xref: /linux/arch/x86/kernel/fpu/xstate.c (revision 8a5f956a9fb7d74fff681145082acfad5afa6bb8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16 #include <linux/coredump.h>
17 #include <linux/sort.h>
18 
19 #include <asm/fpu/api.h>
20 #include <asm/fpu/regset.h>
21 #include <asm/fpu/signal.h>
22 #include <asm/fpu/xcr.h>
23 
24 #include <asm/cpuid/api.h>
25 #include <asm/msr.h>
26 #include <asm/tlbflush.h>
27 #include <asm/prctl.h>
28 #include <asm/elf.h>
29 
30 #include <uapi/asm/elf.h>
31 
32 #include "context.h"
33 #include "internal.h"
34 #include "legacy.h"
35 #include "xstate.h"
36 
37 #define for_each_extended_xfeature(bit, mask)				\
38 	(bit) = FIRST_EXTENDED_XFEATURE;				\
39 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
40 
41 /*
42  * Although we spell it out in here, the Processor Trace
43  * xfeature is completely unused.  We use other mechanisms
44  * to save/restore PT state in Linux.
45  */
46 static const char *xfeature_names[] =
47 {
48 	"x87 floating point registers",
49 	"SSE registers",
50 	"AVX registers",
51 	"MPX bounds registers",
52 	"MPX CSR",
53 	"AVX-512 opmask",
54 	"AVX-512 Hi256",
55 	"AVX-512 ZMM_Hi256",
56 	"Processor Trace (unused)",
57 	"Protection Keys User registers",
58 	"PASID state",
59 	"Control-flow User registers",
60 	"Control-flow Kernel registers (KVM only)",
61 	"unknown xstate feature",
62 	"unknown xstate feature",
63 	"unknown xstate feature",
64 	"unknown xstate feature",
65 	"AMX Tile config",
66 	"AMX Tile data",
67 	"APX registers",
68 	"unknown xstate feature",
69 };
70 
71 static unsigned short xsave_cpuid_features[] __initdata = {
72 	[XFEATURE_FP]				= X86_FEATURE_FPU,
73 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
74 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
75 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
76 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
77 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
78 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
79 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
80 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
81 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
82 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
83 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
84 	[XFEATURE_CET_KERNEL]			= X86_FEATURE_SHSTK,
85 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
86 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
87 	[XFEATURE_APX]				= X86_FEATURE_APX,
88 };
89 
90 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
91 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
92 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
93 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
94 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
95 
96 /*
97  * Ordering of xstate components in uncompacted format:  The xfeature
98  * number does not necessarily indicate its position in the XSAVE buffer.
99  * This array defines the traversal order of xstate features.
100  */
101 static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
102 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
103 
104 static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
105 {
106 	for (; xfeature_uncompact_order[i] != -1; i++) {
107 		if (mask & BIT_ULL(xfeature_uncompact_order[i]))
108 			break;
109 	}
110 
111 	return i;
112 }
113 
114 /* Iterate xstate features in uncompacted order: */
115 #define for_each_extended_xfeature_in_order(i, mask)	\
116 	for (i = 0;					\
117 	     i = next_xfeature_order(i, mask),		\
118 	     xfeature_uncompact_order[i] != -1;		\
119 	     i++)
120 
121 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
122 #define XSTATE_FLAG_ALIGNED64	BIT(1)
123 
124 /*
125  * Return whether the system supports a given xfeature.
126  *
127  * Also return the name of the (most advanced) feature that the caller requested:
128  */
129 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
130 {
131 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
132 
133 	if (unlikely(feature_name)) {
134 		long xfeature_idx, max_idx;
135 		u64 xfeatures_print;
136 		/*
137 		 * So we use FLS here to be able to print the most advanced
138 		 * feature that was requested but is missing. So if a driver
139 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
140 		 * missing AVX feature - this is the most informative message
141 		 * to users:
142 		 */
143 		if (xfeatures_missing)
144 			xfeatures_print = xfeatures_missing;
145 		else
146 			xfeatures_print = xfeatures_needed;
147 
148 		xfeature_idx = fls64(xfeatures_print)-1;
149 		max_idx = ARRAY_SIZE(xfeature_names)-1;
150 		xfeature_idx = min(xfeature_idx, max_idx);
151 
152 		*feature_name = xfeature_names[xfeature_idx];
153 	}
154 
155 	if (xfeatures_missing)
156 		return 0;
157 
158 	return 1;
159 }
160 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
161 
162 static bool xfeature_is_aligned64(int xfeature_nr)
163 {
164 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
165 }
166 
167 static bool xfeature_is_supervisor(int xfeature_nr)
168 {
169 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
170 }
171 
172 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
173 {
174 	unsigned int offs, i;
175 
176 	/*
177 	 * Non-compacted format and legacy features use the cached fixed
178 	 * offsets.
179 	 */
180 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
181 	    xfeature <= XFEATURE_SSE)
182 		return xstate_offsets[xfeature];
183 
184 	/*
185 	 * Compacted format offsets depend on the actual content of the
186 	 * compacted xsave area which is determined by the xcomp_bv header
187 	 * field.
188 	 */
189 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
190 	for_each_extended_xfeature(i, xcomp_bv) {
191 		if (xfeature_is_aligned64(i))
192 			offs = ALIGN(offs, 64);
193 		if (i == xfeature)
194 			break;
195 		offs += xstate_sizes[i];
196 	}
197 	return offs;
198 }
199 
200 /*
201  * Enable the extended processor state save/restore feature.
202  * Called once per CPU onlining.
203  */
204 void fpu__init_cpu_xstate(void)
205 {
206 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
207 		return;
208 
209 	cr4_set_bits(X86_CR4_OSXSAVE);
210 
211 	/*
212 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
213 	 * lazy passthrough.  Write independent of the dynamic state static
214 	 * key as that does not work on the boot CPU. This also ensures
215 	 * that any stale state is wiped out from XFD. Reset the per CPU
216 	 * xfd cache too.
217 	 */
218 	if (cpu_feature_enabled(X86_FEATURE_XFD))
219 		xfd_set_state(init_fpstate.xfd);
220 
221 	/*
222 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
223 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
224 	 * states can be set here.
225 	 */
226 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
227 
228 	/*
229 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
230 	 */
231 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
232 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
233 				     xfeatures_mask_independent());
234 	}
235 }
236 
237 static bool xfeature_enabled(enum xfeature xfeature)
238 {
239 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
240 }
241 
242 static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
243 {
244 	return  xstate_offsets[*(unsigned int *)xfeature1] -
245 		xstate_offsets[*(unsigned int *)xfeature2];
246 }
247 
248 /*
249  * Record the offsets and sizes of various xstates contained
250  * in the XSAVE state memory layout. Also, create an ordered
251  * list of xfeatures for handling out-of-order offsets.
252  */
253 static void __init setup_xstate_cache(void)
254 {
255 	u32 eax, ebx, ecx, edx, xfeature, i = 0;
256 	/*
257 	 * The FP xstates and SSE xstates are legacy states. They are always
258 	 * in the fixed offsets in the xsave area in either compacted form
259 	 * or standard form.
260 	 */
261 	xstate_offsets[XFEATURE_FP]	= 0;
262 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
263 						   xmm_space);
264 
265 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
266 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
267 						       xmm_space);
268 
269 	for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
270 		cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
271 
272 		xstate_sizes[xfeature] = eax;
273 		xstate_flags[xfeature] = ecx;
274 
275 		/*
276 		 * If an xfeature is supervisor state, the offset in EBX is
277 		 * invalid, leave it to -1.
278 		 */
279 		if (xfeature_is_supervisor(xfeature))
280 			continue;
281 
282 		xstate_offsets[xfeature] = ebx;
283 
284 		/* Populate the list of xfeatures before sorting */
285 		xfeature_uncompact_order[i++] = xfeature;
286 	}
287 
288 	/*
289 	 * Sort xfeatures by their offsets to support out-of-order
290 	 * offsets in the uncompacted format.
291 	 */
292 	sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
293 }
294 
295 /*
296  * Print out all the supported xstate features:
297  */
298 static void __init print_xstate_features(void)
299 {
300 	int i;
301 
302 	for (i = 0; i < XFEATURE_MAX; i++) {
303 		u64 mask = BIT_ULL(i);
304 		const char *name;
305 
306 		if (cpu_has_xfeatures(mask, &name))
307 			pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
308 	}
309 }
310 
311 /*
312  * This check is important because it is easy to get XSTATE_*
313  * confused with XSTATE_BIT_*.
314  */
315 #define CHECK_XFEATURE(nr) do {		\
316 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
317 	WARN_ON(nr >= XFEATURE_MAX);	\
318 } while (0)
319 
320 /*
321  * Print out xstate component offsets and sizes
322  */
323 static void __init print_xstate_offset_size(void)
324 {
325 	int i;
326 
327 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
328 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
329 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
330 			i, xstate_sizes[i]);
331 	}
332 }
333 
334 /*
335  * This function is called only during boot time when x86 caps are not set
336  * up and alternative can not be used yet.
337  */
338 static __init void os_xrstor_booting(struct xregs_state *xstate)
339 {
340 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
341 	u32 lmask = mask;
342 	u32 hmask = mask >> 32;
343 	int err;
344 
345 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
346 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
347 	else
348 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
349 
350 	/*
351 	 * We should never fault when copying from a kernel buffer, and the FPU
352 	 * state we set at boot time should be valid.
353 	 */
354 	WARN_ON_FPU(err);
355 }
356 
357 /*
358  * All supported features have either init state all zeros or are
359  * handled in setup_init_fpu() individually. This is an explicit
360  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
361  * newly added supported features at build time and make people
362  * actually look at the init state for the new feature.
363  */
364 #define XFEATURES_INIT_FPSTATE_HANDLED		\
365 	(XFEATURE_MASK_FP |			\
366 	 XFEATURE_MASK_SSE |			\
367 	 XFEATURE_MASK_YMM |			\
368 	 XFEATURE_MASK_OPMASK |			\
369 	 XFEATURE_MASK_ZMM_Hi256 |		\
370 	 XFEATURE_MASK_Hi16_ZMM	 |		\
371 	 XFEATURE_MASK_PKRU |			\
372 	 XFEATURE_MASK_BNDREGS |		\
373 	 XFEATURE_MASK_BNDCSR |			\
374 	 XFEATURE_MASK_PASID |			\
375 	 XFEATURE_MASK_CET_USER |		\
376 	 XFEATURE_MASK_CET_KERNEL |		\
377 	 XFEATURE_MASK_XTILE |			\
378 	 XFEATURE_MASK_APX)
379 
380 /*
381  * setup the xstate image representing the init state
382  */
383 static void __init setup_init_fpu_buf(void)
384 {
385 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
386 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
387 		     XFEATURES_INIT_FPSTATE_HANDLED);
388 
389 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
390 		return;
391 
392 	print_xstate_features();
393 
394 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
395 
396 	/*
397 	 * Init all the features state with header.xfeatures being 0x0
398 	 */
399 	os_xrstor_booting(&init_fpstate.regs.xsave);
400 
401 	/*
402 	 * All components are now in init state. Read the state back so
403 	 * that init_fpstate contains all non-zero init state. This only
404 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
405 	 * those use the init optimization which skips writing data for
406 	 * components in init state.
407 	 *
408 	 * XSAVE could be used, but that would require to reshuffle the
409 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
410 	 * compaction. But doing so is a pointless exercise because most
411 	 * components have an all zeros init state except for the legacy
412 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
413 	 * legacy area. Adding new features requires to ensure that init
414 	 * state is all zeroes or if not to add the necessary handling
415 	 * here.
416 	 */
417 	fxsave(&init_fpstate.regs.fxsave);
418 }
419 
420 int xfeature_size(int xfeature_nr)
421 {
422 	u32 eax, ebx, ecx, edx;
423 
424 	CHECK_XFEATURE(xfeature_nr);
425 	cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
426 	return eax;
427 }
428 
429 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
430 static int validate_user_xstate_header(const struct xstate_header *hdr,
431 				       struct fpstate *fpstate)
432 {
433 	/* No unknown or supervisor features may be set */
434 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
435 		return -EINVAL;
436 
437 	/* Userspace must use the uncompacted format */
438 	if (hdr->xcomp_bv)
439 		return -EINVAL;
440 
441 	/*
442 	 * If 'reserved' is shrunken to add a new field, make sure to validate
443 	 * that new field here!
444 	 */
445 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
446 
447 	/* No reserved bits may be set */
448 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
449 		return -EINVAL;
450 
451 	return 0;
452 }
453 
454 static void __init __xstate_dump_leaves(void)
455 {
456 	int i;
457 	u32 eax, ebx, ecx, edx;
458 	static int should_dump = 1;
459 
460 	if (!should_dump)
461 		return;
462 	should_dump = 0;
463 	/*
464 	 * Dump out a few leaves past the ones that we support
465 	 * just in case there are some goodies up there
466 	 */
467 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
468 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
469 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
470 			CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
471 	}
472 }
473 
474 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
475 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
476 		__xstate_dump_leaves();						\
477 	}									\
478 } while (0)
479 
480 #define XCHECK_SZ(sz, nr, __struct) ({					\
481 	if (WARN_ONCE(sz != sizeof(__struct),				\
482 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
483 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
484 		__xstate_dump_leaves();					\
485 	}								\
486 	true;								\
487 })
488 
489 
490 /**
491  * check_xtile_data_against_struct - Check tile data state size.
492  *
493  * Calculate the state size by multiplying the single tile size which is
494  * recorded in a C struct, and the number of tiles that the CPU informs.
495  * Compare the provided size with the calculation.
496  *
497  * @size:	The tile data state size
498  *
499  * Returns:	0 on success, -EINVAL on mismatch.
500  */
501 static int __init check_xtile_data_against_struct(int size)
502 {
503 	u32 max_palid, palid, state_size;
504 	u32 eax, ebx, ecx, edx;
505 	u16 max_tile;
506 
507 	/*
508 	 * Check the maximum palette id:
509 	 *   eax: the highest numbered palette subleaf.
510 	 */
511 	cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
512 
513 	/*
514 	 * Cross-check each tile size and find the maximum number of
515 	 * supported tiles.
516 	 */
517 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
518 		u16 tile_size, max;
519 
520 		/*
521 		 * Check the tile size info:
522 		 *   eax[31:16]:  bytes per title
523 		 *   ebx[31:16]:  the max names (or max number of tiles)
524 		 */
525 		cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
526 		tile_size = eax >> 16;
527 		max = ebx >> 16;
528 
529 		if (tile_size != sizeof(struct xtile_data)) {
530 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
531 			       __stringify(XFEATURE_XTILE_DATA),
532 			       sizeof(struct xtile_data), tile_size);
533 			__xstate_dump_leaves();
534 			return -EINVAL;
535 		}
536 
537 		if (max > max_tile)
538 			max_tile = max;
539 	}
540 
541 	state_size = sizeof(struct xtile_data) * max_tile;
542 	if (size != state_size) {
543 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
544 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
545 		__xstate_dump_leaves();
546 		return -EINVAL;
547 	}
548 	return 0;
549 }
550 
551 /*
552  * We have a C struct for each 'xstate'.  We need to ensure
553  * that our software representation matches what the CPU
554  * tells us about the state's size.
555  */
556 static bool __init check_xstate_against_struct(int nr)
557 {
558 	/*
559 	 * Ask the CPU for the size of the state.
560 	 */
561 	int sz = xfeature_size(nr);
562 
563 	/*
564 	 * Match each CPU state with the corresponding software
565 	 * structure.
566 	 */
567 	switch (nr) {
568 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
569 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
570 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
571 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
572 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
573 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
574 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
575 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
576 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
577 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
578 	case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
579 	case XFEATURE_APX:        return XCHECK_SZ(sz, nr, struct apx_state);
580 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
581 	default:
582 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
583 		return false;
584 	}
585 
586 	return true;
587 }
588 
589 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
590 {
591 	unsigned int topmost = fls64(xfeatures) -  1;
592 	unsigned int offset, i;
593 
594 	if (topmost <= XFEATURE_SSE)
595 		return sizeof(struct xregs_state);
596 
597 	if (compacted) {
598 		offset = xfeature_get_offset(xfeatures, topmost);
599 	} else {
600 		/* Walk through the xfeature order to pick the last */
601 		for_each_extended_xfeature_in_order(i, xfeatures)
602 			topmost = xfeature_uncompact_order[i];
603 		offset = xstate_offsets[topmost];
604 	}
605 
606 	return offset + xstate_sizes[topmost];
607 }
608 
609 /*
610  * This essentially double-checks what the cpu told us about
611  * how large the XSAVE buffer needs to be.  We are recalculating
612  * it to be safe.
613  *
614  * Independent XSAVE features allocate their own buffers and are not
615  * covered by these checks. Only the size of the buffer for task->fpu
616  * is checked here.
617  */
618 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
619 {
620 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
621 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
622 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
623 	int i;
624 
625 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
626 		if (!check_xstate_against_struct(i))
627 			return false;
628 		/*
629 		 * Supervisor state components can be managed only by
630 		 * XSAVES.
631 		 */
632 		if (!xsaves && xfeature_is_supervisor(i)) {
633 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
634 			return false;
635 		}
636 	}
637 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
638 	XSTATE_WARN_ON(size != kernel_size,
639 		       "size %u != kernel_size %u\n", size, kernel_size);
640 	return size == kernel_size;
641 }
642 
643 /*
644  * Get total size of enabled xstates in XCR0 | IA32_XSS.
645  *
646  * Note the SDM's wording here.  "sub-function 0" only enumerates
647  * the size of the *user* states.  If we use it to size a buffer
648  * that we use 'XSAVES' on, we could potentially overflow the
649  * buffer because 'XSAVES' saves system states too.
650  *
651  * This also takes compaction into account. So this works for
652  * XSAVEC as well.
653  */
654 static unsigned int __init get_compacted_size(void)
655 {
656 	unsigned int eax, ebx, ecx, edx;
657 	/*
658 	 * - CPUID function 0DH, sub-function 1:
659 	 *    EBX enumerates the size (in bytes) required by
660 	 *    the XSAVES instruction for an XSAVE area
661 	 *    containing all the state components
662 	 *    corresponding to bits currently set in
663 	 *    XCR0 | IA32_XSS.
664 	 *
665 	 * When XSAVES is not available but XSAVEC is (virt), then there
666 	 * are no supervisor states, but XSAVEC still uses compacted
667 	 * format.
668 	 */
669 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
670 	return ebx;
671 }
672 
673 /*
674  * Get the total size of the enabled xstates without the independent supervisor
675  * features.
676  */
677 static unsigned int __init get_xsave_compacted_size(void)
678 {
679 	u64 mask = xfeatures_mask_independent();
680 	unsigned int size;
681 
682 	if (!mask)
683 		return get_compacted_size();
684 
685 	/* Disable independent features. */
686 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
687 
688 	/*
689 	 * Ask the hardware what size is required of the buffer.
690 	 * This is the size required for the task->fpu buffer.
691 	 */
692 	size = get_compacted_size();
693 
694 	/* Re-enable independent features so XSAVES will work on them again. */
695 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
696 
697 	return size;
698 }
699 
700 static unsigned int __init get_xsave_size_user(void)
701 {
702 	unsigned int eax, ebx, ecx, edx;
703 	/*
704 	 * - CPUID function 0DH, sub-function 0:
705 	 *    EBX enumerates the size (in bytes) required by
706 	 *    the XSAVE instruction for an XSAVE area
707 	 *    containing all the *user* state components
708 	 *    corresponding to bits currently set in XCR0.
709 	 */
710 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
711 	return ebx;
712 }
713 
714 static int __init init_xstate_size(void)
715 {
716 	/* Recompute the context size for enabled features: */
717 	unsigned int user_size, kernel_size, kernel_default_size;
718 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
719 
720 	/* Uncompacted user space size */
721 	user_size = get_xsave_size_user();
722 
723 	/*
724 	 * XSAVES kernel size includes supervisor states and uses compacted
725 	 * format. XSAVEC uses compacted format, but does not save
726 	 * supervisor states.
727 	 *
728 	 * XSAVE[OPT] do not support supervisor states so kernel and user
729 	 * size is identical.
730 	 */
731 	if (compacted)
732 		kernel_size = get_xsave_compacted_size();
733 	else
734 		kernel_size = user_size;
735 
736 	kernel_default_size =
737 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
738 
739 	if (!paranoid_xstate_size_valid(kernel_size))
740 		return -EINVAL;
741 
742 	fpu_kernel_cfg.max_size = kernel_size;
743 	fpu_user_cfg.max_size = user_size;
744 
745 	fpu_kernel_cfg.default_size = kernel_default_size;
746 	fpu_user_cfg.default_size =
747 		xstate_calculate_size(fpu_user_cfg.default_features, false);
748 
749 	guest_default_cfg.size =
750 		xstate_calculate_size(guest_default_cfg.features, compacted);
751 
752 	return 0;
753 }
754 
755 /*
756  * We enabled the XSAVE hardware, but something went wrong and
757  * we can not use it.  Disable it.
758  */
759 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
760 {
761 	pr_info("x86/fpu: XSAVE disabled\n");
762 
763 	fpu_kernel_cfg.max_features = 0;
764 	cr4_clear_bits(X86_CR4_OSXSAVE);
765 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
766 
767 	/* Restore the legacy size.*/
768 	fpu_kernel_cfg.max_size = legacy_size;
769 	fpu_kernel_cfg.default_size = legacy_size;
770 	fpu_user_cfg.max_size = legacy_size;
771 	fpu_user_cfg.default_size = legacy_size;
772 	guest_default_cfg.size = legacy_size;
773 
774 	/*
775 	 * Prevent enabling the static branch which enables writes to the
776 	 * XFD MSR.
777 	 */
778 	init_fpstate.xfd = 0;
779 
780 	fpstate_reset(x86_task_fpu(current));
781 }
782 
783 static u64 __init host_default_mask(void)
784 {
785 	/*
786 	 * Exclude dynamic features (require userspace opt-in) and features
787 	 * that are supported only for KVM guests.
788 	 */
789 	return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
790 }
791 
792 static u64 __init guest_default_mask(void)
793 {
794 	/*
795 	 * Exclude dynamic features, which require userspace opt-in even
796 	 * for KVM guests.
797 	 */
798 	return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
799 }
800 
801 /*
802  * Enable and initialize the xsave feature.
803  * Called once per system bootup.
804  */
805 void __init fpu__init_system_xstate(unsigned int legacy_size)
806 {
807 	unsigned int eax, ebx, ecx, edx;
808 	u64 xfeatures;
809 	int err;
810 	int i;
811 
812 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
813 		pr_info("x86/fpu: No FPU detected\n");
814 		return;
815 	}
816 
817 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
818 		pr_info("x86/fpu: x87 FPU will use %s\n",
819 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
820 		return;
821 	}
822 
823 	/*
824 	 * Find user xstates supported by the processor.
825 	 */
826 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
827 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
828 
829 	/*
830 	 * Find supervisor xstates supported by the processor.
831 	 */
832 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
833 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
834 
835 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
836 		/*
837 		 * This indicates that something really unexpected happened
838 		 * with the enumeration.  Disable XSAVE and try to continue
839 		 * booting without it.  This is too early to BUG().
840 		 */
841 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
842 		       fpu_kernel_cfg.max_features);
843 		goto out_disable;
844 	}
845 
846 	if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
847 	    fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
848 		/*
849 		 * This is a problematic CPU configuration where two
850 		 * conflicting state components are both enumerated.
851 		 */
852 		pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
853 		       fpu_kernel_cfg.max_features);
854 		goto out_disable;
855 	}
856 
857 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
858 					      XFEATURE_MASK_INDEPENDENT;
859 
860 	/*
861 	 * Clear XSAVE features that are disabled in the normal CPUID.
862 	 */
863 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
864 		unsigned short cid = xsave_cpuid_features[i];
865 
866 		/* Careful: X86_FEATURE_FPU is 0! */
867 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
868 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
869 	}
870 
871 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
872 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
873 
874 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
875 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
876 	else
877 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
878 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
879 
880 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
881 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
882 
883 	/*
884 	 * Now, given maximum feature set, determine default values by
885 	 * applying default masks.
886 	 */
887 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
888 	fpu_user_cfg.default_features   = fpu_user_cfg.max_features & host_default_mask();
889 	guest_default_cfg.features      = fpu_kernel_cfg.max_features & guest_default_mask();
890 
891 	/* Store it for paranoia check at the end */
892 	xfeatures = fpu_kernel_cfg.max_features;
893 
894 	/*
895 	 * Initialize the default XFD state in initfp_state and enable the
896 	 * dynamic sizing mechanism if dynamic states are available.  The
897 	 * static key cannot be enabled here because this runs before
898 	 * jump_label_init(). This is delayed to an initcall.
899 	 */
900 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
901 
902 	/* Set up compaction feature bit */
903 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
904 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
905 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
906 
907 	/* Enable xstate instructions to be able to continue with initialization: */
908 	fpu__init_cpu_xstate();
909 
910 	/* Cache size, offset and flags for initialization */
911 	setup_xstate_cache();
912 
913 	err = init_xstate_size();
914 	if (err)
915 		goto out_disable;
916 
917 	/*
918 	 * Update info used for ptrace frames; use standard-format size and no
919 	 * supervisor xstates:
920 	 */
921 	update_regset_xstate_info(fpu_user_cfg.max_size,
922 				  fpu_user_cfg.max_features);
923 
924 	/*
925 	 * init_fpstate excludes dynamic states as they are large but init
926 	 * state is zero.
927 	 */
928 	init_fpstate.size		= fpu_kernel_cfg.default_size;
929 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
930 
931 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
932 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
933 			sizeof(init_fpstate.regs), init_fpstate.size);
934 		goto out_disable;
935 	}
936 
937 	setup_init_fpu_buf();
938 
939 	/*
940 	 * Paranoia check whether something in the setup modified the
941 	 * xfeatures mask.
942 	 */
943 	if (xfeatures != fpu_kernel_cfg.max_features) {
944 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
945 		       xfeatures, fpu_kernel_cfg.max_features);
946 		goto out_disable;
947 	}
948 
949 	/*
950 	 * CPU capabilities initialization runs before FPU init. So
951 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
952 	 * functional, set the feature bit so depending code works.
953 	 */
954 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
955 
956 	print_xstate_offset_size();
957 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
958 		fpu_kernel_cfg.max_features,
959 		fpu_kernel_cfg.max_size,
960 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
961 	return;
962 
963 out_disable:
964 	/* something went wrong, try to boot without any XSAVE support */
965 	fpu__init_disable_system_xstate(legacy_size);
966 }
967 
968 /*
969  * Restore minimal FPU state after suspend:
970  */
971 void fpu__resume_cpu(void)
972 {
973 	/*
974 	 * Restore XCR0 on xsave capable CPUs:
975 	 */
976 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
977 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
978 
979 	/*
980 	 * Restore IA32_XSS. The same CPUID bit enumerates support
981 	 * of XSAVES and MSR_IA32_XSS.
982 	 */
983 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
984 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
985 				     xfeatures_mask_independent());
986 	}
987 
988 	if (fpu_state_size_dynamic())
989 		wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
990 }
991 
992 /*
993  * Given an xstate feature nr, calculate where in the xsave
994  * buffer the state is.  Callers should ensure that the buffer
995  * is valid.
996  */
997 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
998 {
999 	u64 xcomp_bv = xsave->header.xcomp_bv;
1000 
1001 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1002 		return NULL;
1003 
1004 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
1005 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
1006 			return NULL;
1007 	}
1008 
1009 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
1010 }
1011 
1012 /*
1013  * Given the xsave area and a state inside, this function returns the
1014  * address of the state.
1015  *
1016  * This is the API that is called to get xstate address in either
1017  * standard format or compacted format of xsave area.
1018  *
1019  * Note that if there is no data for the field in the xsave buffer
1020  * this will return NULL.
1021  *
1022  * Inputs:
1023  *	xstate: the thread's storage area for all FPU data
1024  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
1025  *	XFEATURE_SSE, etc...)
1026  * Output:
1027  *	address of the state in the xsave area, or NULL if the
1028  *	field is not present in the xsave buffer.
1029  */
1030 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
1031 {
1032 	/*
1033 	 * Do we even *have* xsave state?
1034 	 */
1035 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
1036 		return NULL;
1037 
1038 	/*
1039 	 * We should not ever be requesting features that we
1040 	 * have not enabled.
1041 	 */
1042 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1043 		return NULL;
1044 
1045 	/*
1046 	 * This assumes the last 'xsave*' instruction to
1047 	 * have requested that 'xfeature_nr' be saved.
1048 	 * If it did not, we might be seeing and old value
1049 	 * of the field in the buffer.
1050 	 *
1051 	 * This can happen because the last 'xsave' did not
1052 	 * request that this feature be saved (unlikely)
1053 	 * or because the "init optimization" caused it
1054 	 * to not be saved.
1055 	 */
1056 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
1057 		return NULL;
1058 
1059 	return __raw_xsave_addr(xsave, xfeature_nr);
1060 }
1061 EXPORT_SYMBOL_GPL(get_xsave_addr);
1062 
1063 /*
1064  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1065  * The xsave buffer should be in standard format, not compacted (e.g. user mode
1066  * signal frames).
1067  */
1068 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1069 {
1070 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1071 		return NULL;
1072 
1073 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
1074 }
1075 
1076 #ifdef CONFIG_ARCH_HAS_PKEYS
1077 
1078 /*
1079  * This will go out and modify PKRU register to set the access
1080  * rights for @pkey to @init_val.
1081  */
1082 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1083 			      unsigned long init_val)
1084 {
1085 	u32 old_pkru, new_pkru_bits = 0;
1086 	int pkey_shift;
1087 
1088 	/*
1089 	 * This check implies XSAVE support.  OSPKE only gets
1090 	 * set if we enable XSAVE and we enable PKU in XCR0.
1091 	 */
1092 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1093 		return -EINVAL;
1094 
1095 	/*
1096 	 * This code should only be called with valid 'pkey'
1097 	 * values originating from in-kernel users.  Complain
1098 	 * if a bad value is observed.
1099 	 */
1100 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1101 		return -EINVAL;
1102 
1103 	/* Set the bits we need in PKRU:  */
1104 	if (init_val & PKEY_DISABLE_ACCESS)
1105 		new_pkru_bits |= PKRU_AD_BIT;
1106 	if (init_val & PKEY_DISABLE_WRITE)
1107 		new_pkru_bits |= PKRU_WD_BIT;
1108 
1109 	/* Shift the bits in to the correct place in PKRU for pkey: */
1110 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1111 	new_pkru_bits <<= pkey_shift;
1112 
1113 	/* Get old PKRU and mask off any old bits in place: */
1114 	old_pkru = read_pkru();
1115 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1116 
1117 	/* Write old part along with new part: */
1118 	write_pkru(old_pkru | new_pkru_bits);
1119 
1120 	return 0;
1121 }
1122 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1123 
1124 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1125 			 void *init_xstate, unsigned int size)
1126 {
1127 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1128 }
1129 
1130 /**
1131  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1132  * @to:		membuf descriptor
1133  * @fpstate:	The fpstate buffer from which to copy
1134  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1135  * @pkru_val:	The PKRU value to store in the PKRU component
1136  * @copy_mode:	The requested copy mode
1137  *
1138  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1139  * format, i.e. from the kernel internal hardware dependent storage format
1140  * to the requested @mode. UABI XSTATE is always uncompacted!
1141  *
1142  * It supports partial copy but @to.pos always starts from zero.
1143  */
1144 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1145 			       u64 xfeatures, u32 pkru_val,
1146 			       enum xstate_copy_mode copy_mode)
1147 {
1148 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1149 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1150 	struct xregs_state *xsave = &fpstate->regs.xsave;
1151 	unsigned int zerofrom, i, xfeature;
1152 	struct xstate_header header;
1153 	u64 mask;
1154 
1155 	memset(&header, 0, sizeof(header));
1156 	header.xfeatures = xsave->header.xfeatures;
1157 
1158 	/* Mask out the feature bits depending on copy mode */
1159 	switch (copy_mode) {
1160 	case XSTATE_COPY_FP:
1161 		header.xfeatures &= XFEATURE_MASK_FP;
1162 		break;
1163 
1164 	case XSTATE_COPY_FX:
1165 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1166 		break;
1167 
1168 	case XSTATE_COPY_XSAVE:
1169 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1170 		break;
1171 	}
1172 
1173 	/* Copy FP state up to MXCSR */
1174 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1175 		     &xinit->i387, off_mxcsr);
1176 
1177 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1178 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1179 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1180 		     MXCSR_AND_FLAGS_SIZE);
1181 
1182 	/* Copy the remaining FP state */
1183 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1184 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1185 		     sizeof(xsave->i387.st_space));
1186 
1187 	/* Copy the SSE state - shared with YMM, but independently managed */
1188 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1189 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1190 		     sizeof(xsave->i387.xmm_space));
1191 
1192 	if (copy_mode != XSTATE_COPY_XSAVE)
1193 		goto out;
1194 
1195 	/* Zero the padding area */
1196 	membuf_zero(&to, sizeof(xsave->i387.padding));
1197 
1198 	/* Copy xsave->i387.sw_reserved */
1199 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1200 
1201 	/* Copy the user space relevant state of @xsave->header */
1202 	membuf_write(&to, &header, sizeof(header));
1203 
1204 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1205 
1206 	/*
1207 	 * This 'mask' indicates which states to copy from fpstate.
1208 	 * Those extended states that are not present in fpstate are
1209 	 * either disabled or initialized:
1210 	 *
1211 	 * In non-compacted format, disabled features still occupy
1212 	 * state space but there is no state to copy from in the
1213 	 * compacted init_fpstate. The gap tracking will zero these
1214 	 * states.
1215 	 *
1216 	 * The extended features have an all zeroes init state. Thus,
1217 	 * remove them from 'mask' to zero those features in the user
1218 	 * buffer instead of retrieving them from init_fpstate.
1219 	 */
1220 	mask = header.xfeatures;
1221 
1222 	for_each_extended_xfeature_in_order(i, mask) {
1223 		xfeature = xfeature_uncompact_order[i];
1224 		/*
1225 		 * If there was a feature or alignment gap, zero the space
1226 		 * in the destination buffer.
1227 		 */
1228 		if (zerofrom < xstate_offsets[xfeature])
1229 			membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
1230 
1231 		if (xfeature == XFEATURE_PKRU) {
1232 			struct pkru_state pkru = {0};
1233 			/*
1234 			 * PKRU is not necessarily up to date in the
1235 			 * XSAVE buffer. Use the provided value.
1236 			 */
1237 			pkru.pkru = pkru_val;
1238 			membuf_write(&to, &pkru, sizeof(pkru));
1239 		} else {
1240 			membuf_write(&to,
1241 				     __raw_xsave_addr(xsave, xfeature),
1242 				     xstate_sizes[xfeature]);
1243 		}
1244 		/*
1245 		 * Keep track of the last copied state in the non-compacted
1246 		 * target buffer for gap zeroing.
1247 		 */
1248 		zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
1249 	}
1250 
1251 out:
1252 	if (to.left)
1253 		membuf_zero(&to, to.left);
1254 }
1255 
1256 /**
1257  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1258  * @to:		membuf descriptor
1259  * @tsk:	The task from which to copy the saved xstate
1260  * @copy_mode:	The requested copy mode
1261  *
1262  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1263  * format, i.e. from the kernel internal hardware dependent storage format
1264  * to the requested @mode. UABI XSTATE is always uncompacted!
1265  *
1266  * It supports partial copy but @to.pos always starts from zero.
1267  */
1268 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1269 			     enum xstate_copy_mode copy_mode)
1270 {
1271 	__copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
1272 				  x86_task_fpu(tsk)->fpstate->user_xfeatures,
1273 				  tsk->thread.pkru, copy_mode);
1274 }
1275 
1276 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1277 			    const void *kbuf, const void __user *ubuf)
1278 {
1279 	if (kbuf) {
1280 		memcpy(dst, kbuf + offset, size);
1281 	} else {
1282 		if (copy_from_user(dst, ubuf + offset, size))
1283 			return -EFAULT;
1284 	}
1285 	return 0;
1286 }
1287 
1288 
1289 /**
1290  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1291  * @fpstate:	The fpstate buffer to copy to
1292  * @kbuf:	The UABI format buffer, if it comes from the kernel
1293  * @ubuf:	The UABI format buffer, if it comes from userspace
1294  * @pkru:	The location to write the PKRU value to
1295  *
1296  * Converts from the UABI format into the kernel internal hardware
1297  * dependent format.
1298  *
1299  * This function ultimately has three different callers with distinct PKRU
1300  * behavior.
1301  * 1.	When called from sigreturn the PKRU register will be restored from
1302  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1303  *	@fpstate is sufficient to cover this case, but the caller will also
1304  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1305  *	it is harmless.
1306  * 2.	When called from ptrace the PKRU register will be restored from the
1307  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1308  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1309  *	the PKRU register to the hardware init value (0) if the corresponding
1310  *	xfeatures bit is not set is emulated here.
1311  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1312  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1313  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1314  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1315  *	bit is not set.
1316  */
1317 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1318 			       const void __user *ubuf, u32 *pkru)
1319 {
1320 	struct xregs_state *xsave = &fpstate->regs.xsave;
1321 	unsigned int offset, size;
1322 	struct xstate_header hdr;
1323 	u64 mask;
1324 	int i;
1325 
1326 	offset = offsetof(struct xregs_state, header);
1327 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1328 		return -EFAULT;
1329 
1330 	if (validate_user_xstate_header(&hdr, fpstate))
1331 		return -EINVAL;
1332 
1333 	/* Validate MXCSR when any of the related features is in use */
1334 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1335 	if (hdr.xfeatures & mask) {
1336 		u32 mxcsr[2];
1337 
1338 		offset = offsetof(struct fxregs_state, mxcsr);
1339 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1340 			return -EFAULT;
1341 
1342 		/* Reserved bits in MXCSR must be zero. */
1343 		if (mxcsr[0] & ~mxcsr_feature_mask)
1344 			return -EINVAL;
1345 
1346 		/* SSE and YMM require MXCSR even when FP is not in use. */
1347 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1348 			xsave->i387.mxcsr = mxcsr[0];
1349 			xsave->i387.mxcsr_mask = mxcsr[1];
1350 		}
1351 	}
1352 
1353 	for (i = 0; i < XFEATURE_MAX; i++) {
1354 		mask = BIT_ULL(i);
1355 
1356 		if (hdr.xfeatures & mask) {
1357 			void *dst = __raw_xsave_addr(xsave, i);
1358 
1359 			offset = xstate_offsets[i];
1360 			size = xstate_sizes[i];
1361 
1362 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1363 				return -EFAULT;
1364 		}
1365 	}
1366 
1367 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1368 		struct pkru_state *xpkru;
1369 
1370 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1371 		*pkru = xpkru->pkru;
1372 	} else {
1373 		/*
1374 		 * KVM may pass NULL here to indicate that it does not need
1375 		 * PKRU updated.
1376 		 */
1377 		if (pkru)
1378 			*pkru = 0;
1379 	}
1380 
1381 	/*
1382 	 * The state that came in from userspace was user-state only.
1383 	 * Mask all the user states out of 'xfeatures':
1384 	 */
1385 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1386 
1387 	/*
1388 	 * Add back in the features that came in from userspace:
1389 	 */
1390 	xsave->header.xfeatures |= hdr.xfeatures;
1391 
1392 	return 0;
1393 }
1394 
1395 /*
1396  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1397  * format and copy to the target thread. Used by ptrace and KVM.
1398  */
1399 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1400 {
1401 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1402 }
1403 
1404 /*
1405  * Convert from a sigreturn standard-format user-space buffer to kernel
1406  * XSAVE[S] format and copy to the target thread. This is called from the
1407  * sigreturn() and rt_sigreturn() system calls.
1408  */
1409 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1410 				      const void __user *ubuf)
1411 {
1412 	return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
1413 }
1414 
1415 static bool validate_independent_components(u64 mask)
1416 {
1417 	u64 xchk;
1418 
1419 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1420 		return false;
1421 
1422 	xchk = ~xfeatures_mask_independent();
1423 
1424 	if (WARN_ON_ONCE(!mask || mask & xchk))
1425 		return false;
1426 
1427 	return true;
1428 }
1429 
1430 /**
1431  * xsaves - Save selected components to a kernel xstate buffer
1432  * @xstate:	Pointer to the buffer
1433  * @mask:	Feature mask to select the components to save
1434  *
1435  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1436  * XSAVES does not write the full xstate header. Before first use the
1437  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1438  * can #GP.
1439  *
1440  * The feature mask must be a subset of the independent features.
1441  */
1442 void xsaves(struct xregs_state *xstate, u64 mask)
1443 {
1444 	int err;
1445 
1446 	if (!validate_independent_components(mask))
1447 		return;
1448 
1449 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1450 	WARN_ON_ONCE(err);
1451 }
1452 
1453 /**
1454  * xrstors - Restore selected components from a kernel xstate buffer
1455  * @xstate:	Pointer to the buffer
1456  * @mask:	Feature mask to select the components to restore
1457  *
1458  * The @xstate buffer must be 64 byte aligned and correctly initialized
1459  * otherwise XRSTORS from that buffer can #GP.
1460  *
1461  * Proper usage is to restore the state which was saved with
1462  * xsaves() into @xstate.
1463  *
1464  * The feature mask must be a subset of the independent features.
1465  */
1466 void xrstors(struct xregs_state *xstate, u64 mask)
1467 {
1468 	int err;
1469 
1470 	if (!validate_independent_components(mask))
1471 		return;
1472 
1473 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1474 	WARN_ON_ONCE(err);
1475 }
1476 
1477 #if IS_ENABLED(CONFIG_KVM)
1478 void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
1479 {
1480 	void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
1481 
1482 	if (addr)
1483 		memset(addr, 0, xstate_sizes[xfeature]);
1484 }
1485 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1486 #endif
1487 
1488 #ifdef CONFIG_X86_64
1489 
1490 #ifdef CONFIG_X86_DEBUG_FPU
1491 /*
1492  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1493  * can safely operate on the @fpstate buffer.
1494  */
1495 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1496 {
1497 	u64 xfd = __this_cpu_read(xfd_state);
1498 
1499 	if (fpstate->xfd == xfd)
1500 		return true;
1501 
1502 	 /*
1503 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1504 	  * the passed in fpstate is current's fpstate.
1505 	  */
1506 	if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
1507 		return false;
1508 
1509 	/*
1510 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1511 	 * bring all components into init state and not read from the
1512 	 * buffer. XSAVE(S) raises #PF after init.
1513 	 */
1514 	if (fpstate == &init_fpstate)
1515 		return rstor;
1516 
1517 	/*
1518 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1519 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1520 	 */
1521 
1522 	/*
1523 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1524 	 * the buffer area for XFD-disabled state components.
1525 	 */
1526 	mask &= ~xfd;
1527 
1528 	/*
1529 	 * Remove features which are valid in fpstate. They
1530 	 * have space allocated in fpstate.
1531 	 */
1532 	mask &= ~fpstate->xfeatures;
1533 
1534 	/*
1535 	 * Any remaining state components in 'mask' might be written
1536 	 * by XSAVE/XRSTOR. Fail validation it found.
1537 	 */
1538 	return !mask;
1539 }
1540 
1541 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1542 {
1543 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1544 }
1545 #endif /* CONFIG_X86_DEBUG_FPU */
1546 
1547 static int __init xfd_update_static_branch(void)
1548 {
1549 	/*
1550 	 * If init_fpstate.xfd has bits set then dynamic features are
1551 	 * available and the dynamic sizing must be enabled.
1552 	 */
1553 	if (init_fpstate.xfd)
1554 		static_branch_enable(&__fpu_state_size_dynamic);
1555 	return 0;
1556 }
1557 arch_initcall(xfd_update_static_branch)
1558 
1559 void fpstate_free(struct fpu *fpu)
1560 {
1561 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1562 		vfree(fpu->fpstate);
1563 }
1564 
1565 /**
1566  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1567  *
1568  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1569  *		of that task
1570  * @ksize:	The required size for the kernel buffer
1571  * @usize:	The required size for user space buffers
1572  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1573  *
1574  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1575  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1576  * with large states are likely to live longer.
1577  *
1578  * Returns: 0 on success, -ENOMEM on allocation error.
1579  */
1580 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1581 			   unsigned int usize, struct fpu_guest *guest_fpu)
1582 {
1583 	struct fpu *fpu = x86_task_fpu(current);
1584 	struct fpstate *curfps, *newfps = NULL;
1585 	unsigned int fpsize;
1586 	bool in_use;
1587 
1588 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1589 
1590 	newfps = vzalloc(fpsize);
1591 	if (!newfps)
1592 		return -ENOMEM;
1593 	newfps->size = ksize;
1594 	newfps->user_size = usize;
1595 	newfps->is_valloc = true;
1596 
1597 	/*
1598 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1599 	 * as reference independent whether it is in use or not.
1600 	 */
1601 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1602 
1603 	/* Determine whether @curfps is the active fpstate */
1604 	in_use = fpu->fpstate == curfps;
1605 
1606 	if (guest_fpu) {
1607 		newfps->is_guest = true;
1608 		newfps->is_confidential = curfps->is_confidential;
1609 		newfps->in_use = curfps->in_use;
1610 		guest_fpu->xfeatures |= xfeatures;
1611 		guest_fpu->uabi_size = usize;
1612 	}
1613 
1614 	fpregs_lock();
1615 	/*
1616 	 * If @curfps is in use, ensure that the current state is in the
1617 	 * registers before swapping fpstate as that might invalidate it
1618 	 * due to layout changes.
1619 	 */
1620 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1621 		fpregs_restore_userregs();
1622 
1623 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1624 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1625 	newfps->xfd = curfps->xfd & ~xfeatures;
1626 
1627 	/* Do the final updates within the locked region */
1628 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1629 
1630 	if (guest_fpu) {
1631 		guest_fpu->fpstate = newfps;
1632 		/* If curfps is active, update the FPU fpstate pointer */
1633 		if (in_use)
1634 			fpu->fpstate = newfps;
1635 	} else {
1636 		fpu->fpstate = newfps;
1637 	}
1638 
1639 	if (in_use)
1640 		xfd_update_state(fpu->fpstate);
1641 	fpregs_unlock();
1642 
1643 	/* Only free valloc'ed state */
1644 	if (curfps && curfps->is_valloc)
1645 		vfree(curfps);
1646 
1647 	return 0;
1648 }
1649 
1650 static int validate_sigaltstack(unsigned int usize)
1651 {
1652 	struct task_struct *thread, *leader = current->group_leader;
1653 	unsigned long framesize = get_sigframe_size();
1654 
1655 	lockdep_assert_held(&current->sighand->siglock);
1656 
1657 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1658 	framesize -= fpu_user_cfg.max_size;
1659 	framesize += usize;
1660 	for_each_thread(leader, thread) {
1661 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1662 			return -ENOSPC;
1663 	}
1664 	return 0;
1665 }
1666 
1667 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1668 {
1669 	/*
1670 	 * This deliberately does not exclude !XSAVES as we still might
1671 	 * decide to optionally context switch XCR0 or talk the silicon
1672 	 * vendors into extending XFD for the pre AMX states, especially
1673 	 * AVX512.
1674 	 */
1675 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1676 	struct fpu *fpu = x86_task_fpu(current->group_leader);
1677 	struct fpu_state_perm *perm;
1678 	unsigned int ksize, usize;
1679 	u64 mask;
1680 	int ret = 0;
1681 
1682 	/* Check whether fully enabled */
1683 	if ((permitted & requested) == requested)
1684 		return 0;
1685 
1686 	/*
1687 	 * Calculate the resulting kernel state size.  Note, @permitted also
1688 	 * contains supervisor xfeatures even though supervisor are always
1689 	 * permitted for kernel and guest FPUs, and never permitted for user
1690 	 * FPUs.
1691 	 */
1692 	mask = permitted | requested;
1693 	ksize = xstate_calculate_size(mask, compacted);
1694 
1695 	/*
1696 	 * Calculate the resulting user state size.  Take care not to clobber
1697 	 * the supervisor xfeatures in the new mask!
1698 	 */
1699 	usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
1700 
1701 	if (!guest) {
1702 		ret = validate_sigaltstack(usize);
1703 		if (ret)
1704 			return ret;
1705 	}
1706 
1707 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1708 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1709 	WRITE_ONCE(perm->__state_perm, mask);
1710 	/* Protected by sighand lock */
1711 	perm->__state_size = ksize;
1712 	perm->__user_state_size = usize;
1713 	return ret;
1714 }
1715 
1716 /*
1717  * Permissions array to map facilities with more than one component
1718  */
1719 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1720 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1721 };
1722 
1723 static int xstate_request_perm(unsigned long idx, bool guest)
1724 {
1725 	u64 permitted, requested;
1726 	int ret;
1727 
1728 	if (idx >= XFEATURE_MAX)
1729 		return -EINVAL;
1730 
1731 	/*
1732 	 * Look up the facility mask which can require more than
1733 	 * one xstate component.
1734 	 */
1735 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1736 	requested = xstate_prctl_req[idx];
1737 	if (!requested)
1738 		return -EOPNOTSUPP;
1739 
1740 	if ((fpu_user_cfg.max_features & requested) != requested)
1741 		return -EOPNOTSUPP;
1742 
1743 	/* Lockless quick check */
1744 	permitted = xstate_get_group_perm(guest);
1745 	if ((permitted & requested) == requested)
1746 		return 0;
1747 
1748 	/* Protect against concurrent modifications */
1749 	spin_lock_irq(&current->sighand->siglock);
1750 	permitted = xstate_get_group_perm(guest);
1751 
1752 	/* First vCPU allocation locks the permissions. */
1753 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1754 		ret = -EBUSY;
1755 	else
1756 		ret = __xstate_request_perm(permitted, requested, guest);
1757 	spin_unlock_irq(&current->sighand->siglock);
1758 	return ret;
1759 }
1760 
1761 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1762 {
1763 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1764 	struct fpu_state_perm *perm;
1765 	unsigned int ksize, usize;
1766 	struct fpu *fpu;
1767 
1768 	if (!xfd_event) {
1769 		if (!guest_fpu)
1770 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1771 		return 0;
1772 	}
1773 
1774 	/* Protect against concurrent modifications */
1775 	spin_lock_irq(&current->sighand->siglock);
1776 
1777 	/* If not permitted let it die */
1778 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1779 		spin_unlock_irq(&current->sighand->siglock);
1780 		return -EPERM;
1781 	}
1782 
1783 	fpu = x86_task_fpu(current->group_leader);
1784 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1785 	ksize = perm->__state_size;
1786 	usize = perm->__user_state_size;
1787 
1788 	/*
1789 	 * The feature is permitted. State size is sufficient.  Dropping
1790 	 * the lock is safe here even if more features are added from
1791 	 * another task, the retrieved buffer sizes are valid for the
1792 	 * currently requested feature(s).
1793 	 */
1794 	spin_unlock_irq(&current->sighand->siglock);
1795 
1796 	/*
1797 	 * Try to allocate a new fpstate. If that fails there is no way
1798 	 * out.
1799 	 */
1800 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1801 		return -EFAULT;
1802 	return 0;
1803 }
1804 
1805 int xfd_enable_feature(u64 xfd_err)
1806 {
1807 	return __xfd_enable_feature(xfd_err, NULL);
1808 }
1809 
1810 #else /* CONFIG_X86_64 */
1811 static inline int xstate_request_perm(unsigned long idx, bool guest)
1812 {
1813 	return -EPERM;
1814 }
1815 #endif  /* !CONFIG_X86_64 */
1816 
1817 u64 xstate_get_guest_group_perm(void)
1818 {
1819 	return xstate_get_group_perm(true);
1820 }
1821 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1822 
1823 /**
1824  * fpu_xstate_prctl - xstate permission operations
1825  * @option:	A subfunction of arch_prctl()
1826  * @arg2:	option argument
1827  * Return:	0 if successful; otherwise, an error code
1828  *
1829  * Option arguments:
1830  *
1831  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1832  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1833  * ARCH_REQ_XCOMP_PERM: Facility number requested
1834  *
1835  * For facilities which require more than one XSTATE component, the request
1836  * must be the highest state component number related to that facility,
1837  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1838  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1839  */
1840 long fpu_xstate_prctl(int option, unsigned long arg2)
1841 {
1842 	u64 __user *uptr = (u64 __user *)arg2;
1843 	u64 permitted, supported;
1844 	unsigned long idx = arg2;
1845 	bool guest = false;
1846 
1847 	switch (option) {
1848 	case ARCH_GET_XCOMP_SUPP:
1849 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1850 		return put_user(supported, uptr);
1851 
1852 	case ARCH_GET_XCOMP_PERM:
1853 		/*
1854 		 * Lockless snapshot as it can also change right after the
1855 		 * dropping the lock.
1856 		 */
1857 		permitted = xstate_get_host_group_perm();
1858 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1859 		return put_user(permitted, uptr);
1860 
1861 	case ARCH_GET_XCOMP_GUEST_PERM:
1862 		permitted = xstate_get_guest_group_perm();
1863 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1864 		return put_user(permitted, uptr);
1865 
1866 	case ARCH_REQ_XCOMP_GUEST_PERM:
1867 		guest = true;
1868 		fallthrough;
1869 
1870 	case ARCH_REQ_XCOMP_PERM:
1871 		if (!IS_ENABLED(CONFIG_X86_64))
1872 			return -EOPNOTSUPP;
1873 
1874 		return xstate_request_perm(idx, guest);
1875 
1876 	default:
1877 		return -EINVAL;
1878 	}
1879 }
1880 
1881 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1882 /*
1883  * Report the amount of time elapsed in millisecond since last AVX512
1884  * use in the task. Report -1 if no AVX-512 usage.
1885  */
1886 static void avx512_status(struct seq_file *m, struct task_struct *task)
1887 {
1888 	unsigned long timestamp;
1889 	long delta = -1;
1890 
1891 	/* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
1892 	if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
1893 		return;
1894 
1895 	timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
1896 
1897 	if (timestamp) {
1898 		delta = (long)(jiffies - timestamp);
1899 		/*
1900 		 * Cap to LONG_MAX if time difference > LONG_MAX
1901 		 */
1902 		if (delta < 0)
1903 			delta = LONG_MAX;
1904 		delta = jiffies_to_msecs(delta);
1905 	}
1906 
1907 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1908 	seq_putc(m, '\n');
1909 }
1910 
1911 /*
1912  * Report architecture specific information
1913  */
1914 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1915 			struct pid *pid, struct task_struct *task)
1916 {
1917 	/*
1918 	 * Report AVX512 state if the processor and build option supported.
1919 	 */
1920 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1921 		avx512_status(m, task);
1922 
1923 	return 0;
1924 }
1925 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1926 
1927 #ifdef CONFIG_COREDUMP
1928 static const char owner_name[] = "LINUX";
1929 
1930 /*
1931  * Dump type, size, offset and flag values for every xfeature that is present.
1932  */
1933 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1934 {
1935 	int num_records = 0;
1936 	int i;
1937 
1938 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1939 		struct x86_xfeat_component xc = {
1940 			.type   = i,
1941 			.size   = xstate_sizes[i],
1942 			.offset = xstate_offsets[i],
1943 			/* reserved for future use */
1944 			.flags  = 0,
1945 		};
1946 
1947 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1948 			return 0;
1949 
1950 		num_records++;
1951 	}
1952 	return num_records;
1953 }
1954 
1955 static u32 get_xsave_desc_size(void)
1956 {
1957 	u32 cnt = 0;
1958 	u32 i;
1959 
1960 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1961 		cnt++;
1962 
1963 	return cnt * (sizeof(struct x86_xfeat_component));
1964 }
1965 
1966 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1967 {
1968 	int num_records = 0;
1969 	struct elf_note en;
1970 
1971 	if (!fpu_user_cfg.max_features)
1972 		return 0;
1973 
1974 	en.n_namesz = sizeof(owner_name);
1975 	en.n_descsz = get_xsave_desc_size();
1976 	en.n_type = NT_X86_XSAVE_LAYOUT;
1977 
1978 	if (!dump_emit(cprm, &en, sizeof(en)))
1979 		return 1;
1980 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1981 		return 1;
1982 	if (!dump_align(cprm, 4))
1983 		return 1;
1984 
1985 	num_records = dump_xsave_layout_desc(cprm);
1986 	if (!num_records)
1987 		return 1;
1988 
1989 	/* Total size should be equal to the number of records */
1990 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1991 		return 1;
1992 
1993 	return 0;
1994 }
1995 
1996 int elf_coredump_extra_notes_size(void)
1997 {
1998 	int size;
1999 
2000 	if (!fpu_user_cfg.max_features)
2001 		return 0;
2002 
2003 	/* .note header */
2004 	size  = sizeof(struct elf_note);
2005 	/*  Name plus alignment to 4 bytes */
2006 	size += roundup(sizeof(owner_name), 4);
2007 	size += get_xsave_desc_size();
2008 
2009 	return size;
2010 }
2011 #endif /* CONFIG_COREDUMP */
2012