xref: /linux/arch/x86/kernel/fpu/xstate.c (revision fcab107abe1ab5be9dbe874baa722372da8f4f73)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16 #include <linux/coredump.h>
17 #include <linux/sort.h>
18 
19 #include <asm/fpu/api.h>
20 #include <asm/fpu/regset.h>
21 #include <asm/fpu/signal.h>
22 #include <asm/fpu/xcr.h>
23 
24 #include <asm/cpuid/api.h>
25 #include <asm/msr.h>
26 #include <asm/tlbflush.h>
27 #include <asm/prctl.h>
28 #include <asm/elf.h>
29 
30 #include <uapi/asm/elf.h>
31 
32 #include "context.h"
33 #include "internal.h"
34 #include "legacy.h"
35 #include "xstate.h"
36 
37 #define for_each_extended_xfeature(bit, mask)				\
38 	(bit) = FIRST_EXTENDED_XFEATURE;				\
39 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
40 
41 /*
42  * Although we spell it out in here, the Processor Trace
43  * xfeature is completely unused.  We use other mechanisms
44  * to save/restore PT state in Linux.
45  */
46 static const char *xfeature_names[] =
47 {
48 	"x87 floating point registers",
49 	"SSE registers",
50 	"AVX registers",
51 	"MPX bounds registers",
52 	"MPX CSR",
53 	"AVX-512 opmask",
54 	"AVX-512 Hi256",
55 	"AVX-512 ZMM_Hi256",
56 	"Processor Trace (unused)",
57 	"Protection Keys User registers",
58 	"PASID state",
59 	"Control-flow User registers",
60 	"Control-flow Kernel registers (unused)",
61 	"unknown xstate feature",
62 	"unknown xstate feature",
63 	"unknown xstate feature",
64 	"unknown xstate feature",
65 	"AMX Tile config",
66 	"AMX Tile data",
67 	"APX registers",
68 	"unknown xstate feature",
69 };
70 
71 static unsigned short xsave_cpuid_features[] __initdata = {
72 	[XFEATURE_FP]				= X86_FEATURE_FPU,
73 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
74 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
75 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
76 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
77 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
78 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
79 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
80 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
81 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
82 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
83 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
84 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
85 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
86 	[XFEATURE_APX]				= X86_FEATURE_APX,
87 };
88 
89 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
90 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
91 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
92 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
93 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
94 
95 /*
96  * Ordering of xstate components in uncompacted format:  The xfeature
97  * number does not necessarily indicate its position in the XSAVE buffer.
98  * This array defines the traversal order of xstate features.
99  */
100 static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
101 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
102 
103 static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
104 {
105 	for (; xfeature_uncompact_order[i] != -1; i++) {
106 		if (mask & BIT_ULL(xfeature_uncompact_order[i]))
107 			break;
108 	}
109 
110 	return i;
111 }
112 
113 /* Iterate xstate features in uncompacted order: */
114 #define for_each_extended_xfeature_in_order(i, mask)	\
115 	for (i = 0;					\
116 	     i = next_xfeature_order(i, mask),		\
117 	     xfeature_uncompact_order[i] != -1;		\
118 	     i++)
119 
120 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
121 #define XSTATE_FLAG_ALIGNED64	BIT(1)
122 
123 /*
124  * Return whether the system supports a given xfeature.
125  *
126  * Also return the name of the (most advanced) feature that the caller requested:
127  */
128 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
129 {
130 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
131 
132 	if (unlikely(feature_name)) {
133 		long xfeature_idx, max_idx;
134 		u64 xfeatures_print;
135 		/*
136 		 * So we use FLS here to be able to print the most advanced
137 		 * feature that was requested but is missing. So if a driver
138 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
139 		 * missing AVX feature - this is the most informative message
140 		 * to users:
141 		 */
142 		if (xfeatures_missing)
143 			xfeatures_print = xfeatures_missing;
144 		else
145 			xfeatures_print = xfeatures_needed;
146 
147 		xfeature_idx = fls64(xfeatures_print)-1;
148 		max_idx = ARRAY_SIZE(xfeature_names)-1;
149 		xfeature_idx = min(xfeature_idx, max_idx);
150 
151 		*feature_name = xfeature_names[xfeature_idx];
152 	}
153 
154 	if (xfeatures_missing)
155 		return 0;
156 
157 	return 1;
158 }
159 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
160 
161 static bool xfeature_is_aligned64(int xfeature_nr)
162 {
163 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
164 }
165 
166 static bool xfeature_is_supervisor(int xfeature_nr)
167 {
168 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
169 }
170 
171 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
172 {
173 	unsigned int offs, i;
174 
175 	/*
176 	 * Non-compacted format and legacy features use the cached fixed
177 	 * offsets.
178 	 */
179 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
180 	    xfeature <= XFEATURE_SSE)
181 		return xstate_offsets[xfeature];
182 
183 	/*
184 	 * Compacted format offsets depend on the actual content of the
185 	 * compacted xsave area which is determined by the xcomp_bv header
186 	 * field.
187 	 */
188 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
189 	for_each_extended_xfeature(i, xcomp_bv) {
190 		if (xfeature_is_aligned64(i))
191 			offs = ALIGN(offs, 64);
192 		if (i == xfeature)
193 			break;
194 		offs += xstate_sizes[i];
195 	}
196 	return offs;
197 }
198 
199 /*
200  * Enable the extended processor state save/restore feature.
201  * Called once per CPU onlining.
202  */
203 void fpu__init_cpu_xstate(void)
204 {
205 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
206 		return;
207 
208 	cr4_set_bits(X86_CR4_OSXSAVE);
209 
210 	/*
211 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
212 	 * lazy passthrough.  Write independent of the dynamic state static
213 	 * key as that does not work on the boot CPU. This also ensures
214 	 * that any stale state is wiped out from XFD. Reset the per CPU
215 	 * xfd cache too.
216 	 */
217 	if (cpu_feature_enabled(X86_FEATURE_XFD))
218 		xfd_set_state(init_fpstate.xfd);
219 
220 	/*
221 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
222 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
223 	 * states can be set here.
224 	 */
225 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
226 
227 	/*
228 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
229 	 */
230 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
231 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
232 				     xfeatures_mask_independent());
233 	}
234 }
235 
236 static bool xfeature_enabled(enum xfeature xfeature)
237 {
238 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
239 }
240 
241 static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
242 {
243 	return  xstate_offsets[*(unsigned int *)xfeature1] -
244 		xstate_offsets[*(unsigned int *)xfeature2];
245 }
246 
247 /*
248  * Record the offsets and sizes of various xstates contained
249  * in the XSAVE state memory layout. Also, create an ordered
250  * list of xfeatures for handling out-of-order offsets.
251  */
252 static void __init setup_xstate_cache(void)
253 {
254 	u32 eax, ebx, ecx, edx, xfeature, i = 0;
255 	/*
256 	 * The FP xstates and SSE xstates are legacy states. They are always
257 	 * in the fixed offsets in the xsave area in either compacted form
258 	 * or standard form.
259 	 */
260 	xstate_offsets[XFEATURE_FP]	= 0;
261 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
262 						   xmm_space);
263 
264 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
265 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
266 						       xmm_space);
267 
268 	for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
269 		cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
270 
271 		xstate_sizes[xfeature] = eax;
272 		xstate_flags[xfeature] = ecx;
273 
274 		/*
275 		 * If an xfeature is supervisor state, the offset in EBX is
276 		 * invalid, leave it to -1.
277 		 */
278 		if (xfeature_is_supervisor(xfeature))
279 			continue;
280 
281 		xstate_offsets[xfeature] = ebx;
282 
283 		/* Populate the list of xfeatures before sorting */
284 		xfeature_uncompact_order[i++] = xfeature;
285 	}
286 
287 	/*
288 	 * Sort xfeatures by their offsets to support out-of-order
289 	 * offsets in the uncompacted format.
290 	 */
291 	sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
292 }
293 
294 /*
295  * Print out all the supported xstate features:
296  */
297 static void __init print_xstate_features(void)
298 {
299 	int i;
300 
301 	for (i = 0; i < XFEATURE_MAX; i++) {
302 		u64 mask = BIT_ULL(i);
303 		const char *name;
304 
305 		if (cpu_has_xfeatures(mask, &name))
306 			pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
307 	}
308 }
309 
310 /*
311  * This check is important because it is easy to get XSTATE_*
312  * confused with XSTATE_BIT_*.
313  */
314 #define CHECK_XFEATURE(nr) do {		\
315 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
316 	WARN_ON(nr >= XFEATURE_MAX);	\
317 } while (0)
318 
319 /*
320  * Print out xstate component offsets and sizes
321  */
322 static void __init print_xstate_offset_size(void)
323 {
324 	int i;
325 
326 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
327 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
328 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
329 			i, xstate_sizes[i]);
330 	}
331 }
332 
333 /*
334  * This function is called only during boot time when x86 caps are not set
335  * up and alternative can not be used yet.
336  */
337 static __init void os_xrstor_booting(struct xregs_state *xstate)
338 {
339 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
340 	u32 lmask = mask;
341 	u32 hmask = mask >> 32;
342 	int err;
343 
344 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
345 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
346 	else
347 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
348 
349 	/*
350 	 * We should never fault when copying from a kernel buffer, and the FPU
351 	 * state we set at boot time should be valid.
352 	 */
353 	WARN_ON_FPU(err);
354 }
355 
356 /*
357  * All supported features have either init state all zeros or are
358  * handled in setup_init_fpu() individually. This is an explicit
359  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
360  * newly added supported features at build time and make people
361  * actually look at the init state for the new feature.
362  */
363 #define XFEATURES_INIT_FPSTATE_HANDLED		\
364 	(XFEATURE_MASK_FP |			\
365 	 XFEATURE_MASK_SSE |			\
366 	 XFEATURE_MASK_YMM |			\
367 	 XFEATURE_MASK_OPMASK |			\
368 	 XFEATURE_MASK_ZMM_Hi256 |		\
369 	 XFEATURE_MASK_Hi16_ZMM	 |		\
370 	 XFEATURE_MASK_PKRU |			\
371 	 XFEATURE_MASK_BNDREGS |		\
372 	 XFEATURE_MASK_BNDCSR |			\
373 	 XFEATURE_MASK_PASID |			\
374 	 XFEATURE_MASK_CET_USER |		\
375 	 XFEATURE_MASK_XTILE |			\
376 	 XFEATURE_MASK_APX)
377 
378 /*
379  * setup the xstate image representing the init state
380  */
381 static void __init setup_init_fpu_buf(void)
382 {
383 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
384 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
385 		     XFEATURES_INIT_FPSTATE_HANDLED);
386 
387 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
388 		return;
389 
390 	print_xstate_features();
391 
392 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
393 
394 	/*
395 	 * Init all the features state with header.xfeatures being 0x0
396 	 */
397 	os_xrstor_booting(&init_fpstate.regs.xsave);
398 
399 	/*
400 	 * All components are now in init state. Read the state back so
401 	 * that init_fpstate contains all non-zero init state. This only
402 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
403 	 * those use the init optimization which skips writing data for
404 	 * components in init state.
405 	 *
406 	 * XSAVE could be used, but that would require to reshuffle the
407 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
408 	 * compaction. But doing so is a pointless exercise because most
409 	 * components have an all zeros init state except for the legacy
410 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
411 	 * legacy area. Adding new features requires to ensure that init
412 	 * state is all zeroes or if not to add the necessary handling
413 	 * here.
414 	 */
415 	fxsave(&init_fpstate.regs.fxsave);
416 }
417 
418 int xfeature_size(int xfeature_nr)
419 {
420 	u32 eax, ebx, ecx, edx;
421 
422 	CHECK_XFEATURE(xfeature_nr);
423 	cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
424 	return eax;
425 }
426 
427 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
428 static int validate_user_xstate_header(const struct xstate_header *hdr,
429 				       struct fpstate *fpstate)
430 {
431 	/* No unknown or supervisor features may be set */
432 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
433 		return -EINVAL;
434 
435 	/* Userspace must use the uncompacted format */
436 	if (hdr->xcomp_bv)
437 		return -EINVAL;
438 
439 	/*
440 	 * If 'reserved' is shrunken to add a new field, make sure to validate
441 	 * that new field here!
442 	 */
443 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
444 
445 	/* No reserved bits may be set */
446 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
447 		return -EINVAL;
448 
449 	return 0;
450 }
451 
452 static void __init __xstate_dump_leaves(void)
453 {
454 	int i;
455 	u32 eax, ebx, ecx, edx;
456 	static int should_dump = 1;
457 
458 	if (!should_dump)
459 		return;
460 	should_dump = 0;
461 	/*
462 	 * Dump out a few leaves past the ones that we support
463 	 * just in case there are some goodies up there
464 	 */
465 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
466 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
467 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
468 			CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
469 	}
470 }
471 
472 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
473 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
474 		__xstate_dump_leaves();						\
475 	}									\
476 } while (0)
477 
478 #define XCHECK_SZ(sz, nr, __struct) ({					\
479 	if (WARN_ONCE(sz != sizeof(__struct),				\
480 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
481 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
482 		__xstate_dump_leaves();					\
483 	}								\
484 	true;								\
485 })
486 
487 
488 /**
489  * check_xtile_data_against_struct - Check tile data state size.
490  *
491  * Calculate the state size by multiplying the single tile size which is
492  * recorded in a C struct, and the number of tiles that the CPU informs.
493  * Compare the provided size with the calculation.
494  *
495  * @size:	The tile data state size
496  *
497  * Returns:	0 on success, -EINVAL on mismatch.
498  */
499 static int __init check_xtile_data_against_struct(int size)
500 {
501 	u32 max_palid, palid, state_size;
502 	u32 eax, ebx, ecx, edx;
503 	u16 max_tile;
504 
505 	/*
506 	 * Check the maximum palette id:
507 	 *   eax: the highest numbered palette subleaf.
508 	 */
509 	cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
510 
511 	/*
512 	 * Cross-check each tile size and find the maximum number of
513 	 * supported tiles.
514 	 */
515 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
516 		u16 tile_size, max;
517 
518 		/*
519 		 * Check the tile size info:
520 		 *   eax[31:16]:  bytes per title
521 		 *   ebx[31:16]:  the max names (or max number of tiles)
522 		 */
523 		cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
524 		tile_size = eax >> 16;
525 		max = ebx >> 16;
526 
527 		if (tile_size != sizeof(struct xtile_data)) {
528 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
529 			       __stringify(XFEATURE_XTILE_DATA),
530 			       sizeof(struct xtile_data), tile_size);
531 			__xstate_dump_leaves();
532 			return -EINVAL;
533 		}
534 
535 		if (max > max_tile)
536 			max_tile = max;
537 	}
538 
539 	state_size = sizeof(struct xtile_data) * max_tile;
540 	if (size != state_size) {
541 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
542 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
543 		__xstate_dump_leaves();
544 		return -EINVAL;
545 	}
546 	return 0;
547 }
548 
549 /*
550  * We have a C struct for each 'xstate'.  We need to ensure
551  * that our software representation matches what the CPU
552  * tells us about the state's size.
553  */
554 static bool __init check_xstate_against_struct(int nr)
555 {
556 	/*
557 	 * Ask the CPU for the size of the state.
558 	 */
559 	int sz = xfeature_size(nr);
560 
561 	/*
562 	 * Match each CPU state with the corresponding software
563 	 * structure.
564 	 */
565 	switch (nr) {
566 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
567 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
568 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
569 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
570 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
571 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
572 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
573 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
574 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
575 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
576 	case XFEATURE_APX:        return XCHECK_SZ(sz, nr, struct apx_state);
577 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
578 	default:
579 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
580 		return false;
581 	}
582 
583 	return true;
584 }
585 
586 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
587 {
588 	unsigned int topmost = fls64(xfeatures) -  1;
589 	unsigned int offset, i;
590 
591 	if (topmost <= XFEATURE_SSE)
592 		return sizeof(struct xregs_state);
593 
594 	if (compacted) {
595 		offset = xfeature_get_offset(xfeatures, topmost);
596 	} else {
597 		/* Walk through the xfeature order to pick the last */
598 		for_each_extended_xfeature_in_order(i, xfeatures)
599 			topmost = xfeature_uncompact_order[i];
600 		offset = xstate_offsets[topmost];
601 	}
602 
603 	return offset + xstate_sizes[topmost];
604 }
605 
606 /*
607  * This essentially double-checks what the cpu told us about
608  * how large the XSAVE buffer needs to be.  We are recalculating
609  * it to be safe.
610  *
611  * Independent XSAVE features allocate their own buffers and are not
612  * covered by these checks. Only the size of the buffer for task->fpu
613  * is checked here.
614  */
615 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
616 {
617 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
618 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
619 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
620 	int i;
621 
622 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
623 		if (!check_xstate_against_struct(i))
624 			return false;
625 		/*
626 		 * Supervisor state components can be managed only by
627 		 * XSAVES.
628 		 */
629 		if (!xsaves && xfeature_is_supervisor(i)) {
630 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
631 			return false;
632 		}
633 	}
634 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
635 	XSTATE_WARN_ON(size != kernel_size,
636 		       "size %u != kernel_size %u\n", size, kernel_size);
637 	return size == kernel_size;
638 }
639 
640 /*
641  * Get total size of enabled xstates in XCR0 | IA32_XSS.
642  *
643  * Note the SDM's wording here.  "sub-function 0" only enumerates
644  * the size of the *user* states.  If we use it to size a buffer
645  * that we use 'XSAVES' on, we could potentially overflow the
646  * buffer because 'XSAVES' saves system states too.
647  *
648  * This also takes compaction into account. So this works for
649  * XSAVEC as well.
650  */
651 static unsigned int __init get_compacted_size(void)
652 {
653 	unsigned int eax, ebx, ecx, edx;
654 	/*
655 	 * - CPUID function 0DH, sub-function 1:
656 	 *    EBX enumerates the size (in bytes) required by
657 	 *    the XSAVES instruction for an XSAVE area
658 	 *    containing all the state components
659 	 *    corresponding to bits currently set in
660 	 *    XCR0 | IA32_XSS.
661 	 *
662 	 * When XSAVES is not available but XSAVEC is (virt), then there
663 	 * are no supervisor states, but XSAVEC still uses compacted
664 	 * format.
665 	 */
666 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
667 	return ebx;
668 }
669 
670 /*
671  * Get the total size of the enabled xstates without the independent supervisor
672  * features.
673  */
674 static unsigned int __init get_xsave_compacted_size(void)
675 {
676 	u64 mask = xfeatures_mask_independent();
677 	unsigned int size;
678 
679 	if (!mask)
680 		return get_compacted_size();
681 
682 	/* Disable independent features. */
683 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
684 
685 	/*
686 	 * Ask the hardware what size is required of the buffer.
687 	 * This is the size required for the task->fpu buffer.
688 	 */
689 	size = get_compacted_size();
690 
691 	/* Re-enable independent features so XSAVES will work on them again. */
692 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
693 
694 	return size;
695 }
696 
697 static unsigned int __init get_xsave_size_user(void)
698 {
699 	unsigned int eax, ebx, ecx, edx;
700 	/*
701 	 * - CPUID function 0DH, sub-function 0:
702 	 *    EBX enumerates the size (in bytes) required by
703 	 *    the XSAVE instruction for an XSAVE area
704 	 *    containing all the *user* state components
705 	 *    corresponding to bits currently set in XCR0.
706 	 */
707 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
708 	return ebx;
709 }
710 
711 static int __init init_xstate_size(void)
712 {
713 	/* Recompute the context size for enabled features: */
714 	unsigned int user_size, kernel_size, kernel_default_size;
715 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
716 
717 	/* Uncompacted user space size */
718 	user_size = get_xsave_size_user();
719 
720 	/*
721 	 * XSAVES kernel size includes supervisor states and uses compacted
722 	 * format. XSAVEC uses compacted format, but does not save
723 	 * supervisor states.
724 	 *
725 	 * XSAVE[OPT] do not support supervisor states so kernel and user
726 	 * size is identical.
727 	 */
728 	if (compacted)
729 		kernel_size = get_xsave_compacted_size();
730 	else
731 		kernel_size = user_size;
732 
733 	kernel_default_size =
734 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
735 
736 	if (!paranoid_xstate_size_valid(kernel_size))
737 		return -EINVAL;
738 
739 	fpu_kernel_cfg.max_size = kernel_size;
740 	fpu_user_cfg.max_size = user_size;
741 
742 	fpu_kernel_cfg.default_size = kernel_default_size;
743 	fpu_user_cfg.default_size =
744 		xstate_calculate_size(fpu_user_cfg.default_features, false);
745 
746 	return 0;
747 }
748 
749 /*
750  * We enabled the XSAVE hardware, but something went wrong and
751  * we can not use it.  Disable it.
752  */
753 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
754 {
755 	pr_info("x86/fpu: XSAVE disabled\n");
756 
757 	fpu_kernel_cfg.max_features = 0;
758 	cr4_clear_bits(X86_CR4_OSXSAVE);
759 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
760 
761 	/* Restore the legacy size.*/
762 	fpu_kernel_cfg.max_size = legacy_size;
763 	fpu_kernel_cfg.default_size = legacy_size;
764 	fpu_user_cfg.max_size = legacy_size;
765 	fpu_user_cfg.default_size = legacy_size;
766 
767 	/*
768 	 * Prevent enabling the static branch which enables writes to the
769 	 * XFD MSR.
770 	 */
771 	init_fpstate.xfd = 0;
772 
773 	fpstate_reset(x86_task_fpu(current));
774 }
775 
776 /*
777  * Enable and initialize the xsave feature.
778  * Called once per system bootup.
779  */
780 void __init fpu__init_system_xstate(unsigned int legacy_size)
781 {
782 	unsigned int eax, ebx, ecx, edx;
783 	u64 xfeatures;
784 	int err;
785 	int i;
786 
787 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
788 		pr_info("x86/fpu: No FPU detected\n");
789 		return;
790 	}
791 
792 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
793 		pr_info("x86/fpu: x87 FPU will use %s\n",
794 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
795 		return;
796 	}
797 
798 	/*
799 	 * Find user xstates supported by the processor.
800 	 */
801 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
802 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
803 
804 	/*
805 	 * Find supervisor xstates supported by the processor.
806 	 */
807 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
808 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
809 
810 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
811 		/*
812 		 * This indicates that something really unexpected happened
813 		 * with the enumeration.  Disable XSAVE and try to continue
814 		 * booting without it.  This is too early to BUG().
815 		 */
816 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
817 		       fpu_kernel_cfg.max_features);
818 		goto out_disable;
819 	}
820 
821 	if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
822 	    fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
823 		/*
824 		 * This is a problematic CPU configuration where two
825 		 * conflicting state components are both enumerated.
826 		 */
827 		pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
828 		       fpu_kernel_cfg.max_features);
829 		goto out_disable;
830 	}
831 
832 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
833 					      XFEATURE_MASK_INDEPENDENT;
834 
835 	/*
836 	 * Clear XSAVE features that are disabled in the normal CPUID.
837 	 */
838 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
839 		unsigned short cid = xsave_cpuid_features[i];
840 
841 		/* Careful: X86_FEATURE_FPU is 0! */
842 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
843 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
844 	}
845 
846 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
847 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
848 
849 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
850 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
851 	else
852 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
853 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
854 
855 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
856 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
857 
858 	/* Clean out dynamic features from default */
859 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
860 	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
861 
862 	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
863 	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
864 
865 	/* Store it for paranoia check at the end */
866 	xfeatures = fpu_kernel_cfg.max_features;
867 
868 	/*
869 	 * Initialize the default XFD state in initfp_state and enable the
870 	 * dynamic sizing mechanism if dynamic states are available.  The
871 	 * static key cannot be enabled here because this runs before
872 	 * jump_label_init(). This is delayed to an initcall.
873 	 */
874 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
875 
876 	/* Set up compaction feature bit */
877 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
878 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
879 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
880 
881 	/* Enable xstate instructions to be able to continue with initialization: */
882 	fpu__init_cpu_xstate();
883 
884 	/* Cache size, offset and flags for initialization */
885 	setup_xstate_cache();
886 
887 	err = init_xstate_size();
888 	if (err)
889 		goto out_disable;
890 
891 	/*
892 	 * Update info used for ptrace frames; use standard-format size and no
893 	 * supervisor xstates:
894 	 */
895 	update_regset_xstate_info(fpu_user_cfg.max_size,
896 				  fpu_user_cfg.max_features);
897 
898 	/*
899 	 * init_fpstate excludes dynamic states as they are large but init
900 	 * state is zero.
901 	 */
902 	init_fpstate.size		= fpu_kernel_cfg.default_size;
903 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
904 
905 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
906 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
907 			sizeof(init_fpstate.regs), init_fpstate.size);
908 		goto out_disable;
909 	}
910 
911 	setup_init_fpu_buf();
912 
913 	/*
914 	 * Paranoia check whether something in the setup modified the
915 	 * xfeatures mask.
916 	 */
917 	if (xfeatures != fpu_kernel_cfg.max_features) {
918 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
919 		       xfeatures, fpu_kernel_cfg.max_features);
920 		goto out_disable;
921 	}
922 
923 	/*
924 	 * CPU capabilities initialization runs before FPU init. So
925 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
926 	 * functional, set the feature bit so depending code works.
927 	 */
928 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
929 
930 	print_xstate_offset_size();
931 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
932 		fpu_kernel_cfg.max_features,
933 		fpu_kernel_cfg.max_size,
934 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
935 	return;
936 
937 out_disable:
938 	/* something went wrong, try to boot without any XSAVE support */
939 	fpu__init_disable_system_xstate(legacy_size);
940 }
941 
942 /*
943  * Restore minimal FPU state after suspend:
944  */
945 void fpu__resume_cpu(void)
946 {
947 	/*
948 	 * Restore XCR0 on xsave capable CPUs:
949 	 */
950 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
951 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
952 
953 	/*
954 	 * Restore IA32_XSS. The same CPUID bit enumerates support
955 	 * of XSAVES and MSR_IA32_XSS.
956 	 */
957 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
958 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
959 				     xfeatures_mask_independent());
960 	}
961 
962 	if (fpu_state_size_dynamic())
963 		wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
964 }
965 
966 /*
967  * Given an xstate feature nr, calculate where in the xsave
968  * buffer the state is.  Callers should ensure that the buffer
969  * is valid.
970  */
971 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
972 {
973 	u64 xcomp_bv = xsave->header.xcomp_bv;
974 
975 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
976 		return NULL;
977 
978 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
979 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
980 			return NULL;
981 	}
982 
983 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
984 }
985 
986 /*
987  * Given the xsave area and a state inside, this function returns the
988  * address of the state.
989  *
990  * This is the API that is called to get xstate address in either
991  * standard format or compacted format of xsave area.
992  *
993  * Note that if there is no data for the field in the xsave buffer
994  * this will return NULL.
995  *
996  * Inputs:
997  *	xstate: the thread's storage area for all FPU data
998  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
999  *	XFEATURE_SSE, etc...)
1000  * Output:
1001  *	address of the state in the xsave area, or NULL if the
1002  *	field is not present in the xsave buffer.
1003  */
1004 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
1005 {
1006 	/*
1007 	 * Do we even *have* xsave state?
1008 	 */
1009 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
1010 		return NULL;
1011 
1012 	/*
1013 	 * We should not ever be requesting features that we
1014 	 * have not enabled.
1015 	 */
1016 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1017 		return NULL;
1018 
1019 	/*
1020 	 * This assumes the last 'xsave*' instruction to
1021 	 * have requested that 'xfeature_nr' be saved.
1022 	 * If it did not, we might be seeing and old value
1023 	 * of the field in the buffer.
1024 	 *
1025 	 * This can happen because the last 'xsave' did not
1026 	 * request that this feature be saved (unlikely)
1027 	 * or because the "init optimization" caused it
1028 	 * to not be saved.
1029 	 */
1030 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
1031 		return NULL;
1032 
1033 	return __raw_xsave_addr(xsave, xfeature_nr);
1034 }
1035 EXPORT_SYMBOL_GPL(get_xsave_addr);
1036 
1037 /*
1038  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1039  * The xsave buffer should be in standard format, not compacted (e.g. user mode
1040  * signal frames).
1041  */
1042 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1043 {
1044 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1045 		return NULL;
1046 
1047 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
1048 }
1049 
1050 #ifdef CONFIG_ARCH_HAS_PKEYS
1051 
1052 /*
1053  * This will go out and modify PKRU register to set the access
1054  * rights for @pkey to @init_val.
1055  */
1056 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1057 			      unsigned long init_val)
1058 {
1059 	u32 old_pkru, new_pkru_bits = 0;
1060 	int pkey_shift;
1061 
1062 	/*
1063 	 * This check implies XSAVE support.  OSPKE only gets
1064 	 * set if we enable XSAVE and we enable PKU in XCR0.
1065 	 */
1066 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1067 		return -EINVAL;
1068 
1069 	/*
1070 	 * This code should only be called with valid 'pkey'
1071 	 * values originating from in-kernel users.  Complain
1072 	 * if a bad value is observed.
1073 	 */
1074 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1075 		return -EINVAL;
1076 
1077 	/* Set the bits we need in PKRU:  */
1078 	if (init_val & PKEY_DISABLE_ACCESS)
1079 		new_pkru_bits |= PKRU_AD_BIT;
1080 	if (init_val & PKEY_DISABLE_WRITE)
1081 		new_pkru_bits |= PKRU_WD_BIT;
1082 
1083 	/* Shift the bits in to the correct place in PKRU for pkey: */
1084 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1085 	new_pkru_bits <<= pkey_shift;
1086 
1087 	/* Get old PKRU and mask off any old bits in place: */
1088 	old_pkru = read_pkru();
1089 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1090 
1091 	/* Write old part along with new part: */
1092 	write_pkru(old_pkru | new_pkru_bits);
1093 
1094 	return 0;
1095 }
1096 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1097 
1098 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1099 			 void *init_xstate, unsigned int size)
1100 {
1101 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1102 }
1103 
1104 /**
1105  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1106  * @to:		membuf descriptor
1107  * @fpstate:	The fpstate buffer from which to copy
1108  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1109  * @pkru_val:	The PKRU value to store in the PKRU component
1110  * @copy_mode:	The requested copy mode
1111  *
1112  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1113  * format, i.e. from the kernel internal hardware dependent storage format
1114  * to the requested @mode. UABI XSTATE is always uncompacted!
1115  *
1116  * It supports partial copy but @to.pos always starts from zero.
1117  */
1118 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1119 			       u64 xfeatures, u32 pkru_val,
1120 			       enum xstate_copy_mode copy_mode)
1121 {
1122 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1123 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1124 	struct xregs_state *xsave = &fpstate->regs.xsave;
1125 	unsigned int zerofrom, i, xfeature;
1126 	struct xstate_header header;
1127 	u64 mask;
1128 
1129 	memset(&header, 0, sizeof(header));
1130 	header.xfeatures = xsave->header.xfeatures;
1131 
1132 	/* Mask out the feature bits depending on copy mode */
1133 	switch (copy_mode) {
1134 	case XSTATE_COPY_FP:
1135 		header.xfeatures &= XFEATURE_MASK_FP;
1136 		break;
1137 
1138 	case XSTATE_COPY_FX:
1139 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1140 		break;
1141 
1142 	case XSTATE_COPY_XSAVE:
1143 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1144 		break;
1145 	}
1146 
1147 	/* Copy FP state up to MXCSR */
1148 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1149 		     &xinit->i387, off_mxcsr);
1150 
1151 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1152 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1153 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1154 		     MXCSR_AND_FLAGS_SIZE);
1155 
1156 	/* Copy the remaining FP state */
1157 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1158 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1159 		     sizeof(xsave->i387.st_space));
1160 
1161 	/* Copy the SSE state - shared with YMM, but independently managed */
1162 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1163 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1164 		     sizeof(xsave->i387.xmm_space));
1165 
1166 	if (copy_mode != XSTATE_COPY_XSAVE)
1167 		goto out;
1168 
1169 	/* Zero the padding area */
1170 	membuf_zero(&to, sizeof(xsave->i387.padding));
1171 
1172 	/* Copy xsave->i387.sw_reserved */
1173 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1174 
1175 	/* Copy the user space relevant state of @xsave->header */
1176 	membuf_write(&to, &header, sizeof(header));
1177 
1178 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1179 
1180 	/*
1181 	 * This 'mask' indicates which states to copy from fpstate.
1182 	 * Those extended states that are not present in fpstate are
1183 	 * either disabled or initialized:
1184 	 *
1185 	 * In non-compacted format, disabled features still occupy
1186 	 * state space but there is no state to copy from in the
1187 	 * compacted init_fpstate. The gap tracking will zero these
1188 	 * states.
1189 	 *
1190 	 * The extended features have an all zeroes init state. Thus,
1191 	 * remove them from 'mask' to zero those features in the user
1192 	 * buffer instead of retrieving them from init_fpstate.
1193 	 */
1194 	mask = header.xfeatures;
1195 
1196 	for_each_extended_xfeature_in_order(i, mask) {
1197 		xfeature = xfeature_uncompact_order[i];
1198 		/*
1199 		 * If there was a feature or alignment gap, zero the space
1200 		 * in the destination buffer.
1201 		 */
1202 		if (zerofrom < xstate_offsets[xfeature])
1203 			membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
1204 
1205 		if (xfeature == XFEATURE_PKRU) {
1206 			struct pkru_state pkru = {0};
1207 			/*
1208 			 * PKRU is not necessarily up to date in the
1209 			 * XSAVE buffer. Use the provided value.
1210 			 */
1211 			pkru.pkru = pkru_val;
1212 			membuf_write(&to, &pkru, sizeof(pkru));
1213 		} else {
1214 			membuf_write(&to,
1215 				     __raw_xsave_addr(xsave, xfeature),
1216 				     xstate_sizes[xfeature]);
1217 		}
1218 		/*
1219 		 * Keep track of the last copied state in the non-compacted
1220 		 * target buffer for gap zeroing.
1221 		 */
1222 		zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
1223 	}
1224 
1225 out:
1226 	if (to.left)
1227 		membuf_zero(&to, to.left);
1228 }
1229 
1230 /**
1231  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1232  * @to:		membuf descriptor
1233  * @tsk:	The task from which to copy the saved xstate
1234  * @copy_mode:	The requested copy mode
1235  *
1236  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1237  * format, i.e. from the kernel internal hardware dependent storage format
1238  * to the requested @mode. UABI XSTATE is always uncompacted!
1239  *
1240  * It supports partial copy but @to.pos always starts from zero.
1241  */
1242 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1243 			     enum xstate_copy_mode copy_mode)
1244 {
1245 	__copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
1246 				  x86_task_fpu(tsk)->fpstate->user_xfeatures,
1247 				  tsk->thread.pkru, copy_mode);
1248 }
1249 
1250 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1251 			    const void *kbuf, const void __user *ubuf)
1252 {
1253 	if (kbuf) {
1254 		memcpy(dst, kbuf + offset, size);
1255 	} else {
1256 		if (copy_from_user(dst, ubuf + offset, size))
1257 			return -EFAULT;
1258 	}
1259 	return 0;
1260 }
1261 
1262 
1263 /**
1264  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1265  * @fpstate:	The fpstate buffer to copy to
1266  * @kbuf:	The UABI format buffer, if it comes from the kernel
1267  * @ubuf:	The UABI format buffer, if it comes from userspace
1268  * @pkru:	The location to write the PKRU value to
1269  *
1270  * Converts from the UABI format into the kernel internal hardware
1271  * dependent format.
1272  *
1273  * This function ultimately has three different callers with distinct PKRU
1274  * behavior.
1275  * 1.	When called from sigreturn the PKRU register will be restored from
1276  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1277  *	@fpstate is sufficient to cover this case, but the caller will also
1278  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1279  *	it is harmless.
1280  * 2.	When called from ptrace the PKRU register will be restored from the
1281  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1282  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1283  *	the PKRU register to the hardware init value (0) if the corresponding
1284  *	xfeatures bit is not set is emulated here.
1285  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1286  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1287  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1288  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1289  *	bit is not set.
1290  */
1291 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1292 			       const void __user *ubuf, u32 *pkru)
1293 {
1294 	struct xregs_state *xsave = &fpstate->regs.xsave;
1295 	unsigned int offset, size;
1296 	struct xstate_header hdr;
1297 	u64 mask;
1298 	int i;
1299 
1300 	offset = offsetof(struct xregs_state, header);
1301 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1302 		return -EFAULT;
1303 
1304 	if (validate_user_xstate_header(&hdr, fpstate))
1305 		return -EINVAL;
1306 
1307 	/* Validate MXCSR when any of the related features is in use */
1308 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1309 	if (hdr.xfeatures & mask) {
1310 		u32 mxcsr[2];
1311 
1312 		offset = offsetof(struct fxregs_state, mxcsr);
1313 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1314 			return -EFAULT;
1315 
1316 		/* Reserved bits in MXCSR must be zero. */
1317 		if (mxcsr[0] & ~mxcsr_feature_mask)
1318 			return -EINVAL;
1319 
1320 		/* SSE and YMM require MXCSR even when FP is not in use. */
1321 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1322 			xsave->i387.mxcsr = mxcsr[0];
1323 			xsave->i387.mxcsr_mask = mxcsr[1];
1324 		}
1325 	}
1326 
1327 	for (i = 0; i < XFEATURE_MAX; i++) {
1328 		mask = BIT_ULL(i);
1329 
1330 		if (hdr.xfeatures & mask) {
1331 			void *dst = __raw_xsave_addr(xsave, i);
1332 
1333 			offset = xstate_offsets[i];
1334 			size = xstate_sizes[i];
1335 
1336 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1337 				return -EFAULT;
1338 		}
1339 	}
1340 
1341 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1342 		struct pkru_state *xpkru;
1343 
1344 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1345 		*pkru = xpkru->pkru;
1346 	} else {
1347 		/*
1348 		 * KVM may pass NULL here to indicate that it does not need
1349 		 * PKRU updated.
1350 		 */
1351 		if (pkru)
1352 			*pkru = 0;
1353 	}
1354 
1355 	/*
1356 	 * The state that came in from userspace was user-state only.
1357 	 * Mask all the user states out of 'xfeatures':
1358 	 */
1359 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1360 
1361 	/*
1362 	 * Add back in the features that came in from userspace:
1363 	 */
1364 	xsave->header.xfeatures |= hdr.xfeatures;
1365 
1366 	return 0;
1367 }
1368 
1369 /*
1370  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1371  * format and copy to the target thread. Used by ptrace and KVM.
1372  */
1373 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1374 {
1375 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1376 }
1377 
1378 /*
1379  * Convert from a sigreturn standard-format user-space buffer to kernel
1380  * XSAVE[S] format and copy to the target thread. This is called from the
1381  * sigreturn() and rt_sigreturn() system calls.
1382  */
1383 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1384 				      const void __user *ubuf)
1385 {
1386 	return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
1387 }
1388 
1389 static bool validate_independent_components(u64 mask)
1390 {
1391 	u64 xchk;
1392 
1393 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1394 		return false;
1395 
1396 	xchk = ~xfeatures_mask_independent();
1397 
1398 	if (WARN_ON_ONCE(!mask || mask & xchk))
1399 		return false;
1400 
1401 	return true;
1402 }
1403 
1404 /**
1405  * xsaves - Save selected components to a kernel xstate buffer
1406  * @xstate:	Pointer to the buffer
1407  * @mask:	Feature mask to select the components to save
1408  *
1409  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1410  * XSAVES does not write the full xstate header. Before first use the
1411  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1412  * can #GP.
1413  *
1414  * The feature mask must be a subset of the independent features.
1415  */
1416 void xsaves(struct xregs_state *xstate, u64 mask)
1417 {
1418 	int err;
1419 
1420 	if (!validate_independent_components(mask))
1421 		return;
1422 
1423 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1424 	WARN_ON_ONCE(err);
1425 }
1426 
1427 /**
1428  * xrstors - Restore selected components from a kernel xstate buffer
1429  * @xstate:	Pointer to the buffer
1430  * @mask:	Feature mask to select the components to restore
1431  *
1432  * The @xstate buffer must be 64 byte aligned and correctly initialized
1433  * otherwise XRSTORS from that buffer can #GP.
1434  *
1435  * Proper usage is to restore the state which was saved with
1436  * xsaves() into @xstate.
1437  *
1438  * The feature mask must be a subset of the independent features.
1439  */
1440 void xrstors(struct xregs_state *xstate, u64 mask)
1441 {
1442 	int err;
1443 
1444 	if (!validate_independent_components(mask))
1445 		return;
1446 
1447 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1448 	WARN_ON_ONCE(err);
1449 }
1450 
1451 #if IS_ENABLED(CONFIG_KVM)
1452 void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
1453 {
1454 	void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
1455 
1456 	if (addr)
1457 		memset(addr, 0, xstate_sizes[xfeature]);
1458 }
1459 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1460 #endif
1461 
1462 #ifdef CONFIG_X86_64
1463 
1464 #ifdef CONFIG_X86_DEBUG_FPU
1465 /*
1466  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1467  * can safely operate on the @fpstate buffer.
1468  */
1469 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1470 {
1471 	u64 xfd = __this_cpu_read(xfd_state);
1472 
1473 	if (fpstate->xfd == xfd)
1474 		return true;
1475 
1476 	 /*
1477 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1478 	  * the passed in fpstate is current's fpstate.
1479 	  */
1480 	if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
1481 		return false;
1482 
1483 	/*
1484 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1485 	 * bring all components into init state and not read from the
1486 	 * buffer. XSAVE(S) raises #PF after init.
1487 	 */
1488 	if (fpstate == &init_fpstate)
1489 		return rstor;
1490 
1491 	/*
1492 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1493 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1494 	 */
1495 
1496 	/*
1497 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1498 	 * the buffer area for XFD-disabled state components.
1499 	 */
1500 	mask &= ~xfd;
1501 
1502 	/*
1503 	 * Remove features which are valid in fpstate. They
1504 	 * have space allocated in fpstate.
1505 	 */
1506 	mask &= ~fpstate->xfeatures;
1507 
1508 	/*
1509 	 * Any remaining state components in 'mask' might be written
1510 	 * by XSAVE/XRSTOR. Fail validation it found.
1511 	 */
1512 	return !mask;
1513 }
1514 
1515 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1516 {
1517 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1518 }
1519 #endif /* CONFIG_X86_DEBUG_FPU */
1520 
1521 static int __init xfd_update_static_branch(void)
1522 {
1523 	/*
1524 	 * If init_fpstate.xfd has bits set then dynamic features are
1525 	 * available and the dynamic sizing must be enabled.
1526 	 */
1527 	if (init_fpstate.xfd)
1528 		static_branch_enable(&__fpu_state_size_dynamic);
1529 	return 0;
1530 }
1531 arch_initcall(xfd_update_static_branch)
1532 
1533 void fpstate_free(struct fpu *fpu)
1534 {
1535 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1536 		vfree(fpu->fpstate);
1537 }
1538 
1539 /**
1540  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1541  *
1542  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1543  *		of that task
1544  * @ksize:	The required size for the kernel buffer
1545  * @usize:	The required size for user space buffers
1546  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1547  *
1548  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1549  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1550  * with large states are likely to live longer.
1551  *
1552  * Returns: 0 on success, -ENOMEM on allocation error.
1553  */
1554 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1555 			   unsigned int usize, struct fpu_guest *guest_fpu)
1556 {
1557 	struct fpu *fpu = x86_task_fpu(current);
1558 	struct fpstate *curfps, *newfps = NULL;
1559 	unsigned int fpsize;
1560 	bool in_use;
1561 
1562 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1563 
1564 	newfps = vzalloc(fpsize);
1565 	if (!newfps)
1566 		return -ENOMEM;
1567 	newfps->size = ksize;
1568 	newfps->user_size = usize;
1569 	newfps->is_valloc = true;
1570 
1571 	/*
1572 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1573 	 * as reference independent whether it is in use or not.
1574 	 */
1575 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1576 
1577 	/* Determine whether @curfps is the active fpstate */
1578 	in_use = fpu->fpstate == curfps;
1579 
1580 	if (guest_fpu) {
1581 		newfps->is_guest = true;
1582 		newfps->is_confidential = curfps->is_confidential;
1583 		newfps->in_use = curfps->in_use;
1584 		guest_fpu->xfeatures |= xfeatures;
1585 		guest_fpu->uabi_size = usize;
1586 	}
1587 
1588 	fpregs_lock();
1589 	/*
1590 	 * If @curfps is in use, ensure that the current state is in the
1591 	 * registers before swapping fpstate as that might invalidate it
1592 	 * due to layout changes.
1593 	 */
1594 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1595 		fpregs_restore_userregs();
1596 
1597 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1598 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1599 	newfps->xfd = curfps->xfd & ~xfeatures;
1600 
1601 	/* Do the final updates within the locked region */
1602 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1603 
1604 	if (guest_fpu) {
1605 		guest_fpu->fpstate = newfps;
1606 		/* If curfps is active, update the FPU fpstate pointer */
1607 		if (in_use)
1608 			fpu->fpstate = newfps;
1609 	} else {
1610 		fpu->fpstate = newfps;
1611 	}
1612 
1613 	if (in_use)
1614 		xfd_update_state(fpu->fpstate);
1615 	fpregs_unlock();
1616 
1617 	/* Only free valloc'ed state */
1618 	if (curfps && curfps->is_valloc)
1619 		vfree(curfps);
1620 
1621 	return 0;
1622 }
1623 
1624 static int validate_sigaltstack(unsigned int usize)
1625 {
1626 	struct task_struct *thread, *leader = current->group_leader;
1627 	unsigned long framesize = get_sigframe_size();
1628 
1629 	lockdep_assert_held(&current->sighand->siglock);
1630 
1631 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1632 	framesize -= fpu_user_cfg.max_size;
1633 	framesize += usize;
1634 	for_each_thread(leader, thread) {
1635 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1636 			return -ENOSPC;
1637 	}
1638 	return 0;
1639 }
1640 
1641 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1642 {
1643 	/*
1644 	 * This deliberately does not exclude !XSAVES as we still might
1645 	 * decide to optionally context switch XCR0 or talk the silicon
1646 	 * vendors into extending XFD for the pre AMX states, especially
1647 	 * AVX512.
1648 	 */
1649 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1650 	struct fpu *fpu = x86_task_fpu(current->group_leader);
1651 	struct fpu_state_perm *perm;
1652 	unsigned int ksize, usize;
1653 	u64 mask;
1654 	int ret = 0;
1655 
1656 	/* Check whether fully enabled */
1657 	if ((permitted & requested) == requested)
1658 		return 0;
1659 
1660 	/*
1661 	 * Calculate the resulting kernel state size.  Note, @permitted also
1662 	 * contains supervisor xfeatures even though supervisor are always
1663 	 * permitted for kernel and guest FPUs, and never permitted for user
1664 	 * FPUs.
1665 	 */
1666 	mask = permitted | requested;
1667 	ksize = xstate_calculate_size(mask, compacted);
1668 
1669 	/*
1670 	 * Calculate the resulting user state size.  Take care not to clobber
1671 	 * the supervisor xfeatures in the new mask!
1672 	 */
1673 	usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
1674 
1675 	if (!guest) {
1676 		ret = validate_sigaltstack(usize);
1677 		if (ret)
1678 			return ret;
1679 	}
1680 
1681 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1682 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1683 	WRITE_ONCE(perm->__state_perm, mask);
1684 	/* Protected by sighand lock */
1685 	perm->__state_size = ksize;
1686 	perm->__user_state_size = usize;
1687 	return ret;
1688 }
1689 
1690 /*
1691  * Permissions array to map facilities with more than one component
1692  */
1693 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1694 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1695 };
1696 
1697 static int xstate_request_perm(unsigned long idx, bool guest)
1698 {
1699 	u64 permitted, requested;
1700 	int ret;
1701 
1702 	if (idx >= XFEATURE_MAX)
1703 		return -EINVAL;
1704 
1705 	/*
1706 	 * Look up the facility mask which can require more than
1707 	 * one xstate component.
1708 	 */
1709 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1710 	requested = xstate_prctl_req[idx];
1711 	if (!requested)
1712 		return -EOPNOTSUPP;
1713 
1714 	if ((fpu_user_cfg.max_features & requested) != requested)
1715 		return -EOPNOTSUPP;
1716 
1717 	/* Lockless quick check */
1718 	permitted = xstate_get_group_perm(guest);
1719 	if ((permitted & requested) == requested)
1720 		return 0;
1721 
1722 	/* Protect against concurrent modifications */
1723 	spin_lock_irq(&current->sighand->siglock);
1724 	permitted = xstate_get_group_perm(guest);
1725 
1726 	/* First vCPU allocation locks the permissions. */
1727 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1728 		ret = -EBUSY;
1729 	else
1730 		ret = __xstate_request_perm(permitted, requested, guest);
1731 	spin_unlock_irq(&current->sighand->siglock);
1732 	return ret;
1733 }
1734 
1735 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1736 {
1737 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1738 	struct fpu_state_perm *perm;
1739 	unsigned int ksize, usize;
1740 	struct fpu *fpu;
1741 
1742 	if (!xfd_event) {
1743 		if (!guest_fpu)
1744 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1745 		return 0;
1746 	}
1747 
1748 	/* Protect against concurrent modifications */
1749 	spin_lock_irq(&current->sighand->siglock);
1750 
1751 	/* If not permitted let it die */
1752 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1753 		spin_unlock_irq(&current->sighand->siglock);
1754 		return -EPERM;
1755 	}
1756 
1757 	fpu = x86_task_fpu(current->group_leader);
1758 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1759 	ksize = perm->__state_size;
1760 	usize = perm->__user_state_size;
1761 
1762 	/*
1763 	 * The feature is permitted. State size is sufficient.  Dropping
1764 	 * the lock is safe here even if more features are added from
1765 	 * another task, the retrieved buffer sizes are valid for the
1766 	 * currently requested feature(s).
1767 	 */
1768 	spin_unlock_irq(&current->sighand->siglock);
1769 
1770 	/*
1771 	 * Try to allocate a new fpstate. If that fails there is no way
1772 	 * out.
1773 	 */
1774 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1775 		return -EFAULT;
1776 	return 0;
1777 }
1778 
1779 int xfd_enable_feature(u64 xfd_err)
1780 {
1781 	return __xfd_enable_feature(xfd_err, NULL);
1782 }
1783 
1784 #else /* CONFIG_X86_64 */
1785 static inline int xstate_request_perm(unsigned long idx, bool guest)
1786 {
1787 	return -EPERM;
1788 }
1789 #endif  /* !CONFIG_X86_64 */
1790 
1791 u64 xstate_get_guest_group_perm(void)
1792 {
1793 	return xstate_get_group_perm(true);
1794 }
1795 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1796 
1797 /**
1798  * fpu_xstate_prctl - xstate permission operations
1799  * @option:	A subfunction of arch_prctl()
1800  * @arg2:	option argument
1801  * Return:	0 if successful; otherwise, an error code
1802  *
1803  * Option arguments:
1804  *
1805  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1806  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1807  * ARCH_REQ_XCOMP_PERM: Facility number requested
1808  *
1809  * For facilities which require more than one XSTATE component, the request
1810  * must be the highest state component number related to that facility,
1811  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1812  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1813  */
1814 long fpu_xstate_prctl(int option, unsigned long arg2)
1815 {
1816 	u64 __user *uptr = (u64 __user *)arg2;
1817 	u64 permitted, supported;
1818 	unsigned long idx = arg2;
1819 	bool guest = false;
1820 
1821 	switch (option) {
1822 	case ARCH_GET_XCOMP_SUPP:
1823 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1824 		return put_user(supported, uptr);
1825 
1826 	case ARCH_GET_XCOMP_PERM:
1827 		/*
1828 		 * Lockless snapshot as it can also change right after the
1829 		 * dropping the lock.
1830 		 */
1831 		permitted = xstate_get_host_group_perm();
1832 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1833 		return put_user(permitted, uptr);
1834 
1835 	case ARCH_GET_XCOMP_GUEST_PERM:
1836 		permitted = xstate_get_guest_group_perm();
1837 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1838 		return put_user(permitted, uptr);
1839 
1840 	case ARCH_REQ_XCOMP_GUEST_PERM:
1841 		guest = true;
1842 		fallthrough;
1843 
1844 	case ARCH_REQ_XCOMP_PERM:
1845 		if (!IS_ENABLED(CONFIG_X86_64))
1846 			return -EOPNOTSUPP;
1847 
1848 		return xstate_request_perm(idx, guest);
1849 
1850 	default:
1851 		return -EINVAL;
1852 	}
1853 }
1854 
1855 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1856 /*
1857  * Report the amount of time elapsed in millisecond since last AVX512
1858  * use in the task.
1859  */
1860 static void avx512_status(struct seq_file *m, struct task_struct *task)
1861 {
1862 	unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
1863 	long delta;
1864 
1865 	if (!timestamp) {
1866 		/*
1867 		 * Report -1 if no AVX512 usage
1868 		 */
1869 		delta = -1;
1870 	} else {
1871 		delta = (long)(jiffies - timestamp);
1872 		/*
1873 		 * Cap to LONG_MAX if time difference > LONG_MAX
1874 		 */
1875 		if (delta < 0)
1876 			delta = LONG_MAX;
1877 		delta = jiffies_to_msecs(delta);
1878 	}
1879 
1880 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1881 	seq_putc(m, '\n');
1882 }
1883 
1884 /*
1885  * Report architecture specific information
1886  */
1887 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1888 			struct pid *pid, struct task_struct *task)
1889 {
1890 	/*
1891 	 * Report AVX512 state if the processor and build option supported.
1892 	 */
1893 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1894 		avx512_status(m, task);
1895 
1896 	return 0;
1897 }
1898 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1899 
1900 #ifdef CONFIG_COREDUMP
1901 static const char owner_name[] = "LINUX";
1902 
1903 /*
1904  * Dump type, size, offset and flag values for every xfeature that is present.
1905  */
1906 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1907 {
1908 	int num_records = 0;
1909 	int i;
1910 
1911 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1912 		struct x86_xfeat_component xc = {
1913 			.type   = i,
1914 			.size   = xstate_sizes[i],
1915 			.offset = xstate_offsets[i],
1916 			/* reserved for future use */
1917 			.flags  = 0,
1918 		};
1919 
1920 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1921 			return 0;
1922 
1923 		num_records++;
1924 	}
1925 	return num_records;
1926 }
1927 
1928 static u32 get_xsave_desc_size(void)
1929 {
1930 	u32 cnt = 0;
1931 	u32 i;
1932 
1933 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1934 		cnt++;
1935 
1936 	return cnt * (sizeof(struct x86_xfeat_component));
1937 }
1938 
1939 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1940 {
1941 	int num_records = 0;
1942 	struct elf_note en;
1943 
1944 	if (!fpu_user_cfg.max_features)
1945 		return 0;
1946 
1947 	en.n_namesz = sizeof(owner_name);
1948 	en.n_descsz = get_xsave_desc_size();
1949 	en.n_type = NT_X86_XSAVE_LAYOUT;
1950 
1951 	if (!dump_emit(cprm, &en, sizeof(en)))
1952 		return 1;
1953 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1954 		return 1;
1955 	if (!dump_align(cprm, 4))
1956 		return 1;
1957 
1958 	num_records = dump_xsave_layout_desc(cprm);
1959 	if (!num_records)
1960 		return 1;
1961 
1962 	/* Total size should be equal to the number of records */
1963 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1964 		return 1;
1965 
1966 	return 0;
1967 }
1968 
1969 int elf_coredump_extra_notes_size(void)
1970 {
1971 	int size;
1972 
1973 	if (!fpu_user_cfg.max_features)
1974 		return 0;
1975 
1976 	/* .note header */
1977 	size  = sizeof(struct elf_note);
1978 	/*  Name plus alignment to 4 bytes */
1979 	size += roundup(sizeof(owner_name), 4);
1980 	size += get_xsave_desc_size();
1981 
1982 	return size;
1983 }
1984 #endif /* CONFIG_COREDUMP */
1985