xref: /linux/arch/x86/kernel/fpu/xstate.c (revision 8838a1a2d219a86ab05e679c73f68dd75a25aca5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16 #include <linux/coredump.h>
17 
18 #include <asm/fpu/api.h>
19 #include <asm/fpu/regset.h>
20 #include <asm/fpu/signal.h>
21 #include <asm/fpu/xcr.h>
22 
23 #include <asm/cpuid.h>
24 #include <asm/tlbflush.h>
25 #include <asm/prctl.h>
26 #include <asm/elf.h>
27 
28 #include <uapi/asm/elf.h>
29 
30 #include "context.h"
31 #include "internal.h"
32 #include "legacy.h"
33 #include "xstate.h"
34 
35 #define for_each_extended_xfeature(bit, mask)				\
36 	(bit) = FIRST_EXTENDED_XFEATURE;				\
37 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
38 
39 /*
40  * Although we spell it out in here, the Processor Trace
41  * xfeature is completely unused.  We use other mechanisms
42  * to save/restore PT state in Linux.
43  */
44 static const char *xfeature_names[] =
45 {
46 	"x87 floating point registers",
47 	"SSE registers",
48 	"AVX registers",
49 	"MPX bounds registers",
50 	"MPX CSR",
51 	"AVX-512 opmask",
52 	"AVX-512 Hi256",
53 	"AVX-512 ZMM_Hi256",
54 	"Processor Trace (unused)",
55 	"Protection Keys User registers",
56 	"PASID state",
57 	"Control-flow User registers",
58 	"Control-flow Kernel registers (unused)",
59 	"unknown xstate feature",
60 	"unknown xstate feature",
61 	"unknown xstate feature",
62 	"unknown xstate feature",
63 	"AMX Tile config",
64 	"AMX Tile data",
65 	"unknown xstate feature",
66 };
67 
68 static unsigned short xsave_cpuid_features[] __initdata = {
69 	[XFEATURE_FP]				= X86_FEATURE_FPU,
70 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
71 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
72 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
73 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
74 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
75 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
76 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
77 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
78 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
79 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
80 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
81 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
82 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
83 };
84 
85 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
86 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
87 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
88 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
89 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
90 
91 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
92 #define XSTATE_FLAG_ALIGNED64	BIT(1)
93 
94 /*
95  * Return whether the system supports a given xfeature.
96  *
97  * Also return the name of the (most advanced) feature that the caller requested:
98  */
99 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
100 {
101 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
102 
103 	if (unlikely(feature_name)) {
104 		long xfeature_idx, max_idx;
105 		u64 xfeatures_print;
106 		/*
107 		 * So we use FLS here to be able to print the most advanced
108 		 * feature that was requested but is missing. So if a driver
109 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
110 		 * missing AVX feature - this is the most informative message
111 		 * to users:
112 		 */
113 		if (xfeatures_missing)
114 			xfeatures_print = xfeatures_missing;
115 		else
116 			xfeatures_print = xfeatures_needed;
117 
118 		xfeature_idx = fls64(xfeatures_print)-1;
119 		max_idx = ARRAY_SIZE(xfeature_names)-1;
120 		xfeature_idx = min(xfeature_idx, max_idx);
121 
122 		*feature_name = xfeature_names[xfeature_idx];
123 	}
124 
125 	if (xfeatures_missing)
126 		return 0;
127 
128 	return 1;
129 }
130 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
131 
132 static bool xfeature_is_aligned64(int xfeature_nr)
133 {
134 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
135 }
136 
137 static bool xfeature_is_supervisor(int xfeature_nr)
138 {
139 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
140 }
141 
142 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
143 {
144 	unsigned int offs, i;
145 
146 	/*
147 	 * Non-compacted format and legacy features use the cached fixed
148 	 * offsets.
149 	 */
150 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
151 	    xfeature <= XFEATURE_SSE)
152 		return xstate_offsets[xfeature];
153 
154 	/*
155 	 * Compacted format offsets depend on the actual content of the
156 	 * compacted xsave area which is determined by the xcomp_bv header
157 	 * field.
158 	 */
159 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
160 	for_each_extended_xfeature(i, xcomp_bv) {
161 		if (xfeature_is_aligned64(i))
162 			offs = ALIGN(offs, 64);
163 		if (i == xfeature)
164 			break;
165 		offs += xstate_sizes[i];
166 	}
167 	return offs;
168 }
169 
170 /*
171  * Enable the extended processor state save/restore feature.
172  * Called once per CPU onlining.
173  */
174 void fpu__init_cpu_xstate(void)
175 {
176 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
177 		return;
178 
179 	cr4_set_bits(X86_CR4_OSXSAVE);
180 
181 	/*
182 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
183 	 * lazy passthrough.  Write independent of the dynamic state static
184 	 * key as that does not work on the boot CPU. This also ensures
185 	 * that any stale state is wiped out from XFD. Reset the per CPU
186 	 * xfd cache too.
187 	 */
188 	if (cpu_feature_enabled(X86_FEATURE_XFD))
189 		xfd_set_state(init_fpstate.xfd);
190 
191 	/*
192 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
193 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
194 	 * states can be set here.
195 	 */
196 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
197 
198 	/*
199 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
200 	 */
201 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
202 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
203 				     xfeatures_mask_independent());
204 	}
205 }
206 
207 static bool xfeature_enabled(enum xfeature xfeature)
208 {
209 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
210 }
211 
212 /*
213  * Record the offsets and sizes of various xstates contained
214  * in the XSAVE state memory layout.
215  */
216 static void __init setup_xstate_cache(void)
217 {
218 	u32 eax, ebx, ecx, edx, i;
219 	/* start at the beginning of the "extended state" */
220 	unsigned int last_good_offset = offsetof(struct xregs_state,
221 						 extended_state_area);
222 	/*
223 	 * The FP xstates and SSE xstates are legacy states. They are always
224 	 * in the fixed offsets in the xsave area in either compacted form
225 	 * or standard form.
226 	 */
227 	xstate_offsets[XFEATURE_FP]	= 0;
228 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
229 						   xmm_space);
230 
231 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
232 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
233 						       xmm_space);
234 
235 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
236 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
237 
238 		xstate_sizes[i] = eax;
239 		xstate_flags[i] = ecx;
240 
241 		/*
242 		 * If an xfeature is supervisor state, the offset in EBX is
243 		 * invalid, leave it to -1.
244 		 */
245 		if (xfeature_is_supervisor(i))
246 			continue;
247 
248 		xstate_offsets[i] = ebx;
249 
250 		/*
251 		 * In our xstate size checks, we assume that the highest-numbered
252 		 * xstate feature has the highest offset in the buffer.  Ensure
253 		 * it does.
254 		 */
255 		WARN_ONCE(last_good_offset > xstate_offsets[i],
256 			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
257 
258 		last_good_offset = xstate_offsets[i];
259 	}
260 }
261 
262 static void __init print_xstate_feature(u64 xstate_mask)
263 {
264 	const char *feature_name;
265 
266 	if (cpu_has_xfeatures(xstate_mask, &feature_name))
267 		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
268 }
269 
270 /*
271  * Print out all the supported xstate features:
272  */
273 static void __init print_xstate_features(void)
274 {
275 	print_xstate_feature(XFEATURE_MASK_FP);
276 	print_xstate_feature(XFEATURE_MASK_SSE);
277 	print_xstate_feature(XFEATURE_MASK_YMM);
278 	print_xstate_feature(XFEATURE_MASK_BNDREGS);
279 	print_xstate_feature(XFEATURE_MASK_BNDCSR);
280 	print_xstate_feature(XFEATURE_MASK_OPMASK);
281 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
282 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
283 	print_xstate_feature(XFEATURE_MASK_PKRU);
284 	print_xstate_feature(XFEATURE_MASK_PASID);
285 	print_xstate_feature(XFEATURE_MASK_CET_USER);
286 	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
287 	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
288 }
289 
290 /*
291  * This check is important because it is easy to get XSTATE_*
292  * confused with XSTATE_BIT_*.
293  */
294 #define CHECK_XFEATURE(nr) do {		\
295 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
296 	WARN_ON(nr >= XFEATURE_MAX);	\
297 } while (0)
298 
299 /*
300  * Print out xstate component offsets and sizes
301  */
302 static void __init print_xstate_offset_size(void)
303 {
304 	int i;
305 
306 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
307 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
308 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
309 			i, xstate_sizes[i]);
310 	}
311 }
312 
313 /*
314  * This function is called only during boot time when x86 caps are not set
315  * up and alternative can not be used yet.
316  */
317 static __init void os_xrstor_booting(struct xregs_state *xstate)
318 {
319 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
320 	u32 lmask = mask;
321 	u32 hmask = mask >> 32;
322 	int err;
323 
324 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
325 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
326 	else
327 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
328 
329 	/*
330 	 * We should never fault when copying from a kernel buffer, and the FPU
331 	 * state we set at boot time should be valid.
332 	 */
333 	WARN_ON_FPU(err);
334 }
335 
336 /*
337  * All supported features have either init state all zeros or are
338  * handled in setup_init_fpu() individually. This is an explicit
339  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
340  * newly added supported features at build time and make people
341  * actually look at the init state for the new feature.
342  */
343 #define XFEATURES_INIT_FPSTATE_HANDLED		\
344 	(XFEATURE_MASK_FP |			\
345 	 XFEATURE_MASK_SSE |			\
346 	 XFEATURE_MASK_YMM |			\
347 	 XFEATURE_MASK_OPMASK |			\
348 	 XFEATURE_MASK_ZMM_Hi256 |		\
349 	 XFEATURE_MASK_Hi16_ZMM	 |		\
350 	 XFEATURE_MASK_PKRU |			\
351 	 XFEATURE_MASK_BNDREGS |		\
352 	 XFEATURE_MASK_BNDCSR |			\
353 	 XFEATURE_MASK_PASID |			\
354 	 XFEATURE_MASK_CET_USER |		\
355 	 XFEATURE_MASK_XTILE)
356 
357 /*
358  * setup the xstate image representing the init state
359  */
360 static void __init setup_init_fpu_buf(void)
361 {
362 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
363 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
364 		     XFEATURES_INIT_FPSTATE_HANDLED);
365 
366 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
367 		return;
368 
369 	print_xstate_features();
370 
371 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
372 
373 	/*
374 	 * Init all the features state with header.xfeatures being 0x0
375 	 */
376 	os_xrstor_booting(&init_fpstate.regs.xsave);
377 
378 	/*
379 	 * All components are now in init state. Read the state back so
380 	 * that init_fpstate contains all non-zero init state. This only
381 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
382 	 * those use the init optimization which skips writing data for
383 	 * components in init state.
384 	 *
385 	 * XSAVE could be used, but that would require to reshuffle the
386 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
387 	 * compaction. But doing so is a pointless exercise because most
388 	 * components have an all zeros init state except for the legacy
389 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
390 	 * legacy area. Adding new features requires to ensure that init
391 	 * state is all zeroes or if not to add the necessary handling
392 	 * here.
393 	 */
394 	fxsave(&init_fpstate.regs.fxsave);
395 }
396 
397 int xfeature_size(int xfeature_nr)
398 {
399 	u32 eax, ebx, ecx, edx;
400 
401 	CHECK_XFEATURE(xfeature_nr);
402 	cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
403 	return eax;
404 }
405 
406 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
407 static int validate_user_xstate_header(const struct xstate_header *hdr,
408 				       struct fpstate *fpstate)
409 {
410 	/* No unknown or supervisor features may be set */
411 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
412 		return -EINVAL;
413 
414 	/* Userspace must use the uncompacted format */
415 	if (hdr->xcomp_bv)
416 		return -EINVAL;
417 
418 	/*
419 	 * If 'reserved' is shrunken to add a new field, make sure to validate
420 	 * that new field here!
421 	 */
422 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
423 
424 	/* No reserved bits may be set */
425 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
426 		return -EINVAL;
427 
428 	return 0;
429 }
430 
431 static void __init __xstate_dump_leaves(void)
432 {
433 	int i;
434 	u32 eax, ebx, ecx, edx;
435 	static int should_dump = 1;
436 
437 	if (!should_dump)
438 		return;
439 	should_dump = 0;
440 	/*
441 	 * Dump out a few leaves past the ones that we support
442 	 * just in case there are some goodies up there
443 	 */
444 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
445 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
446 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
447 			CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
448 	}
449 }
450 
451 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
452 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
453 		__xstate_dump_leaves();						\
454 	}									\
455 } while (0)
456 
457 #define XCHECK_SZ(sz, nr, __struct) ({					\
458 	if (WARN_ONCE(sz != sizeof(__struct),				\
459 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
460 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
461 		__xstate_dump_leaves();					\
462 	}								\
463 	true;								\
464 })
465 
466 
467 /**
468  * check_xtile_data_against_struct - Check tile data state size.
469  *
470  * Calculate the state size by multiplying the single tile size which is
471  * recorded in a C struct, and the number of tiles that the CPU informs.
472  * Compare the provided size with the calculation.
473  *
474  * @size:	The tile data state size
475  *
476  * Returns:	0 on success, -EINVAL on mismatch.
477  */
478 static int __init check_xtile_data_against_struct(int size)
479 {
480 	u32 max_palid, palid, state_size;
481 	u32 eax, ebx, ecx, edx;
482 	u16 max_tile;
483 
484 	/*
485 	 * Check the maximum palette id:
486 	 *   eax: the highest numbered palette subleaf.
487 	 */
488 	cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
489 
490 	/*
491 	 * Cross-check each tile size and find the maximum number of
492 	 * supported tiles.
493 	 */
494 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
495 		u16 tile_size, max;
496 
497 		/*
498 		 * Check the tile size info:
499 		 *   eax[31:16]:  bytes per title
500 		 *   ebx[31:16]:  the max names (or max number of tiles)
501 		 */
502 		cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
503 		tile_size = eax >> 16;
504 		max = ebx >> 16;
505 
506 		if (tile_size != sizeof(struct xtile_data)) {
507 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
508 			       __stringify(XFEATURE_XTILE_DATA),
509 			       sizeof(struct xtile_data), tile_size);
510 			__xstate_dump_leaves();
511 			return -EINVAL;
512 		}
513 
514 		if (max > max_tile)
515 			max_tile = max;
516 	}
517 
518 	state_size = sizeof(struct xtile_data) * max_tile;
519 	if (size != state_size) {
520 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
521 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
522 		__xstate_dump_leaves();
523 		return -EINVAL;
524 	}
525 	return 0;
526 }
527 
528 /*
529  * We have a C struct for each 'xstate'.  We need to ensure
530  * that our software representation matches what the CPU
531  * tells us about the state's size.
532  */
533 static bool __init check_xstate_against_struct(int nr)
534 {
535 	/*
536 	 * Ask the CPU for the size of the state.
537 	 */
538 	int sz = xfeature_size(nr);
539 
540 	/*
541 	 * Match each CPU state with the corresponding software
542 	 * structure.
543 	 */
544 	switch (nr) {
545 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
546 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
547 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
548 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
549 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
550 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
551 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
552 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
553 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
554 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
555 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
556 	default:
557 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
558 		return false;
559 	}
560 
561 	return true;
562 }
563 
564 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
565 {
566 	unsigned int topmost = fls64(xfeatures) -  1;
567 	unsigned int offset = xstate_offsets[topmost];
568 
569 	if (topmost <= XFEATURE_SSE)
570 		return sizeof(struct xregs_state);
571 
572 	if (compacted)
573 		offset = xfeature_get_offset(xfeatures, topmost);
574 	return offset + xstate_sizes[topmost];
575 }
576 
577 /*
578  * This essentially double-checks what the cpu told us about
579  * how large the XSAVE buffer needs to be.  We are recalculating
580  * it to be safe.
581  *
582  * Independent XSAVE features allocate their own buffers and are not
583  * covered by these checks. Only the size of the buffer for task->fpu
584  * is checked here.
585  */
586 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
587 {
588 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
589 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
590 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
591 	int i;
592 
593 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
594 		if (!check_xstate_against_struct(i))
595 			return false;
596 		/*
597 		 * Supervisor state components can be managed only by
598 		 * XSAVES.
599 		 */
600 		if (!xsaves && xfeature_is_supervisor(i)) {
601 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
602 			return false;
603 		}
604 	}
605 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
606 	XSTATE_WARN_ON(size != kernel_size,
607 		       "size %u != kernel_size %u\n", size, kernel_size);
608 	return size == kernel_size;
609 }
610 
611 /*
612  * Get total size of enabled xstates in XCR0 | IA32_XSS.
613  *
614  * Note the SDM's wording here.  "sub-function 0" only enumerates
615  * the size of the *user* states.  If we use it to size a buffer
616  * that we use 'XSAVES' on, we could potentially overflow the
617  * buffer because 'XSAVES' saves system states too.
618  *
619  * This also takes compaction into account. So this works for
620  * XSAVEC as well.
621  */
622 static unsigned int __init get_compacted_size(void)
623 {
624 	unsigned int eax, ebx, ecx, edx;
625 	/*
626 	 * - CPUID function 0DH, sub-function 1:
627 	 *    EBX enumerates the size (in bytes) required by
628 	 *    the XSAVES instruction for an XSAVE area
629 	 *    containing all the state components
630 	 *    corresponding to bits currently set in
631 	 *    XCR0 | IA32_XSS.
632 	 *
633 	 * When XSAVES is not available but XSAVEC is (virt), then there
634 	 * are no supervisor states, but XSAVEC still uses compacted
635 	 * format.
636 	 */
637 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
638 	return ebx;
639 }
640 
641 /*
642  * Get the total size of the enabled xstates without the independent supervisor
643  * features.
644  */
645 static unsigned int __init get_xsave_compacted_size(void)
646 {
647 	u64 mask = xfeatures_mask_independent();
648 	unsigned int size;
649 
650 	if (!mask)
651 		return get_compacted_size();
652 
653 	/* Disable independent features. */
654 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
655 
656 	/*
657 	 * Ask the hardware what size is required of the buffer.
658 	 * This is the size required for the task->fpu buffer.
659 	 */
660 	size = get_compacted_size();
661 
662 	/* Re-enable independent features so XSAVES will work on them again. */
663 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
664 
665 	return size;
666 }
667 
668 static unsigned int __init get_xsave_size_user(void)
669 {
670 	unsigned int eax, ebx, ecx, edx;
671 	/*
672 	 * - CPUID function 0DH, sub-function 0:
673 	 *    EBX enumerates the size (in bytes) required by
674 	 *    the XSAVE instruction for an XSAVE area
675 	 *    containing all the *user* state components
676 	 *    corresponding to bits currently set in XCR0.
677 	 */
678 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
679 	return ebx;
680 }
681 
682 static int __init init_xstate_size(void)
683 {
684 	/* Recompute the context size for enabled features: */
685 	unsigned int user_size, kernel_size, kernel_default_size;
686 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
687 
688 	/* Uncompacted user space size */
689 	user_size = get_xsave_size_user();
690 
691 	/*
692 	 * XSAVES kernel size includes supervisor states and uses compacted
693 	 * format. XSAVEC uses compacted format, but does not save
694 	 * supervisor states.
695 	 *
696 	 * XSAVE[OPT] do not support supervisor states so kernel and user
697 	 * size is identical.
698 	 */
699 	if (compacted)
700 		kernel_size = get_xsave_compacted_size();
701 	else
702 		kernel_size = user_size;
703 
704 	kernel_default_size =
705 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
706 
707 	if (!paranoid_xstate_size_valid(kernel_size))
708 		return -EINVAL;
709 
710 	fpu_kernel_cfg.max_size = kernel_size;
711 	fpu_user_cfg.max_size = user_size;
712 
713 	fpu_kernel_cfg.default_size = kernel_default_size;
714 	fpu_user_cfg.default_size =
715 		xstate_calculate_size(fpu_user_cfg.default_features, false);
716 
717 	return 0;
718 }
719 
720 /*
721  * We enabled the XSAVE hardware, but something went wrong and
722  * we can not use it.  Disable it.
723  */
724 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
725 {
726 	fpu_kernel_cfg.max_features = 0;
727 	cr4_clear_bits(X86_CR4_OSXSAVE);
728 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
729 
730 	/* Restore the legacy size.*/
731 	fpu_kernel_cfg.max_size = legacy_size;
732 	fpu_kernel_cfg.default_size = legacy_size;
733 	fpu_user_cfg.max_size = legacy_size;
734 	fpu_user_cfg.default_size = legacy_size;
735 
736 	/*
737 	 * Prevent enabling the static branch which enables writes to the
738 	 * XFD MSR.
739 	 */
740 	init_fpstate.xfd = 0;
741 
742 	fpstate_reset(&current->thread.fpu);
743 }
744 
745 /*
746  * Enable and initialize the xsave feature.
747  * Called once per system bootup.
748  */
749 void __init fpu__init_system_xstate(unsigned int legacy_size)
750 {
751 	unsigned int eax, ebx, ecx, edx;
752 	u64 xfeatures;
753 	int err;
754 	int i;
755 
756 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
757 		pr_info("x86/fpu: No FPU detected\n");
758 		return;
759 	}
760 
761 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
762 		pr_info("x86/fpu: x87 FPU will use %s\n",
763 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
764 		return;
765 	}
766 
767 	/*
768 	 * Find user xstates supported by the processor.
769 	 */
770 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
771 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
772 
773 	/*
774 	 * Find supervisor xstates supported by the processor.
775 	 */
776 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
777 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
778 
779 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
780 		/*
781 		 * This indicates that something really unexpected happened
782 		 * with the enumeration.  Disable XSAVE and try to continue
783 		 * booting without it.  This is too early to BUG().
784 		 */
785 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
786 		       fpu_kernel_cfg.max_features);
787 		goto out_disable;
788 	}
789 
790 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
791 					      XFEATURE_MASK_INDEPENDENT;
792 
793 	/*
794 	 * Clear XSAVE features that are disabled in the normal CPUID.
795 	 */
796 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
797 		unsigned short cid = xsave_cpuid_features[i];
798 
799 		/* Careful: X86_FEATURE_FPU is 0! */
800 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
801 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
802 	}
803 
804 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
805 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
806 
807 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
808 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
809 	else
810 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
811 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
812 
813 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
814 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
815 
816 	/* Clean out dynamic features from default */
817 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
818 	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
819 
820 	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
821 	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
822 
823 	/* Store it for paranoia check at the end */
824 	xfeatures = fpu_kernel_cfg.max_features;
825 
826 	/*
827 	 * Initialize the default XFD state in initfp_state and enable the
828 	 * dynamic sizing mechanism if dynamic states are available.  The
829 	 * static key cannot be enabled here because this runs before
830 	 * jump_label_init(). This is delayed to an initcall.
831 	 */
832 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
833 
834 	/* Set up compaction feature bit */
835 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
836 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
837 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
838 
839 	/* Enable xstate instructions to be able to continue with initialization: */
840 	fpu__init_cpu_xstate();
841 
842 	/* Cache size, offset and flags for initialization */
843 	setup_xstate_cache();
844 
845 	err = init_xstate_size();
846 	if (err)
847 		goto out_disable;
848 
849 	/* Reset the state for the current task */
850 	fpstate_reset(&current->thread.fpu);
851 
852 	/*
853 	 * Update info used for ptrace frames; use standard-format size and no
854 	 * supervisor xstates:
855 	 */
856 	update_regset_xstate_info(fpu_user_cfg.max_size,
857 				  fpu_user_cfg.max_features);
858 
859 	/*
860 	 * init_fpstate excludes dynamic states as they are large but init
861 	 * state is zero.
862 	 */
863 	init_fpstate.size		= fpu_kernel_cfg.default_size;
864 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
865 
866 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
867 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
868 			sizeof(init_fpstate.regs), init_fpstate.size);
869 		goto out_disable;
870 	}
871 
872 	setup_init_fpu_buf();
873 
874 	/*
875 	 * Paranoia check whether something in the setup modified the
876 	 * xfeatures mask.
877 	 */
878 	if (xfeatures != fpu_kernel_cfg.max_features) {
879 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
880 		       xfeatures, fpu_kernel_cfg.max_features);
881 		goto out_disable;
882 	}
883 
884 	/*
885 	 * CPU capabilities initialization runs before FPU init. So
886 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
887 	 * functional, set the feature bit so depending code works.
888 	 */
889 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
890 
891 	print_xstate_offset_size();
892 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
893 		fpu_kernel_cfg.max_features,
894 		fpu_kernel_cfg.max_size,
895 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
896 	return;
897 
898 out_disable:
899 	/* something went wrong, try to boot without any XSAVE support */
900 	fpu__init_disable_system_xstate(legacy_size);
901 }
902 
903 /*
904  * Restore minimal FPU state after suspend:
905  */
906 void fpu__resume_cpu(void)
907 {
908 	/*
909 	 * Restore XCR0 on xsave capable CPUs:
910 	 */
911 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
912 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
913 
914 	/*
915 	 * Restore IA32_XSS. The same CPUID bit enumerates support
916 	 * of XSAVES and MSR_IA32_XSS.
917 	 */
918 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
919 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
920 				     xfeatures_mask_independent());
921 	}
922 
923 	if (fpu_state_size_dynamic())
924 		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
925 }
926 
927 /*
928  * Given an xstate feature nr, calculate where in the xsave
929  * buffer the state is.  Callers should ensure that the buffer
930  * is valid.
931  */
932 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
933 {
934 	u64 xcomp_bv = xsave->header.xcomp_bv;
935 
936 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
937 		return NULL;
938 
939 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
940 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
941 			return NULL;
942 	}
943 
944 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
945 }
946 
947 /*
948  * Given the xsave area and a state inside, this function returns the
949  * address of the state.
950  *
951  * This is the API that is called to get xstate address in either
952  * standard format or compacted format of xsave area.
953  *
954  * Note that if there is no data for the field in the xsave buffer
955  * this will return NULL.
956  *
957  * Inputs:
958  *	xstate: the thread's storage area for all FPU data
959  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
960  *	XFEATURE_SSE, etc...)
961  * Output:
962  *	address of the state in the xsave area, or NULL if the
963  *	field is not present in the xsave buffer.
964  */
965 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
966 {
967 	/*
968 	 * Do we even *have* xsave state?
969 	 */
970 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
971 		return NULL;
972 
973 	/*
974 	 * We should not ever be requesting features that we
975 	 * have not enabled.
976 	 */
977 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
978 		return NULL;
979 
980 	/*
981 	 * This assumes the last 'xsave*' instruction to
982 	 * have requested that 'xfeature_nr' be saved.
983 	 * If it did not, we might be seeing and old value
984 	 * of the field in the buffer.
985 	 *
986 	 * This can happen because the last 'xsave' did not
987 	 * request that this feature be saved (unlikely)
988 	 * or because the "init optimization" caused it
989 	 * to not be saved.
990 	 */
991 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
992 		return NULL;
993 
994 	return __raw_xsave_addr(xsave, xfeature_nr);
995 }
996 EXPORT_SYMBOL_GPL(get_xsave_addr);
997 
998 /*
999  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1000  * The xsave buffer should be in standard format, not compacted (e.g. user mode
1001  * signal frames).
1002  */
1003 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1004 {
1005 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1006 		return NULL;
1007 
1008 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
1009 }
1010 
1011 #ifdef CONFIG_ARCH_HAS_PKEYS
1012 
1013 /*
1014  * This will go out and modify PKRU register to set the access
1015  * rights for @pkey to @init_val.
1016  */
1017 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1018 			      unsigned long init_val)
1019 {
1020 	u32 old_pkru, new_pkru_bits = 0;
1021 	int pkey_shift;
1022 
1023 	/*
1024 	 * This check implies XSAVE support.  OSPKE only gets
1025 	 * set if we enable XSAVE and we enable PKU in XCR0.
1026 	 */
1027 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1028 		return -EINVAL;
1029 
1030 	/*
1031 	 * This code should only be called with valid 'pkey'
1032 	 * values originating from in-kernel users.  Complain
1033 	 * if a bad value is observed.
1034 	 */
1035 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1036 		return -EINVAL;
1037 
1038 	/* Set the bits we need in PKRU:  */
1039 	if (init_val & PKEY_DISABLE_ACCESS)
1040 		new_pkru_bits |= PKRU_AD_BIT;
1041 	if (init_val & PKEY_DISABLE_WRITE)
1042 		new_pkru_bits |= PKRU_WD_BIT;
1043 
1044 	/* Shift the bits in to the correct place in PKRU for pkey: */
1045 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1046 	new_pkru_bits <<= pkey_shift;
1047 
1048 	/* Get old PKRU and mask off any old bits in place: */
1049 	old_pkru = read_pkru();
1050 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1051 
1052 	/* Write old part along with new part: */
1053 	write_pkru(old_pkru | new_pkru_bits);
1054 
1055 	return 0;
1056 }
1057 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1058 
1059 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1060 			 void *init_xstate, unsigned int size)
1061 {
1062 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1063 }
1064 
1065 /**
1066  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1067  * @to:		membuf descriptor
1068  * @fpstate:	The fpstate buffer from which to copy
1069  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1070  * @pkru_val:	The PKRU value to store in the PKRU component
1071  * @copy_mode:	The requested copy mode
1072  *
1073  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1074  * format, i.e. from the kernel internal hardware dependent storage format
1075  * to the requested @mode. UABI XSTATE is always uncompacted!
1076  *
1077  * It supports partial copy but @to.pos always starts from zero.
1078  */
1079 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1080 			       u64 xfeatures, u32 pkru_val,
1081 			       enum xstate_copy_mode copy_mode)
1082 {
1083 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1084 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1085 	struct xregs_state *xsave = &fpstate->regs.xsave;
1086 	struct xstate_header header;
1087 	unsigned int zerofrom;
1088 	u64 mask;
1089 	int i;
1090 
1091 	memset(&header, 0, sizeof(header));
1092 	header.xfeatures = xsave->header.xfeatures;
1093 
1094 	/* Mask out the feature bits depending on copy mode */
1095 	switch (copy_mode) {
1096 	case XSTATE_COPY_FP:
1097 		header.xfeatures &= XFEATURE_MASK_FP;
1098 		break;
1099 
1100 	case XSTATE_COPY_FX:
1101 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1102 		break;
1103 
1104 	case XSTATE_COPY_XSAVE:
1105 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1106 		break;
1107 	}
1108 
1109 	/* Copy FP state up to MXCSR */
1110 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1111 		     &xinit->i387, off_mxcsr);
1112 
1113 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1114 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1115 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1116 		     MXCSR_AND_FLAGS_SIZE);
1117 
1118 	/* Copy the remaining FP state */
1119 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1120 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1121 		     sizeof(xsave->i387.st_space));
1122 
1123 	/* Copy the SSE state - shared with YMM, but independently managed */
1124 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1125 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1126 		     sizeof(xsave->i387.xmm_space));
1127 
1128 	if (copy_mode != XSTATE_COPY_XSAVE)
1129 		goto out;
1130 
1131 	/* Zero the padding area */
1132 	membuf_zero(&to, sizeof(xsave->i387.padding));
1133 
1134 	/* Copy xsave->i387.sw_reserved */
1135 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1136 
1137 	/* Copy the user space relevant state of @xsave->header */
1138 	membuf_write(&to, &header, sizeof(header));
1139 
1140 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1141 
1142 	/*
1143 	 * This 'mask' indicates which states to copy from fpstate.
1144 	 * Those extended states that are not present in fpstate are
1145 	 * either disabled or initialized:
1146 	 *
1147 	 * In non-compacted format, disabled features still occupy
1148 	 * state space but there is no state to copy from in the
1149 	 * compacted init_fpstate. The gap tracking will zero these
1150 	 * states.
1151 	 *
1152 	 * The extended features have an all zeroes init state. Thus,
1153 	 * remove them from 'mask' to zero those features in the user
1154 	 * buffer instead of retrieving them from init_fpstate.
1155 	 */
1156 	mask = header.xfeatures;
1157 
1158 	for_each_extended_xfeature(i, mask) {
1159 		/*
1160 		 * If there was a feature or alignment gap, zero the space
1161 		 * in the destination buffer.
1162 		 */
1163 		if (zerofrom < xstate_offsets[i])
1164 			membuf_zero(&to, xstate_offsets[i] - zerofrom);
1165 
1166 		if (i == XFEATURE_PKRU) {
1167 			struct pkru_state pkru = {0};
1168 			/*
1169 			 * PKRU is not necessarily up to date in the
1170 			 * XSAVE buffer. Use the provided value.
1171 			 */
1172 			pkru.pkru = pkru_val;
1173 			membuf_write(&to, &pkru, sizeof(pkru));
1174 		} else {
1175 			membuf_write(&to,
1176 				     __raw_xsave_addr(xsave, i),
1177 				     xstate_sizes[i]);
1178 		}
1179 		/*
1180 		 * Keep track of the last copied state in the non-compacted
1181 		 * target buffer for gap zeroing.
1182 		 */
1183 		zerofrom = xstate_offsets[i] + xstate_sizes[i];
1184 	}
1185 
1186 out:
1187 	if (to.left)
1188 		membuf_zero(&to, to.left);
1189 }
1190 
1191 /**
1192  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1193  * @to:		membuf descriptor
1194  * @tsk:	The task from which to copy the saved xstate
1195  * @copy_mode:	The requested copy mode
1196  *
1197  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1198  * format, i.e. from the kernel internal hardware dependent storage format
1199  * to the requested @mode. UABI XSTATE is always uncompacted!
1200  *
1201  * It supports partial copy but @to.pos always starts from zero.
1202  */
1203 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1204 			     enum xstate_copy_mode copy_mode)
1205 {
1206 	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1207 				  tsk->thread.fpu.fpstate->user_xfeatures,
1208 				  tsk->thread.pkru, copy_mode);
1209 }
1210 
1211 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1212 			    const void *kbuf, const void __user *ubuf)
1213 {
1214 	if (kbuf) {
1215 		memcpy(dst, kbuf + offset, size);
1216 	} else {
1217 		if (copy_from_user(dst, ubuf + offset, size))
1218 			return -EFAULT;
1219 	}
1220 	return 0;
1221 }
1222 
1223 
1224 /**
1225  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1226  * @fpstate:	The fpstate buffer to copy to
1227  * @kbuf:	The UABI format buffer, if it comes from the kernel
1228  * @ubuf:	The UABI format buffer, if it comes from userspace
1229  * @pkru:	The location to write the PKRU value to
1230  *
1231  * Converts from the UABI format into the kernel internal hardware
1232  * dependent format.
1233  *
1234  * This function ultimately has three different callers with distinct PKRU
1235  * behavior.
1236  * 1.	When called from sigreturn the PKRU register will be restored from
1237  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1238  *	@fpstate is sufficient to cover this case, but the caller will also
1239  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1240  *	it is harmless.
1241  * 2.	When called from ptrace the PKRU register will be restored from the
1242  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1243  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1244  *	the PKRU register to the hardware init value (0) if the corresponding
1245  *	xfeatures bit is not set is emulated here.
1246  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1247  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1248  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1249  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1250  *	bit is not set.
1251  */
1252 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1253 			       const void __user *ubuf, u32 *pkru)
1254 {
1255 	struct xregs_state *xsave = &fpstate->regs.xsave;
1256 	unsigned int offset, size;
1257 	struct xstate_header hdr;
1258 	u64 mask;
1259 	int i;
1260 
1261 	offset = offsetof(struct xregs_state, header);
1262 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1263 		return -EFAULT;
1264 
1265 	if (validate_user_xstate_header(&hdr, fpstate))
1266 		return -EINVAL;
1267 
1268 	/* Validate MXCSR when any of the related features is in use */
1269 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1270 	if (hdr.xfeatures & mask) {
1271 		u32 mxcsr[2];
1272 
1273 		offset = offsetof(struct fxregs_state, mxcsr);
1274 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1275 			return -EFAULT;
1276 
1277 		/* Reserved bits in MXCSR must be zero. */
1278 		if (mxcsr[0] & ~mxcsr_feature_mask)
1279 			return -EINVAL;
1280 
1281 		/* SSE and YMM require MXCSR even when FP is not in use. */
1282 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1283 			xsave->i387.mxcsr = mxcsr[0];
1284 			xsave->i387.mxcsr_mask = mxcsr[1];
1285 		}
1286 	}
1287 
1288 	for (i = 0; i < XFEATURE_MAX; i++) {
1289 		mask = BIT_ULL(i);
1290 
1291 		if (hdr.xfeatures & mask) {
1292 			void *dst = __raw_xsave_addr(xsave, i);
1293 
1294 			offset = xstate_offsets[i];
1295 			size = xstate_sizes[i];
1296 
1297 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1298 				return -EFAULT;
1299 		}
1300 	}
1301 
1302 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1303 		struct pkru_state *xpkru;
1304 
1305 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1306 		*pkru = xpkru->pkru;
1307 	} else {
1308 		/*
1309 		 * KVM may pass NULL here to indicate that it does not need
1310 		 * PKRU updated.
1311 		 */
1312 		if (pkru)
1313 			*pkru = 0;
1314 	}
1315 
1316 	/*
1317 	 * The state that came in from userspace was user-state only.
1318 	 * Mask all the user states out of 'xfeatures':
1319 	 */
1320 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1321 
1322 	/*
1323 	 * Add back in the features that came in from userspace:
1324 	 */
1325 	xsave->header.xfeatures |= hdr.xfeatures;
1326 
1327 	return 0;
1328 }
1329 
1330 /*
1331  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1332  * format and copy to the target thread. Used by ptrace and KVM.
1333  */
1334 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1335 {
1336 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1337 }
1338 
1339 /*
1340  * Convert from a sigreturn standard-format user-space buffer to kernel
1341  * XSAVE[S] format and copy to the target thread. This is called from the
1342  * sigreturn() and rt_sigreturn() system calls.
1343  */
1344 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1345 				      const void __user *ubuf)
1346 {
1347 	return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1348 }
1349 
1350 static bool validate_independent_components(u64 mask)
1351 {
1352 	u64 xchk;
1353 
1354 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1355 		return false;
1356 
1357 	xchk = ~xfeatures_mask_independent();
1358 
1359 	if (WARN_ON_ONCE(!mask || mask & xchk))
1360 		return false;
1361 
1362 	return true;
1363 }
1364 
1365 /**
1366  * xsaves - Save selected components to a kernel xstate buffer
1367  * @xstate:	Pointer to the buffer
1368  * @mask:	Feature mask to select the components to save
1369  *
1370  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1371  * XSAVES does not write the full xstate header. Before first use the
1372  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1373  * can #GP.
1374  *
1375  * The feature mask must be a subset of the independent features.
1376  */
1377 void xsaves(struct xregs_state *xstate, u64 mask)
1378 {
1379 	int err;
1380 
1381 	if (!validate_independent_components(mask))
1382 		return;
1383 
1384 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1385 	WARN_ON_ONCE(err);
1386 }
1387 
1388 /**
1389  * xrstors - Restore selected components from a kernel xstate buffer
1390  * @xstate:	Pointer to the buffer
1391  * @mask:	Feature mask to select the components to restore
1392  *
1393  * The @xstate buffer must be 64 byte aligned and correctly initialized
1394  * otherwise XRSTORS from that buffer can #GP.
1395  *
1396  * Proper usage is to restore the state which was saved with
1397  * xsaves() into @xstate.
1398  *
1399  * The feature mask must be a subset of the independent features.
1400  */
1401 void xrstors(struct xregs_state *xstate, u64 mask)
1402 {
1403 	int err;
1404 
1405 	if (!validate_independent_components(mask))
1406 		return;
1407 
1408 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1409 	WARN_ON_ONCE(err);
1410 }
1411 
1412 #if IS_ENABLED(CONFIG_KVM)
1413 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1414 {
1415 	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1416 
1417 	if (addr)
1418 		memset(addr, 0, xstate_sizes[xfeature]);
1419 }
1420 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1421 #endif
1422 
1423 #ifdef CONFIG_X86_64
1424 
1425 #ifdef CONFIG_X86_DEBUG_FPU
1426 /*
1427  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1428  * can safely operate on the @fpstate buffer.
1429  */
1430 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1431 {
1432 	u64 xfd = __this_cpu_read(xfd_state);
1433 
1434 	if (fpstate->xfd == xfd)
1435 		return true;
1436 
1437 	 /*
1438 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1439 	  * the passed in fpstate is current's fpstate.
1440 	  */
1441 	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1442 		return false;
1443 
1444 	/*
1445 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1446 	 * bring all components into init state and not read from the
1447 	 * buffer. XSAVE(S) raises #PF after init.
1448 	 */
1449 	if (fpstate == &init_fpstate)
1450 		return rstor;
1451 
1452 	/*
1453 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1454 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1455 	 */
1456 
1457 	/*
1458 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1459 	 * the buffer area for XFD-disabled state components.
1460 	 */
1461 	mask &= ~xfd;
1462 
1463 	/*
1464 	 * Remove features which are valid in fpstate. They
1465 	 * have space allocated in fpstate.
1466 	 */
1467 	mask &= ~fpstate->xfeatures;
1468 
1469 	/*
1470 	 * Any remaining state components in 'mask' might be written
1471 	 * by XSAVE/XRSTOR. Fail validation it found.
1472 	 */
1473 	return !mask;
1474 }
1475 
1476 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1477 {
1478 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1479 }
1480 #endif /* CONFIG_X86_DEBUG_FPU */
1481 
1482 static int __init xfd_update_static_branch(void)
1483 {
1484 	/*
1485 	 * If init_fpstate.xfd has bits set then dynamic features are
1486 	 * available and the dynamic sizing must be enabled.
1487 	 */
1488 	if (init_fpstate.xfd)
1489 		static_branch_enable(&__fpu_state_size_dynamic);
1490 	return 0;
1491 }
1492 arch_initcall(xfd_update_static_branch)
1493 
1494 void fpstate_free(struct fpu *fpu)
1495 {
1496 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1497 		vfree(fpu->fpstate);
1498 }
1499 
1500 /**
1501  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1502  *
1503  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1504  *		of that task
1505  * @ksize:	The required size for the kernel buffer
1506  * @usize:	The required size for user space buffers
1507  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1508  *
1509  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1510  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1511  * with large states are likely to live longer.
1512  *
1513  * Returns: 0 on success, -ENOMEM on allocation error.
1514  */
1515 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1516 			   unsigned int usize, struct fpu_guest *guest_fpu)
1517 {
1518 	struct fpu *fpu = &current->thread.fpu;
1519 	struct fpstate *curfps, *newfps = NULL;
1520 	unsigned int fpsize;
1521 	bool in_use;
1522 
1523 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1524 
1525 	newfps = vzalloc(fpsize);
1526 	if (!newfps)
1527 		return -ENOMEM;
1528 	newfps->size = ksize;
1529 	newfps->user_size = usize;
1530 	newfps->is_valloc = true;
1531 
1532 	/*
1533 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1534 	 * as reference independent whether it is in use or not.
1535 	 */
1536 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1537 
1538 	/* Determine whether @curfps is the active fpstate */
1539 	in_use = fpu->fpstate == curfps;
1540 
1541 	if (guest_fpu) {
1542 		newfps->is_guest = true;
1543 		newfps->is_confidential = curfps->is_confidential;
1544 		newfps->in_use = curfps->in_use;
1545 		guest_fpu->xfeatures |= xfeatures;
1546 		guest_fpu->uabi_size = usize;
1547 	}
1548 
1549 	fpregs_lock();
1550 	/*
1551 	 * If @curfps is in use, ensure that the current state is in the
1552 	 * registers before swapping fpstate as that might invalidate it
1553 	 * due to layout changes.
1554 	 */
1555 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1556 		fpregs_restore_userregs();
1557 
1558 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1559 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1560 	newfps->xfd = curfps->xfd & ~xfeatures;
1561 
1562 	/* Do the final updates within the locked region */
1563 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1564 
1565 	if (guest_fpu) {
1566 		guest_fpu->fpstate = newfps;
1567 		/* If curfps is active, update the FPU fpstate pointer */
1568 		if (in_use)
1569 			fpu->fpstate = newfps;
1570 	} else {
1571 		fpu->fpstate = newfps;
1572 	}
1573 
1574 	if (in_use)
1575 		xfd_update_state(fpu->fpstate);
1576 	fpregs_unlock();
1577 
1578 	/* Only free valloc'ed state */
1579 	if (curfps && curfps->is_valloc)
1580 		vfree(curfps);
1581 
1582 	return 0;
1583 }
1584 
1585 static int validate_sigaltstack(unsigned int usize)
1586 {
1587 	struct task_struct *thread, *leader = current->group_leader;
1588 	unsigned long framesize = get_sigframe_size();
1589 
1590 	lockdep_assert_held(&current->sighand->siglock);
1591 
1592 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1593 	framesize -= fpu_user_cfg.max_size;
1594 	framesize += usize;
1595 	for_each_thread(leader, thread) {
1596 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1597 			return -ENOSPC;
1598 	}
1599 	return 0;
1600 }
1601 
1602 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1603 {
1604 	/*
1605 	 * This deliberately does not exclude !XSAVES as we still might
1606 	 * decide to optionally context switch XCR0 or talk the silicon
1607 	 * vendors into extending XFD for the pre AMX states, especially
1608 	 * AVX512.
1609 	 */
1610 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1611 	struct fpu *fpu = &current->group_leader->thread.fpu;
1612 	struct fpu_state_perm *perm;
1613 	unsigned int ksize, usize;
1614 	u64 mask;
1615 	int ret = 0;
1616 
1617 	/* Check whether fully enabled */
1618 	if ((permitted & requested) == requested)
1619 		return 0;
1620 
1621 	/* Calculate the resulting kernel state size */
1622 	mask = permitted | requested;
1623 	/* Take supervisor states into account on the host */
1624 	if (!guest)
1625 		mask |= xfeatures_mask_supervisor();
1626 	ksize = xstate_calculate_size(mask, compacted);
1627 
1628 	/* Calculate the resulting user state size */
1629 	mask &= XFEATURE_MASK_USER_SUPPORTED;
1630 	usize = xstate_calculate_size(mask, false);
1631 
1632 	if (!guest) {
1633 		ret = validate_sigaltstack(usize);
1634 		if (ret)
1635 			return ret;
1636 	}
1637 
1638 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1639 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1640 	WRITE_ONCE(perm->__state_perm, mask);
1641 	/* Protected by sighand lock */
1642 	perm->__state_size = ksize;
1643 	perm->__user_state_size = usize;
1644 	return ret;
1645 }
1646 
1647 /*
1648  * Permissions array to map facilities with more than one component
1649  */
1650 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1651 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1652 };
1653 
1654 static int xstate_request_perm(unsigned long idx, bool guest)
1655 {
1656 	u64 permitted, requested;
1657 	int ret;
1658 
1659 	if (idx >= XFEATURE_MAX)
1660 		return -EINVAL;
1661 
1662 	/*
1663 	 * Look up the facility mask which can require more than
1664 	 * one xstate component.
1665 	 */
1666 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1667 	requested = xstate_prctl_req[idx];
1668 	if (!requested)
1669 		return -EOPNOTSUPP;
1670 
1671 	if ((fpu_user_cfg.max_features & requested) != requested)
1672 		return -EOPNOTSUPP;
1673 
1674 	/* Lockless quick check */
1675 	permitted = xstate_get_group_perm(guest);
1676 	if ((permitted & requested) == requested)
1677 		return 0;
1678 
1679 	/* Protect against concurrent modifications */
1680 	spin_lock_irq(&current->sighand->siglock);
1681 	permitted = xstate_get_group_perm(guest);
1682 
1683 	/* First vCPU allocation locks the permissions. */
1684 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1685 		ret = -EBUSY;
1686 	else
1687 		ret = __xstate_request_perm(permitted, requested, guest);
1688 	spin_unlock_irq(&current->sighand->siglock);
1689 	return ret;
1690 }
1691 
1692 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1693 {
1694 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1695 	struct fpu_state_perm *perm;
1696 	unsigned int ksize, usize;
1697 	struct fpu *fpu;
1698 
1699 	if (!xfd_event) {
1700 		if (!guest_fpu)
1701 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1702 		return 0;
1703 	}
1704 
1705 	/* Protect against concurrent modifications */
1706 	spin_lock_irq(&current->sighand->siglock);
1707 
1708 	/* If not permitted let it die */
1709 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1710 		spin_unlock_irq(&current->sighand->siglock);
1711 		return -EPERM;
1712 	}
1713 
1714 	fpu = &current->group_leader->thread.fpu;
1715 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1716 	ksize = perm->__state_size;
1717 	usize = perm->__user_state_size;
1718 
1719 	/*
1720 	 * The feature is permitted. State size is sufficient.  Dropping
1721 	 * the lock is safe here even if more features are added from
1722 	 * another task, the retrieved buffer sizes are valid for the
1723 	 * currently requested feature(s).
1724 	 */
1725 	spin_unlock_irq(&current->sighand->siglock);
1726 
1727 	/*
1728 	 * Try to allocate a new fpstate. If that fails there is no way
1729 	 * out.
1730 	 */
1731 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1732 		return -EFAULT;
1733 	return 0;
1734 }
1735 
1736 int xfd_enable_feature(u64 xfd_err)
1737 {
1738 	return __xfd_enable_feature(xfd_err, NULL);
1739 }
1740 
1741 #else /* CONFIG_X86_64 */
1742 static inline int xstate_request_perm(unsigned long idx, bool guest)
1743 {
1744 	return -EPERM;
1745 }
1746 #endif  /* !CONFIG_X86_64 */
1747 
1748 u64 xstate_get_guest_group_perm(void)
1749 {
1750 	return xstate_get_group_perm(true);
1751 }
1752 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1753 
1754 /**
1755  * fpu_xstate_prctl - xstate permission operations
1756  * @option:	A subfunction of arch_prctl()
1757  * @arg2:	option argument
1758  * Return:	0 if successful; otherwise, an error code
1759  *
1760  * Option arguments:
1761  *
1762  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1763  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1764  * ARCH_REQ_XCOMP_PERM: Facility number requested
1765  *
1766  * For facilities which require more than one XSTATE component, the request
1767  * must be the highest state component number related to that facility,
1768  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1769  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1770  */
1771 long fpu_xstate_prctl(int option, unsigned long arg2)
1772 {
1773 	u64 __user *uptr = (u64 __user *)arg2;
1774 	u64 permitted, supported;
1775 	unsigned long idx = arg2;
1776 	bool guest = false;
1777 
1778 	switch (option) {
1779 	case ARCH_GET_XCOMP_SUPP:
1780 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1781 		return put_user(supported, uptr);
1782 
1783 	case ARCH_GET_XCOMP_PERM:
1784 		/*
1785 		 * Lockless snapshot as it can also change right after the
1786 		 * dropping the lock.
1787 		 */
1788 		permitted = xstate_get_host_group_perm();
1789 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1790 		return put_user(permitted, uptr);
1791 
1792 	case ARCH_GET_XCOMP_GUEST_PERM:
1793 		permitted = xstate_get_guest_group_perm();
1794 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1795 		return put_user(permitted, uptr);
1796 
1797 	case ARCH_REQ_XCOMP_GUEST_PERM:
1798 		guest = true;
1799 		fallthrough;
1800 
1801 	case ARCH_REQ_XCOMP_PERM:
1802 		if (!IS_ENABLED(CONFIG_X86_64))
1803 			return -EOPNOTSUPP;
1804 
1805 		return xstate_request_perm(idx, guest);
1806 
1807 	default:
1808 		return -EINVAL;
1809 	}
1810 }
1811 
1812 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1813 /*
1814  * Report the amount of time elapsed in millisecond since last AVX512
1815  * use in the task.
1816  */
1817 static void avx512_status(struct seq_file *m, struct task_struct *task)
1818 {
1819 	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1820 	long delta;
1821 
1822 	if (!timestamp) {
1823 		/*
1824 		 * Report -1 if no AVX512 usage
1825 		 */
1826 		delta = -1;
1827 	} else {
1828 		delta = (long)(jiffies - timestamp);
1829 		/*
1830 		 * Cap to LONG_MAX if time difference > LONG_MAX
1831 		 */
1832 		if (delta < 0)
1833 			delta = LONG_MAX;
1834 		delta = jiffies_to_msecs(delta);
1835 	}
1836 
1837 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1838 	seq_putc(m, '\n');
1839 }
1840 
1841 /*
1842  * Report architecture specific information
1843  */
1844 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1845 			struct pid *pid, struct task_struct *task)
1846 {
1847 	/*
1848 	 * Report AVX512 state if the processor and build option supported.
1849 	 */
1850 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1851 		avx512_status(m, task);
1852 
1853 	return 0;
1854 }
1855 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1856 
1857 #ifdef CONFIG_COREDUMP
1858 static const char owner_name[] = "LINUX";
1859 
1860 /*
1861  * Dump type, size, offset and flag values for every xfeature that is present.
1862  */
1863 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1864 {
1865 	int num_records = 0;
1866 	int i;
1867 
1868 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1869 		struct x86_xfeat_component xc = {
1870 			.type   = i,
1871 			.size   = xstate_sizes[i],
1872 			.offset = xstate_offsets[i],
1873 			/* reserved for future use */
1874 			.flags  = 0,
1875 		};
1876 
1877 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1878 			return 0;
1879 
1880 		num_records++;
1881 	}
1882 	return num_records;
1883 }
1884 
1885 static u32 get_xsave_desc_size(void)
1886 {
1887 	u32 cnt = 0;
1888 	u32 i;
1889 
1890 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1891 		cnt++;
1892 
1893 	return cnt * (sizeof(struct x86_xfeat_component));
1894 }
1895 
1896 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1897 {
1898 	int num_records = 0;
1899 	struct elf_note en;
1900 
1901 	if (!fpu_user_cfg.max_features)
1902 		return 0;
1903 
1904 	en.n_namesz = sizeof(owner_name);
1905 	en.n_descsz = get_xsave_desc_size();
1906 	en.n_type = NT_X86_XSAVE_LAYOUT;
1907 
1908 	if (!dump_emit(cprm, &en, sizeof(en)))
1909 		return 1;
1910 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1911 		return 1;
1912 	if (!dump_align(cprm, 4))
1913 		return 1;
1914 
1915 	num_records = dump_xsave_layout_desc(cprm);
1916 	if (!num_records)
1917 		return 1;
1918 
1919 	/* Total size should be equal to the number of records */
1920 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1921 		return 1;
1922 
1923 	return 0;
1924 }
1925 
1926 int elf_coredump_extra_notes_size(void)
1927 {
1928 	int size;
1929 
1930 	if (!fpu_user_cfg.max_features)
1931 		return 0;
1932 
1933 	/* .note header */
1934 	size  = sizeof(struct elf_note);
1935 	/*  Name plus alignment to 4 bytes */
1936 	size += roundup(sizeof(owner_name), 4);
1937 	size += get_xsave_desc_size();
1938 
1939 	return size;
1940 }
1941 #endif /* CONFIG_COREDUMP */
1942