xref: /linux/arch/x86/kernel/cpu/resctrl/rdtgroup.c (revision e814f3fd16acfb7f9966773953de8f740a1e3202)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * User interface for Resource Allocation in Resource Director Technology(RDT)
4  *
5  * Copyright (C) 2016 Intel Corporation
6  *
7  * Author: Fenghua Yu <fenghua.yu@intel.com>
8  *
9  * More information about RDT be found in the Intel (R) x86 Architecture
10  * Software Developer Manual.
11  */
12 
13 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
14 
15 #include <linux/cpu.h>
16 #include <linux/debugfs.h>
17 #include <linux/fs.h>
18 #include <linux/fs_parser.h>
19 #include <linux/sysfs.h>
20 #include <linux/kernfs.h>
21 #include <linux/seq_buf.h>
22 #include <linux/seq_file.h>
23 #include <linux/sched/signal.h>
24 #include <linux/sched/task.h>
25 #include <linux/slab.h>
26 #include <linux/task_work.h>
27 #include <linux/user_namespace.h>
28 
29 #include <uapi/linux/magic.h>
30 
31 #include <asm/resctrl.h>
32 #include "internal.h"
33 
34 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
35 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
36 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
37 
38 /* Mutex to protect rdtgroup access. */
39 DEFINE_MUTEX(rdtgroup_mutex);
40 
41 static struct kernfs_root *rdt_root;
42 struct rdtgroup rdtgroup_default;
43 LIST_HEAD(rdt_all_groups);
44 
45 /* list of entries for the schemata file */
46 LIST_HEAD(resctrl_schema_all);
47 
48 /* The filesystem can only be mounted once. */
49 bool resctrl_mounted;
50 
51 /* Kernel fs node for "info" directory under root */
52 static struct kernfs_node *kn_info;
53 
54 /* Kernel fs node for "mon_groups" directory under root */
55 static struct kernfs_node *kn_mongrp;
56 
57 /* Kernel fs node for "mon_data" directory under root */
58 static struct kernfs_node *kn_mondata;
59 
60 static struct seq_buf last_cmd_status;
61 static char last_cmd_status_buf[512];
62 
63 static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
64 static void rdtgroup_destroy_root(void);
65 
66 struct dentry *debugfs_resctrl;
67 
68 /*
69  * Memory bandwidth monitoring event to use for the default CTRL_MON group
70  * and each new CTRL_MON group created by the user.  Only relevant when
71  * the filesystem is mounted with the "mba_MBps" option so it does not
72  * matter that it remains uninitialized on systems that do not support
73  * the "mba_MBps" option.
74  */
75 enum resctrl_event_id mba_mbps_default_event;
76 
77 static bool resctrl_debug;
78 
79 void rdt_last_cmd_clear(void)
80 {
81 	lockdep_assert_held(&rdtgroup_mutex);
82 	seq_buf_clear(&last_cmd_status);
83 }
84 
85 void rdt_last_cmd_puts(const char *s)
86 {
87 	lockdep_assert_held(&rdtgroup_mutex);
88 	seq_buf_puts(&last_cmd_status, s);
89 }
90 
91 void rdt_last_cmd_printf(const char *fmt, ...)
92 {
93 	va_list ap;
94 
95 	va_start(ap, fmt);
96 	lockdep_assert_held(&rdtgroup_mutex);
97 	seq_buf_vprintf(&last_cmd_status, fmt, ap);
98 	va_end(ap);
99 }
100 
101 void rdt_staged_configs_clear(void)
102 {
103 	struct rdt_ctrl_domain *dom;
104 	struct rdt_resource *r;
105 
106 	lockdep_assert_held(&rdtgroup_mutex);
107 
108 	for_each_alloc_capable_rdt_resource(r) {
109 		list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
110 			memset(dom->staged_config, 0, sizeof(dom->staged_config));
111 	}
112 }
113 
114 /*
115  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
116  * we can keep a bitmap of free CLOSIDs in a single integer.
117  *
118  * Using a global CLOSID across all resources has some advantages and
119  * some drawbacks:
120  * + We can simply set current's closid to assign a task to a resource
121  *   group.
122  * + Context switch code can avoid extra memory references deciding which
123  *   CLOSID to load into the PQR_ASSOC MSR
124  * - We give up some options in configuring resource groups across multi-socket
125  *   systems.
126  * - Our choices on how to configure each resource become progressively more
127  *   limited as the number of resources grows.
128  */
129 static unsigned long closid_free_map;
130 static int closid_free_map_len;
131 
132 int closids_supported(void)
133 {
134 	return closid_free_map_len;
135 }
136 
137 static void closid_init(void)
138 {
139 	struct resctrl_schema *s;
140 	u32 rdt_min_closid = 32;
141 
142 	/* Compute rdt_min_closid across all resources */
143 	list_for_each_entry(s, &resctrl_schema_all, list)
144 		rdt_min_closid = min(rdt_min_closid, s->num_closid);
145 
146 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
147 
148 	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
149 	__clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
150 	closid_free_map_len = rdt_min_closid;
151 }
152 
153 static int closid_alloc(void)
154 {
155 	int cleanest_closid;
156 	u32 closid;
157 
158 	lockdep_assert_held(&rdtgroup_mutex);
159 
160 	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
161 		cleanest_closid = resctrl_find_cleanest_closid();
162 		if (cleanest_closid < 0)
163 			return cleanest_closid;
164 		closid = cleanest_closid;
165 	} else {
166 		closid = ffs(closid_free_map);
167 		if (closid == 0)
168 			return -ENOSPC;
169 		closid--;
170 	}
171 	__clear_bit(closid, &closid_free_map);
172 
173 	return closid;
174 }
175 
176 void closid_free(int closid)
177 {
178 	lockdep_assert_held(&rdtgroup_mutex);
179 
180 	__set_bit(closid, &closid_free_map);
181 }
182 
183 /**
184  * closid_allocated - test if provided closid is in use
185  * @closid: closid to be tested
186  *
187  * Return: true if @closid is currently associated with a resource group,
188  * false if @closid is free
189  */
190 bool closid_allocated(unsigned int closid)
191 {
192 	lockdep_assert_held(&rdtgroup_mutex);
193 
194 	return !test_bit(closid, &closid_free_map);
195 }
196 
197 /**
198  * rdtgroup_mode_by_closid - Return mode of resource group with closid
199  * @closid: closid if the resource group
200  *
201  * Each resource group is associated with a @closid. Here the mode
202  * of a resource group can be queried by searching for it using its closid.
203  *
204  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
205  */
206 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
207 {
208 	struct rdtgroup *rdtgrp;
209 
210 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
211 		if (rdtgrp->closid == closid)
212 			return rdtgrp->mode;
213 	}
214 
215 	return RDT_NUM_MODES;
216 }
217 
218 static const char * const rdt_mode_str[] = {
219 	[RDT_MODE_SHAREABLE]		= "shareable",
220 	[RDT_MODE_EXCLUSIVE]		= "exclusive",
221 	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
222 	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
223 };
224 
225 /**
226  * rdtgroup_mode_str - Return the string representation of mode
227  * @mode: the resource group mode as &enum rdtgroup_mode
228  *
229  * Return: string representation of valid mode, "unknown" otherwise
230  */
231 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
232 {
233 	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
234 		return "unknown";
235 
236 	return rdt_mode_str[mode];
237 }
238 
239 /* set uid and gid of rdtgroup dirs and files to that of the creator */
240 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
241 {
242 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
243 				.ia_uid = current_fsuid(),
244 				.ia_gid = current_fsgid(), };
245 
246 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
247 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
248 		return 0;
249 
250 	return kernfs_setattr(kn, &iattr);
251 }
252 
253 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
254 {
255 	struct kernfs_node *kn;
256 	int ret;
257 
258 	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
259 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
260 				  0, rft->kf_ops, rft, NULL, NULL);
261 	if (IS_ERR(kn))
262 		return PTR_ERR(kn);
263 
264 	ret = rdtgroup_kn_set_ugid(kn);
265 	if (ret) {
266 		kernfs_remove(kn);
267 		return ret;
268 	}
269 
270 	return 0;
271 }
272 
273 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
274 {
275 	struct kernfs_open_file *of = m->private;
276 	struct rftype *rft = of->kn->priv;
277 
278 	if (rft->seq_show)
279 		return rft->seq_show(of, m, arg);
280 	return 0;
281 }
282 
283 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
284 				   size_t nbytes, loff_t off)
285 {
286 	struct rftype *rft = of->kn->priv;
287 
288 	if (rft->write)
289 		return rft->write(of, buf, nbytes, off);
290 
291 	return -EINVAL;
292 }
293 
294 static const struct kernfs_ops rdtgroup_kf_single_ops = {
295 	.atomic_write_len	= PAGE_SIZE,
296 	.write			= rdtgroup_file_write,
297 	.seq_show		= rdtgroup_seqfile_show,
298 };
299 
300 static const struct kernfs_ops kf_mondata_ops = {
301 	.atomic_write_len	= PAGE_SIZE,
302 	.seq_show		= rdtgroup_mondata_show,
303 };
304 
305 static bool is_cpu_list(struct kernfs_open_file *of)
306 {
307 	struct rftype *rft = of->kn->priv;
308 
309 	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
310 }
311 
312 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
313 			      struct seq_file *s, void *v)
314 {
315 	struct rdtgroup *rdtgrp;
316 	struct cpumask *mask;
317 	int ret = 0;
318 
319 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
320 
321 	if (rdtgrp) {
322 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
323 			if (!rdtgrp->plr->d) {
324 				rdt_last_cmd_clear();
325 				rdt_last_cmd_puts("Cache domain offline\n");
326 				ret = -ENODEV;
327 			} else {
328 				mask = &rdtgrp->plr->d->hdr.cpu_mask;
329 				seq_printf(s, is_cpu_list(of) ?
330 					   "%*pbl\n" : "%*pb\n",
331 					   cpumask_pr_args(mask));
332 			}
333 		} else {
334 			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
335 				   cpumask_pr_args(&rdtgrp->cpu_mask));
336 		}
337 	} else {
338 		ret = -ENOENT;
339 	}
340 	rdtgroup_kn_unlock(of->kn);
341 
342 	return ret;
343 }
344 
345 /*
346  * This is safe against resctrl_sched_in() called from __switch_to()
347  * because __switch_to() is executed with interrupts disabled. A local call
348  * from update_closid_rmid() is protected against __switch_to() because
349  * preemption is disabled.
350  */
351 static void update_cpu_closid_rmid(void *info)
352 {
353 	struct rdtgroup *r = info;
354 
355 	if (r) {
356 		this_cpu_write(pqr_state.default_closid, r->closid);
357 		this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
358 	}
359 
360 	/*
361 	 * We cannot unconditionally write the MSR because the current
362 	 * executing task might have its own closid selected. Just reuse
363 	 * the context switch code.
364 	 */
365 	resctrl_sched_in(current);
366 }
367 
368 /*
369  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
370  *
371  * Per task closids/rmids must have been set up before calling this function.
372  */
373 static void
374 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
375 {
376 	on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
377 }
378 
379 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
380 			  cpumask_var_t tmpmask)
381 {
382 	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
383 	struct list_head *head;
384 
385 	/* Check whether cpus belong to parent ctrl group */
386 	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
387 	if (!cpumask_empty(tmpmask)) {
388 		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
389 		return -EINVAL;
390 	}
391 
392 	/* Check whether cpus are dropped from this group */
393 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
394 	if (!cpumask_empty(tmpmask)) {
395 		/* Give any dropped cpus to parent rdtgroup */
396 		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
397 		update_closid_rmid(tmpmask, prgrp);
398 	}
399 
400 	/*
401 	 * If we added cpus, remove them from previous group that owned them
402 	 * and update per-cpu rmid
403 	 */
404 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
405 	if (!cpumask_empty(tmpmask)) {
406 		head = &prgrp->mon.crdtgrp_list;
407 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
408 			if (crgrp == rdtgrp)
409 				continue;
410 			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
411 				       tmpmask);
412 		}
413 		update_closid_rmid(tmpmask, rdtgrp);
414 	}
415 
416 	/* Done pushing/pulling - update this group with new mask */
417 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
418 
419 	return 0;
420 }
421 
422 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
423 {
424 	struct rdtgroup *crgrp;
425 
426 	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
427 	/* update the child mon group masks as well*/
428 	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
429 		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
430 }
431 
432 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
433 			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
434 {
435 	struct rdtgroup *r, *crgrp;
436 	struct list_head *head;
437 
438 	/* Check whether cpus are dropped from this group */
439 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
440 	if (!cpumask_empty(tmpmask)) {
441 		/* Can't drop from default group */
442 		if (rdtgrp == &rdtgroup_default) {
443 			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
444 			return -EINVAL;
445 		}
446 
447 		/* Give any dropped cpus to rdtgroup_default */
448 		cpumask_or(&rdtgroup_default.cpu_mask,
449 			   &rdtgroup_default.cpu_mask, tmpmask);
450 		update_closid_rmid(tmpmask, &rdtgroup_default);
451 	}
452 
453 	/*
454 	 * If we added cpus, remove them from previous group and
455 	 * the prev group's child groups that owned them
456 	 * and update per-cpu closid/rmid.
457 	 */
458 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
459 	if (!cpumask_empty(tmpmask)) {
460 		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
461 			if (r == rdtgrp)
462 				continue;
463 			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
464 			if (!cpumask_empty(tmpmask1))
465 				cpumask_rdtgrp_clear(r, tmpmask1);
466 		}
467 		update_closid_rmid(tmpmask, rdtgrp);
468 	}
469 
470 	/* Done pushing/pulling - update this group with new mask */
471 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
472 
473 	/*
474 	 * Clear child mon group masks since there is a new parent mask
475 	 * now and update the rmid for the cpus the child lost.
476 	 */
477 	head = &rdtgrp->mon.crdtgrp_list;
478 	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
479 		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
480 		update_closid_rmid(tmpmask, rdtgrp);
481 		cpumask_clear(&crgrp->cpu_mask);
482 	}
483 
484 	return 0;
485 }
486 
487 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
488 				   char *buf, size_t nbytes, loff_t off)
489 {
490 	cpumask_var_t tmpmask, newmask, tmpmask1;
491 	struct rdtgroup *rdtgrp;
492 	int ret;
493 
494 	if (!buf)
495 		return -EINVAL;
496 
497 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
498 		return -ENOMEM;
499 	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
500 		free_cpumask_var(tmpmask);
501 		return -ENOMEM;
502 	}
503 	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
504 		free_cpumask_var(tmpmask);
505 		free_cpumask_var(newmask);
506 		return -ENOMEM;
507 	}
508 
509 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
510 	if (!rdtgrp) {
511 		ret = -ENOENT;
512 		goto unlock;
513 	}
514 
515 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
516 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
517 		ret = -EINVAL;
518 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
519 		goto unlock;
520 	}
521 
522 	if (is_cpu_list(of))
523 		ret = cpulist_parse(buf, newmask);
524 	else
525 		ret = cpumask_parse(buf, newmask);
526 
527 	if (ret) {
528 		rdt_last_cmd_puts("Bad CPU list/mask\n");
529 		goto unlock;
530 	}
531 
532 	/* check that user didn't specify any offline cpus */
533 	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
534 	if (!cpumask_empty(tmpmask)) {
535 		ret = -EINVAL;
536 		rdt_last_cmd_puts("Can only assign online CPUs\n");
537 		goto unlock;
538 	}
539 
540 	if (rdtgrp->type == RDTCTRL_GROUP)
541 		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
542 	else if (rdtgrp->type == RDTMON_GROUP)
543 		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
544 	else
545 		ret = -EINVAL;
546 
547 unlock:
548 	rdtgroup_kn_unlock(of->kn);
549 	free_cpumask_var(tmpmask);
550 	free_cpumask_var(newmask);
551 	free_cpumask_var(tmpmask1);
552 
553 	return ret ?: nbytes;
554 }
555 
556 /**
557  * rdtgroup_remove - the helper to remove resource group safely
558  * @rdtgrp: resource group to remove
559  *
560  * On resource group creation via a mkdir, an extra kernfs_node reference is
561  * taken to ensure that the rdtgroup structure remains accessible for the
562  * rdtgroup_kn_unlock() calls where it is removed.
563  *
564  * Drop the extra reference here, then free the rdtgroup structure.
565  *
566  * Return: void
567  */
568 static void rdtgroup_remove(struct rdtgroup *rdtgrp)
569 {
570 	kernfs_put(rdtgrp->kn);
571 	kfree(rdtgrp);
572 }
573 
574 static void _update_task_closid_rmid(void *task)
575 {
576 	/*
577 	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
578 	 * Otherwise, the MSR is updated when the task is scheduled in.
579 	 */
580 	if (task == current)
581 		resctrl_sched_in(task);
582 }
583 
584 static void update_task_closid_rmid(struct task_struct *t)
585 {
586 	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
587 		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
588 	else
589 		_update_task_closid_rmid(t);
590 }
591 
592 static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
593 {
594 	u32 closid, rmid = rdtgrp->mon.rmid;
595 
596 	if (rdtgrp->type == RDTCTRL_GROUP)
597 		closid = rdtgrp->closid;
598 	else if (rdtgrp->type == RDTMON_GROUP)
599 		closid = rdtgrp->mon.parent->closid;
600 	else
601 		return false;
602 
603 	return resctrl_arch_match_closid(tsk, closid) &&
604 	       resctrl_arch_match_rmid(tsk, closid, rmid);
605 }
606 
607 static int __rdtgroup_move_task(struct task_struct *tsk,
608 				struct rdtgroup *rdtgrp)
609 {
610 	/* If the task is already in rdtgrp, no need to move the task. */
611 	if (task_in_rdtgroup(tsk, rdtgrp))
612 		return 0;
613 
614 	/*
615 	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
616 	 * updated by them.
617 	 *
618 	 * For ctrl_mon groups, move both closid and rmid.
619 	 * For monitor groups, can move the tasks only from
620 	 * their parent CTRL group.
621 	 */
622 	if (rdtgrp->type == RDTMON_GROUP &&
623 	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
624 		rdt_last_cmd_puts("Can't move task to different control group\n");
625 		return -EINVAL;
626 	}
627 
628 	if (rdtgrp->type == RDTMON_GROUP)
629 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
630 					     rdtgrp->mon.rmid);
631 	else
632 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
633 					     rdtgrp->mon.rmid);
634 
635 	/*
636 	 * Ensure the task's closid and rmid are written before determining if
637 	 * the task is current that will decide if it will be interrupted.
638 	 * This pairs with the full barrier between the rq->curr update and
639 	 * resctrl_sched_in() during context switch.
640 	 */
641 	smp_mb();
642 
643 	/*
644 	 * By now, the task's closid and rmid are set. If the task is current
645 	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
646 	 * group go into effect. If the task is not current, the MSR will be
647 	 * updated when the task is scheduled in.
648 	 */
649 	update_task_closid_rmid(tsk);
650 
651 	return 0;
652 }
653 
654 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
655 {
656 	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
657 		resctrl_arch_match_closid(t, r->closid));
658 }
659 
660 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
661 {
662 	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
663 		resctrl_arch_match_rmid(t, r->mon.parent->closid,
664 					r->mon.rmid));
665 }
666 
667 /**
668  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
669  * @r: Resource group
670  *
671  * Return: 1 if tasks have been assigned to @r, 0 otherwise
672  */
673 int rdtgroup_tasks_assigned(struct rdtgroup *r)
674 {
675 	struct task_struct *p, *t;
676 	int ret = 0;
677 
678 	lockdep_assert_held(&rdtgroup_mutex);
679 
680 	rcu_read_lock();
681 	for_each_process_thread(p, t) {
682 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
683 			ret = 1;
684 			break;
685 		}
686 	}
687 	rcu_read_unlock();
688 
689 	return ret;
690 }
691 
692 static int rdtgroup_task_write_permission(struct task_struct *task,
693 					  struct kernfs_open_file *of)
694 {
695 	const struct cred *tcred = get_task_cred(task);
696 	const struct cred *cred = current_cred();
697 	int ret = 0;
698 
699 	/*
700 	 * Even if we're attaching all tasks in the thread group, we only
701 	 * need to check permissions on one of them.
702 	 */
703 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
704 	    !uid_eq(cred->euid, tcred->uid) &&
705 	    !uid_eq(cred->euid, tcred->suid)) {
706 		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
707 		ret = -EPERM;
708 	}
709 
710 	put_cred(tcred);
711 	return ret;
712 }
713 
714 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
715 			      struct kernfs_open_file *of)
716 {
717 	struct task_struct *tsk;
718 	int ret;
719 
720 	rcu_read_lock();
721 	if (pid) {
722 		tsk = find_task_by_vpid(pid);
723 		if (!tsk) {
724 			rcu_read_unlock();
725 			rdt_last_cmd_printf("No task %d\n", pid);
726 			return -ESRCH;
727 		}
728 	} else {
729 		tsk = current;
730 	}
731 
732 	get_task_struct(tsk);
733 	rcu_read_unlock();
734 
735 	ret = rdtgroup_task_write_permission(tsk, of);
736 	if (!ret)
737 		ret = __rdtgroup_move_task(tsk, rdtgrp);
738 
739 	put_task_struct(tsk);
740 	return ret;
741 }
742 
743 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
744 				    char *buf, size_t nbytes, loff_t off)
745 {
746 	struct rdtgroup *rdtgrp;
747 	char *pid_str;
748 	int ret = 0;
749 	pid_t pid;
750 
751 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
752 	if (!rdtgrp) {
753 		rdtgroup_kn_unlock(of->kn);
754 		return -ENOENT;
755 	}
756 	rdt_last_cmd_clear();
757 
758 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
759 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
760 		ret = -EINVAL;
761 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
762 		goto unlock;
763 	}
764 
765 	while (buf && buf[0] != '\0' && buf[0] != '\n') {
766 		pid_str = strim(strsep(&buf, ","));
767 
768 		if (kstrtoint(pid_str, 0, &pid)) {
769 			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
770 			ret = -EINVAL;
771 			break;
772 		}
773 
774 		if (pid < 0) {
775 			rdt_last_cmd_printf("Invalid pid %d\n", pid);
776 			ret = -EINVAL;
777 			break;
778 		}
779 
780 		ret = rdtgroup_move_task(pid, rdtgrp, of);
781 		if (ret) {
782 			rdt_last_cmd_printf("Error while processing task %d\n", pid);
783 			break;
784 		}
785 	}
786 
787 unlock:
788 	rdtgroup_kn_unlock(of->kn);
789 
790 	return ret ?: nbytes;
791 }
792 
793 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
794 {
795 	struct task_struct *p, *t;
796 	pid_t pid;
797 
798 	rcu_read_lock();
799 	for_each_process_thread(p, t) {
800 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
801 			pid = task_pid_vnr(t);
802 			if (pid)
803 				seq_printf(s, "%d\n", pid);
804 		}
805 	}
806 	rcu_read_unlock();
807 }
808 
809 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
810 			       struct seq_file *s, void *v)
811 {
812 	struct rdtgroup *rdtgrp;
813 	int ret = 0;
814 
815 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
816 	if (rdtgrp)
817 		show_rdt_tasks(rdtgrp, s);
818 	else
819 		ret = -ENOENT;
820 	rdtgroup_kn_unlock(of->kn);
821 
822 	return ret;
823 }
824 
825 static int rdtgroup_closid_show(struct kernfs_open_file *of,
826 				struct seq_file *s, void *v)
827 {
828 	struct rdtgroup *rdtgrp;
829 	int ret = 0;
830 
831 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
832 	if (rdtgrp)
833 		seq_printf(s, "%u\n", rdtgrp->closid);
834 	else
835 		ret = -ENOENT;
836 	rdtgroup_kn_unlock(of->kn);
837 
838 	return ret;
839 }
840 
841 static int rdtgroup_rmid_show(struct kernfs_open_file *of,
842 			      struct seq_file *s, void *v)
843 {
844 	struct rdtgroup *rdtgrp;
845 	int ret = 0;
846 
847 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
848 	if (rdtgrp)
849 		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
850 	else
851 		ret = -ENOENT;
852 	rdtgroup_kn_unlock(of->kn);
853 
854 	return ret;
855 }
856 
857 #ifdef CONFIG_PROC_CPU_RESCTRL
858 
859 /*
860  * A task can only be part of one resctrl control group and of one monitor
861  * group which is associated to that control group.
862  *
863  * 1)   res:
864  *      mon:
865  *
866  *    resctrl is not available.
867  *
868  * 2)   res:/
869  *      mon:
870  *
871  *    Task is part of the root resctrl control group, and it is not associated
872  *    to any monitor group.
873  *
874  * 3)  res:/
875  *     mon:mon0
876  *
877  *    Task is part of the root resctrl control group and monitor group mon0.
878  *
879  * 4)  res:group0
880  *     mon:
881  *
882  *    Task is part of resctrl control group group0, and it is not associated
883  *    to any monitor group.
884  *
885  * 5) res:group0
886  *    mon:mon1
887  *
888  *    Task is part of resctrl control group group0 and monitor group mon1.
889  */
890 int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
891 		      struct pid *pid, struct task_struct *tsk)
892 {
893 	struct rdtgroup *rdtg;
894 	int ret = 0;
895 
896 	mutex_lock(&rdtgroup_mutex);
897 
898 	/* Return empty if resctrl has not been mounted. */
899 	if (!resctrl_mounted) {
900 		seq_puts(s, "res:\nmon:\n");
901 		goto unlock;
902 	}
903 
904 	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
905 		struct rdtgroup *crg;
906 
907 		/*
908 		 * Task information is only relevant for shareable
909 		 * and exclusive groups.
910 		 */
911 		if (rdtg->mode != RDT_MODE_SHAREABLE &&
912 		    rdtg->mode != RDT_MODE_EXCLUSIVE)
913 			continue;
914 
915 		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
916 			continue;
917 
918 		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
919 			   rdtg->kn->name);
920 		seq_puts(s, "mon:");
921 		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
922 				    mon.crdtgrp_list) {
923 			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
924 						     crg->mon.rmid))
925 				continue;
926 			seq_printf(s, "%s", crg->kn->name);
927 			break;
928 		}
929 		seq_putc(s, '\n');
930 		goto unlock;
931 	}
932 	/*
933 	 * The above search should succeed. Otherwise return
934 	 * with an error.
935 	 */
936 	ret = -ENOENT;
937 unlock:
938 	mutex_unlock(&rdtgroup_mutex);
939 
940 	return ret;
941 }
942 #endif
943 
944 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
945 				    struct seq_file *seq, void *v)
946 {
947 	int len;
948 
949 	mutex_lock(&rdtgroup_mutex);
950 	len = seq_buf_used(&last_cmd_status);
951 	if (len)
952 		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
953 	else
954 		seq_puts(seq, "ok\n");
955 	mutex_unlock(&rdtgroup_mutex);
956 	return 0;
957 }
958 
959 static int rdt_num_closids_show(struct kernfs_open_file *of,
960 				struct seq_file *seq, void *v)
961 {
962 	struct resctrl_schema *s = of->kn->parent->priv;
963 
964 	seq_printf(seq, "%u\n", s->num_closid);
965 	return 0;
966 }
967 
968 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
969 			     struct seq_file *seq, void *v)
970 {
971 	struct resctrl_schema *s = of->kn->parent->priv;
972 	struct rdt_resource *r = s->res;
973 
974 	seq_printf(seq, "%x\n", r->default_ctrl);
975 	return 0;
976 }
977 
978 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
979 			     struct seq_file *seq, void *v)
980 {
981 	struct resctrl_schema *s = of->kn->parent->priv;
982 	struct rdt_resource *r = s->res;
983 
984 	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
985 	return 0;
986 }
987 
988 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
989 				   struct seq_file *seq, void *v)
990 {
991 	struct resctrl_schema *s = of->kn->parent->priv;
992 	struct rdt_resource *r = s->res;
993 
994 	seq_printf(seq, "%x\n", r->cache.shareable_bits);
995 	return 0;
996 }
997 
998 /*
999  * rdt_bit_usage_show - Display current usage of resources
1000  *
1001  * A domain is a shared resource that can now be allocated differently. Here
1002  * we display the current regions of the domain as an annotated bitmask.
1003  * For each domain of this resource its allocation bitmask
1004  * is annotated as below to indicate the current usage of the corresponding bit:
1005  *   0 - currently unused
1006  *   X - currently available for sharing and used by software and hardware
1007  *   H - currently used by hardware only but available for software use
1008  *   S - currently used and shareable by software only
1009  *   E - currently used exclusively by one resource group
1010  *   P - currently pseudo-locked by one resource group
1011  */
1012 static int rdt_bit_usage_show(struct kernfs_open_file *of,
1013 			      struct seq_file *seq, void *v)
1014 {
1015 	struct resctrl_schema *s = of->kn->parent->priv;
1016 	/*
1017 	 * Use unsigned long even though only 32 bits are used to ensure
1018 	 * test_bit() is used safely.
1019 	 */
1020 	unsigned long sw_shareable = 0, hw_shareable = 0;
1021 	unsigned long exclusive = 0, pseudo_locked = 0;
1022 	struct rdt_resource *r = s->res;
1023 	struct rdt_ctrl_domain *dom;
1024 	int i, hwb, swb, excl, psl;
1025 	enum rdtgrp_mode mode;
1026 	bool sep = false;
1027 	u32 ctrl_val;
1028 
1029 	cpus_read_lock();
1030 	mutex_lock(&rdtgroup_mutex);
1031 	hw_shareable = r->cache.shareable_bits;
1032 	list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
1033 		if (sep)
1034 			seq_putc(seq, ';');
1035 		sw_shareable = 0;
1036 		exclusive = 0;
1037 		seq_printf(seq, "%d=", dom->hdr.id);
1038 		for (i = 0; i < closids_supported(); i++) {
1039 			if (!closid_allocated(i))
1040 				continue;
1041 			ctrl_val = resctrl_arch_get_config(r, dom, i,
1042 							   s->conf_type);
1043 			mode = rdtgroup_mode_by_closid(i);
1044 			switch (mode) {
1045 			case RDT_MODE_SHAREABLE:
1046 				sw_shareable |= ctrl_val;
1047 				break;
1048 			case RDT_MODE_EXCLUSIVE:
1049 				exclusive |= ctrl_val;
1050 				break;
1051 			case RDT_MODE_PSEUDO_LOCKSETUP:
1052 			/*
1053 			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
1054 			 * here but not included since the CBM
1055 			 * associated with this CLOSID in this mode
1056 			 * is not initialized and no task or cpu can be
1057 			 * assigned this CLOSID.
1058 			 */
1059 				break;
1060 			case RDT_MODE_PSEUDO_LOCKED:
1061 			case RDT_NUM_MODES:
1062 				WARN(1,
1063 				     "invalid mode for closid %d\n", i);
1064 				break;
1065 			}
1066 		}
1067 		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
1068 			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
1069 			hwb = test_bit(i, &hw_shareable);
1070 			swb = test_bit(i, &sw_shareable);
1071 			excl = test_bit(i, &exclusive);
1072 			psl = test_bit(i, &pseudo_locked);
1073 			if (hwb && swb)
1074 				seq_putc(seq, 'X');
1075 			else if (hwb && !swb)
1076 				seq_putc(seq, 'H');
1077 			else if (!hwb && swb)
1078 				seq_putc(seq, 'S');
1079 			else if (excl)
1080 				seq_putc(seq, 'E');
1081 			else if (psl)
1082 				seq_putc(seq, 'P');
1083 			else /* Unused bits remain */
1084 				seq_putc(seq, '0');
1085 		}
1086 		sep = true;
1087 	}
1088 	seq_putc(seq, '\n');
1089 	mutex_unlock(&rdtgroup_mutex);
1090 	cpus_read_unlock();
1091 	return 0;
1092 }
1093 
1094 static int rdt_min_bw_show(struct kernfs_open_file *of,
1095 			     struct seq_file *seq, void *v)
1096 {
1097 	struct resctrl_schema *s = of->kn->parent->priv;
1098 	struct rdt_resource *r = s->res;
1099 
1100 	seq_printf(seq, "%u\n", r->membw.min_bw);
1101 	return 0;
1102 }
1103 
1104 static int rdt_num_rmids_show(struct kernfs_open_file *of,
1105 			      struct seq_file *seq, void *v)
1106 {
1107 	struct rdt_resource *r = of->kn->parent->priv;
1108 
1109 	seq_printf(seq, "%d\n", r->num_rmid);
1110 
1111 	return 0;
1112 }
1113 
1114 static int rdt_mon_features_show(struct kernfs_open_file *of,
1115 				 struct seq_file *seq, void *v)
1116 {
1117 	struct rdt_resource *r = of->kn->parent->priv;
1118 	struct mon_evt *mevt;
1119 
1120 	list_for_each_entry(mevt, &r->evt_list, list) {
1121 		seq_printf(seq, "%s\n", mevt->name);
1122 		if (mevt->configurable)
1123 			seq_printf(seq, "%s_config\n", mevt->name);
1124 	}
1125 
1126 	return 0;
1127 }
1128 
1129 static int rdt_bw_gran_show(struct kernfs_open_file *of,
1130 			     struct seq_file *seq, void *v)
1131 {
1132 	struct resctrl_schema *s = of->kn->parent->priv;
1133 	struct rdt_resource *r = s->res;
1134 
1135 	seq_printf(seq, "%u\n", r->membw.bw_gran);
1136 	return 0;
1137 }
1138 
1139 static int rdt_delay_linear_show(struct kernfs_open_file *of,
1140 			     struct seq_file *seq, void *v)
1141 {
1142 	struct resctrl_schema *s = of->kn->parent->priv;
1143 	struct rdt_resource *r = s->res;
1144 
1145 	seq_printf(seq, "%u\n", r->membw.delay_linear);
1146 	return 0;
1147 }
1148 
1149 static int max_threshold_occ_show(struct kernfs_open_file *of,
1150 				  struct seq_file *seq, void *v)
1151 {
1152 	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
1153 
1154 	return 0;
1155 }
1156 
1157 static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1158 					 struct seq_file *seq, void *v)
1159 {
1160 	struct resctrl_schema *s = of->kn->parent->priv;
1161 	struct rdt_resource *r = s->res;
1162 
1163 	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
1164 		seq_puts(seq, "per-thread\n");
1165 	else
1166 		seq_puts(seq, "max\n");
1167 
1168 	return 0;
1169 }
1170 
1171 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1172 				       char *buf, size_t nbytes, loff_t off)
1173 {
1174 	unsigned int bytes;
1175 	int ret;
1176 
1177 	ret = kstrtouint(buf, 0, &bytes);
1178 	if (ret)
1179 		return ret;
1180 
1181 	if (bytes > resctrl_rmid_realloc_limit)
1182 		return -EINVAL;
1183 
1184 	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
1185 
1186 	return nbytes;
1187 }
1188 
1189 /*
1190  * rdtgroup_mode_show - Display mode of this resource group
1191  */
1192 static int rdtgroup_mode_show(struct kernfs_open_file *of,
1193 			      struct seq_file *s, void *v)
1194 {
1195 	struct rdtgroup *rdtgrp;
1196 
1197 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1198 	if (!rdtgrp) {
1199 		rdtgroup_kn_unlock(of->kn);
1200 		return -ENOENT;
1201 	}
1202 
1203 	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1204 
1205 	rdtgroup_kn_unlock(of->kn);
1206 	return 0;
1207 }
1208 
1209 static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
1210 {
1211 	switch (my_type) {
1212 	case CDP_CODE:
1213 		return CDP_DATA;
1214 	case CDP_DATA:
1215 		return CDP_CODE;
1216 	default:
1217 	case CDP_NONE:
1218 		return CDP_NONE;
1219 	}
1220 }
1221 
1222 static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
1223 					struct seq_file *seq, void *v)
1224 {
1225 	struct resctrl_schema *s = of->kn->parent->priv;
1226 	struct rdt_resource *r = s->res;
1227 
1228 	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
1229 
1230 	return 0;
1231 }
1232 
1233 /**
1234  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1235  * @r: Resource to which domain instance @d belongs.
1236  * @d: The domain instance for which @closid is being tested.
1237  * @cbm: Capacity bitmask being tested.
1238  * @closid: Intended closid for @cbm.
1239  * @type: CDP type of @r.
1240  * @exclusive: Only check if overlaps with exclusive resource groups
1241  *
1242  * Checks if provided @cbm intended to be used for @closid on domain
1243  * @d overlaps with any other closids or other hardware usage associated
1244  * with this domain. If @exclusive is true then only overlaps with
1245  * resource groups in exclusive mode will be considered. If @exclusive
1246  * is false then overlaps with any resource group or hardware entities
1247  * will be considered.
1248  *
1249  * @cbm is unsigned long, even if only 32 bits are used, to make the
1250  * bitmap functions work correctly.
1251  *
1252  * Return: false if CBM does not overlap, true if it does.
1253  */
1254 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1255 				    unsigned long cbm, int closid,
1256 				    enum resctrl_conf_type type, bool exclusive)
1257 {
1258 	enum rdtgrp_mode mode;
1259 	unsigned long ctrl_b;
1260 	int i;
1261 
1262 	/* Check for any overlap with regions used by hardware directly */
1263 	if (!exclusive) {
1264 		ctrl_b = r->cache.shareable_bits;
1265 		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1266 			return true;
1267 	}
1268 
1269 	/* Check for overlap with other resource groups */
1270 	for (i = 0; i < closids_supported(); i++) {
1271 		ctrl_b = resctrl_arch_get_config(r, d, i, type);
1272 		mode = rdtgroup_mode_by_closid(i);
1273 		if (closid_allocated(i) && i != closid &&
1274 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1275 			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1276 				if (exclusive) {
1277 					if (mode == RDT_MODE_EXCLUSIVE)
1278 						return true;
1279 					continue;
1280 				}
1281 				return true;
1282 			}
1283 		}
1284 	}
1285 
1286 	return false;
1287 }
1288 
1289 /**
1290  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1291  * @s: Schema for the resource to which domain instance @d belongs.
1292  * @d: The domain instance for which @closid is being tested.
1293  * @cbm: Capacity bitmask being tested.
1294  * @closid: Intended closid for @cbm.
1295  * @exclusive: Only check if overlaps with exclusive resource groups
1296  *
1297  * Resources that can be allocated using a CBM can use the CBM to control
1298  * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1299  * for overlap. Overlap test is not limited to the specific resource for
1300  * which the CBM is intended though - when dealing with CDP resources that
1301  * share the underlying hardware the overlap check should be performed on
1302  * the CDP resource sharing the hardware also.
1303  *
1304  * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1305  * overlap test.
1306  *
1307  * Return: true if CBM overlap detected, false if there is no overlap
1308  */
1309 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
1310 			   unsigned long cbm, int closid, bool exclusive)
1311 {
1312 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
1313 	struct rdt_resource *r = s->res;
1314 
1315 	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
1316 				    exclusive))
1317 		return true;
1318 
1319 	if (!resctrl_arch_get_cdp_enabled(r->rid))
1320 		return false;
1321 	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
1322 }
1323 
1324 /**
1325  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1326  * @rdtgrp: Resource group identified through its closid.
1327  *
1328  * An exclusive resource group implies that there should be no sharing of
1329  * its allocated resources. At the time this group is considered to be
1330  * exclusive this test can determine if its current schemata supports this
1331  * setting by testing for overlap with all other resource groups.
1332  *
1333  * Return: true if resource group can be exclusive, false if there is overlap
1334  * with allocations of other resource groups and thus this resource group
1335  * cannot be exclusive.
1336  */
1337 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1338 {
1339 	int closid = rdtgrp->closid;
1340 	struct rdt_ctrl_domain *d;
1341 	struct resctrl_schema *s;
1342 	struct rdt_resource *r;
1343 	bool has_cache = false;
1344 	u32 ctrl;
1345 
1346 	/* Walking r->domains, ensure it can't race with cpuhp */
1347 	lockdep_assert_cpus_held();
1348 
1349 	list_for_each_entry(s, &resctrl_schema_all, list) {
1350 		r = s->res;
1351 		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
1352 			continue;
1353 		has_cache = true;
1354 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1355 			ctrl = resctrl_arch_get_config(r, d, closid,
1356 						       s->conf_type);
1357 			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
1358 				rdt_last_cmd_puts("Schemata overlaps\n");
1359 				return false;
1360 			}
1361 		}
1362 	}
1363 
1364 	if (!has_cache) {
1365 		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1366 		return false;
1367 	}
1368 
1369 	return true;
1370 }
1371 
1372 /*
1373  * rdtgroup_mode_write - Modify the resource group's mode
1374  */
1375 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1376 				   char *buf, size_t nbytes, loff_t off)
1377 {
1378 	struct rdtgroup *rdtgrp;
1379 	enum rdtgrp_mode mode;
1380 	int ret = 0;
1381 
1382 	/* Valid input requires a trailing newline */
1383 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1384 		return -EINVAL;
1385 	buf[nbytes - 1] = '\0';
1386 
1387 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1388 	if (!rdtgrp) {
1389 		rdtgroup_kn_unlock(of->kn);
1390 		return -ENOENT;
1391 	}
1392 
1393 	rdt_last_cmd_clear();
1394 
1395 	mode = rdtgrp->mode;
1396 
1397 	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1398 	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1399 	    (!strcmp(buf, "pseudo-locksetup") &&
1400 	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1401 	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1402 		goto out;
1403 
1404 	if (mode == RDT_MODE_PSEUDO_LOCKED) {
1405 		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1406 		ret = -EINVAL;
1407 		goto out;
1408 	}
1409 
1410 	if (!strcmp(buf, "shareable")) {
1411 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1412 			ret = rdtgroup_locksetup_exit(rdtgrp);
1413 			if (ret)
1414 				goto out;
1415 		}
1416 		rdtgrp->mode = RDT_MODE_SHAREABLE;
1417 	} else if (!strcmp(buf, "exclusive")) {
1418 		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1419 			ret = -EINVAL;
1420 			goto out;
1421 		}
1422 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1423 			ret = rdtgroup_locksetup_exit(rdtgrp);
1424 			if (ret)
1425 				goto out;
1426 		}
1427 		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1428 	} else if (!strcmp(buf, "pseudo-locksetup")) {
1429 		ret = rdtgroup_locksetup_enter(rdtgrp);
1430 		if (ret)
1431 			goto out;
1432 		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1433 	} else {
1434 		rdt_last_cmd_puts("Unknown or unsupported mode\n");
1435 		ret = -EINVAL;
1436 	}
1437 
1438 out:
1439 	rdtgroup_kn_unlock(of->kn);
1440 	return ret ?: nbytes;
1441 }
1442 
1443 /**
1444  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1445  * @r: RDT resource to which @d belongs.
1446  * @d: RDT domain instance.
1447  * @cbm: bitmask for which the size should be computed.
1448  *
1449  * The bitmask provided associated with the RDT domain instance @d will be
1450  * translated into how many bytes it represents. The size in bytes is
1451  * computed by first dividing the total cache size by the CBM length to
1452  * determine how many bytes each bit in the bitmask represents. The result
1453  * is multiplied with the number of bits set in the bitmask.
1454  *
1455  * @cbm is unsigned long, even if only 32 bits are used to make the
1456  * bitmap functions work correctly.
1457  */
1458 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1459 				  struct rdt_ctrl_domain *d, unsigned long cbm)
1460 {
1461 	unsigned int size = 0;
1462 	struct cacheinfo *ci;
1463 	int num_b;
1464 
1465 	if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
1466 		return size;
1467 
1468 	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1469 	ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
1470 	if (ci)
1471 		size = ci->size / r->cache.cbm_len * num_b;
1472 
1473 	return size;
1474 }
1475 
1476 /*
1477  * rdtgroup_size_show - Display size in bytes of allocated regions
1478  *
1479  * The "size" file mirrors the layout of the "schemata" file, printing the
1480  * size in bytes of each region instead of the capacity bitmask.
1481  */
1482 static int rdtgroup_size_show(struct kernfs_open_file *of,
1483 			      struct seq_file *s, void *v)
1484 {
1485 	struct resctrl_schema *schema;
1486 	enum resctrl_conf_type type;
1487 	struct rdt_ctrl_domain *d;
1488 	struct rdtgroup *rdtgrp;
1489 	struct rdt_resource *r;
1490 	unsigned int size;
1491 	int ret = 0;
1492 	u32 closid;
1493 	bool sep;
1494 	u32 ctrl;
1495 
1496 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1497 	if (!rdtgrp) {
1498 		rdtgroup_kn_unlock(of->kn);
1499 		return -ENOENT;
1500 	}
1501 
1502 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1503 		if (!rdtgrp->plr->d) {
1504 			rdt_last_cmd_clear();
1505 			rdt_last_cmd_puts("Cache domain offline\n");
1506 			ret = -ENODEV;
1507 		} else {
1508 			seq_printf(s, "%*s:", max_name_width,
1509 				   rdtgrp->plr->s->name);
1510 			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
1511 						    rdtgrp->plr->d,
1512 						    rdtgrp->plr->cbm);
1513 			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
1514 		}
1515 		goto out;
1516 	}
1517 
1518 	closid = rdtgrp->closid;
1519 
1520 	list_for_each_entry(schema, &resctrl_schema_all, list) {
1521 		r = schema->res;
1522 		type = schema->conf_type;
1523 		sep = false;
1524 		seq_printf(s, "%*s:", max_name_width, schema->name);
1525 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1526 			if (sep)
1527 				seq_putc(s, ';');
1528 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1529 				size = 0;
1530 			} else {
1531 				if (is_mba_sc(r))
1532 					ctrl = d->mbps_val[closid];
1533 				else
1534 					ctrl = resctrl_arch_get_config(r, d,
1535 								       closid,
1536 								       type);
1537 				if (r->rid == RDT_RESOURCE_MBA ||
1538 				    r->rid == RDT_RESOURCE_SMBA)
1539 					size = ctrl;
1540 				else
1541 					size = rdtgroup_cbm_to_size(r, d, ctrl);
1542 			}
1543 			seq_printf(s, "%d=%u", d->hdr.id, size);
1544 			sep = true;
1545 		}
1546 		seq_putc(s, '\n');
1547 	}
1548 
1549 out:
1550 	rdtgroup_kn_unlock(of->kn);
1551 
1552 	return ret;
1553 }
1554 
1555 struct mon_config_info {
1556 	u32 evtid;
1557 	u32 mon_config;
1558 };
1559 
1560 #define INVALID_CONFIG_INDEX   UINT_MAX
1561 
1562 /**
1563  * mon_event_config_index_get - get the hardware index for the
1564  *                              configurable event
1565  * @evtid: event id.
1566  *
1567  * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
1568  *         1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
1569  *         INVALID_CONFIG_INDEX for invalid evtid
1570  */
1571 static inline unsigned int mon_event_config_index_get(u32 evtid)
1572 {
1573 	switch (evtid) {
1574 	case QOS_L3_MBM_TOTAL_EVENT_ID:
1575 		return 0;
1576 	case QOS_L3_MBM_LOCAL_EVENT_ID:
1577 		return 1;
1578 	default:
1579 		/* Should never reach here */
1580 		return INVALID_CONFIG_INDEX;
1581 	}
1582 }
1583 
1584 static void mon_event_config_read(void *info)
1585 {
1586 	struct mon_config_info *mon_info = info;
1587 	unsigned int index;
1588 	u64 msrval;
1589 
1590 	index = mon_event_config_index_get(mon_info->evtid);
1591 	if (index == INVALID_CONFIG_INDEX) {
1592 		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
1593 		return;
1594 	}
1595 	rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
1596 
1597 	/* Report only the valid event configuration bits */
1598 	mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
1599 }
1600 
1601 static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
1602 {
1603 	smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
1604 }
1605 
1606 static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
1607 {
1608 	struct mon_config_info mon_info;
1609 	struct rdt_mon_domain *dom;
1610 	bool sep = false;
1611 
1612 	cpus_read_lock();
1613 	mutex_lock(&rdtgroup_mutex);
1614 
1615 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1616 		if (sep)
1617 			seq_puts(s, ";");
1618 
1619 		memset(&mon_info, 0, sizeof(struct mon_config_info));
1620 		mon_info.evtid = evtid;
1621 		mondata_config_read(dom, &mon_info);
1622 
1623 		seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
1624 		sep = true;
1625 	}
1626 	seq_puts(s, "\n");
1627 
1628 	mutex_unlock(&rdtgroup_mutex);
1629 	cpus_read_unlock();
1630 
1631 	return 0;
1632 }
1633 
1634 static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
1635 				       struct seq_file *seq, void *v)
1636 {
1637 	struct rdt_resource *r = of->kn->parent->priv;
1638 
1639 	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
1640 
1641 	return 0;
1642 }
1643 
1644 static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
1645 				       struct seq_file *seq, void *v)
1646 {
1647 	struct rdt_resource *r = of->kn->parent->priv;
1648 
1649 	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
1650 
1651 	return 0;
1652 }
1653 
1654 static void mon_event_config_write(void *info)
1655 {
1656 	struct mon_config_info *mon_info = info;
1657 	unsigned int index;
1658 
1659 	index = mon_event_config_index_get(mon_info->evtid);
1660 	if (index == INVALID_CONFIG_INDEX) {
1661 		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
1662 		return;
1663 	}
1664 	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
1665 }
1666 
1667 static void mbm_config_write_domain(struct rdt_resource *r,
1668 				    struct rdt_mon_domain *d, u32 evtid, u32 val)
1669 {
1670 	struct mon_config_info mon_info = {0};
1671 
1672 	/*
1673 	 * Read the current config value first. If both are the same then
1674 	 * no need to write it again.
1675 	 */
1676 	mon_info.evtid = evtid;
1677 	mondata_config_read(d, &mon_info);
1678 	if (mon_info.mon_config == val)
1679 		return;
1680 
1681 	mon_info.mon_config = val;
1682 
1683 	/*
1684 	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
1685 	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
1686 	 * are scoped at the domain level. Writing any of these MSRs
1687 	 * on one CPU is observed by all the CPUs in the domain.
1688 	 */
1689 	smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
1690 			      &mon_info, 1);
1691 
1692 	/*
1693 	 * When an Event Configuration is changed, the bandwidth counters
1694 	 * for all RMIDs and Events will be cleared by the hardware. The
1695 	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
1696 	 * every RMID on the next read to any event for every RMID.
1697 	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
1698 	 * cleared while it is tracked by the hardware. Clear the
1699 	 * mbm_local and mbm_total counts for all the RMIDs.
1700 	 */
1701 	resctrl_arch_reset_rmid_all(r, d);
1702 }
1703 
1704 static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
1705 {
1706 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
1707 	char *dom_str = NULL, *id_str;
1708 	unsigned long dom_id, val;
1709 	struct rdt_mon_domain *d;
1710 
1711 	/* Walking r->domains, ensure it can't race with cpuhp */
1712 	lockdep_assert_cpus_held();
1713 
1714 next:
1715 	if (!tok || tok[0] == '\0')
1716 		return 0;
1717 
1718 	/* Start processing the strings for each domain */
1719 	dom_str = strim(strsep(&tok, ";"));
1720 	id_str = strsep(&dom_str, "=");
1721 
1722 	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
1723 		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
1724 		return -EINVAL;
1725 	}
1726 
1727 	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
1728 		rdt_last_cmd_puts("Non-numeric event configuration value\n");
1729 		return -EINVAL;
1730 	}
1731 
1732 	/* Value from user cannot be more than the supported set of events */
1733 	if ((val & hw_res->mbm_cfg_mask) != val) {
1734 		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
1735 				    hw_res->mbm_cfg_mask);
1736 		return -EINVAL;
1737 	}
1738 
1739 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1740 		if (d->hdr.id == dom_id) {
1741 			mbm_config_write_domain(r, d, evtid, val);
1742 			goto next;
1743 		}
1744 	}
1745 
1746 	return -EINVAL;
1747 }
1748 
1749 static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
1750 					    char *buf, size_t nbytes,
1751 					    loff_t off)
1752 {
1753 	struct rdt_resource *r = of->kn->parent->priv;
1754 	int ret;
1755 
1756 	/* Valid input requires a trailing newline */
1757 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1758 		return -EINVAL;
1759 
1760 	cpus_read_lock();
1761 	mutex_lock(&rdtgroup_mutex);
1762 
1763 	rdt_last_cmd_clear();
1764 
1765 	buf[nbytes - 1] = '\0';
1766 
1767 	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
1768 
1769 	mutex_unlock(&rdtgroup_mutex);
1770 	cpus_read_unlock();
1771 
1772 	return ret ?: nbytes;
1773 }
1774 
1775 static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
1776 					    char *buf, size_t nbytes,
1777 					    loff_t off)
1778 {
1779 	struct rdt_resource *r = of->kn->parent->priv;
1780 	int ret;
1781 
1782 	/* Valid input requires a trailing newline */
1783 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1784 		return -EINVAL;
1785 
1786 	cpus_read_lock();
1787 	mutex_lock(&rdtgroup_mutex);
1788 
1789 	rdt_last_cmd_clear();
1790 
1791 	buf[nbytes - 1] = '\0';
1792 
1793 	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
1794 
1795 	mutex_unlock(&rdtgroup_mutex);
1796 	cpus_read_unlock();
1797 
1798 	return ret ?: nbytes;
1799 }
1800 
1801 /* rdtgroup information files for one cache resource. */
1802 static struct rftype res_common_files[] = {
1803 	{
1804 		.name		= "last_cmd_status",
1805 		.mode		= 0444,
1806 		.kf_ops		= &rdtgroup_kf_single_ops,
1807 		.seq_show	= rdt_last_cmd_status_show,
1808 		.fflags		= RFTYPE_TOP_INFO,
1809 	},
1810 	{
1811 		.name		= "num_closids",
1812 		.mode		= 0444,
1813 		.kf_ops		= &rdtgroup_kf_single_ops,
1814 		.seq_show	= rdt_num_closids_show,
1815 		.fflags		= RFTYPE_CTRL_INFO,
1816 	},
1817 	{
1818 		.name		= "mon_features",
1819 		.mode		= 0444,
1820 		.kf_ops		= &rdtgroup_kf_single_ops,
1821 		.seq_show	= rdt_mon_features_show,
1822 		.fflags		= RFTYPE_MON_INFO,
1823 	},
1824 	{
1825 		.name		= "num_rmids",
1826 		.mode		= 0444,
1827 		.kf_ops		= &rdtgroup_kf_single_ops,
1828 		.seq_show	= rdt_num_rmids_show,
1829 		.fflags		= RFTYPE_MON_INFO,
1830 	},
1831 	{
1832 		.name		= "cbm_mask",
1833 		.mode		= 0444,
1834 		.kf_ops		= &rdtgroup_kf_single_ops,
1835 		.seq_show	= rdt_default_ctrl_show,
1836 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1837 	},
1838 	{
1839 		.name		= "min_cbm_bits",
1840 		.mode		= 0444,
1841 		.kf_ops		= &rdtgroup_kf_single_ops,
1842 		.seq_show	= rdt_min_cbm_bits_show,
1843 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1844 	},
1845 	{
1846 		.name		= "shareable_bits",
1847 		.mode		= 0444,
1848 		.kf_ops		= &rdtgroup_kf_single_ops,
1849 		.seq_show	= rdt_shareable_bits_show,
1850 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1851 	},
1852 	{
1853 		.name		= "bit_usage",
1854 		.mode		= 0444,
1855 		.kf_ops		= &rdtgroup_kf_single_ops,
1856 		.seq_show	= rdt_bit_usage_show,
1857 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1858 	},
1859 	{
1860 		.name		= "min_bandwidth",
1861 		.mode		= 0444,
1862 		.kf_ops		= &rdtgroup_kf_single_ops,
1863 		.seq_show	= rdt_min_bw_show,
1864 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1865 	},
1866 	{
1867 		.name		= "bandwidth_gran",
1868 		.mode		= 0444,
1869 		.kf_ops		= &rdtgroup_kf_single_ops,
1870 		.seq_show	= rdt_bw_gran_show,
1871 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1872 	},
1873 	{
1874 		.name		= "delay_linear",
1875 		.mode		= 0444,
1876 		.kf_ops		= &rdtgroup_kf_single_ops,
1877 		.seq_show	= rdt_delay_linear_show,
1878 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1879 	},
1880 	/*
1881 	 * Platform specific which (if any) capabilities are provided by
1882 	 * thread_throttle_mode. Defer "fflags" initialization to platform
1883 	 * discovery.
1884 	 */
1885 	{
1886 		.name		= "thread_throttle_mode",
1887 		.mode		= 0444,
1888 		.kf_ops		= &rdtgroup_kf_single_ops,
1889 		.seq_show	= rdt_thread_throttle_mode_show,
1890 	},
1891 	{
1892 		.name		= "max_threshold_occupancy",
1893 		.mode		= 0644,
1894 		.kf_ops		= &rdtgroup_kf_single_ops,
1895 		.write		= max_threshold_occ_write,
1896 		.seq_show	= max_threshold_occ_show,
1897 		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
1898 	},
1899 	{
1900 		.name		= "mbm_total_bytes_config",
1901 		.mode		= 0644,
1902 		.kf_ops		= &rdtgroup_kf_single_ops,
1903 		.seq_show	= mbm_total_bytes_config_show,
1904 		.write		= mbm_total_bytes_config_write,
1905 	},
1906 	{
1907 		.name		= "mbm_local_bytes_config",
1908 		.mode		= 0644,
1909 		.kf_ops		= &rdtgroup_kf_single_ops,
1910 		.seq_show	= mbm_local_bytes_config_show,
1911 		.write		= mbm_local_bytes_config_write,
1912 	},
1913 	{
1914 		.name		= "cpus",
1915 		.mode		= 0644,
1916 		.kf_ops		= &rdtgroup_kf_single_ops,
1917 		.write		= rdtgroup_cpus_write,
1918 		.seq_show	= rdtgroup_cpus_show,
1919 		.fflags		= RFTYPE_BASE,
1920 	},
1921 	{
1922 		.name		= "cpus_list",
1923 		.mode		= 0644,
1924 		.kf_ops		= &rdtgroup_kf_single_ops,
1925 		.write		= rdtgroup_cpus_write,
1926 		.seq_show	= rdtgroup_cpus_show,
1927 		.flags		= RFTYPE_FLAGS_CPUS_LIST,
1928 		.fflags		= RFTYPE_BASE,
1929 	},
1930 	{
1931 		.name		= "tasks",
1932 		.mode		= 0644,
1933 		.kf_ops		= &rdtgroup_kf_single_ops,
1934 		.write		= rdtgroup_tasks_write,
1935 		.seq_show	= rdtgroup_tasks_show,
1936 		.fflags		= RFTYPE_BASE,
1937 	},
1938 	{
1939 		.name		= "mon_hw_id",
1940 		.mode		= 0444,
1941 		.kf_ops		= &rdtgroup_kf_single_ops,
1942 		.seq_show	= rdtgroup_rmid_show,
1943 		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
1944 	},
1945 	{
1946 		.name		= "schemata",
1947 		.mode		= 0644,
1948 		.kf_ops		= &rdtgroup_kf_single_ops,
1949 		.write		= rdtgroup_schemata_write,
1950 		.seq_show	= rdtgroup_schemata_show,
1951 		.fflags		= RFTYPE_CTRL_BASE,
1952 	},
1953 	{
1954 		.name		= "mba_MBps_event",
1955 		.mode		= 0644,
1956 		.kf_ops		= &rdtgroup_kf_single_ops,
1957 		.write		= rdtgroup_mba_mbps_event_write,
1958 		.seq_show	= rdtgroup_mba_mbps_event_show,
1959 	},
1960 	{
1961 		.name		= "mode",
1962 		.mode		= 0644,
1963 		.kf_ops		= &rdtgroup_kf_single_ops,
1964 		.write		= rdtgroup_mode_write,
1965 		.seq_show	= rdtgroup_mode_show,
1966 		.fflags		= RFTYPE_CTRL_BASE,
1967 	},
1968 	{
1969 		.name		= "size",
1970 		.mode		= 0444,
1971 		.kf_ops		= &rdtgroup_kf_single_ops,
1972 		.seq_show	= rdtgroup_size_show,
1973 		.fflags		= RFTYPE_CTRL_BASE,
1974 	},
1975 	{
1976 		.name		= "sparse_masks",
1977 		.mode		= 0444,
1978 		.kf_ops		= &rdtgroup_kf_single_ops,
1979 		.seq_show	= rdt_has_sparse_bitmasks_show,
1980 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1981 	},
1982 	{
1983 		.name		= "ctrl_hw_id",
1984 		.mode		= 0444,
1985 		.kf_ops		= &rdtgroup_kf_single_ops,
1986 		.seq_show	= rdtgroup_closid_show,
1987 		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
1988 	},
1989 
1990 };
1991 
1992 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1993 {
1994 	struct rftype *rfts, *rft;
1995 	int ret, len;
1996 
1997 	rfts = res_common_files;
1998 	len = ARRAY_SIZE(res_common_files);
1999 
2000 	lockdep_assert_held(&rdtgroup_mutex);
2001 
2002 	if (resctrl_debug)
2003 		fflags |= RFTYPE_DEBUG;
2004 
2005 	for (rft = rfts; rft < rfts + len; rft++) {
2006 		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
2007 			ret = rdtgroup_add_file(kn, rft);
2008 			if (ret)
2009 				goto error;
2010 		}
2011 	}
2012 
2013 	return 0;
2014 error:
2015 	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
2016 	while (--rft >= rfts) {
2017 		if ((fflags & rft->fflags) == rft->fflags)
2018 			kernfs_remove_by_name(kn, rft->name);
2019 	}
2020 	return ret;
2021 }
2022 
2023 static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
2024 {
2025 	struct rftype *rfts, *rft;
2026 	int len;
2027 
2028 	rfts = res_common_files;
2029 	len = ARRAY_SIZE(res_common_files);
2030 
2031 	for (rft = rfts; rft < rfts + len; rft++) {
2032 		if (!strcmp(rft->name, name))
2033 			return rft;
2034 	}
2035 
2036 	return NULL;
2037 }
2038 
2039 void resctrl_file_fflags_init(const char *config, unsigned long fflags)
2040 {
2041 	struct rftype *rft;
2042 
2043 	rft = rdtgroup_get_rftype_by_name(config);
2044 	if (rft)
2045 		rft->fflags = fflags;
2046 }
2047 
2048 /**
2049  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
2050  * @r: The resource group with which the file is associated.
2051  * @name: Name of the file
2052  *
2053  * The permissions of named resctrl file, directory, or link are modified
2054  * to not allow read, write, or execute by any user.
2055  *
2056  * WARNING: This function is intended to communicate to the user that the
2057  * resctrl file has been locked down - that it is not relevant to the
2058  * particular state the system finds itself in. It should not be relied
2059  * on to protect from user access because after the file's permissions
2060  * are restricted the user can still change the permissions using chmod
2061  * from the command line.
2062  *
2063  * Return: 0 on success, <0 on failure.
2064  */
2065 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
2066 {
2067 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2068 	struct kernfs_node *kn;
2069 	int ret = 0;
2070 
2071 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2072 	if (!kn)
2073 		return -ENOENT;
2074 
2075 	switch (kernfs_type(kn)) {
2076 	case KERNFS_DIR:
2077 		iattr.ia_mode = S_IFDIR;
2078 		break;
2079 	case KERNFS_FILE:
2080 		iattr.ia_mode = S_IFREG;
2081 		break;
2082 	case KERNFS_LINK:
2083 		iattr.ia_mode = S_IFLNK;
2084 		break;
2085 	}
2086 
2087 	ret = kernfs_setattr(kn, &iattr);
2088 	kernfs_put(kn);
2089 	return ret;
2090 }
2091 
2092 /**
2093  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
2094  * @r: The resource group with which the file is associated.
2095  * @name: Name of the file
2096  * @mask: Mask of permissions that should be restored
2097  *
2098  * Restore the permissions of the named file. If @name is a directory the
2099  * permissions of its parent will be used.
2100  *
2101  * Return: 0 on success, <0 on failure.
2102  */
2103 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
2104 			     umode_t mask)
2105 {
2106 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2107 	struct kernfs_node *kn, *parent;
2108 	struct rftype *rfts, *rft;
2109 	int ret, len;
2110 
2111 	rfts = res_common_files;
2112 	len = ARRAY_SIZE(res_common_files);
2113 
2114 	for (rft = rfts; rft < rfts + len; rft++) {
2115 		if (!strcmp(rft->name, name))
2116 			iattr.ia_mode = rft->mode & mask;
2117 	}
2118 
2119 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2120 	if (!kn)
2121 		return -ENOENT;
2122 
2123 	switch (kernfs_type(kn)) {
2124 	case KERNFS_DIR:
2125 		parent = kernfs_get_parent(kn);
2126 		if (parent) {
2127 			iattr.ia_mode |= parent->mode;
2128 			kernfs_put(parent);
2129 		}
2130 		iattr.ia_mode |= S_IFDIR;
2131 		break;
2132 	case KERNFS_FILE:
2133 		iattr.ia_mode |= S_IFREG;
2134 		break;
2135 	case KERNFS_LINK:
2136 		iattr.ia_mode |= S_IFLNK;
2137 		break;
2138 	}
2139 
2140 	ret = kernfs_setattr(kn, &iattr);
2141 	kernfs_put(kn);
2142 	return ret;
2143 }
2144 
2145 static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
2146 				      unsigned long fflags)
2147 {
2148 	struct kernfs_node *kn_subdir;
2149 	int ret;
2150 
2151 	kn_subdir = kernfs_create_dir(kn_info, name,
2152 				      kn_info->mode, priv);
2153 	if (IS_ERR(kn_subdir))
2154 		return PTR_ERR(kn_subdir);
2155 
2156 	ret = rdtgroup_kn_set_ugid(kn_subdir);
2157 	if (ret)
2158 		return ret;
2159 
2160 	ret = rdtgroup_add_files(kn_subdir, fflags);
2161 	if (!ret)
2162 		kernfs_activate(kn_subdir);
2163 
2164 	return ret;
2165 }
2166 
2167 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
2168 {
2169 	struct resctrl_schema *s;
2170 	struct rdt_resource *r;
2171 	unsigned long fflags;
2172 	char name[32];
2173 	int ret;
2174 
2175 	/* create the directory */
2176 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
2177 	if (IS_ERR(kn_info))
2178 		return PTR_ERR(kn_info);
2179 
2180 	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
2181 	if (ret)
2182 		goto out_destroy;
2183 
2184 	/* loop over enabled controls, these are all alloc_capable */
2185 	list_for_each_entry(s, &resctrl_schema_all, list) {
2186 		r = s->res;
2187 		fflags = r->fflags | RFTYPE_CTRL_INFO;
2188 		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
2189 		if (ret)
2190 			goto out_destroy;
2191 	}
2192 
2193 	for_each_mon_capable_rdt_resource(r) {
2194 		fflags = r->fflags | RFTYPE_MON_INFO;
2195 		sprintf(name, "%s_MON", r->name);
2196 		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
2197 		if (ret)
2198 			goto out_destroy;
2199 	}
2200 
2201 	ret = rdtgroup_kn_set_ugid(kn_info);
2202 	if (ret)
2203 		goto out_destroy;
2204 
2205 	kernfs_activate(kn_info);
2206 
2207 	return 0;
2208 
2209 out_destroy:
2210 	kernfs_remove(kn_info);
2211 	return ret;
2212 }
2213 
2214 static int
2215 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
2216 		    char *name, struct kernfs_node **dest_kn)
2217 {
2218 	struct kernfs_node *kn;
2219 	int ret;
2220 
2221 	/* create the directory */
2222 	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2223 	if (IS_ERR(kn))
2224 		return PTR_ERR(kn);
2225 
2226 	if (dest_kn)
2227 		*dest_kn = kn;
2228 
2229 	ret = rdtgroup_kn_set_ugid(kn);
2230 	if (ret)
2231 		goto out_destroy;
2232 
2233 	kernfs_activate(kn);
2234 
2235 	return 0;
2236 
2237 out_destroy:
2238 	kernfs_remove(kn);
2239 	return ret;
2240 }
2241 
2242 static void l3_qos_cfg_update(void *arg)
2243 {
2244 	bool *enable = arg;
2245 
2246 	wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
2247 }
2248 
2249 static void l2_qos_cfg_update(void *arg)
2250 {
2251 	bool *enable = arg;
2252 
2253 	wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
2254 }
2255 
2256 static inline bool is_mba_linear(void)
2257 {
2258 	return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
2259 }
2260 
2261 static int set_cache_qos_cfg(int level, bool enable)
2262 {
2263 	void (*update)(void *arg);
2264 	struct rdt_ctrl_domain *d;
2265 	struct rdt_resource *r_l;
2266 	cpumask_var_t cpu_mask;
2267 	int cpu;
2268 
2269 	/* Walking r->domains, ensure it can't race with cpuhp */
2270 	lockdep_assert_cpus_held();
2271 
2272 	if (level == RDT_RESOURCE_L3)
2273 		update = l3_qos_cfg_update;
2274 	else if (level == RDT_RESOURCE_L2)
2275 		update = l2_qos_cfg_update;
2276 	else
2277 		return -EINVAL;
2278 
2279 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2280 		return -ENOMEM;
2281 
2282 	r_l = &rdt_resources_all[level].r_resctrl;
2283 	list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
2284 		if (r_l->cache.arch_has_per_cpu_cfg)
2285 			/* Pick all the CPUs in the domain instance */
2286 			for_each_cpu(cpu, &d->hdr.cpu_mask)
2287 				cpumask_set_cpu(cpu, cpu_mask);
2288 		else
2289 			/* Pick one CPU from each domain instance to update MSR */
2290 			cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
2291 	}
2292 
2293 	/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
2294 	on_each_cpu_mask(cpu_mask, update, &enable, 1);
2295 
2296 	free_cpumask_var(cpu_mask);
2297 
2298 	return 0;
2299 }
2300 
2301 /* Restore the qos cfg state when a domain comes online */
2302 void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
2303 {
2304 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
2305 
2306 	if (!r->cdp_capable)
2307 		return;
2308 
2309 	if (r->rid == RDT_RESOURCE_L2)
2310 		l2_qos_cfg_update(&hw_res->cdp_enabled);
2311 
2312 	if (r->rid == RDT_RESOURCE_L3)
2313 		l3_qos_cfg_update(&hw_res->cdp_enabled);
2314 }
2315 
2316 static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
2317 {
2318 	u32 num_closid = resctrl_arch_get_num_closid(r);
2319 	int cpu = cpumask_any(&d->hdr.cpu_mask);
2320 	int i;
2321 
2322 	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
2323 				   GFP_KERNEL, cpu_to_node(cpu));
2324 	if (!d->mbps_val)
2325 		return -ENOMEM;
2326 
2327 	for (i = 0; i < num_closid; i++)
2328 		d->mbps_val[i] = MBA_MAX_MBPS;
2329 
2330 	return 0;
2331 }
2332 
2333 static void mba_sc_domain_destroy(struct rdt_resource *r,
2334 				  struct rdt_ctrl_domain *d)
2335 {
2336 	kfree(d->mbps_val);
2337 	d->mbps_val = NULL;
2338 }
2339 
2340 /*
2341  * MBA software controller is supported only if
2342  * MBM is supported and MBA is in linear scale,
2343  * and the MBM monitor scope is the same as MBA
2344  * control scope.
2345  */
2346 static bool supports_mba_mbps(void)
2347 {
2348 	struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
2349 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
2350 
2351 	return (is_mbm_enabled() &&
2352 		r->alloc_capable && is_mba_linear() &&
2353 		r->ctrl_scope == rmbm->mon_scope);
2354 }
2355 
2356 /*
2357  * Enable or disable the MBA software controller
2358  * which helps user specify bandwidth in MBps.
2359  */
2360 static int set_mba_sc(bool mba_sc)
2361 {
2362 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
2363 	u32 num_closid = resctrl_arch_get_num_closid(r);
2364 	struct rdt_ctrl_domain *d;
2365 	unsigned long fflags;
2366 	int i;
2367 
2368 	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
2369 		return -EINVAL;
2370 
2371 	r->membw.mba_sc = mba_sc;
2372 
2373 	rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
2374 
2375 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2376 		for (i = 0; i < num_closid; i++)
2377 			d->mbps_val[i] = MBA_MAX_MBPS;
2378 	}
2379 
2380 	fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
2381 	resctrl_file_fflags_init("mba_MBps_event", fflags);
2382 
2383 	return 0;
2384 }
2385 
2386 static int cdp_enable(int level)
2387 {
2388 	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
2389 	int ret;
2390 
2391 	if (!r_l->alloc_capable)
2392 		return -EINVAL;
2393 
2394 	ret = set_cache_qos_cfg(level, true);
2395 	if (!ret)
2396 		rdt_resources_all[level].cdp_enabled = true;
2397 
2398 	return ret;
2399 }
2400 
2401 static void cdp_disable(int level)
2402 {
2403 	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
2404 
2405 	if (r_hw->cdp_enabled) {
2406 		set_cache_qos_cfg(level, false);
2407 		r_hw->cdp_enabled = false;
2408 	}
2409 }
2410 
2411 int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
2412 {
2413 	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
2414 
2415 	if (!hw_res->r_resctrl.cdp_capable)
2416 		return -EINVAL;
2417 
2418 	if (enable)
2419 		return cdp_enable(l);
2420 
2421 	cdp_disable(l);
2422 
2423 	return 0;
2424 }
2425 
2426 /*
2427  * We don't allow rdtgroup directories to be created anywhere
2428  * except the root directory. Thus when looking for the rdtgroup
2429  * structure for a kernfs node we are either looking at a directory,
2430  * in which case the rdtgroup structure is pointed at by the "priv"
2431  * field, otherwise we have a file, and need only look to the parent
2432  * to find the rdtgroup.
2433  */
2434 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2435 {
2436 	if (kernfs_type(kn) == KERNFS_DIR) {
2437 		/*
2438 		 * All the resource directories use "kn->priv"
2439 		 * to point to the "struct rdtgroup" for the
2440 		 * resource. "info" and its subdirectories don't
2441 		 * have rdtgroup structures, so return NULL here.
2442 		 */
2443 		if (kn == kn_info || kn->parent == kn_info)
2444 			return NULL;
2445 		else
2446 			return kn->priv;
2447 	} else {
2448 		return kn->parent->priv;
2449 	}
2450 }
2451 
2452 static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2453 {
2454 	atomic_inc(&rdtgrp->waitcount);
2455 	kernfs_break_active_protection(kn);
2456 }
2457 
2458 static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2459 {
2460 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2461 	    (rdtgrp->flags & RDT_DELETED)) {
2462 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2463 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2464 			rdtgroup_pseudo_lock_remove(rdtgrp);
2465 		kernfs_unbreak_active_protection(kn);
2466 		rdtgroup_remove(rdtgrp);
2467 	} else {
2468 		kernfs_unbreak_active_protection(kn);
2469 	}
2470 }
2471 
2472 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2473 {
2474 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2475 
2476 	if (!rdtgrp)
2477 		return NULL;
2478 
2479 	rdtgroup_kn_get(rdtgrp, kn);
2480 
2481 	cpus_read_lock();
2482 	mutex_lock(&rdtgroup_mutex);
2483 
2484 	/* Was this group deleted while we waited? */
2485 	if (rdtgrp->flags & RDT_DELETED)
2486 		return NULL;
2487 
2488 	return rdtgrp;
2489 }
2490 
2491 void rdtgroup_kn_unlock(struct kernfs_node *kn)
2492 {
2493 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2494 
2495 	if (!rdtgrp)
2496 		return;
2497 
2498 	mutex_unlock(&rdtgroup_mutex);
2499 	cpus_read_unlock();
2500 
2501 	rdtgroup_kn_put(rdtgrp, kn);
2502 }
2503 
2504 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2505 			     struct rdtgroup *prgrp,
2506 			     struct kernfs_node **mon_data_kn);
2507 
2508 static void rdt_disable_ctx(void)
2509 {
2510 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2511 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2512 	set_mba_sc(false);
2513 
2514 	resctrl_debug = false;
2515 }
2516 
2517 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2518 {
2519 	int ret = 0;
2520 
2521 	if (ctx->enable_cdpl2) {
2522 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
2523 		if (ret)
2524 			goto out_done;
2525 	}
2526 
2527 	if (ctx->enable_cdpl3) {
2528 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
2529 		if (ret)
2530 			goto out_cdpl2;
2531 	}
2532 
2533 	if (ctx->enable_mba_mbps) {
2534 		ret = set_mba_sc(true);
2535 		if (ret)
2536 			goto out_cdpl3;
2537 	}
2538 
2539 	if (ctx->enable_debug)
2540 		resctrl_debug = true;
2541 
2542 	return 0;
2543 
2544 out_cdpl3:
2545 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2546 out_cdpl2:
2547 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2548 out_done:
2549 	return ret;
2550 }
2551 
2552 static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
2553 {
2554 	struct resctrl_schema *s;
2555 	const char *suffix = "";
2556 	int ret, cl;
2557 
2558 	s = kzalloc(sizeof(*s), GFP_KERNEL);
2559 	if (!s)
2560 		return -ENOMEM;
2561 
2562 	s->res = r;
2563 	s->num_closid = resctrl_arch_get_num_closid(r);
2564 	if (resctrl_arch_get_cdp_enabled(r->rid))
2565 		s->num_closid /= 2;
2566 
2567 	s->conf_type = type;
2568 	switch (type) {
2569 	case CDP_CODE:
2570 		suffix = "CODE";
2571 		break;
2572 	case CDP_DATA:
2573 		suffix = "DATA";
2574 		break;
2575 	case CDP_NONE:
2576 		suffix = "";
2577 		break;
2578 	}
2579 
2580 	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
2581 	if (ret >= sizeof(s->name)) {
2582 		kfree(s);
2583 		return -EINVAL;
2584 	}
2585 
2586 	cl = strlen(s->name);
2587 
2588 	/*
2589 	 * If CDP is supported by this resource, but not enabled,
2590 	 * include the suffix. This ensures the tabular format of the
2591 	 * schemata file does not change between mounts of the filesystem.
2592 	 */
2593 	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
2594 		cl += 4;
2595 
2596 	if (cl > max_name_width)
2597 		max_name_width = cl;
2598 
2599 	INIT_LIST_HEAD(&s->list);
2600 	list_add(&s->list, &resctrl_schema_all);
2601 
2602 	return 0;
2603 }
2604 
2605 static int schemata_list_create(void)
2606 {
2607 	struct rdt_resource *r;
2608 	int ret = 0;
2609 
2610 	for_each_alloc_capable_rdt_resource(r) {
2611 		if (resctrl_arch_get_cdp_enabled(r->rid)) {
2612 			ret = schemata_list_add(r, CDP_CODE);
2613 			if (ret)
2614 				break;
2615 
2616 			ret = schemata_list_add(r, CDP_DATA);
2617 		} else {
2618 			ret = schemata_list_add(r, CDP_NONE);
2619 		}
2620 
2621 		if (ret)
2622 			break;
2623 	}
2624 
2625 	return ret;
2626 }
2627 
2628 static void schemata_list_destroy(void)
2629 {
2630 	struct resctrl_schema *s, *tmp;
2631 
2632 	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
2633 		list_del(&s->list);
2634 		kfree(s);
2635 	}
2636 }
2637 
2638 static int rdt_get_tree(struct fs_context *fc)
2639 {
2640 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2641 	unsigned long flags = RFTYPE_CTRL_BASE;
2642 	struct rdt_mon_domain *dom;
2643 	struct rdt_resource *r;
2644 	int ret;
2645 
2646 	cpus_read_lock();
2647 	mutex_lock(&rdtgroup_mutex);
2648 	/*
2649 	 * resctrl file system can only be mounted once.
2650 	 */
2651 	if (resctrl_mounted) {
2652 		ret = -EBUSY;
2653 		goto out;
2654 	}
2655 
2656 	ret = rdtgroup_setup_root(ctx);
2657 	if (ret)
2658 		goto out;
2659 
2660 	ret = rdt_enable_ctx(ctx);
2661 	if (ret)
2662 		goto out_root;
2663 
2664 	ret = schemata_list_create();
2665 	if (ret) {
2666 		schemata_list_destroy();
2667 		goto out_ctx;
2668 	}
2669 
2670 	closid_init();
2671 
2672 	if (resctrl_arch_mon_capable())
2673 		flags |= RFTYPE_MON;
2674 
2675 	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
2676 	if (ret)
2677 		goto out_schemata_free;
2678 
2679 	kernfs_activate(rdtgroup_default.kn);
2680 
2681 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2682 	if (ret < 0)
2683 		goto out_schemata_free;
2684 
2685 	if (resctrl_arch_mon_capable()) {
2686 		ret = mongroup_create_dir(rdtgroup_default.kn,
2687 					  &rdtgroup_default, "mon_groups",
2688 					  &kn_mongrp);
2689 		if (ret < 0)
2690 			goto out_info;
2691 
2692 		ret = mkdir_mondata_all(rdtgroup_default.kn,
2693 					&rdtgroup_default, &kn_mondata);
2694 		if (ret < 0)
2695 			goto out_mongrp;
2696 		rdtgroup_default.mon.mon_data_kn = kn_mondata;
2697 	}
2698 
2699 	ret = rdt_pseudo_lock_init();
2700 	if (ret)
2701 		goto out_mondata;
2702 
2703 	ret = kernfs_get_tree(fc);
2704 	if (ret < 0)
2705 		goto out_psl;
2706 
2707 	if (resctrl_arch_alloc_capable())
2708 		resctrl_arch_enable_alloc();
2709 	if (resctrl_arch_mon_capable())
2710 		resctrl_arch_enable_mon();
2711 
2712 	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
2713 		resctrl_mounted = true;
2714 
2715 	if (is_mbm_enabled()) {
2716 		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
2717 		list_for_each_entry(dom, &r->mon_domains, hdr.list)
2718 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
2719 						   RESCTRL_PICK_ANY_CPU);
2720 	}
2721 
2722 	goto out;
2723 
2724 out_psl:
2725 	rdt_pseudo_lock_release();
2726 out_mondata:
2727 	if (resctrl_arch_mon_capable())
2728 		kernfs_remove(kn_mondata);
2729 out_mongrp:
2730 	if (resctrl_arch_mon_capable())
2731 		kernfs_remove(kn_mongrp);
2732 out_info:
2733 	kernfs_remove(kn_info);
2734 out_schemata_free:
2735 	schemata_list_destroy();
2736 out_ctx:
2737 	rdt_disable_ctx();
2738 out_root:
2739 	rdtgroup_destroy_root();
2740 out:
2741 	rdt_last_cmd_clear();
2742 	mutex_unlock(&rdtgroup_mutex);
2743 	cpus_read_unlock();
2744 	return ret;
2745 }
2746 
2747 enum rdt_param {
2748 	Opt_cdp,
2749 	Opt_cdpl2,
2750 	Opt_mba_mbps,
2751 	Opt_debug,
2752 	nr__rdt_params
2753 };
2754 
2755 static const struct fs_parameter_spec rdt_fs_parameters[] = {
2756 	fsparam_flag("cdp",		Opt_cdp),
2757 	fsparam_flag("cdpl2",		Opt_cdpl2),
2758 	fsparam_flag("mba_MBps",	Opt_mba_mbps),
2759 	fsparam_flag("debug",		Opt_debug),
2760 	{}
2761 };
2762 
2763 static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2764 {
2765 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2766 	struct fs_parse_result result;
2767 	const char *msg;
2768 	int opt;
2769 
2770 	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2771 	if (opt < 0)
2772 		return opt;
2773 
2774 	switch (opt) {
2775 	case Opt_cdp:
2776 		ctx->enable_cdpl3 = true;
2777 		return 0;
2778 	case Opt_cdpl2:
2779 		ctx->enable_cdpl2 = true;
2780 		return 0;
2781 	case Opt_mba_mbps:
2782 		msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
2783 		if (!supports_mba_mbps())
2784 			return invalfc(fc, msg);
2785 		ctx->enable_mba_mbps = true;
2786 		return 0;
2787 	case Opt_debug:
2788 		ctx->enable_debug = true;
2789 		return 0;
2790 	}
2791 
2792 	return -EINVAL;
2793 }
2794 
2795 static void rdt_fs_context_free(struct fs_context *fc)
2796 {
2797 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2798 
2799 	kernfs_free_fs_context(fc);
2800 	kfree(ctx);
2801 }
2802 
2803 static const struct fs_context_operations rdt_fs_context_ops = {
2804 	.free		= rdt_fs_context_free,
2805 	.parse_param	= rdt_parse_param,
2806 	.get_tree	= rdt_get_tree,
2807 };
2808 
2809 static int rdt_init_fs_context(struct fs_context *fc)
2810 {
2811 	struct rdt_fs_context *ctx;
2812 
2813 	ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
2814 	if (!ctx)
2815 		return -ENOMEM;
2816 
2817 	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2818 	fc->fs_private = &ctx->kfc;
2819 	fc->ops = &rdt_fs_context_ops;
2820 	put_user_ns(fc->user_ns);
2821 	fc->user_ns = get_user_ns(&init_user_ns);
2822 	fc->global = true;
2823 	return 0;
2824 }
2825 
2826 static int reset_all_ctrls(struct rdt_resource *r)
2827 {
2828 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
2829 	struct rdt_hw_ctrl_domain *hw_dom;
2830 	struct msr_param msr_param;
2831 	struct rdt_ctrl_domain *d;
2832 	int i;
2833 
2834 	/* Walking r->domains, ensure it can't race with cpuhp */
2835 	lockdep_assert_cpus_held();
2836 
2837 	msr_param.res = r;
2838 	msr_param.low = 0;
2839 	msr_param.high = hw_res->num_closid;
2840 
2841 	/*
2842 	 * Disable resource control for this resource by setting all
2843 	 * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
2844 	 * from each domain to update the MSRs below.
2845 	 */
2846 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2847 		hw_dom = resctrl_to_arch_ctrl_dom(d);
2848 
2849 		for (i = 0; i < hw_res->num_closid; i++)
2850 			hw_dom->ctrl_val[i] = r->default_ctrl;
2851 		msr_param.dom = d;
2852 		smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
2853 	}
2854 
2855 	return 0;
2856 }
2857 
2858 /*
2859  * Move tasks from one to the other group. If @from is NULL, then all tasks
2860  * in the systems are moved unconditionally (used for teardown).
2861  *
2862  * If @mask is not NULL the cpus on which moved tasks are running are set
2863  * in that mask so the update smp function call is restricted to affected
2864  * cpus.
2865  */
2866 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2867 				 struct cpumask *mask)
2868 {
2869 	struct task_struct *p, *t;
2870 
2871 	read_lock(&tasklist_lock);
2872 	for_each_process_thread(p, t) {
2873 		if (!from || is_closid_match(t, from) ||
2874 		    is_rmid_match(t, from)) {
2875 			resctrl_arch_set_closid_rmid(t, to->closid,
2876 						     to->mon.rmid);
2877 
2878 			/*
2879 			 * Order the closid/rmid stores above before the loads
2880 			 * in task_curr(). This pairs with the full barrier
2881 			 * between the rq->curr update and resctrl_sched_in()
2882 			 * during context switch.
2883 			 */
2884 			smp_mb();
2885 
2886 			/*
2887 			 * If the task is on a CPU, set the CPU in the mask.
2888 			 * The detection is inaccurate as tasks might move or
2889 			 * schedule before the smp function call takes place.
2890 			 * In such a case the function call is pointless, but
2891 			 * there is no other side effect.
2892 			 */
2893 			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
2894 				cpumask_set_cpu(task_cpu(t), mask);
2895 		}
2896 	}
2897 	read_unlock(&tasklist_lock);
2898 }
2899 
2900 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2901 {
2902 	struct rdtgroup *sentry, *stmp;
2903 	struct list_head *head;
2904 
2905 	head = &rdtgrp->mon.crdtgrp_list;
2906 	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2907 		free_rmid(sentry->closid, sentry->mon.rmid);
2908 		list_del(&sentry->mon.crdtgrp_list);
2909 
2910 		if (atomic_read(&sentry->waitcount) != 0)
2911 			sentry->flags = RDT_DELETED;
2912 		else
2913 			rdtgroup_remove(sentry);
2914 	}
2915 }
2916 
2917 /*
2918  * Forcibly remove all of subdirectories under root.
2919  */
2920 static void rmdir_all_sub(void)
2921 {
2922 	struct rdtgroup *rdtgrp, *tmp;
2923 
2924 	/* Move all tasks to the default resource group */
2925 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2926 
2927 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2928 		/* Free any child rmids */
2929 		free_all_child_rdtgrp(rdtgrp);
2930 
2931 		/* Remove each rdtgroup other than root */
2932 		if (rdtgrp == &rdtgroup_default)
2933 			continue;
2934 
2935 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2936 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2937 			rdtgroup_pseudo_lock_remove(rdtgrp);
2938 
2939 		/*
2940 		 * Give any CPUs back to the default group. We cannot copy
2941 		 * cpu_online_mask because a CPU might have executed the
2942 		 * offline callback already, but is still marked online.
2943 		 */
2944 		cpumask_or(&rdtgroup_default.cpu_mask,
2945 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2946 
2947 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
2948 
2949 		kernfs_remove(rdtgrp->kn);
2950 		list_del(&rdtgrp->rdtgroup_list);
2951 
2952 		if (atomic_read(&rdtgrp->waitcount) != 0)
2953 			rdtgrp->flags = RDT_DELETED;
2954 		else
2955 			rdtgroup_remove(rdtgrp);
2956 	}
2957 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2958 	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2959 
2960 	kernfs_remove(kn_info);
2961 	kernfs_remove(kn_mongrp);
2962 	kernfs_remove(kn_mondata);
2963 }
2964 
2965 static void rdt_kill_sb(struct super_block *sb)
2966 {
2967 	struct rdt_resource *r;
2968 
2969 	cpus_read_lock();
2970 	mutex_lock(&rdtgroup_mutex);
2971 
2972 	rdt_disable_ctx();
2973 
2974 	/*Put everything back to default values. */
2975 	for_each_alloc_capable_rdt_resource(r)
2976 		reset_all_ctrls(r);
2977 	rmdir_all_sub();
2978 	rdt_pseudo_lock_release();
2979 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2980 	schemata_list_destroy();
2981 	rdtgroup_destroy_root();
2982 	if (resctrl_arch_alloc_capable())
2983 		resctrl_arch_disable_alloc();
2984 	if (resctrl_arch_mon_capable())
2985 		resctrl_arch_disable_mon();
2986 	resctrl_mounted = false;
2987 	kernfs_kill_sb(sb);
2988 	mutex_unlock(&rdtgroup_mutex);
2989 	cpus_read_unlock();
2990 }
2991 
2992 static struct file_system_type rdt_fs_type = {
2993 	.name			= "resctrl",
2994 	.init_fs_context	= rdt_init_fs_context,
2995 	.parameters		= rdt_fs_parameters,
2996 	.kill_sb		= rdt_kill_sb,
2997 };
2998 
2999 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
3000 		       void *priv)
3001 {
3002 	struct kernfs_node *kn;
3003 	int ret = 0;
3004 
3005 	kn = __kernfs_create_file(parent_kn, name, 0444,
3006 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
3007 				  &kf_mondata_ops, priv, NULL, NULL);
3008 	if (IS_ERR(kn))
3009 		return PTR_ERR(kn);
3010 
3011 	ret = rdtgroup_kn_set_ugid(kn);
3012 	if (ret) {
3013 		kernfs_remove(kn);
3014 		return ret;
3015 	}
3016 
3017 	return ret;
3018 }
3019 
3020 static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
3021 {
3022 	struct kernfs_node *kn;
3023 
3024 	kn = kernfs_find_and_get(pkn, name);
3025 	if (!kn)
3026 		return;
3027 	kernfs_put(kn);
3028 
3029 	if (kn->dir.subdirs <= 1)
3030 		kernfs_remove(kn);
3031 	else
3032 		kernfs_remove_by_name(kn, subname);
3033 }
3034 
3035 /*
3036  * Remove all subdirectories of mon_data of ctrl_mon groups
3037  * and monitor groups for the given domain.
3038  * Remove files and directories containing "sum" of domain data
3039  * when last domain being summed is removed.
3040  */
3041 static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3042 					   struct rdt_mon_domain *d)
3043 {
3044 	struct rdtgroup *prgrp, *crgrp;
3045 	char subname[32];
3046 	bool snc_mode;
3047 	char name[32];
3048 
3049 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3050 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3051 	if (snc_mode)
3052 		sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
3053 
3054 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3055 		mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
3056 
3057 		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
3058 			mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
3059 	}
3060 }
3061 
3062 static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
3063 			     struct rdt_resource *r, struct rdtgroup *prgrp,
3064 			     bool do_sum)
3065 {
3066 	struct rmid_read rr = {0};
3067 	union mon_data_bits priv;
3068 	struct mon_evt *mevt;
3069 	int ret;
3070 
3071 	if (WARN_ON(list_empty(&r->evt_list)))
3072 		return -EPERM;
3073 
3074 	priv.u.rid = r->rid;
3075 	priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
3076 	priv.u.sum = do_sum;
3077 	list_for_each_entry(mevt, &r->evt_list, list) {
3078 		priv.u.evtid = mevt->evtid;
3079 		ret = mon_addfile(kn, mevt->name, priv.priv);
3080 		if (ret)
3081 			return ret;
3082 
3083 		if (!do_sum && is_mbm_event(mevt->evtid))
3084 			mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
3085 	}
3086 
3087 	return 0;
3088 }
3089 
3090 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
3091 				struct rdt_mon_domain *d,
3092 				struct rdt_resource *r, struct rdtgroup *prgrp)
3093 {
3094 	struct kernfs_node *kn, *ckn;
3095 	char name[32];
3096 	bool snc_mode;
3097 	int ret = 0;
3098 
3099 	lockdep_assert_held(&rdtgroup_mutex);
3100 
3101 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3102 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3103 	kn = kernfs_find_and_get(parent_kn, name);
3104 	if (kn) {
3105 		/*
3106 		 * rdtgroup_mutex will prevent this directory from being
3107 		 * removed. No need to keep this hold.
3108 		 */
3109 		kernfs_put(kn);
3110 	} else {
3111 		kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
3112 		if (IS_ERR(kn))
3113 			return PTR_ERR(kn);
3114 
3115 		ret = rdtgroup_kn_set_ugid(kn);
3116 		if (ret)
3117 			goto out_destroy;
3118 		ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
3119 		if (ret)
3120 			goto out_destroy;
3121 	}
3122 
3123 	if (snc_mode) {
3124 		sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
3125 		ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
3126 		if (IS_ERR(ckn)) {
3127 			ret = -EINVAL;
3128 			goto out_destroy;
3129 		}
3130 
3131 		ret = rdtgroup_kn_set_ugid(ckn);
3132 		if (ret)
3133 			goto out_destroy;
3134 
3135 		ret = mon_add_all_files(ckn, d, r, prgrp, false);
3136 		if (ret)
3137 			goto out_destroy;
3138 	}
3139 
3140 	kernfs_activate(kn);
3141 	return 0;
3142 
3143 out_destroy:
3144 	kernfs_remove(kn);
3145 	return ret;
3146 }
3147 
3148 /*
3149  * Add all subdirectories of mon_data for "ctrl_mon" groups
3150  * and "monitor" groups with given domain id.
3151  */
3152 static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3153 					   struct rdt_mon_domain *d)
3154 {
3155 	struct kernfs_node *parent_kn;
3156 	struct rdtgroup *prgrp, *crgrp;
3157 	struct list_head *head;
3158 
3159 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3160 		parent_kn = prgrp->mon.mon_data_kn;
3161 		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
3162 
3163 		head = &prgrp->mon.crdtgrp_list;
3164 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
3165 			parent_kn = crgrp->mon.mon_data_kn;
3166 			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
3167 		}
3168 	}
3169 }
3170 
3171 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
3172 				       struct rdt_resource *r,
3173 				       struct rdtgroup *prgrp)
3174 {
3175 	struct rdt_mon_domain *dom;
3176 	int ret;
3177 
3178 	/* Walking r->domains, ensure it can't race with cpuhp */
3179 	lockdep_assert_cpus_held();
3180 
3181 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
3182 		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
3183 		if (ret)
3184 			return ret;
3185 	}
3186 
3187 	return 0;
3188 }
3189 
3190 /*
3191  * This creates a directory mon_data which contains the monitored data.
3192  *
3193  * mon_data has one directory for each domain which are named
3194  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
3195  * with L3 domain looks as below:
3196  * ./mon_data:
3197  * mon_L3_00
3198  * mon_L3_01
3199  * mon_L3_02
3200  * ...
3201  *
3202  * Each domain directory has one file per event:
3203  * ./mon_L3_00/:
3204  * llc_occupancy
3205  *
3206  */
3207 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
3208 			     struct rdtgroup *prgrp,
3209 			     struct kernfs_node **dest_kn)
3210 {
3211 	struct rdt_resource *r;
3212 	struct kernfs_node *kn;
3213 	int ret;
3214 
3215 	/*
3216 	 * Create the mon_data directory first.
3217 	 */
3218 	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
3219 	if (ret)
3220 		return ret;
3221 
3222 	if (dest_kn)
3223 		*dest_kn = kn;
3224 
3225 	/*
3226 	 * Create the subdirectories for each domain. Note that all events
3227 	 * in a domain like L3 are grouped into a resource whose domain is L3
3228 	 */
3229 	for_each_mon_capable_rdt_resource(r) {
3230 		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
3231 		if (ret)
3232 			goto out_destroy;
3233 	}
3234 
3235 	return 0;
3236 
3237 out_destroy:
3238 	kernfs_remove(kn);
3239 	return ret;
3240 }
3241 
3242 /**
3243  * cbm_ensure_valid - Enforce validity on provided CBM
3244  * @_val:	Candidate CBM
3245  * @r:		RDT resource to which the CBM belongs
3246  *
3247  * The provided CBM represents all cache portions available for use. This
3248  * may be represented by a bitmap that does not consist of contiguous ones
3249  * and thus be an invalid CBM.
3250  * Here the provided CBM is forced to be a valid CBM by only considering
3251  * the first set of contiguous bits as valid and clearing all bits.
3252  * The intention here is to provide a valid default CBM with which a new
3253  * resource group is initialized. The user can follow this with a
3254  * modification to the CBM if the default does not satisfy the
3255  * requirements.
3256  */
3257 static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
3258 {
3259 	unsigned int cbm_len = r->cache.cbm_len;
3260 	unsigned long first_bit, zero_bit;
3261 	unsigned long val = _val;
3262 
3263 	if (!val)
3264 		return 0;
3265 
3266 	first_bit = find_first_bit(&val, cbm_len);
3267 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
3268 
3269 	/* Clear any remaining bits to ensure contiguous region */
3270 	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
3271 	return (u32)val;
3272 }
3273 
3274 /*
3275  * Initialize cache resources per RDT domain
3276  *
3277  * Set the RDT domain up to start off with all usable allocations. That is,
3278  * all shareable and unused bits. All-zero CBM is invalid.
3279  */
3280 static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
3281 				 u32 closid)
3282 {
3283 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
3284 	enum resctrl_conf_type t = s->conf_type;
3285 	struct resctrl_staged_config *cfg;
3286 	struct rdt_resource *r = s->res;
3287 	u32 used_b = 0, unused_b = 0;
3288 	unsigned long tmp_cbm;
3289 	enum rdtgrp_mode mode;
3290 	u32 peer_ctl, ctrl_val;
3291 	int i;
3292 
3293 	cfg = &d->staged_config[t];
3294 	cfg->have_new_ctrl = false;
3295 	cfg->new_ctrl = r->cache.shareable_bits;
3296 	used_b = r->cache.shareable_bits;
3297 	for (i = 0; i < closids_supported(); i++) {
3298 		if (closid_allocated(i) && i != closid) {
3299 			mode = rdtgroup_mode_by_closid(i);
3300 			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
3301 				/*
3302 				 * ctrl values for locksetup aren't relevant
3303 				 * until the schemata is written, and the mode
3304 				 * becomes RDT_MODE_PSEUDO_LOCKED.
3305 				 */
3306 				continue;
3307 			/*
3308 			 * If CDP is active include peer domain's
3309 			 * usage to ensure there is no overlap
3310 			 * with an exclusive group.
3311 			 */
3312 			if (resctrl_arch_get_cdp_enabled(r->rid))
3313 				peer_ctl = resctrl_arch_get_config(r, d, i,
3314 								   peer_type);
3315 			else
3316 				peer_ctl = 0;
3317 			ctrl_val = resctrl_arch_get_config(r, d, i,
3318 							   s->conf_type);
3319 			used_b |= ctrl_val | peer_ctl;
3320 			if (mode == RDT_MODE_SHAREABLE)
3321 				cfg->new_ctrl |= ctrl_val | peer_ctl;
3322 		}
3323 	}
3324 	if (d->plr && d->plr->cbm > 0)
3325 		used_b |= d->plr->cbm;
3326 	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
3327 	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
3328 	cfg->new_ctrl |= unused_b;
3329 	/*
3330 	 * Force the initial CBM to be valid, user can
3331 	 * modify the CBM based on system availability.
3332 	 */
3333 	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
3334 	/*
3335 	 * Assign the u32 CBM to an unsigned long to ensure that
3336 	 * bitmap_weight() does not access out-of-bound memory.
3337 	 */
3338 	tmp_cbm = cfg->new_ctrl;
3339 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
3340 		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
3341 		return -ENOSPC;
3342 	}
3343 	cfg->have_new_ctrl = true;
3344 
3345 	return 0;
3346 }
3347 
3348 /*
3349  * Initialize cache resources with default values.
3350  *
3351  * A new RDT group is being created on an allocation capable (CAT)
3352  * supporting system. Set this group up to start off with all usable
3353  * allocations.
3354  *
3355  * If there are no more shareable bits available on any domain then
3356  * the entire allocation will fail.
3357  */
3358 static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
3359 {
3360 	struct rdt_ctrl_domain *d;
3361 	int ret;
3362 
3363 	list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
3364 		ret = __init_one_rdt_domain(d, s, closid);
3365 		if (ret < 0)
3366 			return ret;
3367 	}
3368 
3369 	return 0;
3370 }
3371 
3372 /* Initialize MBA resource with default values. */
3373 static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
3374 {
3375 	struct resctrl_staged_config *cfg;
3376 	struct rdt_ctrl_domain *d;
3377 
3378 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
3379 		if (is_mba_sc(r)) {
3380 			d->mbps_val[closid] = MBA_MAX_MBPS;
3381 			continue;
3382 		}
3383 
3384 		cfg = &d->staged_config[CDP_NONE];
3385 		cfg->new_ctrl = r->default_ctrl;
3386 		cfg->have_new_ctrl = true;
3387 	}
3388 }
3389 
3390 /* Initialize the RDT group's allocations. */
3391 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
3392 {
3393 	struct resctrl_schema *s;
3394 	struct rdt_resource *r;
3395 	int ret = 0;
3396 
3397 	rdt_staged_configs_clear();
3398 
3399 	list_for_each_entry(s, &resctrl_schema_all, list) {
3400 		r = s->res;
3401 		if (r->rid == RDT_RESOURCE_MBA ||
3402 		    r->rid == RDT_RESOURCE_SMBA) {
3403 			rdtgroup_init_mba(r, rdtgrp->closid);
3404 			if (is_mba_sc(r))
3405 				continue;
3406 		} else {
3407 			ret = rdtgroup_init_cat(s, rdtgrp->closid);
3408 			if (ret < 0)
3409 				goto out;
3410 		}
3411 
3412 		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
3413 		if (ret < 0) {
3414 			rdt_last_cmd_puts("Failed to initialize allocations\n");
3415 			goto out;
3416 		}
3417 
3418 	}
3419 
3420 	rdtgrp->mode = RDT_MODE_SHAREABLE;
3421 
3422 out:
3423 	rdt_staged_configs_clear();
3424 	return ret;
3425 }
3426 
3427 static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
3428 {
3429 	int ret;
3430 
3431 	if (!resctrl_arch_mon_capable())
3432 		return 0;
3433 
3434 	ret = alloc_rmid(rdtgrp->closid);
3435 	if (ret < 0) {
3436 		rdt_last_cmd_puts("Out of RMIDs\n");
3437 		return ret;
3438 	}
3439 	rdtgrp->mon.rmid = ret;
3440 
3441 	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
3442 	if (ret) {
3443 		rdt_last_cmd_puts("kernfs subdir error\n");
3444 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3445 		return ret;
3446 	}
3447 
3448 	return 0;
3449 }
3450 
3451 static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
3452 {
3453 	if (resctrl_arch_mon_capable())
3454 		free_rmid(rgrp->closid, rgrp->mon.rmid);
3455 }
3456 
3457 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
3458 			     const char *name, umode_t mode,
3459 			     enum rdt_group_type rtype, struct rdtgroup **r)
3460 {
3461 	struct rdtgroup *prdtgrp, *rdtgrp;
3462 	unsigned long files = 0;
3463 	struct kernfs_node *kn;
3464 	int ret;
3465 
3466 	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
3467 	if (!prdtgrp) {
3468 		ret = -ENODEV;
3469 		goto out_unlock;
3470 	}
3471 
3472 	if (rtype == RDTMON_GROUP &&
3473 	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3474 	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
3475 		ret = -EINVAL;
3476 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
3477 		goto out_unlock;
3478 	}
3479 
3480 	/* allocate the rdtgroup. */
3481 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
3482 	if (!rdtgrp) {
3483 		ret = -ENOSPC;
3484 		rdt_last_cmd_puts("Kernel out of memory\n");
3485 		goto out_unlock;
3486 	}
3487 	*r = rdtgrp;
3488 	rdtgrp->mon.parent = prdtgrp;
3489 	rdtgrp->type = rtype;
3490 	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
3491 
3492 	/* kernfs creates the directory for rdtgrp */
3493 	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
3494 	if (IS_ERR(kn)) {
3495 		ret = PTR_ERR(kn);
3496 		rdt_last_cmd_puts("kernfs create error\n");
3497 		goto out_free_rgrp;
3498 	}
3499 	rdtgrp->kn = kn;
3500 
3501 	/*
3502 	 * kernfs_remove() will drop the reference count on "kn" which
3503 	 * will free it. But we still need it to stick around for the
3504 	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
3505 	 * which will be dropped by kernfs_put() in rdtgroup_remove().
3506 	 */
3507 	kernfs_get(kn);
3508 
3509 	ret = rdtgroup_kn_set_ugid(kn);
3510 	if (ret) {
3511 		rdt_last_cmd_puts("kernfs perm error\n");
3512 		goto out_destroy;
3513 	}
3514 
3515 	if (rtype == RDTCTRL_GROUP) {
3516 		files = RFTYPE_BASE | RFTYPE_CTRL;
3517 		if (resctrl_arch_mon_capable())
3518 			files |= RFTYPE_MON;
3519 	} else {
3520 		files = RFTYPE_BASE | RFTYPE_MON;
3521 	}
3522 
3523 	ret = rdtgroup_add_files(kn, files);
3524 	if (ret) {
3525 		rdt_last_cmd_puts("kernfs fill error\n");
3526 		goto out_destroy;
3527 	}
3528 
3529 	/*
3530 	 * The caller unlocks the parent_kn upon success.
3531 	 */
3532 	return 0;
3533 
3534 out_destroy:
3535 	kernfs_put(rdtgrp->kn);
3536 	kernfs_remove(rdtgrp->kn);
3537 out_free_rgrp:
3538 	kfree(rdtgrp);
3539 out_unlock:
3540 	rdtgroup_kn_unlock(parent_kn);
3541 	return ret;
3542 }
3543 
3544 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
3545 {
3546 	kernfs_remove(rgrp->kn);
3547 	rdtgroup_remove(rgrp);
3548 }
3549 
3550 /*
3551  * Create a monitor group under "mon_groups" directory of a control
3552  * and monitor group(ctrl_mon). This is a resource group
3553  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
3554  */
3555 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
3556 			      const char *name, umode_t mode)
3557 {
3558 	struct rdtgroup *rdtgrp, *prgrp;
3559 	int ret;
3560 
3561 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
3562 	if (ret)
3563 		return ret;
3564 
3565 	prgrp = rdtgrp->mon.parent;
3566 	rdtgrp->closid = prgrp->closid;
3567 
3568 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3569 	if (ret) {
3570 		mkdir_rdt_prepare_clean(rdtgrp);
3571 		goto out_unlock;
3572 	}
3573 
3574 	kernfs_activate(rdtgrp->kn);
3575 
3576 	/*
3577 	 * Add the rdtgrp to the list of rdtgrps the parent
3578 	 * ctrl_mon group has to track.
3579 	 */
3580 	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
3581 
3582 out_unlock:
3583 	rdtgroup_kn_unlock(parent_kn);
3584 	return ret;
3585 }
3586 
3587 /*
3588  * These are rdtgroups created under the root directory. Can be used
3589  * to allocate and monitor resources.
3590  */
3591 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
3592 				   const char *name, umode_t mode)
3593 {
3594 	struct rdtgroup *rdtgrp;
3595 	struct kernfs_node *kn;
3596 	u32 closid;
3597 	int ret;
3598 
3599 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
3600 	if (ret)
3601 		return ret;
3602 
3603 	kn = rdtgrp->kn;
3604 	ret = closid_alloc();
3605 	if (ret < 0) {
3606 		rdt_last_cmd_puts("Out of CLOSIDs\n");
3607 		goto out_common_fail;
3608 	}
3609 	closid = ret;
3610 	ret = 0;
3611 
3612 	rdtgrp->closid = closid;
3613 
3614 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3615 	if (ret)
3616 		goto out_closid_free;
3617 
3618 	kernfs_activate(rdtgrp->kn);
3619 
3620 	ret = rdtgroup_init_alloc(rdtgrp);
3621 	if (ret < 0)
3622 		goto out_rmid_free;
3623 
3624 	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
3625 
3626 	if (resctrl_arch_mon_capable()) {
3627 		/*
3628 		 * Create an empty mon_groups directory to hold the subset
3629 		 * of tasks and cpus to monitor.
3630 		 */
3631 		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
3632 		if (ret) {
3633 			rdt_last_cmd_puts("kernfs subdir error\n");
3634 			goto out_del_list;
3635 		}
3636 		if (is_mba_sc(NULL))
3637 			rdtgrp->mba_mbps_event = mba_mbps_default_event;
3638 	}
3639 
3640 	goto out_unlock;
3641 
3642 out_del_list:
3643 	list_del(&rdtgrp->rdtgroup_list);
3644 out_rmid_free:
3645 	mkdir_rdt_prepare_rmid_free(rdtgrp);
3646 out_closid_free:
3647 	closid_free(closid);
3648 out_common_fail:
3649 	mkdir_rdt_prepare_clean(rdtgrp);
3650 out_unlock:
3651 	rdtgroup_kn_unlock(parent_kn);
3652 	return ret;
3653 }
3654 
3655 /*
3656  * We allow creating mon groups only with in a directory called "mon_groups"
3657  * which is present in every ctrl_mon group. Check if this is a valid
3658  * "mon_groups" directory.
3659  *
3660  * 1. The directory should be named "mon_groups".
3661  * 2. The mon group itself should "not" be named "mon_groups".
3662  *   This makes sure "mon_groups" directory always has a ctrl_mon group
3663  *   as parent.
3664  */
3665 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3666 {
3667 	return (!strcmp(kn->name, "mon_groups") &&
3668 		strcmp(name, "mon_groups"));
3669 }
3670 
3671 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3672 			  umode_t mode)
3673 {
3674 	/* Do not accept '\n' to avoid unparsable situation. */
3675 	if (strchr(name, '\n'))
3676 		return -EINVAL;
3677 
3678 	/*
3679 	 * If the parent directory is the root directory and RDT
3680 	 * allocation is supported, add a control and monitoring
3681 	 * subdirectory
3682 	 */
3683 	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
3684 		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3685 
3686 	/*
3687 	 * If RDT monitoring is supported and the parent directory is a valid
3688 	 * "mon_groups" directory, add a monitoring subdirectory.
3689 	 */
3690 	if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
3691 		return rdtgroup_mkdir_mon(parent_kn, name, mode);
3692 
3693 	return -EPERM;
3694 }
3695 
3696 static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3697 {
3698 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3699 	int cpu;
3700 
3701 	/* Give any tasks back to the parent group */
3702 	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3703 
3704 	/* Update per cpu rmid of the moved CPUs first */
3705 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3706 		per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
3707 	/*
3708 	 * Update the MSR on moved CPUs and CPUs which have moved
3709 	 * task running on them.
3710 	 */
3711 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3712 	update_closid_rmid(tmpmask, NULL);
3713 
3714 	rdtgrp->flags = RDT_DELETED;
3715 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3716 
3717 	/*
3718 	 * Remove the rdtgrp from the parent ctrl_mon group's list
3719 	 */
3720 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3721 	list_del(&rdtgrp->mon.crdtgrp_list);
3722 
3723 	kernfs_remove(rdtgrp->kn);
3724 
3725 	return 0;
3726 }
3727 
3728 static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
3729 {
3730 	rdtgrp->flags = RDT_DELETED;
3731 	list_del(&rdtgrp->rdtgroup_list);
3732 
3733 	kernfs_remove(rdtgrp->kn);
3734 	return 0;
3735 }
3736 
3737 static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3738 {
3739 	int cpu;
3740 
3741 	/* Give any tasks back to the default group */
3742 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3743 
3744 	/* Give any CPUs back to the default group */
3745 	cpumask_or(&rdtgroup_default.cpu_mask,
3746 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3747 
3748 	/* Update per cpu closid and rmid of the moved CPUs first */
3749 	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
3750 		per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
3751 		per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
3752 	}
3753 
3754 	/*
3755 	 * Update the MSR on moved CPUs and CPUs which have moved
3756 	 * task running on them.
3757 	 */
3758 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3759 	update_closid_rmid(tmpmask, NULL);
3760 
3761 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3762 	closid_free(rdtgrp->closid);
3763 
3764 	rdtgroup_ctrl_remove(rdtgrp);
3765 
3766 	/*
3767 	 * Free all the child monitor group rmids.
3768 	 */
3769 	free_all_child_rdtgrp(rdtgrp);
3770 
3771 	return 0;
3772 }
3773 
3774 static int rdtgroup_rmdir(struct kernfs_node *kn)
3775 {
3776 	struct kernfs_node *parent_kn = kn->parent;
3777 	struct rdtgroup *rdtgrp;
3778 	cpumask_var_t tmpmask;
3779 	int ret = 0;
3780 
3781 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
3782 		return -ENOMEM;
3783 
3784 	rdtgrp = rdtgroup_kn_lock_live(kn);
3785 	if (!rdtgrp) {
3786 		ret = -EPERM;
3787 		goto out;
3788 	}
3789 
3790 	/*
3791 	 * If the rdtgroup is a ctrl_mon group and parent directory
3792 	 * is the root directory, remove the ctrl_mon group.
3793 	 *
3794 	 * If the rdtgroup is a mon group and parent directory
3795 	 * is a valid "mon_groups" directory, remove the mon group.
3796 	 */
3797 	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
3798 	    rdtgrp != &rdtgroup_default) {
3799 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3800 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
3801 			ret = rdtgroup_ctrl_remove(rdtgrp);
3802 		} else {
3803 			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
3804 		}
3805 	} else if (rdtgrp->type == RDTMON_GROUP &&
3806 		 is_mon_groups(parent_kn, kn->name)) {
3807 		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
3808 	} else {
3809 		ret = -EPERM;
3810 	}
3811 
3812 out:
3813 	rdtgroup_kn_unlock(kn);
3814 	free_cpumask_var(tmpmask);
3815 	return ret;
3816 }
3817 
3818 /**
3819  * mongrp_reparent() - replace parent CTRL_MON group of a MON group
3820  * @rdtgrp:		the MON group whose parent should be replaced
3821  * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
3822  * @cpus:		cpumask provided by the caller for use during this call
3823  *
3824  * Replaces the parent CTRL_MON group for a MON group, resulting in all member
3825  * tasks' CLOSID immediately changing to that of the new parent group.
3826  * Monitoring data for the group is unaffected by this operation.
3827  */
3828 static void mongrp_reparent(struct rdtgroup *rdtgrp,
3829 			    struct rdtgroup *new_prdtgrp,
3830 			    cpumask_var_t cpus)
3831 {
3832 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3833 
3834 	WARN_ON(rdtgrp->type != RDTMON_GROUP);
3835 	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
3836 
3837 	/* Nothing to do when simply renaming a MON group. */
3838 	if (prdtgrp == new_prdtgrp)
3839 		return;
3840 
3841 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3842 	list_move_tail(&rdtgrp->mon.crdtgrp_list,
3843 		       &new_prdtgrp->mon.crdtgrp_list);
3844 
3845 	rdtgrp->mon.parent = new_prdtgrp;
3846 	rdtgrp->closid = new_prdtgrp->closid;
3847 
3848 	/* Propagate updated closid to all tasks in this group. */
3849 	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
3850 
3851 	update_closid_rmid(cpus, NULL);
3852 }
3853 
3854 static int rdtgroup_rename(struct kernfs_node *kn,
3855 			   struct kernfs_node *new_parent, const char *new_name)
3856 {
3857 	struct rdtgroup *new_prdtgrp;
3858 	struct rdtgroup *rdtgrp;
3859 	cpumask_var_t tmpmask;
3860 	int ret;
3861 
3862 	rdtgrp = kernfs_to_rdtgroup(kn);
3863 	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
3864 	if (!rdtgrp || !new_prdtgrp)
3865 		return -ENOENT;
3866 
3867 	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
3868 	rdtgroup_kn_get(rdtgrp, kn);
3869 	rdtgroup_kn_get(new_prdtgrp, new_parent);
3870 
3871 	mutex_lock(&rdtgroup_mutex);
3872 
3873 	rdt_last_cmd_clear();
3874 
3875 	/*
3876 	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
3877 	 * either kernfs_node is a file.
3878 	 */
3879 	if (kernfs_type(kn) != KERNFS_DIR ||
3880 	    kernfs_type(new_parent) != KERNFS_DIR) {
3881 		rdt_last_cmd_puts("Source and destination must be directories");
3882 		ret = -EPERM;
3883 		goto out;
3884 	}
3885 
3886 	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
3887 		ret = -ENOENT;
3888 		goto out;
3889 	}
3890 
3891 	if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
3892 	    !is_mon_groups(kn->parent, kn->name)) {
3893 		rdt_last_cmd_puts("Source must be a MON group\n");
3894 		ret = -EPERM;
3895 		goto out;
3896 	}
3897 
3898 	if (!is_mon_groups(new_parent, new_name)) {
3899 		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
3900 		ret = -EPERM;
3901 		goto out;
3902 	}
3903 
3904 	/*
3905 	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
3906 	 * current parent CTRL_MON group and therefore cannot be assigned to
3907 	 * the new parent, making the move illegal.
3908 	 */
3909 	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
3910 	    rdtgrp->mon.parent != new_prdtgrp) {
3911 		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
3912 		ret = -EPERM;
3913 		goto out;
3914 	}
3915 
3916 	/*
3917 	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
3918 	 * possibility of failing to allocate it after kernfs_rename() has
3919 	 * succeeded.
3920 	 */
3921 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
3922 		ret = -ENOMEM;
3923 		goto out;
3924 	}
3925 
3926 	/*
3927 	 * Perform all input validation and allocations needed to ensure
3928 	 * mongrp_reparent() will succeed before calling kernfs_rename(),
3929 	 * otherwise it would be necessary to revert this call if
3930 	 * mongrp_reparent() failed.
3931 	 */
3932 	ret = kernfs_rename(kn, new_parent, new_name);
3933 	if (!ret)
3934 		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
3935 
3936 	free_cpumask_var(tmpmask);
3937 
3938 out:
3939 	mutex_unlock(&rdtgroup_mutex);
3940 	rdtgroup_kn_put(rdtgrp, kn);
3941 	rdtgroup_kn_put(new_prdtgrp, new_parent);
3942 	return ret;
3943 }
3944 
3945 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
3946 {
3947 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
3948 		seq_puts(seq, ",cdp");
3949 
3950 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
3951 		seq_puts(seq, ",cdpl2");
3952 
3953 	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
3954 		seq_puts(seq, ",mba_MBps");
3955 
3956 	if (resctrl_debug)
3957 		seq_puts(seq, ",debug");
3958 
3959 	return 0;
3960 }
3961 
3962 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
3963 	.mkdir		= rdtgroup_mkdir,
3964 	.rmdir		= rdtgroup_rmdir,
3965 	.rename		= rdtgroup_rename,
3966 	.show_options	= rdtgroup_show_options,
3967 };
3968 
3969 static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
3970 {
3971 	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
3972 				      KERNFS_ROOT_CREATE_DEACTIVATED |
3973 				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
3974 				      &rdtgroup_default);
3975 	if (IS_ERR(rdt_root))
3976 		return PTR_ERR(rdt_root);
3977 
3978 	ctx->kfc.root = rdt_root;
3979 	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
3980 
3981 	return 0;
3982 }
3983 
3984 static void rdtgroup_destroy_root(void)
3985 {
3986 	kernfs_destroy_root(rdt_root);
3987 	rdtgroup_default.kn = NULL;
3988 }
3989 
3990 static void __init rdtgroup_setup_default(void)
3991 {
3992 	mutex_lock(&rdtgroup_mutex);
3993 
3994 	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
3995 	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
3996 	rdtgroup_default.type = RDTCTRL_GROUP;
3997 	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
3998 
3999 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
4000 
4001 	mutex_unlock(&rdtgroup_mutex);
4002 }
4003 
4004 static void domain_destroy_mon_state(struct rdt_mon_domain *d)
4005 {
4006 	bitmap_free(d->rmid_busy_llc);
4007 	kfree(d->mbm_total);
4008 	kfree(d->mbm_local);
4009 }
4010 
4011 void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4012 {
4013 	mutex_lock(&rdtgroup_mutex);
4014 
4015 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
4016 		mba_sc_domain_destroy(r, d);
4017 
4018 	mutex_unlock(&rdtgroup_mutex);
4019 }
4020 
4021 void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4022 {
4023 	mutex_lock(&rdtgroup_mutex);
4024 
4025 	/*
4026 	 * If resctrl is mounted, remove all the
4027 	 * per domain monitor data directories.
4028 	 */
4029 	if (resctrl_mounted && resctrl_arch_mon_capable())
4030 		rmdir_mondata_subdir_allrdtgrp(r, d);
4031 
4032 	if (is_mbm_enabled())
4033 		cancel_delayed_work(&d->mbm_over);
4034 	if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
4035 		/*
4036 		 * When a package is going down, forcefully
4037 		 * decrement rmid->ebusy. There is no way to know
4038 		 * that the L3 was flushed and hence may lead to
4039 		 * incorrect counts in rare scenarios, but leaving
4040 		 * the RMID as busy creates RMID leaks if the
4041 		 * package never comes back.
4042 		 */
4043 		__check_limbo(d, true);
4044 		cancel_delayed_work(&d->cqm_limbo);
4045 	}
4046 
4047 	domain_destroy_mon_state(d);
4048 
4049 	mutex_unlock(&rdtgroup_mutex);
4050 }
4051 
4052 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
4053 {
4054 	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
4055 	size_t tsize;
4056 
4057 	if (is_llc_occupancy_enabled()) {
4058 		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
4059 		if (!d->rmid_busy_llc)
4060 			return -ENOMEM;
4061 	}
4062 	if (is_mbm_total_enabled()) {
4063 		tsize = sizeof(*d->mbm_total);
4064 		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
4065 		if (!d->mbm_total) {
4066 			bitmap_free(d->rmid_busy_llc);
4067 			return -ENOMEM;
4068 		}
4069 	}
4070 	if (is_mbm_local_enabled()) {
4071 		tsize = sizeof(*d->mbm_local);
4072 		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
4073 		if (!d->mbm_local) {
4074 			bitmap_free(d->rmid_busy_llc);
4075 			kfree(d->mbm_total);
4076 			return -ENOMEM;
4077 		}
4078 	}
4079 
4080 	return 0;
4081 }
4082 
4083 int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4084 {
4085 	int err = 0;
4086 
4087 	mutex_lock(&rdtgroup_mutex);
4088 
4089 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
4090 		/* RDT_RESOURCE_MBA is never mon_capable */
4091 		err = mba_sc_domain_allocate(r, d);
4092 	}
4093 
4094 	mutex_unlock(&rdtgroup_mutex);
4095 
4096 	return err;
4097 }
4098 
4099 int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4100 {
4101 	int err;
4102 
4103 	mutex_lock(&rdtgroup_mutex);
4104 
4105 	err = domain_setup_mon_state(r, d);
4106 	if (err)
4107 		goto out_unlock;
4108 
4109 	if (is_mbm_enabled()) {
4110 		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
4111 		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
4112 					   RESCTRL_PICK_ANY_CPU);
4113 	}
4114 
4115 	if (is_llc_occupancy_enabled())
4116 		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
4117 
4118 	/*
4119 	 * If the filesystem is not mounted then only the default resource group
4120 	 * exists. Creation of its directories is deferred until mount time
4121 	 * by rdt_get_tree() calling mkdir_mondata_all().
4122 	 * If resctrl is mounted, add per domain monitor data directories.
4123 	 */
4124 	if (resctrl_mounted && resctrl_arch_mon_capable())
4125 		mkdir_mondata_subdir_allrdtgrp(r, d);
4126 
4127 out_unlock:
4128 	mutex_unlock(&rdtgroup_mutex);
4129 
4130 	return err;
4131 }
4132 
4133 void resctrl_online_cpu(unsigned int cpu)
4134 {
4135 	mutex_lock(&rdtgroup_mutex);
4136 	/* The CPU is set in default rdtgroup after online. */
4137 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
4138 	mutex_unlock(&rdtgroup_mutex);
4139 }
4140 
4141 static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
4142 {
4143 	struct rdtgroup *cr;
4144 
4145 	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
4146 		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
4147 			break;
4148 	}
4149 }
4150 
4151 void resctrl_offline_cpu(unsigned int cpu)
4152 {
4153 	struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
4154 	struct rdt_mon_domain *d;
4155 	struct rdtgroup *rdtgrp;
4156 
4157 	mutex_lock(&rdtgroup_mutex);
4158 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
4159 		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
4160 			clear_childcpus(rdtgrp, cpu);
4161 			break;
4162 		}
4163 	}
4164 
4165 	if (!l3->mon_capable)
4166 		goto out_unlock;
4167 
4168 	d = get_mon_domain_from_cpu(cpu, l3);
4169 	if (d) {
4170 		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
4171 			cancel_delayed_work(&d->mbm_over);
4172 			mbm_setup_overflow_handler(d, 0, cpu);
4173 		}
4174 		if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
4175 		    has_busy_rmid(d)) {
4176 			cancel_delayed_work(&d->cqm_limbo);
4177 			cqm_setup_limbo_handler(d, 0, cpu);
4178 		}
4179 	}
4180 
4181 out_unlock:
4182 	mutex_unlock(&rdtgroup_mutex);
4183 }
4184 
4185 /*
4186  * rdtgroup_init - rdtgroup initialization
4187  *
4188  * Setup resctrl file system including set up root, create mount point,
4189  * register rdtgroup filesystem, and initialize files under root directory.
4190  *
4191  * Return: 0 on success or -errno
4192  */
4193 int __init rdtgroup_init(void)
4194 {
4195 	int ret = 0;
4196 
4197 	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
4198 		     sizeof(last_cmd_status_buf));
4199 
4200 	rdtgroup_setup_default();
4201 
4202 	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
4203 	if (ret)
4204 		return ret;
4205 
4206 	ret = register_filesystem(&rdt_fs_type);
4207 	if (ret)
4208 		goto cleanup_mountpoint;
4209 
4210 	/*
4211 	 * Adding the resctrl debugfs directory here may not be ideal since
4212 	 * it would let the resctrl debugfs directory appear on the debugfs
4213 	 * filesystem before the resctrl filesystem is mounted.
4214 	 * It may also be ok since that would enable debugging of RDT before
4215 	 * resctrl is mounted.
4216 	 * The reason why the debugfs directory is created here and not in
4217 	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
4218 	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
4219 	 * (the lockdep class of inode->i_rwsem). Other filesystem
4220 	 * interactions (eg. SyS_getdents) have the lock ordering:
4221 	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
4222 	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
4223 	 * is taken, thus creating dependency:
4224 	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
4225 	 * issues considering the other two lock dependencies.
4226 	 * By creating the debugfs directory here we avoid a dependency
4227 	 * that may cause deadlock (even though file operations cannot
4228 	 * occur until the filesystem is mounted, but I do not know how to
4229 	 * tell lockdep that).
4230 	 */
4231 	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
4232 
4233 	return 0;
4234 
4235 cleanup_mountpoint:
4236 	sysfs_remove_mount_point(fs_kobj, "resctrl");
4237 
4238 	return ret;
4239 }
4240 
4241 void __exit rdtgroup_exit(void)
4242 {
4243 	debugfs_remove_recursive(debugfs_resctrl);
4244 	unregister_filesystem(&rdt_fs_type);
4245 	sysfs_remove_mount_point(fs_kobj, "resctrl");
4246 }
4247