xref: /linux/arch/x86/kernel/cpu/resctrl/rdtgroup.c (revision fb1ceb29b27cda91af35851ebab01f298d82162e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * User interface for Resource Allocation in Resource Director Technology(RDT)
4  *
5  * Copyright (C) 2016 Intel Corporation
6  *
7  * Author: Fenghua Yu <fenghua.yu@intel.com>
8  *
9  * More information about RDT be found in the Intel (R) x86 Architecture
10  * Software Developer Manual.
11  */
12 
13 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
14 
15 #include <linux/cpu.h>
16 #include <linux/debugfs.h>
17 #include <linux/fs.h>
18 #include <linux/fs_parser.h>
19 #include <linux/sysfs.h>
20 #include <linux/kernfs.h>
21 #include <linux/seq_buf.h>
22 #include <linux/seq_file.h>
23 #include <linux/sched/signal.h>
24 #include <linux/sched/task.h>
25 #include <linux/slab.h>
26 #include <linux/task_work.h>
27 #include <linux/user_namespace.h>
28 
29 #include <uapi/linux/magic.h>
30 
31 #include <asm/resctrl.h>
32 #include "internal.h"
33 
34 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
35 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
36 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
37 
38 /* Mutex to protect rdtgroup access. */
39 DEFINE_MUTEX(rdtgroup_mutex);
40 
41 static struct kernfs_root *rdt_root;
42 struct rdtgroup rdtgroup_default;
43 LIST_HEAD(rdt_all_groups);
44 
45 /* list of entries for the schemata file */
46 LIST_HEAD(resctrl_schema_all);
47 
48 /* The filesystem can only be mounted once. */
49 bool resctrl_mounted;
50 
51 /* Kernel fs node for "info" directory under root */
52 static struct kernfs_node *kn_info;
53 
54 /* Kernel fs node for "mon_groups" directory under root */
55 static struct kernfs_node *kn_mongrp;
56 
57 /* Kernel fs node for "mon_data" directory under root */
58 static struct kernfs_node *kn_mondata;
59 
60 /*
61  * Used to store the max resource name width to display the schemata names in
62  * a tabular format.
63  */
64 int max_name_width;
65 
66 static struct seq_buf last_cmd_status;
67 static char last_cmd_status_buf[512];
68 
69 static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
70 static void rdtgroup_destroy_root(void);
71 
72 struct dentry *debugfs_resctrl;
73 
74 /*
75  * Memory bandwidth monitoring event to use for the default CTRL_MON group
76  * and each new CTRL_MON group created by the user.  Only relevant when
77  * the filesystem is mounted with the "mba_MBps" option so it does not
78  * matter that it remains uninitialized on systems that do not support
79  * the "mba_MBps" option.
80  */
81 enum resctrl_event_id mba_mbps_default_event;
82 
83 static bool resctrl_debug;
84 
85 void rdt_last_cmd_clear(void)
86 {
87 	lockdep_assert_held(&rdtgroup_mutex);
88 	seq_buf_clear(&last_cmd_status);
89 }
90 
91 void rdt_last_cmd_puts(const char *s)
92 {
93 	lockdep_assert_held(&rdtgroup_mutex);
94 	seq_buf_puts(&last_cmd_status, s);
95 }
96 
97 void rdt_last_cmd_printf(const char *fmt, ...)
98 {
99 	va_list ap;
100 
101 	va_start(ap, fmt);
102 	lockdep_assert_held(&rdtgroup_mutex);
103 	seq_buf_vprintf(&last_cmd_status, fmt, ap);
104 	va_end(ap);
105 }
106 
107 void rdt_staged_configs_clear(void)
108 {
109 	struct rdt_ctrl_domain *dom;
110 	struct rdt_resource *r;
111 
112 	lockdep_assert_held(&rdtgroup_mutex);
113 
114 	for_each_alloc_capable_rdt_resource(r) {
115 		list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
116 			memset(dom->staged_config, 0, sizeof(dom->staged_config));
117 	}
118 }
119 
120 static bool resctrl_is_mbm_enabled(void)
121 {
122 	return (resctrl_arch_is_mbm_total_enabled() ||
123 		resctrl_arch_is_mbm_local_enabled());
124 }
125 
126 static bool resctrl_is_mbm_event(int e)
127 {
128 	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
129 		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
130 }
131 
132 /*
133  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
134  * we can keep a bitmap of free CLOSIDs in a single integer.
135  *
136  * Using a global CLOSID across all resources has some advantages and
137  * some drawbacks:
138  * + We can simply set current's closid to assign a task to a resource
139  *   group.
140  * + Context switch code can avoid extra memory references deciding which
141  *   CLOSID to load into the PQR_ASSOC MSR
142  * - We give up some options in configuring resource groups across multi-socket
143  *   systems.
144  * - Our choices on how to configure each resource become progressively more
145  *   limited as the number of resources grows.
146  */
147 static unsigned long closid_free_map;
148 static int closid_free_map_len;
149 
150 int closids_supported(void)
151 {
152 	return closid_free_map_len;
153 }
154 
155 static void closid_init(void)
156 {
157 	struct resctrl_schema *s;
158 	u32 rdt_min_closid = 32;
159 
160 	/* Compute rdt_min_closid across all resources */
161 	list_for_each_entry(s, &resctrl_schema_all, list)
162 		rdt_min_closid = min(rdt_min_closid, s->num_closid);
163 
164 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
165 
166 	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
167 	__clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
168 	closid_free_map_len = rdt_min_closid;
169 }
170 
171 static int closid_alloc(void)
172 {
173 	int cleanest_closid;
174 	u32 closid;
175 
176 	lockdep_assert_held(&rdtgroup_mutex);
177 
178 	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
179 	    resctrl_arch_is_llc_occupancy_enabled()) {
180 		cleanest_closid = resctrl_find_cleanest_closid();
181 		if (cleanest_closid < 0)
182 			return cleanest_closid;
183 		closid = cleanest_closid;
184 	} else {
185 		closid = ffs(closid_free_map);
186 		if (closid == 0)
187 			return -ENOSPC;
188 		closid--;
189 	}
190 	__clear_bit(closid, &closid_free_map);
191 
192 	return closid;
193 }
194 
195 void closid_free(int closid)
196 {
197 	lockdep_assert_held(&rdtgroup_mutex);
198 
199 	__set_bit(closid, &closid_free_map);
200 }
201 
202 /**
203  * closid_allocated - test if provided closid is in use
204  * @closid: closid to be tested
205  *
206  * Return: true if @closid is currently associated with a resource group,
207  * false if @closid is free
208  */
209 bool closid_allocated(unsigned int closid)
210 {
211 	lockdep_assert_held(&rdtgroup_mutex);
212 
213 	return !test_bit(closid, &closid_free_map);
214 }
215 
216 /**
217  * rdtgroup_mode_by_closid - Return mode of resource group with closid
218  * @closid: closid if the resource group
219  *
220  * Each resource group is associated with a @closid. Here the mode
221  * of a resource group can be queried by searching for it using its closid.
222  *
223  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
224  */
225 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
226 {
227 	struct rdtgroup *rdtgrp;
228 
229 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
230 		if (rdtgrp->closid == closid)
231 			return rdtgrp->mode;
232 	}
233 
234 	return RDT_NUM_MODES;
235 }
236 
237 static const char * const rdt_mode_str[] = {
238 	[RDT_MODE_SHAREABLE]		= "shareable",
239 	[RDT_MODE_EXCLUSIVE]		= "exclusive",
240 	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
241 	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
242 };
243 
244 /**
245  * rdtgroup_mode_str - Return the string representation of mode
246  * @mode: the resource group mode as &enum rdtgroup_mode
247  *
248  * Return: string representation of valid mode, "unknown" otherwise
249  */
250 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
251 {
252 	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
253 		return "unknown";
254 
255 	return rdt_mode_str[mode];
256 }
257 
258 /* set uid and gid of rdtgroup dirs and files to that of the creator */
259 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
260 {
261 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
262 				.ia_uid = current_fsuid(),
263 				.ia_gid = current_fsgid(), };
264 
265 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
266 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
267 		return 0;
268 
269 	return kernfs_setattr(kn, &iattr);
270 }
271 
272 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
273 {
274 	struct kernfs_node *kn;
275 	int ret;
276 
277 	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
278 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
279 				  0, rft->kf_ops, rft, NULL, NULL);
280 	if (IS_ERR(kn))
281 		return PTR_ERR(kn);
282 
283 	ret = rdtgroup_kn_set_ugid(kn);
284 	if (ret) {
285 		kernfs_remove(kn);
286 		return ret;
287 	}
288 
289 	return 0;
290 }
291 
292 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
293 {
294 	struct kernfs_open_file *of = m->private;
295 	struct rftype *rft = of->kn->priv;
296 
297 	if (rft->seq_show)
298 		return rft->seq_show(of, m, arg);
299 	return 0;
300 }
301 
302 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
303 				   size_t nbytes, loff_t off)
304 {
305 	struct rftype *rft = of->kn->priv;
306 
307 	if (rft->write)
308 		return rft->write(of, buf, nbytes, off);
309 
310 	return -EINVAL;
311 }
312 
313 static const struct kernfs_ops rdtgroup_kf_single_ops = {
314 	.atomic_write_len	= PAGE_SIZE,
315 	.write			= rdtgroup_file_write,
316 	.seq_show		= rdtgroup_seqfile_show,
317 };
318 
319 static const struct kernfs_ops kf_mondata_ops = {
320 	.atomic_write_len	= PAGE_SIZE,
321 	.seq_show		= rdtgroup_mondata_show,
322 };
323 
324 static bool is_cpu_list(struct kernfs_open_file *of)
325 {
326 	struct rftype *rft = of->kn->priv;
327 
328 	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
329 }
330 
331 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
332 			      struct seq_file *s, void *v)
333 {
334 	struct rdtgroup *rdtgrp;
335 	struct cpumask *mask;
336 	int ret = 0;
337 
338 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
339 
340 	if (rdtgrp) {
341 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
342 			if (!rdtgrp->plr->d) {
343 				rdt_last_cmd_clear();
344 				rdt_last_cmd_puts("Cache domain offline\n");
345 				ret = -ENODEV;
346 			} else {
347 				mask = &rdtgrp->plr->d->hdr.cpu_mask;
348 				seq_printf(s, is_cpu_list(of) ?
349 					   "%*pbl\n" : "%*pb\n",
350 					   cpumask_pr_args(mask));
351 			}
352 		} else {
353 			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
354 				   cpumask_pr_args(&rdtgrp->cpu_mask));
355 		}
356 	} else {
357 		ret = -ENOENT;
358 	}
359 	rdtgroup_kn_unlock(of->kn);
360 
361 	return ret;
362 }
363 
364 /*
365  * This is safe against resctrl_sched_in() called from __switch_to()
366  * because __switch_to() is executed with interrupts disabled. A local call
367  * from update_closid_rmid() is protected against __switch_to() because
368  * preemption is disabled.
369  */
370 void resctrl_arch_sync_cpu_closid_rmid(void *info)
371 {
372 	struct resctrl_cpu_defaults *r = info;
373 
374 	if (r) {
375 		this_cpu_write(pqr_state.default_closid, r->closid);
376 		this_cpu_write(pqr_state.default_rmid, r->rmid);
377 	}
378 
379 	/*
380 	 * We cannot unconditionally write the MSR because the current
381 	 * executing task might have its own closid selected. Just reuse
382 	 * the context switch code.
383 	 */
384 	resctrl_sched_in(current);
385 }
386 
387 /*
388  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
389  *
390  * Per task closids/rmids must have been set up before calling this function.
391  * @r may be NULL.
392  */
393 static void
394 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
395 {
396 	struct resctrl_cpu_defaults defaults, *p = NULL;
397 
398 	if (r) {
399 		defaults.closid = r->closid;
400 		defaults.rmid = r->mon.rmid;
401 		p = &defaults;
402 	}
403 
404 	on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
405 }
406 
407 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
408 			  cpumask_var_t tmpmask)
409 {
410 	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
411 	struct list_head *head;
412 
413 	/* Check whether cpus belong to parent ctrl group */
414 	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
415 	if (!cpumask_empty(tmpmask)) {
416 		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
417 		return -EINVAL;
418 	}
419 
420 	/* Check whether cpus are dropped from this group */
421 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
422 	if (!cpumask_empty(tmpmask)) {
423 		/* Give any dropped cpus to parent rdtgroup */
424 		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
425 		update_closid_rmid(tmpmask, prgrp);
426 	}
427 
428 	/*
429 	 * If we added cpus, remove them from previous group that owned them
430 	 * and update per-cpu rmid
431 	 */
432 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
433 	if (!cpumask_empty(tmpmask)) {
434 		head = &prgrp->mon.crdtgrp_list;
435 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
436 			if (crgrp == rdtgrp)
437 				continue;
438 			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
439 				       tmpmask);
440 		}
441 		update_closid_rmid(tmpmask, rdtgrp);
442 	}
443 
444 	/* Done pushing/pulling - update this group with new mask */
445 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
446 
447 	return 0;
448 }
449 
450 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
451 {
452 	struct rdtgroup *crgrp;
453 
454 	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
455 	/* update the child mon group masks as well*/
456 	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
457 		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
458 }
459 
460 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
461 			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
462 {
463 	struct rdtgroup *r, *crgrp;
464 	struct list_head *head;
465 
466 	/* Check whether cpus are dropped from this group */
467 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
468 	if (!cpumask_empty(tmpmask)) {
469 		/* Can't drop from default group */
470 		if (rdtgrp == &rdtgroup_default) {
471 			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
472 			return -EINVAL;
473 		}
474 
475 		/* Give any dropped cpus to rdtgroup_default */
476 		cpumask_or(&rdtgroup_default.cpu_mask,
477 			   &rdtgroup_default.cpu_mask, tmpmask);
478 		update_closid_rmid(tmpmask, &rdtgroup_default);
479 	}
480 
481 	/*
482 	 * If we added cpus, remove them from previous group and
483 	 * the prev group's child groups that owned them
484 	 * and update per-cpu closid/rmid.
485 	 */
486 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
487 	if (!cpumask_empty(tmpmask)) {
488 		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
489 			if (r == rdtgrp)
490 				continue;
491 			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
492 			if (!cpumask_empty(tmpmask1))
493 				cpumask_rdtgrp_clear(r, tmpmask1);
494 		}
495 		update_closid_rmid(tmpmask, rdtgrp);
496 	}
497 
498 	/* Done pushing/pulling - update this group with new mask */
499 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
500 
501 	/*
502 	 * Clear child mon group masks since there is a new parent mask
503 	 * now and update the rmid for the cpus the child lost.
504 	 */
505 	head = &rdtgrp->mon.crdtgrp_list;
506 	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
507 		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
508 		update_closid_rmid(tmpmask, rdtgrp);
509 		cpumask_clear(&crgrp->cpu_mask);
510 	}
511 
512 	return 0;
513 }
514 
515 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
516 				   char *buf, size_t nbytes, loff_t off)
517 {
518 	cpumask_var_t tmpmask, newmask, tmpmask1;
519 	struct rdtgroup *rdtgrp;
520 	int ret;
521 
522 	if (!buf)
523 		return -EINVAL;
524 
525 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
526 		return -ENOMEM;
527 	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
528 		free_cpumask_var(tmpmask);
529 		return -ENOMEM;
530 	}
531 	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
532 		free_cpumask_var(tmpmask);
533 		free_cpumask_var(newmask);
534 		return -ENOMEM;
535 	}
536 
537 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
538 	if (!rdtgrp) {
539 		ret = -ENOENT;
540 		goto unlock;
541 	}
542 
543 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
544 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
545 		ret = -EINVAL;
546 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
547 		goto unlock;
548 	}
549 
550 	if (is_cpu_list(of))
551 		ret = cpulist_parse(buf, newmask);
552 	else
553 		ret = cpumask_parse(buf, newmask);
554 
555 	if (ret) {
556 		rdt_last_cmd_puts("Bad CPU list/mask\n");
557 		goto unlock;
558 	}
559 
560 	/* check that user didn't specify any offline cpus */
561 	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
562 	if (!cpumask_empty(tmpmask)) {
563 		ret = -EINVAL;
564 		rdt_last_cmd_puts("Can only assign online CPUs\n");
565 		goto unlock;
566 	}
567 
568 	if (rdtgrp->type == RDTCTRL_GROUP)
569 		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
570 	else if (rdtgrp->type == RDTMON_GROUP)
571 		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
572 	else
573 		ret = -EINVAL;
574 
575 unlock:
576 	rdtgroup_kn_unlock(of->kn);
577 	free_cpumask_var(tmpmask);
578 	free_cpumask_var(newmask);
579 	free_cpumask_var(tmpmask1);
580 
581 	return ret ?: nbytes;
582 }
583 
584 /**
585  * rdtgroup_remove - the helper to remove resource group safely
586  * @rdtgrp: resource group to remove
587  *
588  * On resource group creation via a mkdir, an extra kernfs_node reference is
589  * taken to ensure that the rdtgroup structure remains accessible for the
590  * rdtgroup_kn_unlock() calls where it is removed.
591  *
592  * Drop the extra reference here, then free the rdtgroup structure.
593  *
594  * Return: void
595  */
596 static void rdtgroup_remove(struct rdtgroup *rdtgrp)
597 {
598 	kernfs_put(rdtgrp->kn);
599 	kfree(rdtgrp);
600 }
601 
602 static void _update_task_closid_rmid(void *task)
603 {
604 	/*
605 	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
606 	 * Otherwise, the MSR is updated when the task is scheduled in.
607 	 */
608 	if (task == current)
609 		resctrl_sched_in(task);
610 }
611 
612 static void update_task_closid_rmid(struct task_struct *t)
613 {
614 	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
615 		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
616 	else
617 		_update_task_closid_rmid(t);
618 }
619 
620 static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
621 {
622 	u32 closid, rmid = rdtgrp->mon.rmid;
623 
624 	if (rdtgrp->type == RDTCTRL_GROUP)
625 		closid = rdtgrp->closid;
626 	else if (rdtgrp->type == RDTMON_GROUP)
627 		closid = rdtgrp->mon.parent->closid;
628 	else
629 		return false;
630 
631 	return resctrl_arch_match_closid(tsk, closid) &&
632 	       resctrl_arch_match_rmid(tsk, closid, rmid);
633 }
634 
635 static int __rdtgroup_move_task(struct task_struct *tsk,
636 				struct rdtgroup *rdtgrp)
637 {
638 	/* If the task is already in rdtgrp, no need to move the task. */
639 	if (task_in_rdtgroup(tsk, rdtgrp))
640 		return 0;
641 
642 	/*
643 	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
644 	 * updated by them.
645 	 *
646 	 * For ctrl_mon groups, move both closid and rmid.
647 	 * For monitor groups, can move the tasks only from
648 	 * their parent CTRL group.
649 	 */
650 	if (rdtgrp->type == RDTMON_GROUP &&
651 	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
652 		rdt_last_cmd_puts("Can't move task to different control group\n");
653 		return -EINVAL;
654 	}
655 
656 	if (rdtgrp->type == RDTMON_GROUP)
657 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
658 					     rdtgrp->mon.rmid);
659 	else
660 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
661 					     rdtgrp->mon.rmid);
662 
663 	/*
664 	 * Ensure the task's closid and rmid are written before determining if
665 	 * the task is current that will decide if it will be interrupted.
666 	 * This pairs with the full barrier between the rq->curr update and
667 	 * resctrl_sched_in() during context switch.
668 	 */
669 	smp_mb();
670 
671 	/*
672 	 * By now, the task's closid and rmid are set. If the task is current
673 	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
674 	 * group go into effect. If the task is not current, the MSR will be
675 	 * updated when the task is scheduled in.
676 	 */
677 	update_task_closid_rmid(tsk);
678 
679 	return 0;
680 }
681 
682 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
683 {
684 	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
685 		resctrl_arch_match_closid(t, r->closid));
686 }
687 
688 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
689 {
690 	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
691 		resctrl_arch_match_rmid(t, r->mon.parent->closid,
692 					r->mon.rmid));
693 }
694 
695 /**
696  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
697  * @r: Resource group
698  *
699  * Return: 1 if tasks have been assigned to @r, 0 otherwise
700  */
701 int rdtgroup_tasks_assigned(struct rdtgroup *r)
702 {
703 	struct task_struct *p, *t;
704 	int ret = 0;
705 
706 	lockdep_assert_held(&rdtgroup_mutex);
707 
708 	rcu_read_lock();
709 	for_each_process_thread(p, t) {
710 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
711 			ret = 1;
712 			break;
713 		}
714 	}
715 	rcu_read_unlock();
716 
717 	return ret;
718 }
719 
720 static int rdtgroup_task_write_permission(struct task_struct *task,
721 					  struct kernfs_open_file *of)
722 {
723 	const struct cred *tcred = get_task_cred(task);
724 	const struct cred *cred = current_cred();
725 	int ret = 0;
726 
727 	/*
728 	 * Even if we're attaching all tasks in the thread group, we only
729 	 * need to check permissions on one of them.
730 	 */
731 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
732 	    !uid_eq(cred->euid, tcred->uid) &&
733 	    !uid_eq(cred->euid, tcred->suid)) {
734 		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
735 		ret = -EPERM;
736 	}
737 
738 	put_cred(tcred);
739 	return ret;
740 }
741 
742 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
743 			      struct kernfs_open_file *of)
744 {
745 	struct task_struct *tsk;
746 	int ret;
747 
748 	rcu_read_lock();
749 	if (pid) {
750 		tsk = find_task_by_vpid(pid);
751 		if (!tsk) {
752 			rcu_read_unlock();
753 			rdt_last_cmd_printf("No task %d\n", pid);
754 			return -ESRCH;
755 		}
756 	} else {
757 		tsk = current;
758 	}
759 
760 	get_task_struct(tsk);
761 	rcu_read_unlock();
762 
763 	ret = rdtgroup_task_write_permission(tsk, of);
764 	if (!ret)
765 		ret = __rdtgroup_move_task(tsk, rdtgrp);
766 
767 	put_task_struct(tsk);
768 	return ret;
769 }
770 
771 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
772 				    char *buf, size_t nbytes, loff_t off)
773 {
774 	struct rdtgroup *rdtgrp;
775 	char *pid_str;
776 	int ret = 0;
777 	pid_t pid;
778 
779 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
780 	if (!rdtgrp) {
781 		rdtgroup_kn_unlock(of->kn);
782 		return -ENOENT;
783 	}
784 	rdt_last_cmd_clear();
785 
786 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
787 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
788 		ret = -EINVAL;
789 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
790 		goto unlock;
791 	}
792 
793 	while (buf && buf[0] != '\0' && buf[0] != '\n') {
794 		pid_str = strim(strsep(&buf, ","));
795 
796 		if (kstrtoint(pid_str, 0, &pid)) {
797 			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
798 			ret = -EINVAL;
799 			break;
800 		}
801 
802 		if (pid < 0) {
803 			rdt_last_cmd_printf("Invalid pid %d\n", pid);
804 			ret = -EINVAL;
805 			break;
806 		}
807 
808 		ret = rdtgroup_move_task(pid, rdtgrp, of);
809 		if (ret) {
810 			rdt_last_cmd_printf("Error while processing task %d\n", pid);
811 			break;
812 		}
813 	}
814 
815 unlock:
816 	rdtgroup_kn_unlock(of->kn);
817 
818 	return ret ?: nbytes;
819 }
820 
821 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
822 {
823 	struct task_struct *p, *t;
824 	pid_t pid;
825 
826 	rcu_read_lock();
827 	for_each_process_thread(p, t) {
828 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
829 			pid = task_pid_vnr(t);
830 			if (pid)
831 				seq_printf(s, "%d\n", pid);
832 		}
833 	}
834 	rcu_read_unlock();
835 }
836 
837 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
838 			       struct seq_file *s, void *v)
839 {
840 	struct rdtgroup *rdtgrp;
841 	int ret = 0;
842 
843 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
844 	if (rdtgrp)
845 		show_rdt_tasks(rdtgrp, s);
846 	else
847 		ret = -ENOENT;
848 	rdtgroup_kn_unlock(of->kn);
849 
850 	return ret;
851 }
852 
853 static int rdtgroup_closid_show(struct kernfs_open_file *of,
854 				struct seq_file *s, void *v)
855 {
856 	struct rdtgroup *rdtgrp;
857 	int ret = 0;
858 
859 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
860 	if (rdtgrp)
861 		seq_printf(s, "%u\n", rdtgrp->closid);
862 	else
863 		ret = -ENOENT;
864 	rdtgroup_kn_unlock(of->kn);
865 
866 	return ret;
867 }
868 
869 static int rdtgroup_rmid_show(struct kernfs_open_file *of,
870 			      struct seq_file *s, void *v)
871 {
872 	struct rdtgroup *rdtgrp;
873 	int ret = 0;
874 
875 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
876 	if (rdtgrp)
877 		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
878 	else
879 		ret = -ENOENT;
880 	rdtgroup_kn_unlock(of->kn);
881 
882 	return ret;
883 }
884 
885 #ifdef CONFIG_PROC_CPU_RESCTRL
886 
887 /*
888  * A task can only be part of one resctrl control group and of one monitor
889  * group which is associated to that control group.
890  *
891  * 1)   res:
892  *      mon:
893  *
894  *    resctrl is not available.
895  *
896  * 2)   res:/
897  *      mon:
898  *
899  *    Task is part of the root resctrl control group, and it is not associated
900  *    to any monitor group.
901  *
902  * 3)  res:/
903  *     mon:mon0
904  *
905  *    Task is part of the root resctrl control group and monitor group mon0.
906  *
907  * 4)  res:group0
908  *     mon:
909  *
910  *    Task is part of resctrl control group group0, and it is not associated
911  *    to any monitor group.
912  *
913  * 5) res:group0
914  *    mon:mon1
915  *
916  *    Task is part of resctrl control group group0 and monitor group mon1.
917  */
918 int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
919 		      struct pid *pid, struct task_struct *tsk)
920 {
921 	struct rdtgroup *rdtg;
922 	int ret = 0;
923 
924 	mutex_lock(&rdtgroup_mutex);
925 
926 	/* Return empty if resctrl has not been mounted. */
927 	if (!resctrl_mounted) {
928 		seq_puts(s, "res:\nmon:\n");
929 		goto unlock;
930 	}
931 
932 	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
933 		struct rdtgroup *crg;
934 
935 		/*
936 		 * Task information is only relevant for shareable
937 		 * and exclusive groups.
938 		 */
939 		if (rdtg->mode != RDT_MODE_SHAREABLE &&
940 		    rdtg->mode != RDT_MODE_EXCLUSIVE)
941 			continue;
942 
943 		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
944 			continue;
945 
946 		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
947 			   rdtg->kn->name);
948 		seq_puts(s, "mon:");
949 		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
950 				    mon.crdtgrp_list) {
951 			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
952 						     crg->mon.rmid))
953 				continue;
954 			seq_printf(s, "%s", crg->kn->name);
955 			break;
956 		}
957 		seq_putc(s, '\n');
958 		goto unlock;
959 	}
960 	/*
961 	 * The above search should succeed. Otherwise return
962 	 * with an error.
963 	 */
964 	ret = -ENOENT;
965 unlock:
966 	mutex_unlock(&rdtgroup_mutex);
967 
968 	return ret;
969 }
970 #endif
971 
972 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
973 				    struct seq_file *seq, void *v)
974 {
975 	int len;
976 
977 	mutex_lock(&rdtgroup_mutex);
978 	len = seq_buf_used(&last_cmd_status);
979 	if (len)
980 		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
981 	else
982 		seq_puts(seq, "ok\n");
983 	mutex_unlock(&rdtgroup_mutex);
984 	return 0;
985 }
986 
987 static int rdt_num_closids_show(struct kernfs_open_file *of,
988 				struct seq_file *seq, void *v)
989 {
990 	struct resctrl_schema *s = of->kn->parent->priv;
991 
992 	seq_printf(seq, "%u\n", s->num_closid);
993 	return 0;
994 }
995 
996 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
997 			     struct seq_file *seq, void *v)
998 {
999 	struct resctrl_schema *s = of->kn->parent->priv;
1000 	struct rdt_resource *r = s->res;
1001 
1002 	seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
1003 	return 0;
1004 }
1005 
1006 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
1007 			     struct seq_file *seq, void *v)
1008 {
1009 	struct resctrl_schema *s = of->kn->parent->priv;
1010 	struct rdt_resource *r = s->res;
1011 
1012 	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
1013 	return 0;
1014 }
1015 
1016 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
1017 				   struct seq_file *seq, void *v)
1018 {
1019 	struct resctrl_schema *s = of->kn->parent->priv;
1020 	struct rdt_resource *r = s->res;
1021 
1022 	seq_printf(seq, "%x\n", r->cache.shareable_bits);
1023 	return 0;
1024 }
1025 
1026 /*
1027  * rdt_bit_usage_show - Display current usage of resources
1028  *
1029  * A domain is a shared resource that can now be allocated differently. Here
1030  * we display the current regions of the domain as an annotated bitmask.
1031  * For each domain of this resource its allocation bitmask
1032  * is annotated as below to indicate the current usage of the corresponding bit:
1033  *   0 - currently unused
1034  *   X - currently available for sharing and used by software and hardware
1035  *   H - currently used by hardware only but available for software use
1036  *   S - currently used and shareable by software only
1037  *   E - currently used exclusively by one resource group
1038  *   P - currently pseudo-locked by one resource group
1039  */
1040 static int rdt_bit_usage_show(struct kernfs_open_file *of,
1041 			      struct seq_file *seq, void *v)
1042 {
1043 	struct resctrl_schema *s = of->kn->parent->priv;
1044 	/*
1045 	 * Use unsigned long even though only 32 bits are used to ensure
1046 	 * test_bit() is used safely.
1047 	 */
1048 	unsigned long sw_shareable = 0, hw_shareable = 0;
1049 	unsigned long exclusive = 0, pseudo_locked = 0;
1050 	struct rdt_resource *r = s->res;
1051 	struct rdt_ctrl_domain *dom;
1052 	int i, hwb, swb, excl, psl;
1053 	enum rdtgrp_mode mode;
1054 	bool sep = false;
1055 	u32 ctrl_val;
1056 
1057 	cpus_read_lock();
1058 	mutex_lock(&rdtgroup_mutex);
1059 	hw_shareable = r->cache.shareable_bits;
1060 	list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
1061 		if (sep)
1062 			seq_putc(seq, ';');
1063 		sw_shareable = 0;
1064 		exclusive = 0;
1065 		seq_printf(seq, "%d=", dom->hdr.id);
1066 		for (i = 0; i < closids_supported(); i++) {
1067 			if (!closid_allocated(i))
1068 				continue;
1069 			ctrl_val = resctrl_arch_get_config(r, dom, i,
1070 							   s->conf_type);
1071 			mode = rdtgroup_mode_by_closid(i);
1072 			switch (mode) {
1073 			case RDT_MODE_SHAREABLE:
1074 				sw_shareable |= ctrl_val;
1075 				break;
1076 			case RDT_MODE_EXCLUSIVE:
1077 				exclusive |= ctrl_val;
1078 				break;
1079 			case RDT_MODE_PSEUDO_LOCKSETUP:
1080 			/*
1081 			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
1082 			 * here but not included since the CBM
1083 			 * associated with this CLOSID in this mode
1084 			 * is not initialized and no task or cpu can be
1085 			 * assigned this CLOSID.
1086 			 */
1087 				break;
1088 			case RDT_MODE_PSEUDO_LOCKED:
1089 			case RDT_NUM_MODES:
1090 				WARN(1,
1091 				     "invalid mode for closid %d\n", i);
1092 				break;
1093 			}
1094 		}
1095 		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
1096 			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
1097 			hwb = test_bit(i, &hw_shareable);
1098 			swb = test_bit(i, &sw_shareable);
1099 			excl = test_bit(i, &exclusive);
1100 			psl = test_bit(i, &pseudo_locked);
1101 			if (hwb && swb)
1102 				seq_putc(seq, 'X');
1103 			else if (hwb && !swb)
1104 				seq_putc(seq, 'H');
1105 			else if (!hwb && swb)
1106 				seq_putc(seq, 'S');
1107 			else if (excl)
1108 				seq_putc(seq, 'E');
1109 			else if (psl)
1110 				seq_putc(seq, 'P');
1111 			else /* Unused bits remain */
1112 				seq_putc(seq, '0');
1113 		}
1114 		sep = true;
1115 	}
1116 	seq_putc(seq, '\n');
1117 	mutex_unlock(&rdtgroup_mutex);
1118 	cpus_read_unlock();
1119 	return 0;
1120 }
1121 
1122 static int rdt_min_bw_show(struct kernfs_open_file *of,
1123 			     struct seq_file *seq, void *v)
1124 {
1125 	struct resctrl_schema *s = of->kn->parent->priv;
1126 	struct rdt_resource *r = s->res;
1127 
1128 	seq_printf(seq, "%u\n", r->membw.min_bw);
1129 	return 0;
1130 }
1131 
1132 static int rdt_num_rmids_show(struct kernfs_open_file *of,
1133 			      struct seq_file *seq, void *v)
1134 {
1135 	struct rdt_resource *r = of->kn->parent->priv;
1136 
1137 	seq_printf(seq, "%d\n", r->num_rmid);
1138 
1139 	return 0;
1140 }
1141 
1142 static int rdt_mon_features_show(struct kernfs_open_file *of,
1143 				 struct seq_file *seq, void *v)
1144 {
1145 	struct rdt_resource *r = of->kn->parent->priv;
1146 	struct mon_evt *mevt;
1147 
1148 	list_for_each_entry(mevt, &r->evt_list, list) {
1149 		seq_printf(seq, "%s\n", mevt->name);
1150 		if (mevt->configurable)
1151 			seq_printf(seq, "%s_config\n", mevt->name);
1152 	}
1153 
1154 	return 0;
1155 }
1156 
1157 static int rdt_bw_gran_show(struct kernfs_open_file *of,
1158 			     struct seq_file *seq, void *v)
1159 {
1160 	struct resctrl_schema *s = of->kn->parent->priv;
1161 	struct rdt_resource *r = s->res;
1162 
1163 	seq_printf(seq, "%u\n", r->membw.bw_gran);
1164 	return 0;
1165 }
1166 
1167 static int rdt_delay_linear_show(struct kernfs_open_file *of,
1168 			     struct seq_file *seq, void *v)
1169 {
1170 	struct resctrl_schema *s = of->kn->parent->priv;
1171 	struct rdt_resource *r = s->res;
1172 
1173 	seq_printf(seq, "%u\n", r->membw.delay_linear);
1174 	return 0;
1175 }
1176 
1177 static int max_threshold_occ_show(struct kernfs_open_file *of,
1178 				  struct seq_file *seq, void *v)
1179 {
1180 	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
1181 
1182 	return 0;
1183 }
1184 
1185 static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1186 					 struct seq_file *seq, void *v)
1187 {
1188 	struct resctrl_schema *s = of->kn->parent->priv;
1189 	struct rdt_resource *r = s->res;
1190 
1191 	switch (r->membw.throttle_mode) {
1192 	case THREAD_THROTTLE_PER_THREAD:
1193 		seq_puts(seq, "per-thread\n");
1194 		return 0;
1195 	case THREAD_THROTTLE_MAX:
1196 		seq_puts(seq, "max\n");
1197 		return 0;
1198 	case THREAD_THROTTLE_UNDEFINED:
1199 		seq_puts(seq, "undefined\n");
1200 		return 0;
1201 	}
1202 
1203 	WARN_ON_ONCE(1);
1204 
1205 	return 0;
1206 }
1207 
1208 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1209 				       char *buf, size_t nbytes, loff_t off)
1210 {
1211 	unsigned int bytes;
1212 	int ret;
1213 
1214 	ret = kstrtouint(buf, 0, &bytes);
1215 	if (ret)
1216 		return ret;
1217 
1218 	if (bytes > resctrl_rmid_realloc_limit)
1219 		return -EINVAL;
1220 
1221 	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
1222 
1223 	return nbytes;
1224 }
1225 
1226 /*
1227  * rdtgroup_mode_show - Display mode of this resource group
1228  */
1229 static int rdtgroup_mode_show(struct kernfs_open_file *of,
1230 			      struct seq_file *s, void *v)
1231 {
1232 	struct rdtgroup *rdtgrp;
1233 
1234 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1235 	if (!rdtgrp) {
1236 		rdtgroup_kn_unlock(of->kn);
1237 		return -ENOENT;
1238 	}
1239 
1240 	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1241 
1242 	rdtgroup_kn_unlock(of->kn);
1243 	return 0;
1244 }
1245 
1246 static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
1247 {
1248 	switch (my_type) {
1249 	case CDP_CODE:
1250 		return CDP_DATA;
1251 	case CDP_DATA:
1252 		return CDP_CODE;
1253 	default:
1254 	case CDP_NONE:
1255 		return CDP_NONE;
1256 	}
1257 }
1258 
1259 static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
1260 					struct seq_file *seq, void *v)
1261 {
1262 	struct resctrl_schema *s = of->kn->parent->priv;
1263 	struct rdt_resource *r = s->res;
1264 
1265 	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
1266 
1267 	return 0;
1268 }
1269 
1270 /**
1271  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1272  * @r: Resource to which domain instance @d belongs.
1273  * @d: The domain instance for which @closid is being tested.
1274  * @cbm: Capacity bitmask being tested.
1275  * @closid: Intended closid for @cbm.
1276  * @type: CDP type of @r.
1277  * @exclusive: Only check if overlaps with exclusive resource groups
1278  *
1279  * Checks if provided @cbm intended to be used for @closid on domain
1280  * @d overlaps with any other closids or other hardware usage associated
1281  * with this domain. If @exclusive is true then only overlaps with
1282  * resource groups in exclusive mode will be considered. If @exclusive
1283  * is false then overlaps with any resource group or hardware entities
1284  * will be considered.
1285  *
1286  * @cbm is unsigned long, even if only 32 bits are used, to make the
1287  * bitmap functions work correctly.
1288  *
1289  * Return: false if CBM does not overlap, true if it does.
1290  */
1291 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1292 				    unsigned long cbm, int closid,
1293 				    enum resctrl_conf_type type, bool exclusive)
1294 {
1295 	enum rdtgrp_mode mode;
1296 	unsigned long ctrl_b;
1297 	int i;
1298 
1299 	/* Check for any overlap with regions used by hardware directly */
1300 	if (!exclusive) {
1301 		ctrl_b = r->cache.shareable_bits;
1302 		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1303 			return true;
1304 	}
1305 
1306 	/* Check for overlap with other resource groups */
1307 	for (i = 0; i < closids_supported(); i++) {
1308 		ctrl_b = resctrl_arch_get_config(r, d, i, type);
1309 		mode = rdtgroup_mode_by_closid(i);
1310 		if (closid_allocated(i) && i != closid &&
1311 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1312 			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1313 				if (exclusive) {
1314 					if (mode == RDT_MODE_EXCLUSIVE)
1315 						return true;
1316 					continue;
1317 				}
1318 				return true;
1319 			}
1320 		}
1321 	}
1322 
1323 	return false;
1324 }
1325 
1326 /**
1327  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1328  * @s: Schema for the resource to which domain instance @d belongs.
1329  * @d: The domain instance for which @closid is being tested.
1330  * @cbm: Capacity bitmask being tested.
1331  * @closid: Intended closid for @cbm.
1332  * @exclusive: Only check if overlaps with exclusive resource groups
1333  *
1334  * Resources that can be allocated using a CBM can use the CBM to control
1335  * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1336  * for overlap. Overlap test is not limited to the specific resource for
1337  * which the CBM is intended though - when dealing with CDP resources that
1338  * share the underlying hardware the overlap check should be performed on
1339  * the CDP resource sharing the hardware also.
1340  *
1341  * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1342  * overlap test.
1343  *
1344  * Return: true if CBM overlap detected, false if there is no overlap
1345  */
1346 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
1347 			   unsigned long cbm, int closid, bool exclusive)
1348 {
1349 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
1350 	struct rdt_resource *r = s->res;
1351 
1352 	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
1353 				    exclusive))
1354 		return true;
1355 
1356 	if (!resctrl_arch_get_cdp_enabled(r->rid))
1357 		return false;
1358 	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
1359 }
1360 
1361 /**
1362  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1363  * @rdtgrp: Resource group identified through its closid.
1364  *
1365  * An exclusive resource group implies that there should be no sharing of
1366  * its allocated resources. At the time this group is considered to be
1367  * exclusive this test can determine if its current schemata supports this
1368  * setting by testing for overlap with all other resource groups.
1369  *
1370  * Return: true if resource group can be exclusive, false if there is overlap
1371  * with allocations of other resource groups and thus this resource group
1372  * cannot be exclusive.
1373  */
1374 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1375 {
1376 	int closid = rdtgrp->closid;
1377 	struct rdt_ctrl_domain *d;
1378 	struct resctrl_schema *s;
1379 	struct rdt_resource *r;
1380 	bool has_cache = false;
1381 	u32 ctrl;
1382 
1383 	/* Walking r->domains, ensure it can't race with cpuhp */
1384 	lockdep_assert_cpus_held();
1385 
1386 	list_for_each_entry(s, &resctrl_schema_all, list) {
1387 		r = s->res;
1388 		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
1389 			continue;
1390 		has_cache = true;
1391 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1392 			ctrl = resctrl_arch_get_config(r, d, closid,
1393 						       s->conf_type);
1394 			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
1395 				rdt_last_cmd_puts("Schemata overlaps\n");
1396 				return false;
1397 			}
1398 		}
1399 	}
1400 
1401 	if (!has_cache) {
1402 		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1403 		return false;
1404 	}
1405 
1406 	return true;
1407 }
1408 
1409 /*
1410  * rdtgroup_mode_write - Modify the resource group's mode
1411  */
1412 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1413 				   char *buf, size_t nbytes, loff_t off)
1414 {
1415 	struct rdtgroup *rdtgrp;
1416 	enum rdtgrp_mode mode;
1417 	int ret = 0;
1418 
1419 	/* Valid input requires a trailing newline */
1420 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1421 		return -EINVAL;
1422 	buf[nbytes - 1] = '\0';
1423 
1424 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1425 	if (!rdtgrp) {
1426 		rdtgroup_kn_unlock(of->kn);
1427 		return -ENOENT;
1428 	}
1429 
1430 	rdt_last_cmd_clear();
1431 
1432 	mode = rdtgrp->mode;
1433 
1434 	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1435 	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1436 	    (!strcmp(buf, "pseudo-locksetup") &&
1437 	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1438 	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1439 		goto out;
1440 
1441 	if (mode == RDT_MODE_PSEUDO_LOCKED) {
1442 		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1443 		ret = -EINVAL;
1444 		goto out;
1445 	}
1446 
1447 	if (!strcmp(buf, "shareable")) {
1448 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1449 			ret = rdtgroup_locksetup_exit(rdtgrp);
1450 			if (ret)
1451 				goto out;
1452 		}
1453 		rdtgrp->mode = RDT_MODE_SHAREABLE;
1454 	} else if (!strcmp(buf, "exclusive")) {
1455 		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1456 			ret = -EINVAL;
1457 			goto out;
1458 		}
1459 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1460 			ret = rdtgroup_locksetup_exit(rdtgrp);
1461 			if (ret)
1462 				goto out;
1463 		}
1464 		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1465 	} else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
1466 		   !strcmp(buf, "pseudo-locksetup")) {
1467 		ret = rdtgroup_locksetup_enter(rdtgrp);
1468 		if (ret)
1469 			goto out;
1470 		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1471 	} else {
1472 		rdt_last_cmd_puts("Unknown or unsupported mode\n");
1473 		ret = -EINVAL;
1474 	}
1475 
1476 out:
1477 	rdtgroup_kn_unlock(of->kn);
1478 	return ret ?: nbytes;
1479 }
1480 
1481 /**
1482  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1483  * @r: RDT resource to which @d belongs.
1484  * @d: RDT domain instance.
1485  * @cbm: bitmask for which the size should be computed.
1486  *
1487  * The bitmask provided associated with the RDT domain instance @d will be
1488  * translated into how many bytes it represents. The size in bytes is
1489  * computed by first dividing the total cache size by the CBM length to
1490  * determine how many bytes each bit in the bitmask represents. The result
1491  * is multiplied with the number of bits set in the bitmask.
1492  *
1493  * @cbm is unsigned long, even if only 32 bits are used to make the
1494  * bitmap functions work correctly.
1495  */
1496 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1497 				  struct rdt_ctrl_domain *d, unsigned long cbm)
1498 {
1499 	unsigned int size = 0;
1500 	struct cacheinfo *ci;
1501 	int num_b;
1502 
1503 	if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
1504 		return size;
1505 
1506 	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1507 	ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
1508 	if (ci)
1509 		size = ci->size / r->cache.cbm_len * num_b;
1510 
1511 	return size;
1512 }
1513 
1514 /*
1515  * rdtgroup_size_show - Display size in bytes of allocated regions
1516  *
1517  * The "size" file mirrors the layout of the "schemata" file, printing the
1518  * size in bytes of each region instead of the capacity bitmask.
1519  */
1520 static int rdtgroup_size_show(struct kernfs_open_file *of,
1521 			      struct seq_file *s, void *v)
1522 {
1523 	struct resctrl_schema *schema;
1524 	enum resctrl_conf_type type;
1525 	struct rdt_ctrl_domain *d;
1526 	struct rdtgroup *rdtgrp;
1527 	struct rdt_resource *r;
1528 	unsigned int size;
1529 	int ret = 0;
1530 	u32 closid;
1531 	bool sep;
1532 	u32 ctrl;
1533 
1534 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1535 	if (!rdtgrp) {
1536 		rdtgroup_kn_unlock(of->kn);
1537 		return -ENOENT;
1538 	}
1539 
1540 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1541 		if (!rdtgrp->plr->d) {
1542 			rdt_last_cmd_clear();
1543 			rdt_last_cmd_puts("Cache domain offline\n");
1544 			ret = -ENODEV;
1545 		} else {
1546 			seq_printf(s, "%*s:", max_name_width,
1547 				   rdtgrp->plr->s->name);
1548 			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
1549 						    rdtgrp->plr->d,
1550 						    rdtgrp->plr->cbm);
1551 			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
1552 		}
1553 		goto out;
1554 	}
1555 
1556 	closid = rdtgrp->closid;
1557 
1558 	list_for_each_entry(schema, &resctrl_schema_all, list) {
1559 		r = schema->res;
1560 		type = schema->conf_type;
1561 		sep = false;
1562 		seq_printf(s, "%*s:", max_name_width, schema->name);
1563 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1564 			if (sep)
1565 				seq_putc(s, ';');
1566 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1567 				size = 0;
1568 			} else {
1569 				if (is_mba_sc(r))
1570 					ctrl = d->mbps_val[closid];
1571 				else
1572 					ctrl = resctrl_arch_get_config(r, d,
1573 								       closid,
1574 								       type);
1575 				if (r->rid == RDT_RESOURCE_MBA ||
1576 				    r->rid == RDT_RESOURCE_SMBA)
1577 					size = ctrl;
1578 				else
1579 					size = rdtgroup_cbm_to_size(r, d, ctrl);
1580 			}
1581 			seq_printf(s, "%d=%u", d->hdr.id, size);
1582 			sep = true;
1583 		}
1584 		seq_putc(s, '\n');
1585 	}
1586 
1587 out:
1588 	rdtgroup_kn_unlock(of->kn);
1589 
1590 	return ret;
1591 }
1592 
1593 #define INVALID_CONFIG_INDEX   UINT_MAX
1594 
1595 /**
1596  * mon_event_config_index_get - get the hardware index for the
1597  *                              configurable event
1598  * @evtid: event id.
1599  *
1600  * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
1601  *         1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
1602  *         INVALID_CONFIG_INDEX for invalid evtid
1603  */
1604 static inline unsigned int mon_event_config_index_get(u32 evtid)
1605 {
1606 	switch (evtid) {
1607 	case QOS_L3_MBM_TOTAL_EVENT_ID:
1608 		return 0;
1609 	case QOS_L3_MBM_LOCAL_EVENT_ID:
1610 		return 1;
1611 	default:
1612 		/* Should never reach here */
1613 		return INVALID_CONFIG_INDEX;
1614 	}
1615 }
1616 
1617 void resctrl_arch_mon_event_config_read(void *_config_info)
1618 {
1619 	struct resctrl_mon_config_info *config_info = _config_info;
1620 	unsigned int index;
1621 	u64 msrval;
1622 
1623 	index = mon_event_config_index_get(config_info->evtid);
1624 	if (index == INVALID_CONFIG_INDEX) {
1625 		pr_warn_once("Invalid event id %d\n", config_info->evtid);
1626 		return;
1627 	}
1628 	rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
1629 
1630 	/* Report only the valid event configuration bits */
1631 	config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
1632 }
1633 
1634 static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
1635 {
1636 	smp_call_function_any(&mon_info->d->hdr.cpu_mask,
1637 			      resctrl_arch_mon_event_config_read, mon_info, 1);
1638 }
1639 
1640 static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
1641 {
1642 	struct resctrl_mon_config_info mon_info;
1643 	struct rdt_mon_domain *dom;
1644 	bool sep = false;
1645 
1646 	cpus_read_lock();
1647 	mutex_lock(&rdtgroup_mutex);
1648 
1649 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1650 		if (sep)
1651 			seq_puts(s, ";");
1652 
1653 		memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
1654 		mon_info.r = r;
1655 		mon_info.d = dom;
1656 		mon_info.evtid = evtid;
1657 		mondata_config_read(&mon_info);
1658 
1659 		seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
1660 		sep = true;
1661 	}
1662 	seq_puts(s, "\n");
1663 
1664 	mutex_unlock(&rdtgroup_mutex);
1665 	cpus_read_unlock();
1666 
1667 	return 0;
1668 }
1669 
1670 static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
1671 				       struct seq_file *seq, void *v)
1672 {
1673 	struct rdt_resource *r = of->kn->parent->priv;
1674 
1675 	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
1676 
1677 	return 0;
1678 }
1679 
1680 static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
1681 				       struct seq_file *seq, void *v)
1682 {
1683 	struct rdt_resource *r = of->kn->parent->priv;
1684 
1685 	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
1686 
1687 	return 0;
1688 }
1689 
1690 void resctrl_arch_mon_event_config_write(void *_config_info)
1691 {
1692 	struct resctrl_mon_config_info *config_info = _config_info;
1693 	unsigned int index;
1694 
1695 	index = mon_event_config_index_get(config_info->evtid);
1696 	if (index == INVALID_CONFIG_INDEX) {
1697 		pr_warn_once("Invalid event id %d\n", config_info->evtid);
1698 		return;
1699 	}
1700 	wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
1701 }
1702 
1703 static void mbm_config_write_domain(struct rdt_resource *r,
1704 				    struct rdt_mon_domain *d, u32 evtid, u32 val)
1705 {
1706 	struct resctrl_mon_config_info mon_info = {0};
1707 
1708 	/*
1709 	 * Read the current config value first. If both are the same then
1710 	 * no need to write it again.
1711 	 */
1712 	mon_info.r = r;
1713 	mon_info.d = d;
1714 	mon_info.evtid = evtid;
1715 	mondata_config_read(&mon_info);
1716 	if (mon_info.mon_config == val)
1717 		return;
1718 
1719 	mon_info.mon_config = val;
1720 
1721 	/*
1722 	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
1723 	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
1724 	 * are scoped at the domain level. Writing any of these MSRs
1725 	 * on one CPU is observed by all the CPUs in the domain.
1726 	 */
1727 	smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
1728 			      &mon_info, 1);
1729 
1730 	/*
1731 	 * When an Event Configuration is changed, the bandwidth counters
1732 	 * for all RMIDs and Events will be cleared by the hardware. The
1733 	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
1734 	 * every RMID on the next read to any event for every RMID.
1735 	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
1736 	 * cleared while it is tracked by the hardware. Clear the
1737 	 * mbm_local and mbm_total counts for all the RMIDs.
1738 	 */
1739 	resctrl_arch_reset_rmid_all(r, d);
1740 }
1741 
1742 static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
1743 {
1744 	char *dom_str = NULL, *id_str;
1745 	unsigned long dom_id, val;
1746 	struct rdt_mon_domain *d;
1747 
1748 	/* Walking r->domains, ensure it can't race with cpuhp */
1749 	lockdep_assert_cpus_held();
1750 
1751 next:
1752 	if (!tok || tok[0] == '\0')
1753 		return 0;
1754 
1755 	/* Start processing the strings for each domain */
1756 	dom_str = strim(strsep(&tok, ";"));
1757 	id_str = strsep(&dom_str, "=");
1758 
1759 	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
1760 		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
1761 		return -EINVAL;
1762 	}
1763 
1764 	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
1765 		rdt_last_cmd_puts("Non-numeric event configuration value\n");
1766 		return -EINVAL;
1767 	}
1768 
1769 	/* Value from user cannot be more than the supported set of events */
1770 	if ((val & r->mbm_cfg_mask) != val) {
1771 		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
1772 				    r->mbm_cfg_mask);
1773 		return -EINVAL;
1774 	}
1775 
1776 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1777 		if (d->hdr.id == dom_id) {
1778 			mbm_config_write_domain(r, d, evtid, val);
1779 			goto next;
1780 		}
1781 	}
1782 
1783 	return -EINVAL;
1784 }
1785 
1786 static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
1787 					    char *buf, size_t nbytes,
1788 					    loff_t off)
1789 {
1790 	struct rdt_resource *r = of->kn->parent->priv;
1791 	int ret;
1792 
1793 	/* Valid input requires a trailing newline */
1794 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1795 		return -EINVAL;
1796 
1797 	cpus_read_lock();
1798 	mutex_lock(&rdtgroup_mutex);
1799 
1800 	rdt_last_cmd_clear();
1801 
1802 	buf[nbytes - 1] = '\0';
1803 
1804 	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
1805 
1806 	mutex_unlock(&rdtgroup_mutex);
1807 	cpus_read_unlock();
1808 
1809 	return ret ?: nbytes;
1810 }
1811 
1812 static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
1813 					    char *buf, size_t nbytes,
1814 					    loff_t off)
1815 {
1816 	struct rdt_resource *r = of->kn->parent->priv;
1817 	int ret;
1818 
1819 	/* Valid input requires a trailing newline */
1820 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1821 		return -EINVAL;
1822 
1823 	cpus_read_lock();
1824 	mutex_lock(&rdtgroup_mutex);
1825 
1826 	rdt_last_cmd_clear();
1827 
1828 	buf[nbytes - 1] = '\0';
1829 
1830 	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
1831 
1832 	mutex_unlock(&rdtgroup_mutex);
1833 	cpus_read_unlock();
1834 
1835 	return ret ?: nbytes;
1836 }
1837 
1838 /* rdtgroup information files for one cache resource. */
1839 static struct rftype res_common_files[] = {
1840 	{
1841 		.name		= "last_cmd_status",
1842 		.mode		= 0444,
1843 		.kf_ops		= &rdtgroup_kf_single_ops,
1844 		.seq_show	= rdt_last_cmd_status_show,
1845 		.fflags		= RFTYPE_TOP_INFO,
1846 	},
1847 	{
1848 		.name		= "num_closids",
1849 		.mode		= 0444,
1850 		.kf_ops		= &rdtgroup_kf_single_ops,
1851 		.seq_show	= rdt_num_closids_show,
1852 		.fflags		= RFTYPE_CTRL_INFO,
1853 	},
1854 	{
1855 		.name		= "mon_features",
1856 		.mode		= 0444,
1857 		.kf_ops		= &rdtgroup_kf_single_ops,
1858 		.seq_show	= rdt_mon_features_show,
1859 		.fflags		= RFTYPE_MON_INFO,
1860 	},
1861 	{
1862 		.name		= "num_rmids",
1863 		.mode		= 0444,
1864 		.kf_ops		= &rdtgroup_kf_single_ops,
1865 		.seq_show	= rdt_num_rmids_show,
1866 		.fflags		= RFTYPE_MON_INFO,
1867 	},
1868 	{
1869 		.name		= "cbm_mask",
1870 		.mode		= 0444,
1871 		.kf_ops		= &rdtgroup_kf_single_ops,
1872 		.seq_show	= rdt_default_ctrl_show,
1873 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1874 	},
1875 	{
1876 		.name		= "min_cbm_bits",
1877 		.mode		= 0444,
1878 		.kf_ops		= &rdtgroup_kf_single_ops,
1879 		.seq_show	= rdt_min_cbm_bits_show,
1880 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1881 	},
1882 	{
1883 		.name		= "shareable_bits",
1884 		.mode		= 0444,
1885 		.kf_ops		= &rdtgroup_kf_single_ops,
1886 		.seq_show	= rdt_shareable_bits_show,
1887 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1888 	},
1889 	{
1890 		.name		= "bit_usage",
1891 		.mode		= 0444,
1892 		.kf_ops		= &rdtgroup_kf_single_ops,
1893 		.seq_show	= rdt_bit_usage_show,
1894 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1895 	},
1896 	{
1897 		.name		= "min_bandwidth",
1898 		.mode		= 0444,
1899 		.kf_ops		= &rdtgroup_kf_single_ops,
1900 		.seq_show	= rdt_min_bw_show,
1901 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1902 	},
1903 	{
1904 		.name		= "bandwidth_gran",
1905 		.mode		= 0444,
1906 		.kf_ops		= &rdtgroup_kf_single_ops,
1907 		.seq_show	= rdt_bw_gran_show,
1908 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1909 	},
1910 	{
1911 		.name		= "delay_linear",
1912 		.mode		= 0444,
1913 		.kf_ops		= &rdtgroup_kf_single_ops,
1914 		.seq_show	= rdt_delay_linear_show,
1915 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1916 	},
1917 	/*
1918 	 * Platform specific which (if any) capabilities are provided by
1919 	 * thread_throttle_mode. Defer "fflags" initialization to platform
1920 	 * discovery.
1921 	 */
1922 	{
1923 		.name		= "thread_throttle_mode",
1924 		.mode		= 0444,
1925 		.kf_ops		= &rdtgroup_kf_single_ops,
1926 		.seq_show	= rdt_thread_throttle_mode_show,
1927 	},
1928 	{
1929 		.name		= "max_threshold_occupancy",
1930 		.mode		= 0644,
1931 		.kf_ops		= &rdtgroup_kf_single_ops,
1932 		.write		= max_threshold_occ_write,
1933 		.seq_show	= max_threshold_occ_show,
1934 		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
1935 	},
1936 	{
1937 		.name		= "mbm_total_bytes_config",
1938 		.mode		= 0644,
1939 		.kf_ops		= &rdtgroup_kf_single_ops,
1940 		.seq_show	= mbm_total_bytes_config_show,
1941 		.write		= mbm_total_bytes_config_write,
1942 	},
1943 	{
1944 		.name		= "mbm_local_bytes_config",
1945 		.mode		= 0644,
1946 		.kf_ops		= &rdtgroup_kf_single_ops,
1947 		.seq_show	= mbm_local_bytes_config_show,
1948 		.write		= mbm_local_bytes_config_write,
1949 	},
1950 	{
1951 		.name		= "cpus",
1952 		.mode		= 0644,
1953 		.kf_ops		= &rdtgroup_kf_single_ops,
1954 		.write		= rdtgroup_cpus_write,
1955 		.seq_show	= rdtgroup_cpus_show,
1956 		.fflags		= RFTYPE_BASE,
1957 	},
1958 	{
1959 		.name		= "cpus_list",
1960 		.mode		= 0644,
1961 		.kf_ops		= &rdtgroup_kf_single_ops,
1962 		.write		= rdtgroup_cpus_write,
1963 		.seq_show	= rdtgroup_cpus_show,
1964 		.flags		= RFTYPE_FLAGS_CPUS_LIST,
1965 		.fflags		= RFTYPE_BASE,
1966 	},
1967 	{
1968 		.name		= "tasks",
1969 		.mode		= 0644,
1970 		.kf_ops		= &rdtgroup_kf_single_ops,
1971 		.write		= rdtgroup_tasks_write,
1972 		.seq_show	= rdtgroup_tasks_show,
1973 		.fflags		= RFTYPE_BASE,
1974 	},
1975 	{
1976 		.name		= "mon_hw_id",
1977 		.mode		= 0444,
1978 		.kf_ops		= &rdtgroup_kf_single_ops,
1979 		.seq_show	= rdtgroup_rmid_show,
1980 		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
1981 	},
1982 	{
1983 		.name		= "schemata",
1984 		.mode		= 0644,
1985 		.kf_ops		= &rdtgroup_kf_single_ops,
1986 		.write		= rdtgroup_schemata_write,
1987 		.seq_show	= rdtgroup_schemata_show,
1988 		.fflags		= RFTYPE_CTRL_BASE,
1989 	},
1990 	{
1991 		.name		= "mba_MBps_event",
1992 		.mode		= 0644,
1993 		.kf_ops		= &rdtgroup_kf_single_ops,
1994 		.write		= rdtgroup_mba_mbps_event_write,
1995 		.seq_show	= rdtgroup_mba_mbps_event_show,
1996 	},
1997 	{
1998 		.name		= "mode",
1999 		.mode		= 0644,
2000 		.kf_ops		= &rdtgroup_kf_single_ops,
2001 		.write		= rdtgroup_mode_write,
2002 		.seq_show	= rdtgroup_mode_show,
2003 		.fflags		= RFTYPE_CTRL_BASE,
2004 	},
2005 	{
2006 		.name		= "size",
2007 		.mode		= 0444,
2008 		.kf_ops		= &rdtgroup_kf_single_ops,
2009 		.seq_show	= rdtgroup_size_show,
2010 		.fflags		= RFTYPE_CTRL_BASE,
2011 	},
2012 	{
2013 		.name		= "sparse_masks",
2014 		.mode		= 0444,
2015 		.kf_ops		= &rdtgroup_kf_single_ops,
2016 		.seq_show	= rdt_has_sparse_bitmasks_show,
2017 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
2018 	},
2019 	{
2020 		.name		= "ctrl_hw_id",
2021 		.mode		= 0444,
2022 		.kf_ops		= &rdtgroup_kf_single_ops,
2023 		.seq_show	= rdtgroup_closid_show,
2024 		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
2025 	},
2026 
2027 };
2028 
2029 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
2030 {
2031 	struct rftype *rfts, *rft;
2032 	int ret, len;
2033 
2034 	rfts = res_common_files;
2035 	len = ARRAY_SIZE(res_common_files);
2036 
2037 	lockdep_assert_held(&rdtgroup_mutex);
2038 
2039 	if (resctrl_debug)
2040 		fflags |= RFTYPE_DEBUG;
2041 
2042 	for (rft = rfts; rft < rfts + len; rft++) {
2043 		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
2044 			ret = rdtgroup_add_file(kn, rft);
2045 			if (ret)
2046 				goto error;
2047 		}
2048 	}
2049 
2050 	return 0;
2051 error:
2052 	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
2053 	while (--rft >= rfts) {
2054 		if ((fflags & rft->fflags) == rft->fflags)
2055 			kernfs_remove_by_name(kn, rft->name);
2056 	}
2057 	return ret;
2058 }
2059 
2060 static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
2061 {
2062 	struct rftype *rfts, *rft;
2063 	int len;
2064 
2065 	rfts = res_common_files;
2066 	len = ARRAY_SIZE(res_common_files);
2067 
2068 	for (rft = rfts; rft < rfts + len; rft++) {
2069 		if (!strcmp(rft->name, name))
2070 			return rft;
2071 	}
2072 
2073 	return NULL;
2074 }
2075 
2076 static void thread_throttle_mode_init(void)
2077 {
2078 	enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
2079 	struct rdt_resource *r_mba, *r_smba;
2080 
2081 	r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2082 	if (r_mba->alloc_capable &&
2083 	    r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2084 		throttle_mode = r_mba->membw.throttle_mode;
2085 
2086 	r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
2087 	if (r_smba->alloc_capable &&
2088 	    r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2089 		throttle_mode = r_smba->membw.throttle_mode;
2090 
2091 	if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
2092 		return;
2093 
2094 	resctrl_file_fflags_init("thread_throttle_mode",
2095 				 RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
2096 }
2097 
2098 void resctrl_file_fflags_init(const char *config, unsigned long fflags)
2099 {
2100 	struct rftype *rft;
2101 
2102 	rft = rdtgroup_get_rftype_by_name(config);
2103 	if (rft)
2104 		rft->fflags = fflags;
2105 }
2106 
2107 /**
2108  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
2109  * @r: The resource group with which the file is associated.
2110  * @name: Name of the file
2111  *
2112  * The permissions of named resctrl file, directory, or link are modified
2113  * to not allow read, write, or execute by any user.
2114  *
2115  * WARNING: This function is intended to communicate to the user that the
2116  * resctrl file has been locked down - that it is not relevant to the
2117  * particular state the system finds itself in. It should not be relied
2118  * on to protect from user access because after the file's permissions
2119  * are restricted the user can still change the permissions using chmod
2120  * from the command line.
2121  *
2122  * Return: 0 on success, <0 on failure.
2123  */
2124 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
2125 {
2126 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2127 	struct kernfs_node *kn;
2128 	int ret = 0;
2129 
2130 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2131 	if (!kn)
2132 		return -ENOENT;
2133 
2134 	switch (kernfs_type(kn)) {
2135 	case KERNFS_DIR:
2136 		iattr.ia_mode = S_IFDIR;
2137 		break;
2138 	case KERNFS_FILE:
2139 		iattr.ia_mode = S_IFREG;
2140 		break;
2141 	case KERNFS_LINK:
2142 		iattr.ia_mode = S_IFLNK;
2143 		break;
2144 	}
2145 
2146 	ret = kernfs_setattr(kn, &iattr);
2147 	kernfs_put(kn);
2148 	return ret;
2149 }
2150 
2151 /**
2152  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
2153  * @r: The resource group with which the file is associated.
2154  * @name: Name of the file
2155  * @mask: Mask of permissions that should be restored
2156  *
2157  * Restore the permissions of the named file. If @name is a directory the
2158  * permissions of its parent will be used.
2159  *
2160  * Return: 0 on success, <0 on failure.
2161  */
2162 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
2163 			     umode_t mask)
2164 {
2165 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2166 	struct kernfs_node *kn, *parent;
2167 	struct rftype *rfts, *rft;
2168 	int ret, len;
2169 
2170 	rfts = res_common_files;
2171 	len = ARRAY_SIZE(res_common_files);
2172 
2173 	for (rft = rfts; rft < rfts + len; rft++) {
2174 		if (!strcmp(rft->name, name))
2175 			iattr.ia_mode = rft->mode & mask;
2176 	}
2177 
2178 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2179 	if (!kn)
2180 		return -ENOENT;
2181 
2182 	switch (kernfs_type(kn)) {
2183 	case KERNFS_DIR:
2184 		parent = kernfs_get_parent(kn);
2185 		if (parent) {
2186 			iattr.ia_mode |= parent->mode;
2187 			kernfs_put(parent);
2188 		}
2189 		iattr.ia_mode |= S_IFDIR;
2190 		break;
2191 	case KERNFS_FILE:
2192 		iattr.ia_mode |= S_IFREG;
2193 		break;
2194 	case KERNFS_LINK:
2195 		iattr.ia_mode |= S_IFLNK;
2196 		break;
2197 	}
2198 
2199 	ret = kernfs_setattr(kn, &iattr);
2200 	kernfs_put(kn);
2201 	return ret;
2202 }
2203 
2204 static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
2205 				      unsigned long fflags)
2206 {
2207 	struct kernfs_node *kn_subdir;
2208 	int ret;
2209 
2210 	kn_subdir = kernfs_create_dir(kn_info, name,
2211 				      kn_info->mode, priv);
2212 	if (IS_ERR(kn_subdir))
2213 		return PTR_ERR(kn_subdir);
2214 
2215 	ret = rdtgroup_kn_set_ugid(kn_subdir);
2216 	if (ret)
2217 		return ret;
2218 
2219 	ret = rdtgroup_add_files(kn_subdir, fflags);
2220 	if (!ret)
2221 		kernfs_activate(kn_subdir);
2222 
2223 	return ret;
2224 }
2225 
2226 static unsigned long fflags_from_resource(struct rdt_resource *r)
2227 {
2228 	switch (r->rid) {
2229 	case RDT_RESOURCE_L3:
2230 	case RDT_RESOURCE_L2:
2231 		return RFTYPE_RES_CACHE;
2232 	case RDT_RESOURCE_MBA:
2233 	case RDT_RESOURCE_SMBA:
2234 		return RFTYPE_RES_MB;
2235 	}
2236 
2237 	return WARN_ON_ONCE(1);
2238 }
2239 
2240 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
2241 {
2242 	struct resctrl_schema *s;
2243 	struct rdt_resource *r;
2244 	unsigned long fflags;
2245 	char name[32];
2246 	int ret;
2247 
2248 	/* create the directory */
2249 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
2250 	if (IS_ERR(kn_info))
2251 		return PTR_ERR(kn_info);
2252 
2253 	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
2254 	if (ret)
2255 		goto out_destroy;
2256 
2257 	/* loop over enabled controls, these are all alloc_capable */
2258 	list_for_each_entry(s, &resctrl_schema_all, list) {
2259 		r = s->res;
2260 		fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
2261 		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
2262 		if (ret)
2263 			goto out_destroy;
2264 	}
2265 
2266 	for_each_mon_capable_rdt_resource(r) {
2267 		fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
2268 		sprintf(name, "%s_MON", r->name);
2269 		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
2270 		if (ret)
2271 			goto out_destroy;
2272 	}
2273 
2274 	ret = rdtgroup_kn_set_ugid(kn_info);
2275 	if (ret)
2276 		goto out_destroy;
2277 
2278 	kernfs_activate(kn_info);
2279 
2280 	return 0;
2281 
2282 out_destroy:
2283 	kernfs_remove(kn_info);
2284 	return ret;
2285 }
2286 
2287 static int
2288 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
2289 		    char *name, struct kernfs_node **dest_kn)
2290 {
2291 	struct kernfs_node *kn;
2292 	int ret;
2293 
2294 	/* create the directory */
2295 	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2296 	if (IS_ERR(kn))
2297 		return PTR_ERR(kn);
2298 
2299 	if (dest_kn)
2300 		*dest_kn = kn;
2301 
2302 	ret = rdtgroup_kn_set_ugid(kn);
2303 	if (ret)
2304 		goto out_destroy;
2305 
2306 	kernfs_activate(kn);
2307 
2308 	return 0;
2309 
2310 out_destroy:
2311 	kernfs_remove(kn);
2312 	return ret;
2313 }
2314 
2315 static void l3_qos_cfg_update(void *arg)
2316 {
2317 	bool *enable = arg;
2318 
2319 	wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
2320 }
2321 
2322 static void l2_qos_cfg_update(void *arg)
2323 {
2324 	bool *enable = arg;
2325 
2326 	wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
2327 }
2328 
2329 static inline bool is_mba_linear(void)
2330 {
2331 	return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
2332 }
2333 
2334 static int set_cache_qos_cfg(int level, bool enable)
2335 {
2336 	void (*update)(void *arg);
2337 	struct rdt_ctrl_domain *d;
2338 	struct rdt_resource *r_l;
2339 	cpumask_var_t cpu_mask;
2340 	int cpu;
2341 
2342 	/* Walking r->domains, ensure it can't race with cpuhp */
2343 	lockdep_assert_cpus_held();
2344 
2345 	if (level == RDT_RESOURCE_L3)
2346 		update = l3_qos_cfg_update;
2347 	else if (level == RDT_RESOURCE_L2)
2348 		update = l2_qos_cfg_update;
2349 	else
2350 		return -EINVAL;
2351 
2352 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2353 		return -ENOMEM;
2354 
2355 	r_l = &rdt_resources_all[level].r_resctrl;
2356 	list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
2357 		if (r_l->cache.arch_has_per_cpu_cfg)
2358 			/* Pick all the CPUs in the domain instance */
2359 			for_each_cpu(cpu, &d->hdr.cpu_mask)
2360 				cpumask_set_cpu(cpu, cpu_mask);
2361 		else
2362 			/* Pick one CPU from each domain instance to update MSR */
2363 			cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
2364 	}
2365 
2366 	/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
2367 	on_each_cpu_mask(cpu_mask, update, &enable, 1);
2368 
2369 	free_cpumask_var(cpu_mask);
2370 
2371 	return 0;
2372 }
2373 
2374 /* Restore the qos cfg state when a domain comes online */
2375 void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
2376 {
2377 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
2378 
2379 	if (!r->cdp_capable)
2380 		return;
2381 
2382 	if (r->rid == RDT_RESOURCE_L2)
2383 		l2_qos_cfg_update(&hw_res->cdp_enabled);
2384 
2385 	if (r->rid == RDT_RESOURCE_L3)
2386 		l3_qos_cfg_update(&hw_res->cdp_enabled);
2387 }
2388 
2389 static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
2390 {
2391 	u32 num_closid = resctrl_arch_get_num_closid(r);
2392 	int cpu = cpumask_any(&d->hdr.cpu_mask);
2393 	int i;
2394 
2395 	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
2396 				   GFP_KERNEL, cpu_to_node(cpu));
2397 	if (!d->mbps_val)
2398 		return -ENOMEM;
2399 
2400 	for (i = 0; i < num_closid; i++)
2401 		d->mbps_val[i] = MBA_MAX_MBPS;
2402 
2403 	return 0;
2404 }
2405 
2406 static void mba_sc_domain_destroy(struct rdt_resource *r,
2407 				  struct rdt_ctrl_domain *d)
2408 {
2409 	kfree(d->mbps_val);
2410 	d->mbps_val = NULL;
2411 }
2412 
2413 /*
2414  * MBA software controller is supported only if
2415  * MBM is supported and MBA is in linear scale,
2416  * and the MBM monitor scope is the same as MBA
2417  * control scope.
2418  */
2419 static bool supports_mba_mbps(void)
2420 {
2421 	struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2422 	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2423 
2424 	return (resctrl_is_mbm_enabled() &&
2425 		r->alloc_capable && is_mba_linear() &&
2426 		r->ctrl_scope == rmbm->mon_scope);
2427 }
2428 
2429 /*
2430  * Enable or disable the MBA software controller
2431  * which helps user specify bandwidth in MBps.
2432  */
2433 static int set_mba_sc(bool mba_sc)
2434 {
2435 	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2436 	u32 num_closid = resctrl_arch_get_num_closid(r);
2437 	struct rdt_ctrl_domain *d;
2438 	unsigned long fflags;
2439 	int i;
2440 
2441 	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
2442 		return -EINVAL;
2443 
2444 	r->membw.mba_sc = mba_sc;
2445 
2446 	rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
2447 
2448 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2449 		for (i = 0; i < num_closid; i++)
2450 			d->mbps_val[i] = MBA_MAX_MBPS;
2451 	}
2452 
2453 	fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
2454 	resctrl_file_fflags_init("mba_MBps_event", fflags);
2455 
2456 	return 0;
2457 }
2458 
2459 static int cdp_enable(int level)
2460 {
2461 	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
2462 	int ret;
2463 
2464 	if (!r_l->alloc_capable)
2465 		return -EINVAL;
2466 
2467 	ret = set_cache_qos_cfg(level, true);
2468 	if (!ret)
2469 		rdt_resources_all[level].cdp_enabled = true;
2470 
2471 	return ret;
2472 }
2473 
2474 static void cdp_disable(int level)
2475 {
2476 	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
2477 
2478 	if (r_hw->cdp_enabled) {
2479 		set_cache_qos_cfg(level, false);
2480 		r_hw->cdp_enabled = false;
2481 	}
2482 }
2483 
2484 int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
2485 {
2486 	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
2487 
2488 	if (!hw_res->r_resctrl.cdp_capable)
2489 		return -EINVAL;
2490 
2491 	if (enable)
2492 		return cdp_enable(l);
2493 
2494 	cdp_disable(l);
2495 
2496 	return 0;
2497 }
2498 
2499 /*
2500  * We don't allow rdtgroup directories to be created anywhere
2501  * except the root directory. Thus when looking for the rdtgroup
2502  * structure for a kernfs node we are either looking at a directory,
2503  * in which case the rdtgroup structure is pointed at by the "priv"
2504  * field, otherwise we have a file, and need only look to the parent
2505  * to find the rdtgroup.
2506  */
2507 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2508 {
2509 	if (kernfs_type(kn) == KERNFS_DIR) {
2510 		/*
2511 		 * All the resource directories use "kn->priv"
2512 		 * to point to the "struct rdtgroup" for the
2513 		 * resource. "info" and its subdirectories don't
2514 		 * have rdtgroup structures, so return NULL here.
2515 		 */
2516 		if (kn == kn_info || kn->parent == kn_info)
2517 			return NULL;
2518 		else
2519 			return kn->priv;
2520 	} else {
2521 		return kn->parent->priv;
2522 	}
2523 }
2524 
2525 static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2526 {
2527 	atomic_inc(&rdtgrp->waitcount);
2528 	kernfs_break_active_protection(kn);
2529 }
2530 
2531 static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2532 {
2533 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2534 	    (rdtgrp->flags & RDT_DELETED)) {
2535 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2536 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2537 			rdtgroup_pseudo_lock_remove(rdtgrp);
2538 		kernfs_unbreak_active_protection(kn);
2539 		rdtgroup_remove(rdtgrp);
2540 	} else {
2541 		kernfs_unbreak_active_protection(kn);
2542 	}
2543 }
2544 
2545 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2546 {
2547 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2548 
2549 	if (!rdtgrp)
2550 		return NULL;
2551 
2552 	rdtgroup_kn_get(rdtgrp, kn);
2553 
2554 	cpus_read_lock();
2555 	mutex_lock(&rdtgroup_mutex);
2556 
2557 	/* Was this group deleted while we waited? */
2558 	if (rdtgrp->flags & RDT_DELETED)
2559 		return NULL;
2560 
2561 	return rdtgrp;
2562 }
2563 
2564 void rdtgroup_kn_unlock(struct kernfs_node *kn)
2565 {
2566 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2567 
2568 	if (!rdtgrp)
2569 		return;
2570 
2571 	mutex_unlock(&rdtgroup_mutex);
2572 	cpus_read_unlock();
2573 
2574 	rdtgroup_kn_put(rdtgrp, kn);
2575 }
2576 
2577 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2578 			     struct rdtgroup *prgrp,
2579 			     struct kernfs_node **mon_data_kn);
2580 
2581 static void rdt_disable_ctx(void)
2582 {
2583 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2584 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2585 	set_mba_sc(false);
2586 
2587 	resctrl_debug = false;
2588 }
2589 
2590 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2591 {
2592 	int ret = 0;
2593 
2594 	if (ctx->enable_cdpl2) {
2595 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
2596 		if (ret)
2597 			goto out_done;
2598 	}
2599 
2600 	if (ctx->enable_cdpl3) {
2601 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
2602 		if (ret)
2603 			goto out_cdpl2;
2604 	}
2605 
2606 	if (ctx->enable_mba_mbps) {
2607 		ret = set_mba_sc(true);
2608 		if (ret)
2609 			goto out_cdpl3;
2610 	}
2611 
2612 	if (ctx->enable_debug)
2613 		resctrl_debug = true;
2614 
2615 	return 0;
2616 
2617 out_cdpl3:
2618 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2619 out_cdpl2:
2620 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2621 out_done:
2622 	return ret;
2623 }
2624 
2625 static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
2626 {
2627 	struct resctrl_schema *s;
2628 	const char *suffix = "";
2629 	int ret, cl;
2630 
2631 	s = kzalloc(sizeof(*s), GFP_KERNEL);
2632 	if (!s)
2633 		return -ENOMEM;
2634 
2635 	s->res = r;
2636 	s->num_closid = resctrl_arch_get_num_closid(r);
2637 	if (resctrl_arch_get_cdp_enabled(r->rid))
2638 		s->num_closid /= 2;
2639 
2640 	s->conf_type = type;
2641 	switch (type) {
2642 	case CDP_CODE:
2643 		suffix = "CODE";
2644 		break;
2645 	case CDP_DATA:
2646 		suffix = "DATA";
2647 		break;
2648 	case CDP_NONE:
2649 		suffix = "";
2650 		break;
2651 	}
2652 
2653 	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
2654 	if (ret >= sizeof(s->name)) {
2655 		kfree(s);
2656 		return -EINVAL;
2657 	}
2658 
2659 	cl = strlen(s->name);
2660 
2661 	/*
2662 	 * If CDP is supported by this resource, but not enabled,
2663 	 * include the suffix. This ensures the tabular format of the
2664 	 * schemata file does not change between mounts of the filesystem.
2665 	 */
2666 	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
2667 		cl += 4;
2668 
2669 	if (cl > max_name_width)
2670 		max_name_width = cl;
2671 
2672 	switch (r->schema_fmt) {
2673 	case RESCTRL_SCHEMA_BITMAP:
2674 		s->fmt_str = "%d=%x";
2675 		break;
2676 	case RESCTRL_SCHEMA_RANGE:
2677 		s->fmt_str = "%d=%u";
2678 		break;
2679 	}
2680 
2681 	if (WARN_ON_ONCE(!s->fmt_str)) {
2682 		kfree(s);
2683 		return -EINVAL;
2684 	}
2685 
2686 	INIT_LIST_HEAD(&s->list);
2687 	list_add(&s->list, &resctrl_schema_all);
2688 
2689 	return 0;
2690 }
2691 
2692 static int schemata_list_create(void)
2693 {
2694 	struct rdt_resource *r;
2695 	int ret = 0;
2696 
2697 	for_each_alloc_capable_rdt_resource(r) {
2698 		if (resctrl_arch_get_cdp_enabled(r->rid)) {
2699 			ret = schemata_list_add(r, CDP_CODE);
2700 			if (ret)
2701 				break;
2702 
2703 			ret = schemata_list_add(r, CDP_DATA);
2704 		} else {
2705 			ret = schemata_list_add(r, CDP_NONE);
2706 		}
2707 
2708 		if (ret)
2709 			break;
2710 	}
2711 
2712 	return ret;
2713 }
2714 
2715 static void schemata_list_destroy(void)
2716 {
2717 	struct resctrl_schema *s, *tmp;
2718 
2719 	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
2720 		list_del(&s->list);
2721 		kfree(s);
2722 	}
2723 }
2724 
2725 static int rdt_get_tree(struct fs_context *fc)
2726 {
2727 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2728 	unsigned long flags = RFTYPE_CTRL_BASE;
2729 	struct rdt_mon_domain *dom;
2730 	struct rdt_resource *r;
2731 	int ret;
2732 
2733 	cpus_read_lock();
2734 	mutex_lock(&rdtgroup_mutex);
2735 	/*
2736 	 * resctrl file system can only be mounted once.
2737 	 */
2738 	if (resctrl_mounted) {
2739 		ret = -EBUSY;
2740 		goto out;
2741 	}
2742 
2743 	ret = rdtgroup_setup_root(ctx);
2744 	if (ret)
2745 		goto out;
2746 
2747 	ret = rdt_enable_ctx(ctx);
2748 	if (ret)
2749 		goto out_root;
2750 
2751 	ret = schemata_list_create();
2752 	if (ret) {
2753 		schemata_list_destroy();
2754 		goto out_ctx;
2755 	}
2756 
2757 	closid_init();
2758 
2759 	if (resctrl_arch_mon_capable())
2760 		flags |= RFTYPE_MON;
2761 
2762 	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
2763 	if (ret)
2764 		goto out_schemata_free;
2765 
2766 	kernfs_activate(rdtgroup_default.kn);
2767 
2768 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2769 	if (ret < 0)
2770 		goto out_schemata_free;
2771 
2772 	if (resctrl_arch_mon_capable()) {
2773 		ret = mongroup_create_dir(rdtgroup_default.kn,
2774 					  &rdtgroup_default, "mon_groups",
2775 					  &kn_mongrp);
2776 		if (ret < 0)
2777 			goto out_info;
2778 
2779 		ret = mkdir_mondata_all(rdtgroup_default.kn,
2780 					&rdtgroup_default, &kn_mondata);
2781 		if (ret < 0)
2782 			goto out_mongrp;
2783 		rdtgroup_default.mon.mon_data_kn = kn_mondata;
2784 	}
2785 
2786 	ret = rdt_pseudo_lock_init();
2787 	if (ret)
2788 		goto out_mondata;
2789 
2790 	ret = kernfs_get_tree(fc);
2791 	if (ret < 0)
2792 		goto out_psl;
2793 
2794 	if (resctrl_arch_alloc_capable())
2795 		resctrl_arch_enable_alloc();
2796 	if (resctrl_arch_mon_capable())
2797 		resctrl_arch_enable_mon();
2798 
2799 	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
2800 		resctrl_mounted = true;
2801 
2802 	if (resctrl_is_mbm_enabled()) {
2803 		r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2804 		list_for_each_entry(dom, &r->mon_domains, hdr.list)
2805 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
2806 						   RESCTRL_PICK_ANY_CPU);
2807 	}
2808 
2809 	goto out;
2810 
2811 out_psl:
2812 	rdt_pseudo_lock_release();
2813 out_mondata:
2814 	if (resctrl_arch_mon_capable())
2815 		kernfs_remove(kn_mondata);
2816 out_mongrp:
2817 	if (resctrl_arch_mon_capable())
2818 		kernfs_remove(kn_mongrp);
2819 out_info:
2820 	kernfs_remove(kn_info);
2821 out_schemata_free:
2822 	schemata_list_destroy();
2823 out_ctx:
2824 	rdt_disable_ctx();
2825 out_root:
2826 	rdtgroup_destroy_root();
2827 out:
2828 	rdt_last_cmd_clear();
2829 	mutex_unlock(&rdtgroup_mutex);
2830 	cpus_read_unlock();
2831 	return ret;
2832 }
2833 
2834 enum rdt_param {
2835 	Opt_cdp,
2836 	Opt_cdpl2,
2837 	Opt_mba_mbps,
2838 	Opt_debug,
2839 	nr__rdt_params
2840 };
2841 
2842 static const struct fs_parameter_spec rdt_fs_parameters[] = {
2843 	fsparam_flag("cdp",		Opt_cdp),
2844 	fsparam_flag("cdpl2",		Opt_cdpl2),
2845 	fsparam_flag("mba_MBps",	Opt_mba_mbps),
2846 	fsparam_flag("debug",		Opt_debug),
2847 	{}
2848 };
2849 
2850 static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2851 {
2852 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2853 	struct fs_parse_result result;
2854 	const char *msg;
2855 	int opt;
2856 
2857 	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2858 	if (opt < 0)
2859 		return opt;
2860 
2861 	switch (opt) {
2862 	case Opt_cdp:
2863 		ctx->enable_cdpl3 = true;
2864 		return 0;
2865 	case Opt_cdpl2:
2866 		ctx->enable_cdpl2 = true;
2867 		return 0;
2868 	case Opt_mba_mbps:
2869 		msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
2870 		if (!supports_mba_mbps())
2871 			return invalfc(fc, msg);
2872 		ctx->enable_mba_mbps = true;
2873 		return 0;
2874 	case Opt_debug:
2875 		ctx->enable_debug = true;
2876 		return 0;
2877 	}
2878 
2879 	return -EINVAL;
2880 }
2881 
2882 static void rdt_fs_context_free(struct fs_context *fc)
2883 {
2884 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2885 
2886 	kernfs_free_fs_context(fc);
2887 	kfree(ctx);
2888 }
2889 
2890 static const struct fs_context_operations rdt_fs_context_ops = {
2891 	.free		= rdt_fs_context_free,
2892 	.parse_param	= rdt_parse_param,
2893 	.get_tree	= rdt_get_tree,
2894 };
2895 
2896 static int rdt_init_fs_context(struct fs_context *fc)
2897 {
2898 	struct rdt_fs_context *ctx;
2899 
2900 	ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
2901 	if (!ctx)
2902 		return -ENOMEM;
2903 
2904 	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2905 	fc->fs_private = &ctx->kfc;
2906 	fc->ops = &rdt_fs_context_ops;
2907 	put_user_ns(fc->user_ns);
2908 	fc->user_ns = get_user_ns(&init_user_ns);
2909 	fc->global = true;
2910 	return 0;
2911 }
2912 
2913 void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
2914 {
2915 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
2916 	struct rdt_hw_ctrl_domain *hw_dom;
2917 	struct msr_param msr_param;
2918 	struct rdt_ctrl_domain *d;
2919 	int i;
2920 
2921 	/* Walking r->domains, ensure it can't race with cpuhp */
2922 	lockdep_assert_cpus_held();
2923 
2924 	msr_param.res = r;
2925 	msr_param.low = 0;
2926 	msr_param.high = hw_res->num_closid;
2927 
2928 	/*
2929 	 * Disable resource control for this resource by setting all
2930 	 * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
2931 	 * from each domain to update the MSRs below.
2932 	 */
2933 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2934 		hw_dom = resctrl_to_arch_ctrl_dom(d);
2935 
2936 		for (i = 0; i < hw_res->num_closid; i++)
2937 			hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
2938 		msr_param.dom = d;
2939 		smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
2940 	}
2941 
2942 	return;
2943 }
2944 
2945 /*
2946  * Move tasks from one to the other group. If @from is NULL, then all tasks
2947  * in the systems are moved unconditionally (used for teardown).
2948  *
2949  * If @mask is not NULL the cpus on which moved tasks are running are set
2950  * in that mask so the update smp function call is restricted to affected
2951  * cpus.
2952  */
2953 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2954 				 struct cpumask *mask)
2955 {
2956 	struct task_struct *p, *t;
2957 
2958 	read_lock(&tasklist_lock);
2959 	for_each_process_thread(p, t) {
2960 		if (!from || is_closid_match(t, from) ||
2961 		    is_rmid_match(t, from)) {
2962 			resctrl_arch_set_closid_rmid(t, to->closid,
2963 						     to->mon.rmid);
2964 
2965 			/*
2966 			 * Order the closid/rmid stores above before the loads
2967 			 * in task_curr(). This pairs with the full barrier
2968 			 * between the rq->curr update and resctrl_sched_in()
2969 			 * during context switch.
2970 			 */
2971 			smp_mb();
2972 
2973 			/*
2974 			 * If the task is on a CPU, set the CPU in the mask.
2975 			 * The detection is inaccurate as tasks might move or
2976 			 * schedule before the smp function call takes place.
2977 			 * In such a case the function call is pointless, but
2978 			 * there is no other side effect.
2979 			 */
2980 			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
2981 				cpumask_set_cpu(task_cpu(t), mask);
2982 		}
2983 	}
2984 	read_unlock(&tasklist_lock);
2985 }
2986 
2987 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2988 {
2989 	struct rdtgroup *sentry, *stmp;
2990 	struct list_head *head;
2991 
2992 	head = &rdtgrp->mon.crdtgrp_list;
2993 	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2994 		free_rmid(sentry->closid, sentry->mon.rmid);
2995 		list_del(&sentry->mon.crdtgrp_list);
2996 
2997 		if (atomic_read(&sentry->waitcount) != 0)
2998 			sentry->flags = RDT_DELETED;
2999 		else
3000 			rdtgroup_remove(sentry);
3001 	}
3002 }
3003 
3004 /*
3005  * Forcibly remove all of subdirectories under root.
3006  */
3007 static void rmdir_all_sub(void)
3008 {
3009 	struct rdtgroup *rdtgrp, *tmp;
3010 
3011 	/* Move all tasks to the default resource group */
3012 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
3013 
3014 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
3015 		/* Free any child rmids */
3016 		free_all_child_rdtgrp(rdtgrp);
3017 
3018 		/* Remove each rdtgroup other than root */
3019 		if (rdtgrp == &rdtgroup_default)
3020 			continue;
3021 
3022 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3023 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
3024 			rdtgroup_pseudo_lock_remove(rdtgrp);
3025 
3026 		/*
3027 		 * Give any CPUs back to the default group. We cannot copy
3028 		 * cpu_online_mask because a CPU might have executed the
3029 		 * offline callback already, but is still marked online.
3030 		 */
3031 		cpumask_or(&rdtgroup_default.cpu_mask,
3032 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3033 
3034 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3035 
3036 		kernfs_remove(rdtgrp->kn);
3037 		list_del(&rdtgrp->rdtgroup_list);
3038 
3039 		if (atomic_read(&rdtgrp->waitcount) != 0)
3040 			rdtgrp->flags = RDT_DELETED;
3041 		else
3042 			rdtgroup_remove(rdtgrp);
3043 	}
3044 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
3045 	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
3046 
3047 	kernfs_remove(kn_info);
3048 	kernfs_remove(kn_mongrp);
3049 	kernfs_remove(kn_mondata);
3050 }
3051 
3052 static void rdt_kill_sb(struct super_block *sb)
3053 {
3054 	struct rdt_resource *r;
3055 
3056 	cpus_read_lock();
3057 	mutex_lock(&rdtgroup_mutex);
3058 
3059 	rdt_disable_ctx();
3060 
3061 	/* Put everything back to default values. */
3062 	for_each_alloc_capable_rdt_resource(r)
3063 		resctrl_arch_reset_all_ctrls(r);
3064 
3065 	rmdir_all_sub();
3066 	rdt_pseudo_lock_release();
3067 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
3068 	schemata_list_destroy();
3069 	rdtgroup_destroy_root();
3070 	if (resctrl_arch_alloc_capable())
3071 		resctrl_arch_disable_alloc();
3072 	if (resctrl_arch_mon_capable())
3073 		resctrl_arch_disable_mon();
3074 	resctrl_mounted = false;
3075 	kernfs_kill_sb(sb);
3076 	mutex_unlock(&rdtgroup_mutex);
3077 	cpus_read_unlock();
3078 }
3079 
3080 static struct file_system_type rdt_fs_type = {
3081 	.name			= "resctrl",
3082 	.init_fs_context	= rdt_init_fs_context,
3083 	.parameters		= rdt_fs_parameters,
3084 	.kill_sb		= rdt_kill_sb,
3085 };
3086 
3087 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
3088 		       void *priv)
3089 {
3090 	struct kernfs_node *kn;
3091 	int ret = 0;
3092 
3093 	kn = __kernfs_create_file(parent_kn, name, 0444,
3094 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
3095 				  &kf_mondata_ops, priv, NULL, NULL);
3096 	if (IS_ERR(kn))
3097 		return PTR_ERR(kn);
3098 
3099 	ret = rdtgroup_kn_set_ugid(kn);
3100 	if (ret) {
3101 		kernfs_remove(kn);
3102 		return ret;
3103 	}
3104 
3105 	return ret;
3106 }
3107 
3108 static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
3109 {
3110 	struct kernfs_node *kn;
3111 
3112 	kn = kernfs_find_and_get(pkn, name);
3113 	if (!kn)
3114 		return;
3115 	kernfs_put(kn);
3116 
3117 	if (kn->dir.subdirs <= 1)
3118 		kernfs_remove(kn);
3119 	else
3120 		kernfs_remove_by_name(kn, subname);
3121 }
3122 
3123 /*
3124  * Remove all subdirectories of mon_data of ctrl_mon groups
3125  * and monitor groups for the given domain.
3126  * Remove files and directories containing "sum" of domain data
3127  * when last domain being summed is removed.
3128  */
3129 static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3130 					   struct rdt_mon_domain *d)
3131 {
3132 	struct rdtgroup *prgrp, *crgrp;
3133 	char subname[32];
3134 	bool snc_mode;
3135 	char name[32];
3136 
3137 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3138 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3139 	if (snc_mode)
3140 		sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
3141 
3142 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3143 		mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
3144 
3145 		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
3146 			mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
3147 	}
3148 }
3149 
3150 static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
3151 			     struct rdt_resource *r, struct rdtgroup *prgrp,
3152 			     bool do_sum)
3153 {
3154 	struct rmid_read rr = {0};
3155 	union mon_data_bits priv;
3156 	struct mon_evt *mevt;
3157 	int ret;
3158 
3159 	if (WARN_ON(list_empty(&r->evt_list)))
3160 		return -EPERM;
3161 
3162 	priv.u.rid = r->rid;
3163 	priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
3164 	priv.u.sum = do_sum;
3165 	list_for_each_entry(mevt, &r->evt_list, list) {
3166 		priv.u.evtid = mevt->evtid;
3167 		ret = mon_addfile(kn, mevt->name, priv.priv);
3168 		if (ret)
3169 			return ret;
3170 
3171 		if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
3172 			mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
3173 	}
3174 
3175 	return 0;
3176 }
3177 
3178 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
3179 				struct rdt_mon_domain *d,
3180 				struct rdt_resource *r, struct rdtgroup *prgrp)
3181 {
3182 	struct kernfs_node *kn, *ckn;
3183 	char name[32];
3184 	bool snc_mode;
3185 	int ret = 0;
3186 
3187 	lockdep_assert_held(&rdtgroup_mutex);
3188 
3189 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3190 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3191 	kn = kernfs_find_and_get(parent_kn, name);
3192 	if (kn) {
3193 		/*
3194 		 * rdtgroup_mutex will prevent this directory from being
3195 		 * removed. No need to keep this hold.
3196 		 */
3197 		kernfs_put(kn);
3198 	} else {
3199 		kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
3200 		if (IS_ERR(kn))
3201 			return PTR_ERR(kn);
3202 
3203 		ret = rdtgroup_kn_set_ugid(kn);
3204 		if (ret)
3205 			goto out_destroy;
3206 		ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
3207 		if (ret)
3208 			goto out_destroy;
3209 	}
3210 
3211 	if (snc_mode) {
3212 		sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
3213 		ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
3214 		if (IS_ERR(ckn)) {
3215 			ret = -EINVAL;
3216 			goto out_destroy;
3217 		}
3218 
3219 		ret = rdtgroup_kn_set_ugid(ckn);
3220 		if (ret)
3221 			goto out_destroy;
3222 
3223 		ret = mon_add_all_files(ckn, d, r, prgrp, false);
3224 		if (ret)
3225 			goto out_destroy;
3226 	}
3227 
3228 	kernfs_activate(kn);
3229 	return 0;
3230 
3231 out_destroy:
3232 	kernfs_remove(kn);
3233 	return ret;
3234 }
3235 
3236 /*
3237  * Add all subdirectories of mon_data for "ctrl_mon" groups
3238  * and "monitor" groups with given domain id.
3239  */
3240 static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3241 					   struct rdt_mon_domain *d)
3242 {
3243 	struct kernfs_node *parent_kn;
3244 	struct rdtgroup *prgrp, *crgrp;
3245 	struct list_head *head;
3246 
3247 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3248 		parent_kn = prgrp->mon.mon_data_kn;
3249 		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
3250 
3251 		head = &prgrp->mon.crdtgrp_list;
3252 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
3253 			parent_kn = crgrp->mon.mon_data_kn;
3254 			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
3255 		}
3256 	}
3257 }
3258 
3259 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
3260 				       struct rdt_resource *r,
3261 				       struct rdtgroup *prgrp)
3262 {
3263 	struct rdt_mon_domain *dom;
3264 	int ret;
3265 
3266 	/* Walking r->domains, ensure it can't race with cpuhp */
3267 	lockdep_assert_cpus_held();
3268 
3269 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
3270 		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
3271 		if (ret)
3272 			return ret;
3273 	}
3274 
3275 	return 0;
3276 }
3277 
3278 /*
3279  * This creates a directory mon_data which contains the monitored data.
3280  *
3281  * mon_data has one directory for each domain which are named
3282  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
3283  * with L3 domain looks as below:
3284  * ./mon_data:
3285  * mon_L3_00
3286  * mon_L3_01
3287  * mon_L3_02
3288  * ...
3289  *
3290  * Each domain directory has one file per event:
3291  * ./mon_L3_00/:
3292  * llc_occupancy
3293  *
3294  */
3295 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
3296 			     struct rdtgroup *prgrp,
3297 			     struct kernfs_node **dest_kn)
3298 {
3299 	struct rdt_resource *r;
3300 	struct kernfs_node *kn;
3301 	int ret;
3302 
3303 	/*
3304 	 * Create the mon_data directory first.
3305 	 */
3306 	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
3307 	if (ret)
3308 		return ret;
3309 
3310 	if (dest_kn)
3311 		*dest_kn = kn;
3312 
3313 	/*
3314 	 * Create the subdirectories for each domain. Note that all events
3315 	 * in a domain like L3 are grouped into a resource whose domain is L3
3316 	 */
3317 	for_each_mon_capable_rdt_resource(r) {
3318 		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
3319 		if (ret)
3320 			goto out_destroy;
3321 	}
3322 
3323 	return 0;
3324 
3325 out_destroy:
3326 	kernfs_remove(kn);
3327 	return ret;
3328 }
3329 
3330 /**
3331  * cbm_ensure_valid - Enforce validity on provided CBM
3332  * @_val:	Candidate CBM
3333  * @r:		RDT resource to which the CBM belongs
3334  *
3335  * The provided CBM represents all cache portions available for use. This
3336  * may be represented by a bitmap that does not consist of contiguous ones
3337  * and thus be an invalid CBM.
3338  * Here the provided CBM is forced to be a valid CBM by only considering
3339  * the first set of contiguous bits as valid and clearing all bits.
3340  * The intention here is to provide a valid default CBM with which a new
3341  * resource group is initialized. The user can follow this with a
3342  * modification to the CBM if the default does not satisfy the
3343  * requirements.
3344  */
3345 static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
3346 {
3347 	unsigned int cbm_len = r->cache.cbm_len;
3348 	unsigned long first_bit, zero_bit;
3349 	unsigned long val = _val;
3350 
3351 	if (!val)
3352 		return 0;
3353 
3354 	first_bit = find_first_bit(&val, cbm_len);
3355 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
3356 
3357 	/* Clear any remaining bits to ensure contiguous region */
3358 	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
3359 	return (u32)val;
3360 }
3361 
3362 /*
3363  * Initialize cache resources per RDT domain
3364  *
3365  * Set the RDT domain up to start off with all usable allocations. That is,
3366  * all shareable and unused bits. All-zero CBM is invalid.
3367  */
3368 static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
3369 				 u32 closid)
3370 {
3371 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
3372 	enum resctrl_conf_type t = s->conf_type;
3373 	struct resctrl_staged_config *cfg;
3374 	struct rdt_resource *r = s->res;
3375 	u32 used_b = 0, unused_b = 0;
3376 	unsigned long tmp_cbm;
3377 	enum rdtgrp_mode mode;
3378 	u32 peer_ctl, ctrl_val;
3379 	int i;
3380 
3381 	cfg = &d->staged_config[t];
3382 	cfg->have_new_ctrl = false;
3383 	cfg->new_ctrl = r->cache.shareable_bits;
3384 	used_b = r->cache.shareable_bits;
3385 	for (i = 0; i < closids_supported(); i++) {
3386 		if (closid_allocated(i) && i != closid) {
3387 			mode = rdtgroup_mode_by_closid(i);
3388 			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
3389 				/*
3390 				 * ctrl values for locksetup aren't relevant
3391 				 * until the schemata is written, and the mode
3392 				 * becomes RDT_MODE_PSEUDO_LOCKED.
3393 				 */
3394 				continue;
3395 			/*
3396 			 * If CDP is active include peer domain's
3397 			 * usage to ensure there is no overlap
3398 			 * with an exclusive group.
3399 			 */
3400 			if (resctrl_arch_get_cdp_enabled(r->rid))
3401 				peer_ctl = resctrl_arch_get_config(r, d, i,
3402 								   peer_type);
3403 			else
3404 				peer_ctl = 0;
3405 			ctrl_val = resctrl_arch_get_config(r, d, i,
3406 							   s->conf_type);
3407 			used_b |= ctrl_val | peer_ctl;
3408 			if (mode == RDT_MODE_SHAREABLE)
3409 				cfg->new_ctrl |= ctrl_val | peer_ctl;
3410 		}
3411 	}
3412 	if (d->plr && d->plr->cbm > 0)
3413 		used_b |= d->plr->cbm;
3414 	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
3415 	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
3416 	cfg->new_ctrl |= unused_b;
3417 	/*
3418 	 * Force the initial CBM to be valid, user can
3419 	 * modify the CBM based on system availability.
3420 	 */
3421 	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
3422 	/*
3423 	 * Assign the u32 CBM to an unsigned long to ensure that
3424 	 * bitmap_weight() does not access out-of-bound memory.
3425 	 */
3426 	tmp_cbm = cfg->new_ctrl;
3427 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
3428 		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
3429 		return -ENOSPC;
3430 	}
3431 	cfg->have_new_ctrl = true;
3432 
3433 	return 0;
3434 }
3435 
3436 /*
3437  * Initialize cache resources with default values.
3438  *
3439  * A new RDT group is being created on an allocation capable (CAT)
3440  * supporting system. Set this group up to start off with all usable
3441  * allocations.
3442  *
3443  * If there are no more shareable bits available on any domain then
3444  * the entire allocation will fail.
3445  */
3446 static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
3447 {
3448 	struct rdt_ctrl_domain *d;
3449 	int ret;
3450 
3451 	list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
3452 		ret = __init_one_rdt_domain(d, s, closid);
3453 		if (ret < 0)
3454 			return ret;
3455 	}
3456 
3457 	return 0;
3458 }
3459 
3460 /* Initialize MBA resource with default values. */
3461 static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
3462 {
3463 	struct resctrl_staged_config *cfg;
3464 	struct rdt_ctrl_domain *d;
3465 
3466 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
3467 		if (is_mba_sc(r)) {
3468 			d->mbps_val[closid] = MBA_MAX_MBPS;
3469 			continue;
3470 		}
3471 
3472 		cfg = &d->staged_config[CDP_NONE];
3473 		cfg->new_ctrl = resctrl_get_default_ctrl(r);
3474 		cfg->have_new_ctrl = true;
3475 	}
3476 }
3477 
3478 /* Initialize the RDT group's allocations. */
3479 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
3480 {
3481 	struct resctrl_schema *s;
3482 	struct rdt_resource *r;
3483 	int ret = 0;
3484 
3485 	rdt_staged_configs_clear();
3486 
3487 	list_for_each_entry(s, &resctrl_schema_all, list) {
3488 		r = s->res;
3489 		if (r->rid == RDT_RESOURCE_MBA ||
3490 		    r->rid == RDT_RESOURCE_SMBA) {
3491 			rdtgroup_init_mba(r, rdtgrp->closid);
3492 			if (is_mba_sc(r))
3493 				continue;
3494 		} else {
3495 			ret = rdtgroup_init_cat(s, rdtgrp->closid);
3496 			if (ret < 0)
3497 				goto out;
3498 		}
3499 
3500 		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
3501 		if (ret < 0) {
3502 			rdt_last_cmd_puts("Failed to initialize allocations\n");
3503 			goto out;
3504 		}
3505 
3506 	}
3507 
3508 	rdtgrp->mode = RDT_MODE_SHAREABLE;
3509 
3510 out:
3511 	rdt_staged_configs_clear();
3512 	return ret;
3513 }
3514 
3515 static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
3516 {
3517 	int ret;
3518 
3519 	if (!resctrl_arch_mon_capable())
3520 		return 0;
3521 
3522 	ret = alloc_rmid(rdtgrp->closid);
3523 	if (ret < 0) {
3524 		rdt_last_cmd_puts("Out of RMIDs\n");
3525 		return ret;
3526 	}
3527 	rdtgrp->mon.rmid = ret;
3528 
3529 	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
3530 	if (ret) {
3531 		rdt_last_cmd_puts("kernfs subdir error\n");
3532 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3533 		return ret;
3534 	}
3535 
3536 	return 0;
3537 }
3538 
3539 static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
3540 {
3541 	if (resctrl_arch_mon_capable())
3542 		free_rmid(rgrp->closid, rgrp->mon.rmid);
3543 }
3544 
3545 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
3546 			     const char *name, umode_t mode,
3547 			     enum rdt_group_type rtype, struct rdtgroup **r)
3548 {
3549 	struct rdtgroup *prdtgrp, *rdtgrp;
3550 	unsigned long files = 0;
3551 	struct kernfs_node *kn;
3552 	int ret;
3553 
3554 	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
3555 	if (!prdtgrp) {
3556 		ret = -ENODEV;
3557 		goto out_unlock;
3558 	}
3559 
3560 	if (rtype == RDTMON_GROUP &&
3561 	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3562 	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
3563 		ret = -EINVAL;
3564 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
3565 		goto out_unlock;
3566 	}
3567 
3568 	/* allocate the rdtgroup. */
3569 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
3570 	if (!rdtgrp) {
3571 		ret = -ENOSPC;
3572 		rdt_last_cmd_puts("Kernel out of memory\n");
3573 		goto out_unlock;
3574 	}
3575 	*r = rdtgrp;
3576 	rdtgrp->mon.parent = prdtgrp;
3577 	rdtgrp->type = rtype;
3578 	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
3579 
3580 	/* kernfs creates the directory for rdtgrp */
3581 	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
3582 	if (IS_ERR(kn)) {
3583 		ret = PTR_ERR(kn);
3584 		rdt_last_cmd_puts("kernfs create error\n");
3585 		goto out_free_rgrp;
3586 	}
3587 	rdtgrp->kn = kn;
3588 
3589 	/*
3590 	 * kernfs_remove() will drop the reference count on "kn" which
3591 	 * will free it. But we still need it to stick around for the
3592 	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
3593 	 * which will be dropped by kernfs_put() in rdtgroup_remove().
3594 	 */
3595 	kernfs_get(kn);
3596 
3597 	ret = rdtgroup_kn_set_ugid(kn);
3598 	if (ret) {
3599 		rdt_last_cmd_puts("kernfs perm error\n");
3600 		goto out_destroy;
3601 	}
3602 
3603 	if (rtype == RDTCTRL_GROUP) {
3604 		files = RFTYPE_BASE | RFTYPE_CTRL;
3605 		if (resctrl_arch_mon_capable())
3606 			files |= RFTYPE_MON;
3607 	} else {
3608 		files = RFTYPE_BASE | RFTYPE_MON;
3609 	}
3610 
3611 	ret = rdtgroup_add_files(kn, files);
3612 	if (ret) {
3613 		rdt_last_cmd_puts("kernfs fill error\n");
3614 		goto out_destroy;
3615 	}
3616 
3617 	/*
3618 	 * The caller unlocks the parent_kn upon success.
3619 	 */
3620 	return 0;
3621 
3622 out_destroy:
3623 	kernfs_put(rdtgrp->kn);
3624 	kernfs_remove(rdtgrp->kn);
3625 out_free_rgrp:
3626 	kfree(rdtgrp);
3627 out_unlock:
3628 	rdtgroup_kn_unlock(parent_kn);
3629 	return ret;
3630 }
3631 
3632 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
3633 {
3634 	kernfs_remove(rgrp->kn);
3635 	rdtgroup_remove(rgrp);
3636 }
3637 
3638 /*
3639  * Create a monitor group under "mon_groups" directory of a control
3640  * and monitor group(ctrl_mon). This is a resource group
3641  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
3642  */
3643 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
3644 			      const char *name, umode_t mode)
3645 {
3646 	struct rdtgroup *rdtgrp, *prgrp;
3647 	int ret;
3648 
3649 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
3650 	if (ret)
3651 		return ret;
3652 
3653 	prgrp = rdtgrp->mon.parent;
3654 	rdtgrp->closid = prgrp->closid;
3655 
3656 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3657 	if (ret) {
3658 		mkdir_rdt_prepare_clean(rdtgrp);
3659 		goto out_unlock;
3660 	}
3661 
3662 	kernfs_activate(rdtgrp->kn);
3663 
3664 	/*
3665 	 * Add the rdtgrp to the list of rdtgrps the parent
3666 	 * ctrl_mon group has to track.
3667 	 */
3668 	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
3669 
3670 out_unlock:
3671 	rdtgroup_kn_unlock(parent_kn);
3672 	return ret;
3673 }
3674 
3675 /*
3676  * These are rdtgroups created under the root directory. Can be used
3677  * to allocate and monitor resources.
3678  */
3679 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
3680 				   const char *name, umode_t mode)
3681 {
3682 	struct rdtgroup *rdtgrp;
3683 	struct kernfs_node *kn;
3684 	u32 closid;
3685 	int ret;
3686 
3687 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
3688 	if (ret)
3689 		return ret;
3690 
3691 	kn = rdtgrp->kn;
3692 	ret = closid_alloc();
3693 	if (ret < 0) {
3694 		rdt_last_cmd_puts("Out of CLOSIDs\n");
3695 		goto out_common_fail;
3696 	}
3697 	closid = ret;
3698 	ret = 0;
3699 
3700 	rdtgrp->closid = closid;
3701 
3702 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3703 	if (ret)
3704 		goto out_closid_free;
3705 
3706 	kernfs_activate(rdtgrp->kn);
3707 
3708 	ret = rdtgroup_init_alloc(rdtgrp);
3709 	if (ret < 0)
3710 		goto out_rmid_free;
3711 
3712 	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
3713 
3714 	if (resctrl_arch_mon_capable()) {
3715 		/*
3716 		 * Create an empty mon_groups directory to hold the subset
3717 		 * of tasks and cpus to monitor.
3718 		 */
3719 		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
3720 		if (ret) {
3721 			rdt_last_cmd_puts("kernfs subdir error\n");
3722 			goto out_del_list;
3723 		}
3724 		if (is_mba_sc(NULL))
3725 			rdtgrp->mba_mbps_event = mba_mbps_default_event;
3726 	}
3727 
3728 	goto out_unlock;
3729 
3730 out_del_list:
3731 	list_del(&rdtgrp->rdtgroup_list);
3732 out_rmid_free:
3733 	mkdir_rdt_prepare_rmid_free(rdtgrp);
3734 out_closid_free:
3735 	closid_free(closid);
3736 out_common_fail:
3737 	mkdir_rdt_prepare_clean(rdtgrp);
3738 out_unlock:
3739 	rdtgroup_kn_unlock(parent_kn);
3740 	return ret;
3741 }
3742 
3743 /*
3744  * We allow creating mon groups only with in a directory called "mon_groups"
3745  * which is present in every ctrl_mon group. Check if this is a valid
3746  * "mon_groups" directory.
3747  *
3748  * 1. The directory should be named "mon_groups".
3749  * 2. The mon group itself should "not" be named "mon_groups".
3750  *   This makes sure "mon_groups" directory always has a ctrl_mon group
3751  *   as parent.
3752  */
3753 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3754 {
3755 	return (!strcmp(kn->name, "mon_groups") &&
3756 		strcmp(name, "mon_groups"));
3757 }
3758 
3759 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3760 			  umode_t mode)
3761 {
3762 	/* Do not accept '\n' to avoid unparsable situation. */
3763 	if (strchr(name, '\n'))
3764 		return -EINVAL;
3765 
3766 	/*
3767 	 * If the parent directory is the root directory and RDT
3768 	 * allocation is supported, add a control and monitoring
3769 	 * subdirectory
3770 	 */
3771 	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
3772 		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3773 
3774 	/*
3775 	 * If RDT monitoring is supported and the parent directory is a valid
3776 	 * "mon_groups" directory, add a monitoring subdirectory.
3777 	 */
3778 	if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
3779 		return rdtgroup_mkdir_mon(parent_kn, name, mode);
3780 
3781 	return -EPERM;
3782 }
3783 
3784 static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3785 {
3786 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3787 	u32 closid, rmid;
3788 	int cpu;
3789 
3790 	/* Give any tasks back to the parent group */
3791 	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3792 
3793 	/*
3794 	 * Update per cpu closid/rmid of the moved CPUs first.
3795 	 * Note: the closid will not change, but the arch code still needs it.
3796 	 */
3797 	closid = prdtgrp->closid;
3798 	rmid = prdtgrp->mon.rmid;
3799 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3800 		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3801 
3802 	/*
3803 	 * Update the MSR on moved CPUs and CPUs which have moved
3804 	 * task running on them.
3805 	 */
3806 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3807 	update_closid_rmid(tmpmask, NULL);
3808 
3809 	rdtgrp->flags = RDT_DELETED;
3810 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3811 
3812 	/*
3813 	 * Remove the rdtgrp from the parent ctrl_mon group's list
3814 	 */
3815 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3816 	list_del(&rdtgrp->mon.crdtgrp_list);
3817 
3818 	kernfs_remove(rdtgrp->kn);
3819 
3820 	return 0;
3821 }
3822 
3823 static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
3824 {
3825 	rdtgrp->flags = RDT_DELETED;
3826 	list_del(&rdtgrp->rdtgroup_list);
3827 
3828 	kernfs_remove(rdtgrp->kn);
3829 	return 0;
3830 }
3831 
3832 static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3833 {
3834 	u32 closid, rmid;
3835 	int cpu;
3836 
3837 	/* Give any tasks back to the default group */
3838 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3839 
3840 	/* Give any CPUs back to the default group */
3841 	cpumask_or(&rdtgroup_default.cpu_mask,
3842 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3843 
3844 	/* Update per cpu closid and rmid of the moved CPUs first */
3845 	closid = rdtgroup_default.closid;
3846 	rmid = rdtgroup_default.mon.rmid;
3847 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3848 		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3849 
3850 	/*
3851 	 * Update the MSR on moved CPUs and CPUs which have moved
3852 	 * task running on them.
3853 	 */
3854 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3855 	update_closid_rmid(tmpmask, NULL);
3856 
3857 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3858 	closid_free(rdtgrp->closid);
3859 
3860 	rdtgroup_ctrl_remove(rdtgrp);
3861 
3862 	/*
3863 	 * Free all the child monitor group rmids.
3864 	 */
3865 	free_all_child_rdtgrp(rdtgrp);
3866 
3867 	return 0;
3868 }
3869 
3870 static int rdtgroup_rmdir(struct kernfs_node *kn)
3871 {
3872 	struct kernfs_node *parent_kn = kn->parent;
3873 	struct rdtgroup *rdtgrp;
3874 	cpumask_var_t tmpmask;
3875 	int ret = 0;
3876 
3877 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
3878 		return -ENOMEM;
3879 
3880 	rdtgrp = rdtgroup_kn_lock_live(kn);
3881 	if (!rdtgrp) {
3882 		ret = -EPERM;
3883 		goto out;
3884 	}
3885 
3886 	/*
3887 	 * If the rdtgroup is a ctrl_mon group and parent directory
3888 	 * is the root directory, remove the ctrl_mon group.
3889 	 *
3890 	 * If the rdtgroup is a mon group and parent directory
3891 	 * is a valid "mon_groups" directory, remove the mon group.
3892 	 */
3893 	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
3894 	    rdtgrp != &rdtgroup_default) {
3895 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3896 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
3897 			ret = rdtgroup_ctrl_remove(rdtgrp);
3898 		} else {
3899 			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
3900 		}
3901 	} else if (rdtgrp->type == RDTMON_GROUP &&
3902 		 is_mon_groups(parent_kn, kn->name)) {
3903 		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
3904 	} else {
3905 		ret = -EPERM;
3906 	}
3907 
3908 out:
3909 	rdtgroup_kn_unlock(kn);
3910 	free_cpumask_var(tmpmask);
3911 	return ret;
3912 }
3913 
3914 /**
3915  * mongrp_reparent() - replace parent CTRL_MON group of a MON group
3916  * @rdtgrp:		the MON group whose parent should be replaced
3917  * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
3918  * @cpus:		cpumask provided by the caller for use during this call
3919  *
3920  * Replaces the parent CTRL_MON group for a MON group, resulting in all member
3921  * tasks' CLOSID immediately changing to that of the new parent group.
3922  * Monitoring data for the group is unaffected by this operation.
3923  */
3924 static void mongrp_reparent(struct rdtgroup *rdtgrp,
3925 			    struct rdtgroup *new_prdtgrp,
3926 			    cpumask_var_t cpus)
3927 {
3928 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3929 
3930 	WARN_ON(rdtgrp->type != RDTMON_GROUP);
3931 	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
3932 
3933 	/* Nothing to do when simply renaming a MON group. */
3934 	if (prdtgrp == new_prdtgrp)
3935 		return;
3936 
3937 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3938 	list_move_tail(&rdtgrp->mon.crdtgrp_list,
3939 		       &new_prdtgrp->mon.crdtgrp_list);
3940 
3941 	rdtgrp->mon.parent = new_prdtgrp;
3942 	rdtgrp->closid = new_prdtgrp->closid;
3943 
3944 	/* Propagate updated closid to all tasks in this group. */
3945 	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
3946 
3947 	update_closid_rmid(cpus, NULL);
3948 }
3949 
3950 static int rdtgroup_rename(struct kernfs_node *kn,
3951 			   struct kernfs_node *new_parent, const char *new_name)
3952 {
3953 	struct rdtgroup *new_prdtgrp;
3954 	struct rdtgroup *rdtgrp;
3955 	cpumask_var_t tmpmask;
3956 	int ret;
3957 
3958 	rdtgrp = kernfs_to_rdtgroup(kn);
3959 	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
3960 	if (!rdtgrp || !new_prdtgrp)
3961 		return -ENOENT;
3962 
3963 	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
3964 	rdtgroup_kn_get(rdtgrp, kn);
3965 	rdtgroup_kn_get(new_prdtgrp, new_parent);
3966 
3967 	mutex_lock(&rdtgroup_mutex);
3968 
3969 	rdt_last_cmd_clear();
3970 
3971 	/*
3972 	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
3973 	 * either kernfs_node is a file.
3974 	 */
3975 	if (kernfs_type(kn) != KERNFS_DIR ||
3976 	    kernfs_type(new_parent) != KERNFS_DIR) {
3977 		rdt_last_cmd_puts("Source and destination must be directories");
3978 		ret = -EPERM;
3979 		goto out;
3980 	}
3981 
3982 	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
3983 		ret = -ENOENT;
3984 		goto out;
3985 	}
3986 
3987 	if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
3988 	    !is_mon_groups(kn->parent, kn->name)) {
3989 		rdt_last_cmd_puts("Source must be a MON group\n");
3990 		ret = -EPERM;
3991 		goto out;
3992 	}
3993 
3994 	if (!is_mon_groups(new_parent, new_name)) {
3995 		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
3996 		ret = -EPERM;
3997 		goto out;
3998 	}
3999 
4000 	/*
4001 	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
4002 	 * current parent CTRL_MON group and therefore cannot be assigned to
4003 	 * the new parent, making the move illegal.
4004 	 */
4005 	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
4006 	    rdtgrp->mon.parent != new_prdtgrp) {
4007 		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
4008 		ret = -EPERM;
4009 		goto out;
4010 	}
4011 
4012 	/*
4013 	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
4014 	 * possibility of failing to allocate it after kernfs_rename() has
4015 	 * succeeded.
4016 	 */
4017 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
4018 		ret = -ENOMEM;
4019 		goto out;
4020 	}
4021 
4022 	/*
4023 	 * Perform all input validation and allocations needed to ensure
4024 	 * mongrp_reparent() will succeed before calling kernfs_rename(),
4025 	 * otherwise it would be necessary to revert this call if
4026 	 * mongrp_reparent() failed.
4027 	 */
4028 	ret = kernfs_rename(kn, new_parent, new_name);
4029 	if (!ret)
4030 		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
4031 
4032 	free_cpumask_var(tmpmask);
4033 
4034 out:
4035 	mutex_unlock(&rdtgroup_mutex);
4036 	rdtgroup_kn_put(rdtgrp, kn);
4037 	rdtgroup_kn_put(new_prdtgrp, new_parent);
4038 	return ret;
4039 }
4040 
4041 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
4042 {
4043 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
4044 		seq_puts(seq, ",cdp");
4045 
4046 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
4047 		seq_puts(seq, ",cdpl2");
4048 
4049 	if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
4050 		seq_puts(seq, ",mba_MBps");
4051 
4052 	if (resctrl_debug)
4053 		seq_puts(seq, ",debug");
4054 
4055 	return 0;
4056 }
4057 
4058 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
4059 	.mkdir		= rdtgroup_mkdir,
4060 	.rmdir		= rdtgroup_rmdir,
4061 	.rename		= rdtgroup_rename,
4062 	.show_options	= rdtgroup_show_options,
4063 };
4064 
4065 static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
4066 {
4067 	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
4068 				      KERNFS_ROOT_CREATE_DEACTIVATED |
4069 				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
4070 				      &rdtgroup_default);
4071 	if (IS_ERR(rdt_root))
4072 		return PTR_ERR(rdt_root);
4073 
4074 	ctx->kfc.root = rdt_root;
4075 	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
4076 
4077 	return 0;
4078 }
4079 
4080 static void rdtgroup_destroy_root(void)
4081 {
4082 	kernfs_destroy_root(rdt_root);
4083 	rdtgroup_default.kn = NULL;
4084 }
4085 
4086 static void __init rdtgroup_setup_default(void)
4087 {
4088 	mutex_lock(&rdtgroup_mutex);
4089 
4090 	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
4091 	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
4092 	rdtgroup_default.type = RDTCTRL_GROUP;
4093 	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
4094 
4095 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
4096 
4097 	mutex_unlock(&rdtgroup_mutex);
4098 }
4099 
4100 static void domain_destroy_mon_state(struct rdt_mon_domain *d)
4101 {
4102 	bitmap_free(d->rmid_busy_llc);
4103 	kfree(d->mbm_total);
4104 	kfree(d->mbm_local);
4105 }
4106 
4107 void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4108 {
4109 	mutex_lock(&rdtgroup_mutex);
4110 
4111 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
4112 		mba_sc_domain_destroy(r, d);
4113 
4114 	mutex_unlock(&rdtgroup_mutex);
4115 }
4116 
4117 void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4118 {
4119 	mutex_lock(&rdtgroup_mutex);
4120 
4121 	/*
4122 	 * If resctrl is mounted, remove all the
4123 	 * per domain monitor data directories.
4124 	 */
4125 	if (resctrl_mounted && resctrl_arch_mon_capable())
4126 		rmdir_mondata_subdir_allrdtgrp(r, d);
4127 
4128 	if (resctrl_is_mbm_enabled())
4129 		cancel_delayed_work(&d->mbm_over);
4130 	if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
4131 		/*
4132 		 * When a package is going down, forcefully
4133 		 * decrement rmid->ebusy. There is no way to know
4134 		 * that the L3 was flushed and hence may lead to
4135 		 * incorrect counts in rare scenarios, but leaving
4136 		 * the RMID as busy creates RMID leaks if the
4137 		 * package never comes back.
4138 		 */
4139 		__check_limbo(d, true);
4140 		cancel_delayed_work(&d->cqm_limbo);
4141 	}
4142 
4143 	domain_destroy_mon_state(d);
4144 
4145 	mutex_unlock(&rdtgroup_mutex);
4146 }
4147 
4148 /**
4149  * domain_setup_mon_state() -  Initialise domain monitoring structures.
4150  * @r:	The resource for the newly online domain.
4151  * @d:	The newly online domain.
4152  *
4153  * Allocate monitor resources that belong to this domain.
4154  * Called when the first CPU of a domain comes online, regardless of whether
4155  * the filesystem is mounted.
4156  * During boot this may be called before global allocations have been made by
4157  * resctrl_mon_resource_init().
4158  *
4159  * Returns 0 for success, or -ENOMEM.
4160  */
4161 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
4162 {
4163 	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
4164 	size_t tsize;
4165 
4166 	if (resctrl_arch_is_llc_occupancy_enabled()) {
4167 		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
4168 		if (!d->rmid_busy_llc)
4169 			return -ENOMEM;
4170 	}
4171 	if (resctrl_arch_is_mbm_total_enabled()) {
4172 		tsize = sizeof(*d->mbm_total);
4173 		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
4174 		if (!d->mbm_total) {
4175 			bitmap_free(d->rmid_busy_llc);
4176 			return -ENOMEM;
4177 		}
4178 	}
4179 	if (resctrl_arch_is_mbm_local_enabled()) {
4180 		tsize = sizeof(*d->mbm_local);
4181 		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
4182 		if (!d->mbm_local) {
4183 			bitmap_free(d->rmid_busy_llc);
4184 			kfree(d->mbm_total);
4185 			return -ENOMEM;
4186 		}
4187 	}
4188 
4189 	return 0;
4190 }
4191 
4192 int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4193 {
4194 	int err = 0;
4195 
4196 	mutex_lock(&rdtgroup_mutex);
4197 
4198 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
4199 		/* RDT_RESOURCE_MBA is never mon_capable */
4200 		err = mba_sc_domain_allocate(r, d);
4201 	}
4202 
4203 	mutex_unlock(&rdtgroup_mutex);
4204 
4205 	return err;
4206 }
4207 
4208 int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4209 {
4210 	int err;
4211 
4212 	mutex_lock(&rdtgroup_mutex);
4213 
4214 	err = domain_setup_mon_state(r, d);
4215 	if (err)
4216 		goto out_unlock;
4217 
4218 	if (resctrl_is_mbm_enabled()) {
4219 		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
4220 		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
4221 					   RESCTRL_PICK_ANY_CPU);
4222 	}
4223 
4224 	if (resctrl_arch_is_llc_occupancy_enabled())
4225 		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
4226 
4227 	/*
4228 	 * If the filesystem is not mounted then only the default resource group
4229 	 * exists. Creation of its directories is deferred until mount time
4230 	 * by rdt_get_tree() calling mkdir_mondata_all().
4231 	 * If resctrl is mounted, add per domain monitor data directories.
4232 	 */
4233 	if (resctrl_mounted && resctrl_arch_mon_capable())
4234 		mkdir_mondata_subdir_allrdtgrp(r, d);
4235 
4236 out_unlock:
4237 	mutex_unlock(&rdtgroup_mutex);
4238 
4239 	return err;
4240 }
4241 
4242 void resctrl_online_cpu(unsigned int cpu)
4243 {
4244 	mutex_lock(&rdtgroup_mutex);
4245 	/* The CPU is set in default rdtgroup after online. */
4246 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
4247 	mutex_unlock(&rdtgroup_mutex);
4248 }
4249 
4250 static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
4251 {
4252 	struct rdtgroup *cr;
4253 
4254 	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
4255 		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
4256 			break;
4257 	}
4258 }
4259 
4260 static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
4261 						      struct rdt_resource *r)
4262 {
4263 	struct rdt_mon_domain *d;
4264 
4265 	lockdep_assert_cpus_held();
4266 
4267 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
4268 		/* Find the domain that contains this CPU */
4269 		if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
4270 			return d;
4271 	}
4272 
4273 	return NULL;
4274 }
4275 
4276 void resctrl_offline_cpu(unsigned int cpu)
4277 {
4278 	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
4279 	struct rdt_mon_domain *d;
4280 	struct rdtgroup *rdtgrp;
4281 
4282 	mutex_lock(&rdtgroup_mutex);
4283 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
4284 		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
4285 			clear_childcpus(rdtgrp, cpu);
4286 			break;
4287 		}
4288 	}
4289 
4290 	if (!l3->mon_capable)
4291 		goto out_unlock;
4292 
4293 	d = get_mon_domain_from_cpu(cpu, l3);
4294 	if (d) {
4295 		if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
4296 			cancel_delayed_work(&d->mbm_over);
4297 			mbm_setup_overflow_handler(d, 0, cpu);
4298 		}
4299 		if (resctrl_arch_is_llc_occupancy_enabled() &&
4300 		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
4301 			cancel_delayed_work(&d->cqm_limbo);
4302 			cqm_setup_limbo_handler(d, 0, cpu);
4303 		}
4304 	}
4305 
4306 out_unlock:
4307 	mutex_unlock(&rdtgroup_mutex);
4308 }
4309 
4310 /*
4311  * resctrl_init - resctrl filesystem initialization
4312  *
4313  * Setup resctrl file system including set up root, create mount point,
4314  * register resctrl filesystem, and initialize files under root directory.
4315  *
4316  * Return: 0 on success or -errno
4317  */
4318 int __init resctrl_init(void)
4319 {
4320 	int ret = 0;
4321 
4322 	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
4323 		     sizeof(last_cmd_status_buf));
4324 
4325 	rdtgroup_setup_default();
4326 
4327 	thread_throttle_mode_init();
4328 
4329 	ret = resctrl_mon_resource_init();
4330 	if (ret)
4331 		return ret;
4332 
4333 	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
4334 	if (ret) {
4335 		resctrl_mon_resource_exit();
4336 		return ret;
4337 	}
4338 
4339 	ret = register_filesystem(&rdt_fs_type);
4340 	if (ret)
4341 		goto cleanup_mountpoint;
4342 
4343 	/*
4344 	 * Adding the resctrl debugfs directory here may not be ideal since
4345 	 * it would let the resctrl debugfs directory appear on the debugfs
4346 	 * filesystem before the resctrl filesystem is mounted.
4347 	 * It may also be ok since that would enable debugging of RDT before
4348 	 * resctrl is mounted.
4349 	 * The reason why the debugfs directory is created here and not in
4350 	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
4351 	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
4352 	 * (the lockdep class of inode->i_rwsem). Other filesystem
4353 	 * interactions (eg. SyS_getdents) have the lock ordering:
4354 	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
4355 	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
4356 	 * is taken, thus creating dependency:
4357 	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
4358 	 * issues considering the other two lock dependencies.
4359 	 * By creating the debugfs directory here we avoid a dependency
4360 	 * that may cause deadlock (even though file operations cannot
4361 	 * occur until the filesystem is mounted, but I do not know how to
4362 	 * tell lockdep that).
4363 	 */
4364 	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
4365 
4366 	return 0;
4367 
4368 cleanup_mountpoint:
4369 	sysfs_remove_mount_point(fs_kobj, "resctrl");
4370 	resctrl_mon_resource_exit();
4371 
4372 	return ret;
4373 }
4374 
4375 void __exit resctrl_exit(void)
4376 {
4377 	debugfs_remove_recursive(debugfs_resctrl);
4378 	unregister_filesystem(&rdt_fs_type);
4379 	sysfs_remove_mount_point(fs_kobj, "resctrl");
4380 
4381 	resctrl_mon_resource_exit();
4382 }
4383