1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * User interface for Resource Allocation in Resource Director Technology(RDT)
4 *
5 * Copyright (C) 2016 Intel Corporation
6 *
7 * Author: Fenghua Yu <fenghua.yu@intel.com>
8 *
9 * More information about RDT be found in the Intel (R) x86 Architecture
10 * Software Developer Manual.
11 */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/cpu.h>
16 #include <linux/debugfs.h>
17 #include <linux/fs.h>
18 #include <linux/fs_parser.h>
19 #include <linux/sysfs.h>
20 #include <linux/kernfs.h>
21 #include <linux/resctrl.h>
22 #include <linux/seq_buf.h>
23 #include <linux/seq_file.h>
24 #include <linux/sched/task.h>
25 #include <linux/slab.h>
26 #include <linux/user_namespace.h>
27
28 #include <uapi/linux/magic.h>
29
30 #include "internal.h"
31
32 /* Mutex to protect rdtgroup access. */
33 DEFINE_MUTEX(rdtgroup_mutex);
34
35 static struct kernfs_root *rdt_root;
36
37 struct rdtgroup rdtgroup_default;
38
39 LIST_HEAD(rdt_all_groups);
40
41 /* list of entries for the schemata file */
42 LIST_HEAD(resctrl_schema_all);
43
44 /*
45 * List of struct mon_data containing private data of event files for use by
46 * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
47 */
48 static LIST_HEAD(mon_data_kn_priv_list);
49
50 /* The filesystem can only be mounted once. */
51 bool resctrl_mounted;
52
53 /* Kernel fs node for "info" directory under root */
54 static struct kernfs_node *kn_info;
55
56 /* Kernel fs node for "mon_groups" directory under root */
57 static struct kernfs_node *kn_mongrp;
58
59 /* Kernel fs node for "mon_data" directory under root */
60 static struct kernfs_node *kn_mondata;
61
62 /*
63 * Used to store the max resource name width to display the schemata names in
64 * a tabular format.
65 */
66 int max_name_width;
67
68 static struct seq_buf last_cmd_status;
69
70 static char last_cmd_status_buf[512];
71
72 static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
73
74 static void rdtgroup_destroy_root(void);
75
76 struct dentry *debugfs_resctrl;
77
78 /*
79 * Memory bandwidth monitoring event to use for the default CTRL_MON group
80 * and each new CTRL_MON group created by the user. Only relevant when
81 * the filesystem is mounted with the "mba_MBps" option so it does not
82 * matter that it remains uninitialized on systems that do not support
83 * the "mba_MBps" option.
84 */
85 enum resctrl_event_id mba_mbps_default_event;
86
87 static bool resctrl_debug;
88
rdt_last_cmd_clear(void)89 void rdt_last_cmd_clear(void)
90 {
91 lockdep_assert_held(&rdtgroup_mutex);
92 seq_buf_clear(&last_cmd_status);
93 }
94
rdt_last_cmd_puts(const char * s)95 void rdt_last_cmd_puts(const char *s)
96 {
97 lockdep_assert_held(&rdtgroup_mutex);
98 seq_buf_puts(&last_cmd_status, s);
99 }
100
rdt_last_cmd_printf(const char * fmt,...)101 void rdt_last_cmd_printf(const char *fmt, ...)
102 {
103 va_list ap;
104
105 va_start(ap, fmt);
106 lockdep_assert_held(&rdtgroup_mutex);
107 seq_buf_vprintf(&last_cmd_status, fmt, ap);
108 va_end(ap);
109 }
110
rdt_staged_configs_clear(void)111 void rdt_staged_configs_clear(void)
112 {
113 struct rdt_ctrl_domain *dom;
114 struct rdt_resource *r;
115
116 lockdep_assert_held(&rdtgroup_mutex);
117
118 for_each_alloc_capable_rdt_resource(r) {
119 list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
120 memset(dom->staged_config, 0, sizeof(dom->staged_config));
121 }
122 }
123
resctrl_is_mbm_enabled(void)124 static bool resctrl_is_mbm_enabled(void)
125 {
126 return (resctrl_arch_is_mbm_total_enabled() ||
127 resctrl_arch_is_mbm_local_enabled());
128 }
129
resctrl_is_mbm_event(int e)130 static bool resctrl_is_mbm_event(int e)
131 {
132 return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
133 e <= QOS_L3_MBM_LOCAL_EVENT_ID);
134 }
135
136 /*
137 * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
138 * of free CLOSIDs.
139 *
140 * Using a global CLOSID across all resources has some advantages and
141 * some drawbacks:
142 * + We can simply set current's closid to assign a task to a resource
143 * group.
144 * + Context switch code can avoid extra memory references deciding which
145 * CLOSID to load into the PQR_ASSOC MSR
146 * - We give up some options in configuring resource groups across multi-socket
147 * systems.
148 * - Our choices on how to configure each resource become progressively more
149 * limited as the number of resources grows.
150 */
151 static unsigned long *closid_free_map;
152
153 static int closid_free_map_len;
154
closids_supported(void)155 int closids_supported(void)
156 {
157 return closid_free_map_len;
158 }
159
closid_init(void)160 static int closid_init(void)
161 {
162 struct resctrl_schema *s;
163 u32 rdt_min_closid = ~0;
164
165 /* Monitor only platforms still call closid_init() */
166 if (list_empty(&resctrl_schema_all))
167 return 0;
168
169 /* Compute rdt_min_closid across all resources */
170 list_for_each_entry(s, &resctrl_schema_all, list)
171 rdt_min_closid = min(rdt_min_closid, s->num_closid);
172
173 closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
174 if (!closid_free_map)
175 return -ENOMEM;
176 bitmap_fill(closid_free_map, rdt_min_closid);
177
178 /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
179 __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
180 closid_free_map_len = rdt_min_closid;
181
182 return 0;
183 }
184
closid_exit(void)185 static void closid_exit(void)
186 {
187 bitmap_free(closid_free_map);
188 closid_free_map = NULL;
189 }
190
closid_alloc(void)191 static int closid_alloc(void)
192 {
193 int cleanest_closid;
194 u32 closid;
195
196 lockdep_assert_held(&rdtgroup_mutex);
197
198 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
199 resctrl_arch_is_llc_occupancy_enabled()) {
200 cleanest_closid = resctrl_find_cleanest_closid();
201 if (cleanest_closid < 0)
202 return cleanest_closid;
203 closid = cleanest_closid;
204 } else {
205 closid = find_first_bit(closid_free_map, closid_free_map_len);
206 if (closid == closid_free_map_len)
207 return -ENOSPC;
208 }
209 __clear_bit(closid, closid_free_map);
210
211 return closid;
212 }
213
closid_free(int closid)214 void closid_free(int closid)
215 {
216 lockdep_assert_held(&rdtgroup_mutex);
217
218 __set_bit(closid, closid_free_map);
219 }
220
221 /**
222 * closid_allocated - test if provided closid is in use
223 * @closid: closid to be tested
224 *
225 * Return: true if @closid is currently associated with a resource group,
226 * false if @closid is free
227 */
closid_allocated(unsigned int closid)228 bool closid_allocated(unsigned int closid)
229 {
230 lockdep_assert_held(&rdtgroup_mutex);
231
232 return !test_bit(closid, closid_free_map);
233 }
234
235 /**
236 * rdtgroup_mode_by_closid - Return mode of resource group with closid
237 * @closid: closid if the resource group
238 *
239 * Each resource group is associated with a @closid. Here the mode
240 * of a resource group can be queried by searching for it using its closid.
241 *
242 * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
243 */
rdtgroup_mode_by_closid(int closid)244 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
245 {
246 struct rdtgroup *rdtgrp;
247
248 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
249 if (rdtgrp->closid == closid)
250 return rdtgrp->mode;
251 }
252
253 return RDT_NUM_MODES;
254 }
255
256 static const char * const rdt_mode_str[] = {
257 [RDT_MODE_SHAREABLE] = "shareable",
258 [RDT_MODE_EXCLUSIVE] = "exclusive",
259 [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
260 [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
261 };
262
263 /**
264 * rdtgroup_mode_str - Return the string representation of mode
265 * @mode: the resource group mode as &enum rdtgroup_mode
266 *
267 * Return: string representation of valid mode, "unknown" otherwise
268 */
rdtgroup_mode_str(enum rdtgrp_mode mode)269 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
270 {
271 if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
272 return "unknown";
273
274 return rdt_mode_str[mode];
275 }
276
277 /* set uid and gid of rdtgroup dirs and files to that of the creator */
rdtgroup_kn_set_ugid(struct kernfs_node * kn)278 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
279 {
280 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
281 .ia_uid = current_fsuid(),
282 .ia_gid = current_fsgid(), };
283
284 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
285 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
286 return 0;
287
288 return kernfs_setattr(kn, &iattr);
289 }
290
rdtgroup_add_file(struct kernfs_node * parent_kn,struct rftype * rft)291 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
292 {
293 struct kernfs_node *kn;
294 int ret;
295
296 kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
297 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
298 0, rft->kf_ops, rft, NULL, NULL);
299 if (IS_ERR(kn))
300 return PTR_ERR(kn);
301
302 ret = rdtgroup_kn_set_ugid(kn);
303 if (ret) {
304 kernfs_remove(kn);
305 return ret;
306 }
307
308 return 0;
309 }
310
rdtgroup_seqfile_show(struct seq_file * m,void * arg)311 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
312 {
313 struct kernfs_open_file *of = m->private;
314 struct rftype *rft = of->kn->priv;
315
316 if (rft->seq_show)
317 return rft->seq_show(of, m, arg);
318 return 0;
319 }
320
rdtgroup_file_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)321 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
322 size_t nbytes, loff_t off)
323 {
324 struct rftype *rft = of->kn->priv;
325
326 if (rft->write)
327 return rft->write(of, buf, nbytes, off);
328
329 return -EINVAL;
330 }
331
332 static const struct kernfs_ops rdtgroup_kf_single_ops = {
333 .atomic_write_len = PAGE_SIZE,
334 .write = rdtgroup_file_write,
335 .seq_show = rdtgroup_seqfile_show,
336 };
337
338 static const struct kernfs_ops kf_mondata_ops = {
339 .atomic_write_len = PAGE_SIZE,
340 .seq_show = rdtgroup_mondata_show,
341 };
342
is_cpu_list(struct kernfs_open_file * of)343 static bool is_cpu_list(struct kernfs_open_file *of)
344 {
345 struct rftype *rft = of->kn->priv;
346
347 return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
348 }
349
rdtgroup_cpus_show(struct kernfs_open_file * of,struct seq_file * s,void * v)350 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
351 struct seq_file *s, void *v)
352 {
353 struct rdtgroup *rdtgrp;
354 struct cpumask *mask;
355 int ret = 0;
356
357 rdtgrp = rdtgroup_kn_lock_live(of->kn);
358
359 if (rdtgrp) {
360 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
361 if (!rdtgrp->plr->d) {
362 rdt_last_cmd_clear();
363 rdt_last_cmd_puts("Cache domain offline\n");
364 ret = -ENODEV;
365 } else {
366 mask = &rdtgrp->plr->d->hdr.cpu_mask;
367 seq_printf(s, is_cpu_list(of) ?
368 "%*pbl\n" : "%*pb\n",
369 cpumask_pr_args(mask));
370 }
371 } else {
372 seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
373 cpumask_pr_args(&rdtgrp->cpu_mask));
374 }
375 } else {
376 ret = -ENOENT;
377 }
378 rdtgroup_kn_unlock(of->kn);
379
380 return ret;
381 }
382
383 /*
384 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
385 *
386 * Per task closids/rmids must have been set up before calling this function.
387 * @r may be NULL.
388 */
389 static void
update_closid_rmid(const struct cpumask * cpu_mask,struct rdtgroup * r)390 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
391 {
392 struct resctrl_cpu_defaults defaults, *p = NULL;
393
394 if (r) {
395 defaults.closid = r->closid;
396 defaults.rmid = r->mon.rmid;
397 p = &defaults;
398 }
399
400 on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
401 }
402
cpus_mon_write(struct rdtgroup * rdtgrp,cpumask_var_t newmask,cpumask_var_t tmpmask)403 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
404 cpumask_var_t tmpmask)
405 {
406 struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
407 struct list_head *head;
408
409 /* Check whether cpus belong to parent ctrl group */
410 cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
411 if (!cpumask_empty(tmpmask)) {
412 rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
413 return -EINVAL;
414 }
415
416 /* Check whether cpus are dropped from this group */
417 cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
418 if (!cpumask_empty(tmpmask)) {
419 /* Give any dropped cpus to parent rdtgroup */
420 cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
421 update_closid_rmid(tmpmask, prgrp);
422 }
423
424 /*
425 * If we added cpus, remove them from previous group that owned them
426 * and update per-cpu rmid
427 */
428 cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
429 if (!cpumask_empty(tmpmask)) {
430 head = &prgrp->mon.crdtgrp_list;
431 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
432 if (crgrp == rdtgrp)
433 continue;
434 cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
435 tmpmask);
436 }
437 update_closid_rmid(tmpmask, rdtgrp);
438 }
439
440 /* Done pushing/pulling - update this group with new mask */
441 cpumask_copy(&rdtgrp->cpu_mask, newmask);
442
443 return 0;
444 }
445
cpumask_rdtgrp_clear(struct rdtgroup * r,struct cpumask * m)446 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
447 {
448 struct rdtgroup *crgrp;
449
450 cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
451 /* update the child mon group masks as well*/
452 list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
453 cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
454 }
455
cpus_ctrl_write(struct rdtgroup * rdtgrp,cpumask_var_t newmask,cpumask_var_t tmpmask,cpumask_var_t tmpmask1)456 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
457 cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
458 {
459 struct rdtgroup *r, *crgrp;
460 struct list_head *head;
461
462 /* Check whether cpus are dropped from this group */
463 cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
464 if (!cpumask_empty(tmpmask)) {
465 /* Can't drop from default group */
466 if (rdtgrp == &rdtgroup_default) {
467 rdt_last_cmd_puts("Can't drop CPUs from default group\n");
468 return -EINVAL;
469 }
470
471 /* Give any dropped cpus to rdtgroup_default */
472 cpumask_or(&rdtgroup_default.cpu_mask,
473 &rdtgroup_default.cpu_mask, tmpmask);
474 update_closid_rmid(tmpmask, &rdtgroup_default);
475 }
476
477 /*
478 * If we added cpus, remove them from previous group and
479 * the prev group's child groups that owned them
480 * and update per-cpu closid/rmid.
481 */
482 cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
483 if (!cpumask_empty(tmpmask)) {
484 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
485 if (r == rdtgrp)
486 continue;
487 cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
488 if (!cpumask_empty(tmpmask1))
489 cpumask_rdtgrp_clear(r, tmpmask1);
490 }
491 update_closid_rmid(tmpmask, rdtgrp);
492 }
493
494 /* Done pushing/pulling - update this group with new mask */
495 cpumask_copy(&rdtgrp->cpu_mask, newmask);
496
497 /*
498 * Clear child mon group masks since there is a new parent mask
499 * now and update the rmid for the cpus the child lost.
500 */
501 head = &rdtgrp->mon.crdtgrp_list;
502 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
503 cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
504 update_closid_rmid(tmpmask, rdtgrp);
505 cpumask_clear(&crgrp->cpu_mask);
506 }
507
508 return 0;
509 }
510
rdtgroup_cpus_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)511 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
512 char *buf, size_t nbytes, loff_t off)
513 {
514 cpumask_var_t tmpmask, newmask, tmpmask1;
515 struct rdtgroup *rdtgrp;
516 int ret;
517
518 if (!buf)
519 return -EINVAL;
520
521 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
522 return -ENOMEM;
523 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
524 free_cpumask_var(tmpmask);
525 return -ENOMEM;
526 }
527 if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
528 free_cpumask_var(tmpmask);
529 free_cpumask_var(newmask);
530 return -ENOMEM;
531 }
532
533 rdtgrp = rdtgroup_kn_lock_live(of->kn);
534 if (!rdtgrp) {
535 ret = -ENOENT;
536 goto unlock;
537 }
538
539 rdt_last_cmd_clear();
540
541 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
542 rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
543 ret = -EINVAL;
544 rdt_last_cmd_puts("Pseudo-locking in progress\n");
545 goto unlock;
546 }
547
548 if (is_cpu_list(of))
549 ret = cpulist_parse(buf, newmask);
550 else
551 ret = cpumask_parse(buf, newmask);
552
553 if (ret) {
554 rdt_last_cmd_puts("Bad CPU list/mask\n");
555 goto unlock;
556 }
557
558 /* check that user didn't specify any offline cpus */
559 cpumask_andnot(tmpmask, newmask, cpu_online_mask);
560 if (!cpumask_empty(tmpmask)) {
561 ret = -EINVAL;
562 rdt_last_cmd_puts("Can only assign online CPUs\n");
563 goto unlock;
564 }
565
566 if (rdtgrp->type == RDTCTRL_GROUP)
567 ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
568 else if (rdtgrp->type == RDTMON_GROUP)
569 ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
570 else
571 ret = -EINVAL;
572
573 unlock:
574 rdtgroup_kn_unlock(of->kn);
575 free_cpumask_var(tmpmask);
576 free_cpumask_var(newmask);
577 free_cpumask_var(tmpmask1);
578
579 return ret ?: nbytes;
580 }
581
582 /**
583 * rdtgroup_remove - the helper to remove resource group safely
584 * @rdtgrp: resource group to remove
585 *
586 * On resource group creation via a mkdir, an extra kernfs_node reference is
587 * taken to ensure that the rdtgroup structure remains accessible for the
588 * rdtgroup_kn_unlock() calls where it is removed.
589 *
590 * Drop the extra reference here, then free the rdtgroup structure.
591 *
592 * Return: void
593 */
rdtgroup_remove(struct rdtgroup * rdtgrp)594 static void rdtgroup_remove(struct rdtgroup *rdtgrp)
595 {
596 kernfs_put(rdtgrp->kn);
597 kfree(rdtgrp);
598 }
599
_update_task_closid_rmid(void * task)600 static void _update_task_closid_rmid(void *task)
601 {
602 /*
603 * If the task is still current on this CPU, update PQR_ASSOC MSR.
604 * Otherwise, the MSR is updated when the task is scheduled in.
605 */
606 if (task == current)
607 resctrl_arch_sched_in(task);
608 }
609
update_task_closid_rmid(struct task_struct * t)610 static void update_task_closid_rmid(struct task_struct *t)
611 {
612 if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
613 smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
614 else
615 _update_task_closid_rmid(t);
616 }
617
task_in_rdtgroup(struct task_struct * tsk,struct rdtgroup * rdtgrp)618 static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
619 {
620 u32 closid, rmid = rdtgrp->mon.rmid;
621
622 if (rdtgrp->type == RDTCTRL_GROUP)
623 closid = rdtgrp->closid;
624 else if (rdtgrp->type == RDTMON_GROUP)
625 closid = rdtgrp->mon.parent->closid;
626 else
627 return false;
628
629 return resctrl_arch_match_closid(tsk, closid) &&
630 resctrl_arch_match_rmid(tsk, closid, rmid);
631 }
632
__rdtgroup_move_task(struct task_struct * tsk,struct rdtgroup * rdtgrp)633 static int __rdtgroup_move_task(struct task_struct *tsk,
634 struct rdtgroup *rdtgrp)
635 {
636 /* If the task is already in rdtgrp, no need to move the task. */
637 if (task_in_rdtgroup(tsk, rdtgrp))
638 return 0;
639
640 /*
641 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
642 * updated by them.
643 *
644 * For ctrl_mon groups, move both closid and rmid.
645 * For monitor groups, can move the tasks only from
646 * their parent CTRL group.
647 */
648 if (rdtgrp->type == RDTMON_GROUP &&
649 !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
650 rdt_last_cmd_puts("Can't move task to different control group\n");
651 return -EINVAL;
652 }
653
654 if (rdtgrp->type == RDTMON_GROUP)
655 resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
656 rdtgrp->mon.rmid);
657 else
658 resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
659 rdtgrp->mon.rmid);
660
661 /*
662 * Ensure the task's closid and rmid are written before determining if
663 * the task is current that will decide if it will be interrupted.
664 * This pairs with the full barrier between the rq->curr update and
665 * resctrl_arch_sched_in() during context switch.
666 */
667 smp_mb();
668
669 /*
670 * By now, the task's closid and rmid are set. If the task is current
671 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
672 * group go into effect. If the task is not current, the MSR will be
673 * updated when the task is scheduled in.
674 */
675 update_task_closid_rmid(tsk);
676
677 return 0;
678 }
679
is_closid_match(struct task_struct * t,struct rdtgroup * r)680 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
681 {
682 return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
683 resctrl_arch_match_closid(t, r->closid));
684 }
685
is_rmid_match(struct task_struct * t,struct rdtgroup * r)686 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
687 {
688 return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
689 resctrl_arch_match_rmid(t, r->mon.parent->closid,
690 r->mon.rmid));
691 }
692
693 /**
694 * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
695 * @r: Resource group
696 *
697 * Return: 1 if tasks have been assigned to @r, 0 otherwise
698 */
rdtgroup_tasks_assigned(struct rdtgroup * r)699 int rdtgroup_tasks_assigned(struct rdtgroup *r)
700 {
701 struct task_struct *p, *t;
702 int ret = 0;
703
704 lockdep_assert_held(&rdtgroup_mutex);
705
706 rcu_read_lock();
707 for_each_process_thread(p, t) {
708 if (is_closid_match(t, r) || is_rmid_match(t, r)) {
709 ret = 1;
710 break;
711 }
712 }
713 rcu_read_unlock();
714
715 return ret;
716 }
717
rdtgroup_task_write_permission(struct task_struct * task,struct kernfs_open_file * of)718 static int rdtgroup_task_write_permission(struct task_struct *task,
719 struct kernfs_open_file *of)
720 {
721 const struct cred *tcred = get_task_cred(task);
722 const struct cred *cred = current_cred();
723 int ret = 0;
724
725 /*
726 * Even if we're attaching all tasks in the thread group, we only
727 * need to check permissions on one of them.
728 */
729 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
730 !uid_eq(cred->euid, tcred->uid) &&
731 !uid_eq(cred->euid, tcred->suid)) {
732 rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
733 ret = -EPERM;
734 }
735
736 put_cred(tcred);
737 return ret;
738 }
739
rdtgroup_move_task(pid_t pid,struct rdtgroup * rdtgrp,struct kernfs_open_file * of)740 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
741 struct kernfs_open_file *of)
742 {
743 struct task_struct *tsk;
744 int ret;
745
746 rcu_read_lock();
747 if (pid) {
748 tsk = find_task_by_vpid(pid);
749 if (!tsk) {
750 rcu_read_unlock();
751 rdt_last_cmd_printf("No task %d\n", pid);
752 return -ESRCH;
753 }
754 } else {
755 tsk = current;
756 }
757
758 get_task_struct(tsk);
759 rcu_read_unlock();
760
761 ret = rdtgroup_task_write_permission(tsk, of);
762 if (!ret)
763 ret = __rdtgroup_move_task(tsk, rdtgrp);
764
765 put_task_struct(tsk);
766 return ret;
767 }
768
rdtgroup_tasks_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)769 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
770 char *buf, size_t nbytes, loff_t off)
771 {
772 struct rdtgroup *rdtgrp;
773 char *pid_str;
774 int ret = 0;
775 pid_t pid;
776
777 rdtgrp = rdtgroup_kn_lock_live(of->kn);
778 if (!rdtgrp) {
779 rdtgroup_kn_unlock(of->kn);
780 return -ENOENT;
781 }
782 rdt_last_cmd_clear();
783
784 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
785 rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
786 ret = -EINVAL;
787 rdt_last_cmd_puts("Pseudo-locking in progress\n");
788 goto unlock;
789 }
790
791 while (buf && buf[0] != '\0' && buf[0] != '\n') {
792 pid_str = strim(strsep(&buf, ","));
793
794 if (kstrtoint(pid_str, 0, &pid)) {
795 rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
796 ret = -EINVAL;
797 break;
798 }
799
800 if (pid < 0) {
801 rdt_last_cmd_printf("Invalid pid %d\n", pid);
802 ret = -EINVAL;
803 break;
804 }
805
806 ret = rdtgroup_move_task(pid, rdtgrp, of);
807 if (ret) {
808 rdt_last_cmd_printf("Error while processing task %d\n", pid);
809 break;
810 }
811 }
812
813 unlock:
814 rdtgroup_kn_unlock(of->kn);
815
816 return ret ?: nbytes;
817 }
818
show_rdt_tasks(struct rdtgroup * r,struct seq_file * s)819 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
820 {
821 struct task_struct *p, *t;
822 pid_t pid;
823
824 rcu_read_lock();
825 for_each_process_thread(p, t) {
826 if (is_closid_match(t, r) || is_rmid_match(t, r)) {
827 pid = task_pid_vnr(t);
828 if (pid)
829 seq_printf(s, "%d\n", pid);
830 }
831 }
832 rcu_read_unlock();
833 }
834
rdtgroup_tasks_show(struct kernfs_open_file * of,struct seq_file * s,void * v)835 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
836 struct seq_file *s, void *v)
837 {
838 struct rdtgroup *rdtgrp;
839 int ret = 0;
840
841 rdtgrp = rdtgroup_kn_lock_live(of->kn);
842 if (rdtgrp)
843 show_rdt_tasks(rdtgrp, s);
844 else
845 ret = -ENOENT;
846 rdtgroup_kn_unlock(of->kn);
847
848 return ret;
849 }
850
rdtgroup_closid_show(struct kernfs_open_file * of,struct seq_file * s,void * v)851 static int rdtgroup_closid_show(struct kernfs_open_file *of,
852 struct seq_file *s, void *v)
853 {
854 struct rdtgroup *rdtgrp;
855 int ret = 0;
856
857 rdtgrp = rdtgroup_kn_lock_live(of->kn);
858 if (rdtgrp)
859 seq_printf(s, "%u\n", rdtgrp->closid);
860 else
861 ret = -ENOENT;
862 rdtgroup_kn_unlock(of->kn);
863
864 return ret;
865 }
866
rdtgroup_rmid_show(struct kernfs_open_file * of,struct seq_file * s,void * v)867 static int rdtgroup_rmid_show(struct kernfs_open_file *of,
868 struct seq_file *s, void *v)
869 {
870 struct rdtgroup *rdtgrp;
871 int ret = 0;
872
873 rdtgrp = rdtgroup_kn_lock_live(of->kn);
874 if (rdtgrp)
875 seq_printf(s, "%u\n", rdtgrp->mon.rmid);
876 else
877 ret = -ENOENT;
878 rdtgroup_kn_unlock(of->kn);
879
880 return ret;
881 }
882
883 #ifdef CONFIG_PROC_CPU_RESCTRL
884 /*
885 * A task can only be part of one resctrl control group and of one monitor
886 * group which is associated to that control group.
887 *
888 * 1) res:
889 * mon:
890 *
891 * resctrl is not available.
892 *
893 * 2) res:/
894 * mon:
895 *
896 * Task is part of the root resctrl control group, and it is not associated
897 * to any monitor group.
898 *
899 * 3) res:/
900 * mon:mon0
901 *
902 * Task is part of the root resctrl control group and monitor group mon0.
903 *
904 * 4) res:group0
905 * mon:
906 *
907 * Task is part of resctrl control group group0, and it is not associated
908 * to any monitor group.
909 *
910 * 5) res:group0
911 * mon:mon1
912 *
913 * Task is part of resctrl control group group0 and monitor group mon1.
914 */
proc_resctrl_show(struct seq_file * s,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)915 int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
916 struct pid *pid, struct task_struct *tsk)
917 {
918 struct rdtgroup *rdtg;
919 int ret = 0;
920
921 mutex_lock(&rdtgroup_mutex);
922
923 /* Return empty if resctrl has not been mounted. */
924 if (!resctrl_mounted) {
925 seq_puts(s, "res:\nmon:\n");
926 goto unlock;
927 }
928
929 list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
930 struct rdtgroup *crg;
931
932 /*
933 * Task information is only relevant for shareable
934 * and exclusive groups.
935 */
936 if (rdtg->mode != RDT_MODE_SHAREABLE &&
937 rdtg->mode != RDT_MODE_EXCLUSIVE)
938 continue;
939
940 if (!resctrl_arch_match_closid(tsk, rdtg->closid))
941 continue;
942
943 seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
944 rdt_kn_name(rdtg->kn));
945 seq_puts(s, "mon:");
946 list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
947 mon.crdtgrp_list) {
948 if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
949 crg->mon.rmid))
950 continue;
951 seq_printf(s, "%s", rdt_kn_name(crg->kn));
952 break;
953 }
954 seq_putc(s, '\n');
955 goto unlock;
956 }
957 /*
958 * The above search should succeed. Otherwise return
959 * with an error.
960 */
961 ret = -ENOENT;
962 unlock:
963 mutex_unlock(&rdtgroup_mutex);
964
965 return ret;
966 }
967 #endif
968
rdt_last_cmd_status_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)969 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
970 struct seq_file *seq, void *v)
971 {
972 int len;
973
974 mutex_lock(&rdtgroup_mutex);
975 len = seq_buf_used(&last_cmd_status);
976 if (len)
977 seq_printf(seq, "%.*s", len, last_cmd_status_buf);
978 else
979 seq_puts(seq, "ok\n");
980 mutex_unlock(&rdtgroup_mutex);
981 return 0;
982 }
983
rdt_kn_parent_priv(struct kernfs_node * kn)984 static void *rdt_kn_parent_priv(struct kernfs_node *kn)
985 {
986 /*
987 * The parent pointer is only valid within RCU section since it can be
988 * replaced.
989 */
990 guard(rcu)();
991 return rcu_dereference(kn->__parent)->priv;
992 }
993
rdt_num_closids_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)994 static int rdt_num_closids_show(struct kernfs_open_file *of,
995 struct seq_file *seq, void *v)
996 {
997 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
998
999 seq_printf(seq, "%u\n", s->num_closid);
1000 return 0;
1001 }
1002
rdt_default_ctrl_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1003 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
1004 struct seq_file *seq, void *v)
1005 {
1006 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1007 struct rdt_resource *r = s->res;
1008
1009 seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
1010 return 0;
1011 }
1012
rdt_min_cbm_bits_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1013 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
1014 struct seq_file *seq, void *v)
1015 {
1016 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1017 struct rdt_resource *r = s->res;
1018
1019 seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
1020 return 0;
1021 }
1022
rdt_shareable_bits_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1023 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
1024 struct seq_file *seq, void *v)
1025 {
1026 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1027 struct rdt_resource *r = s->res;
1028
1029 seq_printf(seq, "%x\n", r->cache.shareable_bits);
1030 return 0;
1031 }
1032
1033 /*
1034 * rdt_bit_usage_show - Display current usage of resources
1035 *
1036 * A domain is a shared resource that can now be allocated differently. Here
1037 * we display the current regions of the domain as an annotated bitmask.
1038 * For each domain of this resource its allocation bitmask
1039 * is annotated as below to indicate the current usage of the corresponding bit:
1040 * 0 - currently unused
1041 * X - currently available for sharing and used by software and hardware
1042 * H - currently used by hardware only but available for software use
1043 * S - currently used and shareable by software only
1044 * E - currently used exclusively by one resource group
1045 * P - currently pseudo-locked by one resource group
1046 */
rdt_bit_usage_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1047 static int rdt_bit_usage_show(struct kernfs_open_file *of,
1048 struct seq_file *seq, void *v)
1049 {
1050 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1051 /*
1052 * Use unsigned long even though only 32 bits are used to ensure
1053 * test_bit() is used safely.
1054 */
1055 unsigned long sw_shareable = 0, hw_shareable = 0;
1056 unsigned long exclusive = 0, pseudo_locked = 0;
1057 struct rdt_resource *r = s->res;
1058 struct rdt_ctrl_domain *dom;
1059 int i, hwb, swb, excl, psl;
1060 enum rdtgrp_mode mode;
1061 bool sep = false;
1062 u32 ctrl_val;
1063
1064 cpus_read_lock();
1065 mutex_lock(&rdtgroup_mutex);
1066 hw_shareable = r->cache.shareable_bits;
1067 list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
1068 if (sep)
1069 seq_putc(seq, ';');
1070 sw_shareable = 0;
1071 exclusive = 0;
1072 seq_printf(seq, "%d=", dom->hdr.id);
1073 for (i = 0; i < closids_supported(); i++) {
1074 if (!closid_allocated(i))
1075 continue;
1076 ctrl_val = resctrl_arch_get_config(r, dom, i,
1077 s->conf_type);
1078 mode = rdtgroup_mode_by_closid(i);
1079 switch (mode) {
1080 case RDT_MODE_SHAREABLE:
1081 sw_shareable |= ctrl_val;
1082 break;
1083 case RDT_MODE_EXCLUSIVE:
1084 exclusive |= ctrl_val;
1085 break;
1086 case RDT_MODE_PSEUDO_LOCKSETUP:
1087 /*
1088 * RDT_MODE_PSEUDO_LOCKSETUP is possible
1089 * here but not included since the CBM
1090 * associated with this CLOSID in this mode
1091 * is not initialized and no task or cpu can be
1092 * assigned this CLOSID.
1093 */
1094 break;
1095 case RDT_MODE_PSEUDO_LOCKED:
1096 case RDT_NUM_MODES:
1097 WARN(1,
1098 "invalid mode for closid %d\n", i);
1099 break;
1100 }
1101 }
1102 for (i = r->cache.cbm_len - 1; i >= 0; i--) {
1103 pseudo_locked = dom->plr ? dom->plr->cbm : 0;
1104 hwb = test_bit(i, &hw_shareable);
1105 swb = test_bit(i, &sw_shareable);
1106 excl = test_bit(i, &exclusive);
1107 psl = test_bit(i, &pseudo_locked);
1108 if (hwb && swb)
1109 seq_putc(seq, 'X');
1110 else if (hwb && !swb)
1111 seq_putc(seq, 'H');
1112 else if (!hwb && swb)
1113 seq_putc(seq, 'S');
1114 else if (excl)
1115 seq_putc(seq, 'E');
1116 else if (psl)
1117 seq_putc(seq, 'P');
1118 else /* Unused bits remain */
1119 seq_putc(seq, '0');
1120 }
1121 sep = true;
1122 }
1123 seq_putc(seq, '\n');
1124 mutex_unlock(&rdtgroup_mutex);
1125 cpus_read_unlock();
1126 return 0;
1127 }
1128
rdt_min_bw_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1129 static int rdt_min_bw_show(struct kernfs_open_file *of,
1130 struct seq_file *seq, void *v)
1131 {
1132 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1133 struct rdt_resource *r = s->res;
1134
1135 seq_printf(seq, "%u\n", r->membw.min_bw);
1136 return 0;
1137 }
1138
rdt_num_rmids_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1139 static int rdt_num_rmids_show(struct kernfs_open_file *of,
1140 struct seq_file *seq, void *v)
1141 {
1142 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1143
1144 seq_printf(seq, "%d\n", r->num_rmid);
1145
1146 return 0;
1147 }
1148
rdt_mon_features_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1149 static int rdt_mon_features_show(struct kernfs_open_file *of,
1150 struct seq_file *seq, void *v)
1151 {
1152 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1153 struct mon_evt *mevt;
1154
1155 list_for_each_entry(mevt, &r->evt_list, list) {
1156 seq_printf(seq, "%s\n", mevt->name);
1157 if (mevt->configurable)
1158 seq_printf(seq, "%s_config\n", mevt->name);
1159 }
1160
1161 return 0;
1162 }
1163
rdt_bw_gran_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1164 static int rdt_bw_gran_show(struct kernfs_open_file *of,
1165 struct seq_file *seq, void *v)
1166 {
1167 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1168 struct rdt_resource *r = s->res;
1169
1170 seq_printf(seq, "%u\n", r->membw.bw_gran);
1171 return 0;
1172 }
1173
rdt_delay_linear_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1174 static int rdt_delay_linear_show(struct kernfs_open_file *of,
1175 struct seq_file *seq, void *v)
1176 {
1177 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1178 struct rdt_resource *r = s->res;
1179
1180 seq_printf(seq, "%u\n", r->membw.delay_linear);
1181 return 0;
1182 }
1183
max_threshold_occ_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1184 static int max_threshold_occ_show(struct kernfs_open_file *of,
1185 struct seq_file *seq, void *v)
1186 {
1187 seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
1188
1189 return 0;
1190 }
1191
rdt_thread_throttle_mode_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1192 static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1193 struct seq_file *seq, void *v)
1194 {
1195 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1196 struct rdt_resource *r = s->res;
1197
1198 switch (r->membw.throttle_mode) {
1199 case THREAD_THROTTLE_PER_THREAD:
1200 seq_puts(seq, "per-thread\n");
1201 return 0;
1202 case THREAD_THROTTLE_MAX:
1203 seq_puts(seq, "max\n");
1204 return 0;
1205 case THREAD_THROTTLE_UNDEFINED:
1206 seq_puts(seq, "undefined\n");
1207 return 0;
1208 }
1209
1210 WARN_ON_ONCE(1);
1211
1212 return 0;
1213 }
1214
max_threshold_occ_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1215 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1216 char *buf, size_t nbytes, loff_t off)
1217 {
1218 unsigned int bytes;
1219 int ret;
1220
1221 ret = kstrtouint(buf, 0, &bytes);
1222 if (ret)
1223 return ret;
1224
1225 if (bytes > resctrl_rmid_realloc_limit)
1226 return -EINVAL;
1227
1228 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
1229
1230 return nbytes;
1231 }
1232
1233 /*
1234 * rdtgroup_mode_show - Display mode of this resource group
1235 */
rdtgroup_mode_show(struct kernfs_open_file * of,struct seq_file * s,void * v)1236 static int rdtgroup_mode_show(struct kernfs_open_file *of,
1237 struct seq_file *s, void *v)
1238 {
1239 struct rdtgroup *rdtgrp;
1240
1241 rdtgrp = rdtgroup_kn_lock_live(of->kn);
1242 if (!rdtgrp) {
1243 rdtgroup_kn_unlock(of->kn);
1244 return -ENOENT;
1245 }
1246
1247 seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1248
1249 rdtgroup_kn_unlock(of->kn);
1250 return 0;
1251 }
1252
resctrl_peer_type(enum resctrl_conf_type my_type)1253 static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
1254 {
1255 switch (my_type) {
1256 case CDP_CODE:
1257 return CDP_DATA;
1258 case CDP_DATA:
1259 return CDP_CODE;
1260 default:
1261 case CDP_NONE:
1262 return CDP_NONE;
1263 }
1264 }
1265
rdt_has_sparse_bitmasks_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1266 static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
1267 struct seq_file *seq, void *v)
1268 {
1269 struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1270 struct rdt_resource *r = s->res;
1271
1272 seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
1273
1274 return 0;
1275 }
1276
1277 /**
1278 * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1279 * @r: Resource to which domain instance @d belongs.
1280 * @d: The domain instance for which @closid is being tested.
1281 * @cbm: Capacity bitmask being tested.
1282 * @closid: Intended closid for @cbm.
1283 * @type: CDP type of @r.
1284 * @exclusive: Only check if overlaps with exclusive resource groups
1285 *
1286 * Checks if provided @cbm intended to be used for @closid on domain
1287 * @d overlaps with any other closids or other hardware usage associated
1288 * with this domain. If @exclusive is true then only overlaps with
1289 * resource groups in exclusive mode will be considered. If @exclusive
1290 * is false then overlaps with any resource group or hardware entities
1291 * will be considered.
1292 *
1293 * @cbm is unsigned long, even if only 32 bits are used, to make the
1294 * bitmap functions work correctly.
1295 *
1296 * Return: false if CBM does not overlap, true if it does.
1297 */
__rdtgroup_cbm_overlaps(struct rdt_resource * r,struct rdt_ctrl_domain * d,unsigned long cbm,int closid,enum resctrl_conf_type type,bool exclusive)1298 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1299 unsigned long cbm, int closid,
1300 enum resctrl_conf_type type, bool exclusive)
1301 {
1302 enum rdtgrp_mode mode;
1303 unsigned long ctrl_b;
1304 int i;
1305
1306 /* Check for any overlap with regions used by hardware directly */
1307 if (!exclusive) {
1308 ctrl_b = r->cache.shareable_bits;
1309 if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1310 return true;
1311 }
1312
1313 /* Check for overlap with other resource groups */
1314 for (i = 0; i < closids_supported(); i++) {
1315 ctrl_b = resctrl_arch_get_config(r, d, i, type);
1316 mode = rdtgroup_mode_by_closid(i);
1317 if (closid_allocated(i) && i != closid &&
1318 mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1319 if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1320 if (exclusive) {
1321 if (mode == RDT_MODE_EXCLUSIVE)
1322 return true;
1323 continue;
1324 }
1325 return true;
1326 }
1327 }
1328 }
1329
1330 return false;
1331 }
1332
1333 /**
1334 * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1335 * @s: Schema for the resource to which domain instance @d belongs.
1336 * @d: The domain instance for which @closid is being tested.
1337 * @cbm: Capacity bitmask being tested.
1338 * @closid: Intended closid for @cbm.
1339 * @exclusive: Only check if overlaps with exclusive resource groups
1340 *
1341 * Resources that can be allocated using a CBM can use the CBM to control
1342 * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1343 * for overlap. Overlap test is not limited to the specific resource for
1344 * which the CBM is intended though - when dealing with CDP resources that
1345 * share the underlying hardware the overlap check should be performed on
1346 * the CDP resource sharing the hardware also.
1347 *
1348 * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1349 * overlap test.
1350 *
1351 * Return: true if CBM overlap detected, false if there is no overlap
1352 */
rdtgroup_cbm_overlaps(struct resctrl_schema * s,struct rdt_ctrl_domain * d,unsigned long cbm,int closid,bool exclusive)1353 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
1354 unsigned long cbm, int closid, bool exclusive)
1355 {
1356 enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
1357 struct rdt_resource *r = s->res;
1358
1359 if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
1360 exclusive))
1361 return true;
1362
1363 if (!resctrl_arch_get_cdp_enabled(r->rid))
1364 return false;
1365 return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
1366 }
1367
1368 /**
1369 * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1370 * @rdtgrp: Resource group identified through its closid.
1371 *
1372 * An exclusive resource group implies that there should be no sharing of
1373 * its allocated resources. At the time this group is considered to be
1374 * exclusive this test can determine if its current schemata supports this
1375 * setting by testing for overlap with all other resource groups.
1376 *
1377 * Return: true if resource group can be exclusive, false if there is overlap
1378 * with allocations of other resource groups and thus this resource group
1379 * cannot be exclusive.
1380 */
rdtgroup_mode_test_exclusive(struct rdtgroup * rdtgrp)1381 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1382 {
1383 int closid = rdtgrp->closid;
1384 struct rdt_ctrl_domain *d;
1385 struct resctrl_schema *s;
1386 struct rdt_resource *r;
1387 bool has_cache = false;
1388 u32 ctrl;
1389
1390 /* Walking r->domains, ensure it can't race with cpuhp */
1391 lockdep_assert_cpus_held();
1392
1393 list_for_each_entry(s, &resctrl_schema_all, list) {
1394 r = s->res;
1395 if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
1396 continue;
1397 has_cache = true;
1398 list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1399 ctrl = resctrl_arch_get_config(r, d, closid,
1400 s->conf_type);
1401 if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
1402 rdt_last_cmd_puts("Schemata overlaps\n");
1403 return false;
1404 }
1405 }
1406 }
1407
1408 if (!has_cache) {
1409 rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1410 return false;
1411 }
1412
1413 return true;
1414 }
1415
1416 /*
1417 * rdtgroup_mode_write - Modify the resource group's mode
1418 */
rdtgroup_mode_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1419 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1420 char *buf, size_t nbytes, loff_t off)
1421 {
1422 struct rdtgroup *rdtgrp;
1423 enum rdtgrp_mode mode;
1424 int ret = 0;
1425
1426 /* Valid input requires a trailing newline */
1427 if (nbytes == 0 || buf[nbytes - 1] != '\n')
1428 return -EINVAL;
1429 buf[nbytes - 1] = '\0';
1430
1431 rdtgrp = rdtgroup_kn_lock_live(of->kn);
1432 if (!rdtgrp) {
1433 rdtgroup_kn_unlock(of->kn);
1434 return -ENOENT;
1435 }
1436
1437 rdt_last_cmd_clear();
1438
1439 mode = rdtgrp->mode;
1440
1441 if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1442 (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1443 (!strcmp(buf, "pseudo-locksetup") &&
1444 mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1445 (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1446 goto out;
1447
1448 if (mode == RDT_MODE_PSEUDO_LOCKED) {
1449 rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1450 ret = -EINVAL;
1451 goto out;
1452 }
1453
1454 if (!strcmp(buf, "shareable")) {
1455 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1456 ret = rdtgroup_locksetup_exit(rdtgrp);
1457 if (ret)
1458 goto out;
1459 }
1460 rdtgrp->mode = RDT_MODE_SHAREABLE;
1461 } else if (!strcmp(buf, "exclusive")) {
1462 if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1463 ret = -EINVAL;
1464 goto out;
1465 }
1466 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1467 ret = rdtgroup_locksetup_exit(rdtgrp);
1468 if (ret)
1469 goto out;
1470 }
1471 rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1472 } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
1473 !strcmp(buf, "pseudo-locksetup")) {
1474 ret = rdtgroup_locksetup_enter(rdtgrp);
1475 if (ret)
1476 goto out;
1477 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1478 } else {
1479 rdt_last_cmd_puts("Unknown or unsupported mode\n");
1480 ret = -EINVAL;
1481 }
1482
1483 out:
1484 rdtgroup_kn_unlock(of->kn);
1485 return ret ?: nbytes;
1486 }
1487
1488 /**
1489 * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1490 * @r: RDT resource to which @d belongs.
1491 * @d: RDT domain instance.
1492 * @cbm: bitmask for which the size should be computed.
1493 *
1494 * The bitmask provided associated with the RDT domain instance @d will be
1495 * translated into how many bytes it represents. The size in bytes is
1496 * computed by first dividing the total cache size by the CBM length to
1497 * determine how many bytes each bit in the bitmask represents. The result
1498 * is multiplied with the number of bits set in the bitmask.
1499 *
1500 * @cbm is unsigned long, even if only 32 bits are used to make the
1501 * bitmap functions work correctly.
1502 */
rdtgroup_cbm_to_size(struct rdt_resource * r,struct rdt_ctrl_domain * d,unsigned long cbm)1503 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1504 struct rdt_ctrl_domain *d, unsigned long cbm)
1505 {
1506 unsigned int size = 0;
1507 struct cacheinfo *ci;
1508 int num_b;
1509
1510 if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
1511 return size;
1512
1513 num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1514 ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
1515 if (ci)
1516 size = ci->size / r->cache.cbm_len * num_b;
1517
1518 return size;
1519 }
1520
is_mba_sc(struct rdt_resource * r)1521 bool is_mba_sc(struct rdt_resource *r)
1522 {
1523 if (!r)
1524 r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
1525
1526 /*
1527 * The software controller support is only applicable to MBA resource.
1528 * Make sure to check for resource type.
1529 */
1530 if (r->rid != RDT_RESOURCE_MBA)
1531 return false;
1532
1533 return r->membw.mba_sc;
1534 }
1535
1536 /*
1537 * rdtgroup_size_show - Display size in bytes of allocated regions
1538 *
1539 * The "size" file mirrors the layout of the "schemata" file, printing the
1540 * size in bytes of each region instead of the capacity bitmask.
1541 */
rdtgroup_size_show(struct kernfs_open_file * of,struct seq_file * s,void * v)1542 static int rdtgroup_size_show(struct kernfs_open_file *of,
1543 struct seq_file *s, void *v)
1544 {
1545 struct resctrl_schema *schema;
1546 enum resctrl_conf_type type;
1547 struct rdt_ctrl_domain *d;
1548 struct rdtgroup *rdtgrp;
1549 struct rdt_resource *r;
1550 unsigned int size;
1551 int ret = 0;
1552 u32 closid;
1553 bool sep;
1554 u32 ctrl;
1555
1556 rdtgrp = rdtgroup_kn_lock_live(of->kn);
1557 if (!rdtgrp) {
1558 rdtgroup_kn_unlock(of->kn);
1559 return -ENOENT;
1560 }
1561
1562 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1563 if (!rdtgrp->plr->d) {
1564 rdt_last_cmd_clear();
1565 rdt_last_cmd_puts("Cache domain offline\n");
1566 ret = -ENODEV;
1567 } else {
1568 seq_printf(s, "%*s:", max_name_width,
1569 rdtgrp->plr->s->name);
1570 size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
1571 rdtgrp->plr->d,
1572 rdtgrp->plr->cbm);
1573 seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
1574 }
1575 goto out;
1576 }
1577
1578 closid = rdtgrp->closid;
1579
1580 list_for_each_entry(schema, &resctrl_schema_all, list) {
1581 r = schema->res;
1582 type = schema->conf_type;
1583 sep = false;
1584 seq_printf(s, "%*s:", max_name_width, schema->name);
1585 list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1586 if (sep)
1587 seq_putc(s, ';');
1588 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1589 size = 0;
1590 } else {
1591 if (is_mba_sc(r))
1592 ctrl = d->mbps_val[closid];
1593 else
1594 ctrl = resctrl_arch_get_config(r, d,
1595 closid,
1596 type);
1597 if (r->rid == RDT_RESOURCE_MBA ||
1598 r->rid == RDT_RESOURCE_SMBA)
1599 size = ctrl;
1600 else
1601 size = rdtgroup_cbm_to_size(r, d, ctrl);
1602 }
1603 seq_printf(s, "%d=%u", d->hdr.id, size);
1604 sep = true;
1605 }
1606 seq_putc(s, '\n');
1607 }
1608
1609 out:
1610 rdtgroup_kn_unlock(of->kn);
1611
1612 return ret;
1613 }
1614
mondata_config_read(struct resctrl_mon_config_info * mon_info)1615 static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
1616 {
1617 smp_call_function_any(&mon_info->d->hdr.cpu_mask,
1618 resctrl_arch_mon_event_config_read, mon_info, 1);
1619 }
1620
mbm_config_show(struct seq_file * s,struct rdt_resource * r,u32 evtid)1621 static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
1622 {
1623 struct resctrl_mon_config_info mon_info;
1624 struct rdt_mon_domain *dom;
1625 bool sep = false;
1626
1627 cpus_read_lock();
1628 mutex_lock(&rdtgroup_mutex);
1629
1630 list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1631 if (sep)
1632 seq_puts(s, ";");
1633
1634 memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
1635 mon_info.r = r;
1636 mon_info.d = dom;
1637 mon_info.evtid = evtid;
1638 mondata_config_read(&mon_info);
1639
1640 seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
1641 sep = true;
1642 }
1643 seq_puts(s, "\n");
1644
1645 mutex_unlock(&rdtgroup_mutex);
1646 cpus_read_unlock();
1647
1648 return 0;
1649 }
1650
mbm_total_bytes_config_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1651 static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
1652 struct seq_file *seq, void *v)
1653 {
1654 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1655
1656 mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
1657
1658 return 0;
1659 }
1660
mbm_local_bytes_config_show(struct kernfs_open_file * of,struct seq_file * seq,void * v)1661 static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
1662 struct seq_file *seq, void *v)
1663 {
1664 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1665
1666 mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
1667
1668 return 0;
1669 }
1670
mbm_config_write_domain(struct rdt_resource * r,struct rdt_mon_domain * d,u32 evtid,u32 val)1671 static void mbm_config_write_domain(struct rdt_resource *r,
1672 struct rdt_mon_domain *d, u32 evtid, u32 val)
1673 {
1674 struct resctrl_mon_config_info mon_info = {0};
1675
1676 /*
1677 * Read the current config value first. If both are the same then
1678 * no need to write it again.
1679 */
1680 mon_info.r = r;
1681 mon_info.d = d;
1682 mon_info.evtid = evtid;
1683 mondata_config_read(&mon_info);
1684 if (mon_info.mon_config == val)
1685 return;
1686
1687 mon_info.mon_config = val;
1688
1689 /*
1690 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
1691 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
1692 * are scoped at the domain level. Writing any of these MSRs
1693 * on one CPU is observed by all the CPUs in the domain.
1694 */
1695 smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
1696 &mon_info, 1);
1697
1698 /*
1699 * When an Event Configuration is changed, the bandwidth counters
1700 * for all RMIDs and Events will be cleared by the hardware. The
1701 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
1702 * every RMID on the next read to any event for every RMID.
1703 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
1704 * cleared while it is tracked by the hardware. Clear the
1705 * mbm_local and mbm_total counts for all the RMIDs.
1706 */
1707 resctrl_arch_reset_rmid_all(r, d);
1708 }
1709
mon_config_write(struct rdt_resource * r,char * tok,u32 evtid)1710 static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
1711 {
1712 char *dom_str = NULL, *id_str;
1713 unsigned long dom_id, val;
1714 struct rdt_mon_domain *d;
1715
1716 /* Walking r->domains, ensure it can't race with cpuhp */
1717 lockdep_assert_cpus_held();
1718
1719 next:
1720 if (!tok || tok[0] == '\0')
1721 return 0;
1722
1723 /* Start processing the strings for each domain */
1724 dom_str = strim(strsep(&tok, ";"));
1725 id_str = strsep(&dom_str, "=");
1726
1727 if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
1728 rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
1729 return -EINVAL;
1730 }
1731
1732 if (!dom_str || kstrtoul(dom_str, 16, &val)) {
1733 rdt_last_cmd_puts("Non-numeric event configuration value\n");
1734 return -EINVAL;
1735 }
1736
1737 /* Value from user cannot be more than the supported set of events */
1738 if ((val & r->mbm_cfg_mask) != val) {
1739 rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
1740 r->mbm_cfg_mask);
1741 return -EINVAL;
1742 }
1743
1744 list_for_each_entry(d, &r->mon_domains, hdr.list) {
1745 if (d->hdr.id == dom_id) {
1746 mbm_config_write_domain(r, d, evtid, val);
1747 goto next;
1748 }
1749 }
1750
1751 return -EINVAL;
1752 }
1753
mbm_total_bytes_config_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1754 static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
1755 char *buf, size_t nbytes,
1756 loff_t off)
1757 {
1758 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1759 int ret;
1760
1761 /* Valid input requires a trailing newline */
1762 if (nbytes == 0 || buf[nbytes - 1] != '\n')
1763 return -EINVAL;
1764
1765 cpus_read_lock();
1766 mutex_lock(&rdtgroup_mutex);
1767
1768 rdt_last_cmd_clear();
1769
1770 buf[nbytes - 1] = '\0';
1771
1772 ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
1773
1774 mutex_unlock(&rdtgroup_mutex);
1775 cpus_read_unlock();
1776
1777 return ret ?: nbytes;
1778 }
1779
mbm_local_bytes_config_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1780 static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
1781 char *buf, size_t nbytes,
1782 loff_t off)
1783 {
1784 struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1785 int ret;
1786
1787 /* Valid input requires a trailing newline */
1788 if (nbytes == 0 || buf[nbytes - 1] != '\n')
1789 return -EINVAL;
1790
1791 cpus_read_lock();
1792 mutex_lock(&rdtgroup_mutex);
1793
1794 rdt_last_cmd_clear();
1795
1796 buf[nbytes - 1] = '\0';
1797
1798 ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
1799
1800 mutex_unlock(&rdtgroup_mutex);
1801 cpus_read_unlock();
1802
1803 return ret ?: nbytes;
1804 }
1805
1806 /* rdtgroup information files for one cache resource. */
1807 static struct rftype res_common_files[] = {
1808 {
1809 .name = "last_cmd_status",
1810 .mode = 0444,
1811 .kf_ops = &rdtgroup_kf_single_ops,
1812 .seq_show = rdt_last_cmd_status_show,
1813 .fflags = RFTYPE_TOP_INFO,
1814 },
1815 {
1816 .name = "num_closids",
1817 .mode = 0444,
1818 .kf_ops = &rdtgroup_kf_single_ops,
1819 .seq_show = rdt_num_closids_show,
1820 .fflags = RFTYPE_CTRL_INFO,
1821 },
1822 {
1823 .name = "mon_features",
1824 .mode = 0444,
1825 .kf_ops = &rdtgroup_kf_single_ops,
1826 .seq_show = rdt_mon_features_show,
1827 .fflags = RFTYPE_MON_INFO,
1828 },
1829 {
1830 .name = "num_rmids",
1831 .mode = 0444,
1832 .kf_ops = &rdtgroup_kf_single_ops,
1833 .seq_show = rdt_num_rmids_show,
1834 .fflags = RFTYPE_MON_INFO,
1835 },
1836 {
1837 .name = "cbm_mask",
1838 .mode = 0444,
1839 .kf_ops = &rdtgroup_kf_single_ops,
1840 .seq_show = rdt_default_ctrl_show,
1841 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1842 },
1843 {
1844 .name = "min_cbm_bits",
1845 .mode = 0444,
1846 .kf_ops = &rdtgroup_kf_single_ops,
1847 .seq_show = rdt_min_cbm_bits_show,
1848 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1849 },
1850 {
1851 .name = "shareable_bits",
1852 .mode = 0444,
1853 .kf_ops = &rdtgroup_kf_single_ops,
1854 .seq_show = rdt_shareable_bits_show,
1855 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1856 },
1857 {
1858 .name = "bit_usage",
1859 .mode = 0444,
1860 .kf_ops = &rdtgroup_kf_single_ops,
1861 .seq_show = rdt_bit_usage_show,
1862 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1863 },
1864 {
1865 .name = "min_bandwidth",
1866 .mode = 0444,
1867 .kf_ops = &rdtgroup_kf_single_ops,
1868 .seq_show = rdt_min_bw_show,
1869 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1870 },
1871 {
1872 .name = "bandwidth_gran",
1873 .mode = 0444,
1874 .kf_ops = &rdtgroup_kf_single_ops,
1875 .seq_show = rdt_bw_gran_show,
1876 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1877 },
1878 {
1879 .name = "delay_linear",
1880 .mode = 0444,
1881 .kf_ops = &rdtgroup_kf_single_ops,
1882 .seq_show = rdt_delay_linear_show,
1883 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1884 },
1885 /*
1886 * Platform specific which (if any) capabilities are provided by
1887 * thread_throttle_mode. Defer "fflags" initialization to platform
1888 * discovery.
1889 */
1890 {
1891 .name = "thread_throttle_mode",
1892 .mode = 0444,
1893 .kf_ops = &rdtgroup_kf_single_ops,
1894 .seq_show = rdt_thread_throttle_mode_show,
1895 },
1896 {
1897 .name = "max_threshold_occupancy",
1898 .mode = 0644,
1899 .kf_ops = &rdtgroup_kf_single_ops,
1900 .write = max_threshold_occ_write,
1901 .seq_show = max_threshold_occ_show,
1902 .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
1903 },
1904 {
1905 .name = "mbm_total_bytes_config",
1906 .mode = 0644,
1907 .kf_ops = &rdtgroup_kf_single_ops,
1908 .seq_show = mbm_total_bytes_config_show,
1909 .write = mbm_total_bytes_config_write,
1910 },
1911 {
1912 .name = "mbm_local_bytes_config",
1913 .mode = 0644,
1914 .kf_ops = &rdtgroup_kf_single_ops,
1915 .seq_show = mbm_local_bytes_config_show,
1916 .write = mbm_local_bytes_config_write,
1917 },
1918 {
1919 .name = "cpus",
1920 .mode = 0644,
1921 .kf_ops = &rdtgroup_kf_single_ops,
1922 .write = rdtgroup_cpus_write,
1923 .seq_show = rdtgroup_cpus_show,
1924 .fflags = RFTYPE_BASE,
1925 },
1926 {
1927 .name = "cpus_list",
1928 .mode = 0644,
1929 .kf_ops = &rdtgroup_kf_single_ops,
1930 .write = rdtgroup_cpus_write,
1931 .seq_show = rdtgroup_cpus_show,
1932 .flags = RFTYPE_FLAGS_CPUS_LIST,
1933 .fflags = RFTYPE_BASE,
1934 },
1935 {
1936 .name = "tasks",
1937 .mode = 0644,
1938 .kf_ops = &rdtgroup_kf_single_ops,
1939 .write = rdtgroup_tasks_write,
1940 .seq_show = rdtgroup_tasks_show,
1941 .fflags = RFTYPE_BASE,
1942 },
1943 {
1944 .name = "mon_hw_id",
1945 .mode = 0444,
1946 .kf_ops = &rdtgroup_kf_single_ops,
1947 .seq_show = rdtgroup_rmid_show,
1948 .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
1949 },
1950 {
1951 .name = "schemata",
1952 .mode = 0644,
1953 .kf_ops = &rdtgroup_kf_single_ops,
1954 .write = rdtgroup_schemata_write,
1955 .seq_show = rdtgroup_schemata_show,
1956 .fflags = RFTYPE_CTRL_BASE,
1957 },
1958 {
1959 .name = "mba_MBps_event",
1960 .mode = 0644,
1961 .kf_ops = &rdtgroup_kf_single_ops,
1962 .write = rdtgroup_mba_mbps_event_write,
1963 .seq_show = rdtgroup_mba_mbps_event_show,
1964 },
1965 {
1966 .name = "mode",
1967 .mode = 0644,
1968 .kf_ops = &rdtgroup_kf_single_ops,
1969 .write = rdtgroup_mode_write,
1970 .seq_show = rdtgroup_mode_show,
1971 .fflags = RFTYPE_CTRL_BASE,
1972 },
1973 {
1974 .name = "size",
1975 .mode = 0444,
1976 .kf_ops = &rdtgroup_kf_single_ops,
1977 .seq_show = rdtgroup_size_show,
1978 .fflags = RFTYPE_CTRL_BASE,
1979 },
1980 {
1981 .name = "sparse_masks",
1982 .mode = 0444,
1983 .kf_ops = &rdtgroup_kf_single_ops,
1984 .seq_show = rdt_has_sparse_bitmasks_show,
1985 .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1986 },
1987 {
1988 .name = "ctrl_hw_id",
1989 .mode = 0444,
1990 .kf_ops = &rdtgroup_kf_single_ops,
1991 .seq_show = rdtgroup_closid_show,
1992 .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
1993 },
1994 };
1995
rdtgroup_add_files(struct kernfs_node * kn,unsigned long fflags)1996 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1997 {
1998 struct rftype *rfts, *rft;
1999 int ret, len;
2000
2001 rfts = res_common_files;
2002 len = ARRAY_SIZE(res_common_files);
2003
2004 lockdep_assert_held(&rdtgroup_mutex);
2005
2006 if (resctrl_debug)
2007 fflags |= RFTYPE_DEBUG;
2008
2009 for (rft = rfts; rft < rfts + len; rft++) {
2010 if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
2011 ret = rdtgroup_add_file(kn, rft);
2012 if (ret)
2013 goto error;
2014 }
2015 }
2016
2017 return 0;
2018 error:
2019 pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
2020 while (--rft >= rfts) {
2021 if ((fflags & rft->fflags) == rft->fflags)
2022 kernfs_remove_by_name(kn, rft->name);
2023 }
2024 return ret;
2025 }
2026
rdtgroup_get_rftype_by_name(const char * name)2027 static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
2028 {
2029 struct rftype *rfts, *rft;
2030 int len;
2031
2032 rfts = res_common_files;
2033 len = ARRAY_SIZE(res_common_files);
2034
2035 for (rft = rfts; rft < rfts + len; rft++) {
2036 if (!strcmp(rft->name, name))
2037 return rft;
2038 }
2039
2040 return NULL;
2041 }
2042
thread_throttle_mode_init(void)2043 static void thread_throttle_mode_init(void)
2044 {
2045 enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
2046 struct rdt_resource *r_mba, *r_smba;
2047
2048 r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2049 if (r_mba->alloc_capable &&
2050 r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2051 throttle_mode = r_mba->membw.throttle_mode;
2052
2053 r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
2054 if (r_smba->alloc_capable &&
2055 r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2056 throttle_mode = r_smba->membw.throttle_mode;
2057
2058 if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
2059 return;
2060
2061 resctrl_file_fflags_init("thread_throttle_mode",
2062 RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
2063 }
2064
resctrl_file_fflags_init(const char * config,unsigned long fflags)2065 void resctrl_file_fflags_init(const char *config, unsigned long fflags)
2066 {
2067 struct rftype *rft;
2068
2069 rft = rdtgroup_get_rftype_by_name(config);
2070 if (rft)
2071 rft->fflags = fflags;
2072 }
2073
2074 /**
2075 * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
2076 * @r: The resource group with which the file is associated.
2077 * @name: Name of the file
2078 *
2079 * The permissions of named resctrl file, directory, or link are modified
2080 * to not allow read, write, or execute by any user.
2081 *
2082 * WARNING: This function is intended to communicate to the user that the
2083 * resctrl file has been locked down - that it is not relevant to the
2084 * particular state the system finds itself in. It should not be relied
2085 * on to protect from user access because after the file's permissions
2086 * are restricted the user can still change the permissions using chmod
2087 * from the command line.
2088 *
2089 * Return: 0 on success, <0 on failure.
2090 */
rdtgroup_kn_mode_restrict(struct rdtgroup * r,const char * name)2091 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
2092 {
2093 struct iattr iattr = {.ia_valid = ATTR_MODE,};
2094 struct kernfs_node *kn;
2095 int ret = 0;
2096
2097 kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2098 if (!kn)
2099 return -ENOENT;
2100
2101 switch (kernfs_type(kn)) {
2102 case KERNFS_DIR:
2103 iattr.ia_mode = S_IFDIR;
2104 break;
2105 case KERNFS_FILE:
2106 iattr.ia_mode = S_IFREG;
2107 break;
2108 case KERNFS_LINK:
2109 iattr.ia_mode = S_IFLNK;
2110 break;
2111 }
2112
2113 ret = kernfs_setattr(kn, &iattr);
2114 kernfs_put(kn);
2115 return ret;
2116 }
2117
2118 /**
2119 * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
2120 * @r: The resource group with which the file is associated.
2121 * @name: Name of the file
2122 * @mask: Mask of permissions that should be restored
2123 *
2124 * Restore the permissions of the named file. If @name is a directory the
2125 * permissions of its parent will be used.
2126 *
2127 * Return: 0 on success, <0 on failure.
2128 */
rdtgroup_kn_mode_restore(struct rdtgroup * r,const char * name,umode_t mask)2129 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
2130 umode_t mask)
2131 {
2132 struct iattr iattr = {.ia_valid = ATTR_MODE,};
2133 struct kernfs_node *kn, *parent;
2134 struct rftype *rfts, *rft;
2135 int ret, len;
2136
2137 rfts = res_common_files;
2138 len = ARRAY_SIZE(res_common_files);
2139
2140 for (rft = rfts; rft < rfts + len; rft++) {
2141 if (!strcmp(rft->name, name))
2142 iattr.ia_mode = rft->mode & mask;
2143 }
2144
2145 kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2146 if (!kn)
2147 return -ENOENT;
2148
2149 switch (kernfs_type(kn)) {
2150 case KERNFS_DIR:
2151 parent = kernfs_get_parent(kn);
2152 if (parent) {
2153 iattr.ia_mode |= parent->mode;
2154 kernfs_put(parent);
2155 }
2156 iattr.ia_mode |= S_IFDIR;
2157 break;
2158 case KERNFS_FILE:
2159 iattr.ia_mode |= S_IFREG;
2160 break;
2161 case KERNFS_LINK:
2162 iattr.ia_mode |= S_IFLNK;
2163 break;
2164 }
2165
2166 ret = kernfs_setattr(kn, &iattr);
2167 kernfs_put(kn);
2168 return ret;
2169 }
2170
rdtgroup_mkdir_info_resdir(void * priv,char * name,unsigned long fflags)2171 static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
2172 unsigned long fflags)
2173 {
2174 struct kernfs_node *kn_subdir;
2175 int ret;
2176
2177 kn_subdir = kernfs_create_dir(kn_info, name,
2178 kn_info->mode, priv);
2179 if (IS_ERR(kn_subdir))
2180 return PTR_ERR(kn_subdir);
2181
2182 ret = rdtgroup_kn_set_ugid(kn_subdir);
2183 if (ret)
2184 return ret;
2185
2186 ret = rdtgroup_add_files(kn_subdir, fflags);
2187 if (!ret)
2188 kernfs_activate(kn_subdir);
2189
2190 return ret;
2191 }
2192
fflags_from_resource(struct rdt_resource * r)2193 static unsigned long fflags_from_resource(struct rdt_resource *r)
2194 {
2195 switch (r->rid) {
2196 case RDT_RESOURCE_L3:
2197 case RDT_RESOURCE_L2:
2198 return RFTYPE_RES_CACHE;
2199 case RDT_RESOURCE_MBA:
2200 case RDT_RESOURCE_SMBA:
2201 return RFTYPE_RES_MB;
2202 }
2203
2204 return WARN_ON_ONCE(1);
2205 }
2206
rdtgroup_create_info_dir(struct kernfs_node * parent_kn)2207 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
2208 {
2209 struct resctrl_schema *s;
2210 struct rdt_resource *r;
2211 unsigned long fflags;
2212 char name[32];
2213 int ret;
2214
2215 /* create the directory */
2216 kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
2217 if (IS_ERR(kn_info))
2218 return PTR_ERR(kn_info);
2219
2220 ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
2221 if (ret)
2222 goto out_destroy;
2223
2224 /* loop over enabled controls, these are all alloc_capable */
2225 list_for_each_entry(s, &resctrl_schema_all, list) {
2226 r = s->res;
2227 fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
2228 ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
2229 if (ret)
2230 goto out_destroy;
2231 }
2232
2233 for_each_mon_capable_rdt_resource(r) {
2234 fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
2235 sprintf(name, "%s_MON", r->name);
2236 ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
2237 if (ret)
2238 goto out_destroy;
2239 }
2240
2241 ret = rdtgroup_kn_set_ugid(kn_info);
2242 if (ret)
2243 goto out_destroy;
2244
2245 kernfs_activate(kn_info);
2246
2247 return 0;
2248
2249 out_destroy:
2250 kernfs_remove(kn_info);
2251 return ret;
2252 }
2253
2254 static int
mongroup_create_dir(struct kernfs_node * parent_kn,struct rdtgroup * prgrp,char * name,struct kernfs_node ** dest_kn)2255 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
2256 char *name, struct kernfs_node **dest_kn)
2257 {
2258 struct kernfs_node *kn;
2259 int ret;
2260
2261 /* create the directory */
2262 kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2263 if (IS_ERR(kn))
2264 return PTR_ERR(kn);
2265
2266 if (dest_kn)
2267 *dest_kn = kn;
2268
2269 ret = rdtgroup_kn_set_ugid(kn);
2270 if (ret)
2271 goto out_destroy;
2272
2273 kernfs_activate(kn);
2274
2275 return 0;
2276
2277 out_destroy:
2278 kernfs_remove(kn);
2279 return ret;
2280 }
2281
is_mba_linear(void)2282 static inline bool is_mba_linear(void)
2283 {
2284 return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
2285 }
2286
mba_sc_domain_allocate(struct rdt_resource * r,struct rdt_ctrl_domain * d)2287 static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
2288 {
2289 u32 num_closid = resctrl_arch_get_num_closid(r);
2290 int cpu = cpumask_any(&d->hdr.cpu_mask);
2291 int i;
2292
2293 d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
2294 GFP_KERNEL, cpu_to_node(cpu));
2295 if (!d->mbps_val)
2296 return -ENOMEM;
2297
2298 for (i = 0; i < num_closid; i++)
2299 d->mbps_val[i] = MBA_MAX_MBPS;
2300
2301 return 0;
2302 }
2303
mba_sc_domain_destroy(struct rdt_resource * r,struct rdt_ctrl_domain * d)2304 static void mba_sc_domain_destroy(struct rdt_resource *r,
2305 struct rdt_ctrl_domain *d)
2306 {
2307 kfree(d->mbps_val);
2308 d->mbps_val = NULL;
2309 }
2310
2311 /*
2312 * MBA software controller is supported only if
2313 * MBM is supported and MBA is in linear scale,
2314 * and the MBM monitor scope is the same as MBA
2315 * control scope.
2316 */
supports_mba_mbps(void)2317 static bool supports_mba_mbps(void)
2318 {
2319 struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2320 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2321
2322 return (resctrl_is_mbm_enabled() &&
2323 r->alloc_capable && is_mba_linear() &&
2324 r->ctrl_scope == rmbm->mon_scope);
2325 }
2326
2327 /*
2328 * Enable or disable the MBA software controller
2329 * which helps user specify bandwidth in MBps.
2330 */
set_mba_sc(bool mba_sc)2331 static int set_mba_sc(bool mba_sc)
2332 {
2333 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2334 u32 num_closid = resctrl_arch_get_num_closid(r);
2335 struct rdt_ctrl_domain *d;
2336 unsigned long fflags;
2337 int i;
2338
2339 if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
2340 return -EINVAL;
2341
2342 r->membw.mba_sc = mba_sc;
2343
2344 rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
2345
2346 list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2347 for (i = 0; i < num_closid; i++)
2348 d->mbps_val[i] = MBA_MAX_MBPS;
2349 }
2350
2351 fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
2352 resctrl_file_fflags_init("mba_MBps_event", fflags);
2353
2354 return 0;
2355 }
2356
2357 /*
2358 * We don't allow rdtgroup directories to be created anywhere
2359 * except the root directory. Thus when looking for the rdtgroup
2360 * structure for a kernfs node we are either looking at a directory,
2361 * in which case the rdtgroup structure is pointed at by the "priv"
2362 * field, otherwise we have a file, and need only look to the parent
2363 * to find the rdtgroup.
2364 */
kernfs_to_rdtgroup(struct kernfs_node * kn)2365 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2366 {
2367 if (kernfs_type(kn) == KERNFS_DIR) {
2368 /*
2369 * All the resource directories use "kn->priv"
2370 * to point to the "struct rdtgroup" for the
2371 * resource. "info" and its subdirectories don't
2372 * have rdtgroup structures, so return NULL here.
2373 */
2374 if (kn == kn_info ||
2375 rcu_access_pointer(kn->__parent) == kn_info)
2376 return NULL;
2377 else
2378 return kn->priv;
2379 } else {
2380 return rdt_kn_parent_priv(kn);
2381 }
2382 }
2383
rdtgroup_kn_get(struct rdtgroup * rdtgrp,struct kernfs_node * kn)2384 static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2385 {
2386 atomic_inc(&rdtgrp->waitcount);
2387 kernfs_break_active_protection(kn);
2388 }
2389
rdtgroup_kn_put(struct rdtgroup * rdtgrp,struct kernfs_node * kn)2390 static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2391 {
2392 if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2393 (rdtgrp->flags & RDT_DELETED)) {
2394 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2395 rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2396 rdtgroup_pseudo_lock_remove(rdtgrp);
2397 kernfs_unbreak_active_protection(kn);
2398 rdtgroup_remove(rdtgrp);
2399 } else {
2400 kernfs_unbreak_active_protection(kn);
2401 }
2402 }
2403
rdtgroup_kn_lock_live(struct kernfs_node * kn)2404 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2405 {
2406 struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2407
2408 if (!rdtgrp)
2409 return NULL;
2410
2411 rdtgroup_kn_get(rdtgrp, kn);
2412
2413 cpus_read_lock();
2414 mutex_lock(&rdtgroup_mutex);
2415
2416 /* Was this group deleted while we waited? */
2417 if (rdtgrp->flags & RDT_DELETED)
2418 return NULL;
2419
2420 return rdtgrp;
2421 }
2422
rdtgroup_kn_unlock(struct kernfs_node * kn)2423 void rdtgroup_kn_unlock(struct kernfs_node *kn)
2424 {
2425 struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2426
2427 if (!rdtgrp)
2428 return;
2429
2430 mutex_unlock(&rdtgroup_mutex);
2431 cpus_read_unlock();
2432
2433 rdtgroup_kn_put(rdtgrp, kn);
2434 }
2435
2436 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2437 struct rdtgroup *prgrp,
2438 struct kernfs_node **mon_data_kn);
2439
rdt_disable_ctx(void)2440 static void rdt_disable_ctx(void)
2441 {
2442 resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2443 resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2444 set_mba_sc(false);
2445
2446 resctrl_debug = false;
2447 }
2448
rdt_enable_ctx(struct rdt_fs_context * ctx)2449 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2450 {
2451 int ret = 0;
2452
2453 if (ctx->enable_cdpl2) {
2454 ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
2455 if (ret)
2456 goto out_done;
2457 }
2458
2459 if (ctx->enable_cdpl3) {
2460 ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
2461 if (ret)
2462 goto out_cdpl2;
2463 }
2464
2465 if (ctx->enable_mba_mbps) {
2466 ret = set_mba_sc(true);
2467 if (ret)
2468 goto out_cdpl3;
2469 }
2470
2471 if (ctx->enable_debug)
2472 resctrl_debug = true;
2473
2474 return 0;
2475
2476 out_cdpl3:
2477 resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2478 out_cdpl2:
2479 resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2480 out_done:
2481 return ret;
2482 }
2483
schemata_list_add(struct rdt_resource * r,enum resctrl_conf_type type)2484 static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
2485 {
2486 struct resctrl_schema *s;
2487 const char *suffix = "";
2488 int ret, cl;
2489
2490 s = kzalloc(sizeof(*s), GFP_KERNEL);
2491 if (!s)
2492 return -ENOMEM;
2493
2494 s->res = r;
2495 s->num_closid = resctrl_arch_get_num_closid(r);
2496 if (resctrl_arch_get_cdp_enabled(r->rid))
2497 s->num_closid /= 2;
2498
2499 s->conf_type = type;
2500 switch (type) {
2501 case CDP_CODE:
2502 suffix = "CODE";
2503 break;
2504 case CDP_DATA:
2505 suffix = "DATA";
2506 break;
2507 case CDP_NONE:
2508 suffix = "";
2509 break;
2510 }
2511
2512 ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
2513 if (ret >= sizeof(s->name)) {
2514 kfree(s);
2515 return -EINVAL;
2516 }
2517
2518 cl = strlen(s->name);
2519
2520 /*
2521 * If CDP is supported by this resource, but not enabled,
2522 * include the suffix. This ensures the tabular format of the
2523 * schemata file does not change between mounts of the filesystem.
2524 */
2525 if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
2526 cl += 4;
2527
2528 if (cl > max_name_width)
2529 max_name_width = cl;
2530
2531 switch (r->schema_fmt) {
2532 case RESCTRL_SCHEMA_BITMAP:
2533 s->fmt_str = "%d=%x";
2534 break;
2535 case RESCTRL_SCHEMA_RANGE:
2536 s->fmt_str = "%d=%u";
2537 break;
2538 }
2539
2540 if (WARN_ON_ONCE(!s->fmt_str)) {
2541 kfree(s);
2542 return -EINVAL;
2543 }
2544
2545 INIT_LIST_HEAD(&s->list);
2546 list_add(&s->list, &resctrl_schema_all);
2547
2548 return 0;
2549 }
2550
schemata_list_create(void)2551 static int schemata_list_create(void)
2552 {
2553 struct rdt_resource *r;
2554 int ret = 0;
2555
2556 for_each_alloc_capable_rdt_resource(r) {
2557 if (resctrl_arch_get_cdp_enabled(r->rid)) {
2558 ret = schemata_list_add(r, CDP_CODE);
2559 if (ret)
2560 break;
2561
2562 ret = schemata_list_add(r, CDP_DATA);
2563 } else {
2564 ret = schemata_list_add(r, CDP_NONE);
2565 }
2566
2567 if (ret)
2568 break;
2569 }
2570
2571 return ret;
2572 }
2573
schemata_list_destroy(void)2574 static void schemata_list_destroy(void)
2575 {
2576 struct resctrl_schema *s, *tmp;
2577
2578 list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
2579 list_del(&s->list);
2580 kfree(s);
2581 }
2582 }
2583
rdt_get_tree(struct fs_context * fc)2584 static int rdt_get_tree(struct fs_context *fc)
2585 {
2586 struct rdt_fs_context *ctx = rdt_fc2context(fc);
2587 unsigned long flags = RFTYPE_CTRL_BASE;
2588 struct rdt_mon_domain *dom;
2589 struct rdt_resource *r;
2590 int ret;
2591
2592 cpus_read_lock();
2593 mutex_lock(&rdtgroup_mutex);
2594 /*
2595 * resctrl file system can only be mounted once.
2596 */
2597 if (resctrl_mounted) {
2598 ret = -EBUSY;
2599 goto out;
2600 }
2601
2602 ret = rdtgroup_setup_root(ctx);
2603 if (ret)
2604 goto out;
2605
2606 ret = rdt_enable_ctx(ctx);
2607 if (ret)
2608 goto out_root;
2609
2610 ret = schemata_list_create();
2611 if (ret) {
2612 schemata_list_destroy();
2613 goto out_ctx;
2614 }
2615
2616 ret = closid_init();
2617 if (ret)
2618 goto out_schemata_free;
2619
2620 if (resctrl_arch_mon_capable())
2621 flags |= RFTYPE_MON;
2622
2623 ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
2624 if (ret)
2625 goto out_closid_exit;
2626
2627 kernfs_activate(rdtgroup_default.kn);
2628
2629 ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2630 if (ret < 0)
2631 goto out_closid_exit;
2632
2633 if (resctrl_arch_mon_capable()) {
2634 ret = mongroup_create_dir(rdtgroup_default.kn,
2635 &rdtgroup_default, "mon_groups",
2636 &kn_mongrp);
2637 if (ret < 0)
2638 goto out_info;
2639
2640 ret = mkdir_mondata_all(rdtgroup_default.kn,
2641 &rdtgroup_default, &kn_mondata);
2642 if (ret < 0)
2643 goto out_mongrp;
2644 rdtgroup_default.mon.mon_data_kn = kn_mondata;
2645 }
2646
2647 ret = rdt_pseudo_lock_init();
2648 if (ret)
2649 goto out_mondata;
2650
2651 ret = kernfs_get_tree(fc);
2652 if (ret < 0)
2653 goto out_psl;
2654
2655 if (resctrl_arch_alloc_capable())
2656 resctrl_arch_enable_alloc();
2657 if (resctrl_arch_mon_capable())
2658 resctrl_arch_enable_mon();
2659
2660 if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
2661 resctrl_mounted = true;
2662
2663 if (resctrl_is_mbm_enabled()) {
2664 r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2665 list_for_each_entry(dom, &r->mon_domains, hdr.list)
2666 mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
2667 RESCTRL_PICK_ANY_CPU);
2668 }
2669
2670 goto out;
2671
2672 out_psl:
2673 rdt_pseudo_lock_release();
2674 out_mondata:
2675 if (resctrl_arch_mon_capable())
2676 kernfs_remove(kn_mondata);
2677 out_mongrp:
2678 if (resctrl_arch_mon_capable())
2679 kernfs_remove(kn_mongrp);
2680 out_info:
2681 kernfs_remove(kn_info);
2682 out_closid_exit:
2683 closid_exit();
2684 out_schemata_free:
2685 schemata_list_destroy();
2686 out_ctx:
2687 rdt_disable_ctx();
2688 out_root:
2689 rdtgroup_destroy_root();
2690 out:
2691 rdt_last_cmd_clear();
2692 mutex_unlock(&rdtgroup_mutex);
2693 cpus_read_unlock();
2694 return ret;
2695 }
2696
2697 enum rdt_param {
2698 Opt_cdp,
2699 Opt_cdpl2,
2700 Opt_mba_mbps,
2701 Opt_debug,
2702 nr__rdt_params
2703 };
2704
2705 static const struct fs_parameter_spec rdt_fs_parameters[] = {
2706 fsparam_flag("cdp", Opt_cdp),
2707 fsparam_flag("cdpl2", Opt_cdpl2),
2708 fsparam_flag("mba_MBps", Opt_mba_mbps),
2709 fsparam_flag("debug", Opt_debug),
2710 {}
2711 };
2712
rdt_parse_param(struct fs_context * fc,struct fs_parameter * param)2713 static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2714 {
2715 struct rdt_fs_context *ctx = rdt_fc2context(fc);
2716 struct fs_parse_result result;
2717 const char *msg;
2718 int opt;
2719
2720 opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2721 if (opt < 0)
2722 return opt;
2723
2724 switch (opt) {
2725 case Opt_cdp:
2726 ctx->enable_cdpl3 = true;
2727 return 0;
2728 case Opt_cdpl2:
2729 ctx->enable_cdpl2 = true;
2730 return 0;
2731 case Opt_mba_mbps:
2732 msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
2733 if (!supports_mba_mbps())
2734 return invalfc(fc, msg);
2735 ctx->enable_mba_mbps = true;
2736 return 0;
2737 case Opt_debug:
2738 ctx->enable_debug = true;
2739 return 0;
2740 }
2741
2742 return -EINVAL;
2743 }
2744
rdt_fs_context_free(struct fs_context * fc)2745 static void rdt_fs_context_free(struct fs_context *fc)
2746 {
2747 struct rdt_fs_context *ctx = rdt_fc2context(fc);
2748
2749 kernfs_free_fs_context(fc);
2750 kfree(ctx);
2751 }
2752
2753 static const struct fs_context_operations rdt_fs_context_ops = {
2754 .free = rdt_fs_context_free,
2755 .parse_param = rdt_parse_param,
2756 .get_tree = rdt_get_tree,
2757 };
2758
rdt_init_fs_context(struct fs_context * fc)2759 static int rdt_init_fs_context(struct fs_context *fc)
2760 {
2761 struct rdt_fs_context *ctx;
2762
2763 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2764 if (!ctx)
2765 return -ENOMEM;
2766
2767 ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2768 fc->fs_private = &ctx->kfc;
2769 fc->ops = &rdt_fs_context_ops;
2770 put_user_ns(fc->user_ns);
2771 fc->user_ns = get_user_ns(&init_user_ns);
2772 fc->global = true;
2773 return 0;
2774 }
2775
2776 /*
2777 * Move tasks from one to the other group. If @from is NULL, then all tasks
2778 * in the systems are moved unconditionally (used for teardown).
2779 *
2780 * If @mask is not NULL the cpus on which moved tasks are running are set
2781 * in that mask so the update smp function call is restricted to affected
2782 * cpus.
2783 */
rdt_move_group_tasks(struct rdtgroup * from,struct rdtgroup * to,struct cpumask * mask)2784 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2785 struct cpumask *mask)
2786 {
2787 struct task_struct *p, *t;
2788
2789 read_lock(&tasklist_lock);
2790 for_each_process_thread(p, t) {
2791 if (!from || is_closid_match(t, from) ||
2792 is_rmid_match(t, from)) {
2793 resctrl_arch_set_closid_rmid(t, to->closid,
2794 to->mon.rmid);
2795
2796 /*
2797 * Order the closid/rmid stores above before the loads
2798 * in task_curr(). This pairs with the full barrier
2799 * between the rq->curr update and
2800 * resctrl_arch_sched_in() during context switch.
2801 */
2802 smp_mb();
2803
2804 /*
2805 * If the task is on a CPU, set the CPU in the mask.
2806 * The detection is inaccurate as tasks might move or
2807 * schedule before the smp function call takes place.
2808 * In such a case the function call is pointless, but
2809 * there is no other side effect.
2810 */
2811 if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
2812 cpumask_set_cpu(task_cpu(t), mask);
2813 }
2814 }
2815 read_unlock(&tasklist_lock);
2816 }
2817
free_all_child_rdtgrp(struct rdtgroup * rdtgrp)2818 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2819 {
2820 struct rdtgroup *sentry, *stmp;
2821 struct list_head *head;
2822
2823 head = &rdtgrp->mon.crdtgrp_list;
2824 list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2825 free_rmid(sentry->closid, sentry->mon.rmid);
2826 list_del(&sentry->mon.crdtgrp_list);
2827
2828 if (atomic_read(&sentry->waitcount) != 0)
2829 sentry->flags = RDT_DELETED;
2830 else
2831 rdtgroup_remove(sentry);
2832 }
2833 }
2834
2835 /*
2836 * Forcibly remove all of subdirectories under root.
2837 */
rmdir_all_sub(void)2838 static void rmdir_all_sub(void)
2839 {
2840 struct rdtgroup *rdtgrp, *tmp;
2841
2842 /* Move all tasks to the default resource group */
2843 rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2844
2845 list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2846 /* Free any child rmids */
2847 free_all_child_rdtgrp(rdtgrp);
2848
2849 /* Remove each rdtgroup other than root */
2850 if (rdtgrp == &rdtgroup_default)
2851 continue;
2852
2853 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2854 rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2855 rdtgroup_pseudo_lock_remove(rdtgrp);
2856
2857 /*
2858 * Give any CPUs back to the default group. We cannot copy
2859 * cpu_online_mask because a CPU might have executed the
2860 * offline callback already, but is still marked online.
2861 */
2862 cpumask_or(&rdtgroup_default.cpu_mask,
2863 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2864
2865 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
2866
2867 kernfs_remove(rdtgrp->kn);
2868 list_del(&rdtgrp->rdtgroup_list);
2869
2870 if (atomic_read(&rdtgrp->waitcount) != 0)
2871 rdtgrp->flags = RDT_DELETED;
2872 else
2873 rdtgroup_remove(rdtgrp);
2874 }
2875 /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2876 update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2877
2878 kernfs_remove(kn_info);
2879 kernfs_remove(kn_mongrp);
2880 kernfs_remove(kn_mondata);
2881 }
2882
2883 /**
2884 * mon_get_kn_priv() - Get the mon_data priv data for this event.
2885 *
2886 * The same values are used across the mon_data directories of all control and
2887 * monitor groups for the same event in the same domain. Keep a list of
2888 * allocated structures and re-use an existing one with the same values for
2889 * @rid, @domid, etc.
2890 *
2891 * @rid: The resource id for the event file being created.
2892 * @domid: The domain id for the event file being created.
2893 * @mevt: The type of event file being created.
2894 * @do_sum: Whether SNC summing monitors are being created.
2895 */
mon_get_kn_priv(enum resctrl_res_level rid,int domid,struct mon_evt * mevt,bool do_sum)2896 static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
2897 struct mon_evt *mevt,
2898 bool do_sum)
2899 {
2900 struct mon_data *priv;
2901
2902 lockdep_assert_held(&rdtgroup_mutex);
2903
2904 list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
2905 if (priv->rid == rid && priv->domid == domid &&
2906 priv->sum == do_sum && priv->evtid == mevt->evtid)
2907 return priv;
2908 }
2909
2910 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2911 if (!priv)
2912 return NULL;
2913
2914 priv->rid = rid;
2915 priv->domid = domid;
2916 priv->sum = do_sum;
2917 priv->evtid = mevt->evtid;
2918 list_add_tail(&priv->list, &mon_data_kn_priv_list);
2919
2920 return priv;
2921 }
2922
2923 /**
2924 * mon_put_kn_priv() - Free all allocated mon_data structures.
2925 *
2926 * Called when resctrl file system is unmounted.
2927 */
mon_put_kn_priv(void)2928 static void mon_put_kn_priv(void)
2929 {
2930 struct mon_data *priv, *tmp;
2931
2932 lockdep_assert_held(&rdtgroup_mutex);
2933
2934 list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
2935 list_del(&priv->list);
2936 kfree(priv);
2937 }
2938 }
2939
resctrl_fs_teardown(void)2940 static void resctrl_fs_teardown(void)
2941 {
2942 lockdep_assert_held(&rdtgroup_mutex);
2943
2944 /* Cleared by rdtgroup_destroy_root() */
2945 if (!rdtgroup_default.kn)
2946 return;
2947
2948 rmdir_all_sub();
2949 mon_put_kn_priv();
2950 rdt_pseudo_lock_release();
2951 rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2952 closid_exit();
2953 schemata_list_destroy();
2954 rdtgroup_destroy_root();
2955 }
2956
rdt_kill_sb(struct super_block * sb)2957 static void rdt_kill_sb(struct super_block *sb)
2958 {
2959 struct rdt_resource *r;
2960
2961 cpus_read_lock();
2962 mutex_lock(&rdtgroup_mutex);
2963
2964 rdt_disable_ctx();
2965
2966 /* Put everything back to default values. */
2967 for_each_alloc_capable_rdt_resource(r)
2968 resctrl_arch_reset_all_ctrls(r);
2969
2970 resctrl_fs_teardown();
2971 if (resctrl_arch_alloc_capable())
2972 resctrl_arch_disable_alloc();
2973 if (resctrl_arch_mon_capable())
2974 resctrl_arch_disable_mon();
2975 resctrl_mounted = false;
2976 kernfs_kill_sb(sb);
2977 mutex_unlock(&rdtgroup_mutex);
2978 cpus_read_unlock();
2979 }
2980
2981 static struct file_system_type rdt_fs_type = {
2982 .name = "resctrl",
2983 .init_fs_context = rdt_init_fs_context,
2984 .parameters = rdt_fs_parameters,
2985 .kill_sb = rdt_kill_sb,
2986 };
2987
mon_addfile(struct kernfs_node * parent_kn,const char * name,void * priv)2988 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2989 void *priv)
2990 {
2991 struct kernfs_node *kn;
2992 int ret = 0;
2993
2994 kn = __kernfs_create_file(parent_kn, name, 0444,
2995 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2996 &kf_mondata_ops, priv, NULL, NULL);
2997 if (IS_ERR(kn))
2998 return PTR_ERR(kn);
2999
3000 ret = rdtgroup_kn_set_ugid(kn);
3001 if (ret) {
3002 kernfs_remove(kn);
3003 return ret;
3004 }
3005
3006 return ret;
3007 }
3008
mon_rmdir_one_subdir(struct kernfs_node * pkn,char * name,char * subname)3009 static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
3010 {
3011 struct kernfs_node *kn;
3012
3013 kn = kernfs_find_and_get(pkn, name);
3014 if (!kn)
3015 return;
3016 kernfs_put(kn);
3017
3018 if (kn->dir.subdirs <= 1)
3019 kernfs_remove(kn);
3020 else
3021 kernfs_remove_by_name(kn, subname);
3022 }
3023
3024 /*
3025 * Remove all subdirectories of mon_data of ctrl_mon groups
3026 * and monitor groups for the given domain.
3027 * Remove files and directories containing "sum" of domain data
3028 * when last domain being summed is removed.
3029 */
rmdir_mondata_subdir_allrdtgrp(struct rdt_resource * r,struct rdt_mon_domain * d)3030 static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3031 struct rdt_mon_domain *d)
3032 {
3033 struct rdtgroup *prgrp, *crgrp;
3034 char subname[32];
3035 bool snc_mode;
3036 char name[32];
3037
3038 snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3039 sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
3040 if (snc_mode)
3041 sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
3042
3043 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3044 mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
3045
3046 list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
3047 mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
3048 }
3049 }
3050
mon_add_all_files(struct kernfs_node * kn,struct rdt_mon_domain * d,struct rdt_resource * r,struct rdtgroup * prgrp,bool do_sum)3051 static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
3052 struct rdt_resource *r, struct rdtgroup *prgrp,
3053 bool do_sum)
3054 {
3055 struct rmid_read rr = {0};
3056 struct mon_data *priv;
3057 struct mon_evt *mevt;
3058 int ret, domid;
3059
3060 if (WARN_ON(list_empty(&r->evt_list)))
3061 return -EPERM;
3062
3063 list_for_each_entry(mevt, &r->evt_list, list) {
3064 domid = do_sum ? d->ci_id : d->hdr.id;
3065 priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
3066 if (WARN_ON_ONCE(!priv))
3067 return -EINVAL;
3068
3069 ret = mon_addfile(kn, mevt->name, priv);
3070 if (ret)
3071 return ret;
3072
3073 if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
3074 mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
3075 }
3076
3077 return 0;
3078 }
3079
mkdir_mondata_subdir(struct kernfs_node * parent_kn,struct rdt_mon_domain * d,struct rdt_resource * r,struct rdtgroup * prgrp)3080 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
3081 struct rdt_mon_domain *d,
3082 struct rdt_resource *r, struct rdtgroup *prgrp)
3083 {
3084 struct kernfs_node *kn, *ckn;
3085 char name[32];
3086 bool snc_mode;
3087 int ret = 0;
3088
3089 lockdep_assert_held(&rdtgroup_mutex);
3090
3091 snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3092 sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
3093 kn = kernfs_find_and_get(parent_kn, name);
3094 if (kn) {
3095 /*
3096 * rdtgroup_mutex will prevent this directory from being
3097 * removed. No need to keep this hold.
3098 */
3099 kernfs_put(kn);
3100 } else {
3101 kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
3102 if (IS_ERR(kn))
3103 return PTR_ERR(kn);
3104
3105 ret = rdtgroup_kn_set_ugid(kn);
3106 if (ret)
3107 goto out_destroy;
3108 ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
3109 if (ret)
3110 goto out_destroy;
3111 }
3112
3113 if (snc_mode) {
3114 sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
3115 ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
3116 if (IS_ERR(ckn)) {
3117 ret = -EINVAL;
3118 goto out_destroy;
3119 }
3120
3121 ret = rdtgroup_kn_set_ugid(ckn);
3122 if (ret)
3123 goto out_destroy;
3124
3125 ret = mon_add_all_files(ckn, d, r, prgrp, false);
3126 if (ret)
3127 goto out_destroy;
3128 }
3129
3130 kernfs_activate(kn);
3131 return 0;
3132
3133 out_destroy:
3134 kernfs_remove(kn);
3135 return ret;
3136 }
3137
3138 /*
3139 * Add all subdirectories of mon_data for "ctrl_mon" groups
3140 * and "monitor" groups with given domain id.
3141 */
mkdir_mondata_subdir_allrdtgrp(struct rdt_resource * r,struct rdt_mon_domain * d)3142 static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3143 struct rdt_mon_domain *d)
3144 {
3145 struct kernfs_node *parent_kn;
3146 struct rdtgroup *prgrp, *crgrp;
3147 struct list_head *head;
3148
3149 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3150 parent_kn = prgrp->mon.mon_data_kn;
3151 mkdir_mondata_subdir(parent_kn, d, r, prgrp);
3152
3153 head = &prgrp->mon.crdtgrp_list;
3154 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
3155 parent_kn = crgrp->mon.mon_data_kn;
3156 mkdir_mondata_subdir(parent_kn, d, r, crgrp);
3157 }
3158 }
3159 }
3160
mkdir_mondata_subdir_alldom(struct kernfs_node * parent_kn,struct rdt_resource * r,struct rdtgroup * prgrp)3161 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
3162 struct rdt_resource *r,
3163 struct rdtgroup *prgrp)
3164 {
3165 struct rdt_mon_domain *dom;
3166 int ret;
3167
3168 /* Walking r->domains, ensure it can't race with cpuhp */
3169 lockdep_assert_cpus_held();
3170
3171 list_for_each_entry(dom, &r->mon_domains, hdr.list) {
3172 ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
3173 if (ret)
3174 return ret;
3175 }
3176
3177 return 0;
3178 }
3179
3180 /*
3181 * This creates a directory mon_data which contains the monitored data.
3182 *
3183 * mon_data has one directory for each domain which are named
3184 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
3185 * with L3 domain looks as below:
3186 * ./mon_data:
3187 * mon_L3_00
3188 * mon_L3_01
3189 * mon_L3_02
3190 * ...
3191 *
3192 * Each domain directory has one file per event:
3193 * ./mon_L3_00/:
3194 * llc_occupancy
3195 *
3196 */
mkdir_mondata_all(struct kernfs_node * parent_kn,struct rdtgroup * prgrp,struct kernfs_node ** dest_kn)3197 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
3198 struct rdtgroup *prgrp,
3199 struct kernfs_node **dest_kn)
3200 {
3201 struct rdt_resource *r;
3202 struct kernfs_node *kn;
3203 int ret;
3204
3205 /*
3206 * Create the mon_data directory first.
3207 */
3208 ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
3209 if (ret)
3210 return ret;
3211
3212 if (dest_kn)
3213 *dest_kn = kn;
3214
3215 /*
3216 * Create the subdirectories for each domain. Note that all events
3217 * in a domain like L3 are grouped into a resource whose domain is L3
3218 */
3219 for_each_mon_capable_rdt_resource(r) {
3220 ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
3221 if (ret)
3222 goto out_destroy;
3223 }
3224
3225 return 0;
3226
3227 out_destroy:
3228 kernfs_remove(kn);
3229 return ret;
3230 }
3231
3232 /**
3233 * cbm_ensure_valid - Enforce validity on provided CBM
3234 * @_val: Candidate CBM
3235 * @r: RDT resource to which the CBM belongs
3236 *
3237 * The provided CBM represents all cache portions available for use. This
3238 * may be represented by a bitmap that does not consist of contiguous ones
3239 * and thus be an invalid CBM.
3240 * Here the provided CBM is forced to be a valid CBM by only considering
3241 * the first set of contiguous bits as valid and clearing all bits.
3242 * The intention here is to provide a valid default CBM with which a new
3243 * resource group is initialized. The user can follow this with a
3244 * modification to the CBM if the default does not satisfy the
3245 * requirements.
3246 */
cbm_ensure_valid(u32 _val,struct rdt_resource * r)3247 static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
3248 {
3249 unsigned int cbm_len = r->cache.cbm_len;
3250 unsigned long first_bit, zero_bit;
3251 unsigned long val = _val;
3252
3253 if (!val)
3254 return 0;
3255
3256 first_bit = find_first_bit(&val, cbm_len);
3257 zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
3258
3259 /* Clear any remaining bits to ensure contiguous region */
3260 bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
3261 return (u32)val;
3262 }
3263
3264 /*
3265 * Initialize cache resources per RDT domain
3266 *
3267 * Set the RDT domain up to start off with all usable allocations. That is,
3268 * all shareable and unused bits. All-zero CBM is invalid.
3269 */
__init_one_rdt_domain(struct rdt_ctrl_domain * d,struct resctrl_schema * s,u32 closid)3270 static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
3271 u32 closid)
3272 {
3273 enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
3274 enum resctrl_conf_type t = s->conf_type;
3275 struct resctrl_staged_config *cfg;
3276 struct rdt_resource *r = s->res;
3277 u32 used_b = 0, unused_b = 0;
3278 unsigned long tmp_cbm;
3279 enum rdtgrp_mode mode;
3280 u32 peer_ctl, ctrl_val;
3281 int i;
3282
3283 cfg = &d->staged_config[t];
3284 cfg->have_new_ctrl = false;
3285 cfg->new_ctrl = r->cache.shareable_bits;
3286 used_b = r->cache.shareable_bits;
3287 for (i = 0; i < closids_supported(); i++) {
3288 if (closid_allocated(i) && i != closid) {
3289 mode = rdtgroup_mode_by_closid(i);
3290 if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
3291 /*
3292 * ctrl values for locksetup aren't relevant
3293 * until the schemata is written, and the mode
3294 * becomes RDT_MODE_PSEUDO_LOCKED.
3295 */
3296 continue;
3297 /*
3298 * If CDP is active include peer domain's
3299 * usage to ensure there is no overlap
3300 * with an exclusive group.
3301 */
3302 if (resctrl_arch_get_cdp_enabled(r->rid))
3303 peer_ctl = resctrl_arch_get_config(r, d, i,
3304 peer_type);
3305 else
3306 peer_ctl = 0;
3307 ctrl_val = resctrl_arch_get_config(r, d, i,
3308 s->conf_type);
3309 used_b |= ctrl_val | peer_ctl;
3310 if (mode == RDT_MODE_SHAREABLE)
3311 cfg->new_ctrl |= ctrl_val | peer_ctl;
3312 }
3313 }
3314 if (d->plr && d->plr->cbm > 0)
3315 used_b |= d->plr->cbm;
3316 unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
3317 unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
3318 cfg->new_ctrl |= unused_b;
3319 /*
3320 * Force the initial CBM to be valid, user can
3321 * modify the CBM based on system availability.
3322 */
3323 cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
3324 /*
3325 * Assign the u32 CBM to an unsigned long to ensure that
3326 * bitmap_weight() does not access out-of-bound memory.
3327 */
3328 tmp_cbm = cfg->new_ctrl;
3329 if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
3330 rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
3331 return -ENOSPC;
3332 }
3333 cfg->have_new_ctrl = true;
3334
3335 return 0;
3336 }
3337
3338 /*
3339 * Initialize cache resources with default values.
3340 *
3341 * A new RDT group is being created on an allocation capable (CAT)
3342 * supporting system. Set this group up to start off with all usable
3343 * allocations.
3344 *
3345 * If there are no more shareable bits available on any domain then
3346 * the entire allocation will fail.
3347 */
rdtgroup_init_cat(struct resctrl_schema * s,u32 closid)3348 static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
3349 {
3350 struct rdt_ctrl_domain *d;
3351 int ret;
3352
3353 list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
3354 ret = __init_one_rdt_domain(d, s, closid);
3355 if (ret < 0)
3356 return ret;
3357 }
3358
3359 return 0;
3360 }
3361
3362 /* Initialize MBA resource with default values. */
rdtgroup_init_mba(struct rdt_resource * r,u32 closid)3363 static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
3364 {
3365 struct resctrl_staged_config *cfg;
3366 struct rdt_ctrl_domain *d;
3367
3368 list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
3369 if (is_mba_sc(r)) {
3370 d->mbps_val[closid] = MBA_MAX_MBPS;
3371 continue;
3372 }
3373
3374 cfg = &d->staged_config[CDP_NONE];
3375 cfg->new_ctrl = resctrl_get_default_ctrl(r);
3376 cfg->have_new_ctrl = true;
3377 }
3378 }
3379
3380 /* Initialize the RDT group's allocations. */
rdtgroup_init_alloc(struct rdtgroup * rdtgrp)3381 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
3382 {
3383 struct resctrl_schema *s;
3384 struct rdt_resource *r;
3385 int ret = 0;
3386
3387 rdt_staged_configs_clear();
3388
3389 list_for_each_entry(s, &resctrl_schema_all, list) {
3390 r = s->res;
3391 if (r->rid == RDT_RESOURCE_MBA ||
3392 r->rid == RDT_RESOURCE_SMBA) {
3393 rdtgroup_init_mba(r, rdtgrp->closid);
3394 if (is_mba_sc(r))
3395 continue;
3396 } else {
3397 ret = rdtgroup_init_cat(s, rdtgrp->closid);
3398 if (ret < 0)
3399 goto out;
3400 }
3401
3402 ret = resctrl_arch_update_domains(r, rdtgrp->closid);
3403 if (ret < 0) {
3404 rdt_last_cmd_puts("Failed to initialize allocations\n");
3405 goto out;
3406 }
3407 }
3408
3409 rdtgrp->mode = RDT_MODE_SHAREABLE;
3410
3411 out:
3412 rdt_staged_configs_clear();
3413 return ret;
3414 }
3415
mkdir_rdt_prepare_rmid_alloc(struct rdtgroup * rdtgrp)3416 static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
3417 {
3418 int ret;
3419
3420 if (!resctrl_arch_mon_capable())
3421 return 0;
3422
3423 ret = alloc_rmid(rdtgrp->closid);
3424 if (ret < 0) {
3425 rdt_last_cmd_puts("Out of RMIDs\n");
3426 return ret;
3427 }
3428 rdtgrp->mon.rmid = ret;
3429
3430 ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
3431 if (ret) {
3432 rdt_last_cmd_puts("kernfs subdir error\n");
3433 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3434 return ret;
3435 }
3436
3437 return 0;
3438 }
3439
mkdir_rdt_prepare_rmid_free(struct rdtgroup * rgrp)3440 static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
3441 {
3442 if (resctrl_arch_mon_capable())
3443 free_rmid(rgrp->closid, rgrp->mon.rmid);
3444 }
3445
3446 /*
3447 * We allow creating mon groups only with in a directory called "mon_groups"
3448 * which is present in every ctrl_mon group. Check if this is a valid
3449 * "mon_groups" directory.
3450 *
3451 * 1. The directory should be named "mon_groups".
3452 * 2. The mon group itself should "not" be named "mon_groups".
3453 * This makes sure "mon_groups" directory always has a ctrl_mon group
3454 * as parent.
3455 */
is_mon_groups(struct kernfs_node * kn,const char * name)3456 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3457 {
3458 return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
3459 strcmp(name, "mon_groups"));
3460 }
3461
mkdir_rdt_prepare(struct kernfs_node * parent_kn,const char * name,umode_t mode,enum rdt_group_type rtype,struct rdtgroup ** r)3462 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
3463 const char *name, umode_t mode,
3464 enum rdt_group_type rtype, struct rdtgroup **r)
3465 {
3466 struct rdtgroup *prdtgrp, *rdtgrp;
3467 unsigned long files = 0;
3468 struct kernfs_node *kn;
3469 int ret;
3470
3471 prdtgrp = rdtgroup_kn_lock_live(parent_kn);
3472 if (!prdtgrp) {
3473 ret = -ENODEV;
3474 goto out_unlock;
3475 }
3476
3477 rdt_last_cmd_clear();
3478
3479 /*
3480 * Check that the parent directory for a monitor group is a "mon_groups"
3481 * directory.
3482 */
3483 if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
3484 ret = -EPERM;
3485 goto out_unlock;
3486 }
3487
3488 if (rtype == RDTMON_GROUP &&
3489 (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3490 prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
3491 ret = -EINVAL;
3492 rdt_last_cmd_puts("Pseudo-locking in progress\n");
3493 goto out_unlock;
3494 }
3495
3496 /* allocate the rdtgroup. */
3497 rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
3498 if (!rdtgrp) {
3499 ret = -ENOSPC;
3500 rdt_last_cmd_puts("Kernel out of memory\n");
3501 goto out_unlock;
3502 }
3503 *r = rdtgrp;
3504 rdtgrp->mon.parent = prdtgrp;
3505 rdtgrp->type = rtype;
3506 INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
3507
3508 /* kernfs creates the directory for rdtgrp */
3509 kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
3510 if (IS_ERR(kn)) {
3511 ret = PTR_ERR(kn);
3512 rdt_last_cmd_puts("kernfs create error\n");
3513 goto out_free_rgrp;
3514 }
3515 rdtgrp->kn = kn;
3516
3517 /*
3518 * kernfs_remove() will drop the reference count on "kn" which
3519 * will free it. But we still need it to stick around for the
3520 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
3521 * which will be dropped by kernfs_put() in rdtgroup_remove().
3522 */
3523 kernfs_get(kn);
3524
3525 ret = rdtgroup_kn_set_ugid(kn);
3526 if (ret) {
3527 rdt_last_cmd_puts("kernfs perm error\n");
3528 goto out_destroy;
3529 }
3530
3531 if (rtype == RDTCTRL_GROUP) {
3532 files = RFTYPE_BASE | RFTYPE_CTRL;
3533 if (resctrl_arch_mon_capable())
3534 files |= RFTYPE_MON;
3535 } else {
3536 files = RFTYPE_BASE | RFTYPE_MON;
3537 }
3538
3539 ret = rdtgroup_add_files(kn, files);
3540 if (ret) {
3541 rdt_last_cmd_puts("kernfs fill error\n");
3542 goto out_destroy;
3543 }
3544
3545 /*
3546 * The caller unlocks the parent_kn upon success.
3547 */
3548 return 0;
3549
3550 out_destroy:
3551 kernfs_put(rdtgrp->kn);
3552 kernfs_remove(rdtgrp->kn);
3553 out_free_rgrp:
3554 kfree(rdtgrp);
3555 out_unlock:
3556 rdtgroup_kn_unlock(parent_kn);
3557 return ret;
3558 }
3559
mkdir_rdt_prepare_clean(struct rdtgroup * rgrp)3560 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
3561 {
3562 kernfs_remove(rgrp->kn);
3563 rdtgroup_remove(rgrp);
3564 }
3565
3566 /*
3567 * Create a monitor group under "mon_groups" directory of a control
3568 * and monitor group(ctrl_mon). This is a resource group
3569 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
3570 */
rdtgroup_mkdir_mon(struct kernfs_node * parent_kn,const char * name,umode_t mode)3571 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
3572 const char *name, umode_t mode)
3573 {
3574 struct rdtgroup *rdtgrp, *prgrp;
3575 int ret;
3576
3577 ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
3578 if (ret)
3579 return ret;
3580
3581 prgrp = rdtgrp->mon.parent;
3582 rdtgrp->closid = prgrp->closid;
3583
3584 ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3585 if (ret) {
3586 mkdir_rdt_prepare_clean(rdtgrp);
3587 goto out_unlock;
3588 }
3589
3590 kernfs_activate(rdtgrp->kn);
3591
3592 /*
3593 * Add the rdtgrp to the list of rdtgrps the parent
3594 * ctrl_mon group has to track.
3595 */
3596 list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
3597
3598 out_unlock:
3599 rdtgroup_kn_unlock(parent_kn);
3600 return ret;
3601 }
3602
3603 /*
3604 * These are rdtgroups created under the root directory. Can be used
3605 * to allocate and monitor resources.
3606 */
rdtgroup_mkdir_ctrl_mon(struct kernfs_node * parent_kn,const char * name,umode_t mode)3607 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
3608 const char *name, umode_t mode)
3609 {
3610 struct rdtgroup *rdtgrp;
3611 struct kernfs_node *kn;
3612 u32 closid;
3613 int ret;
3614
3615 ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
3616 if (ret)
3617 return ret;
3618
3619 kn = rdtgrp->kn;
3620 ret = closid_alloc();
3621 if (ret < 0) {
3622 rdt_last_cmd_puts("Out of CLOSIDs\n");
3623 goto out_common_fail;
3624 }
3625 closid = ret;
3626 ret = 0;
3627
3628 rdtgrp->closid = closid;
3629
3630 ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3631 if (ret)
3632 goto out_closid_free;
3633
3634 kernfs_activate(rdtgrp->kn);
3635
3636 ret = rdtgroup_init_alloc(rdtgrp);
3637 if (ret < 0)
3638 goto out_rmid_free;
3639
3640 list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
3641
3642 if (resctrl_arch_mon_capable()) {
3643 /*
3644 * Create an empty mon_groups directory to hold the subset
3645 * of tasks and cpus to monitor.
3646 */
3647 ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
3648 if (ret) {
3649 rdt_last_cmd_puts("kernfs subdir error\n");
3650 goto out_del_list;
3651 }
3652 if (is_mba_sc(NULL))
3653 rdtgrp->mba_mbps_event = mba_mbps_default_event;
3654 }
3655
3656 goto out_unlock;
3657
3658 out_del_list:
3659 list_del(&rdtgrp->rdtgroup_list);
3660 out_rmid_free:
3661 mkdir_rdt_prepare_rmid_free(rdtgrp);
3662 out_closid_free:
3663 closid_free(closid);
3664 out_common_fail:
3665 mkdir_rdt_prepare_clean(rdtgrp);
3666 out_unlock:
3667 rdtgroup_kn_unlock(parent_kn);
3668 return ret;
3669 }
3670
rdtgroup_mkdir(struct kernfs_node * parent_kn,const char * name,umode_t mode)3671 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3672 umode_t mode)
3673 {
3674 /* Do not accept '\n' to avoid unparsable situation. */
3675 if (strchr(name, '\n'))
3676 return -EINVAL;
3677
3678 /*
3679 * If the parent directory is the root directory and RDT
3680 * allocation is supported, add a control and monitoring
3681 * subdirectory
3682 */
3683 if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
3684 return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3685
3686 /* Else, attempt to add a monitoring subdirectory. */
3687 if (resctrl_arch_mon_capable())
3688 return rdtgroup_mkdir_mon(parent_kn, name, mode);
3689
3690 return -EPERM;
3691 }
3692
rdtgroup_rmdir_mon(struct rdtgroup * rdtgrp,cpumask_var_t tmpmask)3693 static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3694 {
3695 struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3696 u32 closid, rmid;
3697 int cpu;
3698
3699 /* Give any tasks back to the parent group */
3700 rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3701
3702 /*
3703 * Update per cpu closid/rmid of the moved CPUs first.
3704 * Note: the closid will not change, but the arch code still needs it.
3705 */
3706 closid = prdtgrp->closid;
3707 rmid = prdtgrp->mon.rmid;
3708 for_each_cpu(cpu, &rdtgrp->cpu_mask)
3709 resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3710
3711 /*
3712 * Update the MSR on moved CPUs and CPUs which have moved
3713 * task running on them.
3714 */
3715 cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3716 update_closid_rmid(tmpmask, NULL);
3717
3718 rdtgrp->flags = RDT_DELETED;
3719 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3720
3721 /*
3722 * Remove the rdtgrp from the parent ctrl_mon group's list
3723 */
3724 WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3725 list_del(&rdtgrp->mon.crdtgrp_list);
3726
3727 kernfs_remove(rdtgrp->kn);
3728
3729 return 0;
3730 }
3731
rdtgroup_ctrl_remove(struct rdtgroup * rdtgrp)3732 static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
3733 {
3734 rdtgrp->flags = RDT_DELETED;
3735 list_del(&rdtgrp->rdtgroup_list);
3736
3737 kernfs_remove(rdtgrp->kn);
3738 return 0;
3739 }
3740
rdtgroup_rmdir_ctrl(struct rdtgroup * rdtgrp,cpumask_var_t tmpmask)3741 static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3742 {
3743 u32 closid, rmid;
3744 int cpu;
3745
3746 /* Give any tasks back to the default group */
3747 rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3748
3749 /* Give any CPUs back to the default group */
3750 cpumask_or(&rdtgroup_default.cpu_mask,
3751 &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3752
3753 /* Update per cpu closid and rmid of the moved CPUs first */
3754 closid = rdtgroup_default.closid;
3755 rmid = rdtgroup_default.mon.rmid;
3756 for_each_cpu(cpu, &rdtgrp->cpu_mask)
3757 resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3758
3759 /*
3760 * Update the MSR on moved CPUs and CPUs which have moved
3761 * task running on them.
3762 */
3763 cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3764 update_closid_rmid(tmpmask, NULL);
3765
3766 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3767 closid_free(rdtgrp->closid);
3768
3769 rdtgroup_ctrl_remove(rdtgrp);
3770
3771 /*
3772 * Free all the child monitor group rmids.
3773 */
3774 free_all_child_rdtgrp(rdtgrp);
3775
3776 return 0;
3777 }
3778
rdt_kn_parent(struct kernfs_node * kn)3779 static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
3780 {
3781 /*
3782 * Valid within the RCU section it was obtained or while rdtgroup_mutex
3783 * is held.
3784 */
3785 return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
3786 }
3787
rdtgroup_rmdir(struct kernfs_node * kn)3788 static int rdtgroup_rmdir(struct kernfs_node *kn)
3789 {
3790 struct kernfs_node *parent_kn;
3791 struct rdtgroup *rdtgrp;
3792 cpumask_var_t tmpmask;
3793 int ret = 0;
3794
3795 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
3796 return -ENOMEM;
3797
3798 rdtgrp = rdtgroup_kn_lock_live(kn);
3799 if (!rdtgrp) {
3800 ret = -EPERM;
3801 goto out;
3802 }
3803 parent_kn = rdt_kn_parent(kn);
3804
3805 /*
3806 * If the rdtgroup is a ctrl_mon group and parent directory
3807 * is the root directory, remove the ctrl_mon group.
3808 *
3809 * If the rdtgroup is a mon group and parent directory
3810 * is a valid "mon_groups" directory, remove the mon group.
3811 */
3812 if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
3813 rdtgrp != &rdtgroup_default) {
3814 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3815 rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
3816 ret = rdtgroup_ctrl_remove(rdtgrp);
3817 } else {
3818 ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
3819 }
3820 } else if (rdtgrp->type == RDTMON_GROUP &&
3821 is_mon_groups(parent_kn, rdt_kn_name(kn))) {
3822 ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
3823 } else {
3824 ret = -EPERM;
3825 }
3826
3827 out:
3828 rdtgroup_kn_unlock(kn);
3829 free_cpumask_var(tmpmask);
3830 return ret;
3831 }
3832
3833 /**
3834 * mongrp_reparent() - replace parent CTRL_MON group of a MON group
3835 * @rdtgrp: the MON group whose parent should be replaced
3836 * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
3837 * @cpus: cpumask provided by the caller for use during this call
3838 *
3839 * Replaces the parent CTRL_MON group for a MON group, resulting in all member
3840 * tasks' CLOSID immediately changing to that of the new parent group.
3841 * Monitoring data for the group is unaffected by this operation.
3842 */
mongrp_reparent(struct rdtgroup * rdtgrp,struct rdtgroup * new_prdtgrp,cpumask_var_t cpus)3843 static void mongrp_reparent(struct rdtgroup *rdtgrp,
3844 struct rdtgroup *new_prdtgrp,
3845 cpumask_var_t cpus)
3846 {
3847 struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3848
3849 WARN_ON(rdtgrp->type != RDTMON_GROUP);
3850 WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
3851
3852 /* Nothing to do when simply renaming a MON group. */
3853 if (prdtgrp == new_prdtgrp)
3854 return;
3855
3856 WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3857 list_move_tail(&rdtgrp->mon.crdtgrp_list,
3858 &new_prdtgrp->mon.crdtgrp_list);
3859
3860 rdtgrp->mon.parent = new_prdtgrp;
3861 rdtgrp->closid = new_prdtgrp->closid;
3862
3863 /* Propagate updated closid to all tasks in this group. */
3864 rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
3865
3866 update_closid_rmid(cpus, NULL);
3867 }
3868
rdtgroup_rename(struct kernfs_node * kn,struct kernfs_node * new_parent,const char * new_name)3869 static int rdtgroup_rename(struct kernfs_node *kn,
3870 struct kernfs_node *new_parent, const char *new_name)
3871 {
3872 struct kernfs_node *kn_parent;
3873 struct rdtgroup *new_prdtgrp;
3874 struct rdtgroup *rdtgrp;
3875 cpumask_var_t tmpmask;
3876 int ret;
3877
3878 rdtgrp = kernfs_to_rdtgroup(kn);
3879 new_prdtgrp = kernfs_to_rdtgroup(new_parent);
3880 if (!rdtgrp || !new_prdtgrp)
3881 return -ENOENT;
3882
3883 /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
3884 rdtgroup_kn_get(rdtgrp, kn);
3885 rdtgroup_kn_get(new_prdtgrp, new_parent);
3886
3887 mutex_lock(&rdtgroup_mutex);
3888
3889 rdt_last_cmd_clear();
3890
3891 /*
3892 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
3893 * either kernfs_node is a file.
3894 */
3895 if (kernfs_type(kn) != KERNFS_DIR ||
3896 kernfs_type(new_parent) != KERNFS_DIR) {
3897 rdt_last_cmd_puts("Source and destination must be directories");
3898 ret = -EPERM;
3899 goto out;
3900 }
3901
3902 if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
3903 ret = -ENOENT;
3904 goto out;
3905 }
3906
3907 kn_parent = rdt_kn_parent(kn);
3908 if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
3909 !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
3910 rdt_last_cmd_puts("Source must be a MON group\n");
3911 ret = -EPERM;
3912 goto out;
3913 }
3914
3915 if (!is_mon_groups(new_parent, new_name)) {
3916 rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
3917 ret = -EPERM;
3918 goto out;
3919 }
3920
3921 /*
3922 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
3923 * current parent CTRL_MON group and therefore cannot be assigned to
3924 * the new parent, making the move illegal.
3925 */
3926 if (!cpumask_empty(&rdtgrp->cpu_mask) &&
3927 rdtgrp->mon.parent != new_prdtgrp) {
3928 rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
3929 ret = -EPERM;
3930 goto out;
3931 }
3932
3933 /*
3934 * Allocate the cpumask for use in mongrp_reparent() to avoid the
3935 * possibility of failing to allocate it after kernfs_rename() has
3936 * succeeded.
3937 */
3938 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
3939 ret = -ENOMEM;
3940 goto out;
3941 }
3942
3943 /*
3944 * Perform all input validation and allocations needed to ensure
3945 * mongrp_reparent() will succeed before calling kernfs_rename(),
3946 * otherwise it would be necessary to revert this call if
3947 * mongrp_reparent() failed.
3948 */
3949 ret = kernfs_rename(kn, new_parent, new_name);
3950 if (!ret)
3951 mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
3952
3953 free_cpumask_var(tmpmask);
3954
3955 out:
3956 mutex_unlock(&rdtgroup_mutex);
3957 rdtgroup_kn_put(rdtgrp, kn);
3958 rdtgroup_kn_put(new_prdtgrp, new_parent);
3959 return ret;
3960 }
3961
rdtgroup_show_options(struct seq_file * seq,struct kernfs_root * kf)3962 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
3963 {
3964 if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
3965 seq_puts(seq, ",cdp");
3966
3967 if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
3968 seq_puts(seq, ",cdpl2");
3969
3970 if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
3971 seq_puts(seq, ",mba_MBps");
3972
3973 if (resctrl_debug)
3974 seq_puts(seq, ",debug");
3975
3976 return 0;
3977 }
3978
3979 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
3980 .mkdir = rdtgroup_mkdir,
3981 .rmdir = rdtgroup_rmdir,
3982 .rename = rdtgroup_rename,
3983 .show_options = rdtgroup_show_options,
3984 };
3985
rdtgroup_setup_root(struct rdt_fs_context * ctx)3986 static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
3987 {
3988 rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
3989 KERNFS_ROOT_CREATE_DEACTIVATED |
3990 KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
3991 &rdtgroup_default);
3992 if (IS_ERR(rdt_root))
3993 return PTR_ERR(rdt_root);
3994
3995 ctx->kfc.root = rdt_root;
3996 rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
3997
3998 return 0;
3999 }
4000
rdtgroup_destroy_root(void)4001 static void rdtgroup_destroy_root(void)
4002 {
4003 lockdep_assert_held(&rdtgroup_mutex);
4004
4005 kernfs_destroy_root(rdt_root);
4006 rdtgroup_default.kn = NULL;
4007 }
4008
rdtgroup_setup_default(void)4009 static void rdtgroup_setup_default(void)
4010 {
4011 mutex_lock(&rdtgroup_mutex);
4012
4013 rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
4014 rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
4015 rdtgroup_default.type = RDTCTRL_GROUP;
4016 INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
4017
4018 list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
4019
4020 mutex_unlock(&rdtgroup_mutex);
4021 }
4022
domain_destroy_mon_state(struct rdt_mon_domain * d)4023 static void domain_destroy_mon_state(struct rdt_mon_domain *d)
4024 {
4025 bitmap_free(d->rmid_busy_llc);
4026 kfree(d->mbm_total);
4027 kfree(d->mbm_local);
4028 }
4029
resctrl_offline_ctrl_domain(struct rdt_resource * r,struct rdt_ctrl_domain * d)4030 void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4031 {
4032 mutex_lock(&rdtgroup_mutex);
4033
4034 if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
4035 mba_sc_domain_destroy(r, d);
4036
4037 mutex_unlock(&rdtgroup_mutex);
4038 }
4039
resctrl_offline_mon_domain(struct rdt_resource * r,struct rdt_mon_domain * d)4040 void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4041 {
4042 mutex_lock(&rdtgroup_mutex);
4043
4044 /*
4045 * If resctrl is mounted, remove all the
4046 * per domain monitor data directories.
4047 */
4048 if (resctrl_mounted && resctrl_arch_mon_capable())
4049 rmdir_mondata_subdir_allrdtgrp(r, d);
4050
4051 if (resctrl_is_mbm_enabled())
4052 cancel_delayed_work(&d->mbm_over);
4053 if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
4054 /*
4055 * When a package is going down, forcefully
4056 * decrement rmid->ebusy. There is no way to know
4057 * that the L3 was flushed and hence may lead to
4058 * incorrect counts in rare scenarios, but leaving
4059 * the RMID as busy creates RMID leaks if the
4060 * package never comes back.
4061 */
4062 __check_limbo(d, true);
4063 cancel_delayed_work(&d->cqm_limbo);
4064 }
4065
4066 domain_destroy_mon_state(d);
4067
4068 mutex_unlock(&rdtgroup_mutex);
4069 }
4070
4071 /**
4072 * domain_setup_mon_state() - Initialise domain monitoring structures.
4073 * @r: The resource for the newly online domain.
4074 * @d: The newly online domain.
4075 *
4076 * Allocate monitor resources that belong to this domain.
4077 * Called when the first CPU of a domain comes online, regardless of whether
4078 * the filesystem is mounted.
4079 * During boot this may be called before global allocations have been made by
4080 * resctrl_mon_resource_init().
4081 *
4082 * Returns 0 for success, or -ENOMEM.
4083 */
domain_setup_mon_state(struct rdt_resource * r,struct rdt_mon_domain * d)4084 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
4085 {
4086 u32 idx_limit = resctrl_arch_system_num_rmid_idx();
4087 size_t tsize;
4088
4089 if (resctrl_arch_is_llc_occupancy_enabled()) {
4090 d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
4091 if (!d->rmid_busy_llc)
4092 return -ENOMEM;
4093 }
4094 if (resctrl_arch_is_mbm_total_enabled()) {
4095 tsize = sizeof(*d->mbm_total);
4096 d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
4097 if (!d->mbm_total) {
4098 bitmap_free(d->rmid_busy_llc);
4099 return -ENOMEM;
4100 }
4101 }
4102 if (resctrl_arch_is_mbm_local_enabled()) {
4103 tsize = sizeof(*d->mbm_local);
4104 d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
4105 if (!d->mbm_local) {
4106 bitmap_free(d->rmid_busy_llc);
4107 kfree(d->mbm_total);
4108 return -ENOMEM;
4109 }
4110 }
4111
4112 return 0;
4113 }
4114
resctrl_online_ctrl_domain(struct rdt_resource * r,struct rdt_ctrl_domain * d)4115 int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4116 {
4117 int err = 0;
4118
4119 mutex_lock(&rdtgroup_mutex);
4120
4121 if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
4122 /* RDT_RESOURCE_MBA is never mon_capable */
4123 err = mba_sc_domain_allocate(r, d);
4124 }
4125
4126 mutex_unlock(&rdtgroup_mutex);
4127
4128 return err;
4129 }
4130
resctrl_online_mon_domain(struct rdt_resource * r,struct rdt_mon_domain * d)4131 int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4132 {
4133 int err;
4134
4135 mutex_lock(&rdtgroup_mutex);
4136
4137 err = domain_setup_mon_state(r, d);
4138 if (err)
4139 goto out_unlock;
4140
4141 if (resctrl_is_mbm_enabled()) {
4142 INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
4143 mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
4144 RESCTRL_PICK_ANY_CPU);
4145 }
4146
4147 if (resctrl_arch_is_llc_occupancy_enabled())
4148 INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
4149
4150 /*
4151 * If the filesystem is not mounted then only the default resource group
4152 * exists. Creation of its directories is deferred until mount time
4153 * by rdt_get_tree() calling mkdir_mondata_all().
4154 * If resctrl is mounted, add per domain monitor data directories.
4155 */
4156 if (resctrl_mounted && resctrl_arch_mon_capable())
4157 mkdir_mondata_subdir_allrdtgrp(r, d);
4158
4159 out_unlock:
4160 mutex_unlock(&rdtgroup_mutex);
4161
4162 return err;
4163 }
4164
resctrl_online_cpu(unsigned int cpu)4165 void resctrl_online_cpu(unsigned int cpu)
4166 {
4167 mutex_lock(&rdtgroup_mutex);
4168 /* The CPU is set in default rdtgroup after online. */
4169 cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
4170 mutex_unlock(&rdtgroup_mutex);
4171 }
4172
clear_childcpus(struct rdtgroup * r,unsigned int cpu)4173 static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
4174 {
4175 struct rdtgroup *cr;
4176
4177 list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
4178 if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
4179 break;
4180 }
4181 }
4182
get_mon_domain_from_cpu(int cpu,struct rdt_resource * r)4183 static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
4184 struct rdt_resource *r)
4185 {
4186 struct rdt_mon_domain *d;
4187
4188 lockdep_assert_cpus_held();
4189
4190 list_for_each_entry(d, &r->mon_domains, hdr.list) {
4191 /* Find the domain that contains this CPU */
4192 if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
4193 return d;
4194 }
4195
4196 return NULL;
4197 }
4198
resctrl_offline_cpu(unsigned int cpu)4199 void resctrl_offline_cpu(unsigned int cpu)
4200 {
4201 struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
4202 struct rdt_mon_domain *d;
4203 struct rdtgroup *rdtgrp;
4204
4205 mutex_lock(&rdtgroup_mutex);
4206 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
4207 if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
4208 clear_childcpus(rdtgrp, cpu);
4209 break;
4210 }
4211 }
4212
4213 if (!l3->mon_capable)
4214 goto out_unlock;
4215
4216 d = get_mon_domain_from_cpu(cpu, l3);
4217 if (d) {
4218 if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
4219 cancel_delayed_work(&d->mbm_over);
4220 mbm_setup_overflow_handler(d, 0, cpu);
4221 }
4222 if (resctrl_arch_is_llc_occupancy_enabled() &&
4223 cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
4224 cancel_delayed_work(&d->cqm_limbo);
4225 cqm_setup_limbo_handler(d, 0, cpu);
4226 }
4227 }
4228
4229 out_unlock:
4230 mutex_unlock(&rdtgroup_mutex);
4231 }
4232
4233 /*
4234 * resctrl_init - resctrl filesystem initialization
4235 *
4236 * Setup resctrl file system including set up root, create mount point,
4237 * register resctrl filesystem, and initialize files under root directory.
4238 *
4239 * Return: 0 on success or -errno
4240 */
resctrl_init(void)4241 int resctrl_init(void)
4242 {
4243 int ret = 0;
4244
4245 seq_buf_init(&last_cmd_status, last_cmd_status_buf,
4246 sizeof(last_cmd_status_buf));
4247
4248 rdtgroup_setup_default();
4249
4250 thread_throttle_mode_init();
4251
4252 ret = resctrl_mon_resource_init();
4253 if (ret)
4254 return ret;
4255
4256 ret = sysfs_create_mount_point(fs_kobj, "resctrl");
4257 if (ret) {
4258 resctrl_mon_resource_exit();
4259 return ret;
4260 }
4261
4262 ret = register_filesystem(&rdt_fs_type);
4263 if (ret)
4264 goto cleanup_mountpoint;
4265
4266 /*
4267 * Adding the resctrl debugfs directory here may not be ideal since
4268 * it would let the resctrl debugfs directory appear on the debugfs
4269 * filesystem before the resctrl filesystem is mounted.
4270 * It may also be ok since that would enable debugging of RDT before
4271 * resctrl is mounted.
4272 * The reason why the debugfs directory is created here and not in
4273 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
4274 * during the debugfs directory creation also &sb->s_type->i_mutex_key
4275 * (the lockdep class of inode->i_rwsem). Other filesystem
4276 * interactions (eg. SyS_getdents) have the lock ordering:
4277 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
4278 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
4279 * is taken, thus creating dependency:
4280 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
4281 * issues considering the other two lock dependencies.
4282 * By creating the debugfs directory here we avoid a dependency
4283 * that may cause deadlock (even though file operations cannot
4284 * occur until the filesystem is mounted, but I do not know how to
4285 * tell lockdep that).
4286 */
4287 debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
4288
4289 return 0;
4290
4291 cleanup_mountpoint:
4292 sysfs_remove_mount_point(fs_kobj, "resctrl");
4293 resctrl_mon_resource_exit();
4294
4295 return ret;
4296 }
4297
resctrl_online_domains_exist(void)4298 static bool resctrl_online_domains_exist(void)
4299 {
4300 struct rdt_resource *r;
4301
4302 /*
4303 * Only walk capable resources to allow resctrl_arch_get_resource()
4304 * to return dummy 'not capable' resources.
4305 */
4306 for_each_alloc_capable_rdt_resource(r) {
4307 if (!list_empty(&r->ctrl_domains))
4308 return true;
4309 }
4310
4311 for_each_mon_capable_rdt_resource(r) {
4312 if (!list_empty(&r->mon_domains))
4313 return true;
4314 }
4315
4316 return false;
4317 }
4318
4319 /**
4320 * resctrl_exit() - Remove the resctrl filesystem and free resources.
4321 *
4322 * Called by the architecture code in response to a fatal error.
4323 * Removes resctrl files and structures from kernfs to prevent further
4324 * configuration.
4325 *
4326 * When called by the architecture code, all CPUs and resctrl domains must be
4327 * offline. This ensures the limbo and overflow handlers are not scheduled to
4328 * run, meaning the data structures they access can be freed by
4329 * resctrl_mon_resource_exit().
4330 *
4331 * After resctrl_exit() returns, the architecture code should return an
4332 * error from all resctrl_arch_ functions that can do this.
4333 * resctrl_arch_get_resource() must continue to return struct rdt_resources
4334 * with the correct rid field to ensure the filesystem can be unmounted.
4335 */
resctrl_exit(void)4336 void resctrl_exit(void)
4337 {
4338 cpus_read_lock();
4339 WARN_ON_ONCE(resctrl_online_domains_exist());
4340
4341 mutex_lock(&rdtgroup_mutex);
4342 resctrl_fs_teardown();
4343 mutex_unlock(&rdtgroup_mutex);
4344
4345 cpus_read_unlock();
4346
4347 debugfs_remove_recursive(debugfs_resctrl);
4348 debugfs_resctrl = NULL;
4349 unregister_filesystem(&rdt_fs_type);
4350
4351 /*
4352 * Do not remove the sysfs mount point added by resctrl_init() so that
4353 * it can be used to umount resctrl.
4354 */
4355
4356 resctrl_mon_resource_exit();
4357 }
4358