1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Resource Director Technology (RDT) 4 * 5 * Pseudo-locking support built on top of Cache Allocation Technology (CAT) 6 * 7 * Copyright (C) 2018 Intel Corporation 8 * 9 * Author: Reinette Chatre <reinette.chatre@intel.com> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/cpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/debugfs.h> 17 #include <linux/kthread.h> 18 #include <linux/mman.h> 19 #include <linux/perf_event.h> 20 #include <linux/pm_qos.h> 21 #include <linux/slab.h> 22 #include <linux/uaccess.h> 23 24 #include <asm/cacheflush.h> 25 #include <asm/cpu_device_id.h> 26 #include <asm/resctrl.h> 27 #include <asm/perf_event.h> 28 29 #include "../../events/perf_event.h" /* For X86_CONFIG() */ 30 #include "internal.h" 31 32 #define CREATE_TRACE_POINTS 33 #include "trace.h" 34 35 /* 36 * The bits needed to disable hardware prefetching varies based on the 37 * platform. During initialization we will discover which bits to use. 38 */ 39 static u64 prefetch_disable_bits; 40 41 /* 42 * Major number assigned to and shared by all devices exposing 43 * pseudo-locked regions. 44 */ 45 static unsigned int pseudo_lock_major; 46 static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 47 48 static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 49 { 50 const struct rdtgroup *rdtgrp; 51 52 rdtgrp = dev_get_drvdata(dev); 53 if (mode) 54 *mode = 0600; 55 return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); 56 } 57 58 static const struct class pseudo_lock_class = { 59 .name = "pseudo_lock", 60 .devnode = pseudo_lock_devnode, 61 }; 62 63 /** 64 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported 65 * platforms 66 * @void: It takes no parameters. 67 * 68 * Capture the list of platforms that have been validated to support 69 * pseudo-locking. This includes testing to ensure pseudo-locked regions 70 * with low cache miss rates can be created under variety of load conditions 71 * as well as that these pseudo-locked regions can maintain their low cache 72 * miss rates under variety of load conditions for significant lengths of time. 73 * 74 * After a platform has been validated to support pseudo-locking its 75 * hardware prefetch disable bits are included here as they are documented 76 * in the SDM. 77 * 78 * When adding a platform here also add support for its cache events to 79 * resctrl_arch_measure_l*_residency() 80 * 81 * Return: 82 * If platform is supported, the bits to disable hardware prefetchers, 0 83 * if platform is not supported. 84 */ 85 u64 resctrl_arch_get_prefetch_disable_bits(void) 86 { 87 prefetch_disable_bits = 0; 88 89 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 90 boot_cpu_data.x86 != 6) 91 return 0; 92 93 switch (boot_cpu_data.x86_vfm) { 94 case INTEL_BROADWELL_X: 95 /* 96 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 97 * as: 98 * 0 L2 Hardware Prefetcher Disable (R/W) 99 * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) 100 * 2 DCU Hardware Prefetcher Disable (R/W) 101 * 3 DCU IP Prefetcher Disable (R/W) 102 * 63:4 Reserved 103 */ 104 prefetch_disable_bits = 0xF; 105 break; 106 case INTEL_ATOM_GOLDMONT: 107 case INTEL_ATOM_GOLDMONT_PLUS: 108 /* 109 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 110 * as: 111 * 0 L2 Hardware Prefetcher Disable (R/W) 112 * 1 Reserved 113 * 2 DCU Hardware Prefetcher Disable (R/W) 114 * 63:3 Reserved 115 */ 116 prefetch_disable_bits = 0x5; 117 break; 118 } 119 120 return prefetch_disable_bits; 121 } 122 123 /** 124 * pseudo_lock_minor_get - Obtain available minor number 125 * @minor: Pointer to where new minor number will be stored 126 * 127 * A bitmask is used to track available minor numbers. Here the next free 128 * minor number is marked as unavailable and returned. 129 * 130 * Return: 0 on success, <0 on failure. 131 */ 132 static int pseudo_lock_minor_get(unsigned int *minor) 133 { 134 unsigned long first_bit; 135 136 first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 137 138 if (first_bit == MINORBITS) 139 return -ENOSPC; 140 141 __clear_bit(first_bit, &pseudo_lock_minor_avail); 142 *minor = first_bit; 143 144 return 0; 145 } 146 147 /** 148 * pseudo_lock_minor_release - Return minor number to available 149 * @minor: The minor number made available 150 */ 151 static void pseudo_lock_minor_release(unsigned int minor) 152 { 153 __set_bit(minor, &pseudo_lock_minor_avail); 154 } 155 156 /** 157 * region_find_by_minor - Locate a pseudo-lock region by inode minor number 158 * @minor: The minor number of the device representing pseudo-locked region 159 * 160 * When the character device is accessed we need to determine which 161 * pseudo-locked region it belongs to. This is done by matching the minor 162 * number of the device to the pseudo-locked region it belongs. 163 * 164 * Minor numbers are assigned at the time a pseudo-locked region is associated 165 * with a cache instance. 166 * 167 * Return: On success return pointer to resource group owning the pseudo-locked 168 * region, NULL on failure. 169 */ 170 static struct rdtgroup *region_find_by_minor(unsigned int minor) 171 { 172 struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 173 174 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 175 if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 176 rdtgrp_match = rdtgrp; 177 break; 178 } 179 } 180 return rdtgrp_match; 181 } 182 183 /** 184 * struct pseudo_lock_pm_req - A power management QoS request list entry 185 * @list: Entry within the @pm_reqs list for a pseudo-locked region 186 * @req: PM QoS request 187 */ 188 struct pseudo_lock_pm_req { 189 struct list_head list; 190 struct dev_pm_qos_request req; 191 }; 192 193 static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 194 { 195 struct pseudo_lock_pm_req *pm_req, *next; 196 197 list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 198 dev_pm_qos_remove_request(&pm_req->req); 199 list_del(&pm_req->list); 200 kfree(pm_req); 201 } 202 } 203 204 /** 205 * pseudo_lock_cstates_constrain - Restrict cores from entering C6 206 * @plr: Pseudo-locked region 207 * 208 * To prevent the cache from being affected by power management entering 209 * C6 has to be avoided. This is accomplished by requesting a latency 210 * requirement lower than lowest C6 exit latency of all supported 211 * platforms as found in the cpuidle state tables in the intel_idle driver. 212 * At this time it is possible to do so with a single latency requirement 213 * for all supported platforms. 214 * 215 * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 216 * the ACPI latencies need to be considered while keeping in mind that C2 217 * may be set to map to deeper sleep states. In this case the latency 218 * requirement needs to prevent entering C2 also. 219 * 220 * Return: 0 on success, <0 on failure 221 */ 222 static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 223 { 224 struct pseudo_lock_pm_req *pm_req; 225 int cpu; 226 int ret; 227 228 for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 229 pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 230 if (!pm_req) { 231 rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 232 ret = -ENOMEM; 233 goto out_err; 234 } 235 ret = dev_pm_qos_add_request(get_cpu_device(cpu), 236 &pm_req->req, 237 DEV_PM_QOS_RESUME_LATENCY, 238 30); 239 if (ret < 0) { 240 rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 241 cpu); 242 kfree(pm_req); 243 ret = -1; 244 goto out_err; 245 } 246 list_add(&pm_req->list, &plr->pm_reqs); 247 } 248 249 return 0; 250 251 out_err: 252 pseudo_lock_cstates_relax(plr); 253 return ret; 254 } 255 256 /** 257 * pseudo_lock_region_clear - Reset pseudo-lock region data 258 * @plr: pseudo-lock region 259 * 260 * All content of the pseudo-locked region is reset - any memory allocated 261 * freed. 262 * 263 * Return: void 264 */ 265 static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 266 { 267 plr->size = 0; 268 plr->line_size = 0; 269 kfree(plr->kmem); 270 plr->kmem = NULL; 271 plr->s = NULL; 272 if (plr->d) 273 plr->d->plr = NULL; 274 plr->d = NULL; 275 plr->cbm = 0; 276 plr->debugfs_dir = NULL; 277 } 278 279 /** 280 * pseudo_lock_region_init - Initialize pseudo-lock region information 281 * @plr: pseudo-lock region 282 * 283 * Called after user provided a schemata to be pseudo-locked. From the 284 * schemata the &struct pseudo_lock_region is on entry already initialized 285 * with the resource, domain, and capacity bitmask. Here the information 286 * required for pseudo-locking is deduced from this data and &struct 287 * pseudo_lock_region initialized further. This information includes: 288 * - size in bytes of the region to be pseudo-locked 289 * - cache line size to know the stride with which data needs to be accessed 290 * to be pseudo-locked 291 * - a cpu associated with the cache instance on which the pseudo-locking 292 * flow can be executed 293 * 294 * Return: 0 on success, <0 on failure. Descriptive error will be written 295 * to last_cmd_status buffer. 296 */ 297 static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 298 { 299 enum resctrl_scope scope = plr->s->res->ctrl_scope; 300 struct cacheinfo *ci; 301 int ret; 302 303 if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 304 return -ENODEV; 305 306 /* Pick the first cpu we find that is associated with the cache. */ 307 plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 308 309 if (!cpu_online(plr->cpu)) { 310 rdt_last_cmd_printf("CPU %u associated with cache not online\n", 311 plr->cpu); 312 ret = -ENODEV; 313 goto out_region; 314 } 315 316 ci = get_cpu_cacheinfo_level(plr->cpu, scope); 317 if (ci) { 318 plr->line_size = ci->coherency_line_size; 319 plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 320 return 0; 321 } 322 323 ret = -1; 324 rdt_last_cmd_puts("Unable to determine cache line size\n"); 325 out_region: 326 pseudo_lock_region_clear(plr); 327 return ret; 328 } 329 330 /** 331 * pseudo_lock_init - Initialize a pseudo-lock region 332 * @rdtgrp: resource group to which new pseudo-locked region will belong 333 * 334 * A pseudo-locked region is associated with a resource group. When this 335 * association is created the pseudo-locked region is initialized. The 336 * details of the pseudo-locked region are not known at this time so only 337 * allocation is done and association established. 338 * 339 * Return: 0 on success, <0 on failure 340 */ 341 static int pseudo_lock_init(struct rdtgroup *rdtgrp) 342 { 343 struct pseudo_lock_region *plr; 344 345 plr = kzalloc(sizeof(*plr), GFP_KERNEL); 346 if (!plr) 347 return -ENOMEM; 348 349 init_waitqueue_head(&plr->lock_thread_wq); 350 INIT_LIST_HEAD(&plr->pm_reqs); 351 rdtgrp->plr = plr; 352 return 0; 353 } 354 355 /** 356 * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 357 * @plr: pseudo-lock region 358 * 359 * Initialize the details required to set up the pseudo-locked region and 360 * allocate the contiguous memory that will be pseudo-locked to the cache. 361 * 362 * Return: 0 on success, <0 on failure. Descriptive error will be written 363 * to last_cmd_status buffer. 364 */ 365 static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 366 { 367 int ret; 368 369 ret = pseudo_lock_region_init(plr); 370 if (ret < 0) 371 return ret; 372 373 /* 374 * We do not yet support contiguous regions larger than 375 * KMALLOC_MAX_SIZE. 376 */ 377 if (plr->size > KMALLOC_MAX_SIZE) { 378 rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 379 ret = -E2BIG; 380 goto out_region; 381 } 382 383 plr->kmem = kzalloc(plr->size, GFP_KERNEL); 384 if (!plr->kmem) { 385 rdt_last_cmd_puts("Unable to allocate memory\n"); 386 ret = -ENOMEM; 387 goto out_region; 388 } 389 390 ret = 0; 391 goto out; 392 out_region: 393 pseudo_lock_region_clear(plr); 394 out: 395 return ret; 396 } 397 398 /** 399 * pseudo_lock_free - Free a pseudo-locked region 400 * @rdtgrp: resource group to which pseudo-locked region belonged 401 * 402 * The pseudo-locked region's resources have already been released, or not 403 * yet created at this point. Now it can be freed and disassociated from the 404 * resource group. 405 * 406 * Return: void 407 */ 408 static void pseudo_lock_free(struct rdtgroup *rdtgrp) 409 { 410 pseudo_lock_region_clear(rdtgrp->plr); 411 kfree(rdtgrp->plr); 412 rdtgrp->plr = NULL; 413 } 414 415 /** 416 * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache 417 * @_plr: the pseudo-lock region descriptor 418 * 419 * This is the core pseudo-locking flow. 420 * 421 * First we ensure that the kernel memory cannot be found in the cache. 422 * Then, while taking care that there will be as little interference as 423 * possible, the memory to be loaded is accessed while core is running 424 * with class of service set to the bitmask of the pseudo-locked region. 425 * After this is complete no future CAT allocations will be allowed to 426 * overlap with this bitmask. 427 * 428 * Local register variables are utilized to ensure that the memory region 429 * to be locked is the only memory access made during the critical locking 430 * loop. 431 * 432 * Return: 0. Waiter on waitqueue will be woken on completion. 433 */ 434 int resctrl_arch_pseudo_lock_fn(void *_plr) 435 { 436 struct pseudo_lock_region *plr = _plr; 437 u32 rmid_p, closid_p; 438 unsigned long i; 439 u64 saved_msr; 440 #ifdef CONFIG_KASAN 441 /* 442 * The registers used for local register variables are also used 443 * when KASAN is active. When KASAN is active we use a regular 444 * variable to ensure we always use a valid pointer, but the cost 445 * is that this variable will enter the cache through evicting the 446 * memory we are trying to lock into the cache. Thus expect lower 447 * pseudo-locking success rate when KASAN is active. 448 */ 449 unsigned int line_size; 450 unsigned int size; 451 void *mem_r; 452 #else 453 register unsigned int line_size asm("esi"); 454 register unsigned int size asm("edi"); 455 register void *mem_r asm(_ASM_BX); 456 #endif /* CONFIG_KASAN */ 457 458 /* 459 * Make sure none of the allocated memory is cached. If it is we 460 * will get a cache hit in below loop from outside of pseudo-locked 461 * region. 462 * wbinvd (as opposed to clflush/clflushopt) is required to 463 * increase likelihood that allocated cache portion will be filled 464 * with associated memory. 465 */ 466 wbinvd(); 467 468 /* 469 * Always called with interrupts enabled. By disabling interrupts 470 * ensure that we will not be preempted during this critical section. 471 */ 472 local_irq_disable(); 473 474 /* 475 * Call wrmsr and rdmsr as directly as possible to avoid tracing 476 * clobbering local register variables or affecting cache accesses. 477 * 478 * Disable the hardware prefetcher so that when the end of the memory 479 * being pseudo-locked is reached the hardware will not read beyond 480 * the buffer and evict pseudo-locked memory read earlier from the 481 * cache. 482 */ 483 saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); 484 __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 485 closid_p = this_cpu_read(pqr_state.cur_closid); 486 rmid_p = this_cpu_read(pqr_state.cur_rmid); 487 mem_r = plr->kmem; 488 size = plr->size; 489 line_size = plr->line_size; 490 /* 491 * Critical section begin: start by writing the closid associated 492 * with the capacity bitmask of the cache region being 493 * pseudo-locked followed by reading of kernel memory to load it 494 * into the cache. 495 */ 496 __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); 497 498 /* 499 * Cache was flushed earlier. Now access kernel memory to read it 500 * into cache region associated with just activated plr->closid. 501 * Loop over data twice: 502 * - In first loop the cache region is shared with the page walker 503 * as it populates the paging structure caches (including TLB). 504 * - In the second loop the paging structure caches are used and 505 * cache region is populated with the memory being referenced. 506 */ 507 for (i = 0; i < size; i += PAGE_SIZE) { 508 /* 509 * Add a barrier to prevent speculative execution of this 510 * loop reading beyond the end of the buffer. 511 */ 512 rmb(); 513 asm volatile("mov (%0,%1,1), %%eax\n\t" 514 : 515 : "r" (mem_r), "r" (i) 516 : "%eax", "memory"); 517 } 518 for (i = 0; i < size; i += line_size) { 519 /* 520 * Add a barrier to prevent speculative execution of this 521 * loop reading beyond the end of the buffer. 522 */ 523 rmb(); 524 asm volatile("mov (%0,%1,1), %%eax\n\t" 525 : 526 : "r" (mem_r), "r" (i) 527 : "%eax", "memory"); 528 } 529 /* 530 * Critical section end: restore closid with capacity bitmask that 531 * does not overlap with pseudo-locked region. 532 */ 533 __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); 534 535 /* Re-enable the hardware prefetcher(s) */ 536 wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); 537 local_irq_enable(); 538 539 plr->thread_done = 1; 540 wake_up_interruptible(&plr->lock_thread_wq); 541 return 0; 542 } 543 544 /** 545 * rdtgroup_monitor_in_progress - Test if monitoring in progress 546 * @rdtgrp: resource group being queried 547 * 548 * Return: 1 if monitor groups have been created for this resource 549 * group, 0 otherwise. 550 */ 551 static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 552 { 553 return !list_empty(&rdtgrp->mon.crdtgrp_list); 554 } 555 556 /** 557 * rdtgroup_locksetup_user_restrict - Restrict user access to group 558 * @rdtgrp: resource group needing access restricted 559 * 560 * A resource group used for cache pseudo-locking cannot have cpus or tasks 561 * assigned to it. This is communicated to the user by restricting access 562 * to all the files that can be used to make such changes. 563 * 564 * Permissions restored with rdtgroup_locksetup_user_restore() 565 * 566 * Return: 0 on success, <0 on failure. If a failure occurs during the 567 * restriction of access an attempt will be made to restore permissions but 568 * the state of the mode of these files will be uncertain when a failure 569 * occurs. 570 */ 571 static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 572 { 573 int ret; 574 575 ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 576 if (ret) 577 return ret; 578 579 ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 580 if (ret) 581 goto err_tasks; 582 583 ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 584 if (ret) 585 goto err_cpus; 586 587 if (resctrl_arch_mon_capable()) { 588 ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 589 if (ret) 590 goto err_cpus_list; 591 } 592 593 ret = 0; 594 goto out; 595 596 err_cpus_list: 597 rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 598 err_cpus: 599 rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 600 err_tasks: 601 rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 602 out: 603 return ret; 604 } 605 606 /** 607 * rdtgroup_locksetup_user_restore - Restore user access to group 608 * @rdtgrp: resource group needing access restored 609 * 610 * Restore all file access previously removed using 611 * rdtgroup_locksetup_user_restrict() 612 * 613 * Return: 0 on success, <0 on failure. If a failure occurs during the 614 * restoration of access an attempt will be made to restrict permissions 615 * again but the state of the mode of these files will be uncertain when 616 * a failure occurs. 617 */ 618 static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 619 { 620 int ret; 621 622 ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 623 if (ret) 624 return ret; 625 626 ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 627 if (ret) 628 goto err_tasks; 629 630 ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 631 if (ret) 632 goto err_cpus; 633 634 if (resctrl_arch_mon_capable()) { 635 ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 636 if (ret) 637 goto err_cpus_list; 638 } 639 640 ret = 0; 641 goto out; 642 643 err_cpus_list: 644 rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 645 err_cpus: 646 rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 647 err_tasks: 648 rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 649 out: 650 return ret; 651 } 652 653 /** 654 * rdtgroup_locksetup_enter - Resource group enters locksetup mode 655 * @rdtgrp: resource group requested to enter locksetup mode 656 * 657 * A resource group enters locksetup mode to reflect that it would be used 658 * to represent a pseudo-locked region and is in the process of being set 659 * up to do so. A resource group used for a pseudo-locked region would 660 * lose the closid associated with it so we cannot allow it to have any 661 * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 662 * future. Monitoring of a pseudo-locked region is not allowed either. 663 * 664 * The above and more restrictions on a pseudo-locked region are checked 665 * for and enforced before the resource group enters the locksetup mode. 666 * 667 * Returns: 0 if the resource group successfully entered locksetup mode, <0 668 * on failure. On failure the last_cmd_status buffer is updated with text to 669 * communicate details of failure to the user. 670 */ 671 int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 672 { 673 int ret; 674 675 /* 676 * The default resource group can neither be removed nor lose the 677 * default closid associated with it. 678 */ 679 if (rdtgrp == &rdtgroup_default) { 680 rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 681 return -EINVAL; 682 } 683 684 /* 685 * Cache Pseudo-locking not supported when CDP is enabled. 686 * 687 * Some things to consider if you would like to enable this 688 * support (using L3 CDP as example): 689 * - When CDP is enabled two separate resources are exposed, 690 * L3DATA and L3CODE, but they are actually on the same cache. 691 * The implication for pseudo-locking is that if a 692 * pseudo-locked region is created on a domain of one 693 * resource (eg. L3CODE), then a pseudo-locked region cannot 694 * be created on that same domain of the other resource 695 * (eg. L3DATA). This is because the creation of a 696 * pseudo-locked region involves a call to wbinvd that will 697 * affect all cache allocations on particular domain. 698 * - Considering the previous, it may be possible to only 699 * expose one of the CDP resources to pseudo-locking and 700 * hide the other. For example, we could consider to only 701 * expose L3DATA and since the L3 cache is unified it is 702 * still possible to place instructions there are execute it. 703 * - If only one region is exposed to pseudo-locking we should 704 * still keep in mind that availability of a portion of cache 705 * for pseudo-locking should take into account both resources. 706 * Similarly, if a pseudo-locked region is created in one 707 * resource, the portion of cache used by it should be made 708 * unavailable to all future allocations from both resources. 709 */ 710 if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 711 resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 712 rdt_last_cmd_puts("CDP enabled\n"); 713 return -EINVAL; 714 } 715 716 /* 717 * Not knowing the bits to disable prefetching implies that this 718 * platform does not support Cache Pseudo-Locking. 719 */ 720 if (resctrl_arch_get_prefetch_disable_bits() == 0) { 721 rdt_last_cmd_puts("Pseudo-locking not supported\n"); 722 return -EINVAL; 723 } 724 725 if (rdtgroup_monitor_in_progress(rdtgrp)) { 726 rdt_last_cmd_puts("Monitoring in progress\n"); 727 return -EINVAL; 728 } 729 730 if (rdtgroup_tasks_assigned(rdtgrp)) { 731 rdt_last_cmd_puts("Tasks assigned to resource group\n"); 732 return -EINVAL; 733 } 734 735 if (!cpumask_empty(&rdtgrp->cpu_mask)) { 736 rdt_last_cmd_puts("CPUs assigned to resource group\n"); 737 return -EINVAL; 738 } 739 740 if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 741 rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 742 return -EIO; 743 } 744 745 ret = pseudo_lock_init(rdtgrp); 746 if (ret) { 747 rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 748 goto out_release; 749 } 750 751 /* 752 * If this system is capable of monitoring a rmid would have been 753 * allocated when the control group was created. This is not needed 754 * anymore when this group would be used for pseudo-locking. This 755 * is safe to call on platforms not capable of monitoring. 756 */ 757 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 758 759 ret = 0; 760 goto out; 761 762 out_release: 763 rdtgroup_locksetup_user_restore(rdtgrp); 764 out: 765 return ret; 766 } 767 768 /** 769 * rdtgroup_locksetup_exit - resource group exist locksetup mode 770 * @rdtgrp: resource group 771 * 772 * When a resource group exits locksetup mode the earlier restrictions are 773 * lifted. 774 * 775 * Return: 0 on success, <0 on failure 776 */ 777 int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 778 { 779 int ret; 780 781 if (resctrl_arch_mon_capable()) { 782 ret = alloc_rmid(rdtgrp->closid); 783 if (ret < 0) { 784 rdt_last_cmd_puts("Out of RMIDs\n"); 785 return ret; 786 } 787 rdtgrp->mon.rmid = ret; 788 } 789 790 ret = rdtgroup_locksetup_user_restore(rdtgrp); 791 if (ret) { 792 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 793 return ret; 794 } 795 796 pseudo_lock_free(rdtgrp); 797 return 0; 798 } 799 800 /** 801 * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 802 * @d: RDT domain 803 * @cbm: CBM to test 804 * 805 * @d represents a cache instance and @cbm a capacity bitmask that is 806 * considered for it. Determine if @cbm overlaps with any existing 807 * pseudo-locked region on @d. 808 * 809 * @cbm is unsigned long, even if only 32 bits are used, to make the 810 * bitmap functions work correctly. 811 * 812 * Return: true if @cbm overlaps with pseudo-locked region on @d, false 813 * otherwise. 814 */ 815 bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 816 { 817 unsigned int cbm_len; 818 unsigned long cbm_b; 819 820 if (d->plr) { 821 cbm_len = d->plr->s->res->cache.cbm_len; 822 cbm_b = d->plr->cbm; 823 if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 824 return true; 825 } 826 return false; 827 } 828 829 /** 830 * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 831 * @d: RDT domain under test 832 * 833 * The setup of a pseudo-locked region affects all cache instances within 834 * the hierarchy of the region. It is thus essential to know if any 835 * pseudo-locked regions exist within a cache hierarchy to prevent any 836 * attempts to create new pseudo-locked regions in the same hierarchy. 837 * 838 * Return: true if a pseudo-locked region exists in the hierarchy of @d or 839 * if it is not possible to test due to memory allocation issue, 840 * false otherwise. 841 */ 842 bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 843 { 844 struct rdt_ctrl_domain *d_i; 845 cpumask_var_t cpu_with_psl; 846 struct rdt_resource *r; 847 bool ret = false; 848 849 /* Walking r->domains, ensure it can't race with cpuhp */ 850 lockdep_assert_cpus_held(); 851 852 if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 853 return true; 854 855 /* 856 * First determine which cpus have pseudo-locked regions 857 * associated with them. 858 */ 859 for_each_alloc_capable_rdt_resource(r) { 860 list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 861 if (d_i->plr) 862 cpumask_or(cpu_with_psl, cpu_with_psl, 863 &d_i->hdr.cpu_mask); 864 } 865 } 866 867 /* 868 * Next test if new pseudo-locked region would intersect with 869 * existing region. 870 */ 871 if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 872 ret = true; 873 874 free_cpumask_var(cpu_with_psl); 875 return ret; 876 } 877 878 /** 879 * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read 880 * pseudo-locked memory 881 * @_plr: pseudo-lock region to measure 882 * 883 * There is no deterministic way to test if a memory region is cached. One 884 * way is to measure how long it takes to read the memory, the speed of 885 * access is a good way to learn how close to the cpu the data was. Even 886 * more, if the prefetcher is disabled and the memory is read at a stride 887 * of half the cache line, then a cache miss will be easy to spot since the 888 * read of the first half would be significantly slower than the read of 889 * the second half. 890 * 891 * Return: 0. Waiter on waitqueue will be woken on completion. 892 */ 893 int resctrl_arch_measure_cycles_lat_fn(void *_plr) 894 { 895 struct pseudo_lock_region *plr = _plr; 896 u32 saved_low, saved_high; 897 unsigned long i; 898 u64 start, end; 899 void *mem_r; 900 901 local_irq_disable(); 902 /* 903 * Disable hardware prefetchers. 904 */ 905 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 906 wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 907 mem_r = READ_ONCE(plr->kmem); 908 /* 909 * Dummy execute of the time measurement to load the needed 910 * instructions into the L1 instruction cache. 911 */ 912 start = rdtsc_ordered(); 913 for (i = 0; i < plr->size; i += 32) { 914 start = rdtsc_ordered(); 915 asm volatile("mov (%0,%1,1), %%eax\n\t" 916 : 917 : "r" (mem_r), "r" (i) 918 : "%eax", "memory"); 919 end = rdtsc_ordered(); 920 trace_pseudo_lock_mem_latency((u32)(end - start)); 921 } 922 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 923 local_irq_enable(); 924 plr->thread_done = 1; 925 wake_up_interruptible(&plr->lock_thread_wq); 926 return 0; 927 } 928 929 /* 930 * Create a perf_event_attr for the hit and miss perf events that will 931 * be used during the performance measurement. A perf_event maintains 932 * a pointer to its perf_event_attr so a unique attribute structure is 933 * created for each perf_event. 934 * 935 * The actual configuration of the event is set right before use in order 936 * to use the X86_CONFIG macro. 937 */ 938 static struct perf_event_attr perf_miss_attr = { 939 .type = PERF_TYPE_RAW, 940 .size = sizeof(struct perf_event_attr), 941 .pinned = 1, 942 .disabled = 0, 943 .exclude_user = 1, 944 }; 945 946 static struct perf_event_attr perf_hit_attr = { 947 .type = PERF_TYPE_RAW, 948 .size = sizeof(struct perf_event_attr), 949 .pinned = 1, 950 .disabled = 0, 951 .exclude_user = 1, 952 }; 953 954 struct residency_counts { 955 u64 miss_before, hits_before; 956 u64 miss_after, hits_after; 957 }; 958 959 static int measure_residency_fn(struct perf_event_attr *miss_attr, 960 struct perf_event_attr *hit_attr, 961 struct pseudo_lock_region *plr, 962 struct residency_counts *counts) 963 { 964 u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; 965 struct perf_event *miss_event, *hit_event; 966 int hit_pmcnum, miss_pmcnum; 967 u32 saved_low, saved_high; 968 unsigned int line_size; 969 unsigned int size; 970 unsigned long i; 971 void *mem_r; 972 u64 tmp; 973 974 miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, 975 NULL, NULL, NULL); 976 if (IS_ERR(miss_event)) 977 goto out; 978 979 hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, 980 NULL, NULL, NULL); 981 if (IS_ERR(hit_event)) 982 goto out_miss; 983 984 local_irq_disable(); 985 /* 986 * Check any possible error state of events used by performing 987 * one local read. 988 */ 989 if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { 990 local_irq_enable(); 991 goto out_hit; 992 } 993 if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { 994 local_irq_enable(); 995 goto out_hit; 996 } 997 998 /* 999 * Disable hardware prefetchers. 1000 */ 1001 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 1002 wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 1003 1004 /* Initialize rest of local variables */ 1005 /* 1006 * Performance event has been validated right before this with 1007 * interrupts disabled - it is thus safe to read the counter index. 1008 */ 1009 miss_pmcnum = x86_perf_rdpmc_index(miss_event); 1010 hit_pmcnum = x86_perf_rdpmc_index(hit_event); 1011 line_size = READ_ONCE(plr->line_size); 1012 mem_r = READ_ONCE(plr->kmem); 1013 size = READ_ONCE(plr->size); 1014 1015 /* 1016 * Read counter variables twice - first to load the instructions 1017 * used in L1 cache, second to capture accurate value that does not 1018 * include cache misses incurred because of instruction loads. 1019 */ 1020 rdpmcl(hit_pmcnum, hits_before); 1021 rdpmcl(miss_pmcnum, miss_before); 1022 /* 1023 * From SDM: Performing back-to-back fast reads are not guaranteed 1024 * to be monotonic. 1025 * Use LFENCE to ensure all previous instructions are retired 1026 * before proceeding. 1027 */ 1028 rmb(); 1029 rdpmcl(hit_pmcnum, hits_before); 1030 rdpmcl(miss_pmcnum, miss_before); 1031 /* 1032 * Use LFENCE to ensure all previous instructions are retired 1033 * before proceeding. 1034 */ 1035 rmb(); 1036 for (i = 0; i < size; i += line_size) { 1037 /* 1038 * Add a barrier to prevent speculative execution of this 1039 * loop reading beyond the end of the buffer. 1040 */ 1041 rmb(); 1042 asm volatile("mov (%0,%1,1), %%eax\n\t" 1043 : 1044 : "r" (mem_r), "r" (i) 1045 : "%eax", "memory"); 1046 } 1047 /* 1048 * Use LFENCE to ensure all previous instructions are retired 1049 * before proceeding. 1050 */ 1051 rmb(); 1052 rdpmcl(hit_pmcnum, hits_after); 1053 rdpmcl(miss_pmcnum, miss_after); 1054 /* 1055 * Use LFENCE to ensure all previous instructions are retired 1056 * before proceeding. 1057 */ 1058 rmb(); 1059 /* Re-enable hardware prefetchers */ 1060 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 1061 local_irq_enable(); 1062 out_hit: 1063 perf_event_release_kernel(hit_event); 1064 out_miss: 1065 perf_event_release_kernel(miss_event); 1066 out: 1067 /* 1068 * All counts will be zero on failure. 1069 */ 1070 counts->miss_before = miss_before; 1071 counts->hits_before = hits_before; 1072 counts->miss_after = miss_after; 1073 counts->hits_after = hits_after; 1074 return 0; 1075 } 1076 1077 int resctrl_arch_measure_l2_residency(void *_plr) 1078 { 1079 struct pseudo_lock_region *plr = _plr; 1080 struct residency_counts counts = {0}; 1081 1082 /* 1083 * Non-architectural event for the Goldmont Microarchitecture 1084 * from Intel x86 Architecture Software Developer Manual (SDM): 1085 * MEM_LOAD_UOPS_RETIRED D1H (event number) 1086 * Umask values: 1087 * L2_HIT 02H 1088 * L2_MISS 10H 1089 */ 1090 switch (boot_cpu_data.x86_vfm) { 1091 case INTEL_ATOM_GOLDMONT: 1092 case INTEL_ATOM_GOLDMONT_PLUS: 1093 perf_miss_attr.config = X86_CONFIG(.event = 0xd1, 1094 .umask = 0x10); 1095 perf_hit_attr.config = X86_CONFIG(.event = 0xd1, 1096 .umask = 0x2); 1097 break; 1098 default: 1099 goto out; 1100 } 1101 1102 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 1103 /* 1104 * If a failure prevented the measurements from succeeding 1105 * tracepoints will still be written and all counts will be zero. 1106 */ 1107 trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, 1108 counts.miss_after - counts.miss_before); 1109 out: 1110 plr->thread_done = 1; 1111 wake_up_interruptible(&plr->lock_thread_wq); 1112 return 0; 1113 } 1114 1115 int resctrl_arch_measure_l3_residency(void *_plr) 1116 { 1117 struct pseudo_lock_region *plr = _plr; 1118 struct residency_counts counts = {0}; 1119 1120 /* 1121 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event 1122 * has two "no fix" errata associated with it: BDM35 and BDM100. On 1123 * this platform the following events are used instead: 1124 * LONGEST_LAT_CACHE 2EH (Documented in SDM) 1125 * REFERENCE 4FH 1126 * MISS 41H 1127 */ 1128 1129 switch (boot_cpu_data.x86_vfm) { 1130 case INTEL_BROADWELL_X: 1131 /* On BDW the hit event counts references, not hits */ 1132 perf_hit_attr.config = X86_CONFIG(.event = 0x2e, 1133 .umask = 0x4f); 1134 perf_miss_attr.config = X86_CONFIG(.event = 0x2e, 1135 .umask = 0x41); 1136 break; 1137 default: 1138 goto out; 1139 } 1140 1141 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 1142 /* 1143 * If a failure prevented the measurements from succeeding 1144 * tracepoints will still be written and all counts will be zero. 1145 */ 1146 1147 counts.miss_after -= counts.miss_before; 1148 if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { 1149 /* 1150 * On BDW references and misses are counted, need to adjust. 1151 * Sometimes the "hits" counter is a bit more than the 1152 * references, for example, x references but x + 1 hits. 1153 * To not report invalid hit values in this case we treat 1154 * that as misses equal to references. 1155 */ 1156 /* First compute the number of cache references measured */ 1157 counts.hits_after -= counts.hits_before; 1158 /* Next convert references to cache hits */ 1159 counts.hits_after -= min(counts.miss_after, counts.hits_after); 1160 } else { 1161 counts.hits_after -= counts.hits_before; 1162 } 1163 1164 trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); 1165 out: 1166 plr->thread_done = 1; 1167 wake_up_interruptible(&plr->lock_thread_wq); 1168 return 0; 1169 } 1170 1171 /** 1172 * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 1173 * @rdtgrp: Resource group to which the pseudo-locked region belongs. 1174 * @sel: Selector of which measurement to perform on a pseudo-locked region. 1175 * 1176 * The measurement of latency to access a pseudo-locked region should be 1177 * done from a cpu that is associated with that pseudo-locked region. 1178 * Determine which cpu is associated with this region and start a thread on 1179 * that cpu to perform the measurement, wait for that thread to complete. 1180 * 1181 * Return: 0 on success, <0 on failure 1182 */ 1183 static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 1184 { 1185 struct pseudo_lock_region *plr = rdtgrp->plr; 1186 struct task_struct *thread; 1187 unsigned int cpu; 1188 int ret = -1; 1189 1190 cpus_read_lock(); 1191 mutex_lock(&rdtgroup_mutex); 1192 1193 if (rdtgrp->flags & RDT_DELETED) { 1194 ret = -ENODEV; 1195 goto out; 1196 } 1197 1198 if (!plr->d) { 1199 ret = -ENODEV; 1200 goto out; 1201 } 1202 1203 plr->thread_done = 0; 1204 cpu = cpumask_first(&plr->d->hdr.cpu_mask); 1205 if (!cpu_online(cpu)) { 1206 ret = -ENODEV; 1207 goto out; 1208 } 1209 1210 plr->cpu = cpu; 1211 1212 if (sel == 1) 1213 thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 1214 plr, cpu, "pseudo_lock_measure/%u"); 1215 else if (sel == 2) 1216 thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 1217 plr, cpu, "pseudo_lock_measure/%u"); 1218 else if (sel == 3) 1219 thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 1220 plr, cpu, "pseudo_lock_measure/%u"); 1221 else 1222 goto out; 1223 1224 if (IS_ERR(thread)) { 1225 ret = PTR_ERR(thread); 1226 goto out; 1227 } 1228 1229 ret = wait_event_interruptible(plr->lock_thread_wq, 1230 plr->thread_done == 1); 1231 if (ret < 0) 1232 goto out; 1233 1234 ret = 0; 1235 1236 out: 1237 mutex_unlock(&rdtgroup_mutex); 1238 cpus_read_unlock(); 1239 return ret; 1240 } 1241 1242 static ssize_t pseudo_lock_measure_trigger(struct file *file, 1243 const char __user *user_buf, 1244 size_t count, loff_t *ppos) 1245 { 1246 struct rdtgroup *rdtgrp = file->private_data; 1247 size_t buf_size; 1248 char buf[32]; 1249 int ret; 1250 int sel; 1251 1252 buf_size = min(count, (sizeof(buf) - 1)); 1253 if (copy_from_user(buf, user_buf, buf_size)) 1254 return -EFAULT; 1255 1256 buf[buf_size] = '\0'; 1257 ret = kstrtoint(buf, 10, &sel); 1258 if (ret == 0) { 1259 if (sel != 1 && sel != 2 && sel != 3) 1260 return -EINVAL; 1261 ret = debugfs_file_get(file->f_path.dentry); 1262 if (ret) 1263 return ret; 1264 ret = pseudo_lock_measure_cycles(rdtgrp, sel); 1265 if (ret == 0) 1266 ret = count; 1267 debugfs_file_put(file->f_path.dentry); 1268 } 1269 1270 return ret; 1271 } 1272 1273 static const struct file_operations pseudo_measure_fops = { 1274 .write = pseudo_lock_measure_trigger, 1275 .open = simple_open, 1276 .llseek = default_llseek, 1277 }; 1278 1279 /** 1280 * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 1281 * @rdtgrp: resource group to which pseudo-lock region belongs 1282 * 1283 * Called when a resource group in the pseudo-locksetup mode receives a 1284 * valid schemata that should be pseudo-locked. Since the resource group is 1285 * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 1286 * allocated and initialized with the essential information. If a failure 1287 * occurs the resource group remains in the pseudo-locksetup mode with the 1288 * &struct pseudo_lock_region associated with it, but cleared from all 1289 * information and ready for the user to re-attempt pseudo-locking by 1290 * writing the schemata again. 1291 * 1292 * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 1293 * on failure. Descriptive error will be written to last_cmd_status buffer. 1294 */ 1295 int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 1296 { 1297 struct pseudo_lock_region *plr = rdtgrp->plr; 1298 struct task_struct *thread; 1299 unsigned int new_minor; 1300 struct device *dev; 1301 int ret; 1302 1303 ret = pseudo_lock_region_alloc(plr); 1304 if (ret < 0) 1305 return ret; 1306 1307 ret = pseudo_lock_cstates_constrain(plr); 1308 if (ret < 0) { 1309 ret = -EINVAL; 1310 goto out_region; 1311 } 1312 1313 plr->thread_done = 0; 1314 1315 thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 1316 plr->cpu, "pseudo_lock/%u"); 1317 if (IS_ERR(thread)) { 1318 ret = PTR_ERR(thread); 1319 rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 1320 goto out_cstates; 1321 } 1322 1323 ret = wait_event_interruptible(plr->lock_thread_wq, 1324 plr->thread_done == 1); 1325 if (ret < 0) { 1326 /* 1327 * If the thread does not get on the CPU for whatever 1328 * reason and the process which sets up the region is 1329 * interrupted then this will leave the thread in runnable 1330 * state and once it gets on the CPU it will dereference 1331 * the cleared, but not freed, plr struct resulting in an 1332 * empty pseudo-locking loop. 1333 */ 1334 rdt_last_cmd_puts("Locking thread interrupted\n"); 1335 goto out_cstates; 1336 } 1337 1338 ret = pseudo_lock_minor_get(&new_minor); 1339 if (ret < 0) { 1340 rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 1341 goto out_cstates; 1342 } 1343 1344 /* 1345 * Unlock access but do not release the reference. The 1346 * pseudo-locked region will still be here on return. 1347 * 1348 * The mutex has to be released temporarily to avoid a potential 1349 * deadlock with the mm->mmap_lock which is obtained in the 1350 * device_create() and debugfs_create_dir() callpath below as well as 1351 * before the mmap() callback is called. 1352 */ 1353 mutex_unlock(&rdtgroup_mutex); 1354 1355 if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 1356 plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, 1357 debugfs_resctrl); 1358 if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 1359 debugfs_create_file("pseudo_lock_measure", 0200, 1360 plr->debugfs_dir, rdtgrp, 1361 &pseudo_measure_fops); 1362 } 1363 1364 dev = device_create(&pseudo_lock_class, NULL, 1365 MKDEV(pseudo_lock_major, new_minor), 1366 rdtgrp, "%s", rdtgrp->kn->name); 1367 1368 mutex_lock(&rdtgroup_mutex); 1369 1370 if (IS_ERR(dev)) { 1371 ret = PTR_ERR(dev); 1372 rdt_last_cmd_printf("Failed to create character device: %d\n", 1373 ret); 1374 goto out_debugfs; 1375 } 1376 1377 /* We released the mutex - check if group was removed while we did so */ 1378 if (rdtgrp->flags & RDT_DELETED) { 1379 ret = -ENODEV; 1380 goto out_device; 1381 } 1382 1383 plr->minor = new_minor; 1384 1385 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 1386 closid_free(rdtgrp->closid); 1387 rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 1388 rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 1389 1390 ret = 0; 1391 goto out; 1392 1393 out_device: 1394 device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 1395 out_debugfs: 1396 debugfs_remove_recursive(plr->debugfs_dir); 1397 pseudo_lock_minor_release(new_minor); 1398 out_cstates: 1399 pseudo_lock_cstates_relax(plr); 1400 out_region: 1401 pseudo_lock_region_clear(plr); 1402 out: 1403 return ret; 1404 } 1405 1406 /** 1407 * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 1408 * @rdtgrp: resource group to which the pseudo-locked region belongs 1409 * 1410 * The removal of a pseudo-locked region can be initiated when the resource 1411 * group is removed from user space via a "rmdir" from userspace or the 1412 * unmount of the resctrl filesystem. On removal the resource group does 1413 * not go back to pseudo-locksetup mode before it is removed, instead it is 1414 * removed directly. There is thus asymmetry with the creation where the 1415 * &struct pseudo_lock_region is removed here while it was not created in 1416 * rdtgroup_pseudo_lock_create(). 1417 * 1418 * Return: void 1419 */ 1420 void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 1421 { 1422 struct pseudo_lock_region *plr = rdtgrp->plr; 1423 1424 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1425 /* 1426 * Default group cannot be a pseudo-locked region so we can 1427 * free closid here. 1428 */ 1429 closid_free(rdtgrp->closid); 1430 goto free; 1431 } 1432 1433 pseudo_lock_cstates_relax(plr); 1434 debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 1435 device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 1436 pseudo_lock_minor_release(plr->minor); 1437 1438 free: 1439 pseudo_lock_free(rdtgrp); 1440 } 1441 1442 static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 1443 { 1444 struct rdtgroup *rdtgrp; 1445 1446 mutex_lock(&rdtgroup_mutex); 1447 1448 rdtgrp = region_find_by_minor(iminor(inode)); 1449 if (!rdtgrp) { 1450 mutex_unlock(&rdtgroup_mutex); 1451 return -ENODEV; 1452 } 1453 1454 filp->private_data = rdtgrp; 1455 atomic_inc(&rdtgrp->waitcount); 1456 /* Perform a non-seekable open - llseek is not supported */ 1457 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1458 1459 mutex_unlock(&rdtgroup_mutex); 1460 1461 return 0; 1462 } 1463 1464 static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 1465 { 1466 struct rdtgroup *rdtgrp; 1467 1468 mutex_lock(&rdtgroup_mutex); 1469 rdtgrp = filp->private_data; 1470 WARN_ON(!rdtgrp); 1471 if (!rdtgrp) { 1472 mutex_unlock(&rdtgroup_mutex); 1473 return -ENODEV; 1474 } 1475 filp->private_data = NULL; 1476 atomic_dec(&rdtgrp->waitcount); 1477 mutex_unlock(&rdtgroup_mutex); 1478 return 0; 1479 } 1480 1481 static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 1482 { 1483 /* Not supported */ 1484 return -EINVAL; 1485 } 1486 1487 static const struct vm_operations_struct pseudo_mmap_ops = { 1488 .mremap = pseudo_lock_dev_mremap, 1489 }; 1490 1491 static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 1492 { 1493 unsigned long vsize = vma->vm_end - vma->vm_start; 1494 unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 1495 struct pseudo_lock_region *plr; 1496 struct rdtgroup *rdtgrp; 1497 unsigned long physical; 1498 unsigned long psize; 1499 1500 mutex_lock(&rdtgroup_mutex); 1501 1502 rdtgrp = filp->private_data; 1503 WARN_ON(!rdtgrp); 1504 if (!rdtgrp) { 1505 mutex_unlock(&rdtgroup_mutex); 1506 return -ENODEV; 1507 } 1508 1509 plr = rdtgrp->plr; 1510 1511 if (!plr->d) { 1512 mutex_unlock(&rdtgroup_mutex); 1513 return -ENODEV; 1514 } 1515 1516 /* 1517 * Task is required to run with affinity to the cpus associated 1518 * with the pseudo-locked region. If this is not the case the task 1519 * may be scheduled elsewhere and invalidate entries in the 1520 * pseudo-locked region. 1521 */ 1522 if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 1523 mutex_unlock(&rdtgroup_mutex); 1524 return -EINVAL; 1525 } 1526 1527 physical = __pa(plr->kmem) >> PAGE_SHIFT; 1528 psize = plr->size - off; 1529 1530 if (off > plr->size) { 1531 mutex_unlock(&rdtgroup_mutex); 1532 return -ENOSPC; 1533 } 1534 1535 /* 1536 * Ensure changes are carried directly to the memory being mapped, 1537 * do not allow copy-on-write mapping. 1538 */ 1539 if (!(vma->vm_flags & VM_SHARED)) { 1540 mutex_unlock(&rdtgroup_mutex); 1541 return -EINVAL; 1542 } 1543 1544 if (vsize > psize) { 1545 mutex_unlock(&rdtgroup_mutex); 1546 return -ENOSPC; 1547 } 1548 1549 memset(plr->kmem + off, 0, vsize); 1550 1551 if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 1552 vsize, vma->vm_page_prot)) { 1553 mutex_unlock(&rdtgroup_mutex); 1554 return -EAGAIN; 1555 } 1556 vma->vm_ops = &pseudo_mmap_ops; 1557 mutex_unlock(&rdtgroup_mutex); 1558 return 0; 1559 } 1560 1561 static const struct file_operations pseudo_lock_dev_fops = { 1562 .owner = THIS_MODULE, 1563 .read = NULL, 1564 .write = NULL, 1565 .open = pseudo_lock_dev_open, 1566 .release = pseudo_lock_dev_release, 1567 .mmap = pseudo_lock_dev_mmap, 1568 }; 1569 1570 int rdt_pseudo_lock_init(void) 1571 { 1572 int ret; 1573 1574 ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 1575 if (ret < 0) 1576 return ret; 1577 1578 pseudo_lock_major = ret; 1579 1580 ret = class_register(&pseudo_lock_class); 1581 if (ret) { 1582 unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1583 return ret; 1584 } 1585 1586 return 0; 1587 } 1588 1589 void rdt_pseudo_lock_release(void) 1590 { 1591 class_unregister(&pseudo_lock_class); 1592 unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1593 pseudo_lock_major = 0; 1594 } 1595