1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Resource Director Technology (RDT) 4 * 5 * Pseudo-locking support built on top of Cache Allocation Technology (CAT) 6 * 7 * Copyright (C) 2018 Intel Corporation 8 * 9 * Author: Reinette Chatre <reinette.chatre@intel.com> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/cpu.h> 15 #include <linux/cpumask.h> 16 #include <linux/debugfs.h> 17 #include <linux/kthread.h> 18 #include <linux/mman.h> 19 #include <linux/perf_event.h> 20 #include <linux/pm_qos.h> 21 #include <linux/slab.h> 22 #include <linux/uaccess.h> 23 24 #include <asm/cacheflush.h> 25 #include <asm/cpu_device_id.h> 26 #include <asm/resctrl.h> 27 #include <asm/perf_event.h> 28 29 #include "../../events/perf_event.h" /* For X86_CONFIG() */ 30 #include "internal.h" 31 32 #define CREATE_TRACE_POINTS 33 #include "trace.h" 34 35 /* 36 * The bits needed to disable hardware prefetching varies based on the 37 * platform. During initialization we will discover which bits to use. 38 */ 39 static u64 prefetch_disable_bits; 40 41 /* 42 * Major number assigned to and shared by all devices exposing 43 * pseudo-locked regions. 44 */ 45 static unsigned int pseudo_lock_major; 46 static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 47 48 static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 49 { 50 const struct rdtgroup *rdtgrp; 51 52 rdtgrp = dev_get_drvdata(dev); 53 if (mode) 54 *mode = 0600; 55 guard(mutex)(&rdtgroup_mutex); 56 return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 57 } 58 59 static const struct class pseudo_lock_class = { 60 .name = "pseudo_lock", 61 .devnode = pseudo_lock_devnode, 62 }; 63 64 /** 65 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported 66 * platforms 67 * @void: It takes no parameters. 68 * 69 * Capture the list of platforms that have been validated to support 70 * pseudo-locking. This includes testing to ensure pseudo-locked regions 71 * with low cache miss rates can be created under variety of load conditions 72 * as well as that these pseudo-locked regions can maintain their low cache 73 * miss rates under variety of load conditions for significant lengths of time. 74 * 75 * After a platform has been validated to support pseudo-locking its 76 * hardware prefetch disable bits are included here as they are documented 77 * in the SDM. 78 * 79 * When adding a platform here also add support for its cache events to 80 * resctrl_arch_measure_l*_residency() 81 * 82 * Return: 83 * If platform is supported, the bits to disable hardware prefetchers, 0 84 * if platform is not supported. 85 */ 86 u64 resctrl_arch_get_prefetch_disable_bits(void) 87 { 88 prefetch_disable_bits = 0; 89 90 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 91 boot_cpu_data.x86 != 6) 92 return 0; 93 94 switch (boot_cpu_data.x86_vfm) { 95 case INTEL_BROADWELL_X: 96 /* 97 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 98 * as: 99 * 0 L2 Hardware Prefetcher Disable (R/W) 100 * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) 101 * 2 DCU Hardware Prefetcher Disable (R/W) 102 * 3 DCU IP Prefetcher Disable (R/W) 103 * 63:4 Reserved 104 */ 105 prefetch_disable_bits = 0xF; 106 break; 107 case INTEL_ATOM_GOLDMONT: 108 case INTEL_ATOM_GOLDMONT_PLUS: 109 /* 110 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 111 * as: 112 * 0 L2 Hardware Prefetcher Disable (R/W) 113 * 1 Reserved 114 * 2 DCU Hardware Prefetcher Disable (R/W) 115 * 63:3 Reserved 116 */ 117 prefetch_disable_bits = 0x5; 118 break; 119 } 120 121 return prefetch_disable_bits; 122 } 123 124 /** 125 * pseudo_lock_minor_get - Obtain available minor number 126 * @minor: Pointer to where new minor number will be stored 127 * 128 * A bitmask is used to track available minor numbers. Here the next free 129 * minor number is marked as unavailable and returned. 130 * 131 * Return: 0 on success, <0 on failure. 132 */ 133 static int pseudo_lock_minor_get(unsigned int *minor) 134 { 135 unsigned long first_bit; 136 137 first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 138 139 if (first_bit == MINORBITS) 140 return -ENOSPC; 141 142 __clear_bit(first_bit, &pseudo_lock_minor_avail); 143 *minor = first_bit; 144 145 return 0; 146 } 147 148 /** 149 * pseudo_lock_minor_release - Return minor number to available 150 * @minor: The minor number made available 151 */ 152 static void pseudo_lock_minor_release(unsigned int minor) 153 { 154 __set_bit(minor, &pseudo_lock_minor_avail); 155 } 156 157 /** 158 * region_find_by_minor - Locate a pseudo-lock region by inode minor number 159 * @minor: The minor number of the device representing pseudo-locked region 160 * 161 * When the character device is accessed we need to determine which 162 * pseudo-locked region it belongs to. This is done by matching the minor 163 * number of the device to the pseudo-locked region it belongs. 164 * 165 * Minor numbers are assigned at the time a pseudo-locked region is associated 166 * with a cache instance. 167 * 168 * Return: On success return pointer to resource group owning the pseudo-locked 169 * region, NULL on failure. 170 */ 171 static struct rdtgroup *region_find_by_minor(unsigned int minor) 172 { 173 struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 174 175 list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 176 if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 177 rdtgrp_match = rdtgrp; 178 break; 179 } 180 } 181 return rdtgrp_match; 182 } 183 184 /** 185 * struct pseudo_lock_pm_req - A power management QoS request list entry 186 * @list: Entry within the @pm_reqs list for a pseudo-locked region 187 * @req: PM QoS request 188 */ 189 struct pseudo_lock_pm_req { 190 struct list_head list; 191 struct dev_pm_qos_request req; 192 }; 193 194 static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 195 { 196 struct pseudo_lock_pm_req *pm_req, *next; 197 198 list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 199 dev_pm_qos_remove_request(&pm_req->req); 200 list_del(&pm_req->list); 201 kfree(pm_req); 202 } 203 } 204 205 /** 206 * pseudo_lock_cstates_constrain - Restrict cores from entering C6 207 * @plr: Pseudo-locked region 208 * 209 * To prevent the cache from being affected by power management entering 210 * C6 has to be avoided. This is accomplished by requesting a latency 211 * requirement lower than lowest C6 exit latency of all supported 212 * platforms as found in the cpuidle state tables in the intel_idle driver. 213 * At this time it is possible to do so with a single latency requirement 214 * for all supported platforms. 215 * 216 * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 217 * the ACPI latencies need to be considered while keeping in mind that C2 218 * may be set to map to deeper sleep states. In this case the latency 219 * requirement needs to prevent entering C2 also. 220 * 221 * Return: 0 on success, <0 on failure 222 */ 223 static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 224 { 225 struct pseudo_lock_pm_req *pm_req; 226 int cpu; 227 int ret; 228 229 for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 230 pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 231 if (!pm_req) { 232 rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 233 ret = -ENOMEM; 234 goto out_err; 235 } 236 ret = dev_pm_qos_add_request(get_cpu_device(cpu), 237 &pm_req->req, 238 DEV_PM_QOS_RESUME_LATENCY, 239 30); 240 if (ret < 0) { 241 rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 242 cpu); 243 kfree(pm_req); 244 ret = -1; 245 goto out_err; 246 } 247 list_add(&pm_req->list, &plr->pm_reqs); 248 } 249 250 return 0; 251 252 out_err: 253 pseudo_lock_cstates_relax(plr); 254 return ret; 255 } 256 257 /** 258 * pseudo_lock_region_clear - Reset pseudo-lock region data 259 * @plr: pseudo-lock region 260 * 261 * All content of the pseudo-locked region is reset - any memory allocated 262 * freed. 263 * 264 * Return: void 265 */ 266 static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 267 { 268 plr->size = 0; 269 plr->line_size = 0; 270 kfree(plr->kmem); 271 plr->kmem = NULL; 272 plr->s = NULL; 273 if (plr->d) 274 plr->d->plr = NULL; 275 plr->d = NULL; 276 plr->cbm = 0; 277 plr->debugfs_dir = NULL; 278 } 279 280 /** 281 * pseudo_lock_region_init - Initialize pseudo-lock region information 282 * @plr: pseudo-lock region 283 * 284 * Called after user provided a schemata to be pseudo-locked. From the 285 * schemata the &struct pseudo_lock_region is on entry already initialized 286 * with the resource, domain, and capacity bitmask. Here the information 287 * required for pseudo-locking is deduced from this data and &struct 288 * pseudo_lock_region initialized further. This information includes: 289 * - size in bytes of the region to be pseudo-locked 290 * - cache line size to know the stride with which data needs to be accessed 291 * to be pseudo-locked 292 * - a cpu associated with the cache instance on which the pseudo-locking 293 * flow can be executed 294 * 295 * Return: 0 on success, <0 on failure. Descriptive error will be written 296 * to last_cmd_status buffer. 297 */ 298 static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 299 { 300 enum resctrl_scope scope = plr->s->res->ctrl_scope; 301 struct cacheinfo *ci; 302 int ret; 303 304 if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 305 return -ENODEV; 306 307 /* Pick the first cpu we find that is associated with the cache. */ 308 plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 309 310 if (!cpu_online(plr->cpu)) { 311 rdt_last_cmd_printf("CPU %u associated with cache not online\n", 312 plr->cpu); 313 ret = -ENODEV; 314 goto out_region; 315 } 316 317 ci = get_cpu_cacheinfo_level(plr->cpu, scope); 318 if (ci) { 319 plr->line_size = ci->coherency_line_size; 320 plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 321 return 0; 322 } 323 324 ret = -1; 325 rdt_last_cmd_puts("Unable to determine cache line size\n"); 326 out_region: 327 pseudo_lock_region_clear(plr); 328 return ret; 329 } 330 331 /** 332 * pseudo_lock_init - Initialize a pseudo-lock region 333 * @rdtgrp: resource group to which new pseudo-locked region will belong 334 * 335 * A pseudo-locked region is associated with a resource group. When this 336 * association is created the pseudo-locked region is initialized. The 337 * details of the pseudo-locked region are not known at this time so only 338 * allocation is done and association established. 339 * 340 * Return: 0 on success, <0 on failure 341 */ 342 static int pseudo_lock_init(struct rdtgroup *rdtgrp) 343 { 344 struct pseudo_lock_region *plr; 345 346 plr = kzalloc(sizeof(*plr), GFP_KERNEL); 347 if (!plr) 348 return -ENOMEM; 349 350 init_waitqueue_head(&plr->lock_thread_wq); 351 INIT_LIST_HEAD(&plr->pm_reqs); 352 rdtgrp->plr = plr; 353 return 0; 354 } 355 356 /** 357 * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 358 * @plr: pseudo-lock region 359 * 360 * Initialize the details required to set up the pseudo-locked region and 361 * allocate the contiguous memory that will be pseudo-locked to the cache. 362 * 363 * Return: 0 on success, <0 on failure. Descriptive error will be written 364 * to last_cmd_status buffer. 365 */ 366 static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 367 { 368 int ret; 369 370 ret = pseudo_lock_region_init(plr); 371 if (ret < 0) 372 return ret; 373 374 /* 375 * We do not yet support contiguous regions larger than 376 * KMALLOC_MAX_SIZE. 377 */ 378 if (plr->size > KMALLOC_MAX_SIZE) { 379 rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 380 ret = -E2BIG; 381 goto out_region; 382 } 383 384 plr->kmem = kzalloc(plr->size, GFP_KERNEL); 385 if (!plr->kmem) { 386 rdt_last_cmd_puts("Unable to allocate memory\n"); 387 ret = -ENOMEM; 388 goto out_region; 389 } 390 391 ret = 0; 392 goto out; 393 out_region: 394 pseudo_lock_region_clear(plr); 395 out: 396 return ret; 397 } 398 399 /** 400 * pseudo_lock_free - Free a pseudo-locked region 401 * @rdtgrp: resource group to which pseudo-locked region belonged 402 * 403 * The pseudo-locked region's resources have already been released, or not 404 * yet created at this point. Now it can be freed and disassociated from the 405 * resource group. 406 * 407 * Return: void 408 */ 409 static void pseudo_lock_free(struct rdtgroup *rdtgrp) 410 { 411 pseudo_lock_region_clear(rdtgrp->plr); 412 kfree(rdtgrp->plr); 413 rdtgrp->plr = NULL; 414 } 415 416 /** 417 * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache 418 * @_plr: the pseudo-lock region descriptor 419 * 420 * This is the core pseudo-locking flow. 421 * 422 * First we ensure that the kernel memory cannot be found in the cache. 423 * Then, while taking care that there will be as little interference as 424 * possible, the memory to be loaded is accessed while core is running 425 * with class of service set to the bitmask of the pseudo-locked region. 426 * After this is complete no future CAT allocations will be allowed to 427 * overlap with this bitmask. 428 * 429 * Local register variables are utilized to ensure that the memory region 430 * to be locked is the only memory access made during the critical locking 431 * loop. 432 * 433 * Return: 0. Waiter on waitqueue will be woken on completion. 434 */ 435 int resctrl_arch_pseudo_lock_fn(void *_plr) 436 { 437 struct pseudo_lock_region *plr = _plr; 438 u32 rmid_p, closid_p; 439 unsigned long i; 440 u64 saved_msr; 441 #ifdef CONFIG_KASAN 442 /* 443 * The registers used for local register variables are also used 444 * when KASAN is active. When KASAN is active we use a regular 445 * variable to ensure we always use a valid pointer, but the cost 446 * is that this variable will enter the cache through evicting the 447 * memory we are trying to lock into the cache. Thus expect lower 448 * pseudo-locking success rate when KASAN is active. 449 */ 450 unsigned int line_size; 451 unsigned int size; 452 void *mem_r; 453 #else 454 register unsigned int line_size asm("esi"); 455 register unsigned int size asm("edi"); 456 register void *mem_r asm(_ASM_BX); 457 #endif /* CONFIG_KASAN */ 458 459 /* 460 * Make sure none of the allocated memory is cached. If it is we 461 * will get a cache hit in below loop from outside of pseudo-locked 462 * region. 463 * wbinvd (as opposed to clflush/clflushopt) is required to 464 * increase likelihood that allocated cache portion will be filled 465 * with associated memory. 466 */ 467 wbinvd(); 468 469 /* 470 * Always called with interrupts enabled. By disabling interrupts 471 * ensure that we will not be preempted during this critical section. 472 */ 473 local_irq_disable(); 474 475 /* 476 * Call wrmsr and rdmsr as directly as possible to avoid tracing 477 * clobbering local register variables or affecting cache accesses. 478 * 479 * Disable the hardware prefetcher so that when the end of the memory 480 * being pseudo-locked is reached the hardware will not read beyond 481 * the buffer and evict pseudo-locked memory read earlier from the 482 * cache. 483 */ 484 saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); 485 __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 486 closid_p = this_cpu_read(pqr_state.cur_closid); 487 rmid_p = this_cpu_read(pqr_state.cur_rmid); 488 mem_r = plr->kmem; 489 size = plr->size; 490 line_size = plr->line_size; 491 /* 492 * Critical section begin: start by writing the closid associated 493 * with the capacity bitmask of the cache region being 494 * pseudo-locked followed by reading of kernel memory to load it 495 * into the cache. 496 */ 497 __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); 498 499 /* 500 * Cache was flushed earlier. Now access kernel memory to read it 501 * into cache region associated with just activated plr->closid. 502 * Loop over data twice: 503 * - In first loop the cache region is shared with the page walker 504 * as it populates the paging structure caches (including TLB). 505 * - In the second loop the paging structure caches are used and 506 * cache region is populated with the memory being referenced. 507 */ 508 for (i = 0; i < size; i += PAGE_SIZE) { 509 /* 510 * Add a barrier to prevent speculative execution of this 511 * loop reading beyond the end of the buffer. 512 */ 513 rmb(); 514 asm volatile("mov (%0,%1,1), %%eax\n\t" 515 : 516 : "r" (mem_r), "r" (i) 517 : "%eax", "memory"); 518 } 519 for (i = 0; i < size; i += line_size) { 520 /* 521 * Add a barrier to prevent speculative execution of this 522 * loop reading beyond the end of the buffer. 523 */ 524 rmb(); 525 asm volatile("mov (%0,%1,1), %%eax\n\t" 526 : 527 : "r" (mem_r), "r" (i) 528 : "%eax", "memory"); 529 } 530 /* 531 * Critical section end: restore closid with capacity bitmask that 532 * does not overlap with pseudo-locked region. 533 */ 534 __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); 535 536 /* Re-enable the hardware prefetcher(s) */ 537 wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); 538 local_irq_enable(); 539 540 plr->thread_done = 1; 541 wake_up_interruptible(&plr->lock_thread_wq); 542 return 0; 543 } 544 545 /** 546 * rdtgroup_monitor_in_progress - Test if monitoring in progress 547 * @rdtgrp: resource group being queried 548 * 549 * Return: 1 if monitor groups have been created for this resource 550 * group, 0 otherwise. 551 */ 552 static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 553 { 554 return !list_empty(&rdtgrp->mon.crdtgrp_list); 555 } 556 557 /** 558 * rdtgroup_locksetup_user_restrict - Restrict user access to group 559 * @rdtgrp: resource group needing access restricted 560 * 561 * A resource group used for cache pseudo-locking cannot have cpus or tasks 562 * assigned to it. This is communicated to the user by restricting access 563 * to all the files that can be used to make such changes. 564 * 565 * Permissions restored with rdtgroup_locksetup_user_restore() 566 * 567 * Return: 0 on success, <0 on failure. If a failure occurs during the 568 * restriction of access an attempt will be made to restore permissions but 569 * the state of the mode of these files will be uncertain when a failure 570 * occurs. 571 */ 572 static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 573 { 574 int ret; 575 576 ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 577 if (ret) 578 return ret; 579 580 ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 581 if (ret) 582 goto err_tasks; 583 584 ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 585 if (ret) 586 goto err_cpus; 587 588 if (resctrl_arch_mon_capable()) { 589 ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 590 if (ret) 591 goto err_cpus_list; 592 } 593 594 ret = 0; 595 goto out; 596 597 err_cpus_list: 598 rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 599 err_cpus: 600 rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 601 err_tasks: 602 rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 603 out: 604 return ret; 605 } 606 607 /** 608 * rdtgroup_locksetup_user_restore - Restore user access to group 609 * @rdtgrp: resource group needing access restored 610 * 611 * Restore all file access previously removed using 612 * rdtgroup_locksetup_user_restrict() 613 * 614 * Return: 0 on success, <0 on failure. If a failure occurs during the 615 * restoration of access an attempt will be made to restrict permissions 616 * again but the state of the mode of these files will be uncertain when 617 * a failure occurs. 618 */ 619 static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 620 { 621 int ret; 622 623 ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 624 if (ret) 625 return ret; 626 627 ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 628 if (ret) 629 goto err_tasks; 630 631 ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 632 if (ret) 633 goto err_cpus; 634 635 if (resctrl_arch_mon_capable()) { 636 ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 637 if (ret) 638 goto err_cpus_list; 639 } 640 641 ret = 0; 642 goto out; 643 644 err_cpus_list: 645 rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 646 err_cpus: 647 rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 648 err_tasks: 649 rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 650 out: 651 return ret; 652 } 653 654 /** 655 * rdtgroup_locksetup_enter - Resource group enters locksetup mode 656 * @rdtgrp: resource group requested to enter locksetup mode 657 * 658 * A resource group enters locksetup mode to reflect that it would be used 659 * to represent a pseudo-locked region and is in the process of being set 660 * up to do so. A resource group used for a pseudo-locked region would 661 * lose the closid associated with it so we cannot allow it to have any 662 * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 663 * future. Monitoring of a pseudo-locked region is not allowed either. 664 * 665 * The above and more restrictions on a pseudo-locked region are checked 666 * for and enforced before the resource group enters the locksetup mode. 667 * 668 * Returns: 0 if the resource group successfully entered locksetup mode, <0 669 * on failure. On failure the last_cmd_status buffer is updated with text to 670 * communicate details of failure to the user. 671 */ 672 int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 673 { 674 int ret; 675 676 /* 677 * The default resource group can neither be removed nor lose the 678 * default closid associated with it. 679 */ 680 if (rdtgrp == &rdtgroup_default) { 681 rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 682 return -EINVAL; 683 } 684 685 /* 686 * Cache Pseudo-locking not supported when CDP is enabled. 687 * 688 * Some things to consider if you would like to enable this 689 * support (using L3 CDP as example): 690 * - When CDP is enabled two separate resources are exposed, 691 * L3DATA and L3CODE, but they are actually on the same cache. 692 * The implication for pseudo-locking is that if a 693 * pseudo-locked region is created on a domain of one 694 * resource (eg. L3CODE), then a pseudo-locked region cannot 695 * be created on that same domain of the other resource 696 * (eg. L3DATA). This is because the creation of a 697 * pseudo-locked region involves a call to wbinvd that will 698 * affect all cache allocations on particular domain. 699 * - Considering the previous, it may be possible to only 700 * expose one of the CDP resources to pseudo-locking and 701 * hide the other. For example, we could consider to only 702 * expose L3DATA and since the L3 cache is unified it is 703 * still possible to place instructions there are execute it. 704 * - If only one region is exposed to pseudo-locking we should 705 * still keep in mind that availability of a portion of cache 706 * for pseudo-locking should take into account both resources. 707 * Similarly, if a pseudo-locked region is created in one 708 * resource, the portion of cache used by it should be made 709 * unavailable to all future allocations from both resources. 710 */ 711 if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 712 resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 713 rdt_last_cmd_puts("CDP enabled\n"); 714 return -EINVAL; 715 } 716 717 /* 718 * Not knowing the bits to disable prefetching implies that this 719 * platform does not support Cache Pseudo-Locking. 720 */ 721 if (resctrl_arch_get_prefetch_disable_bits() == 0) { 722 rdt_last_cmd_puts("Pseudo-locking not supported\n"); 723 return -EINVAL; 724 } 725 726 if (rdtgroup_monitor_in_progress(rdtgrp)) { 727 rdt_last_cmd_puts("Monitoring in progress\n"); 728 return -EINVAL; 729 } 730 731 if (rdtgroup_tasks_assigned(rdtgrp)) { 732 rdt_last_cmd_puts("Tasks assigned to resource group\n"); 733 return -EINVAL; 734 } 735 736 if (!cpumask_empty(&rdtgrp->cpu_mask)) { 737 rdt_last_cmd_puts("CPUs assigned to resource group\n"); 738 return -EINVAL; 739 } 740 741 if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 742 rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 743 return -EIO; 744 } 745 746 ret = pseudo_lock_init(rdtgrp); 747 if (ret) { 748 rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 749 goto out_release; 750 } 751 752 /* 753 * If this system is capable of monitoring a rmid would have been 754 * allocated when the control group was created. This is not needed 755 * anymore when this group would be used for pseudo-locking. This 756 * is safe to call on platforms not capable of monitoring. 757 */ 758 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 759 760 ret = 0; 761 goto out; 762 763 out_release: 764 rdtgroup_locksetup_user_restore(rdtgrp); 765 out: 766 return ret; 767 } 768 769 /** 770 * rdtgroup_locksetup_exit - resource group exist locksetup mode 771 * @rdtgrp: resource group 772 * 773 * When a resource group exits locksetup mode the earlier restrictions are 774 * lifted. 775 * 776 * Return: 0 on success, <0 on failure 777 */ 778 int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 779 { 780 int ret; 781 782 if (resctrl_arch_mon_capable()) { 783 ret = alloc_rmid(rdtgrp->closid); 784 if (ret < 0) { 785 rdt_last_cmd_puts("Out of RMIDs\n"); 786 return ret; 787 } 788 rdtgrp->mon.rmid = ret; 789 } 790 791 ret = rdtgroup_locksetup_user_restore(rdtgrp); 792 if (ret) { 793 free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 794 return ret; 795 } 796 797 pseudo_lock_free(rdtgrp); 798 return 0; 799 } 800 801 /** 802 * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 803 * @d: RDT domain 804 * @cbm: CBM to test 805 * 806 * @d represents a cache instance and @cbm a capacity bitmask that is 807 * considered for it. Determine if @cbm overlaps with any existing 808 * pseudo-locked region on @d. 809 * 810 * @cbm is unsigned long, even if only 32 bits are used, to make the 811 * bitmap functions work correctly. 812 * 813 * Return: true if @cbm overlaps with pseudo-locked region on @d, false 814 * otherwise. 815 */ 816 bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 817 { 818 unsigned int cbm_len; 819 unsigned long cbm_b; 820 821 if (d->plr) { 822 cbm_len = d->plr->s->res->cache.cbm_len; 823 cbm_b = d->plr->cbm; 824 if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 825 return true; 826 } 827 return false; 828 } 829 830 /** 831 * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 832 * @d: RDT domain under test 833 * 834 * The setup of a pseudo-locked region affects all cache instances within 835 * the hierarchy of the region. It is thus essential to know if any 836 * pseudo-locked regions exist within a cache hierarchy to prevent any 837 * attempts to create new pseudo-locked regions in the same hierarchy. 838 * 839 * Return: true if a pseudo-locked region exists in the hierarchy of @d or 840 * if it is not possible to test due to memory allocation issue, 841 * false otherwise. 842 */ 843 bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 844 { 845 struct rdt_ctrl_domain *d_i; 846 cpumask_var_t cpu_with_psl; 847 struct rdt_resource *r; 848 bool ret = false; 849 850 /* Walking r->domains, ensure it can't race with cpuhp */ 851 lockdep_assert_cpus_held(); 852 853 if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 854 return true; 855 856 /* 857 * First determine which cpus have pseudo-locked regions 858 * associated with them. 859 */ 860 for_each_alloc_capable_rdt_resource(r) { 861 list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 862 if (d_i->plr) 863 cpumask_or(cpu_with_psl, cpu_with_psl, 864 &d_i->hdr.cpu_mask); 865 } 866 } 867 868 /* 869 * Next test if new pseudo-locked region would intersect with 870 * existing region. 871 */ 872 if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 873 ret = true; 874 875 free_cpumask_var(cpu_with_psl); 876 return ret; 877 } 878 879 /** 880 * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read 881 * pseudo-locked memory 882 * @_plr: pseudo-lock region to measure 883 * 884 * There is no deterministic way to test if a memory region is cached. One 885 * way is to measure how long it takes to read the memory, the speed of 886 * access is a good way to learn how close to the cpu the data was. Even 887 * more, if the prefetcher is disabled and the memory is read at a stride 888 * of half the cache line, then a cache miss will be easy to spot since the 889 * read of the first half would be significantly slower than the read of 890 * the second half. 891 * 892 * Return: 0. Waiter on waitqueue will be woken on completion. 893 */ 894 int resctrl_arch_measure_cycles_lat_fn(void *_plr) 895 { 896 struct pseudo_lock_region *plr = _plr; 897 u32 saved_low, saved_high; 898 unsigned long i; 899 u64 start, end; 900 void *mem_r; 901 902 local_irq_disable(); 903 /* 904 * Disable hardware prefetchers. 905 */ 906 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 907 wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 908 mem_r = READ_ONCE(plr->kmem); 909 /* 910 * Dummy execute of the time measurement to load the needed 911 * instructions into the L1 instruction cache. 912 */ 913 start = rdtsc_ordered(); 914 for (i = 0; i < plr->size; i += 32) { 915 start = rdtsc_ordered(); 916 asm volatile("mov (%0,%1,1), %%eax\n\t" 917 : 918 : "r" (mem_r), "r" (i) 919 : "%eax", "memory"); 920 end = rdtsc_ordered(); 921 trace_pseudo_lock_mem_latency((u32)(end - start)); 922 } 923 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 924 local_irq_enable(); 925 plr->thread_done = 1; 926 wake_up_interruptible(&plr->lock_thread_wq); 927 return 0; 928 } 929 930 /* 931 * Create a perf_event_attr for the hit and miss perf events that will 932 * be used during the performance measurement. A perf_event maintains 933 * a pointer to its perf_event_attr so a unique attribute structure is 934 * created for each perf_event. 935 * 936 * The actual configuration of the event is set right before use in order 937 * to use the X86_CONFIG macro. 938 */ 939 static struct perf_event_attr perf_miss_attr = { 940 .type = PERF_TYPE_RAW, 941 .size = sizeof(struct perf_event_attr), 942 .pinned = 1, 943 .disabled = 0, 944 .exclude_user = 1, 945 }; 946 947 static struct perf_event_attr perf_hit_attr = { 948 .type = PERF_TYPE_RAW, 949 .size = sizeof(struct perf_event_attr), 950 .pinned = 1, 951 .disabled = 0, 952 .exclude_user = 1, 953 }; 954 955 struct residency_counts { 956 u64 miss_before, hits_before; 957 u64 miss_after, hits_after; 958 }; 959 960 static int measure_residency_fn(struct perf_event_attr *miss_attr, 961 struct perf_event_attr *hit_attr, 962 struct pseudo_lock_region *plr, 963 struct residency_counts *counts) 964 { 965 u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; 966 struct perf_event *miss_event, *hit_event; 967 int hit_pmcnum, miss_pmcnum; 968 u32 saved_low, saved_high; 969 unsigned int line_size; 970 unsigned int size; 971 unsigned long i; 972 void *mem_r; 973 u64 tmp; 974 975 miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, 976 NULL, NULL, NULL); 977 if (IS_ERR(miss_event)) 978 goto out; 979 980 hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, 981 NULL, NULL, NULL); 982 if (IS_ERR(hit_event)) 983 goto out_miss; 984 985 local_irq_disable(); 986 /* 987 * Check any possible error state of events used by performing 988 * one local read. 989 */ 990 if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { 991 local_irq_enable(); 992 goto out_hit; 993 } 994 if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { 995 local_irq_enable(); 996 goto out_hit; 997 } 998 999 /* 1000 * Disable hardware prefetchers. 1001 */ 1002 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 1003 wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); 1004 1005 /* Initialize rest of local variables */ 1006 /* 1007 * Performance event has been validated right before this with 1008 * interrupts disabled - it is thus safe to read the counter index. 1009 */ 1010 miss_pmcnum = x86_perf_rdpmc_index(miss_event); 1011 hit_pmcnum = x86_perf_rdpmc_index(hit_event); 1012 line_size = READ_ONCE(plr->line_size); 1013 mem_r = READ_ONCE(plr->kmem); 1014 size = READ_ONCE(plr->size); 1015 1016 /* 1017 * Read counter variables twice - first to load the instructions 1018 * used in L1 cache, second to capture accurate value that does not 1019 * include cache misses incurred because of instruction loads. 1020 */ 1021 rdpmcl(hit_pmcnum, hits_before); 1022 rdpmcl(miss_pmcnum, miss_before); 1023 /* 1024 * From SDM: Performing back-to-back fast reads are not guaranteed 1025 * to be monotonic. 1026 * Use LFENCE to ensure all previous instructions are retired 1027 * before proceeding. 1028 */ 1029 rmb(); 1030 rdpmcl(hit_pmcnum, hits_before); 1031 rdpmcl(miss_pmcnum, miss_before); 1032 /* 1033 * Use LFENCE to ensure all previous instructions are retired 1034 * before proceeding. 1035 */ 1036 rmb(); 1037 for (i = 0; i < size; i += line_size) { 1038 /* 1039 * Add a barrier to prevent speculative execution of this 1040 * loop reading beyond the end of the buffer. 1041 */ 1042 rmb(); 1043 asm volatile("mov (%0,%1,1), %%eax\n\t" 1044 : 1045 : "r" (mem_r), "r" (i) 1046 : "%eax", "memory"); 1047 } 1048 /* 1049 * Use LFENCE to ensure all previous instructions are retired 1050 * before proceeding. 1051 */ 1052 rmb(); 1053 rdpmcl(hit_pmcnum, hits_after); 1054 rdpmcl(miss_pmcnum, miss_after); 1055 /* 1056 * Use LFENCE to ensure all previous instructions are retired 1057 * before proceeding. 1058 */ 1059 rmb(); 1060 /* Re-enable hardware prefetchers */ 1061 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 1062 local_irq_enable(); 1063 out_hit: 1064 perf_event_release_kernel(hit_event); 1065 out_miss: 1066 perf_event_release_kernel(miss_event); 1067 out: 1068 /* 1069 * All counts will be zero on failure. 1070 */ 1071 counts->miss_before = miss_before; 1072 counts->hits_before = hits_before; 1073 counts->miss_after = miss_after; 1074 counts->hits_after = hits_after; 1075 return 0; 1076 } 1077 1078 int resctrl_arch_measure_l2_residency(void *_plr) 1079 { 1080 struct pseudo_lock_region *plr = _plr; 1081 struct residency_counts counts = {0}; 1082 1083 /* 1084 * Non-architectural event for the Goldmont Microarchitecture 1085 * from Intel x86 Architecture Software Developer Manual (SDM): 1086 * MEM_LOAD_UOPS_RETIRED D1H (event number) 1087 * Umask values: 1088 * L2_HIT 02H 1089 * L2_MISS 10H 1090 */ 1091 switch (boot_cpu_data.x86_vfm) { 1092 case INTEL_ATOM_GOLDMONT: 1093 case INTEL_ATOM_GOLDMONT_PLUS: 1094 perf_miss_attr.config = X86_CONFIG(.event = 0xd1, 1095 .umask = 0x10); 1096 perf_hit_attr.config = X86_CONFIG(.event = 0xd1, 1097 .umask = 0x2); 1098 break; 1099 default: 1100 goto out; 1101 } 1102 1103 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 1104 /* 1105 * If a failure prevented the measurements from succeeding 1106 * tracepoints will still be written and all counts will be zero. 1107 */ 1108 trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, 1109 counts.miss_after - counts.miss_before); 1110 out: 1111 plr->thread_done = 1; 1112 wake_up_interruptible(&plr->lock_thread_wq); 1113 return 0; 1114 } 1115 1116 int resctrl_arch_measure_l3_residency(void *_plr) 1117 { 1118 struct pseudo_lock_region *plr = _plr; 1119 struct residency_counts counts = {0}; 1120 1121 /* 1122 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event 1123 * has two "no fix" errata associated with it: BDM35 and BDM100. On 1124 * this platform the following events are used instead: 1125 * LONGEST_LAT_CACHE 2EH (Documented in SDM) 1126 * REFERENCE 4FH 1127 * MISS 41H 1128 */ 1129 1130 switch (boot_cpu_data.x86_vfm) { 1131 case INTEL_BROADWELL_X: 1132 /* On BDW the hit event counts references, not hits */ 1133 perf_hit_attr.config = X86_CONFIG(.event = 0x2e, 1134 .umask = 0x4f); 1135 perf_miss_attr.config = X86_CONFIG(.event = 0x2e, 1136 .umask = 0x41); 1137 break; 1138 default: 1139 goto out; 1140 } 1141 1142 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 1143 /* 1144 * If a failure prevented the measurements from succeeding 1145 * tracepoints will still be written and all counts will be zero. 1146 */ 1147 1148 counts.miss_after -= counts.miss_before; 1149 if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { 1150 /* 1151 * On BDW references and misses are counted, need to adjust. 1152 * Sometimes the "hits" counter is a bit more than the 1153 * references, for example, x references but x + 1 hits. 1154 * To not report invalid hit values in this case we treat 1155 * that as misses equal to references. 1156 */ 1157 /* First compute the number of cache references measured */ 1158 counts.hits_after -= counts.hits_before; 1159 /* Next convert references to cache hits */ 1160 counts.hits_after -= min(counts.miss_after, counts.hits_after); 1161 } else { 1162 counts.hits_after -= counts.hits_before; 1163 } 1164 1165 trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); 1166 out: 1167 plr->thread_done = 1; 1168 wake_up_interruptible(&plr->lock_thread_wq); 1169 return 0; 1170 } 1171 1172 /** 1173 * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 1174 * @rdtgrp: Resource group to which the pseudo-locked region belongs. 1175 * @sel: Selector of which measurement to perform on a pseudo-locked region. 1176 * 1177 * The measurement of latency to access a pseudo-locked region should be 1178 * done from a cpu that is associated with that pseudo-locked region. 1179 * Determine which cpu is associated with this region and start a thread on 1180 * that cpu to perform the measurement, wait for that thread to complete. 1181 * 1182 * Return: 0 on success, <0 on failure 1183 */ 1184 static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 1185 { 1186 struct pseudo_lock_region *plr = rdtgrp->plr; 1187 struct task_struct *thread; 1188 unsigned int cpu; 1189 int ret = -1; 1190 1191 cpus_read_lock(); 1192 mutex_lock(&rdtgroup_mutex); 1193 1194 if (rdtgrp->flags & RDT_DELETED) { 1195 ret = -ENODEV; 1196 goto out; 1197 } 1198 1199 if (!plr->d) { 1200 ret = -ENODEV; 1201 goto out; 1202 } 1203 1204 plr->thread_done = 0; 1205 cpu = cpumask_first(&plr->d->hdr.cpu_mask); 1206 if (!cpu_online(cpu)) { 1207 ret = -ENODEV; 1208 goto out; 1209 } 1210 1211 plr->cpu = cpu; 1212 1213 if (sel == 1) 1214 thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 1215 plr, cpu, "pseudo_lock_measure/%u"); 1216 else if (sel == 2) 1217 thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 1218 plr, cpu, "pseudo_lock_measure/%u"); 1219 else if (sel == 3) 1220 thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 1221 plr, cpu, "pseudo_lock_measure/%u"); 1222 else 1223 goto out; 1224 1225 if (IS_ERR(thread)) { 1226 ret = PTR_ERR(thread); 1227 goto out; 1228 } 1229 1230 ret = wait_event_interruptible(plr->lock_thread_wq, 1231 plr->thread_done == 1); 1232 if (ret < 0) 1233 goto out; 1234 1235 ret = 0; 1236 1237 out: 1238 mutex_unlock(&rdtgroup_mutex); 1239 cpus_read_unlock(); 1240 return ret; 1241 } 1242 1243 static ssize_t pseudo_lock_measure_trigger(struct file *file, 1244 const char __user *user_buf, 1245 size_t count, loff_t *ppos) 1246 { 1247 struct rdtgroup *rdtgrp = file->private_data; 1248 size_t buf_size; 1249 char buf[32]; 1250 int ret; 1251 int sel; 1252 1253 buf_size = min(count, (sizeof(buf) - 1)); 1254 if (copy_from_user(buf, user_buf, buf_size)) 1255 return -EFAULT; 1256 1257 buf[buf_size] = '\0'; 1258 ret = kstrtoint(buf, 10, &sel); 1259 if (ret == 0) { 1260 if (sel != 1 && sel != 2 && sel != 3) 1261 return -EINVAL; 1262 ret = debugfs_file_get(file->f_path.dentry); 1263 if (ret) 1264 return ret; 1265 ret = pseudo_lock_measure_cycles(rdtgrp, sel); 1266 if (ret == 0) 1267 ret = count; 1268 debugfs_file_put(file->f_path.dentry); 1269 } 1270 1271 return ret; 1272 } 1273 1274 static const struct file_operations pseudo_measure_fops = { 1275 .write = pseudo_lock_measure_trigger, 1276 .open = simple_open, 1277 .llseek = default_llseek, 1278 }; 1279 1280 /** 1281 * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 1282 * @rdtgrp: resource group to which pseudo-lock region belongs 1283 * 1284 * Called when a resource group in the pseudo-locksetup mode receives a 1285 * valid schemata that should be pseudo-locked. Since the resource group is 1286 * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 1287 * allocated and initialized with the essential information. If a failure 1288 * occurs the resource group remains in the pseudo-locksetup mode with the 1289 * &struct pseudo_lock_region associated with it, but cleared from all 1290 * information and ready for the user to re-attempt pseudo-locking by 1291 * writing the schemata again. 1292 * 1293 * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 1294 * on failure. Descriptive error will be written to last_cmd_status buffer. 1295 */ 1296 int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 1297 { 1298 struct pseudo_lock_region *plr = rdtgrp->plr; 1299 struct task_struct *thread; 1300 unsigned int new_minor; 1301 struct device *dev; 1302 char *kn_name __free(kfree) = NULL; 1303 int ret; 1304 1305 ret = pseudo_lock_region_alloc(plr); 1306 if (ret < 0) 1307 return ret; 1308 1309 ret = pseudo_lock_cstates_constrain(plr); 1310 if (ret < 0) { 1311 ret = -EINVAL; 1312 goto out_region; 1313 } 1314 kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 1315 if (!kn_name) { 1316 ret = -ENOMEM; 1317 goto out_cstates; 1318 } 1319 1320 plr->thread_done = 0; 1321 1322 thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 1323 plr->cpu, "pseudo_lock/%u"); 1324 if (IS_ERR(thread)) { 1325 ret = PTR_ERR(thread); 1326 rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 1327 goto out_cstates; 1328 } 1329 1330 ret = wait_event_interruptible(plr->lock_thread_wq, 1331 plr->thread_done == 1); 1332 if (ret < 0) { 1333 /* 1334 * If the thread does not get on the CPU for whatever 1335 * reason and the process which sets up the region is 1336 * interrupted then this will leave the thread in runnable 1337 * state and once it gets on the CPU it will dereference 1338 * the cleared, but not freed, plr struct resulting in an 1339 * empty pseudo-locking loop. 1340 */ 1341 rdt_last_cmd_puts("Locking thread interrupted\n"); 1342 goto out_cstates; 1343 } 1344 1345 ret = pseudo_lock_minor_get(&new_minor); 1346 if (ret < 0) { 1347 rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 1348 goto out_cstates; 1349 } 1350 1351 /* 1352 * Unlock access but do not release the reference. The 1353 * pseudo-locked region will still be here on return. 1354 * 1355 * The mutex has to be released temporarily to avoid a potential 1356 * deadlock with the mm->mmap_lock which is obtained in the 1357 * device_create() and debugfs_create_dir() callpath below as well as 1358 * before the mmap() callback is called. 1359 */ 1360 mutex_unlock(&rdtgroup_mutex); 1361 1362 if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 1363 plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 1364 if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 1365 debugfs_create_file("pseudo_lock_measure", 0200, 1366 plr->debugfs_dir, rdtgrp, 1367 &pseudo_measure_fops); 1368 } 1369 1370 dev = device_create(&pseudo_lock_class, NULL, 1371 MKDEV(pseudo_lock_major, new_minor), 1372 rdtgrp, "%s", kn_name); 1373 1374 mutex_lock(&rdtgroup_mutex); 1375 1376 if (IS_ERR(dev)) { 1377 ret = PTR_ERR(dev); 1378 rdt_last_cmd_printf("Failed to create character device: %d\n", 1379 ret); 1380 goto out_debugfs; 1381 } 1382 1383 /* We released the mutex - check if group was removed while we did so */ 1384 if (rdtgrp->flags & RDT_DELETED) { 1385 ret = -ENODEV; 1386 goto out_device; 1387 } 1388 1389 plr->minor = new_minor; 1390 1391 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 1392 closid_free(rdtgrp->closid); 1393 rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 1394 rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 1395 1396 ret = 0; 1397 goto out; 1398 1399 out_device: 1400 device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 1401 out_debugfs: 1402 debugfs_remove_recursive(plr->debugfs_dir); 1403 pseudo_lock_minor_release(new_minor); 1404 out_cstates: 1405 pseudo_lock_cstates_relax(plr); 1406 out_region: 1407 pseudo_lock_region_clear(plr); 1408 out: 1409 return ret; 1410 } 1411 1412 /** 1413 * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 1414 * @rdtgrp: resource group to which the pseudo-locked region belongs 1415 * 1416 * The removal of a pseudo-locked region can be initiated when the resource 1417 * group is removed from user space via a "rmdir" from userspace or the 1418 * unmount of the resctrl filesystem. On removal the resource group does 1419 * not go back to pseudo-locksetup mode before it is removed, instead it is 1420 * removed directly. There is thus asymmetry with the creation where the 1421 * &struct pseudo_lock_region is removed here while it was not created in 1422 * rdtgroup_pseudo_lock_create(). 1423 * 1424 * Return: void 1425 */ 1426 void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 1427 { 1428 struct pseudo_lock_region *plr = rdtgrp->plr; 1429 1430 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1431 /* 1432 * Default group cannot be a pseudo-locked region so we can 1433 * free closid here. 1434 */ 1435 closid_free(rdtgrp->closid); 1436 goto free; 1437 } 1438 1439 pseudo_lock_cstates_relax(plr); 1440 debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 1441 device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 1442 pseudo_lock_minor_release(plr->minor); 1443 1444 free: 1445 pseudo_lock_free(rdtgrp); 1446 } 1447 1448 static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 1449 { 1450 struct rdtgroup *rdtgrp; 1451 1452 mutex_lock(&rdtgroup_mutex); 1453 1454 rdtgrp = region_find_by_minor(iminor(inode)); 1455 if (!rdtgrp) { 1456 mutex_unlock(&rdtgroup_mutex); 1457 return -ENODEV; 1458 } 1459 1460 filp->private_data = rdtgrp; 1461 atomic_inc(&rdtgrp->waitcount); 1462 /* Perform a non-seekable open - llseek is not supported */ 1463 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1464 1465 mutex_unlock(&rdtgroup_mutex); 1466 1467 return 0; 1468 } 1469 1470 static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 1471 { 1472 struct rdtgroup *rdtgrp; 1473 1474 mutex_lock(&rdtgroup_mutex); 1475 rdtgrp = filp->private_data; 1476 WARN_ON(!rdtgrp); 1477 if (!rdtgrp) { 1478 mutex_unlock(&rdtgroup_mutex); 1479 return -ENODEV; 1480 } 1481 filp->private_data = NULL; 1482 atomic_dec(&rdtgrp->waitcount); 1483 mutex_unlock(&rdtgroup_mutex); 1484 return 0; 1485 } 1486 1487 static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 1488 { 1489 /* Not supported */ 1490 return -EINVAL; 1491 } 1492 1493 static const struct vm_operations_struct pseudo_mmap_ops = { 1494 .mremap = pseudo_lock_dev_mremap, 1495 }; 1496 1497 static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 1498 { 1499 unsigned long vsize = vma->vm_end - vma->vm_start; 1500 unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 1501 struct pseudo_lock_region *plr; 1502 struct rdtgroup *rdtgrp; 1503 unsigned long physical; 1504 unsigned long psize; 1505 1506 mutex_lock(&rdtgroup_mutex); 1507 1508 rdtgrp = filp->private_data; 1509 WARN_ON(!rdtgrp); 1510 if (!rdtgrp) { 1511 mutex_unlock(&rdtgroup_mutex); 1512 return -ENODEV; 1513 } 1514 1515 plr = rdtgrp->plr; 1516 1517 if (!plr->d) { 1518 mutex_unlock(&rdtgroup_mutex); 1519 return -ENODEV; 1520 } 1521 1522 /* 1523 * Task is required to run with affinity to the cpus associated 1524 * with the pseudo-locked region. If this is not the case the task 1525 * may be scheduled elsewhere and invalidate entries in the 1526 * pseudo-locked region. 1527 */ 1528 if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 1529 mutex_unlock(&rdtgroup_mutex); 1530 return -EINVAL; 1531 } 1532 1533 physical = __pa(plr->kmem) >> PAGE_SHIFT; 1534 psize = plr->size - off; 1535 1536 if (off > plr->size) { 1537 mutex_unlock(&rdtgroup_mutex); 1538 return -ENOSPC; 1539 } 1540 1541 /* 1542 * Ensure changes are carried directly to the memory being mapped, 1543 * do not allow copy-on-write mapping. 1544 */ 1545 if (!(vma->vm_flags & VM_SHARED)) { 1546 mutex_unlock(&rdtgroup_mutex); 1547 return -EINVAL; 1548 } 1549 1550 if (vsize > psize) { 1551 mutex_unlock(&rdtgroup_mutex); 1552 return -ENOSPC; 1553 } 1554 1555 memset(plr->kmem + off, 0, vsize); 1556 1557 if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 1558 vsize, vma->vm_page_prot)) { 1559 mutex_unlock(&rdtgroup_mutex); 1560 return -EAGAIN; 1561 } 1562 vma->vm_ops = &pseudo_mmap_ops; 1563 mutex_unlock(&rdtgroup_mutex); 1564 return 0; 1565 } 1566 1567 static const struct file_operations pseudo_lock_dev_fops = { 1568 .owner = THIS_MODULE, 1569 .read = NULL, 1570 .write = NULL, 1571 .open = pseudo_lock_dev_open, 1572 .release = pseudo_lock_dev_release, 1573 .mmap = pseudo_lock_dev_mmap, 1574 }; 1575 1576 int rdt_pseudo_lock_init(void) 1577 { 1578 int ret; 1579 1580 ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 1581 if (ret < 0) 1582 return ret; 1583 1584 pseudo_lock_major = ret; 1585 1586 ret = class_register(&pseudo_lock_class); 1587 if (ret) { 1588 unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1589 return ret; 1590 } 1591 1592 return 0; 1593 } 1594 1595 void rdt_pseudo_lock_release(void) 1596 { 1597 class_unregister(&pseudo_lock_class); 1598 unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1599 pseudo_lock_major = 0; 1600 } 1601