1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/disp.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/atomic.h> 34 #include <sys/cpucaps_impl.h> 35 #include <sys/dtrace.h> 36 #include <sys/sdt.h> 37 #include <sys/debug.h> 38 #include <sys/rctl.h> 39 #include <sys/errno.h> 40 41 /* 42 * CPU Caps implementation 43 * ======================= 44 * 45 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 46 * usage for all projects running inside the zone. If the zone CPU cap is set 47 * below the project CPU cap, the latter will have no effect. 48 * 49 * When CPU usage of projects and/or zones reaches specified caps, threads in 50 * them do not get scheduled and instead are placed on wait queues associated 51 * with a cap. Such threads will start running again only when CPU usage drops 52 * below the cap level. Each zone and each project has its own wait queue. 53 * 54 * When CPU cap is set, the kernel continously keeps track of CPU time used by 55 * capped zones and/or projects over a short time interval and calculates their 56 * current CPU usage as a percentage. When the accumulated usage reaches the CPU 57 * cap, LWPs running in the user-land (when they are not holding any critical 58 * kernel locks) are placed on special wait queues until their project's or 59 * zone's CPU usage drops below the cap. 60 * 61 * The system maintains a list of all capped projects and all capped zones. On 62 * every clock tick every active thread belonging to a capped project adds its 63 * CPU usage to its project. Usage from all projects belonging to a capped zone 64 * is aggregated to get the zone usage. 65 * 66 * When the current CPU usage is above the cap, a project or zone is considered 67 * over-capped. Every user thread caught running in an over-capped project or 68 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 69 * is requested to surrender its CPU. This causes scheduling class specific 70 * CL_PREEMPT() callback to be invoked. The callback function places threads 71 * marked as TS_PROJWAIT on a wait queue and calls switch(). 72 * 73 * Threads are only placed on wait queues after trapping from user-land 74 * (they could be holding some user locks, but no kernel locks) and while 75 * returning from the trap back to the user-land when no kernel locks are held. 76 * Putting threads on wait queues in random places while running in the 77 * kernel might lead to all kinds of locking problems. 78 * 79 * Accounting 80 * ========== 81 * 82 * Accounting of CPU usage is based on per-thread micro-state accounting data. 83 * On every clock tick clock() adds new on-CPU time for every thread found on 84 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 85 * New times means time since it was last accounted for. On-CPU times greater 86 * than 1 tick are truncated to 1 tick. 87 * 88 * Project CPU usage is aggregated from all threads within the project. 89 * Zone CPU usage is the sum of usages for all projects within the zone. Zone 90 * CPU usage is calculated on every clock tick by walking list of projects and 91 * adding their usage together. 92 * 93 * Decay 94 * ===== 95 * 96 * CPU usage is decayed by the caps_update() routine which is called once per 97 * every clock tick. It walks lists of project caps and decays their usages by 98 * one per cent. If CPU usage drops below cap levels, threads on the wait queue 99 * are made runnable again, one thread per clock tick. 100 * 101 * Interfaces 102 * ========== 103 * 104 * The CPU Caps facility provides the following interfaces to the rest of the 105 * system: 106 * 107 * cpucaps_project_add(kproject_t *) 108 * 109 * Notifies the framework of a new project. It should be put on the 110 * capped_projects list if its zone has a cap. 111 * 112 * cpucaps_project_remove(kproject_t *) 113 * 114 * Remove the association between the specified project and its cap. 115 * Called right before the project is destroyed. 116 * 117 * cpucaps_project_set(kproject_t *, rctl_qty_t) 118 * 119 * Set project cap of the specified project to the specified value. Setting the 120 * value to NOCAP is equivalent to removing the cap. 121 * 122 * cpucaps_zone_set(zone_t *, rctl_qty_t) 123 * 124 * Set zone cap of the specified zone to the specified value. Setting the value 125 * to NOCAP is equivalent to removing the cap. 126 * 127 * cpucaps_zone_remove(zone_t *) 128 * 129 * Remove the association between the zone and its cap. 130 * 131 * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 132 * 133 * Charges specified thread's project the amount of on-CPU time that it used. 134 * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 135 * Otherwise returns True if project or zone should be penalized because its 136 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 137 * bits in t_schedflag in this case. 138 * 139 * CPUCAPS_ENFORCE(kthread_id_t *) 140 * 141 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 142 * state on project or zone wait queues, as requested by TS_PROJWAITQ or 143 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 144 * wait queue or False otherwise. 145 * 146 * cpucaps_sc_init(caps_sc_t *) 147 * 148 * Initializes the scheduling-class specific CPU Caps data for a thread. 149 * 150 * LOCKS 151 * ===== 152 * 153 * all the individual caps structures and their lists are protected by a global 154 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 155 * caps, so it is usually uncontended. We avoid all blocking memory allocations 156 * while holding caps_lock to prevent clock() from blocking. 157 * 158 * Thread state is protected by the thread lock. It protects the association 159 * between a thread and its project and, as a consequence, to its zone. The 160 * association can not break while thread lock is held, so the project or zone 161 * cap are not going to disappear while thread lock is held. 162 * 163 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 164 * grabbed by scheduling classes already holding thread lock at high PIL and by 165 * clock thread performing usage decay. We should do as little work as possible 166 * while holding the lock since it may be very hot. All threads in the project 167 * contend for the same cache line doing cap usage updates. 168 */ 169 170 /* 171 * caps_lock protects list of capped projects and zones, changes in the cap 172 * state and changes of the global cpucaps_enabled flag. 173 * 174 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 175 * modified in parallel. This can be per-zone cap flag, but we don't keep any 176 * cap state for now. 177 */ 178 static kmutex_t caps_lock; /* lock to protect: */ 179 static list_t capped_zones; /* - list of zones with caps */ 180 static list_t capped_projects; /* - list of projects with caps */ 181 boolean_t cpucaps_enabled; /* - are there any caps defined? */ 182 boolean_t cpucaps_busy; /* - is framework busy? */ 183 184 /* 185 * The accounting is based on the number of nanoseconds threads spend running 186 * during a tick which is kept in the cap_tick_cost variable. 187 */ 188 static hrtime_t cap_tick_cost; 189 190 /* 191 * How much of the usage value is decayed every clock tick 192 * Decay one per cent of value per tick 193 */ 194 #define CAP_DECAY_FACTOR 100 195 196 /* 197 * Scale the value and round it to the closest integer value 198 */ 199 #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 200 201 static void caps_update(); 202 203 /* 204 * CAP kstats. 205 */ 206 struct cap_kstat { 207 kstat_named_t cap_value; 208 kstat_named_t cap_usage; 209 kstat_named_t cap_nwait; 210 kstat_named_t cap_below; 211 kstat_named_t cap_above; 212 kstat_named_t cap_maxusage; 213 kstat_named_t cap_zonename; 214 } cap_kstat = { 215 { "value", KSTAT_DATA_UINT64 }, 216 { "usage", KSTAT_DATA_UINT64 }, 217 { "nwait", KSTAT_DATA_UINT64 }, 218 { "below_sec", KSTAT_DATA_UINT64 }, 219 { "above_sec", KSTAT_DATA_UINT64 }, 220 { "maxusage", KSTAT_DATA_UINT64 }, 221 { "zonename", KSTAT_DATA_STRING }, 222 }; 223 224 225 static kmutex_t cap_kstat_lock; 226 static int cap_kstat_update(kstat_t *, int); 227 228 /* 229 * Initialize CPU caps infrastructure. 230 * - Initialize lists of capped zones and capped projects 231 * - Set cpucaps_clock_callout to NULL 232 */ 233 void 234 cpucaps_init() 235 { 236 /* 237 * Initialize global variables 238 */ 239 cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 240 241 list_create(&capped_zones, sizeof (cpucap_t), 242 offsetof(cpucap_t, cap_link)); 243 list_create(&capped_projects, sizeof (cpucap_t), 244 offsetof(cpucap_t, cap_link)); 245 246 cpucaps_enabled = B_FALSE; 247 cpucaps_busy = B_FALSE; 248 cpucaps_clock_callout = NULL; 249 } 250 251 /* 252 * Initialize scheduling-class specific CPU Caps data. 253 */ 254 void 255 cpucaps_sc_init(caps_sc_t *csc) 256 { 257 csc->csc_cputime = 0; 258 } 259 260 /* 261 * Allocate and initialize cpucap structure 262 */ 263 static cpucap_t * 264 cap_alloc(void) 265 { 266 cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 267 268 DISP_LOCK_INIT(&cap->cap_usagelock); 269 waitq_init(&cap->cap_waitq); 270 271 return (cap); 272 } 273 274 /* 275 * Free cpucap structure 276 */ 277 static void 278 cap_free(cpucap_t *cap) 279 { 280 if (cap == NULL) 281 return; 282 283 /* 284 * This cap should not be active 285 */ 286 ASSERT(!list_link_active(&cap->cap_link)); 287 ASSERT(cap->cap_value == 0); 288 ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 289 290 waitq_fini(&cap->cap_waitq); 291 DISP_LOCK_DESTROY(&cap->cap_usagelock); 292 293 kmem_free(cap, sizeof (cpucap_t)); 294 } 295 296 /* 297 * Activate cap - insert into active list and unblock its 298 * wait queue. Should be called with caps_lock held. 299 * The cap_value field is set to the value supplied. 300 */ 301 static void 302 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 303 { 304 ASSERT(MUTEX_HELD(&caps_lock)); 305 306 /* 307 * Cap can not be already enabled 308 */ 309 ASSERT(!CAP_ENABLED(cap)); 310 ASSERT(!list_link_active(&cap->cap_link)); 311 312 list_insert_tail(l, cap); 313 cap->cap_below = cap->cap_above = 0; 314 cap->cap_maxusage = 0; 315 cap->cap_usage = 0; 316 cap->cap_value = value; 317 waitq_unblock(&cap->cap_waitq); 318 if (CPUCAPS_OFF()) { 319 cpucaps_enabled = B_TRUE; 320 cpucaps_clock_callout = caps_update; 321 } 322 } 323 324 /* 325 * Deactivate cap 326 * - Block its wait queue. This prevents any new threads from being 327 * enqueued there and moves all enqueued threads to the run queue. 328 * - Remove cap from list l. 329 * - Disable CPU caps globally if there are no capped projects or zones 330 * 331 * Should be called with caps_lock held. 332 */ 333 static void 334 cap_disable(list_t *l, cpucap_t *cap) 335 { 336 ASSERT(MUTEX_HELD(&caps_lock)); 337 /* 338 * Cap should be currently active 339 */ 340 ASSERT(CPUCAPS_ON()); 341 ASSERT(list_link_active(&cap->cap_link)); 342 ASSERT(CAP_ENABLED(cap)); 343 344 waitq_block(&cap->cap_waitq); 345 list_remove(l, cap); 346 if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 347 cpucaps_enabled = B_FALSE; 348 cpucaps_clock_callout = NULL; 349 } 350 cap->cap_value = 0; 351 cap->cap_project = NULL; 352 cap->cap_zone = NULL; 353 if (cap->cap_kstat != NULL) { 354 kstat_delete(cap->cap_kstat); 355 cap->cap_kstat = NULL; 356 } 357 358 } 359 360 /* 361 * Enable cap for a project kpj 362 * It is safe to enable already enabled project cap. 363 * Should be called with caps_lock held. 364 */ 365 static void 366 cap_project_enable(kproject_t *kpj, hrtime_t value) 367 { 368 cpucap_t *cap = kpj->kpj_cpucap; 369 370 ASSERT(MUTEX_HELD(&caps_lock)); 371 ASSERT(cap != NULL); 372 373 if (CAP_DISABLED(cap)) { 374 ASSERT(cap->cap_kstat == NULL); 375 cap_enable(&capped_projects, cap, value); 376 cap->cap_project = kpj; 377 cap->cap_zone = kpj->kpj_zone; 378 379 /* 380 * Create cap kstats 381 */ 382 if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 383 KSTAT_TYPE_NAMED, 384 sizeof (cap_kstat) / sizeof (kstat_named_t), 385 KSTAT_FLAG_VIRTUAL)) != NULL) { 386 cap->cap_kstat->ks_data_size += 387 strlen(cap->cap_zone->zone_name) + 1; 388 cap->cap_kstat->ks_lock = &cap_kstat_lock; 389 cap->cap_kstat->ks_data = &cap_kstat; 390 cap->cap_kstat->ks_update = cap_kstat_update; 391 cap->cap_kstat->ks_private = cap; 392 kstat_install(cap->cap_kstat); 393 } 394 } 395 } 396 397 /* 398 * Disable project cap. 399 * It is safe to disable already disabled project cap. 400 * Should be called with caps_lock held. 401 */ 402 static void 403 cap_project_disable(kproject_t *kpj) 404 { 405 cpucap_t *cap = kpj->kpj_cpucap; 406 407 ASSERT(MUTEX_HELD(&caps_lock)); 408 ASSERT(cap != NULL); 409 ASSERT(cap->cap_project == kpj); 410 411 if (CAP_ENABLED(cap)) 412 cap_disable(&capped_projects, cap); 413 } 414 415 /* 416 * Enable cap for a zone 417 * It is safe to enable already enabled zone cap. 418 * Should be called with caps_lock held. 419 */ 420 static void 421 cap_zone_enable(zone_t *zone, hrtime_t value) 422 { 423 cpucap_t *cap = zone->zone_cpucap; 424 425 ASSERT(MUTEX_HELD(&caps_lock)); 426 ASSERT(cap != NULL); 427 428 if (CAP_DISABLED(cap)) { 429 ASSERT(cap->cap_kstat == NULL); 430 cap_enable(&capped_zones, cap, value); 431 cap->cap_zone = zone; 432 433 /* 434 * Create cap kstats 435 */ 436 if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 437 KSTAT_TYPE_NAMED, 438 sizeof (cap_kstat) / sizeof (kstat_named_t), 439 KSTAT_FLAG_VIRTUAL)) != NULL) { 440 cap->cap_kstat->ks_data_size += 441 strlen(cap->cap_zone->zone_name) + 1; 442 cap->cap_kstat->ks_lock = &cap_kstat_lock; 443 cap->cap_kstat->ks_data = &cap_kstat; 444 cap->cap_kstat->ks_update = cap_kstat_update; 445 cap->cap_kstat->ks_private = cap; 446 kstat_install(cap->cap_kstat); 447 } 448 } 449 } 450 451 /* 452 * Disable zone cap. 453 * It is safe to disable already disabled zone cap. 454 * Should be called with caps_lock held. 455 */ 456 static void 457 cap_zone_disable(zone_t *zone) 458 { 459 cpucap_t *cap = zone->zone_cpucap; 460 461 ASSERT(MUTEX_HELD(&caps_lock)); 462 ASSERT(cap != NULL); 463 ASSERT(cap->cap_zone == zone); 464 465 if (CAP_ENABLED(cap)) 466 cap_disable(&capped_zones, cap); 467 } 468 469 /* 470 * Apply specified callback to all caps contained in the list `l'. 471 */ 472 static void 473 cap_walk(list_t *l, void (*cb)(cpucap_t *)) 474 { 475 cpucap_t *cap; 476 477 ASSERT(MUTEX_HELD(&caps_lock)); 478 479 for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 480 (*cb)(cap); 481 } 482 } 483 484 /* 485 * If cap limit is not reached, make one thread from wait queue runnable. 486 * The waitq_isempty check is performed without the waitq lock. If a new thread 487 * is placed on the waitq right after the check, it will be picked up during the 488 * next invocation of cap_poke_waitq(). 489 */ 490 static void 491 cap_poke_waitq(cpucap_t *cap) 492 { 493 ASSERT(MUTEX_HELD(&caps_lock)); 494 495 if (cap->cap_usage >= cap->cap_value) { 496 cap->cap_above++; 497 } else { 498 waitq_t *wq = &cap->cap_waitq; 499 500 cap->cap_below++; 501 502 if (!waitq_isempty(wq)) 503 waitq_runone(wq); 504 } 505 } 506 507 /* 508 * The callback function called for every cap on capped_projects list. 509 * Decay cap usage by CAP_DECAY_FACTOR 510 * Add this cap project usage to its zone usage. 511 * Kick off a thread from the cap waitq if cap is not reached. 512 */ 513 static void 514 cap_project_usage_walker(cpucap_t *cap) 515 { 516 zone_t *zone = cap->cap_zone; 517 hrtime_t cap_usage = cap->cap_usage; 518 519 ASSERT(MUTEX_HELD(&caps_lock)); 520 ASSERT(cap->cap_project->kpj_cpucap == cap); 521 ASSERT(zone == cap->cap_project->kpj_zone); 522 ASSERT(CAP_ENABLED(cap)); 523 524 /* 525 * Set or clear the CAP_REACHED flag based on the current usage. 526 * Only projects having their own caps are ever marked as CAP_REACHED. 527 */ 528 cap_poke_waitq(cap); 529 530 /* 531 * Add project's CPU usage to our zone's CPU usage. 532 */ 533 if (ZONE_IS_CAPPED(zone)) { 534 cpucap_t *zcap = zone->zone_cpucap; 535 536 ASSERT(zcap->cap_zone == zone); 537 538 /* 539 * If we haven't reset this zone's usage during this clock tick 540 * yet, then do it now. The cap_lbolt field is used to check 541 * whether this is the first zone's project we see during this 542 * tick or a subsequent one. 543 */ 544 if (zcap->cap_lbolt != lbolt64) { 545 if (zcap->cap_usage > zcap->cap_maxusage) 546 zcap->cap_maxusage = zcap->cap_usage; 547 zcap->cap_usage = 0; 548 zcap->cap_lbolt = lbolt64; 549 } 550 DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 551 hrtime_t, cap_usage); 552 zcap->cap_usage += cap_usage; 553 /* Check for overflows */ 554 if (zcap->cap_usage < 0) 555 zcap->cap_usage = MAX_USAGE - 1; 556 } 557 558 /* 559 * Decay project usage. 560 */ 561 disp_lock_enter(&cap->cap_usagelock); 562 cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 563 disp_lock_exit(&cap->cap_usagelock); 564 } 565 566 /* 567 * On every clock tick walk the list of project caps and update the CPU usage. 568 * Also walk the list of zone caps checking whether any threads should 569 * transition from wait queue to run queue. 570 * 571 * This function gets called by the clock thread directly when there are any 572 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 573 * caps_lock for long periods of time, so there should be almost no contention 574 * for it. 575 */ 576 static void 577 caps_update() 578 { 579 mutex_enter(&caps_lock); 580 cap_walk(&capped_projects, cap_project_usage_walker); 581 cap_walk(&capped_zones, cap_poke_waitq); 582 mutex_exit(&caps_lock); 583 } 584 585 /* 586 * The function is called for each project in a zone when the zone cap is 587 * modified. It enables project caps if zone cap is enabled and disables if the 588 * zone cap is disabled and project doesn't have its own cap. 589 * 590 * For each project that does not have cpucap structure allocated it allocates a 591 * new structure and assigns to kpj->cpu_cap. The allocation is performed 592 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 593 * held. 594 */ 595 static int 596 cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 597 { 598 cpucap_t *project_cap = NULL; 599 cpucap_t *zone_cap = (cpucap_t *)arg; 600 601 ASSERT(zone_cap != NULL); 602 603 if (kpj->kpj_cpucap == NULL) { 604 /* 605 * This is the first time any cap was established for this 606 * project. Allocate a new cpucap structure for it. 607 */ 608 project_cap = cap_alloc(); 609 } 610 611 mutex_enter(&caps_lock); 612 613 /* 614 * Double-check that kpj_cpucap is still NULL - now with caps_lock held 615 * and assign the newly allocated cpucap structure to it. 616 */ 617 if (kpj->kpj_cpucap == NULL) { 618 kpj->kpj_cpucap = project_cap; 619 } else if (project_cap != NULL) { 620 cap_free(project_cap); 621 } 622 623 project_cap = kpj->kpj_cpucap; 624 625 if (CAP_DISABLED(zone_cap)) { 626 /* 627 * Remove all projects in this zone without caps 628 * from the capped_projects list. 629 */ 630 if (project_cap->cap_value == MAX_USAGE) { 631 cap_project_disable(kpj); 632 } 633 } else if (CAP_DISABLED(project_cap)) { 634 /* 635 * Add the project to capped_projects list. 636 */ 637 ASSERT(project_cap->cap_value == 0); 638 cap_project_enable(kpj, MAX_USAGE); 639 } 640 mutex_exit(&caps_lock); 641 642 return (0); 643 } 644 645 /* 646 * Set zone cap to cap_val 647 * If cap_val is equal to NOCAP, disable zone cap. 648 * 649 * If this is the first time a cap is set on a zone, allocate cpucap structure 650 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 651 */ 652 int 653 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 654 { 655 cpucap_t *cap = NULL; 656 hrtime_t value; 657 658 if (cap_val == 0) 659 return (EINVAL); 660 661 ASSERT(cap_val <= MAXCAP); 662 if (cap_val > MAXCAP) 663 cap_val = MAXCAP; 664 665 /* 666 * Nothing to do if trying to disable a cap on a zone when caps are off 667 * or a zone which does not have a cap yet. 668 */ 669 if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 670 return (0); 671 672 if (zone->zone_cpucap == NULL) 673 cap = cap_alloc(); 674 675 mutex_enter(&caps_lock); 676 677 if (cpucaps_busy) { 678 mutex_exit(&caps_lock); 679 return (EBUSY); 680 } 681 682 /* 683 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 684 * held. If it is still NULL, assign a newly allocated cpucap to it. 685 */ 686 if (zone->zone_cpucap == NULL) { 687 zone->zone_cpucap = cap; 688 } else if (cap != NULL) { 689 cap_free(cap); 690 } 691 692 cap = zone->zone_cpucap; 693 value = cap_val * cap_tick_cost; 694 if (value < 0) 695 value = MAX_USAGE; 696 697 /* Nothing to do if the value is staying the same */ 698 if (value == cap->cap_value) { 699 mutex_exit(&caps_lock); 700 return (0); 701 } 702 703 /* 704 * Clear cap statistics since the cap value itself changes. 705 */ 706 cap->cap_above = cap->cap_below = 0; 707 708 709 if (cap_val == NOCAP) { 710 if (CAP_ENABLED(cap)) { 711 /* 712 * Remove cap for the zone 713 */ 714 cap_zone_disable(zone); 715 cpucaps_busy = B_TRUE; 716 mutex_exit(&caps_lock); 717 /* 718 * Disable caps for all project belonging to this zone 719 * unless they have their own cap. 720 */ 721 (void) project_walk_all(zone->zone_id, 722 cap_project_zone_modify_walker, cap); 723 724 mutex_enter(&caps_lock); 725 cpucaps_busy = B_FALSE; 726 } 727 } else if (CAP_DISABLED(cap)) { 728 /* 729 * Set a cap on a zone which previously was not capped. 730 */ 731 cap_zone_enable(zone, value); 732 cpucaps_busy = B_TRUE; 733 mutex_exit(&caps_lock); 734 735 /* 736 * Enable cap for all projects belonging to this zone. 737 */ 738 (void) project_walk_all(zone->zone_id, 739 cap_project_zone_modify_walker, cap); 740 741 mutex_enter(&caps_lock); 742 cpucaps_busy = B_FALSE; 743 } else { 744 /* 745 * No state transitions, just change the value 746 */ 747 cap->cap_value = value; 748 } 749 750 ASSERT(MUTEX_HELD(&caps_lock)); 751 ASSERT(!cpucaps_busy); 752 mutex_exit(&caps_lock); 753 754 return (0); 755 } 756 757 /* 758 * The project is going away so disable its cap. 759 */ 760 void 761 cpucaps_project_remove(kproject_t *kpj) 762 { 763 mutex_enter(&caps_lock); 764 if (PROJECT_IS_CAPPED(kpj)) 765 cap_project_disable(kpj); 766 if (kpj->kpj_cpucap != NULL) { 767 cap_free(kpj->kpj_cpucap); 768 kpj->kpj_cpucap = NULL; 769 } 770 mutex_exit(&caps_lock); 771 } 772 773 /* 774 * The zone is going away, so disable its cap. 775 */ 776 void 777 cpucaps_zone_remove(zone_t *zone) 778 { 779 mutex_enter(&caps_lock); 780 while (ZONE_IS_CAPPED(zone)) { 781 mutex_exit(&caps_lock); 782 (void) cpucaps_zone_set(zone, NOCAP); 783 mutex_enter(&caps_lock); 784 } 785 if (zone->zone_cpucap != NULL) { 786 cap_free(zone->zone_cpucap); 787 zone->zone_cpucap = NULL; 788 } 789 mutex_exit(&caps_lock); 790 } 791 792 /* 793 * New project was created. It should be put on the capped_projects list if 794 * its zone has a cap. 795 */ 796 void 797 cpucaps_project_add(kproject_t *kpj) 798 { 799 cpucap_t *cap = NULL; 800 801 if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 802 return; 803 804 /* 805 * This project was never capped before, so allocate its cap structure. 806 */ 807 if (kpj->kpj_cpucap == NULL) 808 cap = cap_alloc(); 809 810 mutex_enter(&caps_lock); 811 /* 812 * Double-check with caps_lock held 813 */ 814 if (kpj->kpj_cpucap == NULL) { 815 kpj->kpj_cpucap = cap; 816 } else if (cap != NULL) { 817 cap_free(cap); 818 } 819 820 if (ZONE_IS_CAPPED(kpj->kpj_zone)) 821 cap_project_enable(kpj, MAX_USAGE); 822 823 mutex_exit(&caps_lock); 824 } 825 826 /* 827 * Set project cap to cap_val 828 * If cap_val is equal to NOCAP, disable project cap. 829 * 830 * If this is the first time a cap is set on a project, allocate cpucap 831 * structure without holding caps_lock to avoid KM_SLEEP allocation with 832 * caps_lock held. 833 */ 834 int 835 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 836 { 837 cpucap_t *cap = NULL; 838 hrtime_t value; 839 840 if (cap_val == 0) 841 return (EINVAL); 842 843 ASSERT(cap_val <= MAXCAP); 844 if (cap_val > MAXCAP) 845 cap_val = MAXCAP; 846 847 /* 848 * Nothing to do if trying to disable project cap and caps are not 849 * enabled or if trying to disable cap on a project that does not have 850 * cap enabled. 851 */ 852 if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 853 return (0); 854 855 if (kpj->kpj_cpucap == NULL) { 856 /* 857 * This project was never capped before, so allocate its cap 858 * structure. 859 */ 860 cap = cap_alloc(); 861 } 862 863 mutex_enter(&caps_lock); 864 865 /* 866 * Double-check with caps_lock held. 867 */ 868 if (kpj->kpj_cpucap == NULL) { 869 kpj->kpj_cpucap = cap; 870 } else if (cap != NULL) { 871 cap_free(cap); 872 } 873 874 /* 875 * Get the actual pointer to the project cap. 876 */ 877 cap = kpj->kpj_cpucap; 878 value = cap_val * cap_tick_cost; 879 if (value < 0) 880 value = MAX_USAGE; 881 882 /* 883 * Nothing to do if the value is not changing 884 */ 885 if (value == cap->cap_value) { 886 mutex_exit(&caps_lock); 887 return (0); 888 } 889 890 /* 891 * Clear cap statistics since the cap value itself changes. 892 */ 893 cap->cap_above = cap->cap_below = 0; 894 cap->cap_maxusage = 0; 895 896 if (cap_val != NOCAP) { 897 /* 898 * Enable this cap if it is not already enabled. 899 */ 900 if (CAP_DISABLED(cap)) 901 cap_project_enable(kpj, value); 902 else 903 cap->cap_value = value; 904 } else if (CAP_ENABLED(cap)) { 905 /* 906 * User requested to drop a cap on the project. If it is part of 907 * capped zone, keep the cap and set the value to MAX_USAGE, 908 * otherwise disable the cap. 909 */ 910 if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 911 cap->cap_value = MAX_USAGE; 912 } else { 913 cap_project_disable(kpj); 914 } 915 } 916 mutex_exit(&caps_lock); 917 918 return (0); 919 } 920 921 /* 922 * Get cap usage. 923 */ 924 static rctl_qty_t 925 cap_get(cpucap_t *cap) 926 { 927 return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 928 } 929 930 /* 931 * Get current project usage. 932 */ 933 rctl_qty_t 934 cpucaps_project_get(kproject_t *kpj) 935 { 936 return (cap_get(kpj->kpj_cpucap)); 937 } 938 939 /* 940 * Get current zone usage. 941 */ 942 rctl_qty_t 943 cpucaps_zone_get(zone_t *zone) 944 { 945 return (cap_get(zone->zone_cpucap)); 946 } 947 948 /* 949 * Charge project of thread t the time thread t spent on CPU since previously 950 * adjusted. 951 * 952 * Record the current on-CPU time in the csc structure. 953 * 954 * Do not adjust for more than one tick worth of time. 955 * 956 * It is possible that the project cap is being disabled while this routine is 957 * executed. This should not cause any issues since the association between the 958 * thread and its project is protected by thread lock. 959 */ 960 static void 961 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 962 { 963 kproject_t *kpj = ttoproj(t); 964 hrtime_t new_usage; 965 hrtime_t usage_delta; 966 967 ASSERT(THREAD_LOCK_HELD(t)); 968 ASSERT(kpj->kpj_cpucap != NULL); 969 970 /* Get on-CPU time since birth of a thread */ 971 new_usage = mstate_thread_onproc_time(t); 972 973 /* Time spent on CPU since last checked */ 974 usage_delta = new_usage - csc->csc_cputime; 975 976 /* Save the accumulated on-CPU time */ 977 csc->csc_cputime = new_usage; 978 979 /* Charge at most one tick worth of on-CPU time */ 980 if (usage_delta > cap_tick_cost) 981 usage_delta = cap_tick_cost; 982 983 /* Add usage_delta to the project usage value. */ 984 if (usage_delta > 0) { 985 cpucap_t *cap = kpj->kpj_cpucap; 986 987 DTRACE_PROBE2(cpucaps__project__charge, 988 kthread_id_t, t, hrtime_t, usage_delta); 989 990 disp_lock_enter_high(&cap->cap_usagelock); 991 cap->cap_usage += usage_delta; 992 993 /* Check for overflows */ 994 if (cap->cap_usage < 0) 995 cap->cap_usage = MAX_USAGE - 1; 996 997 disp_lock_exit_high(&cap->cap_usagelock); 998 999 /* 1000 * cap_maxusage is only kept for observability. Move it outside 1001 * the lock to reduce the time spent while holding the lock. 1002 */ 1003 if (cap->cap_usage > cap->cap_maxusage) 1004 cap->cap_maxusage = cap->cap_usage; 1005 } 1006 } 1007 1008 /* 1009 * Charge thread's project and return True if project or zone should be 1010 * penalized because its project or zone is exceeding its cap. Also sets 1011 * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 1012 * 1013 * It is possible that the project cap is being disabled while this routine is 1014 * executed. This should not cause any issues since the association between the 1015 * thread and its project is protected by thread lock. It will still set 1016 * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place 1017 * anything on the blocked wait queue. 1018 * 1019 */ 1020 boolean_t 1021 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1022 { 1023 kproject_t *kpj = ttoproj(t); 1024 klwp_t *lwp = t->t_lwp; 1025 zone_t *zone; 1026 cpucap_t *project_cap; 1027 boolean_t rc = B_FALSE; 1028 1029 ASSERT(THREAD_LOCK_HELD(t)); 1030 1031 /* Nothing to do for projects that are not capped. */ 1032 if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1033 return (B_FALSE); 1034 1035 caps_charge_adjust(t, csc); 1036 1037 /* 1038 * The caller only requested to charge the project usage, no enforcement 1039 * part. 1040 */ 1041 if (charge_type == CPUCAPS_CHARGE_ONLY) 1042 return (B_FALSE); 1043 1044 project_cap = kpj->kpj_cpucap; 1045 1046 if (project_cap->cap_usage >= project_cap->cap_value) { 1047 t->t_schedflag |= TS_PROJWAITQ; 1048 rc = B_TRUE; 1049 } else if (t->t_schedflag & TS_PROJWAITQ) { 1050 t->t_schedflag &= ~TS_PROJWAITQ; 1051 } 1052 1053 zone = ttozone(t); 1054 if (!ZONE_IS_CAPPED(zone)) { 1055 if (t->t_schedflag & TS_ZONEWAITQ) 1056 t->t_schedflag &= ~TS_ZONEWAITQ; 1057 } else { 1058 cpucap_t *zone_cap = zone->zone_cpucap; 1059 1060 if (zone_cap->cap_usage >= zone_cap->cap_value) { 1061 t->t_schedflag |= TS_ZONEWAITQ; 1062 rc = B_TRUE; 1063 } else if (t->t_schedflag & TS_ZONEWAITQ) { 1064 t->t_schedflag &= ~TS_ZONEWAITQ; 1065 } 1066 } 1067 1068 1069 return (rc); 1070 } 1071 1072 /* 1073 * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1074 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1075 * 1076 * CPU Caps are only enforced for user threads. 1077 * 1078 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1079 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1080 * 1081 * It is possible that by the time we enter cpucaps_enforce() the cap is already 1082 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1083 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1084 * apply. 1085 */ 1086 boolean_t 1087 cpucaps_enforce(kthread_t *t) 1088 { 1089 klwp_t *lwp = t->t_lwp; 1090 1091 ASSERT(THREAD_LOCK_HELD(t)); 1092 1093 if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1094 if (t->t_schedflag & TS_PROJWAITQ) { 1095 ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1096 t->t_schedflag &= ~TS_ANYWAITQ; 1097 if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1098 t)) { 1099 return (B_TRUE); 1100 } 1101 } 1102 if (t->t_schedflag & TS_ZONEWAITQ) { 1103 ASSERT(ttozone(t)->zone_cpucap != NULL); 1104 t->t_schedflag &= ~TS_ZONEWAITQ; 1105 if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1106 t)) { 1107 return (B_TRUE); 1108 } 1109 } 1110 } 1111 1112 /* 1113 * The thread is not enqueued on the wait queue. 1114 */ 1115 return (B_FALSE); 1116 } 1117 1118 /* 1119 * Convert internal cap statistics into values exported by cap kstat. 1120 */ 1121 static int 1122 cap_kstat_update(kstat_t *ksp, int rw) 1123 { 1124 struct cap_kstat *capsp = &cap_kstat; 1125 cpucap_t *cap = ksp->ks_private; 1126 clock_t tick_sec = SEC_TO_TICK(1); 1127 char *zonename = cap->cap_zone->zone_name; 1128 1129 if (rw == KSTAT_WRITE) 1130 return (EACCES); 1131 1132 capsp->cap_value.value.ui64 = 1133 ROUND_SCALE(cap->cap_value, cap_tick_cost); 1134 capsp->cap_usage.value.ui64 = 1135 ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1136 capsp->cap_maxusage.value.ui64 = 1137 ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1138 capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1139 capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1140 capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1141 kstat_named_setstr(&capsp->cap_zonename, zonename); 1142 1143 return (0); 1144 } 1145