1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/disp.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/atomic.h> 34 #include <sys/cpucaps_impl.h> 35 #include <sys/dtrace.h> 36 #include <sys/sdt.h> 37 #include <sys/debug.h> 38 #include <sys/rctl.h> 39 #include <sys/errno.h> 40 41 /* 42 * CPU Caps implementation 43 * ======================= 44 * 45 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 46 * usage for all projects running inside the zone. If the zone CPU cap is set 47 * below the project CPU cap, the latter will have no effect. 48 * 49 * When CPU usage of projects and/or zones reaches specified caps, threads in 50 * them do not get scheduled and instead are placed on wait queues associated 51 * with a cap. Such threads will start running again only when CPU usage drops 52 * below the cap level. Each zone and each project has its own wait queue. 53 * 54 * When CPU cap is set, the kernel continously keeps track of CPU time used by 55 * capped zones and/or projects over a short time interval and calculates their 56 * current CPU usage as a percentage. When the accumulated usage reaches the CPU 57 * cap, LWPs running in the user-land (when they are not holding any critical 58 * kernel locks) are placed on special wait queues until their project's or 59 * zone's CPU usage drops below the cap. 60 * 61 * The system maintains a list of all capped projects and all capped zones. On 62 * every clock tick every active thread belonging to a capped project adds its 63 * CPU usage to its project. Usage from all projects belonging to a capped zone 64 * is aggregated to get the zone usage. 65 * 66 * When the current CPU usage is above the cap, a project or zone is considered 67 * over-capped. Every user thread caught running in an over-capped project or 68 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 69 * is requested to surrender its CPU. This causes scheduling class specific 70 * CL_PREEMPT() callback to be invoked. The callback function places threads 71 * marked as TS_PROJWAIT on a wait queue and calls switch(). 72 * 73 * Threads are only placed on wait queues after trapping from user-land 74 * (they could be holding some user locks, but no kernel locks) and while 75 * returning from the trap back to the user-land when no kernel locks are held. 76 * Putting threads on wait queues in random places while running in the 77 * kernel might lead to all kinds of locking problems. 78 * 79 * Accounting 80 * ========== 81 * 82 * Accounting of CPU usage is based on per-thread micro-state accounting data. 83 * On every clock tick clock() adds new on-CPU time for every thread found on 84 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 85 * New times means time since it was last accounted for. On-CPU times greater 86 * than 1 tick are truncated to 1 tick. 87 * 88 * Project CPU usage is aggregated from all threads within the project. 89 * Zone CPU usage is the sum of usages for all projects within the zone. Zone 90 * CPU usage is calculated on every clock tick by walking list of projects and 91 * adding their usage together. 92 * 93 * Decay 94 * ===== 95 * 96 * CPU usage is decayed by the caps_update() routine which is called once per 97 * every clock tick. It walks lists of project caps and decays their usages by 98 * one per cent. If CPU usage drops below cap levels, threads on the wait queue 99 * are made runnable again, one thread per clock tick. 100 * 101 * Interfaces 102 * ========== 103 * 104 * The CPU Caps facility provides the following interfaces to the rest of the 105 * system: 106 * 107 * cpucaps_project_add(kproject_t *) 108 * 109 * Notifies the framework of a new project. It should be put on the 110 * capped_projects list if its zone has a cap. 111 * 112 * cpucaps_project_remove(kproject_t *) 113 * 114 * Remove the association between the specified project and its cap. 115 * Called right before the project is destroyed. 116 * 117 * cpucaps_project_set(kproject_t *, rctl_qty_t) 118 * 119 * Set project cap of the specified project to the specified value. Setting the 120 * value to NOCAP is equivalent to removing the cap. 121 * 122 * cpucaps_zone_set(zone_t *, rctl_qty_t) 123 * 124 * Set zone cap of the specified zone to the specified value. Setting the value 125 * to NOCAP is equivalent to removing the cap. 126 * 127 * cpucaps_zone_remove(zone_t *) 128 * 129 * Remove the association between the zone and its cap. 130 * 131 * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 132 * 133 * Charges specified thread's project the amount of on-CPU time that it used. 134 * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 135 * Otherwise returns True if project or zone should be penalized because its 136 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 137 * bits in t_schedflag in this case. 138 * 139 * CPUCAPS_ENFORCE(kthread_id_t *) 140 * 141 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 142 * state on project or zone wait queues, as requested by TS_PROJWAITQ or 143 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 144 * wait queue or False otherwise. 145 * 146 * cpucaps_sc_init(caps_sc_t *) 147 * 148 * Initializes the scheduling-class specific CPU Caps data for a thread. 149 * 150 * LOCKS 151 * ===== 152 * 153 * all the individual caps structures and their lists are protected by a global 154 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 155 * caps, so it is usually uncontended. We avoid all blocking memory allocations 156 * while holding caps_lock to prevent clock() from blocking. 157 * 158 * Thread state is protected by the thread lock. It protects the association 159 * between a thread and its project and, as a consequence, to its zone. The 160 * association can not break while thread lock is held, so the project or zone 161 * cap are not going to disappear while thread lock is held. 162 * 163 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 164 * grabbed by scheduling classes already holding thread lock at high PIL and by 165 * clock thread performing usage decay. We should do as little work as possible 166 * while holding the lock since it may be very hot. All threads in the project 167 * contend for the same cache line doing cap usage updates. 168 */ 169 170 /* 171 * caps_lock protects list of capped projects and zones, changes in the cap 172 * state and changes of the global cpucaps_enabled flag. 173 * 174 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 175 * modified in parallel. This can be per-zone cap flag, but we don't keep any 176 * cap state for now. 177 */ 178 static kmutex_t caps_lock; /* lock to protect: */ 179 static list_t capped_zones; /* - list of zones with caps */ 180 static list_t capped_projects; /* - list of projects with caps */ 181 boolean_t cpucaps_enabled; /* - are there any caps defined? */ 182 boolean_t cpucaps_busy; /* - is framework busy? */ 183 184 /* 185 * The accounting is based on the number of nanoseconds threads spend running 186 * during a tick which is kept in the cap_tick_cost variable. 187 */ 188 static hrtime_t cap_tick_cost; 189 190 /* 191 * How much of the usage value is decayed every clock tick 192 * Decay one per cent of value per tick 193 */ 194 #define CAP_DECAY_FACTOR 100 195 196 /* 197 * Scale the value and round it to the closest integer value 198 */ 199 #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 200 201 static void caps_update(); 202 203 /* 204 * CAP kstats. 205 */ 206 struct cap_kstat { 207 kstat_named_t cap_value; 208 kstat_named_t cap_usage; 209 kstat_named_t cap_nwait; 210 kstat_named_t cap_below; 211 kstat_named_t cap_above; 212 kstat_named_t cap_maxusage; 213 kstat_named_t cap_zonename; 214 } cap_kstat = { 215 { "value", KSTAT_DATA_UINT64 }, 216 { "usage", KSTAT_DATA_UINT64 }, 217 { "nwait", KSTAT_DATA_UINT64 }, 218 { "below_sec", KSTAT_DATA_UINT64 }, 219 { "above_sec", KSTAT_DATA_UINT64 }, 220 { "maxusage", KSTAT_DATA_UINT64 }, 221 { "zonename", KSTAT_DATA_STRING }, 222 }; 223 224 225 static kmutex_t cap_kstat_lock; 226 static int cap_kstat_update(kstat_t *, int); 227 228 /* 229 * Initialize CPU caps infrastructure. 230 * - Initialize lists of capped zones and capped projects 231 * - Set cpucaps_clock_callout to NULL 232 */ 233 void 234 cpucaps_init() 235 { 236 /* 237 * Initialize global variables 238 */ 239 cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 240 241 list_create(&capped_zones, sizeof (cpucap_t), 242 offsetof(cpucap_t, cap_link)); 243 list_create(&capped_projects, sizeof (cpucap_t), 244 offsetof(cpucap_t, cap_link)); 245 246 cpucaps_enabled = B_FALSE; 247 cpucaps_busy = B_FALSE; 248 cpucaps_clock_callout = NULL; 249 } 250 251 /* 252 * Initialize scheduling-class specific CPU Caps data. 253 */ 254 void 255 cpucaps_sc_init(caps_sc_t *csc) 256 { 257 csc->csc_cputime = 0; 258 } 259 260 /* 261 * Allocate and initialize cpucap structure 262 */ 263 static cpucap_t * 264 cap_alloc(void) 265 { 266 cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 267 268 DISP_LOCK_INIT(&cap->cap_usagelock); 269 waitq_init(&cap->cap_waitq); 270 271 return (cap); 272 } 273 274 /* 275 * Free cpucap structure 276 */ 277 static void 278 cap_free(cpucap_t *cap) 279 { 280 if (cap == NULL) 281 return; 282 283 /* 284 * This cap should not be active 285 */ 286 ASSERT(!list_link_active(&cap->cap_link)); 287 ASSERT(cap->cap_value == 0); 288 ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 289 290 waitq_fini(&cap->cap_waitq); 291 DISP_LOCK_DESTROY(&cap->cap_usagelock); 292 293 kmem_free(cap, sizeof (cpucap_t)); 294 } 295 296 /* 297 * Activate cap - insert into active list and unblock its 298 * wait queue. Should be called with caps_lock held. 299 * The cap_value field is set to the value supplied. 300 */ 301 static void 302 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 303 { 304 ASSERT(MUTEX_HELD(&caps_lock)); 305 306 /* 307 * Cap can not be already enabled 308 */ 309 ASSERT(!CAP_ENABLED(cap)); 310 ASSERT(!list_link_active(&cap->cap_link)); 311 312 list_insert_tail(l, cap); 313 cap->cap_below = cap->cap_above = 0; 314 cap->cap_maxusage = 0; 315 cap->cap_usage = 0; 316 cap->cap_value = value; 317 waitq_unblock(&cap->cap_waitq); 318 if (CPUCAPS_OFF()) { 319 cpucaps_enabled = B_TRUE; 320 cpucaps_clock_callout = caps_update; 321 } 322 } 323 324 /* 325 * Deactivate cap 326 * - Block its wait queue. This prevents any new threads from being 327 * enqueued there and moves all enqueued threads to the run queue. 328 * - Remove cap from list l. 329 * - Disable CPU caps globally if there are no capped projects or zones 330 * 331 * Should be called with caps_lock held. 332 */ 333 static void 334 cap_disable(list_t *l, cpucap_t *cap) 335 { 336 ASSERT(MUTEX_HELD(&caps_lock)); 337 /* 338 * Cap should be currently active 339 */ 340 ASSERT(CPUCAPS_ON()); 341 ASSERT(list_link_active(&cap->cap_link)); 342 ASSERT(CAP_ENABLED(cap)); 343 344 waitq_block(&cap->cap_waitq); 345 list_remove(l, cap); 346 if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 347 cpucaps_enabled = B_FALSE; 348 cpucaps_clock_callout = NULL; 349 } 350 cap->cap_value = 0; 351 cap->cap_project = NULL; 352 cap->cap_zone = NULL; 353 if (cap->cap_kstat != NULL) { 354 kstat_delete(cap->cap_kstat); 355 cap->cap_kstat = NULL; 356 } 357 358 } 359 360 /* 361 * Enable cap for a project kpj 362 * It is safe to enable already enabled project cap. 363 * Should be called with caps_lock held. 364 */ 365 static void 366 cap_project_enable(kproject_t *kpj, hrtime_t value) 367 { 368 cpucap_t *cap = kpj->kpj_cpucap; 369 370 ASSERT(MUTEX_HELD(&caps_lock)); 371 ASSERT(cap != NULL); 372 373 if (CAP_DISABLED(cap)) { 374 ASSERT(cap->cap_kstat == NULL); 375 cap_enable(&capped_projects, cap, value); 376 cap->cap_project = kpj; 377 cap->cap_zone = kpj->kpj_zone; 378 379 /* 380 * Create cap kstats 381 */ 382 if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 383 KSTAT_TYPE_NAMED, 384 sizeof (cap_kstat) / sizeof (kstat_named_t), 385 KSTAT_FLAG_VIRTUAL)) != NULL) { 386 cap->cap_kstat->ks_data_size += 387 strlen(cap->cap_zone->zone_name) + 1; 388 cap->cap_kstat->ks_lock = &cap_kstat_lock; 389 cap->cap_kstat->ks_data = &cap_kstat; 390 cap->cap_kstat->ks_update = cap_kstat_update; 391 cap->cap_kstat->ks_private = cap; 392 kstat_install(cap->cap_kstat); 393 } 394 } 395 } 396 397 /* 398 * Disable project cap. 399 * It is safe to disable already disabled project cap. 400 * Should be called with caps_lock held. 401 */ 402 static void 403 cap_project_disable(kproject_t *kpj) 404 { 405 cpucap_t *cap = kpj->kpj_cpucap; 406 407 ASSERT(MUTEX_HELD(&caps_lock)); 408 ASSERT(cap != NULL); 409 ASSERT(cap->cap_project == kpj); 410 411 if (CAP_ENABLED(cap)) 412 cap_disable(&capped_projects, cap); 413 } 414 415 /* 416 * Enable cap for a zone 417 * It is safe to enable already enabled zone cap. 418 * Should be called with caps_lock held. 419 */ 420 static void 421 cap_zone_enable(zone_t *zone, hrtime_t value) 422 { 423 cpucap_t *cap = zone->zone_cpucap; 424 425 ASSERT(MUTEX_HELD(&caps_lock)); 426 ASSERT(cap != NULL); 427 428 if (CAP_DISABLED(cap)) { 429 ASSERT(cap->cap_kstat == NULL); 430 cap_enable(&capped_zones, cap, value); 431 cap->cap_zone = zone; 432 433 /* 434 * Create cap kstats 435 */ 436 if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 437 KSTAT_TYPE_NAMED, 438 sizeof (cap_kstat) / sizeof (kstat_named_t), 439 KSTAT_FLAG_VIRTUAL)) != NULL) { 440 cap->cap_kstat->ks_data_size += 441 strlen(cap->cap_zone->zone_name) + 1; 442 cap->cap_kstat->ks_lock = &cap_kstat_lock; 443 cap->cap_kstat->ks_data = &cap_kstat; 444 cap->cap_kstat->ks_update = cap_kstat_update; 445 cap->cap_kstat->ks_private = cap; 446 kstat_install(cap->cap_kstat); 447 } 448 } 449 } 450 451 /* 452 * Disable zone cap. 453 * It is safe to disable already disabled zone cap. 454 * Should be called with caps_lock held. 455 */ 456 static void 457 cap_zone_disable(zone_t *zone) 458 { 459 cpucap_t *cap = zone->zone_cpucap; 460 461 ASSERT(MUTEX_HELD(&caps_lock)); 462 ASSERT(cap != NULL); 463 ASSERT(cap->cap_zone == zone); 464 465 if (CAP_ENABLED(cap)) 466 cap_disable(&capped_zones, cap); 467 } 468 469 /* 470 * Apply specified callback to all caps contained in the list `l'. 471 */ 472 static void 473 cap_walk(list_t *l, void (*cb)(cpucap_t *)) 474 { 475 cpucap_t *cap; 476 477 ASSERT(MUTEX_HELD(&caps_lock)); 478 479 for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 480 (*cb)(cap); 481 } 482 } 483 484 /* 485 * If cap limit is not reached, make one thread from wait queue runnable. 486 * The waitq_isempty check is performed without the waitq lock. If a new thread 487 * is placed on the waitq right after the check, it will be picked up during the 488 * next invocation of cap_poke_waitq(). 489 */ 490 static void 491 cap_poke_waitq(cpucap_t *cap) 492 { 493 ASSERT(MUTEX_HELD(&caps_lock)); 494 495 if (cap->cap_usage >= cap->cap_value) { 496 cap->cap_above++; 497 } else { 498 waitq_t *wq = &cap->cap_waitq; 499 500 cap->cap_below++; 501 502 if (!waitq_isempty(wq)) 503 waitq_runone(wq); 504 } 505 } 506 507 /* 508 * The callback function called for every cap on capped_projects list. 509 * Decay cap usage by CAP_DECAY_FACTOR 510 * Add this cap project usage to its zone usage. 511 * Kick off a thread from the cap waitq if cap is not reached. 512 */ 513 static void 514 cap_project_usage_walker(cpucap_t *cap) 515 { 516 zone_t *zone = cap->cap_zone; 517 hrtime_t cap_usage = cap->cap_usage; 518 519 ASSERT(MUTEX_HELD(&caps_lock)); 520 ASSERT(cap->cap_project->kpj_cpucap == cap); 521 ASSERT(zone == cap->cap_project->kpj_zone); 522 ASSERT(CAP_ENABLED(cap)); 523 524 /* 525 * Set or clear the CAP_REACHED flag based on the current usage. 526 * Only projects having their own caps are ever marked as CAP_REACHED. 527 */ 528 cap_poke_waitq(cap); 529 530 /* 531 * Add project's CPU usage to our zone's CPU usage. 532 */ 533 if (ZONE_IS_CAPPED(zone)) { 534 cpucap_t *zcap = zone->zone_cpucap; 535 536 ASSERT(zcap->cap_zone == zone); 537 538 /* 539 * If we haven't reset this zone's usage during this clock tick 540 * yet, then do it now. The cap_lbolt field is used to check 541 * whether this is the first zone's project we see during this 542 * tick or a subsequent one. 543 */ 544 if (zcap->cap_lbolt != lbolt64) { 545 if (zcap->cap_usage > zcap->cap_maxusage) 546 zcap->cap_maxusage = zcap->cap_usage; 547 zcap->cap_usage = 0; 548 } 549 DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 550 hrtime_t, cap_usage); 551 zcap->cap_usage += cap_usage; 552 /* Check for overflows */ 553 if (zcap->cap_usage < 0) 554 zcap->cap_usage = MAX_USAGE - 1; 555 } 556 557 /* 558 * Decay project usage. 559 */ 560 disp_lock_enter(&cap->cap_usagelock); 561 cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 562 disp_lock_exit(&cap->cap_usagelock); 563 } 564 565 /* 566 * On every clock tick walk the list of project caps and update the CPU usage. 567 * Also walk the list of zone caps checking whether any threads should 568 * transition from wait queue to run queue. 569 * 570 * This function gets called by the clock thread directly when there are any 571 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 572 * caps_lock for long periods of time, so there should be almost no contention 573 * for it. 574 */ 575 static void 576 caps_update() 577 { 578 mutex_enter(&caps_lock); 579 cap_walk(&capped_projects, cap_project_usage_walker); 580 cap_walk(&capped_zones, cap_poke_waitq); 581 mutex_exit(&caps_lock); 582 } 583 584 /* 585 * The function is called for each project in a zone when the zone cap is 586 * modified. It enables project caps if zone cap is enabled and disables if the 587 * zone cap is disabled and project doesn't have its own cap. 588 * 589 * For each project that does not have cpucap structure allocated it allocates a 590 * new structure and assigns to kpj->cpu_cap. The allocation is performed 591 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 592 * held. 593 */ 594 static int 595 cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 596 { 597 cpucap_t *project_cap = NULL; 598 cpucap_t *zone_cap = (cpucap_t *)arg; 599 600 ASSERT(zone_cap != NULL); 601 602 if (kpj->kpj_cpucap == NULL) { 603 /* 604 * This is the first time any cap was established for this 605 * project. Allocate a new cpucap structure for it. 606 */ 607 project_cap = cap_alloc(); 608 } 609 610 mutex_enter(&caps_lock); 611 612 /* 613 * Double-check that kpj_cpucap is still NULL - now with caps_lock held 614 * and assign the newly allocated cpucap structure to it. 615 */ 616 if (kpj->kpj_cpucap == NULL) { 617 kpj->kpj_cpucap = project_cap; 618 } else if (project_cap != NULL) { 619 cap_free(project_cap); 620 } 621 622 project_cap = kpj->kpj_cpucap; 623 624 if (CAP_DISABLED(zone_cap)) { 625 /* 626 * Remove all projects in this zone without caps 627 * from the capped_projects list. 628 */ 629 if (project_cap->cap_value == MAX_USAGE) { 630 cap_project_disable(kpj); 631 } 632 } else if (CAP_DISABLED(project_cap)) { 633 /* 634 * Add the project to capped_projects list. 635 */ 636 ASSERT(project_cap->cap_value == 0); 637 cap_project_enable(kpj, MAX_USAGE); 638 } 639 mutex_exit(&caps_lock); 640 641 return (0); 642 } 643 644 /* 645 * Set zone cap to cap_val 646 * If cap_val is equal to NOCAP, disable zone cap. 647 * 648 * If this is the first time a cap is set on a zone, allocate cpucap structure 649 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 650 */ 651 int 652 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 653 { 654 cpucap_t *cap = NULL; 655 hrtime_t value; 656 657 if (cap_val == 0) 658 return (EINVAL); 659 660 ASSERT(cap_val <= MAXCAP); 661 if (cap_val > MAXCAP) 662 cap_val = MAXCAP; 663 664 /* 665 * Nothing to do if trying to disable a cap on a zone when caps are off 666 * or a zone which does not have a cap yet. 667 */ 668 if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 669 return (0); 670 671 if (zone->zone_cpucap == NULL) 672 cap = cap_alloc(); 673 674 mutex_enter(&caps_lock); 675 676 if (cpucaps_busy) { 677 mutex_exit(&caps_lock); 678 return (EBUSY); 679 } 680 681 /* 682 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 683 * held. If it is still NULL, assign a newly allocated cpucap to it. 684 */ 685 if (zone->zone_cpucap == NULL) { 686 zone->zone_cpucap = cap; 687 } else if (cap != NULL) { 688 cap_free(cap); 689 } 690 691 cap = zone->zone_cpucap; 692 value = cap_val * cap_tick_cost; 693 if (value < 0) 694 value = MAX_USAGE; 695 696 /* Nothing to do if the value is staying the same */ 697 if (value == cap->cap_value) { 698 mutex_exit(&caps_lock); 699 return (0); 700 } 701 702 /* 703 * Clear cap statistics since the cap value itself changes. 704 */ 705 cap->cap_above = cap->cap_below = 0; 706 707 708 if (cap_val == NOCAP) { 709 if (CAP_ENABLED(cap)) { 710 /* 711 * Remove cap for the zone 712 */ 713 cap_zone_disable(zone); 714 cpucaps_busy = B_TRUE; 715 mutex_exit(&caps_lock); 716 /* 717 * Disable caps for all project belonging to this zone 718 * unless they have their own cap. 719 */ 720 (void) project_walk_all(zone->zone_id, 721 cap_project_zone_modify_walker, cap); 722 723 mutex_enter(&caps_lock); 724 cpucaps_busy = B_FALSE; 725 } 726 } else if (CAP_DISABLED(cap)) { 727 /* 728 * Set a cap on a zone which previously was not capped. 729 */ 730 cap_zone_enable(zone, value); 731 cpucaps_busy = B_TRUE; 732 mutex_exit(&caps_lock); 733 734 /* 735 * Enable cap for all projects belonging to this zone. 736 */ 737 (void) project_walk_all(zone->zone_id, 738 cap_project_zone_modify_walker, cap); 739 740 mutex_enter(&caps_lock); 741 cpucaps_busy = B_FALSE; 742 } else { 743 /* 744 * No state transitions, just change the value 745 */ 746 cap->cap_value = value; 747 } 748 749 ASSERT(MUTEX_HELD(&caps_lock)); 750 ASSERT(!cpucaps_busy); 751 mutex_exit(&caps_lock); 752 753 return (0); 754 } 755 756 /* 757 * The project is going away so disable its cap. 758 */ 759 void 760 cpucaps_project_remove(kproject_t *kpj) 761 { 762 mutex_enter(&caps_lock); 763 if (PROJECT_IS_CAPPED(kpj)) 764 cap_project_disable(kpj); 765 if (kpj->kpj_cpucap != NULL) { 766 cap_free(kpj->kpj_cpucap); 767 kpj->kpj_cpucap = NULL; 768 } 769 mutex_exit(&caps_lock); 770 } 771 772 /* 773 * The zone is going away, so disable its cap. 774 */ 775 void 776 cpucaps_zone_remove(zone_t *zone) 777 { 778 mutex_enter(&caps_lock); 779 while (ZONE_IS_CAPPED(zone)) { 780 mutex_exit(&caps_lock); 781 (void) cpucaps_zone_set(zone, NOCAP); 782 mutex_enter(&caps_lock); 783 } 784 if (zone->zone_cpucap != NULL) { 785 cap_free(zone->zone_cpucap); 786 zone->zone_cpucap = NULL; 787 } 788 mutex_exit(&caps_lock); 789 } 790 791 /* 792 * New project was created. It should be put on the capped_projects list if 793 * its zone has a cap. 794 */ 795 void 796 cpucaps_project_add(kproject_t *kpj) 797 { 798 cpucap_t *cap = NULL; 799 800 if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 801 return; 802 803 /* 804 * This project was never capped before, so allocate its cap structure. 805 */ 806 if (kpj->kpj_cpucap == NULL) 807 cap = cap_alloc(); 808 809 mutex_enter(&caps_lock); 810 /* 811 * Double-check with caps_lock held 812 */ 813 if (kpj->kpj_cpucap == NULL) { 814 kpj->kpj_cpucap = cap; 815 } else if (cap != NULL) { 816 cap_free(cap); 817 } 818 819 if (ZONE_IS_CAPPED(kpj->kpj_zone)) 820 cap_project_enable(kpj, MAX_USAGE); 821 822 mutex_exit(&caps_lock); 823 } 824 825 /* 826 * Set project cap to cap_val 827 * If cap_val is equal to NOCAP, disable project cap. 828 * 829 * If this is the first time a cap is set on a project, allocate cpucap 830 * structure without holding caps_lock to avoid KM_SLEEP allocation with 831 * caps_lock held. 832 */ 833 int 834 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 835 { 836 cpucap_t *cap = NULL; 837 hrtime_t value; 838 839 if (cap_val == 0) 840 return (EINVAL); 841 842 ASSERT(cap_val <= MAXCAP); 843 if (cap_val > MAXCAP) 844 cap_val = MAXCAP; 845 846 /* 847 * Nothing to do if trying to disable project cap and caps are not 848 * enabled or if trying to disable cap on a project that does not have 849 * cap enabled. 850 */ 851 if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 852 return (0); 853 854 if (kpj->kpj_cpucap == NULL) { 855 /* 856 * This project was never capped before, so allocate its cap 857 * structure. 858 */ 859 cap = cap_alloc(); 860 } 861 862 mutex_enter(&caps_lock); 863 864 /* 865 * Double-check with caps_lock held. 866 */ 867 if (kpj->kpj_cpucap == NULL) { 868 kpj->kpj_cpucap = cap; 869 } else if (cap != NULL) { 870 cap_free(cap); 871 } 872 873 /* 874 * Get the actual pointer to the project cap. 875 */ 876 cap = kpj->kpj_cpucap; 877 value = cap_val * cap_tick_cost; 878 if (value < 0) 879 value = MAX_USAGE; 880 881 /* 882 * Nothing to do if the value is not changing 883 */ 884 if (value == cap->cap_value) { 885 mutex_exit(&caps_lock); 886 return (0); 887 } 888 889 /* 890 * Clear cap statistics since the cap value itself changes. 891 */ 892 cap->cap_above = cap->cap_below = 0; 893 cap->cap_maxusage = 0; 894 895 if (cap_val != NOCAP) { 896 /* 897 * Enable this cap if it is not already enabled. 898 */ 899 if (CAP_DISABLED(cap)) 900 cap_project_enable(kpj, value); 901 else 902 cap->cap_value = value; 903 } else if (CAP_ENABLED(cap)) { 904 /* 905 * User requested to drop a cap on the project. If it is part of 906 * capped zone, keep the cap and set the value to MAX_USAGE, 907 * otherwise disable the cap. 908 */ 909 if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 910 cap->cap_value = MAX_USAGE; 911 } else { 912 cap_project_disable(kpj); 913 } 914 } 915 mutex_exit(&caps_lock); 916 917 return (0); 918 } 919 920 /* 921 * Get cap usage. 922 */ 923 static rctl_qty_t 924 cap_get(cpucap_t *cap) 925 { 926 return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 927 } 928 929 /* 930 * Get current project usage. 931 */ 932 rctl_qty_t 933 cpucaps_project_get(kproject_t *kpj) 934 { 935 return (cap_get(kpj->kpj_cpucap)); 936 } 937 938 /* 939 * Get current zone usage. 940 */ 941 rctl_qty_t 942 cpucaps_zone_get(zone_t *zone) 943 { 944 return (cap_get(zone->zone_cpucap)); 945 } 946 947 /* 948 * Charge project of thread t the time thread t spent on CPU since previously 949 * adjusted. 950 * 951 * Record the current on-CPU time in the csc structure. 952 * 953 * Do not adjust for more than one tick worth of time. 954 * 955 */ 956 static void 957 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 958 { 959 kproject_t *kpj = ttoproj(t); 960 hrtime_t new_usage; 961 hrtime_t usage_delta; 962 963 ASSERT(THREAD_LOCK_HELD(t)); 964 ASSERT(PROJECT_IS_CAPPED(kpj)); 965 966 /* Get on-CPU time since birth of a thread */ 967 new_usage = mstate_thread_onproc_time(t); 968 969 /* Time spent on CPU since last checked */ 970 usage_delta = new_usage - csc->csc_cputime; 971 972 /* Save the accumulated on-CPU time */ 973 csc->csc_cputime = new_usage; 974 975 /* Charge at most one tick worth of on-CPU time */ 976 if (usage_delta > cap_tick_cost) 977 usage_delta = cap_tick_cost; 978 979 /* Add usage_delta to the project usage value. */ 980 if (usage_delta > 0) { 981 cpucap_t *cap = kpj->kpj_cpucap; 982 983 DTRACE_PROBE2(cpucaps__project__charge, 984 kthread_id_t, t, hrtime_t, usage_delta); 985 986 disp_lock_enter_high(&cap->cap_usagelock); 987 cap->cap_usage += usage_delta; 988 989 /* Check for overflows */ 990 if (cap->cap_usage < 0) 991 cap->cap_usage = MAX_USAGE - 1; 992 993 disp_lock_exit_high(&cap->cap_usagelock); 994 995 /* 996 * cap_maxusage is only kept for observability. Move it outside 997 * the lock to reduce the time spent while holding the lock. 998 */ 999 if (cap->cap_usage > cap->cap_maxusage) 1000 cap->cap_maxusage = cap->cap_usage; 1001 } 1002 } 1003 1004 /* 1005 * Charge thread's project and return True if project or zone should be 1006 * penalized because its project or zone is exceeding its cap. Also sets 1007 * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 1008 */ 1009 boolean_t 1010 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1011 { 1012 kproject_t *kpj = ttoproj(t); 1013 klwp_t *lwp = t->t_lwp; 1014 zone_t *zone; 1015 cpucap_t *project_cap; 1016 boolean_t rc = B_FALSE; 1017 1018 ASSERT(THREAD_LOCK_HELD(t)); 1019 1020 /* Nothing to do for projects that are not capped. */ 1021 if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1022 return (B_FALSE); 1023 1024 caps_charge_adjust(t, csc); 1025 1026 /* 1027 * The caller only requested to charge the project usage, no enforcement 1028 * part. 1029 */ 1030 if (charge_type == CPUCAPS_CHARGE_ONLY) 1031 return (B_FALSE); 1032 1033 project_cap = kpj->kpj_cpucap; 1034 1035 if (project_cap->cap_usage >= project_cap->cap_value) { 1036 t->t_schedflag |= TS_PROJWAITQ; 1037 rc = B_TRUE; 1038 } else if (t->t_schedflag & TS_PROJWAITQ) { 1039 t->t_schedflag &= ~TS_PROJWAITQ; 1040 } 1041 1042 zone = ttozone(t); 1043 if (!ZONE_IS_CAPPED(zone)) { 1044 if (t->t_schedflag & TS_ZONEWAITQ) 1045 t->t_schedflag &= ~TS_ZONEWAITQ; 1046 } else { 1047 cpucap_t *zone_cap = zone->zone_cpucap; 1048 1049 if (zone_cap->cap_usage >= zone_cap->cap_value) { 1050 t->t_schedflag |= TS_ZONEWAITQ; 1051 rc = B_TRUE; 1052 } else if (t->t_schedflag & TS_ZONEWAITQ) { 1053 t->t_schedflag &= ~TS_ZONEWAITQ; 1054 } 1055 } 1056 1057 1058 return (rc); 1059 } 1060 1061 /* 1062 * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1063 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1064 * 1065 * CPU Caps are only enforced for user threads. 1066 * 1067 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1068 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1069 * 1070 * It is possible that by the time we enter cpucaps_enforce() the cap is already 1071 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1072 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1073 * apply. 1074 */ 1075 boolean_t 1076 cpucaps_enforce(kthread_t *t) 1077 { 1078 klwp_t *lwp = t->t_lwp; 1079 1080 ASSERT(THREAD_LOCK_HELD(t)); 1081 1082 if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1083 if (t->t_schedflag & TS_PROJWAITQ) { 1084 ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1085 t->t_schedflag &= ~TS_ANYWAITQ; 1086 if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1087 t)) { 1088 return (B_TRUE); 1089 } 1090 } 1091 if (t->t_schedflag & TS_ZONEWAITQ) { 1092 ASSERT(ttozone(t)->zone_cpucap != NULL); 1093 t->t_schedflag &= ~TS_ZONEWAITQ; 1094 if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1095 t)) { 1096 return (B_TRUE); 1097 } 1098 } 1099 } 1100 1101 /* 1102 * The thread is not enqueued on the wait queue. 1103 */ 1104 return (B_FALSE); 1105 } 1106 1107 /* 1108 * Convert internal cap statistics into values exported by cap kstat. 1109 */ 1110 static int 1111 cap_kstat_update(kstat_t *ksp, int rw) 1112 { 1113 struct cap_kstat *capsp = &cap_kstat; 1114 cpucap_t *cap = ksp->ks_private; 1115 clock_t tick_sec = SEC_TO_TICK(1); 1116 char *zonename = cap->cap_zone->zone_name; 1117 1118 if (rw == KSTAT_WRITE) 1119 return (EACCES); 1120 1121 capsp->cap_value.value.ui64 = 1122 ROUND_SCALE(cap->cap_value, cap_tick_cost); 1123 capsp->cap_usage.value.ui64 = 1124 ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1125 capsp->cap_maxusage.value.ui64 = 1126 ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1127 capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1128 capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1129 capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1130 kstat_named_setstr(&capsp->cap_zonename, zonename); 1131 1132 return (0); 1133 } 1134