1*c97ad5cdSakolb /* 2*c97ad5cdSakolb * CDDL HEADER START 3*c97ad5cdSakolb * 4*c97ad5cdSakolb * The contents of this file are subject to the terms of the 5*c97ad5cdSakolb * Common Development and Distribution License (the "License"). 6*c97ad5cdSakolb * You may not use this file except in compliance with the License. 7*c97ad5cdSakolb * 8*c97ad5cdSakolb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*c97ad5cdSakolb * or http://www.opensolaris.org/os/licensing. 10*c97ad5cdSakolb * See the License for the specific language governing permissions 11*c97ad5cdSakolb * and limitations under the License. 12*c97ad5cdSakolb * 13*c97ad5cdSakolb * When distributing Covered Code, include this CDDL HEADER in each 14*c97ad5cdSakolb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*c97ad5cdSakolb * If applicable, add the following below this CDDL HEADER, with the 16*c97ad5cdSakolb * fields enclosed by brackets "[]" replaced with your own identifying 17*c97ad5cdSakolb * information: Portions Copyright [yyyy] [name of copyright owner] 18*c97ad5cdSakolb * 19*c97ad5cdSakolb * CDDL HEADER END 20*c97ad5cdSakolb */ 21*c97ad5cdSakolb 22*c97ad5cdSakolb /* 23*c97ad5cdSakolb * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24*c97ad5cdSakolb * Use is subject to license terms. 25*c97ad5cdSakolb */ 26*c97ad5cdSakolb 27*c97ad5cdSakolb #pragma ident "%Z%%M% %I% %E% SMI" 28*c97ad5cdSakolb 29*c97ad5cdSakolb #include <sys/disp.h> 30*c97ad5cdSakolb #include <sys/param.h> 31*c97ad5cdSakolb #include <sys/systm.h> 32*c97ad5cdSakolb #include <sys/sysmacros.h> 33*c97ad5cdSakolb #include <sys/atomic.h> 34*c97ad5cdSakolb #include <sys/cpucaps_impl.h> 35*c97ad5cdSakolb #include <sys/dtrace.h> 36*c97ad5cdSakolb #include <sys/sdt.h> 37*c97ad5cdSakolb #include <sys/debug.h> 38*c97ad5cdSakolb #include <sys/rctl.h> 39*c97ad5cdSakolb #include <sys/errno.h> 40*c97ad5cdSakolb 41*c97ad5cdSakolb /* 42*c97ad5cdSakolb * CPU Caps implementation 43*c97ad5cdSakolb * ======================= 44*c97ad5cdSakolb * 45*c97ad5cdSakolb * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 46*c97ad5cdSakolb * usage for all projects running inside the zone. If the zone CPU cap is set 47*c97ad5cdSakolb * below the project CPU cap, the latter will have no effect. 48*c97ad5cdSakolb * 49*c97ad5cdSakolb * When CPU usage of projects and/or zones reaches specified caps, threads in 50*c97ad5cdSakolb * them do not get scheduled and instead are placed on wait queues associated 51*c97ad5cdSakolb * with a cap. Such threads will start running again only when CPU usage drops 52*c97ad5cdSakolb * below the cap level. Each zone and each project has its own wait queue. 53*c97ad5cdSakolb * 54*c97ad5cdSakolb * When CPU cap is set, the kernel continously keeps track of CPU time used by 55*c97ad5cdSakolb * capped zones and/or projects over a short time interval and calculates their 56*c97ad5cdSakolb * current CPU usage as a percentage. When the accumulated usage reaches the CPU 57*c97ad5cdSakolb * cap, LWPs running in the user-land (when they are not holding any critical 58*c97ad5cdSakolb * kernel locks) are placed on special wait queues until their project's or 59*c97ad5cdSakolb * zone's CPU usage drops below the cap. 60*c97ad5cdSakolb * 61*c97ad5cdSakolb * The system maintains a list of all capped projects and all capped zones. On 62*c97ad5cdSakolb * every clock tick every active thread belonging to a capped project adds its 63*c97ad5cdSakolb * CPU usage to its project. Usage from all projects belonging to a capped zone 64*c97ad5cdSakolb * is aggregated to get the zone usage. 65*c97ad5cdSakolb * 66*c97ad5cdSakolb * When the current CPU usage is above the cap, a project or zone is considered 67*c97ad5cdSakolb * over-capped. Every user thread caught running in an over-capped project or 68*c97ad5cdSakolb * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 69*c97ad5cdSakolb * is requested to surrender its CPU. This causes scheduling class specific 70*c97ad5cdSakolb * CL_PREEMPT() callback to be invoked. The callback function places threads 71*c97ad5cdSakolb * marked as TS_PROJWAIT on a wait queue and calls switch(). 72*c97ad5cdSakolb * 73*c97ad5cdSakolb * Threads are only placed on wait queues after trapping from user-land 74*c97ad5cdSakolb * (they could be holding some user locks, but no kernel locks) and while 75*c97ad5cdSakolb * returning from the trap back to the user-land when no kernel locks are held. 76*c97ad5cdSakolb * Putting threads on wait queues in random places while running in the 77*c97ad5cdSakolb * kernel might lead to all kinds of locking problems. 78*c97ad5cdSakolb * 79*c97ad5cdSakolb * Accounting 80*c97ad5cdSakolb * ========== 81*c97ad5cdSakolb * 82*c97ad5cdSakolb * Accounting of CPU usage is based on per-thread micro-state accounting data. 83*c97ad5cdSakolb * On every clock tick clock() adds new on-CPU time for every thread found on 84*c97ad5cdSakolb * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 85*c97ad5cdSakolb * New times means time since it was last accounted for. On-CPU times greater 86*c97ad5cdSakolb * than 1 tick are truncated to 1 tick. 87*c97ad5cdSakolb * 88*c97ad5cdSakolb * Project CPU usage is aggregated from all threads within the project. 89*c97ad5cdSakolb * Zone CPU usage is the sum of usages for all projects within the zone. Zone 90*c97ad5cdSakolb * CPU usage is calculated on every clock tick by walking list of projects and 91*c97ad5cdSakolb * adding their usage together. 92*c97ad5cdSakolb * 93*c97ad5cdSakolb * Decay 94*c97ad5cdSakolb * ===== 95*c97ad5cdSakolb * 96*c97ad5cdSakolb * CPU usage is decayed by the caps_update() routine which is called once per 97*c97ad5cdSakolb * every clock tick. It walks lists of project caps and decays their usages by 98*c97ad5cdSakolb * one per cent. If CPU usage drops below cap levels, threads on the wait queue 99*c97ad5cdSakolb * are made runnable again, one thread per clock tick. 100*c97ad5cdSakolb * 101*c97ad5cdSakolb * Interfaces 102*c97ad5cdSakolb * ========== 103*c97ad5cdSakolb * 104*c97ad5cdSakolb * The CPU Caps facility provides the following interfaces to the rest of the 105*c97ad5cdSakolb * system: 106*c97ad5cdSakolb * 107*c97ad5cdSakolb * cpucaps_project_add(kproject_t *) 108*c97ad5cdSakolb * 109*c97ad5cdSakolb * Notifies the framework of a new project. It should be put on the 110*c97ad5cdSakolb * capped_projects list if its zone has a cap. 111*c97ad5cdSakolb * 112*c97ad5cdSakolb * cpucaps_project_remove(kproject_t *) 113*c97ad5cdSakolb * 114*c97ad5cdSakolb * Remove the association between the specified project and its cap. 115*c97ad5cdSakolb * Called right before the project is destroyed. 116*c97ad5cdSakolb * 117*c97ad5cdSakolb * cpucaps_project_set(kproject_t *, rctl_qty_t) 118*c97ad5cdSakolb * 119*c97ad5cdSakolb * Set project cap of the specified project to the specified value. Setting the 120*c97ad5cdSakolb * value to NOCAP is equivalent to removing the cap. 121*c97ad5cdSakolb * 122*c97ad5cdSakolb * cpucaps_zone_set(zone_t *, rctl_qty_t) 123*c97ad5cdSakolb * 124*c97ad5cdSakolb * Set zone cap of the specified zone to the specified value. Setting the value 125*c97ad5cdSakolb * to NOCAP is equivalent to removing the cap. 126*c97ad5cdSakolb * 127*c97ad5cdSakolb * cpucaps_zone_remove(zone_t *) 128*c97ad5cdSakolb * 129*c97ad5cdSakolb * Remove the association between the zone and its cap. 130*c97ad5cdSakolb * 131*c97ad5cdSakolb * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 132*c97ad5cdSakolb * 133*c97ad5cdSakolb * Charges specified thread's project the amount of on-CPU time that it used. 134*c97ad5cdSakolb * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 135*c97ad5cdSakolb * Otherwise returns True if project or zone should be penalized because its 136*c97ad5cdSakolb * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 137*c97ad5cdSakolb * bits in t_schedflag in this case. 138*c97ad5cdSakolb * 139*c97ad5cdSakolb * CPUCAPS_ENFORCE(kthread_id_t *) 140*c97ad5cdSakolb * 141*c97ad5cdSakolb * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 142*c97ad5cdSakolb * state on project or zone wait queues, as requested by TS_PROJWAITQ or 143*c97ad5cdSakolb * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 144*c97ad5cdSakolb * wait queue or False otherwise. 145*c97ad5cdSakolb * 146*c97ad5cdSakolb * cpucaps_sc_init(caps_sc_t *) 147*c97ad5cdSakolb * 148*c97ad5cdSakolb * Initializes the scheduling-class specific CPU Caps data for a thread. 149*c97ad5cdSakolb * 150*c97ad5cdSakolb * LOCKS 151*c97ad5cdSakolb * ===== 152*c97ad5cdSakolb * 153*c97ad5cdSakolb * all the individual caps structures and their lists are protected by a global 154*c97ad5cdSakolb * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 155*c97ad5cdSakolb * caps, so it is usually uncontended. We avoid all blocking memory allocations 156*c97ad5cdSakolb * while holding caps_lock to prevent clock() from blocking. 157*c97ad5cdSakolb * 158*c97ad5cdSakolb * Thread state is protected by the thread lock. It protects the association 159*c97ad5cdSakolb * between a thread and its project and, as a consequence, to its zone. The 160*c97ad5cdSakolb * association can not break while thread lock is held, so the project or zone 161*c97ad5cdSakolb * cap are not going to disappear while thread lock is held. 162*c97ad5cdSakolb * 163*c97ad5cdSakolb * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 164*c97ad5cdSakolb * grabbed by scheduling classes already holding thread lock at high PIL and by 165*c97ad5cdSakolb * clock thread performing usage decay. We should do as little work as possible 166*c97ad5cdSakolb * while holding the lock since it may be very hot. All threads in the project 167*c97ad5cdSakolb * contend for the same cache line doing cap usage updates. 168*c97ad5cdSakolb */ 169*c97ad5cdSakolb 170*c97ad5cdSakolb /* 171*c97ad5cdSakolb * caps_lock protects list of capped projects and zones, changes in the cap 172*c97ad5cdSakolb * state and changes of the global cpucaps_enabled flag. 173*c97ad5cdSakolb * 174*c97ad5cdSakolb * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 175*c97ad5cdSakolb * modified in parallel. This can be per-zone cap flag, but we don't keep any 176*c97ad5cdSakolb * cap state for now. 177*c97ad5cdSakolb */ 178*c97ad5cdSakolb static kmutex_t caps_lock; /* lock to protect: */ 179*c97ad5cdSakolb static list_t capped_zones; /* - list of zones with caps */ 180*c97ad5cdSakolb static list_t capped_projects; /* - list of projects with caps */ 181*c97ad5cdSakolb boolean_t cpucaps_enabled; /* - are there any caps defined? */ 182*c97ad5cdSakolb boolean_t cpucaps_busy; /* - is framework busy? */ 183*c97ad5cdSakolb 184*c97ad5cdSakolb /* 185*c97ad5cdSakolb * The accounting is based on the number of nanoseconds threads spend running 186*c97ad5cdSakolb * during a tick which is kept in the cap_tick_cost variable. 187*c97ad5cdSakolb */ 188*c97ad5cdSakolb static hrtime_t cap_tick_cost; 189*c97ad5cdSakolb 190*c97ad5cdSakolb /* 191*c97ad5cdSakolb * How much of the usage value is decayed every clock tick 192*c97ad5cdSakolb * Decay one per cent of value per tick 193*c97ad5cdSakolb */ 194*c97ad5cdSakolb #define CAP_DECAY_FACTOR 100 195*c97ad5cdSakolb 196*c97ad5cdSakolb /* 197*c97ad5cdSakolb * Scale the value and round it to the closest integer value 198*c97ad5cdSakolb */ 199*c97ad5cdSakolb #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 200*c97ad5cdSakolb 201*c97ad5cdSakolb static void caps_update(); 202*c97ad5cdSakolb 203*c97ad5cdSakolb /* 204*c97ad5cdSakolb * CAP kstats. 205*c97ad5cdSakolb */ 206*c97ad5cdSakolb struct cap_kstat { 207*c97ad5cdSakolb kstat_named_t cap_value; 208*c97ad5cdSakolb kstat_named_t cap_usage; 209*c97ad5cdSakolb kstat_named_t cap_nwait; 210*c97ad5cdSakolb kstat_named_t cap_below; 211*c97ad5cdSakolb kstat_named_t cap_above; 212*c97ad5cdSakolb kstat_named_t cap_maxusage; 213*c97ad5cdSakolb kstat_named_t cap_zonename; 214*c97ad5cdSakolb } cap_kstat = { 215*c97ad5cdSakolb { "value", KSTAT_DATA_UINT64 }, 216*c97ad5cdSakolb { "usage", KSTAT_DATA_UINT64 }, 217*c97ad5cdSakolb { "nwait", KSTAT_DATA_UINT64 }, 218*c97ad5cdSakolb { "below_sec", KSTAT_DATA_UINT64 }, 219*c97ad5cdSakolb { "above_sec", KSTAT_DATA_UINT64 }, 220*c97ad5cdSakolb { "maxusage", KSTAT_DATA_UINT64 }, 221*c97ad5cdSakolb { "zonename", KSTAT_DATA_STRING }, 222*c97ad5cdSakolb }; 223*c97ad5cdSakolb 224*c97ad5cdSakolb 225*c97ad5cdSakolb static kmutex_t cap_kstat_lock; 226*c97ad5cdSakolb static int cap_kstat_update(kstat_t *, int); 227*c97ad5cdSakolb 228*c97ad5cdSakolb /* 229*c97ad5cdSakolb * Initialize CPU caps infrastructure. 230*c97ad5cdSakolb * - Initialize lists of capped zones and capped projects 231*c97ad5cdSakolb * - Set cpucaps_clock_callout to NULL 232*c97ad5cdSakolb */ 233*c97ad5cdSakolb void 234*c97ad5cdSakolb cpucaps_init() 235*c97ad5cdSakolb { 236*c97ad5cdSakolb /* 237*c97ad5cdSakolb * Initialize global variables 238*c97ad5cdSakolb */ 239*c97ad5cdSakolb cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 240*c97ad5cdSakolb 241*c97ad5cdSakolb list_create(&capped_zones, sizeof (cpucap_t), 242*c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 243*c97ad5cdSakolb list_create(&capped_projects, sizeof (cpucap_t), 244*c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 245*c97ad5cdSakolb 246*c97ad5cdSakolb cpucaps_enabled = B_FALSE; 247*c97ad5cdSakolb cpucaps_busy = B_FALSE; 248*c97ad5cdSakolb cpucaps_clock_callout = NULL; 249*c97ad5cdSakolb } 250*c97ad5cdSakolb 251*c97ad5cdSakolb /* 252*c97ad5cdSakolb * Initialize scheduling-class specific CPU Caps data. 253*c97ad5cdSakolb */ 254*c97ad5cdSakolb void 255*c97ad5cdSakolb cpucaps_sc_init(caps_sc_t *csc) 256*c97ad5cdSakolb { 257*c97ad5cdSakolb csc->csc_cputime = 0; 258*c97ad5cdSakolb } 259*c97ad5cdSakolb 260*c97ad5cdSakolb /* 261*c97ad5cdSakolb * Allocate and initialize cpucap structure 262*c97ad5cdSakolb */ 263*c97ad5cdSakolb static cpucap_t * 264*c97ad5cdSakolb cap_alloc(void) 265*c97ad5cdSakolb { 266*c97ad5cdSakolb cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 267*c97ad5cdSakolb 268*c97ad5cdSakolb DISP_LOCK_INIT(&cap->cap_usagelock); 269*c97ad5cdSakolb waitq_init(&cap->cap_waitq); 270*c97ad5cdSakolb 271*c97ad5cdSakolb return (cap); 272*c97ad5cdSakolb } 273*c97ad5cdSakolb 274*c97ad5cdSakolb /* 275*c97ad5cdSakolb * Free cpucap structure 276*c97ad5cdSakolb */ 277*c97ad5cdSakolb static void 278*c97ad5cdSakolb cap_free(cpucap_t *cap) 279*c97ad5cdSakolb { 280*c97ad5cdSakolb if (cap == NULL) 281*c97ad5cdSakolb return; 282*c97ad5cdSakolb 283*c97ad5cdSakolb /* 284*c97ad5cdSakolb * This cap should not be active 285*c97ad5cdSakolb */ 286*c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 287*c97ad5cdSakolb ASSERT(cap->cap_value == 0); 288*c97ad5cdSakolb ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 289*c97ad5cdSakolb 290*c97ad5cdSakolb waitq_fini(&cap->cap_waitq); 291*c97ad5cdSakolb DISP_LOCK_DESTROY(&cap->cap_usagelock); 292*c97ad5cdSakolb 293*c97ad5cdSakolb kmem_free(cap, sizeof (cpucap_t)); 294*c97ad5cdSakolb } 295*c97ad5cdSakolb 296*c97ad5cdSakolb /* 297*c97ad5cdSakolb * Activate cap - insert into active list and unblock its 298*c97ad5cdSakolb * wait queue. Should be called with caps_lock held. 299*c97ad5cdSakolb * The cap_value field is set to the value supplied. 300*c97ad5cdSakolb */ 301*c97ad5cdSakolb static void 302*c97ad5cdSakolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 303*c97ad5cdSakolb { 304*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 305*c97ad5cdSakolb 306*c97ad5cdSakolb /* 307*c97ad5cdSakolb * Cap can not be already enabled 308*c97ad5cdSakolb */ 309*c97ad5cdSakolb ASSERT(!CAP_ENABLED(cap)); 310*c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 311*c97ad5cdSakolb 312*c97ad5cdSakolb list_insert_tail(l, cap); 313*c97ad5cdSakolb cap->cap_below = cap->cap_above = 0; 314*c97ad5cdSakolb cap->cap_maxusage = 0; 315*c97ad5cdSakolb cap->cap_usage = 0; 316*c97ad5cdSakolb cap->cap_value = value; 317*c97ad5cdSakolb waitq_unblock(&cap->cap_waitq); 318*c97ad5cdSakolb if (CPUCAPS_OFF()) { 319*c97ad5cdSakolb cpucaps_enabled = B_TRUE; 320*c97ad5cdSakolb cpucaps_clock_callout = caps_update; 321*c97ad5cdSakolb } 322*c97ad5cdSakolb } 323*c97ad5cdSakolb 324*c97ad5cdSakolb /* 325*c97ad5cdSakolb * Deactivate cap 326*c97ad5cdSakolb * - Block its wait queue. This prevents any new threads from being 327*c97ad5cdSakolb * enqueued there and moves all enqueued threads to the run queue. 328*c97ad5cdSakolb * - Remove cap from list l. 329*c97ad5cdSakolb * - Disable CPU caps globally if there are no capped projects or zones 330*c97ad5cdSakolb * 331*c97ad5cdSakolb * Should be called with caps_lock held. 332*c97ad5cdSakolb */ 333*c97ad5cdSakolb static void 334*c97ad5cdSakolb cap_disable(list_t *l, cpucap_t *cap) 335*c97ad5cdSakolb { 336*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 337*c97ad5cdSakolb /* 338*c97ad5cdSakolb * Cap should be currently active 339*c97ad5cdSakolb */ 340*c97ad5cdSakolb ASSERT(CPUCAPS_ON()); 341*c97ad5cdSakolb ASSERT(list_link_active(&cap->cap_link)); 342*c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 343*c97ad5cdSakolb 344*c97ad5cdSakolb waitq_block(&cap->cap_waitq); 345*c97ad5cdSakolb list_remove(l, cap); 346*c97ad5cdSakolb if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 347*c97ad5cdSakolb cpucaps_enabled = B_FALSE; 348*c97ad5cdSakolb cpucaps_clock_callout = NULL; 349*c97ad5cdSakolb } 350*c97ad5cdSakolb cap->cap_value = 0; 351*c97ad5cdSakolb cap->cap_project = NULL; 352*c97ad5cdSakolb cap->cap_zone = NULL; 353*c97ad5cdSakolb if (cap->cap_kstat != NULL) { 354*c97ad5cdSakolb kstat_delete(cap->cap_kstat); 355*c97ad5cdSakolb cap->cap_kstat = NULL; 356*c97ad5cdSakolb } 357*c97ad5cdSakolb 358*c97ad5cdSakolb } 359*c97ad5cdSakolb 360*c97ad5cdSakolb /* 361*c97ad5cdSakolb * Enable cap for a project kpj 362*c97ad5cdSakolb * It is safe to enable already enabled project cap. 363*c97ad5cdSakolb * Should be called with caps_lock held. 364*c97ad5cdSakolb */ 365*c97ad5cdSakolb static void 366*c97ad5cdSakolb cap_project_enable(kproject_t *kpj, hrtime_t value) 367*c97ad5cdSakolb { 368*c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 369*c97ad5cdSakolb 370*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 371*c97ad5cdSakolb ASSERT(cap != NULL); 372*c97ad5cdSakolb 373*c97ad5cdSakolb if (CAP_DISABLED(cap)) { 374*c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 375*c97ad5cdSakolb cap_enable(&capped_projects, cap, value); 376*c97ad5cdSakolb cap->cap_project = kpj; 377*c97ad5cdSakolb cap->cap_zone = kpj->kpj_zone; 378*c97ad5cdSakolb 379*c97ad5cdSakolb /* 380*c97ad5cdSakolb * Create cap kstats 381*c97ad5cdSakolb */ 382*c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 383*c97ad5cdSakolb KSTAT_TYPE_NAMED, 384*c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 385*c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 386*c97ad5cdSakolb cap->cap_kstat->ks_data_size += 387*c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 388*c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 389*c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 390*c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 391*c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 392*c97ad5cdSakolb kstat_install(cap->cap_kstat); 393*c97ad5cdSakolb } 394*c97ad5cdSakolb } 395*c97ad5cdSakolb } 396*c97ad5cdSakolb 397*c97ad5cdSakolb /* 398*c97ad5cdSakolb * Disable project cap. 399*c97ad5cdSakolb * It is safe to disable already disabled project cap. 400*c97ad5cdSakolb * Should be called with caps_lock held. 401*c97ad5cdSakolb */ 402*c97ad5cdSakolb static void 403*c97ad5cdSakolb cap_project_disable(kproject_t *kpj) 404*c97ad5cdSakolb { 405*c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 406*c97ad5cdSakolb 407*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 408*c97ad5cdSakolb ASSERT(cap != NULL); 409*c97ad5cdSakolb ASSERT(cap->cap_project == kpj); 410*c97ad5cdSakolb 411*c97ad5cdSakolb if (CAP_ENABLED(cap)) 412*c97ad5cdSakolb cap_disable(&capped_projects, cap); 413*c97ad5cdSakolb } 414*c97ad5cdSakolb 415*c97ad5cdSakolb /* 416*c97ad5cdSakolb * Enable cap for a zone 417*c97ad5cdSakolb * It is safe to enable already enabled zone cap. 418*c97ad5cdSakolb * Should be called with caps_lock held. 419*c97ad5cdSakolb */ 420*c97ad5cdSakolb static void 421*c97ad5cdSakolb cap_zone_enable(zone_t *zone, hrtime_t value) 422*c97ad5cdSakolb { 423*c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 424*c97ad5cdSakolb 425*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 426*c97ad5cdSakolb ASSERT(cap != NULL); 427*c97ad5cdSakolb 428*c97ad5cdSakolb if (CAP_DISABLED(cap)) { 429*c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 430*c97ad5cdSakolb cap_enable(&capped_zones, cap, value); 431*c97ad5cdSakolb cap->cap_zone = zone; 432*c97ad5cdSakolb 433*c97ad5cdSakolb /* 434*c97ad5cdSakolb * Create cap kstats 435*c97ad5cdSakolb */ 436*c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 437*c97ad5cdSakolb KSTAT_TYPE_NAMED, 438*c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 439*c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 440*c97ad5cdSakolb cap->cap_kstat->ks_data_size += 441*c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 442*c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 443*c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 444*c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 445*c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 446*c97ad5cdSakolb kstat_install(cap->cap_kstat); 447*c97ad5cdSakolb } 448*c97ad5cdSakolb } 449*c97ad5cdSakolb } 450*c97ad5cdSakolb 451*c97ad5cdSakolb /* 452*c97ad5cdSakolb * Disable zone cap. 453*c97ad5cdSakolb * It is safe to disable already disabled zone cap. 454*c97ad5cdSakolb * Should be called with caps_lock held. 455*c97ad5cdSakolb */ 456*c97ad5cdSakolb static void 457*c97ad5cdSakolb cap_zone_disable(zone_t *zone) 458*c97ad5cdSakolb { 459*c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 460*c97ad5cdSakolb 461*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 462*c97ad5cdSakolb ASSERT(cap != NULL); 463*c97ad5cdSakolb ASSERT(cap->cap_zone == zone); 464*c97ad5cdSakolb 465*c97ad5cdSakolb if (CAP_ENABLED(cap)) 466*c97ad5cdSakolb cap_disable(&capped_zones, cap); 467*c97ad5cdSakolb } 468*c97ad5cdSakolb 469*c97ad5cdSakolb /* 470*c97ad5cdSakolb * Apply specified callback to all caps contained in the list `l'. 471*c97ad5cdSakolb */ 472*c97ad5cdSakolb static void 473*c97ad5cdSakolb cap_walk(list_t *l, void (*cb)(cpucap_t *)) 474*c97ad5cdSakolb { 475*c97ad5cdSakolb cpucap_t *cap; 476*c97ad5cdSakolb 477*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 478*c97ad5cdSakolb 479*c97ad5cdSakolb for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 480*c97ad5cdSakolb (*cb)(cap); 481*c97ad5cdSakolb } 482*c97ad5cdSakolb } 483*c97ad5cdSakolb 484*c97ad5cdSakolb /* 485*c97ad5cdSakolb * If cap limit is not reached, make one thread from wait queue runnable. 486*c97ad5cdSakolb * The waitq_isempty check is performed without the waitq lock. If a new thread 487*c97ad5cdSakolb * is placed on the waitq right after the check, it will be picked up during the 488*c97ad5cdSakolb * next invocation of cap_poke_waitq(). 489*c97ad5cdSakolb */ 490*c97ad5cdSakolb static void 491*c97ad5cdSakolb cap_poke_waitq(cpucap_t *cap) 492*c97ad5cdSakolb { 493*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 494*c97ad5cdSakolb 495*c97ad5cdSakolb if (cap->cap_usage >= cap->cap_value) { 496*c97ad5cdSakolb cap->cap_above++; 497*c97ad5cdSakolb } else { 498*c97ad5cdSakolb waitq_t *wq = &cap->cap_waitq; 499*c97ad5cdSakolb 500*c97ad5cdSakolb cap->cap_below++; 501*c97ad5cdSakolb 502*c97ad5cdSakolb if (!waitq_isempty(wq)) 503*c97ad5cdSakolb waitq_runone(wq); 504*c97ad5cdSakolb } 505*c97ad5cdSakolb } 506*c97ad5cdSakolb 507*c97ad5cdSakolb /* 508*c97ad5cdSakolb * The callback function called for every cap on capped_projects list. 509*c97ad5cdSakolb * Decay cap usage by CAP_DECAY_FACTOR 510*c97ad5cdSakolb * Add this cap project usage to its zone usage. 511*c97ad5cdSakolb * Kick off a thread from the cap waitq if cap is not reached. 512*c97ad5cdSakolb */ 513*c97ad5cdSakolb static void 514*c97ad5cdSakolb cap_project_usage_walker(cpucap_t *cap) 515*c97ad5cdSakolb { 516*c97ad5cdSakolb zone_t *zone = cap->cap_zone; 517*c97ad5cdSakolb hrtime_t cap_usage = cap->cap_usage; 518*c97ad5cdSakolb 519*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 520*c97ad5cdSakolb ASSERT(cap->cap_project->kpj_cpucap == cap); 521*c97ad5cdSakolb ASSERT(zone == cap->cap_project->kpj_zone); 522*c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 523*c97ad5cdSakolb 524*c97ad5cdSakolb /* 525*c97ad5cdSakolb * Set or clear the CAP_REACHED flag based on the current usage. 526*c97ad5cdSakolb * Only projects having their own caps are ever marked as CAP_REACHED. 527*c97ad5cdSakolb */ 528*c97ad5cdSakolb cap_poke_waitq(cap); 529*c97ad5cdSakolb 530*c97ad5cdSakolb /* 531*c97ad5cdSakolb * Add project's CPU usage to our zone's CPU usage. 532*c97ad5cdSakolb */ 533*c97ad5cdSakolb if (ZONE_IS_CAPPED(zone)) { 534*c97ad5cdSakolb cpucap_t *zcap = zone->zone_cpucap; 535*c97ad5cdSakolb 536*c97ad5cdSakolb ASSERT(zcap->cap_zone == zone); 537*c97ad5cdSakolb 538*c97ad5cdSakolb /* 539*c97ad5cdSakolb * If we haven't reset this zone's usage during this clock tick 540*c97ad5cdSakolb * yet, then do it now. The cap_lbolt field is used to check 541*c97ad5cdSakolb * whether this is the first zone's project we see during this 542*c97ad5cdSakolb * tick or a subsequent one. 543*c97ad5cdSakolb */ 544*c97ad5cdSakolb if (zcap->cap_lbolt != lbolt64) { 545*c97ad5cdSakolb if (zcap->cap_usage > zcap->cap_maxusage) 546*c97ad5cdSakolb zcap->cap_maxusage = zcap->cap_usage; 547*c97ad5cdSakolb zcap->cap_usage = 0; 548*c97ad5cdSakolb } 549*c97ad5cdSakolb DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 550*c97ad5cdSakolb hrtime_t, cap_usage); 551*c97ad5cdSakolb zcap->cap_usage += cap_usage; 552*c97ad5cdSakolb /* Check for overflows */ 553*c97ad5cdSakolb if (zcap->cap_usage < 0) 554*c97ad5cdSakolb zcap->cap_usage = MAX_USAGE - 1; 555*c97ad5cdSakolb } 556*c97ad5cdSakolb 557*c97ad5cdSakolb /* 558*c97ad5cdSakolb * Decay project usage. 559*c97ad5cdSakolb */ 560*c97ad5cdSakolb disp_lock_enter(&cap->cap_usagelock); 561*c97ad5cdSakolb cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 562*c97ad5cdSakolb disp_lock_exit(&cap->cap_usagelock); 563*c97ad5cdSakolb } 564*c97ad5cdSakolb 565*c97ad5cdSakolb /* 566*c97ad5cdSakolb * On every clock tick walk the list of project caps and update the CPU usage. 567*c97ad5cdSakolb * Also walk the list of zone caps checking whether any threads should 568*c97ad5cdSakolb * transition from wait queue to run queue. 569*c97ad5cdSakolb * 570*c97ad5cdSakolb * This function gets called by the clock thread directly when there are any 571*c97ad5cdSakolb * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 572*c97ad5cdSakolb * caps_lock for long periods of time, so there should be almost no contention 573*c97ad5cdSakolb * for it. 574*c97ad5cdSakolb */ 575*c97ad5cdSakolb static void 576*c97ad5cdSakolb caps_update() 577*c97ad5cdSakolb { 578*c97ad5cdSakolb mutex_enter(&caps_lock); 579*c97ad5cdSakolb cap_walk(&capped_projects, cap_project_usage_walker); 580*c97ad5cdSakolb cap_walk(&capped_zones, cap_poke_waitq); 581*c97ad5cdSakolb mutex_exit(&caps_lock); 582*c97ad5cdSakolb } 583*c97ad5cdSakolb 584*c97ad5cdSakolb /* 585*c97ad5cdSakolb * The function is called for each project in a zone when the zone cap is 586*c97ad5cdSakolb * modified. It enables project caps if zone cap is enabled and disables if the 587*c97ad5cdSakolb * zone cap is disabled and project doesn't have its own cap. 588*c97ad5cdSakolb * 589*c97ad5cdSakolb * For each project that does not have cpucap structure allocated it allocates a 590*c97ad5cdSakolb * new structure and assigns to kpj->cpu_cap. The allocation is performed 591*c97ad5cdSakolb * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 592*c97ad5cdSakolb * held. 593*c97ad5cdSakolb */ 594*c97ad5cdSakolb static int 595*c97ad5cdSakolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 596*c97ad5cdSakolb { 597*c97ad5cdSakolb cpucap_t *project_cap = NULL; 598*c97ad5cdSakolb cpucap_t *zone_cap = (cpucap_t *)arg; 599*c97ad5cdSakolb 600*c97ad5cdSakolb ASSERT(zone_cap != NULL); 601*c97ad5cdSakolb 602*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 603*c97ad5cdSakolb /* 604*c97ad5cdSakolb * This is the first time any cap was established for this 605*c97ad5cdSakolb * project. Allocate a new cpucap structure for it. 606*c97ad5cdSakolb */ 607*c97ad5cdSakolb project_cap = cap_alloc(); 608*c97ad5cdSakolb } 609*c97ad5cdSakolb 610*c97ad5cdSakolb mutex_enter(&caps_lock); 611*c97ad5cdSakolb 612*c97ad5cdSakolb /* 613*c97ad5cdSakolb * Double-check that kpj_cpucap is still NULL - now with caps_lock held 614*c97ad5cdSakolb * and assign the newly allocated cpucap structure to it. 615*c97ad5cdSakolb */ 616*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 617*c97ad5cdSakolb kpj->kpj_cpucap = project_cap; 618*c97ad5cdSakolb } else if (project_cap != NULL) { 619*c97ad5cdSakolb cap_free(project_cap); 620*c97ad5cdSakolb } 621*c97ad5cdSakolb 622*c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 623*c97ad5cdSakolb 624*c97ad5cdSakolb if (CAP_DISABLED(zone_cap)) { 625*c97ad5cdSakolb /* 626*c97ad5cdSakolb * Remove all projects in this zone without caps 627*c97ad5cdSakolb * from the capped_projects list. 628*c97ad5cdSakolb */ 629*c97ad5cdSakolb if (project_cap->cap_value == MAX_USAGE) { 630*c97ad5cdSakolb cap_project_disable(kpj); 631*c97ad5cdSakolb } 632*c97ad5cdSakolb } else if (CAP_DISABLED(project_cap)) { 633*c97ad5cdSakolb /* 634*c97ad5cdSakolb * Add the project to capped_projects list. 635*c97ad5cdSakolb */ 636*c97ad5cdSakolb ASSERT(project_cap->cap_value == 0); 637*c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 638*c97ad5cdSakolb } 639*c97ad5cdSakolb mutex_exit(&caps_lock); 640*c97ad5cdSakolb 641*c97ad5cdSakolb return (0); 642*c97ad5cdSakolb } 643*c97ad5cdSakolb 644*c97ad5cdSakolb /* 645*c97ad5cdSakolb * Set zone cap to cap_val 646*c97ad5cdSakolb * If cap_val is equal to NOCAP, disable zone cap. 647*c97ad5cdSakolb * 648*c97ad5cdSakolb * If this is the first time a cap is set on a zone, allocate cpucap structure 649*c97ad5cdSakolb * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 650*c97ad5cdSakolb */ 651*c97ad5cdSakolb int 652*c97ad5cdSakolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 653*c97ad5cdSakolb { 654*c97ad5cdSakolb cpucap_t *cap = NULL; 655*c97ad5cdSakolb hrtime_t value; 656*c97ad5cdSakolb 657*c97ad5cdSakolb if (cap_val == 0) 658*c97ad5cdSakolb return (EINVAL); 659*c97ad5cdSakolb 660*c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 661*c97ad5cdSakolb if (cap_val > MAXCAP) 662*c97ad5cdSakolb cap_val = MAXCAP; 663*c97ad5cdSakolb 664*c97ad5cdSakolb /* 665*c97ad5cdSakolb * Nothing to do if trying to disable a cap on a zone when caps are off 666*c97ad5cdSakolb * or a zone which does not have a cap yet. 667*c97ad5cdSakolb */ 668*c97ad5cdSakolb if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 669*c97ad5cdSakolb return (0); 670*c97ad5cdSakolb 671*c97ad5cdSakolb if (zone->zone_cpucap == NULL) 672*c97ad5cdSakolb cap = cap_alloc(); 673*c97ad5cdSakolb 674*c97ad5cdSakolb mutex_enter(&caps_lock); 675*c97ad5cdSakolb 676*c97ad5cdSakolb if (cpucaps_busy) { 677*c97ad5cdSakolb mutex_exit(&caps_lock); 678*c97ad5cdSakolb return (EBUSY); 679*c97ad5cdSakolb } 680*c97ad5cdSakolb 681*c97ad5cdSakolb /* 682*c97ad5cdSakolb * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 683*c97ad5cdSakolb * held. If it is still NULL, assign a newly allocated cpucap to it. 684*c97ad5cdSakolb */ 685*c97ad5cdSakolb if (zone->zone_cpucap == NULL) { 686*c97ad5cdSakolb zone->zone_cpucap = cap; 687*c97ad5cdSakolb } else if (cap != NULL) { 688*c97ad5cdSakolb cap_free(cap); 689*c97ad5cdSakolb } 690*c97ad5cdSakolb 691*c97ad5cdSakolb cap = zone->zone_cpucap; 692*c97ad5cdSakolb value = cap_val * cap_tick_cost; 693*c97ad5cdSakolb if (value < 0) 694*c97ad5cdSakolb value = MAX_USAGE; 695*c97ad5cdSakolb 696*c97ad5cdSakolb /* Nothing to do if the value is staying the same */ 697*c97ad5cdSakolb if (value == cap->cap_value) { 698*c97ad5cdSakolb mutex_exit(&caps_lock); 699*c97ad5cdSakolb return (0); 700*c97ad5cdSakolb } 701*c97ad5cdSakolb 702*c97ad5cdSakolb /* 703*c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 704*c97ad5cdSakolb */ 705*c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 706*c97ad5cdSakolb 707*c97ad5cdSakolb 708*c97ad5cdSakolb if (cap_val == NOCAP) { 709*c97ad5cdSakolb if (CAP_ENABLED(cap)) { 710*c97ad5cdSakolb /* 711*c97ad5cdSakolb * Remove cap for the zone 712*c97ad5cdSakolb */ 713*c97ad5cdSakolb cap_zone_disable(zone); 714*c97ad5cdSakolb cpucaps_busy = B_TRUE; 715*c97ad5cdSakolb mutex_exit(&caps_lock); 716*c97ad5cdSakolb /* 717*c97ad5cdSakolb * Disable caps for all project belonging to this zone 718*c97ad5cdSakolb * unless they have their own cap. 719*c97ad5cdSakolb */ 720*c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 721*c97ad5cdSakolb cap_project_zone_modify_walker, cap); 722*c97ad5cdSakolb 723*c97ad5cdSakolb mutex_enter(&caps_lock); 724*c97ad5cdSakolb cpucaps_busy = B_FALSE; 725*c97ad5cdSakolb } 726*c97ad5cdSakolb } else if (CAP_DISABLED(cap)) { 727*c97ad5cdSakolb /* 728*c97ad5cdSakolb * Set a cap on a zone which previously was not capped. 729*c97ad5cdSakolb */ 730*c97ad5cdSakolb cap_zone_enable(zone, value); 731*c97ad5cdSakolb cpucaps_busy = B_TRUE; 732*c97ad5cdSakolb mutex_exit(&caps_lock); 733*c97ad5cdSakolb 734*c97ad5cdSakolb /* 735*c97ad5cdSakolb * Enable cap for all projects belonging to this zone. 736*c97ad5cdSakolb */ 737*c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 738*c97ad5cdSakolb cap_project_zone_modify_walker, cap); 739*c97ad5cdSakolb 740*c97ad5cdSakolb mutex_enter(&caps_lock); 741*c97ad5cdSakolb cpucaps_busy = B_FALSE; 742*c97ad5cdSakolb } else { 743*c97ad5cdSakolb /* 744*c97ad5cdSakolb * No state transitions, just change the value 745*c97ad5cdSakolb */ 746*c97ad5cdSakolb cap->cap_value = value; 747*c97ad5cdSakolb } 748*c97ad5cdSakolb 749*c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 750*c97ad5cdSakolb ASSERT(!cpucaps_busy); 751*c97ad5cdSakolb mutex_exit(&caps_lock); 752*c97ad5cdSakolb 753*c97ad5cdSakolb return (0); 754*c97ad5cdSakolb } 755*c97ad5cdSakolb 756*c97ad5cdSakolb /* 757*c97ad5cdSakolb * The project is going away so disable its cap. 758*c97ad5cdSakolb */ 759*c97ad5cdSakolb void 760*c97ad5cdSakolb cpucaps_project_remove(kproject_t *kpj) 761*c97ad5cdSakolb { 762*c97ad5cdSakolb mutex_enter(&caps_lock); 763*c97ad5cdSakolb if (PROJECT_IS_CAPPED(kpj)) 764*c97ad5cdSakolb cap_project_disable(kpj); 765*c97ad5cdSakolb if (kpj->kpj_cpucap != NULL) { 766*c97ad5cdSakolb cap_free(kpj->kpj_cpucap); 767*c97ad5cdSakolb kpj->kpj_cpucap = NULL; 768*c97ad5cdSakolb } 769*c97ad5cdSakolb mutex_exit(&caps_lock); 770*c97ad5cdSakolb } 771*c97ad5cdSakolb 772*c97ad5cdSakolb /* 773*c97ad5cdSakolb * The zone is going away, so disable its cap. 774*c97ad5cdSakolb */ 775*c97ad5cdSakolb void 776*c97ad5cdSakolb cpucaps_zone_remove(zone_t *zone) 777*c97ad5cdSakolb { 778*c97ad5cdSakolb mutex_enter(&caps_lock); 779*c97ad5cdSakolb while (ZONE_IS_CAPPED(zone)) { 780*c97ad5cdSakolb mutex_exit(&caps_lock); 781*c97ad5cdSakolb (void) cpucaps_zone_set(zone, NOCAP); 782*c97ad5cdSakolb mutex_enter(&caps_lock); 783*c97ad5cdSakolb } 784*c97ad5cdSakolb if (zone->zone_cpucap != NULL) { 785*c97ad5cdSakolb cap_free(zone->zone_cpucap); 786*c97ad5cdSakolb zone->zone_cpucap = NULL; 787*c97ad5cdSakolb } 788*c97ad5cdSakolb mutex_exit(&caps_lock); 789*c97ad5cdSakolb } 790*c97ad5cdSakolb 791*c97ad5cdSakolb /* 792*c97ad5cdSakolb * New project was created. It should be put on the capped_projects list if 793*c97ad5cdSakolb * its zone has a cap. 794*c97ad5cdSakolb */ 795*c97ad5cdSakolb void 796*c97ad5cdSakolb cpucaps_project_add(kproject_t *kpj) 797*c97ad5cdSakolb { 798*c97ad5cdSakolb cpucap_t *cap = NULL; 799*c97ad5cdSakolb 800*c97ad5cdSakolb if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 801*c97ad5cdSakolb return; 802*c97ad5cdSakolb 803*c97ad5cdSakolb /* 804*c97ad5cdSakolb * This project was never capped before, so allocate its cap structure. 805*c97ad5cdSakolb */ 806*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) 807*c97ad5cdSakolb cap = cap_alloc(); 808*c97ad5cdSakolb 809*c97ad5cdSakolb mutex_enter(&caps_lock); 810*c97ad5cdSakolb /* 811*c97ad5cdSakolb * Double-check with caps_lock held 812*c97ad5cdSakolb */ 813*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 814*c97ad5cdSakolb kpj->kpj_cpucap = cap; 815*c97ad5cdSakolb } else if (cap != NULL) { 816*c97ad5cdSakolb cap_free(cap); 817*c97ad5cdSakolb } 818*c97ad5cdSakolb 819*c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) 820*c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 821*c97ad5cdSakolb 822*c97ad5cdSakolb mutex_exit(&caps_lock); 823*c97ad5cdSakolb } 824*c97ad5cdSakolb 825*c97ad5cdSakolb /* 826*c97ad5cdSakolb * Set project cap to cap_val 827*c97ad5cdSakolb * If cap_val is equal to NOCAP, disable project cap. 828*c97ad5cdSakolb * 829*c97ad5cdSakolb * If this is the first time a cap is set on a project, allocate cpucap 830*c97ad5cdSakolb * structure without holding caps_lock to avoid KM_SLEEP allocation with 831*c97ad5cdSakolb * caps_lock held. 832*c97ad5cdSakolb */ 833*c97ad5cdSakolb int 834*c97ad5cdSakolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 835*c97ad5cdSakolb { 836*c97ad5cdSakolb cpucap_t *cap = NULL; 837*c97ad5cdSakolb hrtime_t value; 838*c97ad5cdSakolb 839*c97ad5cdSakolb if (cap_val == 0) 840*c97ad5cdSakolb return (EINVAL); 841*c97ad5cdSakolb 842*c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 843*c97ad5cdSakolb if (cap_val > MAXCAP) 844*c97ad5cdSakolb cap_val = MAXCAP; 845*c97ad5cdSakolb 846*c97ad5cdSakolb /* 847*c97ad5cdSakolb * Nothing to do if trying to disable project cap and caps are not 848*c97ad5cdSakolb * enabled or if trying to disable cap on a project that does not have 849*c97ad5cdSakolb * cap enabled. 850*c97ad5cdSakolb */ 851*c97ad5cdSakolb if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 852*c97ad5cdSakolb return (0); 853*c97ad5cdSakolb 854*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 855*c97ad5cdSakolb /* 856*c97ad5cdSakolb * This project was never capped before, so allocate its cap 857*c97ad5cdSakolb * structure. 858*c97ad5cdSakolb */ 859*c97ad5cdSakolb cap = cap_alloc(); 860*c97ad5cdSakolb } 861*c97ad5cdSakolb 862*c97ad5cdSakolb mutex_enter(&caps_lock); 863*c97ad5cdSakolb 864*c97ad5cdSakolb /* 865*c97ad5cdSakolb * Double-check with caps_lock held. 866*c97ad5cdSakolb */ 867*c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 868*c97ad5cdSakolb kpj->kpj_cpucap = cap; 869*c97ad5cdSakolb } else if (cap != NULL) { 870*c97ad5cdSakolb cap_free(cap); 871*c97ad5cdSakolb } 872*c97ad5cdSakolb 873*c97ad5cdSakolb /* 874*c97ad5cdSakolb * Get the actual pointer to the project cap. 875*c97ad5cdSakolb */ 876*c97ad5cdSakolb cap = kpj->kpj_cpucap; 877*c97ad5cdSakolb value = cap_val * cap_tick_cost; 878*c97ad5cdSakolb if (value < 0) 879*c97ad5cdSakolb value = MAX_USAGE; 880*c97ad5cdSakolb 881*c97ad5cdSakolb /* 882*c97ad5cdSakolb * Nothing to do if the value is not changing 883*c97ad5cdSakolb */ 884*c97ad5cdSakolb if (value == cap->cap_value) { 885*c97ad5cdSakolb mutex_exit(&caps_lock); 886*c97ad5cdSakolb return (0); 887*c97ad5cdSakolb } 888*c97ad5cdSakolb 889*c97ad5cdSakolb /* 890*c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 891*c97ad5cdSakolb */ 892*c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 893*c97ad5cdSakolb cap->cap_maxusage = 0; 894*c97ad5cdSakolb 895*c97ad5cdSakolb if (cap_val != NOCAP) { 896*c97ad5cdSakolb /* 897*c97ad5cdSakolb * Enable this cap if it is not already enabled. 898*c97ad5cdSakolb */ 899*c97ad5cdSakolb if (CAP_DISABLED(cap)) 900*c97ad5cdSakolb cap_project_enable(kpj, value); 901*c97ad5cdSakolb else 902*c97ad5cdSakolb cap->cap_value = value; 903*c97ad5cdSakolb } else if (CAP_ENABLED(cap)) { 904*c97ad5cdSakolb /* 905*c97ad5cdSakolb * User requested to drop a cap on the project. If it is part of 906*c97ad5cdSakolb * capped zone, keep the cap and set the value to MAX_USAGE, 907*c97ad5cdSakolb * otherwise disable the cap. 908*c97ad5cdSakolb */ 909*c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 910*c97ad5cdSakolb cap->cap_value = MAX_USAGE; 911*c97ad5cdSakolb } else { 912*c97ad5cdSakolb cap_project_disable(kpj); 913*c97ad5cdSakolb } 914*c97ad5cdSakolb } 915*c97ad5cdSakolb mutex_exit(&caps_lock); 916*c97ad5cdSakolb 917*c97ad5cdSakolb return (0); 918*c97ad5cdSakolb } 919*c97ad5cdSakolb 920*c97ad5cdSakolb /* 921*c97ad5cdSakolb * Get cap usage. 922*c97ad5cdSakolb */ 923*c97ad5cdSakolb static rctl_qty_t 924*c97ad5cdSakolb cap_get(cpucap_t *cap) 925*c97ad5cdSakolb { 926*c97ad5cdSakolb return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 927*c97ad5cdSakolb } 928*c97ad5cdSakolb 929*c97ad5cdSakolb /* 930*c97ad5cdSakolb * Get current project usage. 931*c97ad5cdSakolb */ 932*c97ad5cdSakolb rctl_qty_t 933*c97ad5cdSakolb cpucaps_project_get(kproject_t *kpj) 934*c97ad5cdSakolb { 935*c97ad5cdSakolb return (cap_get(kpj->kpj_cpucap)); 936*c97ad5cdSakolb } 937*c97ad5cdSakolb 938*c97ad5cdSakolb /* 939*c97ad5cdSakolb * Get current zone usage. 940*c97ad5cdSakolb */ 941*c97ad5cdSakolb rctl_qty_t 942*c97ad5cdSakolb cpucaps_zone_get(zone_t *zone) 943*c97ad5cdSakolb { 944*c97ad5cdSakolb return (cap_get(zone->zone_cpucap)); 945*c97ad5cdSakolb } 946*c97ad5cdSakolb 947*c97ad5cdSakolb /* 948*c97ad5cdSakolb * Charge project of thread t the time thread t spent on CPU since previously 949*c97ad5cdSakolb * adjusted. 950*c97ad5cdSakolb * 951*c97ad5cdSakolb * Record the current on-CPU time in the csc structure. 952*c97ad5cdSakolb * 953*c97ad5cdSakolb * Do not adjust for more than one tick worth of time. 954*c97ad5cdSakolb * 955*c97ad5cdSakolb */ 956*c97ad5cdSakolb static void 957*c97ad5cdSakolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 958*c97ad5cdSakolb { 959*c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 960*c97ad5cdSakolb hrtime_t new_usage; 961*c97ad5cdSakolb hrtime_t usage_delta; 962*c97ad5cdSakolb 963*c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 964*c97ad5cdSakolb ASSERT(PROJECT_IS_CAPPED(kpj)); 965*c97ad5cdSakolb 966*c97ad5cdSakolb /* Get on-CPU time since birth of a thread */ 967*c97ad5cdSakolb new_usage = mstate_thread_onproc_time(t); 968*c97ad5cdSakolb 969*c97ad5cdSakolb /* Time spent on CPU since last checked */ 970*c97ad5cdSakolb usage_delta = new_usage - csc->csc_cputime; 971*c97ad5cdSakolb 972*c97ad5cdSakolb /* Save the accumulated on-CPU time */ 973*c97ad5cdSakolb csc->csc_cputime = new_usage; 974*c97ad5cdSakolb 975*c97ad5cdSakolb /* Charge at most one tick worth of on-CPU time */ 976*c97ad5cdSakolb if (usage_delta > cap_tick_cost) 977*c97ad5cdSakolb usage_delta = cap_tick_cost; 978*c97ad5cdSakolb 979*c97ad5cdSakolb /* Add usage_delta to the project usage value. */ 980*c97ad5cdSakolb if (usage_delta > 0) { 981*c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 982*c97ad5cdSakolb 983*c97ad5cdSakolb DTRACE_PROBE2(cpucaps__project__charge, 984*c97ad5cdSakolb kthread_id_t, t, hrtime_t, usage_delta); 985*c97ad5cdSakolb 986*c97ad5cdSakolb disp_lock_enter_high(&cap->cap_usagelock); 987*c97ad5cdSakolb cap->cap_usage += usage_delta; 988*c97ad5cdSakolb 989*c97ad5cdSakolb /* Check for overflows */ 990*c97ad5cdSakolb if (cap->cap_usage < 0) 991*c97ad5cdSakolb cap->cap_usage = MAX_USAGE - 1; 992*c97ad5cdSakolb 993*c97ad5cdSakolb disp_lock_exit_high(&cap->cap_usagelock); 994*c97ad5cdSakolb 995*c97ad5cdSakolb /* 996*c97ad5cdSakolb * cap_maxusage is only kept for observability. Move it outside 997*c97ad5cdSakolb * the lock to reduce the time spent while holding the lock. 998*c97ad5cdSakolb */ 999*c97ad5cdSakolb if (cap->cap_usage > cap->cap_maxusage) 1000*c97ad5cdSakolb cap->cap_maxusage = cap->cap_usage; 1001*c97ad5cdSakolb } 1002*c97ad5cdSakolb } 1003*c97ad5cdSakolb 1004*c97ad5cdSakolb /* 1005*c97ad5cdSakolb * Charge thread's project and return True if project or zone should be 1006*c97ad5cdSakolb * penalized because its project or zone is exceeding its cap. Also sets 1007*c97ad5cdSakolb * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 1008*c97ad5cdSakolb */ 1009*c97ad5cdSakolb boolean_t 1010*c97ad5cdSakolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1011*c97ad5cdSakolb { 1012*c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 1013*c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1014*c97ad5cdSakolb zone_t *zone; 1015*c97ad5cdSakolb cpucap_t *project_cap; 1016*c97ad5cdSakolb boolean_t rc = B_FALSE; 1017*c97ad5cdSakolb 1018*c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1019*c97ad5cdSakolb 1020*c97ad5cdSakolb /* Nothing to do for projects that are not capped. */ 1021*c97ad5cdSakolb if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1022*c97ad5cdSakolb return (B_FALSE); 1023*c97ad5cdSakolb 1024*c97ad5cdSakolb caps_charge_adjust(t, csc); 1025*c97ad5cdSakolb 1026*c97ad5cdSakolb /* 1027*c97ad5cdSakolb * The caller only requested to charge the project usage, no enforcement 1028*c97ad5cdSakolb * part. 1029*c97ad5cdSakolb */ 1030*c97ad5cdSakolb if (charge_type == CPUCAPS_CHARGE_ONLY) 1031*c97ad5cdSakolb return (B_FALSE); 1032*c97ad5cdSakolb 1033*c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 1034*c97ad5cdSakolb 1035*c97ad5cdSakolb if (project_cap->cap_usage >= project_cap->cap_value) { 1036*c97ad5cdSakolb t->t_schedflag |= TS_PROJWAITQ; 1037*c97ad5cdSakolb rc = B_TRUE; 1038*c97ad5cdSakolb } else if (t->t_schedflag & TS_PROJWAITQ) { 1039*c97ad5cdSakolb t->t_schedflag &= ~TS_PROJWAITQ; 1040*c97ad5cdSakolb } 1041*c97ad5cdSakolb 1042*c97ad5cdSakolb zone = ttozone(t); 1043*c97ad5cdSakolb if (!ZONE_IS_CAPPED(zone)) { 1044*c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) 1045*c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1046*c97ad5cdSakolb } else { 1047*c97ad5cdSakolb cpucap_t *zone_cap = zone->zone_cpucap; 1048*c97ad5cdSakolb 1049*c97ad5cdSakolb if (zone_cap->cap_usage >= zone_cap->cap_value) { 1050*c97ad5cdSakolb t->t_schedflag |= TS_ZONEWAITQ; 1051*c97ad5cdSakolb rc = B_TRUE; 1052*c97ad5cdSakolb } else if (t->t_schedflag & TS_ZONEWAITQ) { 1053*c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1054*c97ad5cdSakolb } 1055*c97ad5cdSakolb } 1056*c97ad5cdSakolb 1057*c97ad5cdSakolb 1058*c97ad5cdSakolb return (rc); 1059*c97ad5cdSakolb } 1060*c97ad5cdSakolb 1061*c97ad5cdSakolb /* 1062*c97ad5cdSakolb * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1063*c97ad5cdSakolb * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1064*c97ad5cdSakolb * 1065*c97ad5cdSakolb * CPU Caps are only enforced for user threads. 1066*c97ad5cdSakolb * 1067*c97ad5cdSakolb * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1068*c97ad5cdSakolb * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1069*c97ad5cdSakolb * 1070*c97ad5cdSakolb * It is possible that by the time we enter cpucaps_enforce() the cap is already 1071*c97ad5cdSakolb * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1072*c97ad5cdSakolb * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1073*c97ad5cdSakolb * apply. 1074*c97ad5cdSakolb */ 1075*c97ad5cdSakolb boolean_t 1076*c97ad5cdSakolb cpucaps_enforce(kthread_t *t) 1077*c97ad5cdSakolb { 1078*c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1079*c97ad5cdSakolb 1080*c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1081*c97ad5cdSakolb 1082*c97ad5cdSakolb if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1083*c97ad5cdSakolb if (t->t_schedflag & TS_PROJWAITQ) { 1084*c97ad5cdSakolb ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1085*c97ad5cdSakolb t->t_schedflag &= ~TS_ANYWAITQ; 1086*c97ad5cdSakolb if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1087*c97ad5cdSakolb t)) { 1088*c97ad5cdSakolb return (B_TRUE); 1089*c97ad5cdSakolb } 1090*c97ad5cdSakolb } 1091*c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) { 1092*c97ad5cdSakolb ASSERT(ttozone(t)->zone_cpucap != NULL); 1093*c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1094*c97ad5cdSakolb if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1095*c97ad5cdSakolb t)) { 1096*c97ad5cdSakolb return (B_TRUE); 1097*c97ad5cdSakolb } 1098*c97ad5cdSakolb } 1099*c97ad5cdSakolb } 1100*c97ad5cdSakolb 1101*c97ad5cdSakolb /* 1102*c97ad5cdSakolb * The thread is not enqueued on the wait queue. 1103*c97ad5cdSakolb */ 1104*c97ad5cdSakolb return (B_FALSE); 1105*c97ad5cdSakolb } 1106*c97ad5cdSakolb 1107*c97ad5cdSakolb /* 1108*c97ad5cdSakolb * Convert internal cap statistics into values exported by cap kstat. 1109*c97ad5cdSakolb */ 1110*c97ad5cdSakolb static int 1111*c97ad5cdSakolb cap_kstat_update(kstat_t *ksp, int rw) 1112*c97ad5cdSakolb { 1113*c97ad5cdSakolb struct cap_kstat *capsp = &cap_kstat; 1114*c97ad5cdSakolb cpucap_t *cap = ksp->ks_private; 1115*c97ad5cdSakolb clock_t tick_sec = SEC_TO_TICK(1); 1116*c97ad5cdSakolb char *zonename = cap->cap_zone->zone_name; 1117*c97ad5cdSakolb 1118*c97ad5cdSakolb if (rw == KSTAT_WRITE) 1119*c97ad5cdSakolb return (EACCES); 1120*c97ad5cdSakolb 1121*c97ad5cdSakolb capsp->cap_value.value.ui64 = 1122*c97ad5cdSakolb ROUND_SCALE(cap->cap_value, cap_tick_cost); 1123*c97ad5cdSakolb capsp->cap_usage.value.ui64 = 1124*c97ad5cdSakolb ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1125*c97ad5cdSakolb capsp->cap_maxusage.value.ui64 = 1126*c97ad5cdSakolb ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1127*c97ad5cdSakolb capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1128*c97ad5cdSakolb capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1129*c97ad5cdSakolb capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1130*c97ad5cdSakolb kstat_named_setstr(&capsp->cap_zonename, zonename); 1131*c97ad5cdSakolb 1132*c97ad5cdSakolb return (0); 1133*c97ad5cdSakolb } 1134