1c97ad5cdSakolb /* 2c97ad5cdSakolb * CDDL HEADER START 3c97ad5cdSakolb * 4c97ad5cdSakolb * The contents of this file are subject to the terms of the 5c97ad5cdSakolb * Common Development and Distribution License (the "License"). 6c97ad5cdSakolb * You may not use this file except in compliance with the License. 7c97ad5cdSakolb * 8c97ad5cdSakolb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c97ad5cdSakolb * or http://www.opensolaris.org/os/licensing. 10c97ad5cdSakolb * See the License for the specific language governing permissions 11c97ad5cdSakolb * and limitations under the License. 12c97ad5cdSakolb * 13c97ad5cdSakolb * When distributing Covered Code, include this CDDL HEADER in each 14c97ad5cdSakolb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c97ad5cdSakolb * If applicable, add the following below this CDDL HEADER, with the 16c97ad5cdSakolb * fields enclosed by brackets "[]" replaced with your own identifying 17c97ad5cdSakolb * information: Portions Copyright [yyyy] [name of copyright owner] 18c97ad5cdSakolb * 19c97ad5cdSakolb * CDDL HEADER END 20c97ad5cdSakolb */ 21c97ad5cdSakolb 22c97ad5cdSakolb /* 23c97ad5cdSakolb * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24c97ad5cdSakolb * Use is subject to license terms. 25c97ad5cdSakolb */ 26c97ad5cdSakolb 27c97ad5cdSakolb #pragma ident "%Z%%M% %I% %E% SMI" 28c97ad5cdSakolb 29c97ad5cdSakolb #include <sys/disp.h> 30c97ad5cdSakolb #include <sys/param.h> 31c97ad5cdSakolb #include <sys/systm.h> 32c97ad5cdSakolb #include <sys/sysmacros.h> 33c97ad5cdSakolb #include <sys/atomic.h> 34c97ad5cdSakolb #include <sys/cpucaps_impl.h> 35c97ad5cdSakolb #include <sys/dtrace.h> 36c97ad5cdSakolb #include <sys/sdt.h> 37c97ad5cdSakolb #include <sys/debug.h> 38c97ad5cdSakolb #include <sys/rctl.h> 39c97ad5cdSakolb #include <sys/errno.h> 40c97ad5cdSakolb 41c97ad5cdSakolb /* 42c97ad5cdSakolb * CPU Caps implementation 43c97ad5cdSakolb * ======================= 44c97ad5cdSakolb * 45c97ad5cdSakolb * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 46c97ad5cdSakolb * usage for all projects running inside the zone. If the zone CPU cap is set 47c97ad5cdSakolb * below the project CPU cap, the latter will have no effect. 48c97ad5cdSakolb * 49c97ad5cdSakolb * When CPU usage of projects and/or zones reaches specified caps, threads in 50c97ad5cdSakolb * them do not get scheduled and instead are placed on wait queues associated 51c97ad5cdSakolb * with a cap. Such threads will start running again only when CPU usage drops 52c97ad5cdSakolb * below the cap level. Each zone and each project has its own wait queue. 53c97ad5cdSakolb * 54c97ad5cdSakolb * When CPU cap is set, the kernel continously keeps track of CPU time used by 55c97ad5cdSakolb * capped zones and/or projects over a short time interval and calculates their 56c97ad5cdSakolb * current CPU usage as a percentage. When the accumulated usage reaches the CPU 57c97ad5cdSakolb * cap, LWPs running in the user-land (when they are not holding any critical 58c97ad5cdSakolb * kernel locks) are placed on special wait queues until their project's or 59c97ad5cdSakolb * zone's CPU usage drops below the cap. 60c97ad5cdSakolb * 61c97ad5cdSakolb * The system maintains a list of all capped projects and all capped zones. On 62c97ad5cdSakolb * every clock tick every active thread belonging to a capped project adds its 63c97ad5cdSakolb * CPU usage to its project. Usage from all projects belonging to a capped zone 64c97ad5cdSakolb * is aggregated to get the zone usage. 65c97ad5cdSakolb * 66c97ad5cdSakolb * When the current CPU usage is above the cap, a project or zone is considered 67c97ad5cdSakolb * over-capped. Every user thread caught running in an over-capped project or 68c97ad5cdSakolb * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 69c97ad5cdSakolb * is requested to surrender its CPU. This causes scheduling class specific 70c97ad5cdSakolb * CL_PREEMPT() callback to be invoked. The callback function places threads 71c97ad5cdSakolb * marked as TS_PROJWAIT on a wait queue and calls switch(). 72c97ad5cdSakolb * 73c97ad5cdSakolb * Threads are only placed on wait queues after trapping from user-land 74c97ad5cdSakolb * (they could be holding some user locks, but no kernel locks) and while 75c97ad5cdSakolb * returning from the trap back to the user-land when no kernel locks are held. 76c97ad5cdSakolb * Putting threads on wait queues in random places while running in the 77c97ad5cdSakolb * kernel might lead to all kinds of locking problems. 78c97ad5cdSakolb * 79c97ad5cdSakolb * Accounting 80c97ad5cdSakolb * ========== 81c97ad5cdSakolb * 82c97ad5cdSakolb * Accounting of CPU usage is based on per-thread micro-state accounting data. 83c97ad5cdSakolb * On every clock tick clock() adds new on-CPU time for every thread found on 84c97ad5cdSakolb * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 85c97ad5cdSakolb * New times means time since it was last accounted for. On-CPU times greater 86c97ad5cdSakolb * than 1 tick are truncated to 1 tick. 87c97ad5cdSakolb * 88c97ad5cdSakolb * Project CPU usage is aggregated from all threads within the project. 89c97ad5cdSakolb * Zone CPU usage is the sum of usages for all projects within the zone. Zone 90c97ad5cdSakolb * CPU usage is calculated on every clock tick by walking list of projects and 91c97ad5cdSakolb * adding their usage together. 92c97ad5cdSakolb * 93c97ad5cdSakolb * Decay 94c97ad5cdSakolb * ===== 95c97ad5cdSakolb * 96c97ad5cdSakolb * CPU usage is decayed by the caps_update() routine which is called once per 97c97ad5cdSakolb * every clock tick. It walks lists of project caps and decays their usages by 98c97ad5cdSakolb * one per cent. If CPU usage drops below cap levels, threads on the wait queue 99c97ad5cdSakolb * are made runnable again, one thread per clock tick. 100c97ad5cdSakolb * 101c97ad5cdSakolb * Interfaces 102c97ad5cdSakolb * ========== 103c97ad5cdSakolb * 104c97ad5cdSakolb * The CPU Caps facility provides the following interfaces to the rest of the 105c97ad5cdSakolb * system: 106c97ad5cdSakolb * 107c97ad5cdSakolb * cpucaps_project_add(kproject_t *) 108c97ad5cdSakolb * 109c97ad5cdSakolb * Notifies the framework of a new project. It should be put on the 110c97ad5cdSakolb * capped_projects list if its zone has a cap. 111c97ad5cdSakolb * 112c97ad5cdSakolb * cpucaps_project_remove(kproject_t *) 113c97ad5cdSakolb * 114c97ad5cdSakolb * Remove the association between the specified project and its cap. 115c97ad5cdSakolb * Called right before the project is destroyed. 116c97ad5cdSakolb * 117c97ad5cdSakolb * cpucaps_project_set(kproject_t *, rctl_qty_t) 118c97ad5cdSakolb * 119c97ad5cdSakolb * Set project cap of the specified project to the specified value. Setting the 120c97ad5cdSakolb * value to NOCAP is equivalent to removing the cap. 121c97ad5cdSakolb * 122c97ad5cdSakolb * cpucaps_zone_set(zone_t *, rctl_qty_t) 123c97ad5cdSakolb * 124c97ad5cdSakolb * Set zone cap of the specified zone to the specified value. Setting the value 125c97ad5cdSakolb * to NOCAP is equivalent to removing the cap. 126c97ad5cdSakolb * 127c97ad5cdSakolb * cpucaps_zone_remove(zone_t *) 128c97ad5cdSakolb * 129c97ad5cdSakolb * Remove the association between the zone and its cap. 130c97ad5cdSakolb * 131c97ad5cdSakolb * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 132c97ad5cdSakolb * 133c97ad5cdSakolb * Charges specified thread's project the amount of on-CPU time that it used. 134c97ad5cdSakolb * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 135c97ad5cdSakolb * Otherwise returns True if project or zone should be penalized because its 136c97ad5cdSakolb * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 137c97ad5cdSakolb * bits in t_schedflag in this case. 138c97ad5cdSakolb * 139c97ad5cdSakolb * CPUCAPS_ENFORCE(kthread_id_t *) 140c97ad5cdSakolb * 141c97ad5cdSakolb * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 142c97ad5cdSakolb * state on project or zone wait queues, as requested by TS_PROJWAITQ or 143c97ad5cdSakolb * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 144c97ad5cdSakolb * wait queue or False otherwise. 145c97ad5cdSakolb * 146c97ad5cdSakolb * cpucaps_sc_init(caps_sc_t *) 147c97ad5cdSakolb * 148c97ad5cdSakolb * Initializes the scheduling-class specific CPU Caps data for a thread. 149c97ad5cdSakolb * 150c97ad5cdSakolb * LOCKS 151c97ad5cdSakolb * ===== 152c97ad5cdSakolb * 153c97ad5cdSakolb * all the individual caps structures and their lists are protected by a global 154c97ad5cdSakolb * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 155c97ad5cdSakolb * caps, so it is usually uncontended. We avoid all blocking memory allocations 156c97ad5cdSakolb * while holding caps_lock to prevent clock() from blocking. 157c97ad5cdSakolb * 158c97ad5cdSakolb * Thread state is protected by the thread lock. It protects the association 159c97ad5cdSakolb * between a thread and its project and, as a consequence, to its zone. The 160c97ad5cdSakolb * association can not break while thread lock is held, so the project or zone 161c97ad5cdSakolb * cap are not going to disappear while thread lock is held. 162c97ad5cdSakolb * 163c97ad5cdSakolb * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 164c97ad5cdSakolb * grabbed by scheduling classes already holding thread lock at high PIL and by 165c97ad5cdSakolb * clock thread performing usage decay. We should do as little work as possible 166c97ad5cdSakolb * while holding the lock since it may be very hot. All threads in the project 167c97ad5cdSakolb * contend for the same cache line doing cap usage updates. 168c97ad5cdSakolb */ 169c97ad5cdSakolb 170c97ad5cdSakolb /* 171c97ad5cdSakolb * caps_lock protects list of capped projects and zones, changes in the cap 172c97ad5cdSakolb * state and changes of the global cpucaps_enabled flag. 173c97ad5cdSakolb * 174c97ad5cdSakolb * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 175c97ad5cdSakolb * modified in parallel. This can be per-zone cap flag, but we don't keep any 176c97ad5cdSakolb * cap state for now. 177c97ad5cdSakolb */ 178c97ad5cdSakolb static kmutex_t caps_lock; /* lock to protect: */ 179c97ad5cdSakolb static list_t capped_zones; /* - list of zones with caps */ 180c97ad5cdSakolb static list_t capped_projects; /* - list of projects with caps */ 181c97ad5cdSakolb boolean_t cpucaps_enabled; /* - are there any caps defined? */ 182c97ad5cdSakolb boolean_t cpucaps_busy; /* - is framework busy? */ 183c97ad5cdSakolb 184c97ad5cdSakolb /* 185c97ad5cdSakolb * The accounting is based on the number of nanoseconds threads spend running 186c97ad5cdSakolb * during a tick which is kept in the cap_tick_cost variable. 187c97ad5cdSakolb */ 188c97ad5cdSakolb static hrtime_t cap_tick_cost; 189c97ad5cdSakolb 190c97ad5cdSakolb /* 191c97ad5cdSakolb * How much of the usage value is decayed every clock tick 192c97ad5cdSakolb * Decay one per cent of value per tick 193c97ad5cdSakolb */ 194c97ad5cdSakolb #define CAP_DECAY_FACTOR 100 195c97ad5cdSakolb 196c97ad5cdSakolb /* 197c97ad5cdSakolb * Scale the value and round it to the closest integer value 198c97ad5cdSakolb */ 199c97ad5cdSakolb #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 200c97ad5cdSakolb 201c97ad5cdSakolb static void caps_update(); 202c97ad5cdSakolb 203c97ad5cdSakolb /* 204c97ad5cdSakolb * CAP kstats. 205c97ad5cdSakolb */ 206c97ad5cdSakolb struct cap_kstat { 207c97ad5cdSakolb kstat_named_t cap_value; 208c97ad5cdSakolb kstat_named_t cap_usage; 209c97ad5cdSakolb kstat_named_t cap_nwait; 210c97ad5cdSakolb kstat_named_t cap_below; 211c97ad5cdSakolb kstat_named_t cap_above; 212c97ad5cdSakolb kstat_named_t cap_maxusage; 213c97ad5cdSakolb kstat_named_t cap_zonename; 214c97ad5cdSakolb } cap_kstat = { 215c97ad5cdSakolb { "value", KSTAT_DATA_UINT64 }, 216c97ad5cdSakolb { "usage", KSTAT_DATA_UINT64 }, 217c97ad5cdSakolb { "nwait", KSTAT_DATA_UINT64 }, 218c97ad5cdSakolb { "below_sec", KSTAT_DATA_UINT64 }, 219c97ad5cdSakolb { "above_sec", KSTAT_DATA_UINT64 }, 220c97ad5cdSakolb { "maxusage", KSTAT_DATA_UINT64 }, 221c97ad5cdSakolb { "zonename", KSTAT_DATA_STRING }, 222c97ad5cdSakolb }; 223c97ad5cdSakolb 224c97ad5cdSakolb 225c97ad5cdSakolb static kmutex_t cap_kstat_lock; 226c97ad5cdSakolb static int cap_kstat_update(kstat_t *, int); 227c97ad5cdSakolb 228c97ad5cdSakolb /* 229c97ad5cdSakolb * Initialize CPU caps infrastructure. 230c97ad5cdSakolb * - Initialize lists of capped zones and capped projects 231c97ad5cdSakolb * - Set cpucaps_clock_callout to NULL 232c97ad5cdSakolb */ 233c97ad5cdSakolb void 234c97ad5cdSakolb cpucaps_init() 235c97ad5cdSakolb { 236c97ad5cdSakolb /* 237c97ad5cdSakolb * Initialize global variables 238c97ad5cdSakolb */ 239c97ad5cdSakolb cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 240c97ad5cdSakolb 241c97ad5cdSakolb list_create(&capped_zones, sizeof (cpucap_t), 242c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 243c97ad5cdSakolb list_create(&capped_projects, sizeof (cpucap_t), 244c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 245c97ad5cdSakolb 246c97ad5cdSakolb cpucaps_enabled = B_FALSE; 247c97ad5cdSakolb cpucaps_busy = B_FALSE; 248c97ad5cdSakolb cpucaps_clock_callout = NULL; 249c97ad5cdSakolb } 250c97ad5cdSakolb 251c97ad5cdSakolb /* 252c97ad5cdSakolb * Initialize scheduling-class specific CPU Caps data. 253c97ad5cdSakolb */ 254c97ad5cdSakolb void 255c97ad5cdSakolb cpucaps_sc_init(caps_sc_t *csc) 256c97ad5cdSakolb { 257c97ad5cdSakolb csc->csc_cputime = 0; 258c97ad5cdSakolb } 259c97ad5cdSakolb 260c97ad5cdSakolb /* 261c97ad5cdSakolb * Allocate and initialize cpucap structure 262c97ad5cdSakolb */ 263c97ad5cdSakolb static cpucap_t * 264c97ad5cdSakolb cap_alloc(void) 265c97ad5cdSakolb { 266c97ad5cdSakolb cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 267c97ad5cdSakolb 268c97ad5cdSakolb DISP_LOCK_INIT(&cap->cap_usagelock); 269c97ad5cdSakolb waitq_init(&cap->cap_waitq); 270c97ad5cdSakolb 271c97ad5cdSakolb return (cap); 272c97ad5cdSakolb } 273c97ad5cdSakolb 274c97ad5cdSakolb /* 275c97ad5cdSakolb * Free cpucap structure 276c97ad5cdSakolb */ 277c97ad5cdSakolb static void 278c97ad5cdSakolb cap_free(cpucap_t *cap) 279c97ad5cdSakolb { 280c97ad5cdSakolb if (cap == NULL) 281c97ad5cdSakolb return; 282c97ad5cdSakolb 283c97ad5cdSakolb /* 284c97ad5cdSakolb * This cap should not be active 285c97ad5cdSakolb */ 286c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 287c97ad5cdSakolb ASSERT(cap->cap_value == 0); 288c97ad5cdSakolb ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 289c97ad5cdSakolb 290c97ad5cdSakolb waitq_fini(&cap->cap_waitq); 291c97ad5cdSakolb DISP_LOCK_DESTROY(&cap->cap_usagelock); 292c97ad5cdSakolb 293c97ad5cdSakolb kmem_free(cap, sizeof (cpucap_t)); 294c97ad5cdSakolb } 295c97ad5cdSakolb 296c97ad5cdSakolb /* 297c97ad5cdSakolb * Activate cap - insert into active list and unblock its 298c97ad5cdSakolb * wait queue. Should be called with caps_lock held. 299c97ad5cdSakolb * The cap_value field is set to the value supplied. 300c97ad5cdSakolb */ 301c97ad5cdSakolb static void 302c97ad5cdSakolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 303c97ad5cdSakolb { 304c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 305c97ad5cdSakolb 306c97ad5cdSakolb /* 307c97ad5cdSakolb * Cap can not be already enabled 308c97ad5cdSakolb */ 309c97ad5cdSakolb ASSERT(!CAP_ENABLED(cap)); 310c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 311c97ad5cdSakolb 312c97ad5cdSakolb list_insert_tail(l, cap); 313c97ad5cdSakolb cap->cap_below = cap->cap_above = 0; 314c97ad5cdSakolb cap->cap_maxusage = 0; 315c97ad5cdSakolb cap->cap_usage = 0; 316c97ad5cdSakolb cap->cap_value = value; 317c97ad5cdSakolb waitq_unblock(&cap->cap_waitq); 318c97ad5cdSakolb if (CPUCAPS_OFF()) { 319c97ad5cdSakolb cpucaps_enabled = B_TRUE; 320c97ad5cdSakolb cpucaps_clock_callout = caps_update; 321c97ad5cdSakolb } 322c97ad5cdSakolb } 323c97ad5cdSakolb 324c97ad5cdSakolb /* 325c97ad5cdSakolb * Deactivate cap 326c97ad5cdSakolb * - Block its wait queue. This prevents any new threads from being 327c97ad5cdSakolb * enqueued there and moves all enqueued threads to the run queue. 328c97ad5cdSakolb * - Remove cap from list l. 329c97ad5cdSakolb * - Disable CPU caps globally if there are no capped projects or zones 330c97ad5cdSakolb * 331c97ad5cdSakolb * Should be called with caps_lock held. 332c97ad5cdSakolb */ 333c97ad5cdSakolb static void 334c97ad5cdSakolb cap_disable(list_t *l, cpucap_t *cap) 335c97ad5cdSakolb { 336c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 337c97ad5cdSakolb /* 338c97ad5cdSakolb * Cap should be currently active 339c97ad5cdSakolb */ 340c97ad5cdSakolb ASSERT(CPUCAPS_ON()); 341c97ad5cdSakolb ASSERT(list_link_active(&cap->cap_link)); 342c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 343c97ad5cdSakolb 344c97ad5cdSakolb waitq_block(&cap->cap_waitq); 345c97ad5cdSakolb list_remove(l, cap); 346c97ad5cdSakolb if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 347c97ad5cdSakolb cpucaps_enabled = B_FALSE; 348c97ad5cdSakolb cpucaps_clock_callout = NULL; 349c97ad5cdSakolb } 350c97ad5cdSakolb cap->cap_value = 0; 351c97ad5cdSakolb cap->cap_project = NULL; 352c97ad5cdSakolb cap->cap_zone = NULL; 353c97ad5cdSakolb if (cap->cap_kstat != NULL) { 354c97ad5cdSakolb kstat_delete(cap->cap_kstat); 355c97ad5cdSakolb cap->cap_kstat = NULL; 356c97ad5cdSakolb } 357c97ad5cdSakolb 358c97ad5cdSakolb } 359c97ad5cdSakolb 360c97ad5cdSakolb /* 361c97ad5cdSakolb * Enable cap for a project kpj 362c97ad5cdSakolb * It is safe to enable already enabled project cap. 363c97ad5cdSakolb * Should be called with caps_lock held. 364c97ad5cdSakolb */ 365c97ad5cdSakolb static void 366c97ad5cdSakolb cap_project_enable(kproject_t *kpj, hrtime_t value) 367c97ad5cdSakolb { 368c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 369c97ad5cdSakolb 370c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 371c97ad5cdSakolb ASSERT(cap != NULL); 372c97ad5cdSakolb 373c97ad5cdSakolb if (CAP_DISABLED(cap)) { 374c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 375c97ad5cdSakolb cap_enable(&capped_projects, cap, value); 376c97ad5cdSakolb cap->cap_project = kpj; 377c97ad5cdSakolb cap->cap_zone = kpj->kpj_zone; 378c97ad5cdSakolb 379c97ad5cdSakolb /* 380c97ad5cdSakolb * Create cap kstats 381c97ad5cdSakolb */ 382c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 383c97ad5cdSakolb KSTAT_TYPE_NAMED, 384c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 385c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 386c97ad5cdSakolb cap->cap_kstat->ks_data_size += 387c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 388c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 389c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 390c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 391c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 392c97ad5cdSakolb kstat_install(cap->cap_kstat); 393c97ad5cdSakolb } 394c97ad5cdSakolb } 395c97ad5cdSakolb } 396c97ad5cdSakolb 397c97ad5cdSakolb /* 398c97ad5cdSakolb * Disable project cap. 399c97ad5cdSakolb * It is safe to disable already disabled project cap. 400c97ad5cdSakolb * Should be called with caps_lock held. 401c97ad5cdSakolb */ 402c97ad5cdSakolb static void 403c97ad5cdSakolb cap_project_disable(kproject_t *kpj) 404c97ad5cdSakolb { 405c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 406c97ad5cdSakolb 407c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 408c97ad5cdSakolb ASSERT(cap != NULL); 409c97ad5cdSakolb ASSERT(cap->cap_project == kpj); 410c97ad5cdSakolb 411c97ad5cdSakolb if (CAP_ENABLED(cap)) 412c97ad5cdSakolb cap_disable(&capped_projects, cap); 413c97ad5cdSakolb } 414c97ad5cdSakolb 415c97ad5cdSakolb /* 416c97ad5cdSakolb * Enable cap for a zone 417c97ad5cdSakolb * It is safe to enable already enabled zone cap. 418c97ad5cdSakolb * Should be called with caps_lock held. 419c97ad5cdSakolb */ 420c97ad5cdSakolb static void 421c97ad5cdSakolb cap_zone_enable(zone_t *zone, hrtime_t value) 422c97ad5cdSakolb { 423c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 424c97ad5cdSakolb 425c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 426c97ad5cdSakolb ASSERT(cap != NULL); 427c97ad5cdSakolb 428c97ad5cdSakolb if (CAP_DISABLED(cap)) { 429c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 430c97ad5cdSakolb cap_enable(&capped_zones, cap, value); 431c97ad5cdSakolb cap->cap_zone = zone; 432c97ad5cdSakolb 433c97ad5cdSakolb /* 434c97ad5cdSakolb * Create cap kstats 435c97ad5cdSakolb */ 436c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 437c97ad5cdSakolb KSTAT_TYPE_NAMED, 438c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 439c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 440c97ad5cdSakolb cap->cap_kstat->ks_data_size += 441c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 442c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 443c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 444c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 445c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 446c97ad5cdSakolb kstat_install(cap->cap_kstat); 447c97ad5cdSakolb } 448c97ad5cdSakolb } 449c97ad5cdSakolb } 450c97ad5cdSakolb 451c97ad5cdSakolb /* 452c97ad5cdSakolb * Disable zone cap. 453c97ad5cdSakolb * It is safe to disable already disabled zone cap. 454c97ad5cdSakolb * Should be called with caps_lock held. 455c97ad5cdSakolb */ 456c97ad5cdSakolb static void 457c97ad5cdSakolb cap_zone_disable(zone_t *zone) 458c97ad5cdSakolb { 459c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 460c97ad5cdSakolb 461c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 462c97ad5cdSakolb ASSERT(cap != NULL); 463c97ad5cdSakolb ASSERT(cap->cap_zone == zone); 464c97ad5cdSakolb 465c97ad5cdSakolb if (CAP_ENABLED(cap)) 466c97ad5cdSakolb cap_disable(&capped_zones, cap); 467c97ad5cdSakolb } 468c97ad5cdSakolb 469c97ad5cdSakolb /* 470c97ad5cdSakolb * Apply specified callback to all caps contained in the list `l'. 471c97ad5cdSakolb */ 472c97ad5cdSakolb static void 473c97ad5cdSakolb cap_walk(list_t *l, void (*cb)(cpucap_t *)) 474c97ad5cdSakolb { 475c97ad5cdSakolb cpucap_t *cap; 476c97ad5cdSakolb 477c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 478c97ad5cdSakolb 479c97ad5cdSakolb for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 480c97ad5cdSakolb (*cb)(cap); 481c97ad5cdSakolb } 482c97ad5cdSakolb } 483c97ad5cdSakolb 484c97ad5cdSakolb /* 485c97ad5cdSakolb * If cap limit is not reached, make one thread from wait queue runnable. 486c97ad5cdSakolb * The waitq_isempty check is performed without the waitq lock. If a new thread 487c97ad5cdSakolb * is placed on the waitq right after the check, it will be picked up during the 488c97ad5cdSakolb * next invocation of cap_poke_waitq(). 489c97ad5cdSakolb */ 490c97ad5cdSakolb static void 491c97ad5cdSakolb cap_poke_waitq(cpucap_t *cap) 492c97ad5cdSakolb { 493c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 494c97ad5cdSakolb 495c97ad5cdSakolb if (cap->cap_usage >= cap->cap_value) { 496c97ad5cdSakolb cap->cap_above++; 497c97ad5cdSakolb } else { 498c97ad5cdSakolb waitq_t *wq = &cap->cap_waitq; 499c97ad5cdSakolb 500c97ad5cdSakolb cap->cap_below++; 501c97ad5cdSakolb 502c97ad5cdSakolb if (!waitq_isempty(wq)) 503c97ad5cdSakolb waitq_runone(wq); 504c97ad5cdSakolb } 505c97ad5cdSakolb } 506c97ad5cdSakolb 507c97ad5cdSakolb /* 508c97ad5cdSakolb * The callback function called for every cap on capped_projects list. 509c97ad5cdSakolb * Decay cap usage by CAP_DECAY_FACTOR 510c97ad5cdSakolb * Add this cap project usage to its zone usage. 511c97ad5cdSakolb * Kick off a thread from the cap waitq if cap is not reached. 512c97ad5cdSakolb */ 513c97ad5cdSakolb static void 514c97ad5cdSakolb cap_project_usage_walker(cpucap_t *cap) 515c97ad5cdSakolb { 516c97ad5cdSakolb zone_t *zone = cap->cap_zone; 517c97ad5cdSakolb hrtime_t cap_usage = cap->cap_usage; 518c97ad5cdSakolb 519c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 520c97ad5cdSakolb ASSERT(cap->cap_project->kpj_cpucap == cap); 521c97ad5cdSakolb ASSERT(zone == cap->cap_project->kpj_zone); 522c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 523c97ad5cdSakolb 524c97ad5cdSakolb /* 525c97ad5cdSakolb * Set or clear the CAP_REACHED flag based on the current usage. 526c97ad5cdSakolb * Only projects having their own caps are ever marked as CAP_REACHED. 527c97ad5cdSakolb */ 528c97ad5cdSakolb cap_poke_waitq(cap); 529c97ad5cdSakolb 530c97ad5cdSakolb /* 531c97ad5cdSakolb * Add project's CPU usage to our zone's CPU usage. 532c97ad5cdSakolb */ 533c97ad5cdSakolb if (ZONE_IS_CAPPED(zone)) { 534c97ad5cdSakolb cpucap_t *zcap = zone->zone_cpucap; 535c97ad5cdSakolb 536c97ad5cdSakolb ASSERT(zcap->cap_zone == zone); 537c97ad5cdSakolb 538c97ad5cdSakolb /* 539c97ad5cdSakolb * If we haven't reset this zone's usage during this clock tick 540c97ad5cdSakolb * yet, then do it now. The cap_lbolt field is used to check 541c97ad5cdSakolb * whether this is the first zone's project we see during this 542c97ad5cdSakolb * tick or a subsequent one. 543c97ad5cdSakolb */ 544c97ad5cdSakolb if (zcap->cap_lbolt != lbolt64) { 545c97ad5cdSakolb if (zcap->cap_usage > zcap->cap_maxusage) 546c97ad5cdSakolb zcap->cap_maxusage = zcap->cap_usage; 547c97ad5cdSakolb zcap->cap_usage = 0; 5489077d101Sakolb zcap->cap_lbolt = lbolt64; 549c97ad5cdSakolb } 550c97ad5cdSakolb DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 551c97ad5cdSakolb hrtime_t, cap_usage); 552c97ad5cdSakolb zcap->cap_usage += cap_usage; 553c97ad5cdSakolb /* Check for overflows */ 554c97ad5cdSakolb if (zcap->cap_usage < 0) 555c97ad5cdSakolb zcap->cap_usage = MAX_USAGE - 1; 556c97ad5cdSakolb } 557c97ad5cdSakolb 558c97ad5cdSakolb /* 559c97ad5cdSakolb * Decay project usage. 560c97ad5cdSakolb */ 561c97ad5cdSakolb disp_lock_enter(&cap->cap_usagelock); 562c97ad5cdSakolb cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 563c97ad5cdSakolb disp_lock_exit(&cap->cap_usagelock); 564c97ad5cdSakolb } 565c97ad5cdSakolb 566c97ad5cdSakolb /* 567c97ad5cdSakolb * On every clock tick walk the list of project caps and update the CPU usage. 568c97ad5cdSakolb * Also walk the list of zone caps checking whether any threads should 569c97ad5cdSakolb * transition from wait queue to run queue. 570c97ad5cdSakolb * 571c97ad5cdSakolb * This function gets called by the clock thread directly when there are any 572c97ad5cdSakolb * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 573c97ad5cdSakolb * caps_lock for long periods of time, so there should be almost no contention 574c97ad5cdSakolb * for it. 575c97ad5cdSakolb */ 576c97ad5cdSakolb static void 577c97ad5cdSakolb caps_update() 578c97ad5cdSakolb { 579c97ad5cdSakolb mutex_enter(&caps_lock); 580c97ad5cdSakolb cap_walk(&capped_projects, cap_project_usage_walker); 581c97ad5cdSakolb cap_walk(&capped_zones, cap_poke_waitq); 582c97ad5cdSakolb mutex_exit(&caps_lock); 583c97ad5cdSakolb } 584c97ad5cdSakolb 585c97ad5cdSakolb /* 586c97ad5cdSakolb * The function is called for each project in a zone when the zone cap is 587c97ad5cdSakolb * modified. It enables project caps if zone cap is enabled and disables if the 588c97ad5cdSakolb * zone cap is disabled and project doesn't have its own cap. 589c97ad5cdSakolb * 590c97ad5cdSakolb * For each project that does not have cpucap structure allocated it allocates a 591c97ad5cdSakolb * new structure and assigns to kpj->cpu_cap. The allocation is performed 592c97ad5cdSakolb * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 593c97ad5cdSakolb * held. 594c97ad5cdSakolb */ 595c97ad5cdSakolb static int 596c97ad5cdSakolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 597c97ad5cdSakolb { 598c97ad5cdSakolb cpucap_t *project_cap = NULL; 599c97ad5cdSakolb cpucap_t *zone_cap = (cpucap_t *)arg; 600c97ad5cdSakolb 601c97ad5cdSakolb ASSERT(zone_cap != NULL); 602c97ad5cdSakolb 603c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 604c97ad5cdSakolb /* 605c97ad5cdSakolb * This is the first time any cap was established for this 606c97ad5cdSakolb * project. Allocate a new cpucap structure for it. 607c97ad5cdSakolb */ 608c97ad5cdSakolb project_cap = cap_alloc(); 609c97ad5cdSakolb } 610c97ad5cdSakolb 611c97ad5cdSakolb mutex_enter(&caps_lock); 612c97ad5cdSakolb 613c97ad5cdSakolb /* 614c97ad5cdSakolb * Double-check that kpj_cpucap is still NULL - now with caps_lock held 615c97ad5cdSakolb * and assign the newly allocated cpucap structure to it. 616c97ad5cdSakolb */ 617c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 618c97ad5cdSakolb kpj->kpj_cpucap = project_cap; 619c97ad5cdSakolb } else if (project_cap != NULL) { 620c97ad5cdSakolb cap_free(project_cap); 621c97ad5cdSakolb } 622c97ad5cdSakolb 623c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 624c97ad5cdSakolb 625c97ad5cdSakolb if (CAP_DISABLED(zone_cap)) { 626c97ad5cdSakolb /* 627c97ad5cdSakolb * Remove all projects in this zone without caps 628c97ad5cdSakolb * from the capped_projects list. 629c97ad5cdSakolb */ 630c97ad5cdSakolb if (project_cap->cap_value == MAX_USAGE) { 631c97ad5cdSakolb cap_project_disable(kpj); 632c97ad5cdSakolb } 633c97ad5cdSakolb } else if (CAP_DISABLED(project_cap)) { 634c97ad5cdSakolb /* 635c97ad5cdSakolb * Add the project to capped_projects list. 636c97ad5cdSakolb */ 637c97ad5cdSakolb ASSERT(project_cap->cap_value == 0); 638c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 639c97ad5cdSakolb } 640c97ad5cdSakolb mutex_exit(&caps_lock); 641c97ad5cdSakolb 642c97ad5cdSakolb return (0); 643c97ad5cdSakolb } 644c97ad5cdSakolb 645c97ad5cdSakolb /* 646c97ad5cdSakolb * Set zone cap to cap_val 647c97ad5cdSakolb * If cap_val is equal to NOCAP, disable zone cap. 648c97ad5cdSakolb * 649c97ad5cdSakolb * If this is the first time a cap is set on a zone, allocate cpucap structure 650c97ad5cdSakolb * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 651c97ad5cdSakolb */ 652c97ad5cdSakolb int 653c97ad5cdSakolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 654c97ad5cdSakolb { 655c97ad5cdSakolb cpucap_t *cap = NULL; 656c97ad5cdSakolb hrtime_t value; 657c97ad5cdSakolb 658c97ad5cdSakolb if (cap_val == 0) 659c97ad5cdSakolb return (EINVAL); 660c97ad5cdSakolb 661c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 662c97ad5cdSakolb if (cap_val > MAXCAP) 663c97ad5cdSakolb cap_val = MAXCAP; 664c97ad5cdSakolb 665c97ad5cdSakolb /* 666c97ad5cdSakolb * Nothing to do if trying to disable a cap on a zone when caps are off 667c97ad5cdSakolb * or a zone which does not have a cap yet. 668c97ad5cdSakolb */ 669c97ad5cdSakolb if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 670c97ad5cdSakolb return (0); 671c97ad5cdSakolb 672c97ad5cdSakolb if (zone->zone_cpucap == NULL) 673c97ad5cdSakolb cap = cap_alloc(); 674c97ad5cdSakolb 675c97ad5cdSakolb mutex_enter(&caps_lock); 676c97ad5cdSakolb 677c97ad5cdSakolb if (cpucaps_busy) { 678c97ad5cdSakolb mutex_exit(&caps_lock); 679c97ad5cdSakolb return (EBUSY); 680c97ad5cdSakolb } 681c97ad5cdSakolb 682c97ad5cdSakolb /* 683c97ad5cdSakolb * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 684c97ad5cdSakolb * held. If it is still NULL, assign a newly allocated cpucap to it. 685c97ad5cdSakolb */ 686c97ad5cdSakolb if (zone->zone_cpucap == NULL) { 687c97ad5cdSakolb zone->zone_cpucap = cap; 688c97ad5cdSakolb } else if (cap != NULL) { 689c97ad5cdSakolb cap_free(cap); 690c97ad5cdSakolb } 691c97ad5cdSakolb 692c97ad5cdSakolb cap = zone->zone_cpucap; 693c97ad5cdSakolb value = cap_val * cap_tick_cost; 694c97ad5cdSakolb if (value < 0) 695c97ad5cdSakolb value = MAX_USAGE; 696c97ad5cdSakolb 697c97ad5cdSakolb /* Nothing to do if the value is staying the same */ 698c97ad5cdSakolb if (value == cap->cap_value) { 699c97ad5cdSakolb mutex_exit(&caps_lock); 700c97ad5cdSakolb return (0); 701c97ad5cdSakolb } 702c97ad5cdSakolb 703c97ad5cdSakolb /* 704c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 705c97ad5cdSakolb */ 706c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 707c97ad5cdSakolb 708c97ad5cdSakolb 709c97ad5cdSakolb if (cap_val == NOCAP) { 710c97ad5cdSakolb if (CAP_ENABLED(cap)) { 711c97ad5cdSakolb /* 712c97ad5cdSakolb * Remove cap for the zone 713c97ad5cdSakolb */ 714c97ad5cdSakolb cap_zone_disable(zone); 715c97ad5cdSakolb cpucaps_busy = B_TRUE; 716c97ad5cdSakolb mutex_exit(&caps_lock); 717c97ad5cdSakolb /* 718c97ad5cdSakolb * Disable caps for all project belonging to this zone 719c97ad5cdSakolb * unless they have their own cap. 720c97ad5cdSakolb */ 721c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 722c97ad5cdSakolb cap_project_zone_modify_walker, cap); 723c97ad5cdSakolb 724c97ad5cdSakolb mutex_enter(&caps_lock); 725c97ad5cdSakolb cpucaps_busy = B_FALSE; 726c97ad5cdSakolb } 727c97ad5cdSakolb } else if (CAP_DISABLED(cap)) { 728c97ad5cdSakolb /* 729c97ad5cdSakolb * Set a cap on a zone which previously was not capped. 730c97ad5cdSakolb */ 731c97ad5cdSakolb cap_zone_enable(zone, value); 732c97ad5cdSakolb cpucaps_busy = B_TRUE; 733c97ad5cdSakolb mutex_exit(&caps_lock); 734c97ad5cdSakolb 735c97ad5cdSakolb /* 736c97ad5cdSakolb * Enable cap for all projects belonging to this zone. 737c97ad5cdSakolb */ 738c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 739c97ad5cdSakolb cap_project_zone_modify_walker, cap); 740c97ad5cdSakolb 741c97ad5cdSakolb mutex_enter(&caps_lock); 742c97ad5cdSakolb cpucaps_busy = B_FALSE; 743c97ad5cdSakolb } else { 744c97ad5cdSakolb /* 745c97ad5cdSakolb * No state transitions, just change the value 746c97ad5cdSakolb */ 747c97ad5cdSakolb cap->cap_value = value; 748c97ad5cdSakolb } 749c97ad5cdSakolb 750c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 751c97ad5cdSakolb ASSERT(!cpucaps_busy); 752c97ad5cdSakolb mutex_exit(&caps_lock); 753c97ad5cdSakolb 754c97ad5cdSakolb return (0); 755c97ad5cdSakolb } 756c97ad5cdSakolb 757c97ad5cdSakolb /* 758c97ad5cdSakolb * The project is going away so disable its cap. 759c97ad5cdSakolb */ 760c97ad5cdSakolb void 761c97ad5cdSakolb cpucaps_project_remove(kproject_t *kpj) 762c97ad5cdSakolb { 763c97ad5cdSakolb mutex_enter(&caps_lock); 764c97ad5cdSakolb if (PROJECT_IS_CAPPED(kpj)) 765c97ad5cdSakolb cap_project_disable(kpj); 766c97ad5cdSakolb if (kpj->kpj_cpucap != NULL) { 767c97ad5cdSakolb cap_free(kpj->kpj_cpucap); 768c97ad5cdSakolb kpj->kpj_cpucap = NULL; 769c97ad5cdSakolb } 770c97ad5cdSakolb mutex_exit(&caps_lock); 771c97ad5cdSakolb } 772c97ad5cdSakolb 773c97ad5cdSakolb /* 774c97ad5cdSakolb * The zone is going away, so disable its cap. 775c97ad5cdSakolb */ 776c97ad5cdSakolb void 777c97ad5cdSakolb cpucaps_zone_remove(zone_t *zone) 778c97ad5cdSakolb { 779c97ad5cdSakolb mutex_enter(&caps_lock); 780c97ad5cdSakolb while (ZONE_IS_CAPPED(zone)) { 781c97ad5cdSakolb mutex_exit(&caps_lock); 782c97ad5cdSakolb (void) cpucaps_zone_set(zone, NOCAP); 783c97ad5cdSakolb mutex_enter(&caps_lock); 784c97ad5cdSakolb } 785c97ad5cdSakolb if (zone->zone_cpucap != NULL) { 786c97ad5cdSakolb cap_free(zone->zone_cpucap); 787c97ad5cdSakolb zone->zone_cpucap = NULL; 788c97ad5cdSakolb } 789c97ad5cdSakolb mutex_exit(&caps_lock); 790c97ad5cdSakolb } 791c97ad5cdSakolb 792c97ad5cdSakolb /* 793c97ad5cdSakolb * New project was created. It should be put on the capped_projects list if 794c97ad5cdSakolb * its zone has a cap. 795c97ad5cdSakolb */ 796c97ad5cdSakolb void 797c97ad5cdSakolb cpucaps_project_add(kproject_t *kpj) 798c97ad5cdSakolb { 799c97ad5cdSakolb cpucap_t *cap = NULL; 800c97ad5cdSakolb 801c97ad5cdSakolb if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 802c97ad5cdSakolb return; 803c97ad5cdSakolb 804c97ad5cdSakolb /* 805c97ad5cdSakolb * This project was never capped before, so allocate its cap structure. 806c97ad5cdSakolb */ 807c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) 808c97ad5cdSakolb cap = cap_alloc(); 809c97ad5cdSakolb 810c97ad5cdSakolb mutex_enter(&caps_lock); 811c97ad5cdSakolb /* 812c97ad5cdSakolb * Double-check with caps_lock held 813c97ad5cdSakolb */ 814c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 815c97ad5cdSakolb kpj->kpj_cpucap = cap; 816c97ad5cdSakolb } else if (cap != NULL) { 817c97ad5cdSakolb cap_free(cap); 818c97ad5cdSakolb } 819c97ad5cdSakolb 820c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) 821c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 822c97ad5cdSakolb 823c97ad5cdSakolb mutex_exit(&caps_lock); 824c97ad5cdSakolb } 825c97ad5cdSakolb 826c97ad5cdSakolb /* 827c97ad5cdSakolb * Set project cap to cap_val 828c97ad5cdSakolb * If cap_val is equal to NOCAP, disable project cap. 829c97ad5cdSakolb * 830c97ad5cdSakolb * If this is the first time a cap is set on a project, allocate cpucap 831c97ad5cdSakolb * structure without holding caps_lock to avoid KM_SLEEP allocation with 832c97ad5cdSakolb * caps_lock held. 833c97ad5cdSakolb */ 834c97ad5cdSakolb int 835c97ad5cdSakolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 836c97ad5cdSakolb { 837c97ad5cdSakolb cpucap_t *cap = NULL; 838c97ad5cdSakolb hrtime_t value; 839c97ad5cdSakolb 840c97ad5cdSakolb if (cap_val == 0) 841c97ad5cdSakolb return (EINVAL); 842c97ad5cdSakolb 843c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 844c97ad5cdSakolb if (cap_val > MAXCAP) 845c97ad5cdSakolb cap_val = MAXCAP; 846c97ad5cdSakolb 847c97ad5cdSakolb /* 848c97ad5cdSakolb * Nothing to do if trying to disable project cap and caps are not 849c97ad5cdSakolb * enabled or if trying to disable cap on a project that does not have 850c97ad5cdSakolb * cap enabled. 851c97ad5cdSakolb */ 852c97ad5cdSakolb if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 853c97ad5cdSakolb return (0); 854c97ad5cdSakolb 855c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 856c97ad5cdSakolb /* 857c97ad5cdSakolb * This project was never capped before, so allocate its cap 858c97ad5cdSakolb * structure. 859c97ad5cdSakolb */ 860c97ad5cdSakolb cap = cap_alloc(); 861c97ad5cdSakolb } 862c97ad5cdSakolb 863c97ad5cdSakolb mutex_enter(&caps_lock); 864c97ad5cdSakolb 865c97ad5cdSakolb /* 866c97ad5cdSakolb * Double-check with caps_lock held. 867c97ad5cdSakolb */ 868c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 869c97ad5cdSakolb kpj->kpj_cpucap = cap; 870c97ad5cdSakolb } else if (cap != NULL) { 871c97ad5cdSakolb cap_free(cap); 872c97ad5cdSakolb } 873c97ad5cdSakolb 874c97ad5cdSakolb /* 875c97ad5cdSakolb * Get the actual pointer to the project cap. 876c97ad5cdSakolb */ 877c97ad5cdSakolb cap = kpj->kpj_cpucap; 878c97ad5cdSakolb value = cap_val * cap_tick_cost; 879c97ad5cdSakolb if (value < 0) 880c97ad5cdSakolb value = MAX_USAGE; 881c97ad5cdSakolb 882c97ad5cdSakolb /* 883c97ad5cdSakolb * Nothing to do if the value is not changing 884c97ad5cdSakolb */ 885c97ad5cdSakolb if (value == cap->cap_value) { 886c97ad5cdSakolb mutex_exit(&caps_lock); 887c97ad5cdSakolb return (0); 888c97ad5cdSakolb } 889c97ad5cdSakolb 890c97ad5cdSakolb /* 891c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 892c97ad5cdSakolb */ 893c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 894c97ad5cdSakolb cap->cap_maxusage = 0; 895c97ad5cdSakolb 896c97ad5cdSakolb if (cap_val != NOCAP) { 897c97ad5cdSakolb /* 898c97ad5cdSakolb * Enable this cap if it is not already enabled. 899c97ad5cdSakolb */ 900c97ad5cdSakolb if (CAP_DISABLED(cap)) 901c97ad5cdSakolb cap_project_enable(kpj, value); 902c97ad5cdSakolb else 903c97ad5cdSakolb cap->cap_value = value; 904c97ad5cdSakolb } else if (CAP_ENABLED(cap)) { 905c97ad5cdSakolb /* 906c97ad5cdSakolb * User requested to drop a cap on the project. If it is part of 907c97ad5cdSakolb * capped zone, keep the cap and set the value to MAX_USAGE, 908c97ad5cdSakolb * otherwise disable the cap. 909c97ad5cdSakolb */ 910c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 911c97ad5cdSakolb cap->cap_value = MAX_USAGE; 912c97ad5cdSakolb } else { 913c97ad5cdSakolb cap_project_disable(kpj); 914c97ad5cdSakolb } 915c97ad5cdSakolb } 916c97ad5cdSakolb mutex_exit(&caps_lock); 917c97ad5cdSakolb 918c97ad5cdSakolb return (0); 919c97ad5cdSakolb } 920c97ad5cdSakolb 921c97ad5cdSakolb /* 922c97ad5cdSakolb * Get cap usage. 923c97ad5cdSakolb */ 924c97ad5cdSakolb static rctl_qty_t 925c97ad5cdSakolb cap_get(cpucap_t *cap) 926c97ad5cdSakolb { 927c97ad5cdSakolb return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 928c97ad5cdSakolb } 929c97ad5cdSakolb 930c97ad5cdSakolb /* 931c97ad5cdSakolb * Get current project usage. 932c97ad5cdSakolb */ 933c97ad5cdSakolb rctl_qty_t 934c97ad5cdSakolb cpucaps_project_get(kproject_t *kpj) 935c97ad5cdSakolb { 936c97ad5cdSakolb return (cap_get(kpj->kpj_cpucap)); 937c97ad5cdSakolb } 938c97ad5cdSakolb 939c97ad5cdSakolb /* 940c97ad5cdSakolb * Get current zone usage. 941c97ad5cdSakolb */ 942c97ad5cdSakolb rctl_qty_t 943c97ad5cdSakolb cpucaps_zone_get(zone_t *zone) 944c97ad5cdSakolb { 945c97ad5cdSakolb return (cap_get(zone->zone_cpucap)); 946c97ad5cdSakolb } 947c97ad5cdSakolb 948c97ad5cdSakolb /* 949c97ad5cdSakolb * Charge project of thread t the time thread t spent on CPU since previously 950c97ad5cdSakolb * adjusted. 951c97ad5cdSakolb * 952c97ad5cdSakolb * Record the current on-CPU time in the csc structure. 953c97ad5cdSakolb * 954c97ad5cdSakolb * Do not adjust for more than one tick worth of time. 955c97ad5cdSakolb * 956*4b175f6fSakolb * It is possible that the project cap is being disabled while this routine is 957*4b175f6fSakolb * executed. This should not cause any issues since the association between the 958*4b175f6fSakolb * thread and its project is protected by thread lock. 959c97ad5cdSakolb */ 960c97ad5cdSakolb static void 961c97ad5cdSakolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 962c97ad5cdSakolb { 963c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 964c97ad5cdSakolb hrtime_t new_usage; 965c97ad5cdSakolb hrtime_t usage_delta; 966c97ad5cdSakolb 967c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 968*4b175f6fSakolb ASSERT(kpj->kpj_cpucap != NULL); 969c97ad5cdSakolb 970c97ad5cdSakolb /* Get on-CPU time since birth of a thread */ 971c97ad5cdSakolb new_usage = mstate_thread_onproc_time(t); 972c97ad5cdSakolb 973c97ad5cdSakolb /* Time spent on CPU since last checked */ 974c97ad5cdSakolb usage_delta = new_usage - csc->csc_cputime; 975c97ad5cdSakolb 976c97ad5cdSakolb /* Save the accumulated on-CPU time */ 977c97ad5cdSakolb csc->csc_cputime = new_usage; 978c97ad5cdSakolb 979c97ad5cdSakolb /* Charge at most one tick worth of on-CPU time */ 980c97ad5cdSakolb if (usage_delta > cap_tick_cost) 981c97ad5cdSakolb usage_delta = cap_tick_cost; 982c97ad5cdSakolb 983c97ad5cdSakolb /* Add usage_delta to the project usage value. */ 984c97ad5cdSakolb if (usage_delta > 0) { 985c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 986c97ad5cdSakolb 987c97ad5cdSakolb DTRACE_PROBE2(cpucaps__project__charge, 988c97ad5cdSakolb kthread_id_t, t, hrtime_t, usage_delta); 989c97ad5cdSakolb 990c97ad5cdSakolb disp_lock_enter_high(&cap->cap_usagelock); 991c97ad5cdSakolb cap->cap_usage += usage_delta; 992c97ad5cdSakolb 993c97ad5cdSakolb /* Check for overflows */ 994c97ad5cdSakolb if (cap->cap_usage < 0) 995c97ad5cdSakolb cap->cap_usage = MAX_USAGE - 1; 996c97ad5cdSakolb 997c97ad5cdSakolb disp_lock_exit_high(&cap->cap_usagelock); 998c97ad5cdSakolb 999c97ad5cdSakolb /* 1000c97ad5cdSakolb * cap_maxusage is only kept for observability. Move it outside 1001c97ad5cdSakolb * the lock to reduce the time spent while holding the lock. 1002c97ad5cdSakolb */ 1003c97ad5cdSakolb if (cap->cap_usage > cap->cap_maxusage) 1004c97ad5cdSakolb cap->cap_maxusage = cap->cap_usage; 1005c97ad5cdSakolb } 1006c97ad5cdSakolb } 1007c97ad5cdSakolb 1008c97ad5cdSakolb /* 1009c97ad5cdSakolb * Charge thread's project and return True if project or zone should be 1010c97ad5cdSakolb * penalized because its project or zone is exceeding its cap. Also sets 1011c97ad5cdSakolb * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 1012*4b175f6fSakolb * 1013*4b175f6fSakolb * It is possible that the project cap is being disabled while this routine is 1014*4b175f6fSakolb * executed. This should not cause any issues since the association between the 1015*4b175f6fSakolb * thread and its project is protected by thread lock. It will still set 1016*4b175f6fSakolb * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place 1017*4b175f6fSakolb * anything on the blocked wait queue. 1018*4b175f6fSakolb * 1019c97ad5cdSakolb */ 1020c97ad5cdSakolb boolean_t 1021c97ad5cdSakolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1022c97ad5cdSakolb { 1023c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 1024c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1025c97ad5cdSakolb zone_t *zone; 1026c97ad5cdSakolb cpucap_t *project_cap; 1027c97ad5cdSakolb boolean_t rc = B_FALSE; 1028c97ad5cdSakolb 1029c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1030c97ad5cdSakolb 1031c97ad5cdSakolb /* Nothing to do for projects that are not capped. */ 1032c97ad5cdSakolb if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1033c97ad5cdSakolb return (B_FALSE); 1034c97ad5cdSakolb 1035c97ad5cdSakolb caps_charge_adjust(t, csc); 1036c97ad5cdSakolb 1037c97ad5cdSakolb /* 1038c97ad5cdSakolb * The caller only requested to charge the project usage, no enforcement 1039c97ad5cdSakolb * part. 1040c97ad5cdSakolb */ 1041c97ad5cdSakolb if (charge_type == CPUCAPS_CHARGE_ONLY) 1042c97ad5cdSakolb return (B_FALSE); 1043c97ad5cdSakolb 1044c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 1045c97ad5cdSakolb 1046c97ad5cdSakolb if (project_cap->cap_usage >= project_cap->cap_value) { 1047c97ad5cdSakolb t->t_schedflag |= TS_PROJWAITQ; 1048c97ad5cdSakolb rc = B_TRUE; 1049c97ad5cdSakolb } else if (t->t_schedflag & TS_PROJWAITQ) { 1050c97ad5cdSakolb t->t_schedflag &= ~TS_PROJWAITQ; 1051c97ad5cdSakolb } 1052c97ad5cdSakolb 1053c97ad5cdSakolb zone = ttozone(t); 1054c97ad5cdSakolb if (!ZONE_IS_CAPPED(zone)) { 1055c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) 1056c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1057c97ad5cdSakolb } else { 1058c97ad5cdSakolb cpucap_t *zone_cap = zone->zone_cpucap; 1059c97ad5cdSakolb 1060c97ad5cdSakolb if (zone_cap->cap_usage >= zone_cap->cap_value) { 1061c97ad5cdSakolb t->t_schedflag |= TS_ZONEWAITQ; 1062c97ad5cdSakolb rc = B_TRUE; 1063c97ad5cdSakolb } else if (t->t_schedflag & TS_ZONEWAITQ) { 1064c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1065c97ad5cdSakolb } 1066c97ad5cdSakolb } 1067c97ad5cdSakolb 1068c97ad5cdSakolb 1069c97ad5cdSakolb return (rc); 1070c97ad5cdSakolb } 1071c97ad5cdSakolb 1072c97ad5cdSakolb /* 1073c97ad5cdSakolb * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1074c97ad5cdSakolb * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1075c97ad5cdSakolb * 1076c97ad5cdSakolb * CPU Caps are only enforced for user threads. 1077c97ad5cdSakolb * 1078c97ad5cdSakolb * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1079c97ad5cdSakolb * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1080c97ad5cdSakolb * 1081c97ad5cdSakolb * It is possible that by the time we enter cpucaps_enforce() the cap is already 1082c97ad5cdSakolb * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1083c97ad5cdSakolb * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1084c97ad5cdSakolb * apply. 1085c97ad5cdSakolb */ 1086c97ad5cdSakolb boolean_t 1087c97ad5cdSakolb cpucaps_enforce(kthread_t *t) 1088c97ad5cdSakolb { 1089c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1090c97ad5cdSakolb 1091c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1092c97ad5cdSakolb 1093c97ad5cdSakolb if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1094c97ad5cdSakolb if (t->t_schedflag & TS_PROJWAITQ) { 1095c97ad5cdSakolb ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1096c97ad5cdSakolb t->t_schedflag &= ~TS_ANYWAITQ; 1097c97ad5cdSakolb if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1098c97ad5cdSakolb t)) { 1099c97ad5cdSakolb return (B_TRUE); 1100c97ad5cdSakolb } 1101c97ad5cdSakolb } 1102c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) { 1103c97ad5cdSakolb ASSERT(ttozone(t)->zone_cpucap != NULL); 1104c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1105c97ad5cdSakolb if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1106c97ad5cdSakolb t)) { 1107c97ad5cdSakolb return (B_TRUE); 1108c97ad5cdSakolb } 1109c97ad5cdSakolb } 1110c97ad5cdSakolb } 1111c97ad5cdSakolb 1112c97ad5cdSakolb /* 1113c97ad5cdSakolb * The thread is not enqueued on the wait queue. 1114c97ad5cdSakolb */ 1115c97ad5cdSakolb return (B_FALSE); 1116c97ad5cdSakolb } 1117c97ad5cdSakolb 1118c97ad5cdSakolb /* 1119c97ad5cdSakolb * Convert internal cap statistics into values exported by cap kstat. 1120c97ad5cdSakolb */ 1121c97ad5cdSakolb static int 1122c97ad5cdSakolb cap_kstat_update(kstat_t *ksp, int rw) 1123c97ad5cdSakolb { 1124c97ad5cdSakolb struct cap_kstat *capsp = &cap_kstat; 1125c97ad5cdSakolb cpucap_t *cap = ksp->ks_private; 1126c97ad5cdSakolb clock_t tick_sec = SEC_TO_TICK(1); 1127c97ad5cdSakolb char *zonename = cap->cap_zone->zone_name; 1128c97ad5cdSakolb 1129c97ad5cdSakolb if (rw == KSTAT_WRITE) 1130c97ad5cdSakolb return (EACCES); 1131c97ad5cdSakolb 1132c97ad5cdSakolb capsp->cap_value.value.ui64 = 1133c97ad5cdSakolb ROUND_SCALE(cap->cap_value, cap_tick_cost); 1134c97ad5cdSakolb capsp->cap_usage.value.ui64 = 1135c97ad5cdSakolb ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1136c97ad5cdSakolb capsp->cap_maxusage.value.ui64 = 1137c97ad5cdSakolb ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1138c97ad5cdSakolb capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1139c97ad5cdSakolb capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1140c97ad5cdSakolb capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1141c97ad5cdSakolb kstat_named_setstr(&capsp->cap_zonename, zonename); 1142c97ad5cdSakolb 1143c97ad5cdSakolb return (0); 1144c97ad5cdSakolb } 1145