1c97ad5cdSakolb /* 2c97ad5cdSakolb * CDDL HEADER START 3c97ad5cdSakolb * 4c97ad5cdSakolb * The contents of this file are subject to the terms of the 5c97ad5cdSakolb * Common Development and Distribution License (the "License"). 6c97ad5cdSakolb * You may not use this file except in compliance with the License. 7c97ad5cdSakolb * 8c97ad5cdSakolb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c97ad5cdSakolb * or http://www.opensolaris.org/os/licensing. 10c97ad5cdSakolb * See the License for the specific language governing permissions 11c97ad5cdSakolb * and limitations under the License. 12c97ad5cdSakolb * 13c97ad5cdSakolb * When distributing Covered Code, include this CDDL HEADER in each 14c97ad5cdSakolb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c97ad5cdSakolb * If applicable, add the following below this CDDL HEADER, with the 16c97ad5cdSakolb * fields enclosed by brackets "[]" replaced with your own identifying 17c97ad5cdSakolb * information: Portions Copyright [yyyy] [name of copyright owner] 18c97ad5cdSakolb * 19c97ad5cdSakolb * CDDL HEADER END 20c97ad5cdSakolb */ 21c97ad5cdSakolb 22c97ad5cdSakolb /* 23*d3d50737SRafael Vanoni * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24c97ad5cdSakolb * Use is subject to license terms. 25c97ad5cdSakolb */ 26c97ad5cdSakolb 27c97ad5cdSakolb #include <sys/disp.h> 28c97ad5cdSakolb #include <sys/param.h> 29c97ad5cdSakolb #include <sys/systm.h> 30c97ad5cdSakolb #include <sys/sysmacros.h> 31c97ad5cdSakolb #include <sys/atomic.h> 32c97ad5cdSakolb #include <sys/cpucaps_impl.h> 33c97ad5cdSakolb #include <sys/dtrace.h> 34c97ad5cdSakolb #include <sys/sdt.h> 35c97ad5cdSakolb #include <sys/debug.h> 36c97ad5cdSakolb #include <sys/rctl.h> 37c97ad5cdSakolb #include <sys/errno.h> 38c97ad5cdSakolb 39c97ad5cdSakolb /* 40c97ad5cdSakolb * CPU Caps implementation 41c97ad5cdSakolb * ======================= 42c97ad5cdSakolb * 43c97ad5cdSakolb * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 44c97ad5cdSakolb * usage for all projects running inside the zone. If the zone CPU cap is set 45c97ad5cdSakolb * below the project CPU cap, the latter will have no effect. 46c97ad5cdSakolb * 47c97ad5cdSakolb * When CPU usage of projects and/or zones reaches specified caps, threads in 48c97ad5cdSakolb * them do not get scheduled and instead are placed on wait queues associated 49c97ad5cdSakolb * with a cap. Such threads will start running again only when CPU usage drops 50c97ad5cdSakolb * below the cap level. Each zone and each project has its own wait queue. 51c97ad5cdSakolb * 52c97ad5cdSakolb * When CPU cap is set, the kernel continously keeps track of CPU time used by 53c97ad5cdSakolb * capped zones and/or projects over a short time interval and calculates their 54c97ad5cdSakolb * current CPU usage as a percentage. When the accumulated usage reaches the CPU 55c97ad5cdSakolb * cap, LWPs running in the user-land (when they are not holding any critical 56c97ad5cdSakolb * kernel locks) are placed on special wait queues until their project's or 57c97ad5cdSakolb * zone's CPU usage drops below the cap. 58c97ad5cdSakolb * 59c97ad5cdSakolb * The system maintains a list of all capped projects and all capped zones. On 60c97ad5cdSakolb * every clock tick every active thread belonging to a capped project adds its 61c97ad5cdSakolb * CPU usage to its project. Usage from all projects belonging to a capped zone 62c97ad5cdSakolb * is aggregated to get the zone usage. 63c97ad5cdSakolb * 64c97ad5cdSakolb * When the current CPU usage is above the cap, a project or zone is considered 65c97ad5cdSakolb * over-capped. Every user thread caught running in an over-capped project or 66c97ad5cdSakolb * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 67c97ad5cdSakolb * is requested to surrender its CPU. This causes scheduling class specific 68c97ad5cdSakolb * CL_PREEMPT() callback to be invoked. The callback function places threads 69c97ad5cdSakolb * marked as TS_PROJWAIT on a wait queue and calls switch(). 70c97ad5cdSakolb * 71c97ad5cdSakolb * Threads are only placed on wait queues after trapping from user-land 72c97ad5cdSakolb * (they could be holding some user locks, but no kernel locks) and while 73c97ad5cdSakolb * returning from the trap back to the user-land when no kernel locks are held. 74c97ad5cdSakolb * Putting threads on wait queues in random places while running in the 75c97ad5cdSakolb * kernel might lead to all kinds of locking problems. 76c97ad5cdSakolb * 77c97ad5cdSakolb * Accounting 78c97ad5cdSakolb * ========== 79c97ad5cdSakolb * 80c97ad5cdSakolb * Accounting of CPU usage is based on per-thread micro-state accounting data. 81c97ad5cdSakolb * On every clock tick clock() adds new on-CPU time for every thread found on 82c97ad5cdSakolb * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 83c97ad5cdSakolb * New times means time since it was last accounted for. On-CPU times greater 84c97ad5cdSakolb * than 1 tick are truncated to 1 tick. 85c97ad5cdSakolb * 86c97ad5cdSakolb * Project CPU usage is aggregated from all threads within the project. 87c97ad5cdSakolb * Zone CPU usage is the sum of usages for all projects within the zone. Zone 88c97ad5cdSakolb * CPU usage is calculated on every clock tick by walking list of projects and 89c97ad5cdSakolb * adding their usage together. 90c97ad5cdSakolb * 91c97ad5cdSakolb * Decay 92c97ad5cdSakolb * ===== 93c97ad5cdSakolb * 94c97ad5cdSakolb * CPU usage is decayed by the caps_update() routine which is called once per 95c97ad5cdSakolb * every clock tick. It walks lists of project caps and decays their usages by 96c97ad5cdSakolb * one per cent. If CPU usage drops below cap levels, threads on the wait queue 97c97ad5cdSakolb * are made runnable again, one thread per clock tick. 98c97ad5cdSakolb * 99c97ad5cdSakolb * Interfaces 100c97ad5cdSakolb * ========== 101c97ad5cdSakolb * 102c97ad5cdSakolb * The CPU Caps facility provides the following interfaces to the rest of the 103c97ad5cdSakolb * system: 104c97ad5cdSakolb * 105c97ad5cdSakolb * cpucaps_project_add(kproject_t *) 106c97ad5cdSakolb * 107c97ad5cdSakolb * Notifies the framework of a new project. It should be put on the 108c97ad5cdSakolb * capped_projects list if its zone has a cap. 109c97ad5cdSakolb * 110c97ad5cdSakolb * cpucaps_project_remove(kproject_t *) 111c97ad5cdSakolb * 112c97ad5cdSakolb * Remove the association between the specified project and its cap. 113c97ad5cdSakolb * Called right before the project is destroyed. 114c97ad5cdSakolb * 115c97ad5cdSakolb * cpucaps_project_set(kproject_t *, rctl_qty_t) 116c97ad5cdSakolb * 117c97ad5cdSakolb * Set project cap of the specified project to the specified value. Setting the 118c97ad5cdSakolb * value to NOCAP is equivalent to removing the cap. 119c97ad5cdSakolb * 120c97ad5cdSakolb * cpucaps_zone_set(zone_t *, rctl_qty_t) 121c97ad5cdSakolb * 122c97ad5cdSakolb * Set zone cap of the specified zone to the specified value. Setting the value 123c97ad5cdSakolb * to NOCAP is equivalent to removing the cap. 124c97ad5cdSakolb * 125c97ad5cdSakolb * cpucaps_zone_remove(zone_t *) 126c97ad5cdSakolb * 127c97ad5cdSakolb * Remove the association between the zone and its cap. 128c97ad5cdSakolb * 129c97ad5cdSakolb * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 130c97ad5cdSakolb * 131c97ad5cdSakolb * Charges specified thread's project the amount of on-CPU time that it used. 132c97ad5cdSakolb * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 133c97ad5cdSakolb * Otherwise returns True if project or zone should be penalized because its 134c97ad5cdSakolb * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 135c97ad5cdSakolb * bits in t_schedflag in this case. 136c97ad5cdSakolb * 137c97ad5cdSakolb * CPUCAPS_ENFORCE(kthread_id_t *) 138c97ad5cdSakolb * 139c97ad5cdSakolb * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 140c97ad5cdSakolb * state on project or zone wait queues, as requested by TS_PROJWAITQ or 141c97ad5cdSakolb * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 142c97ad5cdSakolb * wait queue or False otherwise. 143c97ad5cdSakolb * 144c97ad5cdSakolb * cpucaps_sc_init(caps_sc_t *) 145c97ad5cdSakolb * 146c97ad5cdSakolb * Initializes the scheduling-class specific CPU Caps data for a thread. 147c97ad5cdSakolb * 148c97ad5cdSakolb * LOCKS 149c97ad5cdSakolb * ===== 150c97ad5cdSakolb * 151c97ad5cdSakolb * all the individual caps structures and their lists are protected by a global 152c97ad5cdSakolb * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 153c97ad5cdSakolb * caps, so it is usually uncontended. We avoid all blocking memory allocations 154c97ad5cdSakolb * while holding caps_lock to prevent clock() from blocking. 155c97ad5cdSakolb * 156c97ad5cdSakolb * Thread state is protected by the thread lock. It protects the association 157c97ad5cdSakolb * between a thread and its project and, as a consequence, to its zone. The 158c97ad5cdSakolb * association can not break while thread lock is held, so the project or zone 159c97ad5cdSakolb * cap are not going to disappear while thread lock is held. 160c97ad5cdSakolb * 161c97ad5cdSakolb * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 162c97ad5cdSakolb * grabbed by scheduling classes already holding thread lock at high PIL and by 163c97ad5cdSakolb * clock thread performing usage decay. We should do as little work as possible 164c97ad5cdSakolb * while holding the lock since it may be very hot. All threads in the project 165c97ad5cdSakolb * contend for the same cache line doing cap usage updates. 166c97ad5cdSakolb */ 167c97ad5cdSakolb 168c97ad5cdSakolb /* 169c97ad5cdSakolb * caps_lock protects list of capped projects and zones, changes in the cap 170c97ad5cdSakolb * state and changes of the global cpucaps_enabled flag. 171c97ad5cdSakolb * 172c97ad5cdSakolb * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 173c97ad5cdSakolb * modified in parallel. This can be per-zone cap flag, but we don't keep any 174c97ad5cdSakolb * cap state for now. 175c97ad5cdSakolb */ 176c97ad5cdSakolb static kmutex_t caps_lock; /* lock to protect: */ 177c97ad5cdSakolb static list_t capped_zones; /* - list of zones with caps */ 178c97ad5cdSakolb static list_t capped_projects; /* - list of projects with caps */ 179c97ad5cdSakolb boolean_t cpucaps_enabled; /* - are there any caps defined? */ 180c97ad5cdSakolb boolean_t cpucaps_busy; /* - is framework busy? */ 181c97ad5cdSakolb 182c97ad5cdSakolb /* 183c97ad5cdSakolb * The accounting is based on the number of nanoseconds threads spend running 184c97ad5cdSakolb * during a tick which is kept in the cap_tick_cost variable. 185c97ad5cdSakolb */ 186c97ad5cdSakolb static hrtime_t cap_tick_cost; 187c97ad5cdSakolb 188c97ad5cdSakolb /* 189c97ad5cdSakolb * How much of the usage value is decayed every clock tick 190c97ad5cdSakolb * Decay one per cent of value per tick 191c97ad5cdSakolb */ 192c97ad5cdSakolb #define CAP_DECAY_FACTOR 100 193c97ad5cdSakolb 194c97ad5cdSakolb /* 195c97ad5cdSakolb * Scale the value and round it to the closest integer value 196c97ad5cdSakolb */ 197c97ad5cdSakolb #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 198c97ad5cdSakolb 199c97ad5cdSakolb static void caps_update(); 200c97ad5cdSakolb 201c97ad5cdSakolb /* 202c97ad5cdSakolb * CAP kstats. 203c97ad5cdSakolb */ 204c97ad5cdSakolb struct cap_kstat { 205c97ad5cdSakolb kstat_named_t cap_value; 206c97ad5cdSakolb kstat_named_t cap_usage; 207c97ad5cdSakolb kstat_named_t cap_nwait; 208c97ad5cdSakolb kstat_named_t cap_below; 209c97ad5cdSakolb kstat_named_t cap_above; 210c97ad5cdSakolb kstat_named_t cap_maxusage; 211c97ad5cdSakolb kstat_named_t cap_zonename; 212c97ad5cdSakolb } cap_kstat = { 213c97ad5cdSakolb { "value", KSTAT_DATA_UINT64 }, 214c97ad5cdSakolb { "usage", KSTAT_DATA_UINT64 }, 215c97ad5cdSakolb { "nwait", KSTAT_DATA_UINT64 }, 216c97ad5cdSakolb { "below_sec", KSTAT_DATA_UINT64 }, 217c97ad5cdSakolb { "above_sec", KSTAT_DATA_UINT64 }, 218c97ad5cdSakolb { "maxusage", KSTAT_DATA_UINT64 }, 219c97ad5cdSakolb { "zonename", KSTAT_DATA_STRING }, 220c97ad5cdSakolb }; 221c97ad5cdSakolb 222c97ad5cdSakolb 223c97ad5cdSakolb static kmutex_t cap_kstat_lock; 224c97ad5cdSakolb static int cap_kstat_update(kstat_t *, int); 225c97ad5cdSakolb 226c97ad5cdSakolb /* 227c97ad5cdSakolb * Initialize CPU caps infrastructure. 228c97ad5cdSakolb * - Initialize lists of capped zones and capped projects 229c97ad5cdSakolb * - Set cpucaps_clock_callout to NULL 230c97ad5cdSakolb */ 231c97ad5cdSakolb void 232c97ad5cdSakolb cpucaps_init() 233c97ad5cdSakolb { 234c97ad5cdSakolb /* 235c97ad5cdSakolb * Initialize global variables 236c97ad5cdSakolb */ 237c97ad5cdSakolb cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 238c97ad5cdSakolb 239c97ad5cdSakolb list_create(&capped_zones, sizeof (cpucap_t), 240c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 241c97ad5cdSakolb list_create(&capped_projects, sizeof (cpucap_t), 242c97ad5cdSakolb offsetof(cpucap_t, cap_link)); 243c97ad5cdSakolb 244c97ad5cdSakolb cpucaps_enabled = B_FALSE; 245c97ad5cdSakolb cpucaps_busy = B_FALSE; 246c97ad5cdSakolb cpucaps_clock_callout = NULL; 247c97ad5cdSakolb } 248c97ad5cdSakolb 249c97ad5cdSakolb /* 250c97ad5cdSakolb * Initialize scheduling-class specific CPU Caps data. 251c97ad5cdSakolb */ 252c97ad5cdSakolb void 253c97ad5cdSakolb cpucaps_sc_init(caps_sc_t *csc) 254c97ad5cdSakolb { 255c97ad5cdSakolb csc->csc_cputime = 0; 256c97ad5cdSakolb } 257c97ad5cdSakolb 258c97ad5cdSakolb /* 259c97ad5cdSakolb * Allocate and initialize cpucap structure 260c97ad5cdSakolb */ 261c97ad5cdSakolb static cpucap_t * 262c97ad5cdSakolb cap_alloc(void) 263c97ad5cdSakolb { 264c97ad5cdSakolb cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 265c97ad5cdSakolb 266c97ad5cdSakolb DISP_LOCK_INIT(&cap->cap_usagelock); 267c97ad5cdSakolb waitq_init(&cap->cap_waitq); 268c97ad5cdSakolb 269c97ad5cdSakolb return (cap); 270c97ad5cdSakolb } 271c97ad5cdSakolb 272c97ad5cdSakolb /* 273c97ad5cdSakolb * Free cpucap structure 274c97ad5cdSakolb */ 275c97ad5cdSakolb static void 276c97ad5cdSakolb cap_free(cpucap_t *cap) 277c97ad5cdSakolb { 278c97ad5cdSakolb if (cap == NULL) 279c97ad5cdSakolb return; 280c97ad5cdSakolb 281c97ad5cdSakolb /* 282c97ad5cdSakolb * This cap should not be active 283c97ad5cdSakolb */ 284c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 285c97ad5cdSakolb ASSERT(cap->cap_value == 0); 286c97ad5cdSakolb ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 287c97ad5cdSakolb 288c97ad5cdSakolb waitq_fini(&cap->cap_waitq); 289c97ad5cdSakolb DISP_LOCK_DESTROY(&cap->cap_usagelock); 290c97ad5cdSakolb 291c97ad5cdSakolb kmem_free(cap, sizeof (cpucap_t)); 292c97ad5cdSakolb } 293c97ad5cdSakolb 294c97ad5cdSakolb /* 295c97ad5cdSakolb * Activate cap - insert into active list and unblock its 296c97ad5cdSakolb * wait queue. Should be called with caps_lock held. 297c97ad5cdSakolb * The cap_value field is set to the value supplied. 298c97ad5cdSakolb */ 299c97ad5cdSakolb static void 300c97ad5cdSakolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 301c97ad5cdSakolb { 302c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 303c97ad5cdSakolb 304c97ad5cdSakolb /* 305c97ad5cdSakolb * Cap can not be already enabled 306c97ad5cdSakolb */ 307c97ad5cdSakolb ASSERT(!CAP_ENABLED(cap)); 308c97ad5cdSakolb ASSERT(!list_link_active(&cap->cap_link)); 309c97ad5cdSakolb 310c97ad5cdSakolb list_insert_tail(l, cap); 311c97ad5cdSakolb cap->cap_below = cap->cap_above = 0; 312c97ad5cdSakolb cap->cap_maxusage = 0; 313c97ad5cdSakolb cap->cap_usage = 0; 314c97ad5cdSakolb cap->cap_value = value; 315c97ad5cdSakolb waitq_unblock(&cap->cap_waitq); 316c97ad5cdSakolb if (CPUCAPS_OFF()) { 317c97ad5cdSakolb cpucaps_enabled = B_TRUE; 318c97ad5cdSakolb cpucaps_clock_callout = caps_update; 319c97ad5cdSakolb } 320c97ad5cdSakolb } 321c97ad5cdSakolb 322c97ad5cdSakolb /* 323c97ad5cdSakolb * Deactivate cap 324c97ad5cdSakolb * - Block its wait queue. This prevents any new threads from being 325c97ad5cdSakolb * enqueued there and moves all enqueued threads to the run queue. 326c97ad5cdSakolb * - Remove cap from list l. 327c97ad5cdSakolb * - Disable CPU caps globally if there are no capped projects or zones 328c97ad5cdSakolb * 329c97ad5cdSakolb * Should be called with caps_lock held. 330c97ad5cdSakolb */ 331c97ad5cdSakolb static void 332c97ad5cdSakolb cap_disable(list_t *l, cpucap_t *cap) 333c97ad5cdSakolb { 334c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 335c97ad5cdSakolb /* 336c97ad5cdSakolb * Cap should be currently active 337c97ad5cdSakolb */ 338c97ad5cdSakolb ASSERT(CPUCAPS_ON()); 339c97ad5cdSakolb ASSERT(list_link_active(&cap->cap_link)); 340c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 341c97ad5cdSakolb 342c97ad5cdSakolb waitq_block(&cap->cap_waitq); 343c97ad5cdSakolb list_remove(l, cap); 344c97ad5cdSakolb if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 345c97ad5cdSakolb cpucaps_enabled = B_FALSE; 346c97ad5cdSakolb cpucaps_clock_callout = NULL; 347c97ad5cdSakolb } 348c97ad5cdSakolb cap->cap_value = 0; 349c97ad5cdSakolb cap->cap_project = NULL; 350c97ad5cdSakolb cap->cap_zone = NULL; 351c97ad5cdSakolb if (cap->cap_kstat != NULL) { 352c97ad5cdSakolb kstat_delete(cap->cap_kstat); 353c97ad5cdSakolb cap->cap_kstat = NULL; 354c97ad5cdSakolb } 355c97ad5cdSakolb 356c97ad5cdSakolb } 357c97ad5cdSakolb 358c97ad5cdSakolb /* 359c97ad5cdSakolb * Enable cap for a project kpj 360c97ad5cdSakolb * It is safe to enable already enabled project cap. 361c97ad5cdSakolb * Should be called with caps_lock held. 362c97ad5cdSakolb */ 363c97ad5cdSakolb static void 364c97ad5cdSakolb cap_project_enable(kproject_t *kpj, hrtime_t value) 365c97ad5cdSakolb { 366c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 367c97ad5cdSakolb 368c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 369c97ad5cdSakolb ASSERT(cap != NULL); 370c97ad5cdSakolb 371c97ad5cdSakolb if (CAP_DISABLED(cap)) { 372c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 373c97ad5cdSakolb cap_enable(&capped_projects, cap, value); 374c97ad5cdSakolb cap->cap_project = kpj; 375c97ad5cdSakolb cap->cap_zone = kpj->kpj_zone; 376c97ad5cdSakolb 377c97ad5cdSakolb /* 378c97ad5cdSakolb * Create cap kstats 379c97ad5cdSakolb */ 380c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 381c97ad5cdSakolb KSTAT_TYPE_NAMED, 382c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 383c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 384c97ad5cdSakolb cap->cap_kstat->ks_data_size += 385c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 386c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 387c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 388c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 389c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 390c97ad5cdSakolb kstat_install(cap->cap_kstat); 391c97ad5cdSakolb } 392c97ad5cdSakolb } 393c97ad5cdSakolb } 394c97ad5cdSakolb 395c97ad5cdSakolb /* 396c97ad5cdSakolb * Disable project cap. 397c97ad5cdSakolb * It is safe to disable already disabled project cap. 398c97ad5cdSakolb * Should be called with caps_lock held. 399c97ad5cdSakolb */ 400c97ad5cdSakolb static void 401c97ad5cdSakolb cap_project_disable(kproject_t *kpj) 402c97ad5cdSakolb { 403c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 404c97ad5cdSakolb 405c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 406c97ad5cdSakolb ASSERT(cap != NULL); 407c97ad5cdSakolb ASSERT(cap->cap_project == kpj); 408c97ad5cdSakolb 409c97ad5cdSakolb if (CAP_ENABLED(cap)) 410c97ad5cdSakolb cap_disable(&capped_projects, cap); 411c97ad5cdSakolb } 412c97ad5cdSakolb 413c97ad5cdSakolb /* 414c97ad5cdSakolb * Enable cap for a zone 415c97ad5cdSakolb * It is safe to enable already enabled zone cap. 416c97ad5cdSakolb * Should be called with caps_lock held. 417c97ad5cdSakolb */ 418c97ad5cdSakolb static void 419c97ad5cdSakolb cap_zone_enable(zone_t *zone, hrtime_t value) 420c97ad5cdSakolb { 421c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 422c97ad5cdSakolb 423c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 424c97ad5cdSakolb ASSERT(cap != NULL); 425c97ad5cdSakolb 426c97ad5cdSakolb if (CAP_DISABLED(cap)) { 427c97ad5cdSakolb ASSERT(cap->cap_kstat == NULL); 428c97ad5cdSakolb cap_enable(&capped_zones, cap, value); 429c97ad5cdSakolb cap->cap_zone = zone; 430c97ad5cdSakolb 431c97ad5cdSakolb /* 432c97ad5cdSakolb * Create cap kstats 433c97ad5cdSakolb */ 434c97ad5cdSakolb if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 435c97ad5cdSakolb KSTAT_TYPE_NAMED, 436c97ad5cdSakolb sizeof (cap_kstat) / sizeof (kstat_named_t), 437c97ad5cdSakolb KSTAT_FLAG_VIRTUAL)) != NULL) { 438c97ad5cdSakolb cap->cap_kstat->ks_data_size += 439c97ad5cdSakolb strlen(cap->cap_zone->zone_name) + 1; 440c97ad5cdSakolb cap->cap_kstat->ks_lock = &cap_kstat_lock; 441c97ad5cdSakolb cap->cap_kstat->ks_data = &cap_kstat; 442c97ad5cdSakolb cap->cap_kstat->ks_update = cap_kstat_update; 443c97ad5cdSakolb cap->cap_kstat->ks_private = cap; 444c97ad5cdSakolb kstat_install(cap->cap_kstat); 445c97ad5cdSakolb } 446c97ad5cdSakolb } 447c97ad5cdSakolb } 448c97ad5cdSakolb 449c97ad5cdSakolb /* 450c97ad5cdSakolb * Disable zone cap. 451c97ad5cdSakolb * It is safe to disable already disabled zone cap. 452c97ad5cdSakolb * Should be called with caps_lock held. 453c97ad5cdSakolb */ 454c97ad5cdSakolb static void 455c97ad5cdSakolb cap_zone_disable(zone_t *zone) 456c97ad5cdSakolb { 457c97ad5cdSakolb cpucap_t *cap = zone->zone_cpucap; 458c97ad5cdSakolb 459c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 460c97ad5cdSakolb ASSERT(cap != NULL); 461c97ad5cdSakolb ASSERT(cap->cap_zone == zone); 462c97ad5cdSakolb 463c97ad5cdSakolb if (CAP_ENABLED(cap)) 464c97ad5cdSakolb cap_disable(&capped_zones, cap); 465c97ad5cdSakolb } 466c97ad5cdSakolb 467c97ad5cdSakolb /* 468c97ad5cdSakolb * Apply specified callback to all caps contained in the list `l'. 469c97ad5cdSakolb */ 470c97ad5cdSakolb static void 471*d3d50737SRafael Vanoni cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) 472c97ad5cdSakolb { 473*d3d50737SRafael Vanoni static uint64_t cpucap_walk_gen; 474c97ad5cdSakolb cpucap_t *cap; 475c97ad5cdSakolb 476c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 477c97ad5cdSakolb 478c97ad5cdSakolb for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 479*d3d50737SRafael Vanoni (*cb)(cap, cpucap_walk_gen); 480c97ad5cdSakolb } 481*d3d50737SRafael Vanoni 482*d3d50737SRafael Vanoni atomic_inc_64(&cpucap_walk_gen); 483c97ad5cdSakolb } 484c97ad5cdSakolb 485c97ad5cdSakolb /* 486c97ad5cdSakolb * If cap limit is not reached, make one thread from wait queue runnable. 487c97ad5cdSakolb * The waitq_isempty check is performed without the waitq lock. If a new thread 488c97ad5cdSakolb * is placed on the waitq right after the check, it will be picked up during the 489c97ad5cdSakolb * next invocation of cap_poke_waitq(). 490c97ad5cdSakolb */ 491*d3d50737SRafael Vanoni /* ARGSUSED */ 492c97ad5cdSakolb static void 493*d3d50737SRafael Vanoni cap_poke_waitq(cpucap_t *cap, int64_t gen) 494c97ad5cdSakolb { 495c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 496c97ad5cdSakolb 497c97ad5cdSakolb if (cap->cap_usage >= cap->cap_value) { 498c97ad5cdSakolb cap->cap_above++; 499c97ad5cdSakolb } else { 500c97ad5cdSakolb waitq_t *wq = &cap->cap_waitq; 501c97ad5cdSakolb 502c97ad5cdSakolb cap->cap_below++; 503c97ad5cdSakolb 504c97ad5cdSakolb if (!waitq_isempty(wq)) 505c97ad5cdSakolb waitq_runone(wq); 506c97ad5cdSakolb } 507c97ad5cdSakolb } 508c97ad5cdSakolb 509c97ad5cdSakolb /* 510c97ad5cdSakolb * The callback function called for every cap on capped_projects list. 511c97ad5cdSakolb * Decay cap usage by CAP_DECAY_FACTOR 512c97ad5cdSakolb * Add this cap project usage to its zone usage. 513c97ad5cdSakolb * Kick off a thread from the cap waitq if cap is not reached. 514c97ad5cdSakolb */ 515c97ad5cdSakolb static void 516*d3d50737SRafael Vanoni cap_project_usage_walker(cpucap_t *cap, int64_t gen) 517c97ad5cdSakolb { 518c97ad5cdSakolb zone_t *zone = cap->cap_zone; 519c97ad5cdSakolb hrtime_t cap_usage = cap->cap_usage; 520c97ad5cdSakolb 521c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 522c97ad5cdSakolb ASSERT(cap->cap_project->kpj_cpucap == cap); 523c97ad5cdSakolb ASSERT(zone == cap->cap_project->kpj_zone); 524c97ad5cdSakolb ASSERT(CAP_ENABLED(cap)); 525c97ad5cdSakolb 526c97ad5cdSakolb /* 527c97ad5cdSakolb * Set or clear the CAP_REACHED flag based on the current usage. 528c97ad5cdSakolb * Only projects having their own caps are ever marked as CAP_REACHED. 529c97ad5cdSakolb */ 530*d3d50737SRafael Vanoni cap_poke_waitq(cap, 0); 531c97ad5cdSakolb 532c97ad5cdSakolb /* 533c97ad5cdSakolb * Add project's CPU usage to our zone's CPU usage. 534c97ad5cdSakolb */ 535c97ad5cdSakolb if (ZONE_IS_CAPPED(zone)) { 536c97ad5cdSakolb cpucap_t *zcap = zone->zone_cpucap; 537c97ad5cdSakolb 538c97ad5cdSakolb ASSERT(zcap->cap_zone == zone); 539c97ad5cdSakolb 540c97ad5cdSakolb /* 541c97ad5cdSakolb * If we haven't reset this zone's usage during this clock tick 542*d3d50737SRafael Vanoni * yet, then do it now. The cap_gen field is used to check 543c97ad5cdSakolb * whether this is the first zone's project we see during this 544c97ad5cdSakolb * tick or a subsequent one. 545c97ad5cdSakolb */ 546*d3d50737SRafael Vanoni if (zcap->cap_gen != gen) { 547c97ad5cdSakolb if (zcap->cap_usage > zcap->cap_maxusage) 548c97ad5cdSakolb zcap->cap_maxusage = zcap->cap_usage; 549c97ad5cdSakolb zcap->cap_usage = 0; 550*d3d50737SRafael Vanoni zcap->cap_gen = gen; 551c97ad5cdSakolb } 552c97ad5cdSakolb DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 553c97ad5cdSakolb hrtime_t, cap_usage); 554c97ad5cdSakolb zcap->cap_usage += cap_usage; 555c97ad5cdSakolb /* Check for overflows */ 556c97ad5cdSakolb if (zcap->cap_usage < 0) 557c97ad5cdSakolb zcap->cap_usage = MAX_USAGE - 1; 558c97ad5cdSakolb } 559c97ad5cdSakolb 560c97ad5cdSakolb /* 561c97ad5cdSakolb * Decay project usage. 562c97ad5cdSakolb */ 563c97ad5cdSakolb disp_lock_enter(&cap->cap_usagelock); 564c97ad5cdSakolb cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 565c97ad5cdSakolb disp_lock_exit(&cap->cap_usagelock); 566c97ad5cdSakolb } 567c97ad5cdSakolb 568c97ad5cdSakolb /* 569c97ad5cdSakolb * On every clock tick walk the list of project caps and update the CPU usage. 570c97ad5cdSakolb * Also walk the list of zone caps checking whether any threads should 571c97ad5cdSakolb * transition from wait queue to run queue. 572c97ad5cdSakolb * 573c97ad5cdSakolb * This function gets called by the clock thread directly when there are any 574c97ad5cdSakolb * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 575c97ad5cdSakolb * caps_lock for long periods of time, so there should be almost no contention 576c97ad5cdSakolb * for it. 577c97ad5cdSakolb */ 578c97ad5cdSakolb static void 579c97ad5cdSakolb caps_update() 580c97ad5cdSakolb { 581c97ad5cdSakolb mutex_enter(&caps_lock); 582c97ad5cdSakolb cap_walk(&capped_projects, cap_project_usage_walker); 583c97ad5cdSakolb cap_walk(&capped_zones, cap_poke_waitq); 584c97ad5cdSakolb mutex_exit(&caps_lock); 585c97ad5cdSakolb } 586c97ad5cdSakolb 587c97ad5cdSakolb /* 588c97ad5cdSakolb * The function is called for each project in a zone when the zone cap is 589c97ad5cdSakolb * modified. It enables project caps if zone cap is enabled and disables if the 590c97ad5cdSakolb * zone cap is disabled and project doesn't have its own cap. 591c97ad5cdSakolb * 592c97ad5cdSakolb * For each project that does not have cpucap structure allocated it allocates a 593c97ad5cdSakolb * new structure and assigns to kpj->cpu_cap. The allocation is performed 594c97ad5cdSakolb * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 595c97ad5cdSakolb * held. 596c97ad5cdSakolb */ 597c97ad5cdSakolb static int 598c97ad5cdSakolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 599c97ad5cdSakolb { 600c97ad5cdSakolb cpucap_t *project_cap = NULL; 601c97ad5cdSakolb cpucap_t *zone_cap = (cpucap_t *)arg; 602c97ad5cdSakolb 603c97ad5cdSakolb ASSERT(zone_cap != NULL); 604c97ad5cdSakolb 605c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 606c97ad5cdSakolb /* 607c97ad5cdSakolb * This is the first time any cap was established for this 608c97ad5cdSakolb * project. Allocate a new cpucap structure for it. 609c97ad5cdSakolb */ 610c97ad5cdSakolb project_cap = cap_alloc(); 611c97ad5cdSakolb } 612c97ad5cdSakolb 613c97ad5cdSakolb mutex_enter(&caps_lock); 614c97ad5cdSakolb 615c97ad5cdSakolb /* 616c97ad5cdSakolb * Double-check that kpj_cpucap is still NULL - now with caps_lock held 617c97ad5cdSakolb * and assign the newly allocated cpucap structure to it. 618c97ad5cdSakolb */ 619c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 620c97ad5cdSakolb kpj->kpj_cpucap = project_cap; 621c97ad5cdSakolb } else if (project_cap != NULL) { 622c97ad5cdSakolb cap_free(project_cap); 623c97ad5cdSakolb } 624c97ad5cdSakolb 625c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 626c97ad5cdSakolb 627c97ad5cdSakolb if (CAP_DISABLED(zone_cap)) { 628c97ad5cdSakolb /* 629c97ad5cdSakolb * Remove all projects in this zone without caps 630c97ad5cdSakolb * from the capped_projects list. 631c97ad5cdSakolb */ 632c97ad5cdSakolb if (project_cap->cap_value == MAX_USAGE) { 633c97ad5cdSakolb cap_project_disable(kpj); 634c97ad5cdSakolb } 635c97ad5cdSakolb } else if (CAP_DISABLED(project_cap)) { 636c97ad5cdSakolb /* 637c97ad5cdSakolb * Add the project to capped_projects list. 638c97ad5cdSakolb */ 639c97ad5cdSakolb ASSERT(project_cap->cap_value == 0); 640c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 641c97ad5cdSakolb } 642c97ad5cdSakolb mutex_exit(&caps_lock); 643c97ad5cdSakolb 644c97ad5cdSakolb return (0); 645c97ad5cdSakolb } 646c97ad5cdSakolb 647c97ad5cdSakolb /* 648c97ad5cdSakolb * Set zone cap to cap_val 649c97ad5cdSakolb * If cap_val is equal to NOCAP, disable zone cap. 650c97ad5cdSakolb * 651c97ad5cdSakolb * If this is the first time a cap is set on a zone, allocate cpucap structure 652c97ad5cdSakolb * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 653c97ad5cdSakolb */ 654c97ad5cdSakolb int 655c97ad5cdSakolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 656c97ad5cdSakolb { 657c97ad5cdSakolb cpucap_t *cap = NULL; 658c97ad5cdSakolb hrtime_t value; 659c97ad5cdSakolb 660c97ad5cdSakolb if (cap_val == 0) 661c97ad5cdSakolb return (EINVAL); 662c97ad5cdSakolb 663c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 664c97ad5cdSakolb if (cap_val > MAXCAP) 665c97ad5cdSakolb cap_val = MAXCAP; 666c97ad5cdSakolb 667c97ad5cdSakolb /* 668c97ad5cdSakolb * Nothing to do if trying to disable a cap on a zone when caps are off 669c97ad5cdSakolb * or a zone which does not have a cap yet. 670c97ad5cdSakolb */ 671c97ad5cdSakolb if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 672c97ad5cdSakolb return (0); 673c97ad5cdSakolb 674c97ad5cdSakolb if (zone->zone_cpucap == NULL) 675c97ad5cdSakolb cap = cap_alloc(); 676c97ad5cdSakolb 677c97ad5cdSakolb mutex_enter(&caps_lock); 678c97ad5cdSakolb 679c97ad5cdSakolb if (cpucaps_busy) { 680c97ad5cdSakolb mutex_exit(&caps_lock); 681c97ad5cdSakolb return (EBUSY); 682c97ad5cdSakolb } 683c97ad5cdSakolb 684c97ad5cdSakolb /* 685c97ad5cdSakolb * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 686c97ad5cdSakolb * held. If it is still NULL, assign a newly allocated cpucap to it. 687c97ad5cdSakolb */ 688c97ad5cdSakolb if (zone->zone_cpucap == NULL) { 689c97ad5cdSakolb zone->zone_cpucap = cap; 690c97ad5cdSakolb } else if (cap != NULL) { 691c97ad5cdSakolb cap_free(cap); 692c97ad5cdSakolb } 693c97ad5cdSakolb 694c97ad5cdSakolb cap = zone->zone_cpucap; 695c97ad5cdSakolb value = cap_val * cap_tick_cost; 696c97ad5cdSakolb if (value < 0) 697c97ad5cdSakolb value = MAX_USAGE; 698c97ad5cdSakolb 699c97ad5cdSakolb /* Nothing to do if the value is staying the same */ 700c97ad5cdSakolb if (value == cap->cap_value) { 701c97ad5cdSakolb mutex_exit(&caps_lock); 702c97ad5cdSakolb return (0); 703c97ad5cdSakolb } 704c97ad5cdSakolb 705c97ad5cdSakolb /* 706c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 707c97ad5cdSakolb */ 708c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 709c97ad5cdSakolb 710c97ad5cdSakolb 711c97ad5cdSakolb if (cap_val == NOCAP) { 712c97ad5cdSakolb if (CAP_ENABLED(cap)) { 713c97ad5cdSakolb /* 714c97ad5cdSakolb * Remove cap for the zone 715c97ad5cdSakolb */ 716c97ad5cdSakolb cap_zone_disable(zone); 717c97ad5cdSakolb cpucaps_busy = B_TRUE; 718c97ad5cdSakolb mutex_exit(&caps_lock); 719c97ad5cdSakolb /* 720c97ad5cdSakolb * Disable caps for all project belonging to this zone 721c97ad5cdSakolb * unless they have their own cap. 722c97ad5cdSakolb */ 723c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 724c97ad5cdSakolb cap_project_zone_modify_walker, cap); 725c97ad5cdSakolb 726c97ad5cdSakolb mutex_enter(&caps_lock); 727c97ad5cdSakolb cpucaps_busy = B_FALSE; 728c97ad5cdSakolb } 729c97ad5cdSakolb } else if (CAP_DISABLED(cap)) { 730c97ad5cdSakolb /* 731c97ad5cdSakolb * Set a cap on a zone which previously was not capped. 732c97ad5cdSakolb */ 733c97ad5cdSakolb cap_zone_enable(zone, value); 734c97ad5cdSakolb cpucaps_busy = B_TRUE; 735c97ad5cdSakolb mutex_exit(&caps_lock); 736c97ad5cdSakolb 737c97ad5cdSakolb /* 738c97ad5cdSakolb * Enable cap for all projects belonging to this zone. 739c97ad5cdSakolb */ 740c97ad5cdSakolb (void) project_walk_all(zone->zone_id, 741c97ad5cdSakolb cap_project_zone_modify_walker, cap); 742c97ad5cdSakolb 743c97ad5cdSakolb mutex_enter(&caps_lock); 744c97ad5cdSakolb cpucaps_busy = B_FALSE; 745c97ad5cdSakolb } else { 746c97ad5cdSakolb /* 747c97ad5cdSakolb * No state transitions, just change the value 748c97ad5cdSakolb */ 749c97ad5cdSakolb cap->cap_value = value; 750c97ad5cdSakolb } 751c97ad5cdSakolb 752c97ad5cdSakolb ASSERT(MUTEX_HELD(&caps_lock)); 753c97ad5cdSakolb ASSERT(!cpucaps_busy); 754c97ad5cdSakolb mutex_exit(&caps_lock); 755c97ad5cdSakolb 756c97ad5cdSakolb return (0); 757c97ad5cdSakolb } 758c97ad5cdSakolb 759c97ad5cdSakolb /* 760c97ad5cdSakolb * The project is going away so disable its cap. 761c97ad5cdSakolb */ 762c97ad5cdSakolb void 763c97ad5cdSakolb cpucaps_project_remove(kproject_t *kpj) 764c97ad5cdSakolb { 765c97ad5cdSakolb mutex_enter(&caps_lock); 766c97ad5cdSakolb if (PROJECT_IS_CAPPED(kpj)) 767c97ad5cdSakolb cap_project_disable(kpj); 768c97ad5cdSakolb if (kpj->kpj_cpucap != NULL) { 769c97ad5cdSakolb cap_free(kpj->kpj_cpucap); 770c97ad5cdSakolb kpj->kpj_cpucap = NULL; 771c97ad5cdSakolb } 772c97ad5cdSakolb mutex_exit(&caps_lock); 773c97ad5cdSakolb } 774c97ad5cdSakolb 775c97ad5cdSakolb /* 776c97ad5cdSakolb * The zone is going away, so disable its cap. 777c97ad5cdSakolb */ 778c97ad5cdSakolb void 779c97ad5cdSakolb cpucaps_zone_remove(zone_t *zone) 780c97ad5cdSakolb { 781c97ad5cdSakolb mutex_enter(&caps_lock); 782c97ad5cdSakolb while (ZONE_IS_CAPPED(zone)) { 783c97ad5cdSakolb mutex_exit(&caps_lock); 784c97ad5cdSakolb (void) cpucaps_zone_set(zone, NOCAP); 785c97ad5cdSakolb mutex_enter(&caps_lock); 786c97ad5cdSakolb } 787c97ad5cdSakolb if (zone->zone_cpucap != NULL) { 788c97ad5cdSakolb cap_free(zone->zone_cpucap); 789c97ad5cdSakolb zone->zone_cpucap = NULL; 790c97ad5cdSakolb } 791c97ad5cdSakolb mutex_exit(&caps_lock); 792c97ad5cdSakolb } 793c97ad5cdSakolb 794c97ad5cdSakolb /* 795c97ad5cdSakolb * New project was created. It should be put on the capped_projects list if 796c97ad5cdSakolb * its zone has a cap. 797c97ad5cdSakolb */ 798c97ad5cdSakolb void 799c97ad5cdSakolb cpucaps_project_add(kproject_t *kpj) 800c97ad5cdSakolb { 801c97ad5cdSakolb cpucap_t *cap = NULL; 802c97ad5cdSakolb 803c97ad5cdSakolb if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 804c97ad5cdSakolb return; 805c97ad5cdSakolb 806c97ad5cdSakolb /* 807c97ad5cdSakolb * This project was never capped before, so allocate its cap structure. 808c97ad5cdSakolb */ 809c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) 810c97ad5cdSakolb cap = cap_alloc(); 811c97ad5cdSakolb 812c97ad5cdSakolb mutex_enter(&caps_lock); 813c97ad5cdSakolb /* 814c97ad5cdSakolb * Double-check with caps_lock held 815c97ad5cdSakolb */ 816c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 817c97ad5cdSakolb kpj->kpj_cpucap = cap; 818c97ad5cdSakolb } else if (cap != NULL) { 819c97ad5cdSakolb cap_free(cap); 820c97ad5cdSakolb } 821c97ad5cdSakolb 822c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) 823c97ad5cdSakolb cap_project_enable(kpj, MAX_USAGE); 824c97ad5cdSakolb 825c97ad5cdSakolb mutex_exit(&caps_lock); 826c97ad5cdSakolb } 827c97ad5cdSakolb 828c97ad5cdSakolb /* 829c97ad5cdSakolb * Set project cap to cap_val 830c97ad5cdSakolb * If cap_val is equal to NOCAP, disable project cap. 831c97ad5cdSakolb * 832c97ad5cdSakolb * If this is the first time a cap is set on a project, allocate cpucap 833c97ad5cdSakolb * structure without holding caps_lock to avoid KM_SLEEP allocation with 834c97ad5cdSakolb * caps_lock held. 835c97ad5cdSakolb */ 836c97ad5cdSakolb int 837c97ad5cdSakolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 838c97ad5cdSakolb { 839c97ad5cdSakolb cpucap_t *cap = NULL; 840c97ad5cdSakolb hrtime_t value; 841c97ad5cdSakolb 842c97ad5cdSakolb if (cap_val == 0) 843c97ad5cdSakolb return (EINVAL); 844c97ad5cdSakolb 845c97ad5cdSakolb ASSERT(cap_val <= MAXCAP); 846c97ad5cdSakolb if (cap_val > MAXCAP) 847c97ad5cdSakolb cap_val = MAXCAP; 848c97ad5cdSakolb 849c97ad5cdSakolb /* 850c97ad5cdSakolb * Nothing to do if trying to disable project cap and caps are not 851c97ad5cdSakolb * enabled or if trying to disable cap on a project that does not have 852c97ad5cdSakolb * cap enabled. 853c97ad5cdSakolb */ 854c97ad5cdSakolb if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 855c97ad5cdSakolb return (0); 856c97ad5cdSakolb 857c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 858c97ad5cdSakolb /* 859c97ad5cdSakolb * This project was never capped before, so allocate its cap 860c97ad5cdSakolb * structure. 861c97ad5cdSakolb */ 862c97ad5cdSakolb cap = cap_alloc(); 863c97ad5cdSakolb } 864c97ad5cdSakolb 865c97ad5cdSakolb mutex_enter(&caps_lock); 866c97ad5cdSakolb 867c97ad5cdSakolb /* 868c97ad5cdSakolb * Double-check with caps_lock held. 869c97ad5cdSakolb */ 870c97ad5cdSakolb if (kpj->kpj_cpucap == NULL) { 871c97ad5cdSakolb kpj->kpj_cpucap = cap; 872c97ad5cdSakolb } else if (cap != NULL) { 873c97ad5cdSakolb cap_free(cap); 874c97ad5cdSakolb } 875c97ad5cdSakolb 876c97ad5cdSakolb /* 877c97ad5cdSakolb * Get the actual pointer to the project cap. 878c97ad5cdSakolb */ 879c97ad5cdSakolb cap = kpj->kpj_cpucap; 880c97ad5cdSakolb value = cap_val * cap_tick_cost; 881c97ad5cdSakolb if (value < 0) 882c97ad5cdSakolb value = MAX_USAGE; 883c97ad5cdSakolb 884c97ad5cdSakolb /* 885c97ad5cdSakolb * Nothing to do if the value is not changing 886c97ad5cdSakolb */ 887c97ad5cdSakolb if (value == cap->cap_value) { 888c97ad5cdSakolb mutex_exit(&caps_lock); 889c97ad5cdSakolb return (0); 890c97ad5cdSakolb } 891c97ad5cdSakolb 892c97ad5cdSakolb /* 893c97ad5cdSakolb * Clear cap statistics since the cap value itself changes. 894c97ad5cdSakolb */ 895c97ad5cdSakolb cap->cap_above = cap->cap_below = 0; 896c97ad5cdSakolb cap->cap_maxusage = 0; 897c97ad5cdSakolb 898c97ad5cdSakolb if (cap_val != NOCAP) { 899c97ad5cdSakolb /* 900c97ad5cdSakolb * Enable this cap if it is not already enabled. 901c97ad5cdSakolb */ 902c97ad5cdSakolb if (CAP_DISABLED(cap)) 903c97ad5cdSakolb cap_project_enable(kpj, value); 904c97ad5cdSakolb else 905c97ad5cdSakolb cap->cap_value = value; 906c97ad5cdSakolb } else if (CAP_ENABLED(cap)) { 907c97ad5cdSakolb /* 908c97ad5cdSakolb * User requested to drop a cap on the project. If it is part of 909c97ad5cdSakolb * capped zone, keep the cap and set the value to MAX_USAGE, 910c97ad5cdSakolb * otherwise disable the cap. 911c97ad5cdSakolb */ 912c97ad5cdSakolb if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 913c97ad5cdSakolb cap->cap_value = MAX_USAGE; 914c97ad5cdSakolb } else { 915c97ad5cdSakolb cap_project_disable(kpj); 916c97ad5cdSakolb } 917c97ad5cdSakolb } 918c97ad5cdSakolb mutex_exit(&caps_lock); 919c97ad5cdSakolb 920c97ad5cdSakolb return (0); 921c97ad5cdSakolb } 922c97ad5cdSakolb 923c97ad5cdSakolb /* 924c97ad5cdSakolb * Get cap usage. 925c97ad5cdSakolb */ 926c97ad5cdSakolb static rctl_qty_t 927c97ad5cdSakolb cap_get(cpucap_t *cap) 928c97ad5cdSakolb { 929c97ad5cdSakolb return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 930c97ad5cdSakolb } 931c97ad5cdSakolb 932c97ad5cdSakolb /* 933c97ad5cdSakolb * Get current project usage. 934c97ad5cdSakolb */ 935c97ad5cdSakolb rctl_qty_t 936c97ad5cdSakolb cpucaps_project_get(kproject_t *kpj) 937c97ad5cdSakolb { 938c97ad5cdSakolb return (cap_get(kpj->kpj_cpucap)); 939c97ad5cdSakolb } 940c97ad5cdSakolb 941c97ad5cdSakolb /* 942c97ad5cdSakolb * Get current zone usage. 943c97ad5cdSakolb */ 944c97ad5cdSakolb rctl_qty_t 945c97ad5cdSakolb cpucaps_zone_get(zone_t *zone) 946c97ad5cdSakolb { 947c97ad5cdSakolb return (cap_get(zone->zone_cpucap)); 948c97ad5cdSakolb } 949c97ad5cdSakolb 950c97ad5cdSakolb /* 951c97ad5cdSakolb * Charge project of thread t the time thread t spent on CPU since previously 952c97ad5cdSakolb * adjusted. 953c97ad5cdSakolb * 954c97ad5cdSakolb * Record the current on-CPU time in the csc structure. 955c97ad5cdSakolb * 956c97ad5cdSakolb * Do not adjust for more than one tick worth of time. 957c97ad5cdSakolb * 9584b175f6fSakolb * It is possible that the project cap is being disabled while this routine is 9594b175f6fSakolb * executed. This should not cause any issues since the association between the 9604b175f6fSakolb * thread and its project is protected by thread lock. 961c97ad5cdSakolb */ 962c97ad5cdSakolb static void 963c97ad5cdSakolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 964c97ad5cdSakolb { 965c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 966c97ad5cdSakolb hrtime_t new_usage; 967c97ad5cdSakolb hrtime_t usage_delta; 968c97ad5cdSakolb 969c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 9704b175f6fSakolb ASSERT(kpj->kpj_cpucap != NULL); 971c97ad5cdSakolb 972c97ad5cdSakolb /* Get on-CPU time since birth of a thread */ 973c97ad5cdSakolb new_usage = mstate_thread_onproc_time(t); 974c97ad5cdSakolb 975c97ad5cdSakolb /* Time spent on CPU since last checked */ 976c97ad5cdSakolb usage_delta = new_usage - csc->csc_cputime; 977c97ad5cdSakolb 978c97ad5cdSakolb /* Save the accumulated on-CPU time */ 979c97ad5cdSakolb csc->csc_cputime = new_usage; 980c97ad5cdSakolb 981c97ad5cdSakolb /* Charge at most one tick worth of on-CPU time */ 982c97ad5cdSakolb if (usage_delta > cap_tick_cost) 983c97ad5cdSakolb usage_delta = cap_tick_cost; 984c97ad5cdSakolb 985c97ad5cdSakolb /* Add usage_delta to the project usage value. */ 986c97ad5cdSakolb if (usage_delta > 0) { 987c97ad5cdSakolb cpucap_t *cap = kpj->kpj_cpucap; 988c97ad5cdSakolb 989c97ad5cdSakolb DTRACE_PROBE2(cpucaps__project__charge, 990c97ad5cdSakolb kthread_id_t, t, hrtime_t, usage_delta); 991c97ad5cdSakolb 992c97ad5cdSakolb disp_lock_enter_high(&cap->cap_usagelock); 993c97ad5cdSakolb cap->cap_usage += usage_delta; 994c97ad5cdSakolb 995c97ad5cdSakolb /* Check for overflows */ 996c97ad5cdSakolb if (cap->cap_usage < 0) 997c97ad5cdSakolb cap->cap_usage = MAX_USAGE - 1; 998c97ad5cdSakolb 999c97ad5cdSakolb disp_lock_exit_high(&cap->cap_usagelock); 1000c97ad5cdSakolb 1001c97ad5cdSakolb /* 1002c97ad5cdSakolb * cap_maxusage is only kept for observability. Move it outside 1003c97ad5cdSakolb * the lock to reduce the time spent while holding the lock. 1004c97ad5cdSakolb */ 1005c97ad5cdSakolb if (cap->cap_usage > cap->cap_maxusage) 1006c97ad5cdSakolb cap->cap_maxusage = cap->cap_usage; 1007c97ad5cdSakolb } 1008c97ad5cdSakolb } 1009c97ad5cdSakolb 1010c97ad5cdSakolb /* 1011c97ad5cdSakolb * Charge thread's project and return True if project or zone should be 1012c97ad5cdSakolb * penalized because its project or zone is exceeding its cap. Also sets 1013c97ad5cdSakolb * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 10144b175f6fSakolb * 10154b175f6fSakolb * It is possible that the project cap is being disabled while this routine is 10164b175f6fSakolb * executed. This should not cause any issues since the association between the 10174b175f6fSakolb * thread and its project is protected by thread lock. It will still set 10184b175f6fSakolb * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place 10194b175f6fSakolb * anything on the blocked wait queue. 10204b175f6fSakolb * 1021c97ad5cdSakolb */ 1022c97ad5cdSakolb boolean_t 1023c97ad5cdSakolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1024c97ad5cdSakolb { 1025c97ad5cdSakolb kproject_t *kpj = ttoproj(t); 1026c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1027c97ad5cdSakolb zone_t *zone; 1028c97ad5cdSakolb cpucap_t *project_cap; 1029c97ad5cdSakolb boolean_t rc = B_FALSE; 1030c97ad5cdSakolb 1031c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1032c97ad5cdSakolb 1033c97ad5cdSakolb /* Nothing to do for projects that are not capped. */ 1034c97ad5cdSakolb if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1035c97ad5cdSakolb return (B_FALSE); 1036c97ad5cdSakolb 1037c97ad5cdSakolb caps_charge_adjust(t, csc); 1038c97ad5cdSakolb 1039c97ad5cdSakolb /* 1040c97ad5cdSakolb * The caller only requested to charge the project usage, no enforcement 1041c97ad5cdSakolb * part. 1042c97ad5cdSakolb */ 1043c97ad5cdSakolb if (charge_type == CPUCAPS_CHARGE_ONLY) 1044c97ad5cdSakolb return (B_FALSE); 1045c97ad5cdSakolb 1046c97ad5cdSakolb project_cap = kpj->kpj_cpucap; 1047c97ad5cdSakolb 1048c97ad5cdSakolb if (project_cap->cap_usage >= project_cap->cap_value) { 1049c97ad5cdSakolb t->t_schedflag |= TS_PROJWAITQ; 1050c97ad5cdSakolb rc = B_TRUE; 1051c97ad5cdSakolb } else if (t->t_schedflag & TS_PROJWAITQ) { 1052c97ad5cdSakolb t->t_schedflag &= ~TS_PROJWAITQ; 1053c97ad5cdSakolb } 1054c97ad5cdSakolb 1055c97ad5cdSakolb zone = ttozone(t); 1056c97ad5cdSakolb if (!ZONE_IS_CAPPED(zone)) { 1057c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) 1058c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1059c97ad5cdSakolb } else { 1060c97ad5cdSakolb cpucap_t *zone_cap = zone->zone_cpucap; 1061c97ad5cdSakolb 1062c97ad5cdSakolb if (zone_cap->cap_usage >= zone_cap->cap_value) { 1063c97ad5cdSakolb t->t_schedflag |= TS_ZONEWAITQ; 1064c97ad5cdSakolb rc = B_TRUE; 1065c97ad5cdSakolb } else if (t->t_schedflag & TS_ZONEWAITQ) { 1066c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1067c97ad5cdSakolb } 1068c97ad5cdSakolb } 1069c97ad5cdSakolb 1070c97ad5cdSakolb 1071c97ad5cdSakolb return (rc); 1072c97ad5cdSakolb } 1073c97ad5cdSakolb 1074c97ad5cdSakolb /* 1075c97ad5cdSakolb * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1076c97ad5cdSakolb * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1077c97ad5cdSakolb * 1078c97ad5cdSakolb * CPU Caps are only enforced for user threads. 1079c97ad5cdSakolb * 1080c97ad5cdSakolb * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1081c97ad5cdSakolb * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1082c97ad5cdSakolb * 1083c97ad5cdSakolb * It is possible that by the time we enter cpucaps_enforce() the cap is already 1084c97ad5cdSakolb * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1085c97ad5cdSakolb * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1086c97ad5cdSakolb * apply. 1087c97ad5cdSakolb */ 1088c97ad5cdSakolb boolean_t 1089c97ad5cdSakolb cpucaps_enforce(kthread_t *t) 1090c97ad5cdSakolb { 1091c97ad5cdSakolb klwp_t *lwp = t->t_lwp; 1092c97ad5cdSakolb 1093c97ad5cdSakolb ASSERT(THREAD_LOCK_HELD(t)); 1094c97ad5cdSakolb 1095c97ad5cdSakolb if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1096c97ad5cdSakolb if (t->t_schedflag & TS_PROJWAITQ) { 1097c97ad5cdSakolb ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1098c97ad5cdSakolb t->t_schedflag &= ~TS_ANYWAITQ; 1099c97ad5cdSakolb if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1100c97ad5cdSakolb t)) { 1101c97ad5cdSakolb return (B_TRUE); 1102c97ad5cdSakolb } 1103c97ad5cdSakolb } 1104c97ad5cdSakolb if (t->t_schedflag & TS_ZONEWAITQ) { 1105c97ad5cdSakolb ASSERT(ttozone(t)->zone_cpucap != NULL); 1106c97ad5cdSakolb t->t_schedflag &= ~TS_ZONEWAITQ; 1107c97ad5cdSakolb if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1108c97ad5cdSakolb t)) { 1109c97ad5cdSakolb return (B_TRUE); 1110c97ad5cdSakolb } 1111c97ad5cdSakolb } 1112c97ad5cdSakolb } 1113c97ad5cdSakolb 1114c97ad5cdSakolb /* 1115c97ad5cdSakolb * The thread is not enqueued on the wait queue. 1116c97ad5cdSakolb */ 1117c97ad5cdSakolb return (B_FALSE); 1118c97ad5cdSakolb } 1119c97ad5cdSakolb 1120c97ad5cdSakolb /* 1121c97ad5cdSakolb * Convert internal cap statistics into values exported by cap kstat. 1122c97ad5cdSakolb */ 1123c97ad5cdSakolb static int 1124c97ad5cdSakolb cap_kstat_update(kstat_t *ksp, int rw) 1125c97ad5cdSakolb { 1126c97ad5cdSakolb struct cap_kstat *capsp = &cap_kstat; 1127c97ad5cdSakolb cpucap_t *cap = ksp->ks_private; 1128c97ad5cdSakolb clock_t tick_sec = SEC_TO_TICK(1); 1129c97ad5cdSakolb char *zonename = cap->cap_zone->zone_name; 1130c97ad5cdSakolb 1131c97ad5cdSakolb if (rw == KSTAT_WRITE) 1132c97ad5cdSakolb return (EACCES); 1133c97ad5cdSakolb 1134c97ad5cdSakolb capsp->cap_value.value.ui64 = 1135c97ad5cdSakolb ROUND_SCALE(cap->cap_value, cap_tick_cost); 1136c97ad5cdSakolb capsp->cap_usage.value.ui64 = 1137c97ad5cdSakolb ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1138c97ad5cdSakolb capsp->cap_maxusage.value.ui64 = 1139c97ad5cdSakolb ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1140c97ad5cdSakolb capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1141c97ad5cdSakolb capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1142c97ad5cdSakolb capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1143c97ad5cdSakolb kstat_named_setstr(&capsp->cap_zonename, zonename); 1144c97ad5cdSakolb 1145c97ad5cdSakolb return (0); 1146c97ad5cdSakolb } 1147