xref: /illumos-gate/usr/src/uts/common/disp/cpucaps.c (revision 2aeafac3612e19716bf8164f89c3c9196342979c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/disp.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/atomic.h>
32 #include <sys/cpucaps_impl.h>
33 #include <sys/dtrace.h>
34 #include <sys/sdt.h>
35 #include <sys/debug.h>
36 #include <sys/rctl.h>
37 #include <sys/errno.h>
38 
39 /*
40  * CPU Caps implementation
41  * =======================
42  *
43  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
44  * usage for all projects running inside the zone. If the zone CPU cap is set
45  * below the project CPU cap, the latter will have no effect.
46  *
47  * When CPU usage of projects and/or zones reaches specified caps, threads in
48  * them do not get scheduled and instead are placed on wait queues associated
49  * with a cap. Such threads will start running again only when CPU usage drops
50  * below the cap level. Each zone and each project has its own wait queue.
51  *
52  * When CPU cap is set, the kernel continously keeps track of CPU time used by
53  * capped zones and/or projects over a short time interval and calculates their
54  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
55  * cap, LWPs running in the user-land (when they are not holding any critical
56  * kernel locks) are placed on special wait queues until their project's or
57  * zone's CPU usage drops below the cap.
58  *
59  * The system maintains a list of all capped projects and all capped zones. On
60  * every clock tick every active thread belonging to a capped project adds its
61  * CPU usage to its project. Usage from all projects belonging to a capped zone
62  * is aggregated to get the zone usage.
63  *
64  * When the current CPU usage is above the cap, a project or zone is considered
65  * over-capped. Every user thread caught running in an over-capped project or
66  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
67  * is requested to surrender its CPU. This causes scheduling class specific
68  * CL_PREEMPT() callback to be invoked. The callback function places threads
69  * marked as TS_PROJWAIT on a wait queue and calls switch().
70  *
71  * Threads are only placed on wait queues after trapping from user-land
72  * (they could be holding some user locks, but no kernel locks) and while
73  * returning from the trap back to the user-land when no kernel locks are held.
74  * Putting threads on wait queues in random places while running in the
75  * kernel might lead to all kinds of locking problems.
76  *
77  * Accounting
78  * ==========
79  *
80  * Accounting of CPU usage is based on per-thread micro-state accounting data.
81  * On every clock tick clock() adds new on-CPU time for every thread found on
82  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
83  * New times means time since it was last accounted for. On-CPU times greater
84  * than 1 tick are truncated to 1 tick.
85  *
86  * Project CPU usage is aggregated from all threads within the project.
87  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
88  * CPU usage is calculated on every clock tick by walking list of projects and
89  * adding their usage together.
90  *
91  * Decay
92  * =====
93  *
94  * CPU usage is decayed by the caps_update() routine which is called once per
95  * every clock tick. It walks lists of project caps and decays their usages by
96  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
97  * are made runnable again, one thread per clock tick.
98  *
99  * Interfaces
100  * ==========
101  *
102  * The CPU Caps facility provides the following interfaces to the rest of the
103  * system:
104  *
105  *   cpucaps_project_add(kproject_t *)
106  *
107  * Notifies the framework of a new project. It should be put on the
108  * capped_projects list if its zone has a cap.
109  *
110  *   cpucaps_project_remove(kproject_t *)
111  *
112  * Remove the association between the specified project and its cap.
113  * Called right before the project is destroyed.
114  *
115  * cpucaps_project_set(kproject_t *, rctl_qty_t)
116  *
117  * Set project cap of the specified project to the specified value. Setting the
118  * value to NOCAP is equivalent to removing the cap.
119  *
120  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
121  *
122  * Set zone cap of the specified zone to the specified value. Setting the value
123  * to NOCAP is equivalent to removing the cap.
124  *
125  *   cpucaps_zone_remove(zone_t *)
126  *
127  * Remove the association between the zone and its cap.
128  *
129  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
130  *
131  * Charges specified thread's project the amount of on-CPU time that it used.
132  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
133  * Otherwise returns True if project or zone should be penalized because its
134  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
135  * bits in t_schedflag in this case.
136  *
137  *   CPUCAPS_ENFORCE(kthread_id_t *)
138  *
139  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
140  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
141  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
142  * wait queue or False otherwise.
143  *
144  *   cpucaps_sc_init(caps_sc_t *)
145  *
146  * Initializes the scheduling-class specific CPU Caps data for a thread.
147  *
148  * LOCKS
149  * =====
150  *
151  * all the individual caps structures and their lists are protected by a global
152  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
153  * caps, so it is usually uncontended. We avoid all blocking memory allocations
154  * while holding caps_lock to prevent clock() from blocking.
155  *
156  * Thread state is protected by the thread lock. It protects the association
157  * between a thread and its project and, as a consequence, to its zone. The
158  * association can not break while thread lock is held, so the project or zone
159  * cap are not going to disappear while thread lock is held.
160  *
161  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
162  * grabbed by scheduling classes already holding thread lock at high PIL and by
163  * clock thread performing usage decay. We should do as little work as possible
164  * while holding the lock since it may be very hot. All threads in the project
165  * contend for the same cache line doing cap usage updates.
166  */
167 
168 /*
169  * caps_lock protects list of capped projects and zones, changes in the cap
170  * state and changes of the global cpucaps_enabled flag.
171  *
172  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
173  * modified in parallel. This can be per-zone cap flag, but we don't keep any
174  * cap state for now.
175  */
176 static kmutex_t caps_lock;		/* lock to protect: */
177 static list_t capped_zones;		/* - list of zones with caps */
178 static list_t capped_projects;		/* - list of projects with caps */
179 boolean_t cpucaps_enabled;		/* - are there any caps defined? */
180 boolean_t cpucaps_busy;			/* - is framework busy? */
181 
182 /*
183  * The accounting is based on the number of nanoseconds threads spend running
184  * during a tick which is kept in the cap_tick_cost variable.
185  */
186 static hrtime_t cap_tick_cost;
187 
188 /*
189  * How much of the usage value is decayed every clock tick
190  * Decay one per cent of value per tick
191  */
192 #define	CAP_DECAY_FACTOR 100
193 
194 /*
195  * Scale the value and round it to the closest integer value
196  */
197 #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
198 
199 static void caps_update();
200 
201 /*
202  * CAP kstats.
203  */
204 struct cap_kstat {
205 	kstat_named_t	cap_value;
206 	kstat_named_t	cap_usage;
207 	kstat_named_t	cap_nwait;
208 	kstat_named_t	cap_below;
209 	kstat_named_t	cap_above;
210 	kstat_named_t	cap_maxusage;
211 	kstat_named_t	cap_zonename;
212 } cap_kstat = {
213 	{ "value",	KSTAT_DATA_UINT64 },
214 	{ "usage",	KSTAT_DATA_UINT64 },
215 	{ "nwait",	KSTAT_DATA_UINT64 },
216 	{ "below_sec",	KSTAT_DATA_UINT64 },
217 	{ "above_sec",	KSTAT_DATA_UINT64 },
218 	{ "maxusage",	KSTAT_DATA_UINT64 },
219 	{ "zonename",	KSTAT_DATA_STRING },
220 };
221 
222 
223 static kmutex_t cap_kstat_lock;
224 static int cap_kstat_update(kstat_t *, int);
225 
226 /*
227  * Initialize CPU caps infrastructure.
228  *   - Initialize lists of capped zones and capped projects
229  *   - Set cpucaps_clock_callout to NULL
230  */
231 void
232 cpucaps_init()
233 {
234 	/*
235 	 * Initialize global variables
236 	 */
237 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
238 
239 	list_create(&capped_zones, sizeof (cpucap_t),
240 	    offsetof(cpucap_t, cap_link));
241 	list_create(&capped_projects, sizeof (cpucap_t),
242 	    offsetof(cpucap_t, cap_link));
243 
244 	cpucaps_enabled = B_FALSE;
245 	cpucaps_busy = B_FALSE;
246 	cpucaps_clock_callout = NULL;
247 }
248 
249 /*
250  * Initialize scheduling-class specific CPU Caps data.
251  */
252 void
253 cpucaps_sc_init(caps_sc_t *csc)
254 {
255 	csc->csc_cputime = 0;
256 }
257 
258 /*
259  * Allocate and initialize cpucap structure
260  */
261 static cpucap_t *
262 cap_alloc(void)
263 {
264 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
265 
266 	DISP_LOCK_INIT(&cap->cap_usagelock);
267 	waitq_init(&cap->cap_waitq);
268 
269 	return (cap);
270 }
271 
272 /*
273  * Free cpucap structure
274  */
275 static void
276 cap_free(cpucap_t *cap)
277 {
278 	if (cap == NULL)
279 		return;
280 
281 	/*
282 	 * This cap should not be active
283 	 */
284 	ASSERT(!list_link_active(&cap->cap_link));
285 	ASSERT(cap->cap_value == 0);
286 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
287 
288 	waitq_fini(&cap->cap_waitq);
289 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
290 
291 	kmem_free(cap, sizeof (cpucap_t));
292 }
293 
294 /*
295  * Activate cap - insert into active list and unblock its
296  * wait queue. Should be called with caps_lock held.
297  * The cap_value field is set to the value supplied.
298  */
299 static void
300 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
301 {
302 	ASSERT(MUTEX_HELD(&caps_lock));
303 
304 	/*
305 	 * Cap can not be already enabled
306 	 */
307 	ASSERT(!CAP_ENABLED(cap));
308 	ASSERT(!list_link_active(&cap->cap_link));
309 
310 	list_insert_tail(l, cap);
311 	cap->cap_below = cap->cap_above = 0;
312 	cap->cap_maxusage = 0;
313 	cap->cap_usage = 0;
314 	cap->cap_value = value;
315 	waitq_unblock(&cap->cap_waitq);
316 	if (CPUCAPS_OFF()) {
317 		cpucaps_enabled = B_TRUE;
318 		cpucaps_clock_callout = caps_update;
319 	}
320 }
321 
322 /*
323  * Deactivate cap
324  *   - Block its wait queue. This prevents any new threads from being
325  *	enqueued there and moves all enqueued threads to the run queue.
326  *   - Remove cap from list l.
327  *   - Disable CPU caps globally if there are no capped projects or zones
328  *
329  * Should be called with caps_lock held.
330  */
331 static void
332 cap_disable(list_t *l, cpucap_t *cap)
333 {
334 	ASSERT(MUTEX_HELD(&caps_lock));
335 	/*
336 	 * Cap should be currently active
337 	 */
338 	ASSERT(CPUCAPS_ON());
339 	ASSERT(list_link_active(&cap->cap_link));
340 	ASSERT(CAP_ENABLED(cap));
341 
342 	waitq_block(&cap->cap_waitq);
343 	list_remove(l, cap);
344 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
345 		cpucaps_enabled = B_FALSE;
346 		cpucaps_clock_callout = NULL;
347 	}
348 	cap->cap_value = 0;
349 	cap->cap_project = NULL;
350 	cap->cap_zone = NULL;
351 	if (cap->cap_kstat != NULL) {
352 		kstat_delete(cap->cap_kstat);
353 		cap->cap_kstat = NULL;
354 	}
355 
356 }
357 
358 /*
359  * Enable cap for a project kpj
360  * It is safe to enable already enabled project cap.
361  * Should be called with caps_lock held.
362  */
363 static void
364 cap_project_enable(kproject_t *kpj, hrtime_t value)
365 {
366 	cpucap_t *cap = kpj->kpj_cpucap;
367 
368 	ASSERT(MUTEX_HELD(&caps_lock));
369 	ASSERT(cap != NULL);
370 
371 	if (CAP_DISABLED(cap)) {
372 		ASSERT(cap->cap_kstat == NULL);
373 		cap_enable(&capped_projects, cap, value);
374 		cap->cap_project = kpj;
375 		cap->cap_zone = kpj->kpj_zone;
376 
377 		/*
378 		 * Create cap kstats
379 		 */
380 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
381 		    KSTAT_TYPE_NAMED,
382 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
383 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
384 			cap->cap_kstat->ks_data_size +=
385 			    strlen(cap->cap_zone->zone_name) + 1;
386 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
387 			cap->cap_kstat->ks_data = &cap_kstat;
388 			cap->cap_kstat->ks_update = cap_kstat_update;
389 			cap->cap_kstat->ks_private = cap;
390 			kstat_install(cap->cap_kstat);
391 		}
392 	}
393 }
394 
395 /*
396  * Disable project cap.
397  * It is safe to disable already disabled project cap.
398  * Should be called with caps_lock held.
399  */
400 static void
401 cap_project_disable(kproject_t *kpj)
402 {
403 	cpucap_t *cap = kpj->kpj_cpucap;
404 
405 	ASSERT(MUTEX_HELD(&caps_lock));
406 	ASSERT(cap != NULL);
407 	ASSERT(cap->cap_project == kpj);
408 
409 	if (CAP_ENABLED(cap))
410 		cap_disable(&capped_projects, cap);
411 }
412 
413 /*
414  * Enable cap for a zone
415  * It is safe to enable already enabled zone cap.
416  * Should be called with caps_lock held.
417  */
418 static void
419 cap_zone_enable(zone_t *zone, hrtime_t value)
420 {
421 	cpucap_t *cap = zone->zone_cpucap;
422 
423 	ASSERT(MUTEX_HELD(&caps_lock));
424 	ASSERT(cap != NULL);
425 
426 	if (CAP_DISABLED(cap)) {
427 		ASSERT(cap->cap_kstat == NULL);
428 		cap_enable(&capped_zones, cap, value);
429 		cap->cap_zone = zone;
430 
431 		/*
432 		 * Create cap kstats
433 		 */
434 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
435 		    KSTAT_TYPE_NAMED,
436 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
437 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
438 			cap->cap_kstat->ks_data_size +=
439 			    strlen(cap->cap_zone->zone_name) + 1;
440 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
441 			cap->cap_kstat->ks_data = &cap_kstat;
442 			cap->cap_kstat->ks_update = cap_kstat_update;
443 			cap->cap_kstat->ks_private = cap;
444 			kstat_install(cap->cap_kstat);
445 		}
446 	}
447 }
448 
449 /*
450  * Disable zone cap.
451  * It is safe to disable already disabled zone cap.
452  * Should be called with caps_lock held.
453  */
454 static void
455 cap_zone_disable(zone_t *zone)
456 {
457 	cpucap_t *cap = zone->zone_cpucap;
458 
459 	ASSERT(MUTEX_HELD(&caps_lock));
460 	ASSERT(cap != NULL);
461 	ASSERT(cap->cap_zone == zone);
462 
463 	if (CAP_ENABLED(cap))
464 		cap_disable(&capped_zones, cap);
465 }
466 
467 /*
468  * Apply specified callback to all caps contained in the list `l'.
469  */
470 static void
471 cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
472 {
473 	static uint64_t cpucap_walk_gen;
474 	cpucap_t *cap;
475 
476 	ASSERT(MUTEX_HELD(&caps_lock));
477 
478 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
479 		(*cb)(cap, cpucap_walk_gen);
480 	}
481 
482 	atomic_inc_64(&cpucap_walk_gen);
483 }
484 
485 /*
486  * If cap limit is not reached, make one thread from wait queue runnable.
487  * The waitq_isempty check is performed without the waitq lock. If a new thread
488  * is placed on the waitq right after the check, it will be picked up during the
489  * next invocation of cap_poke_waitq().
490  */
491 /* ARGSUSED */
492 static void
493 cap_poke_waitq(cpucap_t *cap, int64_t gen)
494 {
495 	ASSERT(MUTEX_HELD(&caps_lock));
496 
497 	if (cap->cap_usage >= cap->cap_value) {
498 		cap->cap_above++;
499 	} else {
500 		waitq_t *wq = &cap->cap_waitq;
501 
502 		cap->cap_below++;
503 
504 		if (!waitq_isempty(wq))
505 			waitq_runone(wq);
506 	}
507 }
508 
509 /*
510  * The callback function called for every cap on capped_projects list.
511  * Decay cap usage by CAP_DECAY_FACTOR
512  * Add this cap project usage to its zone usage.
513  * Kick off a thread from the cap waitq if cap is not reached.
514  */
515 static void
516 cap_project_usage_walker(cpucap_t *cap, int64_t gen)
517 {
518 	zone_t		*zone = cap->cap_zone;
519 	hrtime_t	cap_usage = cap->cap_usage;
520 
521 	ASSERT(MUTEX_HELD(&caps_lock));
522 	ASSERT(cap->cap_project->kpj_cpucap == cap);
523 	ASSERT(zone == cap->cap_project->kpj_zone);
524 	ASSERT(CAP_ENABLED(cap));
525 
526 	/*
527 	 * Set or clear the CAP_REACHED flag based on the current usage.
528 	 * Only projects having their own caps are ever marked as CAP_REACHED.
529 	 */
530 	cap_poke_waitq(cap, 0);
531 
532 	/*
533 	 * Add project's CPU usage to our zone's CPU usage.
534 	 */
535 	if (ZONE_IS_CAPPED(zone)) {
536 		cpucap_t *zcap = zone->zone_cpucap;
537 
538 		ASSERT(zcap->cap_zone == zone);
539 
540 		/*
541 		 * If we haven't reset this zone's usage during this clock tick
542 		 * yet, then do it now. The cap_gen field is used to check
543 		 * whether this is the first zone's project we see during this
544 		 * tick or a subsequent one.
545 		 */
546 		if (zcap->cap_gen != gen) {
547 			if (zcap->cap_usage > zcap->cap_maxusage)
548 				zcap->cap_maxusage = zcap->cap_usage;
549 			zcap->cap_usage = 0;
550 			zcap->cap_gen = gen;
551 		}
552 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
553 		    hrtime_t, cap_usage);
554 		zcap->cap_usage += cap_usage;
555 		/* Check for overflows */
556 		if (zcap->cap_usage < 0)
557 			zcap->cap_usage = MAX_USAGE - 1;
558 	}
559 
560 	/*
561 	 * Decay project usage.
562 	 */
563 	disp_lock_enter(&cap->cap_usagelock);
564 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
565 	disp_lock_exit(&cap->cap_usagelock);
566 }
567 
568 /*
569  * On every clock tick walk the list of project caps and update the CPU usage.
570  * Also walk the list of zone caps checking whether any threads should
571  * transition from wait queue to run queue.
572  *
573  * This function gets called by the clock thread directly when there are any
574  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
575  * caps_lock for long periods of time, so there should be almost no contention
576  * for it.
577  */
578 static void
579 caps_update()
580 {
581 	mutex_enter(&caps_lock);
582 	cap_walk(&capped_projects, cap_project_usage_walker);
583 	cap_walk(&capped_zones, cap_poke_waitq);
584 	mutex_exit(&caps_lock);
585 }
586 
587 /*
588  * The function is called for each project in a zone when the zone cap is
589  * modified. It enables project caps if zone cap is enabled and disables if the
590  * zone cap is disabled and project doesn't have its own cap.
591  *
592  * For each project that does not have cpucap structure allocated it allocates a
593  * new structure and assigns to kpj->cpu_cap. The allocation is performed
594  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
595  * held.
596  */
597 static int
598 cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
599 {
600 	cpucap_t *project_cap = NULL;
601 	cpucap_t *zone_cap = (cpucap_t *)arg;
602 
603 	ASSERT(zone_cap != NULL);
604 
605 	if (kpj->kpj_cpucap == NULL) {
606 		/*
607 		 * This is the first time any cap was established for this
608 		 * project. Allocate a new cpucap structure for it.
609 		 */
610 		project_cap = cap_alloc();
611 	}
612 
613 	mutex_enter(&caps_lock);
614 
615 	/*
616 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
617 	 * and assign the newly allocated cpucap structure to it.
618 	 */
619 	if (kpj->kpj_cpucap == NULL) {
620 		kpj->kpj_cpucap = project_cap;
621 	} else if (project_cap != NULL) {
622 		cap_free(project_cap);
623 	}
624 
625 	project_cap = kpj->kpj_cpucap;
626 
627 	if (CAP_DISABLED(zone_cap)) {
628 		/*
629 		 * Remove all projects in this zone without caps
630 		 * from the capped_projects list.
631 		 */
632 		if (project_cap->cap_value == MAX_USAGE) {
633 			cap_project_disable(kpj);
634 		}
635 	} else if (CAP_DISABLED(project_cap)) {
636 		/*
637 		 * Add the project to capped_projects list.
638 		 */
639 		ASSERT(project_cap->cap_value == 0);
640 		cap_project_enable(kpj, MAX_USAGE);
641 	}
642 	mutex_exit(&caps_lock);
643 
644 	return (0);
645 }
646 
647 /*
648  * Set zone cap to cap_val
649  * If cap_val is equal to NOCAP, disable zone cap.
650  *
651  * If this is the first time a cap is set on a zone, allocate cpucap structure
652  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
653  */
654 int
655 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
656 {
657 	cpucap_t *cap = NULL;
658 	hrtime_t value;
659 
660 	if (cap_val == 0)
661 		return (EINVAL);
662 
663 	ASSERT(cap_val <= MAXCAP);
664 	if (cap_val > MAXCAP)
665 		cap_val = MAXCAP;
666 
667 	/*
668 	 * Nothing to do if trying to disable a cap on a zone when caps are off
669 	 * or a zone which does not have a cap yet.
670 	 */
671 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
672 		return (0);
673 
674 	if (zone->zone_cpucap == NULL)
675 		cap = cap_alloc();
676 
677 	mutex_enter(&caps_lock);
678 
679 	if (cpucaps_busy) {
680 		mutex_exit(&caps_lock);
681 		return (EBUSY);
682 	}
683 
684 	/*
685 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
686 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
687 	 */
688 	if (zone->zone_cpucap == NULL) {
689 		zone->zone_cpucap = cap;
690 	} else if (cap != NULL) {
691 		cap_free(cap);
692 	}
693 
694 	cap = zone->zone_cpucap;
695 	value = cap_val * cap_tick_cost;
696 	if (value < 0)
697 		value = MAX_USAGE;
698 
699 	/* Nothing to do if the value is staying the same */
700 	if (value == cap->cap_value) {
701 		mutex_exit(&caps_lock);
702 		return (0);
703 	}
704 
705 	/*
706 	 * Clear cap statistics since the cap value itself changes.
707 	 */
708 	cap->cap_above = cap->cap_below = 0;
709 
710 
711 	if (cap_val == NOCAP) {
712 		if (CAP_ENABLED(cap)) {
713 			/*
714 			 * Remove cap for the zone
715 			 */
716 			cap_zone_disable(zone);
717 			cpucaps_busy = B_TRUE;
718 			mutex_exit(&caps_lock);
719 			/*
720 			 * Disable caps for all project belonging to this zone
721 			 * unless they have their own cap.
722 			 */
723 			(void) project_walk_all(zone->zone_id,
724 			    cap_project_zone_modify_walker, cap);
725 
726 			mutex_enter(&caps_lock);
727 			cpucaps_busy = B_FALSE;
728 		}
729 	} else if (CAP_DISABLED(cap)) {
730 		/*
731 		 * Set a cap on a zone which previously was not capped.
732 		 */
733 		cap_zone_enable(zone, value);
734 		cpucaps_busy = B_TRUE;
735 		mutex_exit(&caps_lock);
736 
737 		/*
738 		 * Enable cap for all projects belonging to this zone.
739 		 */
740 		(void) project_walk_all(zone->zone_id,
741 		    cap_project_zone_modify_walker, cap);
742 
743 		mutex_enter(&caps_lock);
744 		cpucaps_busy = B_FALSE;
745 	} else {
746 		/*
747 		 * No state transitions, just change the value
748 		 */
749 		cap->cap_value = value;
750 	}
751 
752 	ASSERT(MUTEX_HELD(&caps_lock));
753 	ASSERT(!cpucaps_busy);
754 	mutex_exit(&caps_lock);
755 
756 	return (0);
757 }
758 
759 /*
760  * The project is going away so disable its cap.
761  */
762 void
763 cpucaps_project_remove(kproject_t *kpj)
764 {
765 	mutex_enter(&caps_lock);
766 	if (PROJECT_IS_CAPPED(kpj))
767 		cap_project_disable(kpj);
768 	if (kpj->kpj_cpucap != NULL) {
769 		cap_free(kpj->kpj_cpucap);
770 		kpj->kpj_cpucap = NULL;
771 	}
772 	mutex_exit(&caps_lock);
773 }
774 
775 /*
776  * The zone is going away, so disable its cap.
777  */
778 void
779 cpucaps_zone_remove(zone_t *zone)
780 {
781 	mutex_enter(&caps_lock);
782 	while (ZONE_IS_CAPPED(zone)) {
783 		mutex_exit(&caps_lock);
784 		(void) cpucaps_zone_set(zone, NOCAP);
785 		mutex_enter(&caps_lock);
786 	}
787 	if (zone->zone_cpucap != NULL) {
788 		cap_free(zone->zone_cpucap);
789 		zone->zone_cpucap = NULL;
790 	}
791 	mutex_exit(&caps_lock);
792 }
793 
794 /*
795  * New project was created. It should be put on the capped_projects list if
796  * its zone has a cap.
797  */
798 void
799 cpucaps_project_add(kproject_t *kpj)
800 {
801 	cpucap_t *cap = NULL;
802 
803 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
804 		return;
805 
806 	/*
807 	 * This project was never capped before, so allocate its cap structure.
808 	 */
809 	if (kpj->kpj_cpucap == NULL)
810 		cap = cap_alloc();
811 
812 	mutex_enter(&caps_lock);
813 	/*
814 	 * Double-check with caps_lock held
815 	 */
816 	if (kpj->kpj_cpucap == NULL) {
817 		kpj->kpj_cpucap = cap;
818 	} else if (cap != NULL) {
819 		cap_free(cap);
820 	}
821 
822 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
823 		cap_project_enable(kpj, MAX_USAGE);
824 
825 	mutex_exit(&caps_lock);
826 }
827 
828 /*
829  * Set project cap to cap_val
830  * If cap_val is equal to NOCAP, disable project cap.
831  *
832  * If this is the first time a cap is set on a project, allocate cpucap
833  * structure without holding caps_lock to avoid KM_SLEEP allocation with
834  * caps_lock held.
835  */
836 int
837 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
838 {
839 	cpucap_t *cap = NULL;
840 	hrtime_t value;
841 
842 	if (cap_val == 0)
843 		return (EINVAL);
844 
845 	ASSERT(cap_val <= MAXCAP);
846 	if (cap_val > MAXCAP)
847 		cap_val = MAXCAP;
848 
849 	/*
850 	 * Nothing to do if trying to disable project cap and caps are not
851 	 * enabled or if trying to disable cap on a project that does not have
852 	 * cap enabled.
853 	 */
854 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
855 		return (0);
856 
857 	if (kpj->kpj_cpucap == NULL) {
858 		/*
859 		 * This project was never capped before, so allocate its cap
860 		 * structure.
861 		 */
862 		cap = cap_alloc();
863 	}
864 
865 	mutex_enter(&caps_lock);
866 
867 	/*
868 	 * Double-check with caps_lock held.
869 	 */
870 	if (kpj->kpj_cpucap == NULL) {
871 		kpj->kpj_cpucap = cap;
872 	} else if (cap != NULL) {
873 		cap_free(cap);
874 	}
875 
876 	/*
877 	 * Get the actual pointer to the project cap.
878 	 */
879 	cap = kpj->kpj_cpucap;
880 	value = cap_val * cap_tick_cost;
881 	if (value < 0)
882 		value = MAX_USAGE;
883 
884 	/*
885 	 * Nothing to do if the value is not changing
886 	 */
887 	if (value == cap->cap_value) {
888 		mutex_exit(&caps_lock);
889 		return (0);
890 	}
891 
892 	/*
893 	 * Clear cap statistics since the cap value itself changes.
894 	 */
895 	cap->cap_above = cap->cap_below = 0;
896 	cap->cap_maxusage = 0;
897 
898 	if (cap_val != NOCAP) {
899 		/*
900 		 * Enable this cap if it is not already enabled.
901 		 */
902 		if (CAP_DISABLED(cap))
903 			cap_project_enable(kpj, value);
904 		else
905 			cap->cap_value = value;
906 	} else if (CAP_ENABLED(cap)) {
907 		/*
908 		 * User requested to drop a cap on the project. If it is part of
909 		 * capped zone, keep the cap and set the value to MAX_USAGE,
910 		 * otherwise disable the cap.
911 		 */
912 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
913 			cap->cap_value = MAX_USAGE;
914 		} else {
915 			cap_project_disable(kpj);
916 		}
917 	}
918 	mutex_exit(&caps_lock);
919 
920 	return (0);
921 }
922 
923 /*
924  * Get cap usage.
925  */
926 static rctl_qty_t
927 cap_get(cpucap_t *cap)
928 {
929 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
930 }
931 
932 /*
933  * Get current project usage.
934  */
935 rctl_qty_t
936 cpucaps_project_get(kproject_t *kpj)
937 {
938 	return (cap_get(kpj->kpj_cpucap));
939 }
940 
941 /*
942  * Get current zone usage.
943  */
944 rctl_qty_t
945 cpucaps_zone_get(zone_t *zone)
946 {
947 	return (cap_get(zone->zone_cpucap));
948 }
949 
950 /*
951  * Charge project of thread t the time thread t spent on CPU since previously
952  * adjusted.
953  *
954  * Record the current on-CPU time in the csc structure.
955  *
956  * Do not adjust for more than one tick worth of time.
957  *
958  * It is possible that the project cap is being disabled while this routine is
959  * executed. This should not cause any issues since the association between the
960  * thread and its project is protected by thread lock.
961  */
962 static void
963 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
964 {
965 	kproject_t	*kpj = ttoproj(t);
966 	hrtime_t	new_usage;
967 	hrtime_t	usage_delta;
968 
969 	ASSERT(THREAD_LOCK_HELD(t));
970 	ASSERT(kpj->kpj_cpucap != NULL);
971 
972 	/* Get on-CPU time since birth of a thread */
973 	new_usage = mstate_thread_onproc_time(t);
974 
975 	/* Time spent on CPU since last checked */
976 	usage_delta = new_usage - csc->csc_cputime;
977 
978 	/* Save the accumulated on-CPU time */
979 	csc->csc_cputime = new_usage;
980 
981 	/* Charge at most one tick worth of on-CPU time */
982 	if (usage_delta > cap_tick_cost)
983 		usage_delta = cap_tick_cost;
984 
985 	/* Add usage_delta to the project usage value. */
986 	if (usage_delta > 0) {
987 		cpucap_t *cap = kpj->kpj_cpucap;
988 
989 		DTRACE_PROBE2(cpucaps__project__charge,
990 		    kthread_id_t, t, hrtime_t, usage_delta);
991 
992 		disp_lock_enter_high(&cap->cap_usagelock);
993 		cap->cap_usage += usage_delta;
994 
995 		/* Check for overflows */
996 		if (cap->cap_usage < 0)
997 			cap->cap_usage = MAX_USAGE - 1;
998 
999 		disp_lock_exit_high(&cap->cap_usagelock);
1000 
1001 		/*
1002 		 * cap_maxusage is only kept for observability. Move it outside
1003 		 * the lock to reduce the time spent while holding the lock.
1004 		 */
1005 		if (cap->cap_usage > cap->cap_maxusage)
1006 			cap->cap_maxusage = cap->cap_usage;
1007 	}
1008 }
1009 
1010 /*
1011  * Charge thread's project and return True if project or zone should be
1012  * penalized because its project or zone is exceeding its cap. Also sets
1013  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1014  *
1015  * It is possible that the project cap is being disabled while this routine is
1016  * executed. This should not cause any issues since the association between the
1017  * thread and its project is protected by thread lock. It will still set
1018  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
1019  * anything on the blocked wait queue.
1020  *
1021  */
1022 boolean_t
1023 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1024 {
1025 	kproject_t	*kpj = ttoproj(t);
1026 	klwp_t		*lwp = t->t_lwp;
1027 	zone_t		*zone;
1028 	cpucap_t	*project_cap;
1029 	boolean_t	rc = B_FALSE;
1030 
1031 	ASSERT(THREAD_LOCK_HELD(t));
1032 
1033 	/* Nothing to do for projects that are not capped. */
1034 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1035 		return (B_FALSE);
1036 
1037 	caps_charge_adjust(t, csc);
1038 
1039 	/*
1040 	 * The caller only requested to charge the project usage, no enforcement
1041 	 * part.
1042 	 */
1043 	if (charge_type == CPUCAPS_CHARGE_ONLY)
1044 		return (B_FALSE);
1045 
1046 	project_cap = kpj->kpj_cpucap;
1047 
1048 	if (project_cap->cap_usage >= project_cap->cap_value) {
1049 		t->t_schedflag |= TS_PROJWAITQ;
1050 		rc = B_TRUE;
1051 	} else if (t->t_schedflag & TS_PROJWAITQ) {
1052 		t->t_schedflag &= ~TS_PROJWAITQ;
1053 	}
1054 
1055 	zone = ttozone(t);
1056 	if (!ZONE_IS_CAPPED(zone)) {
1057 		if (t->t_schedflag & TS_ZONEWAITQ)
1058 			t->t_schedflag &= ~TS_ZONEWAITQ;
1059 	} else {
1060 		cpucap_t *zone_cap = zone->zone_cpucap;
1061 
1062 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
1063 			t->t_schedflag |= TS_ZONEWAITQ;
1064 			rc = B_TRUE;
1065 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
1066 			t->t_schedflag &= ~TS_ZONEWAITQ;
1067 		}
1068 	}
1069 
1070 
1071 	return (rc);
1072 }
1073 
1074 /*
1075  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1076  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1077  *
1078  * CPU Caps are only enforced for user threads.
1079  *
1080  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1081  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1082  *
1083  * It is possible that by the time we enter cpucaps_enforce() the cap is already
1084  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1085  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1086  * apply.
1087  */
1088 boolean_t
1089 cpucaps_enforce(kthread_t *t)
1090 {
1091 	klwp_t *lwp = t->t_lwp;
1092 
1093 	ASSERT(THREAD_LOCK_HELD(t));
1094 
1095 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1096 		if (t->t_schedflag & TS_PROJWAITQ) {
1097 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1098 			t->t_schedflag &= ~TS_ANYWAITQ;
1099 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1100 			    t)) {
1101 				return (B_TRUE);
1102 			}
1103 		}
1104 		if (t->t_schedflag & TS_ZONEWAITQ) {
1105 			ASSERT(ttozone(t)->zone_cpucap != NULL);
1106 			t->t_schedflag &= ~TS_ZONEWAITQ;
1107 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1108 			    t)) {
1109 				return (B_TRUE);
1110 			}
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * The thread is not enqueued on the wait queue.
1116 	 */
1117 	return (B_FALSE);
1118 }
1119 
1120 /*
1121  * Convert internal cap statistics into values exported by cap kstat.
1122  */
1123 static int
1124 cap_kstat_update(kstat_t *ksp, int rw)
1125 {
1126 	struct cap_kstat *capsp = &cap_kstat;
1127 	cpucap_t *cap = ksp->ks_private;
1128 	clock_t	tick_sec = SEC_TO_TICK(1);
1129 	char *zonename = cap->cap_zone->zone_name;
1130 
1131 	if (rw == KSTAT_WRITE)
1132 		return (EACCES);
1133 
1134 	capsp->cap_value.value.ui64 =
1135 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
1136 	capsp->cap_usage.value.ui64 =
1137 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1138 	capsp->cap_maxusage.value.ui64 =
1139 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1140 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1141 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1142 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1143 	kstat_named_setstr(&capsp->cap_zonename, zonename);
1144 
1145 	return (0);
1146 }
1147