xref: /titanic_41/usr/src/uts/common/disp/cpucaps.c (revision 2b24ab6b3865caeede9eeb9db6b83e1d89dcd1ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/disp.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/atomic.h>
34 #include <sys/cpucaps_impl.h>
35 #include <sys/dtrace.h>
36 #include <sys/sdt.h>
37 #include <sys/debug.h>
38 #include <sys/rctl.h>
39 #include <sys/errno.h>
40 
41 /*
42  * CPU Caps implementation
43  * =======================
44  *
45  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
46  * usage for all projects running inside the zone. If the zone CPU cap is set
47  * below the project CPU cap, the latter will have no effect.
48  *
49  * When CPU usage of projects and/or zones reaches specified caps, threads in
50  * them do not get scheduled and instead are placed on wait queues associated
51  * with a cap. Such threads will start running again only when CPU usage drops
52  * below the cap level. Each zone and each project has its own wait queue.
53  *
54  * When CPU cap is set, the kernel continously keeps track of CPU time used by
55  * capped zones and/or projects over a short time interval and calculates their
56  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
57  * cap, LWPs running in the user-land (when they are not holding any critical
58  * kernel locks) are placed on special wait queues until their project's or
59  * zone's CPU usage drops below the cap.
60  *
61  * The system maintains a list of all capped projects and all capped zones. On
62  * every clock tick every active thread belonging to a capped project adds its
63  * CPU usage to its project. Usage from all projects belonging to a capped zone
64  * is aggregated to get the zone usage.
65  *
66  * When the current CPU usage is above the cap, a project or zone is considered
67  * over-capped. Every user thread caught running in an over-capped project or
68  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
69  * is requested to surrender its CPU. This causes scheduling class specific
70  * CL_PREEMPT() callback to be invoked. The callback function places threads
71  * marked as TS_PROJWAIT on a wait queue and calls switch().
72  *
73  * Threads are only placed on wait queues after trapping from user-land
74  * (they could be holding some user locks, but no kernel locks) and while
75  * returning from the trap back to the user-land when no kernel locks are held.
76  * Putting threads on wait queues in random places while running in the
77  * kernel might lead to all kinds of locking problems.
78  *
79  * Accounting
80  * ==========
81  *
82  * Accounting of CPU usage is based on per-thread micro-state accounting data.
83  * On every clock tick clock() adds new on-CPU time for every thread found on
84  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
85  * New times means time since it was last accounted for. On-CPU times greater
86  * than 1 tick are truncated to 1 tick.
87  *
88  * Project CPU usage is aggregated from all threads within the project.
89  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
90  * CPU usage is calculated on every clock tick by walking list of projects and
91  * adding their usage together.
92  *
93  * Decay
94  * =====
95  *
96  * CPU usage is decayed by the caps_update() routine which is called once per
97  * every clock tick. It walks lists of project caps and decays their usages by
98  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
99  * are made runnable again, one thread per clock tick.
100  *
101  * Interfaces
102  * ==========
103  *
104  * The CPU Caps facility provides the following interfaces to the rest of the
105  * system:
106  *
107  *   cpucaps_project_add(kproject_t *)
108  *
109  * Notifies the framework of a new project. It should be put on the
110  * capped_projects list if its zone has a cap.
111  *
112  *   cpucaps_project_remove(kproject_t *)
113  *
114  * Remove the association between the specified project and its cap.
115  * Called right before the project is destroyed.
116  *
117  * cpucaps_project_set(kproject_t *, rctl_qty_t)
118  *
119  * Set project cap of the specified project to the specified value. Setting the
120  * value to NOCAP is equivalent to removing the cap.
121  *
122  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
123  *
124  * Set zone cap of the specified zone to the specified value. Setting the value
125  * to NOCAP is equivalent to removing the cap.
126  *
127  *   cpucaps_zone_remove(zone_t *)
128  *
129  * Remove the association between the zone and its cap.
130  *
131  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
132  *
133  * Charges specified thread's project the amount of on-CPU time that it used.
134  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
135  * Otherwise returns True if project or zone should be penalized because its
136  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
137  * bits in t_schedflag in this case.
138  *
139  *   CPUCAPS_ENFORCE(kthread_id_t *)
140  *
141  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
142  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
143  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
144  * wait queue or False otherwise.
145  *
146  *   cpucaps_sc_init(caps_sc_t *)
147  *
148  * Initializes the scheduling-class specific CPU Caps data for a thread.
149  *
150  * LOCKS
151  * =====
152  *
153  * all the individual caps structures and their lists are protected by a global
154  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
155  * caps, so it is usually uncontended. We avoid all blocking memory allocations
156  * while holding caps_lock to prevent clock() from blocking.
157  *
158  * Thread state is protected by the thread lock. It protects the association
159  * between a thread and its project and, as a consequence, to its zone. The
160  * association can not break while thread lock is held, so the project or zone
161  * cap are not going to disappear while thread lock is held.
162  *
163  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
164  * grabbed by scheduling classes already holding thread lock at high PIL and by
165  * clock thread performing usage decay. We should do as little work as possible
166  * while holding the lock since it may be very hot. All threads in the project
167  * contend for the same cache line doing cap usage updates.
168  */
169 
170 /*
171  * caps_lock protects list of capped projects and zones, changes in the cap
172  * state and changes of the global cpucaps_enabled flag.
173  *
174  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
175  * modified in parallel. This can be per-zone cap flag, but we don't keep any
176  * cap state for now.
177  */
178 static kmutex_t caps_lock;		/* lock to protect: */
179 static list_t capped_zones;		/* - list of zones with caps */
180 static list_t capped_projects;		/* - list of projects with caps */
181 boolean_t cpucaps_enabled;		/* - are there any caps defined? */
182 boolean_t cpucaps_busy;			/* - is framework busy? */
183 
184 /*
185  * The accounting is based on the number of nanoseconds threads spend running
186  * during a tick which is kept in the cap_tick_cost variable.
187  */
188 static hrtime_t cap_tick_cost;
189 
190 /*
191  * How much of the usage value is decayed every clock tick
192  * Decay one per cent of value per tick
193  */
194 #define	CAP_DECAY_FACTOR 100
195 
196 /*
197  * Scale the value and round it to the closest integer value
198  */
199 #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
200 
201 static void caps_update();
202 
203 /*
204  * CAP kstats.
205  */
206 struct cap_kstat {
207 	kstat_named_t	cap_value;
208 	kstat_named_t	cap_usage;
209 	kstat_named_t	cap_nwait;
210 	kstat_named_t	cap_below;
211 	kstat_named_t	cap_above;
212 	kstat_named_t	cap_maxusage;
213 	kstat_named_t	cap_zonename;
214 } cap_kstat = {
215 	{ "value",	KSTAT_DATA_UINT64 },
216 	{ "usage",	KSTAT_DATA_UINT64 },
217 	{ "nwait",	KSTAT_DATA_UINT64 },
218 	{ "below_sec",	KSTAT_DATA_UINT64 },
219 	{ "above_sec",	KSTAT_DATA_UINT64 },
220 	{ "maxusage",	KSTAT_DATA_UINT64 },
221 	{ "zonename",	KSTAT_DATA_STRING },
222 };
223 
224 
225 static kmutex_t cap_kstat_lock;
226 static int cap_kstat_update(kstat_t *, int);
227 
228 /*
229  * Initialize CPU caps infrastructure.
230  *   - Initialize lists of capped zones and capped projects
231  *   - Set cpucaps_clock_callout to NULL
232  */
233 void
234 cpucaps_init()
235 {
236 	/*
237 	 * Initialize global variables
238 	 */
239 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
240 
241 	list_create(&capped_zones, sizeof (cpucap_t),
242 	    offsetof(cpucap_t, cap_link));
243 	list_create(&capped_projects, sizeof (cpucap_t),
244 	    offsetof(cpucap_t, cap_link));
245 
246 	cpucaps_enabled = B_FALSE;
247 	cpucaps_busy = B_FALSE;
248 	cpucaps_clock_callout = NULL;
249 }
250 
251 /*
252  * Initialize scheduling-class specific CPU Caps data.
253  */
254 void
255 cpucaps_sc_init(caps_sc_t *csc)
256 {
257 	csc->csc_cputime = 0;
258 }
259 
260 /*
261  * Allocate and initialize cpucap structure
262  */
263 static cpucap_t *
264 cap_alloc(void)
265 {
266 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
267 
268 	DISP_LOCK_INIT(&cap->cap_usagelock);
269 	waitq_init(&cap->cap_waitq);
270 
271 	return (cap);
272 }
273 
274 /*
275  * Free cpucap structure
276  */
277 static void
278 cap_free(cpucap_t *cap)
279 {
280 	if (cap == NULL)
281 		return;
282 
283 	/*
284 	 * This cap should not be active
285 	 */
286 	ASSERT(!list_link_active(&cap->cap_link));
287 	ASSERT(cap->cap_value == 0);
288 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
289 
290 	waitq_fini(&cap->cap_waitq);
291 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
292 
293 	kmem_free(cap, sizeof (cpucap_t));
294 }
295 
296 /*
297  * Activate cap - insert into active list and unblock its
298  * wait queue. Should be called with caps_lock held.
299  * The cap_value field is set to the value supplied.
300  */
301 static void
302 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
303 {
304 	ASSERT(MUTEX_HELD(&caps_lock));
305 
306 	/*
307 	 * Cap can not be already enabled
308 	 */
309 	ASSERT(!CAP_ENABLED(cap));
310 	ASSERT(!list_link_active(&cap->cap_link));
311 
312 	list_insert_tail(l, cap);
313 	cap->cap_below = cap->cap_above = 0;
314 	cap->cap_maxusage = 0;
315 	cap->cap_usage = 0;
316 	cap->cap_value = value;
317 	waitq_unblock(&cap->cap_waitq);
318 	if (CPUCAPS_OFF()) {
319 		cpucaps_enabled = B_TRUE;
320 		cpucaps_clock_callout = caps_update;
321 	}
322 }
323 
324 /*
325  * Deactivate cap
326  *   - Block its wait queue. This prevents any new threads from being
327  *	enqueued there and moves all enqueued threads to the run queue.
328  *   - Remove cap from list l.
329  *   - Disable CPU caps globally if there are no capped projects or zones
330  *
331  * Should be called with caps_lock held.
332  */
333 static void
334 cap_disable(list_t *l, cpucap_t *cap)
335 {
336 	ASSERT(MUTEX_HELD(&caps_lock));
337 	/*
338 	 * Cap should be currently active
339 	 */
340 	ASSERT(CPUCAPS_ON());
341 	ASSERT(list_link_active(&cap->cap_link));
342 	ASSERT(CAP_ENABLED(cap));
343 
344 	waitq_block(&cap->cap_waitq);
345 	list_remove(l, cap);
346 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
347 		cpucaps_enabled = B_FALSE;
348 		cpucaps_clock_callout = NULL;
349 	}
350 	cap->cap_value = 0;
351 	cap->cap_project = NULL;
352 	cap->cap_zone = NULL;
353 	if (cap->cap_kstat != NULL) {
354 		kstat_delete(cap->cap_kstat);
355 		cap->cap_kstat = NULL;
356 	}
357 
358 }
359 
360 /*
361  * Enable cap for a project kpj
362  * It is safe to enable already enabled project cap.
363  * Should be called with caps_lock held.
364  */
365 static void
366 cap_project_enable(kproject_t *kpj, hrtime_t value)
367 {
368 	cpucap_t *cap = kpj->kpj_cpucap;
369 
370 	ASSERT(MUTEX_HELD(&caps_lock));
371 	ASSERT(cap != NULL);
372 
373 	if (CAP_DISABLED(cap)) {
374 		ASSERT(cap->cap_kstat == NULL);
375 		cap_enable(&capped_projects, cap, value);
376 		cap->cap_project = kpj;
377 		cap->cap_zone = kpj->kpj_zone;
378 
379 		/*
380 		 * Create cap kstats
381 		 */
382 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
383 		    KSTAT_TYPE_NAMED,
384 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
385 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
386 		    cap->cap_kstat->ks_data_size +=
387 			strlen(cap->cap_zone->zone_name) + 1;
388 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
389 		    cap->cap_kstat->ks_data = &cap_kstat;
390 		    cap->cap_kstat->ks_update = cap_kstat_update;
391 		    cap->cap_kstat->ks_private = cap;
392 		    kstat_install(cap->cap_kstat);
393 		}
394 	}
395 }
396 
397 /*
398  * Disable project cap.
399  * It is safe to disable already disabled project cap.
400  * Should be called with caps_lock held.
401  */
402 static void
403 cap_project_disable(kproject_t *kpj)
404 {
405 	cpucap_t *cap = kpj->kpj_cpucap;
406 
407 	ASSERT(MUTEX_HELD(&caps_lock));
408 	ASSERT(cap != NULL);
409 	ASSERT(cap->cap_project == kpj);
410 
411 	if (CAP_ENABLED(cap))
412 		cap_disable(&capped_projects, cap);
413 }
414 
415 /*
416  * Enable cap for a zone
417  * It is safe to enable already enabled zone cap.
418  * Should be called with caps_lock held.
419  */
420 static void
421 cap_zone_enable(zone_t *zone, hrtime_t value)
422 {
423 	cpucap_t *cap = zone->zone_cpucap;
424 
425 	ASSERT(MUTEX_HELD(&caps_lock));
426 	ASSERT(cap != NULL);
427 
428 	if (CAP_DISABLED(cap)) {
429 		ASSERT(cap->cap_kstat == NULL);
430 		cap_enable(&capped_zones, cap, value);
431 		cap->cap_zone = zone;
432 
433 		/*
434 		 * Create cap kstats
435 		 */
436 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
437 		    KSTAT_TYPE_NAMED,
438 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
439 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
440 		    cap->cap_kstat->ks_data_size +=
441 			strlen(cap->cap_zone->zone_name) + 1;
442 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
443 		    cap->cap_kstat->ks_data = &cap_kstat;
444 		    cap->cap_kstat->ks_update = cap_kstat_update;
445 		    cap->cap_kstat->ks_private = cap;
446 		    kstat_install(cap->cap_kstat);
447 		}
448 	}
449 }
450 
451 /*
452  * Disable zone cap.
453  * It is safe to disable already disabled zone cap.
454  * Should be called with caps_lock held.
455  */
456 static void
457 cap_zone_disable(zone_t *zone)
458 {
459 	cpucap_t *cap = zone->zone_cpucap;
460 
461 	ASSERT(MUTEX_HELD(&caps_lock));
462 	ASSERT(cap != NULL);
463 	ASSERT(cap->cap_zone == zone);
464 
465 	if (CAP_ENABLED(cap))
466 		cap_disable(&capped_zones, cap);
467 }
468 
469 /*
470  * Apply specified callback to all caps contained in the list `l'.
471  */
472 static void
473 cap_walk(list_t *l, void (*cb)(cpucap_t *))
474 {
475 	cpucap_t *cap;
476 
477 	ASSERT(MUTEX_HELD(&caps_lock));
478 
479 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
480 		(*cb)(cap);
481 	}
482 }
483 
484 /*
485  * If cap limit is not reached, make one thread from wait queue runnable.
486  * The waitq_isempty check is performed without the waitq lock. If a new thread
487  * is placed on the waitq right after the check, it will be picked up during the
488  * next invocation of cap_poke_waitq().
489  */
490 static void
491 cap_poke_waitq(cpucap_t *cap)
492 {
493 	ASSERT(MUTEX_HELD(&caps_lock));
494 
495 	if (cap->cap_usage >= cap->cap_value) {
496 		cap->cap_above++;
497 	} else {
498 		waitq_t *wq = &cap->cap_waitq;
499 
500 		cap->cap_below++;
501 
502 		if (!waitq_isempty(wq))
503 			waitq_runone(wq);
504 	}
505 }
506 
507 /*
508  * The callback function called for every cap on capped_projects list.
509  * Decay cap usage by CAP_DECAY_FACTOR
510  * Add this cap project usage to its zone usage.
511  * Kick off a thread from the cap waitq if cap is not reached.
512  */
513 static void
514 cap_project_usage_walker(cpucap_t *cap)
515 {
516 	zone_t		*zone = cap->cap_zone;
517 	hrtime_t	cap_usage = cap->cap_usage;
518 
519 	ASSERT(MUTEX_HELD(&caps_lock));
520 	ASSERT(cap->cap_project->kpj_cpucap == cap);
521 	ASSERT(zone == cap->cap_project->kpj_zone);
522 	ASSERT(CAP_ENABLED(cap));
523 
524 	/*
525 	 * Set or clear the CAP_REACHED flag based on the current usage.
526 	 * Only projects having their own caps are ever marked as CAP_REACHED.
527 	 */
528 	cap_poke_waitq(cap);
529 
530 	/*
531 	 * Add project's CPU usage to our zone's CPU usage.
532 	 */
533 	if (ZONE_IS_CAPPED(zone)) {
534 		cpucap_t *zcap = zone->zone_cpucap;
535 
536 		ASSERT(zcap->cap_zone == zone);
537 
538 		/*
539 		 * If we haven't reset this zone's usage during this clock tick
540 		 * yet, then do it now. The cap_lbolt field is used to check
541 		 * whether this is the first zone's project we see during this
542 		 * tick or a subsequent one.
543 		 */
544 		if (zcap->cap_lbolt != lbolt64) {
545 			if (zcap->cap_usage > zcap->cap_maxusage)
546 				zcap->cap_maxusage = zcap->cap_usage;
547 			zcap->cap_usage = 0;
548 			zcap->cap_lbolt = lbolt64;
549 		}
550 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
551 		    hrtime_t, cap_usage);
552 		zcap->cap_usage += cap_usage;
553 		/* Check for overflows */
554 		if (zcap->cap_usage < 0)
555 			zcap->cap_usage = MAX_USAGE - 1;
556 	}
557 
558 	/*
559 	 * Decay project usage.
560 	 */
561 	disp_lock_enter(&cap->cap_usagelock);
562 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
563 	disp_lock_exit(&cap->cap_usagelock);
564 }
565 
566 /*
567  * On every clock tick walk the list of project caps and update the CPU usage.
568  * Also walk the list of zone caps checking whether any threads should
569  * transition from wait queue to run queue.
570  *
571  * This function gets called by the clock thread directly when there are any
572  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
573  * caps_lock for long periods of time, so there should be almost no contention
574  * for it.
575  */
576 static void
577 caps_update()
578 {
579 	mutex_enter(&caps_lock);
580 	cap_walk(&capped_projects, cap_project_usage_walker);
581 	cap_walk(&capped_zones, cap_poke_waitq);
582 	mutex_exit(&caps_lock);
583 }
584 
585 /*
586  * The function is called for each project in a zone when the zone cap is
587  * modified. It enables project caps if zone cap is enabled and disables if the
588  * zone cap is disabled and project doesn't have its own cap.
589  *
590  * For each project that does not have cpucap structure allocated it allocates a
591  * new structure and assigns to kpj->cpu_cap. The allocation is performed
592  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
593  * held.
594  */
595 static int
596 cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
597 {
598 	cpucap_t *project_cap = NULL;
599 	cpucap_t *zone_cap = (cpucap_t *)arg;
600 
601 	ASSERT(zone_cap != NULL);
602 
603 	if (kpj->kpj_cpucap == NULL) {
604 		/*
605 		 * This is the first time any cap was established for this
606 		 * project. Allocate a new cpucap structure for it.
607 		 */
608 		project_cap = cap_alloc();
609 	}
610 
611 	mutex_enter(&caps_lock);
612 
613 	/*
614 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
615 	 * and assign the newly allocated cpucap structure to it.
616 	 */
617 	if (kpj->kpj_cpucap == NULL) {
618 		kpj->kpj_cpucap = project_cap;
619 	} else if (project_cap != NULL) {
620 		cap_free(project_cap);
621 	}
622 
623 	project_cap = kpj->kpj_cpucap;
624 
625 	if (CAP_DISABLED(zone_cap)) {
626 		/*
627 		 * Remove all projects in this zone without caps
628 		 * from the capped_projects list.
629 		 */
630 		if (project_cap->cap_value == MAX_USAGE) {
631 			cap_project_disable(kpj);
632 		}
633 	} else if (CAP_DISABLED(project_cap)) {
634 		/*
635 		 * Add the project to capped_projects list.
636 		 */
637 		ASSERT(project_cap->cap_value == 0);
638 		cap_project_enable(kpj, MAX_USAGE);
639 	}
640 	mutex_exit(&caps_lock);
641 
642 	return (0);
643 }
644 
645 /*
646  * Set zone cap to cap_val
647  * If cap_val is equal to NOCAP, disable zone cap.
648  *
649  * If this is the first time a cap is set on a zone, allocate cpucap structure
650  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
651  */
652 int
653 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
654 {
655 	cpucap_t *cap = NULL;
656 	hrtime_t value;
657 
658 	if (cap_val == 0)
659 		return (EINVAL);
660 
661 	ASSERT(cap_val <= MAXCAP);
662 	if (cap_val > MAXCAP)
663 		cap_val = MAXCAP;
664 
665 	/*
666 	 * Nothing to do if trying to disable a cap on a zone when caps are off
667 	 * or a zone which does not have a cap yet.
668 	 */
669 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
670 		return (0);
671 
672 	if (zone->zone_cpucap == NULL)
673 		cap = cap_alloc();
674 
675 	mutex_enter(&caps_lock);
676 
677 	if (cpucaps_busy) {
678 		mutex_exit(&caps_lock);
679 		return (EBUSY);
680 	}
681 
682 	/*
683 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
684 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
685 	 */
686 	if (zone->zone_cpucap == NULL) {
687 		zone->zone_cpucap = cap;
688 	} else if (cap != NULL) {
689 		cap_free(cap);
690 	}
691 
692 	cap = zone->zone_cpucap;
693 	value = cap_val * cap_tick_cost;
694 	if (value < 0)
695 		value = MAX_USAGE;
696 
697 	/* Nothing to do if the value is staying the same */
698 	if (value == cap->cap_value) {
699 		mutex_exit(&caps_lock);
700 		return (0);
701 	}
702 
703 	/*
704 	 * Clear cap statistics since the cap value itself changes.
705 	 */
706 	cap->cap_above = cap->cap_below = 0;
707 
708 
709 	if (cap_val == NOCAP) {
710 		if (CAP_ENABLED(cap)) {
711 			/*
712 			 * Remove cap for the zone
713 			 */
714 			cap_zone_disable(zone);
715 			cpucaps_busy = B_TRUE;
716 			mutex_exit(&caps_lock);
717 			/*
718 			 * Disable caps for all project belonging to this zone
719 			 * unless they have their own cap.
720 			 */
721 			(void) project_walk_all(zone->zone_id,
722 			    cap_project_zone_modify_walker, cap);
723 
724 			mutex_enter(&caps_lock);
725 			cpucaps_busy = B_FALSE;
726 		}
727 	} else if (CAP_DISABLED(cap)) {
728 		/*
729 		 * Set a cap on a zone which previously was not capped.
730 		 */
731 		cap_zone_enable(zone, value);
732 		cpucaps_busy = B_TRUE;
733 		mutex_exit(&caps_lock);
734 
735 		/*
736 		 * Enable cap for all projects belonging to this zone.
737 		 */
738 		(void) project_walk_all(zone->zone_id,
739 		    cap_project_zone_modify_walker, cap);
740 
741 		mutex_enter(&caps_lock);
742 		cpucaps_busy = B_FALSE;
743 	} else {
744 		/*
745 		 * No state transitions, just change the value
746 		 */
747 		cap->cap_value = value;
748 	}
749 
750 	ASSERT(MUTEX_HELD(&caps_lock));
751 	ASSERT(!cpucaps_busy);
752 	mutex_exit(&caps_lock);
753 
754 	return (0);
755 }
756 
757 /*
758  * The project is going away so disable its cap.
759  */
760 void
761 cpucaps_project_remove(kproject_t *kpj)
762 {
763 	mutex_enter(&caps_lock);
764 	if (PROJECT_IS_CAPPED(kpj))
765 		cap_project_disable(kpj);
766 	if (kpj->kpj_cpucap != NULL) {
767 		cap_free(kpj->kpj_cpucap);
768 		kpj->kpj_cpucap = NULL;
769 	}
770 	mutex_exit(&caps_lock);
771 }
772 
773 /*
774  * The zone is going away, so disable its cap.
775  */
776 void
777 cpucaps_zone_remove(zone_t *zone)
778 {
779 	mutex_enter(&caps_lock);
780 	while (ZONE_IS_CAPPED(zone)) {
781 		mutex_exit(&caps_lock);
782 		(void) cpucaps_zone_set(zone, NOCAP);
783 		mutex_enter(&caps_lock);
784 	}
785 	if (zone->zone_cpucap != NULL) {
786 		cap_free(zone->zone_cpucap);
787 		zone->zone_cpucap = NULL;
788 	}
789 	mutex_exit(&caps_lock);
790 }
791 
792 /*
793  * New project was created. It should be put on the capped_projects list if
794  * its zone has a cap.
795  */
796 void
797 cpucaps_project_add(kproject_t *kpj)
798 {
799 	cpucap_t *cap = NULL;
800 
801 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
802 		return;
803 
804 	/*
805 	 * This project was never capped before, so allocate its cap structure.
806 	 */
807 	if (kpj->kpj_cpucap == NULL)
808 		cap = cap_alloc();
809 
810 	mutex_enter(&caps_lock);
811 	/*
812 	 * Double-check with caps_lock held
813 	 */
814 	if (kpj->kpj_cpucap == NULL) {
815 		kpj->kpj_cpucap = cap;
816 	} else if (cap != NULL) {
817 		cap_free(cap);
818 	}
819 
820 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
821 		cap_project_enable(kpj, MAX_USAGE);
822 
823 	mutex_exit(&caps_lock);
824 }
825 
826 /*
827  * Set project cap to cap_val
828  * If cap_val is equal to NOCAP, disable project cap.
829  *
830  * If this is the first time a cap is set on a project, allocate cpucap
831  * structure without holding caps_lock to avoid KM_SLEEP allocation with
832  * caps_lock held.
833  */
834 int
835 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
836 {
837 	cpucap_t *cap = NULL;
838 	hrtime_t value;
839 
840 	if (cap_val == 0)
841 		return (EINVAL);
842 
843 	ASSERT(cap_val <= MAXCAP);
844 	if (cap_val > MAXCAP)
845 		cap_val = MAXCAP;
846 
847 	/*
848 	 * Nothing to do if trying to disable project cap and caps are not
849 	 * enabled or if trying to disable cap on a project that does not have
850 	 * cap enabled.
851 	 */
852 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
853 		return (0);
854 
855 	if (kpj->kpj_cpucap == NULL) {
856 		/*
857 		 * This project was never capped before, so allocate its cap
858 		 * structure.
859 		 */
860 		cap = cap_alloc();
861 	}
862 
863 	mutex_enter(&caps_lock);
864 
865 	/*
866 	 * Double-check with caps_lock held.
867 	 */
868 	if (kpj->kpj_cpucap == NULL) {
869 		kpj->kpj_cpucap = cap;
870 	} else if (cap != NULL) {
871 		cap_free(cap);
872 	}
873 
874 	/*
875 	 * Get the actual pointer to the project cap.
876 	 */
877 	cap = kpj->kpj_cpucap;
878 	value = cap_val * cap_tick_cost;
879 	if (value < 0)
880 		value = MAX_USAGE;
881 
882 	/*
883 	 * Nothing to do if the value is not changing
884 	 */
885 	if (value == cap->cap_value) {
886 		mutex_exit(&caps_lock);
887 		return (0);
888 	}
889 
890 	/*
891 	 * Clear cap statistics since the cap value itself changes.
892 	 */
893 	cap->cap_above = cap->cap_below = 0;
894 	cap->cap_maxusage = 0;
895 
896 	if (cap_val != NOCAP) {
897 		/*
898 		 * Enable this cap if it is not already enabled.
899 		 */
900 		if (CAP_DISABLED(cap))
901 			cap_project_enable(kpj, value);
902 		else
903 			cap->cap_value = value;
904 	} else if (CAP_ENABLED(cap)) {
905 		/*
906 		 * User requested to drop a cap on the project. If it is part of
907 		 * capped zone, keep the cap and set the value to MAX_USAGE,
908 		 * otherwise disable the cap.
909 		 */
910 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
911 			cap->cap_value = MAX_USAGE;
912 		} else {
913 			cap_project_disable(kpj);
914 		}
915 	}
916 	mutex_exit(&caps_lock);
917 
918 	return (0);
919 }
920 
921 /*
922  * Get cap usage.
923  */
924 static rctl_qty_t
925 cap_get(cpucap_t *cap)
926 {
927 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
928 }
929 
930 /*
931  * Get current project usage.
932  */
933 rctl_qty_t
934 cpucaps_project_get(kproject_t *kpj)
935 {
936 	return (cap_get(kpj->kpj_cpucap));
937 }
938 
939 /*
940  * Get current zone usage.
941  */
942 rctl_qty_t
943 cpucaps_zone_get(zone_t *zone)
944 {
945 	return (cap_get(zone->zone_cpucap));
946 }
947 
948 /*
949  * Charge project of thread t the time thread t spent on CPU since previously
950  * adjusted.
951  *
952  * Record the current on-CPU time in the csc structure.
953  *
954  * Do not adjust for more than one tick worth of time.
955  *
956  * It is possible that the project cap is being disabled while this routine is
957  * executed. This should not cause any issues since the association between the
958  * thread and its project is protected by thread lock.
959  */
960 static void
961 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
962 {
963 	kproject_t	*kpj = ttoproj(t);
964 	hrtime_t	new_usage;
965 	hrtime_t	usage_delta;
966 
967 	ASSERT(THREAD_LOCK_HELD(t));
968 	ASSERT(kpj->kpj_cpucap != NULL);
969 
970 	/* Get on-CPU time since birth of a thread */
971 	new_usage = mstate_thread_onproc_time(t);
972 
973 	/* Time spent on CPU since last checked */
974 	usage_delta = new_usage - csc->csc_cputime;
975 
976 	/* Save the accumulated on-CPU time */
977 	csc->csc_cputime = new_usage;
978 
979 	/* Charge at most one tick worth of on-CPU time */
980 	if (usage_delta > cap_tick_cost)
981 		usage_delta = cap_tick_cost;
982 
983 	/* Add usage_delta to the project usage value. */
984 	if (usage_delta > 0) {
985 		cpucap_t *cap = kpj->kpj_cpucap;
986 
987 		DTRACE_PROBE2(cpucaps__project__charge,
988 		    kthread_id_t, t, hrtime_t, usage_delta);
989 
990 		disp_lock_enter_high(&cap->cap_usagelock);
991 		cap->cap_usage += usage_delta;
992 
993 		/* Check for overflows */
994 		if (cap->cap_usage < 0)
995 			cap->cap_usage = MAX_USAGE - 1;
996 
997 		disp_lock_exit_high(&cap->cap_usagelock);
998 
999 		/*
1000 		 * cap_maxusage is only kept for observability. Move it outside
1001 		 * the lock to reduce the time spent while holding the lock.
1002 		 */
1003 		if (cap->cap_usage > cap->cap_maxusage)
1004 			cap->cap_maxusage = cap->cap_usage;
1005 	}
1006 }
1007 
1008 /*
1009  * Charge thread's project and return True if project or zone should be
1010  * penalized because its project or zone is exceeding its cap. Also sets
1011  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1012  *
1013  * It is possible that the project cap is being disabled while this routine is
1014  * executed. This should not cause any issues since the association between the
1015  * thread and its project is protected by thread lock. It will still set
1016  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
1017  * anything on the blocked wait queue.
1018  *
1019  */
1020 boolean_t
1021 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1022 {
1023 	kproject_t	*kpj = ttoproj(t);
1024 	klwp_t		*lwp = t->t_lwp;
1025 	zone_t		*zone;
1026 	cpucap_t	*project_cap;
1027 	boolean_t	rc = B_FALSE;
1028 
1029 	ASSERT(THREAD_LOCK_HELD(t));
1030 
1031 	/* Nothing to do for projects that are not capped. */
1032 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1033 		return (B_FALSE);
1034 
1035 	caps_charge_adjust(t, csc);
1036 
1037 	/*
1038 	 * The caller only requested to charge the project usage, no enforcement
1039 	 * part.
1040 	 */
1041 	if (charge_type == CPUCAPS_CHARGE_ONLY)
1042 		return (B_FALSE);
1043 
1044 	project_cap = kpj->kpj_cpucap;
1045 
1046 	if (project_cap->cap_usage >= project_cap->cap_value) {
1047 		t->t_schedflag |= TS_PROJWAITQ;
1048 		rc = B_TRUE;
1049 	} else if (t->t_schedflag & TS_PROJWAITQ) {
1050 		t->t_schedflag &= ~TS_PROJWAITQ;
1051 	}
1052 
1053 	zone = ttozone(t);
1054 	if (!ZONE_IS_CAPPED(zone)) {
1055 		if (t->t_schedflag & TS_ZONEWAITQ)
1056 			t->t_schedflag &= ~TS_ZONEWAITQ;
1057 	} else {
1058 		cpucap_t *zone_cap = zone->zone_cpucap;
1059 
1060 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
1061 			t->t_schedflag |= TS_ZONEWAITQ;
1062 			rc = B_TRUE;
1063 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
1064 			t->t_schedflag &= ~TS_ZONEWAITQ;
1065 		}
1066 	}
1067 
1068 
1069 	return (rc);
1070 }
1071 
1072 /*
1073  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1074  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1075  *
1076  * CPU Caps are only enforced for user threads.
1077  *
1078  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1079  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1080  *
1081  * It is possible that by the time we enter cpucaps_enforce() the cap is already
1082  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1083  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1084  * apply.
1085  */
1086 boolean_t
1087 cpucaps_enforce(kthread_t *t)
1088 {
1089 	klwp_t *lwp = t->t_lwp;
1090 
1091 	ASSERT(THREAD_LOCK_HELD(t));
1092 
1093 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1094 		if (t->t_schedflag & TS_PROJWAITQ) {
1095 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1096 			t->t_schedflag &= ~TS_ANYWAITQ;
1097 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1098 				t)) {
1099 				return (B_TRUE);
1100 			}
1101 		}
1102 		if (t->t_schedflag & TS_ZONEWAITQ) {
1103 			ASSERT(ttozone(t)->zone_cpucap != NULL);
1104 			t->t_schedflag &= ~TS_ZONEWAITQ;
1105 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1106 				t)) {
1107 				return (B_TRUE);
1108 			}
1109 		}
1110 	}
1111 
1112 	/*
1113 	 * The thread is not enqueued on the wait queue.
1114 	 */
1115 	return (B_FALSE);
1116 }
1117 
1118 /*
1119  * Convert internal cap statistics into values exported by cap kstat.
1120  */
1121 static int
1122 cap_kstat_update(kstat_t *ksp, int rw)
1123 {
1124 	struct cap_kstat *capsp = &cap_kstat;
1125 	cpucap_t *cap = ksp->ks_private;
1126 	clock_t	tick_sec = SEC_TO_TICK(1);
1127 	char *zonename = cap->cap_zone->zone_name;
1128 
1129 	if (rw == KSTAT_WRITE)
1130 		return (EACCES);
1131 
1132 	capsp->cap_value.value.ui64 =
1133 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
1134 	capsp->cap_usage.value.ui64 =
1135 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1136 	capsp->cap_maxusage.value.ui64 =
1137 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1138 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1139 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1140 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1141 	kstat_named_setstr(&capsp->cap_zonename, zonename);
1142 
1143 	return (0);
1144 }
1145