xref: /titanic_53/usr/src/uts/common/disp/cpucaps.c (revision c97ad5cdc75eb73e3cc38542ca3ba783574b0a7a)
1*c97ad5cdSakolb /*
2*c97ad5cdSakolb  * CDDL HEADER START
3*c97ad5cdSakolb  *
4*c97ad5cdSakolb  * The contents of this file are subject to the terms of the
5*c97ad5cdSakolb  * Common Development and Distribution License (the "License").
6*c97ad5cdSakolb  * You may not use this file except in compliance with the License.
7*c97ad5cdSakolb  *
8*c97ad5cdSakolb  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*c97ad5cdSakolb  * or http://www.opensolaris.org/os/licensing.
10*c97ad5cdSakolb  * See the License for the specific language governing permissions
11*c97ad5cdSakolb  * and limitations under the License.
12*c97ad5cdSakolb  *
13*c97ad5cdSakolb  * When distributing Covered Code, include this CDDL HEADER in each
14*c97ad5cdSakolb  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*c97ad5cdSakolb  * If applicable, add the following below this CDDL HEADER, with the
16*c97ad5cdSakolb  * fields enclosed by brackets "[]" replaced with your own identifying
17*c97ad5cdSakolb  * information: Portions Copyright [yyyy] [name of copyright owner]
18*c97ad5cdSakolb  *
19*c97ad5cdSakolb  * CDDL HEADER END
20*c97ad5cdSakolb  */
21*c97ad5cdSakolb 
22*c97ad5cdSakolb /*
23*c97ad5cdSakolb  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24*c97ad5cdSakolb  * Use is subject to license terms.
25*c97ad5cdSakolb  */
26*c97ad5cdSakolb 
27*c97ad5cdSakolb #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*c97ad5cdSakolb 
29*c97ad5cdSakolb #include <sys/disp.h>
30*c97ad5cdSakolb #include <sys/param.h>
31*c97ad5cdSakolb #include <sys/systm.h>
32*c97ad5cdSakolb #include <sys/sysmacros.h>
33*c97ad5cdSakolb #include <sys/atomic.h>
34*c97ad5cdSakolb #include <sys/cpucaps_impl.h>
35*c97ad5cdSakolb #include <sys/dtrace.h>
36*c97ad5cdSakolb #include <sys/sdt.h>
37*c97ad5cdSakolb #include <sys/debug.h>
38*c97ad5cdSakolb #include <sys/rctl.h>
39*c97ad5cdSakolb #include <sys/errno.h>
40*c97ad5cdSakolb 
41*c97ad5cdSakolb /*
42*c97ad5cdSakolb  * CPU Caps implementation
43*c97ad5cdSakolb  * =======================
44*c97ad5cdSakolb  *
45*c97ad5cdSakolb  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
46*c97ad5cdSakolb  * usage for all projects running inside the zone. If the zone CPU cap is set
47*c97ad5cdSakolb  * below the project CPU cap, the latter will have no effect.
48*c97ad5cdSakolb  *
49*c97ad5cdSakolb  * When CPU usage of projects and/or zones reaches specified caps, threads in
50*c97ad5cdSakolb  * them do not get scheduled and instead are placed on wait queues associated
51*c97ad5cdSakolb  * with a cap. Such threads will start running again only when CPU usage drops
52*c97ad5cdSakolb  * below the cap level. Each zone and each project has its own wait queue.
53*c97ad5cdSakolb  *
54*c97ad5cdSakolb  * When CPU cap is set, the kernel continously keeps track of CPU time used by
55*c97ad5cdSakolb  * capped zones and/or projects over a short time interval and calculates their
56*c97ad5cdSakolb  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
57*c97ad5cdSakolb  * cap, LWPs running in the user-land (when they are not holding any critical
58*c97ad5cdSakolb  * kernel locks) are placed on special wait queues until their project's or
59*c97ad5cdSakolb  * zone's CPU usage drops below the cap.
60*c97ad5cdSakolb  *
61*c97ad5cdSakolb  * The system maintains a list of all capped projects and all capped zones. On
62*c97ad5cdSakolb  * every clock tick every active thread belonging to a capped project adds its
63*c97ad5cdSakolb  * CPU usage to its project. Usage from all projects belonging to a capped zone
64*c97ad5cdSakolb  * is aggregated to get the zone usage.
65*c97ad5cdSakolb  *
66*c97ad5cdSakolb  * When the current CPU usage is above the cap, a project or zone is considered
67*c97ad5cdSakolb  * over-capped. Every user thread caught running in an over-capped project or
68*c97ad5cdSakolb  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
69*c97ad5cdSakolb  * is requested to surrender its CPU. This causes scheduling class specific
70*c97ad5cdSakolb  * CL_PREEMPT() callback to be invoked. The callback function places threads
71*c97ad5cdSakolb  * marked as TS_PROJWAIT on a wait queue and calls switch().
72*c97ad5cdSakolb  *
73*c97ad5cdSakolb  * Threads are only placed on wait queues after trapping from user-land
74*c97ad5cdSakolb  * (they could be holding some user locks, but no kernel locks) and while
75*c97ad5cdSakolb  * returning from the trap back to the user-land when no kernel locks are held.
76*c97ad5cdSakolb  * Putting threads on wait queues in random places while running in the
77*c97ad5cdSakolb  * kernel might lead to all kinds of locking problems.
78*c97ad5cdSakolb  *
79*c97ad5cdSakolb  * Accounting
80*c97ad5cdSakolb  * ==========
81*c97ad5cdSakolb  *
82*c97ad5cdSakolb  * Accounting of CPU usage is based on per-thread micro-state accounting data.
83*c97ad5cdSakolb  * On every clock tick clock() adds new on-CPU time for every thread found on
84*c97ad5cdSakolb  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
85*c97ad5cdSakolb  * New times means time since it was last accounted for. On-CPU times greater
86*c97ad5cdSakolb  * than 1 tick are truncated to 1 tick.
87*c97ad5cdSakolb  *
88*c97ad5cdSakolb  * Project CPU usage is aggregated from all threads within the project.
89*c97ad5cdSakolb  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
90*c97ad5cdSakolb  * CPU usage is calculated on every clock tick by walking list of projects and
91*c97ad5cdSakolb  * adding their usage together.
92*c97ad5cdSakolb  *
93*c97ad5cdSakolb  * Decay
94*c97ad5cdSakolb  * =====
95*c97ad5cdSakolb  *
96*c97ad5cdSakolb  * CPU usage is decayed by the caps_update() routine which is called once per
97*c97ad5cdSakolb  * every clock tick. It walks lists of project caps and decays their usages by
98*c97ad5cdSakolb  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
99*c97ad5cdSakolb  * are made runnable again, one thread per clock tick.
100*c97ad5cdSakolb  *
101*c97ad5cdSakolb  * Interfaces
102*c97ad5cdSakolb  * ==========
103*c97ad5cdSakolb  *
104*c97ad5cdSakolb  * The CPU Caps facility provides the following interfaces to the rest of the
105*c97ad5cdSakolb  * system:
106*c97ad5cdSakolb  *
107*c97ad5cdSakolb  *   cpucaps_project_add(kproject_t *)
108*c97ad5cdSakolb  *
109*c97ad5cdSakolb  * Notifies the framework of a new project. It should be put on the
110*c97ad5cdSakolb  * capped_projects list if its zone has a cap.
111*c97ad5cdSakolb  *
112*c97ad5cdSakolb  *   cpucaps_project_remove(kproject_t *)
113*c97ad5cdSakolb  *
114*c97ad5cdSakolb  * Remove the association between the specified project and its cap.
115*c97ad5cdSakolb  * Called right before the project is destroyed.
116*c97ad5cdSakolb  *
117*c97ad5cdSakolb  * cpucaps_project_set(kproject_t *, rctl_qty_t)
118*c97ad5cdSakolb  *
119*c97ad5cdSakolb  * Set project cap of the specified project to the specified value. Setting the
120*c97ad5cdSakolb  * value to NOCAP is equivalent to removing the cap.
121*c97ad5cdSakolb  *
122*c97ad5cdSakolb  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
123*c97ad5cdSakolb  *
124*c97ad5cdSakolb  * Set zone cap of the specified zone to the specified value. Setting the value
125*c97ad5cdSakolb  * to NOCAP is equivalent to removing the cap.
126*c97ad5cdSakolb  *
127*c97ad5cdSakolb  *   cpucaps_zone_remove(zone_t *)
128*c97ad5cdSakolb  *
129*c97ad5cdSakolb  * Remove the association between the zone and its cap.
130*c97ad5cdSakolb  *
131*c97ad5cdSakolb  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
132*c97ad5cdSakolb  *
133*c97ad5cdSakolb  * Charges specified thread's project the amount of on-CPU time that it used.
134*c97ad5cdSakolb  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
135*c97ad5cdSakolb  * Otherwise returns True if project or zone should be penalized because its
136*c97ad5cdSakolb  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
137*c97ad5cdSakolb  * bits in t_schedflag in this case.
138*c97ad5cdSakolb  *
139*c97ad5cdSakolb  *   CPUCAPS_ENFORCE(kthread_id_t *)
140*c97ad5cdSakolb  *
141*c97ad5cdSakolb  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
142*c97ad5cdSakolb  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
143*c97ad5cdSakolb  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
144*c97ad5cdSakolb  * wait queue or False otherwise.
145*c97ad5cdSakolb  *
146*c97ad5cdSakolb  *   cpucaps_sc_init(caps_sc_t *)
147*c97ad5cdSakolb  *
148*c97ad5cdSakolb  * Initializes the scheduling-class specific CPU Caps data for a thread.
149*c97ad5cdSakolb  *
150*c97ad5cdSakolb  * LOCKS
151*c97ad5cdSakolb  * =====
152*c97ad5cdSakolb  *
153*c97ad5cdSakolb  * all the individual caps structures and their lists are protected by a global
154*c97ad5cdSakolb  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
155*c97ad5cdSakolb  * caps, so it is usually uncontended. We avoid all blocking memory allocations
156*c97ad5cdSakolb  * while holding caps_lock to prevent clock() from blocking.
157*c97ad5cdSakolb  *
158*c97ad5cdSakolb  * Thread state is protected by the thread lock. It protects the association
159*c97ad5cdSakolb  * between a thread and its project and, as a consequence, to its zone. The
160*c97ad5cdSakolb  * association can not break while thread lock is held, so the project or zone
161*c97ad5cdSakolb  * cap are not going to disappear while thread lock is held.
162*c97ad5cdSakolb  *
163*c97ad5cdSakolb  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
164*c97ad5cdSakolb  * grabbed by scheduling classes already holding thread lock at high PIL and by
165*c97ad5cdSakolb  * clock thread performing usage decay. We should do as little work as possible
166*c97ad5cdSakolb  * while holding the lock since it may be very hot. All threads in the project
167*c97ad5cdSakolb  * contend for the same cache line doing cap usage updates.
168*c97ad5cdSakolb  */
169*c97ad5cdSakolb 
170*c97ad5cdSakolb /*
171*c97ad5cdSakolb  * caps_lock protects list of capped projects and zones, changes in the cap
172*c97ad5cdSakolb  * state and changes of the global cpucaps_enabled flag.
173*c97ad5cdSakolb  *
174*c97ad5cdSakolb  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
175*c97ad5cdSakolb  * modified in parallel. This can be per-zone cap flag, but we don't keep any
176*c97ad5cdSakolb  * cap state for now.
177*c97ad5cdSakolb  */
178*c97ad5cdSakolb static kmutex_t caps_lock;		/* lock to protect: */
179*c97ad5cdSakolb static list_t capped_zones;		/* - list of zones with caps */
180*c97ad5cdSakolb static list_t capped_projects;		/* - list of projects with caps */
181*c97ad5cdSakolb boolean_t cpucaps_enabled;		/* - are there any caps defined? */
182*c97ad5cdSakolb boolean_t cpucaps_busy;			/* - is framework busy? */
183*c97ad5cdSakolb 
184*c97ad5cdSakolb /*
185*c97ad5cdSakolb  * The accounting is based on the number of nanoseconds threads spend running
186*c97ad5cdSakolb  * during a tick which is kept in the cap_tick_cost variable.
187*c97ad5cdSakolb  */
188*c97ad5cdSakolb static hrtime_t cap_tick_cost;
189*c97ad5cdSakolb 
190*c97ad5cdSakolb /*
191*c97ad5cdSakolb  * How much of the usage value is decayed every clock tick
192*c97ad5cdSakolb  * Decay one per cent of value per tick
193*c97ad5cdSakolb  */
194*c97ad5cdSakolb #define	CAP_DECAY_FACTOR 100
195*c97ad5cdSakolb 
196*c97ad5cdSakolb /*
197*c97ad5cdSakolb  * Scale the value and round it to the closest integer value
198*c97ad5cdSakolb  */
199*c97ad5cdSakolb #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
200*c97ad5cdSakolb 
201*c97ad5cdSakolb static void caps_update();
202*c97ad5cdSakolb 
203*c97ad5cdSakolb /*
204*c97ad5cdSakolb  * CAP kstats.
205*c97ad5cdSakolb  */
206*c97ad5cdSakolb struct cap_kstat {
207*c97ad5cdSakolb 	kstat_named_t	cap_value;
208*c97ad5cdSakolb 	kstat_named_t	cap_usage;
209*c97ad5cdSakolb 	kstat_named_t	cap_nwait;
210*c97ad5cdSakolb 	kstat_named_t	cap_below;
211*c97ad5cdSakolb 	kstat_named_t	cap_above;
212*c97ad5cdSakolb 	kstat_named_t	cap_maxusage;
213*c97ad5cdSakolb 	kstat_named_t	cap_zonename;
214*c97ad5cdSakolb } cap_kstat = {
215*c97ad5cdSakolb 	{ "value",	KSTAT_DATA_UINT64 },
216*c97ad5cdSakolb 	{ "usage",	KSTAT_DATA_UINT64 },
217*c97ad5cdSakolb 	{ "nwait",	KSTAT_DATA_UINT64 },
218*c97ad5cdSakolb 	{ "below_sec",	KSTAT_DATA_UINT64 },
219*c97ad5cdSakolb 	{ "above_sec",	KSTAT_DATA_UINT64 },
220*c97ad5cdSakolb 	{ "maxusage",	KSTAT_DATA_UINT64 },
221*c97ad5cdSakolb 	{ "zonename",	KSTAT_DATA_STRING },
222*c97ad5cdSakolb };
223*c97ad5cdSakolb 
224*c97ad5cdSakolb 
225*c97ad5cdSakolb static kmutex_t cap_kstat_lock;
226*c97ad5cdSakolb static int cap_kstat_update(kstat_t *, int);
227*c97ad5cdSakolb 
228*c97ad5cdSakolb /*
229*c97ad5cdSakolb  * Initialize CPU caps infrastructure.
230*c97ad5cdSakolb  *   - Initialize lists of capped zones and capped projects
231*c97ad5cdSakolb  *   - Set cpucaps_clock_callout to NULL
232*c97ad5cdSakolb  */
233*c97ad5cdSakolb void
234*c97ad5cdSakolb cpucaps_init()
235*c97ad5cdSakolb {
236*c97ad5cdSakolb 	/*
237*c97ad5cdSakolb 	 * Initialize global variables
238*c97ad5cdSakolb 	 */
239*c97ad5cdSakolb 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
240*c97ad5cdSakolb 
241*c97ad5cdSakolb 	list_create(&capped_zones, sizeof (cpucap_t),
242*c97ad5cdSakolb 	    offsetof(cpucap_t, cap_link));
243*c97ad5cdSakolb 	list_create(&capped_projects, sizeof (cpucap_t),
244*c97ad5cdSakolb 	    offsetof(cpucap_t, cap_link));
245*c97ad5cdSakolb 
246*c97ad5cdSakolb 	cpucaps_enabled = B_FALSE;
247*c97ad5cdSakolb 	cpucaps_busy = B_FALSE;
248*c97ad5cdSakolb 	cpucaps_clock_callout = NULL;
249*c97ad5cdSakolb }
250*c97ad5cdSakolb 
251*c97ad5cdSakolb /*
252*c97ad5cdSakolb  * Initialize scheduling-class specific CPU Caps data.
253*c97ad5cdSakolb  */
254*c97ad5cdSakolb void
255*c97ad5cdSakolb cpucaps_sc_init(caps_sc_t *csc)
256*c97ad5cdSakolb {
257*c97ad5cdSakolb 	csc->csc_cputime = 0;
258*c97ad5cdSakolb }
259*c97ad5cdSakolb 
260*c97ad5cdSakolb /*
261*c97ad5cdSakolb  * Allocate and initialize cpucap structure
262*c97ad5cdSakolb  */
263*c97ad5cdSakolb static cpucap_t *
264*c97ad5cdSakolb cap_alloc(void)
265*c97ad5cdSakolb {
266*c97ad5cdSakolb 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
267*c97ad5cdSakolb 
268*c97ad5cdSakolb 	DISP_LOCK_INIT(&cap->cap_usagelock);
269*c97ad5cdSakolb 	waitq_init(&cap->cap_waitq);
270*c97ad5cdSakolb 
271*c97ad5cdSakolb 	return (cap);
272*c97ad5cdSakolb }
273*c97ad5cdSakolb 
274*c97ad5cdSakolb /*
275*c97ad5cdSakolb  * Free cpucap structure
276*c97ad5cdSakolb  */
277*c97ad5cdSakolb static void
278*c97ad5cdSakolb cap_free(cpucap_t *cap)
279*c97ad5cdSakolb {
280*c97ad5cdSakolb 	if (cap == NULL)
281*c97ad5cdSakolb 		return;
282*c97ad5cdSakolb 
283*c97ad5cdSakolb 	/*
284*c97ad5cdSakolb 	 * This cap should not be active
285*c97ad5cdSakolb 	 */
286*c97ad5cdSakolb 	ASSERT(!list_link_active(&cap->cap_link));
287*c97ad5cdSakolb 	ASSERT(cap->cap_value == 0);
288*c97ad5cdSakolb 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
289*c97ad5cdSakolb 
290*c97ad5cdSakolb 	waitq_fini(&cap->cap_waitq);
291*c97ad5cdSakolb 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
292*c97ad5cdSakolb 
293*c97ad5cdSakolb 	kmem_free(cap, sizeof (cpucap_t));
294*c97ad5cdSakolb }
295*c97ad5cdSakolb 
296*c97ad5cdSakolb /*
297*c97ad5cdSakolb  * Activate cap - insert into active list and unblock its
298*c97ad5cdSakolb  * wait queue. Should be called with caps_lock held.
299*c97ad5cdSakolb  * The cap_value field is set to the value supplied.
300*c97ad5cdSakolb  */
301*c97ad5cdSakolb static void
302*c97ad5cdSakolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
303*c97ad5cdSakolb {
304*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
305*c97ad5cdSakolb 
306*c97ad5cdSakolb 	/*
307*c97ad5cdSakolb 	 * Cap can not be already enabled
308*c97ad5cdSakolb 	 */
309*c97ad5cdSakolb 	ASSERT(!CAP_ENABLED(cap));
310*c97ad5cdSakolb 	ASSERT(!list_link_active(&cap->cap_link));
311*c97ad5cdSakolb 
312*c97ad5cdSakolb 	list_insert_tail(l, cap);
313*c97ad5cdSakolb 	cap->cap_below = cap->cap_above = 0;
314*c97ad5cdSakolb 	cap->cap_maxusage = 0;
315*c97ad5cdSakolb 	cap->cap_usage = 0;
316*c97ad5cdSakolb 	cap->cap_value = value;
317*c97ad5cdSakolb 	waitq_unblock(&cap->cap_waitq);
318*c97ad5cdSakolb 	if (CPUCAPS_OFF()) {
319*c97ad5cdSakolb 		cpucaps_enabled = B_TRUE;
320*c97ad5cdSakolb 		cpucaps_clock_callout = caps_update;
321*c97ad5cdSakolb 	}
322*c97ad5cdSakolb }
323*c97ad5cdSakolb 
324*c97ad5cdSakolb /*
325*c97ad5cdSakolb  * Deactivate cap
326*c97ad5cdSakolb  *   - Block its wait queue. This prevents any new threads from being
327*c97ad5cdSakolb  *	enqueued there and moves all enqueued threads to the run queue.
328*c97ad5cdSakolb  *   - Remove cap from list l.
329*c97ad5cdSakolb  *   - Disable CPU caps globally if there are no capped projects or zones
330*c97ad5cdSakolb  *
331*c97ad5cdSakolb  * Should be called with caps_lock held.
332*c97ad5cdSakolb  */
333*c97ad5cdSakolb static void
334*c97ad5cdSakolb cap_disable(list_t *l, cpucap_t *cap)
335*c97ad5cdSakolb {
336*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
337*c97ad5cdSakolb 	/*
338*c97ad5cdSakolb 	 * Cap should be currently active
339*c97ad5cdSakolb 	 */
340*c97ad5cdSakolb 	ASSERT(CPUCAPS_ON());
341*c97ad5cdSakolb 	ASSERT(list_link_active(&cap->cap_link));
342*c97ad5cdSakolb 	ASSERT(CAP_ENABLED(cap));
343*c97ad5cdSakolb 
344*c97ad5cdSakolb 	waitq_block(&cap->cap_waitq);
345*c97ad5cdSakolb 	list_remove(l, cap);
346*c97ad5cdSakolb 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
347*c97ad5cdSakolb 		cpucaps_enabled = B_FALSE;
348*c97ad5cdSakolb 		cpucaps_clock_callout = NULL;
349*c97ad5cdSakolb 	}
350*c97ad5cdSakolb 	cap->cap_value = 0;
351*c97ad5cdSakolb 	cap->cap_project = NULL;
352*c97ad5cdSakolb 	cap->cap_zone = NULL;
353*c97ad5cdSakolb 	if (cap->cap_kstat != NULL) {
354*c97ad5cdSakolb 		kstat_delete(cap->cap_kstat);
355*c97ad5cdSakolb 		cap->cap_kstat = NULL;
356*c97ad5cdSakolb 	}
357*c97ad5cdSakolb 
358*c97ad5cdSakolb }
359*c97ad5cdSakolb 
360*c97ad5cdSakolb /*
361*c97ad5cdSakolb  * Enable cap for a project kpj
362*c97ad5cdSakolb  * It is safe to enable already enabled project cap.
363*c97ad5cdSakolb  * Should be called with caps_lock held.
364*c97ad5cdSakolb  */
365*c97ad5cdSakolb static void
366*c97ad5cdSakolb cap_project_enable(kproject_t *kpj, hrtime_t value)
367*c97ad5cdSakolb {
368*c97ad5cdSakolb 	cpucap_t *cap = kpj->kpj_cpucap;
369*c97ad5cdSakolb 
370*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
371*c97ad5cdSakolb 	ASSERT(cap != NULL);
372*c97ad5cdSakolb 
373*c97ad5cdSakolb 	if (CAP_DISABLED(cap)) {
374*c97ad5cdSakolb 		ASSERT(cap->cap_kstat == NULL);
375*c97ad5cdSakolb 		cap_enable(&capped_projects, cap, value);
376*c97ad5cdSakolb 		cap->cap_project = kpj;
377*c97ad5cdSakolb 		cap->cap_zone = kpj->kpj_zone;
378*c97ad5cdSakolb 
379*c97ad5cdSakolb 		/*
380*c97ad5cdSakolb 		 * Create cap kstats
381*c97ad5cdSakolb 		 */
382*c97ad5cdSakolb 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
383*c97ad5cdSakolb 		    KSTAT_TYPE_NAMED,
384*c97ad5cdSakolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
385*c97ad5cdSakolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
386*c97ad5cdSakolb 		    cap->cap_kstat->ks_data_size +=
387*c97ad5cdSakolb 			strlen(cap->cap_zone->zone_name) + 1;
388*c97ad5cdSakolb 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
389*c97ad5cdSakolb 		    cap->cap_kstat->ks_data = &cap_kstat;
390*c97ad5cdSakolb 		    cap->cap_kstat->ks_update = cap_kstat_update;
391*c97ad5cdSakolb 		    cap->cap_kstat->ks_private = cap;
392*c97ad5cdSakolb 		    kstat_install(cap->cap_kstat);
393*c97ad5cdSakolb 		}
394*c97ad5cdSakolb 	}
395*c97ad5cdSakolb }
396*c97ad5cdSakolb 
397*c97ad5cdSakolb /*
398*c97ad5cdSakolb  * Disable project cap.
399*c97ad5cdSakolb  * It is safe to disable already disabled project cap.
400*c97ad5cdSakolb  * Should be called with caps_lock held.
401*c97ad5cdSakolb  */
402*c97ad5cdSakolb static void
403*c97ad5cdSakolb cap_project_disable(kproject_t *kpj)
404*c97ad5cdSakolb {
405*c97ad5cdSakolb 	cpucap_t *cap = kpj->kpj_cpucap;
406*c97ad5cdSakolb 
407*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
408*c97ad5cdSakolb 	ASSERT(cap != NULL);
409*c97ad5cdSakolb 	ASSERT(cap->cap_project == kpj);
410*c97ad5cdSakolb 
411*c97ad5cdSakolb 	if (CAP_ENABLED(cap))
412*c97ad5cdSakolb 		cap_disable(&capped_projects, cap);
413*c97ad5cdSakolb }
414*c97ad5cdSakolb 
415*c97ad5cdSakolb /*
416*c97ad5cdSakolb  * Enable cap for a zone
417*c97ad5cdSakolb  * It is safe to enable already enabled zone cap.
418*c97ad5cdSakolb  * Should be called with caps_lock held.
419*c97ad5cdSakolb  */
420*c97ad5cdSakolb static void
421*c97ad5cdSakolb cap_zone_enable(zone_t *zone, hrtime_t value)
422*c97ad5cdSakolb {
423*c97ad5cdSakolb 	cpucap_t *cap = zone->zone_cpucap;
424*c97ad5cdSakolb 
425*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
426*c97ad5cdSakolb 	ASSERT(cap != NULL);
427*c97ad5cdSakolb 
428*c97ad5cdSakolb 	if (CAP_DISABLED(cap)) {
429*c97ad5cdSakolb 		ASSERT(cap->cap_kstat == NULL);
430*c97ad5cdSakolb 		cap_enable(&capped_zones, cap, value);
431*c97ad5cdSakolb 		cap->cap_zone = zone;
432*c97ad5cdSakolb 
433*c97ad5cdSakolb 		/*
434*c97ad5cdSakolb 		 * Create cap kstats
435*c97ad5cdSakolb 		 */
436*c97ad5cdSakolb 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
437*c97ad5cdSakolb 		    KSTAT_TYPE_NAMED,
438*c97ad5cdSakolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
439*c97ad5cdSakolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
440*c97ad5cdSakolb 		    cap->cap_kstat->ks_data_size +=
441*c97ad5cdSakolb 			strlen(cap->cap_zone->zone_name) + 1;
442*c97ad5cdSakolb 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
443*c97ad5cdSakolb 		    cap->cap_kstat->ks_data = &cap_kstat;
444*c97ad5cdSakolb 		    cap->cap_kstat->ks_update = cap_kstat_update;
445*c97ad5cdSakolb 		    cap->cap_kstat->ks_private = cap;
446*c97ad5cdSakolb 		    kstat_install(cap->cap_kstat);
447*c97ad5cdSakolb 		}
448*c97ad5cdSakolb 	}
449*c97ad5cdSakolb }
450*c97ad5cdSakolb 
451*c97ad5cdSakolb /*
452*c97ad5cdSakolb  * Disable zone cap.
453*c97ad5cdSakolb  * It is safe to disable already disabled zone cap.
454*c97ad5cdSakolb  * Should be called with caps_lock held.
455*c97ad5cdSakolb  */
456*c97ad5cdSakolb static void
457*c97ad5cdSakolb cap_zone_disable(zone_t *zone)
458*c97ad5cdSakolb {
459*c97ad5cdSakolb 	cpucap_t *cap = zone->zone_cpucap;
460*c97ad5cdSakolb 
461*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
462*c97ad5cdSakolb 	ASSERT(cap != NULL);
463*c97ad5cdSakolb 	ASSERT(cap->cap_zone == zone);
464*c97ad5cdSakolb 
465*c97ad5cdSakolb 	if (CAP_ENABLED(cap))
466*c97ad5cdSakolb 		cap_disable(&capped_zones, cap);
467*c97ad5cdSakolb }
468*c97ad5cdSakolb 
469*c97ad5cdSakolb /*
470*c97ad5cdSakolb  * Apply specified callback to all caps contained in the list `l'.
471*c97ad5cdSakolb  */
472*c97ad5cdSakolb static void
473*c97ad5cdSakolb cap_walk(list_t *l, void (*cb)(cpucap_t *))
474*c97ad5cdSakolb {
475*c97ad5cdSakolb 	cpucap_t *cap;
476*c97ad5cdSakolb 
477*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
478*c97ad5cdSakolb 
479*c97ad5cdSakolb 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
480*c97ad5cdSakolb 		(*cb)(cap);
481*c97ad5cdSakolb 	}
482*c97ad5cdSakolb }
483*c97ad5cdSakolb 
484*c97ad5cdSakolb /*
485*c97ad5cdSakolb  * If cap limit is not reached, make one thread from wait queue runnable.
486*c97ad5cdSakolb  * The waitq_isempty check is performed without the waitq lock. If a new thread
487*c97ad5cdSakolb  * is placed on the waitq right after the check, it will be picked up during the
488*c97ad5cdSakolb  * next invocation of cap_poke_waitq().
489*c97ad5cdSakolb  */
490*c97ad5cdSakolb static void
491*c97ad5cdSakolb cap_poke_waitq(cpucap_t *cap)
492*c97ad5cdSakolb {
493*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
494*c97ad5cdSakolb 
495*c97ad5cdSakolb 	if (cap->cap_usage >= cap->cap_value) {
496*c97ad5cdSakolb 		cap->cap_above++;
497*c97ad5cdSakolb 	} else {
498*c97ad5cdSakolb 		waitq_t *wq = &cap->cap_waitq;
499*c97ad5cdSakolb 
500*c97ad5cdSakolb 		cap->cap_below++;
501*c97ad5cdSakolb 
502*c97ad5cdSakolb 		if (!waitq_isempty(wq))
503*c97ad5cdSakolb 			waitq_runone(wq);
504*c97ad5cdSakolb 	}
505*c97ad5cdSakolb }
506*c97ad5cdSakolb 
507*c97ad5cdSakolb /*
508*c97ad5cdSakolb  * The callback function called for every cap on capped_projects list.
509*c97ad5cdSakolb  * Decay cap usage by CAP_DECAY_FACTOR
510*c97ad5cdSakolb  * Add this cap project usage to its zone usage.
511*c97ad5cdSakolb  * Kick off a thread from the cap waitq if cap is not reached.
512*c97ad5cdSakolb  */
513*c97ad5cdSakolb static void
514*c97ad5cdSakolb cap_project_usage_walker(cpucap_t *cap)
515*c97ad5cdSakolb {
516*c97ad5cdSakolb 	zone_t		*zone = cap->cap_zone;
517*c97ad5cdSakolb 	hrtime_t	cap_usage = cap->cap_usage;
518*c97ad5cdSakolb 
519*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
520*c97ad5cdSakolb 	ASSERT(cap->cap_project->kpj_cpucap == cap);
521*c97ad5cdSakolb 	ASSERT(zone == cap->cap_project->kpj_zone);
522*c97ad5cdSakolb 	ASSERT(CAP_ENABLED(cap));
523*c97ad5cdSakolb 
524*c97ad5cdSakolb 	/*
525*c97ad5cdSakolb 	 * Set or clear the CAP_REACHED flag based on the current usage.
526*c97ad5cdSakolb 	 * Only projects having their own caps are ever marked as CAP_REACHED.
527*c97ad5cdSakolb 	 */
528*c97ad5cdSakolb 	cap_poke_waitq(cap);
529*c97ad5cdSakolb 
530*c97ad5cdSakolb 	/*
531*c97ad5cdSakolb 	 * Add project's CPU usage to our zone's CPU usage.
532*c97ad5cdSakolb 	 */
533*c97ad5cdSakolb 	if (ZONE_IS_CAPPED(zone)) {
534*c97ad5cdSakolb 		cpucap_t *zcap = zone->zone_cpucap;
535*c97ad5cdSakolb 
536*c97ad5cdSakolb 		ASSERT(zcap->cap_zone == zone);
537*c97ad5cdSakolb 
538*c97ad5cdSakolb 		/*
539*c97ad5cdSakolb 		 * If we haven't reset this zone's usage during this clock tick
540*c97ad5cdSakolb 		 * yet, then do it now. The cap_lbolt field is used to check
541*c97ad5cdSakolb 		 * whether this is the first zone's project we see during this
542*c97ad5cdSakolb 		 * tick or a subsequent one.
543*c97ad5cdSakolb 		 */
544*c97ad5cdSakolb 		if (zcap->cap_lbolt != lbolt64) {
545*c97ad5cdSakolb 			if (zcap->cap_usage > zcap->cap_maxusage)
546*c97ad5cdSakolb 				zcap->cap_maxusage = zcap->cap_usage;
547*c97ad5cdSakolb 			zcap->cap_usage = 0;
548*c97ad5cdSakolb 		}
549*c97ad5cdSakolb 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
550*c97ad5cdSakolb 		    hrtime_t, cap_usage);
551*c97ad5cdSakolb 		zcap->cap_usage += cap_usage;
552*c97ad5cdSakolb 		/* Check for overflows */
553*c97ad5cdSakolb 		if (zcap->cap_usage < 0)
554*c97ad5cdSakolb 			zcap->cap_usage = MAX_USAGE - 1;
555*c97ad5cdSakolb 	}
556*c97ad5cdSakolb 
557*c97ad5cdSakolb 	/*
558*c97ad5cdSakolb 	 * Decay project usage.
559*c97ad5cdSakolb 	 */
560*c97ad5cdSakolb 	disp_lock_enter(&cap->cap_usagelock);
561*c97ad5cdSakolb 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
562*c97ad5cdSakolb 	disp_lock_exit(&cap->cap_usagelock);
563*c97ad5cdSakolb }
564*c97ad5cdSakolb 
565*c97ad5cdSakolb /*
566*c97ad5cdSakolb  * On every clock tick walk the list of project caps and update the CPU usage.
567*c97ad5cdSakolb  * Also walk the list of zone caps checking whether any threads should
568*c97ad5cdSakolb  * transition from wait queue to run queue.
569*c97ad5cdSakolb  *
570*c97ad5cdSakolb  * This function gets called by the clock thread directly when there are any
571*c97ad5cdSakolb  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
572*c97ad5cdSakolb  * caps_lock for long periods of time, so there should be almost no contention
573*c97ad5cdSakolb  * for it.
574*c97ad5cdSakolb  */
575*c97ad5cdSakolb static void
576*c97ad5cdSakolb caps_update()
577*c97ad5cdSakolb {
578*c97ad5cdSakolb 	mutex_enter(&caps_lock);
579*c97ad5cdSakolb 	cap_walk(&capped_projects, cap_project_usage_walker);
580*c97ad5cdSakolb 	cap_walk(&capped_zones, cap_poke_waitq);
581*c97ad5cdSakolb 	mutex_exit(&caps_lock);
582*c97ad5cdSakolb }
583*c97ad5cdSakolb 
584*c97ad5cdSakolb /*
585*c97ad5cdSakolb  * The function is called for each project in a zone when the zone cap is
586*c97ad5cdSakolb  * modified. It enables project caps if zone cap is enabled and disables if the
587*c97ad5cdSakolb  * zone cap is disabled and project doesn't have its own cap.
588*c97ad5cdSakolb  *
589*c97ad5cdSakolb  * For each project that does not have cpucap structure allocated it allocates a
590*c97ad5cdSakolb  * new structure and assigns to kpj->cpu_cap. The allocation is performed
591*c97ad5cdSakolb  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
592*c97ad5cdSakolb  * held.
593*c97ad5cdSakolb  */
594*c97ad5cdSakolb static int
595*c97ad5cdSakolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
596*c97ad5cdSakolb {
597*c97ad5cdSakolb 	cpucap_t *project_cap = NULL;
598*c97ad5cdSakolb 	cpucap_t *zone_cap = (cpucap_t *)arg;
599*c97ad5cdSakolb 
600*c97ad5cdSakolb 	ASSERT(zone_cap != NULL);
601*c97ad5cdSakolb 
602*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
603*c97ad5cdSakolb 		/*
604*c97ad5cdSakolb 		 * This is the first time any cap was established for this
605*c97ad5cdSakolb 		 * project. Allocate a new cpucap structure for it.
606*c97ad5cdSakolb 		 */
607*c97ad5cdSakolb 		project_cap = cap_alloc();
608*c97ad5cdSakolb 	}
609*c97ad5cdSakolb 
610*c97ad5cdSakolb 	mutex_enter(&caps_lock);
611*c97ad5cdSakolb 
612*c97ad5cdSakolb 	/*
613*c97ad5cdSakolb 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
614*c97ad5cdSakolb 	 * and assign the newly allocated cpucap structure to it.
615*c97ad5cdSakolb 	 */
616*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
617*c97ad5cdSakolb 		kpj->kpj_cpucap = project_cap;
618*c97ad5cdSakolb 	} else if (project_cap != NULL) {
619*c97ad5cdSakolb 		cap_free(project_cap);
620*c97ad5cdSakolb 	}
621*c97ad5cdSakolb 
622*c97ad5cdSakolb 	project_cap = kpj->kpj_cpucap;
623*c97ad5cdSakolb 
624*c97ad5cdSakolb 	if (CAP_DISABLED(zone_cap)) {
625*c97ad5cdSakolb 		/*
626*c97ad5cdSakolb 		 * Remove all projects in this zone without caps
627*c97ad5cdSakolb 		 * from the capped_projects list.
628*c97ad5cdSakolb 		 */
629*c97ad5cdSakolb 		if (project_cap->cap_value == MAX_USAGE) {
630*c97ad5cdSakolb 			cap_project_disable(kpj);
631*c97ad5cdSakolb 		}
632*c97ad5cdSakolb 	} else if (CAP_DISABLED(project_cap)) {
633*c97ad5cdSakolb 		/*
634*c97ad5cdSakolb 		 * Add the project to capped_projects list.
635*c97ad5cdSakolb 		 */
636*c97ad5cdSakolb 		ASSERT(project_cap->cap_value == 0);
637*c97ad5cdSakolb 		cap_project_enable(kpj, MAX_USAGE);
638*c97ad5cdSakolb 	}
639*c97ad5cdSakolb 	mutex_exit(&caps_lock);
640*c97ad5cdSakolb 
641*c97ad5cdSakolb 	return (0);
642*c97ad5cdSakolb }
643*c97ad5cdSakolb 
644*c97ad5cdSakolb /*
645*c97ad5cdSakolb  * Set zone cap to cap_val
646*c97ad5cdSakolb  * If cap_val is equal to NOCAP, disable zone cap.
647*c97ad5cdSakolb  *
648*c97ad5cdSakolb  * If this is the first time a cap is set on a zone, allocate cpucap structure
649*c97ad5cdSakolb  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
650*c97ad5cdSakolb  */
651*c97ad5cdSakolb int
652*c97ad5cdSakolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
653*c97ad5cdSakolb {
654*c97ad5cdSakolb 	cpucap_t *cap = NULL;
655*c97ad5cdSakolb 	hrtime_t value;
656*c97ad5cdSakolb 
657*c97ad5cdSakolb 	if (cap_val == 0)
658*c97ad5cdSakolb 		return (EINVAL);
659*c97ad5cdSakolb 
660*c97ad5cdSakolb 	ASSERT(cap_val <= MAXCAP);
661*c97ad5cdSakolb 	if (cap_val > MAXCAP)
662*c97ad5cdSakolb 		cap_val = MAXCAP;
663*c97ad5cdSakolb 
664*c97ad5cdSakolb 	/*
665*c97ad5cdSakolb 	 * Nothing to do if trying to disable a cap on a zone when caps are off
666*c97ad5cdSakolb 	 * or a zone which does not have a cap yet.
667*c97ad5cdSakolb 	 */
668*c97ad5cdSakolb 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
669*c97ad5cdSakolb 		return (0);
670*c97ad5cdSakolb 
671*c97ad5cdSakolb 	if (zone->zone_cpucap == NULL)
672*c97ad5cdSakolb 		cap = cap_alloc();
673*c97ad5cdSakolb 
674*c97ad5cdSakolb 	mutex_enter(&caps_lock);
675*c97ad5cdSakolb 
676*c97ad5cdSakolb 	if (cpucaps_busy) {
677*c97ad5cdSakolb 		mutex_exit(&caps_lock);
678*c97ad5cdSakolb 		return (EBUSY);
679*c97ad5cdSakolb 	}
680*c97ad5cdSakolb 
681*c97ad5cdSakolb 	/*
682*c97ad5cdSakolb 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
683*c97ad5cdSakolb 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
684*c97ad5cdSakolb 	 */
685*c97ad5cdSakolb 	if (zone->zone_cpucap == NULL) {
686*c97ad5cdSakolb 		zone->zone_cpucap = cap;
687*c97ad5cdSakolb 	} else if (cap != NULL) {
688*c97ad5cdSakolb 		cap_free(cap);
689*c97ad5cdSakolb 	}
690*c97ad5cdSakolb 
691*c97ad5cdSakolb 	cap = zone->zone_cpucap;
692*c97ad5cdSakolb 	value = cap_val * cap_tick_cost;
693*c97ad5cdSakolb 	if (value < 0)
694*c97ad5cdSakolb 		value = MAX_USAGE;
695*c97ad5cdSakolb 
696*c97ad5cdSakolb 	/* Nothing to do if the value is staying the same */
697*c97ad5cdSakolb 	if (value == cap->cap_value) {
698*c97ad5cdSakolb 		mutex_exit(&caps_lock);
699*c97ad5cdSakolb 		return (0);
700*c97ad5cdSakolb 	}
701*c97ad5cdSakolb 
702*c97ad5cdSakolb 	/*
703*c97ad5cdSakolb 	 * Clear cap statistics since the cap value itself changes.
704*c97ad5cdSakolb 	 */
705*c97ad5cdSakolb 	cap->cap_above = cap->cap_below = 0;
706*c97ad5cdSakolb 
707*c97ad5cdSakolb 
708*c97ad5cdSakolb 	if (cap_val == NOCAP) {
709*c97ad5cdSakolb 		if (CAP_ENABLED(cap)) {
710*c97ad5cdSakolb 			/*
711*c97ad5cdSakolb 			 * Remove cap for the zone
712*c97ad5cdSakolb 			 */
713*c97ad5cdSakolb 			cap_zone_disable(zone);
714*c97ad5cdSakolb 			cpucaps_busy = B_TRUE;
715*c97ad5cdSakolb 			mutex_exit(&caps_lock);
716*c97ad5cdSakolb 			/*
717*c97ad5cdSakolb 			 * Disable caps for all project belonging to this zone
718*c97ad5cdSakolb 			 * unless they have their own cap.
719*c97ad5cdSakolb 			 */
720*c97ad5cdSakolb 			(void) project_walk_all(zone->zone_id,
721*c97ad5cdSakolb 			    cap_project_zone_modify_walker, cap);
722*c97ad5cdSakolb 
723*c97ad5cdSakolb 			mutex_enter(&caps_lock);
724*c97ad5cdSakolb 			cpucaps_busy = B_FALSE;
725*c97ad5cdSakolb 		}
726*c97ad5cdSakolb 	} else if (CAP_DISABLED(cap)) {
727*c97ad5cdSakolb 		/*
728*c97ad5cdSakolb 		 * Set a cap on a zone which previously was not capped.
729*c97ad5cdSakolb 		 */
730*c97ad5cdSakolb 		cap_zone_enable(zone, value);
731*c97ad5cdSakolb 		cpucaps_busy = B_TRUE;
732*c97ad5cdSakolb 		mutex_exit(&caps_lock);
733*c97ad5cdSakolb 
734*c97ad5cdSakolb 		/*
735*c97ad5cdSakolb 		 * Enable cap for all projects belonging to this zone.
736*c97ad5cdSakolb 		 */
737*c97ad5cdSakolb 		(void) project_walk_all(zone->zone_id,
738*c97ad5cdSakolb 		    cap_project_zone_modify_walker, cap);
739*c97ad5cdSakolb 
740*c97ad5cdSakolb 		mutex_enter(&caps_lock);
741*c97ad5cdSakolb 		cpucaps_busy = B_FALSE;
742*c97ad5cdSakolb 	} else {
743*c97ad5cdSakolb 		/*
744*c97ad5cdSakolb 		 * No state transitions, just change the value
745*c97ad5cdSakolb 		 */
746*c97ad5cdSakolb 		cap->cap_value = value;
747*c97ad5cdSakolb 	}
748*c97ad5cdSakolb 
749*c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
750*c97ad5cdSakolb 	ASSERT(!cpucaps_busy);
751*c97ad5cdSakolb 	mutex_exit(&caps_lock);
752*c97ad5cdSakolb 
753*c97ad5cdSakolb 	return (0);
754*c97ad5cdSakolb }
755*c97ad5cdSakolb 
756*c97ad5cdSakolb /*
757*c97ad5cdSakolb  * The project is going away so disable its cap.
758*c97ad5cdSakolb  */
759*c97ad5cdSakolb void
760*c97ad5cdSakolb cpucaps_project_remove(kproject_t *kpj)
761*c97ad5cdSakolb {
762*c97ad5cdSakolb 	mutex_enter(&caps_lock);
763*c97ad5cdSakolb 	if (PROJECT_IS_CAPPED(kpj))
764*c97ad5cdSakolb 		cap_project_disable(kpj);
765*c97ad5cdSakolb 	if (kpj->kpj_cpucap != NULL) {
766*c97ad5cdSakolb 		cap_free(kpj->kpj_cpucap);
767*c97ad5cdSakolb 		kpj->kpj_cpucap = NULL;
768*c97ad5cdSakolb 	}
769*c97ad5cdSakolb 	mutex_exit(&caps_lock);
770*c97ad5cdSakolb }
771*c97ad5cdSakolb 
772*c97ad5cdSakolb /*
773*c97ad5cdSakolb  * The zone is going away, so disable its cap.
774*c97ad5cdSakolb  */
775*c97ad5cdSakolb void
776*c97ad5cdSakolb cpucaps_zone_remove(zone_t *zone)
777*c97ad5cdSakolb {
778*c97ad5cdSakolb 	mutex_enter(&caps_lock);
779*c97ad5cdSakolb 	while (ZONE_IS_CAPPED(zone)) {
780*c97ad5cdSakolb 		mutex_exit(&caps_lock);
781*c97ad5cdSakolb 		(void) cpucaps_zone_set(zone, NOCAP);
782*c97ad5cdSakolb 		mutex_enter(&caps_lock);
783*c97ad5cdSakolb 	}
784*c97ad5cdSakolb 	if (zone->zone_cpucap != NULL) {
785*c97ad5cdSakolb 		cap_free(zone->zone_cpucap);
786*c97ad5cdSakolb 		zone->zone_cpucap = NULL;
787*c97ad5cdSakolb 	}
788*c97ad5cdSakolb 	mutex_exit(&caps_lock);
789*c97ad5cdSakolb }
790*c97ad5cdSakolb 
791*c97ad5cdSakolb /*
792*c97ad5cdSakolb  * New project was created. It should be put on the capped_projects list if
793*c97ad5cdSakolb  * its zone has a cap.
794*c97ad5cdSakolb  */
795*c97ad5cdSakolb void
796*c97ad5cdSakolb cpucaps_project_add(kproject_t *kpj)
797*c97ad5cdSakolb {
798*c97ad5cdSakolb 	cpucap_t *cap = NULL;
799*c97ad5cdSakolb 
800*c97ad5cdSakolb 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
801*c97ad5cdSakolb 		return;
802*c97ad5cdSakolb 
803*c97ad5cdSakolb 	/*
804*c97ad5cdSakolb 	 * This project was never capped before, so allocate its cap structure.
805*c97ad5cdSakolb 	 */
806*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL)
807*c97ad5cdSakolb 		cap = cap_alloc();
808*c97ad5cdSakolb 
809*c97ad5cdSakolb 	mutex_enter(&caps_lock);
810*c97ad5cdSakolb 	/*
811*c97ad5cdSakolb 	 * Double-check with caps_lock held
812*c97ad5cdSakolb 	 */
813*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
814*c97ad5cdSakolb 		kpj->kpj_cpucap = cap;
815*c97ad5cdSakolb 	} else if (cap != NULL) {
816*c97ad5cdSakolb 		cap_free(cap);
817*c97ad5cdSakolb 	}
818*c97ad5cdSakolb 
819*c97ad5cdSakolb 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
820*c97ad5cdSakolb 		cap_project_enable(kpj, MAX_USAGE);
821*c97ad5cdSakolb 
822*c97ad5cdSakolb 	mutex_exit(&caps_lock);
823*c97ad5cdSakolb }
824*c97ad5cdSakolb 
825*c97ad5cdSakolb /*
826*c97ad5cdSakolb  * Set project cap to cap_val
827*c97ad5cdSakolb  * If cap_val is equal to NOCAP, disable project cap.
828*c97ad5cdSakolb  *
829*c97ad5cdSakolb  * If this is the first time a cap is set on a project, allocate cpucap
830*c97ad5cdSakolb  * structure without holding caps_lock to avoid KM_SLEEP allocation with
831*c97ad5cdSakolb  * caps_lock held.
832*c97ad5cdSakolb  */
833*c97ad5cdSakolb int
834*c97ad5cdSakolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
835*c97ad5cdSakolb {
836*c97ad5cdSakolb 	cpucap_t *cap = NULL;
837*c97ad5cdSakolb 	hrtime_t value;
838*c97ad5cdSakolb 
839*c97ad5cdSakolb 	if (cap_val == 0)
840*c97ad5cdSakolb 		return (EINVAL);
841*c97ad5cdSakolb 
842*c97ad5cdSakolb 	ASSERT(cap_val <= MAXCAP);
843*c97ad5cdSakolb 	if (cap_val > MAXCAP)
844*c97ad5cdSakolb 		cap_val = MAXCAP;
845*c97ad5cdSakolb 
846*c97ad5cdSakolb 	/*
847*c97ad5cdSakolb 	 * Nothing to do if trying to disable project cap and caps are not
848*c97ad5cdSakolb 	 * enabled or if trying to disable cap on a project that does not have
849*c97ad5cdSakolb 	 * cap enabled.
850*c97ad5cdSakolb 	 */
851*c97ad5cdSakolb 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
852*c97ad5cdSakolb 		return (0);
853*c97ad5cdSakolb 
854*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
855*c97ad5cdSakolb 		/*
856*c97ad5cdSakolb 		 * This project was never capped before, so allocate its cap
857*c97ad5cdSakolb 		 * structure.
858*c97ad5cdSakolb 		 */
859*c97ad5cdSakolb 		cap = cap_alloc();
860*c97ad5cdSakolb 	}
861*c97ad5cdSakolb 
862*c97ad5cdSakolb 	mutex_enter(&caps_lock);
863*c97ad5cdSakolb 
864*c97ad5cdSakolb 	/*
865*c97ad5cdSakolb 	 * Double-check with caps_lock held.
866*c97ad5cdSakolb 	 */
867*c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
868*c97ad5cdSakolb 		kpj->kpj_cpucap = cap;
869*c97ad5cdSakolb 	} else if (cap != NULL) {
870*c97ad5cdSakolb 		cap_free(cap);
871*c97ad5cdSakolb 	}
872*c97ad5cdSakolb 
873*c97ad5cdSakolb 	/*
874*c97ad5cdSakolb 	 * Get the actual pointer to the project cap.
875*c97ad5cdSakolb 	 */
876*c97ad5cdSakolb 	cap = kpj->kpj_cpucap;
877*c97ad5cdSakolb 	value = cap_val * cap_tick_cost;
878*c97ad5cdSakolb 	if (value < 0)
879*c97ad5cdSakolb 		value = MAX_USAGE;
880*c97ad5cdSakolb 
881*c97ad5cdSakolb 	/*
882*c97ad5cdSakolb 	 * Nothing to do if the value is not changing
883*c97ad5cdSakolb 	 */
884*c97ad5cdSakolb 	if (value == cap->cap_value) {
885*c97ad5cdSakolb 		mutex_exit(&caps_lock);
886*c97ad5cdSakolb 		return (0);
887*c97ad5cdSakolb 	}
888*c97ad5cdSakolb 
889*c97ad5cdSakolb 	/*
890*c97ad5cdSakolb 	 * Clear cap statistics since the cap value itself changes.
891*c97ad5cdSakolb 	 */
892*c97ad5cdSakolb 	cap->cap_above = cap->cap_below = 0;
893*c97ad5cdSakolb 	cap->cap_maxusage = 0;
894*c97ad5cdSakolb 
895*c97ad5cdSakolb 	if (cap_val != NOCAP) {
896*c97ad5cdSakolb 		/*
897*c97ad5cdSakolb 		 * Enable this cap if it is not already enabled.
898*c97ad5cdSakolb 		 */
899*c97ad5cdSakolb 		if (CAP_DISABLED(cap))
900*c97ad5cdSakolb 			cap_project_enable(kpj, value);
901*c97ad5cdSakolb 		else
902*c97ad5cdSakolb 			cap->cap_value = value;
903*c97ad5cdSakolb 	} else if (CAP_ENABLED(cap)) {
904*c97ad5cdSakolb 		/*
905*c97ad5cdSakolb 		 * User requested to drop a cap on the project. If it is part of
906*c97ad5cdSakolb 		 * capped zone, keep the cap and set the value to MAX_USAGE,
907*c97ad5cdSakolb 		 * otherwise disable the cap.
908*c97ad5cdSakolb 		 */
909*c97ad5cdSakolb 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
910*c97ad5cdSakolb 			cap->cap_value = MAX_USAGE;
911*c97ad5cdSakolb 		} else {
912*c97ad5cdSakolb 			cap_project_disable(kpj);
913*c97ad5cdSakolb 		}
914*c97ad5cdSakolb 	}
915*c97ad5cdSakolb 	mutex_exit(&caps_lock);
916*c97ad5cdSakolb 
917*c97ad5cdSakolb 	return (0);
918*c97ad5cdSakolb }
919*c97ad5cdSakolb 
920*c97ad5cdSakolb /*
921*c97ad5cdSakolb  * Get cap usage.
922*c97ad5cdSakolb  */
923*c97ad5cdSakolb static rctl_qty_t
924*c97ad5cdSakolb cap_get(cpucap_t *cap)
925*c97ad5cdSakolb {
926*c97ad5cdSakolb 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
927*c97ad5cdSakolb }
928*c97ad5cdSakolb 
929*c97ad5cdSakolb /*
930*c97ad5cdSakolb  * Get current project usage.
931*c97ad5cdSakolb  */
932*c97ad5cdSakolb rctl_qty_t
933*c97ad5cdSakolb cpucaps_project_get(kproject_t *kpj)
934*c97ad5cdSakolb {
935*c97ad5cdSakolb 	return (cap_get(kpj->kpj_cpucap));
936*c97ad5cdSakolb }
937*c97ad5cdSakolb 
938*c97ad5cdSakolb /*
939*c97ad5cdSakolb  * Get current zone usage.
940*c97ad5cdSakolb  */
941*c97ad5cdSakolb rctl_qty_t
942*c97ad5cdSakolb cpucaps_zone_get(zone_t *zone)
943*c97ad5cdSakolb {
944*c97ad5cdSakolb 	return (cap_get(zone->zone_cpucap));
945*c97ad5cdSakolb }
946*c97ad5cdSakolb 
947*c97ad5cdSakolb /*
948*c97ad5cdSakolb  * Charge project of thread t the time thread t spent on CPU since previously
949*c97ad5cdSakolb  * adjusted.
950*c97ad5cdSakolb  *
951*c97ad5cdSakolb  * Record the current on-CPU time in the csc structure.
952*c97ad5cdSakolb  *
953*c97ad5cdSakolb  * Do not adjust for more than one tick worth of time.
954*c97ad5cdSakolb  *
955*c97ad5cdSakolb  */
956*c97ad5cdSakolb static void
957*c97ad5cdSakolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
958*c97ad5cdSakolb {
959*c97ad5cdSakolb 	kproject_t	*kpj = ttoproj(t);
960*c97ad5cdSakolb 	hrtime_t	new_usage;
961*c97ad5cdSakolb 	hrtime_t	usage_delta;
962*c97ad5cdSakolb 
963*c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
964*c97ad5cdSakolb 	ASSERT(PROJECT_IS_CAPPED(kpj));
965*c97ad5cdSakolb 
966*c97ad5cdSakolb 	/* Get on-CPU time since birth of a thread */
967*c97ad5cdSakolb 	new_usage = mstate_thread_onproc_time(t);
968*c97ad5cdSakolb 
969*c97ad5cdSakolb 	/* Time spent on CPU since last checked */
970*c97ad5cdSakolb 	usage_delta = new_usage - csc->csc_cputime;
971*c97ad5cdSakolb 
972*c97ad5cdSakolb 	/* Save the accumulated on-CPU time */
973*c97ad5cdSakolb 	csc->csc_cputime = new_usage;
974*c97ad5cdSakolb 
975*c97ad5cdSakolb 	/* Charge at most one tick worth of on-CPU time */
976*c97ad5cdSakolb 	if (usage_delta > cap_tick_cost)
977*c97ad5cdSakolb 		usage_delta = cap_tick_cost;
978*c97ad5cdSakolb 
979*c97ad5cdSakolb 	/* Add usage_delta to the project usage value. */
980*c97ad5cdSakolb 	if (usage_delta > 0) {
981*c97ad5cdSakolb 		cpucap_t *cap = kpj->kpj_cpucap;
982*c97ad5cdSakolb 
983*c97ad5cdSakolb 		DTRACE_PROBE2(cpucaps__project__charge,
984*c97ad5cdSakolb 		    kthread_id_t, t, hrtime_t, usage_delta);
985*c97ad5cdSakolb 
986*c97ad5cdSakolb 		disp_lock_enter_high(&cap->cap_usagelock);
987*c97ad5cdSakolb 		cap->cap_usage += usage_delta;
988*c97ad5cdSakolb 
989*c97ad5cdSakolb 		/* Check for overflows */
990*c97ad5cdSakolb 		if (cap->cap_usage < 0)
991*c97ad5cdSakolb 			cap->cap_usage = MAX_USAGE - 1;
992*c97ad5cdSakolb 
993*c97ad5cdSakolb 		disp_lock_exit_high(&cap->cap_usagelock);
994*c97ad5cdSakolb 
995*c97ad5cdSakolb 		/*
996*c97ad5cdSakolb 		 * cap_maxusage is only kept for observability. Move it outside
997*c97ad5cdSakolb 		 * the lock to reduce the time spent while holding the lock.
998*c97ad5cdSakolb 		 */
999*c97ad5cdSakolb 		if (cap->cap_usage > cap->cap_maxusage)
1000*c97ad5cdSakolb 			cap->cap_maxusage = cap->cap_usage;
1001*c97ad5cdSakolb 	}
1002*c97ad5cdSakolb }
1003*c97ad5cdSakolb 
1004*c97ad5cdSakolb /*
1005*c97ad5cdSakolb  * Charge thread's project and return True if project or zone should be
1006*c97ad5cdSakolb  * penalized because its project or zone is exceeding its cap. Also sets
1007*c97ad5cdSakolb  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1008*c97ad5cdSakolb  */
1009*c97ad5cdSakolb boolean_t
1010*c97ad5cdSakolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1011*c97ad5cdSakolb {
1012*c97ad5cdSakolb 	kproject_t	*kpj = ttoproj(t);
1013*c97ad5cdSakolb 	klwp_t		*lwp = t->t_lwp;
1014*c97ad5cdSakolb 	zone_t		*zone;
1015*c97ad5cdSakolb 	cpucap_t	*project_cap;
1016*c97ad5cdSakolb 	boolean_t	rc = B_FALSE;
1017*c97ad5cdSakolb 
1018*c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
1019*c97ad5cdSakolb 
1020*c97ad5cdSakolb 	/* Nothing to do for projects that are not capped. */
1021*c97ad5cdSakolb 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1022*c97ad5cdSakolb 		return (B_FALSE);
1023*c97ad5cdSakolb 
1024*c97ad5cdSakolb 	caps_charge_adjust(t, csc);
1025*c97ad5cdSakolb 
1026*c97ad5cdSakolb 	/*
1027*c97ad5cdSakolb 	 * The caller only requested to charge the project usage, no enforcement
1028*c97ad5cdSakolb 	 * part.
1029*c97ad5cdSakolb 	 */
1030*c97ad5cdSakolb 	if (charge_type == CPUCAPS_CHARGE_ONLY)
1031*c97ad5cdSakolb 		return (B_FALSE);
1032*c97ad5cdSakolb 
1033*c97ad5cdSakolb 	project_cap = kpj->kpj_cpucap;
1034*c97ad5cdSakolb 
1035*c97ad5cdSakolb 	if (project_cap->cap_usage >= project_cap->cap_value) {
1036*c97ad5cdSakolb 		t->t_schedflag |= TS_PROJWAITQ;
1037*c97ad5cdSakolb 		rc = B_TRUE;
1038*c97ad5cdSakolb 	} else if (t->t_schedflag & TS_PROJWAITQ) {
1039*c97ad5cdSakolb 		t->t_schedflag &= ~TS_PROJWAITQ;
1040*c97ad5cdSakolb 	}
1041*c97ad5cdSakolb 
1042*c97ad5cdSakolb 	zone = ttozone(t);
1043*c97ad5cdSakolb 	if (!ZONE_IS_CAPPED(zone)) {
1044*c97ad5cdSakolb 		if (t->t_schedflag & TS_ZONEWAITQ)
1045*c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1046*c97ad5cdSakolb 	} else {
1047*c97ad5cdSakolb 		cpucap_t *zone_cap = zone->zone_cpucap;
1048*c97ad5cdSakolb 
1049*c97ad5cdSakolb 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
1050*c97ad5cdSakolb 			t->t_schedflag |= TS_ZONEWAITQ;
1051*c97ad5cdSakolb 			rc = B_TRUE;
1052*c97ad5cdSakolb 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
1053*c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1054*c97ad5cdSakolb 		}
1055*c97ad5cdSakolb 	}
1056*c97ad5cdSakolb 
1057*c97ad5cdSakolb 
1058*c97ad5cdSakolb 	return (rc);
1059*c97ad5cdSakolb }
1060*c97ad5cdSakolb 
1061*c97ad5cdSakolb /*
1062*c97ad5cdSakolb  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1063*c97ad5cdSakolb  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1064*c97ad5cdSakolb  *
1065*c97ad5cdSakolb  * CPU Caps are only enforced for user threads.
1066*c97ad5cdSakolb  *
1067*c97ad5cdSakolb  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1068*c97ad5cdSakolb  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1069*c97ad5cdSakolb  *
1070*c97ad5cdSakolb  * It is possible that by the time we enter cpucaps_enforce() the cap is already
1071*c97ad5cdSakolb  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1072*c97ad5cdSakolb  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1073*c97ad5cdSakolb  * apply.
1074*c97ad5cdSakolb  */
1075*c97ad5cdSakolb boolean_t
1076*c97ad5cdSakolb cpucaps_enforce(kthread_t *t)
1077*c97ad5cdSakolb {
1078*c97ad5cdSakolb 	klwp_t *lwp = t->t_lwp;
1079*c97ad5cdSakolb 
1080*c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
1081*c97ad5cdSakolb 
1082*c97ad5cdSakolb 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1083*c97ad5cdSakolb 		if (t->t_schedflag & TS_PROJWAITQ) {
1084*c97ad5cdSakolb 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1085*c97ad5cdSakolb 			t->t_schedflag &= ~TS_ANYWAITQ;
1086*c97ad5cdSakolb 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1087*c97ad5cdSakolb 				t)) {
1088*c97ad5cdSakolb 				return (B_TRUE);
1089*c97ad5cdSakolb 			}
1090*c97ad5cdSakolb 		}
1091*c97ad5cdSakolb 		if (t->t_schedflag & TS_ZONEWAITQ) {
1092*c97ad5cdSakolb 			ASSERT(ttozone(t)->zone_cpucap != NULL);
1093*c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1094*c97ad5cdSakolb 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1095*c97ad5cdSakolb 				t)) {
1096*c97ad5cdSakolb 				return (B_TRUE);
1097*c97ad5cdSakolb 			}
1098*c97ad5cdSakolb 		}
1099*c97ad5cdSakolb 	}
1100*c97ad5cdSakolb 
1101*c97ad5cdSakolb 	/*
1102*c97ad5cdSakolb 	 * The thread is not enqueued on the wait queue.
1103*c97ad5cdSakolb 	 */
1104*c97ad5cdSakolb 	return (B_FALSE);
1105*c97ad5cdSakolb }
1106*c97ad5cdSakolb 
1107*c97ad5cdSakolb /*
1108*c97ad5cdSakolb  * Convert internal cap statistics into values exported by cap kstat.
1109*c97ad5cdSakolb  */
1110*c97ad5cdSakolb static int
1111*c97ad5cdSakolb cap_kstat_update(kstat_t *ksp, int rw)
1112*c97ad5cdSakolb {
1113*c97ad5cdSakolb 	struct cap_kstat *capsp = &cap_kstat;
1114*c97ad5cdSakolb 	cpucap_t *cap = ksp->ks_private;
1115*c97ad5cdSakolb 	clock_t	tick_sec = SEC_TO_TICK(1);
1116*c97ad5cdSakolb 	char *zonename = cap->cap_zone->zone_name;
1117*c97ad5cdSakolb 
1118*c97ad5cdSakolb 	if (rw == KSTAT_WRITE)
1119*c97ad5cdSakolb 		return (EACCES);
1120*c97ad5cdSakolb 
1121*c97ad5cdSakolb 	capsp->cap_value.value.ui64 =
1122*c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
1123*c97ad5cdSakolb 	capsp->cap_usage.value.ui64 =
1124*c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1125*c97ad5cdSakolb 	capsp->cap_maxusage.value.ui64 =
1126*c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1127*c97ad5cdSakolb 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1128*c97ad5cdSakolb 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1129*c97ad5cdSakolb 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1130*c97ad5cdSakolb 	kstat_named_setstr(&capsp->cap_zonename, zonename);
1131*c97ad5cdSakolb 
1132*c97ad5cdSakolb 	return (0);
1133*c97ad5cdSakolb }
1134