xref: /titanic_51/usr/src/uts/common/disp/cpucaps.c (revision d3d50737e566cade9a08d73d2af95105ac7cd960)
1c97ad5cdSakolb /*
2c97ad5cdSakolb  * CDDL HEADER START
3c97ad5cdSakolb  *
4c97ad5cdSakolb  * The contents of this file are subject to the terms of the
5c97ad5cdSakolb  * Common Development and Distribution License (the "License").
6c97ad5cdSakolb  * You may not use this file except in compliance with the License.
7c97ad5cdSakolb  *
8c97ad5cdSakolb  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c97ad5cdSakolb  * or http://www.opensolaris.org/os/licensing.
10c97ad5cdSakolb  * See the License for the specific language governing permissions
11c97ad5cdSakolb  * and limitations under the License.
12c97ad5cdSakolb  *
13c97ad5cdSakolb  * When distributing Covered Code, include this CDDL HEADER in each
14c97ad5cdSakolb  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c97ad5cdSakolb  * If applicable, add the following below this CDDL HEADER, with the
16c97ad5cdSakolb  * fields enclosed by brackets "[]" replaced with your own identifying
17c97ad5cdSakolb  * information: Portions Copyright [yyyy] [name of copyright owner]
18c97ad5cdSakolb  *
19c97ad5cdSakolb  * CDDL HEADER END
20c97ad5cdSakolb  */
21c97ad5cdSakolb 
22c97ad5cdSakolb /*
23*d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24c97ad5cdSakolb  * Use is subject to license terms.
25c97ad5cdSakolb  */
26c97ad5cdSakolb 
27c97ad5cdSakolb #include <sys/disp.h>
28c97ad5cdSakolb #include <sys/param.h>
29c97ad5cdSakolb #include <sys/systm.h>
30c97ad5cdSakolb #include <sys/sysmacros.h>
31c97ad5cdSakolb #include <sys/atomic.h>
32c97ad5cdSakolb #include <sys/cpucaps_impl.h>
33c97ad5cdSakolb #include <sys/dtrace.h>
34c97ad5cdSakolb #include <sys/sdt.h>
35c97ad5cdSakolb #include <sys/debug.h>
36c97ad5cdSakolb #include <sys/rctl.h>
37c97ad5cdSakolb #include <sys/errno.h>
38c97ad5cdSakolb 
39c97ad5cdSakolb /*
40c97ad5cdSakolb  * CPU Caps implementation
41c97ad5cdSakolb  * =======================
42c97ad5cdSakolb  *
43c97ad5cdSakolb  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
44c97ad5cdSakolb  * usage for all projects running inside the zone. If the zone CPU cap is set
45c97ad5cdSakolb  * below the project CPU cap, the latter will have no effect.
46c97ad5cdSakolb  *
47c97ad5cdSakolb  * When CPU usage of projects and/or zones reaches specified caps, threads in
48c97ad5cdSakolb  * them do not get scheduled and instead are placed on wait queues associated
49c97ad5cdSakolb  * with a cap. Such threads will start running again only when CPU usage drops
50c97ad5cdSakolb  * below the cap level. Each zone and each project has its own wait queue.
51c97ad5cdSakolb  *
52c97ad5cdSakolb  * When CPU cap is set, the kernel continously keeps track of CPU time used by
53c97ad5cdSakolb  * capped zones and/or projects over a short time interval and calculates their
54c97ad5cdSakolb  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
55c97ad5cdSakolb  * cap, LWPs running in the user-land (when they are not holding any critical
56c97ad5cdSakolb  * kernel locks) are placed on special wait queues until their project's or
57c97ad5cdSakolb  * zone's CPU usage drops below the cap.
58c97ad5cdSakolb  *
59c97ad5cdSakolb  * The system maintains a list of all capped projects and all capped zones. On
60c97ad5cdSakolb  * every clock tick every active thread belonging to a capped project adds its
61c97ad5cdSakolb  * CPU usage to its project. Usage from all projects belonging to a capped zone
62c97ad5cdSakolb  * is aggregated to get the zone usage.
63c97ad5cdSakolb  *
64c97ad5cdSakolb  * When the current CPU usage is above the cap, a project or zone is considered
65c97ad5cdSakolb  * over-capped. Every user thread caught running in an over-capped project or
66c97ad5cdSakolb  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
67c97ad5cdSakolb  * is requested to surrender its CPU. This causes scheduling class specific
68c97ad5cdSakolb  * CL_PREEMPT() callback to be invoked. The callback function places threads
69c97ad5cdSakolb  * marked as TS_PROJWAIT on a wait queue and calls switch().
70c97ad5cdSakolb  *
71c97ad5cdSakolb  * Threads are only placed on wait queues after trapping from user-land
72c97ad5cdSakolb  * (they could be holding some user locks, but no kernel locks) and while
73c97ad5cdSakolb  * returning from the trap back to the user-land when no kernel locks are held.
74c97ad5cdSakolb  * Putting threads on wait queues in random places while running in the
75c97ad5cdSakolb  * kernel might lead to all kinds of locking problems.
76c97ad5cdSakolb  *
77c97ad5cdSakolb  * Accounting
78c97ad5cdSakolb  * ==========
79c97ad5cdSakolb  *
80c97ad5cdSakolb  * Accounting of CPU usage is based on per-thread micro-state accounting data.
81c97ad5cdSakolb  * On every clock tick clock() adds new on-CPU time for every thread found on
82c97ad5cdSakolb  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
83c97ad5cdSakolb  * New times means time since it was last accounted for. On-CPU times greater
84c97ad5cdSakolb  * than 1 tick are truncated to 1 tick.
85c97ad5cdSakolb  *
86c97ad5cdSakolb  * Project CPU usage is aggregated from all threads within the project.
87c97ad5cdSakolb  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
88c97ad5cdSakolb  * CPU usage is calculated on every clock tick by walking list of projects and
89c97ad5cdSakolb  * adding their usage together.
90c97ad5cdSakolb  *
91c97ad5cdSakolb  * Decay
92c97ad5cdSakolb  * =====
93c97ad5cdSakolb  *
94c97ad5cdSakolb  * CPU usage is decayed by the caps_update() routine which is called once per
95c97ad5cdSakolb  * every clock tick. It walks lists of project caps and decays their usages by
96c97ad5cdSakolb  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
97c97ad5cdSakolb  * are made runnable again, one thread per clock tick.
98c97ad5cdSakolb  *
99c97ad5cdSakolb  * Interfaces
100c97ad5cdSakolb  * ==========
101c97ad5cdSakolb  *
102c97ad5cdSakolb  * The CPU Caps facility provides the following interfaces to the rest of the
103c97ad5cdSakolb  * system:
104c97ad5cdSakolb  *
105c97ad5cdSakolb  *   cpucaps_project_add(kproject_t *)
106c97ad5cdSakolb  *
107c97ad5cdSakolb  * Notifies the framework of a new project. It should be put on the
108c97ad5cdSakolb  * capped_projects list if its zone has a cap.
109c97ad5cdSakolb  *
110c97ad5cdSakolb  *   cpucaps_project_remove(kproject_t *)
111c97ad5cdSakolb  *
112c97ad5cdSakolb  * Remove the association between the specified project and its cap.
113c97ad5cdSakolb  * Called right before the project is destroyed.
114c97ad5cdSakolb  *
115c97ad5cdSakolb  * cpucaps_project_set(kproject_t *, rctl_qty_t)
116c97ad5cdSakolb  *
117c97ad5cdSakolb  * Set project cap of the specified project to the specified value. Setting the
118c97ad5cdSakolb  * value to NOCAP is equivalent to removing the cap.
119c97ad5cdSakolb  *
120c97ad5cdSakolb  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
121c97ad5cdSakolb  *
122c97ad5cdSakolb  * Set zone cap of the specified zone to the specified value. Setting the value
123c97ad5cdSakolb  * to NOCAP is equivalent to removing the cap.
124c97ad5cdSakolb  *
125c97ad5cdSakolb  *   cpucaps_zone_remove(zone_t *)
126c97ad5cdSakolb  *
127c97ad5cdSakolb  * Remove the association between the zone and its cap.
128c97ad5cdSakolb  *
129c97ad5cdSakolb  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
130c97ad5cdSakolb  *
131c97ad5cdSakolb  * Charges specified thread's project the amount of on-CPU time that it used.
132c97ad5cdSakolb  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
133c97ad5cdSakolb  * Otherwise returns True if project or zone should be penalized because its
134c97ad5cdSakolb  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
135c97ad5cdSakolb  * bits in t_schedflag in this case.
136c97ad5cdSakolb  *
137c97ad5cdSakolb  *   CPUCAPS_ENFORCE(kthread_id_t *)
138c97ad5cdSakolb  *
139c97ad5cdSakolb  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
140c97ad5cdSakolb  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
141c97ad5cdSakolb  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
142c97ad5cdSakolb  * wait queue or False otherwise.
143c97ad5cdSakolb  *
144c97ad5cdSakolb  *   cpucaps_sc_init(caps_sc_t *)
145c97ad5cdSakolb  *
146c97ad5cdSakolb  * Initializes the scheduling-class specific CPU Caps data for a thread.
147c97ad5cdSakolb  *
148c97ad5cdSakolb  * LOCKS
149c97ad5cdSakolb  * =====
150c97ad5cdSakolb  *
151c97ad5cdSakolb  * all the individual caps structures and their lists are protected by a global
152c97ad5cdSakolb  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
153c97ad5cdSakolb  * caps, so it is usually uncontended. We avoid all blocking memory allocations
154c97ad5cdSakolb  * while holding caps_lock to prevent clock() from blocking.
155c97ad5cdSakolb  *
156c97ad5cdSakolb  * Thread state is protected by the thread lock. It protects the association
157c97ad5cdSakolb  * between a thread and its project and, as a consequence, to its zone. The
158c97ad5cdSakolb  * association can not break while thread lock is held, so the project or zone
159c97ad5cdSakolb  * cap are not going to disappear while thread lock is held.
160c97ad5cdSakolb  *
161c97ad5cdSakolb  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
162c97ad5cdSakolb  * grabbed by scheduling classes already holding thread lock at high PIL and by
163c97ad5cdSakolb  * clock thread performing usage decay. We should do as little work as possible
164c97ad5cdSakolb  * while holding the lock since it may be very hot. All threads in the project
165c97ad5cdSakolb  * contend for the same cache line doing cap usage updates.
166c97ad5cdSakolb  */
167c97ad5cdSakolb 
168c97ad5cdSakolb /*
169c97ad5cdSakolb  * caps_lock protects list of capped projects and zones, changes in the cap
170c97ad5cdSakolb  * state and changes of the global cpucaps_enabled flag.
171c97ad5cdSakolb  *
172c97ad5cdSakolb  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
173c97ad5cdSakolb  * modified in parallel. This can be per-zone cap flag, but we don't keep any
174c97ad5cdSakolb  * cap state for now.
175c97ad5cdSakolb  */
176c97ad5cdSakolb static kmutex_t caps_lock;		/* lock to protect: */
177c97ad5cdSakolb static list_t capped_zones;		/* - list of zones with caps */
178c97ad5cdSakolb static list_t capped_projects;		/* - list of projects with caps */
179c97ad5cdSakolb boolean_t cpucaps_enabled;		/* - are there any caps defined? */
180c97ad5cdSakolb boolean_t cpucaps_busy;			/* - is framework busy? */
181c97ad5cdSakolb 
182c97ad5cdSakolb /*
183c97ad5cdSakolb  * The accounting is based on the number of nanoseconds threads spend running
184c97ad5cdSakolb  * during a tick which is kept in the cap_tick_cost variable.
185c97ad5cdSakolb  */
186c97ad5cdSakolb static hrtime_t cap_tick_cost;
187c97ad5cdSakolb 
188c97ad5cdSakolb /*
189c97ad5cdSakolb  * How much of the usage value is decayed every clock tick
190c97ad5cdSakolb  * Decay one per cent of value per tick
191c97ad5cdSakolb  */
192c97ad5cdSakolb #define	CAP_DECAY_FACTOR 100
193c97ad5cdSakolb 
194c97ad5cdSakolb /*
195c97ad5cdSakolb  * Scale the value and round it to the closest integer value
196c97ad5cdSakolb  */
197c97ad5cdSakolb #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
198c97ad5cdSakolb 
199c97ad5cdSakolb static void caps_update();
200c97ad5cdSakolb 
201c97ad5cdSakolb /*
202c97ad5cdSakolb  * CAP kstats.
203c97ad5cdSakolb  */
204c97ad5cdSakolb struct cap_kstat {
205c97ad5cdSakolb 	kstat_named_t	cap_value;
206c97ad5cdSakolb 	kstat_named_t	cap_usage;
207c97ad5cdSakolb 	kstat_named_t	cap_nwait;
208c97ad5cdSakolb 	kstat_named_t	cap_below;
209c97ad5cdSakolb 	kstat_named_t	cap_above;
210c97ad5cdSakolb 	kstat_named_t	cap_maxusage;
211c97ad5cdSakolb 	kstat_named_t	cap_zonename;
212c97ad5cdSakolb } cap_kstat = {
213c97ad5cdSakolb 	{ "value",	KSTAT_DATA_UINT64 },
214c97ad5cdSakolb 	{ "usage",	KSTAT_DATA_UINT64 },
215c97ad5cdSakolb 	{ "nwait",	KSTAT_DATA_UINT64 },
216c97ad5cdSakolb 	{ "below_sec",	KSTAT_DATA_UINT64 },
217c97ad5cdSakolb 	{ "above_sec",	KSTAT_DATA_UINT64 },
218c97ad5cdSakolb 	{ "maxusage",	KSTAT_DATA_UINT64 },
219c97ad5cdSakolb 	{ "zonename",	KSTAT_DATA_STRING },
220c97ad5cdSakolb };
221c97ad5cdSakolb 
222c97ad5cdSakolb 
223c97ad5cdSakolb static kmutex_t cap_kstat_lock;
224c97ad5cdSakolb static int cap_kstat_update(kstat_t *, int);
225c97ad5cdSakolb 
226c97ad5cdSakolb /*
227c97ad5cdSakolb  * Initialize CPU caps infrastructure.
228c97ad5cdSakolb  *   - Initialize lists of capped zones and capped projects
229c97ad5cdSakolb  *   - Set cpucaps_clock_callout to NULL
230c97ad5cdSakolb  */
231c97ad5cdSakolb void
232c97ad5cdSakolb cpucaps_init()
233c97ad5cdSakolb {
234c97ad5cdSakolb 	/*
235c97ad5cdSakolb 	 * Initialize global variables
236c97ad5cdSakolb 	 */
237c97ad5cdSakolb 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
238c97ad5cdSakolb 
239c97ad5cdSakolb 	list_create(&capped_zones, sizeof (cpucap_t),
240c97ad5cdSakolb 	    offsetof(cpucap_t, cap_link));
241c97ad5cdSakolb 	list_create(&capped_projects, sizeof (cpucap_t),
242c97ad5cdSakolb 	    offsetof(cpucap_t, cap_link));
243c97ad5cdSakolb 
244c97ad5cdSakolb 	cpucaps_enabled = B_FALSE;
245c97ad5cdSakolb 	cpucaps_busy = B_FALSE;
246c97ad5cdSakolb 	cpucaps_clock_callout = NULL;
247c97ad5cdSakolb }
248c97ad5cdSakolb 
249c97ad5cdSakolb /*
250c97ad5cdSakolb  * Initialize scheduling-class specific CPU Caps data.
251c97ad5cdSakolb  */
252c97ad5cdSakolb void
253c97ad5cdSakolb cpucaps_sc_init(caps_sc_t *csc)
254c97ad5cdSakolb {
255c97ad5cdSakolb 	csc->csc_cputime = 0;
256c97ad5cdSakolb }
257c97ad5cdSakolb 
258c97ad5cdSakolb /*
259c97ad5cdSakolb  * Allocate and initialize cpucap structure
260c97ad5cdSakolb  */
261c97ad5cdSakolb static cpucap_t *
262c97ad5cdSakolb cap_alloc(void)
263c97ad5cdSakolb {
264c97ad5cdSakolb 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
265c97ad5cdSakolb 
266c97ad5cdSakolb 	DISP_LOCK_INIT(&cap->cap_usagelock);
267c97ad5cdSakolb 	waitq_init(&cap->cap_waitq);
268c97ad5cdSakolb 
269c97ad5cdSakolb 	return (cap);
270c97ad5cdSakolb }
271c97ad5cdSakolb 
272c97ad5cdSakolb /*
273c97ad5cdSakolb  * Free cpucap structure
274c97ad5cdSakolb  */
275c97ad5cdSakolb static void
276c97ad5cdSakolb cap_free(cpucap_t *cap)
277c97ad5cdSakolb {
278c97ad5cdSakolb 	if (cap == NULL)
279c97ad5cdSakolb 		return;
280c97ad5cdSakolb 
281c97ad5cdSakolb 	/*
282c97ad5cdSakolb 	 * This cap should not be active
283c97ad5cdSakolb 	 */
284c97ad5cdSakolb 	ASSERT(!list_link_active(&cap->cap_link));
285c97ad5cdSakolb 	ASSERT(cap->cap_value == 0);
286c97ad5cdSakolb 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
287c97ad5cdSakolb 
288c97ad5cdSakolb 	waitq_fini(&cap->cap_waitq);
289c97ad5cdSakolb 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
290c97ad5cdSakolb 
291c97ad5cdSakolb 	kmem_free(cap, sizeof (cpucap_t));
292c97ad5cdSakolb }
293c97ad5cdSakolb 
294c97ad5cdSakolb /*
295c97ad5cdSakolb  * Activate cap - insert into active list and unblock its
296c97ad5cdSakolb  * wait queue. Should be called with caps_lock held.
297c97ad5cdSakolb  * The cap_value field is set to the value supplied.
298c97ad5cdSakolb  */
299c97ad5cdSakolb static void
300c97ad5cdSakolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
301c97ad5cdSakolb {
302c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
303c97ad5cdSakolb 
304c97ad5cdSakolb 	/*
305c97ad5cdSakolb 	 * Cap can not be already enabled
306c97ad5cdSakolb 	 */
307c97ad5cdSakolb 	ASSERT(!CAP_ENABLED(cap));
308c97ad5cdSakolb 	ASSERT(!list_link_active(&cap->cap_link));
309c97ad5cdSakolb 
310c97ad5cdSakolb 	list_insert_tail(l, cap);
311c97ad5cdSakolb 	cap->cap_below = cap->cap_above = 0;
312c97ad5cdSakolb 	cap->cap_maxusage = 0;
313c97ad5cdSakolb 	cap->cap_usage = 0;
314c97ad5cdSakolb 	cap->cap_value = value;
315c97ad5cdSakolb 	waitq_unblock(&cap->cap_waitq);
316c97ad5cdSakolb 	if (CPUCAPS_OFF()) {
317c97ad5cdSakolb 		cpucaps_enabled = B_TRUE;
318c97ad5cdSakolb 		cpucaps_clock_callout = caps_update;
319c97ad5cdSakolb 	}
320c97ad5cdSakolb }
321c97ad5cdSakolb 
322c97ad5cdSakolb /*
323c97ad5cdSakolb  * Deactivate cap
324c97ad5cdSakolb  *   - Block its wait queue. This prevents any new threads from being
325c97ad5cdSakolb  *	enqueued there and moves all enqueued threads to the run queue.
326c97ad5cdSakolb  *   - Remove cap from list l.
327c97ad5cdSakolb  *   - Disable CPU caps globally if there are no capped projects or zones
328c97ad5cdSakolb  *
329c97ad5cdSakolb  * Should be called with caps_lock held.
330c97ad5cdSakolb  */
331c97ad5cdSakolb static void
332c97ad5cdSakolb cap_disable(list_t *l, cpucap_t *cap)
333c97ad5cdSakolb {
334c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
335c97ad5cdSakolb 	/*
336c97ad5cdSakolb 	 * Cap should be currently active
337c97ad5cdSakolb 	 */
338c97ad5cdSakolb 	ASSERT(CPUCAPS_ON());
339c97ad5cdSakolb 	ASSERT(list_link_active(&cap->cap_link));
340c97ad5cdSakolb 	ASSERT(CAP_ENABLED(cap));
341c97ad5cdSakolb 
342c97ad5cdSakolb 	waitq_block(&cap->cap_waitq);
343c97ad5cdSakolb 	list_remove(l, cap);
344c97ad5cdSakolb 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
345c97ad5cdSakolb 		cpucaps_enabled = B_FALSE;
346c97ad5cdSakolb 		cpucaps_clock_callout = NULL;
347c97ad5cdSakolb 	}
348c97ad5cdSakolb 	cap->cap_value = 0;
349c97ad5cdSakolb 	cap->cap_project = NULL;
350c97ad5cdSakolb 	cap->cap_zone = NULL;
351c97ad5cdSakolb 	if (cap->cap_kstat != NULL) {
352c97ad5cdSakolb 		kstat_delete(cap->cap_kstat);
353c97ad5cdSakolb 		cap->cap_kstat = NULL;
354c97ad5cdSakolb 	}
355c97ad5cdSakolb 
356c97ad5cdSakolb }
357c97ad5cdSakolb 
358c97ad5cdSakolb /*
359c97ad5cdSakolb  * Enable cap for a project kpj
360c97ad5cdSakolb  * It is safe to enable already enabled project cap.
361c97ad5cdSakolb  * Should be called with caps_lock held.
362c97ad5cdSakolb  */
363c97ad5cdSakolb static void
364c97ad5cdSakolb cap_project_enable(kproject_t *kpj, hrtime_t value)
365c97ad5cdSakolb {
366c97ad5cdSakolb 	cpucap_t *cap = kpj->kpj_cpucap;
367c97ad5cdSakolb 
368c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
369c97ad5cdSakolb 	ASSERT(cap != NULL);
370c97ad5cdSakolb 
371c97ad5cdSakolb 	if (CAP_DISABLED(cap)) {
372c97ad5cdSakolb 		ASSERT(cap->cap_kstat == NULL);
373c97ad5cdSakolb 		cap_enable(&capped_projects, cap, value);
374c97ad5cdSakolb 		cap->cap_project = kpj;
375c97ad5cdSakolb 		cap->cap_zone = kpj->kpj_zone;
376c97ad5cdSakolb 
377c97ad5cdSakolb 		/*
378c97ad5cdSakolb 		 * Create cap kstats
379c97ad5cdSakolb 		 */
380c97ad5cdSakolb 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
381c97ad5cdSakolb 		    KSTAT_TYPE_NAMED,
382c97ad5cdSakolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
383c97ad5cdSakolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
384c97ad5cdSakolb 			cap->cap_kstat->ks_data_size +=
385c97ad5cdSakolb 			    strlen(cap->cap_zone->zone_name) + 1;
386c97ad5cdSakolb 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
387c97ad5cdSakolb 			cap->cap_kstat->ks_data = &cap_kstat;
388c97ad5cdSakolb 			cap->cap_kstat->ks_update = cap_kstat_update;
389c97ad5cdSakolb 			cap->cap_kstat->ks_private = cap;
390c97ad5cdSakolb 			kstat_install(cap->cap_kstat);
391c97ad5cdSakolb 		}
392c97ad5cdSakolb 	}
393c97ad5cdSakolb }
394c97ad5cdSakolb 
395c97ad5cdSakolb /*
396c97ad5cdSakolb  * Disable project cap.
397c97ad5cdSakolb  * It is safe to disable already disabled project cap.
398c97ad5cdSakolb  * Should be called with caps_lock held.
399c97ad5cdSakolb  */
400c97ad5cdSakolb static void
401c97ad5cdSakolb cap_project_disable(kproject_t *kpj)
402c97ad5cdSakolb {
403c97ad5cdSakolb 	cpucap_t *cap = kpj->kpj_cpucap;
404c97ad5cdSakolb 
405c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
406c97ad5cdSakolb 	ASSERT(cap != NULL);
407c97ad5cdSakolb 	ASSERT(cap->cap_project == kpj);
408c97ad5cdSakolb 
409c97ad5cdSakolb 	if (CAP_ENABLED(cap))
410c97ad5cdSakolb 		cap_disable(&capped_projects, cap);
411c97ad5cdSakolb }
412c97ad5cdSakolb 
413c97ad5cdSakolb /*
414c97ad5cdSakolb  * Enable cap for a zone
415c97ad5cdSakolb  * It is safe to enable already enabled zone cap.
416c97ad5cdSakolb  * Should be called with caps_lock held.
417c97ad5cdSakolb  */
418c97ad5cdSakolb static void
419c97ad5cdSakolb cap_zone_enable(zone_t *zone, hrtime_t value)
420c97ad5cdSakolb {
421c97ad5cdSakolb 	cpucap_t *cap = zone->zone_cpucap;
422c97ad5cdSakolb 
423c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
424c97ad5cdSakolb 	ASSERT(cap != NULL);
425c97ad5cdSakolb 
426c97ad5cdSakolb 	if (CAP_DISABLED(cap)) {
427c97ad5cdSakolb 		ASSERT(cap->cap_kstat == NULL);
428c97ad5cdSakolb 		cap_enable(&capped_zones, cap, value);
429c97ad5cdSakolb 		cap->cap_zone = zone;
430c97ad5cdSakolb 
431c97ad5cdSakolb 		/*
432c97ad5cdSakolb 		 * Create cap kstats
433c97ad5cdSakolb 		 */
434c97ad5cdSakolb 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
435c97ad5cdSakolb 		    KSTAT_TYPE_NAMED,
436c97ad5cdSakolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
437c97ad5cdSakolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
438c97ad5cdSakolb 			cap->cap_kstat->ks_data_size +=
439c97ad5cdSakolb 			    strlen(cap->cap_zone->zone_name) + 1;
440c97ad5cdSakolb 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
441c97ad5cdSakolb 			cap->cap_kstat->ks_data = &cap_kstat;
442c97ad5cdSakolb 			cap->cap_kstat->ks_update = cap_kstat_update;
443c97ad5cdSakolb 			cap->cap_kstat->ks_private = cap;
444c97ad5cdSakolb 			kstat_install(cap->cap_kstat);
445c97ad5cdSakolb 		}
446c97ad5cdSakolb 	}
447c97ad5cdSakolb }
448c97ad5cdSakolb 
449c97ad5cdSakolb /*
450c97ad5cdSakolb  * Disable zone cap.
451c97ad5cdSakolb  * It is safe to disable already disabled zone cap.
452c97ad5cdSakolb  * Should be called with caps_lock held.
453c97ad5cdSakolb  */
454c97ad5cdSakolb static void
455c97ad5cdSakolb cap_zone_disable(zone_t *zone)
456c97ad5cdSakolb {
457c97ad5cdSakolb 	cpucap_t *cap = zone->zone_cpucap;
458c97ad5cdSakolb 
459c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
460c97ad5cdSakolb 	ASSERT(cap != NULL);
461c97ad5cdSakolb 	ASSERT(cap->cap_zone == zone);
462c97ad5cdSakolb 
463c97ad5cdSakolb 	if (CAP_ENABLED(cap))
464c97ad5cdSakolb 		cap_disable(&capped_zones, cap);
465c97ad5cdSakolb }
466c97ad5cdSakolb 
467c97ad5cdSakolb /*
468c97ad5cdSakolb  * Apply specified callback to all caps contained in the list `l'.
469c97ad5cdSakolb  */
470c97ad5cdSakolb static void
471*d3d50737SRafael Vanoni cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
472c97ad5cdSakolb {
473*d3d50737SRafael Vanoni 	static uint64_t cpucap_walk_gen;
474c97ad5cdSakolb 	cpucap_t *cap;
475c97ad5cdSakolb 
476c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
477c97ad5cdSakolb 
478c97ad5cdSakolb 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
479*d3d50737SRafael Vanoni 		(*cb)(cap, cpucap_walk_gen);
480c97ad5cdSakolb 	}
481*d3d50737SRafael Vanoni 
482*d3d50737SRafael Vanoni 	atomic_inc_64(&cpucap_walk_gen);
483c97ad5cdSakolb }
484c97ad5cdSakolb 
485c97ad5cdSakolb /*
486c97ad5cdSakolb  * If cap limit is not reached, make one thread from wait queue runnable.
487c97ad5cdSakolb  * The waitq_isempty check is performed without the waitq lock. If a new thread
488c97ad5cdSakolb  * is placed on the waitq right after the check, it will be picked up during the
489c97ad5cdSakolb  * next invocation of cap_poke_waitq().
490c97ad5cdSakolb  */
491*d3d50737SRafael Vanoni /* ARGSUSED */
492c97ad5cdSakolb static void
493*d3d50737SRafael Vanoni cap_poke_waitq(cpucap_t *cap, int64_t gen)
494c97ad5cdSakolb {
495c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
496c97ad5cdSakolb 
497c97ad5cdSakolb 	if (cap->cap_usage >= cap->cap_value) {
498c97ad5cdSakolb 		cap->cap_above++;
499c97ad5cdSakolb 	} else {
500c97ad5cdSakolb 		waitq_t *wq = &cap->cap_waitq;
501c97ad5cdSakolb 
502c97ad5cdSakolb 		cap->cap_below++;
503c97ad5cdSakolb 
504c97ad5cdSakolb 		if (!waitq_isempty(wq))
505c97ad5cdSakolb 			waitq_runone(wq);
506c97ad5cdSakolb 	}
507c97ad5cdSakolb }
508c97ad5cdSakolb 
509c97ad5cdSakolb /*
510c97ad5cdSakolb  * The callback function called for every cap on capped_projects list.
511c97ad5cdSakolb  * Decay cap usage by CAP_DECAY_FACTOR
512c97ad5cdSakolb  * Add this cap project usage to its zone usage.
513c97ad5cdSakolb  * Kick off a thread from the cap waitq if cap is not reached.
514c97ad5cdSakolb  */
515c97ad5cdSakolb static void
516*d3d50737SRafael Vanoni cap_project_usage_walker(cpucap_t *cap, int64_t gen)
517c97ad5cdSakolb {
518c97ad5cdSakolb 	zone_t		*zone = cap->cap_zone;
519c97ad5cdSakolb 	hrtime_t	cap_usage = cap->cap_usage;
520c97ad5cdSakolb 
521c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
522c97ad5cdSakolb 	ASSERT(cap->cap_project->kpj_cpucap == cap);
523c97ad5cdSakolb 	ASSERT(zone == cap->cap_project->kpj_zone);
524c97ad5cdSakolb 	ASSERT(CAP_ENABLED(cap));
525c97ad5cdSakolb 
526c97ad5cdSakolb 	/*
527c97ad5cdSakolb 	 * Set or clear the CAP_REACHED flag based on the current usage.
528c97ad5cdSakolb 	 * Only projects having their own caps are ever marked as CAP_REACHED.
529c97ad5cdSakolb 	 */
530*d3d50737SRafael Vanoni 	cap_poke_waitq(cap, 0);
531c97ad5cdSakolb 
532c97ad5cdSakolb 	/*
533c97ad5cdSakolb 	 * Add project's CPU usage to our zone's CPU usage.
534c97ad5cdSakolb 	 */
535c97ad5cdSakolb 	if (ZONE_IS_CAPPED(zone)) {
536c97ad5cdSakolb 		cpucap_t *zcap = zone->zone_cpucap;
537c97ad5cdSakolb 
538c97ad5cdSakolb 		ASSERT(zcap->cap_zone == zone);
539c97ad5cdSakolb 
540c97ad5cdSakolb 		/*
541c97ad5cdSakolb 		 * If we haven't reset this zone's usage during this clock tick
542*d3d50737SRafael Vanoni 		 * yet, then do it now. The cap_gen field is used to check
543c97ad5cdSakolb 		 * whether this is the first zone's project we see during this
544c97ad5cdSakolb 		 * tick or a subsequent one.
545c97ad5cdSakolb 		 */
546*d3d50737SRafael Vanoni 		if (zcap->cap_gen != gen) {
547c97ad5cdSakolb 			if (zcap->cap_usage > zcap->cap_maxusage)
548c97ad5cdSakolb 				zcap->cap_maxusage = zcap->cap_usage;
549c97ad5cdSakolb 			zcap->cap_usage = 0;
550*d3d50737SRafael Vanoni 			zcap->cap_gen = gen;
551c97ad5cdSakolb 		}
552c97ad5cdSakolb 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
553c97ad5cdSakolb 		    hrtime_t, cap_usage);
554c97ad5cdSakolb 		zcap->cap_usage += cap_usage;
555c97ad5cdSakolb 		/* Check for overflows */
556c97ad5cdSakolb 		if (zcap->cap_usage < 0)
557c97ad5cdSakolb 			zcap->cap_usage = MAX_USAGE - 1;
558c97ad5cdSakolb 	}
559c97ad5cdSakolb 
560c97ad5cdSakolb 	/*
561c97ad5cdSakolb 	 * Decay project usage.
562c97ad5cdSakolb 	 */
563c97ad5cdSakolb 	disp_lock_enter(&cap->cap_usagelock);
564c97ad5cdSakolb 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
565c97ad5cdSakolb 	disp_lock_exit(&cap->cap_usagelock);
566c97ad5cdSakolb }
567c97ad5cdSakolb 
568c97ad5cdSakolb /*
569c97ad5cdSakolb  * On every clock tick walk the list of project caps and update the CPU usage.
570c97ad5cdSakolb  * Also walk the list of zone caps checking whether any threads should
571c97ad5cdSakolb  * transition from wait queue to run queue.
572c97ad5cdSakolb  *
573c97ad5cdSakolb  * This function gets called by the clock thread directly when there are any
574c97ad5cdSakolb  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
575c97ad5cdSakolb  * caps_lock for long periods of time, so there should be almost no contention
576c97ad5cdSakolb  * for it.
577c97ad5cdSakolb  */
578c97ad5cdSakolb static void
579c97ad5cdSakolb caps_update()
580c97ad5cdSakolb {
581c97ad5cdSakolb 	mutex_enter(&caps_lock);
582c97ad5cdSakolb 	cap_walk(&capped_projects, cap_project_usage_walker);
583c97ad5cdSakolb 	cap_walk(&capped_zones, cap_poke_waitq);
584c97ad5cdSakolb 	mutex_exit(&caps_lock);
585c97ad5cdSakolb }
586c97ad5cdSakolb 
587c97ad5cdSakolb /*
588c97ad5cdSakolb  * The function is called for each project in a zone when the zone cap is
589c97ad5cdSakolb  * modified. It enables project caps if zone cap is enabled and disables if the
590c97ad5cdSakolb  * zone cap is disabled and project doesn't have its own cap.
591c97ad5cdSakolb  *
592c97ad5cdSakolb  * For each project that does not have cpucap structure allocated it allocates a
593c97ad5cdSakolb  * new structure and assigns to kpj->cpu_cap. The allocation is performed
594c97ad5cdSakolb  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
595c97ad5cdSakolb  * held.
596c97ad5cdSakolb  */
597c97ad5cdSakolb static int
598c97ad5cdSakolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
599c97ad5cdSakolb {
600c97ad5cdSakolb 	cpucap_t *project_cap = NULL;
601c97ad5cdSakolb 	cpucap_t *zone_cap = (cpucap_t *)arg;
602c97ad5cdSakolb 
603c97ad5cdSakolb 	ASSERT(zone_cap != NULL);
604c97ad5cdSakolb 
605c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
606c97ad5cdSakolb 		/*
607c97ad5cdSakolb 		 * This is the first time any cap was established for this
608c97ad5cdSakolb 		 * project. Allocate a new cpucap structure for it.
609c97ad5cdSakolb 		 */
610c97ad5cdSakolb 		project_cap = cap_alloc();
611c97ad5cdSakolb 	}
612c97ad5cdSakolb 
613c97ad5cdSakolb 	mutex_enter(&caps_lock);
614c97ad5cdSakolb 
615c97ad5cdSakolb 	/*
616c97ad5cdSakolb 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
617c97ad5cdSakolb 	 * and assign the newly allocated cpucap structure to it.
618c97ad5cdSakolb 	 */
619c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
620c97ad5cdSakolb 		kpj->kpj_cpucap = project_cap;
621c97ad5cdSakolb 	} else if (project_cap != NULL) {
622c97ad5cdSakolb 		cap_free(project_cap);
623c97ad5cdSakolb 	}
624c97ad5cdSakolb 
625c97ad5cdSakolb 	project_cap = kpj->kpj_cpucap;
626c97ad5cdSakolb 
627c97ad5cdSakolb 	if (CAP_DISABLED(zone_cap)) {
628c97ad5cdSakolb 		/*
629c97ad5cdSakolb 		 * Remove all projects in this zone without caps
630c97ad5cdSakolb 		 * from the capped_projects list.
631c97ad5cdSakolb 		 */
632c97ad5cdSakolb 		if (project_cap->cap_value == MAX_USAGE) {
633c97ad5cdSakolb 			cap_project_disable(kpj);
634c97ad5cdSakolb 		}
635c97ad5cdSakolb 	} else if (CAP_DISABLED(project_cap)) {
636c97ad5cdSakolb 		/*
637c97ad5cdSakolb 		 * Add the project to capped_projects list.
638c97ad5cdSakolb 		 */
639c97ad5cdSakolb 		ASSERT(project_cap->cap_value == 0);
640c97ad5cdSakolb 		cap_project_enable(kpj, MAX_USAGE);
641c97ad5cdSakolb 	}
642c97ad5cdSakolb 	mutex_exit(&caps_lock);
643c97ad5cdSakolb 
644c97ad5cdSakolb 	return (0);
645c97ad5cdSakolb }
646c97ad5cdSakolb 
647c97ad5cdSakolb /*
648c97ad5cdSakolb  * Set zone cap to cap_val
649c97ad5cdSakolb  * If cap_val is equal to NOCAP, disable zone cap.
650c97ad5cdSakolb  *
651c97ad5cdSakolb  * If this is the first time a cap is set on a zone, allocate cpucap structure
652c97ad5cdSakolb  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
653c97ad5cdSakolb  */
654c97ad5cdSakolb int
655c97ad5cdSakolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
656c97ad5cdSakolb {
657c97ad5cdSakolb 	cpucap_t *cap = NULL;
658c97ad5cdSakolb 	hrtime_t value;
659c97ad5cdSakolb 
660c97ad5cdSakolb 	if (cap_val == 0)
661c97ad5cdSakolb 		return (EINVAL);
662c97ad5cdSakolb 
663c97ad5cdSakolb 	ASSERT(cap_val <= MAXCAP);
664c97ad5cdSakolb 	if (cap_val > MAXCAP)
665c97ad5cdSakolb 		cap_val = MAXCAP;
666c97ad5cdSakolb 
667c97ad5cdSakolb 	/*
668c97ad5cdSakolb 	 * Nothing to do if trying to disable a cap on a zone when caps are off
669c97ad5cdSakolb 	 * or a zone which does not have a cap yet.
670c97ad5cdSakolb 	 */
671c97ad5cdSakolb 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
672c97ad5cdSakolb 		return (0);
673c97ad5cdSakolb 
674c97ad5cdSakolb 	if (zone->zone_cpucap == NULL)
675c97ad5cdSakolb 		cap = cap_alloc();
676c97ad5cdSakolb 
677c97ad5cdSakolb 	mutex_enter(&caps_lock);
678c97ad5cdSakolb 
679c97ad5cdSakolb 	if (cpucaps_busy) {
680c97ad5cdSakolb 		mutex_exit(&caps_lock);
681c97ad5cdSakolb 		return (EBUSY);
682c97ad5cdSakolb 	}
683c97ad5cdSakolb 
684c97ad5cdSakolb 	/*
685c97ad5cdSakolb 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
686c97ad5cdSakolb 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
687c97ad5cdSakolb 	 */
688c97ad5cdSakolb 	if (zone->zone_cpucap == NULL) {
689c97ad5cdSakolb 		zone->zone_cpucap = cap;
690c97ad5cdSakolb 	} else if (cap != NULL) {
691c97ad5cdSakolb 		cap_free(cap);
692c97ad5cdSakolb 	}
693c97ad5cdSakolb 
694c97ad5cdSakolb 	cap = zone->zone_cpucap;
695c97ad5cdSakolb 	value = cap_val * cap_tick_cost;
696c97ad5cdSakolb 	if (value < 0)
697c97ad5cdSakolb 		value = MAX_USAGE;
698c97ad5cdSakolb 
699c97ad5cdSakolb 	/* Nothing to do if the value is staying the same */
700c97ad5cdSakolb 	if (value == cap->cap_value) {
701c97ad5cdSakolb 		mutex_exit(&caps_lock);
702c97ad5cdSakolb 		return (0);
703c97ad5cdSakolb 	}
704c97ad5cdSakolb 
705c97ad5cdSakolb 	/*
706c97ad5cdSakolb 	 * Clear cap statistics since the cap value itself changes.
707c97ad5cdSakolb 	 */
708c97ad5cdSakolb 	cap->cap_above = cap->cap_below = 0;
709c97ad5cdSakolb 
710c97ad5cdSakolb 
711c97ad5cdSakolb 	if (cap_val == NOCAP) {
712c97ad5cdSakolb 		if (CAP_ENABLED(cap)) {
713c97ad5cdSakolb 			/*
714c97ad5cdSakolb 			 * Remove cap for the zone
715c97ad5cdSakolb 			 */
716c97ad5cdSakolb 			cap_zone_disable(zone);
717c97ad5cdSakolb 			cpucaps_busy = B_TRUE;
718c97ad5cdSakolb 			mutex_exit(&caps_lock);
719c97ad5cdSakolb 			/*
720c97ad5cdSakolb 			 * Disable caps for all project belonging to this zone
721c97ad5cdSakolb 			 * unless they have their own cap.
722c97ad5cdSakolb 			 */
723c97ad5cdSakolb 			(void) project_walk_all(zone->zone_id,
724c97ad5cdSakolb 			    cap_project_zone_modify_walker, cap);
725c97ad5cdSakolb 
726c97ad5cdSakolb 			mutex_enter(&caps_lock);
727c97ad5cdSakolb 			cpucaps_busy = B_FALSE;
728c97ad5cdSakolb 		}
729c97ad5cdSakolb 	} else if (CAP_DISABLED(cap)) {
730c97ad5cdSakolb 		/*
731c97ad5cdSakolb 		 * Set a cap on a zone which previously was not capped.
732c97ad5cdSakolb 		 */
733c97ad5cdSakolb 		cap_zone_enable(zone, value);
734c97ad5cdSakolb 		cpucaps_busy = B_TRUE;
735c97ad5cdSakolb 		mutex_exit(&caps_lock);
736c97ad5cdSakolb 
737c97ad5cdSakolb 		/*
738c97ad5cdSakolb 		 * Enable cap for all projects belonging to this zone.
739c97ad5cdSakolb 		 */
740c97ad5cdSakolb 		(void) project_walk_all(zone->zone_id,
741c97ad5cdSakolb 		    cap_project_zone_modify_walker, cap);
742c97ad5cdSakolb 
743c97ad5cdSakolb 		mutex_enter(&caps_lock);
744c97ad5cdSakolb 		cpucaps_busy = B_FALSE;
745c97ad5cdSakolb 	} else {
746c97ad5cdSakolb 		/*
747c97ad5cdSakolb 		 * No state transitions, just change the value
748c97ad5cdSakolb 		 */
749c97ad5cdSakolb 		cap->cap_value = value;
750c97ad5cdSakolb 	}
751c97ad5cdSakolb 
752c97ad5cdSakolb 	ASSERT(MUTEX_HELD(&caps_lock));
753c97ad5cdSakolb 	ASSERT(!cpucaps_busy);
754c97ad5cdSakolb 	mutex_exit(&caps_lock);
755c97ad5cdSakolb 
756c97ad5cdSakolb 	return (0);
757c97ad5cdSakolb }
758c97ad5cdSakolb 
759c97ad5cdSakolb /*
760c97ad5cdSakolb  * The project is going away so disable its cap.
761c97ad5cdSakolb  */
762c97ad5cdSakolb void
763c97ad5cdSakolb cpucaps_project_remove(kproject_t *kpj)
764c97ad5cdSakolb {
765c97ad5cdSakolb 	mutex_enter(&caps_lock);
766c97ad5cdSakolb 	if (PROJECT_IS_CAPPED(kpj))
767c97ad5cdSakolb 		cap_project_disable(kpj);
768c97ad5cdSakolb 	if (kpj->kpj_cpucap != NULL) {
769c97ad5cdSakolb 		cap_free(kpj->kpj_cpucap);
770c97ad5cdSakolb 		kpj->kpj_cpucap = NULL;
771c97ad5cdSakolb 	}
772c97ad5cdSakolb 	mutex_exit(&caps_lock);
773c97ad5cdSakolb }
774c97ad5cdSakolb 
775c97ad5cdSakolb /*
776c97ad5cdSakolb  * The zone is going away, so disable its cap.
777c97ad5cdSakolb  */
778c97ad5cdSakolb void
779c97ad5cdSakolb cpucaps_zone_remove(zone_t *zone)
780c97ad5cdSakolb {
781c97ad5cdSakolb 	mutex_enter(&caps_lock);
782c97ad5cdSakolb 	while (ZONE_IS_CAPPED(zone)) {
783c97ad5cdSakolb 		mutex_exit(&caps_lock);
784c97ad5cdSakolb 		(void) cpucaps_zone_set(zone, NOCAP);
785c97ad5cdSakolb 		mutex_enter(&caps_lock);
786c97ad5cdSakolb 	}
787c97ad5cdSakolb 	if (zone->zone_cpucap != NULL) {
788c97ad5cdSakolb 		cap_free(zone->zone_cpucap);
789c97ad5cdSakolb 		zone->zone_cpucap = NULL;
790c97ad5cdSakolb 	}
791c97ad5cdSakolb 	mutex_exit(&caps_lock);
792c97ad5cdSakolb }
793c97ad5cdSakolb 
794c97ad5cdSakolb /*
795c97ad5cdSakolb  * New project was created. It should be put on the capped_projects list if
796c97ad5cdSakolb  * its zone has a cap.
797c97ad5cdSakolb  */
798c97ad5cdSakolb void
799c97ad5cdSakolb cpucaps_project_add(kproject_t *kpj)
800c97ad5cdSakolb {
801c97ad5cdSakolb 	cpucap_t *cap = NULL;
802c97ad5cdSakolb 
803c97ad5cdSakolb 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
804c97ad5cdSakolb 		return;
805c97ad5cdSakolb 
806c97ad5cdSakolb 	/*
807c97ad5cdSakolb 	 * This project was never capped before, so allocate its cap structure.
808c97ad5cdSakolb 	 */
809c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL)
810c97ad5cdSakolb 		cap = cap_alloc();
811c97ad5cdSakolb 
812c97ad5cdSakolb 	mutex_enter(&caps_lock);
813c97ad5cdSakolb 	/*
814c97ad5cdSakolb 	 * Double-check with caps_lock held
815c97ad5cdSakolb 	 */
816c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
817c97ad5cdSakolb 		kpj->kpj_cpucap = cap;
818c97ad5cdSakolb 	} else if (cap != NULL) {
819c97ad5cdSakolb 		cap_free(cap);
820c97ad5cdSakolb 	}
821c97ad5cdSakolb 
822c97ad5cdSakolb 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
823c97ad5cdSakolb 		cap_project_enable(kpj, MAX_USAGE);
824c97ad5cdSakolb 
825c97ad5cdSakolb 	mutex_exit(&caps_lock);
826c97ad5cdSakolb }
827c97ad5cdSakolb 
828c97ad5cdSakolb /*
829c97ad5cdSakolb  * Set project cap to cap_val
830c97ad5cdSakolb  * If cap_val is equal to NOCAP, disable project cap.
831c97ad5cdSakolb  *
832c97ad5cdSakolb  * If this is the first time a cap is set on a project, allocate cpucap
833c97ad5cdSakolb  * structure without holding caps_lock to avoid KM_SLEEP allocation with
834c97ad5cdSakolb  * caps_lock held.
835c97ad5cdSakolb  */
836c97ad5cdSakolb int
837c97ad5cdSakolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
838c97ad5cdSakolb {
839c97ad5cdSakolb 	cpucap_t *cap = NULL;
840c97ad5cdSakolb 	hrtime_t value;
841c97ad5cdSakolb 
842c97ad5cdSakolb 	if (cap_val == 0)
843c97ad5cdSakolb 		return (EINVAL);
844c97ad5cdSakolb 
845c97ad5cdSakolb 	ASSERT(cap_val <= MAXCAP);
846c97ad5cdSakolb 	if (cap_val > MAXCAP)
847c97ad5cdSakolb 		cap_val = MAXCAP;
848c97ad5cdSakolb 
849c97ad5cdSakolb 	/*
850c97ad5cdSakolb 	 * Nothing to do if trying to disable project cap and caps are not
851c97ad5cdSakolb 	 * enabled or if trying to disable cap on a project that does not have
852c97ad5cdSakolb 	 * cap enabled.
853c97ad5cdSakolb 	 */
854c97ad5cdSakolb 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
855c97ad5cdSakolb 		return (0);
856c97ad5cdSakolb 
857c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
858c97ad5cdSakolb 		/*
859c97ad5cdSakolb 		 * This project was never capped before, so allocate its cap
860c97ad5cdSakolb 		 * structure.
861c97ad5cdSakolb 		 */
862c97ad5cdSakolb 		cap = cap_alloc();
863c97ad5cdSakolb 	}
864c97ad5cdSakolb 
865c97ad5cdSakolb 	mutex_enter(&caps_lock);
866c97ad5cdSakolb 
867c97ad5cdSakolb 	/*
868c97ad5cdSakolb 	 * Double-check with caps_lock held.
869c97ad5cdSakolb 	 */
870c97ad5cdSakolb 	if (kpj->kpj_cpucap == NULL) {
871c97ad5cdSakolb 		kpj->kpj_cpucap = cap;
872c97ad5cdSakolb 	} else if (cap != NULL) {
873c97ad5cdSakolb 		cap_free(cap);
874c97ad5cdSakolb 	}
875c97ad5cdSakolb 
876c97ad5cdSakolb 	/*
877c97ad5cdSakolb 	 * Get the actual pointer to the project cap.
878c97ad5cdSakolb 	 */
879c97ad5cdSakolb 	cap = kpj->kpj_cpucap;
880c97ad5cdSakolb 	value = cap_val * cap_tick_cost;
881c97ad5cdSakolb 	if (value < 0)
882c97ad5cdSakolb 		value = MAX_USAGE;
883c97ad5cdSakolb 
884c97ad5cdSakolb 	/*
885c97ad5cdSakolb 	 * Nothing to do if the value is not changing
886c97ad5cdSakolb 	 */
887c97ad5cdSakolb 	if (value == cap->cap_value) {
888c97ad5cdSakolb 		mutex_exit(&caps_lock);
889c97ad5cdSakolb 		return (0);
890c97ad5cdSakolb 	}
891c97ad5cdSakolb 
892c97ad5cdSakolb 	/*
893c97ad5cdSakolb 	 * Clear cap statistics since the cap value itself changes.
894c97ad5cdSakolb 	 */
895c97ad5cdSakolb 	cap->cap_above = cap->cap_below = 0;
896c97ad5cdSakolb 	cap->cap_maxusage = 0;
897c97ad5cdSakolb 
898c97ad5cdSakolb 	if (cap_val != NOCAP) {
899c97ad5cdSakolb 		/*
900c97ad5cdSakolb 		 * Enable this cap if it is not already enabled.
901c97ad5cdSakolb 		 */
902c97ad5cdSakolb 		if (CAP_DISABLED(cap))
903c97ad5cdSakolb 			cap_project_enable(kpj, value);
904c97ad5cdSakolb 		else
905c97ad5cdSakolb 			cap->cap_value = value;
906c97ad5cdSakolb 	} else if (CAP_ENABLED(cap)) {
907c97ad5cdSakolb 		/*
908c97ad5cdSakolb 		 * User requested to drop a cap on the project. If it is part of
909c97ad5cdSakolb 		 * capped zone, keep the cap and set the value to MAX_USAGE,
910c97ad5cdSakolb 		 * otherwise disable the cap.
911c97ad5cdSakolb 		 */
912c97ad5cdSakolb 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
913c97ad5cdSakolb 			cap->cap_value = MAX_USAGE;
914c97ad5cdSakolb 		} else {
915c97ad5cdSakolb 			cap_project_disable(kpj);
916c97ad5cdSakolb 		}
917c97ad5cdSakolb 	}
918c97ad5cdSakolb 	mutex_exit(&caps_lock);
919c97ad5cdSakolb 
920c97ad5cdSakolb 	return (0);
921c97ad5cdSakolb }
922c97ad5cdSakolb 
923c97ad5cdSakolb /*
924c97ad5cdSakolb  * Get cap usage.
925c97ad5cdSakolb  */
926c97ad5cdSakolb static rctl_qty_t
927c97ad5cdSakolb cap_get(cpucap_t *cap)
928c97ad5cdSakolb {
929c97ad5cdSakolb 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
930c97ad5cdSakolb }
931c97ad5cdSakolb 
932c97ad5cdSakolb /*
933c97ad5cdSakolb  * Get current project usage.
934c97ad5cdSakolb  */
935c97ad5cdSakolb rctl_qty_t
936c97ad5cdSakolb cpucaps_project_get(kproject_t *kpj)
937c97ad5cdSakolb {
938c97ad5cdSakolb 	return (cap_get(kpj->kpj_cpucap));
939c97ad5cdSakolb }
940c97ad5cdSakolb 
941c97ad5cdSakolb /*
942c97ad5cdSakolb  * Get current zone usage.
943c97ad5cdSakolb  */
944c97ad5cdSakolb rctl_qty_t
945c97ad5cdSakolb cpucaps_zone_get(zone_t *zone)
946c97ad5cdSakolb {
947c97ad5cdSakolb 	return (cap_get(zone->zone_cpucap));
948c97ad5cdSakolb }
949c97ad5cdSakolb 
950c97ad5cdSakolb /*
951c97ad5cdSakolb  * Charge project of thread t the time thread t spent on CPU since previously
952c97ad5cdSakolb  * adjusted.
953c97ad5cdSakolb  *
954c97ad5cdSakolb  * Record the current on-CPU time in the csc structure.
955c97ad5cdSakolb  *
956c97ad5cdSakolb  * Do not adjust for more than one tick worth of time.
957c97ad5cdSakolb  *
9584b175f6fSakolb  * It is possible that the project cap is being disabled while this routine is
9594b175f6fSakolb  * executed. This should not cause any issues since the association between the
9604b175f6fSakolb  * thread and its project is protected by thread lock.
961c97ad5cdSakolb  */
962c97ad5cdSakolb static void
963c97ad5cdSakolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
964c97ad5cdSakolb {
965c97ad5cdSakolb 	kproject_t	*kpj = ttoproj(t);
966c97ad5cdSakolb 	hrtime_t	new_usage;
967c97ad5cdSakolb 	hrtime_t	usage_delta;
968c97ad5cdSakolb 
969c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
9704b175f6fSakolb 	ASSERT(kpj->kpj_cpucap != NULL);
971c97ad5cdSakolb 
972c97ad5cdSakolb 	/* Get on-CPU time since birth of a thread */
973c97ad5cdSakolb 	new_usage = mstate_thread_onproc_time(t);
974c97ad5cdSakolb 
975c97ad5cdSakolb 	/* Time spent on CPU since last checked */
976c97ad5cdSakolb 	usage_delta = new_usage - csc->csc_cputime;
977c97ad5cdSakolb 
978c97ad5cdSakolb 	/* Save the accumulated on-CPU time */
979c97ad5cdSakolb 	csc->csc_cputime = new_usage;
980c97ad5cdSakolb 
981c97ad5cdSakolb 	/* Charge at most one tick worth of on-CPU time */
982c97ad5cdSakolb 	if (usage_delta > cap_tick_cost)
983c97ad5cdSakolb 		usage_delta = cap_tick_cost;
984c97ad5cdSakolb 
985c97ad5cdSakolb 	/* Add usage_delta to the project usage value. */
986c97ad5cdSakolb 	if (usage_delta > 0) {
987c97ad5cdSakolb 		cpucap_t *cap = kpj->kpj_cpucap;
988c97ad5cdSakolb 
989c97ad5cdSakolb 		DTRACE_PROBE2(cpucaps__project__charge,
990c97ad5cdSakolb 		    kthread_id_t, t, hrtime_t, usage_delta);
991c97ad5cdSakolb 
992c97ad5cdSakolb 		disp_lock_enter_high(&cap->cap_usagelock);
993c97ad5cdSakolb 		cap->cap_usage += usage_delta;
994c97ad5cdSakolb 
995c97ad5cdSakolb 		/* Check for overflows */
996c97ad5cdSakolb 		if (cap->cap_usage < 0)
997c97ad5cdSakolb 			cap->cap_usage = MAX_USAGE - 1;
998c97ad5cdSakolb 
999c97ad5cdSakolb 		disp_lock_exit_high(&cap->cap_usagelock);
1000c97ad5cdSakolb 
1001c97ad5cdSakolb 		/*
1002c97ad5cdSakolb 		 * cap_maxusage is only kept for observability. Move it outside
1003c97ad5cdSakolb 		 * the lock to reduce the time spent while holding the lock.
1004c97ad5cdSakolb 		 */
1005c97ad5cdSakolb 		if (cap->cap_usage > cap->cap_maxusage)
1006c97ad5cdSakolb 			cap->cap_maxusage = cap->cap_usage;
1007c97ad5cdSakolb 	}
1008c97ad5cdSakolb }
1009c97ad5cdSakolb 
1010c97ad5cdSakolb /*
1011c97ad5cdSakolb  * Charge thread's project and return True if project or zone should be
1012c97ad5cdSakolb  * penalized because its project or zone is exceeding its cap. Also sets
1013c97ad5cdSakolb  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
10144b175f6fSakolb  *
10154b175f6fSakolb  * It is possible that the project cap is being disabled while this routine is
10164b175f6fSakolb  * executed. This should not cause any issues since the association between the
10174b175f6fSakolb  * thread and its project is protected by thread lock. It will still set
10184b175f6fSakolb  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
10194b175f6fSakolb  * anything on the blocked wait queue.
10204b175f6fSakolb  *
1021c97ad5cdSakolb  */
1022c97ad5cdSakolb boolean_t
1023c97ad5cdSakolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1024c97ad5cdSakolb {
1025c97ad5cdSakolb 	kproject_t	*kpj = ttoproj(t);
1026c97ad5cdSakolb 	klwp_t		*lwp = t->t_lwp;
1027c97ad5cdSakolb 	zone_t		*zone;
1028c97ad5cdSakolb 	cpucap_t	*project_cap;
1029c97ad5cdSakolb 	boolean_t	rc = B_FALSE;
1030c97ad5cdSakolb 
1031c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
1032c97ad5cdSakolb 
1033c97ad5cdSakolb 	/* Nothing to do for projects that are not capped. */
1034c97ad5cdSakolb 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1035c97ad5cdSakolb 		return (B_FALSE);
1036c97ad5cdSakolb 
1037c97ad5cdSakolb 	caps_charge_adjust(t, csc);
1038c97ad5cdSakolb 
1039c97ad5cdSakolb 	/*
1040c97ad5cdSakolb 	 * The caller only requested to charge the project usage, no enforcement
1041c97ad5cdSakolb 	 * part.
1042c97ad5cdSakolb 	 */
1043c97ad5cdSakolb 	if (charge_type == CPUCAPS_CHARGE_ONLY)
1044c97ad5cdSakolb 		return (B_FALSE);
1045c97ad5cdSakolb 
1046c97ad5cdSakolb 	project_cap = kpj->kpj_cpucap;
1047c97ad5cdSakolb 
1048c97ad5cdSakolb 	if (project_cap->cap_usage >= project_cap->cap_value) {
1049c97ad5cdSakolb 		t->t_schedflag |= TS_PROJWAITQ;
1050c97ad5cdSakolb 		rc = B_TRUE;
1051c97ad5cdSakolb 	} else if (t->t_schedflag & TS_PROJWAITQ) {
1052c97ad5cdSakolb 		t->t_schedflag &= ~TS_PROJWAITQ;
1053c97ad5cdSakolb 	}
1054c97ad5cdSakolb 
1055c97ad5cdSakolb 	zone = ttozone(t);
1056c97ad5cdSakolb 	if (!ZONE_IS_CAPPED(zone)) {
1057c97ad5cdSakolb 		if (t->t_schedflag & TS_ZONEWAITQ)
1058c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1059c97ad5cdSakolb 	} else {
1060c97ad5cdSakolb 		cpucap_t *zone_cap = zone->zone_cpucap;
1061c97ad5cdSakolb 
1062c97ad5cdSakolb 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
1063c97ad5cdSakolb 			t->t_schedflag |= TS_ZONEWAITQ;
1064c97ad5cdSakolb 			rc = B_TRUE;
1065c97ad5cdSakolb 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
1066c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1067c97ad5cdSakolb 		}
1068c97ad5cdSakolb 	}
1069c97ad5cdSakolb 
1070c97ad5cdSakolb 
1071c97ad5cdSakolb 	return (rc);
1072c97ad5cdSakolb }
1073c97ad5cdSakolb 
1074c97ad5cdSakolb /*
1075c97ad5cdSakolb  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1076c97ad5cdSakolb  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1077c97ad5cdSakolb  *
1078c97ad5cdSakolb  * CPU Caps are only enforced for user threads.
1079c97ad5cdSakolb  *
1080c97ad5cdSakolb  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1081c97ad5cdSakolb  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1082c97ad5cdSakolb  *
1083c97ad5cdSakolb  * It is possible that by the time we enter cpucaps_enforce() the cap is already
1084c97ad5cdSakolb  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1085c97ad5cdSakolb  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1086c97ad5cdSakolb  * apply.
1087c97ad5cdSakolb  */
1088c97ad5cdSakolb boolean_t
1089c97ad5cdSakolb cpucaps_enforce(kthread_t *t)
1090c97ad5cdSakolb {
1091c97ad5cdSakolb 	klwp_t *lwp = t->t_lwp;
1092c97ad5cdSakolb 
1093c97ad5cdSakolb 	ASSERT(THREAD_LOCK_HELD(t));
1094c97ad5cdSakolb 
1095c97ad5cdSakolb 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1096c97ad5cdSakolb 		if (t->t_schedflag & TS_PROJWAITQ) {
1097c97ad5cdSakolb 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1098c97ad5cdSakolb 			t->t_schedflag &= ~TS_ANYWAITQ;
1099c97ad5cdSakolb 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1100c97ad5cdSakolb 			    t)) {
1101c97ad5cdSakolb 				return (B_TRUE);
1102c97ad5cdSakolb 			}
1103c97ad5cdSakolb 		}
1104c97ad5cdSakolb 		if (t->t_schedflag & TS_ZONEWAITQ) {
1105c97ad5cdSakolb 			ASSERT(ttozone(t)->zone_cpucap != NULL);
1106c97ad5cdSakolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
1107c97ad5cdSakolb 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1108c97ad5cdSakolb 			    t)) {
1109c97ad5cdSakolb 				return (B_TRUE);
1110c97ad5cdSakolb 			}
1111c97ad5cdSakolb 		}
1112c97ad5cdSakolb 	}
1113c97ad5cdSakolb 
1114c97ad5cdSakolb 	/*
1115c97ad5cdSakolb 	 * The thread is not enqueued on the wait queue.
1116c97ad5cdSakolb 	 */
1117c97ad5cdSakolb 	return (B_FALSE);
1118c97ad5cdSakolb }
1119c97ad5cdSakolb 
1120c97ad5cdSakolb /*
1121c97ad5cdSakolb  * Convert internal cap statistics into values exported by cap kstat.
1122c97ad5cdSakolb  */
1123c97ad5cdSakolb static int
1124c97ad5cdSakolb cap_kstat_update(kstat_t *ksp, int rw)
1125c97ad5cdSakolb {
1126c97ad5cdSakolb 	struct cap_kstat *capsp = &cap_kstat;
1127c97ad5cdSakolb 	cpucap_t *cap = ksp->ks_private;
1128c97ad5cdSakolb 	clock_t	tick_sec = SEC_TO_TICK(1);
1129c97ad5cdSakolb 	char *zonename = cap->cap_zone->zone_name;
1130c97ad5cdSakolb 
1131c97ad5cdSakolb 	if (rw == KSTAT_WRITE)
1132c97ad5cdSakolb 		return (EACCES);
1133c97ad5cdSakolb 
1134c97ad5cdSakolb 	capsp->cap_value.value.ui64 =
1135c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
1136c97ad5cdSakolb 	capsp->cap_usage.value.ui64 =
1137c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1138c97ad5cdSakolb 	capsp->cap_maxusage.value.ui64 =
1139c97ad5cdSakolb 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1140c97ad5cdSakolb 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1141c97ad5cdSakolb 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1142c97ad5cdSakolb 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1143c97ad5cdSakolb 	kstat_named_setstr(&capsp->cap_zonename, zonename);
1144c97ad5cdSakolb 
1145c97ad5cdSakolb 	return (0);
1146c97ad5cdSakolb }
1147