xref: /illumos-gate/usr/src/uts/common/disp/sysdc.c (revision e5803b76927480e8f9b67b22201c484ccf4c2bcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * The System Duty Cycle (SDC) scheduling class
27  * --------------------------------------------
28  *
29  * Background
30  *
31  * Kernel threads in Solaris have traditionally not been large consumers
32  * of CPU time.  They typically wake up, perform a small amount of
33  * work, then go back to sleep waiting for either a timeout or another
34  * signal.  On the assumption that the small amount of work that they do
35  * is important for the behavior of the whole system, these threads are
36  * treated kindly by the dispatcher and the SYS scheduling class: they run
37  * without preemption from anything other than real-time and interrupt
38  * threads; when preempted, they are put at the front of the queue, so they
39  * generally do not migrate between CPUs; and they are allowed to stay
40  * running until they voluntarily give up the CPU.
41  *
42  * As Solaris has evolved, new workloads have emerged which require the
43  * kernel to perform significant amounts of CPU-intensive work.  One
44  * example of such a workload is ZFS's transaction group sync processing.
45  * Each sync operation generates a large batch of I/Os, and each I/O
46  * may need to be compressed and/or checksummed before it is written to
47  * storage.  The taskq threads which perform the compression and checksums
48  * will run nonstop as long as they have work to do; a large sync operation
49  * on a compression-heavy dataset can keep them busy for seconds on end.
50  * This causes human-time-scale dispatch latency bubbles for any other
51  * threads which have the misfortune to share a CPU with the taskq threads.
52  *
53  * The SDC scheduling class is a solution to this problem.
54  *
55  *
56  * Overview
57  *
58  * SDC is centered around the concept of a thread's duty cycle (DC):
59  *
60  *			      ONPROC time
61  *	Duty Cycle =	----------------------
62  *			ONPROC + Runnable time
63  *
64  * This is the ratio of the time that the thread spent running on a CPU
65  * divided by the time it spent running or trying to run.  It is unaffected
66  * by any time the thread spent sleeping, stopped, etc.
67  *
68  * A thread joining the SDC class specifies a "target" DC that it wants
69  * to run at.  To implement this policy, the routine sysdc_update() scans
70  * the list of active SDC threads every few ticks and uses each thread's
71  * microstate data to compute the actual duty cycle that that thread
72  * has experienced recently.  If the thread is under its target DC, its
73  * priority is increased to the maximum available (sysdc_maxpri, which is
74  * 99 by default).  If the thread is over its target DC, its priority is
75  * reduced to the minimum available (sysdc_minpri, 0 by default).  This
76  * is a fairly primitive approach, in that it doesn't use any of the
77  * intermediate priorities, but it's not completely inappropriate.  Even
78  * though threads in the SDC class might take a while to do their job, they
79  * are by some definition important if they're running inside the kernel,
80  * so it is reasonable that they should get to run at priority 99.
81  *
82  * If a thread is running when sysdc_update() calculates its actual duty
83  * cycle, and there are other threads of equal or greater priority on its
84  * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
85  * acknowledges the preemption by calling sysdc_preempt(), which calls
86  * setbackdq(), which gives other threads with the same priority a chance
87  * to run.  This creates a de facto time quantum for threads in the SDC
88  * scheduling class.
89  *
90  * An SDC thread which is assigned priority 0 can continue to run if
91  * nothing else needs to use the CPU that it's running on.  Similarly, an
92  * SDC thread at priority 99 might not get to run as much as it wants to
93  * if there are other priority-99 or higher threads on its CPU.  These
94  * situations would cause the thread to get ahead of or behind its target
95  * DC; the longer the situations lasted, the further ahead or behind the
96  * thread would get.  Rather than condemning a thread to a lifetime of
97  * paying for its youthful indiscretions, SDC keeps "base" values for
98  * ONPROC and Runnable times in each thread's sysdc data, and updates these
99  * values periodically.  The duty cycle is then computed using the elapsed
100  * amount of ONPROC and Runnable times since those base times.
101  *
102  * Since sysdc_update() scans SDC threads fairly frequently, it tries to
103  * keep the list of "active" threads small by pruning out threads which
104  * have been asleep for a brief time.  They are not pruned immediately upon
105  * going to sleep, since some threads may bounce back and forth between
106  * sleeping and being runnable.
107  *
108  *
109  * Interfaces
110  *
111  * void sysdc_thread_enter(t, dc, flags)
112  *
113  *	Moves a kernel thread from the SYS scheduling class to the
114  *	SDC class. t must have an associated LWP (created by calling
115  *	lwp_kernel_create()).  The thread will have a target DC of dc.
116  *	Flags should be either 0 or SYSDC_THREAD_BATCH.  If
117  *	SYSDC_THREAD_BATCH is specified, the thread is expected to be
118  *	doing large amounts of processing.
119  *
120  *
121  * Complications
122  *
123  * - Run queue balancing
124  *
125  *	The Solaris dispatcher is biased towards letting a thread run
126  *	on the same CPU which it last ran on, if no more than 3 ticks
127  *	(i.e. rechoose_interval) have passed since the thread last ran.
128  *	This helps to preserve cache warmth.  On the other hand, it also
129  *	tries to keep the per-CPU run queues fairly balanced; if the CPU
130  *	chosen for a runnable thread has a run queue which is three or
131  *	more threads longer than a neighboring CPU's queue, the runnable
132  *	thread is dispatched onto the neighboring CPU instead.
133  *
134  *	These policies work well for some workloads, but not for many SDC
135  *	threads.  The taskq client of SDC, for example, has many discrete
136  *	units of work to do.  The work units are largely independent, so
137  *	cache warmth is not an important consideration.  It is important
138  *	that the threads fan out quickly to different CPUs, since the
139  *	amount of work these threads have to do (a few seconds worth at a
140  *	time) doesn't leave much time to correct thread placement errors
141  *	(i.e. two SDC threads being dispatched to the same CPU).
142  *
143  *	To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
144  *	This tells the dispatcher to keep neighboring run queues' lengths
145  *	more evenly matched, which allows SDC threads to migrate more
146  *	easily.
147  *
148  * - LWPs and system processes
149  *
150  *	SDC can only be used for kernel threads.  Since SDC uses microstate
151  *	accounting data to compute each thread's actual duty cycle, all
152  *	threads entering the SDC class must have associated LWPs (which
153  *	store the microstate data).  This means that the threads have to
154  *	be associated with an SSYS process, i.e. one created by newproc().
155  *	If the microstate accounting information is ever moved into the
156  *	kthread_t, this restriction could be lifted.
157  *
158  * - Dealing with oversubscription
159  *
160  *	Since SDC duty cycles are per-thread, it is possible that the
161  *	aggregate requested duty cycle of all SDC threads in a processor
162  *	set could be greater than the total CPU time available in that set.
163  *	The FSS scheduling class has an analogous situation, which it deals
164  *	with by reducing each thread's allotted CPU time proportionally.
165  *	Since SDC doesn't need to be as precise as FSS, it uses a simpler
166  *	solution to the oversubscription problem.
167  *
168  *	sysdc_update() accumulates the amount of time that max-priority SDC
169  *	threads have spent on-CPU in each processor set, and uses that sum
170  *	to create an implied duty cycle for that processor set:
171  *
172  *				accumulated CPU time
173  *	   pset DC =	-----------------------------------
174  *			 (# CPUs) * time since last update
175  *
176  *	If this implied duty cycle is above a maximum pset duty cycle (90%
177  *	by default), sysdc_update() sets the priority of all SDC threads
178  *	in that processor set to sysdc_minpri for a "break" period.  After
179  *	the break period, it waits for a "nobreak" period before trying to
180  *	enforce the pset duty cycle limit again.
181  *
182  * - Processor sets
183  *
184  *	As the above implies, SDC is processor set aware, but it does not
185  *	currently allow threads to change processor sets while in the SDC
186  *	class.  Instead, those threads must join the desired processor set
187  *	before entering SDC. [1]
188  *
189  * - Batch threads
190  *
191  *	A thread joining the SDC class can specify the SDC_THREAD_BATCH
192  *	flag.  This flag currently has no effect, but marks threads which
193  *	do bulk processing.
194  *
195  * - t_kpri_req
196  *
197  *	The TS and FSS scheduling classes pay attention to t_kpri_req,
198  *	which provides a simple form of priority inheritance for
199  *	synchronization primitives (such as rwlocks held as READER) which
200  *	cannot be traced to a unique thread.  The SDC class does not honor
201  *	t_kpri_req, for a few reasons:
202  *
203  *	1.  t_kpri_req is notoriously inaccurate.  A measure of its
204  *	    inaccuracy is that it needs to be cleared every time a thread
205  *	    returns to user mode, because it is frequently non-zero at that
206  *	    point.  This can happen because "ownership" of synchronization
207  *	    primitives that use t_kpri_req can be silently handed off,
208  *	    leaving no opportunity to will the t_kpri_req inheritance.
209  *
210  *	2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
211  *	    kernel priority.  This means that even if an SDC thread
212  *	    is holding a synchronization primitive and running at low
213  *	    priority, its priority will eventually be raised above 60,
214  *	    allowing it to drive on and release the resource.
215  *
216  *	3.  The first consumer of SDC uses the taskq subsystem, which holds
217  *	    a reader lock for the duration of the task's execution.  This
218  *	    would mean that SDC threads would never drop below kernel
219  *	    priority in practice, which defeats one of the purposes of SDC.
220  *
221  * - Why not FSS?
222  *
223  *	It might seem that the existing FSS scheduling class could solve
224  *	the problems that SDC is attempting to solve.  FSS's more precise
225  *	solution to the oversubscription problem would hardly cause
226  *	trouble, as long as it performed well.  SDC is implemented as
227  *	a separate scheduling class for two main reasons: the initial
228  *	consumer of SDC does not map well onto the "project" abstraction
229  *	that is central to FSS, and FSS does not expect to run at kernel
230  *	priorities.
231  *
232  *
233  * Tunables
234  *
235  * - sysdc_update_interval_msec:  Number of milliseconds between
236  *	consecutive thread priority updates.
237  *
238  * - sysdc_reset_interval_msec:  Number of milliseconds between
239  *	consecutive resets of a thread's base ONPROC and Runnable
240  *	times.
241  *
242  * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
243  *	before a thread is pruned from the active list.
244  *
245  * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
246  *	CPU time which SDC can give to its high-priority threads.
247  *
248  * - sysdc_break_msec:  Number of milliseconds of "break" taken when
249  *	sysdc_max_pset_DC is exceeded.
250  *
251  *
252  * Future work (in SDC and related subsystems)
253  *
254  * - Per-thread rechoose interval (0 for SDC)
255  *
256  *	Allow each thread to specify its own rechoose interval.  SDC
257  *	threads would specify an interval of zero, which would rechoose
258  *	the CPU with the lowest priority once per update.
259  *
260  * - Allow threads to change processor sets after joining the SDC class
261  *
262  * - Thread groups and per-group DC
263  *
264  *	It might be nice to be able to specify a duty cycle which applies
265  *	to a group of threads in aggregate.
266  *
267  * - Per-group DC callback to allow dynamic DC tuning
268  *
269  *	Currently, DCs are assigned when the thread joins SDC.  Some
270  *	workloads could benefit from being able to tune their DC using
271  *	subsystem-specific knowledge about the workload.
272  *
273  * - Finer-grained priority updates
274  *
275  * - More nuanced management of oversubscription
276  *
277  * - Moving other CPU-intensive threads into SDC
278  *
279  * - Move msacct data into kthread_t
280  *
281  *	This would allow kernel threads without LWPs to join SDC.
282  *
283  *
284  * Footnotes
285  *
286  * [1] The details of doing so are left as an exercise for the reader.
287  */
288 
289 #include <sys/types.h>
290 #include <sys/sysdc.h>
291 #include <sys/sysdc_impl.h>
292 
293 #include <sys/class.h>
294 #include <sys/cmn_err.h>
295 #include <sys/cpuvar.h>
296 #include <sys/cpupart.h>
297 #include <sys/debug.h>
298 #include <sys/disp.h>
299 #include <sys/errno.h>
300 #include <sys/inline.h>
301 #include <sys/kmem.h>
302 #include <sys/modctl.h>
303 #include <sys/schedctl.h>
304 #include <sys/sdt.h>
305 #include <sys/sunddi.h>
306 #include <sys/sysmacros.h>
307 #include <sys/systm.h>
308 #include <sys/var.h>
309 
310 /*
311  * Tunables - loaded into the internal state at module load time
312  */
313 uint_t		sysdc_update_interval_msec = 20;
314 uint_t		sysdc_reset_interval_msec = 400;
315 uint_t		sysdc_prune_interval_msec = 100;
316 uint_t		sysdc_max_pset_DC = 90;
317 uint_t		sysdc_break_msec = 80;
318 
319 /*
320  * Internal state - constants set up by sysdc_initparam()
321  */
322 static clock_t	sysdc_update_ticks;	/* ticks between updates */
323 static uint_t	sysdc_prune_updates;	/* updates asleep before pruning */
324 static uint_t	sysdc_reset_updates;	/* # of updates before reset */
325 static uint_t	sysdc_break_updates;	/* updates to break */
326 static uint_t	sysdc_nobreak_updates;	/* updates to not check */
327 static uint_t	sysdc_minDC;		/* minimum allowed DC */
328 static uint_t	sysdc_maxDC;		/* maximum allowed DC */
329 static pri_t	sysdc_minpri;		/* minimum allowed priority */
330 static pri_t	sysdc_maxpri;		/* maximum allowed priority */
331 
332 /*
333  * Internal state
334  */
335 static kmutex_t	sysdc_pset_lock;	/* lock protecting pset data */
336 static list_t	sysdc_psets;		/* list of psets with SDC threads */
337 static uint_t	sysdc_param_init;	/* sysdc_initparam() has been called */
338 static uint_t	sysdc_update_timeout_started; /* update timeout is active */
339 static hrtime_t	sysdc_last_update;	/* time of last sysdc_update() */
340 static sysdc_t	sysdc_dummy;		/* used to terminate active lists */
341 
342 /*
343  * Internal state - active hash table
344  */
345 #define	SYSDC_NLISTS	8
346 #define	SYSDC_HASH(sdc)	(((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
347 static sysdc_list_t	sysdc_active[SYSDC_NLISTS];
348 #define	SYSDC_LIST(sdc)		(&sysdc_active[SYSDC_HASH(sdc)])
349 
350 #ifdef DEBUG
351 static struct {
352 	uint64_t	sysdc_update_times_asleep;
353 	uint64_t	sysdc_update_times_base_ran_backwards;
354 	uint64_t	sysdc_update_times_already_done;
355 	uint64_t	sysdc_update_times_cur_ran_backwards;
356 	uint64_t	sysdc_compute_pri_breaking;
357 	uint64_t	sysdc_activate_enter;
358 	uint64_t	sysdc_update_enter;
359 	uint64_t	sysdc_update_exited;
360 	uint64_t	sysdc_update_not_sdc;
361 	uint64_t	sysdc_update_idle;
362 	uint64_t	sysdc_update_take_break;
363 	uint64_t	sysdc_update_no_psets;
364 	uint64_t	sysdc_tick_not_sdc;
365 	uint64_t	sysdc_tick_quantum_expired;
366 	uint64_t	sysdc_thread_enter_enter;
367 } sysdc_stats;
368 
369 #define	SYSDC_INC_STAT(x)	(sysdc_stats.x++)
370 #else
371 #define	SYSDC_INC_STAT(x)	((void)0)
372 #endif
373 
374 /* macros are UPPER CASE */
375 #define	HOWMANY(a, b)	howmany((a), (b))
376 #define	MSECTOTICKS(a)	HOWMANY((a) * 1000, usec_per_tick)
377 
378 static void
379 sysdc_initparam(void)
380 {
381 	uint_t sysdc_break_ticks;
382 
383 	/* update / prune intervals */
384 	sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
385 
386 	sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
387 	    sysdc_update_interval_msec);
388 	sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
389 	    sysdc_update_interval_msec);
390 
391 	/* We must get at least a little time on CPU. */
392 	sysdc_minDC = 1;
393 	sysdc_maxDC = SYSDC_DC_MAX;
394 	sysdc_minpri = 0;
395 	sysdc_maxpri = maxclsyspri;
396 
397 	/* break parameters */
398 	if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
399 		sysdc_max_pset_DC = SYSDC_DC_MAX;
400 	}
401 	sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
402 	sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
403 
404 	/*
405 	 * We want:
406 	 *
407 	 *	sysdc_max_pset_DC = (nobreak / (break + nobreak))
408 	 *
409 	 *	==>	  nobreak = sysdc_max_pset_DC * (break + nobreak)
410 	 *
411 	 *			    sysdc_max_pset_DC * break
412 	 *	==>	  nobreak = -------------------------
413 	 *			    1 - sysdc_max_pset_DC
414 	 */
415 	sysdc_nobreak_updates =
416 	    HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
417 	    (SYSDC_DC_MAX - sysdc_max_pset_DC));
418 
419 	sysdc_param_init = 1;
420 }
421 
422 #undef HOWMANY
423 #undef MSECTOTICKS
424 
425 #define	SDC_UPDATE_INITIAL	0x1	/* for the initial update */
426 #define	SDC_UPDATE_TIMEOUT	0x2	/* from sysdc_update() */
427 #define	SDC_UPDATE_TICK		0x4	/* from sysdc_tick(), on expiry */
428 
429 /*
430  * Updates the recorded times in the sdc, and returns the elapsed ONPROC
431  * and Runnable times since the last reset.
432  *
433  * newO is the thread's actual ONPROC time; it's used during sysdc_update()
434  * to track processor set usage.
435  */
436 static void
437 sysdc_update_times(sysdc_t *sdc, uint_t flags,
438     hrtime_t *O, hrtime_t *R, hrtime_t *newO)
439 {
440 	kthread_t *const t = sdc->sdc_thread;
441 	const uint_t	initial = (flags & SDC_UPDATE_INITIAL);
442 	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
443 	const clock_t	now = ddi_get_lbolt();
444 	uint_t		do_reset;
445 
446 	ASSERT(THREAD_LOCK_HELD(t));
447 
448 	*O = *R = 0;
449 
450 	/* If we've been sleeping, we know we haven't had any ONPROC time. */
451 	if (sdc->sdc_sleep_updates != 0 &&
452 	    sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
453 		*newO = sdc->sdc_last_base_O;
454 		SYSDC_INC_STAT(sysdc_update_times_asleep);
455 		return;
456 	}
457 
458 	/*
459 	 * If this is our first update, or we've hit the reset point,
460 	 * we need to reset our base_{O,R}.  Once we've updated them, we
461 	 * report O and R for the entire prior interval.
462 	 */
463 	do_reset = initial;
464 	if (update) {
465 		++sdc->sdc_nupdates;
466 		if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
467 			do_reset = 1;
468 	}
469 	if (do_reset) {
470 		hrtime_t baseO, baseR;
471 		if (initial) {
472 			/*
473 			 * Start off our cycle count somewhere in the middle,
474 			 * to keep the resets from all happening at once.
475 			 *
476 			 * 4999 is a handy prime much larger than
477 			 * sysdc_reset_updates, so that we don't run into
478 			 * trouble if the resolution is a multiple of
479 			 * sysdc_reset_updates.
480 			 */
481 			sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
482 			    sysdc_reset_updates);
483 			baseO = baseR = 0;
484 		} else {
485 			baseO = sdc->sdc_base_O;
486 			baseR = sdc->sdc_base_R;
487 		}
488 
489 		mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
490 		*newO = sdc->sdc_base_O;
491 
492 		sdc->sdc_reset = now;
493 		sdc->sdc_pri_check = -1; /* force mismatch below */
494 
495 		/*
496 		 * See below for rationale.
497 		 */
498 		if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
499 			SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
500 			baseO = sdc->sdc_base_O;
501 			baseR = sdc->sdc_base_R;
502 		}
503 
504 		/* compute based on the entire interval */
505 		*O = (sdc->sdc_base_O - baseO);
506 		*R = (sdc->sdc_base_R - baseR);
507 		return;
508 	}
509 
510 	/*
511 	 * If we're called from sysdc_update(), we *must* return a value
512 	 * for newO, so we always call mstate_systhread_times().
513 	 *
514 	 * Otherwise, if we've already done a pri check this tick,
515 	 * we can skip it.
516 	 */
517 	if (!update && sdc->sdc_pri_check == now) {
518 		SYSDC_INC_STAT(sysdc_update_times_already_done);
519 		return;
520 	}
521 
522 	/* Get the current times from the thread */
523 	sdc->sdc_pri_check = now;
524 	mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
525 	*newO = sdc->sdc_cur_O;
526 
527 	/*
528 	 * The updating of microstate accounting is not done under a
529 	 * consistent set of locks, particularly the t_waitrq field.  This
530 	 * can lead to narrow windows in which we account for time in the
531 	 * wrong bucket, which on the next read will be accounted for
532 	 * correctly.
533 	 *
534 	 * If our sdc_base_* fields were affected by one of these blips, we
535 	 * throw away the old data, and pretend this tick didn't happen.
536 	 */
537 	if (sdc->sdc_cur_O < sdc->sdc_base_O ||
538 	    sdc->sdc_cur_R < sdc->sdc_base_R) {
539 
540 		sdc->sdc_base_O = sdc->sdc_cur_O;
541 		sdc->sdc_base_R = sdc->sdc_cur_R;
542 
543 		SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
544 		return;
545 	}
546 
547 	*O = sdc->sdc_cur_O - sdc->sdc_base_O;
548 	*R = sdc->sdc_cur_R - sdc->sdc_base_R;
549 }
550 
551 /*
552  * sysdc_compute_pri()
553  *
554  *	Recomputes the priority of the thread, leaving the result in
555  *	sdc->sdc_epri.  Returns 1 if a priority update should occur
556  *	(which will also trigger a cpu_surrender()), otherwise
557  *	returns 0.
558  */
559 static uint_t
560 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
561 {
562 	kthread_t *const t = sdc->sdc_thread;
563 	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
564 	const uint_t	tick = (flags & SDC_UPDATE_TICK);
565 
566 	hrtime_t	O, R;
567 	hrtime_t	newO = -1;
568 
569 	ASSERT(THREAD_LOCK_HELD(t));
570 
571 	sysdc_update_times(sdc, flags, &O, &R, &newO);
572 	ASSERT(!update || newO != -1);
573 
574 	/* If we have new data, recompute our priority. */
575 	if ((O + R) != 0) {
576 		sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
577 
578 		/* Adjust our priority to move our DC closer to the target. */
579 		if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
580 			sdc->sdc_pri = sdc->sdc_maxpri;
581 		else
582 			sdc->sdc_pri = sdc->sdc_minpri;
583 	}
584 
585 	/*
586 	 * If our per-pset duty cycle goes over the max, we will take a break.
587 	 * This forces all sysdc threads in the pset to minimum priority, in
588 	 * order to let everyone else have a chance at the CPU.
589 	 */
590 	if (sdc->sdc_pset->sdp_need_break) {
591 		SYSDC_INC_STAT(sysdc_compute_pri_breaking);
592 		sdc->sdc_epri = sdc->sdc_minpri;
593 	} else {
594 		sdc->sdc_epri = sdc->sdc_pri;
595 	}
596 
597 	DTRACE_PROBE4(sysdc__compute__pri,
598 	    kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
599 	    uint_t, sdc->sdc_target_DC);
600 
601 	/*
602 	 * For sysdc_update(), we compute the ONPROC time for high-priority
603 	 * threads, which is used to calculate the per-pset duty cycle.  We
604 	 * will always tell our callers to update the thread's priority,
605 	 * since we want to force a cpu_surrender().
606 	 *
607 	 * We reset sdc_update_ticks so that sysdc_tick() will only update
608 	 * the thread's priority if our timeout is delayed by a tick or
609 	 * more.
610 	 */
611 	if (update) {
612 		/* SDC threads are not allowed to change cpupart bindings. */
613 		ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
614 
615 		/* If we were at MAXPRI, account for our onproc time. */
616 		if (t->t_pri == sdc->sdc_maxpri &&
617 		    sdc->sdc_last_base_O != 0 &&
618 		    sdc->sdc_last_base_O < newO) {
619 			sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
620 			sdc->sdc_pset->sdp_onproc_time +=
621 			    (uint64_t)sdc->sdc_last_O;
622 			sdc->sdc_pset->sdp_onproc_threads++;
623 		} else {
624 			sdc->sdc_last_O = 0;
625 		}
626 		sdc->sdc_last_base_O = newO;
627 
628 		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
629 		return (1);
630 	}
631 
632 	/*
633 	 * Like sysdc_update(), sysdc_tick() always wants to update the
634 	 * thread's priority, so that the CPU is surrendered if necessary.
635 	 * We reset sdc_update_ticks so that if the timeout continues to be
636 	 * delayed, we'll update at the regular interval.
637 	 */
638 	if (tick) {
639 		ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
640 		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
641 		return (1);
642 	}
643 
644 	/*
645 	 * Otherwise, only tell our callers to update the priority if it has
646 	 * changed.
647 	 */
648 	return (sdc->sdc_epri != t->t_pri);
649 }
650 
651 static void
652 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
653 {
654 	kthread_t *t = sdc->sdc_thread;
655 
656 	ASSERT(THREAD_LOCK_HELD(t));
657 
658 	if (sysdc_compute_pri(sdc, flags)) {
659 		if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
660 			cpu_surrender(t);
661 		}
662 	}
663 }
664 
665 /*
666  * Add a thread onto the active list.  It will only be removed by
667  * sysdc_update().
668  */
669 static void
670 sysdc_activate(sysdc_t *sdc)
671 {
672 	sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
673 	sysdc_t		*head;
674 	kthread_t	*t = sdc->sdc_thread;
675 
676 	SYSDC_INC_STAT(sysdc_activate_enter);
677 
678 	ASSERT(sdc->sdc_next == NULL);
679 	ASSERT(THREAD_LOCK_HELD(t));
680 
681 	do {
682 		head = *headp;
683 		sdc->sdc_next = head;
684 	} while (atomic_cas_ptr(headp, head, sdc) != head);
685 }
686 
687 /*
688  * sysdc_update() has two jobs:
689  *
690  *	1. It updates the priorities of all active SDC threads on the system.
691  *	2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
692  */
693 static void
694 sysdc_update(void *arg)
695 {
696 	int		idx;
697 	sysdc_t		*freelist = NULL;
698 	sysdc_pset_t	*cur;
699 	hrtime_t	now, diff;
700 	uint_t		redeploy = 1;
701 
702 	SYSDC_INC_STAT(sysdc_update_enter);
703 
704 	ASSERT(sysdc_update_timeout_started);
705 
706 	/*
707 	 * If this is our first time through, diff will be gigantic, and
708 	 * no breaks will be necessary.
709 	 */
710 	now = gethrtime();
711 	diff = now - sysdc_last_update;
712 	sysdc_last_update = now;
713 
714 	mutex_enter(&sysdc_pset_lock);
715 	for (cur = list_head(&sysdc_psets); cur != NULL;
716 	    cur = list_next(&sysdc_psets, cur)) {
717 		boolean_t breaking = (cur->sdp_should_break != 0);
718 
719 		if (cur->sdp_need_break != breaking) {
720 			DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
721 			    boolean_t, breaking);
722 		}
723 		cur->sdp_onproc_time = 0;
724 		cur->sdp_onproc_threads = 0;
725 		cur->sdp_need_break = breaking;
726 	}
727 	mutex_exit(&sysdc_pset_lock);
728 
729 	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
730 		sysdc_list_t		*sdl = &sysdc_active[idx];
731 		sysdc_t *volatile	*headp = &sdl->sdl_list;
732 		sysdc_t			*head, *tail;
733 		sysdc_t			**prevptr;
734 
735 		if (*headp == &sysdc_dummy)
736 			continue;
737 
738 		/* Prevent any threads from exiting while we're poking them. */
739 		mutex_enter(&sdl->sdl_lock);
740 
741 		/*
742 		 * Each sdl_list contains a singly-linked list of active
743 		 * threads. Threads which become active while we are
744 		 * processing the list will be added to sdl_list.  Since we
745 		 * don't want that to interfere with our own processing, we
746 		 * swap in an empty list.  Any newly active threads will
747 		 * go on to this empty list.  When finished, we'll put any
748 		 * such threads at the end of the processed list.
749 		 */
750 		head = atomic_swap_ptr(headp, &sysdc_dummy);
751 		prevptr = &head;
752 		while (*prevptr != &sysdc_dummy) {
753 			sysdc_t		*const	sdc = *prevptr;
754 			kthread_t	*const	t = sdc->sdc_thread;
755 
756 			/*
757 			 * If the thread has exited, move its sysdc_t onto
758 			 * freelist, to be freed later.
759 			 */
760 			if (t == NULL) {
761 				*prevptr = sdc->sdc_next;
762 				SYSDC_INC_STAT(sysdc_update_exited);
763 				sdc->sdc_next = freelist;
764 				freelist = sdc;
765 				continue;
766 			}
767 
768 			thread_lock(t);
769 			if (t->t_cid != sysdccid) {
770 				thread_unlock(t);
771 				prevptr = &sdc->sdc_next;
772 				SYSDC_INC_STAT(sysdc_update_not_sdc);
773 				continue;
774 			}
775 			ASSERT(t->t_cldata == sdc);
776 
777 			/*
778 			 * If the thread has been sleeping for longer
779 			 * than sysdc_prune_interval, make it inactive by
780 			 * removing it from the list.
781 			 */
782 			if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
783 			    sdc->sdc_sleep_updates != 0 &&
784 			    (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
785 			    sysdc_prune_updates) {
786 				*prevptr = sdc->sdc_next;
787 				SYSDC_INC_STAT(sysdc_update_idle);
788 				sdc->sdc_next = NULL;
789 				thread_unlock(t);
790 				continue;
791 			}
792 			sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
793 			thread_unlock(t);
794 
795 			prevptr = &sdc->sdc_next;
796 		}
797 
798 		/*
799 		 * Add our list to the bucket, putting any new entries
800 		 * added while we were working at the tail of the list.
801 		 */
802 		do {
803 			tail = *headp;
804 			*prevptr = tail;
805 		} while (atomic_cas_ptr(headp, tail, head) != tail);
806 
807 		mutex_exit(&sdl->sdl_lock);
808 	}
809 
810 	mutex_enter(&sysdc_pset_lock);
811 	for (cur = list_head(&sysdc_psets); cur != NULL;
812 	    cur = list_next(&sysdc_psets, cur)) {
813 
814 		cur->sdp_vtime_last_interval =
815 		    diff * cur->sdp_cpupart->cp_ncpus;
816 		cur->sdp_DC_last_interval =
817 		    (cur->sdp_onproc_time * SYSDC_DC_MAX) /
818 		    cur->sdp_vtime_last_interval;
819 
820 		if (cur->sdp_should_break > 0) {
821 			cur->sdp_should_break--;	/* breaking */
822 			continue;
823 		}
824 		if (cur->sdp_dont_break > 0) {
825 			cur->sdp_dont_break--;	/* waiting before checking */
826 			continue;
827 		}
828 		if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
829 			cur->sdp_should_break = sysdc_break_updates;
830 			cur->sdp_dont_break = sysdc_nobreak_updates;
831 			SYSDC_INC_STAT(sysdc_update_take_break);
832 		}
833 	}
834 
835 	/*
836 	 * If there are no sysdc_psets, there can be no threads, so
837 	 * we can stop doing our timeout.  Since we're holding the
838 	 * sysdc_pset_lock, no new sysdc_psets can come in, which will
839 	 * prevent anyone from racing with this and dropping our timeout
840 	 * on the floor.
841 	 */
842 	if (list_is_empty(&sysdc_psets)) {
843 		SYSDC_INC_STAT(sysdc_update_no_psets);
844 		ASSERT(sysdc_update_timeout_started);
845 		sysdc_update_timeout_started = 0;
846 
847 		redeploy = 0;
848 	}
849 	mutex_exit(&sysdc_pset_lock);
850 
851 	while (freelist != NULL) {
852 		sysdc_t *cur = freelist;
853 		freelist = cur->sdc_next;
854 		kmem_free(cur, sizeof (*cur));
855 	}
856 
857 	if (redeploy) {
858 		(void) timeout(sysdc_update, arg, sysdc_update_ticks);
859 	}
860 }
861 
862 static void
863 sysdc_preempt(kthread_t *t)
864 {
865 	ASSERT(t == curthread);
866 	ASSERT(THREAD_LOCK_HELD(t));
867 
868 	setbackdq(t);		/* give others a chance to run */
869 }
870 
871 static void
872 sysdc_tick(kthread_t *t)
873 {
874 	sysdc_t *sdc;
875 
876 	thread_lock(t);
877 	if (t->t_cid != sysdccid) {
878 		SYSDC_INC_STAT(sysdc_tick_not_sdc);
879 		thread_unlock(t);
880 		return;
881 	}
882 	sdc = t->t_cldata;
883 	if (t->t_state == TS_ONPROC &&
884 	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
885 		cpu_surrender(t);
886 	}
887 
888 	if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
889 		ASSERT(sdc->sdc_sleep_updates == 0);
890 	}
891 
892 	ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
893 	sdc->sdc_ticks++;
894 	if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
895 		SYSDC_INC_STAT(sysdc_tick_quantum_expired);
896 		sysdc_update_pri(sdc, SDC_UPDATE_TICK);
897 		ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
898 	}
899 	thread_unlock(t);
900 }
901 
902 static void
903 sysdc_setrun(kthread_t *t)
904 {
905 	sysdc_t *sdc = t->t_cldata;
906 
907 	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
908 
909 	sdc->sdc_sleep_updates = 0;
910 
911 	if (sdc->sdc_next == NULL) {
912 		/*
913 		 * Since we're in transition, we don't want to use the
914 		 * full thread_update_pri().
915 		 */
916 		if (sysdc_compute_pri(sdc, 0)) {
917 			THREAD_CHANGE_PRI(t, sdc->sdc_epri);
918 		}
919 		sysdc_activate(sdc);
920 
921 		ASSERT(sdc->sdc_next != NULL);
922 	}
923 
924 	setbackdq(t);
925 }
926 
927 static void
928 sysdc_wakeup(kthread_t *t)
929 {
930 	sysdc_setrun(t);
931 }
932 
933 static void
934 sysdc_sleep(kthread_t *t)
935 {
936 	sysdc_t *sdc = t->t_cldata;
937 
938 	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
939 
940 	sdc->sdc_sleep_updates = sdc->sdc_nupdates;
941 }
942 
943 /*ARGSUSED*/
944 static int
945 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
946     void *bufp)
947 {
948 	cpupart_t *const cpupart = t->t_cpupart;
949 	sysdc_t *sdc = bufp;
950 	sysdc_params_t *sdpp = parmsp;
951 	sysdc_pset_t *newpset = sdc->sdc_pset;
952 	sysdc_pset_t *pset;
953 	int start_timeout;
954 
955 	if (t->t_cid != syscid)
956 		return (EPERM);
957 
958 	ASSERT(ttolwp(t) != NULL);
959 	ASSERT(sdpp != NULL);
960 	ASSERT(newpset != NULL);
961 	ASSERT(sysdc_param_init);
962 
963 	ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
964 	ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
965 	ASSERT(sdpp->sdp_DC >= sysdc_minDC);
966 	ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
967 
968 	sdc->sdc_thread = t;
969 	sdc->sdc_pri = sdpp->sdp_maxpri;	/* start off maximally */
970 	sdc->sdc_minpri = sdpp->sdp_minpri;
971 	sdc->sdc_maxpri = sdpp->sdp_maxpri;
972 	sdc->sdc_target_DC = sdpp->sdp_DC;
973 	sdc->sdc_ticks = 0;
974 	sdc->sdc_update_ticks = sysdc_update_ticks + 1;
975 
976 	/* Assign ourselves to the appropriate pset. */
977 	sdc->sdc_pset = NULL;
978 	mutex_enter(&sysdc_pset_lock);
979 	for (pset = list_head(&sysdc_psets); pset != NULL;
980 	    pset = list_next(&sysdc_psets, pset)) {
981 		if (pset->sdp_cpupart == cpupart) {
982 			break;
983 		}
984 	}
985 	if (pset == NULL) {
986 		pset = newpset;
987 		newpset = NULL;
988 		pset->sdp_cpupart = cpupart;
989 		list_insert_tail(&sysdc_psets, pset);
990 	}
991 	pset->sdp_nthreads++;
992 	ASSERT(pset->sdp_nthreads > 0);
993 
994 	sdc->sdc_pset = pset;
995 
996 	start_timeout = (sysdc_update_timeout_started == 0);
997 	sysdc_update_timeout_started = 1;
998 	mutex_exit(&sysdc_pset_lock);
999 
1000 	if (newpset != NULL)
1001 		kmem_free(newpset, sizeof (*newpset));
1002 
1003 	/* Update t's scheduling class and priority. */
1004 	thread_lock(t);
1005 	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1006 	t->t_cid = cid;
1007 	t->t_cldata = sdc;
1008 	t->t_schedflag |= TS_RUNQMATCH;
1009 
1010 	sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1011 	thread_unlock(t);
1012 
1013 	/* Kick off the thread timeout if we're the first one in. */
1014 	if (start_timeout) {
1015 		(void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1016 	}
1017 
1018 	return (0);
1019 }
1020 
1021 static void
1022 sysdc_leave(sysdc_t *sdc)
1023 {
1024 	sysdc_pset_t *sdp = sdc->sdc_pset;
1025 	sysdc_list_t *sdl = SYSDC_LIST(sdc);
1026 	uint_t freedc;
1027 
1028 	mutex_enter(&sdl->sdl_lock);		/* block sysdc_update() */
1029 	sdc->sdc_thread = NULL;
1030 	freedc = (sdc->sdc_next == NULL);
1031 	mutex_exit(&sdl->sdl_lock);
1032 
1033 	mutex_enter(&sysdc_pset_lock);
1034 	ASSERT(sdp != NULL);
1035 	ASSERT(sdp->sdp_nthreads > 0);
1036 	--sdp->sdp_nthreads;
1037 	if (sdp->sdp_nthreads == 0) {
1038 		list_remove(&sysdc_psets, sdp);
1039 	} else {
1040 		sdp = NULL;
1041 	}
1042 	mutex_exit(&sysdc_pset_lock);
1043 
1044 	if (freedc)
1045 		kmem_free(sdc, sizeof (*sdc));
1046 	if (sdp != NULL)
1047 		kmem_free(sdp, sizeof (*sdp));
1048 }
1049 
1050 static void
1051 sysdc_exitclass(void *buf)
1052 {
1053 	sysdc_leave((sysdc_t *)buf);
1054 }
1055 
1056 /*ARGSUSED*/
1057 static int
1058 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1059 {
1060 	/* Threads cannot exit SDC once joined, except in a body bag. */
1061 	return (EPERM);
1062 }
1063 
1064 static void
1065 sysdc_exit(kthread_t *t)
1066 {
1067 	sysdc_t *sdc;
1068 
1069 	/* We're exiting, so we just rejoin the SYS class. */
1070 	thread_lock(t);
1071 	ASSERT(t->t_cid == sysdccid);
1072 	sdc = t->t_cldata;
1073 	t->t_cid = syscid;
1074 	t->t_cldata = NULL;
1075 	t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1076 	(void) thread_change_pri(t, maxclsyspri, 0);
1077 	t->t_schedflag &= ~TS_RUNQMATCH;
1078 	thread_unlock_nopreempt(t);
1079 
1080 	/* Unlink the sdc from everything. */
1081 	sysdc_leave(sdc);
1082 }
1083 
1084 /*ARGSUSED*/
1085 static int
1086 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1087 {
1088 	/*
1089 	 * Threads cannot be created with SDC as their class; they must
1090 	 * be created as SYS and then added with sysdc_thread_enter().
1091 	 * Because of this restriction, sysdc_fork() should never be called.
1092 	 */
1093 	panic("sysdc cannot be forked");
1094 
1095 	return (ENOSYS);
1096 }
1097 
1098 /*ARGSUSED*/
1099 static void
1100 sysdc_forkret(kthread_t *t, kthread_t *ct)
1101 {
1102 	/* SDC threads are part of system processes, which never fork. */
1103 	panic("sysdc cannot be forked");
1104 }
1105 
1106 static pri_t
1107 sysdc_globpri(kthread_t *t)
1108 {
1109 	return (t->t_epri);
1110 }
1111 
1112 /*ARGSUSED*/
1113 static pri_t
1114 sysdc_no_swap(kthread_t *t, int flags)
1115 {
1116 	/* SDC threads cannot be swapped. */
1117 	return (-1);
1118 }
1119 
1120 /*
1121  * Get maximum and minimum priorities enjoyed by SDC threads.
1122  */
1123 static int
1124 sysdc_getclpri(pcpri_t *pcprip)
1125 {
1126 	pcprip->pc_clpmax = sysdc_maxpri;
1127 	pcprip->pc_clpmin = sysdc_minpri;
1128 	return (0);
1129 }
1130 
1131 /*ARGSUSED*/
1132 static int
1133 sysdc_getclinfo(void *arg)
1134 {
1135 	return (0);		/* no class-specific info */
1136 }
1137 
1138 /*ARGSUSED*/
1139 static int
1140 sysdc_alloc(void **p, int flag)
1141 {
1142 	sysdc_t *new;
1143 
1144 	*p = NULL;
1145 	if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1146 		return (ENOMEM);
1147 	}
1148 	if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1149 	    NULL) {
1150 		kmem_free(new, sizeof (*new));
1151 		return (ENOMEM);
1152 	}
1153 	*p = new;
1154 	return (0);
1155 }
1156 
1157 static void
1158 sysdc_free(void *p)
1159 {
1160 	sysdc_t *sdc = p;
1161 
1162 	if (sdc != NULL) {
1163 		/*
1164 		 * We must have failed CL_ENTERCLASS(), so our pset should be
1165 		 * there and unused.
1166 		 */
1167 		ASSERT(sdc->sdc_pset != NULL);
1168 		ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1169 		kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1170 		kmem_free(sdc, sizeof (*sdc));
1171 	}
1172 }
1173 
1174 static int sysdc_enosys();	/* Boy, ANSI-C's K&R compatibility is weird. */
1175 static int sysdc_einval();
1176 static void sysdc_nullsys();
1177 
1178 static struct classfuncs sysdc_classfuncs = {
1179 	/* messages to class manager */
1180 	{
1181 		sysdc_enosys,	/* admin */
1182 		sysdc_getclinfo,
1183 		sysdc_enosys,	/* parmsin */
1184 		sysdc_enosys,	/* parmsout */
1185 		sysdc_enosys,	/* vaparmsin */
1186 		sysdc_enosys,	/* vaparmsout */
1187 		sysdc_getclpri,
1188 		sysdc_alloc,
1189 		sysdc_free,
1190 	},
1191 	/* operations on threads */
1192 	{
1193 		sysdc_enterclass,
1194 		sysdc_exitclass,
1195 		sysdc_canexit,
1196 		sysdc_fork,
1197 		sysdc_forkret,
1198 		sysdc_nullsys,	/* parmsget */
1199 		sysdc_enosys,	/* parmsset */
1200 		sysdc_nullsys,	/* stop */
1201 		sysdc_exit,
1202 		sysdc_nullsys,	/* active */
1203 		sysdc_nullsys,	/* inactive */
1204 		sysdc_no_swap,	/* swapin */
1205 		sysdc_no_swap,	/* swapout */
1206 		sysdc_nullsys,	/* trapret */
1207 		sysdc_preempt,
1208 		sysdc_setrun,
1209 		sysdc_sleep,
1210 		sysdc_tick,
1211 		sysdc_wakeup,
1212 		sysdc_einval,	/* donice */
1213 		sysdc_globpri,
1214 		sysdc_nullsys,	/* set_process_group */
1215 		sysdc_nullsys,	/* yield */
1216 		sysdc_einval,	/* doprio */
1217 	}
1218 };
1219 
1220 static int
1221 sysdc_enosys()
1222 {
1223 	return (ENOSYS);
1224 }
1225 
1226 static int
1227 sysdc_einval()
1228 {
1229 	return (EINVAL);
1230 }
1231 
1232 static void
1233 sysdc_nullsys()
1234 {
1235 }
1236 
1237 /*ARGSUSED*/
1238 static pri_t
1239 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1240 {
1241 	int idx;
1242 
1243 	list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1244 	    offsetof(sysdc_pset_t, sdp_node));
1245 
1246 	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1247 		sysdc_active[idx].sdl_list = &sysdc_dummy;
1248 	}
1249 
1250 	sysdc_initparam();
1251 
1252 	sysdccid = cid;
1253 	*clfuncspp = &sysdc_classfuncs;
1254 
1255 	return ((pri_t)v.v_maxsyspri);
1256 }
1257 
1258 static struct sclass csw = {
1259 	"SDC",
1260 	sysdc_init,
1261 	0
1262 };
1263 
1264 static struct modlsched modlsched = {
1265 	&mod_schedops, "system duty cycle scheduling class", &csw
1266 };
1267 
1268 static struct modlinkage modlinkage = {
1269 	MODREV_1, (void *)&modlsched, NULL
1270 };
1271 
1272 int
1273 _init()
1274 {
1275 	return (mod_install(&modlinkage));
1276 }
1277 
1278 int
1279 _fini()
1280 {
1281 	return (EBUSY);		/* can't unload for now */
1282 }
1283 
1284 int
1285 _info(struct modinfo *modinfop)
1286 {
1287 	return (mod_info(&modlinkage, modinfop));
1288 }
1289 
1290 /* --- consolidation-private interfaces --- */
1291 void
1292 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1293 {
1294 	void *buf = NULL;
1295 	sysdc_params_t sdp;
1296 
1297 	SYSDC_INC_STAT(sysdc_thread_enter_enter);
1298 
1299 	ASSERT(sysdc_param_init);
1300 	ASSERT(sysdccid >= 0);
1301 
1302 	ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1303 
1304 	sdp.sdp_minpri = sysdc_minpri;
1305 	sdp.sdp_maxpri = sysdc_maxpri;
1306 	sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1307 
1308 	VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);
1309 
1310 	ASSERT(t->t_lwp != NULL);
1311 	ASSERT(t->t_cid == syscid);
1312 	ASSERT(t->t_cldata == NULL);
1313 	VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
1314 	VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
1315 	CL_EXITCLASS(syscid, NULL);
1316 }
1317