xref: /illumos-gate/usr/src/uts/common/disp/sysdc.c (revision e7afc443cb8c2e0a379fe48b15a0c7fb61a4b2fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * The System Duty Cycle (SDC) scheduling class
28  * --------------------------------------------
29  *
30  * Background
31  *
32  * Kernel threads in Solaris have traditionally not been large consumers
33  * of CPU time.  They typically wake up, perform a small amount of
34  * work, then go back to sleep waiting for either a timeout or another
35  * signal.  On the assumption that the small amount of work that they do
36  * is important for the behavior of the whole system, these threads are
37  * treated kindly by the dispatcher and the SYS scheduling class: they run
38  * without preemption from anything other than real-time and interrupt
39  * threads; when preempted, they are put at the front of the queue, so they
40  * generally do not migrate between CPUs; and they are allowed to stay
41  * running until they voluntarily give up the CPU.
42  *
43  * As Solaris has evolved, new workloads have emerged which require the
44  * kernel to perform significant amounts of CPU-intensive work.  One
45  * example of such a workload is ZFS's transaction group sync processing.
46  * Each sync operation generates a large batch of I/Os, and each I/O
47  * may need to be compressed and/or checksummed before it is written to
48  * storage.  The taskq threads which perform the compression and checksums
49  * will run nonstop as long as they have work to do; a large sync operation
50  * on a compression-heavy dataset can keep them busy for seconds on end.
51  * This causes human-time-scale dispatch latency bubbles for any other
52  * threads which have the misfortune to share a CPU with the taskq threads.
53  *
54  * The SDC scheduling class is a solution to this problem.
55  *
56  *
57  * Overview
58  *
59  * SDC is centered around the concept of a thread's duty cycle (DC):
60  *
61  *			      ONPROC time
62  *	Duty Cycle =	----------------------
63  *			ONPROC + Runnable time
64  *
65  * This is the ratio of the time that the thread spent running on a CPU
66  * divided by the time it spent running or trying to run.  It is unaffected
67  * by any time the thread spent sleeping, stopped, etc.
68  *
69  * A thread joining the SDC class specifies a "target" DC that it wants
70  * to run at.  To implement this policy, the routine sysdc_update() scans
71  * the list of active SDC threads every few ticks and uses each thread's
72  * microstate data to compute the actual duty cycle that that thread
73  * has experienced recently.  If the thread is under its target DC, its
74  * priority is increased to the maximum available (sysdc_maxpri, which is
75  * 99 by default).  If the thread is over its target DC, its priority is
76  * reduced to the minimum available (sysdc_minpri, 0 by default).  This
77  * is a fairly primitive approach, in that it doesn't use any of the
78  * intermediate priorities, but it's not completely inappropriate.  Even
79  * though threads in the SDC class might take a while to do their job, they
80  * are by some definition important if they're running inside the kernel,
81  * so it is reasonable that they should get to run at priority 99.
82  *
83  * If a thread is running when sysdc_update() calculates its actual duty
84  * cycle, and there are other threads of equal or greater priority on its
85  * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
86  * acknowledges the preemption by calling sysdc_preempt(), which calls
87  * setbackdq(), which gives other threads with the same priority a chance
88  * to run.  This creates a de facto time quantum for threads in the SDC
89  * scheduling class.
90  *
91  * An SDC thread which is assigned priority 0 can continue to run if
92  * nothing else needs to use the CPU that it's running on.  Similarly, an
93  * SDC thread at priority 99 might not get to run as much as it wants to
94  * if there are other priority-99 or higher threads on its CPU.  These
95  * situations would cause the thread to get ahead of or behind its target
96  * DC; the longer the situations lasted, the further ahead or behind the
97  * thread would get.  Rather than condemning a thread to a lifetime of
98  * paying for its youthful indiscretions, SDC keeps "base" values for
99  * ONPROC and Runnable times in each thread's sysdc data, and updates these
100  * values periodically.  The duty cycle is then computed using the elapsed
101  * amount of ONPROC and Runnable times since those base times.
102  *
103  * Since sysdc_update() scans SDC threads fairly frequently, it tries to
104  * keep the list of "active" threads small by pruning out threads which
105  * have been asleep for a brief time.  They are not pruned immediately upon
106  * going to sleep, since some threads may bounce back and forth between
107  * sleeping and being runnable.
108  *
109  *
110  * Interfaces
111  *
112  * void sysdc_thread_enter(t, dc, flags)
113  *
114  *	Moves a kernel thread from the SYS scheduling class to the
115  *	SDC class. t must have an associated LWP (created by calling
116  *	lwp_kernel_create()).  The thread will have a target DC of dc.
117  *	Flags should be either 0 or SYSDC_THREAD_BATCH.  If
118  *	SYSDC_THREAD_BATCH is specified, the thread is expected to be
119  *	doing large amounts of processing.
120  *
121  *
122  * Complications
123  *
124  * - Run queue balancing
125  *
126  *	The Solaris dispatcher is biased towards letting a thread run
127  *	on the same CPU which it last ran on, if no more than 3 ticks
128  *	(i.e. rechoose_interval) have passed since the thread last ran.
129  *	This helps to preserve cache warmth.  On the other hand, it also
130  *	tries to keep the per-CPU run queues fairly balanced; if the CPU
131  *	chosen for a runnable thread has a run queue which is three or
132  *	more threads longer than a neighboring CPU's queue, the runnable
133  *	thread is dispatched onto the neighboring CPU instead.
134  *
135  *	These policies work well for some workloads, but not for many SDC
136  *	threads.  The taskq client of SDC, for example, has many discrete
137  *	units of work to do.  The work units are largely independent, so
138  *	cache warmth is not an important consideration.  It is important
139  *	that the threads fan out quickly to different CPUs, since the
140  *	amount of work these threads have to do (a few seconds worth at a
141  *	time) doesn't leave much time to correct thread placement errors
142  *	(i.e. two SDC threads being dispatched to the same CPU).
143  *
144  *	To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
145  *	This tells the dispatcher to keep neighboring run queues' lengths
146  *	more evenly matched, which allows SDC threads to migrate more
147  *	easily.
148  *
149  * - LWPs and system processes
150  *
151  *	SDC can only be used for kernel threads.  Since SDC uses microstate
152  *	accounting data to compute each thread's actual duty cycle, all
153  *	threads entering the SDC class must have associated LWPs (which
154  *	store the microstate data).  This means that the threads have to
155  *	be associated with an SSYS process, i.e. one created by newproc().
156  *	If the microstate accounting information is ever moved into the
157  *	kthread_t, this restriction could be lifted.
158  *
159  * - Dealing with oversubscription
160  *
161  *	Since SDC duty cycles are per-thread, it is possible that the
162  *	aggregate requested duty cycle of all SDC threads in a processor
163  *	set could be greater than the total CPU time available in that set.
164  *	The FSS scheduling class has an analogous situation, which it deals
165  *	with by reducing each thread's allotted CPU time proportionally.
166  *	Since SDC doesn't need to be as precise as FSS, it uses a simpler
167  *	solution to the oversubscription problem.
168  *
169  *	sysdc_update() accumulates the amount of time that max-priority SDC
170  *	threads have spent on-CPU in each processor set, and uses that sum
171  *	to create an implied duty cycle for that processor set:
172  *
173  *				accumulated CPU time
174  *	   pset DC =	-----------------------------------
175  *			 (# CPUs) * time since last update
176  *
177  *	If this implied duty cycle is above a maximum pset duty cycle (90%
178  *	by default), sysdc_update() sets the priority of all SDC threads
179  *	in that processor set to sysdc_minpri for a "break" period.  After
180  *	the break period, it waits for a "nobreak" period before trying to
181  *	enforce the pset duty cycle limit again.
182  *
183  * - Processor sets
184  *
185  *	As the above implies, SDC is processor set aware, but it does not
186  *	currently allow threads to change processor sets while in the SDC
187  *	class.  Instead, those threads must join the desired processor set
188  *	before entering SDC. [1]
189  *
190  * - Batch threads
191  *
192  *	A thread joining the SDC class can specify the SDC_THREAD_BATCH
193  *	flag.  This flag currently has no effect, but marks threads which
194  *	do bulk processing.
195  *
196  * - Why not FSS?
197  *
198  *	It might seem that the existing FSS scheduling class could solve
199  *	the problems that SDC is attempting to solve.  FSS's more precise
200  *	solution to the oversubscription problem would hardly cause
201  *	trouble, as long as it performed well.  SDC is implemented as
202  *	a separate scheduling class for two main reasons: the initial
203  *	consumer of SDC does not map well onto the "project" abstraction
204  *	that is central to FSS, and FSS does not expect to run at kernel
205  *	priorities.
206  *
207  *
208  * Tunables
209  *
210  * - sysdc_update_interval_msec:  Number of milliseconds between
211  *	consecutive thread priority updates.
212  *
213  * - sysdc_reset_interval_msec:  Number of milliseconds between
214  *	consecutive resets of a thread's base ONPROC and Runnable
215  *	times.
216  *
217  * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
218  *	before a thread is pruned from the active list.
219  *
220  * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
221  *	CPU time which SDC can give to its high-priority threads.
222  *
223  * - sysdc_break_msec:  Number of milliseconds of "break" taken when
224  *	sysdc_max_pset_DC is exceeded.
225  *
226  *
227  * Future work (in SDC and related subsystems)
228  *
229  * - Per-thread rechoose interval (0 for SDC)
230  *
231  *	Allow each thread to specify its own rechoose interval.  SDC
232  *	threads would specify an interval of zero, which would rechoose
233  *	the CPU with the lowest priority once per update.
234  *
235  * - Allow threads to change processor sets after joining the SDC class
236  *
237  * - Thread groups and per-group DC
238  *
239  *	It might be nice to be able to specify a duty cycle which applies
240  *	to a group of threads in aggregate.
241  *
242  * - Per-group DC callback to allow dynamic DC tuning
243  *
244  *	Currently, DCs are assigned when the thread joins SDC.  Some
245  *	workloads could benefit from being able to tune their DC using
246  *	subsystem-specific knowledge about the workload.
247  *
248  * - Finer-grained priority updates
249  *
250  * - More nuanced management of oversubscription
251  *
252  * - Moving other CPU-intensive threads into SDC
253  *
254  * - Move msacct data into kthread_t
255  *
256  *	This would allow kernel threads without LWPs to join SDC.
257  *
258  *
259  * Footnotes
260  *
261  * [1] The details of doing so are left as an exercise for the reader.
262  */
263 
264 #include <sys/types.h>
265 #include <sys/sysdc.h>
266 #include <sys/sysdc_impl.h>
267 
268 #include <sys/class.h>
269 #include <sys/cmn_err.h>
270 #include <sys/cpuvar.h>
271 #include <sys/cpupart.h>
272 #include <sys/debug.h>
273 #include <sys/disp.h>
274 #include <sys/errno.h>
275 #include <sys/inline.h>
276 #include <sys/kmem.h>
277 #include <sys/modctl.h>
278 #include <sys/schedctl.h>
279 #include <sys/sdt.h>
280 #include <sys/sunddi.h>
281 #include <sys/sysmacros.h>
282 #include <sys/systm.h>
283 #include <sys/var.h>
284 
285 /*
286  * Tunables - loaded into the internal state at module load time
287  */
288 uint_t		sysdc_update_interval_msec = 20;
289 uint_t		sysdc_reset_interval_msec = 400;
290 uint_t		sysdc_prune_interval_msec = 100;
291 uint_t		sysdc_max_pset_DC = 90;
292 uint_t		sysdc_break_msec = 80;
293 
294 /*
295  * Internal state - constants set up by sysdc_initparam()
296  */
297 static clock_t	sysdc_update_ticks;	/* ticks between updates */
298 static uint_t	sysdc_prune_updates;	/* updates asleep before pruning */
299 static uint_t	sysdc_reset_updates;	/* # of updates before reset */
300 static uint_t	sysdc_break_updates;	/* updates to break */
301 static uint_t	sysdc_nobreak_updates;	/* updates to not check */
302 static uint_t	sysdc_minDC;		/* minimum allowed DC */
303 static uint_t	sysdc_maxDC;		/* maximum allowed DC */
304 static pri_t	sysdc_minpri;		/* minimum allowed priority */
305 static pri_t	sysdc_maxpri;		/* maximum allowed priority */
306 
307 /*
308  * Internal state
309  */
310 static kmutex_t	sysdc_pset_lock;	/* lock protecting pset data */
311 static list_t	sysdc_psets;		/* list of psets with SDC threads */
312 static uint_t	sysdc_param_init;	/* sysdc_initparam() has been called */
313 static uint_t	sysdc_update_timeout_started; /* update timeout is active */
314 static hrtime_t	sysdc_last_update;	/* time of last sysdc_update() */
315 static sysdc_t	sysdc_dummy;		/* used to terminate active lists */
316 
317 /*
318  * Internal state - active hash table
319  */
320 #define	SYSDC_NLISTS	8
321 #define	SYSDC_HASH(sdc)	(((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
322 static sysdc_list_t	sysdc_active[SYSDC_NLISTS];
323 #define	SYSDC_LIST(sdc)		(&sysdc_active[SYSDC_HASH(sdc)])
324 
325 #ifdef DEBUG
326 static struct {
327 	uint64_t	sysdc_update_times_asleep;
328 	uint64_t	sysdc_update_times_base_ran_backwards;
329 	uint64_t	sysdc_update_times_already_done;
330 	uint64_t	sysdc_update_times_cur_ran_backwards;
331 	uint64_t	sysdc_compute_pri_breaking;
332 	uint64_t	sysdc_activate_enter;
333 	uint64_t	sysdc_update_enter;
334 	uint64_t	sysdc_update_exited;
335 	uint64_t	sysdc_update_not_sdc;
336 	uint64_t	sysdc_update_idle;
337 	uint64_t	sysdc_update_take_break;
338 	uint64_t	sysdc_update_no_psets;
339 	uint64_t	sysdc_tick_not_sdc;
340 	uint64_t	sysdc_tick_quantum_expired;
341 	uint64_t	sysdc_thread_enter_enter;
342 } sysdc_stats;
343 
344 #define	SYSDC_INC_STAT(x)	(sysdc_stats.x++)
345 #else
346 #define	SYSDC_INC_STAT(x)	((void)0)
347 #endif
348 
349 /* macros are UPPER CASE */
350 #define	HOWMANY(a, b)	howmany((a), (b))
351 #define	MSECTOTICKS(a)	HOWMANY((a) * 1000, usec_per_tick)
352 
353 static void
354 sysdc_initparam(void)
355 {
356 	uint_t sysdc_break_ticks;
357 
358 	/* update / prune intervals */
359 	sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
360 
361 	sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
362 	    sysdc_update_interval_msec);
363 	sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
364 	    sysdc_update_interval_msec);
365 
366 	/* We must get at least a little time on CPU. */
367 	sysdc_minDC = 1;
368 	sysdc_maxDC = SYSDC_DC_MAX;
369 	sysdc_minpri = 0;
370 	sysdc_maxpri = maxclsyspri - 1;
371 
372 	/* break parameters */
373 	if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
374 		sysdc_max_pset_DC = SYSDC_DC_MAX;
375 	}
376 	sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
377 	sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
378 
379 	/*
380 	 * We want:
381 	 *
382 	 *	sysdc_max_pset_DC = (nobreak / (break + nobreak))
383 	 *
384 	 *	==>	  nobreak = sysdc_max_pset_DC * (break + nobreak)
385 	 *
386 	 *			    sysdc_max_pset_DC * break
387 	 *	==>	  nobreak = -------------------------
388 	 *			    1 - sysdc_max_pset_DC
389 	 */
390 	sysdc_nobreak_updates =
391 	    HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
392 	    (SYSDC_DC_MAX - sysdc_max_pset_DC));
393 
394 	sysdc_param_init = 1;
395 }
396 
397 #undef HOWMANY
398 #undef MSECTOTICKS
399 
400 #define	SDC_UPDATE_INITIAL	0x1	/* for the initial update */
401 #define	SDC_UPDATE_TIMEOUT	0x2	/* from sysdc_update() */
402 #define	SDC_UPDATE_TICK		0x4	/* from sysdc_tick(), on expiry */
403 
404 /*
405  * Updates the recorded times in the sdc, and returns the elapsed ONPROC
406  * and Runnable times since the last reset.
407  *
408  * newO is the thread's actual ONPROC time; it's used during sysdc_update()
409  * to track processor set usage.
410  */
411 static void
412 sysdc_update_times(sysdc_t *sdc, uint_t flags,
413     hrtime_t *O, hrtime_t *R, hrtime_t *newO)
414 {
415 	kthread_t *const t = sdc->sdc_thread;
416 	const uint_t	initial = (flags & SDC_UPDATE_INITIAL);
417 	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
418 	const clock_t	now = ddi_get_lbolt();
419 	uint_t		do_reset;
420 
421 	ASSERT(THREAD_LOCK_HELD(t));
422 
423 	*O = *R = 0;
424 
425 	/* If we've been sleeping, we know we haven't had any ONPROC time. */
426 	if (sdc->sdc_sleep_updates != 0 &&
427 	    sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
428 		*newO = sdc->sdc_last_base_O;
429 		SYSDC_INC_STAT(sysdc_update_times_asleep);
430 		return;
431 	}
432 
433 	/*
434 	 * If this is our first update, or we've hit the reset point,
435 	 * we need to reset our base_{O,R}.  Once we've updated them, we
436 	 * report O and R for the entire prior interval.
437 	 */
438 	do_reset = initial;
439 	if (update) {
440 		++sdc->sdc_nupdates;
441 		if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
442 			do_reset = 1;
443 	}
444 	if (do_reset) {
445 		hrtime_t baseO, baseR;
446 		if (initial) {
447 			/*
448 			 * Start off our cycle count somewhere in the middle,
449 			 * to keep the resets from all happening at once.
450 			 *
451 			 * 4999 is a handy prime much larger than
452 			 * sysdc_reset_updates, so that we don't run into
453 			 * trouble if the resolution is a multiple of
454 			 * sysdc_reset_updates.
455 			 */
456 			sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
457 			    sysdc_reset_updates);
458 			baseO = baseR = 0;
459 		} else {
460 			baseO = sdc->sdc_base_O;
461 			baseR = sdc->sdc_base_R;
462 		}
463 
464 		mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
465 		*newO = sdc->sdc_base_O;
466 
467 		sdc->sdc_reset = now;
468 		sdc->sdc_pri_check = -1; /* force mismatch below */
469 
470 		/*
471 		 * See below for rationale.
472 		 */
473 		if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
474 			SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
475 			baseO = sdc->sdc_base_O;
476 			baseR = sdc->sdc_base_R;
477 		}
478 
479 		/* compute based on the entire interval */
480 		*O = (sdc->sdc_base_O - baseO);
481 		*R = (sdc->sdc_base_R - baseR);
482 		return;
483 	}
484 
485 	/*
486 	 * If we're called from sysdc_update(), we *must* return a value
487 	 * for newO, so we always call mstate_systhread_times().
488 	 *
489 	 * Otherwise, if we've already done a pri check this tick,
490 	 * we can skip it.
491 	 */
492 	if (!update && sdc->sdc_pri_check == now) {
493 		SYSDC_INC_STAT(sysdc_update_times_already_done);
494 		return;
495 	}
496 
497 	/* Get the current times from the thread */
498 	sdc->sdc_pri_check = now;
499 	mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
500 	*newO = sdc->sdc_cur_O;
501 
502 	/*
503 	 * The updating of microstate accounting is not done under a
504 	 * consistent set of locks, particularly the t_waitrq field.  This
505 	 * can lead to narrow windows in which we account for time in the
506 	 * wrong bucket, which on the next read will be accounted for
507 	 * correctly.
508 	 *
509 	 * If our sdc_base_* fields were affected by one of these blips, we
510 	 * throw away the old data, and pretend this tick didn't happen.
511 	 */
512 	if (sdc->sdc_cur_O < sdc->sdc_base_O ||
513 	    sdc->sdc_cur_R < sdc->sdc_base_R) {
514 
515 		sdc->sdc_base_O = sdc->sdc_cur_O;
516 		sdc->sdc_base_R = sdc->sdc_cur_R;
517 
518 		SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
519 		return;
520 	}
521 
522 	*O = sdc->sdc_cur_O - sdc->sdc_base_O;
523 	*R = sdc->sdc_cur_R - sdc->sdc_base_R;
524 }
525 
526 /*
527  * sysdc_compute_pri()
528  *
529  *	Recomputes the priority of the thread, leaving the result in
530  *	sdc->sdc_epri.  Returns 1 if a priority update should occur
531  *	(which will also trigger a cpu_surrender()), otherwise
532  *	returns 0.
533  */
534 static uint_t
535 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
536 {
537 	kthread_t *const t = sdc->sdc_thread;
538 	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
539 	const uint_t	tick = (flags & SDC_UPDATE_TICK);
540 
541 	hrtime_t	O, R;
542 	hrtime_t	newO = -1;
543 
544 	ASSERT(THREAD_LOCK_HELD(t));
545 
546 	sysdc_update_times(sdc, flags, &O, &R, &newO);
547 	ASSERT(!update || newO != -1);
548 
549 	/* If we have new data, recompute our priority. */
550 	if ((O + R) != 0) {
551 		sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
552 
553 		/* Adjust our priority to move our DC closer to the target. */
554 		if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
555 			sdc->sdc_pri = sdc->sdc_maxpri;
556 		else
557 			sdc->sdc_pri = sdc->sdc_minpri;
558 	}
559 
560 	/*
561 	 * If our per-pset duty cycle goes over the max, we will take a break.
562 	 * This forces all sysdc threads in the pset to minimum priority, in
563 	 * order to let everyone else have a chance at the CPU.
564 	 */
565 	if (sdc->sdc_pset->sdp_need_break) {
566 		SYSDC_INC_STAT(sysdc_compute_pri_breaking);
567 		sdc->sdc_epri = sdc->sdc_minpri;
568 	} else {
569 		sdc->sdc_epri = sdc->sdc_pri;
570 	}
571 
572 	DTRACE_PROBE4(sysdc__compute__pri,
573 	    kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
574 	    uint_t, sdc->sdc_target_DC);
575 
576 	/*
577 	 * For sysdc_update(), we compute the ONPROC time for high-priority
578 	 * threads, which is used to calculate the per-pset duty cycle.  We
579 	 * will always tell our callers to update the thread's priority,
580 	 * since we want to force a cpu_surrender().
581 	 *
582 	 * We reset sdc_update_ticks so that sysdc_tick() will only update
583 	 * the thread's priority if our timeout is delayed by a tick or
584 	 * more.
585 	 */
586 	if (update) {
587 		/* SDC threads are not allowed to change cpupart bindings. */
588 		ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
589 
590 		/* If we were at MAXPRI, account for our onproc time. */
591 		if (t->t_pri == sdc->sdc_maxpri &&
592 		    sdc->sdc_last_base_O != 0 &&
593 		    sdc->sdc_last_base_O < newO) {
594 			sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
595 			sdc->sdc_pset->sdp_onproc_time +=
596 			    (uint64_t)sdc->sdc_last_O;
597 			sdc->sdc_pset->sdp_onproc_threads++;
598 		} else {
599 			sdc->sdc_last_O = 0;
600 		}
601 		sdc->sdc_last_base_O = newO;
602 
603 		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
604 		return (1);
605 	}
606 
607 	/*
608 	 * Like sysdc_update(), sysdc_tick() always wants to update the
609 	 * thread's priority, so that the CPU is surrendered if necessary.
610 	 * We reset sdc_update_ticks so that if the timeout continues to be
611 	 * delayed, we'll update at the regular interval.
612 	 */
613 	if (tick) {
614 		ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
615 		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
616 		return (1);
617 	}
618 
619 	/*
620 	 * Otherwise, only tell our callers to update the priority if it has
621 	 * changed.
622 	 */
623 	return (sdc->sdc_epri != t->t_pri);
624 }
625 
626 static void
627 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
628 {
629 	kthread_t *t = sdc->sdc_thread;
630 
631 	ASSERT(THREAD_LOCK_HELD(t));
632 
633 	if (sysdc_compute_pri(sdc, flags)) {
634 		if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
635 			cpu_surrender(t);
636 		}
637 	}
638 }
639 
640 /*
641  * Add a thread onto the active list.  It will only be removed by
642  * sysdc_update().
643  */
644 static void
645 sysdc_activate(sysdc_t *sdc)
646 {
647 	sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
648 	sysdc_t		*head;
649 	kthread_t	*t = sdc->sdc_thread;
650 
651 	SYSDC_INC_STAT(sysdc_activate_enter);
652 
653 	ASSERT(sdc->sdc_next == NULL);
654 	ASSERT(THREAD_LOCK_HELD(t));
655 
656 	do {
657 		head = *headp;
658 		sdc->sdc_next = head;
659 	} while (atomic_cas_ptr(headp, head, sdc) != head);
660 }
661 
662 /*
663  * sysdc_update() has two jobs:
664  *
665  *	1. It updates the priorities of all active SDC threads on the system.
666  *	2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
667  */
668 static void
669 sysdc_update(void *arg)
670 {
671 	int		idx;
672 	sysdc_t		*freelist = NULL;
673 	sysdc_pset_t	*cur;
674 	hrtime_t	now, diff;
675 	uint_t		redeploy = 1;
676 
677 	SYSDC_INC_STAT(sysdc_update_enter);
678 
679 	ASSERT(sysdc_update_timeout_started);
680 
681 	/*
682 	 * If this is our first time through, diff will be gigantic, and
683 	 * no breaks will be necessary.
684 	 */
685 	now = gethrtime();
686 	diff = now - sysdc_last_update;
687 	sysdc_last_update = now;
688 
689 	mutex_enter(&sysdc_pset_lock);
690 	for (cur = list_head(&sysdc_psets); cur != NULL;
691 	    cur = list_next(&sysdc_psets, cur)) {
692 		boolean_t breaking = (cur->sdp_should_break != 0);
693 
694 		if (cur->sdp_need_break != breaking) {
695 			DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
696 			    boolean_t, breaking);
697 		}
698 		cur->sdp_onproc_time = 0;
699 		cur->sdp_onproc_threads = 0;
700 		cur->sdp_need_break = breaking;
701 	}
702 	mutex_exit(&sysdc_pset_lock);
703 
704 	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
705 		sysdc_list_t		*sdl = &sysdc_active[idx];
706 		sysdc_t *volatile	*headp = &sdl->sdl_list;
707 		sysdc_t			*head, *tail;
708 		sysdc_t			**prevptr;
709 
710 		if (*headp == &sysdc_dummy)
711 			continue;
712 
713 		/* Prevent any threads from exiting while we're poking them. */
714 		mutex_enter(&sdl->sdl_lock);
715 
716 		/*
717 		 * Each sdl_list contains a singly-linked list of active
718 		 * threads. Threads which become active while we are
719 		 * processing the list will be added to sdl_list.  Since we
720 		 * don't want that to interfere with our own processing, we
721 		 * swap in an empty list.  Any newly active threads will
722 		 * go on to this empty list.  When finished, we'll put any
723 		 * such threads at the end of the processed list.
724 		 */
725 		head = atomic_swap_ptr(headp, &sysdc_dummy);
726 		prevptr = &head;
727 		while (*prevptr != &sysdc_dummy) {
728 			sysdc_t		*const	sdc = *prevptr;
729 			kthread_t	*const	t = sdc->sdc_thread;
730 
731 			/*
732 			 * If the thread has exited, move its sysdc_t onto
733 			 * freelist, to be freed later.
734 			 */
735 			if (t == NULL) {
736 				*prevptr = sdc->sdc_next;
737 				SYSDC_INC_STAT(sysdc_update_exited);
738 				sdc->sdc_next = freelist;
739 				freelist = sdc;
740 				continue;
741 			}
742 
743 			thread_lock(t);
744 			if (t->t_cid != sysdccid) {
745 				thread_unlock(t);
746 				prevptr = &sdc->sdc_next;
747 				SYSDC_INC_STAT(sysdc_update_not_sdc);
748 				continue;
749 			}
750 			ASSERT(t->t_cldata == sdc);
751 
752 			/*
753 			 * If the thread has been sleeping for longer
754 			 * than sysdc_prune_interval, make it inactive by
755 			 * removing it from the list.
756 			 */
757 			if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
758 			    sdc->sdc_sleep_updates != 0 &&
759 			    (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
760 			    sysdc_prune_updates) {
761 				*prevptr = sdc->sdc_next;
762 				SYSDC_INC_STAT(sysdc_update_idle);
763 				sdc->sdc_next = NULL;
764 				thread_unlock(t);
765 				continue;
766 			}
767 			sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
768 			thread_unlock(t);
769 
770 			prevptr = &sdc->sdc_next;
771 		}
772 
773 		/*
774 		 * Add our list to the bucket, putting any new entries
775 		 * added while we were working at the tail of the list.
776 		 */
777 		do {
778 			tail = *headp;
779 			*prevptr = tail;
780 		} while (atomic_cas_ptr(headp, tail, head) != tail);
781 
782 		mutex_exit(&sdl->sdl_lock);
783 	}
784 
785 	mutex_enter(&sysdc_pset_lock);
786 	for (cur = list_head(&sysdc_psets); cur != NULL;
787 	    cur = list_next(&sysdc_psets, cur)) {
788 
789 		cur->sdp_vtime_last_interval =
790 		    diff * cur->sdp_cpupart->cp_ncpus;
791 		cur->sdp_DC_last_interval =
792 		    (cur->sdp_onproc_time * SYSDC_DC_MAX) /
793 		    cur->sdp_vtime_last_interval;
794 
795 		if (cur->sdp_should_break > 0) {
796 			cur->sdp_should_break--;	/* breaking */
797 			continue;
798 		}
799 		if (cur->sdp_dont_break > 0) {
800 			cur->sdp_dont_break--;	/* waiting before checking */
801 			continue;
802 		}
803 		if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
804 			cur->sdp_should_break = sysdc_break_updates;
805 			cur->sdp_dont_break = sysdc_nobreak_updates;
806 			SYSDC_INC_STAT(sysdc_update_take_break);
807 		}
808 	}
809 
810 	/*
811 	 * If there are no sysdc_psets, there can be no threads, so
812 	 * we can stop doing our timeout.  Since we're holding the
813 	 * sysdc_pset_lock, no new sysdc_psets can come in, which will
814 	 * prevent anyone from racing with this and dropping our timeout
815 	 * on the floor.
816 	 */
817 	if (list_is_empty(&sysdc_psets)) {
818 		SYSDC_INC_STAT(sysdc_update_no_psets);
819 		ASSERT(sysdc_update_timeout_started);
820 		sysdc_update_timeout_started = 0;
821 
822 		redeploy = 0;
823 	}
824 	mutex_exit(&sysdc_pset_lock);
825 
826 	while (freelist != NULL) {
827 		sysdc_t *cur = freelist;
828 		freelist = cur->sdc_next;
829 		kmem_free(cur, sizeof (*cur));
830 	}
831 
832 	if (redeploy) {
833 		(void) timeout(sysdc_update, arg, sysdc_update_ticks);
834 	}
835 }
836 
837 static void
838 sysdc_preempt(kthread_t *t)
839 {
840 	ASSERT(t == curthread);
841 	ASSERT(THREAD_LOCK_HELD(t));
842 
843 	setbackdq(t);		/* give others a chance to run */
844 }
845 
846 static void
847 sysdc_tick(kthread_t *t)
848 {
849 	sysdc_t *sdc;
850 
851 	thread_lock(t);
852 	if (t->t_cid != sysdccid) {
853 		SYSDC_INC_STAT(sysdc_tick_not_sdc);
854 		thread_unlock(t);
855 		return;
856 	}
857 	sdc = t->t_cldata;
858 	if (t->t_state == TS_ONPROC &&
859 	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
860 		cpu_surrender(t);
861 	}
862 
863 	if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
864 		ASSERT(sdc->sdc_sleep_updates == 0);
865 	}
866 
867 	ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
868 	sdc->sdc_ticks++;
869 	if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
870 		SYSDC_INC_STAT(sysdc_tick_quantum_expired);
871 		sysdc_update_pri(sdc, SDC_UPDATE_TICK);
872 		ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
873 	}
874 	thread_unlock(t);
875 }
876 
877 static void
878 sysdc_setrun(kthread_t *t)
879 {
880 	sysdc_t *sdc = t->t_cldata;
881 
882 	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
883 
884 	sdc->sdc_sleep_updates = 0;
885 
886 	if (sdc->sdc_next == NULL) {
887 		/*
888 		 * Since we're in transition, we don't want to use the
889 		 * full thread_update_pri().
890 		 */
891 		if (sysdc_compute_pri(sdc, 0)) {
892 			THREAD_CHANGE_PRI(t, sdc->sdc_epri);
893 		}
894 		sysdc_activate(sdc);
895 
896 		ASSERT(sdc->sdc_next != NULL);
897 	}
898 
899 	setbackdq(t);
900 }
901 
902 static void
903 sysdc_wakeup(kthread_t *t)
904 {
905 	sysdc_setrun(t);
906 }
907 
908 static void
909 sysdc_sleep(kthread_t *t)
910 {
911 	sysdc_t *sdc = t->t_cldata;
912 
913 	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
914 
915 	sdc->sdc_sleep_updates = sdc->sdc_nupdates;
916 }
917 
918 /*ARGSUSED*/
919 static int
920 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
921     void *bufp)
922 {
923 	cpupart_t *const cpupart = t->t_cpupart;
924 	sysdc_t *sdc = bufp;
925 	sysdc_params_t *sdpp = parmsp;
926 	sysdc_pset_t *newpset = sdc->sdc_pset;
927 	sysdc_pset_t *pset;
928 	int start_timeout;
929 
930 	if (t->t_cid != syscid)
931 		return (EPERM);
932 
933 	ASSERT(ttolwp(t) != NULL);
934 	ASSERT(sdpp != NULL);
935 	ASSERT(newpset != NULL);
936 	ASSERT(sysdc_param_init);
937 
938 	ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
939 	ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
940 	ASSERT(sdpp->sdp_DC >= sysdc_minDC);
941 	ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
942 
943 	sdc->sdc_thread = t;
944 	sdc->sdc_pri = sdpp->sdp_maxpri;	/* start off maximally */
945 	sdc->sdc_minpri = sdpp->sdp_minpri;
946 	sdc->sdc_maxpri = sdpp->sdp_maxpri;
947 	sdc->sdc_target_DC = sdpp->sdp_DC;
948 	sdc->sdc_ticks = 0;
949 	sdc->sdc_update_ticks = sysdc_update_ticks + 1;
950 
951 	/* Assign ourselves to the appropriate pset. */
952 	sdc->sdc_pset = NULL;
953 	mutex_enter(&sysdc_pset_lock);
954 	for (pset = list_head(&sysdc_psets); pset != NULL;
955 	    pset = list_next(&sysdc_psets, pset)) {
956 		if (pset->sdp_cpupart == cpupart) {
957 			break;
958 		}
959 	}
960 	if (pset == NULL) {
961 		pset = newpset;
962 		newpset = NULL;
963 		pset->sdp_cpupart = cpupart;
964 		list_insert_tail(&sysdc_psets, pset);
965 	}
966 	pset->sdp_nthreads++;
967 	ASSERT(pset->sdp_nthreads > 0);
968 
969 	sdc->sdc_pset = pset;
970 
971 	start_timeout = (sysdc_update_timeout_started == 0);
972 	sysdc_update_timeout_started = 1;
973 	mutex_exit(&sysdc_pset_lock);
974 
975 	if (newpset != NULL)
976 		kmem_free(newpset, sizeof (*newpset));
977 
978 	/* Update t's scheduling class and priority. */
979 	thread_lock(t);
980 	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
981 	t->t_cid = cid;
982 	t->t_cldata = sdc;
983 	t->t_schedflag |= TS_RUNQMATCH;
984 
985 	sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
986 	thread_unlock(t);
987 
988 	/* Kick off the thread timeout if we're the first one in. */
989 	if (start_timeout) {
990 		(void) timeout(sysdc_update, NULL, sysdc_update_ticks);
991 	}
992 
993 	return (0);
994 }
995 
996 static void
997 sysdc_leave(sysdc_t *sdc)
998 {
999 	sysdc_pset_t *sdp = sdc->sdc_pset;
1000 	sysdc_list_t *sdl = SYSDC_LIST(sdc);
1001 	uint_t freedc;
1002 
1003 	mutex_enter(&sdl->sdl_lock);		/* block sysdc_update() */
1004 	sdc->sdc_thread = NULL;
1005 	freedc = (sdc->sdc_next == NULL);
1006 	mutex_exit(&sdl->sdl_lock);
1007 
1008 	mutex_enter(&sysdc_pset_lock);
1009 	ASSERT(sdp != NULL);
1010 	ASSERT(sdp->sdp_nthreads > 0);
1011 	--sdp->sdp_nthreads;
1012 	if (sdp->sdp_nthreads == 0) {
1013 		list_remove(&sysdc_psets, sdp);
1014 	} else {
1015 		sdp = NULL;
1016 	}
1017 	mutex_exit(&sysdc_pset_lock);
1018 
1019 	if (freedc)
1020 		kmem_free(sdc, sizeof (*sdc));
1021 	if (sdp != NULL)
1022 		kmem_free(sdp, sizeof (*sdp));
1023 }
1024 
1025 static void
1026 sysdc_exitclass(void *buf)
1027 {
1028 	sysdc_leave((sysdc_t *)buf);
1029 }
1030 
1031 /*ARGSUSED*/
1032 static int
1033 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1034 {
1035 	/* Threads cannot exit SDC once joined, except in a body bag. */
1036 	return (EPERM);
1037 }
1038 
1039 static void
1040 sysdc_exit(kthread_t *t)
1041 {
1042 	sysdc_t *sdc;
1043 
1044 	/* We're exiting, so we just rejoin the SYS class. */
1045 	thread_lock(t);
1046 	ASSERT(t->t_cid == sysdccid);
1047 	sdc = t->t_cldata;
1048 	t->t_cid = syscid;
1049 	t->t_cldata = NULL;
1050 	t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1051 	(void) thread_change_pri(t, maxclsyspri, 0);
1052 	t->t_schedflag &= ~TS_RUNQMATCH;
1053 	thread_unlock_nopreempt(t);
1054 
1055 	/* Unlink the sdc from everything. */
1056 	sysdc_leave(sdc);
1057 }
1058 
1059 /*ARGSUSED*/
1060 static int
1061 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1062 {
1063 	/*
1064 	 * Threads cannot be created with SDC as their class; they must
1065 	 * be created as SYS and then added with sysdc_thread_enter().
1066 	 * Because of this restriction, sysdc_fork() should never be called.
1067 	 */
1068 	panic("sysdc cannot be forked");
1069 
1070 	return (ENOSYS);
1071 }
1072 
1073 /*ARGSUSED*/
1074 static void
1075 sysdc_forkret(kthread_t *t, kthread_t *ct)
1076 {
1077 	/* SDC threads are part of system processes, which never fork. */
1078 	panic("sysdc cannot be forked");
1079 }
1080 
1081 static pri_t
1082 sysdc_globpri(kthread_t *t)
1083 {
1084 	return (t->t_epri);
1085 }
1086 
1087 /*ARGSUSED*/
1088 static pri_t
1089 sysdc_no_swap(kthread_t *t, int flags)
1090 {
1091 	/* SDC threads cannot be swapped. */
1092 	return (-1);
1093 }
1094 
1095 /*
1096  * Get maximum and minimum priorities enjoyed by SDC threads.
1097  */
1098 static int
1099 sysdc_getclpri(pcpri_t *pcprip)
1100 {
1101 	pcprip->pc_clpmax = sysdc_maxpri;
1102 	pcprip->pc_clpmin = sysdc_minpri;
1103 	return (0);
1104 }
1105 
1106 /*ARGSUSED*/
1107 static int
1108 sysdc_getclinfo(void *arg)
1109 {
1110 	return (0);		/* no class-specific info */
1111 }
1112 
1113 /*ARGSUSED*/
1114 static int
1115 sysdc_alloc(void **p, int flag)
1116 {
1117 	sysdc_t *new;
1118 
1119 	*p = NULL;
1120 	if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1121 		return (ENOMEM);
1122 	}
1123 	if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1124 	    NULL) {
1125 		kmem_free(new, sizeof (*new));
1126 		return (ENOMEM);
1127 	}
1128 	*p = new;
1129 	return (0);
1130 }
1131 
1132 static void
1133 sysdc_free(void *p)
1134 {
1135 	sysdc_t *sdc = p;
1136 
1137 	if (sdc != NULL) {
1138 		/*
1139 		 * We must have failed CL_ENTERCLASS(), so our pset should be
1140 		 * there and unused.
1141 		 */
1142 		ASSERT(sdc->sdc_pset != NULL);
1143 		ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1144 		kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1145 		kmem_free(sdc, sizeof (*sdc));
1146 	}
1147 }
1148 
1149 static int sysdc_enosys();	/* Boy, ANSI-C's K&R compatibility is weird. */
1150 static int sysdc_einval();
1151 static void sysdc_nullsys();
1152 
1153 static struct classfuncs sysdc_classfuncs = {
1154 	/* messages to class manager */
1155 	{
1156 		sysdc_enosys,	/* admin */
1157 		sysdc_getclinfo,
1158 		sysdc_enosys,	/* parmsin */
1159 		sysdc_enosys,	/* parmsout */
1160 		sysdc_enosys,	/* vaparmsin */
1161 		sysdc_enosys,	/* vaparmsout */
1162 		sysdc_getclpri,
1163 		sysdc_alloc,
1164 		sysdc_free,
1165 	},
1166 	/* operations on threads */
1167 	{
1168 		sysdc_enterclass,
1169 		sysdc_exitclass,
1170 		sysdc_canexit,
1171 		sysdc_fork,
1172 		sysdc_forkret,
1173 		sysdc_nullsys,	/* parmsget */
1174 		sysdc_enosys,	/* parmsset */
1175 		sysdc_nullsys,	/* stop */
1176 		sysdc_exit,
1177 		sysdc_nullsys,	/* active */
1178 		sysdc_nullsys,	/* inactive */
1179 		sysdc_no_swap,	/* swapin */
1180 		sysdc_no_swap,	/* swapout */
1181 		sysdc_nullsys,	/* trapret */
1182 		sysdc_preempt,
1183 		sysdc_setrun,
1184 		sysdc_sleep,
1185 		sysdc_tick,
1186 		sysdc_wakeup,
1187 		sysdc_einval,	/* donice */
1188 		sysdc_globpri,
1189 		sysdc_nullsys,	/* set_process_group */
1190 		sysdc_nullsys,	/* yield */
1191 		sysdc_einval,	/* doprio */
1192 	}
1193 };
1194 
1195 static int
1196 sysdc_enosys()
1197 {
1198 	return (ENOSYS);
1199 }
1200 
1201 static int
1202 sysdc_einval()
1203 {
1204 	return (EINVAL);
1205 }
1206 
1207 static void
1208 sysdc_nullsys()
1209 {
1210 }
1211 
1212 /*ARGSUSED*/
1213 static pri_t
1214 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1215 {
1216 	int idx;
1217 
1218 	list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1219 	    offsetof(sysdc_pset_t, sdp_node));
1220 
1221 	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1222 		sysdc_active[idx].sdl_list = &sysdc_dummy;
1223 	}
1224 
1225 	sysdc_initparam();
1226 
1227 	sysdccid = cid;
1228 	*clfuncspp = &sysdc_classfuncs;
1229 
1230 	return ((pri_t)v.v_maxsyspri);
1231 }
1232 
1233 static struct sclass csw = {
1234 	"SDC",
1235 	sysdc_init,
1236 	0
1237 };
1238 
1239 static struct modlsched modlsched = {
1240 	&mod_schedops, "system duty cycle scheduling class", &csw
1241 };
1242 
1243 static struct modlinkage modlinkage = {
1244 	MODREV_1, (void *)&modlsched, NULL
1245 };
1246 
1247 int
1248 _init()
1249 {
1250 	return (mod_install(&modlinkage));
1251 }
1252 
1253 int
1254 _fini()
1255 {
1256 	return (EBUSY);		/* can't unload for now */
1257 }
1258 
1259 int
1260 _info(struct modinfo *modinfop)
1261 {
1262 	return (mod_info(&modlinkage, modinfop));
1263 }
1264 
1265 /* --- consolidation-private interfaces --- */
1266 void
1267 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1268 {
1269 	void *buf = NULL;
1270 	sysdc_params_t sdp;
1271 
1272 	SYSDC_INC_STAT(sysdc_thread_enter_enter);
1273 
1274 	ASSERT(sysdc_param_init);
1275 	ASSERT(sysdccid >= 0);
1276 
1277 	ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1278 
1279 	sdp.sdp_minpri = sysdc_minpri;
1280 	sdp.sdp_maxpri = sysdc_maxpri;
1281 	sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1282 
1283 	VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1284 
1285 	ASSERT(t->t_lwp != NULL);
1286 	ASSERT(t->t_cid == syscid);
1287 	ASSERT(t->t_cldata == NULL);
1288 	VERIFY0(CL_CANEXIT(t, NULL));
1289 	VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1290 	CL_EXITCLASS(syscid, NULL);
1291 }
1292