common/disp/sysdc.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * The System Duty Cycle (SDC) scheduling class
 * --------------------------------------------
 *
 * Background
 *
 * Kernel threads in Solaris have traditionally not been large consumers
 * of CPU time.  They typically wake up, perform a small amount of
 * work, then go back to sleep waiting for either a timeout or another
 * signal.  On the assumption that the small amount of work that they do
 * is important for the behavior of the whole system, these threads are
 * treated kindly by the dispatcher and the SYS scheduling class: they run
 * without preemption from anything other than real-time and interrupt
 * threads; when preempted, they are put at the front of the queue, so they
 * generally do not migrate between CPUs; and they are allowed to stay
 * running until they voluntarily give up the CPU.
 *
 * As Solaris has evolved, new workloads have emerged which require the
 * kernel to perform significant amounts of CPU-intensive work.  One
 * example of such a workload is ZFS's transaction group sync processing.
 * Each sync operation generates a large batch of I/Os, and each I/O
 * may need to be compressed and/or checksummed before it is written to
 * storage.  The taskq threads which perform the compression and checksums
 * will run nonstop as long as they have work to do; a large sync operation
 * on a compression-heavy dataset can keep them busy for seconds on end.
 * This causes human-time-scale dispatch latency bubbles for any other
 * threads which have the misfortune to share a CPU with the taskq threads.
 *
 * The SDC scheduling class is a solution to this problem.
 *
 *
 * Overview
 *
 * SDC is centered around the concept of a thread's duty cycle (DC):
 *
 *			      ONPROC time
 *	Duty Cycle =	----------------------
 *			ONPROC + Runnable time
 *
 * This is the ratio of the time that the thread spent running on a CPU
 * divided by the time it spent running or trying to run.  It is unaffected
 * by any time the thread spent sleeping, stopped, etc.
 *
 * A thread joining the SDC class specifies a "target" DC that it wants
 * to run at.  To implement this policy, the routine sysdc_update() scans
 * the list of active SDC threads every few ticks and uses each thread's
 * microstate data to compute the actual duty cycle that that thread
 * has experienced recently.  If the thread is under its target DC, its
 * priority is increased to the maximum available (sysdc_maxpri, which is
 * 99 by default).  If the thread is over its target DC, its priority is
 * reduced to the minimum available (sysdc_minpri, 0 by default).  This
 * is a fairly primitive approach, in that it doesn't use any of the
 * intermediate priorities, but it's not completely inappropriate.  Even
 * though threads in the SDC class might take a while to do their job, they
 * are by some definition important if they're running inside the kernel,
 * so it is reasonable that they should get to run at priority 99.
 *
 * If a thread is running when sysdc_update() calculates its actual duty
 * cycle, and there are other threads of equal or greater priority on its
 * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
 * acknowledges the preemption by calling sysdc_preempt(), which calls
 * setbackdq(), which gives other threads with the same priority a chance
 * to run.  This creates a de facto time quantum for threads in the SDC
 * scheduling class.
 *
 * An SDC thread which is assigned priority 0 can continue to run if
 * nothing else needs to use the CPU that it's running on.  Similarly, an
 * SDC thread at priority 99 might not get to run as much as it wants to
 * if there are other priority-99 or higher threads on its CPU.  These
 * situations would cause the thread to get ahead of or behind its target
 * DC; the longer the situations lasted, the further ahead or behind the
 * thread would get.  Rather than condemning a thread to a lifetime of
 * paying for its youthful indiscretions, SDC keeps "base" values for
 * ONPROC and Runnable times in each thread's sysdc data, and updates these
 * values periodically.  The duty cycle is then computed using the elapsed
 * amount of ONPROC and Runnable times since those base times.
 *
 * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 * keep the list of "active" threads small by pruning out threads which
 * have been asleep for a brief time.  They are not pruned immediately upon
 * going to sleep, since some threads may bounce back and forth between
 * sleeping and being runnable.
 *
 *
 * Interfaces
 *
 * void sysdc_thread_enter(t, dc, flags)
 *
 *	Moves a kernel thread from the SYS scheduling class to the
 *	SDC class. t must have an associated LWP (created by calling
 *	lwp_kernel_create()).  The thread will have a target DC of dc.
 *	Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 *	SYSDC_THREAD_BATCH is specified, the thread will run with a
 *	slightly lower priority (see "Batch threads", below).
 *
 *
 * Complications
 *
 * - Run queue balancing
 *
 *	The Solaris dispatcher is biased towards letting a thread run
 *	on the same CPU which it last ran on, if no more than 3 ticks
 *	(i.e. rechoose_interval) have passed since the thread last ran.
 *	This helps to preserve cache warmth.  On the other hand, it also
 *	tries to keep the per-CPU run queues fairly balanced; if the CPU
 *	chosen for a runnable thread has a run queue which is three or
 *	more threads longer than a neighboring CPU's queue, the runnable
 *	thread is dispatched onto the neighboring CPU instead.
 *
 *	These policies work well for some workloads, but not for many SDC
 *	threads.  The taskq client of SDC, for example, has many discrete
 *	units of work to do.  The work units are largely independent, so
 *	cache warmth is not an important consideration.  It is important
 *	that the threads fan out quickly to different CPUs, since the
 *	amount of work these threads have to do (a few seconds worth at a
 *	time) doesn't leave much time to correct thread placement errors
 *	(i.e. two SDC threads being dispatched to the same CPU).
 *
 *	To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 *	This tells the dispatcher to keep neighboring run queues' lengths
 *	more evenly matched, which allows SDC threads to migrate more
 *	easily.
 *
 * - LWPs and system processes
 *
 *	SDC can only be used for kernel threads.  Since SDC uses microstate
 *	accounting data to compute each thread's actual duty cycle, all
 *	threads entering the SDC class must have associated LWPs (which
 *	store the microstate data).  This means that the threads have to
 *	be associated with an SSYS process, i.e. one created by newproc().
 *	If the microstate accounting information is ever moved into the
 *	kthread_t, this restriction could be lifted.
 *
 * - Dealing with oversubscription
 *
 *	Since SDC duty cycles are per-thread, it is possible that the
 *	aggregate requested duty cycle of all SDC threads in a processor
 *	set could be greater than the total CPU time available in that set.
 *	The FSS scheduling class has an analogous situation, which it deals
 *	with by reducing each thread's allotted CPU time proportionally.
 *	Since SDC doesn't need to be as precise as FSS, it uses a simpler
 *	solution to the oversubscription problem.
 *
 *	sysdc_update() accumulates the amount of time that max-priority SDC
 *	threads have spent on-CPU in each processor set, and uses that sum
 *	to create an implied duty cycle for that processor set:
 *
 *				accumulated CPU time
 *	   pset DC =	-----------------------------------
 *			 (# CPUs) * time since last update
 *
 *	If this implied duty cycle is above a maximum pset duty cycle (90%
 *	by default), sysdc_update() sets the priority of all SDC threads
 *	in that processor set to sysdc_minpri for a "break" period.  After
 *	the break period, it waits for a "nobreak" period before trying to
 *	enforce the pset duty cycle limit again.
 *
 * - Processor sets
 *
 *	As the above implies, SDC is processor set aware, but it does not
 *	currently allow threads to change processor sets while in the SDC
 *	class.  Instead, those threads must join the desired processor set
 *	before entering SDC. [1]
 *
 * - Batch threads
 *
 *	A thread joining the SDC class can specify the SDC_THREAD_BATCH
 *	flag.  This flag causes the maximum priority for that thread to be
 *	reduced (by default, the maximum is reduced by 1).  This allows
 *	longer-running, batch-oriented SDC threads to be interrupted by
 *	more immediate, higher-priority work.
 *
 * - t_kpri_req
 *
 *	The TS and FSS scheduling classes pay attention to t_kpri_req,
 *	which provides a simple form of priority inheritance for
 *	synchronization primitives (such as rwlocks held as READER) which
 *	cannot be traced to a unique thread.  The SDC class does not honor
 *	t_kpri_req, for a few reasons:
 *
 *	1.  t_kpri_req is notoriously inaccurate.  A measure of its
 *	    inaccuracy is that it needs to be cleared every time a thread
 *	    returns to user mode, because it is frequently non-zero at that
 *	    point.  This can happen because "ownership" of synchronization
 *	    primitives that use t_kpri_req can be silently handed off,
 *	    leaving no opportunity to will the t_kpri_req inheritance.
 *
 *	2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
 *	    kernel priority.  This means that even if an SDC thread
 *	    is holding a synchronization primitive and running at low
 *	    priority, its priority will eventually be raised above 60,
 *	    allowing it to drive on and release the resource.
 *
 *	3.  The first consumer of SDC uses the taskq subsystem, which holds
 *	    a reader lock for the duration of the task's execution.  This
 *	    would mean that SDC threads would never drop below kernel
 *	    priority in practice, which defeats one of the purposes of SDC.
 *
 * - Why not FSS?
 *
 *	It might seem that the existing FSS scheduling class could solve
 *	the problems that SDC is attempting to solve.  FSS's more precise
 *	solution to the oversubscription problem would hardly cause
 *	trouble, as long as it performed well.  SDC is implemented as
 *	a separate scheduling class for two main reasons: the initial
 *	consumer of SDC does not map well onto the "project" abstraction
 *	that is central to FSS, and FSS does not expect to run at kernel
 *	priorities.
 *
 *
 * Tunables
 *
 * - sysdc_batch_niceness:  The amount below sysdc_maxpri that
 *	SDC_THREAD_BATCH threads should use as their per-thread
 *	maximum priority.
 *
 * - sysdc_update_interval_msec:  Number of milliseconds between
 *	consecutive thread priority updates.
 *
 * - sysdc_reset_interval_msec:  Number of milliseconds between
 *	consecutive resets of a thread's base ONPROC and Runnable
 *	times.
 *
 * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 *	before a thread is pruned from the active list.
 *
 * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 *	CPU time which SDC can give to its high-priority threads.
 *
 * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 *	sysdc_max_pset_DC is exceeded.
 *
 *
 * Future work (in SDC and related subsystems)
 *
 * - Per-thread rechoose interval (0 for SDC)
 *
 *	Allow each thread to specify its own rechoose interval.  SDC
 *	threads would specify an interval of zero, which would rechoose
 *	the CPU with the lowest priority once per update.
 *
 * - Allow threads to change processor sets after joining the SDC class
 *
 * - Thread groups and per-group DC
 *
 *	It might be nice to be able to specify a duty cycle which applies
 *	to a group of threads in aggregate.
 *
 * - Per-group DC callback to allow dynamic DC tuning
 *
 *	Currently, DCs are assigned when the thread joins SDC.  Some
 *	workloads could benefit from being able to tune their DC using
 *	subsystem-specific knowledge about the workload.
 *
 * - Finer-grained priority updates
 *
 * - More nuanced management of oversubscription
 *
 * - Moving other CPU-intensive threads into SDC
 *
 * - Move msacct data into kthread_t
 *
 *	This would allow kernel threads without LWPs to join SDC.
 *
 *
 * Footnotes
 *
 * [1] The details of doing so are left as an exercise for the reader.
 */

#include <sys/types.h>
#include <sys/sysdc.h>
#include <sys/sysdc_impl.h>

#include <sys/class.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/debug.h>
#include <sys/disp.h>
#include <sys/errno.h>
#include <sys/inline.h>
#include <sys/kmem.h>
#include <sys/modctl.h>
#include <sys/schedctl.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/var.h>

/*
 * Tunables - loaded into the internal state at module load time
 */
uint_t		sysdc_update_interval_msec = 20;
uint_t		sysdc_reset_interval_msec = 400;
uint_t		sysdc_prune_interval_msec = 100;
uint_t		sysdc_max_pset_DC = 90;
uint_t		sysdc_break_msec = 80;
pri_t		sysdc_batch_niceness = 1;

/*
 * Internal state - constants set up by sysdc_initparam()
 */
static clock_t	sysdc_update_ticks;	/* ticks between updates */
static uint_t	sysdc_prune_updates;	/* updates asleep before pruning */
static uint_t	sysdc_reset_updates;	/* # of updates before reset */
static uint_t	sysdc_break_updates;	/* updates to break */
static uint_t	sysdc_nobreak_updates;	/* updates to not check */
static uint_t	sysdc_minDC;		/* minimum allowed DC */
static uint_t	sysdc_maxDC;		/* maximum allowed DC */
static pri_t	sysdc_minpri;		/* minimum allowed priority */
static pri_t	sysdc_maxpri;		/* maximum allowed priority */

/*
 * Internal state
 */
static kmutex_t	sysdc_pset_lock;	/* lock protecting pset data */
static list_t	sysdc_psets;		/* list of psets with SDC threads */
static uint_t	sysdc_param_init;	/* sysdc_initparam() has been called */
static uint_t	sysdc_update_timeout_started; /* update timeout is active */
static hrtime_t	sysdc_last_update;	/* time of last sysdc_update() */
static sysdc_t	sysdc_dummy;		/* used to terminate active lists */

/*
 * Internal state - active hash table
 */
#define	SYSDC_NLISTS	8
#define	SYSDC_HASH(sdc)	(((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
static sysdc_list_t	sysdc_active[SYSDC_NLISTS];
#define	SYSDC_LIST(sdc)		(&sysdc_active[SYSDC_HASH(sdc)])

#ifdef DEBUG
static struct {
	uint64_t	sysdc_update_times_asleep;
	uint64_t	sysdc_update_times_base_ran_backwards;
	uint64_t	sysdc_update_times_already_done;
	uint64_t	sysdc_update_times_cur_ran_backwards;
	uint64_t	sysdc_compute_pri_breaking;
	uint64_t	sysdc_activate_enter;
	uint64_t	sysdc_update_enter;
	uint64_t	sysdc_update_exited;
	uint64_t	sysdc_update_not_sdc;
	uint64_t	sysdc_update_idle;
	uint64_t	sysdc_update_take_break;
	uint64_t	sysdc_update_no_psets;
	uint64_t	sysdc_tick_not_sdc;
	uint64_t	sysdc_tick_quantum_expired;
	uint64_t	sysdc_thread_enter_enter;
} sysdc_stats;

#define	SYSDC_INC_STAT(x)	(sysdc_stats.x++)
#else
#define	SYSDC_INC_STAT(x)	((void)0)
#endif

/* macros are UPPER CASE */
#define	HOWMANY(a, b)	howmany((a), (b))
#define	MSECTOTICKS(a)	HOWMANY((a) * 1000, usec_per_tick)

static void
sysdc_initparam(void)
{
	uint_t sysdc_break_ticks;

	/* update / prune intervals */
	sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);

	sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
	    sysdc_update_interval_msec);
	sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
	    sysdc_update_interval_msec);

	/* We must get at least a little time on CPU. */
	sysdc_minDC = 1;
	sysdc_maxDC = SYSDC_DC_MAX;
	sysdc_minpri = 0;
	sysdc_maxpri = maxclsyspri;

	/* break parameters */
	if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
		sysdc_max_pset_DC = SYSDC_DC_MAX;
	}
	sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
	sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);

	/*
	 * We want:
	 *
	 *	sysdc_max_pset_DC = (nobreak / (break + nobreak))
	 *
	 *	==>	  nobreak = sysdc_max_pset_DC * (break + nobreak)
	 *
	 *			    sysdc_max_pset_DC * break
	 *	==>	  nobreak = -------------------------
	 *			    1 - sysdc_max_pset_DC
	 */
	sysdc_nobreak_updates =
	    HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
	    (SYSDC_DC_MAX - sysdc_max_pset_DC));

	sysdc_param_init = 1;
}

#undef HOWMANY
#undef MSECTOTICKS

#define	SDC_UPDATE_INITIAL	0x1	/* for the initial update */
#define	SDC_UPDATE_TIMEOUT	0x2	/* from sysdc_update() */
#define	SDC_UPDATE_TICK		0x4	/* from sysdc_tick(), on expiry */

/*
 * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 * and Runnable times since the last reset.
 *
 * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 * to track processor set usage.
 */
static void
sysdc_update_times(sysdc_t *sdc, uint_t flags,
    hrtime_t *O, hrtime_t *R, hrtime_t *newO)
{
	kthread_t *const t = sdc->sdc_thread;
	const uint_t	initial = (flags & SDC_UPDATE_INITIAL);
	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
	const clock_t	now = ddi_get_lbolt();
	uint_t		do_reset;

	ASSERT(THREAD_LOCK_HELD(t));

	*O = *R = 0;

	/* If we've been sleeping, we know we haven't had any ONPROC time. */
	if (sdc->sdc_sleep_updates != 0 &&
	    sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
		*newO = sdc->sdc_last_base_O;
		SYSDC_INC_STAT(sysdc_update_times_asleep);
		return;
	}

	/*
	 * If this is our first update, or we've hit the reset point,
	 * we need to reset our base_{O,R}.  Once we've updated them, we
	 * report O and R for the entire prior interval.
	 */
	do_reset = initial;
	if (update) {
		++sdc->sdc_nupdates;
		if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
			do_reset = 1;
	}
	if (do_reset) {
		hrtime_t baseO, baseR;
		if (initial) {
			/*
			 * Start off our cycle count somewhere in the middle,
			 * to keep the resets from all happening at once.
			 *
			 * 4999 is a handy prime much larger than
			 * sysdc_reset_updates, so that we don't run into
			 * trouble if the resolution is a multiple of
			 * sysdc_reset_updates.
			 */
			sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
			    sysdc_reset_updates);
			baseO = baseR = 0;
		} else {
			baseO = sdc->sdc_base_O;
			baseR = sdc->sdc_base_R;
		}

		mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
		*newO = sdc->sdc_base_O;

		sdc->sdc_reset = now;
		sdc->sdc_pri_check = -1; /* force mismatch below */

		/*
		 * See below for rationale.
		 */
		if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
			SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
			baseO = sdc->sdc_base_O;
			baseR = sdc->sdc_base_R;
		}

		/* compute based on the entire interval */
		*O = (sdc->sdc_base_O - baseO);
		*R = (sdc->sdc_base_R - baseR);
		return;
	}

	/*
	 * If we're called from sysdc_update(), we *must* return a value
	 * for newO, so we always call mstate_systhread_times().
	 *
	 * Otherwise, if we've already done a pri check this tick,
	 * we can skip it.
	 */
	if (!update && sdc->sdc_pri_check == now) {
		SYSDC_INC_STAT(sysdc_update_times_already_done);
		return;
	}

	/* Get the current times from the thread */
	sdc->sdc_pri_check = now;
	mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
	*newO = sdc->sdc_cur_O;

	/*
	 * The updating of microstate accounting is not done under a
	 * consistent set of locks, particularly the t_waitrq field.  This
	 * can lead to narrow windows in which we account for time in the
	 * wrong bucket, which on the next read will be accounted for
	 * correctly.
	 *
	 * If our sdc_base_* fields were affected by one of these blips, we
	 * throw away the old data, and pretend this tick didn't happen.
	 */
	if (sdc->sdc_cur_O < sdc->sdc_base_O ||
	    sdc->sdc_cur_R < sdc->sdc_base_R) {

		sdc->sdc_base_O = sdc->sdc_cur_O;
		sdc->sdc_base_R = sdc->sdc_cur_R;

		SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
		return;
	}

	*O = sdc->sdc_cur_O - sdc->sdc_base_O;
	*R = sdc->sdc_cur_R - sdc->sdc_base_R;
}

/*
 * sysdc_compute_pri()
 *
 *	Recomputes the priority of the thread, leaving the result in
 *	sdc->sdc_epri.  Returns 1 if a priority update should occur
 *	(which will also trigger a cpu_surrender()), otherwise
 *	returns 0.
 */
static uint_t
sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
{
	kthread_t *const t = sdc->sdc_thread;
	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
	const uint_t	tick = (flags & SDC_UPDATE_TICK);

	hrtime_t	O, R;
	hrtime_t	newO = -1;

	ASSERT(THREAD_LOCK_HELD(t));

	sysdc_update_times(sdc, flags, &O, &R, &newO);
	ASSERT(!update || newO != -1);

	/* If we have new data, recompute our priority. */
	if ((O + R) != 0) {
		sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);

		/* Adjust our priority to move our DC closer to the target. */
		if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
			sdc->sdc_pri = sdc->sdc_maxpri;
		else
			sdc->sdc_pri = sdc->sdc_minpri;
	}

	/*
	 * If our per-pset duty cycle goes over the max, we will take a break.
	 * This forces all sysdc threads in the pset to minimum priority, in
	 * order to let everyone else have a chance at the CPU.
	 */
	if (sdc->sdc_pset->sdp_need_break) {
		SYSDC_INC_STAT(sysdc_compute_pri_breaking);
		sdc->sdc_epri = sdc->sdc_minpri;
	} else {
		sdc->sdc_epri = sdc->sdc_pri;
	}

	DTRACE_PROBE4(sysdc__compute__pri,
	    kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
	    uint_t, sdc->sdc_target_DC);

	/*
	 * For sysdc_update(), we compute the ONPROC time for high-priority
	 * threads, which is used to calculate the per-pset duty cycle.  We
	 * will always tell our callers to update the thread's priority,
	 * since we want to force a cpu_surrender().
	 *
	 * We reset sdc_update_ticks so that sysdc_tick() will only update
	 * the thread's priority if our timeout is delayed by a tick or
	 * more.
	 */
	if (update) {
		/* SDC threads are not allowed to change cpupart bindings. */
		ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);

		/* If we were at MAXPRI, account for our onproc time. */
		if (t->t_pri == sdc->sdc_maxpri &&
		    sdc->sdc_last_base_O != 0 &&
		    sdc->sdc_last_base_O < newO) {
			sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
			sdc->sdc_pset->sdp_onproc_time +=
			    (uint64_t)sdc->sdc_last_O;
			sdc->sdc_pset->sdp_onproc_threads++;
		} else {
			sdc->sdc_last_O = 0;
		}
		sdc->sdc_last_base_O = newO;

		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
		return (1);
	}

	/*
	 * Like sysdc_update(), sysdc_tick() always wants to update the
	 * thread's priority, so that the CPU is surrendered if necessary.
	 * We reset sdc_update_ticks so that if the timeout continues to be
	 * delayed, we'll update at the regular interval.
	 */
	if (tick) {
		ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
		return (1);
	}

	/*
	 * Otherwise, only tell our callers to update the priority if it has
	 * changed.
	 */
	return (sdc->sdc_epri != t->t_pri);
}

static void
sysdc_update_pri(sysdc_t *sdc, uint_t flags)
{
	kthread_t *t = sdc->sdc_thread;

	ASSERT(THREAD_LOCK_HELD(t));

	if (sysdc_compute_pri(sdc, flags)) {
		if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
			cpu_surrender(t);
		}
	}
}

/*
 * Add a thread onto the active list.  It will only be removed by
 * sysdc_update().
 */
static void
sysdc_activate(sysdc_t *sdc)
{
	sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
	sysdc_t		*head;
	kthread_t	*t = sdc->sdc_thread;

	SYSDC_INC_STAT(sysdc_activate_enter);

	ASSERT(sdc->sdc_next == NULL);
	ASSERT(THREAD_LOCK_HELD(t));

	do {
		head = *headp;
		sdc->sdc_next = head;
	} while (atomic_cas_ptr(headp, head, sdc) != head);
}

/*
 * sysdc_update() has two jobs:
 *
 *	1. It updates the priorities of all active SDC threads on the system.
 *	2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 */
static void
sysdc_update(void *arg)
{
	int		idx;
	sysdc_t		*freelist = NULL;
	sysdc_pset_t	*cur;
	hrtime_t	now, diff;
	uint_t		redeploy = 1;

	SYSDC_INC_STAT(sysdc_update_enter);

	ASSERT(sysdc_update_timeout_started);

	/*
	 * If this is our first time through, diff will be gigantic, and
	 * no breaks will be necessary.
	 */
	now = gethrtime();
	diff = now - sysdc_last_update;
	sysdc_last_update = now;

	mutex_enter(&sysdc_pset_lock);
	for (cur = list_head(&sysdc_psets); cur != NULL;
	    cur = list_next(&sysdc_psets, cur)) {
		boolean_t breaking = (cur->sdp_should_break != 0);

		if (cur->sdp_need_break != breaking) {
			DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
			    boolean_t, breaking);
		}
		cur->sdp_onproc_time = 0;
		cur->sdp_onproc_threads = 0;
		cur->sdp_need_break = breaking;
	}
	mutex_exit(&sysdc_pset_lock);

	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
		sysdc_list_t		*sdl = &sysdc_active[idx];
		sysdc_t *volatile	*headp = &sdl->sdl_list;
		sysdc_t			*head, *tail;
		sysdc_t			**prevptr;

		if (*headp == &sysdc_dummy)
			continue;

		/* Prevent any threads from exiting while we're poking them. */
		mutex_enter(&sdl->sdl_lock);

		/*
		 * Each sdl_list contains a singly-linked list of active
		 * threads. Threads which become active while we are
		 * processing the list will be added to sdl_list.  Since we
		 * don't want that to interfere with our own processing, we
		 * swap in an empty list.  Any newly active threads will
		 * go on to this empty list.  When finished, we'll put any
		 * such threads at the end of the processed list.
		 */
		head = atomic_swap_ptr(headp, &sysdc_dummy);
		prevptr = &head;
		while (*prevptr != &sysdc_dummy) {
			sysdc_t		*const	sdc = *prevptr;
			kthread_t	*const	t = sdc->sdc_thread;

			/*
			 * If the thread has exited, move its sysdc_t onto
			 * freelist, to be freed later.
			 */
			if (t == NULL) {
				*prevptr = sdc->sdc_next;
				SYSDC_INC_STAT(sysdc_update_exited);
				sdc->sdc_next = freelist;
				freelist = sdc;
				continue;
			}

			thread_lock(t);
			if (t->t_cid != sysdccid) {
				thread_unlock(t);
				prevptr = &sdc->sdc_next;
				SYSDC_INC_STAT(sysdc_update_not_sdc);
				continue;
			}
			ASSERT(t->t_cldata == sdc);

			/*
			 * If the thread has been sleeping for longer
			 * than sysdc_prune_interval, make it inactive by
			 * removing it from the list.
			 */
			if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
			    sdc->sdc_sleep_updates != 0 &&
			    (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
			    sysdc_prune_updates) {
				*prevptr = sdc->sdc_next;
				SYSDC_INC_STAT(sysdc_update_idle);
				sdc->sdc_next = NULL;
				thread_unlock(t);
				continue;
			}
			sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
			thread_unlock(t);

			prevptr = &sdc->sdc_next;
		}

		/*
		 * Add our list to the bucket, putting any new entries
		 * added while we were working at the tail of the list.
		 */
		do {
			tail = *headp;
			*prevptr = tail;
		} while (atomic_cas_ptr(headp, tail, head) != tail);

		mutex_exit(&sdl->sdl_lock);
	}

	mutex_enter(&sysdc_pset_lock);
	for (cur = list_head(&sysdc_psets); cur != NULL;
	    cur = list_next(&sysdc_psets, cur)) {

		cur->sdp_vtime_last_interval =
		    diff * cur->sdp_cpupart->cp_ncpus;
		cur->sdp_DC_last_interval =
		    (cur->sdp_onproc_time * SYSDC_DC_MAX) /
		    cur->sdp_vtime_last_interval;

		if (cur->sdp_should_break > 0) {
			cur->sdp_should_break--;	/* breaking */
			continue;
		}
		if (cur->sdp_dont_break > 0) {
			cur->sdp_dont_break--;	/* waiting before checking */
			continue;
		}
		if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
			cur->sdp_should_break = sysdc_break_updates;
			cur->sdp_dont_break = sysdc_nobreak_updates;
			SYSDC_INC_STAT(sysdc_update_take_break);
		}
	}

	/*
	 * If there are no sysdc_psets, there can be no threads, so
	 * we can stop doing our timeout.  Since we're holding the
	 * sysdc_pset_lock, no new sysdc_psets can come in, which will
	 * prevent anyone from racing with this and dropping our timeout
	 * on the floor.
	 */
	if (list_is_empty(&sysdc_psets)) {
		SYSDC_INC_STAT(sysdc_update_no_psets);
		ASSERT(sysdc_update_timeout_started);
		sysdc_update_timeout_started = 0;

		redeploy = 0;
	}
	mutex_exit(&sysdc_pset_lock);

	while (freelist != NULL) {
		sysdc_t *cur = freelist;
		freelist = cur->sdc_next;
		kmem_free(cur, sizeof (*cur));
	}

	if (redeploy) {
		(void) timeout(sysdc_update, arg, sysdc_update_ticks);
	}
}

static void
sysdc_preempt(kthread_t *t)
{
	ASSERT(t == curthread);
	ASSERT(THREAD_LOCK_HELD(t));

	setbackdq(t);		/* give others a chance to run */
}

static void
sysdc_tick(kthread_t *t)
{
	sysdc_t *sdc;

	thread_lock(t);
	if (t->t_cid != sysdccid) {
		SYSDC_INC_STAT(sysdc_tick_not_sdc);
		thread_unlock(t);
		return;
	}
	sdc = t->t_cldata;
	if (t->t_state == TS_ONPROC &&
	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
		cpu_surrender(t);
	}

	if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
		ASSERT(sdc->sdc_sleep_updates == 0);
	}

	ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
	sdc->sdc_ticks++;
	if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
		SYSDC_INC_STAT(sysdc_tick_quantum_expired);
		sysdc_update_pri(sdc, SDC_UPDATE_TICK);
		ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
	}
	thread_unlock(t);
}

static void
sysdc_setrun(kthread_t *t)
{
	sysdc_t *sdc = t->t_cldata;

	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */

	sdc->sdc_sleep_updates = 0;

	if (sdc->sdc_next == NULL) {
		/*
		 * Since we're in transition, we don't want to use the
		 * full thread_update_pri().
		 */
		if (sysdc_compute_pri(sdc, 0)) {
			THREAD_CHANGE_PRI(t, sdc->sdc_epri);
		}
		sysdc_activate(sdc);

		ASSERT(sdc->sdc_next != NULL);
	}

	setbackdq(t);
}

static void
sysdc_wakeup(kthread_t *t)
{
	sysdc_setrun(t);
}

static void
sysdc_sleep(kthread_t *t)
{
	sysdc_t *sdc = t->t_cldata;

	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */

	sdc->sdc_sleep_updates = sdc->sdc_nupdates;
}

/*ARGSUSED*/
static int
sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
    void *bufp)
{
	cpupart_t *const cpupart = t->t_cpupart;
	sysdc_t *sdc = bufp;
	sysdc_params_t *sdpp = parmsp;
	sysdc_pset_t *newpset = sdc->sdc_pset;
	sysdc_pset_t *pset;
	int start_timeout;

	if (t->t_cid != syscid)
		return (EPERM);

	ASSERT(ttolwp(t) != NULL);
	ASSERT(sdpp != NULL);
	ASSERT(newpset != NULL);
	ASSERT(sysdc_param_init);

	ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
	ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
	ASSERT(sdpp->sdp_DC >= sysdc_minDC);
	ASSERT(sdpp->sdp_DC <= sysdc_maxDC);

	sdc->sdc_thread = t;
	sdc->sdc_pri = sdpp->sdp_maxpri;	/* start off maximally */
	sdc->sdc_minpri = sdpp->sdp_minpri;
	sdc->sdc_maxpri = sdpp->sdp_maxpri;
	sdc->sdc_target_DC = sdpp->sdp_DC;
	sdc->sdc_ticks = 0;
	sdc->sdc_update_ticks = sysdc_update_ticks + 1;

	/* Assign ourselves to the appropriate pset. */
	sdc->sdc_pset = NULL;
	mutex_enter(&sysdc_pset_lock);
	for (pset = list_head(&sysdc_psets); pset != NULL;
	    pset = list_next(&sysdc_psets, pset)) {
		if (pset->sdp_cpupart == cpupart) {
			break;
		}
	}
	if (pset == NULL) {
		pset = newpset;
		newpset = NULL;
		pset->sdp_cpupart = cpupart;
		list_insert_tail(&sysdc_psets, pset);
	}
	pset->sdp_nthreads++;
	ASSERT(pset->sdp_nthreads > 0);

	sdc->sdc_pset = pset;

	start_timeout = (sysdc_update_timeout_started == 0);
	sysdc_update_timeout_started = 1;
	mutex_exit(&sysdc_pset_lock);

	if (newpset != NULL)
		kmem_free(newpset, sizeof (*newpset));

	/* Update t's scheduling class and priority. */
	thread_lock(t);
	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
	t->t_cid = cid;
	t->t_cldata = sdc;
	t->t_schedflag |= TS_RUNQMATCH;

	sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
	thread_unlock(t);

	/* Kick off the thread timeout if we're the first one in. */
	if (start_timeout) {
		(void) timeout(sysdc_update, NULL, sysdc_update_ticks);
	}

	return (0);
}

static void
sysdc_leave(sysdc_t *sdc)
{
	sysdc_pset_t *sdp = sdc->sdc_pset;
	sysdc_list_t *sdl = SYSDC_LIST(sdc);
	uint_t freedc;

	mutex_enter(&sdl->sdl_lock);		/* block sysdc_update() */
	sdc->sdc_thread = NULL;
	freedc = (sdc->sdc_next == NULL);
	mutex_exit(&sdl->sdl_lock);

	mutex_enter(&sysdc_pset_lock);
	sdp = sdc->sdc_pset;
	ASSERT(sdp != NULL);
	ASSERT(sdp->sdp_nthreads > 0);
	--sdp->sdp_nthreads;
	if (sdp->sdp_nthreads == 0) {
		list_remove(&sysdc_psets, sdp);
	} else {
		sdp = NULL;
	}
	mutex_exit(&sysdc_pset_lock);

	if (freedc)
		kmem_free(sdc, sizeof (*sdc));
	if (sdp != NULL)
		kmem_free(sdp, sizeof (*sdp));
}

static void
sysdc_exitclass(void *buf)
{
	sysdc_leave((sysdc_t *)buf);
}

/*ARGSUSED*/
static int
sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
{
	/* Threads cannot exit SDC once joined, except in a body bag. */
	return (EPERM);
}

static void
sysdc_exit(kthread_t *t)
{
	sysdc_t *sdc;

	/* We're exiting, so we just rejoin the SYS class. */
	thread_lock(t);
	ASSERT(t->t_cid == sysdccid);
	sdc = t->t_cldata;
	t->t_cid = syscid;
	t->t_cldata = NULL;
	t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
	(void) thread_change_pri(t, maxclsyspri, 0);
	t->t_schedflag &= ~TS_RUNQMATCH;
	thread_unlock_nopreempt(t);

	/* Unlink the sdc from everything. */
	sysdc_leave(sdc);
}

/*ARGSUSED*/
static int
sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
{
	/*
	 * Threads cannot be created with SDC as their class; they must
	 * be created as SYS and then added with sysdc_thread_enter().
	 * Because of this restriction, sysdc_fork() should never be called.
	 */
	panic("sysdc cannot be forked");

	return (ENOSYS);
}

/*ARGSUSED*/
static void
sysdc_forkret(kthread_t *t, kthread_t *ct)
{
	/* SDC threads are part of system processes, which never fork. */
	panic("sysdc cannot be forked");
}

static pri_t
sysdc_globpri(kthread_t *t)
{
	return (t->t_epri);
}

/*ARGSUSED*/
static pri_t
sysdc_no_swap(kthread_t *t, int flags)
{
	/* SDC threads cannot be swapped. */
	return (-1);
}

/*
 * Get maximum and minimum priorities enjoyed by SDC threads.
 */
static int
sysdc_getclpri(pcpri_t *pcprip)
{
	pcprip->pc_clpmax = sysdc_maxpri;
	pcprip->pc_clpmin = sysdc_minpri;
	return (0);
}

/*ARGSUSED*/
static int
sysdc_getclinfo(void *arg)
{
	return (0);		/* no class-specific info */
}

/*ARGSUSED*/
static int
sysdc_alloc(void **p, int flag)
{
	sysdc_t *new;

	*p = NULL;
	if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
		return (ENOMEM);
	}
	if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
	    NULL) {
		kmem_free(new, sizeof (*new));
		return (ENOMEM);
	}
	*p = new;
	return (0);
}

static void
sysdc_free(void *p)
{
	sysdc_t *sdc = p;

	if (sdc != NULL) {
		/*
		 * We must have failed CL_ENTERCLASS(), so our pset should be
		 * there and unused.
		 */
		ASSERT(sdc->sdc_pset != NULL);
		ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
		kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
		kmem_free(sdc, sizeof (*sdc));
	}
}

static int sysdc_enosys();	/* Boy, ANSI-C's K&R compatibility is weird. */
static int sysdc_einval();
static void sysdc_nullsys();

static struct classfuncs sysdc_classfuncs = {
	/* messages to class manager */
	{
		sysdc_enosys,	/* admin */
		sysdc_getclinfo,
		sysdc_enosys,	/* parmsin */
		sysdc_enosys,	/* parmsout */
		sysdc_enosys,	/* vaparmsin */
		sysdc_enosys,	/* vaparmsout */
		sysdc_getclpri,
		sysdc_alloc,
		sysdc_free,
	},
	/* operations on threads */
	{
		sysdc_enterclass,
		sysdc_exitclass,
		sysdc_canexit,
		sysdc_fork,
		sysdc_forkret,
		sysdc_nullsys,	/* parmsget */
		sysdc_enosys,	/* parmsset */
		sysdc_nullsys,	/* stop */
		sysdc_exit,
		sysdc_nullsys,	/* active */
		sysdc_nullsys,	/* inactive */
		sysdc_no_swap,	/* swapin */
		sysdc_no_swap,	/* swapout */
		sysdc_nullsys,	/* trapret */
		sysdc_preempt,
		sysdc_setrun,
		sysdc_sleep,
		sysdc_tick,
		sysdc_wakeup,
		sysdc_einval,	/* donice */
		sysdc_globpri,
		sysdc_nullsys,	/* set_process_group */
		sysdc_nullsys,	/* yield */
		sysdc_einval,	/* doprio */
	}
};

static int
sysdc_enosys()
{
	return (ENOSYS);
}

static int
sysdc_einval()
{
	return (EINVAL);
}

static void
sysdc_nullsys()
{
}

/*ARGSUSED*/
static pri_t
sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
{
	int idx;

	list_create(&sysdc_psets, sizeof (sysdc_pset_t),
	    offsetof(sysdc_pset_t, sdp_node));

	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
		sysdc_active[idx].sdl_list = &sysdc_dummy;
	}

	sysdc_initparam();

	sysdccid = cid;
	*clfuncspp = &sysdc_classfuncs;

	return ((pri_t)v.v_maxsyspri);
}

static struct sclass csw = {
	"SDC",
	sysdc_init,
	0
};

static struct modlsched modlsched = {
	&mod_schedops, "system duty cycle scheduling class", &csw
};

static struct modlinkage modlinkage = {
	MODREV_1, (void *)&modlsched, NULL
};

int
_init()
{
	return (mod_install(&modlinkage));
}

int
_fini()
{
	return (EBUSY);		/* can't unload for now */
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

/* --- consolidation-private interfaces --- */
void
sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
{
	void *buf = NULL;
	sysdc_params_t sdp;

	SYSDC_INC_STAT(sysdc_thread_enter_enter);

	ASSERT(sysdc_param_init);
	ASSERT(sysdccid >= 0);

	ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);

	sdp.sdp_minpri = sysdc_minpri;
	sdp.sdp_maxpri = sysdc_maxpri;
	sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);

	if (flags & SYSDC_THREAD_BATCH)
		sdp.sdp_maxpri -= sysdc_batch_niceness;

	VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);

	ASSERT(t->t_lwp != NULL);
	ASSERT(t->t_cid == syscid);
	ASSERT(t->t_cldata == NULL);
	VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
	VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
	CL_EXITCLASS(syscid, NULL);
}