/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * The System Duty Cycle (SDC) scheduling class * -------------------------------------------- * * Background * * Kernel threads in Solaris have traditionally not been large consumers * of CPU time. They typically wake up, perform a small amount of * work, then go back to sleep waiting for either a timeout or another * signal. On the assumption that the small amount of work that they do * is important for the behavior of the whole system, these threads are * treated kindly by the dispatcher and the SYS scheduling class: they run * without preemption from anything other than real-time and interrupt * threads; when preempted, they are put at the front of the queue, so they * generally do not migrate between CPUs; and they are allowed to stay * running until they voluntarily give up the CPU. * * As Solaris has evolved, new workloads have emerged which require the * kernel to perform significant amounts of CPU-intensive work. One * example of such a workload is ZFS's transaction group sync processing. * Each sync operation generates a large batch of I/Os, and each I/O * may need to be compressed and/or checksummed before it is written to * storage. The taskq threads which perform the compression and checksums * will run nonstop as long as they have work to do; a large sync operation * on a compression-heavy dataset can keep them busy for seconds on end. * This causes human-time-scale dispatch latency bubbles for any other * threads which have the misfortune to share a CPU with the taskq threads. * * The SDC scheduling class is a solution to this problem. * * * Overview * * SDC is centered around the concept of a thread's duty cycle (DC): * * ONPROC time * Duty Cycle = ---------------------- * ONPROC + Runnable time * * This is the ratio of the time that the thread spent running on a CPU * divided by the time it spent running or trying to run. It is unaffected * by any time the thread spent sleeping, stopped, etc. * * A thread joining the SDC class specifies a "target" DC that it wants * to run at. To implement this policy, the routine sysdc_update() scans * the list of active SDC threads every few ticks and uses each thread's * microstate data to compute the actual duty cycle that that thread * has experienced recently. If the thread is under its target DC, its * priority is increased to the maximum available (sysdc_maxpri, which is * 99 by default). If the thread is over its target DC, its priority is * reduced to the minimum available (sysdc_minpri, 0 by default). This * is a fairly primitive approach, in that it doesn't use any of the * intermediate priorities, but it's not completely inappropriate. Even * though threads in the SDC class might take a while to do their job, they * are by some definition important if they're running inside the kernel, * so it is reasonable that they should get to run at priority 99. * * If a thread is running when sysdc_update() calculates its actual duty * cycle, and there are other threads of equal or greater priority on its * CPU's dispatch queue, sysdc_update() preempts that thread. The thread * acknowledges the preemption by calling sysdc_preempt(), which calls * setbackdq(), which gives other threads with the same priority a chance * to run. This creates a de facto time quantum for threads in the SDC * scheduling class. * * An SDC thread which is assigned priority 0 can continue to run if * nothing else needs to use the CPU that it's running on. Similarly, an * SDC thread at priority 99 might not get to run as much as it wants to * if there are other priority-99 or higher threads on its CPU. These * situations would cause the thread to get ahead of or behind its target * DC; the longer the situations lasted, the further ahead or behind the * thread would get. Rather than condemning a thread to a lifetime of * paying for its youthful indiscretions, SDC keeps "base" values for * ONPROC and Runnable times in each thread's sysdc data, and updates these * values periodically. The duty cycle is then computed using the elapsed * amount of ONPROC and Runnable times since those base times. * * Since sysdc_update() scans SDC threads fairly frequently, it tries to * keep the list of "active" threads small by pruning out threads which * have been asleep for a brief time. They are not pruned immediately upon * going to sleep, since some threads may bounce back and forth between * sleeping and being runnable. * * * Interfaces * * void sysdc_thread_enter(t, dc, flags) * * Moves a kernel thread from the SYS scheduling class to the * SDC class. t must have an associated LWP (created by calling * lwp_kernel_create()). The thread will have a target DC of dc. * Flags should be either 0 or SYSDC_THREAD_BATCH. If * SYSDC_THREAD_BATCH is specified, the thread will run with a * slightly lower priority (see "Batch threads", below). * * * Complications * * - Run queue balancing * * The Solaris dispatcher is biased towards letting a thread run * on the same CPU which it last ran on, if no more than 3 ticks * (i.e. rechoose_interval) have passed since the thread last ran. * This helps to preserve cache warmth. On the other hand, it also * tries to keep the per-CPU run queues fairly balanced; if the CPU * chosen for a runnable thread has a run queue which is three or * more threads longer than a neighboring CPU's queue, the runnable * thread is dispatched onto the neighboring CPU instead. * * These policies work well for some workloads, but not for many SDC * threads. The taskq client of SDC, for example, has many discrete * units of work to do. The work units are largely independent, so * cache warmth is not an important consideration. It is important * that the threads fan out quickly to different CPUs, since the * amount of work these threads have to do (a few seconds worth at a * time) doesn't leave much time to correct thread placement errors * (i.e. two SDC threads being dispatched to the same CPU). * * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS. * This tells the dispatcher to keep neighboring run queues' lengths * more evenly matched, which allows SDC threads to migrate more * easily. * * - LWPs and system processes * * SDC can only be used for kernel threads. Since SDC uses microstate * accounting data to compute each thread's actual duty cycle, all * threads entering the SDC class must have associated LWPs (which * store the microstate data). This means that the threads have to * be associated with an SSYS process, i.e. one created by newproc(). * If the microstate accounting information is ever moved into the * kthread_t, this restriction could be lifted. * * - Dealing with oversubscription * * Since SDC duty cycles are per-thread, it is possible that the * aggregate requested duty cycle of all SDC threads in a processor * set could be greater than the total CPU time available in that set. * The FSS scheduling class has an analogous situation, which it deals * with by reducing each thread's allotted CPU time proportionally. * Since SDC doesn't need to be as precise as FSS, it uses a simpler * solution to the oversubscription problem. * * sysdc_update() accumulates the amount of time that max-priority SDC * threads have spent on-CPU in each processor set, and uses that sum * to create an implied duty cycle for that processor set: * * accumulated CPU time * pset DC = ----------------------------------- * (# CPUs) * time since last update * * If this implied duty cycle is above a maximum pset duty cycle (90% * by default), sysdc_update() sets the priority of all SDC threads * in that processor set to sysdc_minpri for a "break" period. After * the break period, it waits for a "nobreak" period before trying to * enforce the pset duty cycle limit again. * * - Processor sets * * As the above implies, SDC is processor set aware, but it does not * currently allow threads to change processor sets while in the SDC * class. Instead, those threads must join the desired processor set * before entering SDC. [1] * * - Batch threads * * A thread joining the SDC class can specify the SDC_THREAD_BATCH * flag. This flag causes the maximum priority for that thread to be * reduced (by default, the maximum is reduced by 1). This allows * longer-running, batch-oriented SDC threads to be interrupted by * more immediate, higher-priority work. * * - t_kpri_req * * The TS and FSS scheduling classes pay attention to t_kpri_req, * which provides a simple form of priority inheritance for * synchronization primitives (such as rwlocks held as READER) which * cannot be traced to a unique thread. The SDC class does not honor * t_kpri_req, for a few reasons: * * 1. t_kpri_req is notoriously inaccurate. A measure of its * inaccuracy is that it needs to be cleared every time a thread * returns to user mode, because it is frequently non-zero at that * point. This can happen because "ownership" of synchronization * primitives that use t_kpri_req can be silently handed off, * leaving no opportunity to will the t_kpri_req inheritance. * * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at * kernel priority. This means that even if an SDC thread * is holding a synchronization primitive and running at low * priority, its priority will eventually be raised above 60, * allowing it to drive on and release the resource. * * 3. The first consumer of SDC uses the taskq subsystem, which holds * a reader lock for the duration of the task's execution. This * would mean that SDC threads would never drop below kernel * priority in practice, which defeats one of the purposes of SDC. * * - Why not FSS? * * It might seem that the existing FSS scheduling class could solve * the problems that SDC is attempting to solve. FSS's more precise * solution to the oversubscription problem would hardly cause * trouble, as long as it performed well. SDC is implemented as * a separate scheduling class for two main reasons: the initial * consumer of SDC does not map well onto the "project" abstraction * that is central to FSS, and FSS does not expect to run at kernel * priorities. * * * Tunables * * - sysdc_batch_niceness: The amount below sysdc_maxpri that * SDC_THREAD_BATCH threads should use as their per-thread * maximum priority. * * - sysdc_update_interval_msec: Number of milliseconds between * consecutive thread priority updates. * * - sysdc_reset_interval_msec: Number of milliseconds between * consecutive resets of a thread's base ONPROC and Runnable * times. * * - sysdc_prune_interval_msec: Number of milliseconds of sleeping * before a thread is pruned from the active list. * * - sysdc_max_pset_DC: Allowable percentage of a processor set's * CPU time which SDC can give to its high-priority threads. * * - sysdc_break_msec: Number of milliseconds of "break" taken when * sysdc_max_pset_DC is exceeded. * * * Future work (in SDC and related subsystems) * * - Per-thread rechoose interval (0 for SDC) * * Allow each thread to specify its own rechoose interval. SDC * threads would specify an interval of zero, which would rechoose * the CPU with the lowest priority once per update. * * - Allow threads to change processor sets after joining the SDC class * * - Thread groups and per-group DC * * It might be nice to be able to specify a duty cycle which applies * to a group of threads in aggregate. * * - Per-group DC callback to allow dynamic DC tuning * * Currently, DCs are assigned when the thread joins SDC. Some * workloads could benefit from being able to tune their DC using * subsystem-specific knowledge about the workload. * * - Finer-grained priority updates * * - More nuanced management of oversubscription * * - Moving other CPU-intensive threads into SDC * * - Move msacct data into kthread_t * * This would allow kernel threads without LWPs to join SDC. * * * Footnotes * * [1] The details of doing so are left as an exercise for the reader. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Tunables - loaded into the internal state at module load time */ uint_t sysdc_update_interval_msec = 20; uint_t sysdc_reset_interval_msec = 400; uint_t sysdc_prune_interval_msec = 100; uint_t sysdc_max_pset_DC = 90; uint_t sysdc_break_msec = 80; pri_t sysdc_batch_niceness = 1; /* * Internal state - constants set up by sysdc_initparam() */ static clock_t sysdc_update_ticks; /* ticks between updates */ static uint_t sysdc_prune_updates; /* updates asleep before pruning */ static uint_t sysdc_reset_updates; /* # of updates before reset */ static uint_t sysdc_break_updates; /* updates to break */ static uint_t sysdc_nobreak_updates; /* updates to not check */ static uint_t sysdc_minDC; /* minimum allowed DC */ static uint_t sysdc_maxDC; /* maximum allowed DC */ static pri_t sysdc_minpri; /* minimum allowed priority */ static pri_t sysdc_maxpri; /* maximum allowed priority */ /* * Internal state */ static kmutex_t sysdc_pset_lock; /* lock protecting pset data */ static list_t sysdc_psets; /* list of psets with SDC threads */ static uint_t sysdc_param_init; /* sysdc_initparam() has been called */ static uint_t sysdc_update_timeout_started; /* update timeout is active */ static hrtime_t sysdc_last_update; /* time of last sysdc_update() */ static sysdc_t sysdc_dummy; /* used to terminate active lists */ /* * Internal state - active hash table */ #define SYSDC_NLISTS 8 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1)) static sysdc_list_t sysdc_active[SYSDC_NLISTS]; #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)]) #ifdef DEBUG static struct { uint64_t sysdc_update_times_asleep; uint64_t sysdc_update_times_base_ran_backwards; uint64_t sysdc_update_times_already_done; uint64_t sysdc_update_times_cur_ran_backwards; uint64_t sysdc_compute_pri_breaking; uint64_t sysdc_activate_enter; uint64_t sysdc_update_enter; uint64_t sysdc_update_exited; uint64_t sysdc_update_not_sdc; uint64_t sysdc_update_idle; uint64_t sysdc_update_take_break; uint64_t sysdc_update_no_psets; uint64_t sysdc_tick_not_sdc; uint64_t sysdc_tick_quantum_expired; uint64_t sysdc_thread_enter_enter; } sysdc_stats; #define SYSDC_INC_STAT(x) (sysdc_stats.x++) #else #define SYSDC_INC_STAT(x) ((void)0) #endif /* macros are UPPER CASE */ #define HOWMANY(a, b) howmany((a), (b)) #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick) static void sysdc_initparam(void) { uint_t sysdc_break_ticks; /* update / prune intervals */ sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec); sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec, sysdc_update_interval_msec); sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec, sysdc_update_interval_msec); /* We must get at least a little time on CPU. */ sysdc_minDC = 1; sysdc_maxDC = SYSDC_DC_MAX; sysdc_minpri = 0; sysdc_maxpri = maxclsyspri; /* break parameters */ if (sysdc_max_pset_DC > SYSDC_DC_MAX) { sysdc_max_pset_DC = SYSDC_DC_MAX; } sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec); sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks); /* * We want: * * sysdc_max_pset_DC = (nobreak / (break + nobreak)) * * ==> nobreak = sysdc_max_pset_DC * (break + nobreak) * * sysdc_max_pset_DC * break * ==> nobreak = ------------------------- * 1 - sysdc_max_pset_DC */ sysdc_nobreak_updates = HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC, (SYSDC_DC_MAX - sysdc_max_pset_DC)); sysdc_param_init = 1; } #undef HOWMANY #undef MSECTOTICKS #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */ #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */ #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */ /* * Updates the recorded times in the sdc, and returns the elapsed ONPROC * and Runnable times since the last reset. * * newO is the thread's actual ONPROC time; it's used during sysdc_update() * to track processor set usage. */ static void sysdc_update_times(sysdc_t *sdc, uint_t flags, hrtime_t *O, hrtime_t *R, hrtime_t *newO) { kthread_t *const t = sdc->sdc_thread; const uint_t initial = (flags & SDC_UPDATE_INITIAL); const uint_t update = (flags & SDC_UPDATE_TIMEOUT); const clock_t now = ddi_get_lbolt(); uint_t do_reset; ASSERT(THREAD_LOCK_HELD(t)); *O = *R = 0; /* If we've been sleeping, we know we haven't had any ONPROC time. */ if (sdc->sdc_sleep_updates != 0 && sdc->sdc_sleep_updates != sdc->sdc_nupdates) { *newO = sdc->sdc_last_base_O; SYSDC_INC_STAT(sysdc_update_times_asleep); return; } /* * If this is our first update, or we've hit the reset point, * we need to reset our base_{O,R}. Once we've updated them, we * report O and R for the entire prior interval. */ do_reset = initial; if (update) { ++sdc->sdc_nupdates; if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0) do_reset = 1; } if (do_reset) { hrtime_t baseO, baseR; if (initial) { /* * Start off our cycle count somewhere in the middle, * to keep the resets from all happening at once. * * 4999 is a handy prime much larger than * sysdc_reset_updates, so that we don't run into * trouble if the resolution is a multiple of * sysdc_reset_updates. */ sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) % sysdc_reset_updates); baseO = baseR = 0; } else { baseO = sdc->sdc_base_O; baseR = sdc->sdc_base_R; } mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R); *newO = sdc->sdc_base_O; sdc->sdc_reset = now; sdc->sdc_pri_check = -1; /* force mismatch below */ /* * See below for rationale. */ if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) { SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards); baseO = sdc->sdc_base_O; baseR = sdc->sdc_base_R; } /* compute based on the entire interval */ *O = (sdc->sdc_base_O - baseO); *R = (sdc->sdc_base_R - baseR); return; } /* * If we're called from sysdc_update(), we *must* return a value * for newO, so we always call mstate_systhread_times(). * * Otherwise, if we've already done a pri check this tick, * we can skip it. */ if (!update && sdc->sdc_pri_check == now) { SYSDC_INC_STAT(sysdc_update_times_already_done); return; } /* Get the current times from the thread */ sdc->sdc_pri_check = now; mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R); *newO = sdc->sdc_cur_O; /* * The updating of microstate accounting is not done under a * consistent set of locks, particularly the t_waitrq field. This * can lead to narrow windows in which we account for time in the * wrong bucket, which on the next read will be accounted for * correctly. * * If our sdc_base_* fields were affected by one of these blips, we * throw away the old data, and pretend this tick didn't happen. */ if (sdc->sdc_cur_O < sdc->sdc_base_O || sdc->sdc_cur_R < sdc->sdc_base_R) { sdc->sdc_base_O = sdc->sdc_cur_O; sdc->sdc_base_R = sdc->sdc_cur_R; SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards); return; } *O = sdc->sdc_cur_O - sdc->sdc_base_O; *R = sdc->sdc_cur_R - sdc->sdc_base_R; } /* * sysdc_compute_pri() * * Recomputes the priority of the thread, leaving the result in * sdc->sdc_epri. Returns 1 if a priority update should occur * (which will also trigger a cpu_surrender()), otherwise * returns 0. */ static uint_t sysdc_compute_pri(sysdc_t *sdc, uint_t flags) { kthread_t *const t = sdc->sdc_thread; const uint_t update = (flags & SDC_UPDATE_TIMEOUT); const uint_t tick = (flags & SDC_UPDATE_TICK); hrtime_t O, R; hrtime_t newO = -1; ASSERT(THREAD_LOCK_HELD(t)); sysdc_update_times(sdc, flags, &O, &R, &newO); ASSERT(!update || newO != -1); /* If we have new data, recompute our priority. */ if ((O + R) != 0) { sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R); /* Adjust our priority to move our DC closer to the target. */ if (sdc->sdc_cur_DC < sdc->sdc_target_DC) sdc->sdc_pri = sdc->sdc_maxpri; else sdc->sdc_pri = sdc->sdc_minpri; } /* * If our per-pset duty cycle goes over the max, we will take a break. * This forces all sysdc threads in the pset to minimum priority, in * order to let everyone else have a chance at the CPU. */ if (sdc->sdc_pset->sdp_need_break) { SYSDC_INC_STAT(sysdc_compute_pri_breaking); sdc->sdc_epri = sdc->sdc_minpri; } else { sdc->sdc_epri = sdc->sdc_pri; } DTRACE_PROBE4(sysdc__compute__pri, kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC, uint_t, sdc->sdc_target_DC); /* * For sysdc_update(), we compute the ONPROC time for high-priority * threads, which is used to calculate the per-pset duty cycle. We * will always tell our callers to update the thread's priority, * since we want to force a cpu_surrender(). * * We reset sdc_update_ticks so that sysdc_tick() will only update * the thread's priority if our timeout is delayed by a tick or * more. */ if (update) { /* SDC threads are not allowed to change cpupart bindings. */ ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart); /* If we were at MAXPRI, account for our onproc time. */ if (t->t_pri == sdc->sdc_maxpri && sdc->sdc_last_base_O != 0 && sdc->sdc_last_base_O < newO) { sdc->sdc_last_O = newO - sdc->sdc_last_base_O; sdc->sdc_pset->sdp_onproc_time += (uint64_t)sdc->sdc_last_O; sdc->sdc_pset->sdp_onproc_threads++; } else { sdc->sdc_last_O = 0; } sdc->sdc_last_base_O = newO; sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1; return (1); } /* * Like sysdc_update(), sysdc_tick() always wants to update the * thread's priority, so that the CPU is surrendered if necessary. * We reset sdc_update_ticks so that if the timeout continues to be * delayed, we'll update at the regular interval. */ if (tick) { ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks); sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks; return (1); } /* * Otherwise, only tell our callers to update the priority if it has * changed. */ return (sdc->sdc_epri != t->t_pri); } static void sysdc_update_pri(sysdc_t *sdc, uint_t flags) { kthread_t *t = sdc->sdc_thread; ASSERT(THREAD_LOCK_HELD(t)); if (sysdc_compute_pri(sdc, flags)) { if (!thread_change_pri(t, sdc->sdc_epri, 0)) { cpu_surrender(t); } } } /* * Add a thread onto the active list. It will only be removed by * sysdc_update(). */ static void sysdc_activate(sysdc_t *sdc) { sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list; sysdc_t *head; kthread_t *t = sdc->sdc_thread; SYSDC_INC_STAT(sysdc_activate_enter); ASSERT(sdc->sdc_next == NULL); ASSERT(THREAD_LOCK_HELD(t)); do { head = *headp; sdc->sdc_next = head; } while (atomic_cas_ptr(headp, head, sdc) != head); } /* * sysdc_update() has two jobs: * * 1. It updates the priorities of all active SDC threads on the system. * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC. */ static void sysdc_update(void *arg) { int idx; sysdc_t *freelist = NULL; sysdc_pset_t *cur; hrtime_t now, diff; uint_t redeploy = 1; SYSDC_INC_STAT(sysdc_update_enter); ASSERT(sysdc_update_timeout_started); /* * If this is our first time through, diff will be gigantic, and * no breaks will be necessary. */ now = gethrtime(); diff = now - sysdc_last_update; sysdc_last_update = now; mutex_enter(&sysdc_pset_lock); for (cur = list_head(&sysdc_psets); cur != NULL; cur = list_next(&sysdc_psets, cur)) { boolean_t breaking = (cur->sdp_should_break != 0); if (cur->sdp_need_break != breaking) { DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur, boolean_t, breaking); } cur->sdp_onproc_time = 0; cur->sdp_onproc_threads = 0; cur->sdp_need_break = breaking; } mutex_exit(&sysdc_pset_lock); for (idx = 0; idx < SYSDC_NLISTS; idx++) { sysdc_list_t *sdl = &sysdc_active[idx]; sysdc_t *volatile *headp = &sdl->sdl_list; sysdc_t *head, *tail; sysdc_t **prevptr; if (*headp == &sysdc_dummy) continue; /* Prevent any threads from exiting while we're poking them. */ mutex_enter(&sdl->sdl_lock); /* * Each sdl_list contains a singly-linked list of active * threads. Threads which become active while we are * processing the list will be added to sdl_list. Since we * don't want that to interfere with our own processing, we * swap in an empty list. Any newly active threads will * go on to this empty list. When finished, we'll put any * such threads at the end of the processed list. */ head = atomic_swap_ptr(headp, &sysdc_dummy); prevptr = &head; while (*prevptr != &sysdc_dummy) { sysdc_t *const sdc = *prevptr; kthread_t *const t = sdc->sdc_thread; /* * If the thread has exited, move its sysdc_t onto * freelist, to be freed later. */ if (t == NULL) { *prevptr = sdc->sdc_next; SYSDC_INC_STAT(sysdc_update_exited); sdc->sdc_next = freelist; freelist = sdc; continue; } thread_lock(t); if (t->t_cid != sysdccid) { thread_unlock(t); prevptr = &sdc->sdc_next; SYSDC_INC_STAT(sysdc_update_not_sdc); continue; } ASSERT(t->t_cldata == sdc); /* * If the thread has been sleeping for longer * than sysdc_prune_interval, make it inactive by * removing it from the list. */ if (!(t->t_state & (TS_RUN | TS_ONPROC)) && sdc->sdc_sleep_updates != 0 && (sdc->sdc_sleep_updates - sdc->sdc_nupdates) > sysdc_prune_updates) { *prevptr = sdc->sdc_next; SYSDC_INC_STAT(sysdc_update_idle); sdc->sdc_next = NULL; thread_unlock(t); continue; } sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT); thread_unlock(t); prevptr = &sdc->sdc_next; } /* * Add our list to the bucket, putting any new entries * added while we were working at the tail of the list. */ do { tail = *headp; *prevptr = tail; } while (atomic_cas_ptr(headp, tail, head) != tail); mutex_exit(&sdl->sdl_lock); } mutex_enter(&sysdc_pset_lock); for (cur = list_head(&sysdc_psets); cur != NULL; cur = list_next(&sysdc_psets, cur)) { cur->sdp_vtime_last_interval = diff * cur->sdp_cpupart->cp_ncpus; cur->sdp_DC_last_interval = (cur->sdp_onproc_time * SYSDC_DC_MAX) / cur->sdp_vtime_last_interval; if (cur->sdp_should_break > 0) { cur->sdp_should_break--; /* breaking */ continue; } if (cur->sdp_dont_break > 0) { cur->sdp_dont_break--; /* waiting before checking */ continue; } if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) { cur->sdp_should_break = sysdc_break_updates; cur->sdp_dont_break = sysdc_nobreak_updates; SYSDC_INC_STAT(sysdc_update_take_break); } } /* * If there are no sysdc_psets, there can be no threads, so * we can stop doing our timeout. Since we're holding the * sysdc_pset_lock, no new sysdc_psets can come in, which will * prevent anyone from racing with this and dropping our timeout * on the floor. */ if (list_is_empty(&sysdc_psets)) { SYSDC_INC_STAT(sysdc_update_no_psets); ASSERT(sysdc_update_timeout_started); sysdc_update_timeout_started = 0; redeploy = 0; } mutex_exit(&sysdc_pset_lock); while (freelist != NULL) { sysdc_t *cur = freelist; freelist = cur->sdc_next; kmem_free(cur, sizeof (*cur)); } if (redeploy) { (void) timeout(sysdc_update, arg, sysdc_update_ticks); } } static void sysdc_preempt(kthread_t *t) { ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); setbackdq(t); /* give others a chance to run */ } static void sysdc_tick(kthread_t *t) { sysdc_t *sdc; thread_lock(t); if (t->t_cid != sysdccid) { SYSDC_INC_STAT(sysdc_tick_not_sdc); thread_unlock(t); return; } sdc = t->t_cldata; if (t->t_state == TS_ONPROC && t->t_pri < t->t_disp_queue->disp_maxrunpri) { cpu_surrender(t); } if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) { ASSERT(sdc->sdc_sleep_updates == 0); } ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); sdc->sdc_ticks++; if (sdc->sdc_ticks == sdc->sdc_update_ticks) { SYSDC_INC_STAT(sysdc_tick_quantum_expired); sysdc_update_pri(sdc, SDC_UPDATE_TICK); ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); } thread_unlock(t); } static void sysdc_setrun(kthread_t *t) { sysdc_t *sdc = t->t_cldata; ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ sdc->sdc_sleep_updates = 0; if (sdc->sdc_next == NULL) { /* * Since we're in transition, we don't want to use the * full thread_update_pri(). */ if (sysdc_compute_pri(sdc, 0)) { THREAD_CHANGE_PRI(t, sdc->sdc_epri); } sysdc_activate(sdc); ASSERT(sdc->sdc_next != NULL); } setbackdq(t); } static void sysdc_wakeup(kthread_t *t) { sysdc_setrun(t); } static void sysdc_sleep(kthread_t *t) { sysdc_t *sdc = t->t_cldata; ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ sdc->sdc_sleep_updates = sdc->sdc_nupdates; } /*ARGSUSED*/ static int sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, void *bufp) { cpupart_t *const cpupart = t->t_cpupart; sysdc_t *sdc = bufp; sysdc_params_t *sdpp = parmsp; sysdc_pset_t *newpset = sdc->sdc_pset; sysdc_pset_t *pset; int start_timeout; if (t->t_cid != syscid) return (EPERM); ASSERT(ttolwp(t) != NULL); ASSERT(sdpp != NULL); ASSERT(newpset != NULL); ASSERT(sysdc_param_init); ASSERT(sdpp->sdp_minpri >= sysdc_minpri); ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri); ASSERT(sdpp->sdp_DC >= sysdc_minDC); ASSERT(sdpp->sdp_DC <= sysdc_maxDC); sdc->sdc_thread = t; sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */ sdc->sdc_minpri = sdpp->sdp_minpri; sdc->sdc_maxpri = sdpp->sdp_maxpri; sdc->sdc_target_DC = sdpp->sdp_DC; sdc->sdc_ticks = 0; sdc->sdc_update_ticks = sysdc_update_ticks + 1; /* Assign ourselves to the appropriate pset. */ sdc->sdc_pset = NULL; mutex_enter(&sysdc_pset_lock); for (pset = list_head(&sysdc_psets); pset != NULL; pset = list_next(&sysdc_psets, pset)) { if (pset->sdp_cpupart == cpupart) { break; } } if (pset == NULL) { pset = newpset; newpset = NULL; pset->sdp_cpupart = cpupart; list_insert_tail(&sysdc_psets, pset); } pset->sdp_nthreads++; ASSERT(pset->sdp_nthreads > 0); sdc->sdc_pset = pset; start_timeout = (sysdc_update_timeout_started == 0); sysdc_update_timeout_started = 1; mutex_exit(&sysdc_pset_lock); if (newpset != NULL) kmem_free(newpset, sizeof (*newpset)); /* Update t's scheduling class and priority. */ thread_lock(t); t->t_clfuncs = &(sclass[cid].cl_funcs->thread); t->t_cid = cid; t->t_cldata = sdc; t->t_schedflag |= TS_RUNQMATCH; sysdc_update_pri(sdc, SDC_UPDATE_INITIAL); thread_unlock(t); /* Kick off the thread timeout if we're the first one in. */ if (start_timeout) { (void) timeout(sysdc_update, NULL, sysdc_update_ticks); } return (0); } static void sysdc_leave(sysdc_t *sdc) { sysdc_pset_t *sdp = sdc->sdc_pset; sysdc_list_t *sdl = SYSDC_LIST(sdc); uint_t freedc; mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */ sdc->sdc_thread = NULL; freedc = (sdc->sdc_next == NULL); mutex_exit(&sdl->sdl_lock); mutex_enter(&sysdc_pset_lock); sdp = sdc->sdc_pset; ASSERT(sdp != NULL); ASSERT(sdp->sdp_nthreads > 0); --sdp->sdp_nthreads; if (sdp->sdp_nthreads == 0) { list_remove(&sysdc_psets, sdp); } else { sdp = NULL; } mutex_exit(&sysdc_pset_lock); if (freedc) kmem_free(sdc, sizeof (*sdc)); if (sdp != NULL) kmem_free(sdp, sizeof (*sdp)); } static void sysdc_exitclass(void *buf) { sysdc_leave((sysdc_t *)buf); } /*ARGSUSED*/ static int sysdc_canexit(kthread_t *t, cred_t *reqpcredp) { /* Threads cannot exit SDC once joined, except in a body bag. */ return (EPERM); } static void sysdc_exit(kthread_t *t) { sysdc_t *sdc; /* We're exiting, so we just rejoin the SYS class. */ thread_lock(t); ASSERT(t->t_cid == sysdccid); sdc = t->t_cldata; t->t_cid = syscid; t->t_cldata = NULL; t->t_clfuncs = &(sclass[syscid].cl_funcs->thread); (void) thread_change_pri(t, maxclsyspri, 0); t->t_schedflag &= ~TS_RUNQMATCH; thread_unlock_nopreempt(t); /* Unlink the sdc from everything. */ sysdc_leave(sdc); } /*ARGSUSED*/ static int sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp) { /* * Threads cannot be created with SDC as their class; they must * be created as SYS and then added with sysdc_thread_enter(). * Because of this restriction, sysdc_fork() should never be called. */ panic("sysdc cannot be forked"); return (ENOSYS); } /*ARGSUSED*/ static void sysdc_forkret(kthread_t *t, kthread_t *ct) { /* SDC threads are part of system processes, which never fork. */ panic("sysdc cannot be forked"); } static pri_t sysdc_globpri(kthread_t *t) { return (t->t_epri); } /*ARGSUSED*/ static pri_t sysdc_no_swap(kthread_t *t, int flags) { /* SDC threads cannot be swapped. */ return (-1); } /* * Get maximum and minimum priorities enjoyed by SDC threads. */ static int sysdc_getclpri(pcpri_t *pcprip) { pcprip->pc_clpmax = sysdc_maxpri; pcprip->pc_clpmin = sysdc_minpri; return (0); } /*ARGSUSED*/ static int sysdc_getclinfo(void *arg) { return (0); /* no class-specific info */ } /*ARGSUSED*/ static int sysdc_alloc(void **p, int flag) { sysdc_t *new; *p = NULL; if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) { return (ENOMEM); } if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) == NULL) { kmem_free(new, sizeof (*new)); return (ENOMEM); } *p = new; return (0); } static void sysdc_free(void *p) { sysdc_t *sdc = p; if (sdc != NULL) { /* * We must have failed CL_ENTERCLASS(), so our pset should be * there and unused. */ ASSERT(sdc->sdc_pset != NULL); ASSERT(sdc->sdc_pset->sdp_cpupart == NULL); kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset)); kmem_free(sdc, sizeof (*sdc)); } } static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */ static int sysdc_einval(); static void sysdc_nullsys(); static struct classfuncs sysdc_classfuncs = { /* messages to class manager */ { sysdc_enosys, /* admin */ sysdc_getclinfo, sysdc_enosys, /* parmsin */ sysdc_enosys, /* parmsout */ sysdc_enosys, /* vaparmsin */ sysdc_enosys, /* vaparmsout */ sysdc_getclpri, sysdc_alloc, sysdc_free, }, /* operations on threads */ { sysdc_enterclass, sysdc_exitclass, sysdc_canexit, sysdc_fork, sysdc_forkret, sysdc_nullsys, /* parmsget */ sysdc_enosys, /* parmsset */ sysdc_nullsys, /* stop */ sysdc_exit, sysdc_nullsys, /* active */ sysdc_nullsys, /* inactive */ sysdc_no_swap, /* swapin */ sysdc_no_swap, /* swapout */ sysdc_nullsys, /* trapret */ sysdc_preempt, sysdc_setrun, sysdc_sleep, sysdc_tick, sysdc_wakeup, sysdc_einval, /* donice */ sysdc_globpri, sysdc_nullsys, /* set_process_group */ sysdc_nullsys, /* yield */ sysdc_einval, /* doprio */ } }; static int sysdc_enosys() { return (ENOSYS); } static int sysdc_einval() { return (EINVAL); } static void sysdc_nullsys() { } /*ARGSUSED*/ static pri_t sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) { int idx; list_create(&sysdc_psets, sizeof (sysdc_pset_t), offsetof(sysdc_pset_t, sdp_node)); for (idx = 0; idx < SYSDC_NLISTS; idx++) { sysdc_active[idx].sdl_list = &sysdc_dummy; } sysdc_initparam(); sysdccid = cid; *clfuncspp = &sysdc_classfuncs; return ((pri_t)v.v_maxsyspri); } static struct sclass csw = { "SDC", sysdc_init, 0 }; static struct modlsched modlsched = { &mod_schedops, "system duty cycle scheduling class", &csw }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modlsched, NULL }; int _init() { return (mod_install(&modlinkage)); } int _fini() { return (EBUSY); /* can't unload for now */ } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /* --- consolidation-private interfaces --- */ void sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags) { void *buf = NULL; sysdc_params_t sdp; SYSDC_INC_STAT(sysdc_thread_enter_enter); ASSERT(sysdc_param_init); ASSERT(sysdccid >= 0); ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0); sdp.sdp_minpri = sysdc_minpri; sdp.sdp_maxpri = sysdc_maxpri; sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC); if (flags & SYSDC_THREAD_BATCH) sdp.sdp_maxpri -= sysdc_batch_niceness; VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0); ASSERT(t->t_lwp != NULL); ASSERT(t->t_cid == syscid); ASSERT(t->t_cldata == NULL); VERIFY3U(CL_CANEXIT(t, NULL), ==, 0); VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0); CL_EXITCLASS(syscid, NULL); }