/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the implementation of clock tick accounting for threads. * Every tick, user threads running on various CPUs are located and charged * with a tick to account for their use of CPU time. * * Every tick, the clock() handler calls clock_tick_schedule() to perform tick * accounting for all the threads in the system. Tick accounting is done in * two phases: * * Tick scheduling Done in clock_tick_schedule(). In this phase, cross * calls are scheduled to multiple CPUs to perform * multi-threaded tick accounting. The CPUs are chosen * on a rotational basis so as to distribute the tick * accounting load evenly across all CPUs. * * Tick execution Done in clock_tick_execute(). In this phase, tick * accounting is actually performed by softint handlers * on multiple CPUs. * * This implementation gives us a multi-threaded tick processing facility that * is suitable for configurations with a large number of CPUs. On smaller * configurations it may be desirable to let the processing be single-threaded * and just allow clock() to do it as it has been done traditionally. To * facilitate this, a variable, clock_tick_threshold, is defined. Platforms * that desire multi-threading should set this variable to something * appropriate. A recommended value may be found in clock_tick.h. At boot time, * if the number of CPUs is greater than clock_tick_threshold, multi-threading * kicks in. Note that this is a decision made at boot time. If more CPUs * are dynamically added later on to exceed the threshold, no attempt is made * to switch to multi-threaded. Similarly, if CPUs are removed dynamically * no attempt is made to switch to single-threaded. This is to keep the * implementation simple. Also note that the threshold can be changed for a * specific customer configuration via /etc/system. * * The boot time decision is reflected in clock_tick_single_threaded. */ /* * clock_tick_threshold * If the number of CPUs at boot time exceeds this threshold, * multi-threaded tick accounting kicks in. * * clock_tick_ncpus * The number of CPUs in a set. Each set is scheduled for tick execution * on a separate processor. * * clock_tick_single_threaded * Indicates whether or not tick accounting is single threaded. * * clock_tick_total_cpus * Total number of online CPUs. * * clock_tick_cpus * Array of online CPU pointers. * * clock_tick_cpu * Per-CPU, cache-aligned data structures to facilitate multi-threading. * * clock_tick_active * Counter that indicates the number of active tick processing softints * in the system. * * clock_tick_pending * Number of pending ticks that need to be accounted by the softint * handlers. * * clock_tick_lock * Mutex to synchronize between clock_tick_schedule() and * CPU online/offline. * * clock_cpu_id * CPU id of the clock() CPU. Used to detect when the clock CPU * is offlined. * * clock_tick_online_cpuset * CPU set of all online processors that can be X-called. * * clock_tick_proc_max * Each process is allowed to accumulate a few ticks before checking * for the task CPU time resource limit. We lower the number of calls * to rctl_test() to make tick accounting more scalable. The tradeoff * is that the limit may not get enforced in a timely manner. This is * typically not a problem. * * clock_tick_set * Per-set structures. Each structure contains the range of CPUs * to be processed for the set. * * clock_tick_nsets; * Number of sets. * * clock_tick_scan * Where to begin the scan for single-threaded mode. In multi-threaded, * the clock_tick_set itself contains a field for this. */ int clock_tick_threshold; int clock_tick_ncpus; int clock_tick_single_threaded; int clock_tick_total_cpus; cpu_t *clock_tick_cpus[NCPU]; clock_tick_cpu_t *clock_tick_cpu[NCPU]; ulong_t clock_tick_active; int clock_tick_pending; kmutex_t clock_tick_lock; processorid_t clock_cpu_id; cpuset_t clock_tick_online_cpuset; clock_t clock_tick_proc_max; clock_tick_set_t *clock_tick_set; int clock_tick_nsets; int clock_tick_scan; ulong_t clock_tick_intr; static uint_t clock_tick_execute(caddr_t, caddr_t); static void clock_tick_execute_common(int, int, int, clock_t, int); /* * Clock tick initialization is done in two phases: * * 1. Before clock_init() is called, clock_tick_init_pre() is called to set * up single-threading so the clock() can begin to do its job. * * 2. After the slave CPUs are initialized at boot time, we know the number * of CPUs. clock_tick_init_post() is called to set up multi-threading if * required. */ void clock_tick_init_pre(void) { clock_tick_cpu_t *ctp; int i, n; clock_tick_set_t *csp; uintptr_t abuf, buf; size_t size; clock_tick_single_threaded = 1; /* * We will not free this memory, but to avoid the false sharing, * align to cache line size. */ size = P2ROUNDUP(sizeof (clock_tick_cpu_t), _CACHE_LINE_SIZE); abuf = (uintptr_t)kmem_zalloc(size * NCPU + _CACHE_LINE_SIZE, KM_SLEEP); buf = P2ROUNDUP(abuf, _CACHE_LINE_SIZE); /* * Perform initialization in case multi-threading is chosen later. */ if (&create_softint != NULL) { clock_tick_intr = create_softint(LOCK_LEVEL, clock_tick_execute, (caddr_t)NULL); } for (i = 0; i < NCPU; i++, buf += size) { ctp = (clock_tick_cpu_t *)buf; clock_tick_cpu[i] = ctp; mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL); if (&create_softint != NULL) { ctp->ct_intr = clock_tick_intr; } ctp->ct_pending = 0; } mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL); /* * Compute clock_tick_ncpus here. We need it to compute the * maximum number of tick sets we need to support. */ ASSERT(clock_tick_ncpus >= 0); if (clock_tick_ncpus == 0) clock_tick_ncpus = CLOCK_TICK_NCPUS; if (clock_tick_ncpus > max_ncpus) clock_tick_ncpus = max_ncpus; /* * Allocate and initialize the tick sets. */ n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus; clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP); for (i = 0; i < n; i++) { csp = &clock_tick_set[i]; csp->ct_start = i * clock_tick_ncpus; csp->ct_scan = csp->ct_start; csp->ct_end = csp->ct_start; } } void clock_tick_init_post(void) { /* * If a platform does not provide create_softint() and invoke_softint(), * then we assume single threaded. */ if (&invoke_softint == NULL) clock_tick_threshold = 0; ASSERT(clock_tick_threshold >= 0); if (clock_tick_threshold == 0) clock_tick_threshold = max_ncpus; /* * If a platform does not specify a threshold or if the number of CPUs * at boot time does not exceed the threshold, tick accounting remains * single-threaded. */ if (ncpus <= clock_tick_threshold) { clock_tick_ncpus = max_ncpus; clock_tick_proc_max = 1; return; } /* * OK. Multi-thread tick processing. If a platform has not specified * the CPU set size for multi-threading, then use the default value. * This value has been arrived through measurements on large * configuration systems. */ clock_tick_single_threaded = 0; if (clock_tick_proc_max == 0) { clock_tick_proc_max = CLOCK_TICK_PROC_MAX; if (hires_tick) clock_tick_proc_max *= 10; } } static void clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid) { clock_tick_cpu_t *ctp; ASSERT(&invoke_softint != NULL); atomic_inc_ulong(&clock_tick_active); /* * Schedule tick accounting for a set of CPUs. */ ctp = clock_tick_cpu[cid]; mutex_enter(&ctp->ct_lock); ctp->ct_lbolt = LBOLT_NO_ACCOUNT; ctp->ct_pending += pending; ctp->ct_start = csp->ct_start; ctp->ct_end = csp->ct_end; ctp->ct_scan = csp->ct_scan; mutex_exit(&ctp->ct_lock); invoke_softint(cid, ctp->ct_intr); /* * Return without waiting for the softint to finish. */ } static void clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) { kthread_t *t; kmutex_t *plockp; int notick, intr; klwp_id_t lwp; /* * The locking here is rather tricky. thread_free_prevent() * prevents the thread returned from being freed while we * are looking at it. We can then check if the thread * is exiting and get the appropriate p_lock if it * is not. We have to be careful, though, because * the _process_ can still be freed while we've * prevented thread free. To avoid touching the * proc structure we put a pointer to the p_lock in the * thread structure. The p_lock is persistent so we * can acquire it even if the process is gone. At that * point we can check (again) if the thread is exiting * and either drop the lock or do the tick processing. */ t = cp->cpu_thread; /* Current running thread */ if (CPU == cp) { /* * 't' will be the tick processing thread on this * CPU. Use the pinned thread (if any) on this CPU * as the target of the clock tick. */ if (t->t_intr != NULL) t = t->t_intr; } /* * We use thread_free_prevent to keep the currently running * thread from being freed or recycled while we're * looking at it. */ thread_free_prevent(t); /* * We cannot hold the cpu_lock to prevent the * cpu_active from changing in the clock interrupt. * As long as we don't block (or don't get pre-empted) * the cpu_list will not change (all threads are paused * before list modification). */ if (CLOCK_TICK_CPU_OFFLINE(cp)) { thread_free_allow(t); return; } /* * Make sure the thread is still on the CPU. */ if ((t != cp->cpu_thread) && ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { /* * We could not locate the thread. Skip this CPU. Race * conditions while performing these checks are benign. * These checks are not perfect and they don't need * to be. */ thread_free_allow(t); return; } intr = t->t_flag & T_INTR_THREAD; lwp = ttolwp(t); if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { /* * Thread is exiting (or uninteresting) so don't * do tick processing. */ thread_free_allow(t); return; } /* * OK, try to grab the process lock. See * comments above for why we're not using * ttoproc(t)->p_lockp here. */ plockp = t->t_plockp; mutex_enter(plockp); /* See above comment. */ if (CLOCK_TICK_CPU_OFFLINE(cp)) { mutex_exit(plockp); thread_free_allow(t); return; } /* * The thread may have exited between when we * checked above, and when we got the p_lock. */ if (t->t_proc_flag & TP_LWPEXIT) { mutex_exit(plockp); thread_free_allow(t); return; } /* * Either we have the p_lock for the thread's process, * or we don't care about the thread structure any more. * Either way we can allow thread free. */ thread_free_allow(t); /* * If we haven't done tick processing for this * lwp, then do it now. Since we don't hold the * lwp down on a CPU it can migrate and show up * more than once, hence the lbolt check. mylbolt * is copied at the time of tick scheduling to prevent * lbolt mismatches. * * Also, make sure that it's okay to perform the * tick processing before calling clock_tick. * Setting notick to a TRUE value (ie. not 0) * results in tick processing not being performed for * that thread. */ notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || (cp->cpu_dispthread == cp->cpu_idle_thread)); if ((!notick) && (t->t_lbolt < mylbolt)) { t->t_lbolt = mylbolt; clock_tick(t, pending); } mutex_exit(plockp); } void clock_tick_schedule(int one_sec) { ulong_t active; int i, end; clock_tick_set_t *csp; cpu_t *cp; if (clock_cpu_id != CPU->cpu_id) clock_cpu_id = CPU->cpu_id; if (clock_tick_single_threaded) { /* * Each tick cycle, start the scan from a different * CPU for the sake of fairness. */ end = clock_tick_total_cpus; clock_tick_scan++; if (clock_tick_scan >= end) clock_tick_scan = 0; clock_tick_execute_common(0, clock_tick_scan, end, LBOLT_NO_ACCOUNT, 1); return; } /* * If the previous invocation of handlers is not yet finished, then * simply increment a pending count and return. Eventually when they * finish, the pending count is passed down to the next set of * handlers to process. This way, ticks that have already elapsed * in the past are handled as quickly as possible to minimize the * chances of threads getting away before their pending ticks are * accounted. The other benefit is that if the pending count is * more than one, it can be handled by a single invocation of * clock_tick(). This is a good optimization for large configuration * busy systems where tick accounting can get backed up for various * reasons. */ clock_tick_pending++; active = clock_tick_active; active = atomic_cas_ulong(&clock_tick_active, active, active); if (active) return; /* * We want to handle the clock CPU here. If we * scheduled the accounting for the clock CPU to another * processor, that processor will find only the clock() thread * running and not account for any user thread below it. Also, * we want to handle this before we block on anything and allow * the pinned thread below the current thread to escape. */ clock_tick_process(CPU, LBOLT_NO_ACCOUNT, clock_tick_pending); mutex_enter(&clock_tick_lock); /* * Schedule each set on a separate processor. */ cp = clock_cpu_list; for (i = 0; i < clock_tick_nsets; i++) { csp = &clock_tick_set[i]; /* * Pick the next online CPU in list for scheduling tick * accounting. The clock_tick_lock is held by the caller. * So, CPU online/offline cannot muck with this while * we are picking our CPU to X-call. */ if (cp == CPU) cp = cp->cpu_next_onln; /* * Each tick cycle, start the scan from a different * CPU for the sake of fairness. */ csp->ct_scan++; if (csp->ct_scan >= csp->ct_end) csp->ct_scan = csp->ct_start; clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id); cp = cp->cpu_next_onln; } if (one_sec) { /* * Move the CPU pointer around every second. This is so * all the CPUs can be X-called in a round-robin fashion * to evenly distribute the X-calls. We don't do this * at a faster rate than this because we don't want * to affect cache performance negatively. */ clock_cpu_list = clock_cpu_list->cpu_next_onln; } mutex_exit(&clock_tick_lock); clock_tick_pending = 0; } static void clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt, int pending) { cpu_t *cp; int i; ASSERT((start <= scan) && (scan <= end)); /* * Handle the thread on current CPU first. This is to prevent a * pinned thread from escaping if we ever block on something. * Note that in the single-threaded mode, this handles the clock * CPU. */ clock_tick_process(CPU, mylbolt, pending); /* * Perform tick accounting for the threads running on * the scheduled CPUs. */ for (i = scan; i < end; i++) { cp = clock_tick_cpus[i]; if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) continue; clock_tick_process(cp, mylbolt, pending); } for (i = start; i < scan; i++) { cp = clock_tick_cpus[i]; if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) continue; clock_tick_process(cp, mylbolt, pending); } } /*ARGSUSED*/ static uint_t clock_tick_execute(caddr_t arg1, caddr_t arg2) { clock_tick_cpu_t *ctp; int start, scan, end, pending; clock_t mylbolt; /* * We could have raced with cpu offline. We don't want to * process anything on an offlined CPU. If we got blocked * on anything, we may not get scheduled when we wakeup * later on. */ if (!CLOCK_TICK_XCALL_SAFE(CPU)) goto out; ctp = clock_tick_cpu[CPU->cpu_id]; mutex_enter(&ctp->ct_lock); pending = ctp->ct_pending; if (pending == 0) { /* * If a CPU is busy at LOCK_LEVEL, then an invocation * of this softint may be queued for some time. In that case, * clock_tick_active will not be incremented. * clock_tick_schedule() will then assume that the previous * invocation is done and post a new softint. The first one * that gets in will reset the pending count so the * second one is a noop. */ mutex_exit(&ctp->ct_lock); goto out; } ctp->ct_pending = 0; start = ctp->ct_start; end = ctp->ct_end; scan = ctp->ct_scan; mylbolt = ctp->ct_lbolt; mutex_exit(&ctp->ct_lock); clock_tick_execute_common(start, scan, end, mylbolt, pending); out: /* * Signal completion to the clock handler. */ atomic_dec_ulong(&clock_tick_active); return (1); } /*ARGSUSED*/ static int clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg) { cpu_t *cp, *ncp; int i, set; clock_tick_set_t *csp; /* * This function performs some computations at CPU offline/online * time. The computed values are used during tick scheduling and * execution phases. This avoids having to compute things on * an every tick basis. The other benefit is that we perform the * computations only for onlined CPUs (not offlined ones). As a * result, no tick processing is attempted for offlined CPUs. * * Also, cpu_offline() calls this function before checking for * active interrupt threads. This allows us to avoid posting * cross calls to CPUs that are being offlined. */ cp = cpu[cid]; mutex_enter(&clock_tick_lock); switch (what) { case CPU_ON: clock_tick_cpus[clock_tick_total_cpus] = cp; set = clock_tick_total_cpus / clock_tick_ncpus; csp = &clock_tick_set[set]; csp->ct_end++; clock_tick_total_cpus++; clock_tick_nsets = (clock_tick_total_cpus + clock_tick_ncpus - 1) / clock_tick_ncpus; CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id); membar_sync(); break; case CPU_OFF: if (&sync_softint != NULL) sync_softint(clock_tick_online_cpuset); CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id); clock_tick_total_cpus--; clock_tick_cpus[clock_tick_total_cpus] = NULL; clock_tick_nsets = (clock_tick_total_cpus + clock_tick_ncpus - 1) / clock_tick_ncpus; set = clock_tick_total_cpus / clock_tick_ncpus; csp = &clock_tick_set[set]; csp->ct_end--; i = 0; ncp = cpu_active; do { if (cp == ncp) continue; clock_tick_cpus[i] = ncp; i++; } while ((ncp = ncp->cpu_next_onln) != cpu_active); ASSERT(i == clock_tick_total_cpus); membar_sync(); break; default: break; } mutex_exit(&clock_tick_lock); return (0); } void clock_tick_mp_init(void) { cpu_t *cp; mutex_enter(&cpu_lock); cp = cpu_active; do { (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL); } while ((cp = cp->cpu_next_onln) != cpu_active); register_cpu_setup_func(clock_tick_cpu_setup, NULL); mutex_exit(&cpu_lock); }