/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * Architecture-independent CPU control functions. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* to set per-cpu kmem_cache offset */ #include #include #include #include #include #include #include #include #include #include extern int mp_cpu_start(cpu_t *); extern int mp_cpu_stop(cpu_t *); extern int mp_cpu_poweron(cpu_t *); extern int mp_cpu_poweroff(cpu_t *); extern int mp_cpu_configure(int); extern int mp_cpu_unconfigure(int); extern void mp_cpu_faulted_enter(cpu_t *); extern void mp_cpu_faulted_exit(cpu_t *); extern int cmp_cpu_to_chip(processorid_t cpuid); #ifdef __sparcv9 extern char *cpu_fru_fmri(cpu_t *cp); #endif static void cpu_add_active_internal(cpu_t *cp); static void cpu_remove_active(cpu_t *cp); static void cpu_info_kstat_create(cpu_t *cp); static void cpu_info_kstat_destroy(cpu_t *cp); static void cpu_stats_kstat_create(cpu_t *cp); static void cpu_stats_kstat_destroy(cpu_t *cp); static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw); static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw); static int cpu_stat_ks_update(kstat_t *ksp, int rw); static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t); /* * cpu_lock protects ncpus, ncpus_online, cpu_flag, cpu_list, cpu_active, * and dispatch queue reallocations. The lock ordering with respect to * related locks is: * * cpu_lock --> thread_free_lock ---> p_lock ---> thread_lock() * * Warning: Certain sections of code do not use the cpu_lock when * traversing the cpu_list (e.g. mutex_vector_enter(), clock()). Since * all cpus are paused during modifications to this list, a solution * to protect the list is too either disable kernel preemption while * walking the list, *or* recheck the cpu_next pointer at each * iteration in the loop. Note that in no cases can any cached * copies of the cpu pointers be kept as they may become invalid. */ kmutex_t cpu_lock; cpu_t *cpu_list; /* list of all CPUs */ cpu_t *cpu_active; /* list of active CPUs */ static cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ /* * max_ncpus keeps the max cpus the system can have. Initially * it's NCPU, but since most archs scan the devtree for cpus * fairly early on during boot, the real max can be known before * ncpus is set (useful for early NCPU based allocations). */ int max_ncpus = NCPU; /* * platforms that set max_ncpus to maxiumum number of cpus that can be * dynamically added will set boot_max_ncpus to the number of cpus found * at device tree scan time during boot. */ int boot_max_ncpus = -1; /* * Maximum possible CPU id. This can never be >= NCPU since NCPU is * used to size arrays that are indexed by CPU id. */ processorid_t max_cpuid = NCPU - 1; int ncpus = 1; int ncpus_online = 1; /* * CPU that we're trying to offline. Protected by cpu_lock. */ cpu_t *cpu_inmotion; /* * Can be raised to suppress further weakbinding, which are instead * satisfied by disabling preemption. Must be raised/lowered under cpu_lock, * while individual thread weakbinding synchronisation is done under thread * lock. */ int weakbindingbarrier; /* * values for safe_list. Pause state that CPUs are in. */ #define PAUSE_IDLE 0 /* normal state */ #define PAUSE_READY 1 /* paused thread ready to spl */ #define PAUSE_WAIT 2 /* paused thread is spl-ed high */ #define PAUSE_DIE 3 /* tell pause thread to leave */ #define PAUSE_DEAD 4 /* pause thread has left */ /* * Variables used in pause_cpus(). */ static volatile char safe_list[NCPU]; static struct _cpu_pause_info { int cp_spl; /* spl saved in pause_cpus() */ volatile int cp_go; /* Go signal sent after all ready */ int cp_count; /* # of CPUs to pause */ ksema_t cp_sem; /* synch pause_cpus & cpu_pause */ kthread_id_t cp_paused; } cpu_pause_info; static kmutex_t pause_free_mutex; static kcondvar_t pause_free_cv; static struct cpu_sys_stats_ks_data { kstat_named_t cpu_ticks_idle; kstat_named_t cpu_ticks_user; kstat_named_t cpu_ticks_kernel; kstat_named_t cpu_ticks_wait; kstat_named_t cpu_nsec_idle; kstat_named_t cpu_nsec_user; kstat_named_t cpu_nsec_kernel; kstat_named_t wait_ticks_io; kstat_named_t bread; kstat_named_t bwrite; kstat_named_t lread; kstat_named_t lwrite; kstat_named_t phread; kstat_named_t phwrite; kstat_named_t pswitch; kstat_named_t trap; kstat_named_t intr; kstat_named_t syscall; kstat_named_t sysread; kstat_named_t syswrite; kstat_named_t sysfork; kstat_named_t sysvfork; kstat_named_t sysexec; kstat_named_t readch; kstat_named_t writech; kstat_named_t rcvint; kstat_named_t xmtint; kstat_named_t mdmint; kstat_named_t rawch; kstat_named_t canch; kstat_named_t outch; kstat_named_t msg; kstat_named_t sema; kstat_named_t namei; kstat_named_t ufsiget; kstat_named_t ufsdirblk; kstat_named_t ufsipage; kstat_named_t ufsinopage; kstat_named_t procovf; kstat_named_t intrthread; kstat_named_t intrblk; kstat_named_t intrunpin; kstat_named_t idlethread; kstat_named_t inv_swtch; kstat_named_t nthreads; kstat_named_t cpumigrate; kstat_named_t xcalls; kstat_named_t mutex_adenters; kstat_named_t rw_rdfails; kstat_named_t rw_wrfails; kstat_named_t modload; kstat_named_t modunload; kstat_named_t bawrite; kstat_named_t iowait; } cpu_sys_stats_ks_data_template = { { "cpu_ticks_idle", KSTAT_DATA_UINT64 }, { "cpu_ticks_user", KSTAT_DATA_UINT64 }, { "cpu_ticks_kernel", KSTAT_DATA_UINT64 }, { "cpu_ticks_wait", KSTAT_DATA_UINT64 }, { "cpu_nsec_idle", KSTAT_DATA_UINT64 }, { "cpu_nsec_user", KSTAT_DATA_UINT64 }, { "cpu_nsec_kernel", KSTAT_DATA_UINT64 }, { "wait_ticks_io", KSTAT_DATA_UINT64 }, { "bread", KSTAT_DATA_UINT64 }, { "bwrite", KSTAT_DATA_UINT64 }, { "lread", KSTAT_DATA_UINT64 }, { "lwrite", KSTAT_DATA_UINT64 }, { "phread", KSTAT_DATA_UINT64 }, { "phwrite", KSTAT_DATA_UINT64 }, { "pswitch", KSTAT_DATA_UINT64 }, { "trap", KSTAT_DATA_UINT64 }, { "intr", KSTAT_DATA_UINT64 }, { "syscall", KSTAT_DATA_UINT64 }, { "sysread", KSTAT_DATA_UINT64 }, { "syswrite", KSTAT_DATA_UINT64 }, { "sysfork", KSTAT_DATA_UINT64 }, { "sysvfork", KSTAT_DATA_UINT64 }, { "sysexec", KSTAT_DATA_UINT64 }, { "readch", KSTAT_DATA_UINT64 }, { "writech", KSTAT_DATA_UINT64 }, { "rcvint", KSTAT_DATA_UINT64 }, { "xmtint", KSTAT_DATA_UINT64 }, { "mdmint", KSTAT_DATA_UINT64 }, { "rawch", KSTAT_DATA_UINT64 }, { "canch", KSTAT_DATA_UINT64 }, { "outch", KSTAT_DATA_UINT64 }, { "msg", KSTAT_DATA_UINT64 }, { "sema", KSTAT_DATA_UINT64 }, { "namei", KSTAT_DATA_UINT64 }, { "ufsiget", KSTAT_DATA_UINT64 }, { "ufsdirblk", KSTAT_DATA_UINT64 }, { "ufsipage", KSTAT_DATA_UINT64 }, { "ufsinopage", KSTAT_DATA_UINT64 }, { "procovf", KSTAT_DATA_UINT64 }, { "intrthread", KSTAT_DATA_UINT64 }, { "intrblk", KSTAT_DATA_UINT64 }, { "intrunpin", KSTAT_DATA_UINT64 }, { "idlethread", KSTAT_DATA_UINT64 }, { "inv_swtch", KSTAT_DATA_UINT64 }, { "nthreads", KSTAT_DATA_UINT64 }, { "cpumigrate", KSTAT_DATA_UINT64 }, { "xcalls", KSTAT_DATA_UINT64 }, { "mutex_adenters", KSTAT_DATA_UINT64 }, { "rw_rdfails", KSTAT_DATA_UINT64 }, { "rw_wrfails", KSTAT_DATA_UINT64 }, { "modload", KSTAT_DATA_UINT64 }, { "modunload", KSTAT_DATA_UINT64 }, { "bawrite", KSTAT_DATA_UINT64 }, { "iowait", KSTAT_DATA_UINT64 }, }; static struct cpu_vm_stats_ks_data { kstat_named_t pgrec; kstat_named_t pgfrec; kstat_named_t pgin; kstat_named_t pgpgin; kstat_named_t pgout; kstat_named_t pgpgout; kstat_named_t swapin; kstat_named_t pgswapin; kstat_named_t swapout; kstat_named_t pgswapout; kstat_named_t zfod; kstat_named_t dfree; kstat_named_t scan; kstat_named_t rev; kstat_named_t hat_fault; kstat_named_t as_fault; kstat_named_t maj_fault; kstat_named_t cow_fault; kstat_named_t prot_fault; kstat_named_t softlock; kstat_named_t kernel_asflt; kstat_named_t pgrrun; kstat_named_t execpgin; kstat_named_t execpgout; kstat_named_t execfree; kstat_named_t anonpgin; kstat_named_t anonpgout; kstat_named_t anonfree; kstat_named_t fspgin; kstat_named_t fspgout; kstat_named_t fsfree; } cpu_vm_stats_ks_data_template = { { "pgrec", KSTAT_DATA_UINT64 }, { "pgfrec", KSTAT_DATA_UINT64 }, { "pgin", KSTAT_DATA_UINT64 }, { "pgpgin", KSTAT_DATA_UINT64 }, { "pgout", KSTAT_DATA_UINT64 }, { "pgpgout", KSTAT_DATA_UINT64 }, { "swapin", KSTAT_DATA_UINT64 }, { "pgswapin", KSTAT_DATA_UINT64 }, { "swapout", KSTAT_DATA_UINT64 }, { "pgswapout", KSTAT_DATA_UINT64 }, { "zfod", KSTAT_DATA_UINT64 }, { "dfree", KSTAT_DATA_UINT64 }, { "scan", KSTAT_DATA_UINT64 }, { "rev", KSTAT_DATA_UINT64 }, { "hat_fault", KSTAT_DATA_UINT64 }, { "as_fault", KSTAT_DATA_UINT64 }, { "maj_fault", KSTAT_DATA_UINT64 }, { "cow_fault", KSTAT_DATA_UINT64 }, { "prot_fault", KSTAT_DATA_UINT64 }, { "softlock", KSTAT_DATA_UINT64 }, { "kernel_asflt", KSTAT_DATA_UINT64 }, { "pgrrun", KSTAT_DATA_UINT64 }, { "execpgin", KSTAT_DATA_UINT64 }, { "execpgout", KSTAT_DATA_UINT64 }, { "execfree", KSTAT_DATA_UINT64 }, { "anonpgin", KSTAT_DATA_UINT64 }, { "anonpgout", KSTAT_DATA_UINT64 }, { "anonfree", KSTAT_DATA_UINT64 }, { "fspgin", KSTAT_DATA_UINT64 }, { "fspgout", KSTAT_DATA_UINT64 }, { "fsfree", KSTAT_DATA_UINT64 }, }; /* * Force the specified thread to migrate to the appropriate processor. * Called with thread lock held, returns with it dropped. */ static void force_thread_migrate(kthread_id_t tp) { ASSERT(THREAD_LOCK_HELD(tp)); if (tp == curthread) { THREAD_TRANSITION(tp); CL_SETRUN(tp); thread_unlock_nopreempt(tp); swtch(); } else { if (tp->t_state == TS_ONPROC) { cpu_surrender(tp); } else if (tp->t_state == TS_RUN) { (void) dispdeq(tp); setbackdq(tp); } thread_unlock(tp); } } /* * Set affinity for a specified CPU. * A reference count is incremented and the affinity is held until the * reference count is decremented to zero by thread_affinity_clear(). * This is so regions of code requiring affinity can be nested. * Caller needs to ensure that cpu_id remains valid, which can be * done by holding cpu_lock across this call, unless the caller * specifies CPU_CURRENT in which case the cpu_lock will be acquired * by thread_affinity_set and CPU->cpu_id will be the target CPU. */ void thread_affinity_set(kthread_id_t t, int cpu_id) { cpu_t *cp; int c; ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL)); if ((c = cpu_id) == CPU_CURRENT) { mutex_enter(&cpu_lock); cpu_id = CPU->cpu_id; } /* * We should be asserting that cpu_lock is held here, but * the NCA code doesn't acquire it. The following assert * should be uncommented when the NCA code is fixed. * * ASSERT(MUTEX_HELD(&cpu_lock)); */ ASSERT((cpu_id >= 0) && (cpu_id < NCPU)); cp = cpu[cpu_id]; ASSERT(cp != NULL); /* user must provide a good cpu_id */ /* * If there is already a hard affinity requested, and this affinity * conflicts with that, panic. */ thread_lock(t); if (t->t_affinitycnt > 0 && t->t_bound_cpu != cp) { panic("affinity_set: setting %p but already bound to %p", (void *)cp, (void *)t->t_bound_cpu); } t->t_affinitycnt++; t->t_bound_cpu = cp; /* * Make sure we're running on the right CPU. */ if (cp != t->t_cpu || t != curthread) { force_thread_migrate(t); /* drops thread lock */ } else { thread_unlock(t); } if (c == CPU_CURRENT) mutex_exit(&cpu_lock); } /* * Wrapper for backward compatibility. */ void affinity_set(int cpu_id) { thread_affinity_set(curthread, cpu_id); } /* * Decrement the affinity reservation count and if it becomes zero, * clear the CPU affinity for the current thread, or set it to the user's * software binding request. */ void thread_affinity_clear(kthread_id_t t) { register processorid_t binding; thread_lock(t); if (--t->t_affinitycnt == 0) { if ((binding = t->t_bind_cpu) == PBIND_NONE) { /* * Adjust disp_max_unbound_pri if necessary. */ disp_adjust_unbound_pri(t); t->t_bound_cpu = NULL; if (t->t_cpu->cpu_part != t->t_cpupart) { force_thread_migrate(t); return; } } else { t->t_bound_cpu = cpu[binding]; /* * Make sure the thread is running on the bound CPU. */ if (t->t_cpu != t->t_bound_cpu) { force_thread_migrate(t); return; /* already dropped lock */ } } } thread_unlock(t); } /* * Wrapper for backward compatibility. */ void affinity_clear(void) { thread_affinity_clear(curthread); } /* * Weak cpu affinity. Bind to the "current" cpu for short periods * of time during which the thread must not block (but may be preempted). * Use this instead of kpreempt_disable() when it is only "no migration" * rather than "no preemption" semantics that are required - disabling * preemption holds higher priority threads off of cpu and if the * operation that is protected is more than momentary this is not good * for realtime etc. * * Weakly bound threads will not prevent a cpu from being offlined - * we'll only run them on the cpu to which they are weakly bound but * (because they do not block) we'll always be able to move them on to * another cpu at offline time if we give them just a short moment to * run during which they will unbind. To give a cpu a chance of offlining, * however, we require a barrier to weak bindings that may be raised for a * given cpu (offline/move code may set this and then wait a short time for * existing weak bindings to drop); the cpu_inmotion pointer is that barrier. * * There are few restrictions on the calling context of thread_nomigrate. * The caller must not hold the thread lock. Calls may be nested. * * After weakbinding a thread must not perform actions that may block. * In particular it must not call thread_affinity_set; calling that when * already weakbound is nonsensical anyway. * * If curthread is prevented from migrating for other reasons * (kernel preemption disabled; high pil; strongly bound; interrupt thread) * then the weak binding will succeed even if this cpu is the target of an * offline/move request. */ void thread_nomigrate(void) { cpu_t *cp; kthread_id_t t = curthread; again: kpreempt_disable(); cp = CPU; /* * A highlevel interrupt must not modify t_nomigrate or * t_weakbound_cpu of the thread it has interrupted. A lowlevel * interrupt thread cannot migrate and we can avoid the * thread_lock call below by short-circuiting here. In either * case we can just return since no migration is possible and * the condition will persist (ie, when we test for these again * in thread_allowmigrate they can't have changed). Migration * is also impossible if we're at or above DISP_LEVEL pil. */ if (CPU_ON_INTR(cp) || t->t_flag & T_INTR_THREAD || getpil() >= DISP_LEVEL) { kpreempt_enable(); return; } /* * We must be consistent with existing weak bindings. Since we * may be interrupted between the increment of t_nomigrate and * the store to t_weakbound_cpu below we cannot assume that * t_weakbound_cpu will be set if t_nomigrate is. Note that we * cannot assert t_weakbound_cpu == t_bind_cpu since that is not * always the case. */ if (t->t_nomigrate && t->t_weakbound_cpu && t->t_weakbound_cpu != cp) { if (!panicstr) panic("thread_nomigrate: binding to %p but already " "bound to %p", (void *)cp, (void *)t->t_weakbound_cpu); } /* * At this point we have preemption disabled and we don't yet hold * the thread lock. So it's possible that somebody else could * set t_bind_cpu here and not be able to force us across to the * new cpu (since we have preemption disabled). */ thread_lock(curthread); /* * If further weak bindings are being (temporarily) suppressed then * we'll settle for disabling kernel preemption (which assures * no migration provided the thread does not block which it is * not allowed to if using thread_nomigrate). We must remember * this disposition so we can take appropriate action in * thread_allowmigrate. If this is a nested call and the * thread is already weakbound then fall through as normal. * We remember the decision to settle for kpreempt_disable through * negative nesting counting in t_nomigrate. Once a thread has had one * weakbinding request satisfied in this way any further (nested) * requests will continue to be satisfied in the same way, * even if weak bindings have recommenced. */ if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) { --t->t_nomigrate; thread_unlock(curthread); return; /* with kpreempt_disable still active */ } /* * We hold thread_lock so t_bind_cpu cannot change. We could, * however, be running on a different cpu to which we are t_bound_cpu * to (as explained above). If we grant the weak binding request * in that case then the dispatcher must favour our weak binding * over our strong (in which case, just as when preemption is * disabled, we can continue to run on a cpu other than the one to * which we are strongbound; the difference in this case is that * this thread can be preempted and so can appear on the dispatch * queues of a cpu other than the one it is strongbound to). * * If the cpu we are running on does not appear to be a current * offline target (we check cpu_inmotion to determine this - since * we don't hold cpu_lock we may not see a recent store to that, * so it's possible that we at times can grant a weak binding to a * cpu that is an offline target, but that one request will not * prevent the offline from succeeding) then we will always grant * the weak binding request. This includes the case above where * we grant a weakbinding not commensurate with our strong binding. * * If our cpu does appear to be an offline target then we're inclined * not to grant the weakbinding request just yet - we'd prefer to * migrate to another cpu and grant the request there. The * exceptions are those cases where going through preemption code * will not result in us changing cpu: * * . interrupts have already bypassed this case (see above) * . we are already weakbound to this cpu (dispatcher code will * always return us to the weakbound cpu) * . preemption was disabled even before we disabled it above * . we are strongbound to this cpu (if we're strongbound to * another and not yet running there the trip through the * dispatcher will move us to the strongbound cpu and we * will grant the weak binding there) */ if (cp != cpu_inmotion || t->t_nomigrate > 0 || t->t_preempt > 1 || t->t_bound_cpu == cp) { /* * Don't be tempted to store to t_weakbound_cpu only on * the first nested bind request - if we're interrupted * after the increment of t_nomigrate and before the * store to t_weakbound_cpu and the interrupt calls * thread_nomigrate then the assertion in thread_allowmigrate * would fail. */ t->t_nomigrate++; t->t_weakbound_cpu = cp; membar_producer(); thread_unlock(curthread); /* * Now that we have dropped the thread_lock another thread * can set our t_weakbound_cpu, and will try to migrate us * to the strongbound cpu (which will not be prevented by * preemption being disabled since we're about to enable * preemption). We have granted the weakbinding to the current * cpu, so again we are in the position that is is is possible * that our weak and strong bindings differ. Again this * is catered for by dispatcher code which will favour our * weak binding. */ kpreempt_enable(); } else { /* * Move to another cpu before granting the request by * forcing this thread through preemption code. When we * get to set{front,back}dq called from CL_PREEMPT() * cpu_choose() will be used to select a cpu to queue * us on - that will see cpu_inmotion and take * steps to avoid returning us to this cpu. */ cp->cpu_kprunrun = 1; thread_unlock(curthread); kpreempt_enable(); /* will call preempt() */ goto again; } } void thread_allowmigrate(void) { kthread_id_t t = curthread; ASSERT(t->t_weakbound_cpu == CPU || (t->t_nomigrate < 0 && t->t_preempt > 0) || CPU_ON_INTR(CPU) || t->t_flag & T_INTR_THREAD || getpil() >= DISP_LEVEL); if (CPU_ON_INTR(CPU) || (t->t_flag & T_INTR_THREAD) || getpil() >= DISP_LEVEL) return; if (t->t_nomigrate < 0) { /* * This thread was granted "weak binding" in the * stronger form of kernel preemption disabling. * Undo a level of nesting for both t_nomigrate * and t_preempt. */ ++t->t_nomigrate; kpreempt_enable(); } else if (--t->t_nomigrate == 0) { /* * Time to drop the weak binding. We need to cater * for the case where we're weakbound to a different * cpu than that to which we're strongbound (a very * temporary arrangement that must only persist until * weak binding drops). We don't acquire thread_lock * here so even as this code executes t_bound_cpu * may be changing. So we disable preemption and * a) in the case that t_bound_cpu changes while we * have preemption disabled kprunrun will be set * asynchronously, and b) if before disabling * preemption we were already on a different cpu to * our t_bound_cpu then we set kprunrun ourselves * to force a trip through the dispatcher when * preemption is enabled. */ kpreempt_disable(); if (t->t_bound_cpu && t->t_weakbound_cpu != t->t_bound_cpu) CPU->cpu_kprunrun = 1; t->t_weakbound_cpu = NULL; membar_producer(); kpreempt_enable(); } } /* * weakbinding_stop can be used to temporarily cause weakbindings made * with thread_nomigrate to be satisfied through the stronger action of * kpreempt_disable. weakbinding_start recommences normal weakbinding. */ void weakbinding_stop(void) { ASSERT(MUTEX_HELD(&cpu_lock)); weakbindingbarrier = 1; membar_producer(); /* make visible before subsequent thread_lock */ } void weakbinding_start(void) { ASSERT(MUTEX_HELD(&cpu_lock)); weakbindingbarrier = 0; } /* * This routine is called to place the CPUs in a safe place so that * one of them can be taken off line or placed on line. What we are * trying to do here is prevent a thread from traversing the list * of active CPUs while we are changing it or from getting placed on * the run queue of a CPU that has just gone off line. We do this by * creating a thread with the highest possible prio for each CPU and * having it call this routine. The advantage of this method is that * we can eliminate all checks for CPU_ACTIVE in the disp routines. * This makes disp faster at the expense of making p_online() slower * which is a good trade off. */ static void cpu_pause(volatile char *safe) { int s; struct _cpu_pause_info *cpi = &cpu_pause_info; ASSERT((curthread->t_bound_cpu != NULL) || (*safe == PAUSE_DIE)); while (*safe != PAUSE_DIE) { *safe = PAUSE_READY; membar_enter(); /* make sure stores are flushed */ sema_v(&cpi->cp_sem); /* signal requesting thread */ /* * Wait here until all pause threads are running. That * indicates that it's safe to do the spl. Until * cpu_pause_info.cp_go is set, we don't want to spl * because that might block clock interrupts needed * to preempt threads on other CPUs. */ while (cpi->cp_go == 0) ; /* * Even though we are at the highest disp prio, we need * to block out all interrupts below LOCK_LEVEL so that * an intr doesn't come in, wake up a thread, and call * setbackdq/setfrontdq. */ s = splhigh(); /* * This cpu is now safe. */ *safe = PAUSE_WAIT; membar_enter(); /* make sure stores are flushed */ /* * Now we wait. When we are allowed to continue, safe will * be set to PAUSE_IDLE. */ while (*safe != PAUSE_IDLE) ; splx(s); /* * Waiting is at an end. Switch out of cpu_pause * loop and resume useful work. */ swtch(); } mutex_enter(&pause_free_mutex); *safe = PAUSE_DEAD; cv_broadcast(&pause_free_cv); mutex_exit(&pause_free_mutex); } /* * Allow the cpus to start running again. */ void start_cpus() { int i; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpu_pause_info.cp_paused); cpu_pause_info.cp_paused = NULL; for (i = 0; i < NCPU; i++) safe_list[i] = PAUSE_IDLE; membar_enter(); /* make sure stores are flushed */ affinity_clear(); splx(cpu_pause_info.cp_spl); kpreempt_enable(); } /* * Allocate a pause thread for a CPU. */ static void cpu_pause_alloc(cpu_t *cp) { kthread_id_t t; int cpun = cp->cpu_id; /* * Note, v.v_nglobpris will not change value as long as I hold * cpu_lock. */ t = thread_create(NULL, 0, cpu_pause, (caddr_t)&safe_list[cpun], 0, &p0, TS_STOPPED, v.v_nglobpris - 1); thread_lock(t); t->t_bound_cpu = cp; t->t_disp_queue = cp->cpu_disp; t->t_affinitycnt = 1; t->t_preempt = 1; thread_unlock(t); cp->cpu_pause_thread = t; /* * Registering a thread in the callback table is usually done * in the initialization code of the thread. In this * case, we do it right after thread creation because the * thread itself may never run, and we need to register the * fact that it is safe for cpr suspend. */ CALLB_CPR_INIT_SAFE(t, "cpu_pause"); } /* * Free a pause thread for a CPU. */ static void cpu_pause_free(cpu_t *cp) { kthread_id_t t; int cpun = cp->cpu_id; ASSERT(MUTEX_HELD(&cpu_lock)); /* * We have to get the thread and tell him to die. */ if ((t = cp->cpu_pause_thread) == NULL) { ASSERT(safe_list[cpun] == PAUSE_IDLE); return; } thread_lock(t); t->t_cpu = CPU; /* disp gets upset if last cpu is quiesced. */ t->t_bound_cpu = NULL; /* Must un-bind; cpu may not be running. */ t->t_pri = v.v_nglobpris - 1; ASSERT(safe_list[cpun] == PAUSE_IDLE); safe_list[cpun] = PAUSE_DIE; THREAD_TRANSITION(t); setbackdq(t); thread_unlock_nopreempt(t); /* * If we don't wait for the thread to actually die, it may try to * run on the wrong cpu as part of an actual call to pause_cpus(). */ mutex_enter(&pause_free_mutex); while (safe_list[cpun] != PAUSE_DEAD) { cv_wait(&pause_free_cv, &pause_free_mutex); } mutex_exit(&pause_free_mutex); safe_list[cpun] = PAUSE_IDLE; cp->cpu_pause_thread = NULL; } /* * Initialize basic structures for pausing CPUs. */ void cpu_pause_init() { sema_init(&cpu_pause_info.cp_sem, 0, NULL, SEMA_DEFAULT, NULL); /* * Create initial CPU pause thread. */ cpu_pause_alloc(CPU); } /* * Start the threads used to pause another CPU. */ static int cpu_pause_start(processorid_t cpu_id) { int i; int cpu_count = 0; for (i = 0; i < NCPU; i++) { cpu_t *cp; kthread_id_t t; cp = cpu[i]; if (!CPU_IN_SET(cpu_available, i) || (i == cpu_id)) { safe_list[i] = PAUSE_WAIT; continue; } /* * Skip CPU if it is quiesced or not yet started. */ if ((cp->cpu_flags & (CPU_QUIESCED | CPU_READY)) != CPU_READY) { safe_list[i] = PAUSE_WAIT; continue; } /* * Start this CPU's pause thread. */ t = cp->cpu_pause_thread; thread_lock(t); /* * Reset the priority, since nglobpris may have * changed since the thread was created, if someone * has loaded the RT (or some other) scheduling * class. */ t->t_pri = v.v_nglobpris - 1; THREAD_TRANSITION(t); setbackdq(t); thread_unlock_nopreempt(t); ++cpu_count; } return (cpu_count); } /* * Pause all of the CPUs except the one we are on by creating a high * priority thread bound to those CPUs. * * Note that one must be extremely careful regarding code * executed while CPUs are paused. Since a CPU may be paused * while a thread scheduling on that CPU is holding an adaptive * lock, code executed with CPUs paused must not acquire adaptive * (or low-level spin) locks. Also, such code must not block, * since the thread that is supposed to initiate the wakeup may * never run. * * With a few exceptions, the restrictions on code executed with CPUs * paused match those for code executed at high-level interrupt * context. */ void pause_cpus(cpu_t *off_cp) { processorid_t cpu_id; int i; struct _cpu_pause_info *cpi = &cpu_pause_info; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpi->cp_paused == NULL); cpi->cp_count = 0; cpi->cp_go = 0; for (i = 0; i < NCPU; i++) safe_list[i] = PAUSE_IDLE; kpreempt_disable(); /* * If running on the cpu that is going offline, get off it. * This is so that it won't be necessary to rechoose a CPU * when done. */ if (CPU == off_cp) cpu_id = off_cp->cpu_next_part->cpu_id; else cpu_id = CPU->cpu_id; affinity_set(cpu_id); /* * Start the pause threads and record how many were started */ cpi->cp_count = cpu_pause_start(cpu_id); /* * Now wait for all CPUs to be running the pause thread. */ while (cpi->cp_count > 0) { /* * Spin reading the count without grabbing the disp * lock to make sure we don't prevent the pause * threads from getting the lock. */ while (sema_held(&cpi->cp_sem)) ; if (sema_tryp(&cpi->cp_sem)) --cpi->cp_count; } cpi->cp_go = 1; /* all have reached cpu_pause */ /* * Now wait for all CPUs to spl. (Transition from PAUSE_READY * to PAUSE_WAIT.) */ for (i = 0; i < NCPU; i++) { while (safe_list[i] != PAUSE_WAIT) ; } cpi->cp_spl = splhigh(); /* block dispatcher on this CPU */ cpi->cp_paused = curthread; } /* * Check whether the current thread has CPUs paused */ int cpus_paused(void) { if (cpu_pause_info.cp_paused != NULL) { ASSERT(cpu_pause_info.cp_paused == curthread); return (1); } return (0); } static cpu_t * cpu_get_all(processorid_t cpun) { ASSERT(MUTEX_HELD(&cpu_lock)); if (cpun >= NCPU || cpun < 0 || !CPU_IN_SET(cpu_available, cpun)) return (NULL); return (cpu[cpun]); } /* * Check whether cpun is a valid processor id and whether it should be * visible from the current zone. If it is, return a pointer to the * associated CPU structure. */ cpu_t * cpu_get(processorid_t cpun) { cpu_t *c; ASSERT(MUTEX_HELD(&cpu_lock)); c = cpu_get_all(cpun); if (c != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && zone_pset_get(curproc->p_zone) != cpupart_query_cpu(c)) return (NULL); return (c); } /* * The following functions should be used to check CPU states in the kernel. * They should be invoked with cpu_lock held. Kernel subsystems interested * in CPU states should *not* use cpu_get_state() and various P_ONLINE/etc * states. Those are for user-land (and system call) use only. */ /* * Determine whether the CPU is online and handling interrupts. */ int cpu_is_online(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); return (cpu_flagged_online(cpu->cpu_flags)); } /* * Determine whether the CPU is offline (this includes spare and faulted). */ int cpu_is_offline(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); return (cpu_flagged_offline(cpu->cpu_flags)); } /* * Determine whether the CPU is powered off. */ int cpu_is_poweredoff(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); return (cpu_flagged_poweredoff(cpu->cpu_flags)); } /* * Determine whether the CPU is handling interrupts. */ int cpu_is_nointr(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); return (cpu_flagged_nointr(cpu->cpu_flags)); } /* * Determine whether the CPU is active (scheduling threads). */ int cpu_is_active(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); return (cpu_flagged_active(cpu->cpu_flags)); } /* * Same as above, but these require cpu_flags instead of cpu_t pointers. */ int cpu_flagged_online(cpu_flag_t cpu_flags) { return (cpu_flagged_active(cpu_flags) && (cpu_flags & CPU_ENABLE)); } int cpu_flagged_offline(cpu_flag_t cpu_flags) { return (((cpu_flags & CPU_POWEROFF) == 0) && ((cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY)); } int cpu_flagged_poweredoff(cpu_flag_t cpu_flags) { return ((cpu_flags & CPU_POWEROFF) == CPU_POWEROFF); } int cpu_flagged_nointr(cpu_flag_t cpu_flags) { return (cpu_flagged_active(cpu_flags) && (cpu_flags & CPU_ENABLE) == 0); } int cpu_flagged_active(cpu_flag_t cpu_flags) { return (((cpu_flags & (CPU_POWEROFF | CPU_FAULTED | CPU_SPARE)) == 0) && ((cpu_flags & (CPU_READY | CPU_OFFLINE)) == CPU_READY)); } /* * Bring the indicated CPU online. */ int cpu_online(cpu_t *cp) { int error = 0; /* * Handle on-line request. * This code must put the new CPU on the active list before * starting it because it will not be paused, and will start * using the active list immediately. The real start occurs * when the CPU_QUIESCED flag is turned off. */ ASSERT(MUTEX_HELD(&cpu_lock)); /* * Put all the cpus into a known safe place. * No mutexes can be entered while CPUs are paused. */ error = mp_cpu_start(cp); /* arch-dep hook */ if (error == 0) { pause_cpus(NULL); cpu_add_active_internal(cp); if (cp->cpu_flags & CPU_FAULTED) { cp->cpu_flags &= ~CPU_FAULTED; mp_cpu_faulted_exit(cp); } cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN | CPU_SPARE); start_cpus(); cpu_stats_kstat_create(cp); cpu_create_intrstat(cp); lgrp_kstat_create(cp); cpu_state_change_notify(cp->cpu_id, CPU_ON); cpu_intr_enable(cp); /* arch-dep hook */ cyclic_online(cp); poke_cpu(cp->cpu_id); } return (error); } /* * Take the indicated CPU offline. */ int cpu_offline(cpu_t *cp, int flags) { cpupart_t *pp; int error = 0; cpu_t *ncp; int intr_enable; int cyclic_off = 0; int loop_count; int no_quiesce = 0; int (*bound_func)(struct cpu *, int); kthread_t *t; lpl_t *cpu_lpl; proc_t *p; int lgrp_diff_lpl; ASSERT(MUTEX_HELD(&cpu_lock)); /* * If we're going from faulted or spare to offline, just * clear these flags and update CPU state. */ if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) { if (cp->cpu_flags & CPU_FAULTED) { cp->cpu_flags &= ~CPU_FAULTED; mp_cpu_faulted_exit(cp); } cp->cpu_flags &= ~CPU_SPARE; cpu_set_state(cp); return (0); } /* * Handle off-line request. */ pp = cp->cpu_part; /* * Don't offline last online CPU in partition */ if (ncpus_online <= 1 || pp->cp_ncpus <= 1 || cpu_intr_count(cp) < 2) return (EBUSY); /* * Unbind all thread bound to our CPU if we were asked to. */ if (flags & CPU_FORCED && (error = cpu_unbind(cp->cpu_id)) != 0) return (error); /* * We shouldn't be bound to this CPU ourselves. */ if (curthread->t_bound_cpu == cp) return (EBUSY); /* * Tell interested parties that this CPU is going offline. */ cpu_state_change_notify(cp->cpu_id, CPU_OFF); /* * Take the CPU out of interrupt participation so we won't find * bound kernel threads. If the architecture cannot completely * shut off interrupts on the CPU, don't quiesce it, but don't * run anything but interrupt thread... this is indicated by * the CPU_OFFLINE flag being on but the CPU_QUIESCE flag being * off. */ intr_enable = cp->cpu_flags & CPU_ENABLE; if (intr_enable) no_quiesce = cpu_intr_disable(cp); /* * Record that we are aiming to offline this cpu. This acts as * a barrier to further weak binding requests in thread_nomigrate * and also causes cpu_choose, disp_lowpri_cpu and setfrontdq to * lean away from this cpu. Further strong bindings are already * avoided since we hold cpu_lock. Since threads that are set * runnable around now and others coming off the target cpu are * directed away from the target, existing strong and weak bindings * (especially the latter) to the target cpu stand maximum chance of * being able to unbind during the short delay loop below (if other * unbound threads compete they may not see cpu in time to unbind * even if they would do so immediately. */ cpu_inmotion = cp; membar_enter(); /* * Check for kernel threads (strong or weak) bound to that CPU. * Strongly bound threads may not unbind, and we'll have to return * EBUSY. Weakly bound threads should always disappear - we've * stopped more weak binding with cpu_inmotion and existing * bindings will drain imminently (they may not block). Nonetheless * we will wait for a fixed period for all bound threads to disappear. * Inactive interrupt threads are OK (they'll be in TS_FREE * state). If test finds some bound threads, wait a few ticks * to give short-lived threads (such as interrupts) chance to * complete. Note that if no_quiesce is set, i.e. this cpu * is required to service interrupts, then we take the route * that permits interrupt threads to be active (or bypassed). */ bound_func = no_quiesce ? disp_bound_threads : disp_bound_anythreads; again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { if (loop_count >= 5) { error = EBUSY; /* some threads still bound */ break; } /* * If some threads were assigned, give them * a chance to complete or move. * * This assumes that the clock_thread is not bound * to any CPU, because the clock_thread is needed to * do the delay(hz/100). * * Note: we still hold the cpu_lock while waiting for * the next clock tick. This is OK since it isn't * needed for anything else except processor_bind(2), * and system initialization. If we drop the lock, * we would risk another p_online disabling the last * processor. */ delay(hz/100); } if (error == 0 && cyclic_off == 0) { if (!cyclic_offline(cp)) { /* * We must have bound cyclics... */ error = EBUSY; goto out; } cyclic_off = 1; } /* * Call mp_cpu_stop() to perform any special operations * needed for this machine architecture to offline a CPU. */ if (error == 0) error = mp_cpu_stop(cp); /* arch-dep hook */ /* * If that all worked, take the CPU offline and decrement * ncpus_online. */ if (error == 0) { /* * Put all the cpus into a known safe place. * No mutexes can be entered while CPUs are paused. */ pause_cpus(cp); /* * Repeat the operation, if necessary, to make sure that * all outstanding low-level interrupts run to completion * before we set the CPU_QUIESCED flag. It's also possible * that a thread has weak bound to the cpu despite our raising * cpu_inmotion above since it may have loaded that * value before the barrier became visible (this would have * to be the thread that was on the target cpu at the time * we raised the barrier). */ if ((!no_quiesce && cp->cpu_intr_actv != 0) || (*bound_func)(cp, 1)) { start_cpus(); (void) mp_cpu_start(cp); goto again; } ncp = cp->cpu_next_part; cpu_lpl = cp->cpu_lpl; ASSERT(cpu_lpl != NULL); /* * Remove the CPU from the list of active CPUs. */ cpu_remove_active(cp); /* * Walk the active process list and look for threads * whose home lgroup needs to be updated, or * the last CPU they run on is the one being offlined now. */ ASSERT(curthread->t_cpu != cp); for (p = practive; p != NULL; p = p->p_next) { t = p->p_tlist; if (t == NULL) continue; lgrp_diff_lpl = 0; do { ASSERT(t->t_lpl != NULL); /* * Taking last CPU in lpl offline * Rehome thread if it is in this lpl * Otherwise, update the count of how many * threads are in this CPU's lgroup but have * a different lpl. */ if (cpu_lpl->lpl_ncpu == 0) { if (t->t_lpl == cpu_lpl) lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 0); else if (t->t_lpl->lpl_lgrpid == cpu_lpl->lpl_lgrpid) lgrp_diff_lpl++; } ASSERT(t->t_lpl->lpl_ncpu > 0); /* * Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_bound_cpu != cp) t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); t = t->t_forw; } while (t != p->p_tlist); /* * Didn't find any threads in the same lgroup as this * CPU with a different lpl, so remove the lgroup from * the process lgroup bitmask. */ if (lgrp_diff_lpl == 0) klgrpset_del(p->p_lgrpset, cpu_lpl->lpl_lgrpid); } /* * Walk thread list looking for threads that need to be * rehomed, since there are some threads that are not in * their process's p_tlist. */ t = curthread; do { ASSERT(t != NULL && t->t_lpl != NULL); /* * Rehome threads with same lpl as this CPU when this * is the last CPU in the lpl. */ if ((cpu_lpl->lpl_ncpu == 0) && (t->t_lpl == cpu_lpl)) lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 1); ASSERT(t->t_lpl->lpl_ncpu > 0); /* * Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_bound_cpu != cp) { t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); } ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); t = t->t_next; } while (t != curthread); ASSERT((cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) == 0); cp->cpu_flags |= CPU_OFFLINE; disp_cpu_inactive(cp); if (!no_quiesce) cp->cpu_flags |= CPU_QUIESCED; ncpus_online--; cpu_set_state(cp); cpu_inmotion = NULL; start_cpus(); cpu_stats_kstat_destroy(cp); cpu_delete_intrstat(cp); lgrp_kstat_destroy(cp); } out: cpu_inmotion = NULL; /* * If we failed, re-enable interrupts. * Do this even if cpu_intr_disable returned an error, because * it may have partially disabled interrupts. */ if (error && intr_enable) cpu_intr_enable(cp); /* * If we failed, but managed to offline the cyclic subsystem on this * CPU, bring it back online. */ if (error && cyclic_off) cyclic_online(cp); /* * If we failed, we need to notify everyone that this CPU is back on. */ if (error != 0) cpu_state_change_notify(cp->cpu_id, CPU_ON); return (error); } /* * Mark the indicated CPU as faulted, taking it offline. */ int cpu_faulted(cpu_t *cp, int flags) { int error = 0; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(!cpu_is_poweredoff(cp)); if (cpu_is_offline(cp)) { cp->cpu_flags &= ~CPU_SPARE; cp->cpu_flags |= CPU_FAULTED; mp_cpu_faulted_enter(cp); cpu_set_state(cp); return (0); } if ((error = cpu_offline(cp, flags)) == 0) { cp->cpu_flags |= CPU_FAULTED; mp_cpu_faulted_enter(cp); cpu_set_state(cp); } return (error); } /* * Mark the indicated CPU as a spare, taking it offline. */ int cpu_spare(cpu_t *cp, int flags) { int error = 0; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(!cpu_is_poweredoff(cp)); if (cpu_is_offline(cp)) { if (cp->cpu_flags & CPU_FAULTED) { cp->cpu_flags &= ~CPU_FAULTED; mp_cpu_faulted_exit(cp); } cp->cpu_flags |= CPU_SPARE; cpu_set_state(cp); return (0); } if ((error = cpu_offline(cp, flags)) == 0) { cp->cpu_flags |= CPU_SPARE; cpu_set_state(cp); } return (error); } /* * Take the indicated CPU from poweroff to offline. */ int cpu_poweron(cpu_t *cp) { int error = ENOTSUP; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpu_is_poweredoff(cp)); error = mp_cpu_poweron(cp); /* arch-dep hook */ if (error == 0) cpu_set_state(cp); return (error); } /* * Take the indicated CPU from any inactive state to powered off. */ int cpu_poweroff(cpu_t *cp) { int error = ENOTSUP; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpu_is_offline(cp)); if (!(cp->cpu_flags & CPU_QUIESCED)) return (EBUSY); /* not completely idle */ error = mp_cpu_poweroff(cp); /* arch-dep hook */ if (error == 0) cpu_set_state(cp); return (error); } /* * Initialize the CPU lists for the first CPU. */ void cpu_list_init(cpu_t *cp) { cp->cpu_next = cp; cp->cpu_prev = cp; cpu_list = cp; cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; cpu_active = cp; cp->cpu_seqid = 0; CPUSET_ADD(cpu_seqid_inuse, 0); cp->cpu_cache_offset = KMEM_CACHE_SIZE(cp->cpu_seqid); cp_default.cp_cpulist = cp; cp_default.cp_ncpus = 1; cp->cpu_next_part = cp; cp->cpu_prev_part = cp; cp->cpu_part = &cp_default; CPUSET_ADD(cpu_available, cp->cpu_id); } /* * Insert a CPU into the list of available CPUs. */ void cpu_add_unit(cpu_t *cp) { int seqid; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpu_list != NULL); /* list started in cpu_list_init */ lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)cp, 0); /* * Note: most users of the cpu_list will grab the * cpu_lock to insure that it isn't modified. However, * certain users can't or won't do that. To allow this * we pause the other cpus. Users who walk the list * without cpu_lock, must disable kernel preemption * to insure that the list isn't modified underneath * them. Also, any cached pointers to cpu structures * must be revalidated by checking to see if the * cpu_next pointer points to itself. This check must * be done with the cpu_lock held or kernel preemption * disabled. This check relies upon the fact that * old cpu structures are not free'ed or cleared after * then are removed from the cpu_list. * * Note that the clock code walks the cpu list dereferencing * the cpu_part pointer, so we need to initialize it before * adding the cpu to the list. */ cp->cpu_part = &cp_default; (void) pause_cpus(NULL); cp->cpu_next = cpu_list; cp->cpu_prev = cpu_list->cpu_prev; cpu_list->cpu_prev->cpu_next = cp; cpu_list->cpu_prev = cp; start_cpus(); for (seqid = 0; CPU_IN_SET(cpu_seqid_inuse, seqid); seqid++) continue; CPUSET_ADD(cpu_seqid_inuse, seqid); cp->cpu_seqid = seqid; ASSERT(ncpus < max_ncpus); ncpus++; cp->cpu_cache_offset = KMEM_CACHE_SIZE(cp->cpu_seqid); cpu[cp->cpu_id] = cp; CPUSET_ADD(cpu_available, cp->cpu_id); /* * allocate a pause thread for this CPU. */ cpu_pause_alloc(cp); /* * So that new CPUs won't have NULL prev_onln and next_onln pointers, * link them into a list of just that CPU. * This is so that disp_lowpri_cpu will work for thread_create in * pause_cpus() when called from the startup thread in a new CPU. */ cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; cpu_info_kstat_create(cp); cp->cpu_next_part = cp; cp->cpu_prev_part = cp; init_cpu_mstate(cp, CMS_SYSTEM); pool_pset_mod = gethrtime(); } /* * Do the opposite of cpu_add_unit(). */ void cpu_del_unit(int cpuid) { struct cpu *cp, *cpnext; ASSERT(MUTEX_HELD(&cpu_lock)); cp = cpu[cpuid]; ASSERT(cp != NULL); ASSERT(cp->cpu_next_onln == cp); ASSERT(cp->cpu_prev_onln == cp); ASSERT(cp->cpu_next_part == cp); ASSERT(cp->cpu_prev_part == cp); chip_cpu_fini(cp); /* * Destroy kstat stuff. */ cpu_info_kstat_destroy(cp); term_cpu_mstate(cp); /* * Free up pause thread. */ cpu_pause_free(cp); CPUSET_DEL(cpu_available, cp->cpu_id); cpu[cp->cpu_id] = NULL; /* * The clock thread and mutex_vector_enter cannot hold the * cpu_lock while traversing the cpu list, therefore we pause * all other threads by pausing the other cpus. These, and any * other routines holding cpu pointers while possibly sleeping * must be sure to call kpreempt_disable before processing the * list and be sure to check that the cpu has not been deleted * after any sleeps (check cp->cpu_next != NULL). We guarantee * to keep the deleted cpu structure around. * * Note that this MUST be done AFTER cpu_available * has been updated so that we don't waste time * trying to pause the cpu we're trying to delete. */ (void) pause_cpus(NULL); cpnext = cp->cpu_next; cp->cpu_prev->cpu_next = cp->cpu_next; cp->cpu_next->cpu_prev = cp->cpu_prev; if (cp == cpu_list) cpu_list = cpnext; /* * Signals that the cpu has been deleted (see above). */ cp->cpu_next = NULL; cp->cpu_prev = NULL; start_cpus(); CPUSET_DEL(cpu_seqid_inuse, cp->cpu_seqid); ncpus--; lgrp_config(LGRP_CONFIG_CPU_DEL, (uintptr_t)cp, 0); pool_pset_mod = gethrtime(); } /* * Add a CPU to the list of active CPUs. * This routine must not get any locks, because other CPUs are paused. */ static void cpu_add_active_internal(cpu_t *cp) { cpupart_t *pp = cp->cpu_part; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpu_list != NULL); /* list started in cpu_list_init */ ncpus_online++; cpu_set_state(cp); cp->cpu_next_onln = cpu_active; cp->cpu_prev_onln = cpu_active->cpu_prev_onln; cpu_active->cpu_prev_onln->cpu_next_onln = cp; cpu_active->cpu_prev_onln = cp; if (pp->cp_cpulist) { cp->cpu_next_part = pp->cp_cpulist; cp->cpu_prev_part = pp->cp_cpulist->cpu_prev_part; pp->cp_cpulist->cpu_prev_part->cpu_next_part = cp; pp->cp_cpulist->cpu_prev_part = cp; } else { ASSERT(pp->cp_ncpus == 0); pp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; } pp->cp_ncpus++; if (pp->cp_ncpus == 1) { cp_numparts_nonempty++; ASSERT(cp_numparts_nonempty != 0); } chip_cpu_assign(cp); lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)cp, 0); bzero(&cp->cpu_loadavg, sizeof (cp->cpu_loadavg)); } /* * Add a CPU to the list of active CPUs. * This is called from machine-dependent layers when a new CPU is started. */ void cpu_add_active(cpu_t *cp) { pause_cpus(NULL); cpu_add_active_internal(cp); start_cpus(); cpu_stats_kstat_create(cp); cpu_create_intrstat(cp); lgrp_kstat_create(cp); cpu_state_change_notify(cp->cpu_id, CPU_INIT); } /* * Remove a CPU from the list of active CPUs. * This routine must not get any locks, because other CPUs are paused. */ /* ARGSUSED */ static void cpu_remove_active(cpu_t *cp) { cpupart_t *pp = cp->cpu_part; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cp->cpu_next_onln != cp); /* not the last one */ ASSERT(cp->cpu_prev_onln != cp); /* not the last one */ chip_cpu_unassign(cp); lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0); cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln; cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln; if (cpu_active == cp) { cpu_active = cp->cpu_next_onln; } cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; if (pp->cp_cpulist == cp) { pp->cp_cpulist = cp->cpu_next_part; ASSERT(pp->cp_cpulist != cp); } cp->cpu_next_part = cp; cp->cpu_prev_part = cp; pp->cp_ncpus--; if (pp->cp_ncpus == 0) { cp_numparts_nonempty--; ASSERT(cp_numparts_nonempty != 0); } } /* * Routine used to setup a newly inserted CPU in preparation for starting * it running code. */ int cpu_configure(int cpuid) { int retval = 0; ASSERT(MUTEX_HELD(&cpu_lock)); /* * Some structures are statically allocated based upon * the maximum number of cpus the system supports. Do not * try to add anything beyond this limit. */ if (cpuid < 0 || cpuid >= NCPU) { return (EINVAL); } if ((cpu[cpuid] != NULL) && (cpu[cpuid]->cpu_flags != 0)) { return (EALREADY); } if ((retval = mp_cpu_configure(cpuid)) != 0) { return (retval); } cpu[cpuid]->cpu_flags = CPU_QUIESCED | CPU_OFFLINE | CPU_POWEROFF; cpu_set_state(cpu[cpuid]); retval = cpu_state_change_hooks(cpuid, CPU_CONFIG, CPU_UNCONFIG); if (retval != 0) (void) mp_cpu_unconfigure(cpuid); return (retval); } /* * Routine used to cleanup a CPU that has been powered off. This will * destroy all per-cpu information related to this cpu. */ int cpu_unconfigure(int cpuid) { int error; ASSERT(MUTEX_HELD(&cpu_lock)); if (cpu[cpuid] == NULL) { return (ENODEV); } if (cpu[cpuid]->cpu_flags == 0) { return (EALREADY); } if ((cpu[cpuid]->cpu_flags & CPU_POWEROFF) == 0) { return (EBUSY); } if (cpu[cpuid]->cpu_props != NULL) { (void) nvlist_free(cpu[cpuid]->cpu_props); cpu[cpuid]->cpu_props = NULL; } error = cpu_state_change_hooks(cpuid, CPU_UNCONFIG, CPU_CONFIG); if (error != 0) return (error); return (mp_cpu_unconfigure(cpuid)); } /* * Routines for registering and de-registering cpu_setup callback functions. * * Caller's context * These routines must not be called from a driver's attach(9E) or * detach(9E) entry point. * * NOTE: CPU callbacks should not block. They are called with cpu_lock held. */ /* * Ideally, these would be dynamically allocated and put into a linked * list; however that is not feasible because the registration routine * has to be available before the kmem allocator is working (in fact, * it is called by the kmem allocator init code). In any case, there * are quite a few extra entries for future users. */ #define NCPU_SETUPS 10 struct cpu_setup { cpu_setup_func_t *func; void *arg; } cpu_setups[NCPU_SETUPS]; void register_cpu_setup_func(cpu_setup_func_t *func, void *arg) { int i; ASSERT(MUTEX_HELD(&cpu_lock)); for (i = 0; i < NCPU_SETUPS; i++) if (cpu_setups[i].func == NULL) break; if (i >= NCPU_SETUPS) cmn_err(CE_PANIC, "Ran out of cpu_setup callback entries"); cpu_setups[i].func = func; cpu_setups[i].arg = arg; } void unregister_cpu_setup_func(cpu_setup_func_t *func, void *arg) { int i; ASSERT(MUTEX_HELD(&cpu_lock)); for (i = 0; i < NCPU_SETUPS; i++) if ((cpu_setups[i].func == func) && (cpu_setups[i].arg == arg)) break; if (i >= NCPU_SETUPS) cmn_err(CE_PANIC, "Could not find cpu_setup callback to " "deregister"); cpu_setups[i].func = NULL; cpu_setups[i].arg = 0; } /* * Call any state change hooks for this CPU, ignore any errors. */ void cpu_state_change_notify(int id, cpu_setup_t what) { int i; ASSERT(MUTEX_HELD(&cpu_lock)); for (i = 0; i < NCPU_SETUPS; i++) { if (cpu_setups[i].func != NULL) { cpu_setups[i].func(what, id, cpu_setups[i].arg); } } } /* * Call any state change hooks for this CPU, undo it if error found. */ static int cpu_state_change_hooks(int id, cpu_setup_t what, cpu_setup_t undo) { int i; int retval = 0; ASSERT(MUTEX_HELD(&cpu_lock)); for (i = 0; i < NCPU_SETUPS; i++) { if (cpu_setups[i].func != NULL) { retval = cpu_setups[i].func(what, id, cpu_setups[i].arg); if (retval) { for (i--; i >= 0; i--) { if (cpu_setups[i].func != NULL) cpu_setups[i].func(undo, id, cpu_setups[i].arg); } break; } } } return (retval); } /* * Export information about this CPU via the kstat mechanism. */ static struct { kstat_named_t ci_state; kstat_named_t ci_state_begin; kstat_named_t ci_cpu_type; kstat_named_t ci_fpu_type; kstat_named_t ci_clock_MHz; kstat_named_t ci_chip_id; kstat_named_t ci_implementation; #ifdef __sparcv9 kstat_named_t ci_device_ID; kstat_named_t ci_cpu_fru; #endif kstat_named_t ci_brandstr; } cpu_info_template = { { "state", KSTAT_DATA_CHAR }, { "state_begin", KSTAT_DATA_LONG }, { "cpu_type", KSTAT_DATA_CHAR }, { "fpu_type", KSTAT_DATA_CHAR }, { "clock_MHz", KSTAT_DATA_LONG }, { "chip_id", KSTAT_DATA_LONG }, { "implementation", KSTAT_DATA_STRING }, #ifdef __sparcv9 { "device_ID", KSTAT_DATA_UINT64 }, { "cpu_fru", KSTAT_DATA_STRING }, #endif { "brand", KSTAT_DATA_STRING }, }; static kmutex_t cpu_info_template_lock; static int cpu_info_kstat_update(kstat_t *ksp, int rw) { cpu_t *cp = ksp->ks_private; const char *pi_state; if (rw == KSTAT_WRITE) return (EACCES); switch (cp->cpu_type_info.pi_state) { case P_ONLINE: pi_state = PS_ONLINE; break; case P_POWEROFF: pi_state = PS_POWEROFF; break; case P_NOINTR: pi_state = PS_NOINTR; break; case P_FAULTED: pi_state = PS_FAULTED; break; case P_SPARE: pi_state = PS_SPARE; break; case P_OFFLINE: pi_state = PS_OFFLINE; break; default: pi_state = "unknown"; } (void) strcpy(cpu_info_template.ci_state.value.c, pi_state); cpu_info_template.ci_state_begin.value.l = cp->cpu_state_begin; (void) strncpy(cpu_info_template.ci_cpu_type.value.c, cp->cpu_type_info.pi_processor_type, 15); (void) strncpy(cpu_info_template.ci_fpu_type.value.c, cp->cpu_type_info.pi_fputypes, 15); cpu_info_template.ci_clock_MHz.value.l = cp->cpu_type_info.pi_clock; cpu_info_template.ci_chip_id.value.l = chip_plat_get_chipid(cp); kstat_named_setstr(&cpu_info_template.ci_implementation, cp->cpu_idstr); #ifdef __sparcv9 cpu_info_template.ci_device_ID.value.ui64 = cpunodes[cp->cpu_id].device_id; kstat_named_setstr(&cpu_info_template.ci_cpu_fru, cpu_fru_fmri(cp)); #endif kstat_named_setstr(&cpu_info_template.ci_brandstr, cp->cpu_brandstr); return (0); } static void cpu_info_kstat_create(cpu_t *cp) { zoneid_t zoneid; ASSERT(MUTEX_HELD(&cpu_lock)); if (pool_pset_enabled()) zoneid = GLOBAL_ZONEID; else zoneid = ALL_ZONES; if ((cp->cpu_info_kstat = kstat_create_zone("cpu_info", cp->cpu_id, NULL, "misc", KSTAT_TYPE_NAMED, sizeof (cpu_info_template) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL, zoneid)) != NULL) { cp->cpu_info_kstat->ks_data_size += 2 * CPU_IDSTRLEN; #ifdef __sparcv9 cp->cpu_info_kstat->ks_data_size += strlen(cpu_fru_fmri(cp)) + 1; #endif cp->cpu_info_kstat->ks_lock = &cpu_info_template_lock; cp->cpu_info_kstat->ks_data = &cpu_info_template; cp->cpu_info_kstat->ks_private = cp; cp->cpu_info_kstat->ks_update = cpu_info_kstat_update; kstat_install(cp->cpu_info_kstat); } } static void cpu_info_kstat_destroy(cpu_t *cp) { ASSERT(MUTEX_HELD(&cpu_lock)); kstat_delete(cp->cpu_info_kstat); cp->cpu_info_kstat = NULL; } /* * Create and install kstats for the boot CPU. */ void cpu_kstat_init(cpu_t *cp) { mutex_enter(&cpu_lock); cpu_info_kstat_create(cp); cpu_stats_kstat_create(cp); cpu_create_intrstat(cp); chip_kstat_create(cp->cpu_chip); cpu_set_state(cp); mutex_exit(&cpu_lock); } /* * Make visible to the zone that subset of the cpu information that would be * initialized when a cpu is configured (but still offline). */ void cpu_visibility_configure(cpu_t *cp, zone_t *zone) { zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(pool_pset_enabled()); ASSERT(cp != NULL); if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) { zone->zone_ncpus++; ASSERT(zone->zone_ncpus <= ncpus); } if (cp->cpu_info_kstat != NULL) kstat_zone_add(cp->cpu_info_kstat, zoneid); } /* * Make visible to the zone that subset of the cpu information that would be * initialized when a previously configured cpu is onlined. */ void cpu_visibility_online(cpu_t *cp, zone_t *zone) { kstat_t *ksp; char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */ zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES; processorid_t cpun; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(pool_pset_enabled()); ASSERT(cp != NULL); ASSERT(cpu_is_active(cp)); cpun = cp->cpu_id; if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) { zone->zone_ncpus_online++; ASSERT(zone->zone_ncpus_online <= ncpus_online); } (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun); if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES)) != NULL) { kstat_zone_add(ksp, zoneid); kstat_rele(ksp); } if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) { kstat_zone_add(ksp, zoneid); kstat_rele(ksp); } if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) { kstat_zone_add(ksp, zoneid); kstat_rele(ksp); } if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) != NULL) { kstat_zone_add(ksp, zoneid); kstat_rele(ksp); } } /* * Update relevant kstats such that cpu is now visible to processes * executing in specified zone. */ void cpu_visibility_add(cpu_t *cp, zone_t *zone) { cpu_visibility_configure(cp, zone); if (cpu_is_active(cp)) cpu_visibility_online(cp, zone); } /* * Make invisible to the zone that subset of the cpu information that would be * torn down when a previously offlined cpu is unconfigured. */ void cpu_visibility_unconfigure(cpu_t *cp, zone_t *zone) { zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(pool_pset_enabled()); ASSERT(cp != NULL); if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) { ASSERT(zone->zone_ncpus != 0); zone->zone_ncpus--; } if (cp->cpu_info_kstat) kstat_zone_remove(cp->cpu_info_kstat, zoneid); } /* * Make invisible to the zone that subset of the cpu information that would be * torn down when a cpu is offlined (but still configured). */ void cpu_visibility_offline(cpu_t *cp, zone_t *zone) { kstat_t *ksp; char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */ zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES; processorid_t cpun; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(pool_pset_enabled()); ASSERT(cp != NULL); ASSERT(cpu_is_active(cp)); cpun = cp->cpu_id; if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) { ASSERT(zone->zone_ncpus_online != 0); zone->zone_ncpus_online--; } if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) != NULL) { kstat_zone_remove(ksp, zoneid); kstat_rele(ksp); } if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) { kstat_zone_remove(ksp, zoneid); kstat_rele(ksp); } if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) { kstat_zone_remove(ksp, zoneid); kstat_rele(ksp); } (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun); if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES)) != NULL) { kstat_zone_remove(ksp, zoneid); kstat_rele(ksp); } } /* * Update relevant kstats such that cpu is no longer visible to processes * executing in specified zone. */ void cpu_visibility_remove(cpu_t *cp, zone_t *zone) { if (cpu_is_active(cp)) cpu_visibility_offline(cp, zone); cpu_visibility_unconfigure(cp, zone); } /* * Bind a thread to a CPU as requested. */ int cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind, int *error) { processorid_t binding; cpu_t *cp; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); thread_lock(tp); /* * Record old binding, but change the obind, which was initialized * to PBIND_NONE, only if this thread has a binding. This avoids * reporting PBIND_NONE for a process when some LWPs are bound. */ binding = tp->t_bind_cpu; if (binding != PBIND_NONE) *obind = binding; /* record old binding */ if (bind == PBIND_QUERY) { thread_unlock(tp); return (0); } /* * If this thread/LWP cannot be bound because of permission * problems, just note that and return success so that the * other threads/LWPs will be bound. This is the way * processor_bind() is defined to work. * * Binding will get EPERM if the thread is of system class * or hasprocperm() fails. */ if (tp->t_cid == 0 || !hasprocperm(tp->t_cred, CRED())) { *error = EPERM; thread_unlock(tp); return (0); } binding = bind; if (binding != PBIND_NONE) { cp = cpu[binding]; /* * Make sure binding is in right partition. */ if (tp->t_cpupart != cp->cpu_part) { *error = EINVAL; thread_unlock(tp); return (0); } } tp->t_bind_cpu = binding; /* set new binding */ /* * If there is no system-set reason for affinity, set * the t_bound_cpu field to reflect the binding. */ if (tp->t_affinitycnt == 0) { if (binding == PBIND_NONE) { /* * We may need to adjust disp_max_unbound_pri * since we're becoming unbound. */ disp_adjust_unbound_pri(tp); tp->t_bound_cpu = NULL; /* set new binding */ /* * Move thread to lgroup with strongest affinity * after unbinding */ if (tp->t_lgrp_affinity) lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 1); if (tp->t_state == TS_ONPROC && tp->t_cpu->cpu_part != tp->t_cpupart) cpu_surrender(tp); } else { lpl_t *lpl; tp->t_bound_cpu = cp; ASSERT(cp->cpu_lpl != NULL); /* * Set home to lgroup with most affinity containing CPU * that thread is being bound or minimum bounding * lgroup if no affinities set */ if (tp->t_lgrp_affinity) lpl = lgrp_affinity_best(tp, tp->t_cpupart, 0); else lpl = cp->cpu_lpl; if (tp->t_lpl != lpl) { /* can't grab cpu_lock */ lgrp_move_thread(tp, lpl, 1); } /* * Make the thread switch to the bound CPU. * If the thread is runnable, we need to * requeue it even if t_cpu is already set * to the right CPU, since it may be on a * kpreempt queue and need to move to a local * queue. We could check t_disp_queue to * avoid unnecessary overhead if it's already * on the right queue, but since this isn't * a performance-critical operation it doesn't * seem worth the extra code and complexity. * * If the thread is weakbound to the cpu then it will * resist the new binding request until the weak * binding drops. The cpu_surrender or requeueing * below could be skipped in such cases (since it * will have no effect), but that would require * thread_allowmigrate to acquire thread_lock so * we'll take the very occasional hit here instead. */ if (tp->t_state == TS_ONPROC) { cpu_surrender(tp); } else if (tp->t_state == TS_RUN) { cpu_t *ocp = tp->t_cpu; (void) dispdeq(tp); setbackdq(tp); /* * Either on the bound CPU's disp queue now, * or swapped out or on the swap queue. */ ASSERT(tp->t_disp_queue == cp->cpu_disp || tp->t_weakbound_cpu == ocp || (tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD); } } } /* * Our binding has changed; set TP_CHANGEBIND. */ tp->t_proc_flag |= TP_CHANGEBIND; aston(tp); thread_unlock(tp); return (0); } #if CPUSET_WORDS > 1 /* * Functions for implementing cpuset operations when a cpuset is more * than one word. On platforms where a cpuset is a single word these * are implemented as macros in cpuvar.h. */ void cpuset_all(cpuset_t *s) { int i; for (i = 0; i < CPUSET_WORDS; i++) s->cpub[i] = ~0UL; } void cpuset_all_but(cpuset_t *s, uint_t cpu) { cpuset_all(s); CPUSET_DEL(*s, cpu); } void cpuset_only(cpuset_t *s, uint_t cpu) { CPUSET_ZERO(*s); CPUSET_ADD(*s, cpu); } int cpuset_isnull(cpuset_t *s) { int i; for (i = 0; i < CPUSET_WORDS; i++) if (s->cpub[i] != 0) return (0); return (1); } int cpuset_cmp(cpuset_t *s1, cpuset_t *s2) { int i; for (i = 0; i < CPUSET_WORDS; i++) if (s1->cpub[i] != s2->cpub[i]) return (0); return (1); } uint_t cpuset_find(cpuset_t *s) { uint_t i; uint_t cpu = (uint_t)-1; /* * Find a cpu in the cpuset */ for (i = 0; (i < CPUSET_WORDS && cpu == (uint_t)-1); i++) cpu = (uint_t)(lowbit(s->cpub[i]) - 1); return (cpu); } #endif /* CPUSET_WORDS */ /* * Unbind all user threads bound to a given CPU. */ int cpu_unbind(processorid_t cpu) { processorid_t obind; kthread_t *tp; int ret = 0; proc_t *pp; int err, berr = 0; ASSERT(MUTEX_HELD(&cpu_lock)); mutex_enter(&pidlock); for (pp = practive; pp != NULL; pp = pp->p_next) { mutex_enter(&pp->p_lock); tp = pp->p_tlist; /* * Skip zombies, kernel processes, and processes in * other zones, if called from a non-global zone. */ if (tp == NULL || (pp->p_flag & SSYS) || !HASZONEACCESS(curproc, pp->p_zone->zone_id)) { mutex_exit(&pp->p_lock); continue; } do { if (tp->t_bind_cpu != cpu) continue; err = cpu_bind_thread(tp, PBIND_NONE, &obind, &berr); if (ret == 0) ret = err; } while ((tp = tp->t_forw) != pp->p_tlist); mutex_exit(&pp->p_lock); } mutex_exit(&pidlock); if (ret == 0) ret = berr; return (ret); } /* * Destroy all remaining bound threads on a cpu. */ void cpu_destroy_bound_threads(cpu_t *cp) { extern id_t syscid; register kthread_id_t t, tlist, tnext; /* * Destroy all remaining bound threads on the cpu. This * should include both the interrupt threads and the idle thread. * This requires some care, since we need to traverse the * thread list with the pidlock mutex locked, but thread_free * also locks the pidlock mutex. So, we collect the threads * we're going to reap in a list headed by "tlist", then we * unlock the pidlock mutex and traverse the tlist list, * doing thread_free's on the thread's. Simple, n'est pas? * Also, this depends on thread_free not mucking with the * t_next and t_prev links of the thread. */ if ((t = curthread) != NULL) { tlist = NULL; mutex_enter(&pidlock); do { tnext = t->t_next; if (t->t_bound_cpu == cp) { /* * We've found a bound thread, carefully unlink * it out of the thread list, and add it to * our "tlist". We "know" we don't have to * worry about unlinking curthread (the thread * that is executing this code). */ t->t_next->t_prev = t->t_prev; t->t_prev->t_next = t->t_next; t->t_next = tlist; tlist = t; ASSERT(t->t_cid == syscid); /* wake up anyone blocked in thread_join */ cv_broadcast(&t->t_joincv); /* * t_lwp set by interrupt threads and not * cleared. */ t->t_lwp = NULL; /* * Pause and idle threads always have * t_state set to TS_ONPROC. */ t->t_state = TS_FREE; t->t_prev = NULL; /* Just in case */ } } while ((t = tnext) != curthread); mutex_exit(&pidlock); for (t = tlist; t != NULL; t = tnext) { tnext = t->t_next; thread_free(t); } } } /* * processor_info(2) and p_online(2) status support functions * The constants returned by the cpu_get_state() and cpu_get_state_str() are * for use in communicating processor state information to userland. Kernel * subsystems should only be using the cpu_flags value directly. Subsystems * modifying cpu_flags should record the state change via a call to the * cpu_set_state(). */ /* * Update the pi_state of this CPU. This function provides the CPU status for * the information returned by processor_info(2). */ void cpu_set_state(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); cpu->cpu_type_info.pi_state = cpu_get_state(cpu); cpu->cpu_state_begin = gethrestime_sec(); pool_cpu_mod = gethrtime(); } /* * Return offline/online/other status for the indicated CPU. Use only for * communication with user applications; cpu_flags provides the in-kernel * interface. */ int cpu_get_state(cpu_t *cpu) { ASSERT(MUTEX_HELD(&cpu_lock)); if (cpu->cpu_flags & CPU_POWEROFF) return (P_POWEROFF); else if (cpu->cpu_flags & CPU_FAULTED) return (P_FAULTED); else if (cpu->cpu_flags & CPU_SPARE) return (P_SPARE); else if ((cpu->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY) return (P_OFFLINE); else if (cpu->cpu_flags & CPU_ENABLE) return (P_ONLINE); else return (P_NOINTR); } /* * Return processor_info(2) state as a string. */ const char * cpu_get_state_str(cpu_t *cpu) { const char *string; switch (cpu_get_state(cpu)) { case P_ONLINE: string = PS_ONLINE; break; case P_POWEROFF: string = PS_POWEROFF; break; case P_NOINTR: string = PS_NOINTR; break; case P_SPARE: string = PS_SPARE; break; case P_FAULTED: string = PS_FAULTED; break; case P_OFFLINE: string = PS_OFFLINE; break; default: string = "unknown"; break; } return (string); } /* * Export this CPU's statistics (cpu_stat_t and cpu_stats_t) as raw and named * kstats, respectively. This is done when a CPU is initialized or placed * online via p_online(2). */ static void cpu_stats_kstat_create(cpu_t *cp) { int instance = cp->cpu_id; char *module = "cpu"; char *class = "misc"; kstat_t *ksp; zoneid_t zoneid; ASSERT(MUTEX_HELD(&cpu_lock)); if (pool_pset_enabled()) zoneid = GLOBAL_ZONEID; else zoneid = ALL_ZONES; /* * Create named kstats */ #define CPU_STATS_KS_CREATE(name, tsize, update_func) \ ksp = kstat_create_zone(module, instance, (name), class, \ KSTAT_TYPE_NAMED, (tsize) / sizeof (kstat_named_t), 0, \ zoneid); \ if (ksp != NULL) { \ ksp->ks_private = cp; \ ksp->ks_update = (update_func); \ kstat_install(ksp); \ } else \ cmn_err(CE_WARN, "cpu: unable to create %s:%d:%s kstat", \ module, instance, (name)); CPU_STATS_KS_CREATE("sys", sizeof (cpu_sys_stats_ks_data_template), cpu_sys_stats_ks_update); CPU_STATS_KS_CREATE("vm", sizeof (cpu_vm_stats_ks_data_template), cpu_vm_stats_ks_update); /* * Export the familiar cpu_stat_t KSTAT_TYPE_RAW kstat. */ ksp = kstat_create_zone("cpu_stat", cp->cpu_id, NULL, "misc", KSTAT_TYPE_RAW, sizeof (cpu_stat_t), 0, zoneid); if (ksp != NULL) { ksp->ks_update = cpu_stat_ks_update; ksp->ks_private = cp; kstat_install(ksp); } } static void cpu_stats_kstat_destroy(cpu_t *cp) { char ks_name[KSTAT_STRLEN]; (void) sprintf(ks_name, "cpu_stat%d", cp->cpu_id); kstat_delete_byname("cpu_stat", cp->cpu_id, ks_name); kstat_delete_byname("cpu", cp->cpu_id, "sys"); kstat_delete_byname("cpu", cp->cpu_id, "vm"); } static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw) { cpu_t *cp = (cpu_t *)ksp->ks_private; struct cpu_sys_stats_ks_data *csskd; cpu_sys_stats_t *css; hrtime_t msnsecs[NCMSTATES]; int i; if (rw == KSTAT_WRITE) return (EACCES); csskd = ksp->ks_data; css = &cp->cpu_stats.sys; /* * Read CPU mstate, but compare with the last values we * received to make sure that the returned kstats never * decrease. */ get_cpu_mstate(cp, msnsecs); if (csskd->cpu_nsec_idle.value.ui64 > msnsecs[CMS_IDLE]) msnsecs[CMS_IDLE] = csskd->cpu_nsec_idle.value.ui64; if (csskd->cpu_nsec_user.value.ui64 > msnsecs[CMS_USER]) msnsecs[CMS_USER] = csskd->cpu_nsec_user.value.ui64; if (csskd->cpu_nsec_kernel.value.ui64 > msnsecs[CMS_SYSTEM]) msnsecs[CMS_SYSTEM] = csskd->cpu_nsec_kernel.value.ui64; bcopy(&cpu_sys_stats_ks_data_template, ksp->ks_data, sizeof (cpu_sys_stats_ks_data_template)); csskd->cpu_ticks_wait.value.ui64 = 0; csskd->wait_ticks_io.value.ui64 = 0; csskd->cpu_nsec_idle.value.ui64 = msnsecs[CMS_IDLE]; csskd->cpu_nsec_user.value.ui64 = msnsecs[CMS_USER]; csskd->cpu_nsec_kernel.value.ui64 = msnsecs[CMS_SYSTEM]; csskd->cpu_ticks_idle.value.ui64 = NSEC_TO_TICK(csskd->cpu_nsec_idle.value.ui64); csskd->cpu_ticks_user.value.ui64 = NSEC_TO_TICK(csskd->cpu_nsec_user.value.ui64); csskd->cpu_ticks_kernel.value.ui64 = NSEC_TO_TICK(csskd->cpu_nsec_kernel.value.ui64); csskd->bread.value.ui64 = css->bread; csskd->bwrite.value.ui64 = css->bwrite; csskd->lread.value.ui64 = css->lread; csskd->lwrite.value.ui64 = css->lwrite; csskd->phread.value.ui64 = css->phread; csskd->phwrite.value.ui64 = css->phwrite; csskd->pswitch.value.ui64 = css->pswitch; csskd->trap.value.ui64 = css->trap; csskd->intr.value.ui64 = 0; for (i = 0; i < PIL_MAX; i++) csskd->intr.value.ui64 += css->intr[i]; csskd->syscall.value.ui64 = css->syscall; csskd->sysread.value.ui64 = css->sysread; csskd->syswrite.value.ui64 = css->syswrite; csskd->sysfork.value.ui64 = css->sysfork; csskd->sysvfork.value.ui64 = css->sysvfork; csskd->sysexec.value.ui64 = css->sysexec; csskd->readch.value.ui64 = css->readch; csskd->writech.value.ui64 = css->writech; csskd->rcvint.value.ui64 = css->rcvint; csskd->xmtint.value.ui64 = css->xmtint; csskd->mdmint.value.ui64 = css->mdmint; csskd->rawch.value.ui64 = css->rawch; csskd->canch.value.ui64 = css->canch; csskd->outch.value.ui64 = css->outch; csskd->msg.value.ui64 = css->msg; csskd->sema.value.ui64 = css->sema; csskd->namei.value.ui64 = css->namei; csskd->ufsiget.value.ui64 = css->ufsiget; csskd->ufsdirblk.value.ui64 = css->ufsdirblk; csskd->ufsipage.value.ui64 = css->ufsipage; csskd->ufsinopage.value.ui64 = css->ufsinopage; csskd->procovf.value.ui64 = css->procovf; csskd->intrthread.value.ui64 = 0; for (i = 0; i < LOCK_LEVEL; i++) csskd->intrthread.value.ui64 += css->intr[i]; csskd->intrblk.value.ui64 = css->intrblk; csskd->intrunpin.value.ui64 = css->intrunpin; csskd->idlethread.value.ui64 = css->idlethread; csskd->inv_swtch.value.ui64 = css->inv_swtch; csskd->nthreads.value.ui64 = css->nthreads; csskd->cpumigrate.value.ui64 = css->cpumigrate; csskd->xcalls.value.ui64 = css->xcalls; csskd->mutex_adenters.value.ui64 = css->mutex_adenters; csskd->rw_rdfails.value.ui64 = css->rw_rdfails; csskd->rw_wrfails.value.ui64 = css->rw_wrfails; csskd->modload.value.ui64 = css->modload; csskd->modunload.value.ui64 = css->modunload; csskd->bawrite.value.ui64 = css->bawrite; csskd->iowait.value.ui64 = 0; return (0); } static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw) { cpu_t *cp = (cpu_t *)ksp->ks_private; struct cpu_vm_stats_ks_data *cvskd; cpu_vm_stats_t *cvs; if (rw == KSTAT_WRITE) return (EACCES); cvs = &cp->cpu_stats.vm; cvskd = ksp->ks_data; bcopy(&cpu_vm_stats_ks_data_template, ksp->ks_data, sizeof (cpu_vm_stats_ks_data_template)); cvskd->pgrec.value.ui64 = cvs->pgrec; cvskd->pgfrec.value.ui64 = cvs->pgfrec; cvskd->pgin.value.ui64 = cvs->pgin; cvskd->pgpgin.value.ui64 = cvs->pgpgin; cvskd->pgout.value.ui64 = cvs->pgout; cvskd->pgpgout.value.ui64 = cvs->pgpgout; cvskd->swapin.value.ui64 = cvs->swapin; cvskd->pgswapin.value.ui64 = cvs->pgswapin; cvskd->swapout.value.ui64 = cvs->swapout; cvskd->pgswapout.value.ui64 = cvs->pgswapout; cvskd->zfod.value.ui64 = cvs->zfod; cvskd->dfree.value.ui64 = cvs->dfree; cvskd->scan.value.ui64 = cvs->scan; cvskd->rev.value.ui64 = cvs->rev; cvskd->hat_fault.value.ui64 = cvs->hat_fault; cvskd->as_fault.value.ui64 = cvs->as_fault; cvskd->maj_fault.value.ui64 = cvs->maj_fault; cvskd->cow_fault.value.ui64 = cvs->cow_fault; cvskd->prot_fault.value.ui64 = cvs->prot_fault; cvskd->softlock.value.ui64 = cvs->softlock; cvskd->kernel_asflt.value.ui64 = cvs->kernel_asflt; cvskd->pgrrun.value.ui64 = cvs->pgrrun; cvskd->execpgin.value.ui64 = cvs->execpgin; cvskd->execpgout.value.ui64 = cvs->execpgout; cvskd->execfree.value.ui64 = cvs->execfree; cvskd->anonpgin.value.ui64 = cvs->anonpgin; cvskd->anonpgout.value.ui64 = cvs->anonpgout; cvskd->anonfree.value.ui64 = cvs->anonfree; cvskd->fspgin.value.ui64 = cvs->fspgin; cvskd->fspgout.value.ui64 = cvs->fspgout; cvskd->fsfree.value.ui64 = cvs->fsfree; return (0); } static int cpu_stat_ks_update(kstat_t *ksp, int rw) { cpu_stat_t *cso; cpu_t *cp; int i; hrtime_t msnsecs[NCMSTATES]; cso = (cpu_stat_t *)ksp->ks_data; cp = (cpu_t *)ksp->ks_private; if (rw == KSTAT_WRITE) return (EACCES); /* * Read CPU mstate, but compare with the last values we * received to make sure that the returned kstats never * decrease. */ get_cpu_mstate(cp, msnsecs); msnsecs[CMS_IDLE] = NSEC_TO_TICK(msnsecs[CMS_IDLE]); msnsecs[CMS_USER] = NSEC_TO_TICK(msnsecs[CMS_USER]); msnsecs[CMS_SYSTEM] = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); if (cso->cpu_sysinfo.cpu[CPU_IDLE] < msnsecs[CMS_IDLE]) cso->cpu_sysinfo.cpu[CPU_IDLE] = msnsecs[CMS_IDLE]; if (cso->cpu_sysinfo.cpu[CPU_USER] < msnsecs[CMS_USER]) cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER]; if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM]) cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM]; cso->cpu_sysinfo.cpu[CPU_WAIT] = 0; cso->cpu_sysinfo.wait[W_IO] = 0; cso->cpu_sysinfo.wait[W_SWAP] = 0; cso->cpu_sysinfo.wait[W_PIO] = 0; cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread); cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite); cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread); cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite); cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread); cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite); cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch); cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap); cso->cpu_sysinfo.intr = 0; for (i = 0; i < PIL_MAX; i++) cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]); cso->cpu_sysinfo.syscall = CPU_STATS(cp, sys.syscall); cso->cpu_sysinfo.sysread = CPU_STATS(cp, sys.sysread); cso->cpu_sysinfo.syswrite = CPU_STATS(cp, sys.syswrite); cso->cpu_sysinfo.sysfork = CPU_STATS(cp, sys.sysfork); cso->cpu_sysinfo.sysvfork = CPU_STATS(cp, sys.sysvfork); cso->cpu_sysinfo.sysexec = CPU_STATS(cp, sys.sysexec); cso->cpu_sysinfo.readch = CPU_STATS(cp, sys.readch); cso->cpu_sysinfo.writech = CPU_STATS(cp, sys.writech); cso->cpu_sysinfo.rcvint = CPU_STATS(cp, sys.rcvint); cso->cpu_sysinfo.xmtint = CPU_STATS(cp, sys.xmtint); cso->cpu_sysinfo.mdmint = CPU_STATS(cp, sys.mdmint); cso->cpu_sysinfo.rawch = CPU_STATS(cp, sys.rawch); cso->cpu_sysinfo.canch = CPU_STATS(cp, sys.canch); cso->cpu_sysinfo.outch = CPU_STATS(cp, sys.outch); cso->cpu_sysinfo.msg = CPU_STATS(cp, sys.msg); cso->cpu_sysinfo.sema = CPU_STATS(cp, sys.sema); cso->cpu_sysinfo.namei = CPU_STATS(cp, sys.namei); cso->cpu_sysinfo.ufsiget = CPU_STATS(cp, sys.ufsiget); cso->cpu_sysinfo.ufsdirblk = CPU_STATS(cp, sys.ufsdirblk); cso->cpu_sysinfo.ufsipage = CPU_STATS(cp, sys.ufsipage); cso->cpu_sysinfo.ufsinopage = CPU_STATS(cp, sys.ufsinopage); cso->cpu_sysinfo.inodeovf = 0; cso->cpu_sysinfo.fileovf = 0; cso->cpu_sysinfo.procovf = CPU_STATS(cp, sys.procovf); cso->cpu_sysinfo.intrthread = 0; for (i = 0; i < LOCK_LEVEL; i++) cso->cpu_sysinfo.intrthread += CPU_STATS(cp, sys.intr[i]); cso->cpu_sysinfo.intrblk = CPU_STATS(cp, sys.intrblk); cso->cpu_sysinfo.idlethread = CPU_STATS(cp, sys.idlethread); cso->cpu_sysinfo.inv_swtch = CPU_STATS(cp, sys.inv_swtch); cso->cpu_sysinfo.nthreads = CPU_STATS(cp, sys.nthreads); cso->cpu_sysinfo.cpumigrate = CPU_STATS(cp, sys.cpumigrate); cso->cpu_sysinfo.xcalls = CPU_STATS(cp, sys.xcalls); cso->cpu_sysinfo.mutex_adenters = CPU_STATS(cp, sys.mutex_adenters); cso->cpu_sysinfo.rw_rdfails = CPU_STATS(cp, sys.rw_rdfails); cso->cpu_sysinfo.rw_wrfails = CPU_STATS(cp, sys.rw_wrfails); cso->cpu_sysinfo.modload = CPU_STATS(cp, sys.modload); cso->cpu_sysinfo.modunload = CPU_STATS(cp, sys.modunload); cso->cpu_sysinfo.bawrite = CPU_STATS(cp, sys.bawrite); cso->cpu_sysinfo.rw_enters = 0; cso->cpu_sysinfo.win_uo_cnt = 0; cso->cpu_sysinfo.win_uu_cnt = 0; cso->cpu_sysinfo.win_so_cnt = 0; cso->cpu_sysinfo.win_su_cnt = 0; cso->cpu_sysinfo.win_suo_cnt = 0; cso->cpu_syswait.iowait = 0; cso->cpu_syswait.swap = 0; cso->cpu_syswait.physio = 0; cso->cpu_vminfo.pgrec = CPU_STATS(cp, vm.pgrec); cso->cpu_vminfo.pgfrec = CPU_STATS(cp, vm.pgfrec); cso->cpu_vminfo.pgin = CPU_STATS(cp, vm.pgin); cso->cpu_vminfo.pgpgin = CPU_STATS(cp, vm.pgpgin); cso->cpu_vminfo.pgout = CPU_STATS(cp, vm.pgout); cso->cpu_vminfo.pgpgout = CPU_STATS(cp, vm.pgpgout); cso->cpu_vminfo.swapin = CPU_STATS(cp, vm.swapin); cso->cpu_vminfo.pgswapin = CPU_STATS(cp, vm.pgswapin); cso->cpu_vminfo.swapout = CPU_STATS(cp, vm.swapout); cso->cpu_vminfo.pgswapout = CPU_STATS(cp, vm.pgswapout); cso->cpu_vminfo.zfod = CPU_STATS(cp, vm.zfod); cso->cpu_vminfo.dfree = CPU_STATS(cp, vm.dfree); cso->cpu_vminfo.scan = CPU_STATS(cp, vm.scan); cso->cpu_vminfo.rev = CPU_STATS(cp, vm.rev); cso->cpu_vminfo.hat_fault = CPU_STATS(cp, vm.hat_fault); cso->cpu_vminfo.as_fault = CPU_STATS(cp, vm.as_fault); cso->cpu_vminfo.maj_fault = CPU_STATS(cp, vm.maj_fault); cso->cpu_vminfo.cow_fault = CPU_STATS(cp, vm.cow_fault); cso->cpu_vminfo.prot_fault = CPU_STATS(cp, vm.prot_fault); cso->cpu_vminfo.softlock = CPU_STATS(cp, vm.softlock); cso->cpu_vminfo.kernel_asflt = CPU_STATS(cp, vm.kernel_asflt); cso->cpu_vminfo.pgrrun = CPU_STATS(cp, vm.pgrrun); cso->cpu_vminfo.execpgin = CPU_STATS(cp, vm.execpgin); cso->cpu_vminfo.execpgout = CPU_STATS(cp, vm.execpgout); cso->cpu_vminfo.execfree = CPU_STATS(cp, vm.execfree); cso->cpu_vminfo.anonpgin = CPU_STATS(cp, vm.anonpgin); cso->cpu_vminfo.anonpgout = CPU_STATS(cp, vm.anonpgout); cso->cpu_vminfo.anonfree = CPU_STATS(cp, vm.anonfree); cso->cpu_vminfo.fspgin = CPU_STATS(cp, vm.fspgin); cso->cpu_vminfo.fspgout = CPU_STATS(cp, vm.fspgout); cso->cpu_vminfo.fsfree = CPU_STATS(cp, vm.fsfree); return (0); }