/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/thread.h>
#include <sys/disp.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/cpupart.h>
#include <sys/pset.h>
#include <sys/var.h>
#include <sys/cyclic.h>
#include <sys/lgrp.h>
#include <sys/pghw.h>
#include <sys/loadavg.h>
#include <sys/class.h>
#include <sys/fss.h>
#include <sys/pool.h>
#include <sys/pool_pset.h>
#include <sys/policy.h>

/*
 * Calling pool_lock() protects the pools configuration, which includes
 * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
 * partitions from being created or destroyed while the lock is held.
 * The lock ordering with respect to related locks is:
 *
 *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
 *
 * Blocking memory allocations may be made while holding "pool_lock"
 * or cpu_lock.
 */

/*
 * The cp_default partition is allocated statically, but its lgroup load average
 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
 * saves some memory since the space allocated reflects the actual number of
 * lgroups supported by the platform. The lgrp facility provides a temporary
 * space to hold lpl information during system bootstrap.
 */

cpupart_t		*cp_list_head;
cpupart_t		cp_default;
struct mach_cpupart	cp_default_mach;
static cpupartid_t	cp_id_next;
uint_t			cp_numparts;
uint_t			cp_numparts_nonempty;

/*
 * Need to limit total number of partitions to avoid slowing down the
 * clock code too much.  The clock code traverses the list of
 * partitions and needs to be able to execute in a reasonable amount
 * of time (less than 1/hz seconds).  The maximum is sized based on
 * max_ncpus so it shouldn't be a problem unless there are large
 * numbers of empty partitions.
 */
static uint_t		cp_max_numparts;

/*
 * Processor sets and CPU partitions are different but related concepts.
 * A processor set is a user-level abstraction allowing users to create
 * sets of CPUs and bind threads exclusively to those sets.  A CPU
 * partition is a kernel dispatcher object consisting of a set of CPUs
 * and a global dispatch queue.  The processor set abstraction is
 * implemented via a CPU partition, and currently there is a 1-1
 * mapping between processor sets and partitions (excluding the default
 * partition, which is not visible as a processor set).  Hence, the
 * numbering for processor sets and CPU partitions is identical.  This
 * may not always be true in the future, and these macros could become
 * less trivial if we support e.g. a processor set containing multiple
 * CPU partitions.
 */
#define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
#define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))

/*
 * Find a CPU partition given a processor set ID.
 */
static cpupart_t *
cpupart_find_all(psetid_t psid)
{
	cpupart_t *cp;
	cpupartid_t cpid = PSTOCP(psid);

	ASSERT(MUTEX_HELD(&cpu_lock));

	/* default partition not visible as a processor set */
	if (psid == CP_DEFAULT)
		return (NULL);

	if (psid == PS_MYID)
		return (curthread->t_cpupart);

	cp = cp_list_head;
	do {
		if (cp->cp_id == cpid)
			return (cp);
		cp = cp->cp_next;
	} while (cp != cp_list_head);
	return (NULL);
}

/*
 * Find a CPU partition given a processor set ID if the processor set
 * should be visible from the calling zone.
 */
cpupart_t *
cpupart_find(psetid_t psid)
{
	cpupart_t *cp;

	ASSERT(MUTEX_HELD(&cpu_lock));
	cp = cpupart_find_all(psid);
	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
			return (NULL);
	return (cp);
}

static int
cpupart_kstat_update(kstat_t *ksp, int rw)
{
	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
	cpupart_kstat_t *cpksp = ksp->ks_data;

	if (rw == KSTAT_WRITE)
		return (EACCES);

	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
	    (16 - FSHIFT);
	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
	    (16 - FSHIFT);
	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
	    (16 - FSHIFT);
	return (0);
}

static void
cpupart_kstat_create(cpupart_t *cp)
{
	kstat_t *ksp;
	zoneid_t zoneid;

	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	 * We have a bit of a chicken-egg problem since this code will
	 * get called to create the kstats for CP_DEFAULT before the
	 * pools framework gets initialized.  We circumvent the problem
	 * by special-casing cp_default.
	 */
	if (cp != &cp_default && pool_pset_enabled())
		zoneid = GLOBAL_ZONEID;
	else
		zoneid = ALL_ZONES;
	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
	    KSTAT_TYPE_NAMED,
	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
	if (ksp != NULL) {
		cpupart_kstat_t *cpksp = ksp->ks_data;

		kstat_named_init(&cpksp->cpk_updates, "updates",
		    KSTAT_DATA_UINT64);
		kstat_named_init(&cpksp->cpk_runnable, "runnable",
		    KSTAT_DATA_UINT64);
		kstat_named_init(&cpksp->cpk_waiting, "waiting",
		    KSTAT_DATA_UINT64);
		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
		    KSTAT_DATA_UINT32);
		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
		    KSTAT_DATA_UINT32);
		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
		    KSTAT_DATA_UINT32);
		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
		    KSTAT_DATA_UINT32);

		ksp->ks_update = cpupart_kstat_update;
		ksp->ks_private = cp;

		kstat_install(ksp);
	}
	cp->cp_kstat = ksp;
}

/*
 * Initialize the default partition and kpreempt disp queue.
 */
void
cpupart_initialize_default(void)
{
	lgrp_id_t i;

	cp_list_head = &cp_default;
	cp_default.cp_next = &cp_default;
	cp_default.cp_prev = &cp_default;
	cp_default.cp_id = CP_DEFAULT;
	cp_default.cp_kp_queue.disp_maxrunpri = -1;
	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
	cp_default.cp_kp_queue.disp_cpu = NULL;
	cp_default.cp_gen = 0;
	cp_default.cp_loadavg.lg_cur = 0;
	cp_default.cp_loadavg.lg_len = 0;
	cp_default.cp_loadavg.lg_total = 0;
	for (i = 0; i < S_LOADAVG_SZ; i++) {
		cp_default.cp_loadavg.lg_loads[i] = 0;
	}
	CPUSET_ZERO(cp_default.cp_mach->mc_haltset);
	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
	cp_id_next = CP_DEFAULT + 1;
	cpupart_kstat_create(&cp_default);
	cp_numparts = 1;
	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
		cp_max_numparts = max_ncpus * 2 + 1;
	/*
	 * Allocate space for cp_default list of lgrploads
	 */
	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
	    cp_default.cp_nlgrploads, KM_SLEEP);

	/*
	 * The initial lpl topology is created in a special lpl list
	 * lpl_bootstrap. It should be copied to cp_default.
	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
	 */
	lpl_topo_bootstrap(cp_default.cp_lgrploads,
	    cp_default.cp_nlgrploads);

	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
	}
	cp_default.cp_attr = PSET_NOESCAPE;
	cp_numparts_nonempty = 1;
	/*
	 * Set t0's home
	 */
	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];

	bitset_init(&cp_default.cp_cmt_pgs);
}


static int
cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
{
	cpupart_t *oldpp;
	cpu_t	*ncp, *newlist;
	kthread_t *t;
	int	move_threads = 1;
	lgrp_id_t lgrpid;
	proc_t 	*p;
	int lgrp_diff_lpl;
	lpl_t	*cpu_lpl;
	int	ret;

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(newpp != NULL);

	oldpp = cp->cpu_part;
	ASSERT(oldpp != NULL);
	ASSERT(oldpp->cp_ncpus > 0);

	if (newpp == oldpp) {
		/*
		 * Don't need to do anything.
		 */
		return (0);
	}

	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);

	if (!disp_bound_partition(cp, 0)) {
		/*
		 * Don't need to move threads if there are no threads in
		 * the partition.  Note that threads can't enter the
		 * partition while we're holding cpu_lock.
		 */
		move_threads = 0;
	} else if (oldpp->cp_ncpus == 1) {
		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
		return (EBUSY);
	}

	if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) {
		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
		return (ret);
	}

	/*
	 * Stop further threads weak binding to this cpu.
	 */
	cpu_inmotion = cp;
	membar_enter();

	/*
	 * Notify the Processor Groups subsystem that the CPU
	 * will be moving cpu partitions. This is done before
	 * CPUs are paused to provide an opportunity for any
	 * needed memory allocations.
	 */
	pg_cpupart_out(cp, oldpp);
	pg_cpupart_in(cp, newpp);

again:
	if (move_threads) {
		int loop_count;
		/*
		 * Check for threads strong or weak bound to this CPU.
		 */
		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
			if (loop_count >= 5) {
				cpu_state_change_notify(cp->cpu_id,
				    CPU_CPUPART_IN);
				pg_cpupart_out(cp, newpp);
				pg_cpupart_in(cp, oldpp);
				cpu_inmotion = NULL;
				return (EBUSY);	/* some threads still bound */
			}
			delay(1);
		}
	}

	/*
	 * Before we actually start changing data structures, notify
	 * the cyclic subsystem that we want to move this CPU out of its
	 * partition.
	 */
	if (!cyclic_move_out(cp)) {
		/*
		 * This CPU must be the last CPU in a processor set with
		 * a bound cyclic.
		 */
		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
		pg_cpupart_out(cp, newpp);
		pg_cpupart_in(cp, oldpp);
		cpu_inmotion = NULL;
		return (EBUSY);
	}

	pause_cpus(cp);

	if (move_threads) {
		/*
		 * The thread on cpu before the pause thread may have read
		 * cpu_inmotion before we raised the barrier above.  Check
		 * again.
		 */
		if (disp_bound_threads(cp, 1)) {
			start_cpus();
			goto again;
		}

	}

	/*
	 * Now that CPUs are paused, let the PG subsystem perform
	 * any necessary data structure updates.
	 */
	pg_cpupart_move(cp, oldpp, newpp);

	/* save this cpu's lgroup -- it'll be the same in the new partition */
	lgrpid = cp->cpu_lpl->lpl_lgrpid;

	cpu_lpl = cp->cpu_lpl;
	/*
	 * let the lgroup framework know cp has left the partition
	 */
	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);

	/* move out of old partition */
	oldpp->cp_ncpus--;
	if (oldpp->cp_ncpus > 0) {

		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
		if (oldpp->cp_cpulist == cp) {
			oldpp->cp_cpulist = ncp;
		}
	} else {
		ncp = oldpp->cp_cpulist = NULL;
		cp_numparts_nonempty--;
		ASSERT(cp_numparts_nonempty != 0);
	}
	oldpp->cp_gen++;

	/* move into new partition */
	newlist = newpp->cp_cpulist;
	if (newlist == NULL) {
		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
		cp_numparts_nonempty++;
		ASSERT(cp_numparts_nonempty != 0);
	} else {
		cp->cpu_next_part = newlist;
		cp->cpu_prev_part = newlist->cpu_prev_part;
		newlist->cpu_prev_part->cpu_next_part = cp;
		newlist->cpu_prev_part = cp;
	}
	cp->cpu_part = newpp;
	newpp->cp_ncpus++;
	newpp->cp_gen++;

	ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset));
	ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset));

	/*
	 * let the lgroup framework know cp has entered the partition
	 */
	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);

	/*
	 * If necessary, move threads off processor.
	 */
	if (move_threads) {
		ASSERT(ncp != NULL);

		/*
		 * Walk thru the active process list to look for
		 * threads that need to have a new home lgroup,
		 * or the last CPU they run on is the same CPU
		 * being moved out of the partition.
		 */

		for (p = practive; p != NULL; p = p->p_next) {

			t = p->p_tlist;

			if (t == NULL)
				continue;

			lgrp_diff_lpl = 0;

			do {

				ASSERT(t->t_lpl != NULL);

				/*
				 * Update the count of how many threads are
				 * in this CPU's lgroup but have a different lpl
				 */

				if (t->t_lpl != cpu_lpl &&
				    t->t_lpl->lpl_lgrpid == lgrpid)
					lgrp_diff_lpl++;
				/*
				 * If the lgroup that t is assigned to no
				 * longer has any CPUs in t's partition,
				 * we'll have to choose a new lgroup for t.
				 */

				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
				    t->t_cpupart)) {
					lgrp_move_thread(t,
					    lgrp_choose(t, t->t_cpupart), 0);
				}

				/*
				 * make sure lpl points to our own partition
				 */
				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
					t->t_cpupart->cp_nlgrploads));

				ASSERT(t->t_lpl->lpl_ncpu > 0);

				/* Update CPU last ran on if it was this CPU */
				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
				    t->t_bound_cpu != cp) {
					t->t_cpu = disp_lowpri_cpu(ncp,
					    t->t_lpl, t->t_pri, NULL);
				}
				t = t->t_forw;
			} while (t != p->p_tlist);

			/*
			 * Didn't find any threads in the same lgroup as this
			 * CPU with a different lpl, so remove the lgroup from
			 * the process lgroup bitmask.
			 */

			if (lgrp_diff_lpl)
				klgrpset_del(p->p_lgrpset, lgrpid);
		}

		/*
		 * Walk thread list looking for threads that need to be
		 * rehomed, since there are some threads that are not in
		 * their process's p_tlist.
		 */

		t = curthread;

		do {
			ASSERT(t != NULL && t->t_lpl != NULL);

			/*
			 * If the lgroup that t is assigned to no
			 * longer has any CPUs in t's partition,
			 * we'll have to choose a new lgroup for t.
			 * Also, choose best lgroup for home when
			 * thread has specified lgroup affinities,
			 * since there may be an lgroup with more
			 * affinity available after moving CPUs
			 * around.
			 */
			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
			    t->t_cpupart) || t->t_lgrp_affinity) {
				lgrp_move_thread(t,
				    lgrp_choose(t, t->t_cpupart), 1);
			}

			/* make sure lpl points to our own partition */
			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
				t->t_cpupart->cp_nlgrploads));

			ASSERT(t->t_lpl->lpl_ncpu > 0);

			/* Update CPU last ran on if it was this CPU */
			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
			    t->t_bound_cpu != cp) {
				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
				    t->t_pri, NULL);
			}

			t = t->t_next;
		} while (t != curthread);

		/*
		 * Clear off the CPU's run queue, and the kp queue if the
		 * partition is now empty.
		 */
		disp_cpu_inactive(cp);

		/*
		 * Make cp switch to a thread from the new partition.
		 */
		cp->cpu_runrun = 1;
		cp->cpu_kprunrun = 1;
	}

	cpu_inmotion = NULL;
	start_cpus();

	/*
	 * Let anyone interested know that cpu has been added to the set.
	 */
	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);

	/*
	 * Now let the cyclic subsystem know that it can reshuffle cyclics
	 * bound to the new processor set.
	 */
	cyclic_move_in(cp);

	return (0);
}

/*
 * Check if thread can be moved to a new cpu partition.  Called by
 * cpupart_move_thread() and pset_bind_start().
 */
int
cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
{
	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
	ASSERT(cp != NULL);
	ASSERT(THREAD_LOCK_HELD(tp));

	/*
	 * CPU-bound threads can't be moved.
	 */
	if (!ignore) {
		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
		    tp->t_weakbound_cpu;
		if (boundcpu != NULL && boundcpu->cpu_part != cp)
			return (EBUSY);
	}
	return (0);
}

/*
 * Move thread to new partition.  If ignore is non-zero, then CPU
 * bindings should be ignored (this is used when destroying a
 * partition).
 */
static int
cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
    void *projbuf, void *zonebuf)
{
	cpupart_t *oldpp = tp->t_cpupart;
	int ret;

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&pidlock));
	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
	ASSERT(newpp != NULL);

	if (newpp->cp_cpulist == NULL)
		return (EINVAL);

	/*
	 * Check for errors first.
	 */
	thread_lock(tp);
	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
		thread_unlock(tp);
		return (ret);
	}

	/* move the thread */
	if (oldpp != newpp) {
		/*
		 * Make the thread switch to the new partition.
		 */
		tp->t_cpupart = newpp;
		ASSERT(tp->t_lpl != NULL);
		/*
		 * Leave the thread on the same lgroup if possible; otherwise
		 * choose a new lgroup for it.  In either case, update its
		 * t_lpl.
		 */
		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
		    tp->t_lgrp_affinity == NULL) {
			/*
			 * The thread's lgroup has CPUs in the thread's new
			 * partition, so the thread can stay assigned to the
			 * same lgroup.  Update its t_lpl to point to the
			 * lpl_t for its lgroup in its new partition.
			 */
			lgrp_move_thread(tp, &tp->t_cpupart->\
			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
		} else {
			/*
			 * The thread's lgroup has no cpus in its new
			 * partition or it has specified lgroup affinities,
			 * so choose the best lgroup for the thread and
			 * assign it to that lgroup.
			 */
			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
			    1);
		}
		/*
		 * make sure lpl points to our own partition
		 */
		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
			tp->t_cpupart->cp_nlgrploads));

		ASSERT(tp->t_lpl->lpl_ncpu > 0);

		if (tp->t_state == TS_ONPROC) {
			cpu_surrender(tp);
		} else if (tp->t_state == TS_RUN) {
			(void) dispdeq(tp);
			setbackdq(tp);
		}
	}

	/*
	 * Our binding has changed; set TP_CHANGEBIND.
	 */
	tp->t_proc_flag |= TP_CHANGEBIND;
	aston(tp);

	thread_unlock(tp);
	fss_changepset(tp, newpp, projbuf, zonebuf);

	return (0);		/* success */
}


/*
 * This function binds a thread to a partition.  Must be called with the
 * p_lock of the containing process held (to keep the thread from going
 * away), and thus also with cpu_lock held (since cpu_lock must be
 * acquired before p_lock).  If ignore is non-zero, then CPU bindings
 * should be ignored (this is used when destroying a partition).
 */
int
cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
    void *zonebuf)
{
	cpupart_t	*newpp;

	ASSERT(pool_lock_held());
	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&pidlock));
	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));

	if (psid == PS_NONE)
		newpp = &cp_default;
	else {
		newpp = cpupart_find(psid);
		if (newpp == NULL) {
			return (EINVAL);
		}
	}
	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
}


/*
 * Create a new partition.  On MP systems, this also allocates a
 * kpreempt disp queue for that partition.
 */
int
cpupart_create(psetid_t *psid)
{
	cpupart_t	*pp;
	lgrp_id_t	i;

	ASSERT(pool_lock_held());

	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
	pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP);
	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
	    KM_SLEEP);

	mutex_enter(&cpu_lock);
	if (cp_numparts == cp_max_numparts) {
		mutex_exit(&cpu_lock);
		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
		pp->cp_lgrploads = NULL;
		kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
		kmem_free(pp, sizeof (cpupart_t));
		return (ENOMEM);
	}
	cp_numparts++;
	/* find the next free partition ID */
	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
		cp_id_next++;
	pp->cp_id = cp_id_next++;
	pp->cp_ncpus = 0;
	pp->cp_cpulist = NULL;
	pp->cp_attr = 0;
	klgrpset_clear(pp->cp_lgrpset);
	pp->cp_kp_queue.disp_maxrunpri = -1;
	pp->cp_kp_queue.disp_max_unbound_pri = -1;
	pp->cp_kp_queue.disp_cpu = NULL;
	pp->cp_gen = 0;
	CPUSET_ZERO(pp->cp_mach->mc_haltset);
	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
	*psid = CPTOPS(pp->cp_id);
	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
	cpupart_kstat_create(pp);
	for (i = 0; i < pp->cp_nlgrploads; i++) {
		pp->cp_lgrploads[i].lpl_lgrpid = i;
	}
	bitset_init(&pp->cp_cmt_pgs);

	/*
	 * Pause all CPUs while changing the partition list, to make sure
	 * the clock thread (which traverses the list without holding
	 * cpu_lock) isn't running.
	 */
	pause_cpus(NULL);
	pp->cp_next = cp_list_head;
	pp->cp_prev = cp_list_head->cp_prev;
	cp_list_head->cp_prev->cp_next = pp;
	cp_list_head->cp_prev = pp;
	start_cpus();
	mutex_exit(&cpu_lock);

	return (0);
}


/*
 * Destroy a partition.
 */
int
cpupart_destroy(psetid_t psid)
{
	cpu_t	*cp, *first_cp;
	cpupart_t *pp, *newpp;
	int	err = 0;
	void 	*projbuf, *zonebuf;
	kthread_t *t;
	proc_t	*p;

	ASSERT(pool_lock_held());
	mutex_enter(&cpu_lock);

	pp = cpupart_find(psid);
	if (pp == NULL || pp == &cp_default) {
		mutex_exit(&cpu_lock);
		return (EINVAL);
	}

	/*
	 * Pre-allocate enough buffers for FSS for all active projects and
	 * for all active zones on the system.  Unused buffers will be
	 * freed later by fss_freebuf().
	 */
	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);

	/*
	 * First need to unbind all the threads currently bound to the
	 * partition.  Then do the actual destroy (which moves the CPUs).
	 */
	mutex_enter(&pidlock);
	t = curthread;
	do {
		if (t->t_bind_pset == psid) {
again:			p = ttoproc(t);
			mutex_enter(&p->p_lock);
			if (ttoproc(t) != p) {
				/*
				 * lwp_exit has changed this thread's process
				 * pointer before we grabbed its p_lock.
				 */
				mutex_exit(&p->p_lock);
				goto again;
			}
			err = cpupart_bind_thread(t, PS_NONE, 1,
			    projbuf, zonebuf);
			if (err) {
				mutex_exit(&p->p_lock);
				mutex_exit(&pidlock);
				mutex_exit(&cpu_lock);
				fss_freebuf(projbuf, FSS_ALLOC_PROJ);
				fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
				return (err);
			}
			t->t_bind_pset = PS_NONE;
			mutex_exit(&p->p_lock);
		}
		t = t->t_next;
	} while (t != curthread);

	mutex_exit(&pidlock);
	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);

	newpp = &cp_default;
	while ((cp = pp->cp_cpulist) != NULL) {
		if (err = cpupart_move_cpu(cp, newpp, 0)) {
			mutex_exit(&cpu_lock);
			return (err);
		}
	}

	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
	ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset));

	/*
	 * Teardown the partition's group of active CMT PGs now that
	 * all of the CPUs have left.
	 */
	bitset_fini(&pp->cp_cmt_pgs);

	/*
	 * Reset the pointers in any offline processors so they won't
	 * try to rejoin the destroyed partition when they're turned
	 * online.
	 */
	first_cp = cp = CPU;
	do {
		if (cp->cpu_part == pp) {
			ASSERT(cp->cpu_flags & CPU_OFFLINE);
			cp->cpu_part = newpp;
		}
		cp = cp->cpu_next;
	} while (cp != first_cp);

	/*
	 * Pause all CPUs while changing the partition list, to make sure
	 * the clock thread (which traverses the list without holding
	 * cpu_lock) isn't running.
	 */
	pause_cpus(NULL);
	pp->cp_prev->cp_next = pp->cp_next;
	pp->cp_next->cp_prev = pp->cp_prev;
	if (cp_list_head == pp)
		cp_list_head = pp->cp_next;
	start_cpus();

	if (cp_id_next > pp->cp_id)
		cp_id_next = pp->cp_id;

	if (pp->cp_kstat)
		kstat_delete(pp->cp_kstat);

	cp_numparts--;

	disp_kp_free(&pp->cp_kp_queue);
	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
	pp->cp_lgrploads = NULL;
	kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
	kmem_free(pp, sizeof (cpupart_t));
	mutex_exit(&cpu_lock);

	return (err);
}


/*
 * Return the ID of the partition to which the specified processor belongs.
 */
psetid_t
cpupart_query_cpu(cpu_t *cp)
{
	ASSERT(MUTEX_HELD(&cpu_lock));

	return (CPTOPS(cp->cpu_part->cp_id));
}


/*
 * Attach a processor to an existing partition.
 */
int
cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
{
	cpupart_t	*pp;
	int		err;

	ASSERT(pool_lock_held());
	ASSERT(MUTEX_HELD(&cpu_lock));

	pp = cpupart_find(psid);
	if (pp == NULL)
		return (EINVAL);
	if (cp->cpu_flags & CPU_OFFLINE)
		return (EINVAL);

	err = cpupart_move_cpu(cp, pp, forced);
	return (err);
}

/*
 * Get a list of cpus belonging to the partition.  If numcpus is NULL,
 * this just checks for a valid partition.  If numcpus is non-NULL but
 * cpulist is NULL, the current number of cpus is stored in *numcpus.
 * If both are non-NULL, the current number of cpus is stored in *numcpus,
 * and a list of those cpus up to the size originally in *numcpus is
 * stored in cpulist[].  Also, store the processor set id in *psid.
 * This is useful in case the processor set id passed in was PS_MYID.
 */
int
cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
{
	cpupart_t	*pp;
	uint_t		ncpus;
	cpu_t		*c;
	int		i;

	mutex_enter(&cpu_lock);
	pp = cpupart_find(*psid);
	if (pp == NULL) {
		mutex_exit(&cpu_lock);
		return (EINVAL);
	}
	*psid = CPTOPS(pp->cp_id);
	ncpus = pp->cp_ncpus;
	if (numcpus) {
		if (ncpus > *numcpus) {
			/*
			 * Only copy as many cpus as were passed in, but
			 * pass back the real number.
			 */
			uint_t t = ncpus;
			ncpus = *numcpus;
			*numcpus = t;
		} else
			*numcpus = ncpus;

		if (cpulist) {
			c = pp->cp_cpulist;
			for (i = 0; i < ncpus; i++) {
				ASSERT(c != NULL);
				cpulist[i] = c->cpu_id;
				c = c->cpu_next_part;
			}
		}
	}
	mutex_exit(&cpu_lock);
	return (0);
}

/*
 * Reallocate kpreempt queues for each CPU partition.  Called from
 * disp_setup when a new scheduling class is loaded that increases the
 * number of priorities in the system.
 */
void
cpupart_kpqalloc(pri_t npri)
{
	cpupart_t *cpp;

	ASSERT(MUTEX_HELD(&cpu_lock));
	cpp = cp_list_head;
	do {
		disp_kp_alloc(&cpp->cp_kp_queue, npri);
		cpp = cpp->cp_next;
	} while (cpp != cp_list_head);
}

int
cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
{
	cpupart_t *cp;
	int i;

	ASSERT(nelem >= 0);
	ASSERT(nelem <= LOADAVG_NSTATS);
	ASSERT(MUTEX_HELD(&cpu_lock));

	cp = cpupart_find(psid);
	if (cp == NULL)
		return (EINVAL);
	for (i = 0; i < nelem; i++)
		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);

	return (0);
}


uint_t
cpupart_list(psetid_t *list, uint_t nelem, int flag)
{
	uint_t numpart = 0;
	cpupart_t *cp;

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);

	if (list != NULL) {
		cp = cp_list_head;
		do {
			if (((flag == CP_ALL) && (cp != &cp_default)) ||
			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
				if (numpart == nelem)
					break;
				list[numpart++] = CPTOPS(cp->cp_id);
			}
			cp = cp->cp_next;
		} while (cp != cp_list_head);
	}

	ASSERT(numpart < cp_numparts);

	if (flag == CP_ALL)
		numpart = cp_numparts - 1; /* leave out default partition */
	else if (flag == CP_NONEMPTY)
		numpart = cp_numparts_nonempty;

	return (numpart);
}

int
cpupart_setattr(psetid_t psid, uint_t attr)
{
	cpupart_t *cp;

	ASSERT(pool_lock_held());

	mutex_enter(&cpu_lock);
	if ((cp = cpupart_find(psid)) == NULL) {
		mutex_exit(&cpu_lock);
		return (EINVAL);
	}
	/*
	 * PSET_NOESCAPE attribute for default cpu partition is always set
	 */
	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
		mutex_exit(&cpu_lock);
		return (EINVAL);
	}
	cp->cp_attr = attr;
	mutex_exit(&cpu_lock);
	return (0);
}

int
cpupart_getattr(psetid_t psid, uint_t *attrp)
{
	cpupart_t *cp;

	mutex_enter(&cpu_lock);
	if ((cp = cpupart_find(psid)) == NULL) {
		mutex_exit(&cpu_lock);
		return (EINVAL);
	}
	*attrp = cp->cp_attr;
	mutex_exit(&cpu_lock);
	return (0);
}