/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/processor.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/pghw.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>
#include <sys/cpu_pm.h>

/*
 * CMT scheduler / dispatcher support
 *
 * This file implements CMT scheduler support using Processor Groups.
 * The CMT processor group class creates and maintains the CMT class
 * specific processor group pg_cmt_t.
 *
 * ---------------------------- <-- pg_cmt_t *
 * | pghw_t                   |
 * ----------------------------
 * | CMT class specific data  |
 * | - hierarchy linkage      |
 * | - CMT load balancing data|
 * | - active CPU group/bitset|
 * ----------------------------
 *
 * The scheduler/dispatcher leverages knowledge of the performance
 * relevant CMT sharing relationships existing between cpus to implement
 * optimized affinity, load balancing, and coalescence policies.
 *
 * Load balancing policy seeks to improve performance by minimizing
 * contention over shared processor resources / facilities, Affinity
 * policies seek to improve cache and TLB utilization. Coalescence
 * policies improve resource utilization and ultimately power efficiency.
 *
 * The CMT PGs created by this class are already arranged into a
 * hierarchy (which is done in the pghw layer). To implement the top-down
 * CMT load balancing algorithm, the CMT PGs additionally maintain
 * parent, child and sibling hierarchy relationships.
 * Parent PGs always contain a superset of their children(s) resources,
 * each PG can have at most one parent, and siblings are the group of PGs
 * sharing the same parent.
 *
 * On NUMA systems, the CMT load balancing algorithm balances across the
 * CMT PGs within their respective lgroups. On UMA based system, there
 * exists a top level group of PGs to balance across. On NUMA systems multiple
 * top level groups are instantiated, where the top level balancing begins by
 * balancng across the CMT PGs within their respective (per lgroup) top level
 * groups.
 */
static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
						/* used for null_proc_lpa */
cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */

static int		is_cpu0 = 1; /* true if this is boot CPU context */

/*
 * Array of hardware sharing relationships that are blacklisted.
 * PGs won't be instantiated for blacklisted hardware sharing relationships.
 */
static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];

/*
 * Set this to non-zero to disable CMT scheduling
 * This must be done via kmdb -d, as /etc/system will be too late
 */
int			cmt_sched_disabled = 0;

/*
 * Status codes for CMT lineage validation
 * See pg_cmt_lineage_validate() below
 */
typedef enum cmt_lineage_validation {
	CMT_LINEAGE_VALID,
	CMT_LINEAGE_NON_CONCENTRIC,
	CMT_LINEAGE_PG_SPANS_LGRPS,
	CMT_LINEAGE_NON_PROMOTABLE,
	CMT_LINEAGE_REPAIRED,
	CMT_LINEAGE_UNRECOVERABLE
} cmt_lineage_validation_t;

/*
 * Status of the current lineage under construction.
 * One must be holding cpu_lock to change this.
 */
cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;

/*
 * Power domain definitions (on x86) are defined by ACPI, and
 * therefore may be subject to BIOS bugs.
 */
#define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)

/*
 * Macro to test if PG is managed by the CMT PG class
 */
#define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)

static pg_cid_t		pg_cmt_class_id;		/* PG class id */

static pg_t		*pg_cmt_alloc();
static void		pg_cmt_free(pg_t *);
static void		pg_cmt_cpu_init(cpu_t *);
static void		pg_cmt_cpu_fini(cpu_t *);
static void		pg_cmt_cpu_active(cpu_t *);
static void		pg_cmt_cpu_inactive(cpu_t *);
static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
static char		*pg_cmt_policy_name(pg_t *);
static void		pg_cmt_hier_sort(pg_cmt_t **, int);
static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
static int		pg_cmt_hw(pghw_type_t);
static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
			    kthread_t *, kthread_t *);
static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
			    kthread_t *, kthread_t *);
static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *);


/*
 * CMT PG ops
 */
struct pg_ops pg_ops_cmt = {
	pg_cmt_alloc,
	pg_cmt_free,
	pg_cmt_cpu_init,
	pg_cmt_cpu_fini,
	pg_cmt_cpu_active,
	pg_cmt_cpu_inactive,
	pg_cmt_cpupart_in,
	NULL,			/* cpupart_out */
	pg_cmt_cpupart_move,
	pg_cmt_cpu_belongs,
	pg_cmt_policy_name,
};

/*
 * Initialize the CMT PG class
 */
void
pg_cmt_class_init(void)
{
	if (cmt_sched_disabled)
		return;

	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
}

/*
 * Called to indicate a new CPU has started up so
 * that either t0 or the slave startup thread can
 * be accounted for.
 */
void
pg_cmt_cpu_startup(cpu_t *cp)
{
	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
	    cp->cpu_thread);
}

/*
 * Return non-zero if thread can migrate between "from" and "to"
 * without a performance penalty
 */
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
	if (from->cpu_physid->cpu_cacheid ==
	    to->cpu_physid->cpu_cacheid)
		return (1);
	return (0);
}

/*
 * CMT class specific PG allocation
 */
static pg_t *
pg_cmt_alloc(void)
{
	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
}

/*
 * Class specific PG de-allocation
 */
static void
pg_cmt_free(pg_t *pg)
{
	ASSERT(pg != NULL);
	ASSERT(IS_CMT_PG(pg));

	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
}

/*
 * Given a hardware sharing relationship, return which dispatcher
 * policies should be implemented to optimize performance and efficiency
 */
static pg_cmt_policy_t
pg_cmt_policy(pghw_type_t hw)
{
	pg_cmt_policy_t p;

	/*
	 * Give the platform a chance to override the default
	 */
	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
		return (p);

	switch (hw) {
	case PGHW_IPIPE:
	case PGHW_FPU:
	case PGHW_CHIP:
		return (CMT_BALANCE);
	case PGHW_CACHE:
		return (CMT_AFFINITY);
	case PGHW_POW_ACTIVE:
	case PGHW_POW_IDLE:
		return (CMT_BALANCE);
	default:
		return (CMT_NO_POLICY);
	}
}

/*
 * Rank the importance of optimizing for the pg1 relationship vs.
 * the pg2 relationship.
 */
static pg_cmt_t *
pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
{
	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;

	/*
	 * A power domain is only important if CPUPM is enabled.
	 */
	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
			return (pg2);
		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
			return (pg1);
	}

	/*
	 * Otherwise, ask the platform
	 */
	if (pg_plat_hw_rank(hw1, hw2) == hw1)
		return (pg1);
	else
		return (pg2);
}

/*
 * Initialize CMT callbacks for the given PG
 */
static void
cmt_callback_init(pg_t *pg)
{
	switch (((pghw_t *)pg)->pghw_hw) {
	case PGHW_POW_ACTIVE:
		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
		break;
	default:
		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;

	}
}

/*
 * Promote PG above it's current parent.
 * This is only legal if PG has an equal or greater number of CPUs
 * than it's parent.
 */
static void
cmt_hier_promote(pg_cmt_t *pg)
{
	pg_cmt_t	*parent;
	group_t		*children;
	cpu_t		*cpu;
	group_iter_t	iter;
	pg_cpu_itr_t	cpu_iter;
	int		r;
	int		err;

	ASSERT(MUTEX_HELD(&cpu_lock));

	parent = pg->cmt_parent;
	if (parent == NULL) {
		/*
		 * Nothing to do
		 */
		return;
	}

	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));

	/*
	 * We're changing around the hierarchy, which is actively traversed
	 * by the dispatcher. Pause CPUS to ensure exclusivity.
	 */
	pause_cpus(NULL);

	/*
	 * If necessary, update the parent's sibling set, replacing parent
	 * with PG.
	 */
	if (parent->cmt_siblings) {
		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
		    != -1) {
			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
			ASSERT(r != -1);
		}
	}

	/*
	 * If the parent is at the top of the hierarchy, replace it's entry
	 * in the root lgroup's group of top level PGs.
	 */
	if (parent->cmt_parent == NULL &&
	    parent->cmt_siblings != &cmt_root->cl_pgs) {
		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
		    != -1) {
			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
			ASSERT(r != -1);
		}
	}

	/*
	 * We assume (and therefore assert) that the PG being promoted is an
	 * only child of it's parent. Update the parent's children set
	 * replacing PG's entry with the parent (since the parent is becoming
	 * the child). Then have PG and the parent swap children sets.
	 */
	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
		ASSERT(r != -1);
	}

	children = pg->cmt_children;
	pg->cmt_children = parent->cmt_children;
	parent->cmt_children = children;

	/*
	 * Update the sibling references for PG and it's parent
	 */
	pg->cmt_siblings = parent->cmt_siblings;
	parent->cmt_siblings = pg->cmt_children;

	/*
	 * Update any cached lineages in the per CPU pg data.
	 */
	PG_CPU_ITR_INIT(pg, cpu_iter);
	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
		int		idx;
		group_t		*pgs;
		pg_cmt_t	*cpu_pg;

		/*
		 * Iterate over the CPU's PGs updating the children
		 * of the PG being promoted, since they have a new parent.
		 */
		pgs = &cpu->cpu_pg->pgs;
		group_iter_init(&iter);
		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
			if (cpu_pg->cmt_parent == pg) {
				cpu_pg->cmt_parent = parent;
			}
		}

		/*
		 * Update the CMT load balancing lineage
		 */
		pgs = &cpu->cpu_pg->cmt_pgs;
		if ((idx = group_find(pgs, (void *)pg)) == -1) {
			/*
			 * Unless this is the CPU who's lineage is being
			 * constructed, the PG being promoted should be
			 * in the lineage.
			 */
			ASSERT(GROUP_SIZE(pgs) == 0);
			continue;
		}

		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
		ASSERT(idx > 0);

		/*
		 * Have the child and the parent swap places in the CPU's
		 * lineage
		 */
		group_remove_at(pgs, idx);
		group_remove_at(pgs, idx - 1);
		err = group_add_at(pgs, parent, idx);
		ASSERT(err == 0);
		err = group_add_at(pgs, pg, idx - 1);
		ASSERT(err == 0);
	}

	/*
	 * Update the parent references for PG and it's parent
	 */
	pg->cmt_parent = parent->cmt_parent;
	parent->cmt_parent = pg;

	start_cpus();
}

/*
 * CMT class callback for a new CPU entering the system
 */
static void
pg_cmt_cpu_init(cpu_t *cp)
{
	pg_cmt_t	*pg;
	group_t		*cmt_pgs;
	int		levels, level;
	pghw_type_t	hw;
	pg_t		*pg_cache = NULL;
	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
	lgrp_handle_t	lgrp_handle;
	cmt_lgrp_t	*lgrp;
	cmt_lineage_validation_t	lineage_status;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (cmt_sched_disabled)
		return;

	/*
	 * A new CPU is coming into the system.
	 * Interrogate the platform to see if the CPU
	 * has any performance or efficiency relevant
	 * sharing relationships
	 */
	cmt_pgs = &cp->cpu_pg->cmt_pgs;
	cp->cpu_pg->cmt_lineage = NULL;

	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
	levels = 0;
	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {

		pg_cmt_policy_t	policy;

		/*
		 * We're only interested in the hw sharing relationships
		 * for which we know how to optimize.
		 */
		policy = pg_cmt_policy(hw);
		if (policy == CMT_NO_POLICY ||
		    pg_plat_hw_shared(cp, hw) == 0)
			continue;

		/*
		 * Continue if the hardware sharing relationship has been
		 * blacklisted.
		 */
		if (cmt_hw_blacklisted[hw]) {
			continue;
		}

		/*
		 * Find (or create) the PG associated with
		 * the hw sharing relationship in which cp
		 * belongs.
		 *
		 * Determine if a suitable PG already
		 * exists, or if one needs to be created.
		 */
		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
		if (pg == NULL) {
			/*
			 * Create a new one.
			 * Initialize the common...
			 */
			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);

			/* ... physical ... */
			pghw_init((pghw_t *)pg, cp, hw);

			/*
			 * ... and CMT specific portions of the
			 * structure.
			 */
			pg->cmt_policy = policy;

			/* CMT event callbacks */
			cmt_callback_init((pg_t *)pg);

			bitset_init(&pg->cmt_cpus_actv_set);
			group_create(&pg->cmt_cpus_actv);
		} else {
			ASSERT(IS_CMT_PG(pg));
		}

		/* Add the CPU to the PG */
		pg_cpu_add((pg_t *)pg, cp);

		/*
		 * Ensure capacity of the active CPU group/bitset
		 */
		group_expand(&pg->cmt_cpus_actv,
		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

		if (cp->cpu_seqid >=
		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
			bitset_resize(&pg->cmt_cpus_actv_set,
			    cp->cpu_seqid + 1);
		}

		/*
		 * Build a lineage of CMT PGs for load balancing / coalescence
		 */
		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
			cpu_cmt_hier[levels++] = pg;
		}

		/* Cache this for later */
		if (hw == PGHW_CACHE)
			pg_cache = (pg_t *)pg;
	}

	group_expand(cmt_pgs, levels);

	if (cmt_root == NULL)
		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());

	/*
	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
	 */
	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
		lgrp = pg_cmt_lgrp_create(lgrp_handle);

	/*
	 * Ascendingly sort the PGs in the lineage by number of CPUs
	 */
	pg_cmt_hier_sort(cpu_cmt_hier, levels);

	/*
	 * Examine the lineage and validate it.
	 * This routine will also try to fix the lineage along with the
	 * rest of the PG hierarchy should it detect an issue.
	 *
	 * If it returns anything other than VALID or REPAIRED, an
	 * unrecoverable error has occurred, and we cannot proceed.
	 */
	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels);
	if ((lineage_status != CMT_LINEAGE_VALID) &&
	    (lineage_status != CMT_LINEAGE_REPAIRED))
		return;

	/*
	 * For existing PGs in the lineage, verify that the parent is
	 * correct, as the generation in the lineage may have changed
	 * as a result of the sorting. Start the traversal at the top
	 * of the lineage, moving down.
	 */
	for (level = levels - 1; level >= 0; ) {
		int reorg;

		reorg = 0;
		pg = cpu_cmt_hier[level];

		/*
		 * Promote PGs at an incorrect generation into place.
		 */
		while (pg->cmt_parent &&
		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
			cmt_hier_promote(pg);
			reorg++;
		}
		if (reorg > 0)
			level = levels - 1;
		else
			level--;
	}

	/*
	 * For each of the PGs in the CPU's lineage:
	 *	- Add an entry in the CPU sorted CMT PG group
	 *	  which is used for top down CMT load balancing
	 *	- Tie the PG into the CMT hierarchy by connecting
	 *	  it to it's parent and siblings.
	 */
	for (level = 0; level < levels; level++) {
		uint_t		children;
		int		err;

		pg = cpu_cmt_hier[level];
		err = group_add_at(cmt_pgs, pg, levels - level - 1);
		ASSERT(err == 0);

		if (level == 0)
			cp->cpu_pg->cmt_lineage = (pg_t *)pg;

		if (pg->cmt_siblings != NULL) {
			/* Already initialized */
			ASSERT(pg->cmt_parent == NULL ||
			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
			    ((pg->cmt_parent != NULL) &&
			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
			continue;
		}

		if ((level + 1) == levels) {
			pg->cmt_parent = NULL;

			pg->cmt_siblings = &lgrp->cl_pgs;
			children = ++lgrp->cl_npgs;
			if (cmt_root != lgrp)
				cmt_root->cl_npgs++;
		} else {
			pg->cmt_parent = cpu_cmt_hier[level + 1];

			/*
			 * A good parent keeps track of their children.
			 * The parent's children group is also the PG's
			 * siblings.
			 */
			if (pg->cmt_parent->cmt_children == NULL) {
				pg->cmt_parent->cmt_children =
				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
				group_create(pg->cmt_parent->cmt_children);
			}
			pg->cmt_siblings = pg->cmt_parent->cmt_children;
			children = ++pg->cmt_parent->cmt_nchildren;
		}

		group_expand(pg->cmt_siblings, children);
		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
	}

	/*
	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
	 * for fast lookups later.
	 */
	if (cp->cpu_physid) {
		cp->cpu_physid->cpu_chipid =
		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);

		/*
		 * If this cpu has a PG representing shared cache, then set
		 * cpu_cacheid to that PG's logical id
		 */
		if (pg_cache)
			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
	}

	/* CPU0 only initialization */
	if (is_cpu0) {
		pg_cmt_cpu_startup(cp);
		is_cpu0 = 0;
		cpu0_lgrp = lgrp;
	}

}

/*
 * Class callback when a CPU is leaving the system (deletion)
 */
static void
pg_cmt_cpu_fini(cpu_t *cp)
{
	group_iter_t	i;
	pg_cmt_t	*pg;
	group_t		*pgs, *cmt_pgs;
	lgrp_handle_t	lgrp_handle;
	cmt_lgrp_t	*lgrp;

	if (cmt_sched_disabled)
		return;

	pgs = &cp->cpu_pg->pgs;
	cmt_pgs = &cp->cpu_pg->cmt_pgs;

	/*
	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
	 */
	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);

	lgrp = pg_cmt_find_lgrp(lgrp_handle);
	if (ncpus == 1 && lgrp != cpu0_lgrp) {
		/*
		 * One might wonder how we could be deconfiguring the
		 * only CPU in the system.
		 *
		 * On Starcat systems when null_proc_lpa is detected,
		 * the boot CPU (which is already configured into a leaf
		 * lgroup), is moved into the root lgroup. This is done by
		 * deconfiguring it from both lgroups and processor
		 * groups), and then later reconfiguring it back in.  This
		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
		 *
		 * This special case is detected by noting that the platform
		 * has changed the CPU's lgrp affiliation (since it now
		 * belongs in the root). In this case, use the cmt_lgrp_t
		 * cached for the boot CPU, since this is what needs to be
		 * torn down.
		 */
		lgrp = cpu0_lgrp;
	}

	ASSERT(lgrp != NULL);

	/*
	 * First, clean up anything load balancing specific for each of
	 * the CPU's PGs that participated in CMT load balancing
	 */
	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
	while (pg != NULL) {

		/*
		 * Remove the PG from the CPU's load balancing lineage
		 */
		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);

		/*
		 * If it's about to become empty, destroy it's children
		 * group, and remove it's reference from it's siblings.
		 * This is done here (rather than below) to avoid removing
		 * our reference from a PG that we just eliminated.
		 */
		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
			if (pg->cmt_children != NULL)
				group_destroy(pg->cmt_children);
			if (pg->cmt_siblings != NULL) {
				if (pg->cmt_siblings == &lgrp->cl_pgs)
					lgrp->cl_npgs--;
				else
					pg->cmt_parent->cmt_nchildren--;
			}
		}
		pg = pg->cmt_parent;
	}
	ASSERT(GROUP_SIZE(cmt_pgs) == 0);

	/*
	 * Now that the load balancing lineage updates have happened,
	 * remove the CPU from all it's PGs (destroying any that become
	 * empty).
	 */
	group_iter_init(&i);
	while ((pg = group_iterate(pgs, &i)) != NULL) {
		if (IS_CMT_PG(pg) == 0)
			continue;

		pg_cpu_delete((pg_t *)pg, cp);
		/*
		 * Deleting the CPU from the PG changes the CPU's
		 * PG group over which we are actively iterating
		 * Re-initialize the iteration
		 */
		group_iter_init(&i);

		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {

			/*
			 * The PG has become zero sized, so destroy it.
			 */
			group_destroy(&pg->cmt_cpus_actv);
			bitset_fini(&pg->cmt_cpus_actv_set);
			pghw_fini((pghw_t *)pg);

			pg_destroy((pg_t *)pg);
		}
	}
}

/*
 * Class callback when a CPU is entering a cpu partition
 */
static void
pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
{
	group_t		*pgs;
	pg_t		*pg;
	group_iter_t	i;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (cmt_sched_disabled)
		return;

	pgs = &cp->cpu_pg->pgs;

	/*
	 * Ensure that the new partition's PG bitset
	 * is large enough for all CMT PG's to which cp
	 * belongs
	 */
	group_iter_init(&i);
	while ((pg = group_iterate(pgs, &i)) != NULL) {
		if (IS_CMT_PG(pg) == 0)
			continue;

		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
	}
}

/*
 * Class callback when a CPU is actually moving partitions
 */
static void
pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
{
	cpu_t		*cpp;
	group_t		*pgs;
	pg_t		*pg;
	group_iter_t	pg_iter;
	pg_cpu_itr_t	cpu_iter;
	boolean_t	found;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (cmt_sched_disabled)
		return;

	pgs = &cp->cpu_pg->pgs;
	group_iter_init(&pg_iter);

	/*
	 * Iterate over the CPUs CMT PGs
	 */
	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {

		if (IS_CMT_PG(pg) == 0)
			continue;

		/*
		 * Add the PG to the bitset in the new partition.
		 */
		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);

		/*
		 * Remove the PG from the bitset in the old partition
		 * if the last of the PG's CPUs have left.
		 */
		found = B_FALSE;
		PG_CPU_ITR_INIT(pg, cpu_iter);
		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
			if (cpp == cp)
				continue;
			if (CPU_ACTIVE(cpp) &&
			    cpp->cpu_part->cp_id == oldpp->cp_id) {
				found = B_TRUE;
				break;
			}
		}
		if (!found)
			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
	}
}

/*
 * Class callback when a CPU becomes active (online)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_active(cpu_t *cp)
{
	int		err;
	group_iter_t	i;
	pg_cmt_t	*pg;
	group_t		*pgs;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (cmt_sched_disabled)
		return;

	pgs = &cp->cpu_pg->pgs;
	group_iter_init(&i);

	/*
	 * Iterate over the CPU's PGs
	 */
	while ((pg = group_iterate(pgs, &i)) != NULL) {

		if (IS_CMT_PG(pg) == 0)
			continue;

		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
		ASSERT(err == 0);

		/*
		 * If this is the first active CPU in the PG, and it
		 * represents a hardware sharing relationship over which
		 * CMT load balancing is performed, add it as a candidate
		 * for balancing with it's siblings.
		 */
		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
			ASSERT(err == 0);

			/*
			 * If this is a top level PG, add it as a balancing
			 * candidate when balancing within the root lgroup.
			 */
			if (pg->cmt_parent == NULL &&
			    pg->cmt_siblings != &cmt_root->cl_pgs) {
				err = group_add(&cmt_root->cl_pgs, pg,
				    GRP_NORESIZE);
				ASSERT(err == 0);
			}
		}

		/*
		 * Notate the CPU in the PGs active CPU bitset.
		 * Also notate the PG as being active in it's associated
		 * partition
		 */
		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
	}
}

/*
 * Class callback when a CPU goes inactive (offline)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_inactive(cpu_t *cp)
{
	int		err;
	group_t		*pgs;
	pg_cmt_t	*pg;
	cpu_t		*cpp;
	group_iter_t	i;
	pg_cpu_itr_t	cpu_itr;
	boolean_t	found;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (cmt_sched_disabled)
		return;

	pgs = &cp->cpu_pg->pgs;
	group_iter_init(&i);

	while ((pg = group_iterate(pgs, &i)) != NULL) {

		if (IS_CMT_PG(pg) == 0)
			continue;

		/*
		 * Remove the CPU from the CMT PGs active CPU group
		 * bitmap
		 */
		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
		ASSERT(err == 0);

		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);

		/*
		 * If there are no more active CPUs in this PG over which
		 * load was balanced, remove it as a balancing candidate.
		 */
		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
			ASSERT(err == 0);

			if (pg->cmt_parent == NULL &&
			    pg->cmt_siblings != &cmt_root->cl_pgs) {
				err = group_remove(&cmt_root->cl_pgs, pg,
				    GRP_NORESIZE);
				ASSERT(err == 0);
			}
		}

		/*
		 * Assert the number of active CPUs does not exceed
		 * the total number of CPUs in the PG
		 */
		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

		/*
		 * Update the PG bitset in the CPU's old partition
		 */
		found = B_FALSE;
		PG_CPU_ITR_INIT(pg, cpu_itr);
		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
			if (cpp == cp)
				continue;
			if (CPU_ACTIVE(cpp) &&
			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
				found = B_TRUE;
				break;
			}
		}
		if (!found) {
			bitset_del(&cp->cpu_part->cp_cmt_pgs,
			    ((pg_t *)pg)->pg_id);
		}
	}
}

/*
 * Return non-zero if the CPU belongs in the given PG
 */
static int
pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
{
	cpu_t	*pg_cpu;

	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);

	ASSERT(pg_cpu != NULL);

	/*
	 * The CPU belongs if, given the nature of the hardware sharing
	 * relationship represented by the PG, the CPU has that
	 * relationship with some other CPU already in the PG
	 */
	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
		return (1);

	return (0);
}

/*
 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
 */
static void
pg_cmt_hier_sort(pg_cmt_t **hier, int size)
{
	int		i, j, inc;
	pg_t		*tmp;
	pg_t		**h = (pg_t **)hier;

	/*
	 * First sort by number of CPUs
	 */
	inc = size / 2;
	while (inc > 0) {
		for (i = inc; i < size; i++) {
			j = i;
			tmp = h[i];
			while ((j >= inc) &&
			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
				h[j] = h[j - inc];
				j = j - inc;
			}
			h[j] = tmp;
		}
		if (inc == 2)
			inc = 1;
		else
			inc = (inc * 5) / 11;
	}

	/*
	 * Break ties by asking the platform.
	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
	 */
	for (i = 0; i < size - 1; i++) {
		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
			tmp = h[i];
			h[i] = h[i + 1];
			h[i + 1] = tmp;
		}
	}
}

/*
 * Return a cmt_lgrp_t * given an lgroup handle.
 */
static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)
{
	cmt_lgrp_t	*lgrp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	lgrp = cmt_lgrps;
	while (lgrp != NULL) {
		if (lgrp->cl_hand == hand)
			break;
		lgrp = lgrp->cl_next;
	}
	return (lgrp);
}

/*
 * Create a cmt_lgrp_t with the specified handle.
 */
static cmt_lgrp_t *
pg_cmt_lgrp_create(lgrp_handle_t hand)
{
	cmt_lgrp_t	*lgrp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);

	lgrp->cl_hand = hand;
	lgrp->cl_npgs = 0;
	lgrp->cl_next = cmt_lgrps;
	cmt_lgrps = lgrp;
	group_create(&lgrp->cl_pgs);

	return (lgrp);
}

/*
 * Interfaces to enable and disable power aware dispatching
 * The caller must be holding cpu_lock.
 *
 * Return 0 on success and -1 on failure.
 */
int
cmt_pad_enable(pghw_type_t type)
{
	group_t		*hwset;
	group_iter_t	iter;
	pg_cmt_t	*pg;

	ASSERT(PGHW_IS_PM_DOMAIN(type));
	ASSERT(MUTEX_HELD(&cpu_lock));

	if ((hwset = pghw_set_lookup(type)) == NULL ||
	    cmt_hw_blacklisted[type]) {
		/*
		 * Unable to find any instances of the specified type
		 * of power domain, or the power domains have been blacklisted.
		 */
		return (-1);
	}

	/*
	 * Iterate over the power domains, setting the default dispatcher
	 * policy for power/performance optimization.
	 *
	 * Simply setting the policy isn't enough in the case where the power
	 * domain is an only child of another PG. Because the dispatcher walks
	 * the PG hierarchy in a top down fashion, the higher up PG's policy
	 * will dominate. So promote the power domain above it's parent if both
	 * PG and it's parent have the same CPUs to ensure it's policy
	 * dominates.
	 */
	group_iter_init(&iter);
	while ((pg = group_iterate(hwset, &iter)) != NULL) {
		/*
		 * If the power domain is an only child to a parent
		 * not implementing the same policy, promote the child
		 * above the parent to activate the policy.
		 */
		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
		while ((pg->cmt_parent != NULL) &&
		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
		    (PG_NUM_CPUS((pg_t *)pg) ==
		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
			cmt_hier_promote(pg);
		}
	}

	return (0);
}

int
cmt_pad_disable(pghw_type_t type)
{
	group_t		*hwset;
	group_iter_t	iter;
	pg_cmt_t	*pg;
	pg_cmt_t	*child;

	ASSERT(PGHW_IS_PM_DOMAIN(type));
	ASSERT(MUTEX_HELD(&cpu_lock));

	if ((hwset = pghw_set_lookup(type)) == NULL) {
		/*
		 * Unable to find any instances of the specified type of
		 * power domain.
		 */
		return (-1);
	}
	/*
	 * Iterate over the power domains, setting the default dispatcher
	 * policy for performance optimization (load balancing).
	 */
	group_iter_init(&iter);
	while ((pg = group_iterate(hwset, &iter)) != NULL) {

		/*
		 * If the power domain has an only child that implements
		 * policy other than load balancing, promote the child
		 * above the power domain to ensure it's policy dominates.
		 */
		if (pg->cmt_children != NULL &&
		    GROUP_SIZE(pg->cmt_children) == 1) {
			child = GROUP_ACCESS(pg->cmt_children, 0);
			if ((child->cmt_policy & CMT_BALANCE) == 0) {
				cmt_hier_promote(child);
			}
		}
		pg->cmt_policy = CMT_BALANCE;
	}
	return (0);
}

/* ARGSUSED */
static void
cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
		    kthread_t *new)
{
	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;

	if (old == cp->cpu_idle_thread) {
		atomic_add_32(&cmt_pg->cmt_utilization, 1);
	} else if (new == cp->cpu_idle_thread) {
		atomic_add_32(&cmt_pg->cmt_utilization, -1);
	}
}

/*
 * Macro to test whether a thread is currently runnable on a CPU in a PG.
 */
#define	THREAD_RUNNABLE_IN_PG(t, pg)					\
	((t)->t_state == TS_RUN &&					\
	    (t)->t_disp_queue->disp_cpu &&				\
	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
	    (t)->t_disp_queue->disp_cpu->cpu_seqid))

static void
cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
    kthread_t *new)
{
	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
	cpupm_domain_t	*dom;
	uint32_t	u;

	if (old == cp->cpu_idle_thread) {
		ASSERT(new != cp->cpu_idle_thread);
		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
		if (u == 1) {
			/*
			 * Notify the CPU power manager that the domain
			 * is non-idle.
			 */
			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
			cpupm_utilization_event(cp, now, dom,
			    CPUPM_DOM_BUSY_FROM_IDLE);
		}
	} else if (new == cp->cpu_idle_thread) {
		ASSERT(old != cp->cpu_idle_thread);
		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
		if (u == 0) {
			/*
			 * The domain is idle, notify the CPU power
			 * manager.
			 *
			 * Avoid notifying if the thread is simply migrating
			 * between CPUs in the domain.
			 */
			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
				cpupm_utilization_event(cp, now, dom,
				    CPUPM_DOM_IDLE_FROM_BUSY);
			}
		}
	}
}

/* ARGSUSED */
static void
cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
{
	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
	cpupm_domain_t	*dom;

	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
}

/*
 * Return the name of the CMT scheduling policy
 * being implemented across this PG
 */
static char *
pg_cmt_policy_name(pg_t *pg)
{
	pg_cmt_policy_t policy;

	policy = ((pg_cmt_t *)pg)->cmt_policy;

	if (policy & CMT_AFFINITY) {
		if (policy & CMT_BALANCE)
			return ("Load Balancing & Affinity");
		else if (policy & CMT_COALESCE)
			return ("Load Coalescence & Affinity");
		else
			return ("Affinity");
	} else {
		if (policy & CMT_BALANCE)
			return ("Load Balancing");
		else if (policy & CMT_COALESCE)
			return ("Load Coalescence");
		else
			return ("None");
	}
}

/*
 * Prune PG, and all other instances of PG's hardware sharing relationship
 * from the PG hierarchy.
 */
static int
pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
{
	group_t		*hwset, *children;
	int		i, j, r, size = *sz;
	group_iter_t	hw_iter, child_iter;
	pg_cpu_itr_t	cpu_iter;
	pg_cmt_t	*pg, *child;
	cpu_t		*cpu;
	int		cap_needed;
	pghw_type_t	hw;

	ASSERT(MUTEX_HELD(&cpu_lock));

	hw = ((pghw_t *)pg_bad)->pghw_hw;

	if (hw == PGHW_POW_ACTIVE) {
		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
		    "Event Based CPUPM Unavailable");
	} else if (hw == PGHW_POW_IDLE) {
		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
		    "Dispatcher assisted CPUPM disabled.");
	}

	/*
	 * Find and eliminate the PG from the lineage.
	 */
	for (i = 0; i < size; i++) {
		if (lineage[i] == pg_bad) {
			for (j = i; j < size - 1; j++)
				lineage[j] = lineage[j + 1];
			*sz = size - 1;
			break;
		}
	}

	/*
	 * We'll prune all instances of the hardware sharing relationship
	 * represented by pg. But before we do that (and pause CPUs) we need
	 * to ensure the hierarchy's groups are properly sized.
	 */
	hwset = pghw_set_lookup(hw);

	/*
	 * Blacklist the hardware so that future groups won't be created.
	 */
	cmt_hw_blacklisted[hw] = 1;

	/*
	 * For each of the PGs being pruned, ensure sufficient capacity in
	 * the siblings set for the PG's children
	 */
	group_iter_init(&hw_iter);
	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
		/*
		 * PG is being pruned, but if it is bringing up more than
		 * one child, ask for more capacity in the siblings group.
		 */
		cap_needed = 0;
		if (pg->cmt_children &&
		    GROUP_SIZE(pg->cmt_children) > 1) {
			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;

			group_expand(pg->cmt_siblings,
			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);

			/*
			 * If this is a top level group, also ensure the
			 * capacity in the root lgrp level CMT grouping.
			 */
			if (pg->cmt_parent == NULL &&
			    pg->cmt_siblings != &cmt_root->cl_pgs) {
				group_expand(&cmt_root->cl_pgs,
				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
			}
		}
	}

	/*
	 * We're operating on the PG hierarchy. Pause CPUs to ensure
	 * exclusivity with respect to the dispatcher.
	 */
	pause_cpus(NULL);

	/*
	 * Prune all PG instances of the hardware sharing relationship
	 * represented by pg.
	 */
	group_iter_init(&hw_iter);
	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {

		/*
		 * Remove PG from it's group of siblings, if it's there.
		 */
		if (pg->cmt_siblings) {
			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
		}
		if (pg->cmt_parent == NULL &&
		    pg->cmt_siblings != &cmt_root->cl_pgs) {
			(void) group_remove(&cmt_root->cl_pgs, pg,
			    GRP_NORESIZE);
		}
		/*
		 * Move PG's children from it's children set to it's parent's
		 * children set. Note that the parent's children set, and PG's
		 * siblings set are the same thing.
		 *
		 * Because we are iterating over the same group that we are
		 * operating on (removing the children), first add all of PG's
		 * children to the parent's children set, and once we are done
		 * iterating, empty PG's children set.
		 */
		if (pg->cmt_children != NULL) {
			children = pg->cmt_children;

			group_iter_init(&child_iter);
			while ((child = group_iterate(children, &child_iter))
			    != NULL) {
				if (pg->cmt_siblings != NULL) {
					r = group_add(pg->cmt_siblings, child,
					    GRP_NORESIZE);
					ASSERT(r == 0);
				}
			}
			group_empty(pg->cmt_children);
		}

		/*
		 * Reset the callbacks to the defaults
		 */
		pg_callback_set_defaults((pg_t *)pg);

		/*
		 * Update all the CPU lineages in each of PG's CPUs
		 */
		PG_CPU_ITR_INIT(pg, cpu_iter);
		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
			group_t		*pgs;
			pg_cmt_t	*cpu_pg;
			group_iter_t	liter;	/* Iterator for the lineage */

			/*
			 * Iterate over the CPU's PGs updating the children
			 * of the PG being promoted, since they have a new
			 * parent and siblings set.
			 */
			pgs = &cpu->cpu_pg->pgs;
			group_iter_init(&liter);
			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
				if (cpu_pg->cmt_parent == pg) {
					cpu_pg->cmt_parent = pg->cmt_parent;
					cpu_pg->cmt_siblings = pg->cmt_siblings;
				}
			}

			/*
			 * Update the CPU's lineages
			 */
			pgs = &cpu->cpu_pg->cmt_pgs;
			(void) group_remove(pgs, pg, GRP_NORESIZE);
			pgs = &cpu->cpu_pg->pgs;
			(void) group_remove(pgs, pg, GRP_NORESIZE);
		}
	}
	start_cpus();
	return (0);
}

/*
 * Disable CMT scheduling
 */
static void
pg_cmt_disable(void)
{
	cpu_t	*cpu;

	pause_cpus(NULL);
	cpu = cpu_list;

	do {
		if (cpu->cpu_pg)
			group_empty(&cpu->cpu_pg->cmt_pgs);
	} while ((cpu = cpu->cpu_next) != cpu_list);

	cmt_sched_disabled = 1;
	start_cpus();
	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
}

/*
 * CMT lineage validation
 *
 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
 * of the PGs in a CPU's lineage. This is necessary because it's possible that
 * some groupings (power domain groupings in particular) may be defined by
 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
 * possible to integrate those groupings into the CMT PG hierarchy, if doing
 * so would violate the subset invariant of the hierarchy, which says that
 * a PG must be subset of its parent (if it has one).
 *
 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
 * would result in a violation of this invariant. If a violation is found,
 * and the PG is of a grouping type who's definition is known to originate from
 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
 * PG (and all other instances PG's sharing relationship type) from the
 * hierarchy. Further, future instances of that sharing relationship type won't
 * be instantiated. If the grouping definition doesn't originate from suspect
 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
 * CMT scheduling altogether.
 *
 * This routine is invoked after the CPU has been added to the PGs in which
 * it belongs, but before those PGs have been added to (or had their place
 * adjusted in) the CMT PG hierarchy.
 *
 * The first argument is the CPUs PG lineage (essentially an array of PGs in
 * which the CPU belongs) that has already been sorted in ascending order
 * by CPU count. Some of the PGs in the CPUs lineage may already have other
 * CPUs in them, and have already been integrated into the CMT hierarchy.
 *
 * The addition of this new CPU to these pre-existing PGs means that those
 * PGs may need to be promoted up in the hierarchy to satisfy the subset
 * invariant. In additon to testing the subset invariant for the lineage,
 * this routine also verifies that the addition of the new CPU to the
 * existing PGs wouldn't cause the subset invariant to be violated in
 * the exiting lineages.
 *
 * This routine will normally return one of the following:
 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
 *
 * Otherwise, this routine will return a value indicating which error it
 * was unable to recover from (and set cmt_lineage_status along the way).
 */
static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
{
	int		i, j, size;
	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp;
	cpu_t		*cp;
	pg_cpu_itr_t	cpu_iter;
	lgrp_handle_t	lgrp;

	ASSERT(MUTEX_HELD(&cpu_lock));

revalidate:
	size = *sz;
	pg_bad = NULL;
	lgrp = LGRP_NULL_HANDLE;
	for (i = 0; i < size; i++) {

		pg = lineage[i];
		if (i < size - 1)
			pg_next = lineage[i + 1];
		else
			pg_next = NULL;

		/*
		 * We assume that the lineage has already been sorted
		 * by the number of CPUs. In fact, we depend on it.
		 */
		ASSERT(pg_next == NULL ||
		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));

		/*
		 * Check to make sure that the existing parent of PG (if any)
		 * is either in the PG's lineage, or the PG has more CPUs than
		 * its existing parent and can and should be promoted above its
		 * parent.
		 *
		 * Since the PG topology is in the middle of being changed, we
		 * need to check whether the PG's existing parent (if any) is
		 * part of its lineage (and therefore should contain the new
		 * CPU). If not, it means that the addition of the new CPU
		 * should have made this PG have more CPUs than its parent, and
		 * this PG should be promoted to be above its existing parent
		 * now. We need to verify all of this to defend against a buggy
		 * BIOS giving bad power domain CPU groupings. Sigh.
		 */
		if (pg->cmt_parent) {
			/*
			 * Determine if cmt_parent is in this lineage
			 */
			for (j = 0; j < size; j++) {
				pg_tmp = lineage[j];
				if (pg_tmp == pg->cmt_parent)
					break;
			}
			if (pg_tmp != pg->cmt_parent) {
				/*
				 * cmt_parent is not in the lineage, verify
				 * it is a proper subset of PG.
				 */
				if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >=
				    PG_NUM_CPUS((pg_t *)pg)) {
					/*
					 * Not a proper subset if pg has less
					 * CPUs than cmt_parent...
					 */
					cmt_lineage_status =
					    CMT_LINEAGE_NON_PROMOTABLE;
					goto handle_error;
				}
			}
		}

		/*
		 * Walk each of the CPUs in the PGs group and perform
		 * consistency checks along the way.
		 */
		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
			/*
			 * Verify that there aren't any CPUs contained in PG
			 * that the next PG in the lineage (which is larger
			 * or same size) doesn't also contain.
			 */
			if (pg_next != NULL &&
			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
				goto handle_error;
			}

			/*
			 * Verify that all the CPUs in the PG are in the same
			 * lgroup.
			 */
			if (lgrp == LGRP_NULL_HANDLE) {
				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
				goto handle_error;
			}
		}
	}

handle_error:
	/*
	 * Some of these validation errors can result when the CPU grouping
	 * information is derived from buggy sources (for example, incorrect
	 * ACPI tables on x86 systems).
	 *
	 * We'll try to recover in such cases by pruning out the illegal
	 * groupings from the PG hierarchy, which means that we won't optimize
	 * for those levels, but we will for the remaining ones.
	 */
	switch (cmt_lineage_status) {
	case CMT_LINEAGE_VALID:
	case CMT_LINEAGE_REPAIRED:
		break;
	case CMT_LINEAGE_PG_SPANS_LGRPS:
		/*
		 * We've detected a PG whose CPUs span lgroups.
		 *
		 * This isn't supported, as the dispatcher isn't allowed to
		 * to do CMT thread placement across lgroups, as this would
		 * conflict with policies implementing MPO thread affinity.
		 *
		 * The handling for this falls through to the next case.
		 */
	case CMT_LINEAGE_NON_PROMOTABLE:
		/*
		 * We've detected a PG that already exists in another CPU's
		 * lineage that cannot cannot legally be promoted into place
		 * without breaking the invariants of the hierarchy.
		 */
		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
			if (pg_cmt_prune(pg, lineage, sz) == 0) {
				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
				goto revalidate;
			}
		}
		/*
		 * Something went wrong trying to prune out the bad level.
		 * Disable CMT scheduling altogether.
		 */
		pg_cmt_disable();
		break;
	case CMT_LINEAGE_NON_CONCENTRIC:
		/*
		 * We've detected a non-concentric PG lineage, which means that
		 * there's a PG in the lineage that has CPUs that the next PG
		 * over in the lineage (which is the same size or larger)
		 * doesn't have.
		 *
		 * In this case, we examine the two PGs to see if either
		 * grouping is defined by potentially buggy sources.
		 *
		 * If one has less CPUs than the other, and contains CPUs
		 * not found in the parent, and it is an untrusted enumeration,
		 * then prune it. If both have the same number of CPUs, then
		 * prune the one that is untrusted.
		 *
		 * This process repeats until we have a concentric lineage,
		 * or we would have to prune out level derived from what we
		 * thought was a reliable source, in which case CMT scheduling
		 * is disabled altogether.
		 */
		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
			pg_bad = pg;
		} else if (PG_NUM_CPUS((pg_t *)pg) ==
		    PG_NUM_CPUS((pg_t *)pg_next)) {
			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
				pg_bad = pg_next;
			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
				pg_bad = pg;
			}
		}
		if (pg_bad) {
			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
				goto revalidate;
			}
		}
		/*
		 * Something went wrong trying to identify and/or prune out
		 * the bad level. Disable CMT scheduling altogether.
		 */
		pg_cmt_disable();
		break;
	default:
		/*
		 * If we're here, we've encountered a validation error for
		 * which we don't know how to recover. In this case, disable
		 * CMT scheduling altogether.
		 */
		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
		pg_cmt_disable();
	}
	return (cmt_lineage_status);
}