/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * CMT scheduler / dispatcher support * * This file implements CMT scheduler support using Processor Groups. * The CMT processor group class creates and maintains the CMT class * specific processor group pg_cmt_t. * * ---------------------------- <-- pg_cmt_t * * | pghw_t | * ---------------------------- * | CMT class specific data | * | - hierarchy linkage | * | - CMT load balancing data| * | - active CPU group/bitset| * ---------------------------- * * The scheduler/dispatcher leverages knowledge of the performance * relevant CMT sharing relationships existing between cpus to implement * optimized affinity, load balancing, and coalescence policies. * * Load balancing policy seeks to improve performance by minimizing * contention over shared processor resources / facilities, Affinity * policies seek to improve cache and TLB utilization. Coalescence * policies improve resource utilization and ultimately power efficiency. * * The CMT PGs created by this class are already arranged into a * hierarchy (which is done in the pghw layer). To implement the top-down * CMT load balancing algorithm, the CMT PGs additionally maintain * parent, child and sibling hierarchy relationships. * Parent PGs always contain a superset of their children(s) resources, * each PG can have at most one parent, and siblings are the group of PGs * sharing the same parent. * * On UMA based systems, the CMT load balancing algorithm begins by balancing * load across the group of top level PGs in the system hierarchy. * On NUMA systems, the CMT load balancing algorithm balances load across the * group of top level PGs in each leaf lgroup...but for root homed threads, * is willing to balance against all the top level PGs in the system. * * Groups of top level PGs are maintained to implement the above, one for each * leaf lgroup (containing the top level PGs in that lgroup), and one (for the * root lgroup) that contains all the top level PGs in the system. */ static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ /* used for null_proc_lpa */ cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ static int is_cpu0 = 1; /* true if this is boot CPU context */ /* * Array of hardware sharing relationships that are blacklisted. * CMT scheduling optimizations won't be performed for blacklisted sharing * relationships. */ static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; /* * Set this to non-zero to disable CMT scheduling * This must be done via kmdb -d, as /etc/system will be too late */ int cmt_sched_disabled = 0; /* * Status codes for CMT lineage validation * See pg_cmt_lineage_validate() below */ typedef enum cmt_lineage_validation { CMT_LINEAGE_VALID, CMT_LINEAGE_NON_CONCENTRIC, CMT_LINEAGE_PG_SPANS_LGRPS, CMT_LINEAGE_NON_PROMOTABLE, CMT_LINEAGE_REPAIRED, CMT_LINEAGE_UNRECOVERABLE } cmt_lineage_validation_t; /* * Status of the current lineage under construction. * One must be holding cpu_lock to change this. */ cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; /* * Power domain definitions (on x86) are defined by ACPI, and * therefore may be subject to BIOS bugs. */ #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) /* * Macro to test if PG is managed by the CMT PG class */ #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) static pg_cid_t pg_cmt_class_id; /* PG class id */ static pg_t *pg_cmt_alloc(); static void pg_cmt_free(pg_t *); static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); static void pg_cmt_cpu_active(cpu_t *); static void pg_cmt_cpu_inactive(cpu_t *); static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); static char *pg_cmt_policy_name(pg_t *); static void pg_cmt_hier_sort(pg_cmt_t **, int); static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); static int pg_cmt_hw(pghw_type_t); static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, kthread_t *, kthread_t *); static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, kthread_t *, kthread_t *); static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, cpu_pg_t *); /* * CMT PG ops */ struct pg_ops pg_ops_cmt = { pg_cmt_alloc, pg_cmt_free, pg_cmt_cpu_init, pg_cmt_cpu_fini, pg_cmt_cpu_active, pg_cmt_cpu_inactive, pg_cmt_cpupart_in, NULL, /* cpupart_out */ pg_cmt_cpupart_move, pg_cmt_cpu_belongs, pg_cmt_policy_name, }; /* * Initialize the CMT PG class */ void pg_cmt_class_init(void) { if (cmt_sched_disabled) return; pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); } /* * Called to indicate a new CPU has started up so * that either t0 or the slave startup thread can * be accounted for. */ void pg_cmt_cpu_startup(cpu_t *cp) { pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, cp->cpu_thread); } /* * Return non-zero if thread can migrate between "from" and "to" * without a performance penalty */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { if (from->cpu_physid->cpu_cacheid == to->cpu_physid->cpu_cacheid) return (1); return (0); } /* * CMT class specific PG allocation */ static pg_t * pg_cmt_alloc(void) { return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); } /* * Class specific PG de-allocation */ static void pg_cmt_free(pg_t *pg) { ASSERT(pg != NULL); ASSERT(IS_CMT_PG(pg)); kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); } /* * Given a hardware sharing relationship, return which dispatcher * policies should be implemented to optimize performance and efficiency */ static pg_cmt_policy_t pg_cmt_policy(pghw_type_t hw) { pg_cmt_policy_t p; /* * Give the platform a chance to override the default */ if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) return (p); switch (hw) { case PGHW_IPIPE: case PGHW_FPU: case PGHW_PROCNODE: case PGHW_CHIP: return (CMT_BALANCE); case PGHW_CACHE: return (CMT_AFFINITY); case PGHW_POW_ACTIVE: case PGHW_POW_IDLE: return (CMT_BALANCE); default: return (CMT_NO_POLICY); } } /* * Rank the importance of optimizing for the pg1 relationship vs. * the pg2 relationship. */ static pg_cmt_t * pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) { pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; /* * A power domain is only important if CPUPM is enabled. */ if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) return (pg2); if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) return (pg1); } /* * Otherwise, ask the platform */ if (pg_plat_hw_rank(hw1, hw2) == hw1) return (pg1); else return (pg2); } /* * Initialize CMT callbacks for the given PG */ static void cmt_callback_init(pg_t *pg) { /* * Stick with the default callbacks if there isn't going to be * any CMT thread placement optimizations implemented. */ if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) return; switch (((pghw_t *)pg)->pghw_hw) { case PGHW_POW_ACTIVE: pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; break; default: pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; } } /* * Promote PG above it's current parent. * This is only legal if PG has an equal or greater number of CPUs than its * parent. * * This routine operates on the CPU specific processor group data (for the CPUs * in the PG being promoted), and may be invoked from a context where one CPU's * PG data is under construction. In this case the argument "pgdata", if not * NULL, is a reference to the CPU's under-construction PG data. */ static void cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) { pg_cmt_t *parent; group_t *children; cpu_t *cpu; group_iter_t iter; pg_cpu_itr_t cpu_iter; int r; int err; int nchildren; ASSERT(MUTEX_HELD(&cpu_lock)); parent = pg->cmt_parent; if (parent == NULL) { /* * Nothing to do */ return; } ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); /* * We're changing around the hierarchy, which is actively traversed * by the dispatcher. Pause CPUS to ensure exclusivity. */ pause_cpus(NULL); /* * If necessary, update the parent's sibling set, replacing parent * with PG. */ if (parent->cmt_siblings) { if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * If the parent is at the top of the hierarchy, replace it's entry * in the root lgroup's group of top level PGs. */ if (parent->cmt_parent == NULL && parent->cmt_siblings != &cmt_root->cl_pgs) { if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) != -1) { r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * We assume (and therefore assert) that the PG being promoted is an * only child of it's parent. Update the parent's children set * replacing PG's entry with the parent (since the parent is becoming * the child). Then have PG and the parent swap children sets and * children counts. */ ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_children, parent, GRP_NORESIZE); ASSERT(r != -1); } children = pg->cmt_children; pg->cmt_children = parent->cmt_children; parent->cmt_children = children; nchildren = pg->cmt_nchildren; pg->cmt_nchildren = parent->cmt_nchildren; parent->cmt_nchildren = nchildren; /* * Update the sibling references for PG and it's parent */ pg->cmt_siblings = parent->cmt_siblings; parent->cmt_siblings = pg->cmt_children; /* * Update any cached lineages in the per CPU pg data. */ PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { int idx; int sz; pg_cmt_t *cpu_pg; cpu_pg_t *pgd; /* CPU's PG data */ /* * The CPU's whose lineage is under construction still * references the bootstrap CPU PG data structure. */ if (pg_cpu_is_bootstrapped(cpu)) pgd = pgdata; else pgd = cpu->cpu_pg; /* * Iterate over the CPU's PGs updating the children * of the PG being promoted, since they have a new parent. */ group_iter_init(&iter); while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { if (cpu_pg->cmt_parent == pg) { cpu_pg->cmt_parent = parent; } } /* * Update the CMT load balancing lineage */ if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { /* * Unless this is the CPU who's lineage is being * constructed, the PG being promoted should be * in the lineage. */ ASSERT(pg_cpu_is_bootstrapped(cpu)); continue; } ASSERT(idx > 0); ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); /* * Have the child and the parent swap places in the CPU's * lineage */ group_remove_at(&pgd->cmt_pgs, idx); group_remove_at(&pgd->cmt_pgs, idx - 1); err = group_add_at(&pgd->cmt_pgs, parent, idx); ASSERT(err == 0); err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); ASSERT(err == 0); /* * Ensure cmt_lineage references CPU's leaf PG. * Since cmt_pgs is top-down ordered, the bottom is the last * element. */ if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); } /* * Update the parent references for PG and it's parent */ pg->cmt_parent = parent->cmt_parent; parent->cmt_parent = pg; start_cpus(); } /* * CMT class callback for a new CPU entering the system * * This routine operates on the CPU specific processor group data (for the CPU * being initialized). The argument "pgdata" is a reference to the CPU's PG * data to be constructed. * * cp->cpu_pg is used by the dispatcher to access the CPU's PG data * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it * calls must be careful to operate only on the "pgdata" argument, and not * cp->cpu_pg. */ static void pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) { pg_cmt_t *pg; group_t *cmt_pgs; int levels, level; pghw_type_t hw; pg_t *pg_cache = NULL; pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; lgrp_handle_t lgrp_handle; cmt_lgrp_t *lgrp; cmt_lineage_validation_t lineage_status; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(pg_cpu_is_bootstrapped(cp)); if (cmt_sched_disabled) return; /* * A new CPU is coming into the system. * Interrogate the platform to see if the CPU * has any performance or efficiency relevant * sharing relationships */ cmt_pgs = &pgdata->cmt_pgs; pgdata->cmt_lineage = NULL; bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); levels = 0; for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { pg_cmt_policy_t policy; /* * We're only interested in the hw sharing relationships * for which we know how to optimize. */ policy = pg_cmt_policy(hw); if (policy == CMT_NO_POLICY || pg_plat_hw_shared(cp, hw) == 0) continue; /* * We will still create the PGs for hardware sharing * relationships that have been blacklisted, but won't * implement CMT thread placement optimizations against them. */ if (cmt_hw_blacklisted[hw] == 1) policy = CMT_NO_POLICY; /* * Find (or create) the PG associated with * the hw sharing relationship in which cp * belongs. * * Determine if a suitable PG already * exists, or if one needs to be created. */ pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); if (pg == NULL) { /* * Create a new one. * Initialize the common... */ pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); /* ... physical ... */ pghw_init((pghw_t *)pg, cp, hw); /* * ... and CMT specific portions of the * structure. */ pg->cmt_policy = policy; /* CMT event callbacks */ cmt_callback_init((pg_t *)pg); bitset_init(&pg->cmt_cpus_actv_set); group_create(&pg->cmt_cpus_actv); } else { ASSERT(IS_CMT_PG(pg)); } ((pghw_t *)pg)->pghw_generation++; /* Add the CPU to the PG */ pg_cpu_add((pg_t *)pg, cp, pgdata); /* * Ensure capacity of the active CPU group/bitset */ group_expand(&pg->cmt_cpus_actv, GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); if (cp->cpu_seqid >= bitset_capacity(&pg->cmt_cpus_actv_set)) { bitset_resize(&pg->cmt_cpus_actv_set, cp->cpu_seqid + 1); } /* * Build a lineage of CMT PGs for load balancing / coalescence */ if (policy & (CMT_BALANCE | CMT_COALESCE)) { cpu_cmt_hier[levels++] = pg; } /* Cache this for later */ if (hw == PGHW_CACHE) pg_cache = (pg_t *)pg; } group_expand(cmt_pgs, levels); if (cmt_root == NULL) cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); /* * Find the lgrp that encapsulates this CPU's CMT hierarchy */ lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) lgrp = pg_cmt_lgrp_create(lgrp_handle); /* * Ascendingly sort the PGs in the lineage by number of CPUs */ pg_cmt_hier_sort(cpu_cmt_hier, levels); /* * Examine the lineage and validate it. * This routine will also try to fix the lineage along with the * rest of the PG hierarchy should it detect an issue. * * If it returns anything other than VALID or REPAIRED, an * unrecoverable error has occurred, and we cannot proceed. */ lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); if ((lineage_status != CMT_LINEAGE_VALID) && (lineage_status != CMT_LINEAGE_REPAIRED)) { /* * In the case of an unrecoverable error where CMT scheduling * has been disabled, assert that the under construction CPU's * PG data has an empty CMT load balancing lineage. */ ASSERT((cmt_sched_disabled == 0) || (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); return; } /* * For existing PGs in the lineage, verify that the parent is * correct, as the generation in the lineage may have changed * as a result of the sorting. Start the traversal at the top * of the lineage, moving down. */ for (level = levels - 1; level >= 0; ) { int reorg; reorg = 0; pg = cpu_cmt_hier[level]; /* * Promote PGs at an incorrect generation into place. */ while (pg->cmt_parent && pg->cmt_parent != cpu_cmt_hier[level + 1]) { cmt_hier_promote(pg, pgdata); reorg++; } if (reorg > 0) level = levels - 1; else level--; } /* * For each of the PGs in the CPU's lineage: * - Add an entry in the CPU sorted CMT PG group * which is used for top down CMT load balancing * - Tie the PG into the CMT hierarchy by connecting * it to it's parent and siblings. */ for (level = 0; level < levels; level++) { uint_t children; int err; pg = cpu_cmt_hier[level]; err = group_add_at(cmt_pgs, pg, levels - level - 1); ASSERT(err == 0); if (level == 0) pgdata->cmt_lineage = (pg_t *)pg; if (pg->cmt_siblings != NULL) { /* Already initialized */ ASSERT(pg->cmt_parent == NULL || pg->cmt_parent == cpu_cmt_hier[level + 1]); ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || ((pg->cmt_parent != NULL) && pg->cmt_siblings == pg->cmt_parent->cmt_children)); continue; } if ((level + 1) == levels) { pg->cmt_parent = NULL; pg->cmt_siblings = &lgrp->cl_pgs; children = ++lgrp->cl_npgs; if (cmt_root != lgrp) cmt_root->cl_npgs++; } else { pg->cmt_parent = cpu_cmt_hier[level + 1]; /* * A good parent keeps track of their children. * The parent's children group is also the PG's * siblings. */ if (pg->cmt_parent->cmt_children == NULL) { pg->cmt_parent->cmt_children = kmem_zalloc(sizeof (group_t), KM_SLEEP); group_create(pg->cmt_parent->cmt_children); } pg->cmt_siblings = pg->cmt_parent->cmt_children; children = ++pg->cmt_parent->cmt_nchildren; } group_expand(pg->cmt_siblings, children); group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); } /* * Cache the chip and core IDs in the cpu_t->cpu_physid structure * for fast lookups later. */ if (cp->cpu_physid) { cp->cpu_physid->cpu_chipid = pg_plat_hw_instance_id(cp, PGHW_CHIP); cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); /* * If this cpu has a PG representing shared cache, then set * cpu_cacheid to that PG's logical id */ if (pg_cache) cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; } /* CPU0 only initialization */ if (is_cpu0) { is_cpu0 = 0; cpu0_lgrp = lgrp; } } /* * Class callback when a CPU is leaving the system (deletion) * * "pgdata" is a reference to the CPU's PG data to be deconstructed. * * cp->cpu_pg is used by the dispatcher to access the CPU's PG data * references a "bootstrap" structure across this function's invocation. * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only * on the "pgdata" argument, and not cp->cpu_pg. */ static void pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) { group_iter_t i; pg_cmt_t *pg; group_t *pgs, *cmt_pgs; lgrp_handle_t lgrp_handle; cmt_lgrp_t *lgrp; if (cmt_sched_disabled) return; ASSERT(pg_cpu_is_bootstrapped(cp)); pgs = &pgdata->pgs; cmt_pgs = &pgdata->cmt_pgs; /* * Find the lgroup that encapsulates this CPU's CMT hierarchy */ lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); lgrp = pg_cmt_find_lgrp(lgrp_handle); if (ncpus == 1 && lgrp != cpu0_lgrp) { /* * One might wonder how we could be deconfiguring the * only CPU in the system. * * On Starcat systems when null_proc_lpa is detected, * the boot CPU (which is already configured into a leaf * lgroup), is moved into the root lgroup. This is done by * deconfiguring it from both lgroups and processor * groups), and then later reconfiguring it back in. This * call to pg_cmt_cpu_fini() is part of that deconfiguration. * * This special case is detected by noting that the platform * has changed the CPU's lgrp affiliation (since it now * belongs in the root). In this case, use the cmt_lgrp_t * cached for the boot CPU, since this is what needs to be * torn down. */ lgrp = cpu0_lgrp; } ASSERT(lgrp != NULL); /* * First, clean up anything load balancing specific for each of * the CPU's PGs that participated in CMT load balancing */ pg = (pg_cmt_t *)pgdata->cmt_lineage; while (pg != NULL) { ((pghw_t *)pg)->pghw_generation++; /* * Remove the PG from the CPU's load balancing lineage */ (void) group_remove(cmt_pgs, pg, GRP_RESIZE); /* * If it's about to become empty, destroy it's children * group, and remove it's reference from it's siblings. * This is done here (rather than below) to avoid removing * our reference from a PG that we just eliminated. */ if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { if (pg->cmt_children != NULL) group_destroy(pg->cmt_children); if (pg->cmt_siblings != NULL) { if (pg->cmt_siblings == &lgrp->cl_pgs) lgrp->cl_npgs--; else pg->cmt_parent->cmt_nchildren--; } } pg = pg->cmt_parent; } ASSERT(GROUP_SIZE(cmt_pgs) == 0); /* * Now that the load balancing lineage updates have happened, * remove the CPU from all it's PGs (destroying any that become * empty). */ group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; pg_cpu_delete((pg_t *)pg, cp, pgdata); /* * Deleting the CPU from the PG changes the CPU's * PG group over which we are actively iterating * Re-initialize the iteration */ group_iter_init(&i); if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { /* * The PG has become zero sized, so destroy it. */ group_destroy(&pg->cmt_cpus_actv); bitset_fini(&pg->cmt_cpus_actv_set); pghw_fini((pghw_t *)pg); pg_destroy((pg_t *)pg); } } } /* * Class callback when a CPU is entering a cpu partition */ static void pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) { group_t *pgs; pg_t *pg; group_iter_t i; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; /* * Ensure that the new partition's PG bitset * is large enough for all CMT PG's to which cp * belongs */ group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); } } /* * Class callback when a CPU is actually moving partitions */ static void pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) { cpu_t *cpp; group_t *pgs; pg_t *pg; group_iter_t pg_iter; pg_cpu_itr_t cpu_iter; boolean_t found; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&pg_iter); /* * Iterate over the CPUs CMT PGs */ while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; /* * Add the PG to the bitset in the new partition. */ bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); /* * Remove the PG from the bitset in the old partition * if the last of the PG's CPUs have left. */ found = B_FALSE; PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { if (cpp == cp) continue; if (CPU_ACTIVE(cpp) && cpp->cpu_part->cp_id == oldpp->cp_id) { found = B_TRUE; break; } } if (!found) bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); } } /* * Class callback when a CPU becomes active (online) * * This is called in a context where CPUs are paused */ static void pg_cmt_cpu_active(cpu_t *cp) { int err; group_iter_t i; pg_cmt_t *pg; group_t *pgs; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&i); /* * Iterate over the CPU's PGs */ while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; /* * Move to the next generation since topology is changing */ ((pghw_t *)pg)->pghw_generation++; err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); ASSERT(err == 0); /* * If this is the first active CPU in the PG, and it * represents a hardware sharing relationship over which * CMT load balancing is performed, add it as a candidate * for balancing with it's siblings. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); /* * If this is a top level PG, add it as a balancing * candidate when balancing within the root lgroup. */ if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); } } /* * Notate the CPU in the PGs active CPU bitset. * Also notate the PG as being active in it's associated * partition */ bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); } } /* * Class callback when a CPU goes inactive (offline) * * This is called in a context where CPUs are paused */ static void pg_cmt_cpu_inactive(cpu_t *cp) { int err; group_t *pgs; pg_cmt_t *pg; cpu_t *cpp; group_iter_t i; pg_cpu_itr_t cpu_itr; boolean_t found; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; /* * Move to the next generation since topology is changing */ ((pghw_t *)pg)->pghw_generation++; /* * Remove the CPU from the CMT PGs active CPU group * bitmap */ err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); ASSERT(err == 0); bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); /* * If there are no more active CPUs in this PG over which * load was balanced, remove it as a balancing candidate. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_remove(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); } } /* * Assert the number of active CPUs does not exceed * the total number of CPUs in the PG */ ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); /* * Update the PG bitset in the CPU's old partition */ found = B_FALSE; PG_CPU_ITR_INIT(pg, cpu_itr); while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { if (cpp == cp) continue; if (CPU_ACTIVE(cpp) && cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { found = B_TRUE; break; } } if (!found) { bitset_del(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); } } } /* * Return non-zero if the CPU belongs in the given PG */ static int pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) { cpu_t *pg_cpu; pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); ASSERT(pg_cpu != NULL); /* * The CPU belongs if, given the nature of the hardware sharing * relationship represented by the PG, the CPU has that * relationship with some other CPU already in the PG */ if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) return (1); return (0); } /* * Sort the CPUs CMT hierarchy, where "size" is the number of levels. */ static void pg_cmt_hier_sort(pg_cmt_t **hier, int size) { int i, j, inc, sz; int start, end; pg_t *tmp; pg_t **h = (pg_t **)hier; /* * First sort by number of CPUs */ inc = size / 2; while (inc > 0) { for (i = inc; i < size; i++) { j = i; tmp = h[i]; while ((j >= inc) && (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { h[j] = h[j - inc]; j = j - inc; } h[j] = tmp; } if (inc == 2) inc = 1; else inc = (inc * 5) / 11; } /* * Break ties by asking the platform. * Determine if h[i] outranks h[i + 1] and if so, swap them. */ for (start = 0; start < size; start++) { /* * Find various contiguous sets of elements, * in the array, with the same number of cpus */ end = start; sz = PG_NUM_CPUS(h[start]); while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) end++; /* * Sort each such set of the array by rank */ for (i = start + 1; i < end; i++) { j = i - 1; tmp = h[i]; while (j >= start && pg_cmt_hier_rank(hier[j], (pg_cmt_t *)tmp) == hier[j]) { h[j + 1] = h[j]; j--; } h[j + 1] = tmp; } } } /* * Return a cmt_lgrp_t * given an lgroup handle. */ static cmt_lgrp_t * pg_cmt_find_lgrp(lgrp_handle_t hand) { cmt_lgrp_t *lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); lgrp = cmt_lgrps; while (lgrp != NULL) { if (lgrp->cl_hand == hand) break; lgrp = lgrp->cl_next; } return (lgrp); } /* * Create a cmt_lgrp_t with the specified handle. */ static cmt_lgrp_t * pg_cmt_lgrp_create(lgrp_handle_t hand) { cmt_lgrp_t *lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); lgrp->cl_hand = hand; lgrp->cl_npgs = 0; lgrp->cl_next = cmt_lgrps; cmt_lgrps = lgrp; group_create(&lgrp->cl_pgs); return (lgrp); } /* * Interfaces to enable and disable power aware dispatching * The caller must be holding cpu_lock. * * Return 0 on success and -1 on failure. */ int cmt_pad_enable(pghw_type_t type) { group_t *hwset; group_iter_t iter; pg_cmt_t *pg; ASSERT(PGHW_IS_PM_DOMAIN(type)); ASSERT(MUTEX_HELD(&cpu_lock)); if ((hwset = pghw_set_lookup(type)) == NULL || cmt_hw_blacklisted[type]) { /* * Unable to find any instances of the specified type * of power domain, or the power domains have been blacklisted. */ return (-1); } /* * Iterate over the power domains, setting the default dispatcher * policy for power/performance optimization. * * Simply setting the policy isn't enough in the case where the power * domain is an only child of another PG. Because the dispatcher walks * the PG hierarchy in a top down fashion, the higher up PG's policy * will dominate. So promote the power domain above it's parent if both * PG and it's parent have the same CPUs to ensure it's policy * dominates. */ group_iter_init(&iter); while ((pg = group_iterate(hwset, &iter)) != NULL) { /* * If the power domain is an only child to a parent * not implementing the same policy, promote the child * above the parent to activate the policy. */ pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); while ((pg->cmt_parent != NULL) && (pg->cmt_parent->cmt_policy != pg->cmt_policy) && (PG_NUM_CPUS((pg_t *)pg) == PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { cmt_hier_promote(pg, NULL); } } return (0); } int cmt_pad_disable(pghw_type_t type) { group_t *hwset; group_iter_t iter; pg_cmt_t *pg; pg_cmt_t *child; ASSERT(PGHW_IS_PM_DOMAIN(type)); ASSERT(MUTEX_HELD(&cpu_lock)); if ((hwset = pghw_set_lookup(type)) == NULL) { /* * Unable to find any instances of the specified type of * power domain. */ return (-1); } /* * Iterate over the power domains, setting the default dispatcher * policy for performance optimization (load balancing). */ group_iter_init(&iter); while ((pg = group_iterate(hwset, &iter)) != NULL) { /* * If the power domain has an only child that implements * policy other than load balancing, promote the child * above the power domain to ensure it's policy dominates. */ if (pg->cmt_children != NULL && GROUP_SIZE(pg->cmt_children) == 1) { child = GROUP_ACCESS(pg->cmt_children, 0); if ((child->cmt_policy & CMT_BALANCE) == 0) { cmt_hier_promote(child, NULL); } } pg->cmt_policy = CMT_BALANCE; } return (0); } /* ARGSUSED */ static void cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, kthread_t *new) { pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; if (old == cp->cpu_idle_thread) { atomic_add_32(&cmt_pg->cmt_utilization, 1); } else if (new == cp->cpu_idle_thread) { atomic_add_32(&cmt_pg->cmt_utilization, -1); } } /* * Macro to test whether a thread is currently runnable on a CPU in a PG. */ #define THREAD_RUNNABLE_IN_PG(t, pg) \ ((t)->t_state == TS_RUN && \ (t)->t_disp_queue->disp_cpu && \ bitset_in_set(&(pg)->cmt_cpus_actv_set, \ (t)->t_disp_queue->disp_cpu->cpu_seqid)) static void cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, kthread_t *new) { pg_cmt_t *cmt = (pg_cmt_t *)pg; cpupm_domain_t *dom; uint32_t u; if (old == cp->cpu_idle_thread) { ASSERT(new != cp->cpu_idle_thread); u = atomic_add_32_nv(&cmt->cmt_utilization, 1); if (u == 1) { /* * Notify the CPU power manager that the domain * is non-idle. */ dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, now, dom, CPUPM_DOM_BUSY_FROM_IDLE); } } else if (new == cp->cpu_idle_thread) { ASSERT(old != cp->cpu_idle_thread); u = atomic_add_32_nv(&cmt->cmt_utilization, -1); if (u == 0) { /* * The domain is idle, notify the CPU power * manager. * * Avoid notifying if the thread is simply migrating * between CPUs in the domain. */ if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, now, dom, CPUPM_DOM_IDLE_FROM_BUSY); } } } } /* ARGSUSED */ static void cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) { pg_cmt_t *cmt = (pg_cmt_t *)pg; cpupm_domain_t *dom; dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); } /* * Return the name of the CMT scheduling policy * being implemented across this PG */ static char * pg_cmt_policy_name(pg_t *pg) { pg_cmt_policy_t policy; policy = ((pg_cmt_t *)pg)->cmt_policy; if (policy & CMT_AFFINITY) { if (policy & CMT_BALANCE) return ("Load Balancing & Affinity"); else if (policy & CMT_COALESCE) return ("Load Coalescence & Affinity"); else return ("Affinity"); } else { if (policy & CMT_BALANCE) return ("Load Balancing"); else if (policy & CMT_COALESCE) return ("Load Coalescence"); else return ("None"); } } /* * Prune PG, and all other instances of PG's hardware sharing relationship * from the CMT PG hierarchy. * * This routine operates on the CPU specific processor group data (for the CPUs * in the PG being pruned), and may be invoked from a context where one CPU's * PG data is under construction. In this case the argument "pgdata", if not * NULL, is a reference to the CPU's under-construction PG data. */ static int pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) { group_t *hwset, *children; int i, j, r, size = *sz; group_iter_t hw_iter, child_iter; pg_cpu_itr_t cpu_iter; pg_cmt_t *pg, *child; cpu_t *cpu; int cap_needed; pghw_type_t hw; ASSERT(MUTEX_HELD(&cpu_lock)); hw = ((pghw_t *)pg_bad)->pghw_hw; if (hw == PGHW_POW_ACTIVE) { cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " "Event Based CPUPM Unavailable"); } else if (hw == PGHW_POW_IDLE) { cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " "Dispatcher assisted CPUPM disabled."); } /* * Find and eliminate the PG from the lineage. */ for (i = 0; i < size; i++) { if (lineage[i] == pg_bad) { for (j = i; j < size - 1; j++) lineage[j] = lineage[j + 1]; *sz = size - 1; break; } } /* * We'll prune all instances of the hardware sharing relationship * represented by pg. But before we do that (and pause CPUs) we need * to ensure the hierarchy's groups are properly sized. */ hwset = pghw_set_lookup(hw); /* * Blacklist the hardware so future processor groups of this type won't * participate in CMT thread placement. * * XXX * For heterogeneous system configurations, this might be overkill. * We may only need to blacklist the illegal PGs, and other instances * of this hardware sharing relationship may be ok. */ cmt_hw_blacklisted[hw] = 1; /* * For each of the PGs being pruned, ensure sufficient capacity in * the siblings set for the PG's children */ group_iter_init(&hw_iter); while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { /* * PG is being pruned, but if it is bringing up more than * one child, ask for more capacity in the siblings group. */ cap_needed = 0; if (pg->cmt_children && GROUP_SIZE(pg->cmt_children) > 1) { cap_needed = GROUP_SIZE(pg->cmt_children) - 1; group_expand(pg->cmt_siblings, GROUP_SIZE(pg->cmt_siblings) + cap_needed); /* * If this is a top level group, also ensure the * capacity in the root lgrp level CMT grouping. */ if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { group_expand(&cmt_root->cl_pgs, GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); cmt_root->cl_npgs += cap_needed; } } } /* * We're operating on the PG hierarchy. Pause CPUs to ensure * exclusivity with respect to the dispatcher. */ pause_cpus(NULL); /* * Prune all PG instances of the hardware sharing relationship * represented by pg. */ group_iter_init(&hw_iter); while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { /* * Remove PG from it's group of siblings, if it's there. */ if (pg->cmt_siblings) { (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); } if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { (void) group_remove(&cmt_root->cl_pgs, pg, GRP_NORESIZE); } /* * Indicate that no CMT policy will be implemented across * this PG. */ pg->cmt_policy = CMT_NO_POLICY; /* * Move PG's children from it's children set to it's parent's * children set. Note that the parent's children set, and PG's * siblings set are the same thing. * * Because we are iterating over the same group that we are * operating on (removing the children), first add all of PG's * children to the parent's children set, and once we are done * iterating, empty PG's children set. */ if (pg->cmt_children != NULL) { children = pg->cmt_children; group_iter_init(&child_iter); while ((child = group_iterate(children, &child_iter)) != NULL) { if (pg->cmt_siblings != NULL) { r = group_add(pg->cmt_siblings, child, GRP_NORESIZE); ASSERT(r == 0); if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { r = group_add(&cmt_root->cl_pgs, child, GRP_NORESIZE); ASSERT(r == 0); } } } group_empty(pg->cmt_children); } /* * Reset the callbacks to the defaults */ pg_callback_set_defaults((pg_t *)pg); /* * Update all the CPU lineages in each of PG's CPUs */ PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { pg_cmt_t *cpu_pg; group_iter_t liter; /* Iterator for the lineage */ cpu_pg_t *cpd; /* CPU's PG data */ /* * The CPU's lineage is under construction still * references the bootstrap CPU PG data structure. */ if (pg_cpu_is_bootstrapped(cpu)) cpd = pgdata; else cpd = cpu->cpu_pg; /* * Iterate over the CPU's PGs updating the children * of the PG being promoted, since they have a new * parent and siblings set. */ group_iter_init(&liter); while ((cpu_pg = group_iterate(&cpd->pgs, &liter)) != NULL) { if (cpu_pg->cmt_parent == pg) { cpu_pg->cmt_parent = pg->cmt_parent; cpu_pg->cmt_siblings = pg->cmt_siblings; } } /* * Update the CPU's lineages * * Remove the PG from the CPU's group used for CMT * scheduling. */ (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); } } start_cpus(); return (0); } /* * Disable CMT scheduling */ static void pg_cmt_disable(void) { cpu_t *cpu; ASSERT(MUTEX_HELD(&cpu_lock)); pause_cpus(NULL); cpu = cpu_list; do { if (cpu->cpu_pg) group_empty(&cpu->cpu_pg->cmt_pgs); } while ((cpu = cpu->cpu_next) != cpu_list); cmt_sched_disabled = 1; start_cpus(); cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); } /* * CMT lineage validation * * This routine is invoked by pg_cmt_cpu_init() to validate the integrity * of the PGs in a CPU's lineage. This is necessary because it's possible that * some groupings (power domain groupings in particular) may be defined by * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be * possible to integrate those groupings into the CMT PG hierarchy, if doing * so would violate the subset invariant of the hierarchy, which says that * a PG must be subset of its parent (if it has one). * * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that * would result in a violation of this invariant. If a violation is found, * and the PG is of a grouping type who's definition is known to originate from * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the * PG (and all other instances PG's sharing relationship type) from the CMT * hierarchy. Further, future instances of that sharing relationship type won't * be added. If the grouping definition doesn't originate from suspect * sources, then pg_cmt_disable() will be invoked to log an error, and disable * CMT scheduling altogether. * * This routine is invoked after the CPU has been added to the PGs in which * it belongs, but before those PGs have been added to (or had their place * adjusted in) the CMT PG hierarchy. * * The first argument is the CPUs PG lineage (essentially an array of PGs in * which the CPU belongs) that has already been sorted in ascending order * by CPU count. Some of the PGs in the CPUs lineage may already have other * CPUs in them, and have already been integrated into the CMT hierarchy. * * The addition of this new CPU to these pre-existing PGs means that those * PGs may need to be promoted up in the hierarchy to satisfy the subset * invariant. In additon to testing the subset invariant for the lineage, * this routine also verifies that the addition of the new CPU to the * existing PGs wouldn't cause the subset invariant to be violated in * the exiting lineages. * * This routine will normally return one of the following: * CMT_LINEAGE_VALID - There were no problems detected with the lineage. * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. * * Otherwise, this routine will return a value indicating which error it * was unable to recover from (and set cmt_lineage_status along the way). * * This routine operates on the CPU specific processor group data (for the CPU * whose lineage is being validated), which is under-construction. * "pgdata" is a reference to the CPU's under-construction PG data. * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. */ static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) { int i, j, size; pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; cpu_t *cp; pg_cpu_itr_t cpu_iter; lgrp_handle_t lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); revalidate: size = *sz; pg_bad = NULL; lgrp = LGRP_NULL_HANDLE; for (i = 0; i < size; i++) { pg = lineage[i]; if (i < size - 1) pg_next = lineage[i + 1]; else pg_next = NULL; /* * We assume that the lineage has already been sorted * by the number of CPUs. In fact, we depend on it. */ ASSERT(pg_next == NULL || (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); /* * The CPUs PG lineage was passed as the first argument to * this routine and contains the sorted list of the CPU's * PGs. Ultimately, the ordering of the PGs in that list, and * the ordering as traversed by the cmt_parent list must be * the same. PG promotion will be used as the mechanism to * achieve this, but first we need to look for cases where * promotion will be necessary, and validate that will be * possible without violating the subset invarient described * above. * * Since the PG topology is in the middle of being changed, we * need to check whether the PG's existing parent (if any) is * part of this CPU's lineage (and therefore should contain * the new CPU). If not, it means that the addition of the * new CPU should have made this PG have more CPUs than its * parent (and other ancestors not in the same lineage) and * will need to be promoted into place. * * We need to verify all of this to defend against a buggy * BIOS giving bad power domain CPU groupings. Sigh. */ parent = pg->cmt_parent; while (parent != NULL) { /* * Determine if the parent/ancestor is in this lineage */ pg_tmp = NULL; for (j = 0; (j < size) && (pg_tmp != parent); j++) { pg_tmp = lineage[j]; } if (pg_tmp == parent) { /* * It's in the lineage. The concentricity * checks will handle the rest. */ break; } /* * If it is not in the lineage, PG will eventually * need to be promoted above it. Verify the ancestor * is a proper subset. There is still an error if * the ancestor has the same number of CPUs as PG, * since that would imply it should be in the lineage, * and we already know it isn't. */ if (PG_NUM_CPUS((pg_t *)parent) >= PG_NUM_CPUS((pg_t *)pg)) { /* * Not a proper subset if the parent/ancestor * has the same or more CPUs than PG. */ cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; goto handle_error; } parent = parent->cmt_parent; } /* * Walk each of the CPUs in the PGs group and perform * consistency checks along the way. */ PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { /* * Verify that there aren't any CPUs contained in PG * that the next PG in the lineage (which is larger * or same size) doesn't also contain. */ if (pg_next != NULL && pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; goto handle_error; } /* * Verify that all the CPUs in the PG are in the same * lgroup. */ if (lgrp == LGRP_NULL_HANDLE) { lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; goto handle_error; } } } handle_error: /* * Some of these validation errors can result when the CPU grouping * information is derived from buggy sources (for example, incorrect * ACPI tables on x86 systems). * * We'll try to recover in such cases by pruning out the illegal * groupings from the PG hierarchy, which means that we won't optimize * for those levels, but we will for the remaining ones. */ switch (cmt_lineage_status) { case CMT_LINEAGE_VALID: case CMT_LINEAGE_REPAIRED: break; case CMT_LINEAGE_PG_SPANS_LGRPS: /* * We've detected a PG whose CPUs span lgroups. * * This isn't supported, as the dispatcher isn't allowed to * to do CMT thread placement across lgroups, as this would * conflict with policies implementing MPO thread affinity. * * If the PG is of a sharing relationship type known to * legitimately span lgroups, specify that no CMT thread * placement policy should be implemented, and prune the PG * from the existing CMT PG hierarchy. * * Otherwise, fall though to the case below for handling. */ if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { cmt_lineage_status = CMT_LINEAGE_REPAIRED; goto revalidate; } } /*LINTED*/ case CMT_LINEAGE_NON_PROMOTABLE: /* * We've detected a PG that already exists in another CPU's * lineage that cannot cannot legally be promoted into place * without breaking the invariants of the hierarchy. */ if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { cmt_lineage_status = CMT_LINEAGE_REPAIRED; goto revalidate; } } /* * Something went wrong trying to prune out the bad level. * Disable CMT scheduling altogether. */ pg_cmt_disable(); break; case CMT_LINEAGE_NON_CONCENTRIC: /* * We've detected a non-concentric PG lineage, which means that * there's a PG in the lineage that has CPUs that the next PG * over in the lineage (which is the same size or larger) * doesn't have. * * In this case, we examine the two PGs to see if either * grouping is defined by potentially buggy sources. * * If one has less CPUs than the other, and contains CPUs * not found in the parent, and it is an untrusted enumeration, * then prune it. If both have the same number of CPUs, then * prune the one that is untrusted. * * This process repeats until we have a concentric lineage, * or we would have to prune out level derived from what we * thought was a reliable source, in which case CMT scheduling * is disabled altogether. */ if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { pg_bad = pg; } else if (PG_NUM_CPUS((pg_t *)pg) == PG_NUM_CPUS((pg_t *)pg_next)) { if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { pg_bad = pg_next; } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { pg_bad = pg; } } if (pg_bad) { if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { cmt_lineage_status = CMT_LINEAGE_REPAIRED; goto revalidate; } } /* * Something went wrong trying to identify and/or prune out * the bad level. Disable CMT scheduling altogether. */ pg_cmt_disable(); break; default: /* * If we're here, we've encountered a validation error for * which we don't know how to recover. In this case, disable * CMT scheduling altogether. */ cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; pg_cmt_disable(); } return (cmt_lineage_status); }