/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * CMT scheduler / dispatcher support * * This file implements CMT scheduler support using Processor Groups. * The CMT processor group class creates and maintains the CMT class * specific processor group pg_cmt_t. * * ---------------------------- <-- pg_cmt_t * * | pghw_t | * ---------------------------- * | CMT class specific data | * | - hierarchy linkage | * | - CMT load balancing data| * | - active CPU group/bitset| * ---------------------------- * * The scheduler/dispatcher leverages knowledge of the performance * relevant CMT sharing relationships existing between cpus to implement * optimized affinity, load balancing, and coalescence policies. * * Load balancing policy seeks to improve performance by minimizing * contention over shared processor resources / facilities, Affinity * policies seek to improve cache and TLB utilization. Coalescence * policies improve resource utilization and ultimately power efficiency. * * The CMT PGs created by this class are already arranged into a * hierarchy (which is done in the pghw layer). To implement the top-down * CMT load balancing algorithm, the CMT PGs additionally maintain * parent, child and sibling hierarchy relationships. * Parent PGs always contain a superset of their children(s) resources, * each PG can have at most one parent, and siblings are the group of PGs * sharing the same parent. * * On NUMA systems, the CMT load balancing algorithm balances across the * CMT PGs within their respective lgroups. On UMA based system, there * exists a top level group of PGs to balance across. On NUMA systems multiple * top level groups are instantiated, where the top level balancing begins by * balancng across the CMT PGs within their respective (per lgroup) top level * groups. */ static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ /* used for null_proc_lpa */ cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ static int is_cpu0 = 1; /* true if this is boot CPU context */ /* * Array of hardware sharing relationships that are blacklisted. * PGs won't be instantiated for blacklisted hardware sharing relationships. */ static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; /* * Set this to non-zero to disable CMT scheduling * This must be done via kmdb -d, as /etc/system will be too late */ int cmt_sched_disabled = 0; static pg_cid_t pg_cmt_class_id; /* PG class id */ static pg_t *pg_cmt_alloc(); static void pg_cmt_free(pg_t *); static void pg_cmt_cpu_init(cpu_t *); static void pg_cmt_cpu_fini(cpu_t *); static void pg_cmt_cpu_active(cpu_t *); static void pg_cmt_cpu_inactive(cpu_t *); static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); static char *pg_cmt_policy_name(pg_t *); static void pg_cmt_hier_sort(pg_cmt_t **, int); static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); static int pg_cmt_hw(pghw_type_t); static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); static int pg_cmt_lineage_validate(pg_cmt_t **, int *); static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, kthread_t *, kthread_t *); static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, kthread_t *, kthread_t *); static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); /* * Macro to test if PG is managed by the CMT PG class */ #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) /* * Status codes for CMT lineage validation * See cmt_lineage_validate() below */ typedef enum cmt_lineage_validation { CMT_LINEAGE_VALID, CMT_LINEAGE_NON_CONCENTRIC, CMT_LINEAGE_REPAIRED, CMT_LINEAGE_UNRECOVERABLE } cmt_lineage_validation_t; /* * Status of the current lineage under construction. * One must be holding cpu_lock to change this. */ static cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; /* * Power domain definitions (on x86) are defined by ACPI, and * therefore may be subject to BIOS bugs. */ #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) /* * CMT PG ops */ struct pg_ops pg_ops_cmt = { pg_cmt_alloc, pg_cmt_free, pg_cmt_cpu_init, pg_cmt_cpu_fini, pg_cmt_cpu_active, pg_cmt_cpu_inactive, pg_cmt_cpupart_in, NULL, /* cpupart_out */ pg_cmt_cpupart_move, pg_cmt_cpu_belongs, pg_cmt_policy_name, }; /* * Initialize the CMT PG class */ void pg_cmt_class_init(void) { if (cmt_sched_disabled) return; pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); } /* * Called to indicate a new CPU has started up so * that either t0 or the slave startup thread can * be accounted for. */ void pg_cmt_cpu_startup(cpu_t *cp) { pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, cp->cpu_thread); } /* * Return non-zero if thread can migrate between "from" and "to" * without a performance penalty */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { if (from->cpu_physid->cpu_cacheid == to->cpu_physid->cpu_cacheid) return (1); return (0); } /* * CMT class specific PG allocation */ static pg_t * pg_cmt_alloc(void) { return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); } /* * Class specific PG de-allocation */ static void pg_cmt_free(pg_t *pg) { ASSERT(pg != NULL); ASSERT(IS_CMT_PG(pg)); kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); } /* * Given a hardware sharing relationship, return which dispatcher * policies should be implemented to optimize performance and efficiency */ static pg_cmt_policy_t pg_cmt_policy(pghw_type_t hw) { pg_cmt_policy_t p; /* * Give the platform a chance to override the default */ if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) return (p); switch (hw) { case PGHW_IPIPE: case PGHW_FPU: case PGHW_CHIP: return (CMT_BALANCE); case PGHW_CACHE: return (CMT_AFFINITY); case PGHW_POW_ACTIVE: case PGHW_POW_IDLE: return (CMT_BALANCE); default: return (CMT_NO_POLICY); } } /* * Rank the importance of optimizing for the pg1 relationship vs. * the pg2 relationship. */ static pg_cmt_t * pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) { pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; /* * A power domain is only important if CPUPM is enabled. */ if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) return (pg2); if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) return (pg1); } /* * Otherwise, ask the platform */ if (pg_plat_hw_rank(hw1, hw2) == hw1) return (pg1); else return (pg2); } /* * Initialize CMT callbacks for the given PG */ static void cmt_callback_init(pg_t *pg) { switch (((pghw_t *)pg)->pghw_hw) { case PGHW_POW_ACTIVE: pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; break; default: pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; } } /* * Promote PG above it's current parent. * This is only legal if PG has an equal or greater number of CPUs * than it's parent. */ static void cmt_hier_promote(pg_cmt_t *pg) { pg_cmt_t *parent; group_t *children; cpu_t *cpu; group_iter_t iter; pg_cpu_itr_t cpu_iter; int r; int err; ASSERT(MUTEX_HELD(&cpu_lock)); parent = pg->cmt_parent; if (parent == NULL) { /* * Nothing to do */ return; } ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); /* * We're changing around the hierarchy, which is actively traversed * by the dispatcher. Pause CPUS to ensure exclusivity. */ pause_cpus(NULL); /* * If necessary, update the parent's sibling set, replacing parent * with PG. */ if (parent->cmt_siblings) { if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * If the parent is at the top of the hierarchy, replace it's entry * in the root lgroup's group of top level PGs. */ if (parent->cmt_parent == NULL && parent->cmt_siblings != &cmt_root->cl_pgs) { if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) != -1) { r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * We assume (and therefore assert) that the PG being promoted is an * only child of it's parent. Update the parent's children set * replacing PG's entry with the parent (since the parent is becoming * the child). Then have PG and the parent swap children sets. */ ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_children, parent, GRP_NORESIZE); ASSERT(r != -1); } children = pg->cmt_children; pg->cmt_children = parent->cmt_children; parent->cmt_children = children; /* * Update the sibling references for PG and it's parent */ pg->cmt_siblings = parent->cmt_siblings; parent->cmt_siblings = pg->cmt_children; /* * Update any cached lineages in the per CPU pg data. */ PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { int idx; group_t *pgs; pg_cmt_t *cpu_pg; /* * Iterate over the CPU's PGs updating the children * of the PG being promoted, since they have a new parent. */ pgs = &cpu->cpu_pg->pgs; group_iter_init(&iter); while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) { if (cpu_pg->cmt_parent == pg) { cpu_pg->cmt_parent = parent; } } /* * Update the CMT load balancing lineage */ pgs = &cpu->cpu_pg->cmt_pgs; if ((idx = group_find(pgs, (void *)pg)) == -1) { /* * Unless this is the CPU who's lineage is being * constructed, the PG being promoted should be * in the lineage. */ ASSERT(GROUP_SIZE(pgs) == 0); continue; } ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent); ASSERT(idx > 0); /* * Have the child and the parent swap places in the CPU's * lineage */ group_remove_at(pgs, idx); group_remove_at(pgs, idx - 1); err = group_add_at(pgs, parent, idx); ASSERT(err == 0); err = group_add_at(pgs, pg, idx - 1); ASSERT(err == 0); } /* * Update the parent references for PG and it's parent */ pg->cmt_parent = parent->cmt_parent; parent->cmt_parent = pg; start_cpus(); } /* * CMT class callback for a new CPU entering the system */ static void pg_cmt_cpu_init(cpu_t *cp) { pg_cmt_t *pg; group_t *cmt_pgs; int levels, level; pghw_type_t hw; pg_t *pg_cache = NULL; pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; lgrp_handle_t lgrp_handle; cmt_lgrp_t *lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; /* * A new CPU is coming into the system. * Interrogate the platform to see if the CPU * has any performance or efficiency relevant * sharing relationships */ cmt_pgs = &cp->cpu_pg->cmt_pgs; cp->cpu_pg->cmt_lineage = NULL; bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); levels = 0; for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { pg_cmt_policy_t policy; /* * We're only interested in the hw sharing relationships * for which we know how to optimize. */ policy = pg_cmt_policy(hw); if (policy == CMT_NO_POLICY || pg_plat_hw_shared(cp, hw) == 0) continue; /* * Continue if the hardware sharing relationship has been * blacklisted. */ if (cmt_hw_blacklisted[hw]) { continue; } /* * Find (or create) the PG associated with * the hw sharing relationship in which cp * belongs. * * Determine if a suitable PG already * exists, or if one needs to be created. */ pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); if (pg == NULL) { /* * Create a new one. * Initialize the common... */ pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); /* ... physical ... */ pghw_init((pghw_t *)pg, cp, hw); /* * ... and CMT specific portions of the * structure. */ pg->cmt_policy = policy; /* CMT event callbacks */ cmt_callback_init((pg_t *)pg); bitset_init(&pg->cmt_cpus_actv_set); group_create(&pg->cmt_cpus_actv); } else { ASSERT(IS_CMT_PG(pg)); } /* Add the CPU to the PG */ pg_cpu_add((pg_t *)pg, cp); /* * Ensure capacity of the active CPU group/bitset */ group_expand(&pg->cmt_cpus_actv, GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); if (cp->cpu_seqid >= bitset_capacity(&pg->cmt_cpus_actv_set)) { bitset_resize(&pg->cmt_cpus_actv_set, cp->cpu_seqid + 1); } /* * Build a lineage of CMT PGs for load balancing / coalescence */ if (policy & (CMT_BALANCE | CMT_COALESCE)) { cpu_cmt_hier[levels++] = pg; } /* Cache this for later */ if (hw == PGHW_CACHE) pg_cache = (pg_t *)pg; } group_expand(cmt_pgs, levels); if (cmt_root == NULL) cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); /* * Find the lgrp that encapsulates this CPU's CMT hierarchy */ lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) lgrp = pg_cmt_lgrp_create(lgrp_handle); /* * Ascendingly sort the PGs in the lineage by number of CPUs */ pg_cmt_hier_sort(cpu_cmt_hier, levels); /* * Examine the lineage and validate it. * This routine will also try to fix the lineage along with the * rest of the PG hierarchy should it detect an issue. * * If it returns -1, an unrecoverable error has happened and we * need to return. */ if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0) return; /* * For existing PGs in the lineage, verify that the parent is * correct, as the generation in the lineage may have changed * as a result of the sorting. Start the traversal at the top * of the lineage, moving down. */ for (level = levels - 1; level >= 0; ) { int reorg; reorg = 0; pg = cpu_cmt_hier[level]; /* * Promote PGs at an incorrect generation into place. */ while (pg->cmt_parent && pg->cmt_parent != cpu_cmt_hier[level + 1]) { cmt_hier_promote(pg); reorg++; } if (reorg > 0) level = levels - 1; else level--; } /* * For each of the PGs in the CPU's lineage: * - Add an entry in the CPU sorted CMT PG group * which is used for top down CMT load balancing * - Tie the PG into the CMT hierarchy by connecting * it to it's parent and siblings. */ for (level = 0; level < levels; level++) { uint_t children; int err; pg = cpu_cmt_hier[level]; err = group_add_at(cmt_pgs, pg, levels - level - 1); ASSERT(err == 0); if (level == 0) cp->cpu_pg->cmt_lineage = (pg_t *)pg; if (pg->cmt_siblings != NULL) { /* Already initialized */ ASSERT(pg->cmt_parent == NULL || pg->cmt_parent == cpu_cmt_hier[level + 1]); ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || ((pg->cmt_parent != NULL) && pg->cmt_siblings == pg->cmt_parent->cmt_children)); continue; } if ((level + 1) == levels) { pg->cmt_parent = NULL; pg->cmt_siblings = &lgrp->cl_pgs; children = ++lgrp->cl_npgs; if (cmt_root != lgrp) cmt_root->cl_npgs++; } else { pg->cmt_parent = cpu_cmt_hier[level + 1]; /* * A good parent keeps track of their children. * The parent's children group is also the PG's * siblings. */ if (pg->cmt_parent->cmt_children == NULL) { pg->cmt_parent->cmt_children = kmem_zalloc(sizeof (group_t), KM_SLEEP); group_create(pg->cmt_parent->cmt_children); } pg->cmt_siblings = pg->cmt_parent->cmt_children; children = ++pg->cmt_parent->cmt_nchildren; } group_expand(pg->cmt_siblings, children); group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); } /* * Cache the chip and core IDs in the cpu_t->cpu_physid structure * for fast lookups later. */ if (cp->cpu_physid) { cp->cpu_physid->cpu_chipid = pg_plat_hw_instance_id(cp, PGHW_CHIP); cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); /* * If this cpu has a PG representing shared cache, then set * cpu_cacheid to that PG's logical id */ if (pg_cache) cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; } /* CPU0 only initialization */ if (is_cpu0) { pg_cmt_cpu_startup(cp); is_cpu0 = 0; cpu0_lgrp = lgrp; } } /* * Class callback when a CPU is leaving the system (deletion) */ static void pg_cmt_cpu_fini(cpu_t *cp) { group_iter_t i; pg_cmt_t *pg; group_t *pgs, *cmt_pgs; lgrp_handle_t lgrp_handle; cmt_lgrp_t *lgrp; if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; cmt_pgs = &cp->cpu_pg->cmt_pgs; /* * Find the lgroup that encapsulates this CPU's CMT hierarchy */ lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); lgrp = pg_cmt_find_lgrp(lgrp_handle); if (ncpus == 1 && lgrp != cpu0_lgrp) { /* * One might wonder how we could be deconfiguring the * only CPU in the system. * * On Starcat systems when null_proc_lpa is detected, * the boot CPU (which is already configured into a leaf * lgroup), is moved into the root lgroup. This is done by * deconfiguring it from both lgroups and processor * groups), and then later reconfiguring it back in. This * call to pg_cmt_cpu_fini() is part of that deconfiguration. * * This special case is detected by noting that the platform * has changed the CPU's lgrp affiliation (since it now * belongs in the root). In this case, use the cmt_lgrp_t * cached for the boot CPU, since this is what needs to be * torn down. */ lgrp = cpu0_lgrp; } ASSERT(lgrp != NULL); /* * First, clean up anything load balancing specific for each of * the CPU's PGs that participated in CMT load balancing */ pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; while (pg != NULL) { /* * Remove the PG from the CPU's load balancing lineage */ (void) group_remove(cmt_pgs, pg, GRP_RESIZE); /* * If it's about to become empty, destroy it's children * group, and remove it's reference from it's siblings. * This is done here (rather than below) to avoid removing * our reference from a PG that we just eliminated. */ if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { if (pg->cmt_children != NULL) group_destroy(pg->cmt_children); if (pg->cmt_siblings != NULL) { if (pg->cmt_siblings == &lgrp->cl_pgs) lgrp->cl_npgs--; else pg->cmt_parent->cmt_nchildren--; } } pg = pg->cmt_parent; } ASSERT(GROUP_SIZE(cmt_pgs) == 0); /* * Now that the load balancing lineage updates have happened, * remove the CPU from all it's PGs (destroying any that become * empty). */ group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; pg_cpu_delete((pg_t *)pg, cp); /* * Deleting the CPU from the PG changes the CPU's * PG group over which we are actively iterating * Re-initialize the iteration */ group_iter_init(&i); if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { /* * The PG has become zero sized, so destroy it. */ group_destroy(&pg->cmt_cpus_actv); bitset_fini(&pg->cmt_cpus_actv_set); pghw_fini((pghw_t *)pg); pg_destroy((pg_t *)pg); } } } /* * Class callback when a CPU is entering a cpu partition */ static void pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) { group_t *pgs; pg_t *pg; group_iter_t i; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; /* * Ensure that the new partition's PG bitset * is large enough for all CMT PG's to which cp * belongs */ group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); } } /* * Class callback when a CPU is actually moving partitions */ static void pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) { cpu_t *cpp; group_t *pgs; pg_t *pg; group_iter_t pg_iter; pg_cpu_itr_t cpu_iter; boolean_t found; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&pg_iter); /* * Iterate over the CPUs CMT PGs */ while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; /* * Add the PG to the bitset in the new partition. */ bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); /* * Remove the PG from the bitset in the old partition * if the last of the PG's CPUs have left. */ found = B_FALSE; PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { if (cpp == cp) continue; if (CPU_ACTIVE(cpp) && cpp->cpu_part->cp_id == oldpp->cp_id) { found = B_TRUE; break; } } if (!found) bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); } } /* * Class callback when a CPU becomes active (online) * * This is called in a context where CPUs are paused */ static void pg_cmt_cpu_active(cpu_t *cp) { int err; group_iter_t i; pg_cmt_t *pg; group_t *pgs; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&i); /* * Iterate over the CPU's PGs */ while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); ASSERT(err == 0); /* * If this is the first active CPU in the PG, and it * represents a hardware sharing relationship over which * CMT load balancing is performed, add it as a candidate * for balancing with it's siblings. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); /* * If this is a top level PG, add it as a balancing * candidate when balancing within the root lgroup. */ if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); } } /* * Notate the CPU in the PGs active CPU bitset. * Also notate the PG as being active in it's associated * partition */ bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); } } /* * Class callback when a CPU goes inactive (offline) * * This is called in a context where CPUs are paused */ static void pg_cmt_cpu_inactive(cpu_t *cp) { int err; group_t *pgs; pg_cmt_t *pg; cpu_t *cpp; group_iter_t i; pg_cpu_itr_t cpu_itr; boolean_t found; ASSERT(MUTEX_HELD(&cpu_lock)); if (cmt_sched_disabled) return; pgs = &cp->cpu_pg->pgs; group_iter_init(&i); while ((pg = group_iterate(pgs, &i)) != NULL) { if (IS_CMT_PG(pg) == 0) continue; /* * Remove the CPU from the CMT PGs active CPU group * bitmap */ err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); ASSERT(err == 0); bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); /* * If there are no more active CPUs in this PG over which * load was balanced, remove it as a balancing candidate. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_remove(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); } } /* * Assert the number of active CPUs does not exceed * the total number of CPUs in the PG */ ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); /* * Update the PG bitset in the CPU's old partition */ found = B_FALSE; PG_CPU_ITR_INIT(pg, cpu_itr); while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { if (cpp == cp) continue; if (CPU_ACTIVE(cpp) && cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { found = B_TRUE; break; } } if (!found) { bitset_del(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); } } } /* * Return non-zero if the CPU belongs in the given PG */ static int pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) { cpu_t *pg_cpu; pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); ASSERT(pg_cpu != NULL); /* * The CPU belongs if, given the nature of the hardware sharing * relationship represented by the PG, the CPU has that * relationship with some other CPU already in the PG */ if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) return (1); return (0); } /* * Sort the CPUs CMT hierarchy, where "size" is the number of levels. */ static void pg_cmt_hier_sort(pg_cmt_t **hier, int size) { int i, j, inc; pg_t *tmp; pg_t **h = (pg_t **)hier; /* * First sort by number of CPUs */ inc = size / 2; while (inc > 0) { for (i = inc; i < size; i++) { j = i; tmp = h[i]; while ((j >= inc) && (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { h[j] = h[j - inc]; j = j - inc; } h[j] = tmp; } if (inc == 2) inc = 1; else inc = (inc * 5) / 11; } /* * Break ties by asking the platform. * Determine if h[i] outranks h[i + 1] and if so, swap them. */ for (i = 0; i < size - 1; i++) { if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { tmp = h[i]; h[i] = h[i + 1]; h[i + 1] = tmp; } } } /* * Return a cmt_lgrp_t * given an lgroup handle. */ static cmt_lgrp_t * pg_cmt_find_lgrp(lgrp_handle_t hand) { cmt_lgrp_t *lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); lgrp = cmt_lgrps; while (lgrp != NULL) { if (lgrp->cl_hand == hand) break; lgrp = lgrp->cl_next; } return (lgrp); } /* * Create a cmt_lgrp_t with the specified handle. */ static cmt_lgrp_t * pg_cmt_lgrp_create(lgrp_handle_t hand) { cmt_lgrp_t *lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); lgrp->cl_hand = hand; lgrp->cl_npgs = 0; lgrp->cl_next = cmt_lgrps; cmt_lgrps = lgrp; group_create(&lgrp->cl_pgs); return (lgrp); } /* * Interfaces to enable and disable power aware dispatching * The caller must be holding cpu_lock. * * Return 0 on success and -1 on failure. */ int cmt_pad_enable(pghw_type_t type) { group_t *hwset; group_iter_t iter; pg_cmt_t *pg; ASSERT(PGHW_IS_PM_DOMAIN(type)); ASSERT(MUTEX_HELD(&cpu_lock)); if ((hwset = pghw_set_lookup(type)) == NULL || cmt_hw_blacklisted[type]) { /* * Unable to find any instances of the specified type * of power domain, or the power domains have been blacklisted. */ return (-1); } /* * Iterate over the power domains, setting the default dispatcher * policy for power/performance optimization. * * Simply setting the policy isn't enough in the case where the power * domain is an only child of another PG. Because the dispatcher walks * the PG hierarchy in a top down fashion, the higher up PG's policy * will dominate. So promote the power domain above it's parent if both * PG and it's parent have the same CPUs to ensure it's policy * dominates. */ group_iter_init(&iter); while ((pg = group_iterate(hwset, &iter)) != NULL) { /* * If the power domain is an only child to a parent * not implementing the same policy, promote the child * above the parent to activate the policy. */ pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); while ((pg->cmt_parent != NULL) && (pg->cmt_parent->cmt_policy != pg->cmt_policy) && (PG_NUM_CPUS((pg_t *)pg) == PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { cmt_hier_promote(pg); } } return (0); } int cmt_pad_disable(pghw_type_t type) { group_t *hwset; group_iter_t iter; pg_cmt_t *pg; pg_cmt_t *child; ASSERT(PGHW_IS_PM_DOMAIN(type)); ASSERT(MUTEX_HELD(&cpu_lock)); if ((hwset = pghw_set_lookup(type)) == NULL) { /* * Unable to find any instances of the specified type of * power domain. */ return (-1); } /* * Iterate over the power domains, setting the default dispatcher * policy for performance optimization (load balancing). */ group_iter_init(&iter); while ((pg = group_iterate(hwset, &iter)) != NULL) { /* * If the power domain has an only child that implements * policy other than load balancing, promote the child * above the power domain to ensure it's policy dominates. */ if (pg->cmt_children != NULL && GROUP_SIZE(pg->cmt_children) == 1) { child = GROUP_ACCESS(pg->cmt_children, 0); if ((child->cmt_policy & CMT_BALANCE) == 0) { cmt_hier_promote(child); } } pg->cmt_policy = CMT_BALANCE; } return (0); } /* ARGSUSED */ static void cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, kthread_t *new) { pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; if (old == cp->cpu_idle_thread) { atomic_add_32(&cmt_pg->cmt_utilization, 1); } else if (new == cp->cpu_idle_thread) { atomic_add_32(&cmt_pg->cmt_utilization, -1); } } /* * Macro to test whether a thread is currently runnable on a CPU in a PG. */ #define THREAD_RUNNABLE_IN_PG(t, pg) \ ((t)->t_state == TS_RUN && \ (t)->t_disp_queue->disp_cpu && \ bitset_in_set(&(pg)->cmt_cpus_actv_set, \ (t)->t_disp_queue->disp_cpu->cpu_seqid)) static void cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, kthread_t *new) { pg_cmt_t *cmt = (pg_cmt_t *)pg; cpupm_domain_t *dom; uint32_t u; if (old == cp->cpu_idle_thread) { ASSERT(new != cp->cpu_idle_thread); u = atomic_add_32_nv(&cmt->cmt_utilization, 1); if (u == 1) { /* * Notify the CPU power manager that the domain * is non-idle. */ dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, now, dom, CPUPM_DOM_BUSY_FROM_IDLE); } } else if (new == cp->cpu_idle_thread) { ASSERT(old != cp->cpu_idle_thread); u = atomic_add_32_nv(&cmt->cmt_utilization, -1); if (u == 0) { /* * The domain is idle, notify the CPU power * manager. * * Avoid notifying if the thread is simply migrating * between CPUs in the domain. */ if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, now, dom, CPUPM_DOM_IDLE_FROM_BUSY); } } } } /* ARGSUSED */ static void cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) { pg_cmt_t *cmt = (pg_cmt_t *)pg; cpupm_domain_t *dom; dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); } /* * Return the name of the CMT scheduling policy * being implemented across this PG */ static char * pg_cmt_policy_name(pg_t *pg) { pg_cmt_policy_t policy; policy = ((pg_cmt_t *)pg)->cmt_policy; if (policy & CMT_AFFINITY) { if (policy & CMT_BALANCE) return ("Load Balancing & Affinity"); else if (policy & CMT_COALESCE) return ("Load Coalescence & Affinity"); else return ("Affinity"); } else { if (policy & CMT_BALANCE) return ("Load Balancing"); else if (policy & CMT_COALESCE) return ("Load Coalescence"); else return ("None"); } } /* * Prune PG, and all other instances of PG's hardware sharing relationship * from the PG hierarchy. */ static int pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz) { group_t *hwset, *children; int i, j, r, size = *sz; group_iter_t hw_iter, child_iter; pg_cpu_itr_t cpu_iter; pg_cmt_t *pg, *child; cpu_t *cpu; int cap_needed; pghw_type_t hw; ASSERT(MUTEX_HELD(&cpu_lock)); hw = ((pghw_t *)pg_bad)->pghw_hw; if (hw == PGHW_POW_ACTIVE) { cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " "Event Based CPUPM Unavailable"); } else if (hw == PGHW_POW_IDLE) { cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " "Dispatcher assisted CPUPM disabled."); } /* * Find and eliminate the PG from the lineage. */ for (i = 0; i < size; i++) { if (lineage[i] == pg_bad) { for (j = i; j < size - 1; j++) lineage[j] = lineage[j + 1]; *sz = size - 1; break; } } /* * We'll prune all instances of the hardware sharing relationship * represented by pg. But before we do that (and pause CPUs) we need * to ensure the hierarchy's groups are properly sized. */ hwset = pghw_set_lookup(hw); /* * Blacklist the hardware so that future groups won't be created. */ cmt_hw_blacklisted[hw] = 1; /* * For each of the PGs being pruned, ensure sufficient capacity in * the siblings set for the PG's children */ group_iter_init(&hw_iter); while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { /* * PG is being pruned, but if it is bringing up more than * one child, ask for more capacity in the siblings group. */ cap_needed = 0; if (pg->cmt_children && GROUP_SIZE(pg->cmt_children) > 1) { cap_needed = GROUP_SIZE(pg->cmt_children) - 1; group_expand(pg->cmt_siblings, GROUP_SIZE(pg->cmt_siblings) + cap_needed); /* * If this is a top level group, also ensure the * capacity in the root lgrp level CMT grouping. */ if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { group_expand(&cmt_root->cl_pgs, GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); } } } /* * We're operating on the PG hierarchy. Pause CPUs to ensure * exclusivity with respect to the dispatcher. */ pause_cpus(NULL); /* * Prune all PG instances of the hardware sharing relationship * represented by pg. */ group_iter_init(&hw_iter); while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { /* * Remove PG from it's group of siblings, if it's there. */ if (pg->cmt_siblings) { (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); } if (pg->cmt_parent == NULL && pg->cmt_siblings != &cmt_root->cl_pgs) { (void) group_remove(&cmt_root->cl_pgs, pg, GRP_NORESIZE); } /* * Add PGs children to it's group of siblings. */ if (pg->cmt_children != NULL) { children = pg->cmt_children; group_iter_init(&child_iter); while ((child = group_iterate(children, &child_iter)) != NULL) { /* * Transplant child from it's siblings set to * PGs. */ if (pg->cmt_siblings != NULL && child->cmt_siblings != NULL && group_remove(child->cmt_siblings, child, GRP_NORESIZE) != -1) { r = group_add(pg->cmt_siblings, child, GRP_NORESIZE); ASSERT(r == 0); } } } /* * Reset the callbacks to the defaults */ pg_callback_set_defaults((pg_t *)pg); /* * Update all the CPU lineages in each of PG's CPUs */ PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { group_t *pgs; pg_cmt_t *cpu_pg; group_iter_t liter; /* Iterator for the lineage */ /* * Iterate over the CPU's PGs updating the children * of the PG being promoted, since they have a new * parent and siblings set. */ pgs = &cpu->cpu_pg->pgs; group_iter_init(&liter); while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) { if (cpu_pg->cmt_parent == pg) { cpu_pg->cmt_parent = pg->cmt_parent; cpu_pg->cmt_siblings = pg->cmt_siblings; } } /* * Update the CPU's lineages */ pgs = &cpu->cpu_pg->cmt_pgs; (void) group_remove(pgs, pg, GRP_NORESIZE); pgs = &cpu->cpu_pg->pgs; (void) group_remove(pgs, pg, GRP_NORESIZE); } } start_cpus(); return (0); } /* * Disable CMT scheduling */ static void pg_cmt_disable(void) { cpu_t *cpu; pause_cpus(NULL); cpu = cpu_list; do { if (cpu->cpu_pg) group_empty(&cpu->cpu_pg->cmt_pgs); } while ((cpu = cpu->cpu_next) != cpu_list); cmt_sched_disabled = 1; start_cpus(); cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); } static int pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz) { int i, size; pg_cmt_t *pg, *parent, *pg_bad; cpu_t *cp; pg_cpu_itr_t cpu_iter; ASSERT(MUTEX_HELD(&cpu_lock)); revalidate: size = *sz; pg_bad = NULL; for (i = 0; i < size - 1; i++) { pg = lineage[i]; parent = lineage[i + 1]; /* * We assume that the lineage has already been sorted * by the number of CPUs. In fact, we depend on it. */ ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent)); /* * Walk each of the CPUs in the PGs group, and verify that * the next larger PG contains at least the CPUs in this one. */ PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) { cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; goto handle_error; } } } handle_error: switch (cmt_lineage_status) { case CMT_LINEAGE_VALID: case CMT_LINEAGE_REPAIRED: break; case CMT_LINEAGE_NON_CONCENTRIC: /* * We've detected a non-concentric PG lineage. * * This can happen when some of the CPU grouping information * is derived from buggy sources (for example, incorrect ACPI * tables on x86 systems). * * We attempt to recover from this by pruning out the * illegal groupings from the PG hierarchy, which means that * we won't optimize for those levels, but we will for the * remaining ones. * * If a given level has CPUs not found in it's parent, then * we examine the PG and it's parent to see if either grouping * is enumerated from potentially buggy sources. * * If one has less CPUs than the other, and contains CPUs * not found in the parent, and it is an untrusted enumeration, * then prune it. If both have the same number of CPUs, then * prune the one that is untrusted. * * This process repeats until we have a concentric lineage, * or we would have to prune out level derived from what we * thought was a reliable source, in which case CMT scheduling * is disabled all together. */ if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) && (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { pg_bad = pg; } else if (PG_NUM_CPUS((pg_t *)pg) == PG_NUM_CPUS((pg_t *)parent)) { if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) { pg_bad = parent; } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { pg_bad = pg; } } if (pg_bad) { if (pg_cmt_prune(pg_bad, lineage, sz) == 0) { cmt_lineage_status = CMT_LINEAGE_REPAIRED; goto revalidate; } } /*FALLTHROUGH*/ default: /* * If we're here, something has gone wrong in trying to * recover from a illegal PG hierarchy, or we've encountered * a validation error for which we don't know how to recover. * In this case, disable CMT scheduling all together. */ pg_cmt_disable(); cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; return (-1); } return (0); }