/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Basic NUMA support in terms of locality groups * * Solaris needs to know which CPUs, memory, etc. are near each other to * provide good performance on NUMA machines by optimizing for locality. * In order to do this, a new abstraction called a "locality group (lgroup)" * has been introduced to keep track of which CPU-like and memory-like hardware * resources are close to each other. Currently, latency is the only measure * used to determine how to group hardware resources into lgroups, but this * does not limit the groupings to be based solely on latency. Other factors * may be used to determine the groupings in the future. * * Lgroups are organized into a hieararchy or topology that represents the * latency topology of the machine. There is always at least a root lgroup in * the system. It represents all the hardware resources in the machine at a * latency big enough that any hardware resource can at least access any other * hardware resource within that latency. A Uniform Memory Access (UMA) * machine is represented with one lgroup (the root). In contrast, a NUMA * machine is represented at least by the root lgroup and some number of leaf * lgroups where the leaf lgroups contain the hardware resources within the * least latency of each other and the root lgroup still contains all the * resources in the machine. Some number of intermediate lgroups may exist * which represent more levels of locality than just the local latency of the * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups * (eg. root and intermediate lgroups) contain the next nearest resources to * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup * to the root lgroup shows the hardware resources from closest to farthest * from the leaf lgroup such that each successive ancestor lgroup contains * the next nearest resources at the next level of locality from the previous. * * The kernel uses the lgroup abstraction to know how to allocate resources * near a given process/thread. At fork() and lwp/thread_create() time, a * "home" lgroup is chosen for a thread. This is done by picking the lgroup * with the lowest load average. Binding to a processor or processor set will * change the home lgroup for a thread. The scheduler has been modified to try * to dispatch a thread on a CPU in its home lgroup. Physical memory * allocation is lgroup aware too, so memory will be allocated from the current * thread's home lgroup if possible. If the desired resources are not * available, the kernel traverses the lgroup hierarchy going to the parent * lgroup to find resources at the next level of locality until it reaches the * root lgroup. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ /* indexed by lgrp_id */ int nlgrps; /* number of lgroups in machine */ int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ /* * Kstat data for lgroups. * * Actual kstat data is collected in lgrp_stats array. * The lgrp_kstat_data array of named kstats is used to extract data from * lgrp_stats and present it to kstat framework. It is protected from partallel * modifications by lgrp_kstat_mutex. This may cause some contention when * several kstat commands run in parallel but this is not the * performance-critical path. */ extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ /* * Declare kstat names statically for enums as defined in the header file. */ LGRP_KSTAT_NAMES; static void lgrp_kstat_init(void); static int lgrp_kstat_extract(kstat_t *, int); static void lgrp_kstat_reset(lgrp_id_t); static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; static kmutex_t lgrp_kstat_mutex; /* * max number of lgroups supported by the platform */ int nlgrpsmax = 0; /* * The root lgroup. Represents the set of resources at the system wide * level of locality. */ lgrp_t *lgrp_root = NULL; /* * During system bootstrap cp_default does not contain the list of lgrp load * averages (cp_lgrploads). The list is allocated after the first CPU is brought * on-line when cp_default is initialized by cpupart_initialize_default(). * Configuring CPU0 may create a two-level topology with root and one leaf node * containing CPU0. This topology is initially constructed in a special * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used * for all lpl operations until cp_default is fully constructed. * * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other * consumer who needs default lpl should use lpl_bootstrap which is a pointer to * the first element of lpl_bootstrap_list. * * CPUs that are added to the system, but have not yet been assigned to an * lgrp will use lpl_bootstrap as a default lpl. This is necessary because * on some architectures (x86) it's possible for the slave CPU startup thread * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). */ #define LPL_BOOTSTRAP_SIZE 2 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; lpl_t *lpl_bootstrap; static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE]; static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE]; /* * If cp still references the bootstrap lpl, it has not yet been added to * an lgrp. lgrp_mem_choose() uses this macro to detect the case where * a thread is trying to allocate memory close to a CPU that has no lgrp. */ #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) static lgrp_t lroot; /* * Size, in bytes, beyond which random memory allocation policy is applied * to non-shared memory. Default is the maximum size, so random memory * allocation won't be used for non-shared memory by default. */ size_t lgrp_privm_random_thresh = (size_t)(-1); /* the maximum effect that a single thread can have on it's lgroup's load */ #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ ((lgrp_loadavg_max_effect) / (ncpu)) uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; /* * Size, in bytes, beyond which random memory allocation policy is applied to * shared memory. Default is 8MB (2 ISM pages). */ size_t lgrp_shm_random_thresh = 8*1024*1024; /* * Whether to do processor set aware memory allocation by default */ int lgrp_mem_pset_aware = 0; /* * Set the default memory allocation policy for root lgroup */ lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; /* * Set the default memory allocation policy. For most platforms, * next touch is sufficient, but some platforms may wish to override * this. */ lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; /* * lgroup CPU event handlers */ static void lgrp_cpu_init(struct cpu *); static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); /* * lgroup memory event handlers */ static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); /* * lgroup CPU partition event handlers */ static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); static void lgrp_part_del_cpu(struct cpu *); static void lgrp_root_init(void); /* * lpl topology */ static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); static void lpl_clear(lpl_t *); static void lpl_leaf_insert(lpl_t *, struct cpupart *); static void lpl_leaf_remove(lpl_t *, struct cpupart *); static void lpl_rset_add(lpl_t *, lpl_t *); static void lpl_rset_del(lpl_t *, lpl_t *); static int lpl_rset_contains(lpl_t *, lpl_t *); static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); static void lpl_child_update(lpl_t *, struct cpupart *); static int lpl_pick(lpl_t *, lpl_t *); static void lpl_verify_wrapper(struct cpupart *); /* * defines for lpl topology verifier return codes */ #define LPL_TOPO_CORRECT 0 #define LPL_TOPO_PART_HAS_NO_LPL -1 #define LPL_TOPO_CPUS_NOT_EMPTY -2 #define LPL_TOPO_LGRP_MISMATCH -3 #define LPL_TOPO_MISSING_PARENT -4 #define LPL_TOPO_PARENT_MISMATCH -5 #define LPL_TOPO_BAD_CPUCNT -6 #define LPL_TOPO_RSET_MISMATCH -7 #define LPL_TOPO_LPL_ORPHANED -8 #define LPL_TOPO_LPL_BAD_NCPU -9 #define LPL_TOPO_RSET_MSSNG_LF -10 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 #define LPL_TOPO_NONLEAF_HAS_CPUS -12 #define LPL_TOPO_LGRP_NOT_LEAF -13 #define LPL_TOPO_BAD_RSETCNT -14 /* * Return whether lgroup optimizations should be enabled on this system */ int lgrp_optimizations(void) { /* * System must have more than 2 lgroups to enable lgroup optimizations * * XXX This assumes that a 2 lgroup system has an empty root lgroup * with one child lgroup containing all the resources. A 2 lgroup * system with a root lgroup directly containing CPUs or memory might * need lgroup optimizations with its child lgroup, but there * isn't such a machine for now.... */ if (nlgrps > 2) return (1); return (0); } /* * Build full lgroup topology */ static void lgrp_root_init(void) { lgrp_handle_t hand; int i; lgrp_id_t id; /* * Create the "root" lgroup */ ASSERT(nlgrps == 0); id = nlgrps++; lgrp_root = &lroot; lgrp_root->lgrp_cpu = NULL; lgrp_root->lgrp_mnodes = 0; lgrp_root->lgrp_nmnodes = 0; hand = lgrp_plat_root_hand(); lgrp_root->lgrp_plathand = hand; lgrp_root->lgrp_id = id; lgrp_root->lgrp_cpucnt = 0; lgrp_root->lgrp_childcnt = 0; klgrpset_clear(lgrp_root->lgrp_children); klgrpset_clear(lgrp_root->lgrp_leaves); lgrp_root->lgrp_parent = NULL; lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); for (i = 0; i < LGRP_RSRC_COUNT; i++) klgrpset_clear(lgrp_root->lgrp_set[i]); lgrp_root->lgrp_kstat = NULL; lgrp_table[id] = lgrp_root; /* * Setup initial lpl list for CPU0 and initial t0 home. * The only lpl space we have so far is lpl_bootstrap. It is used for * all topology operations until cp_default is initialized at which * point t0.t_lpl will be updated. */ lpl_bootstrap = lpl_bootstrap_list; t0.t_lpl = lpl_bootstrap; cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; lpl_bootstrap_list[1].lpl_lgrpid = 1; /* * Set up the bootstrap rset * Since the bootstrap toplogy has just the root, and a leaf, * the rset contains just the leaf, and both lpls can use the same rset */ lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1]; lpl_bootstrap_list[0].lpl_rset_sz = 1; lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; lpl_bootstrap_list[1].lpl_rset_sz = 1; lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset; lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset; cp_default.cp_lgrploads = lpl_bootstrap; } /* * Initialize the lgroup framework and allow the platform to do the same */ void lgrp_init(void) { /* * Initialize the platform */ lgrp_plat_init(); /* * Set max number of lgroups supported on this platform which must be * less than the max number of lgroups supported by the common lgroup * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) */ nlgrpsmax = lgrp_plat_max_lgrps(); ASSERT(nlgrpsmax <= NLGRPS_MAX); } /* * Create the root and cpu0's lgroup, and set t0's home. */ void lgrp_setup(void) { /* * Setup the root lgroup */ lgrp_root_init(); /* * Add cpu0 to an lgroup */ lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); } /* * Lgroup initialization is split in two parts. The first part * (lgrp_main_init()) is called right before start_other_cpus() in main. The * second part (lgrp_main_mp_init()) is called right after start_other_cpus() * when all CPUs are brought online and all distance information is available. * * When lgrp_main_init() is complete it sets lgrp_initialized. The * lgrp_main_mp_init() sets lgrp_topo_initialized. */ /* * true when lgrp initialization has been completed. */ int lgrp_initialized = 0; /* * True when lgrp topology is constructed. */ int lgrp_topo_initialized = 0; /* * Init routine called after startup(), /etc/system has been processed, * and cpu0 has been added to an lgroup. */ void lgrp_main_init(void) { cpu_t *cp = CPU; lgrp_id_t lgrpid; int i; extern void pg_cpu0_reinit(); /* * Enforce a valid lgrp_mem_default_policy */ if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) || (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG)) lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; /* * See if mpo should be disabled. * This may happen in the case of null proc LPA on Starcat. * The platform won't be able to detect null proc LPA until after * cpu0 and memory have already been added to lgroups. * When and if it is detected, the Starcat platform will return * a different platform handle for cpu0 which is what we check for * here. If mpo should be disabled move cpu0 to it's rightful place * (the root), and destroy the remaining lgroups. This effectively * provides an UMA lgroup topology. */ lgrpid = cp->cpu_lpl->lpl_lgrpid; if (lgrp_table[lgrpid]->lgrp_plathand != lgrp_plat_cpu_to_hand(cp->cpu_id)) { lgrp_part_del_cpu(cp); lgrp_cpu_fini(cp, lgrpid); lgrp_cpu_init(cp); lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); /* * Notify the PG subsystem that the CPU's lgrp * association has changed */ pg_cpu0_reinit(); /* * Destroy all lgroups except for root */ for (i = 0; i <= lgrp_alloc_max; i++) { if (LGRP_EXISTS(lgrp_table[i]) && lgrp_table[i] != lgrp_root) lgrp_destroy(lgrp_table[i]); } /* * Fix up root to point at itself for leaves and resources * and not have any children */ lgrp_root->lgrp_childcnt = 0; klgrpset_clear(lgrp_root->lgrp_children); klgrpset_clear(lgrp_root->lgrp_leaves); klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); } /* * Initialize kstats framework. */ lgrp_kstat_init(); /* * cpu0 is finally where it should be, so create it's lgroup's kstats */ mutex_enter(&cpu_lock); lgrp_kstat_create(cp); mutex_exit(&cpu_lock); lgrp_plat_main_init(); lgrp_initialized = 1; } /* * Finish lgrp initialization after all CPUS are brought on-line. * This routine is called after start_other_cpus(). */ void lgrp_main_mp_init(void) { klgrpset_t changed; /* * Update lgroup topology (if necessary) */ klgrpset_clear(changed); (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); lgrp_topo_initialized = 1; } /* * Change latency of lgroup with specified lgroup platform handle (if one is * given) or change all lgroups with old latency to new latency */ void lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, u_longlong_t newtime) { lgrp_t *lgrp; int i; for (i = 0; i <= lgrp_alloc_max; i++) { lgrp = lgrp_table[i]; if (!LGRP_EXISTS(lgrp)) continue; if ((hand == LGRP_NULL_HANDLE && lgrp->lgrp_latency == oldtime) || (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) lgrp->lgrp_latency = (int)newtime; } } /* * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) */ void lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) { klgrpset_t changed; cpu_t *cp; lgrp_id_t id; int rc; switch (event) { /* * The following (re)configuration events are common code * initiated. lgrp_plat_config() is called here to inform the * platform of the reconfiguration event. */ case LGRP_CONFIG_CPU_ADD: cp = (cpu_t *)resource; /* * Initialize the new CPU's lgrp related next/prev * links, and give it a bootstrap lpl so that it can * survive should it need to enter the dispatcher. */ cp->cpu_next_lpl = cp; cp->cpu_prev_lpl = cp; cp->cpu_next_lgrp = cp; cp->cpu_prev_lgrp = cp; cp->cpu_lpl = lpl_bootstrap; lgrp_plat_config(event, resource); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_CPU_DEL: lgrp_plat_config(event, resource); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_CPU_ONLINE: cp = (cpu_t *)resource; lgrp_cpu_init(cp); lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); rc = lpl_topo_verify(cp->cpu_part); if (rc != LPL_TOPO_CORRECT) { panic("lpl_topo_verify failed: %d", rc); } lgrp_plat_config(event, resource); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_CPU_OFFLINE: cp = (cpu_t *)resource; id = cp->cpu_lpl->lpl_lgrpid; lgrp_part_del_cpu(cp); lgrp_cpu_fini(cp, id); rc = lpl_topo_verify(cp->cpu_part); if (rc != LPL_TOPO_CORRECT) { panic("lpl_topo_verify failed: %d", rc); } lgrp_plat_config(event, resource); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_CPUPART_ADD: cp = (cpu_t *)resource; lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); rc = lpl_topo_verify(cp->cpu_part); if (rc != LPL_TOPO_CORRECT) { panic("lpl_topo_verify failed: %d", rc); } lgrp_plat_config(event, resource); break; case LGRP_CONFIG_CPUPART_DEL: cp = (cpu_t *)resource; lgrp_part_del_cpu((cpu_t *)resource); rc = lpl_topo_verify(cp->cpu_part); if (rc != LPL_TOPO_CORRECT) { panic("lpl_topo_verify failed: %d", rc); } lgrp_plat_config(event, resource); break; /* * The following events are initiated by the memnode * subsystem. */ case LGRP_CONFIG_MEM_ADD: lgrp_mem_init((int)resource, where, B_FALSE); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_MEM_DEL: lgrp_mem_fini((int)resource, where, B_FALSE); atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_MEM_RENAME: { lgrp_config_mem_rename_t *ren_arg = (lgrp_config_mem_rename_t *)where; lgrp_mem_rename((int)resource, ren_arg->lmem_rename_from, ren_arg->lmem_rename_to); atomic_add_32(&lgrp_gen, 1); break; } case LGRP_CONFIG_GEN_UPDATE: atomic_add_32(&lgrp_gen, 1); break; case LGRP_CONFIG_FLATTEN: if (where == 0) lgrp_topo_levels = (int)resource; else (void) lgrp_topo_flatten(resource, lgrp_table, lgrp_alloc_max, &changed); break; /* * Update any lgroups with old latency to new latency */ case LGRP_CONFIG_LAT_CHANGE_ALL: lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, (u_longlong_t)where); break; /* * Update lgroup with specified lgroup platform handle to have * new latency */ case LGRP_CONFIG_LAT_CHANGE: lgrp_latency_change((lgrp_handle_t)resource, 0, (u_longlong_t)where); break; case LGRP_CONFIG_NOP: break; default: break; } } /* * Called to add lgrp info into cpu structure from cpu_add_unit; * do not assume cpu is in cpu[] yet! * * CPUs are brought online with all other CPUs paused so we can't * allocate memory or we could deadlock the system, so we rely on * the platform to statically allocate as much space as we need * for the lgrp structs and stats. */ static void lgrp_cpu_init(struct cpu *cp) { klgrpset_t changed; int count; lgrp_handle_t hand; int first_cpu; lgrp_t *my_lgrp; lgrp_id_t lgrpid; struct cpu *cptr; /* * This is the first time through if the resource set * for the root lgroup is empty. After cpu0 has been * initially added to an lgroup, the root's CPU resource * set can never be empty, since the system's last CPU * cannot be offlined. */ if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { /* * First time through. */ first_cpu = 1; } else { /* * If cpu0 needs to move lgroups, we may come * through here again, at which time cpu_lock won't * be held, and lgrp_initialized will be false. */ ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); ASSERT(cp->cpu_part != NULL); first_cpu = 0; } hand = lgrp_plat_cpu_to_hand(cp->cpu_id); my_lgrp = lgrp_hand_to_lgrp(hand); if (my_lgrp == NULL) { /* * Create new lgrp and add it to lgroup topology */ my_lgrp = lgrp_create(); my_lgrp->lgrp_plathand = hand; my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); lgrpid = my_lgrp->lgrp_id; klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); count = 0; klgrpset_clear(changed); count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); /* * May have added new intermediate lgroups, so need to add * resources other than CPUs which are added below */ (void) lgrp_mnode_update(changed, NULL); } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) > 0) { /* * Leaf lgroup was created, but latency wasn't available * then. So, set latency for it and fill in rest of lgroup * topology now that we know how far it is from other leaf * lgroups. */ lgrpid = my_lgrp->lgrp_id; klgrpset_clear(changed); if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid)) klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); /* * May have added new intermediate lgroups, so need to add * resources other than CPUs which are added below */ (void) lgrp_mnode_update(changed, NULL); } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], my_lgrp->lgrp_id)) { int i; /* * Update existing lgroup and lgroups containing it with CPU * resource */ lgrpid = my_lgrp->lgrp_id; klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_t *lgrp; lgrp = lgrp_table[i]; if (!LGRP_EXISTS(lgrp) || !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) continue; klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); } } lgrpid = my_lgrp->lgrp_id; cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; /* * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will * end up in lpl for lgroup 0 whether it is supposed to be in there or * not since none of lgroup IDs in the lpl's have been set yet. */ if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) cp->cpu_lpl->lpl_lgrpid = lgrpid; /* * link the CPU into the lgrp's CPU list */ if (my_lgrp->lgrp_cpucnt == 0) { my_lgrp->lgrp_cpu = cp; cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; } else { cptr = my_lgrp->lgrp_cpu; cp->cpu_next_lgrp = cptr; cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; cptr->cpu_prev_lgrp = cp; } my_lgrp->lgrp_cpucnt++; } lgrp_t * lgrp_create(void) { lgrp_t *my_lgrp; lgrp_id_t lgrpid; int i; ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); /* * Find an open slot in the lgroup table and recycle unused lgroup * left there if any */ my_lgrp = NULL; if (lgrp_alloc_hint == -1) /* * Allocate from end when hint not set yet because no lgroups * have been deleted yet */ lgrpid = nlgrps++; else { /* * Start looking for next open slot from hint and leave hint * at slot allocated */ for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { my_lgrp = lgrp_table[i]; if (!LGRP_EXISTS(my_lgrp)) { lgrpid = i; nlgrps++; break; } } lgrp_alloc_hint = lgrpid; } /* * Keep track of max lgroup ID allocated so far to cut down on searches */ if (lgrpid > lgrp_alloc_max) lgrp_alloc_max = lgrpid; /* * Need to allocate new lgroup if next open slot didn't have one * for recycling */ if (my_lgrp == NULL) my_lgrp = lgrp_plat_alloc(lgrpid); if (nlgrps > nlgrpsmax || my_lgrp == NULL) panic("Too many lgrps for platform (%d)", nlgrps); my_lgrp->lgrp_id = lgrpid; my_lgrp->lgrp_latency = 0; my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; my_lgrp->lgrp_parent = NULL; my_lgrp->lgrp_childcnt = 0; my_lgrp->lgrp_mnodes = (mnodeset_t)0; my_lgrp->lgrp_nmnodes = 0; klgrpset_clear(my_lgrp->lgrp_children); klgrpset_clear(my_lgrp->lgrp_leaves); for (i = 0; i < LGRP_RSRC_COUNT; i++) klgrpset_clear(my_lgrp->lgrp_set[i]); my_lgrp->lgrp_cpu = NULL; my_lgrp->lgrp_cpucnt = 0; if (my_lgrp->lgrp_kstat != NULL) lgrp_kstat_reset(lgrpid); lgrp_table[my_lgrp->lgrp_id] = my_lgrp; return (my_lgrp); } void lgrp_destroy(lgrp_t *lgrp) { int i; /* * Unless this lgroup is being destroyed on behalf of * the boot CPU, cpu_lock must be held */ ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); if (nlgrps == 1) cmn_err(CE_PANIC, "Can't destroy only lgroup!"); if (!LGRP_EXISTS(lgrp)) return; /* * Set hint to lgroup being deleted and try to keep lower numbered * hints to facilitate finding empty slots */ if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) lgrp_alloc_hint = lgrp->lgrp_id; /* * Mark this lgroup to be recycled by setting its lgroup ID to * LGRP_NONE and clear relevant fields */ lgrp->lgrp_id = LGRP_NONE; lgrp->lgrp_latency = 0; lgrp->lgrp_plathand = LGRP_NULL_HANDLE; lgrp->lgrp_parent = NULL; lgrp->lgrp_childcnt = 0; klgrpset_clear(lgrp->lgrp_children); klgrpset_clear(lgrp->lgrp_leaves); for (i = 0; i < LGRP_RSRC_COUNT; i++) klgrpset_clear(lgrp->lgrp_set[i]); lgrp->lgrp_mnodes = (mnodeset_t)0; lgrp->lgrp_nmnodes = 0; lgrp->lgrp_cpu = NULL; lgrp->lgrp_cpucnt = 0; nlgrps--; } /* * Initialize kstat data. Called from lgrp intialization code. */ static void lgrp_kstat_init(void) { lgrp_stat_t stat; mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); for (stat = 0; stat < LGRP_NUM_STATS; stat++) kstat_named_init(&lgrp_kstat_data[stat], lgrp_kstat_names[stat], KSTAT_DATA_INT64); } /* * initialize an lgrp's kstats if needed * called with cpu_lock held but not with cpus paused. * we don't tear these down now because we don't know about * memory leaving the lgrp yet... */ void lgrp_kstat_create(cpu_t *cp) { kstat_t *lgrp_kstat; lgrp_id_t lgrpid; lgrp_t *my_lgrp; ASSERT(MUTEX_HELD(&cpu_lock)); lgrpid = cp->cpu_lpl->lpl_lgrpid; my_lgrp = lgrp_table[lgrpid]; if (my_lgrp->lgrp_kstat != NULL) return; /* already initialized */ lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", KSTAT_TYPE_NAMED, LGRP_NUM_STATS, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); if (lgrp_kstat != NULL) { lgrp_kstat->ks_lock = &lgrp_kstat_mutex; lgrp_kstat->ks_private = my_lgrp; lgrp_kstat->ks_data = &lgrp_kstat_data; lgrp_kstat->ks_update = lgrp_kstat_extract; my_lgrp->lgrp_kstat = lgrp_kstat; kstat_install(lgrp_kstat); } } /* * this will do something when we manage to remove now unused lgrps */ /* ARGSUSED */ void lgrp_kstat_destroy(cpu_t *cp) { ASSERT(MUTEX_HELD(&cpu_lock)); } /* * Called when a CPU is off-lined. */ static void lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) { lgrp_t *my_lgrp; struct cpu *prev; struct cpu *next; ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); prev = cp->cpu_prev_lgrp; next = cp->cpu_next_lgrp; prev->cpu_next_lgrp = next; next->cpu_prev_lgrp = prev; /* * just because I'm paranoid doesn't mean... */ cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; my_lgrp = lgrp_table[lgrpid]; my_lgrp->lgrp_cpucnt--; /* * Removing last CPU in lgroup, so update lgroup topology */ if (my_lgrp->lgrp_cpucnt == 0) { klgrpset_t changed; int count; int i; my_lgrp->lgrp_cpu = NULL; /* * Remove this lgroup from its lgroup CPU resources and remove * lgroup from lgroup topology if it doesn't have any more * resources in it now */ klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { count = 0; klgrpset_clear(changed); count += lgrp_leaf_delete(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); return; } /* * This lgroup isn't empty, so just remove it from CPU * resources of any lgroups that contain it as such */ for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_t *lgrp; lgrp = lgrp_table[i]; if (!LGRP_EXISTS(lgrp) || !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid)) continue; klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); } return; } if (my_lgrp->lgrp_cpu == cp) my_lgrp->lgrp_cpu = next; } /* * Update memory nodes in target lgroups and return ones that get changed */ int lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) { int count; int i; int j; lgrp_t *lgrp; lgrp_t *lgrp_rsrc; count = 0; if (changed) klgrpset_clear(*changed); if (klgrpset_isempty(target)) return (0); /* * Find each lgroup in target lgroups */ for (i = 0; i <= lgrp_alloc_max; i++) { /* * Skip any lgroups that don't exist or aren't in target group */ lgrp = lgrp_table[i]; if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { continue; } /* * Initialize memnodes for intermediate lgroups to 0 * and update them from scratch since they may have completely * changed */ if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { lgrp->lgrp_mnodes = (mnodeset_t)0; lgrp->lgrp_nmnodes = 0; } /* * Update memory nodes of of target lgroup with memory nodes * from each lgroup in its lgroup memory resource set */ for (j = 0; j <= lgrp_alloc_max; j++) { int k; /* * Skip any lgroups that don't exist or aren't in * memory resources of target lgroup */ lgrp_rsrc = lgrp_table[j]; if (!LGRP_EXISTS(lgrp_rsrc) || !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], j)) continue; /* * Update target lgroup's memnodes to include memnodes * of this lgroup */ for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { mnodeset_t mnode_mask; mnode_mask = (mnodeset_t)1 << k; if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && !(lgrp->lgrp_mnodes & mnode_mask)) { lgrp->lgrp_mnodes |= mnode_mask; lgrp->lgrp_nmnodes++; } } count++; if (changed) klgrpset_add(*changed, lgrp->lgrp_id); } } return (count); } /* * Memory copy-rename. Called when the "mnode" containing the kernel cage memory * is moved from one board to another. The "from" and "to" arguments specify the * source and the destination of the move. * * See plat_lgrp_config() for a detailed description of the copy-rename * semantics. * * The lgrp_mem_rename() is called by the platform copy-rename code to update * the lgroup topology which is changing as memory moves from one lgroup to * another. It removes the mnode from the source lgroup and re-inserts it in the * target lgroup. * * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR * copy-rename operation. * * There is one case which requires special handling. If the system contains * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by * lgrp_mem_init), but there is a window when the system has no memory in the * lgroup hierarchy. If another thread tries to allocate memory during this * window, the allocation will fail, although the system has physical memory. * This may cause a system panic or a deadlock (some sleeping memory allocations * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting * the mnode back). * * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the * lgrp with non-empty lgrp_mnodes. To deal with the special case above, * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, * but it updates the rest of the lgroup topology as if the mnode was actually * removed. The lgrp_mem_init() function recognizes that the mnode being * inserted represents such a special case and updates the topology * appropriately. */ void lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) { /* * Remove the memory from the source node and add it to the destination * node. */ lgrp_mem_fini(mnode, from, B_TRUE); lgrp_mem_init(mnode, to, B_TRUE); } /* * Called to indicate that the lgrp with platform handle "hand" now * contains the memory identified by "mnode". * * LOCKING for this routine is a bit tricky. Usually it is called without * cpu_lock and it must must grab cpu_lock here to prevent racing with other * callers. During DR of the board containing the caged memory it may be called * with cpu_lock already held and CPUs paused. * * If the insertion is part of the DR copy-rename and the inserted mnode (and * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are * dealing with the special case of DR copy-rename described in * lgrp_mem_rename(). */ void lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) { klgrpset_t changed; int count; int i; lgrp_t *my_lgrp; lgrp_id_t lgrpid; mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); boolean_t drop_lock = B_FALSE; boolean_t need_synch = B_FALSE; /* * Grab CPU lock (if we haven't already) */ if (!MUTEX_HELD(&cpu_lock)) { mutex_enter(&cpu_lock); drop_lock = B_TRUE; } /* * This routine may be called from a context where we already * hold cpu_lock, and have already paused cpus. */ if (!cpus_paused()) need_synch = B_TRUE; /* * Check if this mnode is already configured and return immediately if * it is. * * NOTE: in special case of copy-rename of the only remaining mnode, * lgrp_mem_fini() refuses to remove the last mnode from the root, so we * recognize this case and continue as usual, but skip the update to * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency * in topology, temporarily introduced by lgrp_mem_fini(). */ if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && lgrp_root->lgrp_mnodes & mnodes_mask) { if (drop_lock) mutex_exit(&cpu_lock); return; } /* * Update lgroup topology with new memory resources, keeping track of * which lgroups change */ count = 0; klgrpset_clear(changed); my_lgrp = lgrp_hand_to_lgrp(hand); if (my_lgrp == NULL) { /* new lgrp */ my_lgrp = lgrp_create(); lgrpid = my_lgrp->lgrp_id; my_lgrp->lgrp_plathand = hand; my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); if (need_synch) pause_cpus(NULL); count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); if (need_synch) start_cpus(); } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) > 0) { /* * Leaf lgroup was created, but latency wasn't available * then. So, set latency for it and fill in rest of lgroup * topology now that we know how far it is from other leaf * lgroups. */ klgrpset_clear(changed); lgrpid = my_lgrp->lgrp_id; if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); if (need_synch) pause_cpus(NULL); count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); if (need_synch) start_cpus(); } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], my_lgrp->lgrp_id)) { /* * Add new lgroup memory resource to existing lgroup */ lgrpid = my_lgrp->lgrp_id; klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); klgrpset_add(changed, lgrpid); count++; for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_t *lgrp; lgrp = lgrp_table[i]; if (!LGRP_EXISTS(lgrp) || !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) continue; klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); klgrpset_add(changed, lgrp->lgrp_id); count++; } } /* * Add memory node to lgroup and remove lgroup from ones that need * to be updated */ if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { my_lgrp->lgrp_mnodes |= mnodes_mask; my_lgrp->lgrp_nmnodes++; } klgrpset_del(changed, lgrpid); /* * Update memory node information for all lgroups that changed and * contain new memory node as a resource */ if (count) (void) lgrp_mnode_update(changed, NULL); if (drop_lock) mutex_exit(&cpu_lock); } /* * Called to indicate that the lgroup associated with the platform * handle "hand" no longer contains given memory node * * LOCKING for this routine is a bit tricky. Usually it is called without * cpu_lock and it must must grab cpu_lock here to prevent racing with other * callers. During DR of the board containing the caged memory it may be called * with cpu_lock already held and CPUs paused. * * If the deletion is part of the DR copy-rename and the deleted mnode is the * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert * the same mnode back into the topology. See lgrp_mem_rename() and * lgrp_mem_init() for additional details. */ void lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) { klgrpset_t changed; int count; int i; lgrp_t *my_lgrp; lgrp_id_t lgrpid; mnodeset_t mnodes_mask; boolean_t drop_lock = B_FALSE; boolean_t need_synch = B_FALSE; /* * Grab CPU lock (if we haven't already) */ if (!MUTEX_HELD(&cpu_lock)) { mutex_enter(&cpu_lock); drop_lock = B_TRUE; } /* * This routine may be called from a context where we already * hold cpu_lock and have already paused cpus. */ if (!cpus_paused()) need_synch = B_TRUE; my_lgrp = lgrp_hand_to_lgrp(hand); /* * The lgrp *must* be pre-existing */ ASSERT(my_lgrp != NULL); /* * Delete memory node from lgroups which contain it */ mnodes_mask = ((mnodeset_t)1 << mnode); for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_t *lgrp = lgrp_table[i]; /* * Skip any non-existent lgroups and any lgroups that don't * contain leaf lgroup of memory as a memory resource */ if (!LGRP_EXISTS(lgrp) || !(lgrp->lgrp_mnodes & mnodes_mask)) continue; /* * Avoid removing the last mnode from the root in the DR * copy-rename case. See lgrp_mem_rename() for details. */ if (is_copy_rename && (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) continue; /* * Remove memory node from lgroup. */ lgrp->lgrp_mnodes &= ~mnodes_mask; lgrp->lgrp_nmnodes--; ASSERT(lgrp->lgrp_nmnodes >= 0); } ASSERT(lgrp_root->lgrp_nmnodes > 0); /* * Don't need to update lgroup topology if this lgroup still has memory. * * In the special case of DR copy-rename with the only mnode being * removed, the lgrp_mnodes for the root is always non-zero, but we * still need to update the lgroup topology. */ if ((my_lgrp->lgrp_nmnodes > 0) && !(is_copy_rename && (my_lgrp == lgrp_root) && (my_lgrp->lgrp_mnodes == mnodes_mask))) { if (drop_lock) mutex_exit(&cpu_lock); return; } /* * This lgroup does not contain any memory now */ klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); /* * Remove this lgroup from lgroup topology if it does not contain any * resources now */ lgrpid = my_lgrp->lgrp_id; count = 0; klgrpset_clear(changed); if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { /* * Delete lgroup when no more resources */ if (need_synch) pause_cpus(NULL); count = lgrp_leaf_delete(my_lgrp, lgrp_table, lgrp_alloc_max + 1, &changed); ASSERT(count > 0); if (need_synch) start_cpus(); } else { /* * Remove lgroup from memory resources of any lgroups that * contain it as such */ for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_t *lgrp; lgrp = lgrp_table[i]; if (!LGRP_EXISTS(lgrp) || !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) continue; klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); } } if (drop_lock) mutex_exit(&cpu_lock); } /* * Return lgroup with given platform handle */ lgrp_t * lgrp_hand_to_lgrp(lgrp_handle_t hand) { int i; lgrp_t *lgrp; if (hand == LGRP_NULL_HANDLE) return (NULL); for (i = 0; i <= lgrp_alloc_max; i++) { lgrp = lgrp_table[i]; if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) return (lgrp); } return (NULL); } /* * Return the home lgroup of the current thread. * We must do this with kernel preemption disabled, since we don't want our * thread to be re-homed while we're poking around with its lpl, and the lpl * should never be NULL. * * NOTE: Can't guarantee that lgroup will be valid once kernel preemption * is enabled because of DR. Callers can use disable kernel preemption * around this call to guarantee that the lgroup will be valid beyond this * routine, since kernel preemption can be recursive. */ lgrp_t * lgrp_home_lgrp(void) { lgrp_t *lgrp; lpl_t *lpl; kpreempt_disable(); lpl = curthread->t_lpl; ASSERT(lpl != NULL); ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); lgrp = lgrp_table[lpl->lpl_lgrpid]; kpreempt_enable(); return (lgrp); } /* * Return ID of home lgroup for given thread * (See comments for lgrp_home_lgrp() for special care and handling * instructions) */ lgrp_id_t lgrp_home_id(kthread_t *t) { lgrp_id_t lgrp; lpl_t *lpl; ASSERT(t != NULL); /* * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we * cannot since the HAT layer can call into this routine to * determine the locality for its data structures in the context * of a page fault. */ kpreempt_disable(); lpl = t->t_lpl; ASSERT(lpl != NULL); ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); lgrp = lpl->lpl_lgrpid; kpreempt_enable(); return (lgrp); } /* * Return lgroup containing the physical memory for the given page frame number */ lgrp_t * lgrp_pfn_to_lgrp(pfn_t pfn) { lgrp_handle_t hand; int i; lgrp_t *lgrp; hand = lgrp_plat_pfn_to_hand(pfn); if (hand != LGRP_NULL_HANDLE) for (i = 0; i <= lgrp_alloc_max; i++) { lgrp = lgrp_table[i]; if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) return (lgrp); } return (NULL); } /* * Return lgroup containing the physical memory for the given page frame number */ lgrp_t * lgrp_phys_to_lgrp(u_longlong_t physaddr) { lgrp_handle_t hand; int i; lgrp_t *lgrp; pfn_t pfn; pfn = btop(physaddr); hand = lgrp_plat_pfn_to_hand(pfn); if (hand != LGRP_NULL_HANDLE) for (i = 0; i <= lgrp_alloc_max; i++) { lgrp = lgrp_table[i]; if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) return (lgrp); } return (NULL); } /* * Return the leaf lgroup containing the given CPU * * The caller needs to take precautions necessary to prevent * "cpu", and it's lpl from going away across a call to this function. * hint: kpreempt_disable()/kpreempt_enable() */ static lgrp_t * lgrp_cpu_to_lgrp(cpu_t *cpu) { return (cpu->cpu_lpl->lpl_lgrp); } /* * Return the sum of the partition loads in an lgrp divided by * the number of CPUs in the lgrp. This is our best approximation * of an 'lgroup load average' for a useful per-lgroup kstat. */ static uint64_t lgrp_sum_loadavgs(lgrp_t *lgrp) { cpu_t *cpu; int ncpu; uint64_t loads = 0; mutex_enter(&cpu_lock); cpu = lgrp->lgrp_cpu; ncpu = lgrp->lgrp_cpucnt; if (cpu == NULL || ncpu == 0) { mutex_exit(&cpu_lock); return (0ull); } do { loads += cpu->cpu_lpl->lpl_loadavg; cpu = cpu->cpu_next_lgrp; } while (cpu != lgrp->lgrp_cpu); mutex_exit(&cpu_lock); return (loads / ncpu); } void lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) { struct lgrp_stats *pstats; /* * Verify that the caller isn't trying to add to * a statistic for an lgroup that has gone away */ if (lgrpid < 0 || lgrpid > lgrp_alloc_max) return; pstats = &lgrp_stats[lgrpid]; atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); } int64_t lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) { uint64_t val; struct lgrp_stats *pstats; if (lgrpid < 0 || lgrpid > lgrp_alloc_max) return ((int64_t)0); pstats = &lgrp_stats[lgrpid]; LGRP_STAT_READ(pstats, stat, val); return (val); } /* * Reset all kstats for lgrp specified by its lgrpid. */ static void lgrp_kstat_reset(lgrp_id_t lgrpid) { lgrp_stat_t stat; if (lgrpid < 0 || lgrpid > lgrp_alloc_max) return; for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); } } /* * Collect all per-lgrp statistics for the lgrp associated with this * kstat, and store them in the ks_data array. * * The superuser can reset all the running counter statistics for an * lgrp by writing to any of the lgrp's stats. */ static int lgrp_kstat_extract(kstat_t *ksp, int rw) { lgrp_stat_t stat; struct kstat_named *ksd; lgrp_t *lgrp; lgrp_id_t lgrpid; lgrp = (lgrp_t *)ksp->ks_private; ksd = (struct kstat_named *)ksp->ks_data; ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); lgrpid = lgrp->lgrp_id; if (lgrpid == LGRP_NONE) { /* * Return all zeroes as stats for freed lgrp. */ for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { ksd[stat].value.i64 = 0; } ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; ksd[stat + LGRP_LOADAVG].value.i64 = 0; } else if (rw != KSTAT_WRITE) { /* * Handle counter stats */ for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); } /* * Handle kernel data snapshot stats */ ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); ksd[stat + LGRP_NUM_PG_FREE].value.i64 = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = lgrp_loadavg_max_effect; } else { lgrp_kstat_reset(lgrpid); } return (0); } int lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) { cpu_t *cp; mutex_enter(&cpu_lock); if ((cp = cpu_get(id)) == NULL) { mutex_exit(&cpu_lock); return (EINVAL); } if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { mutex_exit(&cpu_lock); return (EINVAL); } ASSERT(cp->cpu_lpl != NULL); *lp = cp->cpu_lpl->lpl_lgrpid; mutex_exit(&cpu_lock); return (0); } int lgrp_query_load(processorid_t id, lgrp_load_t *lp) { cpu_t *cp; mutex_enter(&cpu_lock); if ((cp = cpu_get(id)) == NULL) { mutex_exit(&cpu_lock); return (EINVAL); } ASSERT(cp->cpu_lpl != NULL); *lp = cp->cpu_lpl->lpl_loadavg; mutex_exit(&cpu_lock); return (0); } /* * Add a resource named by lpl_leaf to rset of lpl_target * * This routine also adjusts ncpu and nrset if the call succeeds in adding a * resource. It is adjusted here, as this is presently the only place that we * can be certain a resource addition has succeeded. * * We keep the list of rsets sorted so that the dispatcher can quickly walk the * list in order until it reaches a NULL. (This list is required to be NULL * terminated, too). This is done so that we can mark start pos + 1, so that * each lpl is traversed sequentially, but in a different order. We hope this * will improve performance a bit. (Hopefully, less read-to-own traffic...) */ void lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) { int i; int entry_slot = 0; /* return if leaf is already present */ for (i = 0; i < lpl_target->lpl_nrset; i++) { if (lpl_target->lpl_rset[i] == lpl_leaf) { return; } if (lpl_target->lpl_rset[i]->lpl_lgrpid > lpl_leaf->lpl_lgrpid) { break; } } /* insert leaf, update counts */ entry_slot = i; i = lpl_target->lpl_nrset++; /* * Start at the end of the rset array and work backwards towards the * slot into which the new lpl will be inserted. This effectively * preserves the current ordering by scooting everybody over one entry, * and placing the new entry into the space created. */ while (i-- > entry_slot) { lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] = i + 1; } lpl_target->lpl_rset[entry_slot] = lpl_leaf; lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot; lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; } /* * Update each of lpl_parent's children with a reference to their parent. * The lgrp topology is used as the reference since it is fully * consistent and correct at this point. * This should be called after any potential change in lpl_parent's * rset. */ static void lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) { klgrpset_t children; int i; children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; if (klgrpset_isempty(children)) return; /* nothing to do */ for (i = 0; i <= lgrp_alloc_max; i++) { if (klgrpset_ismember(children, i)) { /* * (Re)set the parent. It may be incorrect if * lpl_parent is new in the topology. */ cp->cp_lgrploads[i].lpl_parent = lpl_parent; } } } /* * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. * * This routine also adjusts ncpu and nrset if the call succeeds in deleting a * resource. The values are adjusted here, as this is the only place that we can * be certain a resource was successfully deleted. */ void lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) { int i; lpl_t *leaf; if (lpl_target->lpl_nrset == 0) return; /* find leaf in intermediate node */ for (i = 0; i < lpl_target->lpl_nrset; i++) { if (lpl_target->lpl_rset[i] == lpl_leaf) break; } /* return if leaf not found */ if (lpl_target->lpl_rset[i] != lpl_leaf) return; /* prune leaf, compress array */ lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1; lpl_target->lpl_ncpu--; do { lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; /* * Update the lgrp id <=> rset mapping */ if ((leaf = lpl_target->lpl_rset[i]) != NULL) { lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i; } } while (i++ < lpl_target->lpl_nrset); } /* * Check to see if the resource set of the target lpl contains the * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. */ int lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) { int i; for (i = 0; i < lpl_target->lpl_nrset; i++) { if (lpl_target->lpl_rset[i] == lpl_leaf) return (1); } return (0); } /* * Called when we change cpu lpl membership. This increments or decrements the * per-cpu counter in every lpl in which our leaf appears. */ void lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) { cpupart_t *cpupart; lgrp_t *lgrp_leaf; lgrp_t *lgrp_cur; lpl_t *lpl_leaf; lpl_t *lpl_cur; int i; ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); cpupart = cp->cpu_part; lpl_leaf = cp->cpu_lpl; lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_cur = lgrp_table[i]; /* * Don't adjust if the lgrp isn't there, if we're the leaf lpl * for the cpu in question, or if the current lgrp and leaf * don't share the same resources. */ if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) continue; lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; if (lpl_cur->lpl_nrset > 0) { if (act == LPL_INCREMENT) { lpl_cur->lpl_ncpu++; } else if (act == LPL_DECREMENT) { lpl_cur->lpl_ncpu--; } } } } /* * Initialize lpl with given resources and specified lgrp */ void lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) { lpl->lpl_lgrpid = lgrp->lgrp_id; lpl->lpl_loadavg = 0; if (lpl == lpl_leaf) lpl->lpl_ncpu = 1; else lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; lpl->lpl_nrset = 1; lpl->lpl_rset[0] = lpl_leaf; lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0; lpl->lpl_lgrp = lgrp; lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ } /* * Clear an unused lpl */ void lpl_clear(lpl_t *lpl) { /* * Clear out all fields in the lpl except: * lpl_lgrpid - to facilitate debugging * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size * * Note that the lpl's rset and id2rset mapping are cleared as well. */ lpl->lpl_loadavg = 0; lpl->lpl_ncpu = 0; lpl->lpl_lgrp = NULL; lpl->lpl_parent = NULL; lpl->lpl_cpus = NULL; lpl->lpl_nrset = 0; lpl->lpl_homed_time = 0; bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz); bzero(lpl->lpl_id2rset, sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz); } /* * Given a CPU-partition, verify that the lpl topology in the CPU-partition * is in sync with the lgroup toplogy in the system. The lpl topology may not * make full use of all of the lgroup topology, but this checks to make sure * that for the parts that it does use, it has correctly understood the * relationships that exist. This function returns * 0 if the topology is correct, and a non-zero error code, for non-debug * kernels if incorrect. Asserts are spread throughout the code to aid in * debugging on a DEBUG kernel. */ int lpl_topo_verify(cpupart_t *cpupart) { lgrp_t *lgrp; lpl_t *lpl; klgrpset_t rset; klgrpset_t cset; cpu_t *cpu; cpu_t *cp_start; int i; int j; int sum; /* topology can't be incorrect if it doesn't exist */ if (!lgrp_topo_initialized || !lgrp_initialized) return (LPL_TOPO_CORRECT); ASSERT(cpupart != NULL); for (i = 0; i <= lgrp_alloc_max; i++) { lgrp = lgrp_table[i]; lpl = NULL; /* make sure lpls are allocated */ ASSERT(cpupart->cp_lgrploads); if (!cpupart->cp_lgrploads) return (LPL_TOPO_PART_HAS_NO_LPL); lpl = &cpupart->cp_lgrploads[i]; /* make sure our index is good */ ASSERT(i < cpupart->cp_nlgrploads); /* if lgroup doesn't exist, make sure lpl is empty */ if (!LGRP_EXISTS(lgrp)) { ASSERT(lpl->lpl_ncpu == 0); if (lpl->lpl_ncpu > 0) { return (LPL_TOPO_CPUS_NOT_EMPTY); } else { continue; } } /* verify that lgroup and lpl are identically numbered */ ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); /* if lgroup isn't in our partition, make sure lpl is empty */ if (!klgrpset_intersects(lgrp->lgrp_leaves, cpupart->cp_lgrpset)) { ASSERT(lpl->lpl_ncpu == 0); if (lpl->lpl_ncpu > 0) { return (LPL_TOPO_CPUS_NOT_EMPTY); } /* * lpl is empty, and lgroup isn't in partition. verify * that lpl doesn't show up in anyone else's rsets (in * this partition, anyway) */ for (j = 0; j < cpupart->cp_nlgrploads; j++) { lpl_t *i_lpl; /* lpl we're iterating over */ i_lpl = &cpupart->cp_lgrploads[j]; ASSERT(!lpl_rset_contains(i_lpl, lpl)); if (lpl_rset_contains(i_lpl, lpl)) { return (LPL_TOPO_LPL_ORPHANED); } } /* lgroup is empty, and everything is ok. continue */ continue; } /* lgroup is in this partition, now check it against lpl */ /* do both have matching lgrps? */ ASSERT(lgrp == lpl->lpl_lgrp); if (lgrp != lpl->lpl_lgrp) { return (LPL_TOPO_LGRP_MISMATCH); } /* do the parent lgroups exist and do they match? */ if (lgrp->lgrp_parent) { ASSERT(lpl->lpl_parent); ASSERT(lgrp->lgrp_parent->lgrp_id == lpl->lpl_parent->lpl_lgrpid); if (!lpl->lpl_parent) { return (LPL_TOPO_MISSING_PARENT); } else if (lgrp->lgrp_parent->lgrp_id != lpl->lpl_parent->lpl_lgrpid) { return (LPL_TOPO_PARENT_MISMATCH); } } /* only leaf lgroups keep a cpucnt, only check leaves */ if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { /* verify that lgrp is also a leaf */ ASSERT((lgrp->lgrp_childcnt == 0) && (klgrpset_ismember(lgrp->lgrp_leaves, lpl->lpl_lgrpid))); if ((lgrp->lgrp_childcnt > 0) || (!klgrpset_ismember(lgrp->lgrp_leaves, lpl->lpl_lgrpid))) { return (LPL_TOPO_LGRP_NOT_LEAF); } ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && (lpl->lpl_ncpu > 0)); if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || (lpl->lpl_ncpu <= 0)) { return (LPL_TOPO_BAD_CPUCNT); } /* * Check that lpl_ncpu also matches the number of * cpus in the lpl's linked list. This only exists in * leaves, but they should always match. */ j = 0; cpu = cp_start = lpl->lpl_cpus; while (cpu != NULL) { j++; /* check to make sure cpu's lpl is leaf lpl */ ASSERT(cpu->cpu_lpl == lpl); if (cpu->cpu_lpl != lpl) { return (LPL_TOPO_CPU_HAS_BAD_LPL); } /* check next cpu */ if ((cpu = cpu->cpu_next_lpl) != cp_start) { continue; } else { cpu = NULL; } } ASSERT(j == lpl->lpl_ncpu); if (j != lpl->lpl_ncpu) { return (LPL_TOPO_LPL_BAD_NCPU); } /* * Also, check that leaf lpl is contained in all * intermediate lpls that name the leaf as a descendant */ for (j = 0; j <= lgrp_alloc_max; j++) { klgrpset_t intersect; lgrp_t *lgrp_cand; lpl_t *lpl_cand; lgrp_cand = lgrp_table[j]; intersect = klgrpset_intersects( lgrp_cand->lgrp_set[LGRP_RSRC_CPU], cpupart->cp_lgrpset); if (!LGRP_EXISTS(lgrp_cand) || !klgrpset_intersects(lgrp_cand->lgrp_leaves, cpupart->cp_lgrpset) || (intersect == 0)) continue; lpl_cand = &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; if (klgrpset_ismember(intersect, lgrp->lgrp_id)) { ASSERT(lpl_rset_contains(lpl_cand, lpl)); if (!lpl_rset_contains(lpl_cand, lpl)) { return (LPL_TOPO_RSET_MSSNG_LF); } } } } else { /* non-leaf specific checks */ /* * Non-leaf lpls should have lpl_cpus == NULL * verify that this is so */ ASSERT(lpl->lpl_cpus == NULL); if (lpl->lpl_cpus != NULL) { return (LPL_TOPO_NONLEAF_HAS_CPUS); } /* * verify that the sum of the cpus in the leaf resources * is equal to the total ncpu in the intermediate */ for (j = sum = 0; j < lpl->lpl_nrset; j++) { sum += lpl->lpl_rset[j]->lpl_ncpu; } ASSERT(sum == lpl->lpl_ncpu); if (sum != lpl->lpl_ncpu) { return (LPL_TOPO_LPL_BAD_NCPU); } } /* * Check the rset of the lpl in question. Make sure that each * rset contains a subset of the resources in * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes * sure that each rset doesn't include resources that are * outside of that set. (Which would be resources somehow not * accounted for). */ klgrpset_clear(rset); for (j = 0; j < lpl->lpl_nrset; j++) { klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); } klgrpset_copy(cset, rset); /* make sure lpl rset matches lgrp rset */ klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); /* make sure rset is contained with in partition, too */ klgrpset_diff(cset, cpupart->cp_lgrpset); ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset)); if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) { return (LPL_TOPO_RSET_MISMATCH); } /* * check to make sure lpl_nrset matches the number of rsets * contained in the lpl */ for (j = 0; j < lpl->lpl_nrset; j++) { if (lpl->lpl_rset[j] == NULL) break; } ASSERT(j == lpl->lpl_nrset); if (j != lpl->lpl_nrset) { return (LPL_TOPO_BAD_RSETCNT); } } return (LPL_TOPO_CORRECT); } /* * Flatten lpl topology to given number of levels. This is presently only * implemented for a flatten to 2 levels, which will prune out the intermediates * and home the leaf lpls to the root lpl. */ int lpl_topo_flatten(int levels) { int i; uint_t sum; lgrp_t *lgrp_cur; lpl_t *lpl_cur; lpl_t *lpl_root; cpupart_t *cp; if (levels != 2) return (0); /* called w/ cpus paused - grab no locks! */ ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || !lgrp_initialized); cp = cp_list_head; do { lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_cur = lgrp_table[i]; lpl_cur = &cp->cp_lgrploads[i]; if ((lgrp_cur == lgrp_root) || (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu == 0))) continue; if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { /* * this should be a deleted intermediate, so * clear it */ lpl_clear(lpl_cur); } else if ((lpl_cur->lpl_nrset == 1) && (lpl_cur->lpl_rset[0] == lpl_cur) && ((lpl_cur->lpl_parent->lpl_ncpu == 0) || (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { /* * this is a leaf whose parent was deleted, or * whose parent had their lgrp deleted. (And * whose parent will soon be deleted). Point * this guy back to the root lpl. */ lpl_cur->lpl_parent = lpl_root; lpl_rset_add(lpl_root, lpl_cur); } } /* * Now that we're done, make sure the count on the root lpl is * correct, and update the hints of the children for the sake of * thoroughness */ for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { sum += lpl_root->lpl_rset[i]->lpl_ncpu; } lpl_root->lpl_ncpu = sum; lpl_child_update(lpl_root, cp); cp = cp->cp_next; } while (cp != cp_list_head); return (levels); } /* * Insert a lpl into the resource hierarchy and create any additional lpls that * are necessary to represent the varying states of locality for the cpu * resoruces newly added to the partition. * * This routine is clever enough that it can correctly add resources from the * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, * those for which the lpl is a leaf as opposed to simply a named equally local * resource). The one special case that needs additional processing is when a * new intermediate lpl is introduced. Since the main loop only traverses * looking to add the leaf resource where it does not yet exist, additional work * is necessary to add other leaf resources that may need to exist in the newly * created intermediate. This is performed by the second inner loop, and is * only done when the check for more than one overlapping resource succeeds. */ void lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) { int i; int j; int rset_num_intersect; lgrp_t *lgrp_cur; lpl_t *lpl_cur; lpl_t *lpl_parent; lgrp_id_t parent_id; klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_cur = lgrp_table[i]; /* * Don't insert if the lgrp isn't there, if the leaf isn't * contained within the current lgrp, or if the current lgrp has * no leaves in this partition */ if (!LGRP_EXISTS(lgrp_cur) || !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], lpl_leaf->lpl_lgrpid) || !klgrpset_intersects(lgrp_cur->lgrp_leaves, cpupart->cp_lgrpset)) continue; lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; if (lgrp_cur->lgrp_parent != NULL) { /* if lgrp has a parent, assign it properly */ parent_id = lgrp_cur->lgrp_parent->lgrp_id; lpl_parent = &cpupart->cp_lgrploads[parent_id]; } else { /* if not, make sure parent ptr gets set to null */ lpl_parent = NULL; } if (lpl_cur == lpl_leaf) { /* * Almost all leaf state was initialized elsewhere. The * only thing left to do is to set the parent. */ lpl_cur->lpl_parent = lpl_parent; continue; } lpl_clear(lpl_cur); lpl_init(lpl_cur, lpl_leaf, lgrp_cur); lpl_cur->lpl_parent = lpl_parent; /* does new lpl need to be populated with other resources? */ rset_intersect = klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], cpupart->cp_lgrpset); klgrpset_nlgrps(rset_intersect, rset_num_intersect); if (rset_num_intersect > 1) { /* * If so, figure out what lpls have resources that * intersect this one, and add them. */ for (j = 0; j <= lgrp_alloc_max; j++) { lgrp_t *lgrp_cand; /* candidate lgrp */ lpl_t *lpl_cand; /* candidate lpl */ lgrp_cand = lgrp_table[j]; if (!LGRP_EXISTS(lgrp_cand) || !klgrpset_ismember(rset_intersect, lgrp_cand->lgrp_id)) continue; lpl_cand = &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; lpl_rset_add(lpl_cur, lpl_cand); } } /* * This lpl's rset has changed. Update the hint in it's * children. */ lpl_child_update(lpl_cur, cpupart); } } /* * remove a lpl from the hierarchy of resources, clearing its state when * finished. If the lpls at the intermediate levels of the hierarchy have no * remaining resources, or no longer name a leaf resource in the cpu-partition, * delete them as well. */ void lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) { int i; lgrp_t *lgrp_cur; lpl_t *lpl_cur; klgrpset_t leaf_intersect; /* intersection of leaves */ for (i = 0; i <= lgrp_alloc_max; i++) { lgrp_cur = lgrp_table[i]; /* * Don't attempt to remove from lgrps that aren't there, that * don't contain our leaf, or from the leaf itself. (We do that * later) */ if (!LGRP_EXISTS(lgrp_cur)) continue; lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], lpl_leaf->lpl_lgrpid) || (lpl_cur == lpl_leaf)) { continue; } /* * This is a slightly sleazy simplification in that we have * already marked the cp_lgrpset as no longer containing the * leaf we've deleted. Any lpls that pass the above checks * based upon lgrp membership but not necessarily cpu-part * membership also get cleared by the checks below. Currently * this is harmless, as the lpls should be empty anyway. * * In particular, we want to preserve lpls that have additional * leaf resources, even though we don't yet have a processor * architecture that represents resources this way. */ leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, cpupart->cp_lgrpset); lpl_rset_del(lpl_cur, lpl_leaf); if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { lpl_clear(lpl_cur); } else { /* * Update this lpl's children */ lpl_child_update(lpl_cur, cpupart); } } lpl_clear(lpl_leaf); } /* * add a cpu to a partition in terms of lgrp load avg bookeeping * * The lpl (cpu partition load average information) is now arranged in a * hierarchical fashion whereby resources that are closest, ie. most local, to * the cpu in question are considered to be leaves in a tree of resources. * There are two general cases for cpu additon: * * 1. A lpl structure that contains resources already in the hierarchy tree. * In this case, all of the associated lpl relationships have been defined, and * all that is necessary is that we link the new cpu into the per-lpl list of * cpus, and increment the ncpu count of all places where this cpu resource will * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer * pushing is accomplished by this routine. * * 2. The lpl to contain the resources in this cpu-partition for this lgrp does * not exist yet. In this case, it is necessary to build the leaf lpl, and * construct the hierarchy of state necessary to name it's more distant * resources, if they should exist. The leaf structure is initialized by this * routine, as is the cpu-partition state for the lgrp membership. This routine * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy * and builds all of the "ancestoral" state necessary to identify resources at * differing levels of locality. */ void lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) { cpupart_t *cpupart; lgrp_t *lgrp_leaf; lpl_t *lpl_leaf; /* called sometimes w/ cpus paused - grab no locks */ ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); cpupart = cp->cpu_part; lgrp_leaf = lgrp_table[lgrpid]; /* don't add non-existent lgrp */ ASSERT(LGRP_EXISTS(lgrp_leaf)); lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; cp->cpu_lpl = lpl_leaf; /* only leaf lpls contain cpus */ if (lpl_leaf->lpl_ncpu++ == 0) { lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); klgrpset_add(cpupart->cp_lgrpset, lgrpid); lpl_leaf_insert(lpl_leaf, cpupart); } else { /* * the lpl should already exist in the parent, so just update * the count of available CPUs */ lpl_cpu_adjcnt(LPL_INCREMENT, cp); } /* link cpu into list of cpus in lpl */ if (lpl_leaf->lpl_cpus) { cp->cpu_next_lpl = lpl_leaf->lpl_cpus; cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; } else { /* * We increment ncpu immediately after we create a new leaf * lpl, so assert that ncpu == 1 for the case where we don't * have any cpu pointers yet. */ ASSERT(lpl_leaf->lpl_ncpu == 1); lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; } } /* * remove a cpu from a partition in terms of lgrp load avg bookeeping * * The lpl (cpu partition load average information) is now arranged in a * hierarchical fashion whereby resources that are closest, ie. most local, to * the cpu in question are considered to be leaves in a tree of resources. * There are two removal cases in question: * * 1. Removal of the resource in the leaf leaves other resources remaining in * that leaf. (Another cpu still exists at this level of locality). In this * case, the count of available cpus is decremented in all assocated lpls by * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned * from the per-cpu lpl list. * * 2. Removal of the resource results in the lpl containing no resources. (It's * empty) In this case, all of what has occurred for the first step must take * place; however, additionally we must remove the lpl structure itself, prune * out any stranded lpls that do not directly name a leaf resource, and mark the * cpu partition in question as no longer containing resources from the lgrp of * the lpl that has been delted. Cpu-partition changes are handled by this * method, but the lpl_leaf_remove function deals with the details of pruning * out the empty lpl and any of its orphaned direct ancestors. */ void lgrp_part_del_cpu(cpu_t *cp) { lpl_t *lpl; lpl_t *leaf_lpl; lgrp_t *lgrp_leaf; /* called sometimes w/ cpus paused - grab no locks */ ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); lpl = leaf_lpl = cp->cpu_lpl; lgrp_leaf = leaf_lpl->lpl_lgrp; /* don't delete a leaf that isn't there */ ASSERT(LGRP_EXISTS(lgrp_leaf)); /* no double-deletes */ ASSERT(lpl->lpl_ncpu); if (--lpl->lpl_ncpu == 0) { /* * This was the last cpu in this lgroup for this partition, * clear its bit in the partition's lgroup bitmask */ klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); /* eliminate remaning lpl link pointers in cpu, lpl */ lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; lpl_leaf_remove(leaf_lpl, cp->cpu_part); } else { /* unlink cpu from lists of cpus in lpl */ cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; if (lpl->lpl_cpus == cp) { lpl->lpl_cpus = cp->cpu_next_lpl; } /* * Update the cpu count in the lpls associated with parent * lgroups. */ lpl_cpu_adjcnt(LPL_DECREMENT, cp); } /* clear cpu's lpl ptr when we're all done */ cp->cpu_lpl = NULL; } /* * Recompute load average for the specified partition/lgrp fragment. * * We rely on the fact that this routine is called from the clock thread * at a point before the clock thread can block (i.e. before its first * lock request). Since the clock thread can not be preempted (since it * runs at highest priority), we know that cpu partitions can not change * (since doing so would require either the repartition requester or the * cpu_pause thread to run on this cpu), so we can update the cpu's load * without grabbing cpu_lock. */ void lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) { uint_t ncpu; int64_t old, new, f; /* * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... */ static short expval[] = { 0, 3196, 1618, 1083, 814, 652, 543, 466, 408, 363, 326, 297, 272, 251, 233, 218, 204, 192, 181, 172, 163, 155, 148, 142, 136, 130, 125, 121, 116, 112, 109, 105 }; /* ASSERT (called from clock level) */ if ((lpl == NULL) || /* we're booting - this is easiest for now */ ((ncpu = lpl->lpl_ncpu) == 0)) { return; } for (;;) { if (ncpu >= sizeof (expval) / sizeof (expval[0])) f = expval[1]/ncpu; /* good approx. for large ncpu */ else f = expval[ncpu]; /* * Modify the load average atomically to avoid losing * anticipatory load updates (see lgrp_move_thread()). */ if (ageflag) { /* * We're supposed to both update and age the load. * This happens 10 times/sec. per cpu. We do a * little hoop-jumping to avoid integer overflow. */ int64_t q, r; do { old = new = lpl->lpl_loadavg; q = (old >> 16) << 7; r = (old & 0xffff) << 7; new += ((long long)(nrcpus - q) * f - ((r * f) >> 16)) >> 7; /* * Check for overflow */ if (new > LGRP_LOADAVG_MAX) new = LGRP_LOADAVG_MAX; else if (new < 0) new = 0; } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, new) != old); } else { /* * We're supposed to update the load, but not age it. * This option is used to update the load (which either * has already been aged in this 1/10 sec. interval or * soon will be) to account for a remotely executing * thread. */ do { old = new = lpl->lpl_loadavg; new += f; /* * Check for overflow * Underflow not possible here */ if (new < old) new = LGRP_LOADAVG_MAX; } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, new) != old); } /* * Do the same for this lpl's parent */ if ((lpl = lpl->lpl_parent) == NULL) break; ncpu = lpl->lpl_ncpu; } } /* * Initialize lpl topology in the target based on topology currently present in * lpl_bootstrap. * * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to * initialize cp_default list of lpls. Up to this point all topology operations * were performed using lpl_bootstrap. Now cp_default has its own list of lpls * and all subsequent lpl operations should use it instead of lpl_bootstrap. The * `target' points to the list of lpls in cp_default and `size' is the size of * this list. * * This function walks the lpl topology in lpl_bootstrap and does for things: * * 1) Copies all fields from lpl_bootstrap to the target. * * 2) Sets CPU0 lpl pointer to the correct element of the target list. * * 3) Updates lpl_parent pointers to point to the lpls in the target list * instead of lpl_bootstrap. * * 4) Updates pointers in the resource list of the target to point to the lpls * in the target list instead of lpl_bootstrap. * * After lpl_topo_bootstrap() completes, target contains the same information * that would be present there if it were used during boot instead of * lpl_bootstrap. There is no need in information in lpl_bootstrap after this * and it is bzeroed. */ void lpl_topo_bootstrap(lpl_t *target, int size) { lpl_t *lpl = lpl_bootstrap; lpl_t *target_lpl = target; lpl_t **rset; int *id2rset; int sz; int howmany; int id; int i; /* * The only target that should be passed here is cp_default lpl list. */ ASSERT(target == cp_default.cp_lgrploads); ASSERT(size == cp_default.cp_nlgrploads); ASSERT(!lgrp_topo_initialized); ASSERT(ncpus == 1); howmany = MIN(LPL_BOOTSTRAP_SIZE, size); for (i = 0; i < howmany; i++, lpl++, target_lpl++) { /* * Copy all fields from lpl, except for the rset, * lgrp id <=> rset mapping storage, * and amount of storage */ rset = target_lpl->lpl_rset; id2rset = target_lpl->lpl_id2rset; sz = target_lpl->lpl_rset_sz; *target_lpl = *lpl; target_lpl->lpl_rset_sz = sz; target_lpl->lpl_rset = rset; target_lpl->lpl_id2rset = id2rset; /* * Substitute CPU0 lpl pointer with one relative to target. */ if (lpl->lpl_cpus == CPU) { ASSERT(CPU->cpu_lpl == lpl); CPU->cpu_lpl = target_lpl; } /* * Substitute parent information with parent relative to target. */ if (lpl->lpl_parent != NULL) target_lpl->lpl_parent = (lpl_t *) (((uintptr_t)lpl->lpl_parent - (uintptr_t)lpl_bootstrap) + (uintptr_t)target); /* * Walk over resource set substituting pointers relative to * lpl_bootstrap's rset to pointers relative to target's */ ASSERT(lpl->lpl_nrset <= 1); for (id = 0; id < lpl->lpl_nrset; id++) { if (lpl->lpl_rset[id] != NULL) { target_lpl->lpl_rset[id] = (lpl_t *) (((uintptr_t)lpl->lpl_rset[id] - (uintptr_t)lpl_bootstrap) + (uintptr_t)target); } target_lpl->lpl_id2rset[id] = lpl->lpl_id2rset[id]; } } /* * Clean up the bootstrap lpls since we have switched over to the * actual lpl array in the default cpu partition. * * We still need to keep one empty lpl around for newly starting * slave CPUs to reference should they need to make it through the * dispatcher prior to their lgrp/lpl initialization. * * The lpl related dispatcher code has been designed to work properly * (and without extra checks) for this special case of a zero'ed * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl * with lgrpid 0 and an empty resource set. Iteration over the rset * array by the dispatcher is also NULL terminated for this reason. * * This provides the desired behaviour for an uninitialized CPU. * It shouldn't see any other CPU to either dispatch to or steal * from until it is properly initialized. */ bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset)); bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset)); lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; } /* * If the lowest load among the lgroups a process' threads are currently * spread across is greater than lgrp_expand_proc_thresh, we'll consider * expanding the process to a new lgroup. */ #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; #define LGRP_EXPAND_PROC_THRESH(ncpu) \ ((lgrp_expand_proc_thresh) / (ncpu)) /* * A process will be expanded to a new lgroup only if the difference between * the lowest load on the lgroups the process' thread's are currently spread * across and the lowest load on the other lgroups in the process' partition * is greater than lgrp_expand_proc_diff. */ #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; #define LGRP_EXPAND_PROC_DIFF(ncpu) \ ((lgrp_expand_proc_diff) / (ncpu)) /* * The loadavg tolerance accounts for "noise" inherent in the load, which may * be present due to impreciseness of the load average decay algorithm. * * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable * tolerance is scaled by the number of cpus in the lgroup just like * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads * of: 0x10000 / 4 => 0x4000 or greater to be significant. */ uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; #define LGRP_LOADAVG_TOLERANCE(ncpu) \ ((lgrp_loadavg_tolerance) / ncpu) /* * lgrp_choose() will choose root lgroup as home when lowest lgroup load * average is above this threshold */ uint32_t lgrp_load_thresh = UINT32_MAX; /* * lgrp_choose() will try to skip any lgroups with less memory * than this free when choosing a home lgroup */ pgcnt_t lgrp_mem_free_thresh = 0; /* * When choosing between similarly loaded lgroups, lgrp_choose() will pick * one based on one of the following policies: * - Random selection * - Pseudo round robin placement * - Longest time since a thread was last placed */ #define LGRP_CHOOSE_RANDOM 1 #define LGRP_CHOOSE_RR 2 #define LGRP_CHOOSE_TIME 3 int lgrp_choose_policy = LGRP_CHOOSE_TIME; /* * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to * be bound to a CPU or processor set. * * Arguments: * t The thread * cpupart The partition the thread belongs to. * * NOTE: Should at least be called with the cpu_lock held, kernel preemption * disabled, or thread_lock held (at splhigh) to protect against the CPU * partitions changing out from under us and assumes that given thread is * protected. Also, called sometimes w/ cpus paused or kernel preemption * disabled, so don't grab any locks because we should never block under * those conditions. */ lpl_t * lgrp_choose(kthread_t *t, cpupart_t *cpupart) { lgrp_load_t bestload, bestrload; int lgrpid_offset, lgrp_count; lgrp_id_t lgrpid, lgrpid_start; lpl_t *lpl, *bestlpl, *bestrlpl; klgrpset_t lgrpset; proc_t *p; ASSERT(t != NULL); ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || THREAD_LOCK_HELD(t)); ASSERT(cpupart != NULL); p = t->t_procp; /* A process should always be in an active partition */ ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); bestlpl = bestrlpl = NULL; bestload = bestrload = LGRP_LOADAVG_MAX; lgrpset = cpupart->cp_lgrpset; switch (lgrp_choose_policy) { case LGRP_CHOOSE_RR: lgrpid = cpupart->cp_lgrp_hint; do { if (++lgrpid > lgrp_alloc_max) lgrpid = 0; } while (!klgrpset_ismember(lgrpset, lgrpid)); break; default: case LGRP_CHOOSE_TIME: case LGRP_CHOOSE_RANDOM: klgrpset_nlgrps(lgrpset, lgrp_count); lgrpid_offset = (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; for (lgrpid = 0; ; lgrpid++) { if (klgrpset_ismember(lgrpset, lgrpid)) { if (--lgrpid_offset == 0) break; } } break; } lgrpid_start = lgrpid; DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, lgrp_id_t, cpupart->cp_lgrp_hint); /* * Use lgroup affinities (if any) to choose best lgroup * * NOTE: Assumes that thread is protected from going away and its * lgroup affinities won't change (ie. p_lock, or * thread_lock() being held and/or CPUs paused) */ if (t->t_lgrp_affinity) { lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); if (lpl != NULL) return (lpl); } ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); do { pgcnt_t npgs; /* * Skip any lgroups outside of thread's pset */ if (!klgrpset_ismember(lgrpset, lgrpid)) { if (++lgrpid > lgrp_alloc_max) lgrpid = 0; /* wrap the search */ continue; } /* * Skip any non-leaf lgroups */ if (lgrp_table[lgrpid]->lgrp_childcnt != 0) continue; /* * Skip any lgroups without enough free memory * (when threshold set to nonzero positive value) */ if (lgrp_mem_free_thresh > 0) { npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); if (npgs < lgrp_mem_free_thresh) { if (++lgrpid > lgrp_alloc_max) lgrpid = 0; /* wrap the search */ continue; } } lpl = &cpupart->cp_lgrploads[lgrpid]; if (klgrpset_isempty(p->p_lgrpset) || klgrpset_ismember(p->p_lgrpset, lgrpid)) { /* * Either this is a new process or the process already * has threads on this lgrp, so this is a preferred * lgroup for the thread. */ if (bestlpl == NULL || lpl_pick(lpl, bestlpl)) { bestload = lpl->lpl_loadavg; bestlpl = lpl; } } else { /* * The process doesn't have any threads on this lgrp, * but we're willing to consider this lgrp if the load * difference is big enough to justify splitting up * the process' threads. */ if (bestrlpl == NULL || lpl_pick(lpl, bestrlpl)) { bestrload = lpl->lpl_loadavg; bestrlpl = lpl; } } if (++lgrpid > lgrp_alloc_max) lgrpid = 0; /* wrap the search */ } while (lgrpid != lgrpid_start); /* * Return root lgroup if threshold isn't set to maximum value and * lowest lgroup load average more than a certain threshold */ if (lgrp_load_thresh != UINT32_MAX && bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); /* * If all the lgroups over which the thread's process is spread are * heavily loaded, or otherwise undesirable, we'll consider placing * the thread on one of the other leaf lgroups in the thread's * partition. */ if ((bestlpl == NULL) || ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && (bestrload < bestload) && /* paranoid about wraparound */ (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < bestload))) { bestlpl = bestrlpl; } if (bestlpl == NULL) { /* * No lgroup looked particularly good, but we still * have to pick something. Go with the randomly selected * legal lgroup we started with above. */ bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; } cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; bestlpl->lpl_homed_time = gethrtime_unscaled(); ASSERT(bestlpl->lpl_ncpu > 0); return (bestlpl); } /* * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. */ static int lpl_pick(lpl_t *lpl1, lpl_t *lpl2) { lgrp_load_t l1, l2; lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); l1 = lpl1->lpl_loadavg; l2 = lpl2->lpl_loadavg; if ((l1 + tolerance < l2) && (l1 < l2)) { /* lpl1 is significantly less loaded than lpl2 */ return (1); } if (lgrp_choose_policy == LGRP_CHOOSE_TIME && l1 + tolerance >= l2 && l1 < l2 && lpl1->lpl_homed_time < lpl2->lpl_homed_time) { /* * lpl1's load is within the tolerance of lpl2. We're * willing to consider it be to better however if * it has been longer since we last homed a thread there */ return (1); } return (0); } /* * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a * process that uses text replication changed home lgrp. This info is used by * segvn asyncronous thread to detect if it needs to recheck what lgrps * should be used for text replication. */ static uint64_t lgrp_trthr_moves = 0; uint64_t lgrp_get_trthr_migrations(void) { return (lgrp_trthr_moves); } void lgrp_update_trthr_migrations(uint64_t incr) { atomic_add_64(&lgrp_trthr_moves, incr); } /* * An LWP is expected to be assigned to an lgroup for at least this long * for its anticipatory load to be justified. NOTE that this value should * not be set extremely huge (say, larger than 100 years), to avoid problems * with overflow in the calculation that uses it. */ #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; /* * Routine to change a thread's lgroup affiliation. This routine updates * the thread's kthread_t struct and its process' proc_t struct to note the * thread's new lgroup affiliation, and its lgroup affinities. * * Note that this is the only routine that modifies a thread's t_lpl field, * and that adds in or removes anticipatory load. * * If the thread is exiting, newlpl is NULL. * * Locking: * The following lock must be held on entry: * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp * doesn't get removed from t's partition * * This routine is not allowed to grab any locks, since it may be called * with cpus paused (such as from cpu_offline). */ void lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) { proc_t *p; lpl_t *lpl, *oldlpl; lgrp_id_t oldid; kthread_t *tp; uint_t ncpu; lgrp_load_t old, new; ASSERT(t); ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || THREAD_LOCK_HELD(t)); /* * If not changing lpls, just return */ if ((oldlpl = t->t_lpl) == newlpl) return; /* * Make sure the thread's lwp hasn't exited (if so, this thread is now * associated with process 0 rather than with its original process). */ if (t->t_proc_flag & TP_LWPEXIT) { if (newlpl != NULL) { t->t_lpl = newlpl; } return; } p = ttoproc(t); /* * If the thread had a previous lgroup, update its process' p_lgrpset * to account for it being moved from its old lgroup. */ if ((oldlpl != NULL) && /* thread had a previous lgroup */ (p->p_tlist != NULL)) { oldid = oldlpl->lpl_lgrpid; if (newlpl != NULL) lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); if ((do_lgrpset_delete) && (klgrpset_ismember(p->p_lgrpset, oldid))) { for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { /* * Check if a thread other than the thread * that's moving is assigned to the same * lgroup as the thread that's moving. Note * that we have to compare lgroup IDs, rather * than simply comparing t_lpl's, since the * threads may belong to different partitions * but be assigned to the same lgroup. */ ASSERT(tp->t_lpl != NULL); if ((tp != t) && (tp->t_lpl->lpl_lgrpid == oldid)) { /* * Another thread is assigned to the * same lgroup as the thread that's * moving, p_lgrpset doesn't change. */ break; } else if (tp == p->p_tlist) { /* * No other thread is assigned to the * same lgroup as the exiting thread, * clear the lgroup's bit in p_lgrpset. */ klgrpset_del(p->p_lgrpset, oldid); break; } } } /* * If this thread was assigned to its old lgroup for such a * short amount of time that the anticipatory load that was * added on its behalf has aged very little, remove that * anticipatory load. */ if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && ((ncpu = oldlpl->lpl_ncpu) > 0)) { lpl = oldlpl; for (;;) { do { old = new = lpl->lpl_loadavg; new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); if (new > old) { /* * this can happen if the load * average was aged since we * added in the anticipatory * load */ new = 0; } } while (cas32( (lgrp_load_t *)&lpl->lpl_loadavg, old, new) != old); lpl = lpl->lpl_parent; if (lpl == NULL) break; ncpu = lpl->lpl_ncpu; ASSERT(ncpu > 0); } } } /* * If the thread has a new lgroup (i.e. it's not exiting), update its * t_lpl and its process' p_lgrpset, and apply an anticipatory load * to its new lgroup to account for its move to its new lgroup. */ if (newlpl != NULL) { /* * This thread is moving to a new lgroup */ t->t_lpl = newlpl; if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) { p->p_t1_lgrpid = newlpl->lpl_lgrpid; membar_producer(); if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) { lgrp_update_trthr_migrations(1); } } /* * Reflect move in load average of new lgroup * unless it is root lgroup */ if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) return; if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); } /* * It'll take some time for the load on the new lgroup * to reflect this thread's placement on it. We'd * like not, however, to have all threads between now * and then also piling on to this lgroup. To avoid * this pileup, we anticipate the load this thread * will generate on its new lgroup. The goal is to * make the lgroup's load appear as though the thread * had been there all along. We're very conservative * in calculating this anticipatory load, we assume * the worst case case (100% CPU-bound thread). This * may be modified in the future to be more accurate. */ lpl = newlpl; for (;;) { ncpu = lpl->lpl_ncpu; ASSERT(ncpu > 0); do { old = new = lpl->lpl_loadavg; new += LGRP_LOADAVG_MAX_EFFECT(ncpu); /* * Check for overflow * Underflow not possible here */ if (new < old) new = UINT32_MAX; } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, new) != old); lpl = lpl->lpl_parent; if (lpl == NULL) break; } t->t_anttime = gethrtime(); } } /* * Return lgroup memory allocation policy given advice from madvise(3C) */ lgrp_mem_policy_t lgrp_madv_to_policy(uchar_t advice, size_t size, int type) { switch (advice) { case MADV_ACCESS_LWP: return (LGRP_MEM_POLICY_NEXT); case MADV_ACCESS_MANY: return (LGRP_MEM_POLICY_RANDOM); default: return (lgrp_mem_policy_default(size, type)); } } /* * Figure out default policy */ lgrp_mem_policy_t lgrp_mem_policy_default(size_t size, int type) { cpupart_t *cp; lgrp_mem_policy_t policy; size_t pset_mem_size; /* * Randomly allocate memory across lgroups for shared memory * beyond a certain threshold */ if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { /* * Get total memory size of current thread's pset */ kpreempt_disable(); cp = curthread->t_cpupart; klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); kpreempt_enable(); /* * Choose policy to randomly allocate memory across * lgroups in pset if it will fit and is not default * partition. Otherwise, allocate memory randomly * across machine. */ if (lgrp_mem_pset_aware && size < pset_mem_size) policy = LGRP_MEM_POLICY_RANDOM_PSET; else policy = LGRP_MEM_POLICY_RANDOM; } else /* * Apply default policy for private memory and * shared memory under the respective random * threshold. */ policy = lgrp_mem_default_policy; return (policy); } /* * Get memory allocation policy for this segment */ lgrp_mem_policy_info_t * lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) { lgrp_mem_policy_info_t *policy_info; extern struct seg_ops segspt_ops; extern struct seg_ops segspt_shmops; /* * This is for binary compatibility to protect against third party * segment drivers which haven't recompiled to allow for * SEGOP_GETPOLICY() */ if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && seg->s_ops != &segspt_shmops) return (NULL); policy_info = NULL; if (seg->s_ops->getpolicy != NULL) policy_info = SEGOP_GETPOLICY(seg, vaddr); return (policy_info); } /* * Set policy for allocating private memory given desired policy, policy info, * size in bytes of memory that policy is being applied. * Return 0 if policy wasn't set already and 1 if policy was set already */ int lgrp_privm_policy_set(lgrp_mem_policy_t policy, lgrp_mem_policy_info_t *policy_info, size_t size) { ASSERT(policy_info != NULL); if (policy == LGRP_MEM_POLICY_DEFAULT) policy = lgrp_mem_policy_default(size, MAP_PRIVATE); /* * Policy set already? */ if (policy == policy_info->mem_policy) return (1); /* * Set policy */ policy_info->mem_policy = policy; policy_info->mem_lgrpid = LGRP_NONE; return (0); } /* * Get shared memory allocation policy with given tree and offset */ lgrp_mem_policy_info_t * lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, u_offset_t vn_off) { u_offset_t off; lgrp_mem_policy_info_t *policy_info; lgrp_shm_policy_seg_t *policy_seg; lgrp_shm_locality_t *shm_locality; avl_tree_t *tree; avl_index_t where; /* * Get policy segment tree from anon_map or vnode and use specified * anon index or vnode offset as offset * * Assume that no lock needs to be held on anon_map or vnode, since * they should be protected by their reference count which must be * nonzero for an existing segment */ if (amp) { ASSERT(amp->refcnt != 0); shm_locality = amp->locality; if (shm_locality == NULL) return (NULL); tree = shm_locality->loc_tree; off = ptob(anon_index); } else if (vp) { shm_locality = vp->v_locality; if (shm_locality == NULL) return (NULL); ASSERT(shm_locality->loc_count != 0); tree = shm_locality->loc_tree; off = vn_off; } if (tree == NULL) return (NULL); /* * Lookup policy segment for offset into shared object and return * policy info */ rw_enter(&shm_locality->loc_lock, RW_READER); policy_info = NULL; policy_seg = avl_find(tree, &off, &where); if (policy_seg) policy_info = &policy_seg->shm_policy; rw_exit(&shm_locality->loc_lock); return (policy_info); } /* * Default memory allocation policy for kernel segmap pages */ lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; /* * Return lgroup to use for allocating memory * given the segment and address * * There isn't any mutual exclusion that exists between calls * to this routine and DR, so this routine and whomever calls it * should be mindful of the possibility that the lgrp returned * may be deleted. If this happens, dereferences of the lgrp * pointer will still be safe, but the resources in the lgrp will * be gone, and LGRP_EXISTS() will no longer be true. */ lgrp_t * lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) { int i; lgrp_t *lgrp; klgrpset_t lgrpset; int lgrps_spanned; unsigned long off; lgrp_mem_policy_t policy; lgrp_mem_policy_info_t *policy_info; ushort_t random; int stat = 0; extern struct seg *segkmap; /* * Just return null if the lgrp framework hasn't finished * initializing or if this is a UMA machine. */ if (nlgrps == 1 || !lgrp_initialized) return (lgrp_root); /* * Get memory allocation policy for this segment */ policy = lgrp_mem_default_policy; if (seg != NULL) { if (seg->s_as == &kas) { if (seg == segkmap) policy = lgrp_segmap_default_policy; if (policy == LGRP_MEM_POLICY_RANDOM_PROC || policy == LGRP_MEM_POLICY_RANDOM_PSET) policy = LGRP_MEM_POLICY_RANDOM; } else { policy_info = lgrp_mem_policy_get(seg, vaddr); if (policy_info != NULL) { policy = policy_info->mem_policy; if (policy == LGRP_MEM_POLICY_NEXT_SEG) { lgrp_id_t id = policy_info->mem_lgrpid; ASSERT(id != LGRP_NONE); ASSERT(id < NLGRPS_MAX); lgrp = lgrp_table[id]; if (!LGRP_EXISTS(lgrp)) { policy = LGRP_MEM_POLICY_NEXT; } else { lgrp_stat_add(id, LGRP_NUM_NEXT_SEG, 1); return (lgrp); } } } } } lgrpset = 0; /* * Initialize lgroup to home by default */ lgrp = lgrp_home_lgrp(); /* * When homing threads on root lgrp, override default memory * allocation policies with root lgroup memory allocation policy */ if (lgrp == lgrp_root) policy = lgrp_mem_policy_root; /* * Implement policy */ switch (policy) { case LGRP_MEM_POLICY_NEXT_CPU: /* * Return lgroup of current CPU which faulted on memory * If the CPU isn't currently in an lgrp, then opt to * allocate from the root. * * Kernel preemption needs to be disabled here to prevent * the current CPU from going away before lgrp is found. */ if (LGRP_CPU_HAS_NO_LGRP(CPU)) { lgrp = lgrp_root; } else { kpreempt_disable(); lgrp = lgrp_cpu_to_lgrp(CPU); kpreempt_enable(); } break; case LGRP_MEM_POLICY_NEXT: case LGRP_MEM_POLICY_DEFAULT: default: /* * Just return current thread's home lgroup * for default policy (next touch) * If the thread is homed to the root, * then the default policy is random across lgroups. * Fallthrough to the random case. */ if (lgrp != lgrp_root) { if (policy == LGRP_MEM_POLICY_NEXT) lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); else lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_DEFAULT, 1); break; } /* LINTED fallthrough on case statement */ case LGRP_MEM_POLICY_RANDOM: /* * Return a random leaf lgroup with memory */ lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; /* * Count how many lgroups are spanned */ klgrpset_nlgrps(lgrpset, lgrps_spanned); /* * There may be no memnodes in the root lgroup during DR copy * rename on a system with only two boards (memnodes) * configured. In this case just return the root lgrp. */ if (lgrps_spanned == 0) { lgrp = lgrp_root; break; } /* * Pick a random offset within lgroups spanned * and return lgroup at that offset */ random = (ushort_t)gethrtime() >> 4; off = random % lgrps_spanned; ASSERT(off <= lgrp_alloc_max); for (i = 0; i <= lgrp_alloc_max; i++) { if (!klgrpset_ismember(lgrpset, i)) continue; if (off) off--; else { lgrp = lgrp_table[i]; lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 1); break; } } break; case LGRP_MEM_POLICY_RANDOM_PROC: /* * Grab copy of bitmask of lgroups spanned by * this process */ klgrpset_copy(lgrpset, curproc->p_lgrpset); stat = LGRP_NUM_RANDOM_PROC; /* LINTED fallthrough on case statement */ case LGRP_MEM_POLICY_RANDOM_PSET: if (!stat) stat = LGRP_NUM_RANDOM_PSET; if (klgrpset_isempty(lgrpset)) { /* * Grab copy of bitmask of lgroups spanned by * this processor set */ kpreempt_disable(); klgrpset_copy(lgrpset, curthread->t_cpupart->cp_lgrpset); kpreempt_enable(); } /* * Count how many lgroups are spanned */ klgrpset_nlgrps(lgrpset, lgrps_spanned); ASSERT(lgrps_spanned <= nlgrps); /* * Probably lgrps_spanned should be always non-zero, but to be * on the safe side we return lgrp_root if it is empty. */ if (lgrps_spanned == 0) { lgrp = lgrp_root; break; } /* * Pick a random offset within lgroups spanned * and return lgroup at that offset */ random = (ushort_t)gethrtime() >> 4; off = random % lgrps_spanned; ASSERT(off <= lgrp_alloc_max); for (i = 0; i <= lgrp_alloc_max; i++) { if (!klgrpset_ismember(lgrpset, i)) continue; if (off) off--; else { lgrp = lgrp_table[i]; lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 1); break; } } break; case LGRP_MEM_POLICY_ROUNDROBIN: /* * Use offset within segment to determine * offset from home lgroup to choose for * next lgroup to allocate memory from */ off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % (lgrp_alloc_max + 1); kpreempt_disable(); lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; i = lgrp->lgrp_id; kpreempt_enable(); while (off > 0) { i = (i + 1) % (lgrp_alloc_max + 1); lgrp = lgrp_table[i]; if (klgrpset_ismember(lgrpset, i)) off--; } lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); break; } ASSERT(lgrp != NULL); return (lgrp); } /* * Return the number of pages in an lgroup * * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics * could cause tests that rely on the numat driver to fail.... */ pgcnt_t lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) { lgrp_t *lgrp; lgrp = lgrp_table[lgrpid]; if (!LGRP_EXISTS(lgrp) || klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) return (0); return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); } /* * Initialize lgroup shared memory allocation policy support */ void lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) { lgrp_shm_locality_t *shm_locality; /* * Initialize locality field in anon_map * Don't need any locks because this is called when anon_map is * allocated, but not used anywhere yet. */ if (amp) { ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); if (amp->locality == NULL) { /* * Allocate and initialize shared memory locality info * and set anon_map locality pointer to it * Drop lock across kmem_alloc(KM_SLEEP) */ ANON_LOCK_EXIT(&->a_rwlock); shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); shm_locality->loc_count = 1; /* not used for amp */ shm_locality->loc_tree = NULL; /* * Reacquire lock and check to see whether anyone beat * us to initializing the locality info */ ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); if (amp->locality != NULL) { rw_destroy(&shm_locality->loc_lock); kmem_free(shm_locality, sizeof (*shm_locality)); } else amp->locality = shm_locality; } ANON_LOCK_EXIT(&->a_rwlock); return; } /* * Allocate shared vnode policy info if vnode is not locality aware yet */ mutex_enter(&vp->v_lock); if ((vp->v_flag & V_LOCALITY) == 0) { /* * Allocate and initialize shared memory locality info */ mutex_exit(&vp->v_lock); shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); shm_locality->loc_count = 1; shm_locality->loc_tree = NULL; /* * Point vnode locality field at shared vnode policy info * and set locality aware flag in vnode */ mutex_enter(&vp->v_lock); if ((vp->v_flag & V_LOCALITY) == 0) { vp->v_locality = shm_locality; vp->v_flag |= V_LOCALITY; } else { /* * Lost race so free locality info and increment count. */ rw_destroy(&shm_locality->loc_lock); kmem_free(shm_locality, sizeof (*shm_locality)); shm_locality = vp->v_locality; shm_locality->loc_count++; } mutex_exit(&vp->v_lock); return; } /* * Increment reference count of number of segments mapping this vnode * shared */ shm_locality = vp->v_locality; shm_locality->loc_count++; mutex_exit(&vp->v_lock); } /* * Destroy the given shared memory policy segment tree */ void lgrp_shm_policy_tree_destroy(avl_tree_t *tree) { lgrp_shm_policy_seg_t *cur; lgrp_shm_policy_seg_t *next; if (tree == NULL) return; cur = (lgrp_shm_policy_seg_t *)avl_first(tree); while (cur != NULL) { next = AVL_NEXT(tree, cur); avl_remove(tree, cur); kmem_free(cur, sizeof (*cur)); cur = next; } kmem_free(tree, sizeof (avl_tree_t)); } /* * Uninitialize lgroup shared memory allocation policy support */ void lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) { lgrp_shm_locality_t *shm_locality; /* * For anon_map, deallocate shared memory policy tree and * zero locality field * Don't need any locks because anon_map is being freed */ if (amp) { if (amp->locality == NULL) return; shm_locality = amp->locality; shm_locality->loc_count = 0; /* not really used for amp */ rw_destroy(&shm_locality->loc_lock); lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); kmem_free(shm_locality, sizeof (*shm_locality)); amp->locality = 0; return; } /* * For vnode, decrement reference count of segments mapping this vnode * shared and delete locality info if reference count drops to 0 */ mutex_enter(&vp->v_lock); shm_locality = vp->v_locality; shm_locality->loc_count--; if (shm_locality->loc_count == 0) { rw_destroy(&shm_locality->loc_lock); lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); kmem_free(shm_locality, sizeof (*shm_locality)); vp->v_locality = 0; vp->v_flag &= ~V_LOCALITY; } mutex_exit(&vp->v_lock); } /* * Compare two shared memory policy segments * Used by AVL tree code for searching */ int lgrp_shm_policy_compar(const void *x, const void *y) { lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; if (a->shm_off < b->shm_off) return (-1); if (a->shm_off >= b->shm_off + b->shm_size) return (1); return (0); } /* * Concatenate seg1 with seg2 and remove seg2 */ static int lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, lgrp_shm_policy_seg_t *seg2) { if (!seg1 || !seg2 || seg1->shm_off + seg1->shm_size != seg2->shm_off || seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) return (-1); seg1->shm_size += seg2->shm_size; avl_remove(tree, seg2); kmem_free(seg2, sizeof (*seg2)); return (0); } /* * Split segment at given offset and return rightmost (uppermost) segment * Assumes that there are no overlapping segments */ static lgrp_shm_policy_seg_t * lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, u_offset_t off) { lgrp_shm_policy_seg_t *newseg; avl_index_t where; ASSERT(seg != NULL); ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); if (!seg || off < seg->shm_off || off > seg->shm_off + seg->shm_size) return (NULL); if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) return (seg); /* * Adjust size of left segment and allocate new (right) segment */ newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); newseg->shm_policy = seg->shm_policy; newseg->shm_off = off; newseg->shm_size = seg->shm_size - (off - seg->shm_off); seg->shm_size = off - seg->shm_off; /* * Find where to insert new segment in AVL tree and insert it */ (void) avl_find(tree, &off, &where); avl_insert(tree, newseg, where); return (newseg); } /* * Set shared memory allocation policy on specified shared object at given * offset and length * * Return 0 if policy wasn't set already, 1 if policy was set already, and * -1 if can't set policy. */ int lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) { u_offset_t eoff; lgrp_shm_policy_seg_t *next; lgrp_shm_policy_seg_t *newseg; u_offset_t off; u_offset_t oldeoff; lgrp_shm_policy_seg_t *prev; int retval; lgrp_shm_policy_seg_t *seg; lgrp_shm_locality_t *shm_locality; avl_tree_t *tree; avl_index_t where; ASSERT(amp || vp); ASSERT((len & PAGEOFFSET) == 0); if (len == 0) return (-1); retval = 0; /* * Get locality info and starting offset into shared object * Try anon map first and then vnode * Assume that no locks need to be held on anon_map or vnode, since * it should be protected by its reference count which must be nonzero * for an existing segment. */ if (amp) { /* * Get policy info from anon_map * */ ASSERT(amp->refcnt != 0); if (amp->locality == NULL) lgrp_shm_policy_init(amp, NULL); shm_locality = amp->locality; off = ptob(anon_index); } else if (vp) { /* * Get policy info from vnode */ if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) lgrp_shm_policy_init(NULL, vp); shm_locality = vp->v_locality; ASSERT(shm_locality->loc_count != 0); off = vn_off; } else return (-1); ASSERT((off & PAGEOFFSET) == 0); /* * Figure out default policy */ if (policy == LGRP_MEM_POLICY_DEFAULT) policy = lgrp_mem_policy_default(len, MAP_SHARED); /* * Create AVL tree if there isn't one yet * and set locality field to point at it */ rw_enter(&shm_locality->loc_lock, RW_WRITER); tree = shm_locality->loc_tree; if (!tree) { rw_exit(&shm_locality->loc_lock); tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); rw_enter(&shm_locality->loc_lock, RW_WRITER); if (shm_locality->loc_tree == NULL) { avl_create(tree, lgrp_shm_policy_compar, sizeof (lgrp_shm_policy_seg_t), offsetof(lgrp_shm_policy_seg_t, shm_tree)); shm_locality->loc_tree = tree; } else { /* * Another thread managed to set up the tree * before we could. Free the tree we allocated * and use the one that's already there. */ kmem_free(tree, sizeof (*tree)); tree = shm_locality->loc_tree; } } /* * Set policy * * Need to maintain hold on writer's lock to keep tree from * changing out from under us */ while (len != 0) { /* * Find policy segment for specified offset into shared object */ seg = avl_find(tree, &off, &where); /* * Didn't find any existing segment that contains specified * offset, so allocate new segment, insert it, and concatenate * with adjacent segments if possible */ if (seg == NULL) { newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); newseg->shm_policy.mem_policy = policy; newseg->shm_policy.mem_lgrpid = LGRP_NONE; newseg->shm_off = off; avl_insert(tree, newseg, where); /* * Check to see whether new segment overlaps with next * one, set length of new segment accordingly, and * calculate remaining length and next offset */ seg = AVL_NEXT(tree, newseg); if (seg == NULL || off + len <= seg->shm_off) { newseg->shm_size = len; len = 0; } else { newseg->shm_size = seg->shm_off - off; off = seg->shm_off; len -= newseg->shm_size; } /* * Try to concatenate new segment with next and * previous ones, since they might have the same policy * now. Grab previous and next segments first because * they will change on concatenation. */ prev = AVL_PREV(tree, newseg); next = AVL_NEXT(tree, newseg); (void) lgrp_shm_policy_concat(tree, newseg, next); (void) lgrp_shm_policy_concat(tree, prev, newseg); continue; } eoff = off + len; oldeoff = seg->shm_off + seg->shm_size; /* * Policy set already? */ if (policy == seg->shm_policy.mem_policy) { /* * Nothing left to do if offset and length * fall within this segment */ if (eoff <= oldeoff) { retval = 1; break; } else { len = eoff - oldeoff; off = oldeoff; continue; } } /* * Specified offset and length match existing segment exactly */ if (off == seg->shm_off && len == seg->shm_size) { /* * Set policy and update current length */ seg->shm_policy.mem_policy = policy; seg->shm_policy.mem_lgrpid = LGRP_NONE; len = 0; /* * Try concatenating new segment with previous and next * segments, since they might have the same policy now. * Grab previous and next segments first because they * will change on concatenation. */ prev = AVL_PREV(tree, seg); next = AVL_NEXT(tree, seg); (void) lgrp_shm_policy_concat(tree, seg, next); (void) lgrp_shm_policy_concat(tree, prev, seg); } else { /* * Specified offset and length only apply to part of * existing segment */ /* * New segment starts in middle of old one, so split * new one off near beginning of old one */ newseg = NULL; if (off > seg->shm_off) { newseg = lgrp_shm_policy_split(tree, seg, off); /* * New segment ends where old one did, so try * to concatenate with next segment */ if (eoff == oldeoff) { newseg->shm_policy.mem_policy = policy; newseg->shm_policy.mem_lgrpid = LGRP_NONE; (void) lgrp_shm_policy_concat(tree, newseg, AVL_NEXT(tree, newseg)); break; } } /* * New segment ends before old one, so split off end of * old one */ if (eoff < oldeoff) { if (newseg) { (void) lgrp_shm_policy_split(tree, newseg, eoff); newseg->shm_policy.mem_policy = policy; newseg->shm_policy.mem_lgrpid = LGRP_NONE; } else { (void) lgrp_shm_policy_split(tree, seg, eoff); seg->shm_policy.mem_policy = policy; seg->shm_policy.mem_lgrpid = LGRP_NONE; } if (off == seg->shm_off) (void) lgrp_shm_policy_concat(tree, AVL_PREV(tree, seg), seg); break; } /* * Calculate remaining length and next offset */ len = eoff - oldeoff; off = oldeoff; } } rw_exit(&shm_locality->loc_lock); return (retval); } /* * Return the best memnode from which to allocate memory given * an lgroup. * * "c" is for cookie, which is good enough for me. * It references a cookie struct that should be zero'ed to initialize. * The cookie should live on the caller's stack. * * The routine returns -1 when: * - traverse is 0, and all the memnodes in "lgrp" have been returned. * - traverse is 1, and all the memnodes in the system have been * returned. */ int lgrp_memnode_choose(lgrp_mnode_cookie_t *c) { lgrp_t *lp = c->lmc_lgrp; mnodeset_t nodes = c->lmc_nodes; int cnt = c->lmc_cnt; int offset, mnode; extern int max_mem_nodes; /* * If the set is empty, and the caller is willing, traverse * up the hierarchy until we find a non-empty set. */ while (nodes == (mnodeset_t)0 || cnt <= 0) { if (c->lmc_scope == LGRP_SRCH_LOCAL || ((lp = lp->lgrp_parent) == NULL)) return (-1); nodes = lp->lgrp_mnodes & ~(c->lmc_tried); cnt = lp->lgrp_nmnodes - c->lmc_ntried; } /* * Select a memnode by picking one at a "random" offset. * Because of DR, memnodes can come and go at any time. * This code must be able to cope with the possibility * that the nodes count "cnt" is inconsistent with respect * to the number of elements actually in "nodes", and * therefore that the offset chosen could be greater than * the number of elements in the set (some memnodes may * have dissapeared just before cnt was read). * If this happens, the search simply wraps back to the * beginning of the set. */ ASSERT(nodes != (mnodeset_t)0 && cnt > 0); offset = c->lmc_rand % cnt; do { for (mnode = 0; mnode < max_mem_nodes; mnode++) if (nodes & ((mnodeset_t)1 << mnode)) if (!offset--) break; } while (mnode >= max_mem_nodes); /* Found a node. Store state before returning. */ c->lmc_lgrp = lp; c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); c->lmc_cnt = cnt - 1; c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); c->lmc_ntried++; return (mnode); }