xref: /titanic_53/usr/src/uts/common/os/lgrp.c (revision 611ffe8a3112495ac3288bbe1f81f9f09a61dc9e)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ab761399Sesaxe  * Common Development and Distribution License (the "License").
6ab761399Sesaxe  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22ab761399Sesaxe  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate /*
297c478bd9Sstevel@tonic-gate  * Basic NUMA support in terms of locality groups
307c478bd9Sstevel@tonic-gate  *
317c478bd9Sstevel@tonic-gate  * Solaris needs to know which CPUs, memory, etc. are near each other to
327c478bd9Sstevel@tonic-gate  * provide good performance on NUMA machines by optimizing for locality.
337c478bd9Sstevel@tonic-gate  * In order to do this, a new abstraction called a "locality group (lgroup)"
347c478bd9Sstevel@tonic-gate  * has been introduced to keep track of which CPU-like and memory-like hardware
357c478bd9Sstevel@tonic-gate  * resources are close to each other.  Currently, latency is the only measure
367c478bd9Sstevel@tonic-gate  * used to determine how to group hardware resources into lgroups, but this
377c478bd9Sstevel@tonic-gate  * does not limit the groupings to be based solely on latency.  Other factors
387c478bd9Sstevel@tonic-gate  * may be used to determine the groupings in the future.
397c478bd9Sstevel@tonic-gate  *
407c478bd9Sstevel@tonic-gate  * Lgroups are organized into a hieararchy or topology that represents the
417c478bd9Sstevel@tonic-gate  * latency topology of the machine.  There is always at least a root lgroup in
427c478bd9Sstevel@tonic-gate  * the system.  It represents all the hardware resources in the machine at a
437c478bd9Sstevel@tonic-gate  * latency big enough that any hardware resource can at least access any other
447c478bd9Sstevel@tonic-gate  * hardware resource within that latency.  A Uniform Memory Access (UMA)
457c478bd9Sstevel@tonic-gate  * machine is represented with one lgroup (the root).  In contrast, a NUMA
467c478bd9Sstevel@tonic-gate  * machine is represented at least by the root lgroup and some number of leaf
477c478bd9Sstevel@tonic-gate  * lgroups where the leaf lgroups contain the hardware resources within the
487c478bd9Sstevel@tonic-gate  * least latency of each other and the root lgroup still contains all the
497c478bd9Sstevel@tonic-gate  * resources in the machine.  Some number of intermediate lgroups may exist
507c478bd9Sstevel@tonic-gate  * which represent more levels of locality than just the local latency of the
517c478bd9Sstevel@tonic-gate  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
527c478bd9Sstevel@tonic-gate  * (eg. root and intermediate lgroups) contain the next nearest resources to
537c478bd9Sstevel@tonic-gate  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
547c478bd9Sstevel@tonic-gate  * to the root lgroup shows the hardware resources from closest to farthest
557c478bd9Sstevel@tonic-gate  * from the leaf lgroup such that each successive ancestor lgroup contains
567c478bd9Sstevel@tonic-gate  * the next nearest resources at the next level of locality from the previous.
577c478bd9Sstevel@tonic-gate  *
587c478bd9Sstevel@tonic-gate  * The kernel uses the lgroup abstraction to know how to allocate resources
597c478bd9Sstevel@tonic-gate  * near a given process/thread.  At fork() and lwp/thread_create() time, a
607c478bd9Sstevel@tonic-gate  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
617c478bd9Sstevel@tonic-gate  * with the lowest load average.  Binding to a processor or processor set will
627c478bd9Sstevel@tonic-gate  * change the home lgroup for a thread.  The scheduler has been modified to try
637c478bd9Sstevel@tonic-gate  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
647c478bd9Sstevel@tonic-gate  * allocation is lgroup aware too, so memory will be allocated from the current
657c478bd9Sstevel@tonic-gate  * thread's home lgroup if possible.  If the desired resources are not
667c478bd9Sstevel@tonic-gate  * available, the kernel traverses the lgroup hierarchy going to the parent
677c478bd9Sstevel@tonic-gate  * lgroup to find resources at the next level of locality until it reaches the
687c478bd9Sstevel@tonic-gate  * root lgroup.
697c478bd9Sstevel@tonic-gate  */
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
727c478bd9Sstevel@tonic-gate #include <sys/lgrp_user.h>
737c478bd9Sstevel@tonic-gate #include <sys/types.h>
747c478bd9Sstevel@tonic-gate #include <sys/mman.h>
757c478bd9Sstevel@tonic-gate #include <sys/param.h>
767c478bd9Sstevel@tonic-gate #include <sys/var.h>
777c478bd9Sstevel@tonic-gate #include <sys/thread.h>
787c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
797c478bd9Sstevel@tonic-gate #include <sys/cpupart.h>
807c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
817c478bd9Sstevel@tonic-gate #include <vm/seg.h>
827c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
837c478bd9Sstevel@tonic-gate #include <vm/seg_spt.h>
847c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
857c478bd9Sstevel@tonic-gate #include <vm/as.h>
867c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
877c478bd9Sstevel@tonic-gate #include <sys/systm.h>
887c478bd9Sstevel@tonic-gate #include <sys/errno.h>
897c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
907c478bd9Sstevel@tonic-gate #include <sys/kstat.h>
917c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
927c478bd9Sstevel@tonic-gate #include <sys/chip.h>
937c478bd9Sstevel@tonic-gate #include <sys/promif.h>
947c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
957c478bd9Sstevel@tonic-gate 
967c478bd9Sstevel@tonic-gate lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
977c478bd9Sstevel@tonic-gate lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
987c478bd9Sstevel@tonic-gate 				/* indexed by lgrp_id */
997c478bd9Sstevel@tonic-gate int	nlgrps;			/* number of lgroups in machine */
1007c478bd9Sstevel@tonic-gate int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
1017c478bd9Sstevel@tonic-gate int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
1027c478bd9Sstevel@tonic-gate 
1037c478bd9Sstevel@tonic-gate /*
1047c478bd9Sstevel@tonic-gate  * Kstat data for lgroups.
1057c478bd9Sstevel@tonic-gate  *
1067c478bd9Sstevel@tonic-gate  * Actual kstat data is collected in lgrp_stats array.
1077c478bd9Sstevel@tonic-gate  * The lgrp_kstat_data array of named kstats is used to extract data from
1087c478bd9Sstevel@tonic-gate  * lgrp_stats and present it to kstat framework. It is protected from partallel
1097c478bd9Sstevel@tonic-gate  * modifications by lgrp_kstat_mutex. This may cause some contention when
1107c478bd9Sstevel@tonic-gate  * several kstat commands run in parallel but this is not the
1117c478bd9Sstevel@tonic-gate  * performance-critical path.
1127c478bd9Sstevel@tonic-gate  */
1137c478bd9Sstevel@tonic-gate extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
1147c478bd9Sstevel@tonic-gate 
1157c478bd9Sstevel@tonic-gate /*
1167c478bd9Sstevel@tonic-gate  * Declare kstat names statically for enums as defined in the header file.
1177c478bd9Sstevel@tonic-gate  */
1187c478bd9Sstevel@tonic-gate LGRP_KSTAT_NAMES;
1197c478bd9Sstevel@tonic-gate 
1207c478bd9Sstevel@tonic-gate static void	lgrp_kstat_init(void);
1217c478bd9Sstevel@tonic-gate static int	lgrp_kstat_extract(kstat_t *, int);
1227c478bd9Sstevel@tonic-gate static void	lgrp_kstat_reset(lgrp_id_t);
1237c478bd9Sstevel@tonic-gate 
1247c478bd9Sstevel@tonic-gate static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
1257c478bd9Sstevel@tonic-gate static kmutex_t lgrp_kstat_mutex;
1267c478bd9Sstevel@tonic-gate 
1277c478bd9Sstevel@tonic-gate 
1287c478bd9Sstevel@tonic-gate /*
1297c478bd9Sstevel@tonic-gate  * max number of lgroups supported by the platform
1307c478bd9Sstevel@tonic-gate  */
1317c478bd9Sstevel@tonic-gate int	nlgrpsmax = 0;
1327c478bd9Sstevel@tonic-gate 
1337c478bd9Sstevel@tonic-gate /*
1347c478bd9Sstevel@tonic-gate  * The root lgroup. Represents the set of resources at the system wide
1357c478bd9Sstevel@tonic-gate  * level of locality.
1367c478bd9Sstevel@tonic-gate  */
1377c478bd9Sstevel@tonic-gate lgrp_t		*lgrp_root = NULL;
1387c478bd9Sstevel@tonic-gate 
1397c478bd9Sstevel@tonic-gate /*
1407c478bd9Sstevel@tonic-gate  * During system bootstrap cp_default does not contain the list of lgrp load
1417c478bd9Sstevel@tonic-gate  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
1427c478bd9Sstevel@tonic-gate  * on-line when cp_default is initialized by cpupart_initialize_default().
1437c478bd9Sstevel@tonic-gate  * Configuring CPU0 may create a two-level topology with root and one leaf node
1447c478bd9Sstevel@tonic-gate  * containing CPU0. This topology is initially constructed in a special
1457c478bd9Sstevel@tonic-gate  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
1467c478bd9Sstevel@tonic-gate  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
1477c478bd9Sstevel@tonic-gate  * for all lpl operations until cp_default is fully constructed.
1487c478bd9Sstevel@tonic-gate  *
1497c478bd9Sstevel@tonic-gate  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
1507c478bd9Sstevel@tonic-gate  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
1517c478bd9Sstevel@tonic-gate  * the first element of lpl_bootstrap_list.
152394b433dSesaxe  *
153394b433dSesaxe  * CPUs that are added to the system, but have not yet been assigned to an
154394b433dSesaxe  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155394b433dSesaxe  * on some architectures (x86) it's possible for the slave CPU startup thread
156394b433dSesaxe  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
1577c478bd9Sstevel@tonic-gate  */
1587c478bd9Sstevel@tonic-gate #define	LPL_BOOTSTRAP_SIZE 2
1597c478bd9Sstevel@tonic-gate static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
1607c478bd9Sstevel@tonic-gate lpl_t		*lpl_bootstrap;
1617c478bd9Sstevel@tonic-gate 
162394b433dSesaxe /*
163394b433dSesaxe  * If cp still references the bootstrap lpl, it has not yet been added to
164394b433dSesaxe  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165394b433dSesaxe  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166394b433dSesaxe  */
167394b433dSesaxe #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168394b433dSesaxe 
1697c478bd9Sstevel@tonic-gate static lgrp_t	lroot;
1707c478bd9Sstevel@tonic-gate 
1717c478bd9Sstevel@tonic-gate 
1727c478bd9Sstevel@tonic-gate /*
1737c478bd9Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied
1747c478bd9Sstevel@tonic-gate  * to non-shared memory.  Default is the maximum size, so random memory
1757c478bd9Sstevel@tonic-gate  * allocation won't be used for non-shared memory by default.
1767c478bd9Sstevel@tonic-gate  */
1777c478bd9Sstevel@tonic-gate size_t	lgrp_privm_random_thresh = (size_t)(-1);
1787c478bd9Sstevel@tonic-gate 
1797c478bd9Sstevel@tonic-gate /*
1807c478bd9Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied to
1817c478bd9Sstevel@tonic-gate  * shared memory.  Default is 8MB (2 ISM pages).
1827c478bd9Sstevel@tonic-gate  */
1837c478bd9Sstevel@tonic-gate size_t	lgrp_shm_random_thresh = 8*1024*1024;
1847c478bd9Sstevel@tonic-gate 
1857c478bd9Sstevel@tonic-gate /*
1867c478bd9Sstevel@tonic-gate  * Whether to do processor set aware memory allocation by default
1877c478bd9Sstevel@tonic-gate  */
1887c478bd9Sstevel@tonic-gate int	lgrp_mem_pset_aware = 0;
1897c478bd9Sstevel@tonic-gate 
1907c478bd9Sstevel@tonic-gate /*
1917c478bd9Sstevel@tonic-gate  * Set the default memory allocation policy for root lgroup
1927c478bd9Sstevel@tonic-gate  */
1937c478bd9Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
1947c478bd9Sstevel@tonic-gate 
1957c478bd9Sstevel@tonic-gate /*
1967c478bd9Sstevel@tonic-gate  * Set the default memory allocation policy.  For most platforms,
1977c478bd9Sstevel@tonic-gate  * next touch is sufficient, but some platforms may wish to override
1987c478bd9Sstevel@tonic-gate  * this.
1997c478bd9Sstevel@tonic-gate  */
2007c478bd9Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
2017c478bd9Sstevel@tonic-gate 
2027c478bd9Sstevel@tonic-gate 
2037c478bd9Sstevel@tonic-gate /*
2047c478bd9Sstevel@tonic-gate  * lgroup CPU event handlers
2057c478bd9Sstevel@tonic-gate  */
2067c478bd9Sstevel@tonic-gate static void	lgrp_cpu_init(struct cpu *);
2077c478bd9Sstevel@tonic-gate static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
2087c478bd9Sstevel@tonic-gate static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
2097c478bd9Sstevel@tonic-gate 
2107c478bd9Sstevel@tonic-gate static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
2117c478bd9Sstevel@tonic-gate 
2127c478bd9Sstevel@tonic-gate /*
2137c478bd9Sstevel@tonic-gate  * lgroup memory event handlers
2147c478bd9Sstevel@tonic-gate  */
2157c478bd9Sstevel@tonic-gate static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
2167c478bd9Sstevel@tonic-gate static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
2177c478bd9Sstevel@tonic-gate static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
2187c478bd9Sstevel@tonic-gate 
2197c478bd9Sstevel@tonic-gate /*
2207c478bd9Sstevel@tonic-gate  * lgroup CPU partition event handlers
2217c478bd9Sstevel@tonic-gate  */
2227c478bd9Sstevel@tonic-gate static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
2237c478bd9Sstevel@tonic-gate static void	lgrp_part_del_cpu(struct cpu *);
2247c478bd9Sstevel@tonic-gate 
2257c478bd9Sstevel@tonic-gate static void	lgrp_root_init(void);
2267c478bd9Sstevel@tonic-gate 
2277c478bd9Sstevel@tonic-gate /*
2287c478bd9Sstevel@tonic-gate  * lpl topology
2297c478bd9Sstevel@tonic-gate  */
2307c478bd9Sstevel@tonic-gate static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
2317c478bd9Sstevel@tonic-gate static void	lpl_clear(lpl_t *);
2327c478bd9Sstevel@tonic-gate static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
2337c478bd9Sstevel@tonic-gate static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
2347c478bd9Sstevel@tonic-gate static void	lpl_rset_add(lpl_t *, lpl_t *);
2357c478bd9Sstevel@tonic-gate static void	lpl_rset_del(lpl_t *, lpl_t *);
2367c478bd9Sstevel@tonic-gate static int	lpl_rset_contains(lpl_t *, lpl_t *);
2377c478bd9Sstevel@tonic-gate static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
2387c478bd9Sstevel@tonic-gate static void	lpl_child_update(lpl_t *, struct cpupart *);
2397c478bd9Sstevel@tonic-gate static int	lpl_pick(lpl_t *, lpl_t *);
2407c478bd9Sstevel@tonic-gate static void	lpl_verify_wrapper(struct cpupart *);
2417c478bd9Sstevel@tonic-gate 
2427c478bd9Sstevel@tonic-gate /*
2437c478bd9Sstevel@tonic-gate  * defines for lpl topology verifier return codes
2447c478bd9Sstevel@tonic-gate  */
2457c478bd9Sstevel@tonic-gate 
2467c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CORRECT			0
2477c478bd9Sstevel@tonic-gate #define	LPL_TOPO_PART_HAS_NO_LPL		-1
2487c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
2497c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LGRP_MISMATCH			-3
2507c478bd9Sstevel@tonic-gate #define	LPL_TOPO_MISSING_PARENT			-4
2517c478bd9Sstevel@tonic-gate #define	LPL_TOPO_PARENT_MISMATCH		-5
2527c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BAD_CPUCNT			-6
2537c478bd9Sstevel@tonic-gate #define	LPL_TOPO_RSET_MISMATCH			-7
2547c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LPL_ORPHANED			-8
2557c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LPL_BAD_NCPU			-9
2567c478bd9Sstevel@tonic-gate #define	LPL_TOPO_RSET_MSSNG_LF			-10
2577c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
2587c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BOGUS_HINT			-12
2597c478bd9Sstevel@tonic-gate #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
2607c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LGRP_NOT_LEAF			-14
2617c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BAD_RSETCNT			-15
2627c478bd9Sstevel@tonic-gate 
2637c478bd9Sstevel@tonic-gate /*
2647c478bd9Sstevel@tonic-gate  * Return whether lgroup optimizations should be enabled on this system
2657c478bd9Sstevel@tonic-gate  */
2667c478bd9Sstevel@tonic-gate int
2677c478bd9Sstevel@tonic-gate lgrp_optimizations(void)
2687c478bd9Sstevel@tonic-gate {
2697c478bd9Sstevel@tonic-gate 	/*
2707c478bd9Sstevel@tonic-gate 	 * System must have more than 2 lgroups to enable lgroup optimizations
2717c478bd9Sstevel@tonic-gate 	 *
2727c478bd9Sstevel@tonic-gate 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
2737c478bd9Sstevel@tonic-gate 	 * with one child lgroup containing all the resources. A 2 lgroup
2747c478bd9Sstevel@tonic-gate 	 * system with a root lgroup directly containing CPUs or memory might
2757c478bd9Sstevel@tonic-gate 	 * need lgroup optimizations with its child lgroup, but there
2767c478bd9Sstevel@tonic-gate 	 * isn't such a machine for now....
2777c478bd9Sstevel@tonic-gate 	 */
2787c478bd9Sstevel@tonic-gate 	if (nlgrps > 2)
2797c478bd9Sstevel@tonic-gate 		return (1);
2807c478bd9Sstevel@tonic-gate 
2817c478bd9Sstevel@tonic-gate 	return (0);
2827c478bd9Sstevel@tonic-gate }
2837c478bd9Sstevel@tonic-gate 
2847c478bd9Sstevel@tonic-gate /*
2857c478bd9Sstevel@tonic-gate  * Build full lgroup topology
2867c478bd9Sstevel@tonic-gate  */
2877c478bd9Sstevel@tonic-gate static void
2887c478bd9Sstevel@tonic-gate lgrp_root_init(void)
2897c478bd9Sstevel@tonic-gate {
2907c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
2917c478bd9Sstevel@tonic-gate 	int		i;
2927c478bd9Sstevel@tonic-gate 	lgrp_id_t	id;
2937c478bd9Sstevel@tonic-gate 
2947c478bd9Sstevel@tonic-gate 	/*
2957c478bd9Sstevel@tonic-gate 	 * Create the "root" lgroup
2967c478bd9Sstevel@tonic-gate 	 */
2977c478bd9Sstevel@tonic-gate 	ASSERT(nlgrps == 0);
2987c478bd9Sstevel@tonic-gate 	id = nlgrps++;
2997c478bd9Sstevel@tonic-gate 
3007c478bd9Sstevel@tonic-gate 	lgrp_root = &lroot;
3017c478bd9Sstevel@tonic-gate 
3027c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_cpu = NULL;
3037c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_mnodes = 0;
3047c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_nmnodes = 0;
3057c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_root_hand();
3067c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_plathand = hand;
3077c478bd9Sstevel@tonic-gate 
3087c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_id = id;
3097c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_cpucnt = 0;
3107c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_childcnt = 0;
3117c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_children);
3127c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_leaves);
3137c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_parent = NULL;
3147c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_chips = NULL;
3157c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_chipcnt = 0;
3167c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
3177c478bd9Sstevel@tonic-gate 
3187c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
3197c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[i]);
3207c478bd9Sstevel@tonic-gate 
3217c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_kstat = NULL;
3227c478bd9Sstevel@tonic-gate 
3237c478bd9Sstevel@tonic-gate 	lgrp_table[id] = lgrp_root;
3247c478bd9Sstevel@tonic-gate 
3257c478bd9Sstevel@tonic-gate 	/*
3267c478bd9Sstevel@tonic-gate 	 * Setup initial lpl list for CPU0 and initial t0 home.
3277c478bd9Sstevel@tonic-gate 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
328394b433dSesaxe 	 * all topology operations until cp_default is initialized at which
329394b433dSesaxe 	 * point t0.t_lpl will be updated.
3307c478bd9Sstevel@tonic-gate 	 */
3317c478bd9Sstevel@tonic-gate 	lpl_bootstrap = lpl_bootstrap_list;
3327c478bd9Sstevel@tonic-gate 	t0.t_lpl = lpl_bootstrap;
3337c478bd9Sstevel@tonic-gate 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
3347c478bd9Sstevel@tonic-gate 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
3357c478bd9Sstevel@tonic-gate 	cp_default.cp_lgrploads = lpl_bootstrap;
3367c478bd9Sstevel@tonic-gate }
3377c478bd9Sstevel@tonic-gate 
3387c478bd9Sstevel@tonic-gate /*
3397c478bd9Sstevel@tonic-gate  * Initialize the lgroup framework and allow the platform to do the same
3407c478bd9Sstevel@tonic-gate  */
3417c478bd9Sstevel@tonic-gate void
3427c478bd9Sstevel@tonic-gate lgrp_init(void)
3437c478bd9Sstevel@tonic-gate {
3447c478bd9Sstevel@tonic-gate 	/*
3457c478bd9Sstevel@tonic-gate 	 * Initialize the platform
3467c478bd9Sstevel@tonic-gate 	 */
3477c478bd9Sstevel@tonic-gate 	lgrp_plat_init();
3487c478bd9Sstevel@tonic-gate 
3497c478bd9Sstevel@tonic-gate 	/*
3507c478bd9Sstevel@tonic-gate 	 * Set max number of lgroups supported on this platform which must be
3517c478bd9Sstevel@tonic-gate 	 * less than the max number of lgroups supported by the common lgroup
3527c478bd9Sstevel@tonic-gate 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
3537c478bd9Sstevel@tonic-gate 	 */
3547c478bd9Sstevel@tonic-gate 	nlgrpsmax = lgrp_plat_max_lgrps();
3557c478bd9Sstevel@tonic-gate 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
3567c478bd9Sstevel@tonic-gate }
3577c478bd9Sstevel@tonic-gate 
3587c478bd9Sstevel@tonic-gate /*
3597c478bd9Sstevel@tonic-gate  * Create the root and cpu0's lgroup, and set t0's home.
3607c478bd9Sstevel@tonic-gate  */
3617c478bd9Sstevel@tonic-gate void
3627c478bd9Sstevel@tonic-gate lgrp_setup(void)
3637c478bd9Sstevel@tonic-gate {
3647c478bd9Sstevel@tonic-gate 	/*
3657c478bd9Sstevel@tonic-gate 	 * Setup the root lgroup
3667c478bd9Sstevel@tonic-gate 	 */
3677c478bd9Sstevel@tonic-gate 	lgrp_root_init();
3687c478bd9Sstevel@tonic-gate 
3697c478bd9Sstevel@tonic-gate 	/*
3707c478bd9Sstevel@tonic-gate 	 * Add cpu0 to an lgroup
3717c478bd9Sstevel@tonic-gate 	 */
3727c478bd9Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
3737c478bd9Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
3747c478bd9Sstevel@tonic-gate }
3757c478bd9Sstevel@tonic-gate 
3767c478bd9Sstevel@tonic-gate /*
3777c478bd9Sstevel@tonic-gate  * Lgroup initialization is split in two parts. The first part
3787c478bd9Sstevel@tonic-gate  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
3797c478bd9Sstevel@tonic-gate  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
3807c478bd9Sstevel@tonic-gate  * when all CPUs are brought online and all distance information is available.
3817c478bd9Sstevel@tonic-gate  *
3827c478bd9Sstevel@tonic-gate  * When lgrp_main_init() is complete it sets lgrp_initialized. The
3837c478bd9Sstevel@tonic-gate  * lgrp_main_mp_init() sets lgrp_topo_initialized.
3847c478bd9Sstevel@tonic-gate  */
3857c478bd9Sstevel@tonic-gate 
3867c478bd9Sstevel@tonic-gate /*
3877c478bd9Sstevel@tonic-gate  * true when lgrp initialization has been completed.
3887c478bd9Sstevel@tonic-gate  */
3897c478bd9Sstevel@tonic-gate int	lgrp_initialized = 0;
3907c478bd9Sstevel@tonic-gate 
3917c478bd9Sstevel@tonic-gate /*
3927c478bd9Sstevel@tonic-gate  * True when lgrp topology is constructed.
3937c478bd9Sstevel@tonic-gate  */
3947c478bd9Sstevel@tonic-gate int	lgrp_topo_initialized = 0;
3957c478bd9Sstevel@tonic-gate 
3967c478bd9Sstevel@tonic-gate /*
3977c478bd9Sstevel@tonic-gate  * Init routine called after startup(), /etc/system has been processed,
3987c478bd9Sstevel@tonic-gate  * and cpu0 has been added to an lgroup.
3997c478bd9Sstevel@tonic-gate  */
4007c478bd9Sstevel@tonic-gate void
4017c478bd9Sstevel@tonic-gate lgrp_main_init(void)
4027c478bd9Sstevel@tonic-gate {
4037c478bd9Sstevel@tonic-gate 	cpu_t		*cp = CPU;
4047c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
4057c478bd9Sstevel@tonic-gate 	int		i;
4067c478bd9Sstevel@tonic-gate 	/*
4077c478bd9Sstevel@tonic-gate 	 * Enforce a valid lgrp_mem_default_policy
4087c478bd9Sstevel@tonic-gate 	 */
4097c478bd9Sstevel@tonic-gate 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
4107c478bd9Sstevel@tonic-gate 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
4117c478bd9Sstevel@tonic-gate 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
4127c478bd9Sstevel@tonic-gate 
4137c478bd9Sstevel@tonic-gate 	/*
4147c478bd9Sstevel@tonic-gate 	 * See if mpo should be disabled.
4157c478bd9Sstevel@tonic-gate 	 * This may happen in the case of null proc LPA on Starcat.
4167c478bd9Sstevel@tonic-gate 	 * The platform won't be able to detect null proc LPA until after
4177c478bd9Sstevel@tonic-gate 	 * cpu0 and memory have already been added to lgroups.
4187c478bd9Sstevel@tonic-gate 	 * When and if it is detected, the Starcat platform will return
4197c478bd9Sstevel@tonic-gate 	 * a different platform handle for cpu0 which is what we check for
4207c478bd9Sstevel@tonic-gate 	 * here. If mpo should be disabled move cpu0 to it's rightful place
4217c478bd9Sstevel@tonic-gate 	 * (the root), and destroy the remaining lgroups. This effectively
4227c478bd9Sstevel@tonic-gate 	 * provides an UMA lgroup topology.
4237c478bd9Sstevel@tonic-gate 	 */
4247c478bd9Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
4257c478bd9Sstevel@tonic-gate 	if (lgrp_table[lgrpid]->lgrp_plathand !=
4267c478bd9Sstevel@tonic-gate 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
4277c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
4287c478bd9Sstevel@tonic-gate 		lgrp_cpu_fini(cp, lgrpid);
4297c478bd9Sstevel@tonic-gate 
4307c478bd9Sstevel@tonic-gate 		lgrp_cpu_init(cp);
4317c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
4327c478bd9Sstevel@tonic-gate 
4337c478bd9Sstevel@tonic-gate 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
4347c478bd9Sstevel@tonic-gate 
4358c6a5496Sjjc 		/*
4368c6a5496Sjjc 		 * Destroy all lgroups except for root
4378c6a5496Sjjc 		 */
4387c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
4397c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp_table[i]) &&
4407c478bd9Sstevel@tonic-gate 			    lgrp_table[i] != lgrp_root)
4417c478bd9Sstevel@tonic-gate 				lgrp_destroy(lgrp_table[i]);
4427c478bd9Sstevel@tonic-gate 		}
4438c6a5496Sjjc 
4448c6a5496Sjjc 		/*
4458c6a5496Sjjc 		 * Fix up root to point at itself for leaves and resources
4468c6a5496Sjjc 		 * and not have any children
4478c6a5496Sjjc 		 */
4488c6a5496Sjjc 		lgrp_root->lgrp_childcnt = 0;
4498c6a5496Sjjc 		klgrpset_clear(lgrp_root->lgrp_children);
4508c6a5496Sjjc 		klgrpset_clear(lgrp_root->lgrp_leaves);
4518c6a5496Sjjc 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
4527c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
4537c478bd9Sstevel@tonic-gate 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
4547c478bd9Sstevel@tonic-gate 	}
4557c478bd9Sstevel@tonic-gate 
4567c478bd9Sstevel@tonic-gate 	/*
4577c478bd9Sstevel@tonic-gate 	 * Initialize kstats framework.
4587c478bd9Sstevel@tonic-gate 	 */
4597c478bd9Sstevel@tonic-gate 	lgrp_kstat_init();
4607c478bd9Sstevel@tonic-gate 	/*
4617c478bd9Sstevel@tonic-gate 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
4627c478bd9Sstevel@tonic-gate 	 */
4637c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
4647c478bd9Sstevel@tonic-gate 	lgrp_kstat_create(cp);
4657c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate 	lgrp_plat_main_init();
4687c478bd9Sstevel@tonic-gate 	lgrp_initialized = 1;
4697c478bd9Sstevel@tonic-gate }
4707c478bd9Sstevel@tonic-gate 
4717c478bd9Sstevel@tonic-gate /*
4727c478bd9Sstevel@tonic-gate  * Finish lgrp initialization after all CPUS are brought on-line.
4737c478bd9Sstevel@tonic-gate  * This routine is called after start_other_cpus().
4747c478bd9Sstevel@tonic-gate  */
4757c478bd9Sstevel@tonic-gate void
4767c478bd9Sstevel@tonic-gate lgrp_main_mp_init(void)
4777c478bd9Sstevel@tonic-gate {
4787c478bd9Sstevel@tonic-gate 	klgrpset_t changed;
4797c478bd9Sstevel@tonic-gate 
4807c478bd9Sstevel@tonic-gate 	/*
4817c478bd9Sstevel@tonic-gate 	 * Update lgroup topology (if necessary)
4827c478bd9Sstevel@tonic-gate 	 */
4837c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
4847c478bd9Sstevel@tonic-gate 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
4857c478bd9Sstevel@tonic-gate 	lgrp_topo_initialized = 1;
4867c478bd9Sstevel@tonic-gate }
4877c478bd9Sstevel@tonic-gate 
4887c478bd9Sstevel@tonic-gate /*
4897c478bd9Sstevel@tonic-gate  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
4907c478bd9Sstevel@tonic-gate  */
4917c478bd9Sstevel@tonic-gate void
4927c478bd9Sstevel@tonic-gate lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
4937c478bd9Sstevel@tonic-gate {
4947c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
4957c478bd9Sstevel@tonic-gate 	cpu_t		*cp;
4967c478bd9Sstevel@tonic-gate 	lgrp_id_t	id;
4977c478bd9Sstevel@tonic-gate 	int		rc;
4987c478bd9Sstevel@tonic-gate 
4997c478bd9Sstevel@tonic-gate 	switch (event) {
5007c478bd9Sstevel@tonic-gate 	/*
5017c478bd9Sstevel@tonic-gate 	 * The following (re)configuration events are common code
5027c478bd9Sstevel@tonic-gate 	 * initiated. lgrp_plat_config() is called here to inform the
5037c478bd9Sstevel@tonic-gate 	 * platform of the reconfiguration event.
5047c478bd9Sstevel@tonic-gate 	 */
5057c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ADD:
506394b433dSesaxe 		cp = (cpu_t *)resource;
507394b433dSesaxe 
508394b433dSesaxe 		/*
509394b433dSesaxe 		 * Initialize the new CPU's lgrp related next/prev
510394b433dSesaxe 		 * links, and give it a bootstrap lpl so that it can
511394b433dSesaxe 		 * survive should it need to enter the dispatcher.
512394b433dSesaxe 		 */
513394b433dSesaxe 		cp->cpu_next_lpl = cp;
514394b433dSesaxe 		cp->cpu_prev_lpl = cp;
515394b433dSesaxe 		cp->cpu_next_lgrp = cp;
516394b433dSesaxe 		cp->cpu_prev_lgrp = cp;
517394b433dSesaxe 		cp->cpu_lpl = lpl_bootstrap;
518394b433dSesaxe 
5197c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5207c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5217c478bd9Sstevel@tonic-gate 
5227c478bd9Sstevel@tonic-gate 		break;
5237c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_DEL:
5247c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5257c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5267c478bd9Sstevel@tonic-gate 
5277c478bd9Sstevel@tonic-gate 		break;
5287c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ONLINE:
5297c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
5307c478bd9Sstevel@tonic-gate 		lgrp_cpu_init(cp);
5317c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
5327c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
5337c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
5347c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
5357c478bd9Sstevel@tonic-gate 		}
5367c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5377c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5387c478bd9Sstevel@tonic-gate 
5397c478bd9Sstevel@tonic-gate 		break;
5407c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_OFFLINE:
5417c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
5427c478bd9Sstevel@tonic-gate 		id = cp->cpu_lpl->lpl_lgrpid;
5437c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
5447c478bd9Sstevel@tonic-gate 		lgrp_cpu_fini(cp, id);
5457c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
5467c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
5477c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
5487c478bd9Sstevel@tonic-gate 		}
5497c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5507c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5517c478bd9Sstevel@tonic-gate 
5527c478bd9Sstevel@tonic-gate 		break;
5537c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_ADD:
5547c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
5557c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
5567c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
5577c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
5587c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
5597c478bd9Sstevel@tonic-gate 		}
5607c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5617c478bd9Sstevel@tonic-gate 
5627c478bd9Sstevel@tonic-gate 		break;
5637c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_DEL:
5647c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
5657c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu((cpu_t *)resource);
5667c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
5677c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
5687c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
5697c478bd9Sstevel@tonic-gate 		}
5707c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5717c478bd9Sstevel@tonic-gate 
5727c478bd9Sstevel@tonic-gate 		break;
5737c478bd9Sstevel@tonic-gate 	/*
5747c478bd9Sstevel@tonic-gate 	 * The following events are initiated by the memnode
5757c478bd9Sstevel@tonic-gate 	 * subsystem.
5767c478bd9Sstevel@tonic-gate 	 */
5777c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_ADD:
5787c478bd9Sstevel@tonic-gate 		lgrp_mem_init((int)resource, where, B_FALSE);
5797c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5807c478bd9Sstevel@tonic-gate 
5817c478bd9Sstevel@tonic-gate 		break;
5827c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_DEL:
5837c478bd9Sstevel@tonic-gate 		lgrp_mem_fini((int)resource, where, B_FALSE);
5847c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5857c478bd9Sstevel@tonic-gate 
5867c478bd9Sstevel@tonic-gate 		break;
5877c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_RENAME: {
5887c478bd9Sstevel@tonic-gate 		lgrp_config_mem_rename_t *ren_arg =
5897c478bd9Sstevel@tonic-gate 		    (lgrp_config_mem_rename_t *)where;
5907c478bd9Sstevel@tonic-gate 
5917c478bd9Sstevel@tonic-gate 		lgrp_mem_rename((int)resource,
5927c478bd9Sstevel@tonic-gate 		    ren_arg->lmem_rename_from,
5937c478bd9Sstevel@tonic-gate 		    ren_arg->lmem_rename_to);
5947c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5957c478bd9Sstevel@tonic-gate 
5967c478bd9Sstevel@tonic-gate 		break;
5977c478bd9Sstevel@tonic-gate 	}
5987c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_GEN_UPDATE:
5997c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6007c478bd9Sstevel@tonic-gate 
6017c478bd9Sstevel@tonic-gate 		break;
6027c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_FLATTEN:
6037c478bd9Sstevel@tonic-gate 		if (where == 0)
6047c478bd9Sstevel@tonic-gate 			lgrp_topo_levels = (int)resource;
6057c478bd9Sstevel@tonic-gate 		else
6067c478bd9Sstevel@tonic-gate 			(void) lgrp_topo_flatten(resource,
6077c478bd9Sstevel@tonic-gate 			    lgrp_table, lgrp_alloc_max, &changed);
6087c478bd9Sstevel@tonic-gate 
6097c478bd9Sstevel@tonic-gate 		break;
6107c478bd9Sstevel@tonic-gate 	/*
6117c478bd9Sstevel@tonic-gate 	 * Initiated by platform latency probing code
6127c478bd9Sstevel@tonic-gate 	 */
6137c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_LATENCY_CHANGE:
6147c478bd9Sstevel@tonic-gate 		lgrp_latency_change((u_longlong_t)resource,
6157c478bd9Sstevel@tonic-gate 		    (u_longlong_t)where);
6167c478bd9Sstevel@tonic-gate 
6177c478bd9Sstevel@tonic-gate 		break;
6187c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_NOP:
6197c478bd9Sstevel@tonic-gate 
6207c478bd9Sstevel@tonic-gate 		break;
6217c478bd9Sstevel@tonic-gate 	default:
6227c478bd9Sstevel@tonic-gate 		break;
6237c478bd9Sstevel@tonic-gate 	}
6247c478bd9Sstevel@tonic-gate 
6257c478bd9Sstevel@tonic-gate }
6267c478bd9Sstevel@tonic-gate 
6277c478bd9Sstevel@tonic-gate /*
6287c478bd9Sstevel@tonic-gate  * Called to add lgrp info into cpu structure from cpu_add_unit;
6297c478bd9Sstevel@tonic-gate  * do not assume cpu is in cpu[] yet!
6307c478bd9Sstevel@tonic-gate  *
6317c478bd9Sstevel@tonic-gate  * CPUs are brought online with all other CPUs paused so we can't
6327c478bd9Sstevel@tonic-gate  * allocate memory or we could deadlock the system, so we rely on
6337c478bd9Sstevel@tonic-gate  * the platform to statically allocate as much space as we need
6347c478bd9Sstevel@tonic-gate  * for the lgrp structs and stats.
6357c478bd9Sstevel@tonic-gate  */
6367c478bd9Sstevel@tonic-gate static void
6377c478bd9Sstevel@tonic-gate lgrp_cpu_init(struct cpu *cp)
6387c478bd9Sstevel@tonic-gate {
6397c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
6407c478bd9Sstevel@tonic-gate 	int		count;
6417c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
6427c478bd9Sstevel@tonic-gate 	int		first_cpu;
6437c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
6447c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
6457c478bd9Sstevel@tonic-gate 	struct cpu	*cptr;
6467c478bd9Sstevel@tonic-gate 	struct chip	*chp;
6477c478bd9Sstevel@tonic-gate 
6487c478bd9Sstevel@tonic-gate 	/*
6497c478bd9Sstevel@tonic-gate 	 * This is the first time through if the resource set
6507c478bd9Sstevel@tonic-gate 	 * for the root lgroup is empty. After cpu0 has been
6517c478bd9Sstevel@tonic-gate 	 * initially added to an lgroup, the root's CPU resource
6527c478bd9Sstevel@tonic-gate 	 * set can never be empty, since the system's last CPU
6537c478bd9Sstevel@tonic-gate 	 * cannot be offlined.
6547c478bd9Sstevel@tonic-gate 	 */
6557c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
6567c478bd9Sstevel@tonic-gate 		/*
6577c478bd9Sstevel@tonic-gate 		 * First time through.
6587c478bd9Sstevel@tonic-gate 		 */
6597c478bd9Sstevel@tonic-gate 		first_cpu = 1;
6607c478bd9Sstevel@tonic-gate 	} else {
6617c478bd9Sstevel@tonic-gate 		/*
6627c478bd9Sstevel@tonic-gate 		 * If cpu0 needs to move lgroups, we may come
6637c478bd9Sstevel@tonic-gate 		 * through here again, at which time cpu_lock won't
6647c478bd9Sstevel@tonic-gate 		 * be held, and lgrp_initialized will be false.
6657c478bd9Sstevel@tonic-gate 		 */
6667c478bd9Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
6677c478bd9Sstevel@tonic-gate 		ASSERT(cp->cpu_part != NULL);
6687c478bd9Sstevel@tonic-gate 		first_cpu = 0;
6697c478bd9Sstevel@tonic-gate 	}
6707c478bd9Sstevel@tonic-gate 
6717c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
6727c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
6737c478bd9Sstevel@tonic-gate 
6747c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL) {
6757c478bd9Sstevel@tonic-gate 		/*
6767c478bd9Sstevel@tonic-gate 		 * Create new lgrp and add it to lgroup topology
6777c478bd9Sstevel@tonic-gate 		 */
6787c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_create();
6797c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
6807c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
6817c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
6827c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
6837c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
6847c478bd9Sstevel@tonic-gate 
6857c478bd9Sstevel@tonic-gate 		count = 0;
6867c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
6877c478bd9Sstevel@tonic-gate 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
6887c478bd9Sstevel@tonic-gate 		    &changed);
6892dae3fb5Sjjc 		/*
6902dae3fb5Sjjc 		 * May have added new intermediate lgroups, so need to add
6912dae3fb5Sjjc 		 * resources other than CPUs which are added below
6922dae3fb5Sjjc 		 */
6932dae3fb5Sjjc 		(void) lgrp_mnode_update(changed, NULL);
6947c478bd9Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
6957c478bd9Sstevel@tonic-gate 	    > 0) {
6967c478bd9Sstevel@tonic-gate 		/*
6977c478bd9Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
6987c478bd9Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
6997c478bd9Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
7007c478bd9Sstevel@tonic-gate 		 * lgroups.
7017c478bd9Sstevel@tonic-gate 		 */
7027c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
7037c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
7047c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
7057c478bd9Sstevel@tonic-gate 		    lgrpid))
7067c478bd9Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
7077c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
7087c478bd9Sstevel@tonic-gate 		    &changed);
7097c478bd9Sstevel@tonic-gate 
7107c478bd9Sstevel@tonic-gate 		/*
7117c478bd9Sstevel@tonic-gate 		 * May have added new intermediate lgroups, so need to add
7127c478bd9Sstevel@tonic-gate 		 * resources other than CPUs which are added below
7137c478bd9Sstevel@tonic-gate 		 */
7147c478bd9Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
7157c478bd9Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
7167c478bd9Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
7177c478bd9Sstevel@tonic-gate 		int	i;
7187c478bd9Sstevel@tonic-gate 
7197c478bd9Sstevel@tonic-gate 		/*
7207c478bd9Sstevel@tonic-gate 		 * Update existing lgroup and lgroups containing it with CPU
7217c478bd9Sstevel@tonic-gate 		 * resource
7227c478bd9Sstevel@tonic-gate 		 */
7237c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
7247c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
7257c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
7267c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
7277c478bd9Sstevel@tonic-gate 
7287c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
7297c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
7307c478bd9Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
7317c478bd9Sstevel@tonic-gate 				continue;
7327c478bd9Sstevel@tonic-gate 
7337c478bd9Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
7347c478bd9Sstevel@tonic-gate 		}
7357c478bd9Sstevel@tonic-gate 	}
7367c478bd9Sstevel@tonic-gate 
7377c478bd9Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
7387c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
7397c478bd9Sstevel@tonic-gate 
7407c478bd9Sstevel@tonic-gate 	/*
7417c478bd9Sstevel@tonic-gate 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
7427c478bd9Sstevel@tonic-gate 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
7437c478bd9Sstevel@tonic-gate 	 * not since none of lgroup IDs in the lpl's have been set yet.
7447c478bd9Sstevel@tonic-gate 	 */
7457c478bd9Sstevel@tonic-gate 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
7467c478bd9Sstevel@tonic-gate 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
7477c478bd9Sstevel@tonic-gate 
7487c478bd9Sstevel@tonic-gate 	/*
7497c478bd9Sstevel@tonic-gate 	 * link the CPU into the lgrp's CPU list
7507c478bd9Sstevel@tonic-gate 	 */
7517c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
7527c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = cp;
7537c478bd9Sstevel@tonic-gate 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
7547c478bd9Sstevel@tonic-gate 	} else {
7557c478bd9Sstevel@tonic-gate 		cptr = my_lgrp->lgrp_cpu;
7567c478bd9Sstevel@tonic-gate 		cp->cpu_next_lgrp = cptr;
7577c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
7587c478bd9Sstevel@tonic-gate 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
7597c478bd9Sstevel@tonic-gate 		cptr->cpu_prev_lgrp = cp;
7607c478bd9Sstevel@tonic-gate 	}
7617c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt++;
7627c478bd9Sstevel@tonic-gate 
7637c478bd9Sstevel@tonic-gate 	/*
7647c478bd9Sstevel@tonic-gate 	 * Add this cpu's chip to the per lgroup list
7657c478bd9Sstevel@tonic-gate 	 * if necessary
7667c478bd9Sstevel@tonic-gate 	 */
7677c478bd9Sstevel@tonic-gate 	if (cp->cpu_chip->chip_lgrp == NULL) {
7687c478bd9Sstevel@tonic-gate 		struct chip *lcpr;
7697c478bd9Sstevel@tonic-gate 
7707c478bd9Sstevel@tonic-gate 		chp = cp->cpu_chip;
7717c478bd9Sstevel@tonic-gate 
7727c478bd9Sstevel@tonic-gate 		if (my_lgrp->lgrp_chipcnt == 0) {
7737c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = chp;
7747c478bd9Sstevel@tonic-gate 			chp->chip_next_lgrp =
7757c478bd9Sstevel@tonic-gate 			    chp->chip_prev_lgrp = chp;
7767c478bd9Sstevel@tonic-gate 		} else {
7777c478bd9Sstevel@tonic-gate 			lcpr = my_lgrp->lgrp_chips;
7787c478bd9Sstevel@tonic-gate 			chp->chip_next_lgrp = lcpr;
7797c478bd9Sstevel@tonic-gate 			chp->chip_prev_lgrp =
7807c478bd9Sstevel@tonic-gate 			    lcpr->chip_prev_lgrp;
7817c478bd9Sstevel@tonic-gate 			lcpr->chip_prev_lgrp->chip_next_lgrp =
7827c478bd9Sstevel@tonic-gate 			    chp;
7837c478bd9Sstevel@tonic-gate 			lcpr->chip_prev_lgrp = chp;
7847c478bd9Sstevel@tonic-gate 		}
7857c478bd9Sstevel@tonic-gate 		chp->chip_lgrp = my_lgrp;
7867c478bd9Sstevel@tonic-gate 		chp->chip_balance = chp->chip_next_lgrp;
7877c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_chipcnt++;
7887c478bd9Sstevel@tonic-gate 	}
7897c478bd9Sstevel@tonic-gate }
7907c478bd9Sstevel@tonic-gate 
7917c478bd9Sstevel@tonic-gate lgrp_t *
7927c478bd9Sstevel@tonic-gate lgrp_create(void)
7937c478bd9Sstevel@tonic-gate {
7947c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
7957c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
7967c478bd9Sstevel@tonic-gate 	int		i;
7977c478bd9Sstevel@tonic-gate 
7987c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
7997c478bd9Sstevel@tonic-gate 
8007c478bd9Sstevel@tonic-gate 	/*
8017c478bd9Sstevel@tonic-gate 	 * Find an open slot in the lgroup table and recycle unused lgroup
8027c478bd9Sstevel@tonic-gate 	 * left there if any
8037c478bd9Sstevel@tonic-gate 	 */
8047c478bd9Sstevel@tonic-gate 	my_lgrp = NULL;
8057c478bd9Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1)
8067c478bd9Sstevel@tonic-gate 		/*
8077c478bd9Sstevel@tonic-gate 		 * Allocate from end when hint not set yet because no lgroups
8087c478bd9Sstevel@tonic-gate 		 * have been deleted yet
8097c478bd9Sstevel@tonic-gate 		 */
8107c478bd9Sstevel@tonic-gate 		lgrpid = nlgrps++;
8117c478bd9Sstevel@tonic-gate 	else {
8127c478bd9Sstevel@tonic-gate 		/*
8137c478bd9Sstevel@tonic-gate 		 * Start looking for next open slot from hint and leave hint
8147c478bd9Sstevel@tonic-gate 		 * at slot allocated
8157c478bd9Sstevel@tonic-gate 		 */
8167c478bd9Sstevel@tonic-gate 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
8177c478bd9Sstevel@tonic-gate 			my_lgrp = lgrp_table[i];
8187c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(my_lgrp)) {
8197c478bd9Sstevel@tonic-gate 				lgrpid = i;
8207c478bd9Sstevel@tonic-gate 				nlgrps++;
8217c478bd9Sstevel@tonic-gate 				break;
8227c478bd9Sstevel@tonic-gate 			}
8237c478bd9Sstevel@tonic-gate 		}
8247c478bd9Sstevel@tonic-gate 		lgrp_alloc_hint = lgrpid;
8257c478bd9Sstevel@tonic-gate 	}
8267c478bd9Sstevel@tonic-gate 
8277c478bd9Sstevel@tonic-gate 	/*
8287c478bd9Sstevel@tonic-gate 	 * Keep track of max lgroup ID allocated so far to cut down on searches
8297c478bd9Sstevel@tonic-gate 	 */
8307c478bd9Sstevel@tonic-gate 	if (lgrpid > lgrp_alloc_max)
8317c478bd9Sstevel@tonic-gate 		lgrp_alloc_max = lgrpid;
8327c478bd9Sstevel@tonic-gate 
8337c478bd9Sstevel@tonic-gate 	/*
8347c478bd9Sstevel@tonic-gate 	 * Need to allocate new lgroup if next open slot didn't have one
8357c478bd9Sstevel@tonic-gate 	 * for recycling
8367c478bd9Sstevel@tonic-gate 	 */
8377c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL)
8387c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_plat_alloc(lgrpid);
8397c478bd9Sstevel@tonic-gate 
8407c478bd9Sstevel@tonic-gate 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
8417c478bd9Sstevel@tonic-gate 		panic("Too many lgrps for platform (%d)", nlgrps);
8427c478bd9Sstevel@tonic-gate 
8437c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_id = lgrpid;
8447c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_latency = 0;
8457c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
8467c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_parent = NULL;
8477c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_childcnt = 0;
8487c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
8497c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_nmnodes = 0;
8507c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_children);
8517c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_leaves);
8527c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
8537c478bd9Sstevel@tonic-gate 		klgrpset_clear(my_lgrp->lgrp_set[i]);
8547c478bd9Sstevel@tonic-gate 
8557c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpu = NULL;
8567c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt = 0;
8577c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_chips = NULL;
8587c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_chipcnt = 0;
8597c478bd9Sstevel@tonic-gate 
8607c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
8617c478bd9Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
8627c478bd9Sstevel@tonic-gate 
8637c478bd9Sstevel@tonic-gate 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
8647c478bd9Sstevel@tonic-gate 
8657c478bd9Sstevel@tonic-gate 	return (my_lgrp);
8667c478bd9Sstevel@tonic-gate }
8677c478bd9Sstevel@tonic-gate 
8687c478bd9Sstevel@tonic-gate void
8697c478bd9Sstevel@tonic-gate lgrp_destroy(lgrp_t *lgrp)
8707c478bd9Sstevel@tonic-gate {
8717c478bd9Sstevel@tonic-gate 	int		i;
8727c478bd9Sstevel@tonic-gate 
8737c478bd9Sstevel@tonic-gate 	/*
8747c478bd9Sstevel@tonic-gate 	 * Unless this lgroup is being destroyed on behalf of
8757c478bd9Sstevel@tonic-gate 	 * the boot CPU, cpu_lock must be held
8767c478bd9Sstevel@tonic-gate 	 */
8777c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
8787c478bd9Sstevel@tonic-gate 
8797c478bd9Sstevel@tonic-gate 	if (nlgrps == 1)
8807c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
8817c478bd9Sstevel@tonic-gate 
8827c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
8837c478bd9Sstevel@tonic-gate 		return;
8847c478bd9Sstevel@tonic-gate 
8857c478bd9Sstevel@tonic-gate 	/*
8867c478bd9Sstevel@tonic-gate 	 * Set hint to lgroup being deleted and try to keep lower numbered
8877c478bd9Sstevel@tonic-gate 	 * hints to facilitate finding empty slots
8887c478bd9Sstevel@tonic-gate 	 */
8897c478bd9Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
8907c478bd9Sstevel@tonic-gate 		lgrp_alloc_hint = lgrp->lgrp_id;
8917c478bd9Sstevel@tonic-gate 
8927c478bd9Sstevel@tonic-gate 	/*
8937c478bd9Sstevel@tonic-gate 	 * Mark this lgroup to be recycled by setting its lgroup ID to
8947c478bd9Sstevel@tonic-gate 	 * LGRP_NONE and clear relevant fields
8957c478bd9Sstevel@tonic-gate 	 */
8967c478bd9Sstevel@tonic-gate 	lgrp->lgrp_id = LGRP_NONE;
8977c478bd9Sstevel@tonic-gate 	lgrp->lgrp_latency = 0;
8987c478bd9Sstevel@tonic-gate 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
8997c478bd9Sstevel@tonic-gate 	lgrp->lgrp_parent = NULL;
9007c478bd9Sstevel@tonic-gate 	lgrp->lgrp_childcnt = 0;
9017c478bd9Sstevel@tonic-gate 
9027c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_children);
9037c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_leaves);
9047c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
9057c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp->lgrp_set[i]);
9067c478bd9Sstevel@tonic-gate 
9077c478bd9Sstevel@tonic-gate 	lgrp->lgrp_mnodes = (mnodeset_t)0;
9087c478bd9Sstevel@tonic-gate 	lgrp->lgrp_nmnodes = 0;
9097c478bd9Sstevel@tonic-gate 
9107c478bd9Sstevel@tonic-gate 	lgrp->lgrp_cpu = NULL;
9117c478bd9Sstevel@tonic-gate 	lgrp->lgrp_cpucnt = 0;
9127c478bd9Sstevel@tonic-gate 	lgrp->lgrp_chipcnt = 0;
9137c478bd9Sstevel@tonic-gate 	lgrp->lgrp_chips = NULL;
9147c478bd9Sstevel@tonic-gate 
9157c478bd9Sstevel@tonic-gate 	nlgrps--;
9167c478bd9Sstevel@tonic-gate }
9177c478bd9Sstevel@tonic-gate 
9187c478bd9Sstevel@tonic-gate /*
9197c478bd9Sstevel@tonic-gate  * Initialize kstat data. Called from lgrp intialization code.
9207c478bd9Sstevel@tonic-gate  */
9217c478bd9Sstevel@tonic-gate static void
9227c478bd9Sstevel@tonic-gate lgrp_kstat_init(void)
9237c478bd9Sstevel@tonic-gate {
9247c478bd9Sstevel@tonic-gate 	lgrp_stat_t	stat;
9257c478bd9Sstevel@tonic-gate 
9267c478bd9Sstevel@tonic-gate 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
9277c478bd9Sstevel@tonic-gate 
9287c478bd9Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
9297c478bd9Sstevel@tonic-gate 		kstat_named_init(&lgrp_kstat_data[stat],
9307c478bd9Sstevel@tonic-gate 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
9317c478bd9Sstevel@tonic-gate }
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate /*
9347c478bd9Sstevel@tonic-gate  * initialize an lgrp's kstats if needed
9357c478bd9Sstevel@tonic-gate  * called with cpu_lock held but not with cpus paused.
9367c478bd9Sstevel@tonic-gate  * we don't tear these down now because we don't know about
9377c478bd9Sstevel@tonic-gate  * memory leaving the lgrp yet...
9387c478bd9Sstevel@tonic-gate  */
9397c478bd9Sstevel@tonic-gate 
9407c478bd9Sstevel@tonic-gate void
9417c478bd9Sstevel@tonic-gate lgrp_kstat_create(cpu_t *cp)
9427c478bd9Sstevel@tonic-gate {
9437c478bd9Sstevel@tonic-gate 	kstat_t		*lgrp_kstat;
9447c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
9457c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
9467c478bd9Sstevel@tonic-gate 
9477c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
9487c478bd9Sstevel@tonic-gate 
9497c478bd9Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
9507c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
9517c478bd9Sstevel@tonic-gate 
9527c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
9537c478bd9Sstevel@tonic-gate 		return; /* already initialized */
9547c478bd9Sstevel@tonic-gate 
9557c478bd9Sstevel@tonic-gate 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
9567c478bd9Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
9577c478bd9Sstevel@tonic-gate 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
9587c478bd9Sstevel@tonic-gate 
9597c478bd9Sstevel@tonic-gate 	if (lgrp_kstat != NULL) {
9607c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
9617c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_private = my_lgrp;
9627c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_data = &lgrp_kstat_data;
9637c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_update = lgrp_kstat_extract;
9647c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_kstat = lgrp_kstat;
9657c478bd9Sstevel@tonic-gate 		kstat_install(lgrp_kstat);
9667c478bd9Sstevel@tonic-gate 	}
9677c478bd9Sstevel@tonic-gate }
9687c478bd9Sstevel@tonic-gate 
9697c478bd9Sstevel@tonic-gate /*
9707c478bd9Sstevel@tonic-gate  * this will do something when we manage to remove now unused lgrps
9717c478bd9Sstevel@tonic-gate  */
9727c478bd9Sstevel@tonic-gate 
9737c478bd9Sstevel@tonic-gate /* ARGSUSED */
9747c478bd9Sstevel@tonic-gate void
9757c478bd9Sstevel@tonic-gate lgrp_kstat_destroy(cpu_t *cp)
9767c478bd9Sstevel@tonic-gate {
9777c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
9787c478bd9Sstevel@tonic-gate }
9797c478bd9Sstevel@tonic-gate 
9807c478bd9Sstevel@tonic-gate /*
9817c478bd9Sstevel@tonic-gate  * Called when a CPU is off-lined.
9827c478bd9Sstevel@tonic-gate  */
9837c478bd9Sstevel@tonic-gate static void
9847c478bd9Sstevel@tonic-gate lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
9857c478bd9Sstevel@tonic-gate {
9867c478bd9Sstevel@tonic-gate 	lgrp_t *my_lgrp;
9877c478bd9Sstevel@tonic-gate 	struct cpu *prev;
9887c478bd9Sstevel@tonic-gate 	struct cpu *next;
9897c478bd9Sstevel@tonic-gate 	chip_t  *chp;
9907c478bd9Sstevel@tonic-gate 
9917c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
9927c478bd9Sstevel@tonic-gate 
9937c478bd9Sstevel@tonic-gate 	prev = cp->cpu_prev_lgrp;
9947c478bd9Sstevel@tonic-gate 	next = cp->cpu_next_lgrp;
9957c478bd9Sstevel@tonic-gate 
9967c478bd9Sstevel@tonic-gate 	prev->cpu_next_lgrp = next;
9977c478bd9Sstevel@tonic-gate 	next->cpu_prev_lgrp = prev;
9987c478bd9Sstevel@tonic-gate 
9997c478bd9Sstevel@tonic-gate 	/*
10007c478bd9Sstevel@tonic-gate 	 * just because I'm paranoid doesn't mean...
10017c478bd9Sstevel@tonic-gate 	 */
10027c478bd9Sstevel@tonic-gate 
10037c478bd9Sstevel@tonic-gate 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
10047c478bd9Sstevel@tonic-gate 
10057c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
10067c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt--;
10077c478bd9Sstevel@tonic-gate 
10087c478bd9Sstevel@tonic-gate 	/*
10097c478bd9Sstevel@tonic-gate 	 * If the last CPU on it's chip is being offlined
10107c478bd9Sstevel@tonic-gate 	 * then remove this chip from the per lgroup list.
10117c478bd9Sstevel@tonic-gate 	 *
10127c478bd9Sstevel@tonic-gate 	 * This is also done for the boot CPU when it needs
10137c478bd9Sstevel@tonic-gate 	 * to move between lgroups as a consequence of
10147c478bd9Sstevel@tonic-gate 	 * null proc lpa.
10157c478bd9Sstevel@tonic-gate 	 */
10167c478bd9Sstevel@tonic-gate 	chp = cp->cpu_chip;
10177c478bd9Sstevel@tonic-gate 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
10187c478bd9Sstevel@tonic-gate 
10197c478bd9Sstevel@tonic-gate 		chip_t	*chpp;
10207c478bd9Sstevel@tonic-gate 
10217c478bd9Sstevel@tonic-gate 		if (--my_lgrp->lgrp_chipcnt == 0)
10227c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = NULL;
10237c478bd9Sstevel@tonic-gate 		else if (my_lgrp->lgrp_chips == chp)
10247c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
10257c478bd9Sstevel@tonic-gate 
10267c478bd9Sstevel@tonic-gate 		/*
10277c478bd9Sstevel@tonic-gate 		 * Walk this lgroup's chip list looking for chips that
10287c478bd9Sstevel@tonic-gate 		 * may try to balance against the one that's leaving
10297c478bd9Sstevel@tonic-gate 		 */
10307c478bd9Sstevel@tonic-gate 		for (chpp = chp->chip_next_lgrp; chpp != chp;
10317c478bd9Sstevel@tonic-gate 		    chpp = chpp->chip_next_lgrp) {
10327c478bd9Sstevel@tonic-gate 			if (chpp->chip_balance == chp)
10337c478bd9Sstevel@tonic-gate 				chpp->chip_balance = chp->chip_next_lgrp;
10347c478bd9Sstevel@tonic-gate 		}
10357c478bd9Sstevel@tonic-gate 
10367c478bd9Sstevel@tonic-gate 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
10377c478bd9Sstevel@tonic-gate 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
10387c478bd9Sstevel@tonic-gate 
10397c478bd9Sstevel@tonic-gate 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
10407c478bd9Sstevel@tonic-gate 		chp->chip_lgrp = NULL;
10417c478bd9Sstevel@tonic-gate 		chp->chip_balance = NULL;
10427c478bd9Sstevel@tonic-gate 	}
10437c478bd9Sstevel@tonic-gate 
10447c478bd9Sstevel@tonic-gate 	/*
10457c478bd9Sstevel@tonic-gate 	 * Removing last CPU in lgroup, so update lgroup topology
10467c478bd9Sstevel@tonic-gate 	 */
10477c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
10487c478bd9Sstevel@tonic-gate 		klgrpset_t	changed;
10497c478bd9Sstevel@tonic-gate 		int		count;
10507c478bd9Sstevel@tonic-gate 		int		i;
10517c478bd9Sstevel@tonic-gate 
10527c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = NULL;
10537c478bd9Sstevel@tonic-gate 
10547c478bd9Sstevel@tonic-gate 		/*
10557c478bd9Sstevel@tonic-gate 		 * Remove this lgroup from its lgroup CPU resources and remove
10567c478bd9Sstevel@tonic-gate 		 * lgroup from lgroup topology if it doesn't have any more
10577c478bd9Sstevel@tonic-gate 		 * resources in it now
10587c478bd9Sstevel@tonic-gate 		 */
10597c478bd9Sstevel@tonic-gate 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
10607c478bd9Sstevel@tonic-gate 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
10617c478bd9Sstevel@tonic-gate 			count = 0;
10627c478bd9Sstevel@tonic-gate 			klgrpset_clear(changed);
10637c478bd9Sstevel@tonic-gate 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
10647c478bd9Sstevel@tonic-gate 			    lgrp_alloc_max + 1, &changed);
10657c478bd9Sstevel@tonic-gate 			return;
10667c478bd9Sstevel@tonic-gate 		}
10677c478bd9Sstevel@tonic-gate 
10687c478bd9Sstevel@tonic-gate 		/*
10697c478bd9Sstevel@tonic-gate 		 * This lgroup isn't empty, so just remove it from CPU
10707c478bd9Sstevel@tonic-gate 		 * resources of any lgroups that contain it as such
10717c478bd9Sstevel@tonic-gate 		 */
10727c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
10737c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
10747c478bd9Sstevel@tonic-gate 
10757c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
10767c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
10777c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
10787c478bd9Sstevel@tonic-gate 			    lgrpid))
10797c478bd9Sstevel@tonic-gate 				continue;
10807c478bd9Sstevel@tonic-gate 
10817c478bd9Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
10827c478bd9Sstevel@tonic-gate 		}
10837c478bd9Sstevel@tonic-gate 		return;
10847c478bd9Sstevel@tonic-gate 	}
10857c478bd9Sstevel@tonic-gate 
10867c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpu == cp)
10877c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = next;
10887c478bd9Sstevel@tonic-gate 
10897c478bd9Sstevel@tonic-gate }
10907c478bd9Sstevel@tonic-gate 
10917c478bd9Sstevel@tonic-gate /*
10927c478bd9Sstevel@tonic-gate  * Update memory nodes in target lgroups and return ones that get changed
10937c478bd9Sstevel@tonic-gate  */
10947c478bd9Sstevel@tonic-gate int
10957c478bd9Sstevel@tonic-gate lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
10967c478bd9Sstevel@tonic-gate {
10977c478bd9Sstevel@tonic-gate 	int	count;
10987c478bd9Sstevel@tonic-gate 	int	i;
10997c478bd9Sstevel@tonic-gate 	int	j;
11007c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
11017c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp_rsrc;
11027c478bd9Sstevel@tonic-gate 
11037c478bd9Sstevel@tonic-gate 	count = 0;
11047c478bd9Sstevel@tonic-gate 	if (changed)
11057c478bd9Sstevel@tonic-gate 		klgrpset_clear(*changed);
11067c478bd9Sstevel@tonic-gate 
11077c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(target))
11087c478bd9Sstevel@tonic-gate 		return (0);
11097c478bd9Sstevel@tonic-gate 
11107c478bd9Sstevel@tonic-gate 	/*
11117c478bd9Sstevel@tonic-gate 	 * Find each lgroup in target lgroups
11127c478bd9Sstevel@tonic-gate 	 */
11137c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
11147c478bd9Sstevel@tonic-gate 		/*
11157c478bd9Sstevel@tonic-gate 		 * Skip any lgroups that don't exist or aren't in target group
11167c478bd9Sstevel@tonic-gate 		 */
11177c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
11187c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
11197c478bd9Sstevel@tonic-gate 			continue;
11207c478bd9Sstevel@tonic-gate 		}
11217c478bd9Sstevel@tonic-gate 
11227c478bd9Sstevel@tonic-gate 		/*
11237c478bd9Sstevel@tonic-gate 		 * Initialize memnodes for intermediate lgroups to 0
11247c478bd9Sstevel@tonic-gate 		 * and update them from scratch since they may have completely
11257c478bd9Sstevel@tonic-gate 		 * changed
11267c478bd9Sstevel@tonic-gate 		 */
11277c478bd9Sstevel@tonic-gate 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
11287c478bd9Sstevel@tonic-gate 			lgrp->lgrp_mnodes = (mnodeset_t)0;
11297c478bd9Sstevel@tonic-gate 			lgrp->lgrp_nmnodes = 0;
11307c478bd9Sstevel@tonic-gate 		}
11317c478bd9Sstevel@tonic-gate 
11327c478bd9Sstevel@tonic-gate 		/*
11337c478bd9Sstevel@tonic-gate 		 * Update memory nodes of of target lgroup with memory nodes
11347c478bd9Sstevel@tonic-gate 		 * from each lgroup in its lgroup memory resource set
11357c478bd9Sstevel@tonic-gate 		 */
11367c478bd9Sstevel@tonic-gate 		for (j = 0; j <= lgrp_alloc_max; j++) {
11377c478bd9Sstevel@tonic-gate 			int	k;
11387c478bd9Sstevel@tonic-gate 
11397c478bd9Sstevel@tonic-gate 			/*
11407c478bd9Sstevel@tonic-gate 			 * Skip any lgroups that don't exist or aren't in
11417c478bd9Sstevel@tonic-gate 			 * memory resources of target lgroup
11427c478bd9Sstevel@tonic-gate 			 */
11437c478bd9Sstevel@tonic-gate 			lgrp_rsrc = lgrp_table[j];
11447c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_rsrc) ||
11457c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
11467c478bd9Sstevel@tonic-gate 			    j))
11477c478bd9Sstevel@tonic-gate 				continue;
11487c478bd9Sstevel@tonic-gate 
11497c478bd9Sstevel@tonic-gate 			/*
11507c478bd9Sstevel@tonic-gate 			 * Update target lgroup's memnodes to include memnodes
11517c478bd9Sstevel@tonic-gate 			 * of this lgroup
11527c478bd9Sstevel@tonic-gate 			 */
11537c478bd9Sstevel@tonic-gate 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
11547c478bd9Sstevel@tonic-gate 				mnodeset_t	mnode_mask;
11557c478bd9Sstevel@tonic-gate 
11567c478bd9Sstevel@tonic-gate 				mnode_mask = (mnodeset_t)1 << k;
11577c478bd9Sstevel@tonic-gate 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
11587c478bd9Sstevel@tonic-gate 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
11597c478bd9Sstevel@tonic-gate 					lgrp->lgrp_mnodes |= mnode_mask;
11607c478bd9Sstevel@tonic-gate 					lgrp->lgrp_nmnodes++;
11617c478bd9Sstevel@tonic-gate 				}
11627c478bd9Sstevel@tonic-gate 			}
11637c478bd9Sstevel@tonic-gate 			count++;
11647c478bd9Sstevel@tonic-gate 			if (changed)
11657c478bd9Sstevel@tonic-gate 				klgrpset_add(*changed, lgrp->lgrp_id);
11667c478bd9Sstevel@tonic-gate 		}
11677c478bd9Sstevel@tonic-gate 	}
11687c478bd9Sstevel@tonic-gate 
11697c478bd9Sstevel@tonic-gate 	return (count);
11707c478bd9Sstevel@tonic-gate }
11717c478bd9Sstevel@tonic-gate 
11727c478bd9Sstevel@tonic-gate /*
11737c478bd9Sstevel@tonic-gate  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
11747c478bd9Sstevel@tonic-gate  * is moved from one board to another. The "from" and "to" arguments specify the
11757c478bd9Sstevel@tonic-gate  * source and the destination of the move.
11767c478bd9Sstevel@tonic-gate  *
11777c478bd9Sstevel@tonic-gate  * See plat_lgrp_config() for a detailed description of the copy-rename
11787c478bd9Sstevel@tonic-gate  * semantics.
11797c478bd9Sstevel@tonic-gate  *
11807c478bd9Sstevel@tonic-gate  * The lgrp_mem_rename() is called by the platform copy-rename code to update
11817c478bd9Sstevel@tonic-gate  * the lgroup topology which is changing as memory moves from one lgroup to
11827c478bd9Sstevel@tonic-gate  * another. It removes the mnode from the source lgroup and re-inserts it in the
11837c478bd9Sstevel@tonic-gate  * target lgroup.
11847c478bd9Sstevel@tonic-gate  *
11857c478bd9Sstevel@tonic-gate  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
11867c478bd9Sstevel@tonic-gate  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
11877c478bd9Sstevel@tonic-gate  * copy-rename operation.
11887c478bd9Sstevel@tonic-gate  *
11897c478bd9Sstevel@tonic-gate  * There is one case which requires special handling. If the system contains
11907c478bd9Sstevel@tonic-gate  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
11917c478bd9Sstevel@tonic-gate  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
11927c478bd9Sstevel@tonic-gate  * lgrp_mem_init), but there is a window when the system has no memory in the
11937c478bd9Sstevel@tonic-gate  * lgroup hierarchy. If another thread tries to allocate memory during this
11947c478bd9Sstevel@tonic-gate  * window, the allocation will fail, although the system has physical memory.
11957c478bd9Sstevel@tonic-gate  * This may cause a system panic or a deadlock (some sleeping memory allocations
11967c478bd9Sstevel@tonic-gate  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
11977c478bd9Sstevel@tonic-gate  * the mnode back).
11987c478bd9Sstevel@tonic-gate  *
11997c478bd9Sstevel@tonic-gate  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
12007c478bd9Sstevel@tonic-gate  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
12017c478bd9Sstevel@tonic-gate  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
12027c478bd9Sstevel@tonic-gate  * but it updates the rest of the lgroup topology as if the mnode was actually
12037c478bd9Sstevel@tonic-gate  * removed. The lgrp_mem_init() function recognizes that the mnode being
12047c478bd9Sstevel@tonic-gate  * inserted represents such a special case and updates the topology
12057c478bd9Sstevel@tonic-gate  * appropriately.
12067c478bd9Sstevel@tonic-gate  */
12077c478bd9Sstevel@tonic-gate void
12087c478bd9Sstevel@tonic-gate lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
12097c478bd9Sstevel@tonic-gate {
12107c478bd9Sstevel@tonic-gate 	/*
12117c478bd9Sstevel@tonic-gate 	 * Remove the memory from the source node and add it to the destination
12127c478bd9Sstevel@tonic-gate 	 * node.
12137c478bd9Sstevel@tonic-gate 	 */
12147c478bd9Sstevel@tonic-gate 	lgrp_mem_fini(mnode, from, B_TRUE);
12157c478bd9Sstevel@tonic-gate 	lgrp_mem_init(mnode, to, B_TRUE);
12167c478bd9Sstevel@tonic-gate }
12177c478bd9Sstevel@tonic-gate 
12187c478bd9Sstevel@tonic-gate /*
12197c478bd9Sstevel@tonic-gate  * Called to indicate that the lgrp with platform handle "hand" now
12207c478bd9Sstevel@tonic-gate  * contains the memory identified by "mnode".
12217c478bd9Sstevel@tonic-gate  *
12227c478bd9Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
12237c478bd9Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
12247c478bd9Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
12257c478bd9Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
12267c478bd9Sstevel@tonic-gate  *
12277c478bd9Sstevel@tonic-gate  * If the insertion is part of the DR copy-rename and the inserted mnode (and
12287c478bd9Sstevel@tonic-gate  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
12297c478bd9Sstevel@tonic-gate  * dealing with the special case of DR copy-rename described in
12307c478bd9Sstevel@tonic-gate  * lgrp_mem_rename().
12317c478bd9Sstevel@tonic-gate  */
12327c478bd9Sstevel@tonic-gate void
12337c478bd9Sstevel@tonic-gate lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
12347c478bd9Sstevel@tonic-gate {
12357c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
12367c478bd9Sstevel@tonic-gate 	int		count;
12377c478bd9Sstevel@tonic-gate 	int		i;
12387c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
12397c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
12407c478bd9Sstevel@tonic-gate 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
12417c478bd9Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
12427c478bd9Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
12437c478bd9Sstevel@tonic-gate 
12447c478bd9Sstevel@tonic-gate 	/*
12457c478bd9Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
12467c478bd9Sstevel@tonic-gate 	 */
12477c478bd9Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
12487c478bd9Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
12497c478bd9Sstevel@tonic-gate 		drop_lock = B_TRUE;
12507c478bd9Sstevel@tonic-gate 	}
12517c478bd9Sstevel@tonic-gate 
12527c478bd9Sstevel@tonic-gate 	/*
12537c478bd9Sstevel@tonic-gate 	 * This routine may be called from a context where we already
12547c478bd9Sstevel@tonic-gate 	 * hold cpu_lock, and have already paused cpus.
12557c478bd9Sstevel@tonic-gate 	 */
12567c478bd9Sstevel@tonic-gate 	if (!cpus_paused())
12577c478bd9Sstevel@tonic-gate 		need_synch = B_TRUE;
12587c478bd9Sstevel@tonic-gate 
12597c478bd9Sstevel@tonic-gate 	/*
12607c478bd9Sstevel@tonic-gate 	 * Check if this mnode is already configured and return immediately if
12617c478bd9Sstevel@tonic-gate 	 * it is.
12627c478bd9Sstevel@tonic-gate 	 *
12637c478bd9Sstevel@tonic-gate 	 * NOTE: in special case of copy-rename of the only remaining mnode,
12647c478bd9Sstevel@tonic-gate 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
12657c478bd9Sstevel@tonic-gate 	 * recognize this case and continue as usual, but skip the update to
12667c478bd9Sstevel@tonic-gate 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
12677c478bd9Sstevel@tonic-gate 	 * in topology, temporarily introduced by lgrp_mem_fini().
12687c478bd9Sstevel@tonic-gate 	 */
12697c478bd9Sstevel@tonic-gate 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
12707c478bd9Sstevel@tonic-gate 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
12717c478bd9Sstevel@tonic-gate 		if (drop_lock)
12727c478bd9Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
12737c478bd9Sstevel@tonic-gate 		return;
12747c478bd9Sstevel@tonic-gate 	}
12757c478bd9Sstevel@tonic-gate 
12767c478bd9Sstevel@tonic-gate 	/*
12777c478bd9Sstevel@tonic-gate 	 * Update lgroup topology with new memory resources, keeping track of
12787c478bd9Sstevel@tonic-gate 	 * which lgroups change
12797c478bd9Sstevel@tonic-gate 	 */
12807c478bd9Sstevel@tonic-gate 	count = 0;
12817c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
12827c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
12837c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL) {
12847c478bd9Sstevel@tonic-gate 		/* new lgrp */
12857c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_create();
12867c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
12877c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
12887c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
12897c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
12907c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
12917c478bd9Sstevel@tonic-gate 
12927c478bd9Sstevel@tonic-gate 		if (need_synch)
12937c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
12947c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
12957c478bd9Sstevel@tonic-gate 		    &changed);
12967c478bd9Sstevel@tonic-gate 		if (need_synch)
12977c478bd9Sstevel@tonic-gate 			start_cpus();
12987c478bd9Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
12997c478bd9Sstevel@tonic-gate 	    > 0) {
13007c478bd9Sstevel@tonic-gate 		/*
13017c478bd9Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
13027c478bd9Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
13037c478bd9Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
13047c478bd9Sstevel@tonic-gate 		 * lgroups.
13057c478bd9Sstevel@tonic-gate 		 */
13067c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
13077c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
13087c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
13097c478bd9Sstevel@tonic-gate 		    lgrpid))
13107c478bd9Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13117c478bd9Sstevel@tonic-gate 		if (need_synch)
13127c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
13137c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
13147c478bd9Sstevel@tonic-gate 		    &changed);
13157c478bd9Sstevel@tonic-gate 		if (need_synch)
13167c478bd9Sstevel@tonic-gate 			start_cpus();
13177c478bd9Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
13187c478bd9Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
13192dae3fb5Sjjc 		/*
13202dae3fb5Sjjc 		 * Add new lgroup memory resource to existing lgroup
13212dae3fb5Sjjc 		 */
13227c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
13237c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13247c478bd9Sstevel@tonic-gate 		klgrpset_add(changed, lgrpid);
13257c478bd9Sstevel@tonic-gate 		count++;
13267c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
13277c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
13287c478bd9Sstevel@tonic-gate 
13297c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
13307c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
13317c478bd9Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
13327c478bd9Sstevel@tonic-gate 				continue;
13337c478bd9Sstevel@tonic-gate 
13347c478bd9Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13357c478bd9Sstevel@tonic-gate 			klgrpset_add(changed, lgrp->lgrp_id);
13367c478bd9Sstevel@tonic-gate 			count++;
13377c478bd9Sstevel@tonic-gate 		}
13387c478bd9Sstevel@tonic-gate 	}
13397c478bd9Sstevel@tonic-gate 
13407c478bd9Sstevel@tonic-gate 	/*
13417c478bd9Sstevel@tonic-gate 	 * Add memory node to lgroup and remove lgroup from ones that need
13427c478bd9Sstevel@tonic-gate 	 * to be updated
13437c478bd9Sstevel@tonic-gate 	 */
13447c478bd9Sstevel@tonic-gate 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
13457c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_mnodes |= mnodes_mask;
13467c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_nmnodes++;
13477c478bd9Sstevel@tonic-gate 	}
13487c478bd9Sstevel@tonic-gate 	klgrpset_del(changed, lgrpid);
13497c478bd9Sstevel@tonic-gate 
13507c478bd9Sstevel@tonic-gate 	/*
13517c478bd9Sstevel@tonic-gate 	 * Update memory node information for all lgroups that changed and
13527c478bd9Sstevel@tonic-gate 	 * contain new memory node as a resource
13537c478bd9Sstevel@tonic-gate 	 */
13547c478bd9Sstevel@tonic-gate 	if (count)
13557c478bd9Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
13567c478bd9Sstevel@tonic-gate 
13577c478bd9Sstevel@tonic-gate 	if (drop_lock)
13587c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
13597c478bd9Sstevel@tonic-gate }
13607c478bd9Sstevel@tonic-gate 
13617c478bd9Sstevel@tonic-gate /*
13627c478bd9Sstevel@tonic-gate  * Called to indicate that the lgroup associated with the platform
13637c478bd9Sstevel@tonic-gate  * handle "hand" no longer contains given memory node
13647c478bd9Sstevel@tonic-gate  *
13657c478bd9Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
13667c478bd9Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
13677c478bd9Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
13687c478bd9Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
13697c478bd9Sstevel@tonic-gate  *
13707c478bd9Sstevel@tonic-gate  * If the deletion is part of the DR copy-rename and the deleted mnode is the
13717c478bd9Sstevel@tonic-gate  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
13727c478bd9Sstevel@tonic-gate  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
13737c478bd9Sstevel@tonic-gate  * the same mnode back into the topology. See lgrp_mem_rename() and
13747c478bd9Sstevel@tonic-gate  * lgrp_mem_init() for additional details.
13757c478bd9Sstevel@tonic-gate  */
13767c478bd9Sstevel@tonic-gate void
13777c478bd9Sstevel@tonic-gate lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
13787c478bd9Sstevel@tonic-gate {
13797c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
13807c478bd9Sstevel@tonic-gate 	int		count;
13817c478bd9Sstevel@tonic-gate 	int		i;
13827c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
13837c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
13847c478bd9Sstevel@tonic-gate 	mnodeset_t	mnodes_mask;
13857c478bd9Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
13867c478bd9Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
13877c478bd9Sstevel@tonic-gate 
13887c478bd9Sstevel@tonic-gate 	/*
13897c478bd9Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
13907c478bd9Sstevel@tonic-gate 	 */
13917c478bd9Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
13927c478bd9Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
13937c478bd9Sstevel@tonic-gate 		drop_lock = B_TRUE;
13947c478bd9Sstevel@tonic-gate 	}
13957c478bd9Sstevel@tonic-gate 
13967c478bd9Sstevel@tonic-gate 	/*
13977c478bd9Sstevel@tonic-gate 	 * This routine may be called from a context where we already
13987c478bd9Sstevel@tonic-gate 	 * hold cpu_lock and have already paused cpus.
13997c478bd9Sstevel@tonic-gate 	 */
14007c478bd9Sstevel@tonic-gate 	if (!cpus_paused())
14017c478bd9Sstevel@tonic-gate 		need_synch = B_TRUE;
14027c478bd9Sstevel@tonic-gate 
14037c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
14047c478bd9Sstevel@tonic-gate 
14057c478bd9Sstevel@tonic-gate 	/*
14067c478bd9Sstevel@tonic-gate 	 * The lgrp *must* be pre-existing
14077c478bd9Sstevel@tonic-gate 	 */
14087c478bd9Sstevel@tonic-gate 	ASSERT(my_lgrp != NULL);
14097c478bd9Sstevel@tonic-gate 
14107c478bd9Sstevel@tonic-gate 	/*
14117c478bd9Sstevel@tonic-gate 	 * Delete memory node from lgroups which contain it
14127c478bd9Sstevel@tonic-gate 	 */
14137c478bd9Sstevel@tonic-gate 	mnodes_mask = ((mnodeset_t)1 << mnode);
14147c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
14157c478bd9Sstevel@tonic-gate 		lgrp_t *lgrp = lgrp_table[i];
14167c478bd9Sstevel@tonic-gate 		/*
14177c478bd9Sstevel@tonic-gate 		 * Skip any non-existent lgroups and any lgroups that don't
14187c478bd9Sstevel@tonic-gate 		 * contain leaf lgroup of memory as a memory resource
14197c478bd9Sstevel@tonic-gate 		 */
14207c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp) ||
14217c478bd9Sstevel@tonic-gate 		    !(lgrp->lgrp_mnodes & mnodes_mask))
14227c478bd9Sstevel@tonic-gate 			continue;
14237c478bd9Sstevel@tonic-gate 
14247c478bd9Sstevel@tonic-gate 		/*
14257c478bd9Sstevel@tonic-gate 		 * Avoid removing the last mnode from the root in the DR
14267c478bd9Sstevel@tonic-gate 		 * copy-rename case. See lgrp_mem_rename() for details.
14277c478bd9Sstevel@tonic-gate 		 */
14287c478bd9Sstevel@tonic-gate 		if (is_copy_rename &&
14297c478bd9Sstevel@tonic-gate 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
14307c478bd9Sstevel@tonic-gate 			continue;
14317c478bd9Sstevel@tonic-gate 
14327c478bd9Sstevel@tonic-gate 		/*
14337c478bd9Sstevel@tonic-gate 		 * Remove memory node from lgroup.
14347c478bd9Sstevel@tonic-gate 		 */
14357c478bd9Sstevel@tonic-gate 		lgrp->lgrp_mnodes &= ~mnodes_mask;
14367c478bd9Sstevel@tonic-gate 		lgrp->lgrp_nmnodes--;
14377c478bd9Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_nmnodes >= 0);
14387c478bd9Sstevel@tonic-gate 	}
14397c478bd9Sstevel@tonic-gate 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
14407c478bd9Sstevel@tonic-gate 
14417c478bd9Sstevel@tonic-gate 	/*
14427c478bd9Sstevel@tonic-gate 	 * Don't need to update lgroup topology if this lgroup still has memory.
14437c478bd9Sstevel@tonic-gate 	 *
14447c478bd9Sstevel@tonic-gate 	 * In the special case of DR copy-rename with the only mnode being
14457c478bd9Sstevel@tonic-gate 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
14467c478bd9Sstevel@tonic-gate 	 * still need to update the lgroup topology.
14477c478bd9Sstevel@tonic-gate 	 */
14487c478bd9Sstevel@tonic-gate 	if ((my_lgrp->lgrp_nmnodes > 0) &&
14497c478bd9Sstevel@tonic-gate 	    !(is_copy_rename &&
14507c478bd9Sstevel@tonic-gate 		(my_lgrp == lgrp_root) &&
14517c478bd9Sstevel@tonic-gate 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
14527c478bd9Sstevel@tonic-gate 		if (drop_lock)
14537c478bd9Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
14547c478bd9Sstevel@tonic-gate 		return;
14557c478bd9Sstevel@tonic-gate 	}
14567c478bd9Sstevel@tonic-gate 
14577c478bd9Sstevel@tonic-gate 	/*
14587c478bd9Sstevel@tonic-gate 	 * This lgroup does not contain any memory now
14597c478bd9Sstevel@tonic-gate 	 */
14607c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
14617c478bd9Sstevel@tonic-gate 
14627c478bd9Sstevel@tonic-gate 	/*
14637c478bd9Sstevel@tonic-gate 	 * Remove this lgroup from lgroup topology if it does not contain any
14647c478bd9Sstevel@tonic-gate 	 * resources now
14657c478bd9Sstevel@tonic-gate 	 */
14667c478bd9Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
14677c478bd9Sstevel@tonic-gate 	count = 0;
14687c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
14697c478bd9Sstevel@tonic-gate 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
14707c478bd9Sstevel@tonic-gate 		/*
14717c478bd9Sstevel@tonic-gate 		 * Delete lgroup when no more resources
14727c478bd9Sstevel@tonic-gate 		 */
14737c478bd9Sstevel@tonic-gate 		if (need_synch)
14747c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
14757c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
14767c478bd9Sstevel@tonic-gate 		    lgrp_alloc_max + 1, &changed);
14777c478bd9Sstevel@tonic-gate 		ASSERT(count > 0);
14787c478bd9Sstevel@tonic-gate 		if (need_synch)
14797c478bd9Sstevel@tonic-gate 			start_cpus();
14807c478bd9Sstevel@tonic-gate 	} else {
14817c478bd9Sstevel@tonic-gate 		/*
14827c478bd9Sstevel@tonic-gate 		 * Remove lgroup from memory resources of any lgroups that
14837c478bd9Sstevel@tonic-gate 		 * contain it as such
14847c478bd9Sstevel@tonic-gate 		 */
14857c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
14867c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
14877c478bd9Sstevel@tonic-gate 
14887c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
14897c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
14907c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
14917c478bd9Sstevel@tonic-gate 			    lgrpid))
14927c478bd9Sstevel@tonic-gate 				continue;
14937c478bd9Sstevel@tonic-gate 
14947c478bd9Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
14957c478bd9Sstevel@tonic-gate 		}
14967c478bd9Sstevel@tonic-gate 	}
14977c478bd9Sstevel@tonic-gate 	if (drop_lock)
14987c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
14997c478bd9Sstevel@tonic-gate }
15007c478bd9Sstevel@tonic-gate 
15017c478bd9Sstevel@tonic-gate /*
15027c478bd9Sstevel@tonic-gate  * Return lgroup with given platform handle
15037c478bd9Sstevel@tonic-gate  */
15047c478bd9Sstevel@tonic-gate lgrp_t *
15057c478bd9Sstevel@tonic-gate lgrp_hand_to_lgrp(lgrp_handle_t hand)
15067c478bd9Sstevel@tonic-gate {
15077c478bd9Sstevel@tonic-gate 	int	i;
15087c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
15097c478bd9Sstevel@tonic-gate 
15107c478bd9Sstevel@tonic-gate 	if (hand == LGRP_NULL_HANDLE)
15117c478bd9Sstevel@tonic-gate 		return (NULL);
15127c478bd9Sstevel@tonic-gate 
15137c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
15147c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
15157c478bd9Sstevel@tonic-gate 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
15167c478bd9Sstevel@tonic-gate 			return (lgrp);
15177c478bd9Sstevel@tonic-gate 	}
15187c478bd9Sstevel@tonic-gate 	return (NULL);
15197c478bd9Sstevel@tonic-gate }
15207c478bd9Sstevel@tonic-gate 
15217c478bd9Sstevel@tonic-gate /*
15227c478bd9Sstevel@tonic-gate  * Return the home lgroup of the current thread.
15237c478bd9Sstevel@tonic-gate  * We must do this with kernel preemption disabled, since we don't want our
15247c478bd9Sstevel@tonic-gate  * thread to be re-homed while we're poking around with its lpl, and the lpl
15257c478bd9Sstevel@tonic-gate  * should never be NULL.
15267c478bd9Sstevel@tonic-gate  *
15277c478bd9Sstevel@tonic-gate  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
15287c478bd9Sstevel@tonic-gate  * is enabled because of DR.  Callers can use disable kernel preemption
15297c478bd9Sstevel@tonic-gate  * around this call to guarantee that the lgroup will be valid beyond this
15307c478bd9Sstevel@tonic-gate  * routine, since kernel preemption can be recursive.
15317c478bd9Sstevel@tonic-gate  */
15327c478bd9Sstevel@tonic-gate lgrp_t *
15337c478bd9Sstevel@tonic-gate lgrp_home_lgrp(void)
15347c478bd9Sstevel@tonic-gate {
15357c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
15367c478bd9Sstevel@tonic-gate 	lpl_t	*lpl;
15377c478bd9Sstevel@tonic-gate 
15387c478bd9Sstevel@tonic-gate 	kpreempt_disable();
15397c478bd9Sstevel@tonic-gate 
15407c478bd9Sstevel@tonic-gate 	lpl = curthread->t_lpl;
15417c478bd9Sstevel@tonic-gate 	ASSERT(lpl != NULL);
15427c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
15437c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
15447c478bd9Sstevel@tonic-gate 	lgrp = lgrp_table[lpl->lpl_lgrpid];
15457c478bd9Sstevel@tonic-gate 
15467c478bd9Sstevel@tonic-gate 	kpreempt_enable();
15477c478bd9Sstevel@tonic-gate 
15487c478bd9Sstevel@tonic-gate 	return (lgrp);
15497c478bd9Sstevel@tonic-gate }
15507c478bd9Sstevel@tonic-gate 
15517c478bd9Sstevel@tonic-gate /*
15527c478bd9Sstevel@tonic-gate  * Return ID of home lgroup for given thread
15537c478bd9Sstevel@tonic-gate  * (See comments for lgrp_home_lgrp() for special care and handling
15547c478bd9Sstevel@tonic-gate  * instructions)
15557c478bd9Sstevel@tonic-gate  */
15567c478bd9Sstevel@tonic-gate lgrp_id_t
15577c478bd9Sstevel@tonic-gate lgrp_home_id(kthread_t *t)
15587c478bd9Sstevel@tonic-gate {
15597c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrp;
15607c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
15617c478bd9Sstevel@tonic-gate 
15627c478bd9Sstevel@tonic-gate 	ASSERT(t != NULL);
15637c478bd9Sstevel@tonic-gate 	/*
15647c478bd9Sstevel@tonic-gate 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
15657c478bd9Sstevel@tonic-gate 	 * cannot since the HAT layer can call into this routine to
15667c478bd9Sstevel@tonic-gate 	 * determine the locality for its data structures in the context
15677c478bd9Sstevel@tonic-gate 	 * of a page fault.
15687c478bd9Sstevel@tonic-gate 	 */
15697c478bd9Sstevel@tonic-gate 
15707c478bd9Sstevel@tonic-gate 	kpreempt_disable();
15717c478bd9Sstevel@tonic-gate 
15727c478bd9Sstevel@tonic-gate 	lpl = t->t_lpl;
15737c478bd9Sstevel@tonic-gate 	ASSERT(lpl != NULL);
15747c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
15757c478bd9Sstevel@tonic-gate 	lgrp = lpl->lpl_lgrpid;
15767c478bd9Sstevel@tonic-gate 
15777c478bd9Sstevel@tonic-gate 	kpreempt_enable();
15787c478bd9Sstevel@tonic-gate 
15797c478bd9Sstevel@tonic-gate 	return (lgrp);
15807c478bd9Sstevel@tonic-gate }
15817c478bd9Sstevel@tonic-gate 
15827c478bd9Sstevel@tonic-gate /*
15837c478bd9Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
15847c478bd9Sstevel@tonic-gate  */
15857c478bd9Sstevel@tonic-gate lgrp_t *
15867c478bd9Sstevel@tonic-gate lgrp_pfn_to_lgrp(pfn_t pfn)
15877c478bd9Sstevel@tonic-gate {
15887c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
15897c478bd9Sstevel@tonic-gate 	int		i;
15907c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
15917c478bd9Sstevel@tonic-gate 
15927c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
15937c478bd9Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
15947c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
15957c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
15967c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
15977c478bd9Sstevel@tonic-gate 				return (lgrp);
15987c478bd9Sstevel@tonic-gate 		}
15997c478bd9Sstevel@tonic-gate 	return (NULL);
16007c478bd9Sstevel@tonic-gate }
16017c478bd9Sstevel@tonic-gate 
16027c478bd9Sstevel@tonic-gate /*
16037c478bd9Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
16047c478bd9Sstevel@tonic-gate  */
16057c478bd9Sstevel@tonic-gate lgrp_t *
16067c478bd9Sstevel@tonic-gate lgrp_phys_to_lgrp(u_longlong_t physaddr)
16077c478bd9Sstevel@tonic-gate {
16087c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
16097c478bd9Sstevel@tonic-gate 	int		i;
16107c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
16117c478bd9Sstevel@tonic-gate 	pfn_t		pfn;
16127c478bd9Sstevel@tonic-gate 
16137c478bd9Sstevel@tonic-gate 	pfn = btop(physaddr);
16147c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
16157c478bd9Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
16167c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
16177c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
16187c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
16197c478bd9Sstevel@tonic-gate 				return (lgrp);
16207c478bd9Sstevel@tonic-gate 		}
16217c478bd9Sstevel@tonic-gate 	return (NULL);
16227c478bd9Sstevel@tonic-gate }
16237c478bd9Sstevel@tonic-gate 
16247c478bd9Sstevel@tonic-gate /*
16257c478bd9Sstevel@tonic-gate  * Return the leaf lgroup containing the given CPU
1626394b433dSesaxe  *
1627394b433dSesaxe  * The caller needs to take precautions necessary to prevent
1628394b433dSesaxe  * "cpu" from going away across a call to this function.
1629394b433dSesaxe  * hint: kpreempt_disable()/kpreempt_enable()
16307c478bd9Sstevel@tonic-gate  */
16317c478bd9Sstevel@tonic-gate static lgrp_t *
16327c478bd9Sstevel@tonic-gate lgrp_cpu_to_lgrp(cpu_t *cpu)
16337c478bd9Sstevel@tonic-gate {
1634ab761399Sesaxe 	return (cpu->cpu_lpl->lpl_lgrp);
16357c478bd9Sstevel@tonic-gate }
16367c478bd9Sstevel@tonic-gate 
16377c478bd9Sstevel@tonic-gate /*
16387c478bd9Sstevel@tonic-gate  * Return the sum of the partition loads in an lgrp divided by
16397c478bd9Sstevel@tonic-gate  * the number of CPUs in the lgrp.  This is our best approximation
16407c478bd9Sstevel@tonic-gate  * of an 'lgroup load average' for a useful per-lgroup kstat.
16417c478bd9Sstevel@tonic-gate  */
16427c478bd9Sstevel@tonic-gate static uint64_t
16437c478bd9Sstevel@tonic-gate lgrp_sum_loadavgs(lgrp_t *lgrp)
16447c478bd9Sstevel@tonic-gate {
16457c478bd9Sstevel@tonic-gate 	cpu_t *cpu;
16467c478bd9Sstevel@tonic-gate 	int ncpu;
16477c478bd9Sstevel@tonic-gate 	uint64_t loads = 0;
16487c478bd9Sstevel@tonic-gate 
16497c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
16507c478bd9Sstevel@tonic-gate 
16517c478bd9Sstevel@tonic-gate 	cpu = lgrp->lgrp_cpu;
16527c478bd9Sstevel@tonic-gate 	ncpu = lgrp->lgrp_cpucnt;
16537c478bd9Sstevel@tonic-gate 
16547c478bd9Sstevel@tonic-gate 	if (cpu == NULL || ncpu == 0) {
16557c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
16567c478bd9Sstevel@tonic-gate 		return (0ull);
16577c478bd9Sstevel@tonic-gate 	}
16587c478bd9Sstevel@tonic-gate 
16597c478bd9Sstevel@tonic-gate 	do {
16607c478bd9Sstevel@tonic-gate 		loads += cpu->cpu_lpl->lpl_loadavg;
16617c478bd9Sstevel@tonic-gate 		cpu = cpu->cpu_next_lgrp;
16627c478bd9Sstevel@tonic-gate 	} while (cpu != lgrp->lgrp_cpu);
16637c478bd9Sstevel@tonic-gate 
16647c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
16657c478bd9Sstevel@tonic-gate 
16667c478bd9Sstevel@tonic-gate 	return (loads / ncpu);
16677c478bd9Sstevel@tonic-gate }
16687c478bd9Sstevel@tonic-gate 
16697c478bd9Sstevel@tonic-gate void
16707c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
16717c478bd9Sstevel@tonic-gate {
16727c478bd9Sstevel@tonic-gate 	struct lgrp_stats *pstats;
16737c478bd9Sstevel@tonic-gate 
16747c478bd9Sstevel@tonic-gate 	/*
16757c478bd9Sstevel@tonic-gate 	 * Verify that the caller isn't trying to add to
16767c478bd9Sstevel@tonic-gate 	 * a statistic for an lgroup that has gone away
16777c478bd9Sstevel@tonic-gate 	 */
16787c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
16797c478bd9Sstevel@tonic-gate 		return;
16807c478bd9Sstevel@tonic-gate 
16817c478bd9Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
16827c478bd9Sstevel@tonic-gate 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
16837c478bd9Sstevel@tonic-gate }
16847c478bd9Sstevel@tonic-gate 
16857c478bd9Sstevel@tonic-gate int64_t
16867c478bd9Sstevel@tonic-gate lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
16877c478bd9Sstevel@tonic-gate {
16887c478bd9Sstevel@tonic-gate 	uint64_t val;
16897c478bd9Sstevel@tonic-gate 	struct lgrp_stats *pstats;
16907c478bd9Sstevel@tonic-gate 
16917c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
16927c478bd9Sstevel@tonic-gate 		return ((int64_t)0);
16937c478bd9Sstevel@tonic-gate 
16947c478bd9Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
16957c478bd9Sstevel@tonic-gate 	LGRP_STAT_READ(pstats, stat, val);
16967c478bd9Sstevel@tonic-gate 	return (val);
16977c478bd9Sstevel@tonic-gate }
16987c478bd9Sstevel@tonic-gate 
16997c478bd9Sstevel@tonic-gate /*
17007c478bd9Sstevel@tonic-gate  * Reset all kstats for lgrp specified by its lgrpid.
17017c478bd9Sstevel@tonic-gate  */
17027c478bd9Sstevel@tonic-gate static void
17037c478bd9Sstevel@tonic-gate lgrp_kstat_reset(lgrp_id_t lgrpid)
17047c478bd9Sstevel@tonic-gate {
17057c478bd9Sstevel@tonic-gate 	lgrp_stat_t stat;
17067c478bd9Sstevel@tonic-gate 
17077c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
17087c478bd9Sstevel@tonic-gate 		return;
17097c478bd9Sstevel@tonic-gate 
17107c478bd9Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17117c478bd9Sstevel@tonic-gate 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
17127c478bd9Sstevel@tonic-gate 	}
17137c478bd9Sstevel@tonic-gate }
17147c478bd9Sstevel@tonic-gate 
17157c478bd9Sstevel@tonic-gate /*
17167c478bd9Sstevel@tonic-gate  * Collect all per-lgrp statistics for the lgrp associated with this
17177c478bd9Sstevel@tonic-gate  * kstat, and store them in the ks_data array.
17187c478bd9Sstevel@tonic-gate  *
17197c478bd9Sstevel@tonic-gate  * The superuser can reset all the running counter statistics for an
17207c478bd9Sstevel@tonic-gate  * lgrp by writing to any of the lgrp's stats.
17217c478bd9Sstevel@tonic-gate  */
17227c478bd9Sstevel@tonic-gate static int
17237c478bd9Sstevel@tonic-gate lgrp_kstat_extract(kstat_t *ksp, int rw)
17247c478bd9Sstevel@tonic-gate {
17257c478bd9Sstevel@tonic-gate 	lgrp_stat_t		stat;
17267c478bd9Sstevel@tonic-gate 	struct kstat_named	*ksd;
17277c478bd9Sstevel@tonic-gate 	lgrp_t			*lgrp;
17287c478bd9Sstevel@tonic-gate 	lgrp_id_t		lgrpid;
17297c478bd9Sstevel@tonic-gate 
17307c478bd9Sstevel@tonic-gate 	lgrp = (lgrp_t *)ksp->ks_private;
17317c478bd9Sstevel@tonic-gate 
17327c478bd9Sstevel@tonic-gate 	ksd = (struct kstat_named *)ksp->ks_data;
17337c478bd9Sstevel@tonic-gate 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
17347c478bd9Sstevel@tonic-gate 
17357c478bd9Sstevel@tonic-gate 	lgrpid = lgrp->lgrp_id;
17367c478bd9Sstevel@tonic-gate 
17377c478bd9Sstevel@tonic-gate 	if (lgrpid == LGRP_NONE) {
17387c478bd9Sstevel@tonic-gate 		/*
17397c478bd9Sstevel@tonic-gate 		 * Return all zeroes as stats for freed lgrp.
17407c478bd9Sstevel@tonic-gate 		 */
17417c478bd9Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17427c478bd9Sstevel@tonic-gate 			ksd[stat].value.i64 = 0;
17437c478bd9Sstevel@tonic-gate 		}
17447c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
17457c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
17467c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
17477c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
17487c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
17497c478bd9Sstevel@tonic-gate 	} else if (rw != KSTAT_WRITE) {
17507c478bd9Sstevel@tonic-gate 		/*
17517c478bd9Sstevel@tonic-gate 		 * Handle counter stats
17527c478bd9Sstevel@tonic-gate 		 */
17537c478bd9Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17547c478bd9Sstevel@tonic-gate 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
17557c478bd9Sstevel@tonic-gate 		}
17567c478bd9Sstevel@tonic-gate 
17577c478bd9Sstevel@tonic-gate 		/*
17587c478bd9Sstevel@tonic-gate 		 * Handle kernel data snapshot stats
17597c478bd9Sstevel@tonic-gate 		 */
17607c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
17617c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
17627c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
17637c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
17647c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
17657c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
17667c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
17677c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
17687c478bd9Sstevel@tonic-gate 	} else {
17697c478bd9Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
17707c478bd9Sstevel@tonic-gate 	}
17717c478bd9Sstevel@tonic-gate 
17727c478bd9Sstevel@tonic-gate 	return (0);
17737c478bd9Sstevel@tonic-gate }
17747c478bd9Sstevel@tonic-gate 
17757c478bd9Sstevel@tonic-gate int
17767c478bd9Sstevel@tonic-gate lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
17777c478bd9Sstevel@tonic-gate {
17787c478bd9Sstevel@tonic-gate 	cpu_t	*cp;
17797c478bd9Sstevel@tonic-gate 
17807c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
17817c478bd9Sstevel@tonic-gate 
17827c478bd9Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
17837c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
17847c478bd9Sstevel@tonic-gate 		return (EINVAL);
17857c478bd9Sstevel@tonic-gate 	}
17867c478bd9Sstevel@tonic-gate 
17877c478bd9Sstevel@tonic-gate 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
17887c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
17897c478bd9Sstevel@tonic-gate 		return (EINVAL);
17907c478bd9Sstevel@tonic-gate 	}
17917c478bd9Sstevel@tonic-gate 
17927c478bd9Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
17937c478bd9Sstevel@tonic-gate 
17947c478bd9Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_lgrpid;
17957c478bd9Sstevel@tonic-gate 
17967c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
17977c478bd9Sstevel@tonic-gate 
17987c478bd9Sstevel@tonic-gate 	return (0);
17997c478bd9Sstevel@tonic-gate }
18007c478bd9Sstevel@tonic-gate 
18017c478bd9Sstevel@tonic-gate int
18027c478bd9Sstevel@tonic-gate lgrp_query_load(processorid_t id, lgrp_load_t *lp)
18037c478bd9Sstevel@tonic-gate {
18047c478bd9Sstevel@tonic-gate 	cpu_t *cp;
18057c478bd9Sstevel@tonic-gate 
18067c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
18077c478bd9Sstevel@tonic-gate 
18087c478bd9Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
18097c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
18107c478bd9Sstevel@tonic-gate 		return (EINVAL);
18117c478bd9Sstevel@tonic-gate 	}
18127c478bd9Sstevel@tonic-gate 
18137c478bd9Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
18147c478bd9Sstevel@tonic-gate 
18157c478bd9Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_loadavg;
18167c478bd9Sstevel@tonic-gate 
18177c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
18187c478bd9Sstevel@tonic-gate 
18197c478bd9Sstevel@tonic-gate 	return (0);
18207c478bd9Sstevel@tonic-gate }
18217c478bd9Sstevel@tonic-gate 
18227c478bd9Sstevel@tonic-gate void
18237c478bd9Sstevel@tonic-gate lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
18247c478bd9Sstevel@tonic-gate {
18257c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
18267c478bd9Sstevel@tonic-gate 	int		i;
18277c478bd9Sstevel@tonic-gate 
18287c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
18297c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
18307c478bd9Sstevel@tonic-gate 
18317c478bd9Sstevel@tonic-gate 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
18327c478bd9Sstevel@tonic-gate 			lgrp->lgrp_latency = (int)newtime;
18337c478bd9Sstevel@tonic-gate 	}
18347c478bd9Sstevel@tonic-gate }
18357c478bd9Sstevel@tonic-gate 
18367c478bd9Sstevel@tonic-gate /*
18377c478bd9Sstevel@tonic-gate  * Add a resource named by lpl_leaf to rset of lpl_target
18387c478bd9Sstevel@tonic-gate  *
18397c478bd9Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
18407c478bd9Sstevel@tonic-gate  * resource. It is adjusted here, as this is presently the only place that we
18417c478bd9Sstevel@tonic-gate  * can be certain a resource addition has succeeded.
18427c478bd9Sstevel@tonic-gate  *
18437c478bd9Sstevel@tonic-gate  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
18447c478bd9Sstevel@tonic-gate  * list in order until it reaches a NULL.  (This list is required to be NULL
18457c478bd9Sstevel@tonic-gate  * terminated, too).  This is done so that we can mark start pos + 1, so that
18467c478bd9Sstevel@tonic-gate  * each lpl is traversed sequentially, but in a different order.  We hope this
18477c478bd9Sstevel@tonic-gate  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
18487c478bd9Sstevel@tonic-gate  */
18497c478bd9Sstevel@tonic-gate 
18507c478bd9Sstevel@tonic-gate void
18517c478bd9Sstevel@tonic-gate lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
18527c478bd9Sstevel@tonic-gate {
18537c478bd9Sstevel@tonic-gate 	int		i;
18547c478bd9Sstevel@tonic-gate 	int		entry_slot = 0;
18557c478bd9Sstevel@tonic-gate 
18567c478bd9Sstevel@tonic-gate 	/* return if leaf is already present */
18577c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
18587c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
18597c478bd9Sstevel@tonic-gate 			return;
18607c478bd9Sstevel@tonic-gate 		}
18617c478bd9Sstevel@tonic-gate 
18627c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
18637c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) {
18647c478bd9Sstevel@tonic-gate 			break;
18657c478bd9Sstevel@tonic-gate 		}
18667c478bd9Sstevel@tonic-gate 	}
18677c478bd9Sstevel@tonic-gate 
18687c478bd9Sstevel@tonic-gate 	/* insert leaf, update counts */
18697c478bd9Sstevel@tonic-gate 	entry_slot = i;
18707c478bd9Sstevel@tonic-gate 	i = lpl_target->lpl_nrset++;
18717c478bd9Sstevel@tonic-gate 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
18727c478bd9Sstevel@tonic-gate 		panic("More leaf lgrps in system than are supported!\n");
18737c478bd9Sstevel@tonic-gate 	}
18747c478bd9Sstevel@tonic-gate 
18757c478bd9Sstevel@tonic-gate 	/*
18767c478bd9Sstevel@tonic-gate 	 * Start at the end of the rset array and work backwards towards the
18777c478bd9Sstevel@tonic-gate 	 * slot into which the new lpl will be inserted. This effectively
18787c478bd9Sstevel@tonic-gate 	 * preserves the current ordering by scooting everybody over one entry,
18797c478bd9Sstevel@tonic-gate 	 * and placing the new entry into the space created.
18807c478bd9Sstevel@tonic-gate 	 */
18817c478bd9Sstevel@tonic-gate 
18827c478bd9Sstevel@tonic-gate 	while (i-- > entry_slot) {
18837c478bd9Sstevel@tonic-gate 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
18847c478bd9Sstevel@tonic-gate 	}
18857c478bd9Sstevel@tonic-gate 
18867c478bd9Sstevel@tonic-gate 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
18877c478bd9Sstevel@tonic-gate 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
18887c478bd9Sstevel@tonic-gate }
18897c478bd9Sstevel@tonic-gate 
18907c478bd9Sstevel@tonic-gate /*
18917c478bd9Sstevel@tonic-gate  * Update each of lpl_parent's children with a proper hint and
18927c478bd9Sstevel@tonic-gate  * a reference to their parent.
18937c478bd9Sstevel@tonic-gate  * The lgrp topology is used as the reference since it is fully
18947c478bd9Sstevel@tonic-gate  * consistent and correct at this point.
18957c478bd9Sstevel@tonic-gate  *
18967c478bd9Sstevel@tonic-gate  * Each child's hint will reference an element in lpl_parent's
18977c478bd9Sstevel@tonic-gate  * rset that designates where the child should start searching
18987c478bd9Sstevel@tonic-gate  * for CPU resources. The hint selected is the highest order leaf present
18997c478bd9Sstevel@tonic-gate  * in the child's lineage.
19007c478bd9Sstevel@tonic-gate  *
19017c478bd9Sstevel@tonic-gate  * This should be called after any potential change in lpl_parent's
19027c478bd9Sstevel@tonic-gate  * rset.
19037c478bd9Sstevel@tonic-gate  */
19047c478bd9Sstevel@tonic-gate static void
19057c478bd9Sstevel@tonic-gate lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
19067c478bd9Sstevel@tonic-gate {
19077c478bd9Sstevel@tonic-gate 	klgrpset_t	children, leaves;
19087c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
19097c478bd9Sstevel@tonic-gate 	int		hint;
19107c478bd9Sstevel@tonic-gate 	int		i, j;
19117c478bd9Sstevel@tonic-gate 
19127c478bd9Sstevel@tonic-gate 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
19137c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(children))
19147c478bd9Sstevel@tonic-gate 		return; /* nothing to do */
19157c478bd9Sstevel@tonic-gate 
19167c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
19177c478bd9Sstevel@tonic-gate 		if (klgrpset_ismember(children, i)) {
19187c478bd9Sstevel@tonic-gate 
19197c478bd9Sstevel@tonic-gate 			/*
19207c478bd9Sstevel@tonic-gate 			 * Given the set of leaves in this child's lineage,
19217c478bd9Sstevel@tonic-gate 			 * find the highest order leaf present in the parent's
19227c478bd9Sstevel@tonic-gate 			 * rset. Select this as the hint for the child.
19237c478bd9Sstevel@tonic-gate 			 */
19247c478bd9Sstevel@tonic-gate 			leaves = lgrp_table[i]->lgrp_leaves;
19257c478bd9Sstevel@tonic-gate 			hint = 0;
19267c478bd9Sstevel@tonic-gate 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
19277c478bd9Sstevel@tonic-gate 				lpl = lpl_parent->lpl_rset[j];
19287c478bd9Sstevel@tonic-gate 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
19297c478bd9Sstevel@tonic-gate 					hint = j;
19307c478bd9Sstevel@tonic-gate 			}
19317c478bd9Sstevel@tonic-gate 			cp->cp_lgrploads[i].lpl_hint = hint;
19327c478bd9Sstevel@tonic-gate 
19337c478bd9Sstevel@tonic-gate 			/*
19347c478bd9Sstevel@tonic-gate 			 * (Re)set the parent. It may be incorrect if
19357c478bd9Sstevel@tonic-gate 			 * lpl_parent is new in the topology.
19367c478bd9Sstevel@tonic-gate 			 */
19377c478bd9Sstevel@tonic-gate 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
19387c478bd9Sstevel@tonic-gate 		}
19397c478bd9Sstevel@tonic-gate 	}
19407c478bd9Sstevel@tonic-gate }
19417c478bd9Sstevel@tonic-gate 
19427c478bd9Sstevel@tonic-gate /*
19437c478bd9Sstevel@tonic-gate  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
19447c478bd9Sstevel@tonic-gate  *
19457c478bd9Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
19467c478bd9Sstevel@tonic-gate  * resource. The values are adjusted here, as this is the only place that we can
19477c478bd9Sstevel@tonic-gate  * be certain a resource was successfully deleted.
19487c478bd9Sstevel@tonic-gate  */
19497c478bd9Sstevel@tonic-gate void
19507c478bd9Sstevel@tonic-gate lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
19517c478bd9Sstevel@tonic-gate {
19527c478bd9Sstevel@tonic-gate 	int i;
19537c478bd9Sstevel@tonic-gate 
19547c478bd9Sstevel@tonic-gate 	/* find leaf in intermediate node */
19557c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
19567c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
19577c478bd9Sstevel@tonic-gate 			break;
19587c478bd9Sstevel@tonic-gate 	}
19597c478bd9Sstevel@tonic-gate 
19607c478bd9Sstevel@tonic-gate 	/* return if leaf not found */
19617c478bd9Sstevel@tonic-gate 	if (lpl_target->lpl_rset[i] != lpl_leaf)
19627c478bd9Sstevel@tonic-gate 		return;
19637c478bd9Sstevel@tonic-gate 
19647c478bd9Sstevel@tonic-gate 	/* prune leaf, compress array */
19657c478bd9Sstevel@tonic-gate 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
19667c478bd9Sstevel@tonic-gate 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
19677c478bd9Sstevel@tonic-gate 	lpl_target->lpl_ncpu--;
19687c478bd9Sstevel@tonic-gate 	do {
19697c478bd9Sstevel@tonic-gate 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
19707c478bd9Sstevel@tonic-gate 	} while (i++ < lpl_target->lpl_nrset);
19717c478bd9Sstevel@tonic-gate }
19727c478bd9Sstevel@tonic-gate 
19737c478bd9Sstevel@tonic-gate /*
19747c478bd9Sstevel@tonic-gate  * Check to see if the resource set of the target lpl contains the
19757c478bd9Sstevel@tonic-gate  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
19767c478bd9Sstevel@tonic-gate  */
19777c478bd9Sstevel@tonic-gate 
19787c478bd9Sstevel@tonic-gate int
19797c478bd9Sstevel@tonic-gate lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
19807c478bd9Sstevel@tonic-gate {
19817c478bd9Sstevel@tonic-gate 	int i;
19827c478bd9Sstevel@tonic-gate 
19837c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
19847c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
19857c478bd9Sstevel@tonic-gate 			return (1);
19867c478bd9Sstevel@tonic-gate 	}
19877c478bd9Sstevel@tonic-gate 
19887c478bd9Sstevel@tonic-gate 	return (0);
19897c478bd9Sstevel@tonic-gate }
19907c478bd9Sstevel@tonic-gate 
19917c478bd9Sstevel@tonic-gate /*
19927c478bd9Sstevel@tonic-gate  * Called when we change cpu lpl membership.  This increments or decrements the
19937c478bd9Sstevel@tonic-gate  * per-cpu counter in every lpl in which our leaf appears.
19947c478bd9Sstevel@tonic-gate  */
19957c478bd9Sstevel@tonic-gate void
19967c478bd9Sstevel@tonic-gate lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
19977c478bd9Sstevel@tonic-gate {
19987c478bd9Sstevel@tonic-gate 	cpupart_t	*cpupart;
19997c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
20007c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
20017c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
20027c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
20037c478bd9Sstevel@tonic-gate 	int		i;
20047c478bd9Sstevel@tonic-gate 
20057c478bd9Sstevel@tonic-gate 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
20067c478bd9Sstevel@tonic-gate 
20077c478bd9Sstevel@tonic-gate 	cpupart = cp->cpu_part;
20087c478bd9Sstevel@tonic-gate 	lpl_leaf = cp->cpu_lpl;
20097c478bd9Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
20107c478bd9Sstevel@tonic-gate 
20117c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
20127c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
20137c478bd9Sstevel@tonic-gate 
20147c478bd9Sstevel@tonic-gate 		/*
20157c478bd9Sstevel@tonic-gate 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
20167c478bd9Sstevel@tonic-gate 		 * for the cpu in question, or if the current lgrp and leaf
20177c478bd9Sstevel@tonic-gate 		 * don't share the same resources.
20187c478bd9Sstevel@tonic-gate 		 */
20197c478bd9Sstevel@tonic-gate 
20207c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
20217c478bd9Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
20227c478bd9Sstevel@tonic-gate 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
20237c478bd9Sstevel@tonic-gate 			continue;
20247c478bd9Sstevel@tonic-gate 
20257c478bd9Sstevel@tonic-gate 
20267c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
20277c478bd9Sstevel@tonic-gate 
20287c478bd9Sstevel@tonic-gate 		if (lpl_cur->lpl_nrset > 0) {
20297c478bd9Sstevel@tonic-gate 			if (act == LPL_INCREMENT) {
20307c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_ncpu++;
20317c478bd9Sstevel@tonic-gate 			} else if (act == LPL_DECREMENT) {
20327c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_ncpu--;
20337c478bd9Sstevel@tonic-gate 			}
20347c478bd9Sstevel@tonic-gate 		}
20357c478bd9Sstevel@tonic-gate 	}
20367c478bd9Sstevel@tonic-gate }
20377c478bd9Sstevel@tonic-gate 
20387c478bd9Sstevel@tonic-gate /*
20397c478bd9Sstevel@tonic-gate  * Initialize lpl with given resources and specified lgrp
20407c478bd9Sstevel@tonic-gate  */
20417c478bd9Sstevel@tonic-gate 
20427c478bd9Sstevel@tonic-gate void
20437c478bd9Sstevel@tonic-gate lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
20447c478bd9Sstevel@tonic-gate {
20457c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrpid = lgrp->lgrp_id;
20467c478bd9Sstevel@tonic-gate 	lpl->lpl_loadavg = 0;
20477c478bd9Sstevel@tonic-gate 	if (lpl == lpl_leaf)
20487c478bd9Sstevel@tonic-gate 		lpl->lpl_ncpu = 1;
20497c478bd9Sstevel@tonic-gate 	else
20507c478bd9Sstevel@tonic-gate 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
20517c478bd9Sstevel@tonic-gate 	lpl->lpl_nrset = 1;
20527c478bd9Sstevel@tonic-gate 	lpl->lpl_rset[0] = lpl_leaf;
20537c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrp = lgrp;
20547c478bd9Sstevel@tonic-gate 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
20557c478bd9Sstevel@tonic-gate 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
20567c478bd9Sstevel@tonic-gate }
20577c478bd9Sstevel@tonic-gate 
20587c478bd9Sstevel@tonic-gate /*
20597c478bd9Sstevel@tonic-gate  * Clear an unused lpl
20607c478bd9Sstevel@tonic-gate  */
20617c478bd9Sstevel@tonic-gate 
20627c478bd9Sstevel@tonic-gate void
20637c478bd9Sstevel@tonic-gate lpl_clear(lpl_t *lpl)
20647c478bd9Sstevel@tonic-gate {
2065ab761399Sesaxe 	lgrp_id_t	lid;
20667c478bd9Sstevel@tonic-gate 
20677c478bd9Sstevel@tonic-gate 	/* save lid for debugging purposes */
20687c478bd9Sstevel@tonic-gate 	lid = lpl->lpl_lgrpid;
20697c478bd9Sstevel@tonic-gate 	bzero(lpl, sizeof (lpl_t));
20707c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrpid = lid;
20717c478bd9Sstevel@tonic-gate }
20727c478bd9Sstevel@tonic-gate 
20737c478bd9Sstevel@tonic-gate /*
20747c478bd9Sstevel@tonic-gate  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
20757c478bd9Sstevel@tonic-gate  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
20767c478bd9Sstevel@tonic-gate  * make full use of all of the lgroup topology, but this checks to make sure
20777c478bd9Sstevel@tonic-gate  * that for the parts that it does use, it has correctly understood the
20787c478bd9Sstevel@tonic-gate  * relationships that exist. This function returns
20797c478bd9Sstevel@tonic-gate  * 0 if the topology is correct, and a non-zero error code, for non-debug
20807c478bd9Sstevel@tonic-gate  * kernels if incorrect.  Asserts are spread throughout the code to aid in
20817c478bd9Sstevel@tonic-gate  * debugging on a DEBUG kernel.
20827c478bd9Sstevel@tonic-gate  */
20837c478bd9Sstevel@tonic-gate int
20847c478bd9Sstevel@tonic-gate lpl_topo_verify(cpupart_t *cpupart)
20857c478bd9Sstevel@tonic-gate {
20867c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
20877c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
20887c478bd9Sstevel@tonic-gate 	klgrpset_t	rset;
20897c478bd9Sstevel@tonic-gate 	klgrpset_t	cset;
20907c478bd9Sstevel@tonic-gate 	cpu_t		*cpu;
20917c478bd9Sstevel@tonic-gate 	cpu_t		*cp_start;
20927c478bd9Sstevel@tonic-gate 	int		i;
20937c478bd9Sstevel@tonic-gate 	int		j;
20947c478bd9Sstevel@tonic-gate 	int		sum;
20957c478bd9Sstevel@tonic-gate 
20967c478bd9Sstevel@tonic-gate 	/* topology can't be incorrect if it doesn't exist */
20977c478bd9Sstevel@tonic-gate 	if (!lgrp_topo_initialized || !lgrp_initialized)
20987c478bd9Sstevel@tonic-gate 		return (LPL_TOPO_CORRECT);
20997c478bd9Sstevel@tonic-gate 
21007c478bd9Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
21017c478bd9Sstevel@tonic-gate 
21027c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
21037c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
21047c478bd9Sstevel@tonic-gate 		lpl = NULL;
21057c478bd9Sstevel@tonic-gate 		/* make sure lpls are allocated */
21067c478bd9Sstevel@tonic-gate 		ASSERT(cpupart->cp_lgrploads);
21077c478bd9Sstevel@tonic-gate 		if (!cpupart->cp_lgrploads)
21087c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_PART_HAS_NO_LPL);
21097c478bd9Sstevel@tonic-gate 
21107c478bd9Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[i];
21117c478bd9Sstevel@tonic-gate 		/* make sure our index is good */
21127c478bd9Sstevel@tonic-gate 		ASSERT(i < cpupart->cp_nlgrploads);
21137c478bd9Sstevel@tonic-gate 
21147c478bd9Sstevel@tonic-gate 		/* if lgroup doesn't exist, make sure lpl is empty */
21157c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp)) {
21167c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
21177c478bd9Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
21187c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
21197c478bd9Sstevel@tonic-gate 			} else {
21207c478bd9Sstevel@tonic-gate 				continue;
21217c478bd9Sstevel@tonic-gate 			}
21227c478bd9Sstevel@tonic-gate 		}
21237c478bd9Sstevel@tonic-gate 
21247c478bd9Sstevel@tonic-gate 		/* verify that lgroup and lpl are identically numbered */
21257c478bd9Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
21267c478bd9Sstevel@tonic-gate 
21277c478bd9Sstevel@tonic-gate 		/* if lgroup isn't in our partition, make sure lpl is empty */
21287c478bd9Sstevel@tonic-gate 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
21297c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset)) {
21307c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
21317c478bd9Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
21327c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
21337c478bd9Sstevel@tonic-gate 			}
21347c478bd9Sstevel@tonic-gate 			/*
21357c478bd9Sstevel@tonic-gate 			 * lpl is empty, and lgroup isn't in partition.  verify
21367c478bd9Sstevel@tonic-gate 			 * that lpl doesn't show up in anyone else's rsets (in
21377c478bd9Sstevel@tonic-gate 			 * this partition, anyway)
21387c478bd9Sstevel@tonic-gate 			 */
21397c478bd9Sstevel@tonic-gate 
21407c478bd9Sstevel@tonic-gate 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
21417c478bd9Sstevel@tonic-gate 				lpl_t *i_lpl; /* lpl we're iterating over */
21427c478bd9Sstevel@tonic-gate 
21437c478bd9Sstevel@tonic-gate 				i_lpl = &cpupart->cp_lgrploads[j];
21447c478bd9Sstevel@tonic-gate 
21457c478bd9Sstevel@tonic-gate 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
21467c478bd9Sstevel@tonic-gate 				if (lpl_rset_contains(i_lpl, lpl)) {
21477c478bd9Sstevel@tonic-gate 					return (LPL_TOPO_LPL_ORPHANED);
21487c478bd9Sstevel@tonic-gate 				}
21497c478bd9Sstevel@tonic-gate 			}
21507c478bd9Sstevel@tonic-gate 			/* lgroup is empty, and everything is ok. continue */
21517c478bd9Sstevel@tonic-gate 			continue;
21527c478bd9Sstevel@tonic-gate 		}
21537c478bd9Sstevel@tonic-gate 
21547c478bd9Sstevel@tonic-gate 
21557c478bd9Sstevel@tonic-gate 		/* lgroup is in this partition, now check it against lpl */
21567c478bd9Sstevel@tonic-gate 
21577c478bd9Sstevel@tonic-gate 		/* do both have matching lgrps? */
21587c478bd9Sstevel@tonic-gate 		ASSERT(lgrp == lpl->lpl_lgrp);
21597c478bd9Sstevel@tonic-gate 		if (lgrp != lpl->lpl_lgrp) {
21607c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_LGRP_MISMATCH);
21617c478bd9Sstevel@tonic-gate 		}
21627c478bd9Sstevel@tonic-gate 
21637c478bd9Sstevel@tonic-gate 		/* do the parent lgroups exist and do they match? */
21647c478bd9Sstevel@tonic-gate 		if (lgrp->lgrp_parent) {
21657c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_parent);
21667c478bd9Sstevel@tonic-gate 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
21677c478bd9Sstevel@tonic-gate 				    lpl->lpl_parent->lpl_lgrpid);
21687c478bd9Sstevel@tonic-gate 
21697c478bd9Sstevel@tonic-gate 			if (!lpl->lpl_parent) {
21707c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_MISSING_PARENT);
21717c478bd9Sstevel@tonic-gate 			} else if (lgrp->lgrp_parent->lgrp_id !=
21727c478bd9Sstevel@tonic-gate 			    lpl->lpl_parent->lpl_lgrpid) {
21737c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_PARENT_MISMATCH);
21747c478bd9Sstevel@tonic-gate 			}
21757c478bd9Sstevel@tonic-gate 		}
21767c478bd9Sstevel@tonic-gate 
21777c478bd9Sstevel@tonic-gate 		/* only leaf lgroups keep a cpucnt, only check leaves */
21787c478bd9Sstevel@tonic-gate 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
21797c478bd9Sstevel@tonic-gate 
21807c478bd9Sstevel@tonic-gate 			/* verify that lgrp is also a leaf */
21817c478bd9Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_childcnt == 0) &&
21827c478bd9Sstevel@tonic-gate 			    (klgrpset_ismember(lgrp->lgrp_leaves,
21837c478bd9Sstevel@tonic-gate 			    lpl->lpl_lgrpid)));
21847c478bd9Sstevel@tonic-gate 
21857c478bd9Sstevel@tonic-gate 			if ((lgrp->lgrp_childcnt > 0) ||
21867c478bd9Sstevel@tonic-gate 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
21877c478bd9Sstevel@tonic-gate 			    lpl->lpl_lgrpid))) {
21887c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LGRP_NOT_LEAF);
21897c478bd9Sstevel@tonic-gate 			}
21907c478bd9Sstevel@tonic-gate 
21917c478bd9Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
21927c478bd9Sstevel@tonic-gate 			    (lpl->lpl_ncpu > 0));
21937c478bd9Sstevel@tonic-gate 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
21947c478bd9Sstevel@tonic-gate 				(lpl->lpl_ncpu <= 0)) {
21957c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BAD_CPUCNT);
21967c478bd9Sstevel@tonic-gate 			}
21977c478bd9Sstevel@tonic-gate 
21987c478bd9Sstevel@tonic-gate 			/*
21997c478bd9Sstevel@tonic-gate 			 * Check that lpl_ncpu also matches the number of
22007c478bd9Sstevel@tonic-gate 			 * cpus in the lpl's linked list.  This only exists in
22017c478bd9Sstevel@tonic-gate 			 * leaves, but they should always match.
22027c478bd9Sstevel@tonic-gate 			 */
22037c478bd9Sstevel@tonic-gate 			j = 0;
22047c478bd9Sstevel@tonic-gate 			cpu = cp_start = lpl->lpl_cpus;
22057c478bd9Sstevel@tonic-gate 			while (cpu != NULL) {
22067c478bd9Sstevel@tonic-gate 				j++;
22077c478bd9Sstevel@tonic-gate 
22087c478bd9Sstevel@tonic-gate 				/* check to make sure cpu's lpl is leaf lpl */
22097c478bd9Sstevel@tonic-gate 				ASSERT(cpu->cpu_lpl == lpl);
22107c478bd9Sstevel@tonic-gate 				if (cpu->cpu_lpl != lpl) {
22117c478bd9Sstevel@tonic-gate 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
22127c478bd9Sstevel@tonic-gate 				}
22137c478bd9Sstevel@tonic-gate 
22147c478bd9Sstevel@tonic-gate 				/* check next cpu */
22157c478bd9Sstevel@tonic-gate 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
22167c478bd9Sstevel@tonic-gate 					continue;
22177c478bd9Sstevel@tonic-gate 				} else {
22187c478bd9Sstevel@tonic-gate 					cpu = NULL;
22197c478bd9Sstevel@tonic-gate 				}
22207c478bd9Sstevel@tonic-gate 			}
22217c478bd9Sstevel@tonic-gate 
22227c478bd9Sstevel@tonic-gate 			ASSERT(j == lpl->lpl_ncpu);
22237c478bd9Sstevel@tonic-gate 			if (j != lpl->lpl_ncpu) {
22247c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
22257c478bd9Sstevel@tonic-gate 			}
22267c478bd9Sstevel@tonic-gate 
22277c478bd9Sstevel@tonic-gate 			/*
22287c478bd9Sstevel@tonic-gate 			 * Also, check that leaf lpl is contained in all
22297c478bd9Sstevel@tonic-gate 			 * intermediate lpls that name the leaf as a descendant
22307c478bd9Sstevel@tonic-gate 			 */
22317c478bd9Sstevel@tonic-gate 
22327c478bd9Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
22337c478bd9Sstevel@tonic-gate 				klgrpset_t intersect;
22347c478bd9Sstevel@tonic-gate 				lgrp_t *lgrp_cand;
22357c478bd9Sstevel@tonic-gate 				lpl_t *lpl_cand;
22367c478bd9Sstevel@tonic-gate 
22377c478bd9Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
22387c478bd9Sstevel@tonic-gate 				intersect = klgrpset_intersects(
22397c478bd9Sstevel@tonic-gate 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
22407c478bd9Sstevel@tonic-gate 				    cpupart->cp_lgrpset);
22417c478bd9Sstevel@tonic-gate 
22427c478bd9Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
22437c478bd9Sstevel@tonic-gate 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
22447c478bd9Sstevel@tonic-gate 				    cpupart->cp_lgrpset) ||
22457c478bd9Sstevel@tonic-gate 				    (intersect == 0))
22467c478bd9Sstevel@tonic-gate 					continue;
22477c478bd9Sstevel@tonic-gate 
22487c478bd9Sstevel@tonic-gate 				lpl_cand =
22497c478bd9Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
22507c478bd9Sstevel@tonic-gate 
22517c478bd9Sstevel@tonic-gate 				if (klgrpset_ismember(intersect,
22527c478bd9Sstevel@tonic-gate 				    lgrp->lgrp_id)) {
22537c478bd9Sstevel@tonic-gate 					ASSERT(lpl_rset_contains(lpl_cand,
22547c478bd9Sstevel@tonic-gate 					    lpl));
22557c478bd9Sstevel@tonic-gate 
22567c478bd9Sstevel@tonic-gate 					if (!lpl_rset_contains(lpl_cand, lpl)) {
22577c478bd9Sstevel@tonic-gate 						return (LPL_TOPO_RSET_MSSNG_LF);
22587c478bd9Sstevel@tonic-gate 					}
22597c478bd9Sstevel@tonic-gate 				}
22607c478bd9Sstevel@tonic-gate 			}
22617c478bd9Sstevel@tonic-gate 
22627c478bd9Sstevel@tonic-gate 		} else { /* non-leaf specific checks */
22637c478bd9Sstevel@tonic-gate 
22647c478bd9Sstevel@tonic-gate 			/*
22657c478bd9Sstevel@tonic-gate 			 * Non-leaf lpls should have lpl_cpus == NULL
22667c478bd9Sstevel@tonic-gate 			 * verify that this is so
22677c478bd9Sstevel@tonic-gate 			 */
22687c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_cpus == NULL);
22697c478bd9Sstevel@tonic-gate 			if (lpl->lpl_cpus != NULL) {
22707c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
22717c478bd9Sstevel@tonic-gate 			}
22727c478bd9Sstevel@tonic-gate 
22737c478bd9Sstevel@tonic-gate 			/*
22747c478bd9Sstevel@tonic-gate 			 * verify that the sum of the cpus in the leaf resources
22757c478bd9Sstevel@tonic-gate 			 * is equal to the total ncpu in the intermediate
22767c478bd9Sstevel@tonic-gate 			 */
22777c478bd9Sstevel@tonic-gate 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
22787c478bd9Sstevel@tonic-gate 				sum += lpl->lpl_rset[j]->lpl_ncpu;
22797c478bd9Sstevel@tonic-gate 			}
22807c478bd9Sstevel@tonic-gate 
22817c478bd9Sstevel@tonic-gate 			ASSERT(sum == lpl->lpl_ncpu);
22827c478bd9Sstevel@tonic-gate 			if (sum != lpl->lpl_ncpu) {
22837c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
22847c478bd9Sstevel@tonic-gate 			}
22857c478bd9Sstevel@tonic-gate 		}
22867c478bd9Sstevel@tonic-gate 
22877c478bd9Sstevel@tonic-gate 		/*
22887c478bd9Sstevel@tonic-gate 		 * check on lpl_hint. Don't check root, since it has no parent.
22897c478bd9Sstevel@tonic-gate 		 */
22907c478bd9Sstevel@tonic-gate 		if (lpl->lpl_parent != NULL) {
22917c478bd9Sstevel@tonic-gate 			int hint;
22927c478bd9Sstevel@tonic-gate 			lpl_t *hint_lpl;
22937c478bd9Sstevel@tonic-gate 
22947c478bd9Sstevel@tonic-gate 			/* make sure hint is within limits of nrset */
22957c478bd9Sstevel@tonic-gate 			hint = lpl->lpl_hint;
22967c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
22977c478bd9Sstevel@tonic-gate 			if (lpl->lpl_parent->lpl_nrset < hint) {
22987c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BOGUS_HINT);
22997c478bd9Sstevel@tonic-gate 			}
23007c478bd9Sstevel@tonic-gate 
23017c478bd9Sstevel@tonic-gate 			/* make sure hint points to valid lpl */
23027c478bd9Sstevel@tonic-gate 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
23037c478bd9Sstevel@tonic-gate 			ASSERT(hint_lpl->lpl_ncpu > 0);
23047c478bd9Sstevel@tonic-gate 			if (hint_lpl->lpl_ncpu <= 0) {
23057c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BOGUS_HINT);
23067c478bd9Sstevel@tonic-gate 			}
23077c478bd9Sstevel@tonic-gate 		}
23087c478bd9Sstevel@tonic-gate 
23097c478bd9Sstevel@tonic-gate 		/*
23107c478bd9Sstevel@tonic-gate 		 * Check the rset of the lpl in question.  Make sure that each
23117c478bd9Sstevel@tonic-gate 		 * rset contains a subset of the resources in
23127c478bd9Sstevel@tonic-gate 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
23137c478bd9Sstevel@tonic-gate 		 * sure that each rset doesn't include resources that are
23147c478bd9Sstevel@tonic-gate 		 * outside of that set.  (Which would be resources somehow not
23157c478bd9Sstevel@tonic-gate 		 * accounted for).
23167c478bd9Sstevel@tonic-gate 		 */
23177c478bd9Sstevel@tonic-gate 
23187c478bd9Sstevel@tonic-gate 		klgrpset_clear(rset);
23197c478bd9Sstevel@tonic-gate 		for (j = 0; j < lpl->lpl_nrset; j++) {
23207c478bd9Sstevel@tonic-gate 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
23217c478bd9Sstevel@tonic-gate 		}
23227c478bd9Sstevel@tonic-gate 		klgrpset_copy(cset, rset);
23237c478bd9Sstevel@tonic-gate 		/* make sure lpl rset matches lgrp rset */
23247c478bd9Sstevel@tonic-gate 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
23257c478bd9Sstevel@tonic-gate 		/* make sure rset is contained with in partition, too */
23267c478bd9Sstevel@tonic-gate 		klgrpset_diff(cset, cpupart->cp_lgrpset);
23277c478bd9Sstevel@tonic-gate 
23287c478bd9Sstevel@tonic-gate 		ASSERT(klgrpset_isempty(rset) &&
23297c478bd9Sstevel@tonic-gate 			    klgrpset_isempty(cset));
23307c478bd9Sstevel@tonic-gate 		if (!klgrpset_isempty(rset) ||
23317c478bd9Sstevel@tonic-gate 		    !klgrpset_isempty(cset)) {
23327c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_RSET_MISMATCH);
23337c478bd9Sstevel@tonic-gate 		}
23347c478bd9Sstevel@tonic-gate 
23357c478bd9Sstevel@tonic-gate 		/*
23367c478bd9Sstevel@tonic-gate 		 * check to make sure lpl_nrset matches the number of rsets
23377c478bd9Sstevel@tonic-gate 		 * contained in the lpl
23387c478bd9Sstevel@tonic-gate 		 */
23397c478bd9Sstevel@tonic-gate 
23407c478bd9Sstevel@tonic-gate 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
23417c478bd9Sstevel@tonic-gate 		    j++);
23427c478bd9Sstevel@tonic-gate 
23437c478bd9Sstevel@tonic-gate 		ASSERT(j == lpl->lpl_nrset);
23447c478bd9Sstevel@tonic-gate 		if (j != lpl->lpl_nrset) {
23457c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_BAD_RSETCNT);
23467c478bd9Sstevel@tonic-gate 		}
23477c478bd9Sstevel@tonic-gate 
23487c478bd9Sstevel@tonic-gate 	}
23497c478bd9Sstevel@tonic-gate 	return (LPL_TOPO_CORRECT);
23507c478bd9Sstevel@tonic-gate }
23517c478bd9Sstevel@tonic-gate 
23527c478bd9Sstevel@tonic-gate /*
23537c478bd9Sstevel@tonic-gate  * Flatten lpl topology to given number of levels.  This is presently only
23547c478bd9Sstevel@tonic-gate  * implemented for a flatten to 2 levels, which will prune out the intermediates
23557c478bd9Sstevel@tonic-gate  * and home the leaf lpls to the root lpl.
23567c478bd9Sstevel@tonic-gate  */
23577c478bd9Sstevel@tonic-gate int
23587c478bd9Sstevel@tonic-gate lpl_topo_flatten(int levels)
23597c478bd9Sstevel@tonic-gate {
23607c478bd9Sstevel@tonic-gate 	int		i;
23617c478bd9Sstevel@tonic-gate 	uint_t		sum;
23627c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
23637c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
23647c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_root;
23657c478bd9Sstevel@tonic-gate 	cpupart_t	*cp;
23667c478bd9Sstevel@tonic-gate 
23677c478bd9Sstevel@tonic-gate 	if (levels != 2)
23687c478bd9Sstevel@tonic-gate 		return (0);
23697c478bd9Sstevel@tonic-gate 
23707c478bd9Sstevel@tonic-gate 	/* called w/ cpus paused - grab no locks! */
23717c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
23727c478bd9Sstevel@tonic-gate 	    !lgrp_initialized);
23737c478bd9Sstevel@tonic-gate 
23747c478bd9Sstevel@tonic-gate 	cp = cp_list_head;
23757c478bd9Sstevel@tonic-gate 	do {
23767c478bd9Sstevel@tonic-gate 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
23777c478bd9Sstevel@tonic-gate 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
23787c478bd9Sstevel@tonic-gate 
23797c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
23807c478bd9Sstevel@tonic-gate 			lgrp_cur = lgrp_table[i];
23817c478bd9Sstevel@tonic-gate 			lpl_cur = &cp->cp_lgrploads[i];
23827c478bd9Sstevel@tonic-gate 
23837c478bd9Sstevel@tonic-gate 			if ((lgrp_cur == lgrp_root) ||
23847c478bd9Sstevel@tonic-gate 			    (!LGRP_EXISTS(lgrp_cur) &&
23857c478bd9Sstevel@tonic-gate 			    (lpl_cur->lpl_ncpu == 0)))
23867c478bd9Sstevel@tonic-gate 				continue;
23877c478bd9Sstevel@tonic-gate 
23887c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
23897c478bd9Sstevel@tonic-gate 				/*
23907c478bd9Sstevel@tonic-gate 				 * this should be a deleted intermediate, so
23917c478bd9Sstevel@tonic-gate 				 * clear it
23927c478bd9Sstevel@tonic-gate 				 */
23937c478bd9Sstevel@tonic-gate 				lpl_clear(lpl_cur);
23947c478bd9Sstevel@tonic-gate 			} else if ((lpl_cur->lpl_nrset == 1) &&
23957c478bd9Sstevel@tonic-gate 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
23967c478bd9Sstevel@tonic-gate 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
23977c478bd9Sstevel@tonic-gate 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
23987c478bd9Sstevel@tonic-gate 				/*
23997c478bd9Sstevel@tonic-gate 				 * this is a leaf whose parent was deleted, or
24007c478bd9Sstevel@tonic-gate 				 * whose parent had their lgrp deleted.  (And
24017c478bd9Sstevel@tonic-gate 				 * whose parent will soon be deleted).  Point
24027c478bd9Sstevel@tonic-gate 				 * this guy back to the root lpl.
24037c478bd9Sstevel@tonic-gate 				 */
24047c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_parent = lpl_root;
24057c478bd9Sstevel@tonic-gate 				lpl_rset_add(lpl_root, lpl_cur);
24067c478bd9Sstevel@tonic-gate 			}
24077c478bd9Sstevel@tonic-gate 
24087c478bd9Sstevel@tonic-gate 		}
24097c478bd9Sstevel@tonic-gate 
24107c478bd9Sstevel@tonic-gate 		/*
24117c478bd9Sstevel@tonic-gate 		 * Now that we're done, make sure the count on the root lpl is
24127c478bd9Sstevel@tonic-gate 		 * correct, and update the hints of the children for the sake of
24137c478bd9Sstevel@tonic-gate 		 * thoroughness
24147c478bd9Sstevel@tonic-gate 		 */
24157c478bd9Sstevel@tonic-gate 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
24167c478bd9Sstevel@tonic-gate 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
24177c478bd9Sstevel@tonic-gate 		}
24187c478bd9Sstevel@tonic-gate 		lpl_root->lpl_ncpu = sum;
24197c478bd9Sstevel@tonic-gate 		lpl_child_update(lpl_root, cp);
24207c478bd9Sstevel@tonic-gate 
24217c478bd9Sstevel@tonic-gate 		cp = cp->cp_next;
24227c478bd9Sstevel@tonic-gate 	} while (cp != cp_list_head);
24237c478bd9Sstevel@tonic-gate 
24247c478bd9Sstevel@tonic-gate 	return (levels);
24257c478bd9Sstevel@tonic-gate }
24267c478bd9Sstevel@tonic-gate 
24277c478bd9Sstevel@tonic-gate /*
24287c478bd9Sstevel@tonic-gate  * Insert a lpl into the resource hierarchy and create any additional lpls that
24297c478bd9Sstevel@tonic-gate  * are necessary to represent the varying states of locality for the cpu
24307c478bd9Sstevel@tonic-gate  * resoruces newly added to the partition.
24317c478bd9Sstevel@tonic-gate  *
24327c478bd9Sstevel@tonic-gate  * This routine is clever enough that it can correctly add resources from the
24337c478bd9Sstevel@tonic-gate  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
24347c478bd9Sstevel@tonic-gate  * those for which the lpl is a leaf as opposed to simply a named equally local
24357c478bd9Sstevel@tonic-gate  * resource).  The one special case that needs additional processing is when a
24367c478bd9Sstevel@tonic-gate  * new intermediate lpl is introduced.  Since the main loop only traverses
24377c478bd9Sstevel@tonic-gate  * looking to add the leaf resource where it does not yet exist, additional work
24387c478bd9Sstevel@tonic-gate  * is necessary to add other leaf resources that may need to exist in the newly
24397c478bd9Sstevel@tonic-gate  * created intermediate.  This is performed by the second inner loop, and is
24407c478bd9Sstevel@tonic-gate  * only done when the check for more than one overlapping resource succeeds.
24417c478bd9Sstevel@tonic-gate  */
24427c478bd9Sstevel@tonic-gate 
24437c478bd9Sstevel@tonic-gate void
24447c478bd9Sstevel@tonic-gate lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
24457c478bd9Sstevel@tonic-gate {
24467c478bd9Sstevel@tonic-gate 	int		i;
24477c478bd9Sstevel@tonic-gate 	int		j;
24487c478bd9Sstevel@tonic-gate 	int		hint;
24497c478bd9Sstevel@tonic-gate 	int		rset_num_intersect;
24507c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
24517c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
24527c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_parent;
2453ab761399Sesaxe 	lgrp_id_t	parent_id;
24547c478bd9Sstevel@tonic-gate 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
24557c478bd9Sstevel@tonic-gate 
24567c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
24577c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
24587c478bd9Sstevel@tonic-gate 
24597c478bd9Sstevel@tonic-gate 		/*
24607c478bd9Sstevel@tonic-gate 		 * Don't insert if the lgrp isn't there, if the leaf isn't
24617c478bd9Sstevel@tonic-gate 		 * contained within the current lgrp, or if the current lgrp has
24627c478bd9Sstevel@tonic-gate 		 * no leaves in this partition
24637c478bd9Sstevel@tonic-gate 		 */
24647c478bd9Sstevel@tonic-gate 
24657c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) ||
24667c478bd9Sstevel@tonic-gate 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
24677c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
24687c478bd9Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
24697c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset))
24707c478bd9Sstevel@tonic-gate 			continue;
24717c478bd9Sstevel@tonic-gate 
24727c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
24737c478bd9Sstevel@tonic-gate 		if (lgrp_cur->lgrp_parent != NULL) {
24747c478bd9Sstevel@tonic-gate 			/* if lgrp has a parent, assign it properly */
24757c478bd9Sstevel@tonic-gate 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
24767c478bd9Sstevel@tonic-gate 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
24777c478bd9Sstevel@tonic-gate 		} else {
24787c478bd9Sstevel@tonic-gate 			/* if not, make sure parent ptr gets set to null */
24797c478bd9Sstevel@tonic-gate 			lpl_parent = NULL;
24807c478bd9Sstevel@tonic-gate 		}
24817c478bd9Sstevel@tonic-gate 
24827c478bd9Sstevel@tonic-gate 		if (lpl_cur == lpl_leaf) {
24837c478bd9Sstevel@tonic-gate 			/*
24847c478bd9Sstevel@tonic-gate 			 * Almost all leaf state was initialized elsewhere.  The
24857c478bd9Sstevel@tonic-gate 			 * only thing left to do is to set the parent.
24867c478bd9Sstevel@tonic-gate 			 */
24877c478bd9Sstevel@tonic-gate 			lpl_cur->lpl_parent = lpl_parent;
24887c478bd9Sstevel@tonic-gate 			continue;
24897c478bd9Sstevel@tonic-gate 		}
24907c478bd9Sstevel@tonic-gate 
24917c478bd9Sstevel@tonic-gate 		/*
24927c478bd9Sstevel@tonic-gate 		 * Initialize intermediate lpl
24937c478bd9Sstevel@tonic-gate 		 * Save this lpl's hint though. Since we're changing this
24947c478bd9Sstevel@tonic-gate 		 * lpl's resources, we need to update the hint in this lpl's
24957c478bd9Sstevel@tonic-gate 		 * children, but the hint in this lpl is unaffected and
24967c478bd9Sstevel@tonic-gate 		 * should be preserved.
24977c478bd9Sstevel@tonic-gate 		 */
24987c478bd9Sstevel@tonic-gate 		hint = lpl_cur->lpl_hint;
24997c478bd9Sstevel@tonic-gate 
25007c478bd9Sstevel@tonic-gate 		lpl_clear(lpl_cur);
25017c478bd9Sstevel@tonic-gate 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
25027c478bd9Sstevel@tonic-gate 
25037c478bd9Sstevel@tonic-gate 		lpl_cur->lpl_hint = hint;
25047c478bd9Sstevel@tonic-gate 		lpl_cur->lpl_parent = lpl_parent;
25057c478bd9Sstevel@tonic-gate 
25067c478bd9Sstevel@tonic-gate 		/* does new lpl need to be populated with other resources? */
25077c478bd9Sstevel@tonic-gate 		rset_intersect =
25087c478bd9Sstevel@tonic-gate 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
25097c478bd9Sstevel@tonic-gate 			cpupart->cp_lgrpset);
25107c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
25117c478bd9Sstevel@tonic-gate 
25127c478bd9Sstevel@tonic-gate 		if (rset_num_intersect > 1) {
25137c478bd9Sstevel@tonic-gate 			/*
25147c478bd9Sstevel@tonic-gate 			 * If so, figure out what lpls have resources that
25157c478bd9Sstevel@tonic-gate 			 * intersect this one, and add them.
25167c478bd9Sstevel@tonic-gate 			 */
25177c478bd9Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
25187c478bd9Sstevel@tonic-gate 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
25197c478bd9Sstevel@tonic-gate 				lpl_t	*lpl_cand;	/* candidate lpl */
25207c478bd9Sstevel@tonic-gate 
25217c478bd9Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
25227c478bd9Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
25237c478bd9Sstevel@tonic-gate 				    !klgrpset_ismember(rset_intersect,
25247c478bd9Sstevel@tonic-gate 					lgrp_cand->lgrp_id))
25257c478bd9Sstevel@tonic-gate 					continue;
25267c478bd9Sstevel@tonic-gate 				lpl_cand =
25277c478bd9Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
25287c478bd9Sstevel@tonic-gate 				lpl_rset_add(lpl_cur, lpl_cand);
25297c478bd9Sstevel@tonic-gate 			}
25307c478bd9Sstevel@tonic-gate 		}
25317c478bd9Sstevel@tonic-gate 		/*
25327c478bd9Sstevel@tonic-gate 		 * This lpl's rset has changed. Update the hint in it's
25337c478bd9Sstevel@tonic-gate 		 * children.
25347c478bd9Sstevel@tonic-gate 		 */
25357c478bd9Sstevel@tonic-gate 		lpl_child_update(lpl_cur, cpupart);
25367c478bd9Sstevel@tonic-gate 	}
25377c478bd9Sstevel@tonic-gate }
25387c478bd9Sstevel@tonic-gate 
25397c478bd9Sstevel@tonic-gate /*
25407c478bd9Sstevel@tonic-gate  * remove a lpl from the hierarchy of resources, clearing its state when
25417c478bd9Sstevel@tonic-gate  * finished.  If the lpls at the intermediate levels of the hierarchy have no
25427c478bd9Sstevel@tonic-gate  * remaining resources, or no longer name a leaf resource in the cpu-partition,
25437c478bd9Sstevel@tonic-gate  * delete them as well.
25447c478bd9Sstevel@tonic-gate  */
25457c478bd9Sstevel@tonic-gate 
25467c478bd9Sstevel@tonic-gate void
25477c478bd9Sstevel@tonic-gate lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
25487c478bd9Sstevel@tonic-gate {
25497c478bd9Sstevel@tonic-gate 	int		i;
25507c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
25517c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
25527c478bd9Sstevel@tonic-gate 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
25537c478bd9Sstevel@tonic-gate 
25547c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
25557c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
25567c478bd9Sstevel@tonic-gate 
25577c478bd9Sstevel@tonic-gate 		/*
25587c478bd9Sstevel@tonic-gate 		 * Don't attempt to remove from lgrps that aren't there, that
25597c478bd9Sstevel@tonic-gate 		 * don't contain our leaf, or from the leaf itself. (We do that
25607c478bd9Sstevel@tonic-gate 		 * later)
25617c478bd9Sstevel@tonic-gate 		 */
25627c478bd9Sstevel@tonic-gate 
25637c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur))
25647c478bd9Sstevel@tonic-gate 			continue;
25657c478bd9Sstevel@tonic-gate 
25667c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
25677c478bd9Sstevel@tonic-gate 
25687c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
25697c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
25707c478bd9Sstevel@tonic-gate 		    (lpl_cur == lpl_leaf)) {
25717c478bd9Sstevel@tonic-gate 			continue;
25727c478bd9Sstevel@tonic-gate 		}
25737c478bd9Sstevel@tonic-gate 
25747c478bd9Sstevel@tonic-gate 		/*
25757c478bd9Sstevel@tonic-gate 		 * This is a slightly sleazy simplification in that we have
25767c478bd9Sstevel@tonic-gate 		 * already marked the cp_lgrpset as no longer containing the
25777c478bd9Sstevel@tonic-gate 		 * leaf we've deleted.  Any lpls that pass the above checks
25787c478bd9Sstevel@tonic-gate 		 * based upon lgrp membership but not necessarily cpu-part
25797c478bd9Sstevel@tonic-gate 		 * membership also get cleared by the checks below.  Currently
25807c478bd9Sstevel@tonic-gate 		 * this is harmless, as the lpls should be empty anyway.
25817c478bd9Sstevel@tonic-gate 		 *
25827c478bd9Sstevel@tonic-gate 		 * In particular, we want to preserve lpls that have additional
25837c478bd9Sstevel@tonic-gate 		 * leaf resources, even though we don't yet have a processor
25847c478bd9Sstevel@tonic-gate 		 * architecture that represents resources this way.
25857c478bd9Sstevel@tonic-gate 		 */
25867c478bd9Sstevel@tonic-gate 
25877c478bd9Sstevel@tonic-gate 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
25887c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset);
25897c478bd9Sstevel@tonic-gate 
25907c478bd9Sstevel@tonic-gate 		lpl_rset_del(lpl_cur, lpl_leaf);
25917c478bd9Sstevel@tonic-gate 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
25927c478bd9Sstevel@tonic-gate 			lpl_clear(lpl_cur);
25937c478bd9Sstevel@tonic-gate 		} else {
25947c478bd9Sstevel@tonic-gate 			/*
25957c478bd9Sstevel@tonic-gate 			 * Update this lpl's children
25967c478bd9Sstevel@tonic-gate 			 */
25977c478bd9Sstevel@tonic-gate 			lpl_child_update(lpl_cur, cpupart);
25987c478bd9Sstevel@tonic-gate 		}
25997c478bd9Sstevel@tonic-gate 	}
26007c478bd9Sstevel@tonic-gate 	lpl_clear(lpl_leaf);
26017c478bd9Sstevel@tonic-gate }
26027c478bd9Sstevel@tonic-gate 
26037c478bd9Sstevel@tonic-gate /*
26047c478bd9Sstevel@tonic-gate  * add a cpu to a partition in terms of lgrp load avg bookeeping
26057c478bd9Sstevel@tonic-gate  *
26067c478bd9Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
26077c478bd9Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
26087c478bd9Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
26097c478bd9Sstevel@tonic-gate  * There are two general cases for cpu additon:
26107c478bd9Sstevel@tonic-gate  *
26117c478bd9Sstevel@tonic-gate  * 1. A lpl structure that contains resources already in the hierarchy tree.
26127c478bd9Sstevel@tonic-gate  * In this case, all of the associated lpl relationships have been defined, and
26137c478bd9Sstevel@tonic-gate  * all that is necessary is that we link the new cpu into the per-lpl list of
26147c478bd9Sstevel@tonic-gate  * cpus, and increment the ncpu count of all places where this cpu resource will
26157c478bd9Sstevel@tonic-gate  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
26167c478bd9Sstevel@tonic-gate  * pushing is accomplished by this routine.
26177c478bd9Sstevel@tonic-gate  *
26187c478bd9Sstevel@tonic-gate  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
26197c478bd9Sstevel@tonic-gate  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
26207c478bd9Sstevel@tonic-gate  * construct the hierarchy of state necessary to name it's more distant
26217c478bd9Sstevel@tonic-gate  * resources, if they should exist.  The leaf structure is initialized by this
26227c478bd9Sstevel@tonic-gate  * routine, as is the cpu-partition state for the lgrp membership.  This routine
26237c478bd9Sstevel@tonic-gate  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
26247c478bd9Sstevel@tonic-gate  * and builds all of the "ancestoral" state necessary to identify resources at
26257c478bd9Sstevel@tonic-gate  * differing levels of locality.
26267c478bd9Sstevel@tonic-gate  */
26277c478bd9Sstevel@tonic-gate void
26287c478bd9Sstevel@tonic-gate lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
26297c478bd9Sstevel@tonic-gate {
26307c478bd9Sstevel@tonic-gate 	cpupart_t	*cpupart;
26317c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
26327c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
26337c478bd9Sstevel@tonic-gate 
26347c478bd9Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
26357c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
26367c478bd9Sstevel@tonic-gate 
26377c478bd9Sstevel@tonic-gate 	cpupart = cp->cpu_part;
26387c478bd9Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lgrpid];
26397c478bd9Sstevel@tonic-gate 
26407c478bd9Sstevel@tonic-gate 	/* don't add non-existent lgrp */
26417c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
26427c478bd9Sstevel@tonic-gate 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
26437c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = lpl_leaf;
26447c478bd9Sstevel@tonic-gate 
26457c478bd9Sstevel@tonic-gate 	/* only leaf lpls contain cpus */
26467c478bd9Sstevel@tonic-gate 
26477c478bd9Sstevel@tonic-gate 	if (lpl_leaf->lpl_ncpu++ == 0) {
26487c478bd9Sstevel@tonic-gate 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
26497c478bd9Sstevel@tonic-gate 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
26507c478bd9Sstevel@tonic-gate 		lpl_leaf_insert(lpl_leaf, cpupart);
26517c478bd9Sstevel@tonic-gate 	} else {
26527c478bd9Sstevel@tonic-gate 		/*
26537c478bd9Sstevel@tonic-gate 		 * the lpl should already exist in the parent, so just update
26547c478bd9Sstevel@tonic-gate 		 * the count of available CPUs
26557c478bd9Sstevel@tonic-gate 		 */
26567c478bd9Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
26577c478bd9Sstevel@tonic-gate 	}
26587c478bd9Sstevel@tonic-gate 
26597c478bd9Sstevel@tonic-gate 	/* link cpu into list of cpus in lpl */
26607c478bd9Sstevel@tonic-gate 
26617c478bd9Sstevel@tonic-gate 	if (lpl_leaf->lpl_cpus) {
26627c478bd9Sstevel@tonic-gate 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
26637c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
26647c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
26657c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
26667c478bd9Sstevel@tonic-gate 	} else {
26677c478bd9Sstevel@tonic-gate 		/*
26687c478bd9Sstevel@tonic-gate 		 * We increment ncpu immediately after we create a new leaf
26697c478bd9Sstevel@tonic-gate 		 * lpl, so assert that ncpu == 1 for the case where we don't
26707c478bd9Sstevel@tonic-gate 		 * have any cpu pointers yet.
26717c478bd9Sstevel@tonic-gate 		 */
26727c478bd9Sstevel@tonic-gate 		ASSERT(lpl_leaf->lpl_ncpu == 1);
26737c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
26747c478bd9Sstevel@tonic-gate 	}
26757c478bd9Sstevel@tonic-gate 
26767c478bd9Sstevel@tonic-gate }
26777c478bd9Sstevel@tonic-gate 
26787c478bd9Sstevel@tonic-gate 
26797c478bd9Sstevel@tonic-gate /*
26807c478bd9Sstevel@tonic-gate  * remove a cpu from a partition in terms of lgrp load avg bookeeping
26817c478bd9Sstevel@tonic-gate  *
26827c478bd9Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
26837c478bd9Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
26847c478bd9Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
26857c478bd9Sstevel@tonic-gate  * There are two removal cases in question:
26867c478bd9Sstevel@tonic-gate  *
26877c478bd9Sstevel@tonic-gate  * 1. Removal of the resource in the leaf leaves other resources remaining in
26887c478bd9Sstevel@tonic-gate  * that leaf.  (Another cpu still exists at this level of locality).  In this
26897c478bd9Sstevel@tonic-gate  * case, the count of available cpus is decremented in all assocated lpls by
26907c478bd9Sstevel@tonic-gate  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
26917c478bd9Sstevel@tonic-gate  * from the per-cpu lpl list.
26927c478bd9Sstevel@tonic-gate  *
26937c478bd9Sstevel@tonic-gate  * 2. Removal of the resource results in the lpl containing no resources.  (It's
26947c478bd9Sstevel@tonic-gate  * empty)  In this case, all of what has occurred for the first step must take
26957c478bd9Sstevel@tonic-gate  * place; however, additionally we must remove the lpl structure itself, prune
26967c478bd9Sstevel@tonic-gate  * out any stranded lpls that do not directly name a leaf resource, and mark the
26977c478bd9Sstevel@tonic-gate  * cpu partition in question as no longer containing resources from the lgrp of
26987c478bd9Sstevel@tonic-gate  * the lpl that has been delted.  Cpu-partition changes are handled by this
26997c478bd9Sstevel@tonic-gate  * method, but the lpl_leaf_remove function deals with the details of pruning
27007c478bd9Sstevel@tonic-gate  * out the empty lpl and any of its orphaned direct ancestors.
27017c478bd9Sstevel@tonic-gate  */
27027c478bd9Sstevel@tonic-gate void
27037c478bd9Sstevel@tonic-gate lgrp_part_del_cpu(cpu_t *cp)
27047c478bd9Sstevel@tonic-gate {
27057c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
27067c478bd9Sstevel@tonic-gate 	lpl_t		*leaf_lpl;
27077c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
27087c478bd9Sstevel@tonic-gate 
27097c478bd9Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
27107c478bd9Sstevel@tonic-gate 
27117c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
27127c478bd9Sstevel@tonic-gate 
27137c478bd9Sstevel@tonic-gate 	lpl = leaf_lpl = cp->cpu_lpl;
27147c478bd9Sstevel@tonic-gate 	lgrp_leaf = leaf_lpl->lpl_lgrp;
27157c478bd9Sstevel@tonic-gate 
27167c478bd9Sstevel@tonic-gate 	/* don't delete a leaf that isn't there */
27177c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
27187c478bd9Sstevel@tonic-gate 
27197c478bd9Sstevel@tonic-gate 	/* no double-deletes */
27207c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_ncpu);
27217c478bd9Sstevel@tonic-gate 	if (--lpl->lpl_ncpu == 0) {
27227c478bd9Sstevel@tonic-gate 		/*
27237c478bd9Sstevel@tonic-gate 		 * This was the last cpu in this lgroup for this partition,
27247c478bd9Sstevel@tonic-gate 		 * clear its bit in the partition's lgroup bitmask
27257c478bd9Sstevel@tonic-gate 		 */
27267c478bd9Sstevel@tonic-gate 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
27277c478bd9Sstevel@tonic-gate 
27287c478bd9Sstevel@tonic-gate 		/* eliminate remaning lpl link pointers in cpu, lpl */
27297c478bd9Sstevel@tonic-gate 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
27307c478bd9Sstevel@tonic-gate 
27317c478bd9Sstevel@tonic-gate 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
27327c478bd9Sstevel@tonic-gate 	} else {
27337c478bd9Sstevel@tonic-gate 
27347c478bd9Sstevel@tonic-gate 		/* unlink cpu from lists of cpus in lpl */
27357c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
27367c478bd9Sstevel@tonic-gate 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
27377c478bd9Sstevel@tonic-gate 		if (lpl->lpl_cpus == cp) {
27387c478bd9Sstevel@tonic-gate 			lpl->lpl_cpus = cp->cpu_next_lpl;
27397c478bd9Sstevel@tonic-gate 		}
27407c478bd9Sstevel@tonic-gate 
27417c478bd9Sstevel@tonic-gate 		/*
27427c478bd9Sstevel@tonic-gate 		 * Update the cpu count in the lpls associated with parent
27437c478bd9Sstevel@tonic-gate 		 * lgroups.
27447c478bd9Sstevel@tonic-gate 		 */
27457c478bd9Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
27467c478bd9Sstevel@tonic-gate 
27477c478bd9Sstevel@tonic-gate 	}
27487c478bd9Sstevel@tonic-gate 	/* clear cpu's lpl ptr when we're all done */
27497c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = NULL;
27507c478bd9Sstevel@tonic-gate }
27517c478bd9Sstevel@tonic-gate 
27527c478bd9Sstevel@tonic-gate /*
27537c478bd9Sstevel@tonic-gate  * Recompute load average for the specified partition/lgrp fragment.
27547c478bd9Sstevel@tonic-gate  *
27557c478bd9Sstevel@tonic-gate  * We rely on the fact that this routine is called from the clock thread
27567c478bd9Sstevel@tonic-gate  * at a point before the clock thread can block (i.e. before its first
27577c478bd9Sstevel@tonic-gate  * lock request).  Since the clock thread can not be preempted (since it
27587c478bd9Sstevel@tonic-gate  * runs at highest priority), we know that cpu partitions can not change
27597c478bd9Sstevel@tonic-gate  * (since doing so would require either the repartition requester or the
27607c478bd9Sstevel@tonic-gate  * cpu_pause thread to run on this cpu), so we can update the cpu's load
27617c478bd9Sstevel@tonic-gate  * without grabbing cpu_lock.
27627c478bd9Sstevel@tonic-gate  */
27637c478bd9Sstevel@tonic-gate void
27647c478bd9Sstevel@tonic-gate lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
27657c478bd9Sstevel@tonic-gate {
27667c478bd9Sstevel@tonic-gate 	uint_t		ncpu;
27677c478bd9Sstevel@tonic-gate 	int64_t		old, new, f;
27687c478bd9Sstevel@tonic-gate 
27697c478bd9Sstevel@tonic-gate 	/*
27707c478bd9Sstevel@tonic-gate 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
27717c478bd9Sstevel@tonic-gate 	 */
27727c478bd9Sstevel@tonic-gate 	static short expval[] = {
27737c478bd9Sstevel@tonic-gate 	    0, 3196, 1618, 1083,
27747c478bd9Sstevel@tonic-gate 	    814, 652, 543, 466,
27757c478bd9Sstevel@tonic-gate 	    408, 363, 326, 297,
27767c478bd9Sstevel@tonic-gate 	    272, 251, 233, 218,
27777c478bd9Sstevel@tonic-gate 	    204, 192, 181, 172,
27787c478bd9Sstevel@tonic-gate 	    163, 155, 148, 142,
27797c478bd9Sstevel@tonic-gate 	    136, 130, 125, 121,
27807c478bd9Sstevel@tonic-gate 	    116, 112, 109, 105
27817c478bd9Sstevel@tonic-gate 	};
27827c478bd9Sstevel@tonic-gate 
27837c478bd9Sstevel@tonic-gate 	/* ASSERT (called from clock level) */
27847c478bd9Sstevel@tonic-gate 
27857c478bd9Sstevel@tonic-gate 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
27867c478bd9Sstevel@tonic-gate 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
27877c478bd9Sstevel@tonic-gate 		return;
27887c478bd9Sstevel@tonic-gate 	}
27897c478bd9Sstevel@tonic-gate 
27907c478bd9Sstevel@tonic-gate 	for (;;) {
27917c478bd9Sstevel@tonic-gate 
27927c478bd9Sstevel@tonic-gate 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
27937c478bd9Sstevel@tonic-gate 			f = expval[1]/ncpu; /* good approx. for large ncpu */
27947c478bd9Sstevel@tonic-gate 		else
27957c478bd9Sstevel@tonic-gate 			f = expval[ncpu];
27967c478bd9Sstevel@tonic-gate 
27977c478bd9Sstevel@tonic-gate 		/*
27987c478bd9Sstevel@tonic-gate 		 * Modify the load average atomically to avoid losing
27997c478bd9Sstevel@tonic-gate 		 * anticipatory load updates (see lgrp_move_thread()).
28007c478bd9Sstevel@tonic-gate 		 */
28017c478bd9Sstevel@tonic-gate 		if (ageflag) {
28027c478bd9Sstevel@tonic-gate 			/*
28037c478bd9Sstevel@tonic-gate 			 * We're supposed to both update and age the load.
28047c478bd9Sstevel@tonic-gate 			 * This happens 10 times/sec. per cpu.  We do a
28057c478bd9Sstevel@tonic-gate 			 * little hoop-jumping to avoid integer overflow.
28067c478bd9Sstevel@tonic-gate 			 */
28077c478bd9Sstevel@tonic-gate 			int64_t		q, r;
28087c478bd9Sstevel@tonic-gate 
28097c478bd9Sstevel@tonic-gate 			do {
28107c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
28117c478bd9Sstevel@tonic-gate 				q = (old  >> 16) << 7;
28127c478bd9Sstevel@tonic-gate 				r = (old  & 0xffff) << 7;
28137c478bd9Sstevel@tonic-gate 				new += ((long long)(nrcpus - q) * f -
28147c478bd9Sstevel@tonic-gate 				    ((r * f) >> 16)) >> 7;
28157c478bd9Sstevel@tonic-gate 
28167c478bd9Sstevel@tonic-gate 				/*
28177c478bd9Sstevel@tonic-gate 				 * Check for overflow
28187c478bd9Sstevel@tonic-gate 				 */
28197c478bd9Sstevel@tonic-gate 				if (new > LGRP_LOADAVG_MAX)
28207c478bd9Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
28217c478bd9Sstevel@tonic-gate 				else if (new < 0)
28227c478bd9Sstevel@tonic-gate 					new = 0;
28237c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
28247c478bd9Sstevel@tonic-gate 			    new) != old);
28257c478bd9Sstevel@tonic-gate 		} else {
28267c478bd9Sstevel@tonic-gate 			/*
28277c478bd9Sstevel@tonic-gate 			 * We're supposed to update the load, but not age it.
28287c478bd9Sstevel@tonic-gate 			 * This option is used to update the load (which either
28297c478bd9Sstevel@tonic-gate 			 * has already been aged in this 1/10 sec. interval or
28307c478bd9Sstevel@tonic-gate 			 * soon will be) to account for a remotely executing
28317c478bd9Sstevel@tonic-gate 			 * thread.
28327c478bd9Sstevel@tonic-gate 			 */
28337c478bd9Sstevel@tonic-gate 			do {
28347c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
28357c478bd9Sstevel@tonic-gate 				new += f;
28367c478bd9Sstevel@tonic-gate 				/*
28377c478bd9Sstevel@tonic-gate 				 * Check for overflow
28387c478bd9Sstevel@tonic-gate 				 * Underflow not possible here
28397c478bd9Sstevel@tonic-gate 				 */
28407c478bd9Sstevel@tonic-gate 				if (new < old)
28417c478bd9Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
28427c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
28437c478bd9Sstevel@tonic-gate 			    new) != old);
28447c478bd9Sstevel@tonic-gate 		}
28457c478bd9Sstevel@tonic-gate 
28467c478bd9Sstevel@tonic-gate 		/*
28477c478bd9Sstevel@tonic-gate 		 * Do the same for this lpl's parent
28487c478bd9Sstevel@tonic-gate 		 */
28497c478bd9Sstevel@tonic-gate 		if ((lpl = lpl->lpl_parent) == NULL)
28507c478bd9Sstevel@tonic-gate 			break;
28517c478bd9Sstevel@tonic-gate 		ncpu = lpl->lpl_ncpu;
28527c478bd9Sstevel@tonic-gate 	}
28537c478bd9Sstevel@tonic-gate }
28547c478bd9Sstevel@tonic-gate 
28557c478bd9Sstevel@tonic-gate /*
28567c478bd9Sstevel@tonic-gate  * Initialize lpl topology in the target based on topology currently present in
28577c478bd9Sstevel@tonic-gate  * lpl_bootstrap.
28587c478bd9Sstevel@tonic-gate  *
28597c478bd9Sstevel@tonic-gate  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
28607c478bd9Sstevel@tonic-gate  * initialize cp_default list of lpls. Up to this point all topology operations
28617c478bd9Sstevel@tonic-gate  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
28627c478bd9Sstevel@tonic-gate  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
28637c478bd9Sstevel@tonic-gate  * `target' points to the list of lpls in cp_default and `size' is the size of
28647c478bd9Sstevel@tonic-gate  * this list.
28657c478bd9Sstevel@tonic-gate  *
28667c478bd9Sstevel@tonic-gate  * This function walks the lpl topology in lpl_bootstrap and does for things:
28677c478bd9Sstevel@tonic-gate  *
28687c478bd9Sstevel@tonic-gate  * 1) Copies all fields from lpl_bootstrap to the target.
28697c478bd9Sstevel@tonic-gate  *
28707c478bd9Sstevel@tonic-gate  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
28717c478bd9Sstevel@tonic-gate  *
28727c478bd9Sstevel@tonic-gate  * 3) Updates lpl_parent pointers to point to the lpls in the target list
28737c478bd9Sstevel@tonic-gate  *    instead of lpl_bootstrap.
28747c478bd9Sstevel@tonic-gate  *
28757c478bd9Sstevel@tonic-gate  * 4) Updates pointers in the resource list of the target to point to the lpls
28767c478bd9Sstevel@tonic-gate  *    in the target list instead of lpl_bootstrap.
28777c478bd9Sstevel@tonic-gate  *
28787c478bd9Sstevel@tonic-gate  * After lpl_topo_bootstrap() completes, target contains the same information
28797c478bd9Sstevel@tonic-gate  * that would be present there if it were used during boot instead of
28807c478bd9Sstevel@tonic-gate  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
28817c478bd9Sstevel@tonic-gate  * and it is bzeroed.
28827c478bd9Sstevel@tonic-gate  */
28837c478bd9Sstevel@tonic-gate void
28847c478bd9Sstevel@tonic-gate lpl_topo_bootstrap(lpl_t *target, int size)
28857c478bd9Sstevel@tonic-gate {
28867c478bd9Sstevel@tonic-gate 	lpl_t	*lpl = lpl_bootstrap;
28877c478bd9Sstevel@tonic-gate 	lpl_t	*target_lpl = target;
28887c478bd9Sstevel@tonic-gate 	int	howmany;
28897c478bd9Sstevel@tonic-gate 	int	id;
28907c478bd9Sstevel@tonic-gate 	int	i;
28917c478bd9Sstevel@tonic-gate 
28927c478bd9Sstevel@tonic-gate 	/*
28937c478bd9Sstevel@tonic-gate 	 * The only target that should be passed here is cp_default lpl list.
28947c478bd9Sstevel@tonic-gate 	 */
28957c478bd9Sstevel@tonic-gate 	ASSERT(target == cp_default.cp_lgrploads);
28967c478bd9Sstevel@tonic-gate 	ASSERT(size == cp_default.cp_nlgrploads);
28977c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_topo_initialized);
28987c478bd9Sstevel@tonic-gate 	ASSERT(ncpus == 1);
28997c478bd9Sstevel@tonic-gate 
29007c478bd9Sstevel@tonic-gate 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
29017c478bd9Sstevel@tonic-gate 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
29027c478bd9Sstevel@tonic-gate 		/*
29037c478bd9Sstevel@tonic-gate 		 * Copy all fields from lpl.
29047c478bd9Sstevel@tonic-gate 		 */
29057c478bd9Sstevel@tonic-gate 
29067c478bd9Sstevel@tonic-gate 		*target_lpl = *lpl;
29077c478bd9Sstevel@tonic-gate 
29087c478bd9Sstevel@tonic-gate 		/*
29097c478bd9Sstevel@tonic-gate 		 * Substitute CPU0 lpl pointer with one relative to target.
29107c478bd9Sstevel@tonic-gate 		 */
29117c478bd9Sstevel@tonic-gate 		if (lpl->lpl_cpus == CPU) {
29127c478bd9Sstevel@tonic-gate 			ASSERT(CPU->cpu_lpl == lpl);
29137c478bd9Sstevel@tonic-gate 			CPU->cpu_lpl = target_lpl;
29147c478bd9Sstevel@tonic-gate 		}
29157c478bd9Sstevel@tonic-gate 
29167c478bd9Sstevel@tonic-gate 		/*
29177c478bd9Sstevel@tonic-gate 		 * Substitute parent information with parent relative to target.
29187c478bd9Sstevel@tonic-gate 		 */
29197c478bd9Sstevel@tonic-gate 		if (lpl->lpl_parent != NULL)
29207c478bd9Sstevel@tonic-gate 			target_lpl->lpl_parent = (lpl_t *)
29217c478bd9Sstevel@tonic-gate 			    (((uintptr_t)lpl->lpl_parent -
29227c478bd9Sstevel@tonic-gate 				(uintptr_t)lpl_bootstrap) +
29237c478bd9Sstevel@tonic-gate 				(uintptr_t)target);
29247c478bd9Sstevel@tonic-gate 
29257c478bd9Sstevel@tonic-gate 		/*
29267c478bd9Sstevel@tonic-gate 		 * Walk over resource set substituting pointers relative to
29277c478bd9Sstevel@tonic-gate 		 * lpl_bootstrap to pointers relative to target.
29287c478bd9Sstevel@tonic-gate 		 */
29297c478bd9Sstevel@tonic-gate 		ASSERT(lpl->lpl_nrset <= 1);
29307c478bd9Sstevel@tonic-gate 
29317c478bd9Sstevel@tonic-gate 		for (id = 0; id < lpl->lpl_nrset; id++) {
29327c478bd9Sstevel@tonic-gate 			if (lpl->lpl_rset[id] != NULL) {
29337c478bd9Sstevel@tonic-gate 				target_lpl->lpl_rset[id] =
29347c478bd9Sstevel@tonic-gate 				    (lpl_t *)
29357c478bd9Sstevel@tonic-gate 				    (((uintptr_t)lpl->lpl_rset[id] -
29367c478bd9Sstevel@tonic-gate 					(uintptr_t)lpl_bootstrap) +
29377c478bd9Sstevel@tonic-gate 					(uintptr_t)target);
29387c478bd9Sstevel@tonic-gate 			}
29397c478bd9Sstevel@tonic-gate 		}
29407c478bd9Sstevel@tonic-gate 	}
29417c478bd9Sstevel@tonic-gate 
29427c478bd9Sstevel@tonic-gate 	/*
29437c478bd9Sstevel@tonic-gate 	 * Topology information in lpl_bootstrap is no longer needed.
29447c478bd9Sstevel@tonic-gate 	 */
29457c478bd9Sstevel@tonic-gate 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
29467c478bd9Sstevel@tonic-gate }
29477c478bd9Sstevel@tonic-gate 
29487c478bd9Sstevel@tonic-gate /* the maximum effect that a single thread can have on it's lgroup's load */
29497c478bd9Sstevel@tonic-gate #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
29507c478bd9Sstevel@tonic-gate 	((lgrp_loadavg_max_effect) / (ncpu))
29517c478bd9Sstevel@tonic-gate uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
29527c478bd9Sstevel@tonic-gate 
29537c478bd9Sstevel@tonic-gate /*
29547c478bd9Sstevel@tonic-gate  * If the lowest load among the lgroups a process' threads are currently
29557c478bd9Sstevel@tonic-gate  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
29567c478bd9Sstevel@tonic-gate  * expanding the process to a new lgroup.
29577c478bd9Sstevel@tonic-gate  */
29587c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
29597c478bd9Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
29607c478bd9Sstevel@tonic-gate 
29617c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
29627c478bd9Sstevel@tonic-gate 	((lgrp_expand_proc_thresh) / (ncpu))
29637c478bd9Sstevel@tonic-gate 
29647c478bd9Sstevel@tonic-gate /*
29657c478bd9Sstevel@tonic-gate  * A process will be expanded to a new lgroup only if the difference between
29667c478bd9Sstevel@tonic-gate  * the lowest load on the lgroups the process' thread's are currently spread
29677c478bd9Sstevel@tonic-gate  * across and the lowest load on the other lgroups in the process' partition
29687c478bd9Sstevel@tonic-gate  * is greater than lgrp_expand_proc_diff.
29697c478bd9Sstevel@tonic-gate  */
29707c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
29717c478bd9Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
29727c478bd9Sstevel@tonic-gate 
29737c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
29747c478bd9Sstevel@tonic-gate 	((lgrp_expand_proc_diff) / (ncpu))
29757c478bd9Sstevel@tonic-gate 
29767c478bd9Sstevel@tonic-gate /*
29777c478bd9Sstevel@tonic-gate  * The loadavg tolerance accounts for "noise" inherent in the load, which may
29787c478bd9Sstevel@tonic-gate  * be present due to impreciseness of the load average decay algorithm.
29797c478bd9Sstevel@tonic-gate  *
29807c478bd9Sstevel@tonic-gate  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
29817c478bd9Sstevel@tonic-gate  * tolerance is scaled by the number of cpus in the lgroup just like
29827c478bd9Sstevel@tonic-gate  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
29837c478bd9Sstevel@tonic-gate  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
29847c478bd9Sstevel@tonic-gate  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
29857c478bd9Sstevel@tonic-gate  */
29867c478bd9Sstevel@tonic-gate uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
29877c478bd9Sstevel@tonic-gate #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
29887c478bd9Sstevel@tonic-gate 	((lgrp_loadavg_tolerance) / ncpu)
29897c478bd9Sstevel@tonic-gate 
29907c478bd9Sstevel@tonic-gate /*
29917c478bd9Sstevel@tonic-gate  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
29927c478bd9Sstevel@tonic-gate  * average is above this threshold
29937c478bd9Sstevel@tonic-gate  */
29947c478bd9Sstevel@tonic-gate uint32_t	lgrp_load_thresh = UINT32_MAX;
29957c478bd9Sstevel@tonic-gate 
29967c478bd9Sstevel@tonic-gate /*
29977c478bd9Sstevel@tonic-gate  * lgrp_choose() will try to skip any lgroups with less memory
29987c478bd9Sstevel@tonic-gate  * than this free when choosing a home lgroup
29997c478bd9Sstevel@tonic-gate  */
30007c478bd9Sstevel@tonic-gate pgcnt_t	lgrp_mem_free_thresh = 0;
30017c478bd9Sstevel@tonic-gate 
30027c478bd9Sstevel@tonic-gate /*
30037c478bd9Sstevel@tonic-gate  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
30047c478bd9Sstevel@tonic-gate  * one based on one of the following policies:
30057c478bd9Sstevel@tonic-gate  * - Random selection
30067c478bd9Sstevel@tonic-gate  * - Pseudo round robin placement
30077c478bd9Sstevel@tonic-gate  * - Longest time since a thread was last placed
30087c478bd9Sstevel@tonic-gate  */
30097c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_RANDOM	1
30107c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_RR		2
30117c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_TIME	3
30127c478bd9Sstevel@tonic-gate 
30137c478bd9Sstevel@tonic-gate int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
30147c478bd9Sstevel@tonic-gate 
30157c478bd9Sstevel@tonic-gate /*
30167c478bd9Sstevel@tonic-gate  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
30177c478bd9Sstevel@tonic-gate  * be bound to a CPU or processor set.
30187c478bd9Sstevel@tonic-gate  *
30197c478bd9Sstevel@tonic-gate  * Arguments:
30207c478bd9Sstevel@tonic-gate  *	t		The thread
30217c478bd9Sstevel@tonic-gate  *	cpupart		The partition the thread belongs to.
30227c478bd9Sstevel@tonic-gate  *
30237c478bd9Sstevel@tonic-gate  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
30247c478bd9Sstevel@tonic-gate  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
30257c478bd9Sstevel@tonic-gate  *	 partitions changing out from under us and assumes that given thread is
30267c478bd9Sstevel@tonic-gate  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
30277c478bd9Sstevel@tonic-gate  *	 disabled, so don't grab any locks because we should never block under
30287c478bd9Sstevel@tonic-gate  *	 those conditions.
30297c478bd9Sstevel@tonic-gate  */
30307c478bd9Sstevel@tonic-gate lpl_t *
30317c478bd9Sstevel@tonic-gate lgrp_choose(kthread_t *t, cpupart_t *cpupart)
30327c478bd9Sstevel@tonic-gate {
30337c478bd9Sstevel@tonic-gate 	lgrp_load_t	bestload, bestrload;
30347c478bd9Sstevel@tonic-gate 	int		lgrpid_offset, lgrp_count;
30357c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid, lgrpid_start;
30367c478bd9Sstevel@tonic-gate 	lpl_t		*lpl, *bestlpl, *bestrlpl;
30377c478bd9Sstevel@tonic-gate 	klgrpset_t	lgrpset;
30387c478bd9Sstevel@tonic-gate 	proc_t		*p;
30397c478bd9Sstevel@tonic-gate 
30407c478bd9Sstevel@tonic-gate 	ASSERT(t != NULL);
30417c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
30427c478bd9Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
30437c478bd9Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
30447c478bd9Sstevel@tonic-gate 
30457c478bd9Sstevel@tonic-gate 	p = t->t_procp;
30467c478bd9Sstevel@tonic-gate 
30477c478bd9Sstevel@tonic-gate 	/* A process should always be in an active partition */
30487c478bd9Sstevel@tonic-gate 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
30497c478bd9Sstevel@tonic-gate 
30507c478bd9Sstevel@tonic-gate 	bestlpl = bestrlpl = NULL;
30517c478bd9Sstevel@tonic-gate 	bestload = bestrload = LGRP_LOADAVG_MAX;
30527c478bd9Sstevel@tonic-gate 	lgrpset = cpupart->cp_lgrpset;
30537c478bd9Sstevel@tonic-gate 
30547c478bd9Sstevel@tonic-gate 	switch (lgrp_choose_policy) {
30557c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_RR:
30567c478bd9Sstevel@tonic-gate 		lgrpid = cpupart->cp_lgrp_hint;
30577c478bd9Sstevel@tonic-gate 		do {
30587c478bd9Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
30597c478bd9Sstevel@tonic-gate 				lgrpid = 0;
30607c478bd9Sstevel@tonic-gate 		} while (!klgrpset_ismember(lgrpset, lgrpid));
30617c478bd9Sstevel@tonic-gate 
30627c478bd9Sstevel@tonic-gate 		break;
30637c478bd9Sstevel@tonic-gate 	default:
30647c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_TIME:
30657c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_RANDOM:
30667c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrp_count);
30677c478bd9Sstevel@tonic-gate 		lgrpid_offset =
30687c478bd9Sstevel@tonic-gate 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
30697c478bd9Sstevel@tonic-gate 		for (lgrpid = 0; ; lgrpid++) {
30707c478bd9Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, lgrpid)) {
30717c478bd9Sstevel@tonic-gate 				if (--lgrpid_offset == 0)
30727c478bd9Sstevel@tonic-gate 					break;
30737c478bd9Sstevel@tonic-gate 			}
30747c478bd9Sstevel@tonic-gate 		}
30757c478bd9Sstevel@tonic-gate 		break;
30767c478bd9Sstevel@tonic-gate 	}
30777c478bd9Sstevel@tonic-gate 
30787c478bd9Sstevel@tonic-gate 	lgrpid_start = lgrpid;
30797c478bd9Sstevel@tonic-gate 
30807c478bd9Sstevel@tonic-gate 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
30817c478bd9Sstevel@tonic-gate 	    lgrp_id_t, cpupart->cp_lgrp_hint);
30827c478bd9Sstevel@tonic-gate 
30837c478bd9Sstevel@tonic-gate 	/*
30847c478bd9Sstevel@tonic-gate 	 * Use lgroup affinities (if any) to choose best lgroup
30857c478bd9Sstevel@tonic-gate 	 *
30867c478bd9Sstevel@tonic-gate 	 * NOTE: Assumes that thread is protected from going away and its
30877c478bd9Sstevel@tonic-gate 	 *	 lgroup affinities won't change (ie. p_lock, or
30887c478bd9Sstevel@tonic-gate 	 *	 thread_lock() being held and/or CPUs paused)
30897c478bd9Sstevel@tonic-gate 	 */
30907c478bd9Sstevel@tonic-gate 	if (t->t_lgrp_affinity) {
30917c478bd9Sstevel@tonic-gate 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
30927c478bd9Sstevel@tonic-gate 		if (lpl != NULL)
30937c478bd9Sstevel@tonic-gate 			return (lpl);
30947c478bd9Sstevel@tonic-gate 	}
30957c478bd9Sstevel@tonic-gate 
30967c478bd9Sstevel@tonic-gate 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
30977c478bd9Sstevel@tonic-gate 
30987c478bd9Sstevel@tonic-gate 	do {
30997c478bd9Sstevel@tonic-gate 		pgcnt_t	npgs;
31007c478bd9Sstevel@tonic-gate 
31017c478bd9Sstevel@tonic-gate 		/*
31027c478bd9Sstevel@tonic-gate 		 * Skip any lgroups outside of thread's pset
31037c478bd9Sstevel@tonic-gate 		 */
31047c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
31057c478bd9Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
31067c478bd9Sstevel@tonic-gate 				lgrpid = 0;	/* wrap the search */
31077c478bd9Sstevel@tonic-gate 			continue;
31087c478bd9Sstevel@tonic-gate 		}
31097c478bd9Sstevel@tonic-gate 
31107c478bd9Sstevel@tonic-gate 		/*
31117c478bd9Sstevel@tonic-gate 		 * Skip any non-leaf lgroups
31127c478bd9Sstevel@tonic-gate 		 */
31137c478bd9Sstevel@tonic-gate 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
31147c478bd9Sstevel@tonic-gate 			continue;
31157c478bd9Sstevel@tonic-gate 
31167c478bd9Sstevel@tonic-gate 		/*
31177c478bd9Sstevel@tonic-gate 		 * Skip any lgroups without enough free memory
31187c478bd9Sstevel@tonic-gate 		 * (when threshold set to nonzero positive value)
31197c478bd9Sstevel@tonic-gate 		 */
31207c478bd9Sstevel@tonic-gate 		if (lgrp_mem_free_thresh > 0) {
31217c478bd9Sstevel@tonic-gate 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
31227c478bd9Sstevel@tonic-gate 			if (npgs < lgrp_mem_free_thresh) {
31237c478bd9Sstevel@tonic-gate 				if (++lgrpid > lgrp_alloc_max)
31247c478bd9Sstevel@tonic-gate 					lgrpid = 0;	/* wrap the search */
31257c478bd9Sstevel@tonic-gate 				continue;
31267c478bd9Sstevel@tonic-gate 			}
31277c478bd9Sstevel@tonic-gate 		}
31287c478bd9Sstevel@tonic-gate 
31297c478bd9Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[lgrpid];
31307c478bd9Sstevel@tonic-gate 		if (klgrpset_isempty(p->p_lgrpset) ||
31317c478bd9Sstevel@tonic-gate 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
31327c478bd9Sstevel@tonic-gate 			/*
31337c478bd9Sstevel@tonic-gate 			 * Either this is a new process or the process already
31347c478bd9Sstevel@tonic-gate 			 * has threads on this lgrp, so this is a preferred
31357c478bd9Sstevel@tonic-gate 			 * lgroup for the thread.
31367c478bd9Sstevel@tonic-gate 			 */
3137ab761399Sesaxe 			if (bestlpl == NULL ||
3138ab761399Sesaxe 			    lpl_pick(lpl, bestlpl)) {
31397c478bd9Sstevel@tonic-gate 				bestload = lpl->lpl_loadavg;
31407c478bd9Sstevel@tonic-gate 				bestlpl = lpl;
31417c478bd9Sstevel@tonic-gate 			}
31427c478bd9Sstevel@tonic-gate 		} else {
31437c478bd9Sstevel@tonic-gate 			/*
31447c478bd9Sstevel@tonic-gate 			 * The process doesn't have any threads on this lgrp,
31457c478bd9Sstevel@tonic-gate 			 * but we're willing to consider this lgrp if the load
31467c478bd9Sstevel@tonic-gate 			 * difference is big enough to justify splitting up
31477c478bd9Sstevel@tonic-gate 			 * the process' threads.
31487c478bd9Sstevel@tonic-gate 			 */
3149ab761399Sesaxe 			if (bestrlpl == NULL ||
3150ab761399Sesaxe 			    lpl_pick(lpl, bestrlpl)) {
31517c478bd9Sstevel@tonic-gate 				bestrload = lpl->lpl_loadavg;
31527c478bd9Sstevel@tonic-gate 				bestrlpl = lpl;
31537c478bd9Sstevel@tonic-gate 			}
31547c478bd9Sstevel@tonic-gate 		}
31557c478bd9Sstevel@tonic-gate 		if (++lgrpid > lgrp_alloc_max)
31567c478bd9Sstevel@tonic-gate 			lgrpid = 0;	/* wrap the search */
31577c478bd9Sstevel@tonic-gate 	} while (lgrpid != lgrpid_start);
31587c478bd9Sstevel@tonic-gate 
31597c478bd9Sstevel@tonic-gate 	/*
31607c478bd9Sstevel@tonic-gate 	 * Return root lgroup if threshold isn't set to maximum value and
31617c478bd9Sstevel@tonic-gate 	 * lowest lgroup load average more than a certain threshold
31627c478bd9Sstevel@tonic-gate 	 */
31637c478bd9Sstevel@tonic-gate 	if (lgrp_load_thresh != UINT32_MAX &&
31647c478bd9Sstevel@tonic-gate 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
31657c478bd9Sstevel@tonic-gate 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
31667c478bd9Sstevel@tonic-gate 
31677c478bd9Sstevel@tonic-gate 	/*
31687c478bd9Sstevel@tonic-gate 	 * If all the lgroups over which the thread's process is spread are
3169ab761399Sesaxe 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3170ab761399Sesaxe 	 * the thread on one of the other leaf lgroups in the thread's
3171ab761399Sesaxe 	 * partition.
31727c478bd9Sstevel@tonic-gate 	 */
3173ab761399Sesaxe 	if ((bestlpl == NULL) ||
3174ab761399Sesaxe 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
31757c478bd9Sstevel@tonic-gate 	    (bestrload < bestload) &&	/* paranoid about wraparound */
31767c478bd9Sstevel@tonic-gate 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3177ab761399Sesaxe 	    bestload))) {
31787c478bd9Sstevel@tonic-gate 		bestlpl = bestrlpl;
31797c478bd9Sstevel@tonic-gate 	}
31807c478bd9Sstevel@tonic-gate 
3181ab761399Sesaxe 	if (bestlpl == NULL) {
3182ab761399Sesaxe 		/*
3183ab761399Sesaxe 		 * No lgroup looked particularly good, but we still
3184ab761399Sesaxe 		 * have to pick something. Go with the randomly selected
3185ab761399Sesaxe 		 * legal lgroup we started with above.
3186ab761399Sesaxe 		 */
3187ab761399Sesaxe 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3188ab761399Sesaxe 	}
3189ab761399Sesaxe 
31907c478bd9Sstevel@tonic-gate 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
31917c478bd9Sstevel@tonic-gate 	bestlpl->lpl_homed_time = gethrtime_unscaled();
31927c478bd9Sstevel@tonic-gate 
31937c478bd9Sstevel@tonic-gate 	ASSERT(bestlpl->lpl_ncpu > 0);
31947c478bd9Sstevel@tonic-gate 	return (bestlpl);
31957c478bd9Sstevel@tonic-gate }
31967c478bd9Sstevel@tonic-gate 
31977c478bd9Sstevel@tonic-gate /*
3198ab761399Sesaxe  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3199ab761399Sesaxe  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
32007c478bd9Sstevel@tonic-gate  */
32017c478bd9Sstevel@tonic-gate static int
32027c478bd9Sstevel@tonic-gate lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
32037c478bd9Sstevel@tonic-gate {
32047c478bd9Sstevel@tonic-gate 	lgrp_load_t	l1, l2;
32057c478bd9Sstevel@tonic-gate 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
32067c478bd9Sstevel@tonic-gate 
32077c478bd9Sstevel@tonic-gate 	l1 = lpl1->lpl_loadavg;
32087c478bd9Sstevel@tonic-gate 	l2 = lpl2->lpl_loadavg;
32097c478bd9Sstevel@tonic-gate 
32107c478bd9Sstevel@tonic-gate 	if ((l1 + tolerance < l2) && (l1 < l2)) {
32117c478bd9Sstevel@tonic-gate 		/* lpl1 is significantly less loaded than lpl2 */
32127c478bd9Sstevel@tonic-gate 		return (1);
32137c478bd9Sstevel@tonic-gate 	}
32147c478bd9Sstevel@tonic-gate 
32157c478bd9Sstevel@tonic-gate 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
32167c478bd9Sstevel@tonic-gate 	    l1 + tolerance >= l2 && l1 < l2 &&
32177c478bd9Sstevel@tonic-gate 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
32187c478bd9Sstevel@tonic-gate 		/*
32197c478bd9Sstevel@tonic-gate 		 * lpl1's load is within the tolerance of lpl2. We're
32207c478bd9Sstevel@tonic-gate 		 * willing to consider it be to better however if
32217c478bd9Sstevel@tonic-gate 		 * it has been longer since we last homed a thread there
32227c478bd9Sstevel@tonic-gate 		 */
32237c478bd9Sstevel@tonic-gate 		return (1);
32247c478bd9Sstevel@tonic-gate 	}
32257c478bd9Sstevel@tonic-gate 
32267c478bd9Sstevel@tonic-gate 	return (0);
32277c478bd9Sstevel@tonic-gate }
32287c478bd9Sstevel@tonic-gate 
32297c478bd9Sstevel@tonic-gate /*
32307c478bd9Sstevel@tonic-gate  * An LWP is expected to be assigned to an lgroup for at least this long
32317c478bd9Sstevel@tonic-gate  * for its anticipatory load to be justified.  NOTE that this value should
32327c478bd9Sstevel@tonic-gate  * not be set extremely huge (say, larger than 100 years), to avoid problems
32337c478bd9Sstevel@tonic-gate  * with overflow in the calculation that uses it.
32347c478bd9Sstevel@tonic-gate  */
32357c478bd9Sstevel@tonic-gate #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
32367c478bd9Sstevel@tonic-gate hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
32377c478bd9Sstevel@tonic-gate 
32387c478bd9Sstevel@tonic-gate /*
32397c478bd9Sstevel@tonic-gate  * Routine to change a thread's lgroup affiliation.  This routine updates
32407c478bd9Sstevel@tonic-gate  * the thread's kthread_t struct and its process' proc_t struct to note the
32417c478bd9Sstevel@tonic-gate  * thread's new lgroup affiliation, and its lgroup affinities.
32427c478bd9Sstevel@tonic-gate  *
32437c478bd9Sstevel@tonic-gate  * Note that this is the only routine that modifies a thread's t_lpl field,
32447c478bd9Sstevel@tonic-gate  * and that adds in or removes anticipatory load.
32457c478bd9Sstevel@tonic-gate  *
32467c478bd9Sstevel@tonic-gate  * If the thread is exiting, newlpl is NULL.
32477c478bd9Sstevel@tonic-gate  *
32487c478bd9Sstevel@tonic-gate  * Locking:
32497c478bd9Sstevel@tonic-gate  * The following lock must be held on entry:
32507c478bd9Sstevel@tonic-gate  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
32517c478bd9Sstevel@tonic-gate  *		doesn't get removed from t's partition
32527c478bd9Sstevel@tonic-gate  *
32537c478bd9Sstevel@tonic-gate  * This routine is not allowed to grab any locks, since it may be called
32547c478bd9Sstevel@tonic-gate  * with cpus paused (such as from cpu_offline).
32557c478bd9Sstevel@tonic-gate  */
32567c478bd9Sstevel@tonic-gate void
32577c478bd9Sstevel@tonic-gate lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
32587c478bd9Sstevel@tonic-gate {
32597c478bd9Sstevel@tonic-gate 	proc_t		*p;
32607c478bd9Sstevel@tonic-gate 	lpl_t		*lpl, *oldlpl;
32617c478bd9Sstevel@tonic-gate 	lgrp_id_t	oldid;
32627c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
32637c478bd9Sstevel@tonic-gate 	uint_t		ncpu;
32647c478bd9Sstevel@tonic-gate 	lgrp_load_t	old, new;
32657c478bd9Sstevel@tonic-gate 
32667c478bd9Sstevel@tonic-gate 	ASSERT(t);
32677c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
32687c478bd9Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
32697c478bd9Sstevel@tonic-gate 
32707c478bd9Sstevel@tonic-gate 	/*
32717c478bd9Sstevel@tonic-gate 	 * If not changing lpls, just return
32727c478bd9Sstevel@tonic-gate 	 */
32737c478bd9Sstevel@tonic-gate 	if ((oldlpl = t->t_lpl) == newlpl)
32747c478bd9Sstevel@tonic-gate 		return;
32757c478bd9Sstevel@tonic-gate 
32767c478bd9Sstevel@tonic-gate 	/*
32777c478bd9Sstevel@tonic-gate 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
32787c478bd9Sstevel@tonic-gate 	 * associated with process 0 rather than with its original process).
32797c478bd9Sstevel@tonic-gate 	 */
32807c478bd9Sstevel@tonic-gate 	if (t->t_proc_flag & TP_LWPEXIT) {
32817c478bd9Sstevel@tonic-gate 		if (newlpl != NULL) {
32827c478bd9Sstevel@tonic-gate 			t->t_lpl = newlpl;
32837c478bd9Sstevel@tonic-gate 		}
32847c478bd9Sstevel@tonic-gate 		return;
32857c478bd9Sstevel@tonic-gate 	}
32867c478bd9Sstevel@tonic-gate 
32877c478bd9Sstevel@tonic-gate 	p = ttoproc(t);
32887c478bd9Sstevel@tonic-gate 
32897c478bd9Sstevel@tonic-gate 	/*
32907c478bd9Sstevel@tonic-gate 	 * If the thread had a previous lgroup, update its process' p_lgrpset
32917c478bd9Sstevel@tonic-gate 	 * to account for it being moved from its old lgroup.
32927c478bd9Sstevel@tonic-gate 	 */
32937c478bd9Sstevel@tonic-gate 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
32947c478bd9Sstevel@tonic-gate 	    (p->p_tlist != NULL)) {
32957c478bd9Sstevel@tonic-gate 		oldid = oldlpl->lpl_lgrpid;
32967c478bd9Sstevel@tonic-gate 
32977c478bd9Sstevel@tonic-gate 		if (newlpl != NULL)
32987c478bd9Sstevel@tonic-gate 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
32997c478bd9Sstevel@tonic-gate 
33007c478bd9Sstevel@tonic-gate 		if ((do_lgrpset_delete) &&
33017c478bd9Sstevel@tonic-gate 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
33027c478bd9Sstevel@tonic-gate 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
33037c478bd9Sstevel@tonic-gate 				/*
33047c478bd9Sstevel@tonic-gate 				 * Check if a thread other than the thread
33057c478bd9Sstevel@tonic-gate 				 * that's moving is assigned to the same
33067c478bd9Sstevel@tonic-gate 				 * lgroup as the thread that's moving.  Note
33077c478bd9Sstevel@tonic-gate 				 * that we have to compare lgroup IDs, rather
33087c478bd9Sstevel@tonic-gate 				 * than simply comparing t_lpl's, since the
33097c478bd9Sstevel@tonic-gate 				 * threads may belong to different partitions
33107c478bd9Sstevel@tonic-gate 				 * but be assigned to the same lgroup.
33117c478bd9Sstevel@tonic-gate 				 */
33127c478bd9Sstevel@tonic-gate 				ASSERT(tp->t_lpl != NULL);
33137c478bd9Sstevel@tonic-gate 
33147c478bd9Sstevel@tonic-gate 				if ((tp != t) &&
33157c478bd9Sstevel@tonic-gate 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
33167c478bd9Sstevel@tonic-gate 					/*
33177c478bd9Sstevel@tonic-gate 					 * Another thread is assigned to the
33187c478bd9Sstevel@tonic-gate 					 * same lgroup as the thread that's
33197c478bd9Sstevel@tonic-gate 					 * moving, p_lgrpset doesn't change.
33207c478bd9Sstevel@tonic-gate 					 */
33217c478bd9Sstevel@tonic-gate 					break;
33227c478bd9Sstevel@tonic-gate 				} else if (tp == p->p_tlist) {
33237c478bd9Sstevel@tonic-gate 					/*
33247c478bd9Sstevel@tonic-gate 					 * No other thread is assigned to the
33257c478bd9Sstevel@tonic-gate 					 * same lgroup as the exiting thread,
33267c478bd9Sstevel@tonic-gate 					 * clear the lgroup's bit in p_lgrpset.
33277c478bd9Sstevel@tonic-gate 					 */
33287c478bd9Sstevel@tonic-gate 					klgrpset_del(p->p_lgrpset, oldid);
33297c478bd9Sstevel@tonic-gate 					break;
33307c478bd9Sstevel@tonic-gate 				}
33317c478bd9Sstevel@tonic-gate 			}
33327c478bd9Sstevel@tonic-gate 		}
33337c478bd9Sstevel@tonic-gate 
33347c478bd9Sstevel@tonic-gate 		/*
33357c478bd9Sstevel@tonic-gate 		 * If this thread was assigned to its old lgroup for such a
33367c478bd9Sstevel@tonic-gate 		 * short amount of time that the anticipatory load that was
33377c478bd9Sstevel@tonic-gate 		 * added on its behalf has aged very little, remove that
33387c478bd9Sstevel@tonic-gate 		 * anticipatory load.
33397c478bd9Sstevel@tonic-gate 		 */
33407c478bd9Sstevel@tonic-gate 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
33417c478bd9Sstevel@tonic-gate 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
33427c478bd9Sstevel@tonic-gate 			lpl = oldlpl;
33437c478bd9Sstevel@tonic-gate 			for (;;) {
33447c478bd9Sstevel@tonic-gate 				do {
33457c478bd9Sstevel@tonic-gate 					old = new = lpl->lpl_loadavg;
33467c478bd9Sstevel@tonic-gate 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
33477c478bd9Sstevel@tonic-gate 					if (new > old) {
33487c478bd9Sstevel@tonic-gate 						/*
33497c478bd9Sstevel@tonic-gate 						 * this can happen if the load
33507c478bd9Sstevel@tonic-gate 						 * average was aged since we
33517c478bd9Sstevel@tonic-gate 						 * added in the anticipatory
33527c478bd9Sstevel@tonic-gate 						 * load
33537c478bd9Sstevel@tonic-gate 						 */
33547c478bd9Sstevel@tonic-gate 						new = 0;
33557c478bd9Sstevel@tonic-gate 					}
33567c478bd9Sstevel@tonic-gate 				} while (cas32(
33577c478bd9Sstevel@tonic-gate 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
33587c478bd9Sstevel@tonic-gate 					    new) != old);
33597c478bd9Sstevel@tonic-gate 
33607c478bd9Sstevel@tonic-gate 				lpl = lpl->lpl_parent;
33617c478bd9Sstevel@tonic-gate 				if (lpl == NULL)
33627c478bd9Sstevel@tonic-gate 					break;
33637c478bd9Sstevel@tonic-gate 
33647c478bd9Sstevel@tonic-gate 				ncpu = lpl->lpl_ncpu;
33657c478bd9Sstevel@tonic-gate 				ASSERT(ncpu > 0);
33667c478bd9Sstevel@tonic-gate 			}
33677c478bd9Sstevel@tonic-gate 		}
33687c478bd9Sstevel@tonic-gate 	}
33697c478bd9Sstevel@tonic-gate 	/*
33707c478bd9Sstevel@tonic-gate 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
33717c478bd9Sstevel@tonic-gate 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
33727c478bd9Sstevel@tonic-gate 	 * to its new lgroup to account for its move to its new lgroup.
33737c478bd9Sstevel@tonic-gate 	 */
33747c478bd9Sstevel@tonic-gate 	if (newlpl != NULL) {
33757c478bd9Sstevel@tonic-gate 		/*
33767c478bd9Sstevel@tonic-gate 		 * This thread is moving to a new lgroup
33777c478bd9Sstevel@tonic-gate 		 */
33787c478bd9Sstevel@tonic-gate 		t->t_lpl = newlpl;
33797c478bd9Sstevel@tonic-gate 
33807c478bd9Sstevel@tonic-gate 		/*
33817c478bd9Sstevel@tonic-gate 		 * Reflect move in load average of new lgroup
33827c478bd9Sstevel@tonic-gate 		 * unless it is root lgroup
33837c478bd9Sstevel@tonic-gate 		 */
33847c478bd9Sstevel@tonic-gate 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
33857c478bd9Sstevel@tonic-gate 			return;
33867c478bd9Sstevel@tonic-gate 
33877c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
33887c478bd9Sstevel@tonic-gate 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
33897c478bd9Sstevel@tonic-gate 		}
33907c478bd9Sstevel@tonic-gate 
33917c478bd9Sstevel@tonic-gate 		/*
33927c478bd9Sstevel@tonic-gate 		 * It'll take some time for the load on the new lgroup
33937c478bd9Sstevel@tonic-gate 		 * to reflect this thread's placement on it.  We'd
33947c478bd9Sstevel@tonic-gate 		 * like not, however, to have all threads between now
33957c478bd9Sstevel@tonic-gate 		 * and then also piling on to this lgroup.  To avoid
33967c478bd9Sstevel@tonic-gate 		 * this pileup, we anticipate the load this thread
33977c478bd9Sstevel@tonic-gate 		 * will generate on its new lgroup.  The goal is to
33987c478bd9Sstevel@tonic-gate 		 * make the lgroup's load appear as though the thread
33997c478bd9Sstevel@tonic-gate 		 * had been there all along.  We're very conservative
34007c478bd9Sstevel@tonic-gate 		 * in calculating this anticipatory load, we assume
34017c478bd9Sstevel@tonic-gate 		 * the worst case case (100% CPU-bound thread).  This
34027c478bd9Sstevel@tonic-gate 		 * may be modified in the future to be more accurate.
34037c478bd9Sstevel@tonic-gate 		 */
34047c478bd9Sstevel@tonic-gate 		lpl = newlpl;
34057c478bd9Sstevel@tonic-gate 		for (;;) {
34067c478bd9Sstevel@tonic-gate 			ncpu = lpl->lpl_ncpu;
34077c478bd9Sstevel@tonic-gate 			ASSERT(ncpu > 0);
34087c478bd9Sstevel@tonic-gate 			do {
34097c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
34107c478bd9Sstevel@tonic-gate 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
34117c478bd9Sstevel@tonic-gate 				/*
34127c478bd9Sstevel@tonic-gate 				 * Check for overflow
34137c478bd9Sstevel@tonic-gate 				 * Underflow not possible here
34147c478bd9Sstevel@tonic-gate 				 */
34157c478bd9Sstevel@tonic-gate 				if (new < old)
34167c478bd9Sstevel@tonic-gate 					new = UINT32_MAX;
34177c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
34187c478bd9Sstevel@tonic-gate 			    new) != old);
34197c478bd9Sstevel@tonic-gate 
34207c478bd9Sstevel@tonic-gate 			lpl = lpl->lpl_parent;
34217c478bd9Sstevel@tonic-gate 			if (lpl == NULL)
34227c478bd9Sstevel@tonic-gate 				break;
34237c478bd9Sstevel@tonic-gate 		}
34247c478bd9Sstevel@tonic-gate 		t->t_anttime = gethrtime();
34257c478bd9Sstevel@tonic-gate 	}
34267c478bd9Sstevel@tonic-gate }
34277c478bd9Sstevel@tonic-gate 
34287c478bd9Sstevel@tonic-gate /*
34297c478bd9Sstevel@tonic-gate  * Return lgroup memory allocation policy given advice from madvise(3C)
34307c478bd9Sstevel@tonic-gate  */
34317c478bd9Sstevel@tonic-gate lgrp_mem_policy_t
34327c478bd9Sstevel@tonic-gate lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
34337c478bd9Sstevel@tonic-gate {
34347c478bd9Sstevel@tonic-gate 	switch (advice) {
34357c478bd9Sstevel@tonic-gate 	case MADV_ACCESS_LWP:
34367c478bd9Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_NEXT);
34377c478bd9Sstevel@tonic-gate 	case MADV_ACCESS_MANY:
34387c478bd9Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_RANDOM);
34397c478bd9Sstevel@tonic-gate 	default:
34407c478bd9Sstevel@tonic-gate 		return (lgrp_mem_policy_default(size, type));
34417c478bd9Sstevel@tonic-gate 	}
34427c478bd9Sstevel@tonic-gate }
34437c478bd9Sstevel@tonic-gate 
34447c478bd9Sstevel@tonic-gate /*
34457c478bd9Sstevel@tonic-gate  * Figure out default policy
34467c478bd9Sstevel@tonic-gate  */
34477c478bd9Sstevel@tonic-gate lgrp_mem_policy_t
34487c478bd9Sstevel@tonic-gate lgrp_mem_policy_default(size_t size, int type)
34497c478bd9Sstevel@tonic-gate {
34507c478bd9Sstevel@tonic-gate 	cpupart_t		*cp;
34517c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
34527c478bd9Sstevel@tonic-gate 	size_t			pset_mem_size;
34537c478bd9Sstevel@tonic-gate 
34547c478bd9Sstevel@tonic-gate 	/*
34557c478bd9Sstevel@tonic-gate 	 * Randomly allocate memory across lgroups for shared memory
34567c478bd9Sstevel@tonic-gate 	 * beyond a certain threshold
34577c478bd9Sstevel@tonic-gate 	 */
34587c478bd9Sstevel@tonic-gate 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
34597c478bd9Sstevel@tonic-gate 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
34607c478bd9Sstevel@tonic-gate 		/*
34617c478bd9Sstevel@tonic-gate 		 * Get total memory size of current thread's pset
34627c478bd9Sstevel@tonic-gate 		 */
34637c478bd9Sstevel@tonic-gate 		kpreempt_disable();
34647c478bd9Sstevel@tonic-gate 		cp = curthread->t_cpupart;
34657c478bd9Sstevel@tonic-gate 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
34667c478bd9Sstevel@tonic-gate 		kpreempt_enable();
34677c478bd9Sstevel@tonic-gate 
34687c478bd9Sstevel@tonic-gate 		/*
34697c478bd9Sstevel@tonic-gate 		 * Choose policy to randomly allocate memory across
34707c478bd9Sstevel@tonic-gate 		 * lgroups in pset if it will fit and is not default
34717c478bd9Sstevel@tonic-gate 		 * partition.  Otherwise, allocate memory randomly
34727c478bd9Sstevel@tonic-gate 		 * across machine.
34737c478bd9Sstevel@tonic-gate 		 */
34747c478bd9Sstevel@tonic-gate 		if (lgrp_mem_pset_aware && size < pset_mem_size)
34757c478bd9Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
34767c478bd9Sstevel@tonic-gate 		else
34777c478bd9Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM;
34787c478bd9Sstevel@tonic-gate 	} else
34797c478bd9Sstevel@tonic-gate 		/*
34807c478bd9Sstevel@tonic-gate 		 * Apply default policy for private memory and
34817c478bd9Sstevel@tonic-gate 		 * shared memory under the respective random
34827c478bd9Sstevel@tonic-gate 		 * threshold.
34837c478bd9Sstevel@tonic-gate 		 */
34847c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_default_policy;
34857c478bd9Sstevel@tonic-gate 
34867c478bd9Sstevel@tonic-gate 	return (policy);
34877c478bd9Sstevel@tonic-gate }
34887c478bd9Sstevel@tonic-gate 
34897c478bd9Sstevel@tonic-gate /*
34907c478bd9Sstevel@tonic-gate  * Get memory allocation policy for this segment
34917c478bd9Sstevel@tonic-gate  */
34927c478bd9Sstevel@tonic-gate lgrp_mem_policy_info_t *
34937c478bd9Sstevel@tonic-gate lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
34947c478bd9Sstevel@tonic-gate {
34957c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
34967c478bd9Sstevel@tonic-gate 	extern struct seg_ops	segspt_ops;
34977c478bd9Sstevel@tonic-gate 	extern struct seg_ops	segspt_shmops;
34987c478bd9Sstevel@tonic-gate 
34997c478bd9Sstevel@tonic-gate 	/*
35007c478bd9Sstevel@tonic-gate 	 * This is for binary compatibility to protect against third party
35017c478bd9Sstevel@tonic-gate 	 * segment drivers which haven't recompiled to allow for
35027c478bd9Sstevel@tonic-gate 	 * SEGOP_GETPOLICY()
35037c478bd9Sstevel@tonic-gate 	 */
35047c478bd9Sstevel@tonic-gate 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
35057c478bd9Sstevel@tonic-gate 	    seg->s_ops != &segspt_shmops)
35067c478bd9Sstevel@tonic-gate 		return (NULL);
35077c478bd9Sstevel@tonic-gate 
35087c478bd9Sstevel@tonic-gate 	policy_info = NULL;
35097c478bd9Sstevel@tonic-gate 	if (seg->s_ops->getpolicy != NULL)
35107c478bd9Sstevel@tonic-gate 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
35117c478bd9Sstevel@tonic-gate 
35127c478bd9Sstevel@tonic-gate 	return (policy_info);
35137c478bd9Sstevel@tonic-gate }
35147c478bd9Sstevel@tonic-gate 
35157c478bd9Sstevel@tonic-gate /*
35167c478bd9Sstevel@tonic-gate  * Set policy for allocating private memory given desired policy, policy info,
35177c478bd9Sstevel@tonic-gate  * size in bytes of memory that policy is being applied.
35187c478bd9Sstevel@tonic-gate  * Return 0 if policy wasn't set already and 1 if policy was set already
35197c478bd9Sstevel@tonic-gate  */
35207c478bd9Sstevel@tonic-gate int
35217c478bd9Sstevel@tonic-gate lgrp_privm_policy_set(lgrp_mem_policy_t policy,
35227c478bd9Sstevel@tonic-gate     lgrp_mem_policy_info_t *policy_info, size_t size)
35237c478bd9Sstevel@tonic-gate {
35247c478bd9Sstevel@tonic-gate 
35257c478bd9Sstevel@tonic-gate 	ASSERT(policy_info != NULL);
35267c478bd9Sstevel@tonic-gate 
35277c478bd9Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
35287c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
35297c478bd9Sstevel@tonic-gate 
35307c478bd9Sstevel@tonic-gate 	/*
35317c478bd9Sstevel@tonic-gate 	 * Policy set already?
35327c478bd9Sstevel@tonic-gate 	 */
35337c478bd9Sstevel@tonic-gate 	if (policy == policy_info->mem_policy)
35347c478bd9Sstevel@tonic-gate 		return (1);
35357c478bd9Sstevel@tonic-gate 
35367c478bd9Sstevel@tonic-gate 	/*
35377c478bd9Sstevel@tonic-gate 	 * Set policy
35387c478bd9Sstevel@tonic-gate 	 */
35397c478bd9Sstevel@tonic-gate 	policy_info->mem_policy = policy;
35407c478bd9Sstevel@tonic-gate 	policy_info->mem_reserved = 0;
35417c478bd9Sstevel@tonic-gate 
35427c478bd9Sstevel@tonic-gate 	return (0);
35437c478bd9Sstevel@tonic-gate }
35447c478bd9Sstevel@tonic-gate 
35457c478bd9Sstevel@tonic-gate 
35467c478bd9Sstevel@tonic-gate /*
35477c478bd9Sstevel@tonic-gate  * Get shared memory allocation policy with given tree and offset
35487c478bd9Sstevel@tonic-gate  */
35497c478bd9Sstevel@tonic-gate lgrp_mem_policy_info_t *
35507c478bd9Sstevel@tonic-gate lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
35517c478bd9Sstevel@tonic-gate     u_offset_t vn_off)
35527c478bd9Sstevel@tonic-gate {
35537c478bd9Sstevel@tonic-gate 	u_offset_t		off;
35547c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
35557c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*policy_seg;
35567c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
35577c478bd9Sstevel@tonic-gate 	avl_tree_t		*tree;
35587c478bd9Sstevel@tonic-gate 	avl_index_t		where;
35597c478bd9Sstevel@tonic-gate 
35607c478bd9Sstevel@tonic-gate 	/*
35617c478bd9Sstevel@tonic-gate 	 * Get policy segment tree from anon_map or vnode and use specified
35627c478bd9Sstevel@tonic-gate 	 * anon index or vnode offset as offset
35637c478bd9Sstevel@tonic-gate 	 *
35647c478bd9Sstevel@tonic-gate 	 * Assume that no lock needs to be held on anon_map or vnode, since
35657c478bd9Sstevel@tonic-gate 	 * they should be protected by their reference count which must be
35667c478bd9Sstevel@tonic-gate 	 * nonzero for an existing segment
35677c478bd9Sstevel@tonic-gate 	 */
35687c478bd9Sstevel@tonic-gate 	if (amp) {
35697c478bd9Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
35707c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
35717c478bd9Sstevel@tonic-gate 		if (shm_locality == NULL)
35727c478bd9Sstevel@tonic-gate 			return (NULL);
35737c478bd9Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
35747c478bd9Sstevel@tonic-gate 		off = ptob(anon_index);
35757c478bd9Sstevel@tonic-gate 	} else if (vp) {
35767c478bd9Sstevel@tonic-gate 		shm_locality = vp->v_locality;
35777c478bd9Sstevel@tonic-gate 		if (shm_locality == NULL)
35787c478bd9Sstevel@tonic-gate 			return (NULL);
35797c478bd9Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
35807c478bd9Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
35817c478bd9Sstevel@tonic-gate 		off = vn_off;
35827c478bd9Sstevel@tonic-gate 	}
35837c478bd9Sstevel@tonic-gate 
35847c478bd9Sstevel@tonic-gate 	if (tree == NULL)
35857c478bd9Sstevel@tonic-gate 		return (NULL);
35867c478bd9Sstevel@tonic-gate 
35877c478bd9Sstevel@tonic-gate 	/*
35887c478bd9Sstevel@tonic-gate 	 * Lookup policy segment for offset into shared object and return
35897c478bd9Sstevel@tonic-gate 	 * policy info
35907c478bd9Sstevel@tonic-gate 	 */
35917c478bd9Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_READER);
35927c478bd9Sstevel@tonic-gate 	policy_info = NULL;
35937c478bd9Sstevel@tonic-gate 	policy_seg = avl_find(tree, &off, &where);
35947c478bd9Sstevel@tonic-gate 	if (policy_seg)
35957c478bd9Sstevel@tonic-gate 		policy_info = &policy_seg->shm_policy;
35967c478bd9Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
35977c478bd9Sstevel@tonic-gate 
35987c478bd9Sstevel@tonic-gate 	return (policy_info);
35997c478bd9Sstevel@tonic-gate }
36007c478bd9Sstevel@tonic-gate 
36017c478bd9Sstevel@tonic-gate /*
3602*611ffe8aSesaxe  * Default memory allocation policy for kernel segmap pages
3603*611ffe8aSesaxe  */
3604*611ffe8aSesaxe lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3605*611ffe8aSesaxe 
3606*611ffe8aSesaxe /*
36077c478bd9Sstevel@tonic-gate  * Return lgroup to use for allocating memory
36087c478bd9Sstevel@tonic-gate  * given the segment and address
36097c478bd9Sstevel@tonic-gate  *
36107c478bd9Sstevel@tonic-gate  * There isn't any mutual exclusion that exists between calls
36117c478bd9Sstevel@tonic-gate  * to this routine and DR, so this routine and whomever calls it
36127c478bd9Sstevel@tonic-gate  * should be mindful of the possibility that the lgrp returned
36137c478bd9Sstevel@tonic-gate  * may be deleted. If this happens, dereferences of the lgrp
36147c478bd9Sstevel@tonic-gate  * pointer will still be safe, but the resources in the lgrp will
36157c478bd9Sstevel@tonic-gate  * be gone, and LGRP_EXISTS() will no longer be true.
36167c478bd9Sstevel@tonic-gate  */
36177c478bd9Sstevel@tonic-gate lgrp_t *
36187c478bd9Sstevel@tonic-gate lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
36197c478bd9Sstevel@tonic-gate {
36207c478bd9Sstevel@tonic-gate 	int			i;
36217c478bd9Sstevel@tonic-gate 	lgrp_t			*lgrp;
36227c478bd9Sstevel@tonic-gate 	klgrpset_t		lgrpset;
36237c478bd9Sstevel@tonic-gate 	int			lgrps_spanned;
36247c478bd9Sstevel@tonic-gate 	unsigned long		off;
36257c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
36267c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
36277c478bd9Sstevel@tonic-gate 	ushort_t		random;
36287c478bd9Sstevel@tonic-gate 	int			stat = 0;
3629*611ffe8aSesaxe 	extern struct seg	*segkmap;
36307c478bd9Sstevel@tonic-gate 
36317c478bd9Sstevel@tonic-gate 	/*
36327c478bd9Sstevel@tonic-gate 	 * Just return null if the lgrp framework hasn't finished
36337c478bd9Sstevel@tonic-gate 	 * initializing or if this is a UMA machine.
36347c478bd9Sstevel@tonic-gate 	 */
36357c478bd9Sstevel@tonic-gate 	if (nlgrps == 1 || !lgrp_initialized)
36367c478bd9Sstevel@tonic-gate 		return (lgrp_root);
36377c478bd9Sstevel@tonic-gate 
36387c478bd9Sstevel@tonic-gate 	/*
36397c478bd9Sstevel@tonic-gate 	 * Get memory allocation policy for this segment
36407c478bd9Sstevel@tonic-gate 	 */
36417c478bd9Sstevel@tonic-gate 	policy = lgrp_mem_default_policy;
36427c478bd9Sstevel@tonic-gate 	if (seg != NULL) {
36437c478bd9Sstevel@tonic-gate 		if (seg->s_as == &kas) {
3644*611ffe8aSesaxe 			if (seg == segkmap)
3645*611ffe8aSesaxe 				policy = lgrp_segmap_default_policy;
36467c478bd9Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
36477c478bd9Sstevel@tonic-gate 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
36487c478bd9Sstevel@tonic-gate 				policy = LGRP_MEM_POLICY_RANDOM;
36497c478bd9Sstevel@tonic-gate 		} else {
36507c478bd9Sstevel@tonic-gate 			policy_info = lgrp_mem_policy_get(seg, vaddr);
36517c478bd9Sstevel@tonic-gate 			if (policy_info != NULL)
36527c478bd9Sstevel@tonic-gate 				policy = policy_info->mem_policy;
36537c478bd9Sstevel@tonic-gate 		}
36547c478bd9Sstevel@tonic-gate 	}
36557c478bd9Sstevel@tonic-gate 	lgrpset = 0;
36567c478bd9Sstevel@tonic-gate 
36577c478bd9Sstevel@tonic-gate 	/*
36587c478bd9Sstevel@tonic-gate 	 * Initialize lgroup to home by default
36597c478bd9Sstevel@tonic-gate 	 */
36607c478bd9Sstevel@tonic-gate 	lgrp = lgrp_home_lgrp();
36617c478bd9Sstevel@tonic-gate 
36627c478bd9Sstevel@tonic-gate 	/*
36637c478bd9Sstevel@tonic-gate 	 * When homing threads on root lgrp, override default memory
36647c478bd9Sstevel@tonic-gate 	 * allocation policies with root lgroup memory allocation policy
36657c478bd9Sstevel@tonic-gate 	 */
36667c478bd9Sstevel@tonic-gate 	if (lgrp == lgrp_root)
36677c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_root;
36687c478bd9Sstevel@tonic-gate 
36697c478bd9Sstevel@tonic-gate 	/*
36707c478bd9Sstevel@tonic-gate 	 * Implement policy
36717c478bd9Sstevel@tonic-gate 	 */
36727c478bd9Sstevel@tonic-gate 	switch (policy) {
36737c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT_CPU:
36747c478bd9Sstevel@tonic-gate 
36757c478bd9Sstevel@tonic-gate 		/*
36767c478bd9Sstevel@tonic-gate 		 * Return lgroup of current CPU which faulted on memory
3677394b433dSesaxe 		 * If the CPU isn't currently in an lgrp, then opt to
3678394b433dSesaxe 		 * allocate from the root.
3679394b433dSesaxe 		 *
3680394b433dSesaxe 		 * Kernel preemption needs to be disabled here to prevent
3681394b433dSesaxe 		 * the current CPU from going away before lgrp is found.
36827c478bd9Sstevel@tonic-gate 		 */
3683394b433dSesaxe 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3684394b433dSesaxe 			lgrp = lgrp_root;
3685394b433dSesaxe 		} else {
3686394b433dSesaxe 			kpreempt_disable();
36877c478bd9Sstevel@tonic-gate 			lgrp = lgrp_cpu_to_lgrp(CPU);
3688394b433dSesaxe 			kpreempt_enable();
3689394b433dSesaxe 		}
36907c478bd9Sstevel@tonic-gate 		break;
36917c478bd9Sstevel@tonic-gate 
36927c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT:
36937c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_DEFAULT:
36947c478bd9Sstevel@tonic-gate 	default:
36957c478bd9Sstevel@tonic-gate 
36967c478bd9Sstevel@tonic-gate 		/*
36977c478bd9Sstevel@tonic-gate 		 * Just return current thread's home lgroup
36987c478bd9Sstevel@tonic-gate 		 * for default policy (next touch)
36997c478bd9Sstevel@tonic-gate 		 * If the thread is homed to the root,
37007c478bd9Sstevel@tonic-gate 		 * then the default policy is random across lgroups.
37017c478bd9Sstevel@tonic-gate 		 * Fallthrough to the random case.
37027c478bd9Sstevel@tonic-gate 		 */
37037c478bd9Sstevel@tonic-gate 		if (lgrp != lgrp_root) {
37047c478bd9Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_NEXT)
37057c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
37067c478bd9Sstevel@tonic-gate 			else
37077c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id,
37087c478bd9Sstevel@tonic-gate 				    LGRP_NUM_DEFAULT, 1);
37097c478bd9Sstevel@tonic-gate 			break;
37107c478bd9Sstevel@tonic-gate 		}
37117c478bd9Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
37127c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM:
37137c478bd9Sstevel@tonic-gate 
37147c478bd9Sstevel@tonic-gate 		/*
37157c478bd9Sstevel@tonic-gate 		 * Return a random leaf lgroup with memory
37167c478bd9Sstevel@tonic-gate 		 */
37177c478bd9Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
37187c478bd9Sstevel@tonic-gate 		/*
37197c478bd9Sstevel@tonic-gate 		 * Count how many lgroups are spanned
37207c478bd9Sstevel@tonic-gate 		 */
37217c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
37227c478bd9Sstevel@tonic-gate 
37237c478bd9Sstevel@tonic-gate 		/*
37247c478bd9Sstevel@tonic-gate 		 * There may be no memnodes in the root lgroup during DR copy
37257c478bd9Sstevel@tonic-gate 		 * rename on a system with only two boards (memnodes)
37267c478bd9Sstevel@tonic-gate 		 * configured. In this case just return the root lgrp.
37277c478bd9Sstevel@tonic-gate 		 */
37287c478bd9Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
37297c478bd9Sstevel@tonic-gate 			lgrp = lgrp_root;
37307c478bd9Sstevel@tonic-gate 			break;
37317c478bd9Sstevel@tonic-gate 		}
37327c478bd9Sstevel@tonic-gate 
37337c478bd9Sstevel@tonic-gate 		/*
37347c478bd9Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
37357c478bd9Sstevel@tonic-gate 		 * and return lgroup at that offset
37367c478bd9Sstevel@tonic-gate 		 */
37377c478bd9Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
37387c478bd9Sstevel@tonic-gate 		off = random % lgrps_spanned;
37397c478bd9Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
37407c478bd9Sstevel@tonic-gate 
37417c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
37427c478bd9Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
37437c478bd9Sstevel@tonic-gate 				continue;
37447c478bd9Sstevel@tonic-gate 			if (off)
37457c478bd9Sstevel@tonic-gate 				off--;
37467c478bd9Sstevel@tonic-gate 			else {
37477c478bd9Sstevel@tonic-gate 				lgrp = lgrp_table[i];
37487c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
37497c478bd9Sstevel@tonic-gate 				    1);
37507c478bd9Sstevel@tonic-gate 				break;
37517c478bd9Sstevel@tonic-gate 			}
37527c478bd9Sstevel@tonic-gate 		}
37537c478bd9Sstevel@tonic-gate 		break;
37547c478bd9Sstevel@tonic-gate 
37557c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PROC:
37567c478bd9Sstevel@tonic-gate 
37577c478bd9Sstevel@tonic-gate 		/*
37587c478bd9Sstevel@tonic-gate 		 * Grab copy of bitmask of lgroups spanned by
37597c478bd9Sstevel@tonic-gate 		 * this process
37607c478bd9Sstevel@tonic-gate 		 */
37617c478bd9Sstevel@tonic-gate 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
37627c478bd9Sstevel@tonic-gate 		stat = LGRP_NUM_RANDOM_PROC;
37637c478bd9Sstevel@tonic-gate 
37647c478bd9Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
37657c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PSET:
37667c478bd9Sstevel@tonic-gate 
37677c478bd9Sstevel@tonic-gate 		if (!stat)
37687c478bd9Sstevel@tonic-gate 			stat = LGRP_NUM_RANDOM_PSET;
37697c478bd9Sstevel@tonic-gate 
37707c478bd9Sstevel@tonic-gate 		if (klgrpset_isempty(lgrpset)) {
37717c478bd9Sstevel@tonic-gate 			/*
37727c478bd9Sstevel@tonic-gate 			 * Grab copy of bitmask of lgroups spanned by
37737c478bd9Sstevel@tonic-gate 			 * this processor set
37747c478bd9Sstevel@tonic-gate 			 */
37757c478bd9Sstevel@tonic-gate 			kpreempt_disable();
37767c478bd9Sstevel@tonic-gate 			klgrpset_copy(lgrpset,
37777c478bd9Sstevel@tonic-gate 			    curthread->t_cpupart->cp_lgrpset);
37787c478bd9Sstevel@tonic-gate 			kpreempt_enable();
37797c478bd9Sstevel@tonic-gate 		}
37807c478bd9Sstevel@tonic-gate 
37817c478bd9Sstevel@tonic-gate 		/*
37827c478bd9Sstevel@tonic-gate 		 * Count how many lgroups are spanned
37837c478bd9Sstevel@tonic-gate 		 */
37847c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
37857c478bd9Sstevel@tonic-gate 		ASSERT(lgrps_spanned <= nlgrps);
37867c478bd9Sstevel@tonic-gate 
37877c478bd9Sstevel@tonic-gate 		/*
37887c478bd9Sstevel@tonic-gate 		 * Probably lgrps_spanned should be always non-zero, but to be
37897c478bd9Sstevel@tonic-gate 		 * on the safe side we return lgrp_root if it is empty.
37907c478bd9Sstevel@tonic-gate 		 */
37917c478bd9Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
37927c478bd9Sstevel@tonic-gate 			lgrp = lgrp_root;
37937c478bd9Sstevel@tonic-gate 			break;
37947c478bd9Sstevel@tonic-gate 		}
37957c478bd9Sstevel@tonic-gate 
37967c478bd9Sstevel@tonic-gate 		/*
37977c478bd9Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
37987c478bd9Sstevel@tonic-gate 		 * and return lgroup at that offset
37997c478bd9Sstevel@tonic-gate 		 */
38007c478bd9Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
38017c478bd9Sstevel@tonic-gate 		off = random % lgrps_spanned;
38027c478bd9Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
38037c478bd9Sstevel@tonic-gate 
38047c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
38057c478bd9Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
38067c478bd9Sstevel@tonic-gate 				continue;
38077c478bd9Sstevel@tonic-gate 			if (off)
38087c478bd9Sstevel@tonic-gate 				off--;
38097c478bd9Sstevel@tonic-gate 			else {
38107c478bd9Sstevel@tonic-gate 				lgrp = lgrp_table[i];
38117c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
38127c478bd9Sstevel@tonic-gate 				    1);
38137c478bd9Sstevel@tonic-gate 				break;
38147c478bd9Sstevel@tonic-gate 			}
38157c478bd9Sstevel@tonic-gate 		}
38167c478bd9Sstevel@tonic-gate 		break;
38177c478bd9Sstevel@tonic-gate 
38187c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_ROUNDROBIN:
38197c478bd9Sstevel@tonic-gate 
38207c478bd9Sstevel@tonic-gate 		/*
38217c478bd9Sstevel@tonic-gate 		 * Use offset within segment to determine
38227c478bd9Sstevel@tonic-gate 		 * offset from home lgroup to choose for
38237c478bd9Sstevel@tonic-gate 		 * next lgroup to allocate memory from
38247c478bd9Sstevel@tonic-gate 		 */
38257c478bd9Sstevel@tonic-gate 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
38267c478bd9Sstevel@tonic-gate 		    (lgrp_alloc_max + 1);
38277c478bd9Sstevel@tonic-gate 
38287c478bd9Sstevel@tonic-gate 		kpreempt_disable();
38297c478bd9Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
38307c478bd9Sstevel@tonic-gate 		i = lgrp->lgrp_id;
38317c478bd9Sstevel@tonic-gate 		kpreempt_enable();
38327c478bd9Sstevel@tonic-gate 
38337c478bd9Sstevel@tonic-gate 		while (off > 0) {
38347c478bd9Sstevel@tonic-gate 			i = (i + 1) % (lgrp_alloc_max + 1);
38357c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
38367c478bd9Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, i))
38377c478bd9Sstevel@tonic-gate 				off--;
38387c478bd9Sstevel@tonic-gate 		}
38397c478bd9Sstevel@tonic-gate 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
38407c478bd9Sstevel@tonic-gate 
38417c478bd9Sstevel@tonic-gate 		break;
38427c478bd9Sstevel@tonic-gate 	}
38437c478bd9Sstevel@tonic-gate 
38447c478bd9Sstevel@tonic-gate 	ASSERT(lgrp != NULL);
38457c478bd9Sstevel@tonic-gate 	return (lgrp);
38467c478bd9Sstevel@tonic-gate }
38477c478bd9Sstevel@tonic-gate 
38487c478bd9Sstevel@tonic-gate /*
38497c478bd9Sstevel@tonic-gate  * Return the number of pages in an lgroup
38507c478bd9Sstevel@tonic-gate  *
38517c478bd9Sstevel@tonic-gate  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
38527c478bd9Sstevel@tonic-gate  *	 could cause tests that rely on the numat driver to fail....
38537c478bd9Sstevel@tonic-gate  */
38547c478bd9Sstevel@tonic-gate pgcnt_t
38557c478bd9Sstevel@tonic-gate lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
38567c478bd9Sstevel@tonic-gate {
38577c478bd9Sstevel@tonic-gate 	lgrp_t *lgrp;
38587c478bd9Sstevel@tonic-gate 
38597c478bd9Sstevel@tonic-gate 	lgrp = lgrp_table[lgrpid];
38607c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp) ||
38617c478bd9Sstevel@tonic-gate 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
38627c478bd9Sstevel@tonic-gate 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
38637c478bd9Sstevel@tonic-gate 		return (0);
38647c478bd9Sstevel@tonic-gate 
38657c478bd9Sstevel@tonic-gate 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
38667c478bd9Sstevel@tonic-gate }
38677c478bd9Sstevel@tonic-gate 
38687c478bd9Sstevel@tonic-gate /*
38697c478bd9Sstevel@tonic-gate  * Initialize lgroup shared memory allocation policy support
38707c478bd9Sstevel@tonic-gate  */
38717c478bd9Sstevel@tonic-gate void
38727c478bd9Sstevel@tonic-gate lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
38737c478bd9Sstevel@tonic-gate {
38747c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
38757c478bd9Sstevel@tonic-gate 
38767c478bd9Sstevel@tonic-gate 	/*
38777c478bd9Sstevel@tonic-gate 	 * Initialize locality field in anon_map
38787c478bd9Sstevel@tonic-gate 	 * Don't need any locks because this is called when anon_map is
38797c478bd9Sstevel@tonic-gate 	 * allocated, but not used anywhere yet.
38807c478bd9Sstevel@tonic-gate 	 */
38817c478bd9Sstevel@tonic-gate 	if (amp) {
38827c478bd9Sstevel@tonic-gate 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
38837c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL) {
38847c478bd9Sstevel@tonic-gate 			/*
38857c478bd9Sstevel@tonic-gate 			 * Allocate and initialize shared memory locality info
38867c478bd9Sstevel@tonic-gate 			 * and set anon_map locality pointer to it
38877c478bd9Sstevel@tonic-gate 			 * Drop lock across kmem_alloc(KM_SLEEP)
38887c478bd9Sstevel@tonic-gate 			 */
38897c478bd9Sstevel@tonic-gate 			ANON_LOCK_EXIT(&amp->a_rwlock);
38907c478bd9Sstevel@tonic-gate 			shm_locality = kmem_alloc(sizeof (*shm_locality),
38917c478bd9Sstevel@tonic-gate 			    KM_SLEEP);
38927c478bd9Sstevel@tonic-gate 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
38937c478bd9Sstevel@tonic-gate 			    NULL);
38947c478bd9Sstevel@tonic-gate 			shm_locality->loc_count = 1;	/* not used for amp */
38957c478bd9Sstevel@tonic-gate 			shm_locality->loc_tree = NULL;
38967c478bd9Sstevel@tonic-gate 
38977c478bd9Sstevel@tonic-gate 			/*
38987c478bd9Sstevel@tonic-gate 			 * Reacquire lock and check to see whether anyone beat
38997c478bd9Sstevel@tonic-gate 			 * us to initializing the locality info
39007c478bd9Sstevel@tonic-gate 			 */
39017c478bd9Sstevel@tonic-gate 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
39027c478bd9Sstevel@tonic-gate 			if (amp->locality != NULL) {
39037c478bd9Sstevel@tonic-gate 				rw_destroy(&shm_locality->loc_lock);
39047c478bd9Sstevel@tonic-gate 				kmem_free(shm_locality,
39057c478bd9Sstevel@tonic-gate 				    sizeof (*shm_locality));
39067c478bd9Sstevel@tonic-gate 			} else
39077c478bd9Sstevel@tonic-gate 				amp->locality = shm_locality;
39087c478bd9Sstevel@tonic-gate 		}
39097c478bd9Sstevel@tonic-gate 		ANON_LOCK_EXIT(&amp->a_rwlock);
39107c478bd9Sstevel@tonic-gate 		return;
39117c478bd9Sstevel@tonic-gate 	}
39127c478bd9Sstevel@tonic-gate 
39137c478bd9Sstevel@tonic-gate 	/*
39147c478bd9Sstevel@tonic-gate 	 * Allocate shared vnode policy info if vnode is not locality aware yet
39157c478bd9Sstevel@tonic-gate 	 */
39167c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
39177c478bd9Sstevel@tonic-gate 	if ((vp->v_flag & V_LOCALITY) == 0) {
39187c478bd9Sstevel@tonic-gate 		/*
39197c478bd9Sstevel@tonic-gate 		 * Allocate and initialize shared memory locality info
39207c478bd9Sstevel@tonic-gate 		 */
39217c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
39227c478bd9Sstevel@tonic-gate 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
39237c478bd9Sstevel@tonic-gate 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
39247c478bd9Sstevel@tonic-gate 		shm_locality->loc_count = 1;
39257c478bd9Sstevel@tonic-gate 		shm_locality->loc_tree = NULL;
39267c478bd9Sstevel@tonic-gate 
39277c478bd9Sstevel@tonic-gate 		/*
39287c478bd9Sstevel@tonic-gate 		 * Point vnode locality field at shared vnode policy info
39297c478bd9Sstevel@tonic-gate 		 * and set locality aware flag in vnode
39307c478bd9Sstevel@tonic-gate 		 */
39317c478bd9Sstevel@tonic-gate 		mutex_enter(&vp->v_lock);
39327c478bd9Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0) {
39337c478bd9Sstevel@tonic-gate 			vp->v_locality = shm_locality;
39347c478bd9Sstevel@tonic-gate 			vp->v_flag |= V_LOCALITY;
39357c478bd9Sstevel@tonic-gate 		} else {
39367c478bd9Sstevel@tonic-gate 			/*
39377c478bd9Sstevel@tonic-gate 			 * Lost race so free locality info and increment count.
39387c478bd9Sstevel@tonic-gate 			 */
39397c478bd9Sstevel@tonic-gate 			rw_destroy(&shm_locality->loc_lock);
39407c478bd9Sstevel@tonic-gate 			kmem_free(shm_locality, sizeof (*shm_locality));
39417c478bd9Sstevel@tonic-gate 			shm_locality = vp->v_locality;
39427c478bd9Sstevel@tonic-gate 			shm_locality->loc_count++;
39437c478bd9Sstevel@tonic-gate 		}
39447c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
39457c478bd9Sstevel@tonic-gate 
39467c478bd9Sstevel@tonic-gate 		return;
39477c478bd9Sstevel@tonic-gate 	}
39487c478bd9Sstevel@tonic-gate 
39497c478bd9Sstevel@tonic-gate 	/*
39507c478bd9Sstevel@tonic-gate 	 * Increment reference count of number of segments mapping this vnode
39517c478bd9Sstevel@tonic-gate 	 * shared
39527c478bd9Sstevel@tonic-gate 	 */
39537c478bd9Sstevel@tonic-gate 	shm_locality = vp->v_locality;
39547c478bd9Sstevel@tonic-gate 	shm_locality->loc_count++;
39557c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
39567c478bd9Sstevel@tonic-gate }
39577c478bd9Sstevel@tonic-gate 
39587c478bd9Sstevel@tonic-gate /*
39597c478bd9Sstevel@tonic-gate  * Destroy the given shared memory policy segment tree
39607c478bd9Sstevel@tonic-gate  */
39617c478bd9Sstevel@tonic-gate void
39627c478bd9Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
39637c478bd9Sstevel@tonic-gate {
39647c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*cur;
39657c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
39667c478bd9Sstevel@tonic-gate 
39677c478bd9Sstevel@tonic-gate 	if (tree == NULL)
39687c478bd9Sstevel@tonic-gate 		return;
39697c478bd9Sstevel@tonic-gate 
39707c478bd9Sstevel@tonic-gate 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
39717c478bd9Sstevel@tonic-gate 	while (cur != NULL) {
39727c478bd9Sstevel@tonic-gate 		next = AVL_NEXT(tree, cur);
39737c478bd9Sstevel@tonic-gate 		avl_remove(tree, cur);
39747c478bd9Sstevel@tonic-gate 		kmem_free(cur, sizeof (*cur));
39757c478bd9Sstevel@tonic-gate 		cur = next;
39767c478bd9Sstevel@tonic-gate 	}
39777c478bd9Sstevel@tonic-gate 	kmem_free(tree, sizeof (avl_tree_t));
39787c478bd9Sstevel@tonic-gate }
39797c478bd9Sstevel@tonic-gate 
39807c478bd9Sstevel@tonic-gate /*
39817c478bd9Sstevel@tonic-gate  * Uninitialize lgroup shared memory allocation policy support
39827c478bd9Sstevel@tonic-gate  */
39837c478bd9Sstevel@tonic-gate void
39847c478bd9Sstevel@tonic-gate lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
39857c478bd9Sstevel@tonic-gate {
39867c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
39877c478bd9Sstevel@tonic-gate 
39887c478bd9Sstevel@tonic-gate 	/*
39897c478bd9Sstevel@tonic-gate 	 * For anon_map, deallocate shared memory policy tree and
39907c478bd9Sstevel@tonic-gate 	 * zero locality field
39917c478bd9Sstevel@tonic-gate 	 * Don't need any locks because anon_map is being freed
39927c478bd9Sstevel@tonic-gate 	 */
39937c478bd9Sstevel@tonic-gate 	if (amp) {
39947c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL)
39957c478bd9Sstevel@tonic-gate 			return;
39967c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
39977c478bd9Sstevel@tonic-gate 		shm_locality->loc_count = 0;	/* not really used for amp */
39987c478bd9Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
39997c478bd9Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
40007c478bd9Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
40017c478bd9Sstevel@tonic-gate 		amp->locality = 0;
40027c478bd9Sstevel@tonic-gate 		return;
40037c478bd9Sstevel@tonic-gate 	}
40047c478bd9Sstevel@tonic-gate 
40057c478bd9Sstevel@tonic-gate 	/*
40067c478bd9Sstevel@tonic-gate 	 * For vnode, decrement reference count of segments mapping this vnode
40077c478bd9Sstevel@tonic-gate 	 * shared and delete locality info if reference count drops to 0
40087c478bd9Sstevel@tonic-gate 	 */
40097c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
40107c478bd9Sstevel@tonic-gate 	shm_locality = vp->v_locality;
40117c478bd9Sstevel@tonic-gate 	shm_locality->loc_count--;
40127c478bd9Sstevel@tonic-gate 
40137c478bd9Sstevel@tonic-gate 	if (shm_locality->loc_count == 0) {
40147c478bd9Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
40157c478bd9Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
40167c478bd9Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
40177c478bd9Sstevel@tonic-gate 		vp->v_locality = 0;
40187c478bd9Sstevel@tonic-gate 		vp->v_flag &= ~V_LOCALITY;
40197c478bd9Sstevel@tonic-gate 	}
40207c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
40217c478bd9Sstevel@tonic-gate }
40227c478bd9Sstevel@tonic-gate 
40237c478bd9Sstevel@tonic-gate /*
40247c478bd9Sstevel@tonic-gate  * Compare two shared memory policy segments
40257c478bd9Sstevel@tonic-gate  * Used by AVL tree code for searching
40267c478bd9Sstevel@tonic-gate  */
40277c478bd9Sstevel@tonic-gate int
40287c478bd9Sstevel@tonic-gate lgrp_shm_policy_compar(const void *x, const void *y)
40297c478bd9Sstevel@tonic-gate {
40307c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
40317c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
40327c478bd9Sstevel@tonic-gate 
40337c478bd9Sstevel@tonic-gate 	if (a->shm_off < b->shm_off)
40347c478bd9Sstevel@tonic-gate 		return (-1);
40357c478bd9Sstevel@tonic-gate 	if (a->shm_off >= b->shm_off + b->shm_size)
40367c478bd9Sstevel@tonic-gate 		return (1);
40377c478bd9Sstevel@tonic-gate 	return (0);
40387c478bd9Sstevel@tonic-gate }
40397c478bd9Sstevel@tonic-gate 
40407c478bd9Sstevel@tonic-gate /*
40417c478bd9Sstevel@tonic-gate  * Concatenate seg1 with seg2 and remove seg2
40427c478bd9Sstevel@tonic-gate  */
40437c478bd9Sstevel@tonic-gate static int
40447c478bd9Sstevel@tonic-gate lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
40457c478bd9Sstevel@tonic-gate     lgrp_shm_policy_seg_t *seg2)
40467c478bd9Sstevel@tonic-gate {
40477c478bd9Sstevel@tonic-gate 	if (!seg1 || !seg2 ||
40487c478bd9Sstevel@tonic-gate 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
40497c478bd9Sstevel@tonic-gate 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
40507c478bd9Sstevel@tonic-gate 		return (-1);
40517c478bd9Sstevel@tonic-gate 
40527c478bd9Sstevel@tonic-gate 	seg1->shm_size += seg2->shm_size;
40537c478bd9Sstevel@tonic-gate 	avl_remove(tree, seg2);
40547c478bd9Sstevel@tonic-gate 	kmem_free(seg2, sizeof (*seg2));
40557c478bd9Sstevel@tonic-gate 	return (0);
40567c478bd9Sstevel@tonic-gate }
40577c478bd9Sstevel@tonic-gate 
40587c478bd9Sstevel@tonic-gate /*
40597c478bd9Sstevel@tonic-gate  * Split segment at given offset and return rightmost (uppermost) segment
40607c478bd9Sstevel@tonic-gate  * Assumes that there are no overlapping segments
40617c478bd9Sstevel@tonic-gate  */
40627c478bd9Sstevel@tonic-gate static lgrp_shm_policy_seg_t *
40637c478bd9Sstevel@tonic-gate lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
40647c478bd9Sstevel@tonic-gate     u_offset_t off)
40657c478bd9Sstevel@tonic-gate {
40667c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
40677c478bd9Sstevel@tonic-gate 	avl_index_t		where;
40687c478bd9Sstevel@tonic-gate 
40697c478bd9Sstevel@tonic-gate 	ASSERT(seg != NULL);
40707c478bd9Sstevel@tonic-gate 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
40717c478bd9Sstevel@tonic-gate 
40727c478bd9Sstevel@tonic-gate 	if (!seg || off < seg->shm_off || off > seg->shm_off +
40737c478bd9Sstevel@tonic-gate 	    seg->shm_size)
40747c478bd9Sstevel@tonic-gate 		return (NULL);
40757c478bd9Sstevel@tonic-gate 
40767c478bd9Sstevel@tonic-gate 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
40777c478bd9Sstevel@tonic-gate 		return (seg);
40787c478bd9Sstevel@tonic-gate 
40797c478bd9Sstevel@tonic-gate 	/*
40807c478bd9Sstevel@tonic-gate 	 * Adjust size of left segment and allocate new (right) segment
40817c478bd9Sstevel@tonic-gate 	 */
40827c478bd9Sstevel@tonic-gate 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
40837c478bd9Sstevel@tonic-gate 	newseg->shm_policy = seg->shm_policy;
40847c478bd9Sstevel@tonic-gate 	newseg->shm_off = off;
40857c478bd9Sstevel@tonic-gate 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
40867c478bd9Sstevel@tonic-gate 	seg->shm_size = off - seg->shm_off;
40877c478bd9Sstevel@tonic-gate 
40887c478bd9Sstevel@tonic-gate 	/*
40897c478bd9Sstevel@tonic-gate 	 * Find where to insert new segment in AVL tree and insert it
40907c478bd9Sstevel@tonic-gate 	 */
40917c478bd9Sstevel@tonic-gate 	(void) avl_find(tree, &off, &where);
40927c478bd9Sstevel@tonic-gate 	avl_insert(tree, newseg, where);
40937c478bd9Sstevel@tonic-gate 
40947c478bd9Sstevel@tonic-gate 	return (newseg);
40957c478bd9Sstevel@tonic-gate }
40967c478bd9Sstevel@tonic-gate 
40977c478bd9Sstevel@tonic-gate /*
40987c478bd9Sstevel@tonic-gate  * Set shared memory allocation policy on specified shared object at given
40997c478bd9Sstevel@tonic-gate  * offset and length
41007c478bd9Sstevel@tonic-gate  *
41017c478bd9Sstevel@tonic-gate  * Return 0 if policy wasn't set already, 1 if policy was set already, and
41027c478bd9Sstevel@tonic-gate  * -1 if can't set policy.
41037c478bd9Sstevel@tonic-gate  */
41047c478bd9Sstevel@tonic-gate int
41057c478bd9Sstevel@tonic-gate lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
41067c478bd9Sstevel@tonic-gate     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
41077c478bd9Sstevel@tonic-gate {
41087c478bd9Sstevel@tonic-gate 	u_offset_t		eoff;
41097c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
41107c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
41117c478bd9Sstevel@tonic-gate 	u_offset_t		off;
41127c478bd9Sstevel@tonic-gate 	u_offset_t		oldeoff;
41137c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*prev;
41147c478bd9Sstevel@tonic-gate 	int			retval;
41157c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*seg;
41167c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
41177c478bd9Sstevel@tonic-gate 	avl_tree_t		*tree;
41187c478bd9Sstevel@tonic-gate 	avl_index_t		where;
41197c478bd9Sstevel@tonic-gate 
41207c478bd9Sstevel@tonic-gate 	ASSERT(amp || vp);
41217c478bd9Sstevel@tonic-gate 	ASSERT((len & PAGEOFFSET) == 0);
41227c478bd9Sstevel@tonic-gate 
41237c478bd9Sstevel@tonic-gate 	if (len == 0)
41247c478bd9Sstevel@tonic-gate 		return (-1);
41257c478bd9Sstevel@tonic-gate 
41267c478bd9Sstevel@tonic-gate 	retval = 0;
41277c478bd9Sstevel@tonic-gate 
41287c478bd9Sstevel@tonic-gate 	/*
41297c478bd9Sstevel@tonic-gate 	 * Get locality info and starting offset into shared object
41307c478bd9Sstevel@tonic-gate 	 * Try anon map first and then vnode
41317c478bd9Sstevel@tonic-gate 	 * Assume that no locks need to be held on anon_map or vnode, since
41327c478bd9Sstevel@tonic-gate 	 * it should be protected by its reference count which must be nonzero
41337c478bd9Sstevel@tonic-gate 	 * for an existing segment.
41347c478bd9Sstevel@tonic-gate 	 */
41357c478bd9Sstevel@tonic-gate 	if (amp) {
41367c478bd9Sstevel@tonic-gate 		/*
41377c478bd9Sstevel@tonic-gate 		 * Get policy info from anon_map
41387c478bd9Sstevel@tonic-gate 		 *
41397c478bd9Sstevel@tonic-gate 		 */
41407c478bd9Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
41417c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL)
41427c478bd9Sstevel@tonic-gate 			lgrp_shm_policy_init(amp, NULL);
41437c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
41447c478bd9Sstevel@tonic-gate 		off = ptob(anon_index);
41457c478bd9Sstevel@tonic-gate 	} else if (vp) {
41467c478bd9Sstevel@tonic-gate 		/*
41477c478bd9Sstevel@tonic-gate 		 * Get policy info from vnode
41487c478bd9Sstevel@tonic-gate 		 */
41497c478bd9Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
41507c478bd9Sstevel@tonic-gate 			lgrp_shm_policy_init(NULL, vp);
41517c478bd9Sstevel@tonic-gate 		shm_locality = vp->v_locality;
41527c478bd9Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
41537c478bd9Sstevel@tonic-gate 		off = vn_off;
41547c478bd9Sstevel@tonic-gate 	} else
41557c478bd9Sstevel@tonic-gate 		return (-1);
41567c478bd9Sstevel@tonic-gate 
41577c478bd9Sstevel@tonic-gate 	ASSERT((off & PAGEOFFSET) == 0);
41587c478bd9Sstevel@tonic-gate 
41597c478bd9Sstevel@tonic-gate 	/*
41607c478bd9Sstevel@tonic-gate 	 * Figure out default policy
41617c478bd9Sstevel@tonic-gate 	 */
41627c478bd9Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
41637c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
41647c478bd9Sstevel@tonic-gate 
41657c478bd9Sstevel@tonic-gate 	/*
41667c478bd9Sstevel@tonic-gate 	 * Create AVL tree if there isn't one yet
41677c478bd9Sstevel@tonic-gate 	 * and set locality field to point at it
41687c478bd9Sstevel@tonic-gate 	 */
41697c478bd9Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
41707c478bd9Sstevel@tonic-gate 	tree = shm_locality->loc_tree;
41717c478bd9Sstevel@tonic-gate 	if (!tree) {
41727c478bd9Sstevel@tonic-gate 		rw_exit(&shm_locality->loc_lock);
41737c478bd9Sstevel@tonic-gate 
41747c478bd9Sstevel@tonic-gate 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
41757c478bd9Sstevel@tonic-gate 
41767c478bd9Sstevel@tonic-gate 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
41777c478bd9Sstevel@tonic-gate 		if (shm_locality->loc_tree == NULL) {
41787c478bd9Sstevel@tonic-gate 			avl_create(tree, lgrp_shm_policy_compar,
41797c478bd9Sstevel@tonic-gate 			    sizeof (lgrp_shm_policy_seg_t),
41807c478bd9Sstevel@tonic-gate 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
41817c478bd9Sstevel@tonic-gate 			shm_locality->loc_tree = tree;
41827c478bd9Sstevel@tonic-gate 		} else {
41837c478bd9Sstevel@tonic-gate 			/*
41847c478bd9Sstevel@tonic-gate 			 * Another thread managed to set up the tree
41857c478bd9Sstevel@tonic-gate 			 * before we could. Free the tree we allocated
41867c478bd9Sstevel@tonic-gate 			 * and use the one that's already there.
41877c478bd9Sstevel@tonic-gate 			 */
41887c478bd9Sstevel@tonic-gate 			kmem_free(tree, sizeof (*tree));
41897c478bd9Sstevel@tonic-gate 			tree = shm_locality->loc_tree;
41907c478bd9Sstevel@tonic-gate 		}
41917c478bd9Sstevel@tonic-gate 	}
41927c478bd9Sstevel@tonic-gate 
41937c478bd9Sstevel@tonic-gate 	/*
41947c478bd9Sstevel@tonic-gate 	 * Set policy
41957c478bd9Sstevel@tonic-gate 	 *
41967c478bd9Sstevel@tonic-gate 	 * Need to maintain hold on writer's lock to keep tree from
41977c478bd9Sstevel@tonic-gate 	 * changing out from under us
41987c478bd9Sstevel@tonic-gate 	 */
41997c478bd9Sstevel@tonic-gate 	while (len != 0) {
42007c478bd9Sstevel@tonic-gate 		/*
42017c478bd9Sstevel@tonic-gate 		 * Find policy segment for specified offset into shared object
42027c478bd9Sstevel@tonic-gate 		 */
42037c478bd9Sstevel@tonic-gate 		seg = avl_find(tree, &off, &where);
42047c478bd9Sstevel@tonic-gate 
42057c478bd9Sstevel@tonic-gate 		/*
42067c478bd9Sstevel@tonic-gate 		 * Didn't find any existing segment that contains specified
42077c478bd9Sstevel@tonic-gate 		 * offset, so allocate new segment, insert it, and concatenate
42087c478bd9Sstevel@tonic-gate 		 * with adjacent segments if possible
42097c478bd9Sstevel@tonic-gate 		 */
42107c478bd9Sstevel@tonic-gate 		if (seg == NULL) {
42117c478bd9Sstevel@tonic-gate 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
42127c478bd9Sstevel@tonic-gate 			    KM_SLEEP);
42137c478bd9Sstevel@tonic-gate 			newseg->shm_policy.mem_policy = policy;
42147c478bd9Sstevel@tonic-gate 			newseg->shm_policy.mem_reserved = 0;
42157c478bd9Sstevel@tonic-gate 			newseg->shm_off = off;
42167c478bd9Sstevel@tonic-gate 			avl_insert(tree, newseg, where);
42177c478bd9Sstevel@tonic-gate 
42187c478bd9Sstevel@tonic-gate 			/*
42197c478bd9Sstevel@tonic-gate 			 * Check to see whether new segment overlaps with next
42207c478bd9Sstevel@tonic-gate 			 * one, set length of new segment accordingly, and
42217c478bd9Sstevel@tonic-gate 			 * calculate remaining length and next offset
42227c478bd9Sstevel@tonic-gate 			 */
42237c478bd9Sstevel@tonic-gate 			seg = AVL_NEXT(tree, newseg);
42247c478bd9Sstevel@tonic-gate 			if (seg == NULL || off + len <= seg->shm_off) {
42257c478bd9Sstevel@tonic-gate 				newseg->shm_size = len;
42267c478bd9Sstevel@tonic-gate 				len = 0;
42277c478bd9Sstevel@tonic-gate 			} else {
42287c478bd9Sstevel@tonic-gate 				newseg->shm_size = seg->shm_off - off;
42297c478bd9Sstevel@tonic-gate 				off = seg->shm_off;
42307c478bd9Sstevel@tonic-gate 				len -= newseg->shm_size;
42317c478bd9Sstevel@tonic-gate 			}
42327c478bd9Sstevel@tonic-gate 
42337c478bd9Sstevel@tonic-gate 			/*
42347c478bd9Sstevel@tonic-gate 			 * Try to concatenate new segment with next and
42357c478bd9Sstevel@tonic-gate 			 * previous ones, since they might have the same policy
42367c478bd9Sstevel@tonic-gate 			 * now.  Grab previous and next segments first because
42377c478bd9Sstevel@tonic-gate 			 * they will change on concatenation.
42387c478bd9Sstevel@tonic-gate 			 */
42397c478bd9Sstevel@tonic-gate 			prev =  AVL_PREV(tree, newseg);
42407c478bd9Sstevel@tonic-gate 			next = AVL_NEXT(tree, newseg);
42417c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, newseg, next);
42427c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
42437c478bd9Sstevel@tonic-gate 
42447c478bd9Sstevel@tonic-gate 			continue;
42457c478bd9Sstevel@tonic-gate 		}
42467c478bd9Sstevel@tonic-gate 
42477c478bd9Sstevel@tonic-gate 		eoff = off + len;
42487c478bd9Sstevel@tonic-gate 		oldeoff = seg->shm_off + seg->shm_size;
42497c478bd9Sstevel@tonic-gate 
42507c478bd9Sstevel@tonic-gate 		/*
42517c478bd9Sstevel@tonic-gate 		 * Policy set already?
42527c478bd9Sstevel@tonic-gate 		 */
42537c478bd9Sstevel@tonic-gate 		if (policy == seg->shm_policy.mem_policy) {
42547c478bd9Sstevel@tonic-gate 			/*
42557c478bd9Sstevel@tonic-gate 			 * Nothing left to do if offset and length
42567c478bd9Sstevel@tonic-gate 			 * fall within this segment
42577c478bd9Sstevel@tonic-gate 			 */
42587c478bd9Sstevel@tonic-gate 			if (eoff <= oldeoff) {
42597c478bd9Sstevel@tonic-gate 				retval = 1;
42607c478bd9Sstevel@tonic-gate 				break;
42617c478bd9Sstevel@tonic-gate 			} else {
42627c478bd9Sstevel@tonic-gate 				len = eoff - oldeoff;
42637c478bd9Sstevel@tonic-gate 				off = oldeoff;
42647c478bd9Sstevel@tonic-gate 				continue;
42657c478bd9Sstevel@tonic-gate 			}
42667c478bd9Sstevel@tonic-gate 		}
42677c478bd9Sstevel@tonic-gate 
42687c478bd9Sstevel@tonic-gate 		/*
42697c478bd9Sstevel@tonic-gate 		 * Specified offset and length match existing segment exactly
42707c478bd9Sstevel@tonic-gate 		 */
42717c478bd9Sstevel@tonic-gate 		if (off == seg->shm_off && len == seg->shm_size) {
42727c478bd9Sstevel@tonic-gate 			/*
42737c478bd9Sstevel@tonic-gate 			 * Set policy and update current length
42747c478bd9Sstevel@tonic-gate 			 */
42757c478bd9Sstevel@tonic-gate 			seg->shm_policy.mem_policy = policy;
42767c478bd9Sstevel@tonic-gate 			seg->shm_policy.mem_reserved = 0;
42777c478bd9Sstevel@tonic-gate 			len = 0;
42787c478bd9Sstevel@tonic-gate 
42797c478bd9Sstevel@tonic-gate 			/*
42807c478bd9Sstevel@tonic-gate 			 * Try concatenating new segment with previous and next
42817c478bd9Sstevel@tonic-gate 			 * segments, since they might have the same policy now.
42827c478bd9Sstevel@tonic-gate 			 * Grab previous and next segments first because they
42837c478bd9Sstevel@tonic-gate 			 * will change on concatenation.
42847c478bd9Sstevel@tonic-gate 			 */
42857c478bd9Sstevel@tonic-gate 			prev =  AVL_PREV(tree, seg);
42867c478bd9Sstevel@tonic-gate 			next = AVL_NEXT(tree, seg);
42877c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, seg, next);
42887c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, seg);
42897c478bd9Sstevel@tonic-gate 		} else {
42907c478bd9Sstevel@tonic-gate 			/*
42917c478bd9Sstevel@tonic-gate 			 * Specified offset and length only apply to part of
42927c478bd9Sstevel@tonic-gate 			 * existing segment
42937c478bd9Sstevel@tonic-gate 			 */
42947c478bd9Sstevel@tonic-gate 
42957c478bd9Sstevel@tonic-gate 			/*
42967c478bd9Sstevel@tonic-gate 			 * New segment starts in middle of old one, so split
42977c478bd9Sstevel@tonic-gate 			 * new one off near beginning of old one
42987c478bd9Sstevel@tonic-gate 			 */
42997c478bd9Sstevel@tonic-gate 			newseg = NULL;
43007c478bd9Sstevel@tonic-gate 			if (off > seg->shm_off) {
43017c478bd9Sstevel@tonic-gate 				newseg = lgrp_shm_policy_split(tree, seg, off);
43027c478bd9Sstevel@tonic-gate 
43037c478bd9Sstevel@tonic-gate 				/*
43047c478bd9Sstevel@tonic-gate 				 * New segment ends where old one did, so try
43057c478bd9Sstevel@tonic-gate 				 * to concatenate with next segment
43067c478bd9Sstevel@tonic-gate 				 */
43077c478bd9Sstevel@tonic-gate 				if (eoff == oldeoff) {
43087c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
43097c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_reserved = 0;
43107c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
43117c478bd9Sstevel@tonic-gate 					    newseg, AVL_NEXT(tree, newseg));
43127c478bd9Sstevel@tonic-gate 					break;
43137c478bd9Sstevel@tonic-gate 				}
43147c478bd9Sstevel@tonic-gate 			}
43157c478bd9Sstevel@tonic-gate 
43167c478bd9Sstevel@tonic-gate 			/*
43177c478bd9Sstevel@tonic-gate 			 * New segment ends before old one, so split off end of
43187c478bd9Sstevel@tonic-gate 			 * old one
43197c478bd9Sstevel@tonic-gate 			 */
43207c478bd9Sstevel@tonic-gate 			if (eoff < oldeoff) {
43217c478bd9Sstevel@tonic-gate 				if (newseg) {
43227c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree,
43237c478bd9Sstevel@tonic-gate 					    newseg, eoff);
43247c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
43257c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_reserved = 0;
43267c478bd9Sstevel@tonic-gate 				} else {
43277c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree, seg,
43287c478bd9Sstevel@tonic-gate 					    eoff);
43297c478bd9Sstevel@tonic-gate 					seg->shm_policy.mem_policy = policy;
43307c478bd9Sstevel@tonic-gate 					seg->shm_policy.mem_reserved = 0;
43317c478bd9Sstevel@tonic-gate 				}
43327c478bd9Sstevel@tonic-gate 
43337c478bd9Sstevel@tonic-gate 				if (off == seg->shm_off)
43347c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
43357c478bd9Sstevel@tonic-gate 					    AVL_PREV(tree, seg), seg);
43367c478bd9Sstevel@tonic-gate 				break;
43377c478bd9Sstevel@tonic-gate 			}
43387c478bd9Sstevel@tonic-gate 
43397c478bd9Sstevel@tonic-gate 			/*
43407c478bd9Sstevel@tonic-gate 			 * Calculate remaining length and next offset
43417c478bd9Sstevel@tonic-gate 			 */
43427c478bd9Sstevel@tonic-gate 			len = eoff - oldeoff;
43437c478bd9Sstevel@tonic-gate 			off = oldeoff;
43447c478bd9Sstevel@tonic-gate 		}
43457c478bd9Sstevel@tonic-gate 	}
43467c478bd9Sstevel@tonic-gate 
43477c478bd9Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
43487c478bd9Sstevel@tonic-gate 	return (retval);
43497c478bd9Sstevel@tonic-gate }
43507c478bd9Sstevel@tonic-gate 
43517c478bd9Sstevel@tonic-gate /*
43527c478bd9Sstevel@tonic-gate  * Return the best memnode from which to allocate memory given
43537c478bd9Sstevel@tonic-gate  * an lgroup.
43547c478bd9Sstevel@tonic-gate  *
43557c478bd9Sstevel@tonic-gate  * "c" is for cookie, which is good enough for me.
43567c478bd9Sstevel@tonic-gate  * It references a cookie struct that should be zero'ed to initialize.
43577c478bd9Sstevel@tonic-gate  * The cookie should live on the caller's stack.
43587c478bd9Sstevel@tonic-gate  *
43597c478bd9Sstevel@tonic-gate  * The routine returns -1 when:
43607c478bd9Sstevel@tonic-gate  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
43617c478bd9Sstevel@tonic-gate  *	- traverse is 1, and all the memnodes in the system have been
43627c478bd9Sstevel@tonic-gate  *	  returned.
43637c478bd9Sstevel@tonic-gate  */
43647c478bd9Sstevel@tonic-gate int
43657c478bd9Sstevel@tonic-gate lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
43667c478bd9Sstevel@tonic-gate {
43677c478bd9Sstevel@tonic-gate 	lgrp_t		*lp = c->lmc_lgrp;
43687c478bd9Sstevel@tonic-gate 	mnodeset_t	nodes = c->lmc_nodes;
43697c478bd9Sstevel@tonic-gate 	int		cnt = c->lmc_cnt;
43707c478bd9Sstevel@tonic-gate 	int		offset, mnode;
43717c478bd9Sstevel@tonic-gate 
43727c478bd9Sstevel@tonic-gate 	extern int	max_mem_nodes;
43737c478bd9Sstevel@tonic-gate 
43747c478bd9Sstevel@tonic-gate 	/*
43757c478bd9Sstevel@tonic-gate 	 * If the set is empty, and the caller is willing, traverse
43767c478bd9Sstevel@tonic-gate 	 * up the hierarchy until we find a non-empty set.
43777c478bd9Sstevel@tonic-gate 	 */
43787c478bd9Sstevel@tonic-gate 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
43797c478bd9Sstevel@tonic-gate 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
43807c478bd9Sstevel@tonic-gate 		    ((lp = lp->lgrp_parent) == NULL))
43817c478bd9Sstevel@tonic-gate 			return (-1);
43827c478bd9Sstevel@tonic-gate 
43837c478bd9Sstevel@tonic-gate 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
43847c478bd9Sstevel@tonic-gate 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
43857c478bd9Sstevel@tonic-gate 	}
43867c478bd9Sstevel@tonic-gate 
43877c478bd9Sstevel@tonic-gate 	/*
43887c478bd9Sstevel@tonic-gate 	 * Select a memnode by picking one at a "random" offset.
43897c478bd9Sstevel@tonic-gate 	 * Because of DR, memnodes can come and go at any time.
43907c478bd9Sstevel@tonic-gate 	 * This code must be able to cope with the possibility
43917c478bd9Sstevel@tonic-gate 	 * that the nodes count "cnt" is inconsistent with respect
43927c478bd9Sstevel@tonic-gate 	 * to the number of elements actually in "nodes", and
43937c478bd9Sstevel@tonic-gate 	 * therefore that the offset chosen could be greater than
43947c478bd9Sstevel@tonic-gate 	 * the number of elements in the set (some memnodes may
43957c478bd9Sstevel@tonic-gate 	 * have dissapeared just before cnt was read).
43967c478bd9Sstevel@tonic-gate 	 * If this happens, the search simply wraps back to the
43977c478bd9Sstevel@tonic-gate 	 * beginning of the set.
43987c478bd9Sstevel@tonic-gate 	 */
43997c478bd9Sstevel@tonic-gate 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
44007c478bd9Sstevel@tonic-gate 	offset = c->lmc_rand % cnt;
44017c478bd9Sstevel@tonic-gate 	do {
44027c478bd9Sstevel@tonic-gate 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
44037c478bd9Sstevel@tonic-gate 			if (nodes & ((mnodeset_t)1 << mnode))
44047c478bd9Sstevel@tonic-gate 				if (!offset--)
44057c478bd9Sstevel@tonic-gate 					break;
44067c478bd9Sstevel@tonic-gate 	} while (mnode >= max_mem_nodes);
44077c478bd9Sstevel@tonic-gate 
44087c478bd9Sstevel@tonic-gate 	/* Found a node. Store state before returning. */
44097c478bd9Sstevel@tonic-gate 	c->lmc_lgrp = lp;
44107c478bd9Sstevel@tonic-gate 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
44117c478bd9Sstevel@tonic-gate 	c->lmc_cnt = cnt - 1;
44127c478bd9Sstevel@tonic-gate 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
44137c478bd9Sstevel@tonic-gate 	c->lmc_ntried++;
44147c478bd9Sstevel@tonic-gate 
44157c478bd9Sstevel@tonic-gate 	return (mnode);
44167c478bd9Sstevel@tonic-gate }
4417