xref: /titanic_51/usr/src/uts/common/os/lgrp.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate 
29*7c478bd9Sstevel@tonic-gate /*
30*7c478bd9Sstevel@tonic-gate  * Basic NUMA support in terms of locality groups
31*7c478bd9Sstevel@tonic-gate  *
32*7c478bd9Sstevel@tonic-gate  * Solaris needs to know which CPUs, memory, etc. are near each other to
33*7c478bd9Sstevel@tonic-gate  * provide good performance on NUMA machines by optimizing for locality.
34*7c478bd9Sstevel@tonic-gate  * In order to do this, a new abstraction called a "locality group (lgroup)"
35*7c478bd9Sstevel@tonic-gate  * has been introduced to keep track of which CPU-like and memory-like hardware
36*7c478bd9Sstevel@tonic-gate  * resources are close to each other.  Currently, latency is the only measure
37*7c478bd9Sstevel@tonic-gate  * used to determine how to group hardware resources into lgroups, but this
38*7c478bd9Sstevel@tonic-gate  * does not limit the groupings to be based solely on latency.  Other factors
39*7c478bd9Sstevel@tonic-gate  * may be used to determine the groupings in the future.
40*7c478bd9Sstevel@tonic-gate  *
41*7c478bd9Sstevel@tonic-gate  * Lgroups are organized into a hieararchy or topology that represents the
42*7c478bd9Sstevel@tonic-gate  * latency topology of the machine.  There is always at least a root lgroup in
43*7c478bd9Sstevel@tonic-gate  * the system.  It represents all the hardware resources in the machine at a
44*7c478bd9Sstevel@tonic-gate  * latency big enough that any hardware resource can at least access any other
45*7c478bd9Sstevel@tonic-gate  * hardware resource within that latency.  A Uniform Memory Access (UMA)
46*7c478bd9Sstevel@tonic-gate  * machine is represented with one lgroup (the root).  In contrast, a NUMA
47*7c478bd9Sstevel@tonic-gate  * machine is represented at least by the root lgroup and some number of leaf
48*7c478bd9Sstevel@tonic-gate  * lgroups where the leaf lgroups contain the hardware resources within the
49*7c478bd9Sstevel@tonic-gate  * least latency of each other and the root lgroup still contains all the
50*7c478bd9Sstevel@tonic-gate  * resources in the machine.  Some number of intermediate lgroups may exist
51*7c478bd9Sstevel@tonic-gate  * which represent more levels of locality than just the local latency of the
52*7c478bd9Sstevel@tonic-gate  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
53*7c478bd9Sstevel@tonic-gate  * (eg. root and intermediate lgroups) contain the next nearest resources to
54*7c478bd9Sstevel@tonic-gate  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
55*7c478bd9Sstevel@tonic-gate  * to the root lgroup shows the hardware resources from closest to farthest
56*7c478bd9Sstevel@tonic-gate  * from the leaf lgroup such that each successive ancestor lgroup contains
57*7c478bd9Sstevel@tonic-gate  * the next nearest resources at the next level of locality from the previous.
58*7c478bd9Sstevel@tonic-gate  *
59*7c478bd9Sstevel@tonic-gate  * The kernel uses the lgroup abstraction to know how to allocate resources
60*7c478bd9Sstevel@tonic-gate  * near a given process/thread.  At fork() and lwp/thread_create() time, a
61*7c478bd9Sstevel@tonic-gate  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
62*7c478bd9Sstevel@tonic-gate  * with the lowest load average.  Binding to a processor or processor set will
63*7c478bd9Sstevel@tonic-gate  * change the home lgroup for a thread.  The scheduler has been modified to try
64*7c478bd9Sstevel@tonic-gate  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
65*7c478bd9Sstevel@tonic-gate  * allocation is lgroup aware too, so memory will be allocated from the current
66*7c478bd9Sstevel@tonic-gate  * thread's home lgroup if possible.  If the desired resources are not
67*7c478bd9Sstevel@tonic-gate  * available, the kernel traverses the lgroup hierarchy going to the parent
68*7c478bd9Sstevel@tonic-gate  * lgroup to find resources at the next level of locality until it reaches the
69*7c478bd9Sstevel@tonic-gate  * root lgroup.
70*7c478bd9Sstevel@tonic-gate  */
71*7c478bd9Sstevel@tonic-gate 
72*7c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
73*7c478bd9Sstevel@tonic-gate #include <sys/lgrp_user.h>
74*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
75*7c478bd9Sstevel@tonic-gate #include <sys/mman.h>
76*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
77*7c478bd9Sstevel@tonic-gate #include <sys/var.h>
78*7c478bd9Sstevel@tonic-gate #include <sys/thread.h>
79*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
80*7c478bd9Sstevel@tonic-gate #include <sys/cpupart.h>
81*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
82*7c478bd9Sstevel@tonic-gate #include <vm/seg.h>
83*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
84*7c478bd9Sstevel@tonic-gate #include <vm/seg_spt.h>
85*7c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
86*7c478bd9Sstevel@tonic-gate #include <vm/as.h>
87*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
88*7c478bd9Sstevel@tonic-gate #include <sys/systm.h>
89*7c478bd9Sstevel@tonic-gate #include <sys/errno.h>
90*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
91*7c478bd9Sstevel@tonic-gate #include <sys/kstat.h>
92*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
93*7c478bd9Sstevel@tonic-gate #include <sys/chip.h>
94*7c478bd9Sstevel@tonic-gate #include <sys/promif.h>
95*7c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
96*7c478bd9Sstevel@tonic-gate 
97*7c478bd9Sstevel@tonic-gate lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
98*7c478bd9Sstevel@tonic-gate lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
99*7c478bd9Sstevel@tonic-gate 				/* indexed by lgrp_id */
100*7c478bd9Sstevel@tonic-gate int	nlgrps;			/* number of lgroups in machine */
101*7c478bd9Sstevel@tonic-gate int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
102*7c478bd9Sstevel@tonic-gate int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
103*7c478bd9Sstevel@tonic-gate 
104*7c478bd9Sstevel@tonic-gate /*
105*7c478bd9Sstevel@tonic-gate  * Kstat data for lgroups.
106*7c478bd9Sstevel@tonic-gate  *
107*7c478bd9Sstevel@tonic-gate  * Actual kstat data is collected in lgrp_stats array.
108*7c478bd9Sstevel@tonic-gate  * The lgrp_kstat_data array of named kstats is used to extract data from
109*7c478bd9Sstevel@tonic-gate  * lgrp_stats and present it to kstat framework. It is protected from partallel
110*7c478bd9Sstevel@tonic-gate  * modifications by lgrp_kstat_mutex. This may cause some contention when
111*7c478bd9Sstevel@tonic-gate  * several kstat commands run in parallel but this is not the
112*7c478bd9Sstevel@tonic-gate  * performance-critical path.
113*7c478bd9Sstevel@tonic-gate  */
114*7c478bd9Sstevel@tonic-gate extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
115*7c478bd9Sstevel@tonic-gate 
116*7c478bd9Sstevel@tonic-gate /*
117*7c478bd9Sstevel@tonic-gate  * Declare kstat names statically for enums as defined in the header file.
118*7c478bd9Sstevel@tonic-gate  */
119*7c478bd9Sstevel@tonic-gate LGRP_KSTAT_NAMES;
120*7c478bd9Sstevel@tonic-gate 
121*7c478bd9Sstevel@tonic-gate static void	lgrp_kstat_init(void);
122*7c478bd9Sstevel@tonic-gate static int	lgrp_kstat_extract(kstat_t *, int);
123*7c478bd9Sstevel@tonic-gate static void	lgrp_kstat_reset(lgrp_id_t);
124*7c478bd9Sstevel@tonic-gate 
125*7c478bd9Sstevel@tonic-gate static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
126*7c478bd9Sstevel@tonic-gate static kmutex_t lgrp_kstat_mutex;
127*7c478bd9Sstevel@tonic-gate 
128*7c478bd9Sstevel@tonic-gate 
129*7c478bd9Sstevel@tonic-gate /*
130*7c478bd9Sstevel@tonic-gate  * max number of lgroups supported by the platform
131*7c478bd9Sstevel@tonic-gate  */
132*7c478bd9Sstevel@tonic-gate int	nlgrpsmax = 0;
133*7c478bd9Sstevel@tonic-gate 
134*7c478bd9Sstevel@tonic-gate /*
135*7c478bd9Sstevel@tonic-gate  * The root lgroup. Represents the set of resources at the system wide
136*7c478bd9Sstevel@tonic-gate  * level of locality.
137*7c478bd9Sstevel@tonic-gate  */
138*7c478bd9Sstevel@tonic-gate lgrp_t		*lgrp_root = NULL;
139*7c478bd9Sstevel@tonic-gate 
140*7c478bd9Sstevel@tonic-gate /*
141*7c478bd9Sstevel@tonic-gate  * During system bootstrap cp_default does not contain the list of lgrp load
142*7c478bd9Sstevel@tonic-gate  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
143*7c478bd9Sstevel@tonic-gate  * on-line when cp_default is initialized by cpupart_initialize_default().
144*7c478bd9Sstevel@tonic-gate  * Configuring CPU0 may create a two-level topology with root and one leaf node
145*7c478bd9Sstevel@tonic-gate  * containing CPU0. This topology is initially constructed in a special
146*7c478bd9Sstevel@tonic-gate  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
147*7c478bd9Sstevel@tonic-gate  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
148*7c478bd9Sstevel@tonic-gate  * for all lpl operations until cp_default is fully constructed.
149*7c478bd9Sstevel@tonic-gate  *
150*7c478bd9Sstevel@tonic-gate  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
151*7c478bd9Sstevel@tonic-gate  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
152*7c478bd9Sstevel@tonic-gate  * the first element of lpl_bootstrap_list.
153*7c478bd9Sstevel@tonic-gate  */
154*7c478bd9Sstevel@tonic-gate #define	LPL_BOOTSTRAP_SIZE 2
155*7c478bd9Sstevel@tonic-gate static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
156*7c478bd9Sstevel@tonic-gate lpl_t		*lpl_bootstrap;
157*7c478bd9Sstevel@tonic-gate 
158*7c478bd9Sstevel@tonic-gate static lgrp_t	lroot;
159*7c478bd9Sstevel@tonic-gate 
160*7c478bd9Sstevel@tonic-gate 
161*7c478bd9Sstevel@tonic-gate /*
162*7c478bd9Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied
163*7c478bd9Sstevel@tonic-gate  * to non-shared memory.  Default is the maximum size, so random memory
164*7c478bd9Sstevel@tonic-gate  * allocation won't be used for non-shared memory by default.
165*7c478bd9Sstevel@tonic-gate  */
166*7c478bd9Sstevel@tonic-gate size_t	lgrp_privm_random_thresh = (size_t)(-1);
167*7c478bd9Sstevel@tonic-gate 
168*7c478bd9Sstevel@tonic-gate /*
169*7c478bd9Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied to
170*7c478bd9Sstevel@tonic-gate  * shared memory.  Default is 8MB (2 ISM pages).
171*7c478bd9Sstevel@tonic-gate  */
172*7c478bd9Sstevel@tonic-gate size_t	lgrp_shm_random_thresh = 8*1024*1024;
173*7c478bd9Sstevel@tonic-gate 
174*7c478bd9Sstevel@tonic-gate /*
175*7c478bd9Sstevel@tonic-gate  * Whether to do processor set aware memory allocation by default
176*7c478bd9Sstevel@tonic-gate  */
177*7c478bd9Sstevel@tonic-gate int	lgrp_mem_pset_aware = 0;
178*7c478bd9Sstevel@tonic-gate 
179*7c478bd9Sstevel@tonic-gate /*
180*7c478bd9Sstevel@tonic-gate  * Set the default memory allocation policy for root lgroup
181*7c478bd9Sstevel@tonic-gate  */
182*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
183*7c478bd9Sstevel@tonic-gate 
184*7c478bd9Sstevel@tonic-gate /*
185*7c478bd9Sstevel@tonic-gate  * Set the default memory allocation policy.  For most platforms,
186*7c478bd9Sstevel@tonic-gate  * next touch is sufficient, but some platforms may wish to override
187*7c478bd9Sstevel@tonic-gate  * this.
188*7c478bd9Sstevel@tonic-gate  */
189*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
190*7c478bd9Sstevel@tonic-gate 
191*7c478bd9Sstevel@tonic-gate 
192*7c478bd9Sstevel@tonic-gate /*
193*7c478bd9Sstevel@tonic-gate  * lgroup CPU event handlers
194*7c478bd9Sstevel@tonic-gate  */
195*7c478bd9Sstevel@tonic-gate static void	lgrp_cpu_init(struct cpu *);
196*7c478bd9Sstevel@tonic-gate static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
197*7c478bd9Sstevel@tonic-gate static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
198*7c478bd9Sstevel@tonic-gate 
199*7c478bd9Sstevel@tonic-gate static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
200*7c478bd9Sstevel@tonic-gate 
201*7c478bd9Sstevel@tonic-gate /*
202*7c478bd9Sstevel@tonic-gate  * lgroup memory event handlers
203*7c478bd9Sstevel@tonic-gate  */
204*7c478bd9Sstevel@tonic-gate static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
205*7c478bd9Sstevel@tonic-gate static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
206*7c478bd9Sstevel@tonic-gate static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
207*7c478bd9Sstevel@tonic-gate 
208*7c478bd9Sstevel@tonic-gate /*
209*7c478bd9Sstevel@tonic-gate  * lgroup CPU partition event handlers
210*7c478bd9Sstevel@tonic-gate  */
211*7c478bd9Sstevel@tonic-gate static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
212*7c478bd9Sstevel@tonic-gate static void	lgrp_part_del_cpu(struct cpu *);
213*7c478bd9Sstevel@tonic-gate 
214*7c478bd9Sstevel@tonic-gate static void	lgrp_root_init(void);
215*7c478bd9Sstevel@tonic-gate 
216*7c478bd9Sstevel@tonic-gate /*
217*7c478bd9Sstevel@tonic-gate  * lpl topology
218*7c478bd9Sstevel@tonic-gate  */
219*7c478bd9Sstevel@tonic-gate static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
220*7c478bd9Sstevel@tonic-gate static void	lpl_clear(lpl_t *);
221*7c478bd9Sstevel@tonic-gate static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
222*7c478bd9Sstevel@tonic-gate static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
223*7c478bd9Sstevel@tonic-gate static void	lpl_rset_add(lpl_t *, lpl_t *);
224*7c478bd9Sstevel@tonic-gate static void	lpl_rset_del(lpl_t *, lpl_t *);
225*7c478bd9Sstevel@tonic-gate static int	lpl_rset_contains(lpl_t *, lpl_t *);
226*7c478bd9Sstevel@tonic-gate static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
227*7c478bd9Sstevel@tonic-gate static void	lpl_child_update(lpl_t *, struct cpupart *);
228*7c478bd9Sstevel@tonic-gate static int	lpl_pick(lpl_t *, lpl_t *);
229*7c478bd9Sstevel@tonic-gate static void	lpl_verify_wrapper(struct cpupart *);
230*7c478bd9Sstevel@tonic-gate 
231*7c478bd9Sstevel@tonic-gate /*
232*7c478bd9Sstevel@tonic-gate  * defines for lpl topology verifier return codes
233*7c478bd9Sstevel@tonic-gate  */
234*7c478bd9Sstevel@tonic-gate 
235*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CORRECT			0
236*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_PART_HAS_NO_LPL		-1
237*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
238*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LGRP_MISMATCH			-3
239*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_MISSING_PARENT			-4
240*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_PARENT_MISMATCH		-5
241*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BAD_CPUCNT			-6
242*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_RSET_MISMATCH			-7
243*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LPL_ORPHANED			-8
244*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LPL_BAD_NCPU			-9
245*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_RSET_MSSNG_LF			-10
246*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
247*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BOGUS_HINT			-12
248*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
249*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_LGRP_NOT_LEAF			-14
250*7c478bd9Sstevel@tonic-gate #define	LPL_TOPO_BAD_RSETCNT			-15
251*7c478bd9Sstevel@tonic-gate 
252*7c478bd9Sstevel@tonic-gate /*
253*7c478bd9Sstevel@tonic-gate  * Return whether lgroup optimizations should be enabled on this system
254*7c478bd9Sstevel@tonic-gate  */
255*7c478bd9Sstevel@tonic-gate int
256*7c478bd9Sstevel@tonic-gate lgrp_optimizations(void)
257*7c478bd9Sstevel@tonic-gate {
258*7c478bd9Sstevel@tonic-gate 	/*
259*7c478bd9Sstevel@tonic-gate 	 * System must have more than 2 lgroups to enable lgroup optimizations
260*7c478bd9Sstevel@tonic-gate 	 *
261*7c478bd9Sstevel@tonic-gate 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
262*7c478bd9Sstevel@tonic-gate 	 * with one child lgroup containing all the resources. A 2 lgroup
263*7c478bd9Sstevel@tonic-gate 	 * system with a root lgroup directly containing CPUs or memory might
264*7c478bd9Sstevel@tonic-gate 	 * need lgroup optimizations with its child lgroup, but there
265*7c478bd9Sstevel@tonic-gate 	 * isn't such a machine for now....
266*7c478bd9Sstevel@tonic-gate 	 */
267*7c478bd9Sstevel@tonic-gate 	if (nlgrps > 2)
268*7c478bd9Sstevel@tonic-gate 		return (1);
269*7c478bd9Sstevel@tonic-gate 
270*7c478bd9Sstevel@tonic-gate 	return (0);
271*7c478bd9Sstevel@tonic-gate }
272*7c478bd9Sstevel@tonic-gate 
273*7c478bd9Sstevel@tonic-gate /*
274*7c478bd9Sstevel@tonic-gate  * Build full lgroup topology
275*7c478bd9Sstevel@tonic-gate  */
276*7c478bd9Sstevel@tonic-gate static void
277*7c478bd9Sstevel@tonic-gate lgrp_root_init(void)
278*7c478bd9Sstevel@tonic-gate {
279*7c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
280*7c478bd9Sstevel@tonic-gate 	int		i;
281*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	id;
282*7c478bd9Sstevel@tonic-gate 
283*7c478bd9Sstevel@tonic-gate 	/*
284*7c478bd9Sstevel@tonic-gate 	 * Create the "root" lgroup
285*7c478bd9Sstevel@tonic-gate 	 */
286*7c478bd9Sstevel@tonic-gate 	ASSERT(nlgrps == 0);
287*7c478bd9Sstevel@tonic-gate 	id = nlgrps++;
288*7c478bd9Sstevel@tonic-gate 
289*7c478bd9Sstevel@tonic-gate 	lgrp_root = &lroot;
290*7c478bd9Sstevel@tonic-gate 
291*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_cpu = NULL;
292*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_mnodes = 0;
293*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_nmnodes = 0;
294*7c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_root_hand();
295*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_plathand = hand;
296*7c478bd9Sstevel@tonic-gate 
297*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_id = id;
298*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_cpucnt = 0;
299*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_childcnt = 0;
300*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_children);
301*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_leaves);
302*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_parent = NULL;
303*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_chips = NULL;
304*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_chipcnt = 0;
305*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
306*7c478bd9Sstevel@tonic-gate 
307*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
308*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[i]);
309*7c478bd9Sstevel@tonic-gate 
310*7c478bd9Sstevel@tonic-gate 	lgrp_root->lgrp_kstat = NULL;
311*7c478bd9Sstevel@tonic-gate 
312*7c478bd9Sstevel@tonic-gate 	lgrp_table[id] = lgrp_root;
313*7c478bd9Sstevel@tonic-gate 
314*7c478bd9Sstevel@tonic-gate 	/*
315*7c478bd9Sstevel@tonic-gate 	 * Setup initial lpl list for CPU0 and initial t0 home.
316*7c478bd9Sstevel@tonic-gate 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
317*7c478bd9Sstevel@tonic-gate 	 * all topology operations untill cp_default until cp_default is
318*7c478bd9Sstevel@tonic-gate 	 * initialized at which point t0.t_lpl will be updated.
319*7c478bd9Sstevel@tonic-gate 	 */
320*7c478bd9Sstevel@tonic-gate 	lpl_bootstrap = lpl_bootstrap_list;
321*7c478bd9Sstevel@tonic-gate 	t0.t_lpl = lpl_bootstrap;
322*7c478bd9Sstevel@tonic-gate 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
323*7c478bd9Sstevel@tonic-gate 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
324*7c478bd9Sstevel@tonic-gate 	cp_default.cp_lgrploads = lpl_bootstrap;
325*7c478bd9Sstevel@tonic-gate }
326*7c478bd9Sstevel@tonic-gate 
327*7c478bd9Sstevel@tonic-gate /*
328*7c478bd9Sstevel@tonic-gate  * Initialize the lgroup framework and allow the platform to do the same
329*7c478bd9Sstevel@tonic-gate  */
330*7c478bd9Sstevel@tonic-gate void
331*7c478bd9Sstevel@tonic-gate lgrp_init(void)
332*7c478bd9Sstevel@tonic-gate {
333*7c478bd9Sstevel@tonic-gate 	/*
334*7c478bd9Sstevel@tonic-gate 	 * Initialize the platform
335*7c478bd9Sstevel@tonic-gate 	 */
336*7c478bd9Sstevel@tonic-gate 	lgrp_plat_init();
337*7c478bd9Sstevel@tonic-gate 
338*7c478bd9Sstevel@tonic-gate 	/*
339*7c478bd9Sstevel@tonic-gate 	 * Set max number of lgroups supported on this platform which must be
340*7c478bd9Sstevel@tonic-gate 	 * less than the max number of lgroups supported by the common lgroup
341*7c478bd9Sstevel@tonic-gate 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
342*7c478bd9Sstevel@tonic-gate 	 */
343*7c478bd9Sstevel@tonic-gate 	nlgrpsmax = lgrp_plat_max_lgrps();
344*7c478bd9Sstevel@tonic-gate 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
345*7c478bd9Sstevel@tonic-gate }
346*7c478bd9Sstevel@tonic-gate 
347*7c478bd9Sstevel@tonic-gate /*
348*7c478bd9Sstevel@tonic-gate  * Create the root and cpu0's lgroup, and set t0's home.
349*7c478bd9Sstevel@tonic-gate  */
350*7c478bd9Sstevel@tonic-gate void
351*7c478bd9Sstevel@tonic-gate lgrp_setup(void)
352*7c478bd9Sstevel@tonic-gate {
353*7c478bd9Sstevel@tonic-gate 	/*
354*7c478bd9Sstevel@tonic-gate 	 * Setup the root lgroup
355*7c478bd9Sstevel@tonic-gate 	 */
356*7c478bd9Sstevel@tonic-gate 	lgrp_root_init();
357*7c478bd9Sstevel@tonic-gate 
358*7c478bd9Sstevel@tonic-gate 	/*
359*7c478bd9Sstevel@tonic-gate 	 * Add cpu0 to an lgroup
360*7c478bd9Sstevel@tonic-gate 	 */
361*7c478bd9Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
362*7c478bd9Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
363*7c478bd9Sstevel@tonic-gate }
364*7c478bd9Sstevel@tonic-gate 
365*7c478bd9Sstevel@tonic-gate /*
366*7c478bd9Sstevel@tonic-gate  * Lgroup initialization is split in two parts. The first part
367*7c478bd9Sstevel@tonic-gate  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
368*7c478bd9Sstevel@tonic-gate  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
369*7c478bd9Sstevel@tonic-gate  * when all CPUs are brought online and all distance information is available.
370*7c478bd9Sstevel@tonic-gate  *
371*7c478bd9Sstevel@tonic-gate  * When lgrp_main_init() is complete it sets lgrp_initialized. The
372*7c478bd9Sstevel@tonic-gate  * lgrp_main_mp_init() sets lgrp_topo_initialized.
373*7c478bd9Sstevel@tonic-gate  */
374*7c478bd9Sstevel@tonic-gate 
375*7c478bd9Sstevel@tonic-gate /*
376*7c478bd9Sstevel@tonic-gate  * true when lgrp initialization has been completed.
377*7c478bd9Sstevel@tonic-gate  */
378*7c478bd9Sstevel@tonic-gate int	lgrp_initialized = 0;
379*7c478bd9Sstevel@tonic-gate 
380*7c478bd9Sstevel@tonic-gate /*
381*7c478bd9Sstevel@tonic-gate  * True when lgrp topology is constructed.
382*7c478bd9Sstevel@tonic-gate  */
383*7c478bd9Sstevel@tonic-gate int	lgrp_topo_initialized = 0;
384*7c478bd9Sstevel@tonic-gate 
385*7c478bd9Sstevel@tonic-gate /*
386*7c478bd9Sstevel@tonic-gate  * Init routine called after startup(), /etc/system has been processed,
387*7c478bd9Sstevel@tonic-gate  * and cpu0 has been added to an lgroup.
388*7c478bd9Sstevel@tonic-gate  */
389*7c478bd9Sstevel@tonic-gate void
390*7c478bd9Sstevel@tonic-gate lgrp_main_init(void)
391*7c478bd9Sstevel@tonic-gate {
392*7c478bd9Sstevel@tonic-gate 	cpu_t		*cp = CPU;
393*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
394*7c478bd9Sstevel@tonic-gate 	int		i;
395*7c478bd9Sstevel@tonic-gate 	/*
396*7c478bd9Sstevel@tonic-gate 	 * Enforce a valid lgrp_mem_default_policy
397*7c478bd9Sstevel@tonic-gate 	 */
398*7c478bd9Sstevel@tonic-gate 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
399*7c478bd9Sstevel@tonic-gate 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
400*7c478bd9Sstevel@tonic-gate 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
401*7c478bd9Sstevel@tonic-gate 
402*7c478bd9Sstevel@tonic-gate 	/*
403*7c478bd9Sstevel@tonic-gate 	 * See if mpo should be disabled.
404*7c478bd9Sstevel@tonic-gate 	 * This may happen in the case of null proc LPA on Starcat.
405*7c478bd9Sstevel@tonic-gate 	 * The platform won't be able to detect null proc LPA until after
406*7c478bd9Sstevel@tonic-gate 	 * cpu0 and memory have already been added to lgroups.
407*7c478bd9Sstevel@tonic-gate 	 * When and if it is detected, the Starcat platform will return
408*7c478bd9Sstevel@tonic-gate 	 * a different platform handle for cpu0 which is what we check for
409*7c478bd9Sstevel@tonic-gate 	 * here. If mpo should be disabled move cpu0 to it's rightful place
410*7c478bd9Sstevel@tonic-gate 	 * (the root), and destroy the remaining lgroups. This effectively
411*7c478bd9Sstevel@tonic-gate 	 * provides an UMA lgroup topology.
412*7c478bd9Sstevel@tonic-gate 	 */
413*7c478bd9Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
414*7c478bd9Sstevel@tonic-gate 	if (lgrp_table[lgrpid]->lgrp_plathand !=
415*7c478bd9Sstevel@tonic-gate 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
416*7c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
417*7c478bd9Sstevel@tonic-gate 		lgrp_cpu_fini(cp, lgrpid);
418*7c478bd9Sstevel@tonic-gate 
419*7c478bd9Sstevel@tonic-gate 		lgrp_cpu_init(cp);
420*7c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
421*7c478bd9Sstevel@tonic-gate 
422*7c478bd9Sstevel@tonic-gate 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
423*7c478bd9Sstevel@tonic-gate 
424*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
425*7c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp_table[i]) &&
426*7c478bd9Sstevel@tonic-gate 			    lgrp_table[i] != lgrp_root)
427*7c478bd9Sstevel@tonic-gate 				lgrp_destroy(lgrp_table[i]);
428*7c478bd9Sstevel@tonic-gate 		}
429*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
430*7c478bd9Sstevel@tonic-gate 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
431*7c478bd9Sstevel@tonic-gate 	}
432*7c478bd9Sstevel@tonic-gate 
433*7c478bd9Sstevel@tonic-gate 	/*
434*7c478bd9Sstevel@tonic-gate 	 * Initialize kstats framework.
435*7c478bd9Sstevel@tonic-gate 	 */
436*7c478bd9Sstevel@tonic-gate 	lgrp_kstat_init();
437*7c478bd9Sstevel@tonic-gate 	/*
438*7c478bd9Sstevel@tonic-gate 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
439*7c478bd9Sstevel@tonic-gate 	 */
440*7c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
441*7c478bd9Sstevel@tonic-gate 	lgrp_kstat_create(cp);
442*7c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
443*7c478bd9Sstevel@tonic-gate 
444*7c478bd9Sstevel@tonic-gate 	lgrp_plat_main_init();
445*7c478bd9Sstevel@tonic-gate 	lgrp_initialized = 1;
446*7c478bd9Sstevel@tonic-gate }
447*7c478bd9Sstevel@tonic-gate 
448*7c478bd9Sstevel@tonic-gate /*
449*7c478bd9Sstevel@tonic-gate  * Finish lgrp initialization after all CPUS are brought on-line.
450*7c478bd9Sstevel@tonic-gate  * This routine is called after start_other_cpus().
451*7c478bd9Sstevel@tonic-gate  */
452*7c478bd9Sstevel@tonic-gate void
453*7c478bd9Sstevel@tonic-gate lgrp_main_mp_init(void)
454*7c478bd9Sstevel@tonic-gate {
455*7c478bd9Sstevel@tonic-gate 	klgrpset_t changed;
456*7c478bd9Sstevel@tonic-gate 
457*7c478bd9Sstevel@tonic-gate 	/*
458*7c478bd9Sstevel@tonic-gate 	 * Update lgroup topology (if necessary)
459*7c478bd9Sstevel@tonic-gate 	 */
460*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
461*7c478bd9Sstevel@tonic-gate 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
462*7c478bd9Sstevel@tonic-gate 	lgrp_topo_initialized = 1;
463*7c478bd9Sstevel@tonic-gate }
464*7c478bd9Sstevel@tonic-gate 
465*7c478bd9Sstevel@tonic-gate /*
466*7c478bd9Sstevel@tonic-gate  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
467*7c478bd9Sstevel@tonic-gate  */
468*7c478bd9Sstevel@tonic-gate void
469*7c478bd9Sstevel@tonic-gate lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
470*7c478bd9Sstevel@tonic-gate {
471*7c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
472*7c478bd9Sstevel@tonic-gate 	cpu_t		*cp;
473*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	id;
474*7c478bd9Sstevel@tonic-gate 	int		rc;
475*7c478bd9Sstevel@tonic-gate 
476*7c478bd9Sstevel@tonic-gate 	switch (event) {
477*7c478bd9Sstevel@tonic-gate 	/*
478*7c478bd9Sstevel@tonic-gate 	 * The following (re)configuration events are common code
479*7c478bd9Sstevel@tonic-gate 	 * initiated. lgrp_plat_config() is called here to inform the
480*7c478bd9Sstevel@tonic-gate 	 * platform of the reconfiguration event.
481*7c478bd9Sstevel@tonic-gate 	 */
482*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ADD:
483*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
484*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
485*7c478bd9Sstevel@tonic-gate 
486*7c478bd9Sstevel@tonic-gate 		break;
487*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_DEL:
488*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
489*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
490*7c478bd9Sstevel@tonic-gate 
491*7c478bd9Sstevel@tonic-gate 		break;
492*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ONLINE:
493*7c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
494*7c478bd9Sstevel@tonic-gate 		lgrp_cpu_init(cp);
495*7c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
496*7c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
497*7c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
498*7c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
499*7c478bd9Sstevel@tonic-gate 		}
500*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
501*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
502*7c478bd9Sstevel@tonic-gate 
503*7c478bd9Sstevel@tonic-gate 		break;
504*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_OFFLINE:
505*7c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
506*7c478bd9Sstevel@tonic-gate 		id = cp->cpu_lpl->lpl_lgrpid;
507*7c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
508*7c478bd9Sstevel@tonic-gate 		lgrp_cpu_fini(cp, id);
509*7c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
510*7c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
511*7c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
512*7c478bd9Sstevel@tonic-gate 		}
513*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
514*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
515*7c478bd9Sstevel@tonic-gate 
516*7c478bd9Sstevel@tonic-gate 		break;
517*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_ADD:
518*7c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
519*7c478bd9Sstevel@tonic-gate 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
520*7c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
521*7c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
522*7c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
523*7c478bd9Sstevel@tonic-gate 		}
524*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
525*7c478bd9Sstevel@tonic-gate 
526*7c478bd9Sstevel@tonic-gate 		break;
527*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_DEL:
528*7c478bd9Sstevel@tonic-gate 		cp = (cpu_t *)resource;
529*7c478bd9Sstevel@tonic-gate 		lgrp_part_del_cpu((cpu_t *)resource);
530*7c478bd9Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
531*7c478bd9Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
532*7c478bd9Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
533*7c478bd9Sstevel@tonic-gate 		}
534*7c478bd9Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
535*7c478bd9Sstevel@tonic-gate 
536*7c478bd9Sstevel@tonic-gate 		break;
537*7c478bd9Sstevel@tonic-gate 	/*
538*7c478bd9Sstevel@tonic-gate 	 * The following events are initiated by the memnode
539*7c478bd9Sstevel@tonic-gate 	 * subsystem.
540*7c478bd9Sstevel@tonic-gate 	 */
541*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_ADD:
542*7c478bd9Sstevel@tonic-gate 		lgrp_mem_init((int)resource, where, B_FALSE);
543*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
544*7c478bd9Sstevel@tonic-gate 
545*7c478bd9Sstevel@tonic-gate 		break;
546*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_DEL:
547*7c478bd9Sstevel@tonic-gate 		lgrp_mem_fini((int)resource, where, B_FALSE);
548*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
549*7c478bd9Sstevel@tonic-gate 
550*7c478bd9Sstevel@tonic-gate 		break;
551*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_RENAME: {
552*7c478bd9Sstevel@tonic-gate 		lgrp_config_mem_rename_t *ren_arg =
553*7c478bd9Sstevel@tonic-gate 		    (lgrp_config_mem_rename_t *)where;
554*7c478bd9Sstevel@tonic-gate 
555*7c478bd9Sstevel@tonic-gate 		lgrp_mem_rename((int)resource,
556*7c478bd9Sstevel@tonic-gate 		    ren_arg->lmem_rename_from,
557*7c478bd9Sstevel@tonic-gate 		    ren_arg->lmem_rename_to);
558*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
559*7c478bd9Sstevel@tonic-gate 
560*7c478bd9Sstevel@tonic-gate 		break;
561*7c478bd9Sstevel@tonic-gate 	}
562*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_GEN_UPDATE:
563*7c478bd9Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
564*7c478bd9Sstevel@tonic-gate 
565*7c478bd9Sstevel@tonic-gate 		break;
566*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_FLATTEN:
567*7c478bd9Sstevel@tonic-gate 		if (where == 0)
568*7c478bd9Sstevel@tonic-gate 			lgrp_topo_levels = (int)resource;
569*7c478bd9Sstevel@tonic-gate 		else
570*7c478bd9Sstevel@tonic-gate 			(void) lgrp_topo_flatten(resource,
571*7c478bd9Sstevel@tonic-gate 			    lgrp_table, lgrp_alloc_max, &changed);
572*7c478bd9Sstevel@tonic-gate 
573*7c478bd9Sstevel@tonic-gate 		break;
574*7c478bd9Sstevel@tonic-gate 	/*
575*7c478bd9Sstevel@tonic-gate 	 * Initiated by platform latency probing code
576*7c478bd9Sstevel@tonic-gate 	 */
577*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_LATENCY_CHANGE:
578*7c478bd9Sstevel@tonic-gate 		lgrp_latency_change((u_longlong_t)resource,
579*7c478bd9Sstevel@tonic-gate 		    (u_longlong_t)where);
580*7c478bd9Sstevel@tonic-gate 
581*7c478bd9Sstevel@tonic-gate 		break;
582*7c478bd9Sstevel@tonic-gate 	case LGRP_CONFIG_NOP:
583*7c478bd9Sstevel@tonic-gate 
584*7c478bd9Sstevel@tonic-gate 		break;
585*7c478bd9Sstevel@tonic-gate 	default:
586*7c478bd9Sstevel@tonic-gate 		break;
587*7c478bd9Sstevel@tonic-gate 	}
588*7c478bd9Sstevel@tonic-gate 
589*7c478bd9Sstevel@tonic-gate }
590*7c478bd9Sstevel@tonic-gate 
591*7c478bd9Sstevel@tonic-gate /*
592*7c478bd9Sstevel@tonic-gate  * Called to add lgrp info into cpu structure from cpu_add_unit;
593*7c478bd9Sstevel@tonic-gate  * do not assume cpu is in cpu[] yet!
594*7c478bd9Sstevel@tonic-gate  *
595*7c478bd9Sstevel@tonic-gate  * CPUs are brought online with all other CPUs paused so we can't
596*7c478bd9Sstevel@tonic-gate  * allocate memory or we could deadlock the system, so we rely on
597*7c478bd9Sstevel@tonic-gate  * the platform to statically allocate as much space as we need
598*7c478bd9Sstevel@tonic-gate  * for the lgrp structs and stats.
599*7c478bd9Sstevel@tonic-gate  */
600*7c478bd9Sstevel@tonic-gate static void
601*7c478bd9Sstevel@tonic-gate lgrp_cpu_init(struct cpu *cp)
602*7c478bd9Sstevel@tonic-gate {
603*7c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
604*7c478bd9Sstevel@tonic-gate 	int		count;
605*7c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
606*7c478bd9Sstevel@tonic-gate 	int		first_cpu;
607*7c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
608*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
609*7c478bd9Sstevel@tonic-gate 	struct cpu	*cptr;
610*7c478bd9Sstevel@tonic-gate 	struct chip	*chp;
611*7c478bd9Sstevel@tonic-gate 
612*7c478bd9Sstevel@tonic-gate 	/*
613*7c478bd9Sstevel@tonic-gate 	 * This is the first time through if the resource set
614*7c478bd9Sstevel@tonic-gate 	 * for the root lgroup is empty. After cpu0 has been
615*7c478bd9Sstevel@tonic-gate 	 * initially added to an lgroup, the root's CPU resource
616*7c478bd9Sstevel@tonic-gate 	 * set can never be empty, since the system's last CPU
617*7c478bd9Sstevel@tonic-gate 	 * cannot be offlined.
618*7c478bd9Sstevel@tonic-gate 	 */
619*7c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
620*7c478bd9Sstevel@tonic-gate 		/*
621*7c478bd9Sstevel@tonic-gate 		 * First time through.
622*7c478bd9Sstevel@tonic-gate 		 */
623*7c478bd9Sstevel@tonic-gate 		first_cpu = 1;
624*7c478bd9Sstevel@tonic-gate 	} else {
625*7c478bd9Sstevel@tonic-gate 		/*
626*7c478bd9Sstevel@tonic-gate 		 * If cpu0 needs to move lgroups, we may come
627*7c478bd9Sstevel@tonic-gate 		 * through here again, at which time cpu_lock won't
628*7c478bd9Sstevel@tonic-gate 		 * be held, and lgrp_initialized will be false.
629*7c478bd9Sstevel@tonic-gate 		 */
630*7c478bd9Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
631*7c478bd9Sstevel@tonic-gate 		ASSERT(cp->cpu_part != NULL);
632*7c478bd9Sstevel@tonic-gate 		first_cpu = 0;
633*7c478bd9Sstevel@tonic-gate 	}
634*7c478bd9Sstevel@tonic-gate 
635*7c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
636*7c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
637*7c478bd9Sstevel@tonic-gate 
638*7c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL) {
639*7c478bd9Sstevel@tonic-gate 		/*
640*7c478bd9Sstevel@tonic-gate 		 * Create new lgrp and add it to lgroup topology
641*7c478bd9Sstevel@tonic-gate 		 */
642*7c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_create();
643*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
644*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
645*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
646*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
647*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
648*7c478bd9Sstevel@tonic-gate 
649*7c478bd9Sstevel@tonic-gate 		count = 0;
650*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
651*7c478bd9Sstevel@tonic-gate 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
652*7c478bd9Sstevel@tonic-gate 		    &changed);
653*7c478bd9Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
654*7c478bd9Sstevel@tonic-gate 	    > 0) {
655*7c478bd9Sstevel@tonic-gate 		/*
656*7c478bd9Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
657*7c478bd9Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
658*7c478bd9Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
659*7c478bd9Sstevel@tonic-gate 		 * lgroups.
660*7c478bd9Sstevel@tonic-gate 		 */
661*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
662*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
663*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
664*7c478bd9Sstevel@tonic-gate 		    lgrpid))
665*7c478bd9Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
666*7c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
667*7c478bd9Sstevel@tonic-gate 		    &changed);
668*7c478bd9Sstevel@tonic-gate 
669*7c478bd9Sstevel@tonic-gate 		/*
670*7c478bd9Sstevel@tonic-gate 		 * May have added new intermediate lgroups, so need to add
671*7c478bd9Sstevel@tonic-gate 		 * resources other than CPUs which are added below
672*7c478bd9Sstevel@tonic-gate 		 */
673*7c478bd9Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
674*7c478bd9Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
675*7c478bd9Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
676*7c478bd9Sstevel@tonic-gate 		int	i;
677*7c478bd9Sstevel@tonic-gate 
678*7c478bd9Sstevel@tonic-gate 		/*
679*7c478bd9Sstevel@tonic-gate 		 * Update existing lgroup and lgroups containing it with CPU
680*7c478bd9Sstevel@tonic-gate 		 * resource
681*7c478bd9Sstevel@tonic-gate 		 */
682*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
683*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
684*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
685*7c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
686*7c478bd9Sstevel@tonic-gate 
687*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
688*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
689*7c478bd9Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
690*7c478bd9Sstevel@tonic-gate 				continue;
691*7c478bd9Sstevel@tonic-gate 
692*7c478bd9Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
693*7c478bd9Sstevel@tonic-gate 		}
694*7c478bd9Sstevel@tonic-gate 	}
695*7c478bd9Sstevel@tonic-gate 
696*7c478bd9Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
697*7c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
698*7c478bd9Sstevel@tonic-gate 
699*7c478bd9Sstevel@tonic-gate 	/*
700*7c478bd9Sstevel@tonic-gate 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
701*7c478bd9Sstevel@tonic-gate 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
702*7c478bd9Sstevel@tonic-gate 	 * not since none of lgroup IDs in the lpl's have been set yet.
703*7c478bd9Sstevel@tonic-gate 	 */
704*7c478bd9Sstevel@tonic-gate 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
705*7c478bd9Sstevel@tonic-gate 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
706*7c478bd9Sstevel@tonic-gate 
707*7c478bd9Sstevel@tonic-gate 	/*
708*7c478bd9Sstevel@tonic-gate 	 * link the CPU into the lgrp's CPU list
709*7c478bd9Sstevel@tonic-gate 	 */
710*7c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
711*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = cp;
712*7c478bd9Sstevel@tonic-gate 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
713*7c478bd9Sstevel@tonic-gate 	} else {
714*7c478bd9Sstevel@tonic-gate 		cptr = my_lgrp->lgrp_cpu;
715*7c478bd9Sstevel@tonic-gate 		cp->cpu_next_lgrp = cptr;
716*7c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
717*7c478bd9Sstevel@tonic-gate 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
718*7c478bd9Sstevel@tonic-gate 		cptr->cpu_prev_lgrp = cp;
719*7c478bd9Sstevel@tonic-gate 	}
720*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt++;
721*7c478bd9Sstevel@tonic-gate 
722*7c478bd9Sstevel@tonic-gate 	/*
723*7c478bd9Sstevel@tonic-gate 	 * Add this cpu's chip to the per lgroup list
724*7c478bd9Sstevel@tonic-gate 	 * if necessary
725*7c478bd9Sstevel@tonic-gate 	 */
726*7c478bd9Sstevel@tonic-gate 	if (cp->cpu_chip->chip_lgrp == NULL) {
727*7c478bd9Sstevel@tonic-gate 		struct chip *lcpr;
728*7c478bd9Sstevel@tonic-gate 
729*7c478bd9Sstevel@tonic-gate 		chp = cp->cpu_chip;
730*7c478bd9Sstevel@tonic-gate 
731*7c478bd9Sstevel@tonic-gate 		if (my_lgrp->lgrp_chipcnt == 0) {
732*7c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = chp;
733*7c478bd9Sstevel@tonic-gate 			chp->chip_next_lgrp =
734*7c478bd9Sstevel@tonic-gate 			    chp->chip_prev_lgrp = chp;
735*7c478bd9Sstevel@tonic-gate 		} else {
736*7c478bd9Sstevel@tonic-gate 			lcpr = my_lgrp->lgrp_chips;
737*7c478bd9Sstevel@tonic-gate 			chp->chip_next_lgrp = lcpr;
738*7c478bd9Sstevel@tonic-gate 			chp->chip_prev_lgrp =
739*7c478bd9Sstevel@tonic-gate 			    lcpr->chip_prev_lgrp;
740*7c478bd9Sstevel@tonic-gate 			lcpr->chip_prev_lgrp->chip_next_lgrp =
741*7c478bd9Sstevel@tonic-gate 			    chp;
742*7c478bd9Sstevel@tonic-gate 			lcpr->chip_prev_lgrp = chp;
743*7c478bd9Sstevel@tonic-gate 		}
744*7c478bd9Sstevel@tonic-gate 		chp->chip_lgrp = my_lgrp;
745*7c478bd9Sstevel@tonic-gate 		chp->chip_balance = chp->chip_next_lgrp;
746*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_chipcnt++;
747*7c478bd9Sstevel@tonic-gate 	}
748*7c478bd9Sstevel@tonic-gate }
749*7c478bd9Sstevel@tonic-gate 
750*7c478bd9Sstevel@tonic-gate lgrp_t *
751*7c478bd9Sstevel@tonic-gate lgrp_create(void)
752*7c478bd9Sstevel@tonic-gate {
753*7c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
754*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
755*7c478bd9Sstevel@tonic-gate 	int		i;
756*7c478bd9Sstevel@tonic-gate 
757*7c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
758*7c478bd9Sstevel@tonic-gate 
759*7c478bd9Sstevel@tonic-gate 	/*
760*7c478bd9Sstevel@tonic-gate 	 * Find an open slot in the lgroup table and recycle unused lgroup
761*7c478bd9Sstevel@tonic-gate 	 * left there if any
762*7c478bd9Sstevel@tonic-gate 	 */
763*7c478bd9Sstevel@tonic-gate 	my_lgrp = NULL;
764*7c478bd9Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1)
765*7c478bd9Sstevel@tonic-gate 		/*
766*7c478bd9Sstevel@tonic-gate 		 * Allocate from end when hint not set yet because no lgroups
767*7c478bd9Sstevel@tonic-gate 		 * have been deleted yet
768*7c478bd9Sstevel@tonic-gate 		 */
769*7c478bd9Sstevel@tonic-gate 		lgrpid = nlgrps++;
770*7c478bd9Sstevel@tonic-gate 	else {
771*7c478bd9Sstevel@tonic-gate 		/*
772*7c478bd9Sstevel@tonic-gate 		 * Start looking for next open slot from hint and leave hint
773*7c478bd9Sstevel@tonic-gate 		 * at slot allocated
774*7c478bd9Sstevel@tonic-gate 		 */
775*7c478bd9Sstevel@tonic-gate 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
776*7c478bd9Sstevel@tonic-gate 			my_lgrp = lgrp_table[i];
777*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(my_lgrp)) {
778*7c478bd9Sstevel@tonic-gate 				lgrpid = i;
779*7c478bd9Sstevel@tonic-gate 				nlgrps++;
780*7c478bd9Sstevel@tonic-gate 				break;
781*7c478bd9Sstevel@tonic-gate 			}
782*7c478bd9Sstevel@tonic-gate 		}
783*7c478bd9Sstevel@tonic-gate 		lgrp_alloc_hint = lgrpid;
784*7c478bd9Sstevel@tonic-gate 	}
785*7c478bd9Sstevel@tonic-gate 
786*7c478bd9Sstevel@tonic-gate 	/*
787*7c478bd9Sstevel@tonic-gate 	 * Keep track of max lgroup ID allocated so far to cut down on searches
788*7c478bd9Sstevel@tonic-gate 	 */
789*7c478bd9Sstevel@tonic-gate 	if (lgrpid > lgrp_alloc_max)
790*7c478bd9Sstevel@tonic-gate 		lgrp_alloc_max = lgrpid;
791*7c478bd9Sstevel@tonic-gate 
792*7c478bd9Sstevel@tonic-gate 	/*
793*7c478bd9Sstevel@tonic-gate 	 * Need to allocate new lgroup if next open slot didn't have one
794*7c478bd9Sstevel@tonic-gate 	 * for recycling
795*7c478bd9Sstevel@tonic-gate 	 */
796*7c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL)
797*7c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_plat_alloc(lgrpid);
798*7c478bd9Sstevel@tonic-gate 
799*7c478bd9Sstevel@tonic-gate 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
800*7c478bd9Sstevel@tonic-gate 		panic("Too many lgrps for platform (%d)", nlgrps);
801*7c478bd9Sstevel@tonic-gate 
802*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_id = lgrpid;
803*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_latency = 0;
804*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
805*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_parent = NULL;
806*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_childcnt = 0;
807*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
808*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_nmnodes = 0;
809*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_children);
810*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_leaves);
811*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
812*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(my_lgrp->lgrp_set[i]);
813*7c478bd9Sstevel@tonic-gate 
814*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpu = NULL;
815*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt = 0;
816*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_chips = NULL;
817*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_chipcnt = 0;
818*7c478bd9Sstevel@tonic-gate 
819*7c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
820*7c478bd9Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
821*7c478bd9Sstevel@tonic-gate 
822*7c478bd9Sstevel@tonic-gate 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
823*7c478bd9Sstevel@tonic-gate 
824*7c478bd9Sstevel@tonic-gate 	return (my_lgrp);
825*7c478bd9Sstevel@tonic-gate }
826*7c478bd9Sstevel@tonic-gate 
827*7c478bd9Sstevel@tonic-gate void
828*7c478bd9Sstevel@tonic-gate lgrp_destroy(lgrp_t *lgrp)
829*7c478bd9Sstevel@tonic-gate {
830*7c478bd9Sstevel@tonic-gate 	int		i;
831*7c478bd9Sstevel@tonic-gate 
832*7c478bd9Sstevel@tonic-gate 	/*
833*7c478bd9Sstevel@tonic-gate 	 * Unless this lgroup is being destroyed on behalf of
834*7c478bd9Sstevel@tonic-gate 	 * the boot CPU, cpu_lock must be held
835*7c478bd9Sstevel@tonic-gate 	 */
836*7c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
837*7c478bd9Sstevel@tonic-gate 
838*7c478bd9Sstevel@tonic-gate 	if (nlgrps == 1)
839*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
840*7c478bd9Sstevel@tonic-gate 
841*7c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
842*7c478bd9Sstevel@tonic-gate 		return;
843*7c478bd9Sstevel@tonic-gate 
844*7c478bd9Sstevel@tonic-gate 	/*
845*7c478bd9Sstevel@tonic-gate 	 * Set hint to lgroup being deleted and try to keep lower numbered
846*7c478bd9Sstevel@tonic-gate 	 * hints to facilitate finding empty slots
847*7c478bd9Sstevel@tonic-gate 	 */
848*7c478bd9Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
849*7c478bd9Sstevel@tonic-gate 		lgrp_alloc_hint = lgrp->lgrp_id;
850*7c478bd9Sstevel@tonic-gate 
851*7c478bd9Sstevel@tonic-gate 	/*
852*7c478bd9Sstevel@tonic-gate 	 * Mark this lgroup to be recycled by setting its lgroup ID to
853*7c478bd9Sstevel@tonic-gate 	 * LGRP_NONE and clear relevant fields
854*7c478bd9Sstevel@tonic-gate 	 */
855*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_id = LGRP_NONE;
856*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_latency = 0;
857*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
858*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_parent = NULL;
859*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_childcnt = 0;
860*7c478bd9Sstevel@tonic-gate 
861*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_children);
862*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_leaves);
863*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
864*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(lgrp->lgrp_set[i]);
865*7c478bd9Sstevel@tonic-gate 
866*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_mnodes = (mnodeset_t)0;
867*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_nmnodes = 0;
868*7c478bd9Sstevel@tonic-gate 
869*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_cpu = NULL;
870*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_cpucnt = 0;
871*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_chipcnt = 0;
872*7c478bd9Sstevel@tonic-gate 	lgrp->lgrp_chips = NULL;
873*7c478bd9Sstevel@tonic-gate 
874*7c478bd9Sstevel@tonic-gate 	nlgrps--;
875*7c478bd9Sstevel@tonic-gate }
876*7c478bd9Sstevel@tonic-gate 
877*7c478bd9Sstevel@tonic-gate /*
878*7c478bd9Sstevel@tonic-gate  * Initialize kstat data. Called from lgrp intialization code.
879*7c478bd9Sstevel@tonic-gate  */
880*7c478bd9Sstevel@tonic-gate static void
881*7c478bd9Sstevel@tonic-gate lgrp_kstat_init(void)
882*7c478bd9Sstevel@tonic-gate {
883*7c478bd9Sstevel@tonic-gate 	lgrp_stat_t	stat;
884*7c478bd9Sstevel@tonic-gate 
885*7c478bd9Sstevel@tonic-gate 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
886*7c478bd9Sstevel@tonic-gate 
887*7c478bd9Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
888*7c478bd9Sstevel@tonic-gate 		kstat_named_init(&lgrp_kstat_data[stat],
889*7c478bd9Sstevel@tonic-gate 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
890*7c478bd9Sstevel@tonic-gate }
891*7c478bd9Sstevel@tonic-gate 
892*7c478bd9Sstevel@tonic-gate /*
893*7c478bd9Sstevel@tonic-gate  * initialize an lgrp's kstats if needed
894*7c478bd9Sstevel@tonic-gate  * called with cpu_lock held but not with cpus paused.
895*7c478bd9Sstevel@tonic-gate  * we don't tear these down now because we don't know about
896*7c478bd9Sstevel@tonic-gate  * memory leaving the lgrp yet...
897*7c478bd9Sstevel@tonic-gate  */
898*7c478bd9Sstevel@tonic-gate 
899*7c478bd9Sstevel@tonic-gate void
900*7c478bd9Sstevel@tonic-gate lgrp_kstat_create(cpu_t *cp)
901*7c478bd9Sstevel@tonic-gate {
902*7c478bd9Sstevel@tonic-gate 	kstat_t		*lgrp_kstat;
903*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
904*7c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
905*7c478bd9Sstevel@tonic-gate 
906*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
907*7c478bd9Sstevel@tonic-gate 
908*7c478bd9Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
909*7c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
910*7c478bd9Sstevel@tonic-gate 
911*7c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
912*7c478bd9Sstevel@tonic-gate 		return; /* already initialized */
913*7c478bd9Sstevel@tonic-gate 
914*7c478bd9Sstevel@tonic-gate 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
915*7c478bd9Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
916*7c478bd9Sstevel@tonic-gate 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
917*7c478bd9Sstevel@tonic-gate 
918*7c478bd9Sstevel@tonic-gate 	if (lgrp_kstat != NULL) {
919*7c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
920*7c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_private = my_lgrp;
921*7c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_data = &lgrp_kstat_data;
922*7c478bd9Sstevel@tonic-gate 		lgrp_kstat->ks_update = lgrp_kstat_extract;
923*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_kstat = lgrp_kstat;
924*7c478bd9Sstevel@tonic-gate 		kstat_install(lgrp_kstat);
925*7c478bd9Sstevel@tonic-gate 	}
926*7c478bd9Sstevel@tonic-gate }
927*7c478bd9Sstevel@tonic-gate 
928*7c478bd9Sstevel@tonic-gate /*
929*7c478bd9Sstevel@tonic-gate  * this will do something when we manage to remove now unused lgrps
930*7c478bd9Sstevel@tonic-gate  */
931*7c478bd9Sstevel@tonic-gate 
932*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
933*7c478bd9Sstevel@tonic-gate void
934*7c478bd9Sstevel@tonic-gate lgrp_kstat_destroy(cpu_t *cp)
935*7c478bd9Sstevel@tonic-gate {
936*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
937*7c478bd9Sstevel@tonic-gate }
938*7c478bd9Sstevel@tonic-gate 
939*7c478bd9Sstevel@tonic-gate /*
940*7c478bd9Sstevel@tonic-gate  * Called when a CPU is off-lined.
941*7c478bd9Sstevel@tonic-gate  */
942*7c478bd9Sstevel@tonic-gate static void
943*7c478bd9Sstevel@tonic-gate lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
944*7c478bd9Sstevel@tonic-gate {
945*7c478bd9Sstevel@tonic-gate 	lgrp_t *my_lgrp;
946*7c478bd9Sstevel@tonic-gate 	struct cpu *prev;
947*7c478bd9Sstevel@tonic-gate 	struct cpu *next;
948*7c478bd9Sstevel@tonic-gate 	chip_t  *chp;
949*7c478bd9Sstevel@tonic-gate 
950*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
951*7c478bd9Sstevel@tonic-gate 
952*7c478bd9Sstevel@tonic-gate 	prev = cp->cpu_prev_lgrp;
953*7c478bd9Sstevel@tonic-gate 	next = cp->cpu_next_lgrp;
954*7c478bd9Sstevel@tonic-gate 
955*7c478bd9Sstevel@tonic-gate 	prev->cpu_next_lgrp = next;
956*7c478bd9Sstevel@tonic-gate 	next->cpu_prev_lgrp = prev;
957*7c478bd9Sstevel@tonic-gate 
958*7c478bd9Sstevel@tonic-gate 	/*
959*7c478bd9Sstevel@tonic-gate 	 * just because I'm paranoid doesn't mean...
960*7c478bd9Sstevel@tonic-gate 	 */
961*7c478bd9Sstevel@tonic-gate 
962*7c478bd9Sstevel@tonic-gate 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
963*7c478bd9Sstevel@tonic-gate 
964*7c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
965*7c478bd9Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt--;
966*7c478bd9Sstevel@tonic-gate 
967*7c478bd9Sstevel@tonic-gate 	/*
968*7c478bd9Sstevel@tonic-gate 	 * If the last CPU on it's chip is being offlined
969*7c478bd9Sstevel@tonic-gate 	 * then remove this chip from the per lgroup list.
970*7c478bd9Sstevel@tonic-gate 	 *
971*7c478bd9Sstevel@tonic-gate 	 * This is also done for the boot CPU when it needs
972*7c478bd9Sstevel@tonic-gate 	 * to move between lgroups as a consequence of
973*7c478bd9Sstevel@tonic-gate 	 * null proc lpa.
974*7c478bd9Sstevel@tonic-gate 	 */
975*7c478bd9Sstevel@tonic-gate 	chp = cp->cpu_chip;
976*7c478bd9Sstevel@tonic-gate 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
977*7c478bd9Sstevel@tonic-gate 
978*7c478bd9Sstevel@tonic-gate 		chip_t	*chpp;
979*7c478bd9Sstevel@tonic-gate 
980*7c478bd9Sstevel@tonic-gate 		if (--my_lgrp->lgrp_chipcnt == 0)
981*7c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = NULL;
982*7c478bd9Sstevel@tonic-gate 		else if (my_lgrp->lgrp_chips == chp)
983*7c478bd9Sstevel@tonic-gate 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
984*7c478bd9Sstevel@tonic-gate 
985*7c478bd9Sstevel@tonic-gate 		/*
986*7c478bd9Sstevel@tonic-gate 		 * Walk this lgroup's chip list looking for chips that
987*7c478bd9Sstevel@tonic-gate 		 * may try to balance against the one that's leaving
988*7c478bd9Sstevel@tonic-gate 		 */
989*7c478bd9Sstevel@tonic-gate 		for (chpp = chp->chip_next_lgrp; chpp != chp;
990*7c478bd9Sstevel@tonic-gate 		    chpp = chpp->chip_next_lgrp) {
991*7c478bd9Sstevel@tonic-gate 			if (chpp->chip_balance == chp)
992*7c478bd9Sstevel@tonic-gate 				chpp->chip_balance = chp->chip_next_lgrp;
993*7c478bd9Sstevel@tonic-gate 		}
994*7c478bd9Sstevel@tonic-gate 
995*7c478bd9Sstevel@tonic-gate 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
996*7c478bd9Sstevel@tonic-gate 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
997*7c478bd9Sstevel@tonic-gate 
998*7c478bd9Sstevel@tonic-gate 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
999*7c478bd9Sstevel@tonic-gate 		chp->chip_lgrp = NULL;
1000*7c478bd9Sstevel@tonic-gate 		chp->chip_balance = NULL;
1001*7c478bd9Sstevel@tonic-gate 	}
1002*7c478bd9Sstevel@tonic-gate 
1003*7c478bd9Sstevel@tonic-gate 	/*
1004*7c478bd9Sstevel@tonic-gate 	 * Removing last CPU in lgroup, so update lgroup topology
1005*7c478bd9Sstevel@tonic-gate 	 */
1006*7c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
1007*7c478bd9Sstevel@tonic-gate 		klgrpset_t	changed;
1008*7c478bd9Sstevel@tonic-gate 		int		count;
1009*7c478bd9Sstevel@tonic-gate 		int		i;
1010*7c478bd9Sstevel@tonic-gate 
1011*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = NULL;
1012*7c478bd9Sstevel@tonic-gate 
1013*7c478bd9Sstevel@tonic-gate 		/*
1014*7c478bd9Sstevel@tonic-gate 		 * Remove this lgroup from its lgroup CPU resources and remove
1015*7c478bd9Sstevel@tonic-gate 		 * lgroup from lgroup topology if it doesn't have any more
1016*7c478bd9Sstevel@tonic-gate 		 * resources in it now
1017*7c478bd9Sstevel@tonic-gate 		 */
1018*7c478bd9Sstevel@tonic-gate 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1019*7c478bd9Sstevel@tonic-gate 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1020*7c478bd9Sstevel@tonic-gate 			count = 0;
1021*7c478bd9Sstevel@tonic-gate 			klgrpset_clear(changed);
1022*7c478bd9Sstevel@tonic-gate 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1023*7c478bd9Sstevel@tonic-gate 			    lgrp_alloc_max + 1, &changed);
1024*7c478bd9Sstevel@tonic-gate 			return;
1025*7c478bd9Sstevel@tonic-gate 		}
1026*7c478bd9Sstevel@tonic-gate 
1027*7c478bd9Sstevel@tonic-gate 		/*
1028*7c478bd9Sstevel@tonic-gate 		 * This lgroup isn't empty, so just remove it from CPU
1029*7c478bd9Sstevel@tonic-gate 		 * resources of any lgroups that contain it as such
1030*7c478bd9Sstevel@tonic-gate 		 */
1031*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
1032*7c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
1033*7c478bd9Sstevel@tonic-gate 
1034*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
1035*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
1036*7c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1037*7c478bd9Sstevel@tonic-gate 			    lgrpid))
1038*7c478bd9Sstevel@tonic-gate 				continue;
1039*7c478bd9Sstevel@tonic-gate 
1040*7c478bd9Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1041*7c478bd9Sstevel@tonic-gate 		}
1042*7c478bd9Sstevel@tonic-gate 		return;
1043*7c478bd9Sstevel@tonic-gate 	}
1044*7c478bd9Sstevel@tonic-gate 
1045*7c478bd9Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpu == cp)
1046*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = next;
1047*7c478bd9Sstevel@tonic-gate 
1048*7c478bd9Sstevel@tonic-gate }
1049*7c478bd9Sstevel@tonic-gate 
1050*7c478bd9Sstevel@tonic-gate /*
1051*7c478bd9Sstevel@tonic-gate  * Update memory nodes in target lgroups and return ones that get changed
1052*7c478bd9Sstevel@tonic-gate  */
1053*7c478bd9Sstevel@tonic-gate int
1054*7c478bd9Sstevel@tonic-gate lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1055*7c478bd9Sstevel@tonic-gate {
1056*7c478bd9Sstevel@tonic-gate 	int	count;
1057*7c478bd9Sstevel@tonic-gate 	int	i;
1058*7c478bd9Sstevel@tonic-gate 	int	j;
1059*7c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
1060*7c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp_rsrc;
1061*7c478bd9Sstevel@tonic-gate 
1062*7c478bd9Sstevel@tonic-gate 	count = 0;
1063*7c478bd9Sstevel@tonic-gate 	if (changed)
1064*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(*changed);
1065*7c478bd9Sstevel@tonic-gate 
1066*7c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(target))
1067*7c478bd9Sstevel@tonic-gate 		return (0);
1068*7c478bd9Sstevel@tonic-gate 
1069*7c478bd9Sstevel@tonic-gate 	/*
1070*7c478bd9Sstevel@tonic-gate 	 * Find each lgroup in target lgroups
1071*7c478bd9Sstevel@tonic-gate 	 */
1072*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1073*7c478bd9Sstevel@tonic-gate 		/*
1074*7c478bd9Sstevel@tonic-gate 		 * Skip any lgroups that don't exist or aren't in target group
1075*7c478bd9Sstevel@tonic-gate 		 */
1076*7c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
1077*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1078*7c478bd9Sstevel@tonic-gate 			continue;
1079*7c478bd9Sstevel@tonic-gate 		}
1080*7c478bd9Sstevel@tonic-gate 
1081*7c478bd9Sstevel@tonic-gate 		/*
1082*7c478bd9Sstevel@tonic-gate 		 * Initialize memnodes for intermediate lgroups to 0
1083*7c478bd9Sstevel@tonic-gate 		 * and update them from scratch since they may have completely
1084*7c478bd9Sstevel@tonic-gate 		 * changed
1085*7c478bd9Sstevel@tonic-gate 		 */
1086*7c478bd9Sstevel@tonic-gate 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1087*7c478bd9Sstevel@tonic-gate 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1088*7c478bd9Sstevel@tonic-gate 			lgrp->lgrp_nmnodes = 0;
1089*7c478bd9Sstevel@tonic-gate 		}
1090*7c478bd9Sstevel@tonic-gate 
1091*7c478bd9Sstevel@tonic-gate 		/*
1092*7c478bd9Sstevel@tonic-gate 		 * Update memory nodes of of target lgroup with memory nodes
1093*7c478bd9Sstevel@tonic-gate 		 * from each lgroup in its lgroup memory resource set
1094*7c478bd9Sstevel@tonic-gate 		 */
1095*7c478bd9Sstevel@tonic-gate 		for (j = 0; j <= lgrp_alloc_max; j++) {
1096*7c478bd9Sstevel@tonic-gate 			int	k;
1097*7c478bd9Sstevel@tonic-gate 
1098*7c478bd9Sstevel@tonic-gate 			/*
1099*7c478bd9Sstevel@tonic-gate 			 * Skip any lgroups that don't exist or aren't in
1100*7c478bd9Sstevel@tonic-gate 			 * memory resources of target lgroup
1101*7c478bd9Sstevel@tonic-gate 			 */
1102*7c478bd9Sstevel@tonic-gate 			lgrp_rsrc = lgrp_table[j];
1103*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1104*7c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1105*7c478bd9Sstevel@tonic-gate 			    j))
1106*7c478bd9Sstevel@tonic-gate 				continue;
1107*7c478bd9Sstevel@tonic-gate 
1108*7c478bd9Sstevel@tonic-gate 			/*
1109*7c478bd9Sstevel@tonic-gate 			 * Update target lgroup's memnodes to include memnodes
1110*7c478bd9Sstevel@tonic-gate 			 * of this lgroup
1111*7c478bd9Sstevel@tonic-gate 			 */
1112*7c478bd9Sstevel@tonic-gate 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1113*7c478bd9Sstevel@tonic-gate 				mnodeset_t	mnode_mask;
1114*7c478bd9Sstevel@tonic-gate 
1115*7c478bd9Sstevel@tonic-gate 				mnode_mask = (mnodeset_t)1 << k;
1116*7c478bd9Sstevel@tonic-gate 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1117*7c478bd9Sstevel@tonic-gate 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1118*7c478bd9Sstevel@tonic-gate 					lgrp->lgrp_mnodes |= mnode_mask;
1119*7c478bd9Sstevel@tonic-gate 					lgrp->lgrp_nmnodes++;
1120*7c478bd9Sstevel@tonic-gate 				}
1121*7c478bd9Sstevel@tonic-gate 			}
1122*7c478bd9Sstevel@tonic-gate 			count++;
1123*7c478bd9Sstevel@tonic-gate 			if (changed)
1124*7c478bd9Sstevel@tonic-gate 				klgrpset_add(*changed, lgrp->lgrp_id);
1125*7c478bd9Sstevel@tonic-gate 		}
1126*7c478bd9Sstevel@tonic-gate 	}
1127*7c478bd9Sstevel@tonic-gate 
1128*7c478bd9Sstevel@tonic-gate 	return (count);
1129*7c478bd9Sstevel@tonic-gate }
1130*7c478bd9Sstevel@tonic-gate 
1131*7c478bd9Sstevel@tonic-gate /*
1132*7c478bd9Sstevel@tonic-gate  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1133*7c478bd9Sstevel@tonic-gate  * is moved from one board to another. The "from" and "to" arguments specify the
1134*7c478bd9Sstevel@tonic-gate  * source and the destination of the move.
1135*7c478bd9Sstevel@tonic-gate  *
1136*7c478bd9Sstevel@tonic-gate  * See plat_lgrp_config() for a detailed description of the copy-rename
1137*7c478bd9Sstevel@tonic-gate  * semantics.
1138*7c478bd9Sstevel@tonic-gate  *
1139*7c478bd9Sstevel@tonic-gate  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1140*7c478bd9Sstevel@tonic-gate  * the lgroup topology which is changing as memory moves from one lgroup to
1141*7c478bd9Sstevel@tonic-gate  * another. It removes the mnode from the source lgroup and re-inserts it in the
1142*7c478bd9Sstevel@tonic-gate  * target lgroup.
1143*7c478bd9Sstevel@tonic-gate  *
1144*7c478bd9Sstevel@tonic-gate  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1145*7c478bd9Sstevel@tonic-gate  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1146*7c478bd9Sstevel@tonic-gate  * copy-rename operation.
1147*7c478bd9Sstevel@tonic-gate  *
1148*7c478bd9Sstevel@tonic-gate  * There is one case which requires special handling. If the system contains
1149*7c478bd9Sstevel@tonic-gate  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1150*7c478bd9Sstevel@tonic-gate  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1151*7c478bd9Sstevel@tonic-gate  * lgrp_mem_init), but there is a window when the system has no memory in the
1152*7c478bd9Sstevel@tonic-gate  * lgroup hierarchy. If another thread tries to allocate memory during this
1153*7c478bd9Sstevel@tonic-gate  * window, the allocation will fail, although the system has physical memory.
1154*7c478bd9Sstevel@tonic-gate  * This may cause a system panic or a deadlock (some sleeping memory allocations
1155*7c478bd9Sstevel@tonic-gate  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1156*7c478bd9Sstevel@tonic-gate  * the mnode back).
1157*7c478bd9Sstevel@tonic-gate  *
1158*7c478bd9Sstevel@tonic-gate  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1159*7c478bd9Sstevel@tonic-gate  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1160*7c478bd9Sstevel@tonic-gate  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1161*7c478bd9Sstevel@tonic-gate  * but it updates the rest of the lgroup topology as if the mnode was actually
1162*7c478bd9Sstevel@tonic-gate  * removed. The lgrp_mem_init() function recognizes that the mnode being
1163*7c478bd9Sstevel@tonic-gate  * inserted represents such a special case and updates the topology
1164*7c478bd9Sstevel@tonic-gate  * appropriately.
1165*7c478bd9Sstevel@tonic-gate  */
1166*7c478bd9Sstevel@tonic-gate void
1167*7c478bd9Sstevel@tonic-gate lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1168*7c478bd9Sstevel@tonic-gate {
1169*7c478bd9Sstevel@tonic-gate 	/*
1170*7c478bd9Sstevel@tonic-gate 	 * Remove the memory from the source node and add it to the destination
1171*7c478bd9Sstevel@tonic-gate 	 * node.
1172*7c478bd9Sstevel@tonic-gate 	 */
1173*7c478bd9Sstevel@tonic-gate 	lgrp_mem_fini(mnode, from, B_TRUE);
1174*7c478bd9Sstevel@tonic-gate 	lgrp_mem_init(mnode, to, B_TRUE);
1175*7c478bd9Sstevel@tonic-gate }
1176*7c478bd9Sstevel@tonic-gate 
1177*7c478bd9Sstevel@tonic-gate /*
1178*7c478bd9Sstevel@tonic-gate  * Called to indicate that the lgrp with platform handle "hand" now
1179*7c478bd9Sstevel@tonic-gate  * contains the memory identified by "mnode".
1180*7c478bd9Sstevel@tonic-gate  *
1181*7c478bd9Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
1182*7c478bd9Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1183*7c478bd9Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
1184*7c478bd9Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
1185*7c478bd9Sstevel@tonic-gate  *
1186*7c478bd9Sstevel@tonic-gate  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1187*7c478bd9Sstevel@tonic-gate  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1188*7c478bd9Sstevel@tonic-gate  * dealing with the special case of DR copy-rename described in
1189*7c478bd9Sstevel@tonic-gate  * lgrp_mem_rename().
1190*7c478bd9Sstevel@tonic-gate  */
1191*7c478bd9Sstevel@tonic-gate void
1192*7c478bd9Sstevel@tonic-gate lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1193*7c478bd9Sstevel@tonic-gate {
1194*7c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
1195*7c478bd9Sstevel@tonic-gate 	int		count;
1196*7c478bd9Sstevel@tonic-gate 	int		i;
1197*7c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
1198*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
1199*7c478bd9Sstevel@tonic-gate 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1200*7c478bd9Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
1201*7c478bd9Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
1202*7c478bd9Sstevel@tonic-gate 
1203*7c478bd9Sstevel@tonic-gate 	/*
1204*7c478bd9Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
1205*7c478bd9Sstevel@tonic-gate 	 */
1206*7c478bd9Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
1207*7c478bd9Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
1208*7c478bd9Sstevel@tonic-gate 		drop_lock = B_TRUE;
1209*7c478bd9Sstevel@tonic-gate 	}
1210*7c478bd9Sstevel@tonic-gate 
1211*7c478bd9Sstevel@tonic-gate 	/*
1212*7c478bd9Sstevel@tonic-gate 	 * This routine may be called from a context where we already
1213*7c478bd9Sstevel@tonic-gate 	 * hold cpu_lock, and have already paused cpus.
1214*7c478bd9Sstevel@tonic-gate 	 */
1215*7c478bd9Sstevel@tonic-gate 	if (!cpus_paused())
1216*7c478bd9Sstevel@tonic-gate 		need_synch = B_TRUE;
1217*7c478bd9Sstevel@tonic-gate 
1218*7c478bd9Sstevel@tonic-gate 	/*
1219*7c478bd9Sstevel@tonic-gate 	 * Check if this mnode is already configured and return immediately if
1220*7c478bd9Sstevel@tonic-gate 	 * it is.
1221*7c478bd9Sstevel@tonic-gate 	 *
1222*7c478bd9Sstevel@tonic-gate 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1223*7c478bd9Sstevel@tonic-gate 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1224*7c478bd9Sstevel@tonic-gate 	 * recognize this case and continue as usual, but skip the update to
1225*7c478bd9Sstevel@tonic-gate 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1226*7c478bd9Sstevel@tonic-gate 	 * in topology, temporarily introduced by lgrp_mem_fini().
1227*7c478bd9Sstevel@tonic-gate 	 */
1228*7c478bd9Sstevel@tonic-gate 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1229*7c478bd9Sstevel@tonic-gate 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1230*7c478bd9Sstevel@tonic-gate 		if (drop_lock)
1231*7c478bd9Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
1232*7c478bd9Sstevel@tonic-gate 		return;
1233*7c478bd9Sstevel@tonic-gate 	}
1234*7c478bd9Sstevel@tonic-gate 
1235*7c478bd9Sstevel@tonic-gate 	/*
1236*7c478bd9Sstevel@tonic-gate 	 * Update lgroup topology with new memory resources, keeping track of
1237*7c478bd9Sstevel@tonic-gate 	 * which lgroups change
1238*7c478bd9Sstevel@tonic-gate 	 */
1239*7c478bd9Sstevel@tonic-gate 	count = 0;
1240*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
1241*7c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
1242*7c478bd9Sstevel@tonic-gate 	if (my_lgrp == NULL) {
1243*7c478bd9Sstevel@tonic-gate 		/* new lgrp */
1244*7c478bd9Sstevel@tonic-gate 		my_lgrp = lgrp_create();
1245*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
1246*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
1247*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1248*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1249*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1250*7c478bd9Sstevel@tonic-gate 
1251*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1252*7c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
1253*7c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1254*7c478bd9Sstevel@tonic-gate 		    &changed);
1255*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1256*7c478bd9Sstevel@tonic-gate 			start_cpus();
1257*7c478bd9Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1258*7c478bd9Sstevel@tonic-gate 	    > 0) {
1259*7c478bd9Sstevel@tonic-gate 		/*
1260*7c478bd9Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
1261*7c478bd9Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
1262*7c478bd9Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
1263*7c478bd9Sstevel@tonic-gate 		 * lgroups.
1264*7c478bd9Sstevel@tonic-gate 		 */
1265*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(changed);
1266*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
1267*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1268*7c478bd9Sstevel@tonic-gate 		    lgrpid))
1269*7c478bd9Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1270*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1271*7c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
1272*7c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1273*7c478bd9Sstevel@tonic-gate 		    &changed);
1274*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1275*7c478bd9Sstevel@tonic-gate 			start_cpus();
1276*7c478bd9Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1277*7c478bd9Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
1278*7c478bd9Sstevel@tonic-gate 		klgrpset_add(changed, lgrpid);
1279*7c478bd9Sstevel@tonic-gate 		count = 1;
1280*7c478bd9Sstevel@tonic-gate 
1281*7c478bd9Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
1282*7c478bd9Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1283*7c478bd9Sstevel@tonic-gate 		klgrpset_add(changed, lgrpid);
1284*7c478bd9Sstevel@tonic-gate 		count++;
1285*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
1286*7c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
1287*7c478bd9Sstevel@tonic-gate 
1288*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
1289*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
1290*7c478bd9Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1291*7c478bd9Sstevel@tonic-gate 				continue;
1292*7c478bd9Sstevel@tonic-gate 
1293*7c478bd9Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1294*7c478bd9Sstevel@tonic-gate 			klgrpset_add(changed, lgrp->lgrp_id);
1295*7c478bd9Sstevel@tonic-gate 			count++;
1296*7c478bd9Sstevel@tonic-gate 		}
1297*7c478bd9Sstevel@tonic-gate 	}
1298*7c478bd9Sstevel@tonic-gate 
1299*7c478bd9Sstevel@tonic-gate 	/*
1300*7c478bd9Sstevel@tonic-gate 	 * Add memory node to lgroup and remove lgroup from ones that need
1301*7c478bd9Sstevel@tonic-gate 	 * to be updated
1302*7c478bd9Sstevel@tonic-gate 	 */
1303*7c478bd9Sstevel@tonic-gate 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1304*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1305*7c478bd9Sstevel@tonic-gate 		my_lgrp->lgrp_nmnodes++;
1306*7c478bd9Sstevel@tonic-gate 	}
1307*7c478bd9Sstevel@tonic-gate 	klgrpset_del(changed, lgrpid);
1308*7c478bd9Sstevel@tonic-gate 
1309*7c478bd9Sstevel@tonic-gate 	/*
1310*7c478bd9Sstevel@tonic-gate 	 * Update memory node information for all lgroups that changed and
1311*7c478bd9Sstevel@tonic-gate 	 * contain new memory node as a resource
1312*7c478bd9Sstevel@tonic-gate 	 */
1313*7c478bd9Sstevel@tonic-gate 	if (count)
1314*7c478bd9Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
1315*7c478bd9Sstevel@tonic-gate 
1316*7c478bd9Sstevel@tonic-gate 	if (drop_lock)
1317*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1318*7c478bd9Sstevel@tonic-gate }
1319*7c478bd9Sstevel@tonic-gate 
1320*7c478bd9Sstevel@tonic-gate /*
1321*7c478bd9Sstevel@tonic-gate  * Called to indicate that the lgroup associated with the platform
1322*7c478bd9Sstevel@tonic-gate  * handle "hand" no longer contains given memory node
1323*7c478bd9Sstevel@tonic-gate  *
1324*7c478bd9Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
1325*7c478bd9Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1326*7c478bd9Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
1327*7c478bd9Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
1328*7c478bd9Sstevel@tonic-gate  *
1329*7c478bd9Sstevel@tonic-gate  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1330*7c478bd9Sstevel@tonic-gate  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1331*7c478bd9Sstevel@tonic-gate  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1332*7c478bd9Sstevel@tonic-gate  * the same mnode back into the topology. See lgrp_mem_rename() and
1333*7c478bd9Sstevel@tonic-gate  * lgrp_mem_init() for additional details.
1334*7c478bd9Sstevel@tonic-gate  */
1335*7c478bd9Sstevel@tonic-gate void
1336*7c478bd9Sstevel@tonic-gate lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1337*7c478bd9Sstevel@tonic-gate {
1338*7c478bd9Sstevel@tonic-gate 	klgrpset_t	changed;
1339*7c478bd9Sstevel@tonic-gate 	int		count;
1340*7c478bd9Sstevel@tonic-gate 	int		i;
1341*7c478bd9Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
1342*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
1343*7c478bd9Sstevel@tonic-gate 	mnodeset_t	mnodes_mask;
1344*7c478bd9Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
1345*7c478bd9Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
1346*7c478bd9Sstevel@tonic-gate 
1347*7c478bd9Sstevel@tonic-gate 	/*
1348*7c478bd9Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
1349*7c478bd9Sstevel@tonic-gate 	 */
1350*7c478bd9Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
1351*7c478bd9Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
1352*7c478bd9Sstevel@tonic-gate 		drop_lock = B_TRUE;
1353*7c478bd9Sstevel@tonic-gate 	}
1354*7c478bd9Sstevel@tonic-gate 
1355*7c478bd9Sstevel@tonic-gate 	/*
1356*7c478bd9Sstevel@tonic-gate 	 * This routine may be called from a context where we already
1357*7c478bd9Sstevel@tonic-gate 	 * hold cpu_lock and have already paused cpus.
1358*7c478bd9Sstevel@tonic-gate 	 */
1359*7c478bd9Sstevel@tonic-gate 	if (!cpus_paused())
1360*7c478bd9Sstevel@tonic-gate 		need_synch = B_TRUE;
1361*7c478bd9Sstevel@tonic-gate 
1362*7c478bd9Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
1363*7c478bd9Sstevel@tonic-gate 
1364*7c478bd9Sstevel@tonic-gate 	/*
1365*7c478bd9Sstevel@tonic-gate 	 * The lgrp *must* be pre-existing
1366*7c478bd9Sstevel@tonic-gate 	 */
1367*7c478bd9Sstevel@tonic-gate 	ASSERT(my_lgrp != NULL);
1368*7c478bd9Sstevel@tonic-gate 
1369*7c478bd9Sstevel@tonic-gate 	/*
1370*7c478bd9Sstevel@tonic-gate 	 * Delete memory node from lgroups which contain it
1371*7c478bd9Sstevel@tonic-gate 	 */
1372*7c478bd9Sstevel@tonic-gate 	mnodes_mask = ((mnodeset_t)1 << mnode);
1373*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1374*7c478bd9Sstevel@tonic-gate 		lgrp_t *lgrp = lgrp_table[i];
1375*7c478bd9Sstevel@tonic-gate 		/*
1376*7c478bd9Sstevel@tonic-gate 		 * Skip any non-existent lgroups and any lgroups that don't
1377*7c478bd9Sstevel@tonic-gate 		 * contain leaf lgroup of memory as a memory resource
1378*7c478bd9Sstevel@tonic-gate 		 */
1379*7c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp) ||
1380*7c478bd9Sstevel@tonic-gate 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1381*7c478bd9Sstevel@tonic-gate 			continue;
1382*7c478bd9Sstevel@tonic-gate 
1383*7c478bd9Sstevel@tonic-gate 		/*
1384*7c478bd9Sstevel@tonic-gate 		 * Avoid removing the last mnode from the root in the DR
1385*7c478bd9Sstevel@tonic-gate 		 * copy-rename case. See lgrp_mem_rename() for details.
1386*7c478bd9Sstevel@tonic-gate 		 */
1387*7c478bd9Sstevel@tonic-gate 		if (is_copy_rename &&
1388*7c478bd9Sstevel@tonic-gate 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1389*7c478bd9Sstevel@tonic-gate 			continue;
1390*7c478bd9Sstevel@tonic-gate 
1391*7c478bd9Sstevel@tonic-gate 		/*
1392*7c478bd9Sstevel@tonic-gate 		 * Remove memory node from lgroup.
1393*7c478bd9Sstevel@tonic-gate 		 */
1394*7c478bd9Sstevel@tonic-gate 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1395*7c478bd9Sstevel@tonic-gate 		lgrp->lgrp_nmnodes--;
1396*7c478bd9Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1397*7c478bd9Sstevel@tonic-gate 	}
1398*7c478bd9Sstevel@tonic-gate 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1399*7c478bd9Sstevel@tonic-gate 
1400*7c478bd9Sstevel@tonic-gate 	/*
1401*7c478bd9Sstevel@tonic-gate 	 * Don't need to update lgroup topology if this lgroup still has memory.
1402*7c478bd9Sstevel@tonic-gate 	 *
1403*7c478bd9Sstevel@tonic-gate 	 * In the special case of DR copy-rename with the only mnode being
1404*7c478bd9Sstevel@tonic-gate 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1405*7c478bd9Sstevel@tonic-gate 	 * still need to update the lgroup topology.
1406*7c478bd9Sstevel@tonic-gate 	 */
1407*7c478bd9Sstevel@tonic-gate 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1408*7c478bd9Sstevel@tonic-gate 	    !(is_copy_rename &&
1409*7c478bd9Sstevel@tonic-gate 		(my_lgrp == lgrp_root) &&
1410*7c478bd9Sstevel@tonic-gate 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1411*7c478bd9Sstevel@tonic-gate 		if (drop_lock)
1412*7c478bd9Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
1413*7c478bd9Sstevel@tonic-gate 		return;
1414*7c478bd9Sstevel@tonic-gate 	}
1415*7c478bd9Sstevel@tonic-gate 
1416*7c478bd9Sstevel@tonic-gate 	/*
1417*7c478bd9Sstevel@tonic-gate 	 * This lgroup does not contain any memory now
1418*7c478bd9Sstevel@tonic-gate 	 */
1419*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1420*7c478bd9Sstevel@tonic-gate 
1421*7c478bd9Sstevel@tonic-gate 	/*
1422*7c478bd9Sstevel@tonic-gate 	 * Remove this lgroup from lgroup topology if it does not contain any
1423*7c478bd9Sstevel@tonic-gate 	 * resources now
1424*7c478bd9Sstevel@tonic-gate 	 */
1425*7c478bd9Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
1426*7c478bd9Sstevel@tonic-gate 	count = 0;
1427*7c478bd9Sstevel@tonic-gate 	klgrpset_clear(changed);
1428*7c478bd9Sstevel@tonic-gate 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1429*7c478bd9Sstevel@tonic-gate 		/*
1430*7c478bd9Sstevel@tonic-gate 		 * Delete lgroup when no more resources
1431*7c478bd9Sstevel@tonic-gate 		 */
1432*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1433*7c478bd9Sstevel@tonic-gate 			pause_cpus(NULL);
1434*7c478bd9Sstevel@tonic-gate 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1435*7c478bd9Sstevel@tonic-gate 		    lgrp_alloc_max + 1, &changed);
1436*7c478bd9Sstevel@tonic-gate 		ASSERT(count > 0);
1437*7c478bd9Sstevel@tonic-gate 		if (need_synch)
1438*7c478bd9Sstevel@tonic-gate 			start_cpus();
1439*7c478bd9Sstevel@tonic-gate 	} else {
1440*7c478bd9Sstevel@tonic-gate 		/*
1441*7c478bd9Sstevel@tonic-gate 		 * Remove lgroup from memory resources of any lgroups that
1442*7c478bd9Sstevel@tonic-gate 		 * contain it as such
1443*7c478bd9Sstevel@tonic-gate 		 */
1444*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
1445*7c478bd9Sstevel@tonic-gate 			lgrp_t		*lgrp;
1446*7c478bd9Sstevel@tonic-gate 
1447*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
1448*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
1449*7c478bd9Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1450*7c478bd9Sstevel@tonic-gate 			    lgrpid))
1451*7c478bd9Sstevel@tonic-gate 				continue;
1452*7c478bd9Sstevel@tonic-gate 
1453*7c478bd9Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1454*7c478bd9Sstevel@tonic-gate 		}
1455*7c478bd9Sstevel@tonic-gate 	}
1456*7c478bd9Sstevel@tonic-gate 	if (drop_lock)
1457*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1458*7c478bd9Sstevel@tonic-gate }
1459*7c478bd9Sstevel@tonic-gate 
1460*7c478bd9Sstevel@tonic-gate /*
1461*7c478bd9Sstevel@tonic-gate  * Return lgroup with given platform handle
1462*7c478bd9Sstevel@tonic-gate  */
1463*7c478bd9Sstevel@tonic-gate lgrp_t *
1464*7c478bd9Sstevel@tonic-gate lgrp_hand_to_lgrp(lgrp_handle_t hand)
1465*7c478bd9Sstevel@tonic-gate {
1466*7c478bd9Sstevel@tonic-gate 	int	i;
1467*7c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
1468*7c478bd9Sstevel@tonic-gate 
1469*7c478bd9Sstevel@tonic-gate 	if (hand == LGRP_NULL_HANDLE)
1470*7c478bd9Sstevel@tonic-gate 		return (NULL);
1471*7c478bd9Sstevel@tonic-gate 
1472*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1473*7c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
1474*7c478bd9Sstevel@tonic-gate 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1475*7c478bd9Sstevel@tonic-gate 			return (lgrp);
1476*7c478bd9Sstevel@tonic-gate 	}
1477*7c478bd9Sstevel@tonic-gate 	return (NULL);
1478*7c478bd9Sstevel@tonic-gate }
1479*7c478bd9Sstevel@tonic-gate 
1480*7c478bd9Sstevel@tonic-gate /*
1481*7c478bd9Sstevel@tonic-gate  * Return the home lgroup of the current thread.
1482*7c478bd9Sstevel@tonic-gate  * We must do this with kernel preemption disabled, since we don't want our
1483*7c478bd9Sstevel@tonic-gate  * thread to be re-homed while we're poking around with its lpl, and the lpl
1484*7c478bd9Sstevel@tonic-gate  * should never be NULL.
1485*7c478bd9Sstevel@tonic-gate  *
1486*7c478bd9Sstevel@tonic-gate  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1487*7c478bd9Sstevel@tonic-gate  * is enabled because of DR.  Callers can use disable kernel preemption
1488*7c478bd9Sstevel@tonic-gate  * around this call to guarantee that the lgroup will be valid beyond this
1489*7c478bd9Sstevel@tonic-gate  * routine, since kernel preemption can be recursive.
1490*7c478bd9Sstevel@tonic-gate  */
1491*7c478bd9Sstevel@tonic-gate lgrp_t *
1492*7c478bd9Sstevel@tonic-gate lgrp_home_lgrp(void)
1493*7c478bd9Sstevel@tonic-gate {
1494*7c478bd9Sstevel@tonic-gate 	lgrp_t	*lgrp;
1495*7c478bd9Sstevel@tonic-gate 	lpl_t	*lpl;
1496*7c478bd9Sstevel@tonic-gate 
1497*7c478bd9Sstevel@tonic-gate 	kpreempt_disable();
1498*7c478bd9Sstevel@tonic-gate 
1499*7c478bd9Sstevel@tonic-gate 	lpl = curthread->t_lpl;
1500*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl != NULL);
1501*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1502*7c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1503*7c478bd9Sstevel@tonic-gate 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1504*7c478bd9Sstevel@tonic-gate 
1505*7c478bd9Sstevel@tonic-gate 	kpreempt_enable();
1506*7c478bd9Sstevel@tonic-gate 
1507*7c478bd9Sstevel@tonic-gate 	return (lgrp);
1508*7c478bd9Sstevel@tonic-gate }
1509*7c478bd9Sstevel@tonic-gate 
1510*7c478bd9Sstevel@tonic-gate /*
1511*7c478bd9Sstevel@tonic-gate  * Return ID of home lgroup for given thread
1512*7c478bd9Sstevel@tonic-gate  * (See comments for lgrp_home_lgrp() for special care and handling
1513*7c478bd9Sstevel@tonic-gate  * instructions)
1514*7c478bd9Sstevel@tonic-gate  */
1515*7c478bd9Sstevel@tonic-gate lgrp_id_t
1516*7c478bd9Sstevel@tonic-gate lgrp_home_id(kthread_t *t)
1517*7c478bd9Sstevel@tonic-gate {
1518*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrp;
1519*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
1520*7c478bd9Sstevel@tonic-gate 
1521*7c478bd9Sstevel@tonic-gate 	ASSERT(t != NULL);
1522*7c478bd9Sstevel@tonic-gate 	/*
1523*7c478bd9Sstevel@tonic-gate 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1524*7c478bd9Sstevel@tonic-gate 	 * cannot since the HAT layer can call into this routine to
1525*7c478bd9Sstevel@tonic-gate 	 * determine the locality for its data structures in the context
1526*7c478bd9Sstevel@tonic-gate 	 * of a page fault.
1527*7c478bd9Sstevel@tonic-gate 	 */
1528*7c478bd9Sstevel@tonic-gate 
1529*7c478bd9Sstevel@tonic-gate 	kpreempt_disable();
1530*7c478bd9Sstevel@tonic-gate 
1531*7c478bd9Sstevel@tonic-gate 	lpl = t->t_lpl;
1532*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl != NULL);
1533*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1534*7c478bd9Sstevel@tonic-gate 	lgrp = lpl->lpl_lgrpid;
1535*7c478bd9Sstevel@tonic-gate 
1536*7c478bd9Sstevel@tonic-gate 	kpreempt_enable();
1537*7c478bd9Sstevel@tonic-gate 
1538*7c478bd9Sstevel@tonic-gate 	return (lgrp);
1539*7c478bd9Sstevel@tonic-gate }
1540*7c478bd9Sstevel@tonic-gate 
1541*7c478bd9Sstevel@tonic-gate /*
1542*7c478bd9Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
1543*7c478bd9Sstevel@tonic-gate  */
1544*7c478bd9Sstevel@tonic-gate lgrp_t *
1545*7c478bd9Sstevel@tonic-gate lgrp_pfn_to_lgrp(pfn_t pfn)
1546*7c478bd9Sstevel@tonic-gate {
1547*7c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
1548*7c478bd9Sstevel@tonic-gate 	int		i;
1549*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
1550*7c478bd9Sstevel@tonic-gate 
1551*7c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
1552*7c478bd9Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
1553*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
1554*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
1555*7c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1556*7c478bd9Sstevel@tonic-gate 				return (lgrp);
1557*7c478bd9Sstevel@tonic-gate 		}
1558*7c478bd9Sstevel@tonic-gate 	return (NULL);
1559*7c478bd9Sstevel@tonic-gate }
1560*7c478bd9Sstevel@tonic-gate 
1561*7c478bd9Sstevel@tonic-gate /*
1562*7c478bd9Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
1563*7c478bd9Sstevel@tonic-gate  */
1564*7c478bd9Sstevel@tonic-gate lgrp_t *
1565*7c478bd9Sstevel@tonic-gate lgrp_phys_to_lgrp(u_longlong_t physaddr)
1566*7c478bd9Sstevel@tonic-gate {
1567*7c478bd9Sstevel@tonic-gate 	lgrp_handle_t	hand;
1568*7c478bd9Sstevel@tonic-gate 	int		i;
1569*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
1570*7c478bd9Sstevel@tonic-gate 	pfn_t		pfn;
1571*7c478bd9Sstevel@tonic-gate 
1572*7c478bd9Sstevel@tonic-gate 	pfn = btop(physaddr);
1573*7c478bd9Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
1574*7c478bd9Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
1575*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
1576*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
1577*7c478bd9Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1578*7c478bd9Sstevel@tonic-gate 				return (lgrp);
1579*7c478bd9Sstevel@tonic-gate 		}
1580*7c478bd9Sstevel@tonic-gate 	return (NULL);
1581*7c478bd9Sstevel@tonic-gate }
1582*7c478bd9Sstevel@tonic-gate 
1583*7c478bd9Sstevel@tonic-gate /*
1584*7c478bd9Sstevel@tonic-gate  * Return the leaf lgroup containing the given CPU
1585*7c478bd9Sstevel@tonic-gate  */
1586*7c478bd9Sstevel@tonic-gate static lgrp_t *
1587*7c478bd9Sstevel@tonic-gate lgrp_cpu_to_lgrp(cpu_t *cpu)
1588*7c478bd9Sstevel@tonic-gate {
1589*7c478bd9Sstevel@tonic-gate 	return (cpu->cpu_chip->chip_lgrp);
1590*7c478bd9Sstevel@tonic-gate }
1591*7c478bd9Sstevel@tonic-gate 
1592*7c478bd9Sstevel@tonic-gate /*
1593*7c478bd9Sstevel@tonic-gate  * Return the sum of the partition loads in an lgrp divided by
1594*7c478bd9Sstevel@tonic-gate  * the number of CPUs in the lgrp.  This is our best approximation
1595*7c478bd9Sstevel@tonic-gate  * of an 'lgroup load average' for a useful per-lgroup kstat.
1596*7c478bd9Sstevel@tonic-gate  */
1597*7c478bd9Sstevel@tonic-gate static uint64_t
1598*7c478bd9Sstevel@tonic-gate lgrp_sum_loadavgs(lgrp_t *lgrp)
1599*7c478bd9Sstevel@tonic-gate {
1600*7c478bd9Sstevel@tonic-gate 	cpu_t *cpu;
1601*7c478bd9Sstevel@tonic-gate 	int ncpu;
1602*7c478bd9Sstevel@tonic-gate 	uint64_t loads = 0;
1603*7c478bd9Sstevel@tonic-gate 
1604*7c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
1605*7c478bd9Sstevel@tonic-gate 
1606*7c478bd9Sstevel@tonic-gate 	cpu = lgrp->lgrp_cpu;
1607*7c478bd9Sstevel@tonic-gate 	ncpu = lgrp->lgrp_cpucnt;
1608*7c478bd9Sstevel@tonic-gate 
1609*7c478bd9Sstevel@tonic-gate 	if (cpu == NULL || ncpu == 0) {
1610*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1611*7c478bd9Sstevel@tonic-gate 		return (0ull);
1612*7c478bd9Sstevel@tonic-gate 	}
1613*7c478bd9Sstevel@tonic-gate 
1614*7c478bd9Sstevel@tonic-gate 	do {
1615*7c478bd9Sstevel@tonic-gate 		loads += cpu->cpu_lpl->lpl_loadavg;
1616*7c478bd9Sstevel@tonic-gate 		cpu = cpu->cpu_next_lgrp;
1617*7c478bd9Sstevel@tonic-gate 	} while (cpu != lgrp->lgrp_cpu);
1618*7c478bd9Sstevel@tonic-gate 
1619*7c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
1620*7c478bd9Sstevel@tonic-gate 
1621*7c478bd9Sstevel@tonic-gate 	return (loads / ncpu);
1622*7c478bd9Sstevel@tonic-gate }
1623*7c478bd9Sstevel@tonic-gate 
1624*7c478bd9Sstevel@tonic-gate void
1625*7c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1626*7c478bd9Sstevel@tonic-gate {
1627*7c478bd9Sstevel@tonic-gate 	struct lgrp_stats *pstats;
1628*7c478bd9Sstevel@tonic-gate 
1629*7c478bd9Sstevel@tonic-gate 	/*
1630*7c478bd9Sstevel@tonic-gate 	 * Verify that the caller isn't trying to add to
1631*7c478bd9Sstevel@tonic-gate 	 * a statistic for an lgroup that has gone away
1632*7c478bd9Sstevel@tonic-gate 	 */
1633*7c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1634*7c478bd9Sstevel@tonic-gate 		return;
1635*7c478bd9Sstevel@tonic-gate 
1636*7c478bd9Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
1637*7c478bd9Sstevel@tonic-gate 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1638*7c478bd9Sstevel@tonic-gate }
1639*7c478bd9Sstevel@tonic-gate 
1640*7c478bd9Sstevel@tonic-gate int64_t
1641*7c478bd9Sstevel@tonic-gate lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1642*7c478bd9Sstevel@tonic-gate {
1643*7c478bd9Sstevel@tonic-gate 	uint64_t val;
1644*7c478bd9Sstevel@tonic-gate 	struct lgrp_stats *pstats;
1645*7c478bd9Sstevel@tonic-gate 
1646*7c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1647*7c478bd9Sstevel@tonic-gate 		return ((int64_t)0);
1648*7c478bd9Sstevel@tonic-gate 
1649*7c478bd9Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
1650*7c478bd9Sstevel@tonic-gate 	LGRP_STAT_READ(pstats, stat, val);
1651*7c478bd9Sstevel@tonic-gate 	return (val);
1652*7c478bd9Sstevel@tonic-gate }
1653*7c478bd9Sstevel@tonic-gate 
1654*7c478bd9Sstevel@tonic-gate /*
1655*7c478bd9Sstevel@tonic-gate  * Reset all kstats for lgrp specified by its lgrpid.
1656*7c478bd9Sstevel@tonic-gate  */
1657*7c478bd9Sstevel@tonic-gate static void
1658*7c478bd9Sstevel@tonic-gate lgrp_kstat_reset(lgrp_id_t lgrpid)
1659*7c478bd9Sstevel@tonic-gate {
1660*7c478bd9Sstevel@tonic-gate 	lgrp_stat_t stat;
1661*7c478bd9Sstevel@tonic-gate 
1662*7c478bd9Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1663*7c478bd9Sstevel@tonic-gate 		return;
1664*7c478bd9Sstevel@tonic-gate 
1665*7c478bd9Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1666*7c478bd9Sstevel@tonic-gate 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1667*7c478bd9Sstevel@tonic-gate 	}
1668*7c478bd9Sstevel@tonic-gate }
1669*7c478bd9Sstevel@tonic-gate 
1670*7c478bd9Sstevel@tonic-gate /*
1671*7c478bd9Sstevel@tonic-gate  * Collect all per-lgrp statistics for the lgrp associated with this
1672*7c478bd9Sstevel@tonic-gate  * kstat, and store them in the ks_data array.
1673*7c478bd9Sstevel@tonic-gate  *
1674*7c478bd9Sstevel@tonic-gate  * The superuser can reset all the running counter statistics for an
1675*7c478bd9Sstevel@tonic-gate  * lgrp by writing to any of the lgrp's stats.
1676*7c478bd9Sstevel@tonic-gate  */
1677*7c478bd9Sstevel@tonic-gate static int
1678*7c478bd9Sstevel@tonic-gate lgrp_kstat_extract(kstat_t *ksp, int rw)
1679*7c478bd9Sstevel@tonic-gate {
1680*7c478bd9Sstevel@tonic-gate 	lgrp_stat_t		stat;
1681*7c478bd9Sstevel@tonic-gate 	struct kstat_named	*ksd;
1682*7c478bd9Sstevel@tonic-gate 	lgrp_t			*lgrp;
1683*7c478bd9Sstevel@tonic-gate 	lgrp_id_t		lgrpid;
1684*7c478bd9Sstevel@tonic-gate 
1685*7c478bd9Sstevel@tonic-gate 	lgrp = (lgrp_t *)ksp->ks_private;
1686*7c478bd9Sstevel@tonic-gate 
1687*7c478bd9Sstevel@tonic-gate 	ksd = (struct kstat_named *)ksp->ks_data;
1688*7c478bd9Sstevel@tonic-gate 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1689*7c478bd9Sstevel@tonic-gate 
1690*7c478bd9Sstevel@tonic-gate 	lgrpid = lgrp->lgrp_id;
1691*7c478bd9Sstevel@tonic-gate 
1692*7c478bd9Sstevel@tonic-gate 	if (lgrpid == LGRP_NONE) {
1693*7c478bd9Sstevel@tonic-gate 		/*
1694*7c478bd9Sstevel@tonic-gate 		 * Return all zeroes as stats for freed lgrp.
1695*7c478bd9Sstevel@tonic-gate 		 */
1696*7c478bd9Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1697*7c478bd9Sstevel@tonic-gate 			ksd[stat].value.i64 = 0;
1698*7c478bd9Sstevel@tonic-gate 		}
1699*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1700*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1701*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1702*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1703*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1704*7c478bd9Sstevel@tonic-gate 	} else if (rw != KSTAT_WRITE) {
1705*7c478bd9Sstevel@tonic-gate 		/*
1706*7c478bd9Sstevel@tonic-gate 		 * Handle counter stats
1707*7c478bd9Sstevel@tonic-gate 		 */
1708*7c478bd9Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1709*7c478bd9Sstevel@tonic-gate 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1710*7c478bd9Sstevel@tonic-gate 		}
1711*7c478bd9Sstevel@tonic-gate 
1712*7c478bd9Sstevel@tonic-gate 		/*
1713*7c478bd9Sstevel@tonic-gate 		 * Handle kernel data snapshot stats
1714*7c478bd9Sstevel@tonic-gate 		 */
1715*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1716*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1717*7c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1718*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1719*7c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1720*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1721*7c478bd9Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1722*7c478bd9Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1723*7c478bd9Sstevel@tonic-gate 	} else {
1724*7c478bd9Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
1725*7c478bd9Sstevel@tonic-gate 	}
1726*7c478bd9Sstevel@tonic-gate 
1727*7c478bd9Sstevel@tonic-gate 	return (0);
1728*7c478bd9Sstevel@tonic-gate }
1729*7c478bd9Sstevel@tonic-gate 
1730*7c478bd9Sstevel@tonic-gate int
1731*7c478bd9Sstevel@tonic-gate lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1732*7c478bd9Sstevel@tonic-gate {
1733*7c478bd9Sstevel@tonic-gate 	cpu_t	*cp;
1734*7c478bd9Sstevel@tonic-gate 
1735*7c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
1736*7c478bd9Sstevel@tonic-gate 
1737*7c478bd9Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
1738*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1739*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1740*7c478bd9Sstevel@tonic-gate 	}
1741*7c478bd9Sstevel@tonic-gate 
1742*7c478bd9Sstevel@tonic-gate 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1743*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1744*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1745*7c478bd9Sstevel@tonic-gate 	}
1746*7c478bd9Sstevel@tonic-gate 
1747*7c478bd9Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
1748*7c478bd9Sstevel@tonic-gate 
1749*7c478bd9Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_lgrpid;
1750*7c478bd9Sstevel@tonic-gate 
1751*7c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
1752*7c478bd9Sstevel@tonic-gate 
1753*7c478bd9Sstevel@tonic-gate 	return (0);
1754*7c478bd9Sstevel@tonic-gate }
1755*7c478bd9Sstevel@tonic-gate 
1756*7c478bd9Sstevel@tonic-gate int
1757*7c478bd9Sstevel@tonic-gate lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1758*7c478bd9Sstevel@tonic-gate {
1759*7c478bd9Sstevel@tonic-gate 	cpu_t *cp;
1760*7c478bd9Sstevel@tonic-gate 
1761*7c478bd9Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
1762*7c478bd9Sstevel@tonic-gate 
1763*7c478bd9Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
1764*7c478bd9Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
1765*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1766*7c478bd9Sstevel@tonic-gate 	}
1767*7c478bd9Sstevel@tonic-gate 
1768*7c478bd9Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
1769*7c478bd9Sstevel@tonic-gate 
1770*7c478bd9Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_loadavg;
1771*7c478bd9Sstevel@tonic-gate 
1772*7c478bd9Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
1773*7c478bd9Sstevel@tonic-gate 
1774*7c478bd9Sstevel@tonic-gate 	return (0);
1775*7c478bd9Sstevel@tonic-gate }
1776*7c478bd9Sstevel@tonic-gate 
1777*7c478bd9Sstevel@tonic-gate void
1778*7c478bd9Sstevel@tonic-gate lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1779*7c478bd9Sstevel@tonic-gate {
1780*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
1781*7c478bd9Sstevel@tonic-gate 	int		i;
1782*7c478bd9Sstevel@tonic-gate 
1783*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1784*7c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
1785*7c478bd9Sstevel@tonic-gate 
1786*7c478bd9Sstevel@tonic-gate 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1787*7c478bd9Sstevel@tonic-gate 			lgrp->lgrp_latency = (int)newtime;
1788*7c478bd9Sstevel@tonic-gate 	}
1789*7c478bd9Sstevel@tonic-gate }
1790*7c478bd9Sstevel@tonic-gate 
1791*7c478bd9Sstevel@tonic-gate /*
1792*7c478bd9Sstevel@tonic-gate  * Add a resource named by lpl_leaf to rset of lpl_target
1793*7c478bd9Sstevel@tonic-gate  *
1794*7c478bd9Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1795*7c478bd9Sstevel@tonic-gate  * resource. It is adjusted here, as this is presently the only place that we
1796*7c478bd9Sstevel@tonic-gate  * can be certain a resource addition has succeeded.
1797*7c478bd9Sstevel@tonic-gate  *
1798*7c478bd9Sstevel@tonic-gate  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1799*7c478bd9Sstevel@tonic-gate  * list in order until it reaches a NULL.  (This list is required to be NULL
1800*7c478bd9Sstevel@tonic-gate  * terminated, too).  This is done so that we can mark start pos + 1, so that
1801*7c478bd9Sstevel@tonic-gate  * each lpl is traversed sequentially, but in a different order.  We hope this
1802*7c478bd9Sstevel@tonic-gate  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1803*7c478bd9Sstevel@tonic-gate  */
1804*7c478bd9Sstevel@tonic-gate 
1805*7c478bd9Sstevel@tonic-gate void
1806*7c478bd9Sstevel@tonic-gate lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1807*7c478bd9Sstevel@tonic-gate {
1808*7c478bd9Sstevel@tonic-gate 	int		i;
1809*7c478bd9Sstevel@tonic-gate 	int		entry_slot = 0;
1810*7c478bd9Sstevel@tonic-gate 
1811*7c478bd9Sstevel@tonic-gate 	/* return if leaf is already present */
1812*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1813*7c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1814*7c478bd9Sstevel@tonic-gate 			return;
1815*7c478bd9Sstevel@tonic-gate 		}
1816*7c478bd9Sstevel@tonic-gate 
1817*7c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1818*7c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) {
1819*7c478bd9Sstevel@tonic-gate 			break;
1820*7c478bd9Sstevel@tonic-gate 		}
1821*7c478bd9Sstevel@tonic-gate 	}
1822*7c478bd9Sstevel@tonic-gate 
1823*7c478bd9Sstevel@tonic-gate 	/* insert leaf, update counts */
1824*7c478bd9Sstevel@tonic-gate 	entry_slot = i;
1825*7c478bd9Sstevel@tonic-gate 	i = lpl_target->lpl_nrset++;
1826*7c478bd9Sstevel@tonic-gate 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1827*7c478bd9Sstevel@tonic-gate 		panic("More leaf lgrps in system than are supported!\n");
1828*7c478bd9Sstevel@tonic-gate 	}
1829*7c478bd9Sstevel@tonic-gate 
1830*7c478bd9Sstevel@tonic-gate 	/*
1831*7c478bd9Sstevel@tonic-gate 	 * Start at the end of the rset array and work backwards towards the
1832*7c478bd9Sstevel@tonic-gate 	 * slot into which the new lpl will be inserted. This effectively
1833*7c478bd9Sstevel@tonic-gate 	 * preserves the current ordering by scooting everybody over one entry,
1834*7c478bd9Sstevel@tonic-gate 	 * and placing the new entry into the space created.
1835*7c478bd9Sstevel@tonic-gate 	 */
1836*7c478bd9Sstevel@tonic-gate 
1837*7c478bd9Sstevel@tonic-gate 	while (i-- > entry_slot) {
1838*7c478bd9Sstevel@tonic-gate 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1839*7c478bd9Sstevel@tonic-gate 	}
1840*7c478bd9Sstevel@tonic-gate 
1841*7c478bd9Sstevel@tonic-gate 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1842*7c478bd9Sstevel@tonic-gate 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1843*7c478bd9Sstevel@tonic-gate }
1844*7c478bd9Sstevel@tonic-gate 
1845*7c478bd9Sstevel@tonic-gate /*
1846*7c478bd9Sstevel@tonic-gate  * Update each of lpl_parent's children with a proper hint and
1847*7c478bd9Sstevel@tonic-gate  * a reference to their parent.
1848*7c478bd9Sstevel@tonic-gate  * The lgrp topology is used as the reference since it is fully
1849*7c478bd9Sstevel@tonic-gate  * consistent and correct at this point.
1850*7c478bd9Sstevel@tonic-gate  *
1851*7c478bd9Sstevel@tonic-gate  * Each child's hint will reference an element in lpl_parent's
1852*7c478bd9Sstevel@tonic-gate  * rset that designates where the child should start searching
1853*7c478bd9Sstevel@tonic-gate  * for CPU resources. The hint selected is the highest order leaf present
1854*7c478bd9Sstevel@tonic-gate  * in the child's lineage.
1855*7c478bd9Sstevel@tonic-gate  *
1856*7c478bd9Sstevel@tonic-gate  * This should be called after any potential change in lpl_parent's
1857*7c478bd9Sstevel@tonic-gate  * rset.
1858*7c478bd9Sstevel@tonic-gate  */
1859*7c478bd9Sstevel@tonic-gate static void
1860*7c478bd9Sstevel@tonic-gate lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1861*7c478bd9Sstevel@tonic-gate {
1862*7c478bd9Sstevel@tonic-gate 	klgrpset_t	children, leaves;
1863*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
1864*7c478bd9Sstevel@tonic-gate 	int		hint;
1865*7c478bd9Sstevel@tonic-gate 	int		i, j;
1866*7c478bd9Sstevel@tonic-gate 
1867*7c478bd9Sstevel@tonic-gate 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1868*7c478bd9Sstevel@tonic-gate 	if (klgrpset_isempty(children))
1869*7c478bd9Sstevel@tonic-gate 		return; /* nothing to do */
1870*7c478bd9Sstevel@tonic-gate 
1871*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1872*7c478bd9Sstevel@tonic-gate 		if (klgrpset_ismember(children, i)) {
1873*7c478bd9Sstevel@tonic-gate 
1874*7c478bd9Sstevel@tonic-gate 			/*
1875*7c478bd9Sstevel@tonic-gate 			 * Given the set of leaves in this child's lineage,
1876*7c478bd9Sstevel@tonic-gate 			 * find the highest order leaf present in the parent's
1877*7c478bd9Sstevel@tonic-gate 			 * rset. Select this as the hint for the child.
1878*7c478bd9Sstevel@tonic-gate 			 */
1879*7c478bd9Sstevel@tonic-gate 			leaves = lgrp_table[i]->lgrp_leaves;
1880*7c478bd9Sstevel@tonic-gate 			hint = 0;
1881*7c478bd9Sstevel@tonic-gate 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1882*7c478bd9Sstevel@tonic-gate 				lpl = lpl_parent->lpl_rset[j];
1883*7c478bd9Sstevel@tonic-gate 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1884*7c478bd9Sstevel@tonic-gate 					hint = j;
1885*7c478bd9Sstevel@tonic-gate 			}
1886*7c478bd9Sstevel@tonic-gate 			cp->cp_lgrploads[i].lpl_hint = hint;
1887*7c478bd9Sstevel@tonic-gate 
1888*7c478bd9Sstevel@tonic-gate 			/*
1889*7c478bd9Sstevel@tonic-gate 			 * (Re)set the parent. It may be incorrect if
1890*7c478bd9Sstevel@tonic-gate 			 * lpl_parent is new in the topology.
1891*7c478bd9Sstevel@tonic-gate 			 */
1892*7c478bd9Sstevel@tonic-gate 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1893*7c478bd9Sstevel@tonic-gate 		}
1894*7c478bd9Sstevel@tonic-gate 	}
1895*7c478bd9Sstevel@tonic-gate }
1896*7c478bd9Sstevel@tonic-gate 
1897*7c478bd9Sstevel@tonic-gate /*
1898*7c478bd9Sstevel@tonic-gate  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1899*7c478bd9Sstevel@tonic-gate  *
1900*7c478bd9Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1901*7c478bd9Sstevel@tonic-gate  * resource. The values are adjusted here, as this is the only place that we can
1902*7c478bd9Sstevel@tonic-gate  * be certain a resource was successfully deleted.
1903*7c478bd9Sstevel@tonic-gate  */
1904*7c478bd9Sstevel@tonic-gate void
1905*7c478bd9Sstevel@tonic-gate lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1906*7c478bd9Sstevel@tonic-gate {
1907*7c478bd9Sstevel@tonic-gate 	int i;
1908*7c478bd9Sstevel@tonic-gate 
1909*7c478bd9Sstevel@tonic-gate 	/* find leaf in intermediate node */
1910*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1911*7c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1912*7c478bd9Sstevel@tonic-gate 			break;
1913*7c478bd9Sstevel@tonic-gate 	}
1914*7c478bd9Sstevel@tonic-gate 
1915*7c478bd9Sstevel@tonic-gate 	/* return if leaf not found */
1916*7c478bd9Sstevel@tonic-gate 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1917*7c478bd9Sstevel@tonic-gate 		return;
1918*7c478bd9Sstevel@tonic-gate 
1919*7c478bd9Sstevel@tonic-gate 	/* prune leaf, compress array */
1920*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1921*7c478bd9Sstevel@tonic-gate 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1922*7c478bd9Sstevel@tonic-gate 	lpl_target->lpl_ncpu--;
1923*7c478bd9Sstevel@tonic-gate 	do {
1924*7c478bd9Sstevel@tonic-gate 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1925*7c478bd9Sstevel@tonic-gate 	} while (i++ < lpl_target->lpl_nrset);
1926*7c478bd9Sstevel@tonic-gate }
1927*7c478bd9Sstevel@tonic-gate 
1928*7c478bd9Sstevel@tonic-gate /*
1929*7c478bd9Sstevel@tonic-gate  * Check to see if the resource set of the target lpl contains the
1930*7c478bd9Sstevel@tonic-gate  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1931*7c478bd9Sstevel@tonic-gate  */
1932*7c478bd9Sstevel@tonic-gate 
1933*7c478bd9Sstevel@tonic-gate int
1934*7c478bd9Sstevel@tonic-gate lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1935*7c478bd9Sstevel@tonic-gate {
1936*7c478bd9Sstevel@tonic-gate 	int i;
1937*7c478bd9Sstevel@tonic-gate 
1938*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1939*7c478bd9Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1940*7c478bd9Sstevel@tonic-gate 			return (1);
1941*7c478bd9Sstevel@tonic-gate 	}
1942*7c478bd9Sstevel@tonic-gate 
1943*7c478bd9Sstevel@tonic-gate 	return (0);
1944*7c478bd9Sstevel@tonic-gate }
1945*7c478bd9Sstevel@tonic-gate 
1946*7c478bd9Sstevel@tonic-gate /*
1947*7c478bd9Sstevel@tonic-gate  * Called when we change cpu lpl membership.  This increments or decrements the
1948*7c478bd9Sstevel@tonic-gate  * per-cpu counter in every lpl in which our leaf appears.
1949*7c478bd9Sstevel@tonic-gate  */
1950*7c478bd9Sstevel@tonic-gate void
1951*7c478bd9Sstevel@tonic-gate lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1952*7c478bd9Sstevel@tonic-gate {
1953*7c478bd9Sstevel@tonic-gate 	cpupart_t	*cpupart;
1954*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
1955*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
1956*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
1957*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
1958*7c478bd9Sstevel@tonic-gate 	int		i;
1959*7c478bd9Sstevel@tonic-gate 
1960*7c478bd9Sstevel@tonic-gate 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1961*7c478bd9Sstevel@tonic-gate 
1962*7c478bd9Sstevel@tonic-gate 	cpupart = cp->cpu_part;
1963*7c478bd9Sstevel@tonic-gate 	lpl_leaf = cp->cpu_lpl;
1964*7c478bd9Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1965*7c478bd9Sstevel@tonic-gate 
1966*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
1967*7c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
1968*7c478bd9Sstevel@tonic-gate 
1969*7c478bd9Sstevel@tonic-gate 		/*
1970*7c478bd9Sstevel@tonic-gate 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1971*7c478bd9Sstevel@tonic-gate 		 * for the cpu in question, or if the current lgrp and leaf
1972*7c478bd9Sstevel@tonic-gate 		 * don't share the same resources.
1973*7c478bd9Sstevel@tonic-gate 		 */
1974*7c478bd9Sstevel@tonic-gate 
1975*7c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1976*7c478bd9Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1977*7c478bd9Sstevel@tonic-gate 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1978*7c478bd9Sstevel@tonic-gate 			continue;
1979*7c478bd9Sstevel@tonic-gate 
1980*7c478bd9Sstevel@tonic-gate 
1981*7c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1982*7c478bd9Sstevel@tonic-gate 
1983*7c478bd9Sstevel@tonic-gate 		if (lpl_cur->lpl_nrset > 0) {
1984*7c478bd9Sstevel@tonic-gate 			if (act == LPL_INCREMENT) {
1985*7c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_ncpu++;
1986*7c478bd9Sstevel@tonic-gate 			} else if (act == LPL_DECREMENT) {
1987*7c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_ncpu--;
1988*7c478bd9Sstevel@tonic-gate 			}
1989*7c478bd9Sstevel@tonic-gate 		}
1990*7c478bd9Sstevel@tonic-gate 	}
1991*7c478bd9Sstevel@tonic-gate }
1992*7c478bd9Sstevel@tonic-gate 
1993*7c478bd9Sstevel@tonic-gate /*
1994*7c478bd9Sstevel@tonic-gate  * Initialize lpl with given resources and specified lgrp
1995*7c478bd9Sstevel@tonic-gate  */
1996*7c478bd9Sstevel@tonic-gate 
1997*7c478bd9Sstevel@tonic-gate void
1998*7c478bd9Sstevel@tonic-gate lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
1999*7c478bd9Sstevel@tonic-gate {
2000*7c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2001*7c478bd9Sstevel@tonic-gate 	lpl->lpl_loadavg = 0;
2002*7c478bd9Sstevel@tonic-gate 	if (lpl == lpl_leaf)
2003*7c478bd9Sstevel@tonic-gate 		lpl->lpl_ncpu = 1;
2004*7c478bd9Sstevel@tonic-gate 	else
2005*7c478bd9Sstevel@tonic-gate 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2006*7c478bd9Sstevel@tonic-gate 	lpl->lpl_nrset = 1;
2007*7c478bd9Sstevel@tonic-gate 	lpl->lpl_rset[0] = lpl_leaf;
2008*7c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrp = lgrp;
2009*7c478bd9Sstevel@tonic-gate 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2010*7c478bd9Sstevel@tonic-gate 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2011*7c478bd9Sstevel@tonic-gate }
2012*7c478bd9Sstevel@tonic-gate 
2013*7c478bd9Sstevel@tonic-gate /*
2014*7c478bd9Sstevel@tonic-gate  * Clear an unused lpl
2015*7c478bd9Sstevel@tonic-gate  */
2016*7c478bd9Sstevel@tonic-gate 
2017*7c478bd9Sstevel@tonic-gate void
2018*7c478bd9Sstevel@tonic-gate lpl_clear(lpl_t *lpl)
2019*7c478bd9Sstevel@tonic-gate {
2020*7c478bd9Sstevel@tonic-gate 	lgrpid_t	lid;
2021*7c478bd9Sstevel@tonic-gate 
2022*7c478bd9Sstevel@tonic-gate 	/* save lid for debugging purposes */
2023*7c478bd9Sstevel@tonic-gate 	lid = lpl->lpl_lgrpid;
2024*7c478bd9Sstevel@tonic-gate 	bzero(lpl, sizeof (lpl_t));
2025*7c478bd9Sstevel@tonic-gate 	lpl->lpl_lgrpid = lid;
2026*7c478bd9Sstevel@tonic-gate }
2027*7c478bd9Sstevel@tonic-gate 
2028*7c478bd9Sstevel@tonic-gate /*
2029*7c478bd9Sstevel@tonic-gate  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2030*7c478bd9Sstevel@tonic-gate  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2031*7c478bd9Sstevel@tonic-gate  * make full use of all of the lgroup topology, but this checks to make sure
2032*7c478bd9Sstevel@tonic-gate  * that for the parts that it does use, it has correctly understood the
2033*7c478bd9Sstevel@tonic-gate  * relationships that exist. This function returns
2034*7c478bd9Sstevel@tonic-gate  * 0 if the topology is correct, and a non-zero error code, for non-debug
2035*7c478bd9Sstevel@tonic-gate  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2036*7c478bd9Sstevel@tonic-gate  * debugging on a DEBUG kernel.
2037*7c478bd9Sstevel@tonic-gate  */
2038*7c478bd9Sstevel@tonic-gate int
2039*7c478bd9Sstevel@tonic-gate lpl_topo_verify(cpupart_t *cpupart)
2040*7c478bd9Sstevel@tonic-gate {
2041*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp;
2042*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
2043*7c478bd9Sstevel@tonic-gate 	klgrpset_t	rset;
2044*7c478bd9Sstevel@tonic-gate 	klgrpset_t	cset;
2045*7c478bd9Sstevel@tonic-gate 	cpu_t		*cpu;
2046*7c478bd9Sstevel@tonic-gate 	cpu_t		*cp_start;
2047*7c478bd9Sstevel@tonic-gate 	int		i;
2048*7c478bd9Sstevel@tonic-gate 	int		j;
2049*7c478bd9Sstevel@tonic-gate 	int		sum;
2050*7c478bd9Sstevel@tonic-gate 
2051*7c478bd9Sstevel@tonic-gate 	/* topology can't be incorrect if it doesn't exist */
2052*7c478bd9Sstevel@tonic-gate 	if (!lgrp_topo_initialized || !lgrp_initialized)
2053*7c478bd9Sstevel@tonic-gate 		return (LPL_TOPO_CORRECT);
2054*7c478bd9Sstevel@tonic-gate 
2055*7c478bd9Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
2056*7c478bd9Sstevel@tonic-gate 
2057*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
2058*7c478bd9Sstevel@tonic-gate 		lgrp = lgrp_table[i];
2059*7c478bd9Sstevel@tonic-gate 		lpl = NULL;
2060*7c478bd9Sstevel@tonic-gate 		/* make sure lpls are allocated */
2061*7c478bd9Sstevel@tonic-gate 		ASSERT(cpupart->cp_lgrploads);
2062*7c478bd9Sstevel@tonic-gate 		if (!cpupart->cp_lgrploads)
2063*7c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_PART_HAS_NO_LPL);
2064*7c478bd9Sstevel@tonic-gate 
2065*7c478bd9Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[i];
2066*7c478bd9Sstevel@tonic-gate 		/* make sure our index is good */
2067*7c478bd9Sstevel@tonic-gate 		ASSERT(i < cpupart->cp_nlgrploads);
2068*7c478bd9Sstevel@tonic-gate 
2069*7c478bd9Sstevel@tonic-gate 		/* if lgroup doesn't exist, make sure lpl is empty */
2070*7c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp)) {
2071*7c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
2072*7c478bd9Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
2073*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2074*7c478bd9Sstevel@tonic-gate 			} else {
2075*7c478bd9Sstevel@tonic-gate 				continue;
2076*7c478bd9Sstevel@tonic-gate 			}
2077*7c478bd9Sstevel@tonic-gate 		}
2078*7c478bd9Sstevel@tonic-gate 
2079*7c478bd9Sstevel@tonic-gate 		/* verify that lgroup and lpl are identically numbered */
2080*7c478bd9Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2081*7c478bd9Sstevel@tonic-gate 
2082*7c478bd9Sstevel@tonic-gate 		/* if lgroup isn't in our partition, make sure lpl is empty */
2083*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2084*7c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset)) {
2085*7c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
2086*7c478bd9Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
2087*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2088*7c478bd9Sstevel@tonic-gate 			}
2089*7c478bd9Sstevel@tonic-gate 			/*
2090*7c478bd9Sstevel@tonic-gate 			 * lpl is empty, and lgroup isn't in partition.  verify
2091*7c478bd9Sstevel@tonic-gate 			 * that lpl doesn't show up in anyone else's rsets (in
2092*7c478bd9Sstevel@tonic-gate 			 * this partition, anyway)
2093*7c478bd9Sstevel@tonic-gate 			 */
2094*7c478bd9Sstevel@tonic-gate 
2095*7c478bd9Sstevel@tonic-gate 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2096*7c478bd9Sstevel@tonic-gate 				lpl_t *i_lpl; /* lpl we're iterating over */
2097*7c478bd9Sstevel@tonic-gate 
2098*7c478bd9Sstevel@tonic-gate 				i_lpl = &cpupart->cp_lgrploads[j];
2099*7c478bd9Sstevel@tonic-gate 
2100*7c478bd9Sstevel@tonic-gate 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2101*7c478bd9Sstevel@tonic-gate 				if (lpl_rset_contains(i_lpl, lpl)) {
2102*7c478bd9Sstevel@tonic-gate 					return (LPL_TOPO_LPL_ORPHANED);
2103*7c478bd9Sstevel@tonic-gate 				}
2104*7c478bd9Sstevel@tonic-gate 			}
2105*7c478bd9Sstevel@tonic-gate 			/* lgroup is empty, and everything is ok. continue */
2106*7c478bd9Sstevel@tonic-gate 			continue;
2107*7c478bd9Sstevel@tonic-gate 		}
2108*7c478bd9Sstevel@tonic-gate 
2109*7c478bd9Sstevel@tonic-gate 
2110*7c478bd9Sstevel@tonic-gate 		/* lgroup is in this partition, now check it against lpl */
2111*7c478bd9Sstevel@tonic-gate 
2112*7c478bd9Sstevel@tonic-gate 		/* do both have matching lgrps? */
2113*7c478bd9Sstevel@tonic-gate 		ASSERT(lgrp == lpl->lpl_lgrp);
2114*7c478bd9Sstevel@tonic-gate 		if (lgrp != lpl->lpl_lgrp) {
2115*7c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_LGRP_MISMATCH);
2116*7c478bd9Sstevel@tonic-gate 		}
2117*7c478bd9Sstevel@tonic-gate 
2118*7c478bd9Sstevel@tonic-gate 		/* do the parent lgroups exist and do they match? */
2119*7c478bd9Sstevel@tonic-gate 		if (lgrp->lgrp_parent) {
2120*7c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_parent);
2121*7c478bd9Sstevel@tonic-gate 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2122*7c478bd9Sstevel@tonic-gate 				    lpl->lpl_parent->lpl_lgrpid);
2123*7c478bd9Sstevel@tonic-gate 
2124*7c478bd9Sstevel@tonic-gate 			if (!lpl->lpl_parent) {
2125*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_MISSING_PARENT);
2126*7c478bd9Sstevel@tonic-gate 			} else if (lgrp->lgrp_parent->lgrp_id !=
2127*7c478bd9Sstevel@tonic-gate 			    lpl->lpl_parent->lpl_lgrpid) {
2128*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_PARENT_MISMATCH);
2129*7c478bd9Sstevel@tonic-gate 			}
2130*7c478bd9Sstevel@tonic-gate 		}
2131*7c478bd9Sstevel@tonic-gate 
2132*7c478bd9Sstevel@tonic-gate 		/* only leaf lgroups keep a cpucnt, only check leaves */
2133*7c478bd9Sstevel@tonic-gate 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2134*7c478bd9Sstevel@tonic-gate 
2135*7c478bd9Sstevel@tonic-gate 			/* verify that lgrp is also a leaf */
2136*7c478bd9Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2137*7c478bd9Sstevel@tonic-gate 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2138*7c478bd9Sstevel@tonic-gate 			    lpl->lpl_lgrpid)));
2139*7c478bd9Sstevel@tonic-gate 
2140*7c478bd9Sstevel@tonic-gate 			if ((lgrp->lgrp_childcnt > 0) ||
2141*7c478bd9Sstevel@tonic-gate 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2142*7c478bd9Sstevel@tonic-gate 			    lpl->lpl_lgrpid))) {
2143*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LGRP_NOT_LEAF);
2144*7c478bd9Sstevel@tonic-gate 			}
2145*7c478bd9Sstevel@tonic-gate 
2146*7c478bd9Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2147*7c478bd9Sstevel@tonic-gate 			    (lpl->lpl_ncpu > 0));
2148*7c478bd9Sstevel@tonic-gate 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2149*7c478bd9Sstevel@tonic-gate 				(lpl->lpl_ncpu <= 0)) {
2150*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BAD_CPUCNT);
2151*7c478bd9Sstevel@tonic-gate 			}
2152*7c478bd9Sstevel@tonic-gate 
2153*7c478bd9Sstevel@tonic-gate 			/*
2154*7c478bd9Sstevel@tonic-gate 			 * Check that lpl_ncpu also matches the number of
2155*7c478bd9Sstevel@tonic-gate 			 * cpus in the lpl's linked list.  This only exists in
2156*7c478bd9Sstevel@tonic-gate 			 * leaves, but they should always match.
2157*7c478bd9Sstevel@tonic-gate 			 */
2158*7c478bd9Sstevel@tonic-gate 			j = 0;
2159*7c478bd9Sstevel@tonic-gate 			cpu = cp_start = lpl->lpl_cpus;
2160*7c478bd9Sstevel@tonic-gate 			while (cpu != NULL) {
2161*7c478bd9Sstevel@tonic-gate 				j++;
2162*7c478bd9Sstevel@tonic-gate 
2163*7c478bd9Sstevel@tonic-gate 				/* check to make sure cpu's lpl is leaf lpl */
2164*7c478bd9Sstevel@tonic-gate 				ASSERT(cpu->cpu_lpl == lpl);
2165*7c478bd9Sstevel@tonic-gate 				if (cpu->cpu_lpl != lpl) {
2166*7c478bd9Sstevel@tonic-gate 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2167*7c478bd9Sstevel@tonic-gate 				}
2168*7c478bd9Sstevel@tonic-gate 
2169*7c478bd9Sstevel@tonic-gate 				/* check next cpu */
2170*7c478bd9Sstevel@tonic-gate 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2171*7c478bd9Sstevel@tonic-gate 					continue;
2172*7c478bd9Sstevel@tonic-gate 				} else {
2173*7c478bd9Sstevel@tonic-gate 					cpu = NULL;
2174*7c478bd9Sstevel@tonic-gate 				}
2175*7c478bd9Sstevel@tonic-gate 			}
2176*7c478bd9Sstevel@tonic-gate 
2177*7c478bd9Sstevel@tonic-gate 			ASSERT(j == lpl->lpl_ncpu);
2178*7c478bd9Sstevel@tonic-gate 			if (j != lpl->lpl_ncpu) {
2179*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
2180*7c478bd9Sstevel@tonic-gate 			}
2181*7c478bd9Sstevel@tonic-gate 
2182*7c478bd9Sstevel@tonic-gate 			/*
2183*7c478bd9Sstevel@tonic-gate 			 * Also, check that leaf lpl is contained in all
2184*7c478bd9Sstevel@tonic-gate 			 * intermediate lpls that name the leaf as a descendant
2185*7c478bd9Sstevel@tonic-gate 			 */
2186*7c478bd9Sstevel@tonic-gate 
2187*7c478bd9Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
2188*7c478bd9Sstevel@tonic-gate 				klgrpset_t intersect;
2189*7c478bd9Sstevel@tonic-gate 				lgrp_t *lgrp_cand;
2190*7c478bd9Sstevel@tonic-gate 				lpl_t *lpl_cand;
2191*7c478bd9Sstevel@tonic-gate 
2192*7c478bd9Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
2193*7c478bd9Sstevel@tonic-gate 				intersect = klgrpset_intersects(
2194*7c478bd9Sstevel@tonic-gate 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2195*7c478bd9Sstevel@tonic-gate 				    cpupart->cp_lgrpset);
2196*7c478bd9Sstevel@tonic-gate 
2197*7c478bd9Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
2198*7c478bd9Sstevel@tonic-gate 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2199*7c478bd9Sstevel@tonic-gate 				    cpupart->cp_lgrpset) ||
2200*7c478bd9Sstevel@tonic-gate 				    (intersect == 0))
2201*7c478bd9Sstevel@tonic-gate 					continue;
2202*7c478bd9Sstevel@tonic-gate 
2203*7c478bd9Sstevel@tonic-gate 				lpl_cand =
2204*7c478bd9Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2205*7c478bd9Sstevel@tonic-gate 
2206*7c478bd9Sstevel@tonic-gate 				if (klgrpset_ismember(intersect,
2207*7c478bd9Sstevel@tonic-gate 				    lgrp->lgrp_id)) {
2208*7c478bd9Sstevel@tonic-gate 					ASSERT(lpl_rset_contains(lpl_cand,
2209*7c478bd9Sstevel@tonic-gate 					    lpl));
2210*7c478bd9Sstevel@tonic-gate 
2211*7c478bd9Sstevel@tonic-gate 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2212*7c478bd9Sstevel@tonic-gate 						return (LPL_TOPO_RSET_MSSNG_LF);
2213*7c478bd9Sstevel@tonic-gate 					}
2214*7c478bd9Sstevel@tonic-gate 				}
2215*7c478bd9Sstevel@tonic-gate 			}
2216*7c478bd9Sstevel@tonic-gate 
2217*7c478bd9Sstevel@tonic-gate 		} else { /* non-leaf specific checks */
2218*7c478bd9Sstevel@tonic-gate 
2219*7c478bd9Sstevel@tonic-gate 			/*
2220*7c478bd9Sstevel@tonic-gate 			 * Non-leaf lpls should have lpl_cpus == NULL
2221*7c478bd9Sstevel@tonic-gate 			 * verify that this is so
2222*7c478bd9Sstevel@tonic-gate 			 */
2223*7c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_cpus == NULL);
2224*7c478bd9Sstevel@tonic-gate 			if (lpl->lpl_cpus != NULL) {
2225*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2226*7c478bd9Sstevel@tonic-gate 			}
2227*7c478bd9Sstevel@tonic-gate 
2228*7c478bd9Sstevel@tonic-gate 			/*
2229*7c478bd9Sstevel@tonic-gate 			 * verify that the sum of the cpus in the leaf resources
2230*7c478bd9Sstevel@tonic-gate 			 * is equal to the total ncpu in the intermediate
2231*7c478bd9Sstevel@tonic-gate 			 */
2232*7c478bd9Sstevel@tonic-gate 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2233*7c478bd9Sstevel@tonic-gate 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2234*7c478bd9Sstevel@tonic-gate 			}
2235*7c478bd9Sstevel@tonic-gate 
2236*7c478bd9Sstevel@tonic-gate 			ASSERT(sum == lpl->lpl_ncpu);
2237*7c478bd9Sstevel@tonic-gate 			if (sum != lpl->lpl_ncpu) {
2238*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
2239*7c478bd9Sstevel@tonic-gate 			}
2240*7c478bd9Sstevel@tonic-gate 		}
2241*7c478bd9Sstevel@tonic-gate 
2242*7c478bd9Sstevel@tonic-gate 		/*
2243*7c478bd9Sstevel@tonic-gate 		 * check on lpl_hint. Don't check root, since it has no parent.
2244*7c478bd9Sstevel@tonic-gate 		 */
2245*7c478bd9Sstevel@tonic-gate 		if (lpl->lpl_parent != NULL) {
2246*7c478bd9Sstevel@tonic-gate 			int hint;
2247*7c478bd9Sstevel@tonic-gate 			lpl_t *hint_lpl;
2248*7c478bd9Sstevel@tonic-gate 
2249*7c478bd9Sstevel@tonic-gate 			/* make sure hint is within limits of nrset */
2250*7c478bd9Sstevel@tonic-gate 			hint = lpl->lpl_hint;
2251*7c478bd9Sstevel@tonic-gate 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2252*7c478bd9Sstevel@tonic-gate 			if (lpl->lpl_parent->lpl_nrset < hint) {
2253*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BOGUS_HINT);
2254*7c478bd9Sstevel@tonic-gate 			}
2255*7c478bd9Sstevel@tonic-gate 
2256*7c478bd9Sstevel@tonic-gate 			/* make sure hint points to valid lpl */
2257*7c478bd9Sstevel@tonic-gate 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2258*7c478bd9Sstevel@tonic-gate 			ASSERT(hint_lpl->lpl_ncpu > 0);
2259*7c478bd9Sstevel@tonic-gate 			if (hint_lpl->lpl_ncpu <= 0) {
2260*7c478bd9Sstevel@tonic-gate 				return (LPL_TOPO_BOGUS_HINT);
2261*7c478bd9Sstevel@tonic-gate 			}
2262*7c478bd9Sstevel@tonic-gate 		}
2263*7c478bd9Sstevel@tonic-gate 
2264*7c478bd9Sstevel@tonic-gate 		/*
2265*7c478bd9Sstevel@tonic-gate 		 * Check the rset of the lpl in question.  Make sure that each
2266*7c478bd9Sstevel@tonic-gate 		 * rset contains a subset of the resources in
2267*7c478bd9Sstevel@tonic-gate 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2268*7c478bd9Sstevel@tonic-gate 		 * sure that each rset doesn't include resources that are
2269*7c478bd9Sstevel@tonic-gate 		 * outside of that set.  (Which would be resources somehow not
2270*7c478bd9Sstevel@tonic-gate 		 * accounted for).
2271*7c478bd9Sstevel@tonic-gate 		 */
2272*7c478bd9Sstevel@tonic-gate 
2273*7c478bd9Sstevel@tonic-gate 		klgrpset_clear(rset);
2274*7c478bd9Sstevel@tonic-gate 		for (j = 0; j < lpl->lpl_nrset; j++) {
2275*7c478bd9Sstevel@tonic-gate 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2276*7c478bd9Sstevel@tonic-gate 		}
2277*7c478bd9Sstevel@tonic-gate 		klgrpset_copy(cset, rset);
2278*7c478bd9Sstevel@tonic-gate 		/* make sure lpl rset matches lgrp rset */
2279*7c478bd9Sstevel@tonic-gate 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2280*7c478bd9Sstevel@tonic-gate 		/* make sure rset is contained with in partition, too */
2281*7c478bd9Sstevel@tonic-gate 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2282*7c478bd9Sstevel@tonic-gate 
2283*7c478bd9Sstevel@tonic-gate 		ASSERT(klgrpset_isempty(rset) &&
2284*7c478bd9Sstevel@tonic-gate 			    klgrpset_isempty(cset));
2285*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_isempty(rset) ||
2286*7c478bd9Sstevel@tonic-gate 		    !klgrpset_isempty(cset)) {
2287*7c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_RSET_MISMATCH);
2288*7c478bd9Sstevel@tonic-gate 		}
2289*7c478bd9Sstevel@tonic-gate 
2290*7c478bd9Sstevel@tonic-gate 		/*
2291*7c478bd9Sstevel@tonic-gate 		 * check to make sure lpl_nrset matches the number of rsets
2292*7c478bd9Sstevel@tonic-gate 		 * contained in the lpl
2293*7c478bd9Sstevel@tonic-gate 		 */
2294*7c478bd9Sstevel@tonic-gate 
2295*7c478bd9Sstevel@tonic-gate 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2296*7c478bd9Sstevel@tonic-gate 		    j++);
2297*7c478bd9Sstevel@tonic-gate 
2298*7c478bd9Sstevel@tonic-gate 		ASSERT(j == lpl->lpl_nrset);
2299*7c478bd9Sstevel@tonic-gate 		if (j != lpl->lpl_nrset) {
2300*7c478bd9Sstevel@tonic-gate 			return (LPL_TOPO_BAD_RSETCNT);
2301*7c478bd9Sstevel@tonic-gate 		}
2302*7c478bd9Sstevel@tonic-gate 
2303*7c478bd9Sstevel@tonic-gate 	}
2304*7c478bd9Sstevel@tonic-gate 	return (LPL_TOPO_CORRECT);
2305*7c478bd9Sstevel@tonic-gate }
2306*7c478bd9Sstevel@tonic-gate 
2307*7c478bd9Sstevel@tonic-gate /*
2308*7c478bd9Sstevel@tonic-gate  * Flatten lpl topology to given number of levels.  This is presently only
2309*7c478bd9Sstevel@tonic-gate  * implemented for a flatten to 2 levels, which will prune out the intermediates
2310*7c478bd9Sstevel@tonic-gate  * and home the leaf lpls to the root lpl.
2311*7c478bd9Sstevel@tonic-gate  */
2312*7c478bd9Sstevel@tonic-gate int
2313*7c478bd9Sstevel@tonic-gate lpl_topo_flatten(int levels)
2314*7c478bd9Sstevel@tonic-gate {
2315*7c478bd9Sstevel@tonic-gate 	int		i;
2316*7c478bd9Sstevel@tonic-gate 	uint_t		sum;
2317*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
2318*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
2319*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_root;
2320*7c478bd9Sstevel@tonic-gate 	cpupart_t	*cp;
2321*7c478bd9Sstevel@tonic-gate 
2322*7c478bd9Sstevel@tonic-gate 	if (levels != 2)
2323*7c478bd9Sstevel@tonic-gate 		return (0);
2324*7c478bd9Sstevel@tonic-gate 
2325*7c478bd9Sstevel@tonic-gate 	/* called w/ cpus paused - grab no locks! */
2326*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2327*7c478bd9Sstevel@tonic-gate 	    !lgrp_initialized);
2328*7c478bd9Sstevel@tonic-gate 
2329*7c478bd9Sstevel@tonic-gate 	cp = cp_list_head;
2330*7c478bd9Sstevel@tonic-gate 	do {
2331*7c478bd9Sstevel@tonic-gate 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2332*7c478bd9Sstevel@tonic-gate 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2333*7c478bd9Sstevel@tonic-gate 
2334*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
2335*7c478bd9Sstevel@tonic-gate 			lgrp_cur = lgrp_table[i];
2336*7c478bd9Sstevel@tonic-gate 			lpl_cur = &cp->cp_lgrploads[i];
2337*7c478bd9Sstevel@tonic-gate 
2338*7c478bd9Sstevel@tonic-gate 			if ((lgrp_cur == lgrp_root) ||
2339*7c478bd9Sstevel@tonic-gate 			    (!LGRP_EXISTS(lgrp_cur) &&
2340*7c478bd9Sstevel@tonic-gate 			    (lpl_cur->lpl_ncpu == 0)))
2341*7c478bd9Sstevel@tonic-gate 				continue;
2342*7c478bd9Sstevel@tonic-gate 
2343*7c478bd9Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2344*7c478bd9Sstevel@tonic-gate 				/*
2345*7c478bd9Sstevel@tonic-gate 				 * this should be a deleted intermediate, so
2346*7c478bd9Sstevel@tonic-gate 				 * clear it
2347*7c478bd9Sstevel@tonic-gate 				 */
2348*7c478bd9Sstevel@tonic-gate 				lpl_clear(lpl_cur);
2349*7c478bd9Sstevel@tonic-gate 			} else if ((lpl_cur->lpl_nrset == 1) &&
2350*7c478bd9Sstevel@tonic-gate 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2351*7c478bd9Sstevel@tonic-gate 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2352*7c478bd9Sstevel@tonic-gate 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2353*7c478bd9Sstevel@tonic-gate 				/*
2354*7c478bd9Sstevel@tonic-gate 				 * this is a leaf whose parent was deleted, or
2355*7c478bd9Sstevel@tonic-gate 				 * whose parent had their lgrp deleted.  (And
2356*7c478bd9Sstevel@tonic-gate 				 * whose parent will soon be deleted).  Point
2357*7c478bd9Sstevel@tonic-gate 				 * this guy back to the root lpl.
2358*7c478bd9Sstevel@tonic-gate 				 */
2359*7c478bd9Sstevel@tonic-gate 				lpl_cur->lpl_parent = lpl_root;
2360*7c478bd9Sstevel@tonic-gate 				lpl_rset_add(lpl_root, lpl_cur);
2361*7c478bd9Sstevel@tonic-gate 			}
2362*7c478bd9Sstevel@tonic-gate 
2363*7c478bd9Sstevel@tonic-gate 		}
2364*7c478bd9Sstevel@tonic-gate 
2365*7c478bd9Sstevel@tonic-gate 		/*
2366*7c478bd9Sstevel@tonic-gate 		 * Now that we're done, make sure the count on the root lpl is
2367*7c478bd9Sstevel@tonic-gate 		 * correct, and update the hints of the children for the sake of
2368*7c478bd9Sstevel@tonic-gate 		 * thoroughness
2369*7c478bd9Sstevel@tonic-gate 		 */
2370*7c478bd9Sstevel@tonic-gate 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2371*7c478bd9Sstevel@tonic-gate 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2372*7c478bd9Sstevel@tonic-gate 		}
2373*7c478bd9Sstevel@tonic-gate 		lpl_root->lpl_ncpu = sum;
2374*7c478bd9Sstevel@tonic-gate 		lpl_child_update(lpl_root, cp);
2375*7c478bd9Sstevel@tonic-gate 
2376*7c478bd9Sstevel@tonic-gate 		cp = cp->cp_next;
2377*7c478bd9Sstevel@tonic-gate 	} while (cp != cp_list_head);
2378*7c478bd9Sstevel@tonic-gate 
2379*7c478bd9Sstevel@tonic-gate 	return (levels);
2380*7c478bd9Sstevel@tonic-gate }
2381*7c478bd9Sstevel@tonic-gate 
2382*7c478bd9Sstevel@tonic-gate /*
2383*7c478bd9Sstevel@tonic-gate  * Insert a lpl into the resource hierarchy and create any additional lpls that
2384*7c478bd9Sstevel@tonic-gate  * are necessary to represent the varying states of locality for the cpu
2385*7c478bd9Sstevel@tonic-gate  * resoruces newly added to the partition.
2386*7c478bd9Sstevel@tonic-gate  *
2387*7c478bd9Sstevel@tonic-gate  * This routine is clever enough that it can correctly add resources from the
2388*7c478bd9Sstevel@tonic-gate  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2389*7c478bd9Sstevel@tonic-gate  * those for which the lpl is a leaf as opposed to simply a named equally local
2390*7c478bd9Sstevel@tonic-gate  * resource).  The one special case that needs additional processing is when a
2391*7c478bd9Sstevel@tonic-gate  * new intermediate lpl is introduced.  Since the main loop only traverses
2392*7c478bd9Sstevel@tonic-gate  * looking to add the leaf resource where it does not yet exist, additional work
2393*7c478bd9Sstevel@tonic-gate  * is necessary to add other leaf resources that may need to exist in the newly
2394*7c478bd9Sstevel@tonic-gate  * created intermediate.  This is performed by the second inner loop, and is
2395*7c478bd9Sstevel@tonic-gate  * only done when the check for more than one overlapping resource succeeds.
2396*7c478bd9Sstevel@tonic-gate  */
2397*7c478bd9Sstevel@tonic-gate 
2398*7c478bd9Sstevel@tonic-gate void
2399*7c478bd9Sstevel@tonic-gate lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2400*7c478bd9Sstevel@tonic-gate {
2401*7c478bd9Sstevel@tonic-gate 	int		i;
2402*7c478bd9Sstevel@tonic-gate 	int		j;
2403*7c478bd9Sstevel@tonic-gate 	int		hint;
2404*7c478bd9Sstevel@tonic-gate 	int		rset_num_intersect;
2405*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
2406*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
2407*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_parent;
2408*7c478bd9Sstevel@tonic-gate 	lgrpid_t	parent_id;
2409*7c478bd9Sstevel@tonic-gate 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2410*7c478bd9Sstevel@tonic-gate 
2411*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
2412*7c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
2413*7c478bd9Sstevel@tonic-gate 
2414*7c478bd9Sstevel@tonic-gate 		/*
2415*7c478bd9Sstevel@tonic-gate 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2416*7c478bd9Sstevel@tonic-gate 		 * contained within the current lgrp, or if the current lgrp has
2417*7c478bd9Sstevel@tonic-gate 		 * no leaves in this partition
2418*7c478bd9Sstevel@tonic-gate 		 */
2419*7c478bd9Sstevel@tonic-gate 
2420*7c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) ||
2421*7c478bd9Sstevel@tonic-gate 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2422*7c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
2423*7c478bd9Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2424*7c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset))
2425*7c478bd9Sstevel@tonic-gate 			continue;
2426*7c478bd9Sstevel@tonic-gate 
2427*7c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2428*7c478bd9Sstevel@tonic-gate 		if (lgrp_cur->lgrp_parent != NULL) {
2429*7c478bd9Sstevel@tonic-gate 			/* if lgrp has a parent, assign it properly */
2430*7c478bd9Sstevel@tonic-gate 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2431*7c478bd9Sstevel@tonic-gate 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2432*7c478bd9Sstevel@tonic-gate 		} else {
2433*7c478bd9Sstevel@tonic-gate 			/* if not, make sure parent ptr gets set to null */
2434*7c478bd9Sstevel@tonic-gate 			lpl_parent = NULL;
2435*7c478bd9Sstevel@tonic-gate 		}
2436*7c478bd9Sstevel@tonic-gate 
2437*7c478bd9Sstevel@tonic-gate 		if (lpl_cur == lpl_leaf) {
2438*7c478bd9Sstevel@tonic-gate 			/*
2439*7c478bd9Sstevel@tonic-gate 			 * Almost all leaf state was initialized elsewhere.  The
2440*7c478bd9Sstevel@tonic-gate 			 * only thing left to do is to set the parent.
2441*7c478bd9Sstevel@tonic-gate 			 */
2442*7c478bd9Sstevel@tonic-gate 			lpl_cur->lpl_parent = lpl_parent;
2443*7c478bd9Sstevel@tonic-gate 			continue;
2444*7c478bd9Sstevel@tonic-gate 		}
2445*7c478bd9Sstevel@tonic-gate 
2446*7c478bd9Sstevel@tonic-gate 		/*
2447*7c478bd9Sstevel@tonic-gate 		 * Initialize intermediate lpl
2448*7c478bd9Sstevel@tonic-gate 		 * Save this lpl's hint though. Since we're changing this
2449*7c478bd9Sstevel@tonic-gate 		 * lpl's resources, we need to update the hint in this lpl's
2450*7c478bd9Sstevel@tonic-gate 		 * children, but the hint in this lpl is unaffected and
2451*7c478bd9Sstevel@tonic-gate 		 * should be preserved.
2452*7c478bd9Sstevel@tonic-gate 		 */
2453*7c478bd9Sstevel@tonic-gate 		hint = lpl_cur->lpl_hint;
2454*7c478bd9Sstevel@tonic-gate 
2455*7c478bd9Sstevel@tonic-gate 		lpl_clear(lpl_cur);
2456*7c478bd9Sstevel@tonic-gate 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2457*7c478bd9Sstevel@tonic-gate 
2458*7c478bd9Sstevel@tonic-gate 		lpl_cur->lpl_hint = hint;
2459*7c478bd9Sstevel@tonic-gate 		lpl_cur->lpl_parent = lpl_parent;
2460*7c478bd9Sstevel@tonic-gate 
2461*7c478bd9Sstevel@tonic-gate 		/* does new lpl need to be populated with other resources? */
2462*7c478bd9Sstevel@tonic-gate 		rset_intersect =
2463*7c478bd9Sstevel@tonic-gate 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2464*7c478bd9Sstevel@tonic-gate 			cpupart->cp_lgrpset);
2465*7c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2466*7c478bd9Sstevel@tonic-gate 
2467*7c478bd9Sstevel@tonic-gate 		if (rset_num_intersect > 1) {
2468*7c478bd9Sstevel@tonic-gate 			/*
2469*7c478bd9Sstevel@tonic-gate 			 * If so, figure out what lpls have resources that
2470*7c478bd9Sstevel@tonic-gate 			 * intersect this one, and add them.
2471*7c478bd9Sstevel@tonic-gate 			 */
2472*7c478bd9Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
2473*7c478bd9Sstevel@tonic-gate 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2474*7c478bd9Sstevel@tonic-gate 				lpl_t	*lpl_cand;	/* candidate lpl */
2475*7c478bd9Sstevel@tonic-gate 
2476*7c478bd9Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
2477*7c478bd9Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
2478*7c478bd9Sstevel@tonic-gate 				    !klgrpset_ismember(rset_intersect,
2479*7c478bd9Sstevel@tonic-gate 					lgrp_cand->lgrp_id))
2480*7c478bd9Sstevel@tonic-gate 					continue;
2481*7c478bd9Sstevel@tonic-gate 				lpl_cand =
2482*7c478bd9Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2483*7c478bd9Sstevel@tonic-gate 				lpl_rset_add(lpl_cur, lpl_cand);
2484*7c478bd9Sstevel@tonic-gate 			}
2485*7c478bd9Sstevel@tonic-gate 		}
2486*7c478bd9Sstevel@tonic-gate 		/*
2487*7c478bd9Sstevel@tonic-gate 		 * This lpl's rset has changed. Update the hint in it's
2488*7c478bd9Sstevel@tonic-gate 		 * children.
2489*7c478bd9Sstevel@tonic-gate 		 */
2490*7c478bd9Sstevel@tonic-gate 		lpl_child_update(lpl_cur, cpupart);
2491*7c478bd9Sstevel@tonic-gate 	}
2492*7c478bd9Sstevel@tonic-gate }
2493*7c478bd9Sstevel@tonic-gate 
2494*7c478bd9Sstevel@tonic-gate /*
2495*7c478bd9Sstevel@tonic-gate  * remove a lpl from the hierarchy of resources, clearing its state when
2496*7c478bd9Sstevel@tonic-gate  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2497*7c478bd9Sstevel@tonic-gate  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2498*7c478bd9Sstevel@tonic-gate  * delete them as well.
2499*7c478bd9Sstevel@tonic-gate  */
2500*7c478bd9Sstevel@tonic-gate 
2501*7c478bd9Sstevel@tonic-gate void
2502*7c478bd9Sstevel@tonic-gate lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2503*7c478bd9Sstevel@tonic-gate {
2504*7c478bd9Sstevel@tonic-gate 	int		i;
2505*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
2506*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_cur;
2507*7c478bd9Sstevel@tonic-gate 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2508*7c478bd9Sstevel@tonic-gate 
2509*7c478bd9Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
2510*7c478bd9Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
2511*7c478bd9Sstevel@tonic-gate 
2512*7c478bd9Sstevel@tonic-gate 		/*
2513*7c478bd9Sstevel@tonic-gate 		 * Don't attempt to remove from lgrps that aren't there, that
2514*7c478bd9Sstevel@tonic-gate 		 * don't contain our leaf, or from the leaf itself. (We do that
2515*7c478bd9Sstevel@tonic-gate 		 * later)
2516*7c478bd9Sstevel@tonic-gate 		 */
2517*7c478bd9Sstevel@tonic-gate 
2518*7c478bd9Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur))
2519*7c478bd9Sstevel@tonic-gate 			continue;
2520*7c478bd9Sstevel@tonic-gate 
2521*7c478bd9Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2522*7c478bd9Sstevel@tonic-gate 
2523*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2524*7c478bd9Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
2525*7c478bd9Sstevel@tonic-gate 		    (lpl_cur == lpl_leaf)) {
2526*7c478bd9Sstevel@tonic-gate 			continue;
2527*7c478bd9Sstevel@tonic-gate 		}
2528*7c478bd9Sstevel@tonic-gate 
2529*7c478bd9Sstevel@tonic-gate 		/*
2530*7c478bd9Sstevel@tonic-gate 		 * This is a slightly sleazy simplification in that we have
2531*7c478bd9Sstevel@tonic-gate 		 * already marked the cp_lgrpset as no longer containing the
2532*7c478bd9Sstevel@tonic-gate 		 * leaf we've deleted.  Any lpls that pass the above checks
2533*7c478bd9Sstevel@tonic-gate 		 * based upon lgrp membership but not necessarily cpu-part
2534*7c478bd9Sstevel@tonic-gate 		 * membership also get cleared by the checks below.  Currently
2535*7c478bd9Sstevel@tonic-gate 		 * this is harmless, as the lpls should be empty anyway.
2536*7c478bd9Sstevel@tonic-gate 		 *
2537*7c478bd9Sstevel@tonic-gate 		 * In particular, we want to preserve lpls that have additional
2538*7c478bd9Sstevel@tonic-gate 		 * leaf resources, even though we don't yet have a processor
2539*7c478bd9Sstevel@tonic-gate 		 * architecture that represents resources this way.
2540*7c478bd9Sstevel@tonic-gate 		 */
2541*7c478bd9Sstevel@tonic-gate 
2542*7c478bd9Sstevel@tonic-gate 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2543*7c478bd9Sstevel@tonic-gate 		    cpupart->cp_lgrpset);
2544*7c478bd9Sstevel@tonic-gate 
2545*7c478bd9Sstevel@tonic-gate 		lpl_rset_del(lpl_cur, lpl_leaf);
2546*7c478bd9Sstevel@tonic-gate 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2547*7c478bd9Sstevel@tonic-gate 			lpl_clear(lpl_cur);
2548*7c478bd9Sstevel@tonic-gate 		} else {
2549*7c478bd9Sstevel@tonic-gate 			/*
2550*7c478bd9Sstevel@tonic-gate 			 * Update this lpl's children
2551*7c478bd9Sstevel@tonic-gate 			 */
2552*7c478bd9Sstevel@tonic-gate 			lpl_child_update(lpl_cur, cpupart);
2553*7c478bd9Sstevel@tonic-gate 		}
2554*7c478bd9Sstevel@tonic-gate 	}
2555*7c478bd9Sstevel@tonic-gate 	lpl_clear(lpl_leaf);
2556*7c478bd9Sstevel@tonic-gate }
2557*7c478bd9Sstevel@tonic-gate 
2558*7c478bd9Sstevel@tonic-gate /*
2559*7c478bd9Sstevel@tonic-gate  * add a cpu to a partition in terms of lgrp load avg bookeeping
2560*7c478bd9Sstevel@tonic-gate  *
2561*7c478bd9Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
2562*7c478bd9Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
2563*7c478bd9Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
2564*7c478bd9Sstevel@tonic-gate  * There are two general cases for cpu additon:
2565*7c478bd9Sstevel@tonic-gate  *
2566*7c478bd9Sstevel@tonic-gate  * 1. A lpl structure that contains resources already in the hierarchy tree.
2567*7c478bd9Sstevel@tonic-gate  * In this case, all of the associated lpl relationships have been defined, and
2568*7c478bd9Sstevel@tonic-gate  * all that is necessary is that we link the new cpu into the per-lpl list of
2569*7c478bd9Sstevel@tonic-gate  * cpus, and increment the ncpu count of all places where this cpu resource will
2570*7c478bd9Sstevel@tonic-gate  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2571*7c478bd9Sstevel@tonic-gate  * pushing is accomplished by this routine.
2572*7c478bd9Sstevel@tonic-gate  *
2573*7c478bd9Sstevel@tonic-gate  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2574*7c478bd9Sstevel@tonic-gate  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2575*7c478bd9Sstevel@tonic-gate  * construct the hierarchy of state necessary to name it's more distant
2576*7c478bd9Sstevel@tonic-gate  * resources, if they should exist.  The leaf structure is initialized by this
2577*7c478bd9Sstevel@tonic-gate  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2578*7c478bd9Sstevel@tonic-gate  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2579*7c478bd9Sstevel@tonic-gate  * and builds all of the "ancestoral" state necessary to identify resources at
2580*7c478bd9Sstevel@tonic-gate  * differing levels of locality.
2581*7c478bd9Sstevel@tonic-gate  */
2582*7c478bd9Sstevel@tonic-gate void
2583*7c478bd9Sstevel@tonic-gate lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2584*7c478bd9Sstevel@tonic-gate {
2585*7c478bd9Sstevel@tonic-gate 	cpupart_t	*cpupart;
2586*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
2587*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
2588*7c478bd9Sstevel@tonic-gate 
2589*7c478bd9Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
2590*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2591*7c478bd9Sstevel@tonic-gate 
2592*7c478bd9Sstevel@tonic-gate 	cpupart = cp->cpu_part;
2593*7c478bd9Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lgrpid];
2594*7c478bd9Sstevel@tonic-gate 
2595*7c478bd9Sstevel@tonic-gate 	/* don't add non-existent lgrp */
2596*7c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2597*7c478bd9Sstevel@tonic-gate 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2598*7c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = lpl_leaf;
2599*7c478bd9Sstevel@tonic-gate 
2600*7c478bd9Sstevel@tonic-gate 	/* only leaf lpls contain cpus */
2601*7c478bd9Sstevel@tonic-gate 
2602*7c478bd9Sstevel@tonic-gate 	if (lpl_leaf->lpl_ncpu++ == 0) {
2603*7c478bd9Sstevel@tonic-gate 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2604*7c478bd9Sstevel@tonic-gate 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2605*7c478bd9Sstevel@tonic-gate 		lpl_leaf_insert(lpl_leaf, cpupart);
2606*7c478bd9Sstevel@tonic-gate 	} else {
2607*7c478bd9Sstevel@tonic-gate 		/*
2608*7c478bd9Sstevel@tonic-gate 		 * the lpl should already exist in the parent, so just update
2609*7c478bd9Sstevel@tonic-gate 		 * the count of available CPUs
2610*7c478bd9Sstevel@tonic-gate 		 */
2611*7c478bd9Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2612*7c478bd9Sstevel@tonic-gate 	}
2613*7c478bd9Sstevel@tonic-gate 
2614*7c478bd9Sstevel@tonic-gate 	/* link cpu into list of cpus in lpl */
2615*7c478bd9Sstevel@tonic-gate 
2616*7c478bd9Sstevel@tonic-gate 	if (lpl_leaf->lpl_cpus) {
2617*7c478bd9Sstevel@tonic-gate 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2618*7c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2619*7c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2620*7c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2621*7c478bd9Sstevel@tonic-gate 	} else {
2622*7c478bd9Sstevel@tonic-gate 		/*
2623*7c478bd9Sstevel@tonic-gate 		 * We increment ncpu immediately after we create a new leaf
2624*7c478bd9Sstevel@tonic-gate 		 * lpl, so assert that ncpu == 1 for the case where we don't
2625*7c478bd9Sstevel@tonic-gate 		 * have any cpu pointers yet.
2626*7c478bd9Sstevel@tonic-gate 		 */
2627*7c478bd9Sstevel@tonic-gate 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2628*7c478bd9Sstevel@tonic-gate 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2629*7c478bd9Sstevel@tonic-gate 	}
2630*7c478bd9Sstevel@tonic-gate 
2631*7c478bd9Sstevel@tonic-gate }
2632*7c478bd9Sstevel@tonic-gate 
2633*7c478bd9Sstevel@tonic-gate 
2634*7c478bd9Sstevel@tonic-gate /*
2635*7c478bd9Sstevel@tonic-gate  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2636*7c478bd9Sstevel@tonic-gate  *
2637*7c478bd9Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
2638*7c478bd9Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
2639*7c478bd9Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
2640*7c478bd9Sstevel@tonic-gate  * There are two removal cases in question:
2641*7c478bd9Sstevel@tonic-gate  *
2642*7c478bd9Sstevel@tonic-gate  * 1. Removal of the resource in the leaf leaves other resources remaining in
2643*7c478bd9Sstevel@tonic-gate  * that leaf.  (Another cpu still exists at this level of locality).  In this
2644*7c478bd9Sstevel@tonic-gate  * case, the count of available cpus is decremented in all assocated lpls by
2645*7c478bd9Sstevel@tonic-gate  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2646*7c478bd9Sstevel@tonic-gate  * from the per-cpu lpl list.
2647*7c478bd9Sstevel@tonic-gate  *
2648*7c478bd9Sstevel@tonic-gate  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2649*7c478bd9Sstevel@tonic-gate  * empty)  In this case, all of what has occurred for the first step must take
2650*7c478bd9Sstevel@tonic-gate  * place; however, additionally we must remove the lpl structure itself, prune
2651*7c478bd9Sstevel@tonic-gate  * out any stranded lpls that do not directly name a leaf resource, and mark the
2652*7c478bd9Sstevel@tonic-gate  * cpu partition in question as no longer containing resources from the lgrp of
2653*7c478bd9Sstevel@tonic-gate  * the lpl that has been delted.  Cpu-partition changes are handled by this
2654*7c478bd9Sstevel@tonic-gate  * method, but the lpl_leaf_remove function deals with the details of pruning
2655*7c478bd9Sstevel@tonic-gate  * out the empty lpl and any of its orphaned direct ancestors.
2656*7c478bd9Sstevel@tonic-gate  */
2657*7c478bd9Sstevel@tonic-gate void
2658*7c478bd9Sstevel@tonic-gate lgrp_part_del_cpu(cpu_t *cp)
2659*7c478bd9Sstevel@tonic-gate {
2660*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl;
2661*7c478bd9Sstevel@tonic-gate 	lpl_t		*leaf_lpl;
2662*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
2663*7c478bd9Sstevel@tonic-gate 
2664*7c478bd9Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
2665*7c478bd9Sstevel@tonic-gate 
2666*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2667*7c478bd9Sstevel@tonic-gate 
2668*7c478bd9Sstevel@tonic-gate 	lpl = leaf_lpl = cp->cpu_lpl;
2669*7c478bd9Sstevel@tonic-gate 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2670*7c478bd9Sstevel@tonic-gate 
2671*7c478bd9Sstevel@tonic-gate 	/* don't delete a leaf that isn't there */
2672*7c478bd9Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2673*7c478bd9Sstevel@tonic-gate 
2674*7c478bd9Sstevel@tonic-gate 	/* no double-deletes */
2675*7c478bd9Sstevel@tonic-gate 	ASSERT(lpl->lpl_ncpu);
2676*7c478bd9Sstevel@tonic-gate 	if (--lpl->lpl_ncpu == 0) {
2677*7c478bd9Sstevel@tonic-gate 		/*
2678*7c478bd9Sstevel@tonic-gate 		 * This was the last cpu in this lgroup for this partition,
2679*7c478bd9Sstevel@tonic-gate 		 * clear its bit in the partition's lgroup bitmask
2680*7c478bd9Sstevel@tonic-gate 		 */
2681*7c478bd9Sstevel@tonic-gate 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2682*7c478bd9Sstevel@tonic-gate 
2683*7c478bd9Sstevel@tonic-gate 		/* eliminate remaning lpl link pointers in cpu, lpl */
2684*7c478bd9Sstevel@tonic-gate 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2685*7c478bd9Sstevel@tonic-gate 
2686*7c478bd9Sstevel@tonic-gate 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2687*7c478bd9Sstevel@tonic-gate 	} else {
2688*7c478bd9Sstevel@tonic-gate 
2689*7c478bd9Sstevel@tonic-gate 		/* unlink cpu from lists of cpus in lpl */
2690*7c478bd9Sstevel@tonic-gate 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2691*7c478bd9Sstevel@tonic-gate 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2692*7c478bd9Sstevel@tonic-gate 		if (lpl->lpl_cpus == cp) {
2693*7c478bd9Sstevel@tonic-gate 			lpl->lpl_cpus = cp->cpu_next_lpl;
2694*7c478bd9Sstevel@tonic-gate 		}
2695*7c478bd9Sstevel@tonic-gate 
2696*7c478bd9Sstevel@tonic-gate 		/*
2697*7c478bd9Sstevel@tonic-gate 		 * Update the cpu count in the lpls associated with parent
2698*7c478bd9Sstevel@tonic-gate 		 * lgroups.
2699*7c478bd9Sstevel@tonic-gate 		 */
2700*7c478bd9Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2701*7c478bd9Sstevel@tonic-gate 
2702*7c478bd9Sstevel@tonic-gate 	}
2703*7c478bd9Sstevel@tonic-gate 	/* clear cpu's lpl ptr when we're all done */
2704*7c478bd9Sstevel@tonic-gate 	cp->cpu_lpl = NULL;
2705*7c478bd9Sstevel@tonic-gate }
2706*7c478bd9Sstevel@tonic-gate 
2707*7c478bd9Sstevel@tonic-gate /*
2708*7c478bd9Sstevel@tonic-gate  * Recompute load average for the specified partition/lgrp fragment.
2709*7c478bd9Sstevel@tonic-gate  *
2710*7c478bd9Sstevel@tonic-gate  * We rely on the fact that this routine is called from the clock thread
2711*7c478bd9Sstevel@tonic-gate  * at a point before the clock thread can block (i.e. before its first
2712*7c478bd9Sstevel@tonic-gate  * lock request).  Since the clock thread can not be preempted (since it
2713*7c478bd9Sstevel@tonic-gate  * runs at highest priority), we know that cpu partitions can not change
2714*7c478bd9Sstevel@tonic-gate  * (since doing so would require either the repartition requester or the
2715*7c478bd9Sstevel@tonic-gate  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2716*7c478bd9Sstevel@tonic-gate  * without grabbing cpu_lock.
2717*7c478bd9Sstevel@tonic-gate  */
2718*7c478bd9Sstevel@tonic-gate void
2719*7c478bd9Sstevel@tonic-gate lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2720*7c478bd9Sstevel@tonic-gate {
2721*7c478bd9Sstevel@tonic-gate 	uint_t		ncpu;
2722*7c478bd9Sstevel@tonic-gate 	int64_t		old, new, f;
2723*7c478bd9Sstevel@tonic-gate 
2724*7c478bd9Sstevel@tonic-gate 	/*
2725*7c478bd9Sstevel@tonic-gate 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2726*7c478bd9Sstevel@tonic-gate 	 */
2727*7c478bd9Sstevel@tonic-gate 	static short expval[] = {
2728*7c478bd9Sstevel@tonic-gate 	    0, 3196, 1618, 1083,
2729*7c478bd9Sstevel@tonic-gate 	    814, 652, 543, 466,
2730*7c478bd9Sstevel@tonic-gate 	    408, 363, 326, 297,
2731*7c478bd9Sstevel@tonic-gate 	    272, 251, 233, 218,
2732*7c478bd9Sstevel@tonic-gate 	    204, 192, 181, 172,
2733*7c478bd9Sstevel@tonic-gate 	    163, 155, 148, 142,
2734*7c478bd9Sstevel@tonic-gate 	    136, 130, 125, 121,
2735*7c478bd9Sstevel@tonic-gate 	    116, 112, 109, 105
2736*7c478bd9Sstevel@tonic-gate 	};
2737*7c478bd9Sstevel@tonic-gate 
2738*7c478bd9Sstevel@tonic-gate 	/* ASSERT (called from clock level) */
2739*7c478bd9Sstevel@tonic-gate 
2740*7c478bd9Sstevel@tonic-gate 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2741*7c478bd9Sstevel@tonic-gate 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2742*7c478bd9Sstevel@tonic-gate 		return;
2743*7c478bd9Sstevel@tonic-gate 	}
2744*7c478bd9Sstevel@tonic-gate 
2745*7c478bd9Sstevel@tonic-gate 	for (;;) {
2746*7c478bd9Sstevel@tonic-gate 
2747*7c478bd9Sstevel@tonic-gate 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2748*7c478bd9Sstevel@tonic-gate 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2749*7c478bd9Sstevel@tonic-gate 		else
2750*7c478bd9Sstevel@tonic-gate 			f = expval[ncpu];
2751*7c478bd9Sstevel@tonic-gate 
2752*7c478bd9Sstevel@tonic-gate 		/*
2753*7c478bd9Sstevel@tonic-gate 		 * Modify the load average atomically to avoid losing
2754*7c478bd9Sstevel@tonic-gate 		 * anticipatory load updates (see lgrp_move_thread()).
2755*7c478bd9Sstevel@tonic-gate 		 */
2756*7c478bd9Sstevel@tonic-gate 		if (ageflag) {
2757*7c478bd9Sstevel@tonic-gate 			/*
2758*7c478bd9Sstevel@tonic-gate 			 * We're supposed to both update and age the load.
2759*7c478bd9Sstevel@tonic-gate 			 * This happens 10 times/sec. per cpu.  We do a
2760*7c478bd9Sstevel@tonic-gate 			 * little hoop-jumping to avoid integer overflow.
2761*7c478bd9Sstevel@tonic-gate 			 */
2762*7c478bd9Sstevel@tonic-gate 			int64_t		q, r;
2763*7c478bd9Sstevel@tonic-gate 
2764*7c478bd9Sstevel@tonic-gate 			do {
2765*7c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
2766*7c478bd9Sstevel@tonic-gate 				q = (old  >> 16) << 7;
2767*7c478bd9Sstevel@tonic-gate 				r = (old  & 0xffff) << 7;
2768*7c478bd9Sstevel@tonic-gate 				new += ((long long)(nrcpus - q) * f -
2769*7c478bd9Sstevel@tonic-gate 				    ((r * f) >> 16)) >> 7;
2770*7c478bd9Sstevel@tonic-gate 
2771*7c478bd9Sstevel@tonic-gate 				/*
2772*7c478bd9Sstevel@tonic-gate 				 * Check for overflow
2773*7c478bd9Sstevel@tonic-gate 				 */
2774*7c478bd9Sstevel@tonic-gate 				if (new > LGRP_LOADAVG_MAX)
2775*7c478bd9Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
2776*7c478bd9Sstevel@tonic-gate 				else if (new < 0)
2777*7c478bd9Sstevel@tonic-gate 					new = 0;
2778*7c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2779*7c478bd9Sstevel@tonic-gate 			    new) != old);
2780*7c478bd9Sstevel@tonic-gate 		} else {
2781*7c478bd9Sstevel@tonic-gate 			/*
2782*7c478bd9Sstevel@tonic-gate 			 * We're supposed to update the load, but not age it.
2783*7c478bd9Sstevel@tonic-gate 			 * This option is used to update the load (which either
2784*7c478bd9Sstevel@tonic-gate 			 * has already been aged in this 1/10 sec. interval or
2785*7c478bd9Sstevel@tonic-gate 			 * soon will be) to account for a remotely executing
2786*7c478bd9Sstevel@tonic-gate 			 * thread.
2787*7c478bd9Sstevel@tonic-gate 			 */
2788*7c478bd9Sstevel@tonic-gate 			do {
2789*7c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
2790*7c478bd9Sstevel@tonic-gate 				new += f;
2791*7c478bd9Sstevel@tonic-gate 				/*
2792*7c478bd9Sstevel@tonic-gate 				 * Check for overflow
2793*7c478bd9Sstevel@tonic-gate 				 * Underflow not possible here
2794*7c478bd9Sstevel@tonic-gate 				 */
2795*7c478bd9Sstevel@tonic-gate 				if (new < old)
2796*7c478bd9Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
2797*7c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2798*7c478bd9Sstevel@tonic-gate 			    new) != old);
2799*7c478bd9Sstevel@tonic-gate 		}
2800*7c478bd9Sstevel@tonic-gate 
2801*7c478bd9Sstevel@tonic-gate 		/*
2802*7c478bd9Sstevel@tonic-gate 		 * Do the same for this lpl's parent
2803*7c478bd9Sstevel@tonic-gate 		 */
2804*7c478bd9Sstevel@tonic-gate 		if ((lpl = lpl->lpl_parent) == NULL)
2805*7c478bd9Sstevel@tonic-gate 			break;
2806*7c478bd9Sstevel@tonic-gate 		ncpu = lpl->lpl_ncpu;
2807*7c478bd9Sstevel@tonic-gate 	}
2808*7c478bd9Sstevel@tonic-gate }
2809*7c478bd9Sstevel@tonic-gate 
2810*7c478bd9Sstevel@tonic-gate /*
2811*7c478bd9Sstevel@tonic-gate  * Initialize lpl topology in the target based on topology currently present in
2812*7c478bd9Sstevel@tonic-gate  * lpl_bootstrap.
2813*7c478bd9Sstevel@tonic-gate  *
2814*7c478bd9Sstevel@tonic-gate  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2815*7c478bd9Sstevel@tonic-gate  * initialize cp_default list of lpls. Up to this point all topology operations
2816*7c478bd9Sstevel@tonic-gate  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2817*7c478bd9Sstevel@tonic-gate  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2818*7c478bd9Sstevel@tonic-gate  * `target' points to the list of lpls in cp_default and `size' is the size of
2819*7c478bd9Sstevel@tonic-gate  * this list.
2820*7c478bd9Sstevel@tonic-gate  *
2821*7c478bd9Sstevel@tonic-gate  * This function walks the lpl topology in lpl_bootstrap and does for things:
2822*7c478bd9Sstevel@tonic-gate  *
2823*7c478bd9Sstevel@tonic-gate  * 1) Copies all fields from lpl_bootstrap to the target.
2824*7c478bd9Sstevel@tonic-gate  *
2825*7c478bd9Sstevel@tonic-gate  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2826*7c478bd9Sstevel@tonic-gate  *
2827*7c478bd9Sstevel@tonic-gate  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2828*7c478bd9Sstevel@tonic-gate  *    instead of lpl_bootstrap.
2829*7c478bd9Sstevel@tonic-gate  *
2830*7c478bd9Sstevel@tonic-gate  * 4) Updates pointers in the resource list of the target to point to the lpls
2831*7c478bd9Sstevel@tonic-gate  *    in the target list instead of lpl_bootstrap.
2832*7c478bd9Sstevel@tonic-gate  *
2833*7c478bd9Sstevel@tonic-gate  * After lpl_topo_bootstrap() completes, target contains the same information
2834*7c478bd9Sstevel@tonic-gate  * that would be present there if it were used during boot instead of
2835*7c478bd9Sstevel@tonic-gate  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2836*7c478bd9Sstevel@tonic-gate  * and it is bzeroed.
2837*7c478bd9Sstevel@tonic-gate  */
2838*7c478bd9Sstevel@tonic-gate void
2839*7c478bd9Sstevel@tonic-gate lpl_topo_bootstrap(lpl_t *target, int size)
2840*7c478bd9Sstevel@tonic-gate {
2841*7c478bd9Sstevel@tonic-gate 	lpl_t	*lpl = lpl_bootstrap;
2842*7c478bd9Sstevel@tonic-gate 	lpl_t	*target_lpl = target;
2843*7c478bd9Sstevel@tonic-gate 	int	howmany;
2844*7c478bd9Sstevel@tonic-gate 	int	id;
2845*7c478bd9Sstevel@tonic-gate 	int	i;
2846*7c478bd9Sstevel@tonic-gate 
2847*7c478bd9Sstevel@tonic-gate 	/*
2848*7c478bd9Sstevel@tonic-gate 	 * The only target that should be passed here is cp_default lpl list.
2849*7c478bd9Sstevel@tonic-gate 	 */
2850*7c478bd9Sstevel@tonic-gate 	ASSERT(target == cp_default.cp_lgrploads);
2851*7c478bd9Sstevel@tonic-gate 	ASSERT(size == cp_default.cp_nlgrploads);
2852*7c478bd9Sstevel@tonic-gate 	ASSERT(!lgrp_topo_initialized);
2853*7c478bd9Sstevel@tonic-gate 	ASSERT(ncpus == 1);
2854*7c478bd9Sstevel@tonic-gate 
2855*7c478bd9Sstevel@tonic-gate 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2856*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2857*7c478bd9Sstevel@tonic-gate 		/*
2858*7c478bd9Sstevel@tonic-gate 		 * Copy all fields from lpl.
2859*7c478bd9Sstevel@tonic-gate 		 */
2860*7c478bd9Sstevel@tonic-gate 
2861*7c478bd9Sstevel@tonic-gate 		*target_lpl = *lpl;
2862*7c478bd9Sstevel@tonic-gate 
2863*7c478bd9Sstevel@tonic-gate 		/*
2864*7c478bd9Sstevel@tonic-gate 		 * Substitute CPU0 lpl pointer with one relative to target.
2865*7c478bd9Sstevel@tonic-gate 		 */
2866*7c478bd9Sstevel@tonic-gate 		if (lpl->lpl_cpus == CPU) {
2867*7c478bd9Sstevel@tonic-gate 			ASSERT(CPU->cpu_lpl == lpl);
2868*7c478bd9Sstevel@tonic-gate 			CPU->cpu_lpl = target_lpl;
2869*7c478bd9Sstevel@tonic-gate 		}
2870*7c478bd9Sstevel@tonic-gate 
2871*7c478bd9Sstevel@tonic-gate 		/*
2872*7c478bd9Sstevel@tonic-gate 		 * Substitute parent information with parent relative to target.
2873*7c478bd9Sstevel@tonic-gate 		 */
2874*7c478bd9Sstevel@tonic-gate 		if (lpl->lpl_parent != NULL)
2875*7c478bd9Sstevel@tonic-gate 			target_lpl->lpl_parent = (lpl_t *)
2876*7c478bd9Sstevel@tonic-gate 			    (((uintptr_t)lpl->lpl_parent -
2877*7c478bd9Sstevel@tonic-gate 				(uintptr_t)lpl_bootstrap) +
2878*7c478bd9Sstevel@tonic-gate 				(uintptr_t)target);
2879*7c478bd9Sstevel@tonic-gate 
2880*7c478bd9Sstevel@tonic-gate 		/*
2881*7c478bd9Sstevel@tonic-gate 		 * Walk over resource set substituting pointers relative to
2882*7c478bd9Sstevel@tonic-gate 		 * lpl_bootstrap to pointers relative to target.
2883*7c478bd9Sstevel@tonic-gate 		 */
2884*7c478bd9Sstevel@tonic-gate 		ASSERT(lpl->lpl_nrset <= 1);
2885*7c478bd9Sstevel@tonic-gate 
2886*7c478bd9Sstevel@tonic-gate 		for (id = 0; id < lpl->lpl_nrset; id++) {
2887*7c478bd9Sstevel@tonic-gate 			if (lpl->lpl_rset[id] != NULL) {
2888*7c478bd9Sstevel@tonic-gate 				target_lpl->lpl_rset[id] =
2889*7c478bd9Sstevel@tonic-gate 				    (lpl_t *)
2890*7c478bd9Sstevel@tonic-gate 				    (((uintptr_t)lpl->lpl_rset[id] -
2891*7c478bd9Sstevel@tonic-gate 					(uintptr_t)lpl_bootstrap) +
2892*7c478bd9Sstevel@tonic-gate 					(uintptr_t)target);
2893*7c478bd9Sstevel@tonic-gate 			}
2894*7c478bd9Sstevel@tonic-gate 		}
2895*7c478bd9Sstevel@tonic-gate 	}
2896*7c478bd9Sstevel@tonic-gate 
2897*7c478bd9Sstevel@tonic-gate 	/*
2898*7c478bd9Sstevel@tonic-gate 	 * Topology information in lpl_bootstrap is no longer needed.
2899*7c478bd9Sstevel@tonic-gate 	 */
2900*7c478bd9Sstevel@tonic-gate 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2901*7c478bd9Sstevel@tonic-gate }
2902*7c478bd9Sstevel@tonic-gate 
2903*7c478bd9Sstevel@tonic-gate /* the maximum effect that a single thread can have on it's lgroup's load */
2904*7c478bd9Sstevel@tonic-gate #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2905*7c478bd9Sstevel@tonic-gate 	((lgrp_loadavg_max_effect) / (ncpu))
2906*7c478bd9Sstevel@tonic-gate uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2907*7c478bd9Sstevel@tonic-gate 
2908*7c478bd9Sstevel@tonic-gate /*
2909*7c478bd9Sstevel@tonic-gate  * If the lowest load among the lgroups a process' threads are currently
2910*7c478bd9Sstevel@tonic-gate  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2911*7c478bd9Sstevel@tonic-gate  * expanding the process to a new lgroup.
2912*7c478bd9Sstevel@tonic-gate  */
2913*7c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2914*7c478bd9Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2915*7c478bd9Sstevel@tonic-gate 
2916*7c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2917*7c478bd9Sstevel@tonic-gate 	((lgrp_expand_proc_thresh) / (ncpu))
2918*7c478bd9Sstevel@tonic-gate 
2919*7c478bd9Sstevel@tonic-gate /*
2920*7c478bd9Sstevel@tonic-gate  * A process will be expanded to a new lgroup only if the difference between
2921*7c478bd9Sstevel@tonic-gate  * the lowest load on the lgroups the process' thread's are currently spread
2922*7c478bd9Sstevel@tonic-gate  * across and the lowest load on the other lgroups in the process' partition
2923*7c478bd9Sstevel@tonic-gate  * is greater than lgrp_expand_proc_diff.
2924*7c478bd9Sstevel@tonic-gate  */
2925*7c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2926*7c478bd9Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2927*7c478bd9Sstevel@tonic-gate 
2928*7c478bd9Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2929*7c478bd9Sstevel@tonic-gate 	((lgrp_expand_proc_diff) / (ncpu))
2930*7c478bd9Sstevel@tonic-gate 
2931*7c478bd9Sstevel@tonic-gate /*
2932*7c478bd9Sstevel@tonic-gate  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2933*7c478bd9Sstevel@tonic-gate  * be present due to impreciseness of the load average decay algorithm.
2934*7c478bd9Sstevel@tonic-gate  *
2935*7c478bd9Sstevel@tonic-gate  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2936*7c478bd9Sstevel@tonic-gate  * tolerance is scaled by the number of cpus in the lgroup just like
2937*7c478bd9Sstevel@tonic-gate  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2938*7c478bd9Sstevel@tonic-gate  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2939*7c478bd9Sstevel@tonic-gate  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2940*7c478bd9Sstevel@tonic-gate  */
2941*7c478bd9Sstevel@tonic-gate uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2942*7c478bd9Sstevel@tonic-gate #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2943*7c478bd9Sstevel@tonic-gate 	((lgrp_loadavg_tolerance) / ncpu)
2944*7c478bd9Sstevel@tonic-gate 
2945*7c478bd9Sstevel@tonic-gate /*
2946*7c478bd9Sstevel@tonic-gate  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2947*7c478bd9Sstevel@tonic-gate  * average is above this threshold
2948*7c478bd9Sstevel@tonic-gate  */
2949*7c478bd9Sstevel@tonic-gate uint32_t	lgrp_load_thresh = UINT32_MAX;
2950*7c478bd9Sstevel@tonic-gate 
2951*7c478bd9Sstevel@tonic-gate /*
2952*7c478bd9Sstevel@tonic-gate  * lgrp_choose() will try to skip any lgroups with less memory
2953*7c478bd9Sstevel@tonic-gate  * than this free when choosing a home lgroup
2954*7c478bd9Sstevel@tonic-gate  */
2955*7c478bd9Sstevel@tonic-gate pgcnt_t	lgrp_mem_free_thresh = 0;
2956*7c478bd9Sstevel@tonic-gate 
2957*7c478bd9Sstevel@tonic-gate /*
2958*7c478bd9Sstevel@tonic-gate  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2959*7c478bd9Sstevel@tonic-gate  * one based on one of the following policies:
2960*7c478bd9Sstevel@tonic-gate  * - Random selection
2961*7c478bd9Sstevel@tonic-gate  * - Pseudo round robin placement
2962*7c478bd9Sstevel@tonic-gate  * - Longest time since a thread was last placed
2963*7c478bd9Sstevel@tonic-gate  */
2964*7c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_RANDOM	1
2965*7c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_RR		2
2966*7c478bd9Sstevel@tonic-gate #define	LGRP_CHOOSE_TIME	3
2967*7c478bd9Sstevel@tonic-gate 
2968*7c478bd9Sstevel@tonic-gate int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2969*7c478bd9Sstevel@tonic-gate 
2970*7c478bd9Sstevel@tonic-gate /*
2971*7c478bd9Sstevel@tonic-gate  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2972*7c478bd9Sstevel@tonic-gate  * be bound to a CPU or processor set.
2973*7c478bd9Sstevel@tonic-gate  *
2974*7c478bd9Sstevel@tonic-gate  * Arguments:
2975*7c478bd9Sstevel@tonic-gate  *	t		The thread
2976*7c478bd9Sstevel@tonic-gate  *	cpupart		The partition the thread belongs to.
2977*7c478bd9Sstevel@tonic-gate  *
2978*7c478bd9Sstevel@tonic-gate  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2979*7c478bd9Sstevel@tonic-gate  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2980*7c478bd9Sstevel@tonic-gate  *	 partitions changing out from under us and assumes that given thread is
2981*7c478bd9Sstevel@tonic-gate  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2982*7c478bd9Sstevel@tonic-gate  *	 disabled, so don't grab any locks because we should never block under
2983*7c478bd9Sstevel@tonic-gate  *	 those conditions.
2984*7c478bd9Sstevel@tonic-gate  */
2985*7c478bd9Sstevel@tonic-gate lpl_t *
2986*7c478bd9Sstevel@tonic-gate lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2987*7c478bd9Sstevel@tonic-gate {
2988*7c478bd9Sstevel@tonic-gate 	lgrp_load_t	bestload, bestrload;
2989*7c478bd9Sstevel@tonic-gate 	int		lgrpid_offset, lgrp_count;
2990*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	lgrpid, lgrpid_start;
2991*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2992*7c478bd9Sstevel@tonic-gate 	klgrpset_t	lgrpset;
2993*7c478bd9Sstevel@tonic-gate 	proc_t		*p;
2994*7c478bd9Sstevel@tonic-gate 
2995*7c478bd9Sstevel@tonic-gate 	ASSERT(t != NULL);
2996*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2997*7c478bd9Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
2998*7c478bd9Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
2999*7c478bd9Sstevel@tonic-gate 
3000*7c478bd9Sstevel@tonic-gate 	p = t->t_procp;
3001*7c478bd9Sstevel@tonic-gate 
3002*7c478bd9Sstevel@tonic-gate 	/* A process should always be in an active partition */
3003*7c478bd9Sstevel@tonic-gate 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3004*7c478bd9Sstevel@tonic-gate 
3005*7c478bd9Sstevel@tonic-gate 	bestlpl = bestrlpl = NULL;
3006*7c478bd9Sstevel@tonic-gate 	bestload = bestrload = LGRP_LOADAVG_MAX;
3007*7c478bd9Sstevel@tonic-gate 	lgrpset = cpupart->cp_lgrpset;
3008*7c478bd9Sstevel@tonic-gate 
3009*7c478bd9Sstevel@tonic-gate 	switch (lgrp_choose_policy) {
3010*7c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_RR:
3011*7c478bd9Sstevel@tonic-gate 		lgrpid = cpupart->cp_lgrp_hint;
3012*7c478bd9Sstevel@tonic-gate 		do {
3013*7c478bd9Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
3014*7c478bd9Sstevel@tonic-gate 				lgrpid = 0;
3015*7c478bd9Sstevel@tonic-gate 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3016*7c478bd9Sstevel@tonic-gate 
3017*7c478bd9Sstevel@tonic-gate 		break;
3018*7c478bd9Sstevel@tonic-gate 	default:
3019*7c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_TIME:
3020*7c478bd9Sstevel@tonic-gate 	case LGRP_CHOOSE_RANDOM:
3021*7c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrp_count);
3022*7c478bd9Sstevel@tonic-gate 		lgrpid_offset =
3023*7c478bd9Sstevel@tonic-gate 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3024*7c478bd9Sstevel@tonic-gate 		for (lgrpid = 0; ; lgrpid++) {
3025*7c478bd9Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3026*7c478bd9Sstevel@tonic-gate 				if (--lgrpid_offset == 0)
3027*7c478bd9Sstevel@tonic-gate 					break;
3028*7c478bd9Sstevel@tonic-gate 			}
3029*7c478bd9Sstevel@tonic-gate 		}
3030*7c478bd9Sstevel@tonic-gate 		break;
3031*7c478bd9Sstevel@tonic-gate 	}
3032*7c478bd9Sstevel@tonic-gate 
3033*7c478bd9Sstevel@tonic-gate 	lgrpid_start = lgrpid;
3034*7c478bd9Sstevel@tonic-gate 
3035*7c478bd9Sstevel@tonic-gate 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3036*7c478bd9Sstevel@tonic-gate 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3037*7c478bd9Sstevel@tonic-gate 
3038*7c478bd9Sstevel@tonic-gate 	/*
3039*7c478bd9Sstevel@tonic-gate 	 * Use lgroup affinities (if any) to choose best lgroup
3040*7c478bd9Sstevel@tonic-gate 	 *
3041*7c478bd9Sstevel@tonic-gate 	 * NOTE: Assumes that thread is protected from going away and its
3042*7c478bd9Sstevel@tonic-gate 	 *	 lgroup affinities won't change (ie. p_lock, or
3043*7c478bd9Sstevel@tonic-gate 	 *	 thread_lock() being held and/or CPUs paused)
3044*7c478bd9Sstevel@tonic-gate 	 */
3045*7c478bd9Sstevel@tonic-gate 	if (t->t_lgrp_affinity) {
3046*7c478bd9Sstevel@tonic-gate 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3047*7c478bd9Sstevel@tonic-gate 		if (lpl != NULL)
3048*7c478bd9Sstevel@tonic-gate 			return (lpl);
3049*7c478bd9Sstevel@tonic-gate 	}
3050*7c478bd9Sstevel@tonic-gate 
3051*7c478bd9Sstevel@tonic-gate 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3052*7c478bd9Sstevel@tonic-gate 	bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3053*7c478bd9Sstevel@tonic-gate 
3054*7c478bd9Sstevel@tonic-gate 	do {
3055*7c478bd9Sstevel@tonic-gate 		pgcnt_t	npgs;
3056*7c478bd9Sstevel@tonic-gate 
3057*7c478bd9Sstevel@tonic-gate 		/*
3058*7c478bd9Sstevel@tonic-gate 		 * Skip any lgroups outside of thread's pset
3059*7c478bd9Sstevel@tonic-gate 		 */
3060*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3061*7c478bd9Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
3062*7c478bd9Sstevel@tonic-gate 				lgrpid = 0;	/* wrap the search */
3063*7c478bd9Sstevel@tonic-gate 			continue;
3064*7c478bd9Sstevel@tonic-gate 		}
3065*7c478bd9Sstevel@tonic-gate 
3066*7c478bd9Sstevel@tonic-gate 		/*
3067*7c478bd9Sstevel@tonic-gate 		 * Skip any non-leaf lgroups
3068*7c478bd9Sstevel@tonic-gate 		 */
3069*7c478bd9Sstevel@tonic-gate 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3070*7c478bd9Sstevel@tonic-gate 			continue;
3071*7c478bd9Sstevel@tonic-gate 
3072*7c478bd9Sstevel@tonic-gate 		/*
3073*7c478bd9Sstevel@tonic-gate 		 * Skip any lgroups without enough free memory
3074*7c478bd9Sstevel@tonic-gate 		 * (when threshold set to nonzero positive value)
3075*7c478bd9Sstevel@tonic-gate 		 */
3076*7c478bd9Sstevel@tonic-gate 		if (lgrp_mem_free_thresh > 0) {
3077*7c478bd9Sstevel@tonic-gate 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3078*7c478bd9Sstevel@tonic-gate 			if (npgs < lgrp_mem_free_thresh) {
3079*7c478bd9Sstevel@tonic-gate 				if (++lgrpid > lgrp_alloc_max)
3080*7c478bd9Sstevel@tonic-gate 					lgrpid = 0;	/* wrap the search */
3081*7c478bd9Sstevel@tonic-gate 				continue;
3082*7c478bd9Sstevel@tonic-gate 			}
3083*7c478bd9Sstevel@tonic-gate 		}
3084*7c478bd9Sstevel@tonic-gate 
3085*7c478bd9Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[lgrpid];
3086*7c478bd9Sstevel@tonic-gate 		if (klgrpset_isempty(p->p_lgrpset) ||
3087*7c478bd9Sstevel@tonic-gate 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3088*7c478bd9Sstevel@tonic-gate 			/*
3089*7c478bd9Sstevel@tonic-gate 			 * Either this is a new process or the process already
3090*7c478bd9Sstevel@tonic-gate 			 * has threads on this lgrp, so this is a preferred
3091*7c478bd9Sstevel@tonic-gate 			 * lgroup for the thread.
3092*7c478bd9Sstevel@tonic-gate 			 */
3093*7c478bd9Sstevel@tonic-gate 			if (lpl_pick(lpl, bestlpl)) {
3094*7c478bd9Sstevel@tonic-gate 				bestload = lpl->lpl_loadavg;
3095*7c478bd9Sstevel@tonic-gate 				bestlpl = lpl;
3096*7c478bd9Sstevel@tonic-gate 			}
3097*7c478bd9Sstevel@tonic-gate 		} else {
3098*7c478bd9Sstevel@tonic-gate 			/*
3099*7c478bd9Sstevel@tonic-gate 			 * The process doesn't have any threads on this lgrp,
3100*7c478bd9Sstevel@tonic-gate 			 * but we're willing to consider this lgrp if the load
3101*7c478bd9Sstevel@tonic-gate 			 * difference is big enough to justify splitting up
3102*7c478bd9Sstevel@tonic-gate 			 * the process' threads.
3103*7c478bd9Sstevel@tonic-gate 			 */
3104*7c478bd9Sstevel@tonic-gate 			if (lpl_pick(lpl, bestrlpl)) {
3105*7c478bd9Sstevel@tonic-gate 				bestrload = lpl->lpl_loadavg;
3106*7c478bd9Sstevel@tonic-gate 				bestrlpl = lpl;
3107*7c478bd9Sstevel@tonic-gate 			}
3108*7c478bd9Sstevel@tonic-gate 		}
3109*7c478bd9Sstevel@tonic-gate 		if (++lgrpid > lgrp_alloc_max)
3110*7c478bd9Sstevel@tonic-gate 			lgrpid = 0;	/* wrap the search */
3111*7c478bd9Sstevel@tonic-gate 	} while (lgrpid != lgrpid_start);
3112*7c478bd9Sstevel@tonic-gate 
3113*7c478bd9Sstevel@tonic-gate 	/*
3114*7c478bd9Sstevel@tonic-gate 	 * Return root lgroup if threshold isn't set to maximum value and
3115*7c478bd9Sstevel@tonic-gate 	 * lowest lgroup load average more than a certain threshold
3116*7c478bd9Sstevel@tonic-gate 	 */
3117*7c478bd9Sstevel@tonic-gate 	if (lgrp_load_thresh != UINT32_MAX &&
3118*7c478bd9Sstevel@tonic-gate 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3119*7c478bd9Sstevel@tonic-gate 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3120*7c478bd9Sstevel@tonic-gate 
3121*7c478bd9Sstevel@tonic-gate 	/*
3122*7c478bd9Sstevel@tonic-gate 	 * If all the lgroups over which the thread's process is spread are
3123*7c478bd9Sstevel@tonic-gate 	 * heavily loaded, we'll consider placing the thread on one of the
3124*7c478bd9Sstevel@tonic-gate 	 * other leaf lgroups in the thread's partition.
3125*7c478bd9Sstevel@tonic-gate 	 */
3126*7c478bd9Sstevel@tonic-gate 	if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3127*7c478bd9Sstevel@tonic-gate 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3128*7c478bd9Sstevel@tonic-gate 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3129*7c478bd9Sstevel@tonic-gate 	    bestload)) {
3130*7c478bd9Sstevel@tonic-gate 		bestlpl = bestrlpl;
3131*7c478bd9Sstevel@tonic-gate 	}
3132*7c478bd9Sstevel@tonic-gate 
3133*7c478bd9Sstevel@tonic-gate 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3134*7c478bd9Sstevel@tonic-gate 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3135*7c478bd9Sstevel@tonic-gate 
3136*7c478bd9Sstevel@tonic-gate 	ASSERT(bestlpl->lpl_ncpu > 0);
3137*7c478bd9Sstevel@tonic-gate 	return (bestlpl);
3138*7c478bd9Sstevel@tonic-gate }
3139*7c478bd9Sstevel@tonic-gate 
3140*7c478bd9Sstevel@tonic-gate /*
3141*7c478bd9Sstevel@tonic-gate  * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing.
3142*7c478bd9Sstevel@tonic-gate  */
3143*7c478bd9Sstevel@tonic-gate static int
3144*7c478bd9Sstevel@tonic-gate lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3145*7c478bd9Sstevel@tonic-gate {
3146*7c478bd9Sstevel@tonic-gate 	lgrp_load_t	l1, l2;
3147*7c478bd9Sstevel@tonic-gate 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3148*7c478bd9Sstevel@tonic-gate 
3149*7c478bd9Sstevel@tonic-gate 
3150*7c478bd9Sstevel@tonic-gate 	if (lpl2 == NULL)
3151*7c478bd9Sstevel@tonic-gate 		return (1);
3152*7c478bd9Sstevel@tonic-gate 
3153*7c478bd9Sstevel@tonic-gate 	l1 = lpl1->lpl_loadavg;
3154*7c478bd9Sstevel@tonic-gate 	l2 = lpl2->lpl_loadavg;
3155*7c478bd9Sstevel@tonic-gate 
3156*7c478bd9Sstevel@tonic-gate 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3157*7c478bd9Sstevel@tonic-gate 		/* lpl1 is significantly less loaded than lpl2 */
3158*7c478bd9Sstevel@tonic-gate 		return (1);
3159*7c478bd9Sstevel@tonic-gate 	}
3160*7c478bd9Sstevel@tonic-gate 
3161*7c478bd9Sstevel@tonic-gate 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3162*7c478bd9Sstevel@tonic-gate 	    l1 + tolerance >= l2 && l1 < l2 &&
3163*7c478bd9Sstevel@tonic-gate 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3164*7c478bd9Sstevel@tonic-gate 		/*
3165*7c478bd9Sstevel@tonic-gate 		 * lpl1's load is within the tolerance of lpl2. We're
3166*7c478bd9Sstevel@tonic-gate 		 * willing to consider it be to better however if
3167*7c478bd9Sstevel@tonic-gate 		 * it has been longer since we last homed a thread there
3168*7c478bd9Sstevel@tonic-gate 		 */
3169*7c478bd9Sstevel@tonic-gate 		return (1);
3170*7c478bd9Sstevel@tonic-gate 	}
3171*7c478bd9Sstevel@tonic-gate 
3172*7c478bd9Sstevel@tonic-gate 	return (0);
3173*7c478bd9Sstevel@tonic-gate }
3174*7c478bd9Sstevel@tonic-gate 
3175*7c478bd9Sstevel@tonic-gate /*
3176*7c478bd9Sstevel@tonic-gate  * An LWP is expected to be assigned to an lgroup for at least this long
3177*7c478bd9Sstevel@tonic-gate  * for its anticipatory load to be justified.  NOTE that this value should
3178*7c478bd9Sstevel@tonic-gate  * not be set extremely huge (say, larger than 100 years), to avoid problems
3179*7c478bd9Sstevel@tonic-gate  * with overflow in the calculation that uses it.
3180*7c478bd9Sstevel@tonic-gate  */
3181*7c478bd9Sstevel@tonic-gate #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3182*7c478bd9Sstevel@tonic-gate hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3183*7c478bd9Sstevel@tonic-gate 
3184*7c478bd9Sstevel@tonic-gate /*
3185*7c478bd9Sstevel@tonic-gate  * Routine to change a thread's lgroup affiliation.  This routine updates
3186*7c478bd9Sstevel@tonic-gate  * the thread's kthread_t struct and its process' proc_t struct to note the
3187*7c478bd9Sstevel@tonic-gate  * thread's new lgroup affiliation, and its lgroup affinities.
3188*7c478bd9Sstevel@tonic-gate  *
3189*7c478bd9Sstevel@tonic-gate  * Note that this is the only routine that modifies a thread's t_lpl field,
3190*7c478bd9Sstevel@tonic-gate  * and that adds in or removes anticipatory load.
3191*7c478bd9Sstevel@tonic-gate  *
3192*7c478bd9Sstevel@tonic-gate  * If the thread is exiting, newlpl is NULL.
3193*7c478bd9Sstevel@tonic-gate  *
3194*7c478bd9Sstevel@tonic-gate  * Locking:
3195*7c478bd9Sstevel@tonic-gate  * The following lock must be held on entry:
3196*7c478bd9Sstevel@tonic-gate  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3197*7c478bd9Sstevel@tonic-gate  *		doesn't get removed from t's partition
3198*7c478bd9Sstevel@tonic-gate  *
3199*7c478bd9Sstevel@tonic-gate  * This routine is not allowed to grab any locks, since it may be called
3200*7c478bd9Sstevel@tonic-gate  * with cpus paused (such as from cpu_offline).
3201*7c478bd9Sstevel@tonic-gate  */
3202*7c478bd9Sstevel@tonic-gate void
3203*7c478bd9Sstevel@tonic-gate lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3204*7c478bd9Sstevel@tonic-gate {
3205*7c478bd9Sstevel@tonic-gate 	proc_t		*p;
3206*7c478bd9Sstevel@tonic-gate 	lpl_t		*lpl, *oldlpl;
3207*7c478bd9Sstevel@tonic-gate 	lgrp_id_t	oldid;
3208*7c478bd9Sstevel@tonic-gate 	kthread_t	*tp;
3209*7c478bd9Sstevel@tonic-gate 	uint_t		ncpu;
3210*7c478bd9Sstevel@tonic-gate 	lgrp_load_t	old, new;
3211*7c478bd9Sstevel@tonic-gate 
3212*7c478bd9Sstevel@tonic-gate 	ASSERT(t);
3213*7c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3214*7c478bd9Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
3215*7c478bd9Sstevel@tonic-gate 
3216*7c478bd9Sstevel@tonic-gate 	/*
3217*7c478bd9Sstevel@tonic-gate 	 * If not changing lpls, just return
3218*7c478bd9Sstevel@tonic-gate 	 */
3219*7c478bd9Sstevel@tonic-gate 	if ((oldlpl = t->t_lpl) == newlpl)
3220*7c478bd9Sstevel@tonic-gate 		return;
3221*7c478bd9Sstevel@tonic-gate 
3222*7c478bd9Sstevel@tonic-gate 	/*
3223*7c478bd9Sstevel@tonic-gate 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3224*7c478bd9Sstevel@tonic-gate 	 * associated with process 0 rather than with its original process).
3225*7c478bd9Sstevel@tonic-gate 	 */
3226*7c478bd9Sstevel@tonic-gate 	if (t->t_proc_flag & TP_LWPEXIT) {
3227*7c478bd9Sstevel@tonic-gate 		if (newlpl != NULL) {
3228*7c478bd9Sstevel@tonic-gate 			t->t_lpl = newlpl;
3229*7c478bd9Sstevel@tonic-gate 		}
3230*7c478bd9Sstevel@tonic-gate 		return;
3231*7c478bd9Sstevel@tonic-gate 	}
3232*7c478bd9Sstevel@tonic-gate 
3233*7c478bd9Sstevel@tonic-gate 	p = ttoproc(t);
3234*7c478bd9Sstevel@tonic-gate 
3235*7c478bd9Sstevel@tonic-gate 	/*
3236*7c478bd9Sstevel@tonic-gate 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3237*7c478bd9Sstevel@tonic-gate 	 * to account for it being moved from its old lgroup.
3238*7c478bd9Sstevel@tonic-gate 	 */
3239*7c478bd9Sstevel@tonic-gate 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3240*7c478bd9Sstevel@tonic-gate 	    (p->p_tlist != NULL)) {
3241*7c478bd9Sstevel@tonic-gate 		oldid = oldlpl->lpl_lgrpid;
3242*7c478bd9Sstevel@tonic-gate 
3243*7c478bd9Sstevel@tonic-gate 		if (newlpl != NULL)
3244*7c478bd9Sstevel@tonic-gate 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3245*7c478bd9Sstevel@tonic-gate 
3246*7c478bd9Sstevel@tonic-gate 		if ((do_lgrpset_delete) &&
3247*7c478bd9Sstevel@tonic-gate 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3248*7c478bd9Sstevel@tonic-gate 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3249*7c478bd9Sstevel@tonic-gate 				/*
3250*7c478bd9Sstevel@tonic-gate 				 * Check if a thread other than the thread
3251*7c478bd9Sstevel@tonic-gate 				 * that's moving is assigned to the same
3252*7c478bd9Sstevel@tonic-gate 				 * lgroup as the thread that's moving.  Note
3253*7c478bd9Sstevel@tonic-gate 				 * that we have to compare lgroup IDs, rather
3254*7c478bd9Sstevel@tonic-gate 				 * than simply comparing t_lpl's, since the
3255*7c478bd9Sstevel@tonic-gate 				 * threads may belong to different partitions
3256*7c478bd9Sstevel@tonic-gate 				 * but be assigned to the same lgroup.
3257*7c478bd9Sstevel@tonic-gate 				 */
3258*7c478bd9Sstevel@tonic-gate 				ASSERT(tp->t_lpl != NULL);
3259*7c478bd9Sstevel@tonic-gate 
3260*7c478bd9Sstevel@tonic-gate 				if ((tp != t) &&
3261*7c478bd9Sstevel@tonic-gate 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3262*7c478bd9Sstevel@tonic-gate 					/*
3263*7c478bd9Sstevel@tonic-gate 					 * Another thread is assigned to the
3264*7c478bd9Sstevel@tonic-gate 					 * same lgroup as the thread that's
3265*7c478bd9Sstevel@tonic-gate 					 * moving, p_lgrpset doesn't change.
3266*7c478bd9Sstevel@tonic-gate 					 */
3267*7c478bd9Sstevel@tonic-gate 					break;
3268*7c478bd9Sstevel@tonic-gate 				} else if (tp == p->p_tlist) {
3269*7c478bd9Sstevel@tonic-gate 					/*
3270*7c478bd9Sstevel@tonic-gate 					 * No other thread is assigned to the
3271*7c478bd9Sstevel@tonic-gate 					 * same lgroup as the exiting thread,
3272*7c478bd9Sstevel@tonic-gate 					 * clear the lgroup's bit in p_lgrpset.
3273*7c478bd9Sstevel@tonic-gate 					 */
3274*7c478bd9Sstevel@tonic-gate 					klgrpset_del(p->p_lgrpset, oldid);
3275*7c478bd9Sstevel@tonic-gate 					break;
3276*7c478bd9Sstevel@tonic-gate 				}
3277*7c478bd9Sstevel@tonic-gate 			}
3278*7c478bd9Sstevel@tonic-gate 		}
3279*7c478bd9Sstevel@tonic-gate 
3280*7c478bd9Sstevel@tonic-gate 		/*
3281*7c478bd9Sstevel@tonic-gate 		 * If this thread was assigned to its old lgroup for such a
3282*7c478bd9Sstevel@tonic-gate 		 * short amount of time that the anticipatory load that was
3283*7c478bd9Sstevel@tonic-gate 		 * added on its behalf has aged very little, remove that
3284*7c478bd9Sstevel@tonic-gate 		 * anticipatory load.
3285*7c478bd9Sstevel@tonic-gate 		 */
3286*7c478bd9Sstevel@tonic-gate 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3287*7c478bd9Sstevel@tonic-gate 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3288*7c478bd9Sstevel@tonic-gate 			lpl = oldlpl;
3289*7c478bd9Sstevel@tonic-gate 			for (;;) {
3290*7c478bd9Sstevel@tonic-gate 				do {
3291*7c478bd9Sstevel@tonic-gate 					old = new = lpl->lpl_loadavg;
3292*7c478bd9Sstevel@tonic-gate 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3293*7c478bd9Sstevel@tonic-gate 					if (new > old) {
3294*7c478bd9Sstevel@tonic-gate 						/*
3295*7c478bd9Sstevel@tonic-gate 						 * this can happen if the load
3296*7c478bd9Sstevel@tonic-gate 						 * average was aged since we
3297*7c478bd9Sstevel@tonic-gate 						 * added in the anticipatory
3298*7c478bd9Sstevel@tonic-gate 						 * load
3299*7c478bd9Sstevel@tonic-gate 						 */
3300*7c478bd9Sstevel@tonic-gate 						new = 0;
3301*7c478bd9Sstevel@tonic-gate 					}
3302*7c478bd9Sstevel@tonic-gate 				} while (cas32(
3303*7c478bd9Sstevel@tonic-gate 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3304*7c478bd9Sstevel@tonic-gate 					    new) != old);
3305*7c478bd9Sstevel@tonic-gate 
3306*7c478bd9Sstevel@tonic-gate 				lpl = lpl->lpl_parent;
3307*7c478bd9Sstevel@tonic-gate 				if (lpl == NULL)
3308*7c478bd9Sstevel@tonic-gate 					break;
3309*7c478bd9Sstevel@tonic-gate 
3310*7c478bd9Sstevel@tonic-gate 				ncpu = lpl->lpl_ncpu;
3311*7c478bd9Sstevel@tonic-gate 				ASSERT(ncpu > 0);
3312*7c478bd9Sstevel@tonic-gate 			}
3313*7c478bd9Sstevel@tonic-gate 		}
3314*7c478bd9Sstevel@tonic-gate 	}
3315*7c478bd9Sstevel@tonic-gate 	/*
3316*7c478bd9Sstevel@tonic-gate 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3317*7c478bd9Sstevel@tonic-gate 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3318*7c478bd9Sstevel@tonic-gate 	 * to its new lgroup to account for its move to its new lgroup.
3319*7c478bd9Sstevel@tonic-gate 	 */
3320*7c478bd9Sstevel@tonic-gate 	if (newlpl != NULL) {
3321*7c478bd9Sstevel@tonic-gate 		/*
3322*7c478bd9Sstevel@tonic-gate 		 * This thread is moving to a new lgroup
3323*7c478bd9Sstevel@tonic-gate 		 */
3324*7c478bd9Sstevel@tonic-gate 		t->t_lpl = newlpl;
3325*7c478bd9Sstevel@tonic-gate 
3326*7c478bd9Sstevel@tonic-gate 		/*
3327*7c478bd9Sstevel@tonic-gate 		 * Reflect move in load average of new lgroup
3328*7c478bd9Sstevel@tonic-gate 		 * unless it is root lgroup
3329*7c478bd9Sstevel@tonic-gate 		 */
3330*7c478bd9Sstevel@tonic-gate 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3331*7c478bd9Sstevel@tonic-gate 			return;
3332*7c478bd9Sstevel@tonic-gate 
3333*7c478bd9Sstevel@tonic-gate 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3334*7c478bd9Sstevel@tonic-gate 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3335*7c478bd9Sstevel@tonic-gate 		}
3336*7c478bd9Sstevel@tonic-gate 
3337*7c478bd9Sstevel@tonic-gate 		/*
3338*7c478bd9Sstevel@tonic-gate 		 * It'll take some time for the load on the new lgroup
3339*7c478bd9Sstevel@tonic-gate 		 * to reflect this thread's placement on it.  We'd
3340*7c478bd9Sstevel@tonic-gate 		 * like not, however, to have all threads between now
3341*7c478bd9Sstevel@tonic-gate 		 * and then also piling on to this lgroup.  To avoid
3342*7c478bd9Sstevel@tonic-gate 		 * this pileup, we anticipate the load this thread
3343*7c478bd9Sstevel@tonic-gate 		 * will generate on its new lgroup.  The goal is to
3344*7c478bd9Sstevel@tonic-gate 		 * make the lgroup's load appear as though the thread
3345*7c478bd9Sstevel@tonic-gate 		 * had been there all along.  We're very conservative
3346*7c478bd9Sstevel@tonic-gate 		 * in calculating this anticipatory load, we assume
3347*7c478bd9Sstevel@tonic-gate 		 * the worst case case (100% CPU-bound thread).  This
3348*7c478bd9Sstevel@tonic-gate 		 * may be modified in the future to be more accurate.
3349*7c478bd9Sstevel@tonic-gate 		 */
3350*7c478bd9Sstevel@tonic-gate 		lpl = newlpl;
3351*7c478bd9Sstevel@tonic-gate 		for (;;) {
3352*7c478bd9Sstevel@tonic-gate 			ncpu = lpl->lpl_ncpu;
3353*7c478bd9Sstevel@tonic-gate 			ASSERT(ncpu > 0);
3354*7c478bd9Sstevel@tonic-gate 			do {
3355*7c478bd9Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
3356*7c478bd9Sstevel@tonic-gate 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3357*7c478bd9Sstevel@tonic-gate 				/*
3358*7c478bd9Sstevel@tonic-gate 				 * Check for overflow
3359*7c478bd9Sstevel@tonic-gate 				 * Underflow not possible here
3360*7c478bd9Sstevel@tonic-gate 				 */
3361*7c478bd9Sstevel@tonic-gate 				if (new < old)
3362*7c478bd9Sstevel@tonic-gate 					new = UINT32_MAX;
3363*7c478bd9Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3364*7c478bd9Sstevel@tonic-gate 			    new) != old);
3365*7c478bd9Sstevel@tonic-gate 
3366*7c478bd9Sstevel@tonic-gate 			lpl = lpl->lpl_parent;
3367*7c478bd9Sstevel@tonic-gate 			if (lpl == NULL)
3368*7c478bd9Sstevel@tonic-gate 				break;
3369*7c478bd9Sstevel@tonic-gate 		}
3370*7c478bd9Sstevel@tonic-gate 		t->t_anttime = gethrtime();
3371*7c478bd9Sstevel@tonic-gate 	}
3372*7c478bd9Sstevel@tonic-gate }
3373*7c478bd9Sstevel@tonic-gate 
3374*7c478bd9Sstevel@tonic-gate /*
3375*7c478bd9Sstevel@tonic-gate  * Return lgroup memory allocation policy given advice from madvise(3C)
3376*7c478bd9Sstevel@tonic-gate  */
3377*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_t
3378*7c478bd9Sstevel@tonic-gate lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3379*7c478bd9Sstevel@tonic-gate {
3380*7c478bd9Sstevel@tonic-gate 	switch (advice) {
3381*7c478bd9Sstevel@tonic-gate 	case MADV_ACCESS_LWP:
3382*7c478bd9Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_NEXT);
3383*7c478bd9Sstevel@tonic-gate 	case MADV_ACCESS_MANY:
3384*7c478bd9Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_RANDOM);
3385*7c478bd9Sstevel@tonic-gate 	default:
3386*7c478bd9Sstevel@tonic-gate 		return (lgrp_mem_policy_default(size, type));
3387*7c478bd9Sstevel@tonic-gate 	}
3388*7c478bd9Sstevel@tonic-gate }
3389*7c478bd9Sstevel@tonic-gate 
3390*7c478bd9Sstevel@tonic-gate /*
3391*7c478bd9Sstevel@tonic-gate  * Figure out default policy
3392*7c478bd9Sstevel@tonic-gate  */
3393*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_t
3394*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_default(size_t size, int type)
3395*7c478bd9Sstevel@tonic-gate {
3396*7c478bd9Sstevel@tonic-gate 	cpupart_t		*cp;
3397*7c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
3398*7c478bd9Sstevel@tonic-gate 	size_t			pset_mem_size;
3399*7c478bd9Sstevel@tonic-gate 
3400*7c478bd9Sstevel@tonic-gate 	/*
3401*7c478bd9Sstevel@tonic-gate 	 * Randomly allocate memory across lgroups for shared memory
3402*7c478bd9Sstevel@tonic-gate 	 * beyond a certain threshold
3403*7c478bd9Sstevel@tonic-gate 	 */
3404*7c478bd9Sstevel@tonic-gate 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3405*7c478bd9Sstevel@tonic-gate 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3406*7c478bd9Sstevel@tonic-gate 		/*
3407*7c478bd9Sstevel@tonic-gate 		 * Get total memory size of current thread's pset
3408*7c478bd9Sstevel@tonic-gate 		 */
3409*7c478bd9Sstevel@tonic-gate 		kpreempt_disable();
3410*7c478bd9Sstevel@tonic-gate 		cp = curthread->t_cpupart;
3411*7c478bd9Sstevel@tonic-gate 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3412*7c478bd9Sstevel@tonic-gate 		kpreempt_enable();
3413*7c478bd9Sstevel@tonic-gate 
3414*7c478bd9Sstevel@tonic-gate 		/*
3415*7c478bd9Sstevel@tonic-gate 		 * Choose policy to randomly allocate memory across
3416*7c478bd9Sstevel@tonic-gate 		 * lgroups in pset if it will fit and is not default
3417*7c478bd9Sstevel@tonic-gate 		 * partition.  Otherwise, allocate memory randomly
3418*7c478bd9Sstevel@tonic-gate 		 * across machine.
3419*7c478bd9Sstevel@tonic-gate 		 */
3420*7c478bd9Sstevel@tonic-gate 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3421*7c478bd9Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3422*7c478bd9Sstevel@tonic-gate 		else
3423*7c478bd9Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM;
3424*7c478bd9Sstevel@tonic-gate 	} else
3425*7c478bd9Sstevel@tonic-gate 		/*
3426*7c478bd9Sstevel@tonic-gate 		 * Apply default policy for private memory and
3427*7c478bd9Sstevel@tonic-gate 		 * shared memory under the respective random
3428*7c478bd9Sstevel@tonic-gate 		 * threshold.
3429*7c478bd9Sstevel@tonic-gate 		 */
3430*7c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_default_policy;
3431*7c478bd9Sstevel@tonic-gate 
3432*7c478bd9Sstevel@tonic-gate 	return (policy);
3433*7c478bd9Sstevel@tonic-gate }
3434*7c478bd9Sstevel@tonic-gate 
3435*7c478bd9Sstevel@tonic-gate /*
3436*7c478bd9Sstevel@tonic-gate  * Get memory allocation policy for this segment
3437*7c478bd9Sstevel@tonic-gate  */
3438*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_info_t *
3439*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3440*7c478bd9Sstevel@tonic-gate {
3441*7c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
3442*7c478bd9Sstevel@tonic-gate 	extern struct seg_ops	segspt_ops;
3443*7c478bd9Sstevel@tonic-gate 	extern struct seg_ops	segspt_shmops;
3444*7c478bd9Sstevel@tonic-gate 
3445*7c478bd9Sstevel@tonic-gate 	/*
3446*7c478bd9Sstevel@tonic-gate 	 * This is for binary compatibility to protect against third party
3447*7c478bd9Sstevel@tonic-gate 	 * segment drivers which haven't recompiled to allow for
3448*7c478bd9Sstevel@tonic-gate 	 * SEGOP_GETPOLICY()
3449*7c478bd9Sstevel@tonic-gate 	 */
3450*7c478bd9Sstevel@tonic-gate 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3451*7c478bd9Sstevel@tonic-gate 	    seg->s_ops != &segspt_shmops)
3452*7c478bd9Sstevel@tonic-gate 		return (NULL);
3453*7c478bd9Sstevel@tonic-gate 
3454*7c478bd9Sstevel@tonic-gate 	policy_info = NULL;
3455*7c478bd9Sstevel@tonic-gate 	if (seg->s_ops->getpolicy != NULL)
3456*7c478bd9Sstevel@tonic-gate 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3457*7c478bd9Sstevel@tonic-gate 
3458*7c478bd9Sstevel@tonic-gate 	return (policy_info);
3459*7c478bd9Sstevel@tonic-gate }
3460*7c478bd9Sstevel@tonic-gate 
3461*7c478bd9Sstevel@tonic-gate /*
3462*7c478bd9Sstevel@tonic-gate  * Set policy for allocating private memory given desired policy, policy info,
3463*7c478bd9Sstevel@tonic-gate  * size in bytes of memory that policy is being applied.
3464*7c478bd9Sstevel@tonic-gate  * Return 0 if policy wasn't set already and 1 if policy was set already
3465*7c478bd9Sstevel@tonic-gate  */
3466*7c478bd9Sstevel@tonic-gate int
3467*7c478bd9Sstevel@tonic-gate lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3468*7c478bd9Sstevel@tonic-gate     lgrp_mem_policy_info_t *policy_info, size_t size)
3469*7c478bd9Sstevel@tonic-gate {
3470*7c478bd9Sstevel@tonic-gate 
3471*7c478bd9Sstevel@tonic-gate 	ASSERT(policy_info != NULL);
3472*7c478bd9Sstevel@tonic-gate 
3473*7c478bd9Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3474*7c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3475*7c478bd9Sstevel@tonic-gate 
3476*7c478bd9Sstevel@tonic-gate 	/*
3477*7c478bd9Sstevel@tonic-gate 	 * Policy set already?
3478*7c478bd9Sstevel@tonic-gate 	 */
3479*7c478bd9Sstevel@tonic-gate 	if (policy == policy_info->mem_policy)
3480*7c478bd9Sstevel@tonic-gate 		return (1);
3481*7c478bd9Sstevel@tonic-gate 
3482*7c478bd9Sstevel@tonic-gate 	/*
3483*7c478bd9Sstevel@tonic-gate 	 * Set policy
3484*7c478bd9Sstevel@tonic-gate 	 */
3485*7c478bd9Sstevel@tonic-gate 	policy_info->mem_policy = policy;
3486*7c478bd9Sstevel@tonic-gate 	policy_info->mem_reserved = 0;
3487*7c478bd9Sstevel@tonic-gate 
3488*7c478bd9Sstevel@tonic-gate 	return (0);
3489*7c478bd9Sstevel@tonic-gate }
3490*7c478bd9Sstevel@tonic-gate 
3491*7c478bd9Sstevel@tonic-gate 
3492*7c478bd9Sstevel@tonic-gate /*
3493*7c478bd9Sstevel@tonic-gate  * Get shared memory allocation policy with given tree and offset
3494*7c478bd9Sstevel@tonic-gate  */
3495*7c478bd9Sstevel@tonic-gate lgrp_mem_policy_info_t *
3496*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3497*7c478bd9Sstevel@tonic-gate     u_offset_t vn_off)
3498*7c478bd9Sstevel@tonic-gate {
3499*7c478bd9Sstevel@tonic-gate 	u_offset_t		off;
3500*7c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
3501*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*policy_seg;
3502*7c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
3503*7c478bd9Sstevel@tonic-gate 	avl_tree_t		*tree;
3504*7c478bd9Sstevel@tonic-gate 	avl_index_t		where;
3505*7c478bd9Sstevel@tonic-gate 
3506*7c478bd9Sstevel@tonic-gate 	/*
3507*7c478bd9Sstevel@tonic-gate 	 * Get policy segment tree from anon_map or vnode and use specified
3508*7c478bd9Sstevel@tonic-gate 	 * anon index or vnode offset as offset
3509*7c478bd9Sstevel@tonic-gate 	 *
3510*7c478bd9Sstevel@tonic-gate 	 * Assume that no lock needs to be held on anon_map or vnode, since
3511*7c478bd9Sstevel@tonic-gate 	 * they should be protected by their reference count which must be
3512*7c478bd9Sstevel@tonic-gate 	 * nonzero for an existing segment
3513*7c478bd9Sstevel@tonic-gate 	 */
3514*7c478bd9Sstevel@tonic-gate 	if (amp) {
3515*7c478bd9Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
3516*7c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
3517*7c478bd9Sstevel@tonic-gate 		if (shm_locality == NULL)
3518*7c478bd9Sstevel@tonic-gate 			return (NULL);
3519*7c478bd9Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
3520*7c478bd9Sstevel@tonic-gate 		off = ptob(anon_index);
3521*7c478bd9Sstevel@tonic-gate 	} else if (vp) {
3522*7c478bd9Sstevel@tonic-gate 		shm_locality = vp->v_locality;
3523*7c478bd9Sstevel@tonic-gate 		if (shm_locality == NULL)
3524*7c478bd9Sstevel@tonic-gate 			return (NULL);
3525*7c478bd9Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
3526*7c478bd9Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
3527*7c478bd9Sstevel@tonic-gate 		off = vn_off;
3528*7c478bd9Sstevel@tonic-gate 	}
3529*7c478bd9Sstevel@tonic-gate 
3530*7c478bd9Sstevel@tonic-gate 	if (tree == NULL)
3531*7c478bd9Sstevel@tonic-gate 		return (NULL);
3532*7c478bd9Sstevel@tonic-gate 
3533*7c478bd9Sstevel@tonic-gate 	/*
3534*7c478bd9Sstevel@tonic-gate 	 * Lookup policy segment for offset into shared object and return
3535*7c478bd9Sstevel@tonic-gate 	 * policy info
3536*7c478bd9Sstevel@tonic-gate 	 */
3537*7c478bd9Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_READER);
3538*7c478bd9Sstevel@tonic-gate 	policy_info = NULL;
3539*7c478bd9Sstevel@tonic-gate 	policy_seg = avl_find(tree, &off, &where);
3540*7c478bd9Sstevel@tonic-gate 	if (policy_seg)
3541*7c478bd9Sstevel@tonic-gate 		policy_info = &policy_seg->shm_policy;
3542*7c478bd9Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
3543*7c478bd9Sstevel@tonic-gate 
3544*7c478bd9Sstevel@tonic-gate 	return (policy_info);
3545*7c478bd9Sstevel@tonic-gate }
3546*7c478bd9Sstevel@tonic-gate 
3547*7c478bd9Sstevel@tonic-gate /*
3548*7c478bd9Sstevel@tonic-gate  * Return lgroup to use for allocating memory
3549*7c478bd9Sstevel@tonic-gate  * given the segment and address
3550*7c478bd9Sstevel@tonic-gate  *
3551*7c478bd9Sstevel@tonic-gate  * There isn't any mutual exclusion that exists between calls
3552*7c478bd9Sstevel@tonic-gate  * to this routine and DR, so this routine and whomever calls it
3553*7c478bd9Sstevel@tonic-gate  * should be mindful of the possibility that the lgrp returned
3554*7c478bd9Sstevel@tonic-gate  * may be deleted. If this happens, dereferences of the lgrp
3555*7c478bd9Sstevel@tonic-gate  * pointer will still be safe, but the resources in the lgrp will
3556*7c478bd9Sstevel@tonic-gate  * be gone, and LGRP_EXISTS() will no longer be true.
3557*7c478bd9Sstevel@tonic-gate  */
3558*7c478bd9Sstevel@tonic-gate lgrp_t *
3559*7c478bd9Sstevel@tonic-gate lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3560*7c478bd9Sstevel@tonic-gate {
3561*7c478bd9Sstevel@tonic-gate 	int			i;
3562*7c478bd9Sstevel@tonic-gate 	lgrp_t			*lgrp;
3563*7c478bd9Sstevel@tonic-gate 	klgrpset_t		lgrpset;
3564*7c478bd9Sstevel@tonic-gate 	int			lgrps_spanned;
3565*7c478bd9Sstevel@tonic-gate 	unsigned long		off;
3566*7c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
3567*7c478bd9Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
3568*7c478bd9Sstevel@tonic-gate 	ushort_t		random;
3569*7c478bd9Sstevel@tonic-gate 	int			stat = 0;
3570*7c478bd9Sstevel@tonic-gate 
3571*7c478bd9Sstevel@tonic-gate 	/*
3572*7c478bd9Sstevel@tonic-gate 	 * Just return null if the lgrp framework hasn't finished
3573*7c478bd9Sstevel@tonic-gate 	 * initializing or if this is a UMA machine.
3574*7c478bd9Sstevel@tonic-gate 	 */
3575*7c478bd9Sstevel@tonic-gate 	if (nlgrps == 1 || !lgrp_initialized)
3576*7c478bd9Sstevel@tonic-gate 		return (lgrp_root);
3577*7c478bd9Sstevel@tonic-gate 
3578*7c478bd9Sstevel@tonic-gate 	/*
3579*7c478bd9Sstevel@tonic-gate 	 * Get memory allocation policy for this segment
3580*7c478bd9Sstevel@tonic-gate 	 */
3581*7c478bd9Sstevel@tonic-gate 	policy = lgrp_mem_default_policy;
3582*7c478bd9Sstevel@tonic-gate 	if (seg != NULL) {
3583*7c478bd9Sstevel@tonic-gate 		if (seg->s_as == &kas) {
3584*7c478bd9Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3585*7c478bd9Sstevel@tonic-gate 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3586*7c478bd9Sstevel@tonic-gate 				policy = LGRP_MEM_POLICY_RANDOM;
3587*7c478bd9Sstevel@tonic-gate 		} else {
3588*7c478bd9Sstevel@tonic-gate 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3589*7c478bd9Sstevel@tonic-gate 			if (policy_info != NULL)
3590*7c478bd9Sstevel@tonic-gate 				policy = policy_info->mem_policy;
3591*7c478bd9Sstevel@tonic-gate 		}
3592*7c478bd9Sstevel@tonic-gate 	}
3593*7c478bd9Sstevel@tonic-gate 	lgrpset = 0;
3594*7c478bd9Sstevel@tonic-gate 
3595*7c478bd9Sstevel@tonic-gate 	/*
3596*7c478bd9Sstevel@tonic-gate 	 * Initialize lgroup to home by default
3597*7c478bd9Sstevel@tonic-gate 	 */
3598*7c478bd9Sstevel@tonic-gate 	lgrp = lgrp_home_lgrp();
3599*7c478bd9Sstevel@tonic-gate 
3600*7c478bd9Sstevel@tonic-gate 	/*
3601*7c478bd9Sstevel@tonic-gate 	 * When homing threads on root lgrp, override default memory
3602*7c478bd9Sstevel@tonic-gate 	 * allocation policies with root lgroup memory allocation policy
3603*7c478bd9Sstevel@tonic-gate 	 */
3604*7c478bd9Sstevel@tonic-gate 	if (lgrp == lgrp_root)
3605*7c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_root;
3606*7c478bd9Sstevel@tonic-gate 
3607*7c478bd9Sstevel@tonic-gate 	/*
3608*7c478bd9Sstevel@tonic-gate 	 * Implement policy
3609*7c478bd9Sstevel@tonic-gate 	 */
3610*7c478bd9Sstevel@tonic-gate 	switch (policy) {
3611*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT_CPU:
3612*7c478bd9Sstevel@tonic-gate 
3613*7c478bd9Sstevel@tonic-gate 		/*
3614*7c478bd9Sstevel@tonic-gate 		 * Return lgroup of current CPU which faulted on memory
3615*7c478bd9Sstevel@tonic-gate 		 */
3616*7c478bd9Sstevel@tonic-gate 		lgrp = lgrp_cpu_to_lgrp(CPU);
3617*7c478bd9Sstevel@tonic-gate 		break;
3618*7c478bd9Sstevel@tonic-gate 
3619*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT:
3620*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_DEFAULT:
3621*7c478bd9Sstevel@tonic-gate 	default:
3622*7c478bd9Sstevel@tonic-gate 
3623*7c478bd9Sstevel@tonic-gate 		/*
3624*7c478bd9Sstevel@tonic-gate 		 * Just return current thread's home lgroup
3625*7c478bd9Sstevel@tonic-gate 		 * for default policy (next touch)
3626*7c478bd9Sstevel@tonic-gate 		 * If the thread is homed to the root,
3627*7c478bd9Sstevel@tonic-gate 		 * then the default policy is random across lgroups.
3628*7c478bd9Sstevel@tonic-gate 		 * Fallthrough to the random case.
3629*7c478bd9Sstevel@tonic-gate 		 */
3630*7c478bd9Sstevel@tonic-gate 		if (lgrp != lgrp_root) {
3631*7c478bd9Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_NEXT)
3632*7c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3633*7c478bd9Sstevel@tonic-gate 			else
3634*7c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id,
3635*7c478bd9Sstevel@tonic-gate 				    LGRP_NUM_DEFAULT, 1);
3636*7c478bd9Sstevel@tonic-gate 			break;
3637*7c478bd9Sstevel@tonic-gate 		}
3638*7c478bd9Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
3639*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM:
3640*7c478bd9Sstevel@tonic-gate 
3641*7c478bd9Sstevel@tonic-gate 		/*
3642*7c478bd9Sstevel@tonic-gate 		 * Return a random leaf lgroup with memory
3643*7c478bd9Sstevel@tonic-gate 		 */
3644*7c478bd9Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3645*7c478bd9Sstevel@tonic-gate 		/*
3646*7c478bd9Sstevel@tonic-gate 		 * Count how many lgroups are spanned
3647*7c478bd9Sstevel@tonic-gate 		 */
3648*7c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3649*7c478bd9Sstevel@tonic-gate 
3650*7c478bd9Sstevel@tonic-gate 		/*
3651*7c478bd9Sstevel@tonic-gate 		 * There may be no memnodes in the root lgroup during DR copy
3652*7c478bd9Sstevel@tonic-gate 		 * rename on a system with only two boards (memnodes)
3653*7c478bd9Sstevel@tonic-gate 		 * configured. In this case just return the root lgrp.
3654*7c478bd9Sstevel@tonic-gate 		 */
3655*7c478bd9Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
3656*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_root;
3657*7c478bd9Sstevel@tonic-gate 			break;
3658*7c478bd9Sstevel@tonic-gate 		}
3659*7c478bd9Sstevel@tonic-gate 
3660*7c478bd9Sstevel@tonic-gate 		/*
3661*7c478bd9Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
3662*7c478bd9Sstevel@tonic-gate 		 * and return lgroup at that offset
3663*7c478bd9Sstevel@tonic-gate 		 */
3664*7c478bd9Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
3665*7c478bd9Sstevel@tonic-gate 		off = random % lgrps_spanned;
3666*7c478bd9Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
3667*7c478bd9Sstevel@tonic-gate 
3668*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
3669*7c478bd9Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
3670*7c478bd9Sstevel@tonic-gate 				continue;
3671*7c478bd9Sstevel@tonic-gate 			if (off)
3672*7c478bd9Sstevel@tonic-gate 				off--;
3673*7c478bd9Sstevel@tonic-gate 			else {
3674*7c478bd9Sstevel@tonic-gate 				lgrp = lgrp_table[i];
3675*7c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3676*7c478bd9Sstevel@tonic-gate 				    1);
3677*7c478bd9Sstevel@tonic-gate 				break;
3678*7c478bd9Sstevel@tonic-gate 			}
3679*7c478bd9Sstevel@tonic-gate 		}
3680*7c478bd9Sstevel@tonic-gate 		break;
3681*7c478bd9Sstevel@tonic-gate 
3682*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PROC:
3683*7c478bd9Sstevel@tonic-gate 
3684*7c478bd9Sstevel@tonic-gate 		/*
3685*7c478bd9Sstevel@tonic-gate 		 * Grab copy of bitmask of lgroups spanned by
3686*7c478bd9Sstevel@tonic-gate 		 * this process
3687*7c478bd9Sstevel@tonic-gate 		 */
3688*7c478bd9Sstevel@tonic-gate 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3689*7c478bd9Sstevel@tonic-gate 		stat = LGRP_NUM_RANDOM_PROC;
3690*7c478bd9Sstevel@tonic-gate 
3691*7c478bd9Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
3692*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PSET:
3693*7c478bd9Sstevel@tonic-gate 
3694*7c478bd9Sstevel@tonic-gate 		if (!stat)
3695*7c478bd9Sstevel@tonic-gate 			stat = LGRP_NUM_RANDOM_PSET;
3696*7c478bd9Sstevel@tonic-gate 
3697*7c478bd9Sstevel@tonic-gate 		if (klgrpset_isempty(lgrpset)) {
3698*7c478bd9Sstevel@tonic-gate 			/*
3699*7c478bd9Sstevel@tonic-gate 			 * Grab copy of bitmask of lgroups spanned by
3700*7c478bd9Sstevel@tonic-gate 			 * this processor set
3701*7c478bd9Sstevel@tonic-gate 			 */
3702*7c478bd9Sstevel@tonic-gate 			kpreempt_disable();
3703*7c478bd9Sstevel@tonic-gate 			klgrpset_copy(lgrpset,
3704*7c478bd9Sstevel@tonic-gate 			    curthread->t_cpupart->cp_lgrpset);
3705*7c478bd9Sstevel@tonic-gate 			kpreempt_enable();
3706*7c478bd9Sstevel@tonic-gate 		}
3707*7c478bd9Sstevel@tonic-gate 
3708*7c478bd9Sstevel@tonic-gate 		/*
3709*7c478bd9Sstevel@tonic-gate 		 * Count how many lgroups are spanned
3710*7c478bd9Sstevel@tonic-gate 		 */
3711*7c478bd9Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3712*7c478bd9Sstevel@tonic-gate 		ASSERT(lgrps_spanned <= nlgrps);
3713*7c478bd9Sstevel@tonic-gate 
3714*7c478bd9Sstevel@tonic-gate 		/*
3715*7c478bd9Sstevel@tonic-gate 		 * Probably lgrps_spanned should be always non-zero, but to be
3716*7c478bd9Sstevel@tonic-gate 		 * on the safe side we return lgrp_root if it is empty.
3717*7c478bd9Sstevel@tonic-gate 		 */
3718*7c478bd9Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
3719*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_root;
3720*7c478bd9Sstevel@tonic-gate 			break;
3721*7c478bd9Sstevel@tonic-gate 		}
3722*7c478bd9Sstevel@tonic-gate 
3723*7c478bd9Sstevel@tonic-gate 		/*
3724*7c478bd9Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
3725*7c478bd9Sstevel@tonic-gate 		 * and return lgroup at that offset
3726*7c478bd9Sstevel@tonic-gate 		 */
3727*7c478bd9Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
3728*7c478bd9Sstevel@tonic-gate 		off = random % lgrps_spanned;
3729*7c478bd9Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
3730*7c478bd9Sstevel@tonic-gate 
3731*7c478bd9Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
3732*7c478bd9Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
3733*7c478bd9Sstevel@tonic-gate 				continue;
3734*7c478bd9Sstevel@tonic-gate 			if (off)
3735*7c478bd9Sstevel@tonic-gate 				off--;
3736*7c478bd9Sstevel@tonic-gate 			else {
3737*7c478bd9Sstevel@tonic-gate 				lgrp = lgrp_table[i];
3738*7c478bd9Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3739*7c478bd9Sstevel@tonic-gate 				    1);
3740*7c478bd9Sstevel@tonic-gate 				break;
3741*7c478bd9Sstevel@tonic-gate 			}
3742*7c478bd9Sstevel@tonic-gate 		}
3743*7c478bd9Sstevel@tonic-gate 		break;
3744*7c478bd9Sstevel@tonic-gate 
3745*7c478bd9Sstevel@tonic-gate 	case LGRP_MEM_POLICY_ROUNDROBIN:
3746*7c478bd9Sstevel@tonic-gate 
3747*7c478bd9Sstevel@tonic-gate 		/*
3748*7c478bd9Sstevel@tonic-gate 		 * Use offset within segment to determine
3749*7c478bd9Sstevel@tonic-gate 		 * offset from home lgroup to choose for
3750*7c478bd9Sstevel@tonic-gate 		 * next lgroup to allocate memory from
3751*7c478bd9Sstevel@tonic-gate 		 */
3752*7c478bd9Sstevel@tonic-gate 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3753*7c478bd9Sstevel@tonic-gate 		    (lgrp_alloc_max + 1);
3754*7c478bd9Sstevel@tonic-gate 
3755*7c478bd9Sstevel@tonic-gate 		kpreempt_disable();
3756*7c478bd9Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3757*7c478bd9Sstevel@tonic-gate 		i = lgrp->lgrp_id;
3758*7c478bd9Sstevel@tonic-gate 		kpreempt_enable();
3759*7c478bd9Sstevel@tonic-gate 
3760*7c478bd9Sstevel@tonic-gate 		while (off > 0) {
3761*7c478bd9Sstevel@tonic-gate 			i = (i + 1) % (lgrp_alloc_max + 1);
3762*7c478bd9Sstevel@tonic-gate 			lgrp = lgrp_table[i];
3763*7c478bd9Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, i))
3764*7c478bd9Sstevel@tonic-gate 				off--;
3765*7c478bd9Sstevel@tonic-gate 		}
3766*7c478bd9Sstevel@tonic-gate 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3767*7c478bd9Sstevel@tonic-gate 
3768*7c478bd9Sstevel@tonic-gate 		break;
3769*7c478bd9Sstevel@tonic-gate 	}
3770*7c478bd9Sstevel@tonic-gate 
3771*7c478bd9Sstevel@tonic-gate 	ASSERT(lgrp != NULL);
3772*7c478bd9Sstevel@tonic-gate 	return (lgrp);
3773*7c478bd9Sstevel@tonic-gate }
3774*7c478bd9Sstevel@tonic-gate 
3775*7c478bd9Sstevel@tonic-gate /*
3776*7c478bd9Sstevel@tonic-gate  * Return the number of pages in an lgroup
3777*7c478bd9Sstevel@tonic-gate  *
3778*7c478bd9Sstevel@tonic-gate  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3779*7c478bd9Sstevel@tonic-gate  *	 could cause tests that rely on the numat driver to fail....
3780*7c478bd9Sstevel@tonic-gate  */
3781*7c478bd9Sstevel@tonic-gate pgcnt_t
3782*7c478bd9Sstevel@tonic-gate lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3783*7c478bd9Sstevel@tonic-gate {
3784*7c478bd9Sstevel@tonic-gate 	lgrp_t *lgrp;
3785*7c478bd9Sstevel@tonic-gate 
3786*7c478bd9Sstevel@tonic-gate 	lgrp = lgrp_table[lgrpid];
3787*7c478bd9Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp) ||
3788*7c478bd9Sstevel@tonic-gate 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3789*7c478bd9Sstevel@tonic-gate 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3790*7c478bd9Sstevel@tonic-gate 		return (0);
3791*7c478bd9Sstevel@tonic-gate 
3792*7c478bd9Sstevel@tonic-gate 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3793*7c478bd9Sstevel@tonic-gate }
3794*7c478bd9Sstevel@tonic-gate 
3795*7c478bd9Sstevel@tonic-gate /*
3796*7c478bd9Sstevel@tonic-gate  * Initialize lgroup shared memory allocation policy support
3797*7c478bd9Sstevel@tonic-gate  */
3798*7c478bd9Sstevel@tonic-gate void
3799*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3800*7c478bd9Sstevel@tonic-gate {
3801*7c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
3802*7c478bd9Sstevel@tonic-gate 
3803*7c478bd9Sstevel@tonic-gate 	/*
3804*7c478bd9Sstevel@tonic-gate 	 * Initialize locality field in anon_map
3805*7c478bd9Sstevel@tonic-gate 	 * Don't need any locks because this is called when anon_map is
3806*7c478bd9Sstevel@tonic-gate 	 * allocated, but not used anywhere yet.
3807*7c478bd9Sstevel@tonic-gate 	 */
3808*7c478bd9Sstevel@tonic-gate 	if (amp) {
3809*7c478bd9Sstevel@tonic-gate 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3810*7c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL) {
3811*7c478bd9Sstevel@tonic-gate 			/*
3812*7c478bd9Sstevel@tonic-gate 			 * Allocate and initialize shared memory locality info
3813*7c478bd9Sstevel@tonic-gate 			 * and set anon_map locality pointer to it
3814*7c478bd9Sstevel@tonic-gate 			 * Drop lock across kmem_alloc(KM_SLEEP)
3815*7c478bd9Sstevel@tonic-gate 			 */
3816*7c478bd9Sstevel@tonic-gate 			ANON_LOCK_EXIT(&amp->a_rwlock);
3817*7c478bd9Sstevel@tonic-gate 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3818*7c478bd9Sstevel@tonic-gate 			    KM_SLEEP);
3819*7c478bd9Sstevel@tonic-gate 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3820*7c478bd9Sstevel@tonic-gate 			    NULL);
3821*7c478bd9Sstevel@tonic-gate 			shm_locality->loc_count = 1;	/* not used for amp */
3822*7c478bd9Sstevel@tonic-gate 			shm_locality->loc_tree = NULL;
3823*7c478bd9Sstevel@tonic-gate 
3824*7c478bd9Sstevel@tonic-gate 			/*
3825*7c478bd9Sstevel@tonic-gate 			 * Reacquire lock and check to see whether anyone beat
3826*7c478bd9Sstevel@tonic-gate 			 * us to initializing the locality info
3827*7c478bd9Sstevel@tonic-gate 			 */
3828*7c478bd9Sstevel@tonic-gate 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3829*7c478bd9Sstevel@tonic-gate 			if (amp->locality != NULL) {
3830*7c478bd9Sstevel@tonic-gate 				rw_destroy(&shm_locality->loc_lock);
3831*7c478bd9Sstevel@tonic-gate 				kmem_free(shm_locality,
3832*7c478bd9Sstevel@tonic-gate 				    sizeof (*shm_locality));
3833*7c478bd9Sstevel@tonic-gate 			} else
3834*7c478bd9Sstevel@tonic-gate 				amp->locality = shm_locality;
3835*7c478bd9Sstevel@tonic-gate 		}
3836*7c478bd9Sstevel@tonic-gate 		ANON_LOCK_EXIT(&amp->a_rwlock);
3837*7c478bd9Sstevel@tonic-gate 		return;
3838*7c478bd9Sstevel@tonic-gate 	}
3839*7c478bd9Sstevel@tonic-gate 
3840*7c478bd9Sstevel@tonic-gate 	/*
3841*7c478bd9Sstevel@tonic-gate 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3842*7c478bd9Sstevel@tonic-gate 	 */
3843*7c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
3844*7c478bd9Sstevel@tonic-gate 	if ((vp->v_flag & V_LOCALITY) == 0) {
3845*7c478bd9Sstevel@tonic-gate 		/*
3846*7c478bd9Sstevel@tonic-gate 		 * Allocate and initialize shared memory locality info
3847*7c478bd9Sstevel@tonic-gate 		 */
3848*7c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
3849*7c478bd9Sstevel@tonic-gate 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3850*7c478bd9Sstevel@tonic-gate 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3851*7c478bd9Sstevel@tonic-gate 		shm_locality->loc_count = 1;
3852*7c478bd9Sstevel@tonic-gate 		shm_locality->loc_tree = NULL;
3853*7c478bd9Sstevel@tonic-gate 
3854*7c478bd9Sstevel@tonic-gate 		/*
3855*7c478bd9Sstevel@tonic-gate 		 * Point vnode locality field at shared vnode policy info
3856*7c478bd9Sstevel@tonic-gate 		 * and set locality aware flag in vnode
3857*7c478bd9Sstevel@tonic-gate 		 */
3858*7c478bd9Sstevel@tonic-gate 		mutex_enter(&vp->v_lock);
3859*7c478bd9Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0) {
3860*7c478bd9Sstevel@tonic-gate 			vp->v_locality = shm_locality;
3861*7c478bd9Sstevel@tonic-gate 			vp->v_flag |= V_LOCALITY;
3862*7c478bd9Sstevel@tonic-gate 		} else {
3863*7c478bd9Sstevel@tonic-gate 			/*
3864*7c478bd9Sstevel@tonic-gate 			 * Lost race so free locality info and increment count.
3865*7c478bd9Sstevel@tonic-gate 			 */
3866*7c478bd9Sstevel@tonic-gate 			rw_destroy(&shm_locality->loc_lock);
3867*7c478bd9Sstevel@tonic-gate 			kmem_free(shm_locality, sizeof (*shm_locality));
3868*7c478bd9Sstevel@tonic-gate 			shm_locality = vp->v_locality;
3869*7c478bd9Sstevel@tonic-gate 			shm_locality->loc_count++;
3870*7c478bd9Sstevel@tonic-gate 		}
3871*7c478bd9Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
3872*7c478bd9Sstevel@tonic-gate 
3873*7c478bd9Sstevel@tonic-gate 		return;
3874*7c478bd9Sstevel@tonic-gate 	}
3875*7c478bd9Sstevel@tonic-gate 
3876*7c478bd9Sstevel@tonic-gate 	/*
3877*7c478bd9Sstevel@tonic-gate 	 * Increment reference count of number of segments mapping this vnode
3878*7c478bd9Sstevel@tonic-gate 	 * shared
3879*7c478bd9Sstevel@tonic-gate 	 */
3880*7c478bd9Sstevel@tonic-gate 	shm_locality = vp->v_locality;
3881*7c478bd9Sstevel@tonic-gate 	shm_locality->loc_count++;
3882*7c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
3883*7c478bd9Sstevel@tonic-gate }
3884*7c478bd9Sstevel@tonic-gate 
3885*7c478bd9Sstevel@tonic-gate /*
3886*7c478bd9Sstevel@tonic-gate  * Destroy the given shared memory policy segment tree
3887*7c478bd9Sstevel@tonic-gate  */
3888*7c478bd9Sstevel@tonic-gate void
3889*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3890*7c478bd9Sstevel@tonic-gate {
3891*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*cur;
3892*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
3893*7c478bd9Sstevel@tonic-gate 
3894*7c478bd9Sstevel@tonic-gate 	if (tree == NULL)
3895*7c478bd9Sstevel@tonic-gate 		return;
3896*7c478bd9Sstevel@tonic-gate 
3897*7c478bd9Sstevel@tonic-gate 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3898*7c478bd9Sstevel@tonic-gate 	while (cur != NULL) {
3899*7c478bd9Sstevel@tonic-gate 		next = AVL_NEXT(tree, cur);
3900*7c478bd9Sstevel@tonic-gate 		avl_remove(tree, cur);
3901*7c478bd9Sstevel@tonic-gate 		kmem_free(cur, sizeof (*cur));
3902*7c478bd9Sstevel@tonic-gate 		cur = next;
3903*7c478bd9Sstevel@tonic-gate 	}
3904*7c478bd9Sstevel@tonic-gate 	kmem_free(tree, sizeof (avl_tree_t));
3905*7c478bd9Sstevel@tonic-gate }
3906*7c478bd9Sstevel@tonic-gate 
3907*7c478bd9Sstevel@tonic-gate /*
3908*7c478bd9Sstevel@tonic-gate  * Uninitialize lgroup shared memory allocation policy support
3909*7c478bd9Sstevel@tonic-gate  */
3910*7c478bd9Sstevel@tonic-gate void
3911*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3912*7c478bd9Sstevel@tonic-gate {
3913*7c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
3914*7c478bd9Sstevel@tonic-gate 
3915*7c478bd9Sstevel@tonic-gate 	/*
3916*7c478bd9Sstevel@tonic-gate 	 * For anon_map, deallocate shared memory policy tree and
3917*7c478bd9Sstevel@tonic-gate 	 * zero locality field
3918*7c478bd9Sstevel@tonic-gate 	 * Don't need any locks because anon_map is being freed
3919*7c478bd9Sstevel@tonic-gate 	 */
3920*7c478bd9Sstevel@tonic-gate 	if (amp) {
3921*7c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL)
3922*7c478bd9Sstevel@tonic-gate 			return;
3923*7c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
3924*7c478bd9Sstevel@tonic-gate 		shm_locality->loc_count = 0;	/* not really used for amp */
3925*7c478bd9Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
3926*7c478bd9Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3927*7c478bd9Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
3928*7c478bd9Sstevel@tonic-gate 		amp->locality = 0;
3929*7c478bd9Sstevel@tonic-gate 		return;
3930*7c478bd9Sstevel@tonic-gate 	}
3931*7c478bd9Sstevel@tonic-gate 
3932*7c478bd9Sstevel@tonic-gate 	/*
3933*7c478bd9Sstevel@tonic-gate 	 * For vnode, decrement reference count of segments mapping this vnode
3934*7c478bd9Sstevel@tonic-gate 	 * shared and delete locality info if reference count drops to 0
3935*7c478bd9Sstevel@tonic-gate 	 */
3936*7c478bd9Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
3937*7c478bd9Sstevel@tonic-gate 	shm_locality = vp->v_locality;
3938*7c478bd9Sstevel@tonic-gate 	shm_locality->loc_count--;
3939*7c478bd9Sstevel@tonic-gate 
3940*7c478bd9Sstevel@tonic-gate 	if (shm_locality->loc_count == 0) {
3941*7c478bd9Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
3942*7c478bd9Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3943*7c478bd9Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
3944*7c478bd9Sstevel@tonic-gate 		vp->v_locality = 0;
3945*7c478bd9Sstevel@tonic-gate 		vp->v_flag &= ~V_LOCALITY;
3946*7c478bd9Sstevel@tonic-gate 	}
3947*7c478bd9Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
3948*7c478bd9Sstevel@tonic-gate }
3949*7c478bd9Sstevel@tonic-gate 
3950*7c478bd9Sstevel@tonic-gate /*
3951*7c478bd9Sstevel@tonic-gate  * Compare two shared memory policy segments
3952*7c478bd9Sstevel@tonic-gate  * Used by AVL tree code for searching
3953*7c478bd9Sstevel@tonic-gate  */
3954*7c478bd9Sstevel@tonic-gate int
3955*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_compar(const void *x, const void *y)
3956*7c478bd9Sstevel@tonic-gate {
3957*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3958*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3959*7c478bd9Sstevel@tonic-gate 
3960*7c478bd9Sstevel@tonic-gate 	if (a->shm_off < b->shm_off)
3961*7c478bd9Sstevel@tonic-gate 		return (-1);
3962*7c478bd9Sstevel@tonic-gate 	if (a->shm_off >= b->shm_off + b->shm_size)
3963*7c478bd9Sstevel@tonic-gate 		return (1);
3964*7c478bd9Sstevel@tonic-gate 	return (0);
3965*7c478bd9Sstevel@tonic-gate }
3966*7c478bd9Sstevel@tonic-gate 
3967*7c478bd9Sstevel@tonic-gate /*
3968*7c478bd9Sstevel@tonic-gate  * Concatenate seg1 with seg2 and remove seg2
3969*7c478bd9Sstevel@tonic-gate  */
3970*7c478bd9Sstevel@tonic-gate static int
3971*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
3972*7c478bd9Sstevel@tonic-gate     lgrp_shm_policy_seg_t *seg2)
3973*7c478bd9Sstevel@tonic-gate {
3974*7c478bd9Sstevel@tonic-gate 	if (!seg1 || !seg2 ||
3975*7c478bd9Sstevel@tonic-gate 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
3976*7c478bd9Sstevel@tonic-gate 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
3977*7c478bd9Sstevel@tonic-gate 		return (-1);
3978*7c478bd9Sstevel@tonic-gate 
3979*7c478bd9Sstevel@tonic-gate 	seg1->shm_size += seg2->shm_size;
3980*7c478bd9Sstevel@tonic-gate 	avl_remove(tree, seg2);
3981*7c478bd9Sstevel@tonic-gate 	kmem_free(seg2, sizeof (*seg2));
3982*7c478bd9Sstevel@tonic-gate 	return (0);
3983*7c478bd9Sstevel@tonic-gate }
3984*7c478bd9Sstevel@tonic-gate 
3985*7c478bd9Sstevel@tonic-gate /*
3986*7c478bd9Sstevel@tonic-gate  * Split segment at given offset and return rightmost (uppermost) segment
3987*7c478bd9Sstevel@tonic-gate  * Assumes that there are no overlapping segments
3988*7c478bd9Sstevel@tonic-gate  */
3989*7c478bd9Sstevel@tonic-gate static lgrp_shm_policy_seg_t *
3990*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
3991*7c478bd9Sstevel@tonic-gate     u_offset_t off)
3992*7c478bd9Sstevel@tonic-gate {
3993*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
3994*7c478bd9Sstevel@tonic-gate 	avl_index_t		where;
3995*7c478bd9Sstevel@tonic-gate 
3996*7c478bd9Sstevel@tonic-gate 	ASSERT(seg != NULL);
3997*7c478bd9Sstevel@tonic-gate 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
3998*7c478bd9Sstevel@tonic-gate 
3999*7c478bd9Sstevel@tonic-gate 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4000*7c478bd9Sstevel@tonic-gate 	    seg->shm_size)
4001*7c478bd9Sstevel@tonic-gate 		return (NULL);
4002*7c478bd9Sstevel@tonic-gate 
4003*7c478bd9Sstevel@tonic-gate 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4004*7c478bd9Sstevel@tonic-gate 		return (seg);
4005*7c478bd9Sstevel@tonic-gate 
4006*7c478bd9Sstevel@tonic-gate 	/*
4007*7c478bd9Sstevel@tonic-gate 	 * Adjust size of left segment and allocate new (right) segment
4008*7c478bd9Sstevel@tonic-gate 	 */
4009*7c478bd9Sstevel@tonic-gate 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4010*7c478bd9Sstevel@tonic-gate 	newseg->shm_policy = seg->shm_policy;
4011*7c478bd9Sstevel@tonic-gate 	newseg->shm_off = off;
4012*7c478bd9Sstevel@tonic-gate 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4013*7c478bd9Sstevel@tonic-gate 	seg->shm_size = off - seg->shm_off;
4014*7c478bd9Sstevel@tonic-gate 
4015*7c478bd9Sstevel@tonic-gate 	/*
4016*7c478bd9Sstevel@tonic-gate 	 * Find where to insert new segment in AVL tree and insert it
4017*7c478bd9Sstevel@tonic-gate 	 */
4018*7c478bd9Sstevel@tonic-gate 	(void) avl_find(tree, &off, &where);
4019*7c478bd9Sstevel@tonic-gate 	avl_insert(tree, newseg, where);
4020*7c478bd9Sstevel@tonic-gate 
4021*7c478bd9Sstevel@tonic-gate 	return (newseg);
4022*7c478bd9Sstevel@tonic-gate }
4023*7c478bd9Sstevel@tonic-gate 
4024*7c478bd9Sstevel@tonic-gate /*
4025*7c478bd9Sstevel@tonic-gate  * Set shared memory allocation policy on specified shared object at given
4026*7c478bd9Sstevel@tonic-gate  * offset and length
4027*7c478bd9Sstevel@tonic-gate  *
4028*7c478bd9Sstevel@tonic-gate  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4029*7c478bd9Sstevel@tonic-gate  * -1 if can't set policy.
4030*7c478bd9Sstevel@tonic-gate  */
4031*7c478bd9Sstevel@tonic-gate int
4032*7c478bd9Sstevel@tonic-gate lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4033*7c478bd9Sstevel@tonic-gate     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4034*7c478bd9Sstevel@tonic-gate {
4035*7c478bd9Sstevel@tonic-gate 	u_offset_t		eoff;
4036*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
4037*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
4038*7c478bd9Sstevel@tonic-gate 	u_offset_t		off;
4039*7c478bd9Sstevel@tonic-gate 	u_offset_t		oldeoff;
4040*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*prev;
4041*7c478bd9Sstevel@tonic-gate 	int			retval;
4042*7c478bd9Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*seg;
4043*7c478bd9Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
4044*7c478bd9Sstevel@tonic-gate 	avl_tree_t		*tree;
4045*7c478bd9Sstevel@tonic-gate 	avl_index_t		where;
4046*7c478bd9Sstevel@tonic-gate 
4047*7c478bd9Sstevel@tonic-gate 	ASSERT(amp || vp);
4048*7c478bd9Sstevel@tonic-gate 	ASSERT((len & PAGEOFFSET) == 0);
4049*7c478bd9Sstevel@tonic-gate 
4050*7c478bd9Sstevel@tonic-gate 	if (len == 0)
4051*7c478bd9Sstevel@tonic-gate 		return (-1);
4052*7c478bd9Sstevel@tonic-gate 
4053*7c478bd9Sstevel@tonic-gate 	retval = 0;
4054*7c478bd9Sstevel@tonic-gate 
4055*7c478bd9Sstevel@tonic-gate 	/*
4056*7c478bd9Sstevel@tonic-gate 	 * Get locality info and starting offset into shared object
4057*7c478bd9Sstevel@tonic-gate 	 * Try anon map first and then vnode
4058*7c478bd9Sstevel@tonic-gate 	 * Assume that no locks need to be held on anon_map or vnode, since
4059*7c478bd9Sstevel@tonic-gate 	 * it should be protected by its reference count which must be nonzero
4060*7c478bd9Sstevel@tonic-gate 	 * for an existing segment.
4061*7c478bd9Sstevel@tonic-gate 	 */
4062*7c478bd9Sstevel@tonic-gate 	if (amp) {
4063*7c478bd9Sstevel@tonic-gate 		/*
4064*7c478bd9Sstevel@tonic-gate 		 * Get policy info from anon_map
4065*7c478bd9Sstevel@tonic-gate 		 *
4066*7c478bd9Sstevel@tonic-gate 		 */
4067*7c478bd9Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
4068*7c478bd9Sstevel@tonic-gate 		if (amp->locality == NULL)
4069*7c478bd9Sstevel@tonic-gate 			lgrp_shm_policy_init(amp, NULL);
4070*7c478bd9Sstevel@tonic-gate 		shm_locality = amp->locality;
4071*7c478bd9Sstevel@tonic-gate 		off = ptob(anon_index);
4072*7c478bd9Sstevel@tonic-gate 	} else if (vp) {
4073*7c478bd9Sstevel@tonic-gate 		/*
4074*7c478bd9Sstevel@tonic-gate 		 * Get policy info from vnode
4075*7c478bd9Sstevel@tonic-gate 		 */
4076*7c478bd9Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4077*7c478bd9Sstevel@tonic-gate 			lgrp_shm_policy_init(NULL, vp);
4078*7c478bd9Sstevel@tonic-gate 		shm_locality = vp->v_locality;
4079*7c478bd9Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
4080*7c478bd9Sstevel@tonic-gate 		off = vn_off;
4081*7c478bd9Sstevel@tonic-gate 	} else
4082*7c478bd9Sstevel@tonic-gate 		return (-1);
4083*7c478bd9Sstevel@tonic-gate 
4084*7c478bd9Sstevel@tonic-gate 	ASSERT((off & PAGEOFFSET) == 0);
4085*7c478bd9Sstevel@tonic-gate 
4086*7c478bd9Sstevel@tonic-gate 	/*
4087*7c478bd9Sstevel@tonic-gate 	 * Figure out default policy
4088*7c478bd9Sstevel@tonic-gate 	 */
4089*7c478bd9Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4090*7c478bd9Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4091*7c478bd9Sstevel@tonic-gate 
4092*7c478bd9Sstevel@tonic-gate 	/*
4093*7c478bd9Sstevel@tonic-gate 	 * Create AVL tree if there isn't one yet
4094*7c478bd9Sstevel@tonic-gate 	 * and set locality field to point at it
4095*7c478bd9Sstevel@tonic-gate 	 */
4096*7c478bd9Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4097*7c478bd9Sstevel@tonic-gate 	tree = shm_locality->loc_tree;
4098*7c478bd9Sstevel@tonic-gate 	if (!tree) {
4099*7c478bd9Sstevel@tonic-gate 		rw_exit(&shm_locality->loc_lock);
4100*7c478bd9Sstevel@tonic-gate 
4101*7c478bd9Sstevel@tonic-gate 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4102*7c478bd9Sstevel@tonic-gate 
4103*7c478bd9Sstevel@tonic-gate 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4104*7c478bd9Sstevel@tonic-gate 		if (shm_locality->loc_tree == NULL) {
4105*7c478bd9Sstevel@tonic-gate 			avl_create(tree, lgrp_shm_policy_compar,
4106*7c478bd9Sstevel@tonic-gate 			    sizeof (lgrp_shm_policy_seg_t),
4107*7c478bd9Sstevel@tonic-gate 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4108*7c478bd9Sstevel@tonic-gate 			shm_locality->loc_tree = tree;
4109*7c478bd9Sstevel@tonic-gate 		} else {
4110*7c478bd9Sstevel@tonic-gate 			/*
4111*7c478bd9Sstevel@tonic-gate 			 * Another thread managed to set up the tree
4112*7c478bd9Sstevel@tonic-gate 			 * before we could. Free the tree we allocated
4113*7c478bd9Sstevel@tonic-gate 			 * and use the one that's already there.
4114*7c478bd9Sstevel@tonic-gate 			 */
4115*7c478bd9Sstevel@tonic-gate 			kmem_free(tree, sizeof (*tree));
4116*7c478bd9Sstevel@tonic-gate 			tree = shm_locality->loc_tree;
4117*7c478bd9Sstevel@tonic-gate 		}
4118*7c478bd9Sstevel@tonic-gate 	}
4119*7c478bd9Sstevel@tonic-gate 
4120*7c478bd9Sstevel@tonic-gate 	/*
4121*7c478bd9Sstevel@tonic-gate 	 * Set policy
4122*7c478bd9Sstevel@tonic-gate 	 *
4123*7c478bd9Sstevel@tonic-gate 	 * Need to maintain hold on writer's lock to keep tree from
4124*7c478bd9Sstevel@tonic-gate 	 * changing out from under us
4125*7c478bd9Sstevel@tonic-gate 	 */
4126*7c478bd9Sstevel@tonic-gate 	while (len != 0) {
4127*7c478bd9Sstevel@tonic-gate 		/*
4128*7c478bd9Sstevel@tonic-gate 		 * Find policy segment for specified offset into shared object
4129*7c478bd9Sstevel@tonic-gate 		 */
4130*7c478bd9Sstevel@tonic-gate 		seg = avl_find(tree, &off, &where);
4131*7c478bd9Sstevel@tonic-gate 
4132*7c478bd9Sstevel@tonic-gate 		/*
4133*7c478bd9Sstevel@tonic-gate 		 * Didn't find any existing segment that contains specified
4134*7c478bd9Sstevel@tonic-gate 		 * offset, so allocate new segment, insert it, and concatenate
4135*7c478bd9Sstevel@tonic-gate 		 * with adjacent segments if possible
4136*7c478bd9Sstevel@tonic-gate 		 */
4137*7c478bd9Sstevel@tonic-gate 		if (seg == NULL) {
4138*7c478bd9Sstevel@tonic-gate 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4139*7c478bd9Sstevel@tonic-gate 			    KM_SLEEP);
4140*7c478bd9Sstevel@tonic-gate 			newseg->shm_policy.mem_policy = policy;
4141*7c478bd9Sstevel@tonic-gate 			newseg->shm_policy.mem_reserved = 0;
4142*7c478bd9Sstevel@tonic-gate 			newseg->shm_off = off;
4143*7c478bd9Sstevel@tonic-gate 			avl_insert(tree, newseg, where);
4144*7c478bd9Sstevel@tonic-gate 
4145*7c478bd9Sstevel@tonic-gate 			/*
4146*7c478bd9Sstevel@tonic-gate 			 * Check to see whether new segment overlaps with next
4147*7c478bd9Sstevel@tonic-gate 			 * one, set length of new segment accordingly, and
4148*7c478bd9Sstevel@tonic-gate 			 * calculate remaining length and next offset
4149*7c478bd9Sstevel@tonic-gate 			 */
4150*7c478bd9Sstevel@tonic-gate 			seg = AVL_NEXT(tree, newseg);
4151*7c478bd9Sstevel@tonic-gate 			if (seg == NULL || off + len <= seg->shm_off) {
4152*7c478bd9Sstevel@tonic-gate 				newseg->shm_size = len;
4153*7c478bd9Sstevel@tonic-gate 				len = 0;
4154*7c478bd9Sstevel@tonic-gate 			} else {
4155*7c478bd9Sstevel@tonic-gate 				newseg->shm_size = seg->shm_off - off;
4156*7c478bd9Sstevel@tonic-gate 				off = seg->shm_off;
4157*7c478bd9Sstevel@tonic-gate 				len -= newseg->shm_size;
4158*7c478bd9Sstevel@tonic-gate 			}
4159*7c478bd9Sstevel@tonic-gate 
4160*7c478bd9Sstevel@tonic-gate 			/*
4161*7c478bd9Sstevel@tonic-gate 			 * Try to concatenate new segment with next and
4162*7c478bd9Sstevel@tonic-gate 			 * previous ones, since they might have the same policy
4163*7c478bd9Sstevel@tonic-gate 			 * now.  Grab previous and next segments first because
4164*7c478bd9Sstevel@tonic-gate 			 * they will change on concatenation.
4165*7c478bd9Sstevel@tonic-gate 			 */
4166*7c478bd9Sstevel@tonic-gate 			prev =  AVL_PREV(tree, newseg);
4167*7c478bd9Sstevel@tonic-gate 			next = AVL_NEXT(tree, newseg);
4168*7c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4169*7c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4170*7c478bd9Sstevel@tonic-gate 
4171*7c478bd9Sstevel@tonic-gate 			continue;
4172*7c478bd9Sstevel@tonic-gate 		}
4173*7c478bd9Sstevel@tonic-gate 
4174*7c478bd9Sstevel@tonic-gate 		eoff = off + len;
4175*7c478bd9Sstevel@tonic-gate 		oldeoff = seg->shm_off + seg->shm_size;
4176*7c478bd9Sstevel@tonic-gate 
4177*7c478bd9Sstevel@tonic-gate 		/*
4178*7c478bd9Sstevel@tonic-gate 		 * Policy set already?
4179*7c478bd9Sstevel@tonic-gate 		 */
4180*7c478bd9Sstevel@tonic-gate 		if (policy == seg->shm_policy.mem_policy) {
4181*7c478bd9Sstevel@tonic-gate 			/*
4182*7c478bd9Sstevel@tonic-gate 			 * Nothing left to do if offset and length
4183*7c478bd9Sstevel@tonic-gate 			 * fall within this segment
4184*7c478bd9Sstevel@tonic-gate 			 */
4185*7c478bd9Sstevel@tonic-gate 			if (eoff <= oldeoff) {
4186*7c478bd9Sstevel@tonic-gate 				retval = 1;
4187*7c478bd9Sstevel@tonic-gate 				break;
4188*7c478bd9Sstevel@tonic-gate 			} else {
4189*7c478bd9Sstevel@tonic-gate 				len = eoff - oldeoff;
4190*7c478bd9Sstevel@tonic-gate 				off = oldeoff;
4191*7c478bd9Sstevel@tonic-gate 				continue;
4192*7c478bd9Sstevel@tonic-gate 			}
4193*7c478bd9Sstevel@tonic-gate 		}
4194*7c478bd9Sstevel@tonic-gate 
4195*7c478bd9Sstevel@tonic-gate 		/*
4196*7c478bd9Sstevel@tonic-gate 		 * Specified offset and length match existing segment exactly
4197*7c478bd9Sstevel@tonic-gate 		 */
4198*7c478bd9Sstevel@tonic-gate 		if (off == seg->shm_off && len == seg->shm_size) {
4199*7c478bd9Sstevel@tonic-gate 			/*
4200*7c478bd9Sstevel@tonic-gate 			 * Set policy and update current length
4201*7c478bd9Sstevel@tonic-gate 			 */
4202*7c478bd9Sstevel@tonic-gate 			seg->shm_policy.mem_policy = policy;
4203*7c478bd9Sstevel@tonic-gate 			seg->shm_policy.mem_reserved = 0;
4204*7c478bd9Sstevel@tonic-gate 			len = 0;
4205*7c478bd9Sstevel@tonic-gate 
4206*7c478bd9Sstevel@tonic-gate 			/*
4207*7c478bd9Sstevel@tonic-gate 			 * Try concatenating new segment with previous and next
4208*7c478bd9Sstevel@tonic-gate 			 * segments, since they might have the same policy now.
4209*7c478bd9Sstevel@tonic-gate 			 * Grab previous and next segments first because they
4210*7c478bd9Sstevel@tonic-gate 			 * will change on concatenation.
4211*7c478bd9Sstevel@tonic-gate 			 */
4212*7c478bd9Sstevel@tonic-gate 			prev =  AVL_PREV(tree, seg);
4213*7c478bd9Sstevel@tonic-gate 			next = AVL_NEXT(tree, seg);
4214*7c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, seg, next);
4215*7c478bd9Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4216*7c478bd9Sstevel@tonic-gate 		} else {
4217*7c478bd9Sstevel@tonic-gate 			/*
4218*7c478bd9Sstevel@tonic-gate 			 * Specified offset and length only apply to part of
4219*7c478bd9Sstevel@tonic-gate 			 * existing segment
4220*7c478bd9Sstevel@tonic-gate 			 */
4221*7c478bd9Sstevel@tonic-gate 
4222*7c478bd9Sstevel@tonic-gate 			/*
4223*7c478bd9Sstevel@tonic-gate 			 * New segment starts in middle of old one, so split
4224*7c478bd9Sstevel@tonic-gate 			 * new one off near beginning of old one
4225*7c478bd9Sstevel@tonic-gate 			 */
4226*7c478bd9Sstevel@tonic-gate 			newseg = NULL;
4227*7c478bd9Sstevel@tonic-gate 			if (off > seg->shm_off) {
4228*7c478bd9Sstevel@tonic-gate 				newseg = lgrp_shm_policy_split(tree, seg, off);
4229*7c478bd9Sstevel@tonic-gate 
4230*7c478bd9Sstevel@tonic-gate 				/*
4231*7c478bd9Sstevel@tonic-gate 				 * New segment ends where old one did, so try
4232*7c478bd9Sstevel@tonic-gate 				 * to concatenate with next segment
4233*7c478bd9Sstevel@tonic-gate 				 */
4234*7c478bd9Sstevel@tonic-gate 				if (eoff == oldeoff) {
4235*7c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
4236*7c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_reserved = 0;
4237*7c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
4238*7c478bd9Sstevel@tonic-gate 					    newseg, AVL_NEXT(tree, newseg));
4239*7c478bd9Sstevel@tonic-gate 					break;
4240*7c478bd9Sstevel@tonic-gate 				}
4241*7c478bd9Sstevel@tonic-gate 			}
4242*7c478bd9Sstevel@tonic-gate 
4243*7c478bd9Sstevel@tonic-gate 			/*
4244*7c478bd9Sstevel@tonic-gate 			 * New segment ends before old one, so split off end of
4245*7c478bd9Sstevel@tonic-gate 			 * old one
4246*7c478bd9Sstevel@tonic-gate 			 */
4247*7c478bd9Sstevel@tonic-gate 			if (eoff < oldeoff) {
4248*7c478bd9Sstevel@tonic-gate 				if (newseg) {
4249*7c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree,
4250*7c478bd9Sstevel@tonic-gate 					    newseg, eoff);
4251*7c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
4252*7c478bd9Sstevel@tonic-gate 					newseg->shm_policy.mem_reserved = 0;
4253*7c478bd9Sstevel@tonic-gate 				} else {
4254*7c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree, seg,
4255*7c478bd9Sstevel@tonic-gate 					    eoff);
4256*7c478bd9Sstevel@tonic-gate 					seg->shm_policy.mem_policy = policy;
4257*7c478bd9Sstevel@tonic-gate 					seg->shm_policy.mem_reserved = 0;
4258*7c478bd9Sstevel@tonic-gate 				}
4259*7c478bd9Sstevel@tonic-gate 
4260*7c478bd9Sstevel@tonic-gate 				if (off == seg->shm_off)
4261*7c478bd9Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
4262*7c478bd9Sstevel@tonic-gate 					    AVL_PREV(tree, seg), seg);
4263*7c478bd9Sstevel@tonic-gate 				break;
4264*7c478bd9Sstevel@tonic-gate 			}
4265*7c478bd9Sstevel@tonic-gate 
4266*7c478bd9Sstevel@tonic-gate 			/*
4267*7c478bd9Sstevel@tonic-gate 			 * Calculate remaining length and next offset
4268*7c478bd9Sstevel@tonic-gate 			 */
4269*7c478bd9Sstevel@tonic-gate 			len = eoff - oldeoff;
4270*7c478bd9Sstevel@tonic-gate 			off = oldeoff;
4271*7c478bd9Sstevel@tonic-gate 		}
4272*7c478bd9Sstevel@tonic-gate 	}
4273*7c478bd9Sstevel@tonic-gate 
4274*7c478bd9Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
4275*7c478bd9Sstevel@tonic-gate 	return (retval);
4276*7c478bd9Sstevel@tonic-gate }
4277*7c478bd9Sstevel@tonic-gate 
4278*7c478bd9Sstevel@tonic-gate /*
4279*7c478bd9Sstevel@tonic-gate  * Return the best memnode from which to allocate memory given
4280*7c478bd9Sstevel@tonic-gate  * an lgroup.
4281*7c478bd9Sstevel@tonic-gate  *
4282*7c478bd9Sstevel@tonic-gate  * "c" is for cookie, which is good enough for me.
4283*7c478bd9Sstevel@tonic-gate  * It references a cookie struct that should be zero'ed to initialize.
4284*7c478bd9Sstevel@tonic-gate  * The cookie should live on the caller's stack.
4285*7c478bd9Sstevel@tonic-gate  *
4286*7c478bd9Sstevel@tonic-gate  * The routine returns -1 when:
4287*7c478bd9Sstevel@tonic-gate  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4288*7c478bd9Sstevel@tonic-gate  *	- traverse is 1, and all the memnodes in the system have been
4289*7c478bd9Sstevel@tonic-gate  *	  returned.
4290*7c478bd9Sstevel@tonic-gate  */
4291*7c478bd9Sstevel@tonic-gate int
4292*7c478bd9Sstevel@tonic-gate lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4293*7c478bd9Sstevel@tonic-gate {
4294*7c478bd9Sstevel@tonic-gate 	lgrp_t		*lp = c->lmc_lgrp;
4295*7c478bd9Sstevel@tonic-gate 	mnodeset_t	nodes = c->lmc_nodes;
4296*7c478bd9Sstevel@tonic-gate 	int		cnt = c->lmc_cnt;
4297*7c478bd9Sstevel@tonic-gate 	int		offset, mnode;
4298*7c478bd9Sstevel@tonic-gate 
4299*7c478bd9Sstevel@tonic-gate 	extern int	max_mem_nodes;
4300*7c478bd9Sstevel@tonic-gate 
4301*7c478bd9Sstevel@tonic-gate 	/*
4302*7c478bd9Sstevel@tonic-gate 	 * If the set is empty, and the caller is willing, traverse
4303*7c478bd9Sstevel@tonic-gate 	 * up the hierarchy until we find a non-empty set.
4304*7c478bd9Sstevel@tonic-gate 	 */
4305*7c478bd9Sstevel@tonic-gate 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4306*7c478bd9Sstevel@tonic-gate 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4307*7c478bd9Sstevel@tonic-gate 		    ((lp = lp->lgrp_parent) == NULL))
4308*7c478bd9Sstevel@tonic-gate 			return (-1);
4309*7c478bd9Sstevel@tonic-gate 
4310*7c478bd9Sstevel@tonic-gate 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4311*7c478bd9Sstevel@tonic-gate 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4312*7c478bd9Sstevel@tonic-gate 	}
4313*7c478bd9Sstevel@tonic-gate 
4314*7c478bd9Sstevel@tonic-gate 	/*
4315*7c478bd9Sstevel@tonic-gate 	 * Select a memnode by picking one at a "random" offset.
4316*7c478bd9Sstevel@tonic-gate 	 * Because of DR, memnodes can come and go at any time.
4317*7c478bd9Sstevel@tonic-gate 	 * This code must be able to cope with the possibility
4318*7c478bd9Sstevel@tonic-gate 	 * that the nodes count "cnt" is inconsistent with respect
4319*7c478bd9Sstevel@tonic-gate 	 * to the number of elements actually in "nodes", and
4320*7c478bd9Sstevel@tonic-gate 	 * therefore that the offset chosen could be greater than
4321*7c478bd9Sstevel@tonic-gate 	 * the number of elements in the set (some memnodes may
4322*7c478bd9Sstevel@tonic-gate 	 * have dissapeared just before cnt was read).
4323*7c478bd9Sstevel@tonic-gate 	 * If this happens, the search simply wraps back to the
4324*7c478bd9Sstevel@tonic-gate 	 * beginning of the set.
4325*7c478bd9Sstevel@tonic-gate 	 */
4326*7c478bd9Sstevel@tonic-gate 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4327*7c478bd9Sstevel@tonic-gate 	offset = c->lmc_rand % cnt;
4328*7c478bd9Sstevel@tonic-gate 	do {
4329*7c478bd9Sstevel@tonic-gate 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4330*7c478bd9Sstevel@tonic-gate 			if (nodes & ((mnodeset_t)1 << mnode))
4331*7c478bd9Sstevel@tonic-gate 				if (!offset--)
4332*7c478bd9Sstevel@tonic-gate 					break;
4333*7c478bd9Sstevel@tonic-gate 	} while (mnode >= max_mem_nodes);
4334*7c478bd9Sstevel@tonic-gate 
4335*7c478bd9Sstevel@tonic-gate 	/* Found a node. Store state before returning. */
4336*7c478bd9Sstevel@tonic-gate 	c->lmc_lgrp = lp;
4337*7c478bd9Sstevel@tonic-gate 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4338*7c478bd9Sstevel@tonic-gate 	c->lmc_cnt = cnt - 1;
4339*7c478bd9Sstevel@tonic-gate 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4340*7c478bd9Sstevel@tonic-gate 	c->lmc_ntried++;
4341*7c478bd9Sstevel@tonic-gate 
4342*7c478bd9Sstevel@tonic-gate 	return (mnode);
4343*7c478bd9Sstevel@tonic-gate }
4344