xref: /illumos-gate/usr/src/uts/common/os/lgrp.c (revision 4fceebdf03eeac0d7c58a4f70cc19b00a8c40a73)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/pg.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 /*
216  * lgroup memory event handlers
217  */
218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 
222 /*
223  * lgroup CPU partition event handlers
224  */
225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 static void	lgrp_part_del_cpu(struct cpu *);
227 
228 static void	lgrp_root_init(void);
229 
230 /*
231  * lpl topology
232  */
233 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234 static void	lpl_clear(lpl_t *);
235 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237 static void	lpl_rset_add(lpl_t *, lpl_t *);
238 static void	lpl_rset_del(lpl_t *, lpl_t *);
239 static int	lpl_rset_contains(lpl_t *, lpl_t *);
240 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241 static void	lpl_child_update(lpl_t *, struct cpupart *);
242 static int	lpl_pick(lpl_t *, lpl_t *);
243 static void	lpl_verify_wrapper(struct cpupart *);
244 
245 /*
246  * defines for lpl topology verifier return codes
247  */
248 
249 #define	LPL_TOPO_CORRECT			0
250 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
251 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252 #define	LPL_TOPO_LGRP_MISMATCH			-3
253 #define	LPL_TOPO_MISSING_PARENT			-4
254 #define	LPL_TOPO_PARENT_MISMATCH		-5
255 #define	LPL_TOPO_BAD_CPUCNT			-6
256 #define	LPL_TOPO_RSET_MISMATCH			-7
257 #define	LPL_TOPO_LPL_ORPHANED			-8
258 #define	LPL_TOPO_LPL_BAD_NCPU			-9
259 #define	LPL_TOPO_RSET_MSSNG_LF			-10
260 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261 #define	LPL_TOPO_BOGUS_HINT			-12
262 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
263 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
264 #define	LPL_TOPO_BAD_RSETCNT			-15
265 
266 /*
267  * Return whether lgroup optimizations should be enabled on this system
268  */
269 int
270 lgrp_optimizations(void)
271 {
272 	/*
273 	 * System must have more than 2 lgroups to enable lgroup optimizations
274 	 *
275 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
276 	 * with one child lgroup containing all the resources. A 2 lgroup
277 	 * system with a root lgroup directly containing CPUs or memory might
278 	 * need lgroup optimizations with its child lgroup, but there
279 	 * isn't such a machine for now....
280 	 */
281 	if (nlgrps > 2)
282 		return (1);
283 
284 	return (0);
285 }
286 
287 /*
288  * Build full lgroup topology
289  */
290 static void
291 lgrp_root_init(void)
292 {
293 	lgrp_handle_t	hand;
294 	int		i;
295 	lgrp_id_t	id;
296 
297 	/*
298 	 * Create the "root" lgroup
299 	 */
300 	ASSERT(nlgrps == 0);
301 	id = nlgrps++;
302 
303 	lgrp_root = &lroot;
304 
305 	lgrp_root->lgrp_cpu = NULL;
306 	lgrp_root->lgrp_mnodes = 0;
307 	lgrp_root->lgrp_nmnodes = 0;
308 	hand = lgrp_plat_root_hand();
309 	lgrp_root->lgrp_plathand = hand;
310 
311 	lgrp_root->lgrp_id = id;
312 	lgrp_root->lgrp_cpucnt = 0;
313 	lgrp_root->lgrp_childcnt = 0;
314 	klgrpset_clear(lgrp_root->lgrp_children);
315 	klgrpset_clear(lgrp_root->lgrp_leaves);
316 	lgrp_root->lgrp_parent = NULL;
317 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318 
319 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320 		klgrpset_clear(lgrp_root->lgrp_set[i]);
321 
322 	lgrp_root->lgrp_kstat = NULL;
323 
324 	lgrp_table[id] = lgrp_root;
325 
326 	/*
327 	 * Setup initial lpl list for CPU0 and initial t0 home.
328 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329 	 * all topology operations until cp_default is initialized at which
330 	 * point t0.t_lpl will be updated.
331 	 */
332 	lpl_bootstrap = lpl_bootstrap_list;
333 	t0.t_lpl = lpl_bootstrap;
334 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336 	cp_default.cp_lgrploads = lpl_bootstrap;
337 }
338 
339 /*
340  * Initialize the lgroup framework and allow the platform to do the same
341  */
342 void
343 lgrp_init(void)
344 {
345 	/*
346 	 * Initialize the platform
347 	 */
348 	lgrp_plat_init();
349 
350 	/*
351 	 * Set max number of lgroups supported on this platform which must be
352 	 * less than the max number of lgroups supported by the common lgroup
353 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354 	 */
355 	nlgrpsmax = lgrp_plat_max_lgrps();
356 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357 }
358 
359 /*
360  * Create the root and cpu0's lgroup, and set t0's home.
361  */
362 void
363 lgrp_setup(void)
364 {
365 	/*
366 	 * Setup the root lgroup
367 	 */
368 	lgrp_root_init();
369 
370 	/*
371 	 * Add cpu0 to an lgroup
372 	 */
373 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375 }
376 
377 /*
378  * Lgroup initialization is split in two parts. The first part
379  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381  * when all CPUs are brought online and all distance information is available.
382  *
383  * When lgrp_main_init() is complete it sets lgrp_initialized. The
384  * lgrp_main_mp_init() sets lgrp_topo_initialized.
385  */
386 
387 /*
388  * true when lgrp initialization has been completed.
389  */
390 int	lgrp_initialized = 0;
391 
392 /*
393  * True when lgrp topology is constructed.
394  */
395 int	lgrp_topo_initialized = 0;
396 
397 /*
398  * Init routine called after startup(), /etc/system has been processed,
399  * and cpu0 has been added to an lgroup.
400  */
401 void
402 lgrp_main_init(void)
403 {
404 	cpu_t		*cp = CPU;
405 	lgrp_id_t	lgrpid;
406 	int		i;
407 	/*
408 	 * Enforce a valid lgrp_mem_default_policy
409 	 */
410 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
411 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
412 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
413 
414 	/*
415 	 * See if mpo should be disabled.
416 	 * This may happen in the case of null proc LPA on Starcat.
417 	 * The platform won't be able to detect null proc LPA until after
418 	 * cpu0 and memory have already been added to lgroups.
419 	 * When and if it is detected, the Starcat platform will return
420 	 * a different platform handle for cpu0 which is what we check for
421 	 * here. If mpo should be disabled move cpu0 to it's rightful place
422 	 * (the root), and destroy the remaining lgroups. This effectively
423 	 * provides an UMA lgroup topology.
424 	 */
425 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
426 	if (lgrp_table[lgrpid]->lgrp_plathand !=
427 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
428 		lgrp_part_del_cpu(cp);
429 		lgrp_cpu_fini(cp, lgrpid);
430 
431 		lgrp_cpu_init(cp);
432 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
433 
434 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
435 
436 		/*
437 		 * Destroy all lgroups except for root
438 		 */
439 		for (i = 0; i <= lgrp_alloc_max; i++) {
440 			if (LGRP_EXISTS(lgrp_table[i]) &&
441 			    lgrp_table[i] != lgrp_root)
442 				lgrp_destroy(lgrp_table[i]);
443 		}
444 
445 		/*
446 		 * Fix up root to point at itself for leaves and resources
447 		 * and not have any children
448 		 */
449 		lgrp_root->lgrp_childcnt = 0;
450 		klgrpset_clear(lgrp_root->lgrp_children);
451 		klgrpset_clear(lgrp_root->lgrp_leaves);
452 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
453 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
454 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
455 	}
456 
457 	/*
458 	 * Initialize kstats framework.
459 	 */
460 	lgrp_kstat_init();
461 	/*
462 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
463 	 */
464 	mutex_enter(&cpu_lock);
465 	lgrp_kstat_create(cp);
466 	mutex_exit(&cpu_lock);
467 
468 	lgrp_plat_main_init();
469 	lgrp_initialized = 1;
470 }
471 
472 /*
473  * Finish lgrp initialization after all CPUS are brought on-line.
474  * This routine is called after start_other_cpus().
475  */
476 void
477 lgrp_main_mp_init(void)
478 {
479 	klgrpset_t changed;
480 
481 	/*
482 	 * Update lgroup topology (if necessary)
483 	 */
484 	klgrpset_clear(changed);
485 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
486 	lgrp_topo_initialized = 1;
487 }
488 
489 /*
490  * Change latency of lgroup with specified lgroup platform handle (if one is
491  * given) or change all lgroups with old latency to new latency
492  */
493 void
494 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
495     u_longlong_t newtime)
496 {
497 	lgrp_t		*lgrp;
498 	int		i;
499 
500 	for (i = 0; i <= lgrp_alloc_max; i++) {
501 		lgrp = lgrp_table[i];
502 
503 		if (!LGRP_EXISTS(lgrp))
504 			continue;
505 
506 		if ((hand == LGRP_NULL_HANDLE &&
507 		    lgrp->lgrp_latency == oldtime) ||
508 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
509 			lgrp->lgrp_latency = (int)newtime;
510 	}
511 }
512 
513 /*
514  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
515  */
516 void
517 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
518 {
519 	klgrpset_t	changed;
520 	cpu_t		*cp;
521 	lgrp_id_t	id;
522 	int		rc;
523 
524 	switch (event) {
525 	/*
526 	 * The following (re)configuration events are common code
527 	 * initiated. lgrp_plat_config() is called here to inform the
528 	 * platform of the reconfiguration event.
529 	 */
530 	case LGRP_CONFIG_CPU_ADD:
531 		cp = (cpu_t *)resource;
532 
533 		/*
534 		 * Initialize the new CPU's lgrp related next/prev
535 		 * links, and give it a bootstrap lpl so that it can
536 		 * survive should it need to enter the dispatcher.
537 		 */
538 		cp->cpu_next_lpl = cp;
539 		cp->cpu_prev_lpl = cp;
540 		cp->cpu_next_lgrp = cp;
541 		cp->cpu_prev_lgrp = cp;
542 		cp->cpu_lpl = lpl_bootstrap;
543 
544 		lgrp_plat_config(event, resource);
545 		atomic_add_32(&lgrp_gen, 1);
546 
547 		break;
548 	case LGRP_CONFIG_CPU_DEL:
549 		lgrp_plat_config(event, resource);
550 		atomic_add_32(&lgrp_gen, 1);
551 
552 		break;
553 	case LGRP_CONFIG_CPU_ONLINE:
554 		cp = (cpu_t *)resource;
555 		lgrp_cpu_init(cp);
556 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
557 		rc = lpl_topo_verify(cp->cpu_part);
558 		if (rc != LPL_TOPO_CORRECT) {
559 			panic("lpl_topo_verify failed: %d", rc);
560 		}
561 		lgrp_plat_config(event, resource);
562 		atomic_add_32(&lgrp_gen, 1);
563 
564 		break;
565 	case LGRP_CONFIG_CPU_OFFLINE:
566 		cp = (cpu_t *)resource;
567 		id = cp->cpu_lpl->lpl_lgrpid;
568 		lgrp_part_del_cpu(cp);
569 		lgrp_cpu_fini(cp, id);
570 		rc = lpl_topo_verify(cp->cpu_part);
571 		if (rc != LPL_TOPO_CORRECT) {
572 			panic("lpl_topo_verify failed: %d", rc);
573 		}
574 		lgrp_plat_config(event, resource);
575 		atomic_add_32(&lgrp_gen, 1);
576 
577 		break;
578 	case LGRP_CONFIG_CPUPART_ADD:
579 		cp = (cpu_t *)resource;
580 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
581 		rc = lpl_topo_verify(cp->cpu_part);
582 		if (rc != LPL_TOPO_CORRECT) {
583 			panic("lpl_topo_verify failed: %d", rc);
584 		}
585 		lgrp_plat_config(event, resource);
586 
587 		break;
588 	case LGRP_CONFIG_CPUPART_DEL:
589 		cp = (cpu_t *)resource;
590 		lgrp_part_del_cpu((cpu_t *)resource);
591 		rc = lpl_topo_verify(cp->cpu_part);
592 		if (rc != LPL_TOPO_CORRECT) {
593 			panic("lpl_topo_verify failed: %d", rc);
594 		}
595 		lgrp_plat_config(event, resource);
596 
597 		break;
598 	/*
599 	 * The following events are initiated by the memnode
600 	 * subsystem.
601 	 */
602 	case LGRP_CONFIG_MEM_ADD:
603 		lgrp_mem_init((int)resource, where, B_FALSE);
604 		atomic_add_32(&lgrp_gen, 1);
605 
606 		break;
607 	case LGRP_CONFIG_MEM_DEL:
608 		lgrp_mem_fini((int)resource, where, B_FALSE);
609 		atomic_add_32(&lgrp_gen, 1);
610 
611 		break;
612 	case LGRP_CONFIG_MEM_RENAME: {
613 		lgrp_config_mem_rename_t *ren_arg =
614 		    (lgrp_config_mem_rename_t *)where;
615 
616 		lgrp_mem_rename((int)resource,
617 		    ren_arg->lmem_rename_from,
618 		    ren_arg->lmem_rename_to);
619 		atomic_add_32(&lgrp_gen, 1);
620 
621 		break;
622 	}
623 	case LGRP_CONFIG_GEN_UPDATE:
624 		atomic_add_32(&lgrp_gen, 1);
625 
626 		break;
627 	case LGRP_CONFIG_FLATTEN:
628 		if (where == 0)
629 			lgrp_topo_levels = (int)resource;
630 		else
631 			(void) lgrp_topo_flatten(resource,
632 			    lgrp_table, lgrp_alloc_max, &changed);
633 
634 		break;
635 	/*
636 	 * Update any lgroups with old latency to new latency
637 	 */
638 	case LGRP_CONFIG_LAT_CHANGE_ALL:
639 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
640 		    (u_longlong_t)where);
641 
642 		break;
643 	/*
644 	 * Update lgroup with specified lgroup platform handle to have
645 	 * new latency
646 	 */
647 	case LGRP_CONFIG_LAT_CHANGE:
648 		lgrp_latency_change((lgrp_handle_t)resource, 0,
649 		    (u_longlong_t)where);
650 
651 		break;
652 	case LGRP_CONFIG_NOP:
653 
654 		break;
655 	default:
656 		break;
657 	}
658 
659 }
660 
661 /*
662  * Called to add lgrp info into cpu structure from cpu_add_unit;
663  * do not assume cpu is in cpu[] yet!
664  *
665  * CPUs are brought online with all other CPUs paused so we can't
666  * allocate memory or we could deadlock the system, so we rely on
667  * the platform to statically allocate as much space as we need
668  * for the lgrp structs and stats.
669  */
670 static void
671 lgrp_cpu_init(struct cpu *cp)
672 {
673 	klgrpset_t	changed;
674 	int		count;
675 	lgrp_handle_t	hand;
676 	int		first_cpu;
677 	lgrp_t		*my_lgrp;
678 	lgrp_id_t	lgrpid;
679 	struct cpu	*cptr;
680 
681 	/*
682 	 * This is the first time through if the resource set
683 	 * for the root lgroup is empty. After cpu0 has been
684 	 * initially added to an lgroup, the root's CPU resource
685 	 * set can never be empty, since the system's last CPU
686 	 * cannot be offlined.
687 	 */
688 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
689 		/*
690 		 * First time through.
691 		 */
692 		first_cpu = 1;
693 	} else {
694 		/*
695 		 * If cpu0 needs to move lgroups, we may come
696 		 * through here again, at which time cpu_lock won't
697 		 * be held, and lgrp_initialized will be false.
698 		 */
699 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
700 		ASSERT(cp->cpu_part != NULL);
701 		first_cpu = 0;
702 	}
703 
704 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
705 	my_lgrp = lgrp_hand_to_lgrp(hand);
706 
707 	if (my_lgrp == NULL) {
708 		/*
709 		 * Create new lgrp and add it to lgroup topology
710 		 */
711 		my_lgrp = lgrp_create();
712 		my_lgrp->lgrp_plathand = hand;
713 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
714 		lgrpid = my_lgrp->lgrp_id;
715 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
716 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
717 
718 		count = 0;
719 		klgrpset_clear(changed);
720 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
721 		    &changed);
722 		/*
723 		 * May have added new intermediate lgroups, so need to add
724 		 * resources other than CPUs which are added below
725 		 */
726 		(void) lgrp_mnode_update(changed, NULL);
727 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
728 	    > 0) {
729 		/*
730 		 * Leaf lgroup was created, but latency wasn't available
731 		 * then.  So, set latency for it and fill in rest of lgroup
732 		 * topology  now that we know how far it is from other leaf
733 		 * lgroups.
734 		 */
735 		lgrpid = my_lgrp->lgrp_id;
736 		klgrpset_clear(changed);
737 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
738 		    lgrpid))
739 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
740 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
741 		    &changed);
742 
743 		/*
744 		 * May have added new intermediate lgroups, so need to add
745 		 * resources other than CPUs which are added below
746 		 */
747 		(void) lgrp_mnode_update(changed, NULL);
748 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
749 	    my_lgrp->lgrp_id)) {
750 		int	i;
751 
752 		/*
753 		 * Update existing lgroup and lgroups containing it with CPU
754 		 * resource
755 		 */
756 		lgrpid = my_lgrp->lgrp_id;
757 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
758 		for (i = 0; i <= lgrp_alloc_max; i++) {
759 			lgrp_t		*lgrp;
760 
761 			lgrp = lgrp_table[i];
762 			if (!LGRP_EXISTS(lgrp) ||
763 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
764 				continue;
765 
766 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
767 		}
768 	}
769 
770 	lgrpid = my_lgrp->lgrp_id;
771 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
772 
773 	/*
774 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
775 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
776 	 * not since none of lgroup IDs in the lpl's have been set yet.
777 	 */
778 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
779 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
780 
781 	/*
782 	 * link the CPU into the lgrp's CPU list
783 	 */
784 	if (my_lgrp->lgrp_cpucnt == 0) {
785 		my_lgrp->lgrp_cpu = cp;
786 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
787 	} else {
788 		cptr = my_lgrp->lgrp_cpu;
789 		cp->cpu_next_lgrp = cptr;
790 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
791 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
792 		cptr->cpu_prev_lgrp = cp;
793 	}
794 	my_lgrp->lgrp_cpucnt++;
795 }
796 
797 lgrp_t *
798 lgrp_create(void)
799 {
800 	lgrp_t		*my_lgrp;
801 	lgrp_id_t	lgrpid;
802 	int		i;
803 
804 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
805 
806 	/*
807 	 * Find an open slot in the lgroup table and recycle unused lgroup
808 	 * left there if any
809 	 */
810 	my_lgrp = NULL;
811 	if (lgrp_alloc_hint == -1)
812 		/*
813 		 * Allocate from end when hint not set yet because no lgroups
814 		 * have been deleted yet
815 		 */
816 		lgrpid = nlgrps++;
817 	else {
818 		/*
819 		 * Start looking for next open slot from hint and leave hint
820 		 * at slot allocated
821 		 */
822 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
823 			my_lgrp = lgrp_table[i];
824 			if (!LGRP_EXISTS(my_lgrp)) {
825 				lgrpid = i;
826 				nlgrps++;
827 				break;
828 			}
829 		}
830 		lgrp_alloc_hint = lgrpid;
831 	}
832 
833 	/*
834 	 * Keep track of max lgroup ID allocated so far to cut down on searches
835 	 */
836 	if (lgrpid > lgrp_alloc_max)
837 		lgrp_alloc_max = lgrpid;
838 
839 	/*
840 	 * Need to allocate new lgroup if next open slot didn't have one
841 	 * for recycling
842 	 */
843 	if (my_lgrp == NULL)
844 		my_lgrp = lgrp_plat_alloc(lgrpid);
845 
846 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
847 		panic("Too many lgrps for platform (%d)", nlgrps);
848 
849 	my_lgrp->lgrp_id = lgrpid;
850 	my_lgrp->lgrp_latency = 0;
851 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
852 	my_lgrp->lgrp_parent = NULL;
853 	my_lgrp->lgrp_childcnt = 0;
854 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
855 	my_lgrp->lgrp_nmnodes = 0;
856 	klgrpset_clear(my_lgrp->lgrp_children);
857 	klgrpset_clear(my_lgrp->lgrp_leaves);
858 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
859 		klgrpset_clear(my_lgrp->lgrp_set[i]);
860 
861 	my_lgrp->lgrp_cpu = NULL;
862 	my_lgrp->lgrp_cpucnt = 0;
863 
864 	if (my_lgrp->lgrp_kstat != NULL)
865 		lgrp_kstat_reset(lgrpid);
866 
867 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
868 
869 	return (my_lgrp);
870 }
871 
872 void
873 lgrp_destroy(lgrp_t *lgrp)
874 {
875 	int		i;
876 
877 	/*
878 	 * Unless this lgroup is being destroyed on behalf of
879 	 * the boot CPU, cpu_lock must be held
880 	 */
881 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
882 
883 	if (nlgrps == 1)
884 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
885 
886 	if (!LGRP_EXISTS(lgrp))
887 		return;
888 
889 	/*
890 	 * Set hint to lgroup being deleted and try to keep lower numbered
891 	 * hints to facilitate finding empty slots
892 	 */
893 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
894 		lgrp_alloc_hint = lgrp->lgrp_id;
895 
896 	/*
897 	 * Mark this lgroup to be recycled by setting its lgroup ID to
898 	 * LGRP_NONE and clear relevant fields
899 	 */
900 	lgrp->lgrp_id = LGRP_NONE;
901 	lgrp->lgrp_latency = 0;
902 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
903 	lgrp->lgrp_parent = NULL;
904 	lgrp->lgrp_childcnt = 0;
905 
906 	klgrpset_clear(lgrp->lgrp_children);
907 	klgrpset_clear(lgrp->lgrp_leaves);
908 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
909 		klgrpset_clear(lgrp->lgrp_set[i]);
910 
911 	lgrp->lgrp_mnodes = (mnodeset_t)0;
912 	lgrp->lgrp_nmnodes = 0;
913 
914 	lgrp->lgrp_cpu = NULL;
915 	lgrp->lgrp_cpucnt = 0;
916 
917 	nlgrps--;
918 }
919 
920 /*
921  * Initialize kstat data. Called from lgrp intialization code.
922  */
923 static void
924 lgrp_kstat_init(void)
925 {
926 	lgrp_stat_t	stat;
927 
928 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
929 
930 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
931 		kstat_named_init(&lgrp_kstat_data[stat],
932 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
933 }
934 
935 /*
936  * initialize an lgrp's kstats if needed
937  * called with cpu_lock held but not with cpus paused.
938  * we don't tear these down now because we don't know about
939  * memory leaving the lgrp yet...
940  */
941 
942 void
943 lgrp_kstat_create(cpu_t *cp)
944 {
945 	kstat_t		*lgrp_kstat;
946 	lgrp_id_t	lgrpid;
947 	lgrp_t		*my_lgrp;
948 
949 	ASSERT(MUTEX_HELD(&cpu_lock));
950 
951 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
952 	my_lgrp = lgrp_table[lgrpid];
953 
954 	if (my_lgrp->lgrp_kstat != NULL)
955 		return; /* already initialized */
956 
957 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
958 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
959 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
960 
961 	if (lgrp_kstat != NULL) {
962 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
963 		lgrp_kstat->ks_private = my_lgrp;
964 		lgrp_kstat->ks_data = &lgrp_kstat_data;
965 		lgrp_kstat->ks_update = lgrp_kstat_extract;
966 		my_lgrp->lgrp_kstat = lgrp_kstat;
967 		kstat_install(lgrp_kstat);
968 	}
969 }
970 
971 /*
972  * this will do something when we manage to remove now unused lgrps
973  */
974 
975 /* ARGSUSED */
976 void
977 lgrp_kstat_destroy(cpu_t *cp)
978 {
979 	ASSERT(MUTEX_HELD(&cpu_lock));
980 }
981 
982 /*
983  * Called when a CPU is off-lined.
984  */
985 static void
986 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
987 {
988 	lgrp_t *my_lgrp;
989 	struct cpu *prev;
990 	struct cpu *next;
991 
992 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
993 
994 	prev = cp->cpu_prev_lgrp;
995 	next = cp->cpu_next_lgrp;
996 
997 	prev->cpu_next_lgrp = next;
998 	next->cpu_prev_lgrp = prev;
999 
1000 	/*
1001 	 * just because I'm paranoid doesn't mean...
1002 	 */
1003 
1004 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1005 
1006 	my_lgrp = lgrp_table[lgrpid];
1007 	my_lgrp->lgrp_cpucnt--;
1008 
1009 	/*
1010 	 * Removing last CPU in lgroup, so update lgroup topology
1011 	 */
1012 	if (my_lgrp->lgrp_cpucnt == 0) {
1013 		klgrpset_t	changed;
1014 		int		count;
1015 		int		i;
1016 
1017 		my_lgrp->lgrp_cpu = NULL;
1018 
1019 		/*
1020 		 * Remove this lgroup from its lgroup CPU resources and remove
1021 		 * lgroup from lgroup topology if it doesn't have any more
1022 		 * resources in it now
1023 		 */
1024 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1025 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1026 			count = 0;
1027 			klgrpset_clear(changed);
1028 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1029 			    lgrp_alloc_max + 1, &changed);
1030 			return;
1031 		}
1032 
1033 		/*
1034 		 * This lgroup isn't empty, so just remove it from CPU
1035 		 * resources of any lgroups that contain it as such
1036 		 */
1037 		for (i = 0; i <= lgrp_alloc_max; i++) {
1038 			lgrp_t		*lgrp;
1039 
1040 			lgrp = lgrp_table[i];
1041 			if (!LGRP_EXISTS(lgrp) ||
1042 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1043 			    lgrpid))
1044 				continue;
1045 
1046 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1047 		}
1048 		return;
1049 	}
1050 
1051 	if (my_lgrp->lgrp_cpu == cp)
1052 		my_lgrp->lgrp_cpu = next;
1053 
1054 }
1055 
1056 /*
1057  * Update memory nodes in target lgroups and return ones that get changed
1058  */
1059 int
1060 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1061 {
1062 	int	count;
1063 	int	i;
1064 	int	j;
1065 	lgrp_t	*lgrp;
1066 	lgrp_t	*lgrp_rsrc;
1067 
1068 	count = 0;
1069 	if (changed)
1070 		klgrpset_clear(*changed);
1071 
1072 	if (klgrpset_isempty(target))
1073 		return (0);
1074 
1075 	/*
1076 	 * Find each lgroup in target lgroups
1077 	 */
1078 	for (i = 0; i <= lgrp_alloc_max; i++) {
1079 		/*
1080 		 * Skip any lgroups that don't exist or aren't in target group
1081 		 */
1082 		lgrp = lgrp_table[i];
1083 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1084 			continue;
1085 		}
1086 
1087 		/*
1088 		 * Initialize memnodes for intermediate lgroups to 0
1089 		 * and update them from scratch since they may have completely
1090 		 * changed
1091 		 */
1092 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1093 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1094 			lgrp->lgrp_nmnodes = 0;
1095 		}
1096 
1097 		/*
1098 		 * Update memory nodes of of target lgroup with memory nodes
1099 		 * from each lgroup in its lgroup memory resource set
1100 		 */
1101 		for (j = 0; j <= lgrp_alloc_max; j++) {
1102 			int	k;
1103 
1104 			/*
1105 			 * Skip any lgroups that don't exist or aren't in
1106 			 * memory resources of target lgroup
1107 			 */
1108 			lgrp_rsrc = lgrp_table[j];
1109 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1110 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1111 			    j))
1112 				continue;
1113 
1114 			/*
1115 			 * Update target lgroup's memnodes to include memnodes
1116 			 * of this lgroup
1117 			 */
1118 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1119 				mnodeset_t	mnode_mask;
1120 
1121 				mnode_mask = (mnodeset_t)1 << k;
1122 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1123 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1124 					lgrp->lgrp_mnodes |= mnode_mask;
1125 					lgrp->lgrp_nmnodes++;
1126 				}
1127 			}
1128 			count++;
1129 			if (changed)
1130 				klgrpset_add(*changed, lgrp->lgrp_id);
1131 		}
1132 	}
1133 
1134 	return (count);
1135 }
1136 
1137 /*
1138  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1139  * is moved from one board to another. The "from" and "to" arguments specify the
1140  * source and the destination of the move.
1141  *
1142  * See plat_lgrp_config() for a detailed description of the copy-rename
1143  * semantics.
1144  *
1145  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1146  * the lgroup topology which is changing as memory moves from one lgroup to
1147  * another. It removes the mnode from the source lgroup and re-inserts it in the
1148  * target lgroup.
1149  *
1150  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1151  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1152  * copy-rename operation.
1153  *
1154  * There is one case which requires special handling. If the system contains
1155  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1156  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1157  * lgrp_mem_init), but there is a window when the system has no memory in the
1158  * lgroup hierarchy. If another thread tries to allocate memory during this
1159  * window, the allocation will fail, although the system has physical memory.
1160  * This may cause a system panic or a deadlock (some sleeping memory allocations
1161  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1162  * the mnode back).
1163  *
1164  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1165  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1166  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1167  * but it updates the rest of the lgroup topology as if the mnode was actually
1168  * removed. The lgrp_mem_init() function recognizes that the mnode being
1169  * inserted represents such a special case and updates the topology
1170  * appropriately.
1171  */
1172 void
1173 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1174 {
1175 	/*
1176 	 * Remove the memory from the source node and add it to the destination
1177 	 * node.
1178 	 */
1179 	lgrp_mem_fini(mnode, from, B_TRUE);
1180 	lgrp_mem_init(mnode, to, B_TRUE);
1181 }
1182 
1183 /*
1184  * Called to indicate that the lgrp with platform handle "hand" now
1185  * contains the memory identified by "mnode".
1186  *
1187  * LOCKING for this routine is a bit tricky. Usually it is called without
1188  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1189  * callers. During DR of the board containing the caged memory it may be called
1190  * with cpu_lock already held and CPUs paused.
1191  *
1192  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1193  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1194  * dealing with the special case of DR copy-rename described in
1195  * lgrp_mem_rename().
1196  */
1197 void
1198 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1199 {
1200 	klgrpset_t	changed;
1201 	int		count;
1202 	int		i;
1203 	lgrp_t		*my_lgrp;
1204 	lgrp_id_t	lgrpid;
1205 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1206 	boolean_t	drop_lock = B_FALSE;
1207 	boolean_t	need_synch = B_FALSE;
1208 
1209 	/*
1210 	 * Grab CPU lock (if we haven't already)
1211 	 */
1212 	if (!MUTEX_HELD(&cpu_lock)) {
1213 		mutex_enter(&cpu_lock);
1214 		drop_lock = B_TRUE;
1215 	}
1216 
1217 	/*
1218 	 * This routine may be called from a context where we already
1219 	 * hold cpu_lock, and have already paused cpus.
1220 	 */
1221 	if (!cpus_paused())
1222 		need_synch = B_TRUE;
1223 
1224 	/*
1225 	 * Check if this mnode is already configured and return immediately if
1226 	 * it is.
1227 	 *
1228 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1229 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1230 	 * recognize this case and continue as usual, but skip the update to
1231 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1232 	 * in topology, temporarily introduced by lgrp_mem_fini().
1233 	 */
1234 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1235 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1236 		if (drop_lock)
1237 			mutex_exit(&cpu_lock);
1238 		return;
1239 	}
1240 
1241 	/*
1242 	 * Update lgroup topology with new memory resources, keeping track of
1243 	 * which lgroups change
1244 	 */
1245 	count = 0;
1246 	klgrpset_clear(changed);
1247 	my_lgrp = lgrp_hand_to_lgrp(hand);
1248 	if (my_lgrp == NULL) {
1249 		/* new lgrp */
1250 		my_lgrp = lgrp_create();
1251 		lgrpid = my_lgrp->lgrp_id;
1252 		my_lgrp->lgrp_plathand = hand;
1253 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1254 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1255 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1256 
1257 		if (need_synch)
1258 			pause_cpus(NULL);
1259 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1260 		    &changed);
1261 		if (need_synch)
1262 			start_cpus();
1263 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1264 	    > 0) {
1265 		/*
1266 		 * Leaf lgroup was created, but latency wasn't available
1267 		 * then.  So, set latency for it and fill in rest of lgroup
1268 		 * topology  now that we know how far it is from other leaf
1269 		 * lgroups.
1270 		 */
1271 		klgrpset_clear(changed);
1272 		lgrpid = my_lgrp->lgrp_id;
1273 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1274 		    lgrpid))
1275 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1276 		if (need_synch)
1277 			pause_cpus(NULL);
1278 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1279 		    &changed);
1280 		if (need_synch)
1281 			start_cpus();
1282 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1283 	    my_lgrp->lgrp_id)) {
1284 		/*
1285 		 * Add new lgroup memory resource to existing lgroup
1286 		 */
1287 		lgrpid = my_lgrp->lgrp_id;
1288 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1289 		klgrpset_add(changed, lgrpid);
1290 		count++;
1291 		for (i = 0; i <= lgrp_alloc_max; i++) {
1292 			lgrp_t		*lgrp;
1293 
1294 			lgrp = lgrp_table[i];
1295 			if (!LGRP_EXISTS(lgrp) ||
1296 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1297 				continue;
1298 
1299 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1300 			klgrpset_add(changed, lgrp->lgrp_id);
1301 			count++;
1302 		}
1303 	}
1304 
1305 	/*
1306 	 * Add memory node to lgroup and remove lgroup from ones that need
1307 	 * to be updated
1308 	 */
1309 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1310 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1311 		my_lgrp->lgrp_nmnodes++;
1312 	}
1313 	klgrpset_del(changed, lgrpid);
1314 
1315 	/*
1316 	 * Update memory node information for all lgroups that changed and
1317 	 * contain new memory node as a resource
1318 	 */
1319 	if (count)
1320 		(void) lgrp_mnode_update(changed, NULL);
1321 
1322 	if (drop_lock)
1323 		mutex_exit(&cpu_lock);
1324 }
1325 
1326 /*
1327  * Called to indicate that the lgroup associated with the platform
1328  * handle "hand" no longer contains given memory node
1329  *
1330  * LOCKING for this routine is a bit tricky. Usually it is called without
1331  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1332  * callers. During DR of the board containing the caged memory it may be called
1333  * with cpu_lock already held and CPUs paused.
1334  *
1335  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1336  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1337  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1338  * the same mnode back into the topology. See lgrp_mem_rename() and
1339  * lgrp_mem_init() for additional details.
1340  */
1341 void
1342 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1343 {
1344 	klgrpset_t	changed;
1345 	int		count;
1346 	int		i;
1347 	lgrp_t		*my_lgrp;
1348 	lgrp_id_t	lgrpid;
1349 	mnodeset_t	mnodes_mask;
1350 	boolean_t	drop_lock = B_FALSE;
1351 	boolean_t	need_synch = B_FALSE;
1352 
1353 	/*
1354 	 * Grab CPU lock (if we haven't already)
1355 	 */
1356 	if (!MUTEX_HELD(&cpu_lock)) {
1357 		mutex_enter(&cpu_lock);
1358 		drop_lock = B_TRUE;
1359 	}
1360 
1361 	/*
1362 	 * This routine may be called from a context where we already
1363 	 * hold cpu_lock and have already paused cpus.
1364 	 */
1365 	if (!cpus_paused())
1366 		need_synch = B_TRUE;
1367 
1368 	my_lgrp = lgrp_hand_to_lgrp(hand);
1369 
1370 	/*
1371 	 * The lgrp *must* be pre-existing
1372 	 */
1373 	ASSERT(my_lgrp != NULL);
1374 
1375 	/*
1376 	 * Delete memory node from lgroups which contain it
1377 	 */
1378 	mnodes_mask = ((mnodeset_t)1 << mnode);
1379 	for (i = 0; i <= lgrp_alloc_max; i++) {
1380 		lgrp_t *lgrp = lgrp_table[i];
1381 		/*
1382 		 * Skip any non-existent lgroups and any lgroups that don't
1383 		 * contain leaf lgroup of memory as a memory resource
1384 		 */
1385 		if (!LGRP_EXISTS(lgrp) ||
1386 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1387 			continue;
1388 
1389 		/*
1390 		 * Avoid removing the last mnode from the root in the DR
1391 		 * copy-rename case. See lgrp_mem_rename() for details.
1392 		 */
1393 		if (is_copy_rename &&
1394 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1395 			continue;
1396 
1397 		/*
1398 		 * Remove memory node from lgroup.
1399 		 */
1400 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1401 		lgrp->lgrp_nmnodes--;
1402 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1403 	}
1404 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1405 
1406 	/*
1407 	 * Don't need to update lgroup topology if this lgroup still has memory.
1408 	 *
1409 	 * In the special case of DR copy-rename with the only mnode being
1410 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1411 	 * still need to update the lgroup topology.
1412 	 */
1413 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1414 	    !(is_copy_rename &&
1415 		(my_lgrp == lgrp_root) &&
1416 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1417 		if (drop_lock)
1418 			mutex_exit(&cpu_lock);
1419 		return;
1420 	}
1421 
1422 	/*
1423 	 * This lgroup does not contain any memory now
1424 	 */
1425 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1426 
1427 	/*
1428 	 * Remove this lgroup from lgroup topology if it does not contain any
1429 	 * resources now
1430 	 */
1431 	lgrpid = my_lgrp->lgrp_id;
1432 	count = 0;
1433 	klgrpset_clear(changed);
1434 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1435 		/*
1436 		 * Delete lgroup when no more resources
1437 		 */
1438 		if (need_synch)
1439 			pause_cpus(NULL);
1440 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1441 		    lgrp_alloc_max + 1, &changed);
1442 		ASSERT(count > 0);
1443 		if (need_synch)
1444 			start_cpus();
1445 	} else {
1446 		/*
1447 		 * Remove lgroup from memory resources of any lgroups that
1448 		 * contain it as such
1449 		 */
1450 		for (i = 0; i <= lgrp_alloc_max; i++) {
1451 			lgrp_t		*lgrp;
1452 
1453 			lgrp = lgrp_table[i];
1454 			if (!LGRP_EXISTS(lgrp) ||
1455 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1456 			    lgrpid))
1457 				continue;
1458 
1459 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1460 		}
1461 	}
1462 	if (drop_lock)
1463 		mutex_exit(&cpu_lock);
1464 }
1465 
1466 /*
1467  * Return lgroup with given platform handle
1468  */
1469 lgrp_t *
1470 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1471 {
1472 	int	i;
1473 	lgrp_t	*lgrp;
1474 
1475 	if (hand == LGRP_NULL_HANDLE)
1476 		return (NULL);
1477 
1478 	for (i = 0; i <= lgrp_alloc_max; i++) {
1479 		lgrp = lgrp_table[i];
1480 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1481 			return (lgrp);
1482 	}
1483 	return (NULL);
1484 }
1485 
1486 /*
1487  * Return the home lgroup of the current thread.
1488  * We must do this with kernel preemption disabled, since we don't want our
1489  * thread to be re-homed while we're poking around with its lpl, and the lpl
1490  * should never be NULL.
1491  *
1492  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1493  * is enabled because of DR.  Callers can use disable kernel preemption
1494  * around this call to guarantee that the lgroup will be valid beyond this
1495  * routine, since kernel preemption can be recursive.
1496  */
1497 lgrp_t *
1498 lgrp_home_lgrp(void)
1499 {
1500 	lgrp_t	*lgrp;
1501 	lpl_t	*lpl;
1502 
1503 	kpreempt_disable();
1504 
1505 	lpl = curthread->t_lpl;
1506 	ASSERT(lpl != NULL);
1507 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1508 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1509 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1510 
1511 	kpreempt_enable();
1512 
1513 	return (lgrp);
1514 }
1515 
1516 /*
1517  * Return ID of home lgroup for given thread
1518  * (See comments for lgrp_home_lgrp() for special care and handling
1519  * instructions)
1520  */
1521 lgrp_id_t
1522 lgrp_home_id(kthread_t *t)
1523 {
1524 	lgrp_id_t	lgrp;
1525 	lpl_t		*lpl;
1526 
1527 	ASSERT(t != NULL);
1528 	/*
1529 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1530 	 * cannot since the HAT layer can call into this routine to
1531 	 * determine the locality for its data structures in the context
1532 	 * of a page fault.
1533 	 */
1534 
1535 	kpreempt_disable();
1536 
1537 	lpl = t->t_lpl;
1538 	ASSERT(lpl != NULL);
1539 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1540 	lgrp = lpl->lpl_lgrpid;
1541 
1542 	kpreempt_enable();
1543 
1544 	return (lgrp);
1545 }
1546 
1547 /*
1548  * Return lgroup containing the physical memory for the given page frame number
1549  */
1550 lgrp_t *
1551 lgrp_pfn_to_lgrp(pfn_t pfn)
1552 {
1553 	lgrp_handle_t	hand;
1554 	int		i;
1555 	lgrp_t		*lgrp;
1556 
1557 	hand = lgrp_plat_pfn_to_hand(pfn);
1558 	if (hand != LGRP_NULL_HANDLE)
1559 		for (i = 0; i <= lgrp_alloc_max; i++) {
1560 			lgrp = lgrp_table[i];
1561 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1562 				return (lgrp);
1563 		}
1564 	return (NULL);
1565 }
1566 
1567 /*
1568  * Return lgroup containing the physical memory for the given page frame number
1569  */
1570 lgrp_t *
1571 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1572 {
1573 	lgrp_handle_t	hand;
1574 	int		i;
1575 	lgrp_t		*lgrp;
1576 	pfn_t		pfn;
1577 
1578 	pfn = btop(physaddr);
1579 	hand = lgrp_plat_pfn_to_hand(pfn);
1580 	if (hand != LGRP_NULL_HANDLE)
1581 		for (i = 0; i <= lgrp_alloc_max; i++) {
1582 			lgrp = lgrp_table[i];
1583 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1584 				return (lgrp);
1585 		}
1586 	return (NULL);
1587 }
1588 
1589 /*
1590  * Return the leaf lgroup containing the given CPU
1591  *
1592  * The caller needs to take precautions necessary to prevent
1593  * "cpu", and it's lpl from going away across a call to this function.
1594  * hint: kpreempt_disable()/kpreempt_enable()
1595  */
1596 static lgrp_t *
1597 lgrp_cpu_to_lgrp(cpu_t *cpu)
1598 {
1599 	return (cpu->cpu_lpl->lpl_lgrp);
1600 }
1601 
1602 /*
1603  * Return the sum of the partition loads in an lgrp divided by
1604  * the number of CPUs in the lgrp.  This is our best approximation
1605  * of an 'lgroup load average' for a useful per-lgroup kstat.
1606  */
1607 static uint64_t
1608 lgrp_sum_loadavgs(lgrp_t *lgrp)
1609 {
1610 	cpu_t *cpu;
1611 	int ncpu;
1612 	uint64_t loads = 0;
1613 
1614 	mutex_enter(&cpu_lock);
1615 
1616 	cpu = lgrp->lgrp_cpu;
1617 	ncpu = lgrp->lgrp_cpucnt;
1618 
1619 	if (cpu == NULL || ncpu == 0) {
1620 		mutex_exit(&cpu_lock);
1621 		return (0ull);
1622 	}
1623 
1624 	do {
1625 		loads += cpu->cpu_lpl->lpl_loadavg;
1626 		cpu = cpu->cpu_next_lgrp;
1627 	} while (cpu != lgrp->lgrp_cpu);
1628 
1629 	mutex_exit(&cpu_lock);
1630 
1631 	return (loads / ncpu);
1632 }
1633 
1634 void
1635 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1636 {
1637 	struct lgrp_stats *pstats;
1638 
1639 	/*
1640 	 * Verify that the caller isn't trying to add to
1641 	 * a statistic for an lgroup that has gone away
1642 	 */
1643 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1644 		return;
1645 
1646 	pstats = &lgrp_stats[lgrpid];
1647 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1648 }
1649 
1650 int64_t
1651 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1652 {
1653 	uint64_t val;
1654 	struct lgrp_stats *pstats;
1655 
1656 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1657 		return ((int64_t)0);
1658 
1659 	pstats = &lgrp_stats[lgrpid];
1660 	LGRP_STAT_READ(pstats, stat, val);
1661 	return (val);
1662 }
1663 
1664 /*
1665  * Reset all kstats for lgrp specified by its lgrpid.
1666  */
1667 static void
1668 lgrp_kstat_reset(lgrp_id_t lgrpid)
1669 {
1670 	lgrp_stat_t stat;
1671 
1672 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1673 		return;
1674 
1675 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1676 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1677 	}
1678 }
1679 
1680 /*
1681  * Collect all per-lgrp statistics for the lgrp associated with this
1682  * kstat, and store them in the ks_data array.
1683  *
1684  * The superuser can reset all the running counter statistics for an
1685  * lgrp by writing to any of the lgrp's stats.
1686  */
1687 static int
1688 lgrp_kstat_extract(kstat_t *ksp, int rw)
1689 {
1690 	lgrp_stat_t		stat;
1691 	struct kstat_named	*ksd;
1692 	lgrp_t			*lgrp;
1693 	lgrp_id_t		lgrpid;
1694 
1695 	lgrp = (lgrp_t *)ksp->ks_private;
1696 
1697 	ksd = (struct kstat_named *)ksp->ks_data;
1698 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1699 
1700 	lgrpid = lgrp->lgrp_id;
1701 
1702 	if (lgrpid == LGRP_NONE) {
1703 		/*
1704 		 * Return all zeroes as stats for freed lgrp.
1705 		 */
1706 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1707 			ksd[stat].value.i64 = 0;
1708 		}
1709 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1710 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1711 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1712 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1713 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1714 	} else if (rw != KSTAT_WRITE) {
1715 		/*
1716 		 * Handle counter stats
1717 		 */
1718 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1719 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1720 		}
1721 
1722 		/*
1723 		 * Handle kernel data snapshot stats
1724 		 */
1725 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1726 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1727 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1728 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1729 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1730 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1731 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1732 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1733 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1734 		    lgrp_loadavg_max_effect;
1735 	} else {
1736 		lgrp_kstat_reset(lgrpid);
1737 	}
1738 
1739 	return (0);
1740 }
1741 
1742 int
1743 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1744 {
1745 	cpu_t	*cp;
1746 
1747 	mutex_enter(&cpu_lock);
1748 
1749 	if ((cp = cpu_get(id)) == NULL) {
1750 		mutex_exit(&cpu_lock);
1751 		return (EINVAL);
1752 	}
1753 
1754 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1755 		mutex_exit(&cpu_lock);
1756 		return (EINVAL);
1757 	}
1758 
1759 	ASSERT(cp->cpu_lpl != NULL);
1760 
1761 	*lp = cp->cpu_lpl->lpl_lgrpid;
1762 
1763 	mutex_exit(&cpu_lock);
1764 
1765 	return (0);
1766 }
1767 
1768 int
1769 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1770 {
1771 	cpu_t *cp;
1772 
1773 	mutex_enter(&cpu_lock);
1774 
1775 	if ((cp = cpu_get(id)) == NULL) {
1776 		mutex_exit(&cpu_lock);
1777 		return (EINVAL);
1778 	}
1779 
1780 	ASSERT(cp->cpu_lpl != NULL);
1781 
1782 	*lp = cp->cpu_lpl->lpl_loadavg;
1783 
1784 	mutex_exit(&cpu_lock);
1785 
1786 	return (0);
1787 }
1788 
1789 /*
1790  * Add a resource named by lpl_leaf to rset of lpl_target
1791  *
1792  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1793  * resource. It is adjusted here, as this is presently the only place that we
1794  * can be certain a resource addition has succeeded.
1795  *
1796  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1797  * list in order until it reaches a NULL.  (This list is required to be NULL
1798  * terminated, too).  This is done so that we can mark start pos + 1, so that
1799  * each lpl is traversed sequentially, but in a different order.  We hope this
1800  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1801  */
1802 
1803 void
1804 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1805 {
1806 	int		i;
1807 	int		entry_slot = 0;
1808 
1809 	/* return if leaf is already present */
1810 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1811 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1812 			return;
1813 		}
1814 
1815 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1816 		    lpl_leaf->lpl_lgrpid) {
1817 			break;
1818 		}
1819 	}
1820 
1821 	/* insert leaf, update counts */
1822 	entry_slot = i;
1823 	i = lpl_target->lpl_nrset++;
1824 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1825 		panic("More leaf lgrps in system than are supported!\n");
1826 	}
1827 
1828 	/*
1829 	 * Start at the end of the rset array and work backwards towards the
1830 	 * slot into which the new lpl will be inserted. This effectively
1831 	 * preserves the current ordering by scooting everybody over one entry,
1832 	 * and placing the new entry into the space created.
1833 	 */
1834 
1835 	while (i-- > entry_slot) {
1836 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1837 	}
1838 
1839 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1840 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1841 }
1842 
1843 /*
1844  * Update each of lpl_parent's children with a proper hint and
1845  * a reference to their parent.
1846  * The lgrp topology is used as the reference since it is fully
1847  * consistent and correct at this point.
1848  *
1849  * Each child's hint will reference an element in lpl_parent's
1850  * rset that designates where the child should start searching
1851  * for CPU resources. The hint selected is the highest order leaf present
1852  * in the child's lineage.
1853  *
1854  * This should be called after any potential change in lpl_parent's
1855  * rset.
1856  */
1857 static void
1858 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1859 {
1860 	klgrpset_t	children, leaves;
1861 	lpl_t		*lpl;
1862 	int		hint;
1863 	int		i, j;
1864 
1865 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1866 	if (klgrpset_isempty(children))
1867 		return; /* nothing to do */
1868 
1869 	for (i = 0; i <= lgrp_alloc_max; i++) {
1870 		if (klgrpset_ismember(children, i)) {
1871 
1872 			/*
1873 			 * Given the set of leaves in this child's lineage,
1874 			 * find the highest order leaf present in the parent's
1875 			 * rset. Select this as the hint for the child.
1876 			 */
1877 			leaves = lgrp_table[i]->lgrp_leaves;
1878 			hint = 0;
1879 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1880 				lpl = lpl_parent->lpl_rset[j];
1881 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1882 					hint = j;
1883 			}
1884 			cp->cp_lgrploads[i].lpl_hint = hint;
1885 
1886 			/*
1887 			 * (Re)set the parent. It may be incorrect if
1888 			 * lpl_parent is new in the topology.
1889 			 */
1890 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1891 		}
1892 	}
1893 }
1894 
1895 /*
1896  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1897  *
1898  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1899  * resource. The values are adjusted here, as this is the only place that we can
1900  * be certain a resource was successfully deleted.
1901  */
1902 void
1903 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1904 {
1905 	int i;
1906 
1907 	/* find leaf in intermediate node */
1908 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1909 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1910 			break;
1911 	}
1912 
1913 	/* return if leaf not found */
1914 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1915 		return;
1916 
1917 	/* prune leaf, compress array */
1918 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1919 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1920 	lpl_target->lpl_ncpu--;
1921 	do {
1922 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1923 	} while (i++ < lpl_target->lpl_nrset);
1924 }
1925 
1926 /*
1927  * Check to see if the resource set of the target lpl contains the
1928  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1929  */
1930 
1931 int
1932 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1933 {
1934 	int i;
1935 
1936 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1937 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1938 			return (1);
1939 	}
1940 
1941 	return (0);
1942 }
1943 
1944 /*
1945  * Called when we change cpu lpl membership.  This increments or decrements the
1946  * per-cpu counter in every lpl in which our leaf appears.
1947  */
1948 void
1949 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1950 {
1951 	cpupart_t	*cpupart;
1952 	lgrp_t		*lgrp_leaf;
1953 	lgrp_t		*lgrp_cur;
1954 	lpl_t		*lpl_leaf;
1955 	lpl_t		*lpl_cur;
1956 	int		i;
1957 
1958 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1959 
1960 	cpupart = cp->cpu_part;
1961 	lpl_leaf = cp->cpu_lpl;
1962 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1963 
1964 	for (i = 0; i <= lgrp_alloc_max; i++) {
1965 		lgrp_cur = lgrp_table[i];
1966 
1967 		/*
1968 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1969 		 * for the cpu in question, or if the current lgrp and leaf
1970 		 * don't share the same resources.
1971 		 */
1972 
1973 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1974 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1975 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1976 			continue;
1977 
1978 
1979 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1980 
1981 		if (lpl_cur->lpl_nrset > 0) {
1982 			if (act == LPL_INCREMENT) {
1983 				lpl_cur->lpl_ncpu++;
1984 			} else if (act == LPL_DECREMENT) {
1985 				lpl_cur->lpl_ncpu--;
1986 			}
1987 		}
1988 	}
1989 }
1990 
1991 /*
1992  * Initialize lpl with given resources and specified lgrp
1993  */
1994 
1995 void
1996 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
1997 {
1998 	lpl->lpl_lgrpid = lgrp->lgrp_id;
1999 	lpl->lpl_loadavg = 0;
2000 	if (lpl == lpl_leaf)
2001 		lpl->lpl_ncpu = 1;
2002 	else
2003 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2004 	lpl->lpl_nrset = 1;
2005 	lpl->lpl_rset[0] = lpl_leaf;
2006 	lpl->lpl_lgrp = lgrp;
2007 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2008 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2009 }
2010 
2011 /*
2012  * Clear an unused lpl
2013  */
2014 
2015 void
2016 lpl_clear(lpl_t *lpl)
2017 {
2018 	lgrp_id_t	lid;
2019 
2020 	/* save lid for debugging purposes */
2021 	lid = lpl->lpl_lgrpid;
2022 	bzero(lpl, sizeof (lpl_t));
2023 	lpl->lpl_lgrpid = lid;
2024 }
2025 
2026 /*
2027  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2028  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2029  * make full use of all of the lgroup topology, but this checks to make sure
2030  * that for the parts that it does use, it has correctly understood the
2031  * relationships that exist. This function returns
2032  * 0 if the topology is correct, and a non-zero error code, for non-debug
2033  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2034  * debugging on a DEBUG kernel.
2035  */
2036 int
2037 lpl_topo_verify(cpupart_t *cpupart)
2038 {
2039 	lgrp_t		*lgrp;
2040 	lpl_t		*lpl;
2041 	klgrpset_t	rset;
2042 	klgrpset_t	cset;
2043 	cpu_t		*cpu;
2044 	cpu_t		*cp_start;
2045 	int		i;
2046 	int		j;
2047 	int		sum;
2048 
2049 	/* topology can't be incorrect if it doesn't exist */
2050 	if (!lgrp_topo_initialized || !lgrp_initialized)
2051 		return (LPL_TOPO_CORRECT);
2052 
2053 	ASSERT(cpupart != NULL);
2054 
2055 	for (i = 0; i <= lgrp_alloc_max; i++) {
2056 		lgrp = lgrp_table[i];
2057 		lpl = NULL;
2058 		/* make sure lpls are allocated */
2059 		ASSERT(cpupart->cp_lgrploads);
2060 		if (!cpupart->cp_lgrploads)
2061 			return (LPL_TOPO_PART_HAS_NO_LPL);
2062 
2063 		lpl = &cpupart->cp_lgrploads[i];
2064 		/* make sure our index is good */
2065 		ASSERT(i < cpupart->cp_nlgrploads);
2066 
2067 		/* if lgroup doesn't exist, make sure lpl is empty */
2068 		if (!LGRP_EXISTS(lgrp)) {
2069 			ASSERT(lpl->lpl_ncpu == 0);
2070 			if (lpl->lpl_ncpu > 0) {
2071 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2072 			} else {
2073 				continue;
2074 			}
2075 		}
2076 
2077 		/* verify that lgroup and lpl are identically numbered */
2078 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2079 
2080 		/* if lgroup isn't in our partition, make sure lpl is empty */
2081 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2082 		    cpupart->cp_lgrpset)) {
2083 			ASSERT(lpl->lpl_ncpu == 0);
2084 			if (lpl->lpl_ncpu > 0) {
2085 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2086 			}
2087 			/*
2088 			 * lpl is empty, and lgroup isn't in partition.  verify
2089 			 * that lpl doesn't show up in anyone else's rsets (in
2090 			 * this partition, anyway)
2091 			 */
2092 
2093 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2094 				lpl_t *i_lpl; /* lpl we're iterating over */
2095 
2096 				i_lpl = &cpupart->cp_lgrploads[j];
2097 
2098 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2099 				if (lpl_rset_contains(i_lpl, lpl)) {
2100 					return (LPL_TOPO_LPL_ORPHANED);
2101 				}
2102 			}
2103 			/* lgroup is empty, and everything is ok. continue */
2104 			continue;
2105 		}
2106 
2107 
2108 		/* lgroup is in this partition, now check it against lpl */
2109 
2110 		/* do both have matching lgrps? */
2111 		ASSERT(lgrp == lpl->lpl_lgrp);
2112 		if (lgrp != lpl->lpl_lgrp) {
2113 			return (LPL_TOPO_LGRP_MISMATCH);
2114 		}
2115 
2116 		/* do the parent lgroups exist and do they match? */
2117 		if (lgrp->lgrp_parent) {
2118 			ASSERT(lpl->lpl_parent);
2119 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2120 				    lpl->lpl_parent->lpl_lgrpid);
2121 
2122 			if (!lpl->lpl_parent) {
2123 				return (LPL_TOPO_MISSING_PARENT);
2124 			} else if (lgrp->lgrp_parent->lgrp_id !=
2125 			    lpl->lpl_parent->lpl_lgrpid) {
2126 				return (LPL_TOPO_PARENT_MISMATCH);
2127 			}
2128 		}
2129 
2130 		/* only leaf lgroups keep a cpucnt, only check leaves */
2131 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2132 
2133 			/* verify that lgrp is also a leaf */
2134 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2135 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2136 			    lpl->lpl_lgrpid)));
2137 
2138 			if ((lgrp->lgrp_childcnt > 0) ||
2139 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2140 			    lpl->lpl_lgrpid))) {
2141 				return (LPL_TOPO_LGRP_NOT_LEAF);
2142 			}
2143 
2144 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2145 			    (lpl->lpl_ncpu > 0));
2146 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2147 				(lpl->lpl_ncpu <= 0)) {
2148 				return (LPL_TOPO_BAD_CPUCNT);
2149 			}
2150 
2151 			/*
2152 			 * Check that lpl_ncpu also matches the number of
2153 			 * cpus in the lpl's linked list.  This only exists in
2154 			 * leaves, but they should always match.
2155 			 */
2156 			j = 0;
2157 			cpu = cp_start = lpl->lpl_cpus;
2158 			while (cpu != NULL) {
2159 				j++;
2160 
2161 				/* check to make sure cpu's lpl is leaf lpl */
2162 				ASSERT(cpu->cpu_lpl == lpl);
2163 				if (cpu->cpu_lpl != lpl) {
2164 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2165 				}
2166 
2167 				/* check next cpu */
2168 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2169 					continue;
2170 				} else {
2171 					cpu = NULL;
2172 				}
2173 			}
2174 
2175 			ASSERT(j == lpl->lpl_ncpu);
2176 			if (j != lpl->lpl_ncpu) {
2177 				return (LPL_TOPO_LPL_BAD_NCPU);
2178 			}
2179 
2180 			/*
2181 			 * Also, check that leaf lpl is contained in all
2182 			 * intermediate lpls that name the leaf as a descendant
2183 			 */
2184 
2185 			for (j = 0; j <= lgrp_alloc_max; j++) {
2186 				klgrpset_t intersect;
2187 				lgrp_t *lgrp_cand;
2188 				lpl_t *lpl_cand;
2189 
2190 				lgrp_cand = lgrp_table[j];
2191 				intersect = klgrpset_intersects(
2192 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2193 				    cpupart->cp_lgrpset);
2194 
2195 				if (!LGRP_EXISTS(lgrp_cand) ||
2196 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2197 				    cpupart->cp_lgrpset) ||
2198 				    (intersect == 0))
2199 					continue;
2200 
2201 				lpl_cand =
2202 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2203 
2204 				if (klgrpset_ismember(intersect,
2205 				    lgrp->lgrp_id)) {
2206 					ASSERT(lpl_rset_contains(lpl_cand,
2207 					    lpl));
2208 
2209 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2210 						return (LPL_TOPO_RSET_MSSNG_LF);
2211 					}
2212 				}
2213 			}
2214 
2215 		} else { /* non-leaf specific checks */
2216 
2217 			/*
2218 			 * Non-leaf lpls should have lpl_cpus == NULL
2219 			 * verify that this is so
2220 			 */
2221 			ASSERT(lpl->lpl_cpus == NULL);
2222 			if (lpl->lpl_cpus != NULL) {
2223 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2224 			}
2225 
2226 			/*
2227 			 * verify that the sum of the cpus in the leaf resources
2228 			 * is equal to the total ncpu in the intermediate
2229 			 */
2230 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2231 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2232 			}
2233 
2234 			ASSERT(sum == lpl->lpl_ncpu);
2235 			if (sum != lpl->lpl_ncpu) {
2236 				return (LPL_TOPO_LPL_BAD_NCPU);
2237 			}
2238 		}
2239 
2240 		/*
2241 		 * check on lpl_hint. Don't check root, since it has no parent.
2242 		 */
2243 		if (lpl->lpl_parent != NULL) {
2244 			int hint;
2245 			lpl_t *hint_lpl;
2246 
2247 			/* make sure hint is within limits of nrset */
2248 			hint = lpl->lpl_hint;
2249 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2250 			if (lpl->lpl_parent->lpl_nrset < hint) {
2251 				return (LPL_TOPO_BOGUS_HINT);
2252 			}
2253 
2254 			/* make sure hint points to valid lpl */
2255 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2256 			ASSERT(hint_lpl->lpl_ncpu > 0);
2257 			if (hint_lpl->lpl_ncpu <= 0) {
2258 				return (LPL_TOPO_BOGUS_HINT);
2259 			}
2260 		}
2261 
2262 		/*
2263 		 * Check the rset of the lpl in question.  Make sure that each
2264 		 * rset contains a subset of the resources in
2265 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2266 		 * sure that each rset doesn't include resources that are
2267 		 * outside of that set.  (Which would be resources somehow not
2268 		 * accounted for).
2269 		 */
2270 
2271 		klgrpset_clear(rset);
2272 		for (j = 0; j < lpl->lpl_nrset; j++) {
2273 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2274 		}
2275 		klgrpset_copy(cset, rset);
2276 		/* make sure lpl rset matches lgrp rset */
2277 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2278 		/* make sure rset is contained with in partition, too */
2279 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2280 
2281 		ASSERT(klgrpset_isempty(rset) &&
2282 			    klgrpset_isempty(cset));
2283 		if (!klgrpset_isempty(rset) ||
2284 		    !klgrpset_isempty(cset)) {
2285 			return (LPL_TOPO_RSET_MISMATCH);
2286 		}
2287 
2288 		/*
2289 		 * check to make sure lpl_nrset matches the number of rsets
2290 		 * contained in the lpl
2291 		 */
2292 
2293 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2294 		    j++);
2295 
2296 		ASSERT(j == lpl->lpl_nrset);
2297 		if (j != lpl->lpl_nrset) {
2298 			return (LPL_TOPO_BAD_RSETCNT);
2299 		}
2300 
2301 	}
2302 	return (LPL_TOPO_CORRECT);
2303 }
2304 
2305 /*
2306  * Flatten lpl topology to given number of levels.  This is presently only
2307  * implemented for a flatten to 2 levels, which will prune out the intermediates
2308  * and home the leaf lpls to the root lpl.
2309  */
2310 int
2311 lpl_topo_flatten(int levels)
2312 {
2313 	int		i;
2314 	uint_t		sum;
2315 	lgrp_t		*lgrp_cur;
2316 	lpl_t		*lpl_cur;
2317 	lpl_t		*lpl_root;
2318 	cpupart_t	*cp;
2319 
2320 	if (levels != 2)
2321 		return (0);
2322 
2323 	/* called w/ cpus paused - grab no locks! */
2324 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2325 	    !lgrp_initialized);
2326 
2327 	cp = cp_list_head;
2328 	do {
2329 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2330 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2331 
2332 		for (i = 0; i <= lgrp_alloc_max; i++) {
2333 			lgrp_cur = lgrp_table[i];
2334 			lpl_cur = &cp->cp_lgrploads[i];
2335 
2336 			if ((lgrp_cur == lgrp_root) ||
2337 			    (!LGRP_EXISTS(lgrp_cur) &&
2338 			    (lpl_cur->lpl_ncpu == 0)))
2339 				continue;
2340 
2341 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2342 				/*
2343 				 * this should be a deleted intermediate, so
2344 				 * clear it
2345 				 */
2346 				lpl_clear(lpl_cur);
2347 			} else if ((lpl_cur->lpl_nrset == 1) &&
2348 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2349 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2350 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2351 				/*
2352 				 * this is a leaf whose parent was deleted, or
2353 				 * whose parent had their lgrp deleted.  (And
2354 				 * whose parent will soon be deleted).  Point
2355 				 * this guy back to the root lpl.
2356 				 */
2357 				lpl_cur->lpl_parent = lpl_root;
2358 				lpl_rset_add(lpl_root, lpl_cur);
2359 			}
2360 
2361 		}
2362 
2363 		/*
2364 		 * Now that we're done, make sure the count on the root lpl is
2365 		 * correct, and update the hints of the children for the sake of
2366 		 * thoroughness
2367 		 */
2368 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2369 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2370 		}
2371 		lpl_root->lpl_ncpu = sum;
2372 		lpl_child_update(lpl_root, cp);
2373 
2374 		cp = cp->cp_next;
2375 	} while (cp != cp_list_head);
2376 
2377 	return (levels);
2378 }
2379 
2380 /*
2381  * Insert a lpl into the resource hierarchy and create any additional lpls that
2382  * are necessary to represent the varying states of locality for the cpu
2383  * resoruces newly added to the partition.
2384  *
2385  * This routine is clever enough that it can correctly add resources from the
2386  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2387  * those for which the lpl is a leaf as opposed to simply a named equally local
2388  * resource).  The one special case that needs additional processing is when a
2389  * new intermediate lpl is introduced.  Since the main loop only traverses
2390  * looking to add the leaf resource where it does not yet exist, additional work
2391  * is necessary to add other leaf resources that may need to exist in the newly
2392  * created intermediate.  This is performed by the second inner loop, and is
2393  * only done when the check for more than one overlapping resource succeeds.
2394  */
2395 
2396 void
2397 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2398 {
2399 	int		i;
2400 	int		j;
2401 	int		hint;
2402 	int		rset_num_intersect;
2403 	lgrp_t		*lgrp_cur;
2404 	lpl_t		*lpl_cur;
2405 	lpl_t		*lpl_parent;
2406 	lgrp_id_t	parent_id;
2407 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2408 
2409 	for (i = 0; i <= lgrp_alloc_max; i++) {
2410 		lgrp_cur = lgrp_table[i];
2411 
2412 		/*
2413 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2414 		 * contained within the current lgrp, or if the current lgrp has
2415 		 * no leaves in this partition
2416 		 */
2417 
2418 		if (!LGRP_EXISTS(lgrp_cur) ||
2419 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2420 		    lpl_leaf->lpl_lgrpid) ||
2421 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2422 		    cpupart->cp_lgrpset))
2423 			continue;
2424 
2425 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2426 		if (lgrp_cur->lgrp_parent != NULL) {
2427 			/* if lgrp has a parent, assign it properly */
2428 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2429 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2430 		} else {
2431 			/* if not, make sure parent ptr gets set to null */
2432 			lpl_parent = NULL;
2433 		}
2434 
2435 		if (lpl_cur == lpl_leaf) {
2436 			/*
2437 			 * Almost all leaf state was initialized elsewhere.  The
2438 			 * only thing left to do is to set the parent.
2439 			 */
2440 			lpl_cur->lpl_parent = lpl_parent;
2441 			continue;
2442 		}
2443 
2444 		/*
2445 		 * Initialize intermediate lpl
2446 		 * Save this lpl's hint though. Since we're changing this
2447 		 * lpl's resources, we need to update the hint in this lpl's
2448 		 * children, but the hint in this lpl is unaffected and
2449 		 * should be preserved.
2450 		 */
2451 		hint = lpl_cur->lpl_hint;
2452 
2453 		lpl_clear(lpl_cur);
2454 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2455 
2456 		lpl_cur->lpl_hint = hint;
2457 		lpl_cur->lpl_parent = lpl_parent;
2458 
2459 		/* does new lpl need to be populated with other resources? */
2460 		rset_intersect =
2461 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2462 			cpupart->cp_lgrpset);
2463 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2464 
2465 		if (rset_num_intersect > 1) {
2466 			/*
2467 			 * If so, figure out what lpls have resources that
2468 			 * intersect this one, and add them.
2469 			 */
2470 			for (j = 0; j <= lgrp_alloc_max; j++) {
2471 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2472 				lpl_t	*lpl_cand;	/* candidate lpl */
2473 
2474 				lgrp_cand = lgrp_table[j];
2475 				if (!LGRP_EXISTS(lgrp_cand) ||
2476 				    !klgrpset_ismember(rset_intersect,
2477 					lgrp_cand->lgrp_id))
2478 					continue;
2479 				lpl_cand =
2480 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2481 				lpl_rset_add(lpl_cur, lpl_cand);
2482 			}
2483 		}
2484 		/*
2485 		 * This lpl's rset has changed. Update the hint in it's
2486 		 * children.
2487 		 */
2488 		lpl_child_update(lpl_cur, cpupart);
2489 	}
2490 }
2491 
2492 /*
2493  * remove a lpl from the hierarchy of resources, clearing its state when
2494  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2495  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496  * delete them as well.
2497  */
2498 
2499 void
2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2501 {
2502 	int		i;
2503 	lgrp_t		*lgrp_cur;
2504 	lpl_t		*lpl_cur;
2505 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2506 
2507 	for (i = 0; i <= lgrp_alloc_max; i++) {
2508 		lgrp_cur = lgrp_table[i];
2509 
2510 		/*
2511 		 * Don't attempt to remove from lgrps that aren't there, that
2512 		 * don't contain our leaf, or from the leaf itself. (We do that
2513 		 * later)
2514 		 */
2515 
2516 		if (!LGRP_EXISTS(lgrp_cur))
2517 			continue;
2518 
2519 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2520 
2521 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2522 		    lpl_leaf->lpl_lgrpid) ||
2523 		    (lpl_cur == lpl_leaf)) {
2524 			continue;
2525 		}
2526 
2527 		/*
2528 		 * This is a slightly sleazy simplification in that we have
2529 		 * already marked the cp_lgrpset as no longer containing the
2530 		 * leaf we've deleted.  Any lpls that pass the above checks
2531 		 * based upon lgrp membership but not necessarily cpu-part
2532 		 * membership also get cleared by the checks below.  Currently
2533 		 * this is harmless, as the lpls should be empty anyway.
2534 		 *
2535 		 * In particular, we want to preserve lpls that have additional
2536 		 * leaf resources, even though we don't yet have a processor
2537 		 * architecture that represents resources this way.
2538 		 */
2539 
2540 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2541 		    cpupart->cp_lgrpset);
2542 
2543 		lpl_rset_del(lpl_cur, lpl_leaf);
2544 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2545 			lpl_clear(lpl_cur);
2546 		} else {
2547 			/*
2548 			 * Update this lpl's children
2549 			 */
2550 			lpl_child_update(lpl_cur, cpupart);
2551 		}
2552 	}
2553 	lpl_clear(lpl_leaf);
2554 }
2555 
2556 /*
2557  * add a cpu to a partition in terms of lgrp load avg bookeeping
2558  *
2559  * The lpl (cpu partition load average information) is now arranged in a
2560  * hierarchical fashion whereby resources that are closest, ie. most local, to
2561  * the cpu in question are considered to be leaves in a tree of resources.
2562  * There are two general cases for cpu additon:
2563  *
2564  * 1. A lpl structure that contains resources already in the hierarchy tree.
2565  * In this case, all of the associated lpl relationships have been defined, and
2566  * all that is necessary is that we link the new cpu into the per-lpl list of
2567  * cpus, and increment the ncpu count of all places where this cpu resource will
2568  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569  * pushing is accomplished by this routine.
2570  *
2571  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2573  * construct the hierarchy of state necessary to name it's more distant
2574  * resources, if they should exist.  The leaf structure is initialized by this
2575  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2576  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577  * and builds all of the "ancestoral" state necessary to identify resources at
2578  * differing levels of locality.
2579  */
2580 void
2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2582 {
2583 	cpupart_t	*cpupart;
2584 	lgrp_t		*lgrp_leaf;
2585 	lpl_t		*lpl_leaf;
2586 
2587 	/* called sometimes w/ cpus paused - grab no locks */
2588 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2589 
2590 	cpupart = cp->cpu_part;
2591 	lgrp_leaf = lgrp_table[lgrpid];
2592 
2593 	/* don't add non-existent lgrp */
2594 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2595 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2596 	cp->cpu_lpl = lpl_leaf;
2597 
2598 	/* only leaf lpls contain cpus */
2599 
2600 	if (lpl_leaf->lpl_ncpu++ == 0) {
2601 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2602 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2603 		lpl_leaf_insert(lpl_leaf, cpupart);
2604 	} else {
2605 		/*
2606 		 * the lpl should already exist in the parent, so just update
2607 		 * the count of available CPUs
2608 		 */
2609 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2610 	}
2611 
2612 	/* link cpu into list of cpus in lpl */
2613 
2614 	if (lpl_leaf->lpl_cpus) {
2615 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2616 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2617 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2618 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2619 	} else {
2620 		/*
2621 		 * We increment ncpu immediately after we create a new leaf
2622 		 * lpl, so assert that ncpu == 1 for the case where we don't
2623 		 * have any cpu pointers yet.
2624 		 */
2625 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2626 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2627 	}
2628 
2629 }
2630 
2631 
2632 /*
2633  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2634  *
2635  * The lpl (cpu partition load average information) is now arranged in a
2636  * hierarchical fashion whereby resources that are closest, ie. most local, to
2637  * the cpu in question are considered to be leaves in a tree of resources.
2638  * There are two removal cases in question:
2639  *
2640  * 1. Removal of the resource in the leaf leaves other resources remaining in
2641  * that leaf.  (Another cpu still exists at this level of locality).  In this
2642  * case, the count of available cpus is decremented in all assocated lpls by
2643  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644  * from the per-cpu lpl list.
2645  *
2646  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2647  * empty)  In this case, all of what has occurred for the first step must take
2648  * place; however, additionally we must remove the lpl structure itself, prune
2649  * out any stranded lpls that do not directly name a leaf resource, and mark the
2650  * cpu partition in question as no longer containing resources from the lgrp of
2651  * the lpl that has been delted.  Cpu-partition changes are handled by this
2652  * method, but the lpl_leaf_remove function deals with the details of pruning
2653  * out the empty lpl and any of its orphaned direct ancestors.
2654  */
2655 void
2656 lgrp_part_del_cpu(cpu_t *cp)
2657 {
2658 	lpl_t		*lpl;
2659 	lpl_t		*leaf_lpl;
2660 	lgrp_t		*lgrp_leaf;
2661 
2662 	/* called sometimes w/ cpus paused - grab no locks */
2663 
2664 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2665 
2666 	lpl = leaf_lpl = cp->cpu_lpl;
2667 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2668 
2669 	/* don't delete a leaf that isn't there */
2670 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2671 
2672 	/* no double-deletes */
2673 	ASSERT(lpl->lpl_ncpu);
2674 	if (--lpl->lpl_ncpu == 0) {
2675 		/*
2676 		 * This was the last cpu in this lgroup for this partition,
2677 		 * clear its bit in the partition's lgroup bitmask
2678 		 */
2679 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2680 
2681 		/* eliminate remaning lpl link pointers in cpu, lpl */
2682 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2683 
2684 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2685 	} else {
2686 
2687 		/* unlink cpu from lists of cpus in lpl */
2688 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2689 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2690 		if (lpl->lpl_cpus == cp) {
2691 			lpl->lpl_cpus = cp->cpu_next_lpl;
2692 		}
2693 
2694 		/*
2695 		 * Update the cpu count in the lpls associated with parent
2696 		 * lgroups.
2697 		 */
2698 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2699 
2700 	}
2701 	/* clear cpu's lpl ptr when we're all done */
2702 	cp->cpu_lpl = NULL;
2703 }
2704 
2705 /*
2706  * Recompute load average for the specified partition/lgrp fragment.
2707  *
2708  * We rely on the fact that this routine is called from the clock thread
2709  * at a point before the clock thread can block (i.e. before its first
2710  * lock request).  Since the clock thread can not be preempted (since it
2711  * runs at highest priority), we know that cpu partitions can not change
2712  * (since doing so would require either the repartition requester or the
2713  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714  * without grabbing cpu_lock.
2715  */
2716 void
2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2718 {
2719 	uint_t		ncpu;
2720 	int64_t		old, new, f;
2721 
2722 	/*
2723 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2724 	 */
2725 	static short expval[] = {
2726 	    0, 3196, 1618, 1083,
2727 	    814, 652, 543, 466,
2728 	    408, 363, 326, 297,
2729 	    272, 251, 233, 218,
2730 	    204, 192, 181, 172,
2731 	    163, 155, 148, 142,
2732 	    136, 130, 125, 121,
2733 	    116, 112, 109, 105
2734 	};
2735 
2736 	/* ASSERT (called from clock level) */
2737 
2738 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2739 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2740 		return;
2741 	}
2742 
2743 	for (;;) {
2744 
2745 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2746 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2747 		else
2748 			f = expval[ncpu];
2749 
2750 		/*
2751 		 * Modify the load average atomically to avoid losing
2752 		 * anticipatory load updates (see lgrp_move_thread()).
2753 		 */
2754 		if (ageflag) {
2755 			/*
2756 			 * We're supposed to both update and age the load.
2757 			 * This happens 10 times/sec. per cpu.  We do a
2758 			 * little hoop-jumping to avoid integer overflow.
2759 			 */
2760 			int64_t		q, r;
2761 
2762 			do {
2763 				old = new = lpl->lpl_loadavg;
2764 				q = (old  >> 16) << 7;
2765 				r = (old  & 0xffff) << 7;
2766 				new += ((long long)(nrcpus - q) * f -
2767 				    ((r * f) >> 16)) >> 7;
2768 
2769 				/*
2770 				 * Check for overflow
2771 				 */
2772 				if (new > LGRP_LOADAVG_MAX)
2773 					new = LGRP_LOADAVG_MAX;
2774 				else if (new < 0)
2775 					new = 0;
2776 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2777 			    new) != old);
2778 		} else {
2779 			/*
2780 			 * We're supposed to update the load, but not age it.
2781 			 * This option is used to update the load (which either
2782 			 * has already been aged in this 1/10 sec. interval or
2783 			 * soon will be) to account for a remotely executing
2784 			 * thread.
2785 			 */
2786 			do {
2787 				old = new = lpl->lpl_loadavg;
2788 				new += f;
2789 				/*
2790 				 * Check for overflow
2791 				 * Underflow not possible here
2792 				 */
2793 				if (new < old)
2794 					new = LGRP_LOADAVG_MAX;
2795 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2796 			    new) != old);
2797 		}
2798 
2799 		/*
2800 		 * Do the same for this lpl's parent
2801 		 */
2802 		if ((lpl = lpl->lpl_parent) == NULL)
2803 			break;
2804 		ncpu = lpl->lpl_ncpu;
2805 	}
2806 }
2807 
2808 /*
2809  * Initialize lpl topology in the target based on topology currently present in
2810  * lpl_bootstrap.
2811  *
2812  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813  * initialize cp_default list of lpls. Up to this point all topology operations
2814  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816  * `target' points to the list of lpls in cp_default and `size' is the size of
2817  * this list.
2818  *
2819  * This function walks the lpl topology in lpl_bootstrap and does for things:
2820  *
2821  * 1) Copies all fields from lpl_bootstrap to the target.
2822  *
2823  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2824  *
2825  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826  *    instead of lpl_bootstrap.
2827  *
2828  * 4) Updates pointers in the resource list of the target to point to the lpls
2829  *    in the target list instead of lpl_bootstrap.
2830  *
2831  * After lpl_topo_bootstrap() completes, target contains the same information
2832  * that would be present there if it were used during boot instead of
2833  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834  * and it is bzeroed.
2835  */
2836 void
2837 lpl_topo_bootstrap(lpl_t *target, int size)
2838 {
2839 	lpl_t	*lpl = lpl_bootstrap;
2840 	lpl_t	*target_lpl = target;
2841 	int	howmany;
2842 	int	id;
2843 	int	i;
2844 
2845 	/*
2846 	 * The only target that should be passed here is cp_default lpl list.
2847 	 */
2848 	ASSERT(target == cp_default.cp_lgrploads);
2849 	ASSERT(size == cp_default.cp_nlgrploads);
2850 	ASSERT(!lgrp_topo_initialized);
2851 	ASSERT(ncpus == 1);
2852 
2853 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2854 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2855 		/*
2856 		 * Copy all fields from lpl.
2857 		 */
2858 
2859 		*target_lpl = *lpl;
2860 
2861 		/*
2862 		 * Substitute CPU0 lpl pointer with one relative to target.
2863 		 */
2864 		if (lpl->lpl_cpus == CPU) {
2865 			ASSERT(CPU->cpu_lpl == lpl);
2866 			CPU->cpu_lpl = target_lpl;
2867 		}
2868 
2869 		/*
2870 		 * Substitute parent information with parent relative to target.
2871 		 */
2872 		if (lpl->lpl_parent != NULL)
2873 			target_lpl->lpl_parent = (lpl_t *)
2874 			    (((uintptr_t)lpl->lpl_parent -
2875 				(uintptr_t)lpl_bootstrap) +
2876 				(uintptr_t)target);
2877 
2878 		/*
2879 		 * Walk over resource set substituting pointers relative to
2880 		 * lpl_bootstrap to pointers relative to target.
2881 		 */
2882 		ASSERT(lpl->lpl_nrset <= 1);
2883 
2884 		for (id = 0; id < lpl->lpl_nrset; id++) {
2885 			if (lpl->lpl_rset[id] != NULL) {
2886 				target_lpl->lpl_rset[id] =
2887 				    (lpl_t *)
2888 				    (((uintptr_t)lpl->lpl_rset[id] -
2889 					(uintptr_t)lpl_bootstrap) +
2890 					(uintptr_t)target);
2891 			}
2892 		}
2893 	}
2894 
2895 	/*
2896 	 * Topology information in lpl_bootstrap is no longer needed.
2897 	 */
2898 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2899 }
2900 
2901 /*
2902  * If the lowest load among the lgroups a process' threads are currently
2903  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2904  * expanding the process to a new lgroup.
2905  */
2906 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2907 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2908 
2909 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2910 	((lgrp_expand_proc_thresh) / (ncpu))
2911 
2912 /*
2913  * A process will be expanded to a new lgroup only if the difference between
2914  * the lowest load on the lgroups the process' thread's are currently spread
2915  * across and the lowest load on the other lgroups in the process' partition
2916  * is greater than lgrp_expand_proc_diff.
2917  */
2918 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2919 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2920 
2921 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2922 	((lgrp_expand_proc_diff) / (ncpu))
2923 
2924 /*
2925  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2926  * be present due to impreciseness of the load average decay algorithm.
2927  *
2928  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2929  * tolerance is scaled by the number of cpus in the lgroup just like
2930  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2931  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2932  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2933  */
2934 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2935 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2936 	((lgrp_loadavg_tolerance) / ncpu)
2937 
2938 /*
2939  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2940  * average is above this threshold
2941  */
2942 uint32_t	lgrp_load_thresh = UINT32_MAX;
2943 
2944 /*
2945  * lgrp_choose() will try to skip any lgroups with less memory
2946  * than this free when choosing a home lgroup
2947  */
2948 pgcnt_t	lgrp_mem_free_thresh = 0;
2949 
2950 /*
2951  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2952  * one based on one of the following policies:
2953  * - Random selection
2954  * - Pseudo round robin placement
2955  * - Longest time since a thread was last placed
2956  */
2957 #define	LGRP_CHOOSE_RANDOM	1
2958 #define	LGRP_CHOOSE_RR		2
2959 #define	LGRP_CHOOSE_TIME	3
2960 
2961 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2962 
2963 /*
2964  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2965  * be bound to a CPU or processor set.
2966  *
2967  * Arguments:
2968  *	t		The thread
2969  *	cpupart		The partition the thread belongs to.
2970  *
2971  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2972  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2973  *	 partitions changing out from under us and assumes that given thread is
2974  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2975  *	 disabled, so don't grab any locks because we should never block under
2976  *	 those conditions.
2977  */
2978 lpl_t *
2979 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2980 {
2981 	lgrp_load_t	bestload, bestrload;
2982 	int		lgrpid_offset, lgrp_count;
2983 	lgrp_id_t	lgrpid, lgrpid_start;
2984 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2985 	klgrpset_t	lgrpset;
2986 	proc_t		*p;
2987 
2988 	ASSERT(t != NULL);
2989 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2990 	    THREAD_LOCK_HELD(t));
2991 	ASSERT(cpupart != NULL);
2992 
2993 	p = t->t_procp;
2994 
2995 	/* A process should always be in an active partition */
2996 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
2997 
2998 	bestlpl = bestrlpl = NULL;
2999 	bestload = bestrload = LGRP_LOADAVG_MAX;
3000 	lgrpset = cpupart->cp_lgrpset;
3001 
3002 	switch (lgrp_choose_policy) {
3003 	case LGRP_CHOOSE_RR:
3004 		lgrpid = cpupart->cp_lgrp_hint;
3005 		do {
3006 			if (++lgrpid > lgrp_alloc_max)
3007 				lgrpid = 0;
3008 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3009 
3010 		break;
3011 	default:
3012 	case LGRP_CHOOSE_TIME:
3013 	case LGRP_CHOOSE_RANDOM:
3014 		klgrpset_nlgrps(lgrpset, lgrp_count);
3015 		lgrpid_offset =
3016 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3017 		for (lgrpid = 0; ; lgrpid++) {
3018 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3019 				if (--lgrpid_offset == 0)
3020 					break;
3021 			}
3022 		}
3023 		break;
3024 	}
3025 
3026 	lgrpid_start = lgrpid;
3027 
3028 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3029 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3030 
3031 	/*
3032 	 * Use lgroup affinities (if any) to choose best lgroup
3033 	 *
3034 	 * NOTE: Assumes that thread is protected from going away and its
3035 	 *	 lgroup affinities won't change (ie. p_lock, or
3036 	 *	 thread_lock() being held and/or CPUs paused)
3037 	 */
3038 	if (t->t_lgrp_affinity) {
3039 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3040 		if (lpl != NULL)
3041 			return (lpl);
3042 	}
3043 
3044 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3045 
3046 	do {
3047 		pgcnt_t	npgs;
3048 
3049 		/*
3050 		 * Skip any lgroups outside of thread's pset
3051 		 */
3052 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3053 			if (++lgrpid > lgrp_alloc_max)
3054 				lgrpid = 0;	/* wrap the search */
3055 			continue;
3056 		}
3057 
3058 		/*
3059 		 * Skip any non-leaf lgroups
3060 		 */
3061 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3062 			continue;
3063 
3064 		/*
3065 		 * Skip any lgroups without enough free memory
3066 		 * (when threshold set to nonzero positive value)
3067 		 */
3068 		if (lgrp_mem_free_thresh > 0) {
3069 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3070 			if (npgs < lgrp_mem_free_thresh) {
3071 				if (++lgrpid > lgrp_alloc_max)
3072 					lgrpid = 0;	/* wrap the search */
3073 				continue;
3074 			}
3075 		}
3076 
3077 		lpl = &cpupart->cp_lgrploads[lgrpid];
3078 		if (klgrpset_isempty(p->p_lgrpset) ||
3079 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3080 			/*
3081 			 * Either this is a new process or the process already
3082 			 * has threads on this lgrp, so this is a preferred
3083 			 * lgroup for the thread.
3084 			 */
3085 			if (bestlpl == NULL ||
3086 			    lpl_pick(lpl, bestlpl)) {
3087 				bestload = lpl->lpl_loadavg;
3088 				bestlpl = lpl;
3089 			}
3090 		} else {
3091 			/*
3092 			 * The process doesn't have any threads on this lgrp,
3093 			 * but we're willing to consider this lgrp if the load
3094 			 * difference is big enough to justify splitting up
3095 			 * the process' threads.
3096 			 */
3097 			if (bestrlpl == NULL ||
3098 			    lpl_pick(lpl, bestrlpl)) {
3099 				bestrload = lpl->lpl_loadavg;
3100 				bestrlpl = lpl;
3101 			}
3102 		}
3103 		if (++lgrpid > lgrp_alloc_max)
3104 			lgrpid = 0;	/* wrap the search */
3105 	} while (lgrpid != lgrpid_start);
3106 
3107 	/*
3108 	 * Return root lgroup if threshold isn't set to maximum value and
3109 	 * lowest lgroup load average more than a certain threshold
3110 	 */
3111 	if (lgrp_load_thresh != UINT32_MAX &&
3112 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3113 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3114 
3115 	/*
3116 	 * If all the lgroups over which the thread's process is spread are
3117 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3118 	 * the thread on one of the other leaf lgroups in the thread's
3119 	 * partition.
3120 	 */
3121 	if ((bestlpl == NULL) ||
3122 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3123 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3124 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3125 	    bestload))) {
3126 		bestlpl = bestrlpl;
3127 	}
3128 
3129 	if (bestlpl == NULL) {
3130 		/*
3131 		 * No lgroup looked particularly good, but we still
3132 		 * have to pick something. Go with the randomly selected
3133 		 * legal lgroup we started with above.
3134 		 */
3135 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3136 	}
3137 
3138 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3139 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3140 
3141 	ASSERT(bestlpl->lpl_ncpu > 0);
3142 	return (bestlpl);
3143 }
3144 
3145 /*
3146  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3147  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3148  */
3149 static int
3150 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3151 {
3152 	lgrp_load_t	l1, l2;
3153 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3154 
3155 	l1 = lpl1->lpl_loadavg;
3156 	l2 = lpl2->lpl_loadavg;
3157 
3158 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3159 		/* lpl1 is significantly less loaded than lpl2 */
3160 		return (1);
3161 	}
3162 
3163 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3164 	    l1 + tolerance >= l2 && l1 < l2 &&
3165 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3166 		/*
3167 		 * lpl1's load is within the tolerance of lpl2. We're
3168 		 * willing to consider it be to better however if
3169 		 * it has been longer since we last homed a thread there
3170 		 */
3171 		return (1);
3172 	}
3173 
3174 	return (0);
3175 }
3176 
3177 /*
3178  * An LWP is expected to be assigned to an lgroup for at least this long
3179  * for its anticipatory load to be justified.  NOTE that this value should
3180  * not be set extremely huge (say, larger than 100 years), to avoid problems
3181  * with overflow in the calculation that uses it.
3182  */
3183 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3184 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3185 
3186 /*
3187  * Routine to change a thread's lgroup affiliation.  This routine updates
3188  * the thread's kthread_t struct and its process' proc_t struct to note the
3189  * thread's new lgroup affiliation, and its lgroup affinities.
3190  *
3191  * Note that this is the only routine that modifies a thread's t_lpl field,
3192  * and that adds in or removes anticipatory load.
3193  *
3194  * If the thread is exiting, newlpl is NULL.
3195  *
3196  * Locking:
3197  * The following lock must be held on entry:
3198  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3199  *		doesn't get removed from t's partition
3200  *
3201  * This routine is not allowed to grab any locks, since it may be called
3202  * with cpus paused (such as from cpu_offline).
3203  */
3204 void
3205 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3206 {
3207 	proc_t		*p;
3208 	lpl_t		*lpl, *oldlpl;
3209 	lgrp_id_t	oldid;
3210 	kthread_t	*tp;
3211 	uint_t		ncpu;
3212 	lgrp_load_t	old, new;
3213 
3214 	ASSERT(t);
3215 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3216 	    THREAD_LOCK_HELD(t));
3217 
3218 	/*
3219 	 * If not changing lpls, just return
3220 	 */
3221 	if ((oldlpl = t->t_lpl) == newlpl)
3222 		return;
3223 
3224 	/*
3225 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3226 	 * associated with process 0 rather than with its original process).
3227 	 */
3228 	if (t->t_proc_flag & TP_LWPEXIT) {
3229 		if (newlpl != NULL) {
3230 			t->t_lpl = newlpl;
3231 		}
3232 		return;
3233 	}
3234 
3235 	p = ttoproc(t);
3236 
3237 	/*
3238 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3239 	 * to account for it being moved from its old lgroup.
3240 	 */
3241 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3242 	    (p->p_tlist != NULL)) {
3243 		oldid = oldlpl->lpl_lgrpid;
3244 
3245 		if (newlpl != NULL)
3246 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3247 
3248 		if ((do_lgrpset_delete) &&
3249 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3250 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3251 				/*
3252 				 * Check if a thread other than the thread
3253 				 * that's moving is assigned to the same
3254 				 * lgroup as the thread that's moving.  Note
3255 				 * that we have to compare lgroup IDs, rather
3256 				 * than simply comparing t_lpl's, since the
3257 				 * threads may belong to different partitions
3258 				 * but be assigned to the same lgroup.
3259 				 */
3260 				ASSERT(tp->t_lpl != NULL);
3261 
3262 				if ((tp != t) &&
3263 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3264 					/*
3265 					 * Another thread is assigned to the
3266 					 * same lgroup as the thread that's
3267 					 * moving, p_lgrpset doesn't change.
3268 					 */
3269 					break;
3270 				} else if (tp == p->p_tlist) {
3271 					/*
3272 					 * No other thread is assigned to the
3273 					 * same lgroup as the exiting thread,
3274 					 * clear the lgroup's bit in p_lgrpset.
3275 					 */
3276 					klgrpset_del(p->p_lgrpset, oldid);
3277 					break;
3278 				}
3279 			}
3280 		}
3281 
3282 		/*
3283 		 * If this thread was assigned to its old lgroup for such a
3284 		 * short amount of time that the anticipatory load that was
3285 		 * added on its behalf has aged very little, remove that
3286 		 * anticipatory load.
3287 		 */
3288 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3289 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3290 			lpl = oldlpl;
3291 			for (;;) {
3292 				do {
3293 					old = new = lpl->lpl_loadavg;
3294 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3295 					if (new > old) {
3296 						/*
3297 						 * this can happen if the load
3298 						 * average was aged since we
3299 						 * added in the anticipatory
3300 						 * load
3301 						 */
3302 						new = 0;
3303 					}
3304 				} while (cas32(
3305 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3306 					    new) != old);
3307 
3308 				lpl = lpl->lpl_parent;
3309 				if (lpl == NULL)
3310 					break;
3311 
3312 				ncpu = lpl->lpl_ncpu;
3313 				ASSERT(ncpu > 0);
3314 			}
3315 		}
3316 	}
3317 	/*
3318 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3319 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3320 	 * to its new lgroup to account for its move to its new lgroup.
3321 	 */
3322 	if (newlpl != NULL) {
3323 		/*
3324 		 * This thread is moving to a new lgroup
3325 		 */
3326 		t->t_lpl = newlpl;
3327 
3328 		/*
3329 		 * Reflect move in load average of new lgroup
3330 		 * unless it is root lgroup
3331 		 */
3332 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3333 			return;
3334 
3335 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3336 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3337 		}
3338 
3339 		/*
3340 		 * It'll take some time for the load on the new lgroup
3341 		 * to reflect this thread's placement on it.  We'd
3342 		 * like not, however, to have all threads between now
3343 		 * and then also piling on to this lgroup.  To avoid
3344 		 * this pileup, we anticipate the load this thread
3345 		 * will generate on its new lgroup.  The goal is to
3346 		 * make the lgroup's load appear as though the thread
3347 		 * had been there all along.  We're very conservative
3348 		 * in calculating this anticipatory load, we assume
3349 		 * the worst case case (100% CPU-bound thread).  This
3350 		 * may be modified in the future to be more accurate.
3351 		 */
3352 		lpl = newlpl;
3353 		for (;;) {
3354 			ncpu = lpl->lpl_ncpu;
3355 			ASSERT(ncpu > 0);
3356 			do {
3357 				old = new = lpl->lpl_loadavg;
3358 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3359 				/*
3360 				 * Check for overflow
3361 				 * Underflow not possible here
3362 				 */
3363 				if (new < old)
3364 					new = UINT32_MAX;
3365 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3366 			    new) != old);
3367 
3368 			lpl = lpl->lpl_parent;
3369 			if (lpl == NULL)
3370 				break;
3371 		}
3372 		t->t_anttime = gethrtime();
3373 	}
3374 }
3375 
3376 /*
3377  * Return lgroup memory allocation policy given advice from madvise(3C)
3378  */
3379 lgrp_mem_policy_t
3380 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3381 {
3382 	switch (advice) {
3383 	case MADV_ACCESS_LWP:
3384 		return (LGRP_MEM_POLICY_NEXT);
3385 	case MADV_ACCESS_MANY:
3386 		return (LGRP_MEM_POLICY_RANDOM);
3387 	default:
3388 		return (lgrp_mem_policy_default(size, type));
3389 	}
3390 }
3391 
3392 /*
3393  * Figure out default policy
3394  */
3395 lgrp_mem_policy_t
3396 lgrp_mem_policy_default(size_t size, int type)
3397 {
3398 	cpupart_t		*cp;
3399 	lgrp_mem_policy_t	policy;
3400 	size_t			pset_mem_size;
3401 
3402 	/*
3403 	 * Randomly allocate memory across lgroups for shared memory
3404 	 * beyond a certain threshold
3405 	 */
3406 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3407 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3408 		/*
3409 		 * Get total memory size of current thread's pset
3410 		 */
3411 		kpreempt_disable();
3412 		cp = curthread->t_cpupart;
3413 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3414 		kpreempt_enable();
3415 
3416 		/*
3417 		 * Choose policy to randomly allocate memory across
3418 		 * lgroups in pset if it will fit and is not default
3419 		 * partition.  Otherwise, allocate memory randomly
3420 		 * across machine.
3421 		 */
3422 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3423 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3424 		else
3425 			policy = LGRP_MEM_POLICY_RANDOM;
3426 	} else
3427 		/*
3428 		 * Apply default policy for private memory and
3429 		 * shared memory under the respective random
3430 		 * threshold.
3431 		 */
3432 		policy = lgrp_mem_default_policy;
3433 
3434 	return (policy);
3435 }
3436 
3437 /*
3438  * Get memory allocation policy for this segment
3439  */
3440 lgrp_mem_policy_info_t *
3441 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3442 {
3443 	lgrp_mem_policy_info_t	*policy_info;
3444 	extern struct seg_ops	segspt_ops;
3445 	extern struct seg_ops	segspt_shmops;
3446 
3447 	/*
3448 	 * This is for binary compatibility to protect against third party
3449 	 * segment drivers which haven't recompiled to allow for
3450 	 * SEGOP_GETPOLICY()
3451 	 */
3452 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3453 	    seg->s_ops != &segspt_shmops)
3454 		return (NULL);
3455 
3456 	policy_info = NULL;
3457 	if (seg->s_ops->getpolicy != NULL)
3458 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3459 
3460 	return (policy_info);
3461 }
3462 
3463 /*
3464  * Set policy for allocating private memory given desired policy, policy info,
3465  * size in bytes of memory that policy is being applied.
3466  * Return 0 if policy wasn't set already and 1 if policy was set already
3467  */
3468 int
3469 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3470     lgrp_mem_policy_info_t *policy_info, size_t size)
3471 {
3472 
3473 	ASSERT(policy_info != NULL);
3474 
3475 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3476 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3477 
3478 	/*
3479 	 * Policy set already?
3480 	 */
3481 	if (policy == policy_info->mem_policy)
3482 		return (1);
3483 
3484 	/*
3485 	 * Set policy
3486 	 */
3487 	policy_info->mem_policy = policy;
3488 	policy_info->mem_reserved = 0;
3489 
3490 	return (0);
3491 }
3492 
3493 
3494 /*
3495  * Get shared memory allocation policy with given tree and offset
3496  */
3497 lgrp_mem_policy_info_t *
3498 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3499     u_offset_t vn_off)
3500 {
3501 	u_offset_t		off;
3502 	lgrp_mem_policy_info_t	*policy_info;
3503 	lgrp_shm_policy_seg_t	*policy_seg;
3504 	lgrp_shm_locality_t	*shm_locality;
3505 	avl_tree_t		*tree;
3506 	avl_index_t		where;
3507 
3508 	/*
3509 	 * Get policy segment tree from anon_map or vnode and use specified
3510 	 * anon index or vnode offset as offset
3511 	 *
3512 	 * Assume that no lock needs to be held on anon_map or vnode, since
3513 	 * they should be protected by their reference count which must be
3514 	 * nonzero for an existing segment
3515 	 */
3516 	if (amp) {
3517 		ASSERT(amp->refcnt != 0);
3518 		shm_locality = amp->locality;
3519 		if (shm_locality == NULL)
3520 			return (NULL);
3521 		tree = shm_locality->loc_tree;
3522 		off = ptob(anon_index);
3523 	} else if (vp) {
3524 		shm_locality = vp->v_locality;
3525 		if (shm_locality == NULL)
3526 			return (NULL);
3527 		ASSERT(shm_locality->loc_count != 0);
3528 		tree = shm_locality->loc_tree;
3529 		off = vn_off;
3530 	}
3531 
3532 	if (tree == NULL)
3533 		return (NULL);
3534 
3535 	/*
3536 	 * Lookup policy segment for offset into shared object and return
3537 	 * policy info
3538 	 */
3539 	rw_enter(&shm_locality->loc_lock, RW_READER);
3540 	policy_info = NULL;
3541 	policy_seg = avl_find(tree, &off, &where);
3542 	if (policy_seg)
3543 		policy_info = &policy_seg->shm_policy;
3544 	rw_exit(&shm_locality->loc_lock);
3545 
3546 	return (policy_info);
3547 }
3548 
3549 /*
3550  * Default memory allocation policy for kernel segmap pages
3551  */
3552 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3553 
3554 /*
3555  * Return lgroup to use for allocating memory
3556  * given the segment and address
3557  *
3558  * There isn't any mutual exclusion that exists between calls
3559  * to this routine and DR, so this routine and whomever calls it
3560  * should be mindful of the possibility that the lgrp returned
3561  * may be deleted. If this happens, dereferences of the lgrp
3562  * pointer will still be safe, but the resources in the lgrp will
3563  * be gone, and LGRP_EXISTS() will no longer be true.
3564  */
3565 lgrp_t *
3566 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3567 {
3568 	int			i;
3569 	lgrp_t			*lgrp;
3570 	klgrpset_t		lgrpset;
3571 	int			lgrps_spanned;
3572 	unsigned long		off;
3573 	lgrp_mem_policy_t	policy;
3574 	lgrp_mem_policy_info_t	*policy_info;
3575 	ushort_t		random;
3576 	int			stat = 0;
3577 	extern struct seg	*segkmap;
3578 
3579 	/*
3580 	 * Just return null if the lgrp framework hasn't finished
3581 	 * initializing or if this is a UMA machine.
3582 	 */
3583 	if (nlgrps == 1 || !lgrp_initialized)
3584 		return (lgrp_root);
3585 
3586 	/*
3587 	 * Get memory allocation policy for this segment
3588 	 */
3589 	policy = lgrp_mem_default_policy;
3590 	if (seg != NULL) {
3591 		if (seg->s_as == &kas) {
3592 			if (seg == segkmap)
3593 				policy = lgrp_segmap_default_policy;
3594 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3595 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3596 				policy = LGRP_MEM_POLICY_RANDOM;
3597 		} else {
3598 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3599 			if (policy_info != NULL)
3600 				policy = policy_info->mem_policy;
3601 		}
3602 	}
3603 	lgrpset = 0;
3604 
3605 	/*
3606 	 * Initialize lgroup to home by default
3607 	 */
3608 	lgrp = lgrp_home_lgrp();
3609 
3610 	/*
3611 	 * When homing threads on root lgrp, override default memory
3612 	 * allocation policies with root lgroup memory allocation policy
3613 	 */
3614 	if (lgrp == lgrp_root)
3615 		policy = lgrp_mem_policy_root;
3616 
3617 	/*
3618 	 * Implement policy
3619 	 */
3620 	switch (policy) {
3621 	case LGRP_MEM_POLICY_NEXT_CPU:
3622 
3623 		/*
3624 		 * Return lgroup of current CPU which faulted on memory
3625 		 * If the CPU isn't currently in an lgrp, then opt to
3626 		 * allocate from the root.
3627 		 *
3628 		 * Kernel preemption needs to be disabled here to prevent
3629 		 * the current CPU from going away before lgrp is found.
3630 		 */
3631 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3632 			lgrp = lgrp_root;
3633 		} else {
3634 			kpreempt_disable();
3635 			lgrp = lgrp_cpu_to_lgrp(CPU);
3636 			kpreempt_enable();
3637 		}
3638 		break;
3639 
3640 	case LGRP_MEM_POLICY_NEXT:
3641 	case LGRP_MEM_POLICY_DEFAULT:
3642 	default:
3643 
3644 		/*
3645 		 * Just return current thread's home lgroup
3646 		 * for default policy (next touch)
3647 		 * If the thread is homed to the root,
3648 		 * then the default policy is random across lgroups.
3649 		 * Fallthrough to the random case.
3650 		 */
3651 		if (lgrp != lgrp_root) {
3652 			if (policy == LGRP_MEM_POLICY_NEXT)
3653 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3654 			else
3655 				lgrp_stat_add(lgrp->lgrp_id,
3656 				    LGRP_NUM_DEFAULT, 1);
3657 			break;
3658 		}
3659 		/* LINTED fallthrough on case statement */
3660 	case LGRP_MEM_POLICY_RANDOM:
3661 
3662 		/*
3663 		 * Return a random leaf lgroup with memory
3664 		 */
3665 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3666 		/*
3667 		 * Count how many lgroups are spanned
3668 		 */
3669 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3670 
3671 		/*
3672 		 * There may be no memnodes in the root lgroup during DR copy
3673 		 * rename on a system with only two boards (memnodes)
3674 		 * configured. In this case just return the root lgrp.
3675 		 */
3676 		if (lgrps_spanned == 0) {
3677 			lgrp = lgrp_root;
3678 			break;
3679 		}
3680 
3681 		/*
3682 		 * Pick a random offset within lgroups spanned
3683 		 * and return lgroup at that offset
3684 		 */
3685 		random = (ushort_t)gethrtime() >> 4;
3686 		off = random % lgrps_spanned;
3687 		ASSERT(off <= lgrp_alloc_max);
3688 
3689 		for (i = 0; i <= lgrp_alloc_max; i++) {
3690 			if (!klgrpset_ismember(lgrpset, i))
3691 				continue;
3692 			if (off)
3693 				off--;
3694 			else {
3695 				lgrp = lgrp_table[i];
3696 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3697 				    1);
3698 				break;
3699 			}
3700 		}
3701 		break;
3702 
3703 	case LGRP_MEM_POLICY_RANDOM_PROC:
3704 
3705 		/*
3706 		 * Grab copy of bitmask of lgroups spanned by
3707 		 * this process
3708 		 */
3709 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3710 		stat = LGRP_NUM_RANDOM_PROC;
3711 
3712 		/* LINTED fallthrough on case statement */
3713 	case LGRP_MEM_POLICY_RANDOM_PSET:
3714 
3715 		if (!stat)
3716 			stat = LGRP_NUM_RANDOM_PSET;
3717 
3718 		if (klgrpset_isempty(lgrpset)) {
3719 			/*
3720 			 * Grab copy of bitmask of lgroups spanned by
3721 			 * this processor set
3722 			 */
3723 			kpreempt_disable();
3724 			klgrpset_copy(lgrpset,
3725 			    curthread->t_cpupart->cp_lgrpset);
3726 			kpreempt_enable();
3727 		}
3728 
3729 		/*
3730 		 * Count how many lgroups are spanned
3731 		 */
3732 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3733 		ASSERT(lgrps_spanned <= nlgrps);
3734 
3735 		/*
3736 		 * Probably lgrps_spanned should be always non-zero, but to be
3737 		 * on the safe side we return lgrp_root if it is empty.
3738 		 */
3739 		if (lgrps_spanned == 0) {
3740 			lgrp = lgrp_root;
3741 			break;
3742 		}
3743 
3744 		/*
3745 		 * Pick a random offset within lgroups spanned
3746 		 * and return lgroup at that offset
3747 		 */
3748 		random = (ushort_t)gethrtime() >> 4;
3749 		off = random % lgrps_spanned;
3750 		ASSERT(off <= lgrp_alloc_max);
3751 
3752 		for (i = 0; i <= lgrp_alloc_max; i++) {
3753 			if (!klgrpset_ismember(lgrpset, i))
3754 				continue;
3755 			if (off)
3756 				off--;
3757 			else {
3758 				lgrp = lgrp_table[i];
3759 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3760 				    1);
3761 				break;
3762 			}
3763 		}
3764 		break;
3765 
3766 	case LGRP_MEM_POLICY_ROUNDROBIN:
3767 
3768 		/*
3769 		 * Use offset within segment to determine
3770 		 * offset from home lgroup to choose for
3771 		 * next lgroup to allocate memory from
3772 		 */
3773 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3774 		    (lgrp_alloc_max + 1);
3775 
3776 		kpreempt_disable();
3777 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3778 		i = lgrp->lgrp_id;
3779 		kpreempt_enable();
3780 
3781 		while (off > 0) {
3782 			i = (i + 1) % (lgrp_alloc_max + 1);
3783 			lgrp = lgrp_table[i];
3784 			if (klgrpset_ismember(lgrpset, i))
3785 				off--;
3786 		}
3787 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3788 
3789 		break;
3790 	}
3791 
3792 	ASSERT(lgrp != NULL);
3793 	return (lgrp);
3794 }
3795 
3796 /*
3797  * Return the number of pages in an lgroup
3798  *
3799  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3800  *	 could cause tests that rely on the numat driver to fail....
3801  */
3802 pgcnt_t
3803 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3804 {
3805 	lgrp_t *lgrp;
3806 
3807 	lgrp = lgrp_table[lgrpid];
3808 	if (!LGRP_EXISTS(lgrp) ||
3809 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3810 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3811 		return (0);
3812 
3813 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3814 }
3815 
3816 /*
3817  * Initialize lgroup shared memory allocation policy support
3818  */
3819 void
3820 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3821 {
3822 	lgrp_shm_locality_t	*shm_locality;
3823 
3824 	/*
3825 	 * Initialize locality field in anon_map
3826 	 * Don't need any locks because this is called when anon_map is
3827 	 * allocated, but not used anywhere yet.
3828 	 */
3829 	if (amp) {
3830 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3831 		if (amp->locality == NULL) {
3832 			/*
3833 			 * Allocate and initialize shared memory locality info
3834 			 * and set anon_map locality pointer to it
3835 			 * Drop lock across kmem_alloc(KM_SLEEP)
3836 			 */
3837 			ANON_LOCK_EXIT(&amp->a_rwlock);
3838 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3839 			    KM_SLEEP);
3840 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3841 			    NULL);
3842 			shm_locality->loc_count = 1;	/* not used for amp */
3843 			shm_locality->loc_tree = NULL;
3844 
3845 			/*
3846 			 * Reacquire lock and check to see whether anyone beat
3847 			 * us to initializing the locality info
3848 			 */
3849 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3850 			if (amp->locality != NULL) {
3851 				rw_destroy(&shm_locality->loc_lock);
3852 				kmem_free(shm_locality,
3853 				    sizeof (*shm_locality));
3854 			} else
3855 				amp->locality = shm_locality;
3856 		}
3857 		ANON_LOCK_EXIT(&amp->a_rwlock);
3858 		return;
3859 	}
3860 
3861 	/*
3862 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3863 	 */
3864 	mutex_enter(&vp->v_lock);
3865 	if ((vp->v_flag & V_LOCALITY) == 0) {
3866 		/*
3867 		 * Allocate and initialize shared memory locality info
3868 		 */
3869 		mutex_exit(&vp->v_lock);
3870 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3871 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3872 		shm_locality->loc_count = 1;
3873 		shm_locality->loc_tree = NULL;
3874 
3875 		/*
3876 		 * Point vnode locality field at shared vnode policy info
3877 		 * and set locality aware flag in vnode
3878 		 */
3879 		mutex_enter(&vp->v_lock);
3880 		if ((vp->v_flag & V_LOCALITY) == 0) {
3881 			vp->v_locality = shm_locality;
3882 			vp->v_flag |= V_LOCALITY;
3883 		} else {
3884 			/*
3885 			 * Lost race so free locality info and increment count.
3886 			 */
3887 			rw_destroy(&shm_locality->loc_lock);
3888 			kmem_free(shm_locality, sizeof (*shm_locality));
3889 			shm_locality = vp->v_locality;
3890 			shm_locality->loc_count++;
3891 		}
3892 		mutex_exit(&vp->v_lock);
3893 
3894 		return;
3895 	}
3896 
3897 	/*
3898 	 * Increment reference count of number of segments mapping this vnode
3899 	 * shared
3900 	 */
3901 	shm_locality = vp->v_locality;
3902 	shm_locality->loc_count++;
3903 	mutex_exit(&vp->v_lock);
3904 }
3905 
3906 /*
3907  * Destroy the given shared memory policy segment tree
3908  */
3909 void
3910 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3911 {
3912 	lgrp_shm_policy_seg_t	*cur;
3913 	lgrp_shm_policy_seg_t	*next;
3914 
3915 	if (tree == NULL)
3916 		return;
3917 
3918 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3919 	while (cur != NULL) {
3920 		next = AVL_NEXT(tree, cur);
3921 		avl_remove(tree, cur);
3922 		kmem_free(cur, sizeof (*cur));
3923 		cur = next;
3924 	}
3925 	kmem_free(tree, sizeof (avl_tree_t));
3926 }
3927 
3928 /*
3929  * Uninitialize lgroup shared memory allocation policy support
3930  */
3931 void
3932 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3933 {
3934 	lgrp_shm_locality_t	*shm_locality;
3935 
3936 	/*
3937 	 * For anon_map, deallocate shared memory policy tree and
3938 	 * zero locality field
3939 	 * Don't need any locks because anon_map is being freed
3940 	 */
3941 	if (amp) {
3942 		if (amp->locality == NULL)
3943 			return;
3944 		shm_locality = amp->locality;
3945 		shm_locality->loc_count = 0;	/* not really used for amp */
3946 		rw_destroy(&shm_locality->loc_lock);
3947 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3948 		kmem_free(shm_locality, sizeof (*shm_locality));
3949 		amp->locality = 0;
3950 		return;
3951 	}
3952 
3953 	/*
3954 	 * For vnode, decrement reference count of segments mapping this vnode
3955 	 * shared and delete locality info if reference count drops to 0
3956 	 */
3957 	mutex_enter(&vp->v_lock);
3958 	shm_locality = vp->v_locality;
3959 	shm_locality->loc_count--;
3960 
3961 	if (shm_locality->loc_count == 0) {
3962 		rw_destroy(&shm_locality->loc_lock);
3963 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3964 		kmem_free(shm_locality, sizeof (*shm_locality));
3965 		vp->v_locality = 0;
3966 		vp->v_flag &= ~V_LOCALITY;
3967 	}
3968 	mutex_exit(&vp->v_lock);
3969 }
3970 
3971 /*
3972  * Compare two shared memory policy segments
3973  * Used by AVL tree code for searching
3974  */
3975 int
3976 lgrp_shm_policy_compar(const void *x, const void *y)
3977 {
3978 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3979 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3980 
3981 	if (a->shm_off < b->shm_off)
3982 		return (-1);
3983 	if (a->shm_off >= b->shm_off + b->shm_size)
3984 		return (1);
3985 	return (0);
3986 }
3987 
3988 /*
3989  * Concatenate seg1 with seg2 and remove seg2
3990  */
3991 static int
3992 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
3993     lgrp_shm_policy_seg_t *seg2)
3994 {
3995 	if (!seg1 || !seg2 ||
3996 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
3997 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
3998 		return (-1);
3999 
4000 	seg1->shm_size += seg2->shm_size;
4001 	avl_remove(tree, seg2);
4002 	kmem_free(seg2, sizeof (*seg2));
4003 	return (0);
4004 }
4005 
4006 /*
4007  * Split segment at given offset and return rightmost (uppermost) segment
4008  * Assumes that there are no overlapping segments
4009  */
4010 static lgrp_shm_policy_seg_t *
4011 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4012     u_offset_t off)
4013 {
4014 	lgrp_shm_policy_seg_t	*newseg;
4015 	avl_index_t		where;
4016 
4017 	ASSERT(seg != NULL);
4018 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4019 
4020 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4021 	    seg->shm_size)
4022 		return (NULL);
4023 
4024 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4025 		return (seg);
4026 
4027 	/*
4028 	 * Adjust size of left segment and allocate new (right) segment
4029 	 */
4030 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4031 	newseg->shm_policy = seg->shm_policy;
4032 	newseg->shm_off = off;
4033 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4034 	seg->shm_size = off - seg->shm_off;
4035 
4036 	/*
4037 	 * Find where to insert new segment in AVL tree and insert it
4038 	 */
4039 	(void) avl_find(tree, &off, &where);
4040 	avl_insert(tree, newseg, where);
4041 
4042 	return (newseg);
4043 }
4044 
4045 /*
4046  * Set shared memory allocation policy on specified shared object at given
4047  * offset and length
4048  *
4049  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4050  * -1 if can't set policy.
4051  */
4052 int
4053 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4054     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4055 {
4056 	u_offset_t		eoff;
4057 	lgrp_shm_policy_seg_t	*next;
4058 	lgrp_shm_policy_seg_t	*newseg;
4059 	u_offset_t		off;
4060 	u_offset_t		oldeoff;
4061 	lgrp_shm_policy_seg_t	*prev;
4062 	int			retval;
4063 	lgrp_shm_policy_seg_t	*seg;
4064 	lgrp_shm_locality_t	*shm_locality;
4065 	avl_tree_t		*tree;
4066 	avl_index_t		where;
4067 
4068 	ASSERT(amp || vp);
4069 	ASSERT((len & PAGEOFFSET) == 0);
4070 
4071 	if (len == 0)
4072 		return (-1);
4073 
4074 	retval = 0;
4075 
4076 	/*
4077 	 * Get locality info and starting offset into shared object
4078 	 * Try anon map first and then vnode
4079 	 * Assume that no locks need to be held on anon_map or vnode, since
4080 	 * it should be protected by its reference count which must be nonzero
4081 	 * for an existing segment.
4082 	 */
4083 	if (amp) {
4084 		/*
4085 		 * Get policy info from anon_map
4086 		 *
4087 		 */
4088 		ASSERT(amp->refcnt != 0);
4089 		if (amp->locality == NULL)
4090 			lgrp_shm_policy_init(amp, NULL);
4091 		shm_locality = amp->locality;
4092 		off = ptob(anon_index);
4093 	} else if (vp) {
4094 		/*
4095 		 * Get policy info from vnode
4096 		 */
4097 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4098 			lgrp_shm_policy_init(NULL, vp);
4099 		shm_locality = vp->v_locality;
4100 		ASSERT(shm_locality->loc_count != 0);
4101 		off = vn_off;
4102 	} else
4103 		return (-1);
4104 
4105 	ASSERT((off & PAGEOFFSET) == 0);
4106 
4107 	/*
4108 	 * Figure out default policy
4109 	 */
4110 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4111 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4112 
4113 	/*
4114 	 * Create AVL tree if there isn't one yet
4115 	 * and set locality field to point at it
4116 	 */
4117 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4118 	tree = shm_locality->loc_tree;
4119 	if (!tree) {
4120 		rw_exit(&shm_locality->loc_lock);
4121 
4122 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4123 
4124 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4125 		if (shm_locality->loc_tree == NULL) {
4126 			avl_create(tree, lgrp_shm_policy_compar,
4127 			    sizeof (lgrp_shm_policy_seg_t),
4128 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4129 			shm_locality->loc_tree = tree;
4130 		} else {
4131 			/*
4132 			 * Another thread managed to set up the tree
4133 			 * before we could. Free the tree we allocated
4134 			 * and use the one that's already there.
4135 			 */
4136 			kmem_free(tree, sizeof (*tree));
4137 			tree = shm_locality->loc_tree;
4138 		}
4139 	}
4140 
4141 	/*
4142 	 * Set policy
4143 	 *
4144 	 * Need to maintain hold on writer's lock to keep tree from
4145 	 * changing out from under us
4146 	 */
4147 	while (len != 0) {
4148 		/*
4149 		 * Find policy segment for specified offset into shared object
4150 		 */
4151 		seg = avl_find(tree, &off, &where);
4152 
4153 		/*
4154 		 * Didn't find any existing segment that contains specified
4155 		 * offset, so allocate new segment, insert it, and concatenate
4156 		 * with adjacent segments if possible
4157 		 */
4158 		if (seg == NULL) {
4159 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4160 			    KM_SLEEP);
4161 			newseg->shm_policy.mem_policy = policy;
4162 			newseg->shm_policy.mem_reserved = 0;
4163 			newseg->shm_off = off;
4164 			avl_insert(tree, newseg, where);
4165 
4166 			/*
4167 			 * Check to see whether new segment overlaps with next
4168 			 * one, set length of new segment accordingly, and
4169 			 * calculate remaining length and next offset
4170 			 */
4171 			seg = AVL_NEXT(tree, newseg);
4172 			if (seg == NULL || off + len <= seg->shm_off) {
4173 				newseg->shm_size = len;
4174 				len = 0;
4175 			} else {
4176 				newseg->shm_size = seg->shm_off - off;
4177 				off = seg->shm_off;
4178 				len -= newseg->shm_size;
4179 			}
4180 
4181 			/*
4182 			 * Try to concatenate new segment with next and
4183 			 * previous ones, since they might have the same policy
4184 			 * now.  Grab previous and next segments first because
4185 			 * they will change on concatenation.
4186 			 */
4187 			prev =  AVL_PREV(tree, newseg);
4188 			next = AVL_NEXT(tree, newseg);
4189 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4190 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4191 
4192 			continue;
4193 		}
4194 
4195 		eoff = off + len;
4196 		oldeoff = seg->shm_off + seg->shm_size;
4197 
4198 		/*
4199 		 * Policy set already?
4200 		 */
4201 		if (policy == seg->shm_policy.mem_policy) {
4202 			/*
4203 			 * Nothing left to do if offset and length
4204 			 * fall within this segment
4205 			 */
4206 			if (eoff <= oldeoff) {
4207 				retval = 1;
4208 				break;
4209 			} else {
4210 				len = eoff - oldeoff;
4211 				off = oldeoff;
4212 				continue;
4213 			}
4214 		}
4215 
4216 		/*
4217 		 * Specified offset and length match existing segment exactly
4218 		 */
4219 		if (off == seg->shm_off && len == seg->shm_size) {
4220 			/*
4221 			 * Set policy and update current length
4222 			 */
4223 			seg->shm_policy.mem_policy = policy;
4224 			seg->shm_policy.mem_reserved = 0;
4225 			len = 0;
4226 
4227 			/*
4228 			 * Try concatenating new segment with previous and next
4229 			 * segments, since they might have the same policy now.
4230 			 * Grab previous and next segments first because they
4231 			 * will change on concatenation.
4232 			 */
4233 			prev =  AVL_PREV(tree, seg);
4234 			next = AVL_NEXT(tree, seg);
4235 			(void) lgrp_shm_policy_concat(tree, seg, next);
4236 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4237 		} else {
4238 			/*
4239 			 * Specified offset and length only apply to part of
4240 			 * existing segment
4241 			 */
4242 
4243 			/*
4244 			 * New segment starts in middle of old one, so split
4245 			 * new one off near beginning of old one
4246 			 */
4247 			newseg = NULL;
4248 			if (off > seg->shm_off) {
4249 				newseg = lgrp_shm_policy_split(tree, seg, off);
4250 
4251 				/*
4252 				 * New segment ends where old one did, so try
4253 				 * to concatenate with next segment
4254 				 */
4255 				if (eoff == oldeoff) {
4256 					newseg->shm_policy.mem_policy = policy;
4257 					newseg->shm_policy.mem_reserved = 0;
4258 					(void) lgrp_shm_policy_concat(tree,
4259 					    newseg, AVL_NEXT(tree, newseg));
4260 					break;
4261 				}
4262 			}
4263 
4264 			/*
4265 			 * New segment ends before old one, so split off end of
4266 			 * old one
4267 			 */
4268 			if (eoff < oldeoff) {
4269 				if (newseg) {
4270 					(void) lgrp_shm_policy_split(tree,
4271 					    newseg, eoff);
4272 					newseg->shm_policy.mem_policy = policy;
4273 					newseg->shm_policy.mem_reserved = 0;
4274 				} else {
4275 					(void) lgrp_shm_policy_split(tree, seg,
4276 					    eoff);
4277 					seg->shm_policy.mem_policy = policy;
4278 					seg->shm_policy.mem_reserved = 0;
4279 				}
4280 
4281 				if (off == seg->shm_off)
4282 					(void) lgrp_shm_policy_concat(tree,
4283 					    AVL_PREV(tree, seg), seg);
4284 				break;
4285 			}
4286 
4287 			/*
4288 			 * Calculate remaining length and next offset
4289 			 */
4290 			len = eoff - oldeoff;
4291 			off = oldeoff;
4292 		}
4293 	}
4294 
4295 	rw_exit(&shm_locality->loc_lock);
4296 	return (retval);
4297 }
4298 
4299 /*
4300  * Return the best memnode from which to allocate memory given
4301  * an lgroup.
4302  *
4303  * "c" is for cookie, which is good enough for me.
4304  * It references a cookie struct that should be zero'ed to initialize.
4305  * The cookie should live on the caller's stack.
4306  *
4307  * The routine returns -1 when:
4308  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4309  *	- traverse is 1, and all the memnodes in the system have been
4310  *	  returned.
4311  */
4312 int
4313 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4314 {
4315 	lgrp_t		*lp = c->lmc_lgrp;
4316 	mnodeset_t	nodes = c->lmc_nodes;
4317 	int		cnt = c->lmc_cnt;
4318 	int		offset, mnode;
4319 
4320 	extern int	max_mem_nodes;
4321 
4322 	/*
4323 	 * If the set is empty, and the caller is willing, traverse
4324 	 * up the hierarchy until we find a non-empty set.
4325 	 */
4326 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4327 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4328 		    ((lp = lp->lgrp_parent) == NULL))
4329 			return (-1);
4330 
4331 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4332 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4333 	}
4334 
4335 	/*
4336 	 * Select a memnode by picking one at a "random" offset.
4337 	 * Because of DR, memnodes can come and go at any time.
4338 	 * This code must be able to cope with the possibility
4339 	 * that the nodes count "cnt" is inconsistent with respect
4340 	 * to the number of elements actually in "nodes", and
4341 	 * therefore that the offset chosen could be greater than
4342 	 * the number of elements in the set (some memnodes may
4343 	 * have dissapeared just before cnt was read).
4344 	 * If this happens, the search simply wraps back to the
4345 	 * beginning of the set.
4346 	 */
4347 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4348 	offset = c->lmc_rand % cnt;
4349 	do {
4350 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4351 			if (nodes & ((mnodeset_t)1 << mnode))
4352 				if (!offset--)
4353 					break;
4354 	} while (mnode >= max_mem_nodes);
4355 
4356 	/* Found a node. Store state before returning. */
4357 	c->lmc_lgrp = lp;
4358 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4359 	c->lmc_cnt = cnt - 1;
4360 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4361 	c->lmc_ntried++;
4362 
4363 	return (mnode);
4364 }
4365