xref: /titanic_50/usr/src/uts/common/os/lgrp.c (revision e127a3e717f822eb855235fa3bd08235b2cf533d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/pg.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 /*
216  * lgroup memory event handlers
217  */
218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 
222 /*
223  * lgroup CPU partition event handlers
224  */
225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 static void	lgrp_part_del_cpu(struct cpu *);
227 
228 static void	lgrp_root_init(void);
229 
230 /*
231  * lpl topology
232  */
233 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234 static void	lpl_clear(lpl_t *);
235 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237 static void	lpl_rset_add(lpl_t *, lpl_t *);
238 static void	lpl_rset_del(lpl_t *, lpl_t *);
239 static int	lpl_rset_contains(lpl_t *, lpl_t *);
240 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241 static void	lpl_child_update(lpl_t *, struct cpupart *);
242 static int	lpl_pick(lpl_t *, lpl_t *);
243 static void	lpl_verify_wrapper(struct cpupart *);
244 
245 /*
246  * defines for lpl topology verifier return codes
247  */
248 
249 #define	LPL_TOPO_CORRECT			0
250 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
251 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252 #define	LPL_TOPO_LGRP_MISMATCH			-3
253 #define	LPL_TOPO_MISSING_PARENT			-4
254 #define	LPL_TOPO_PARENT_MISMATCH		-5
255 #define	LPL_TOPO_BAD_CPUCNT			-6
256 #define	LPL_TOPO_RSET_MISMATCH			-7
257 #define	LPL_TOPO_LPL_ORPHANED			-8
258 #define	LPL_TOPO_LPL_BAD_NCPU			-9
259 #define	LPL_TOPO_RSET_MSSNG_LF			-10
260 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261 #define	LPL_TOPO_BOGUS_HINT			-12
262 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
263 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
264 #define	LPL_TOPO_BAD_RSETCNT			-15
265 
266 /*
267  * Return whether lgroup optimizations should be enabled on this system
268  */
269 int
270 lgrp_optimizations(void)
271 {
272 	/*
273 	 * System must have more than 2 lgroups to enable lgroup optimizations
274 	 *
275 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
276 	 * with one child lgroup containing all the resources. A 2 lgroup
277 	 * system with a root lgroup directly containing CPUs or memory might
278 	 * need lgroup optimizations with its child lgroup, but there
279 	 * isn't such a machine for now....
280 	 */
281 	if (nlgrps > 2)
282 		return (1);
283 
284 	return (0);
285 }
286 
287 /*
288  * Build full lgroup topology
289  */
290 static void
291 lgrp_root_init(void)
292 {
293 	lgrp_handle_t	hand;
294 	int		i;
295 	lgrp_id_t	id;
296 
297 	/*
298 	 * Create the "root" lgroup
299 	 */
300 	ASSERT(nlgrps == 0);
301 	id = nlgrps++;
302 
303 	lgrp_root = &lroot;
304 
305 	lgrp_root->lgrp_cpu = NULL;
306 	lgrp_root->lgrp_mnodes = 0;
307 	lgrp_root->lgrp_nmnodes = 0;
308 	hand = lgrp_plat_root_hand();
309 	lgrp_root->lgrp_plathand = hand;
310 
311 	lgrp_root->lgrp_id = id;
312 	lgrp_root->lgrp_cpucnt = 0;
313 	lgrp_root->lgrp_childcnt = 0;
314 	klgrpset_clear(lgrp_root->lgrp_children);
315 	klgrpset_clear(lgrp_root->lgrp_leaves);
316 	lgrp_root->lgrp_parent = NULL;
317 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318 
319 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320 		klgrpset_clear(lgrp_root->lgrp_set[i]);
321 
322 	lgrp_root->lgrp_kstat = NULL;
323 
324 	lgrp_table[id] = lgrp_root;
325 
326 	/*
327 	 * Setup initial lpl list for CPU0 and initial t0 home.
328 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329 	 * all topology operations until cp_default is initialized at which
330 	 * point t0.t_lpl will be updated.
331 	 */
332 	lpl_bootstrap = lpl_bootstrap_list;
333 	t0.t_lpl = lpl_bootstrap;
334 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336 	cp_default.cp_lgrploads = lpl_bootstrap;
337 }
338 
339 /*
340  * Initialize the lgroup framework and allow the platform to do the same
341  */
342 void
343 lgrp_init(void)
344 {
345 	/*
346 	 * Initialize the platform
347 	 */
348 	lgrp_plat_init();
349 
350 	/*
351 	 * Set max number of lgroups supported on this platform which must be
352 	 * less than the max number of lgroups supported by the common lgroup
353 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354 	 */
355 	nlgrpsmax = lgrp_plat_max_lgrps();
356 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357 }
358 
359 /*
360  * Create the root and cpu0's lgroup, and set t0's home.
361  */
362 void
363 lgrp_setup(void)
364 {
365 	/*
366 	 * Setup the root lgroup
367 	 */
368 	lgrp_root_init();
369 
370 	/*
371 	 * Add cpu0 to an lgroup
372 	 */
373 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375 }
376 
377 /*
378  * Lgroup initialization is split in two parts. The first part
379  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381  * when all CPUs are brought online and all distance information is available.
382  *
383  * When lgrp_main_init() is complete it sets lgrp_initialized. The
384  * lgrp_main_mp_init() sets lgrp_topo_initialized.
385  */
386 
387 /*
388  * true when lgrp initialization has been completed.
389  */
390 int	lgrp_initialized = 0;
391 
392 /*
393  * True when lgrp topology is constructed.
394  */
395 int	lgrp_topo_initialized = 0;
396 
397 /*
398  * Init routine called after startup(), /etc/system has been processed,
399  * and cpu0 has been added to an lgroup.
400  */
401 void
402 lgrp_main_init(void)
403 {
404 	cpu_t		*cp = CPU;
405 	lgrp_id_t	lgrpid;
406 	int		i;
407 	extern void	pg_cpu0_reinit();
408 
409 	/*
410 	 * Enforce a valid lgrp_mem_default_policy
411 	 */
412 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
413 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
414 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
415 
416 	/*
417 	 * See if mpo should be disabled.
418 	 * This may happen in the case of null proc LPA on Starcat.
419 	 * The platform won't be able to detect null proc LPA until after
420 	 * cpu0 and memory have already been added to lgroups.
421 	 * When and if it is detected, the Starcat platform will return
422 	 * a different platform handle for cpu0 which is what we check for
423 	 * here. If mpo should be disabled move cpu0 to it's rightful place
424 	 * (the root), and destroy the remaining lgroups. This effectively
425 	 * provides an UMA lgroup topology.
426 	 */
427 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
428 	if (lgrp_table[lgrpid]->lgrp_plathand !=
429 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
430 		lgrp_part_del_cpu(cp);
431 		lgrp_cpu_fini(cp, lgrpid);
432 
433 		lgrp_cpu_init(cp);
434 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
435 
436 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
437 
438 		/*
439 		 * Notify the PG subsystem that the CPU's lgrp
440 		 * association has changed
441 		 */
442 		pg_cpu0_reinit();
443 
444 		/*
445 		 * Destroy all lgroups except for root
446 		 */
447 		for (i = 0; i <= lgrp_alloc_max; i++) {
448 			if (LGRP_EXISTS(lgrp_table[i]) &&
449 			    lgrp_table[i] != lgrp_root)
450 				lgrp_destroy(lgrp_table[i]);
451 		}
452 
453 		/*
454 		 * Fix up root to point at itself for leaves and resources
455 		 * and not have any children
456 		 */
457 		lgrp_root->lgrp_childcnt = 0;
458 		klgrpset_clear(lgrp_root->lgrp_children);
459 		klgrpset_clear(lgrp_root->lgrp_leaves);
460 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
461 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
462 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
463 	}
464 
465 	/*
466 	 * Initialize kstats framework.
467 	 */
468 	lgrp_kstat_init();
469 	/*
470 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
471 	 */
472 	mutex_enter(&cpu_lock);
473 	lgrp_kstat_create(cp);
474 	mutex_exit(&cpu_lock);
475 
476 	lgrp_plat_main_init();
477 	lgrp_initialized = 1;
478 }
479 
480 /*
481  * Finish lgrp initialization after all CPUS are brought on-line.
482  * This routine is called after start_other_cpus().
483  */
484 void
485 lgrp_main_mp_init(void)
486 {
487 	klgrpset_t changed;
488 
489 	/*
490 	 * Update lgroup topology (if necessary)
491 	 */
492 	klgrpset_clear(changed);
493 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
494 	lgrp_topo_initialized = 1;
495 }
496 
497 /*
498  * Change latency of lgroup with specified lgroup platform handle (if one is
499  * given) or change all lgroups with old latency to new latency
500  */
501 void
502 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
503     u_longlong_t newtime)
504 {
505 	lgrp_t		*lgrp;
506 	int		i;
507 
508 	for (i = 0; i <= lgrp_alloc_max; i++) {
509 		lgrp = lgrp_table[i];
510 
511 		if (!LGRP_EXISTS(lgrp))
512 			continue;
513 
514 		if ((hand == LGRP_NULL_HANDLE &&
515 		    lgrp->lgrp_latency == oldtime) ||
516 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
517 			lgrp->lgrp_latency = (int)newtime;
518 	}
519 }
520 
521 /*
522  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
523  */
524 void
525 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
526 {
527 	klgrpset_t	changed;
528 	cpu_t		*cp;
529 	lgrp_id_t	id;
530 	int		rc;
531 
532 	switch (event) {
533 	/*
534 	 * The following (re)configuration events are common code
535 	 * initiated. lgrp_plat_config() is called here to inform the
536 	 * platform of the reconfiguration event.
537 	 */
538 	case LGRP_CONFIG_CPU_ADD:
539 		cp = (cpu_t *)resource;
540 
541 		/*
542 		 * Initialize the new CPU's lgrp related next/prev
543 		 * links, and give it a bootstrap lpl so that it can
544 		 * survive should it need to enter the dispatcher.
545 		 */
546 		cp->cpu_next_lpl = cp;
547 		cp->cpu_prev_lpl = cp;
548 		cp->cpu_next_lgrp = cp;
549 		cp->cpu_prev_lgrp = cp;
550 		cp->cpu_lpl = lpl_bootstrap;
551 
552 		lgrp_plat_config(event, resource);
553 		atomic_add_32(&lgrp_gen, 1);
554 
555 		break;
556 	case LGRP_CONFIG_CPU_DEL:
557 		lgrp_plat_config(event, resource);
558 		atomic_add_32(&lgrp_gen, 1);
559 
560 		break;
561 	case LGRP_CONFIG_CPU_ONLINE:
562 		cp = (cpu_t *)resource;
563 		lgrp_cpu_init(cp);
564 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
565 		rc = lpl_topo_verify(cp->cpu_part);
566 		if (rc != LPL_TOPO_CORRECT) {
567 			panic("lpl_topo_verify failed: %d", rc);
568 		}
569 		lgrp_plat_config(event, resource);
570 		atomic_add_32(&lgrp_gen, 1);
571 
572 		break;
573 	case LGRP_CONFIG_CPU_OFFLINE:
574 		cp = (cpu_t *)resource;
575 		id = cp->cpu_lpl->lpl_lgrpid;
576 		lgrp_part_del_cpu(cp);
577 		lgrp_cpu_fini(cp, id);
578 		rc = lpl_topo_verify(cp->cpu_part);
579 		if (rc != LPL_TOPO_CORRECT) {
580 			panic("lpl_topo_verify failed: %d", rc);
581 		}
582 		lgrp_plat_config(event, resource);
583 		atomic_add_32(&lgrp_gen, 1);
584 
585 		break;
586 	case LGRP_CONFIG_CPUPART_ADD:
587 		cp = (cpu_t *)resource;
588 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
589 		rc = lpl_topo_verify(cp->cpu_part);
590 		if (rc != LPL_TOPO_CORRECT) {
591 			panic("lpl_topo_verify failed: %d", rc);
592 		}
593 		lgrp_plat_config(event, resource);
594 
595 		break;
596 	case LGRP_CONFIG_CPUPART_DEL:
597 		cp = (cpu_t *)resource;
598 		lgrp_part_del_cpu((cpu_t *)resource);
599 		rc = lpl_topo_verify(cp->cpu_part);
600 		if (rc != LPL_TOPO_CORRECT) {
601 			panic("lpl_topo_verify failed: %d", rc);
602 		}
603 		lgrp_plat_config(event, resource);
604 
605 		break;
606 	/*
607 	 * The following events are initiated by the memnode
608 	 * subsystem.
609 	 */
610 	case LGRP_CONFIG_MEM_ADD:
611 		lgrp_mem_init((int)resource, where, B_FALSE);
612 		atomic_add_32(&lgrp_gen, 1);
613 
614 		break;
615 	case LGRP_CONFIG_MEM_DEL:
616 		lgrp_mem_fini((int)resource, where, B_FALSE);
617 		atomic_add_32(&lgrp_gen, 1);
618 
619 		break;
620 	case LGRP_CONFIG_MEM_RENAME: {
621 		lgrp_config_mem_rename_t *ren_arg =
622 		    (lgrp_config_mem_rename_t *)where;
623 
624 		lgrp_mem_rename((int)resource,
625 		    ren_arg->lmem_rename_from,
626 		    ren_arg->lmem_rename_to);
627 		atomic_add_32(&lgrp_gen, 1);
628 
629 		break;
630 	}
631 	case LGRP_CONFIG_GEN_UPDATE:
632 		atomic_add_32(&lgrp_gen, 1);
633 
634 		break;
635 	case LGRP_CONFIG_FLATTEN:
636 		if (where == 0)
637 			lgrp_topo_levels = (int)resource;
638 		else
639 			(void) lgrp_topo_flatten(resource,
640 			    lgrp_table, lgrp_alloc_max, &changed);
641 
642 		break;
643 	/*
644 	 * Update any lgroups with old latency to new latency
645 	 */
646 	case LGRP_CONFIG_LAT_CHANGE_ALL:
647 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
648 		    (u_longlong_t)where);
649 
650 		break;
651 	/*
652 	 * Update lgroup with specified lgroup platform handle to have
653 	 * new latency
654 	 */
655 	case LGRP_CONFIG_LAT_CHANGE:
656 		lgrp_latency_change((lgrp_handle_t)resource, 0,
657 		    (u_longlong_t)where);
658 
659 		break;
660 	case LGRP_CONFIG_NOP:
661 
662 		break;
663 	default:
664 		break;
665 	}
666 
667 }
668 
669 /*
670  * Called to add lgrp info into cpu structure from cpu_add_unit;
671  * do not assume cpu is in cpu[] yet!
672  *
673  * CPUs are brought online with all other CPUs paused so we can't
674  * allocate memory or we could deadlock the system, so we rely on
675  * the platform to statically allocate as much space as we need
676  * for the lgrp structs and stats.
677  */
678 static void
679 lgrp_cpu_init(struct cpu *cp)
680 {
681 	klgrpset_t	changed;
682 	int		count;
683 	lgrp_handle_t	hand;
684 	int		first_cpu;
685 	lgrp_t		*my_lgrp;
686 	lgrp_id_t	lgrpid;
687 	struct cpu	*cptr;
688 
689 	/*
690 	 * This is the first time through if the resource set
691 	 * for the root lgroup is empty. After cpu0 has been
692 	 * initially added to an lgroup, the root's CPU resource
693 	 * set can never be empty, since the system's last CPU
694 	 * cannot be offlined.
695 	 */
696 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
697 		/*
698 		 * First time through.
699 		 */
700 		first_cpu = 1;
701 	} else {
702 		/*
703 		 * If cpu0 needs to move lgroups, we may come
704 		 * through here again, at which time cpu_lock won't
705 		 * be held, and lgrp_initialized will be false.
706 		 */
707 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
708 		ASSERT(cp->cpu_part != NULL);
709 		first_cpu = 0;
710 	}
711 
712 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
713 	my_lgrp = lgrp_hand_to_lgrp(hand);
714 
715 	if (my_lgrp == NULL) {
716 		/*
717 		 * Create new lgrp and add it to lgroup topology
718 		 */
719 		my_lgrp = lgrp_create();
720 		my_lgrp->lgrp_plathand = hand;
721 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
722 		lgrpid = my_lgrp->lgrp_id;
723 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
724 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
725 
726 		count = 0;
727 		klgrpset_clear(changed);
728 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
729 		    &changed);
730 		/*
731 		 * May have added new intermediate lgroups, so need to add
732 		 * resources other than CPUs which are added below
733 		 */
734 		(void) lgrp_mnode_update(changed, NULL);
735 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
736 	    > 0) {
737 		/*
738 		 * Leaf lgroup was created, but latency wasn't available
739 		 * then.  So, set latency for it and fill in rest of lgroup
740 		 * topology  now that we know how far it is from other leaf
741 		 * lgroups.
742 		 */
743 		lgrpid = my_lgrp->lgrp_id;
744 		klgrpset_clear(changed);
745 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
746 		    lgrpid))
747 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
748 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
749 		    &changed);
750 
751 		/*
752 		 * May have added new intermediate lgroups, so need to add
753 		 * resources other than CPUs which are added below
754 		 */
755 		(void) lgrp_mnode_update(changed, NULL);
756 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
757 	    my_lgrp->lgrp_id)) {
758 		int	i;
759 
760 		/*
761 		 * Update existing lgroup and lgroups containing it with CPU
762 		 * resource
763 		 */
764 		lgrpid = my_lgrp->lgrp_id;
765 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
766 		for (i = 0; i <= lgrp_alloc_max; i++) {
767 			lgrp_t		*lgrp;
768 
769 			lgrp = lgrp_table[i];
770 			if (!LGRP_EXISTS(lgrp) ||
771 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
772 				continue;
773 
774 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
775 		}
776 	}
777 
778 	lgrpid = my_lgrp->lgrp_id;
779 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
780 
781 	/*
782 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
783 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
784 	 * not since none of lgroup IDs in the lpl's have been set yet.
785 	 */
786 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
787 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
788 
789 	/*
790 	 * link the CPU into the lgrp's CPU list
791 	 */
792 	if (my_lgrp->lgrp_cpucnt == 0) {
793 		my_lgrp->lgrp_cpu = cp;
794 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
795 	} else {
796 		cptr = my_lgrp->lgrp_cpu;
797 		cp->cpu_next_lgrp = cptr;
798 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
799 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
800 		cptr->cpu_prev_lgrp = cp;
801 	}
802 	my_lgrp->lgrp_cpucnt++;
803 }
804 
805 lgrp_t *
806 lgrp_create(void)
807 {
808 	lgrp_t		*my_lgrp;
809 	lgrp_id_t	lgrpid;
810 	int		i;
811 
812 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
813 
814 	/*
815 	 * Find an open slot in the lgroup table and recycle unused lgroup
816 	 * left there if any
817 	 */
818 	my_lgrp = NULL;
819 	if (lgrp_alloc_hint == -1)
820 		/*
821 		 * Allocate from end when hint not set yet because no lgroups
822 		 * have been deleted yet
823 		 */
824 		lgrpid = nlgrps++;
825 	else {
826 		/*
827 		 * Start looking for next open slot from hint and leave hint
828 		 * at slot allocated
829 		 */
830 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
831 			my_lgrp = lgrp_table[i];
832 			if (!LGRP_EXISTS(my_lgrp)) {
833 				lgrpid = i;
834 				nlgrps++;
835 				break;
836 			}
837 		}
838 		lgrp_alloc_hint = lgrpid;
839 	}
840 
841 	/*
842 	 * Keep track of max lgroup ID allocated so far to cut down on searches
843 	 */
844 	if (lgrpid > lgrp_alloc_max)
845 		lgrp_alloc_max = lgrpid;
846 
847 	/*
848 	 * Need to allocate new lgroup if next open slot didn't have one
849 	 * for recycling
850 	 */
851 	if (my_lgrp == NULL)
852 		my_lgrp = lgrp_plat_alloc(lgrpid);
853 
854 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
855 		panic("Too many lgrps for platform (%d)", nlgrps);
856 
857 	my_lgrp->lgrp_id = lgrpid;
858 	my_lgrp->lgrp_latency = 0;
859 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
860 	my_lgrp->lgrp_parent = NULL;
861 	my_lgrp->lgrp_childcnt = 0;
862 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
863 	my_lgrp->lgrp_nmnodes = 0;
864 	klgrpset_clear(my_lgrp->lgrp_children);
865 	klgrpset_clear(my_lgrp->lgrp_leaves);
866 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
867 		klgrpset_clear(my_lgrp->lgrp_set[i]);
868 
869 	my_lgrp->lgrp_cpu = NULL;
870 	my_lgrp->lgrp_cpucnt = 0;
871 
872 	if (my_lgrp->lgrp_kstat != NULL)
873 		lgrp_kstat_reset(lgrpid);
874 
875 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
876 
877 	return (my_lgrp);
878 }
879 
880 void
881 lgrp_destroy(lgrp_t *lgrp)
882 {
883 	int		i;
884 
885 	/*
886 	 * Unless this lgroup is being destroyed on behalf of
887 	 * the boot CPU, cpu_lock must be held
888 	 */
889 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
890 
891 	if (nlgrps == 1)
892 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
893 
894 	if (!LGRP_EXISTS(lgrp))
895 		return;
896 
897 	/*
898 	 * Set hint to lgroup being deleted and try to keep lower numbered
899 	 * hints to facilitate finding empty slots
900 	 */
901 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
902 		lgrp_alloc_hint = lgrp->lgrp_id;
903 
904 	/*
905 	 * Mark this lgroup to be recycled by setting its lgroup ID to
906 	 * LGRP_NONE and clear relevant fields
907 	 */
908 	lgrp->lgrp_id = LGRP_NONE;
909 	lgrp->lgrp_latency = 0;
910 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
911 	lgrp->lgrp_parent = NULL;
912 	lgrp->lgrp_childcnt = 0;
913 
914 	klgrpset_clear(lgrp->lgrp_children);
915 	klgrpset_clear(lgrp->lgrp_leaves);
916 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
917 		klgrpset_clear(lgrp->lgrp_set[i]);
918 
919 	lgrp->lgrp_mnodes = (mnodeset_t)0;
920 	lgrp->lgrp_nmnodes = 0;
921 
922 	lgrp->lgrp_cpu = NULL;
923 	lgrp->lgrp_cpucnt = 0;
924 
925 	nlgrps--;
926 }
927 
928 /*
929  * Initialize kstat data. Called from lgrp intialization code.
930  */
931 static void
932 lgrp_kstat_init(void)
933 {
934 	lgrp_stat_t	stat;
935 
936 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
937 
938 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
939 		kstat_named_init(&lgrp_kstat_data[stat],
940 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
941 }
942 
943 /*
944  * initialize an lgrp's kstats if needed
945  * called with cpu_lock held but not with cpus paused.
946  * we don't tear these down now because we don't know about
947  * memory leaving the lgrp yet...
948  */
949 
950 void
951 lgrp_kstat_create(cpu_t *cp)
952 {
953 	kstat_t		*lgrp_kstat;
954 	lgrp_id_t	lgrpid;
955 	lgrp_t		*my_lgrp;
956 
957 	ASSERT(MUTEX_HELD(&cpu_lock));
958 
959 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
960 	my_lgrp = lgrp_table[lgrpid];
961 
962 	if (my_lgrp->lgrp_kstat != NULL)
963 		return; /* already initialized */
964 
965 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
966 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
967 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
968 
969 	if (lgrp_kstat != NULL) {
970 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
971 		lgrp_kstat->ks_private = my_lgrp;
972 		lgrp_kstat->ks_data = &lgrp_kstat_data;
973 		lgrp_kstat->ks_update = lgrp_kstat_extract;
974 		my_lgrp->lgrp_kstat = lgrp_kstat;
975 		kstat_install(lgrp_kstat);
976 	}
977 }
978 
979 /*
980  * this will do something when we manage to remove now unused lgrps
981  */
982 
983 /* ARGSUSED */
984 void
985 lgrp_kstat_destroy(cpu_t *cp)
986 {
987 	ASSERT(MUTEX_HELD(&cpu_lock));
988 }
989 
990 /*
991  * Called when a CPU is off-lined.
992  */
993 static void
994 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
995 {
996 	lgrp_t *my_lgrp;
997 	struct cpu *prev;
998 	struct cpu *next;
999 
1000 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1001 
1002 	prev = cp->cpu_prev_lgrp;
1003 	next = cp->cpu_next_lgrp;
1004 
1005 	prev->cpu_next_lgrp = next;
1006 	next->cpu_prev_lgrp = prev;
1007 
1008 	/*
1009 	 * just because I'm paranoid doesn't mean...
1010 	 */
1011 
1012 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1013 
1014 	my_lgrp = lgrp_table[lgrpid];
1015 	my_lgrp->lgrp_cpucnt--;
1016 
1017 	/*
1018 	 * Removing last CPU in lgroup, so update lgroup topology
1019 	 */
1020 	if (my_lgrp->lgrp_cpucnt == 0) {
1021 		klgrpset_t	changed;
1022 		int		count;
1023 		int		i;
1024 
1025 		my_lgrp->lgrp_cpu = NULL;
1026 
1027 		/*
1028 		 * Remove this lgroup from its lgroup CPU resources and remove
1029 		 * lgroup from lgroup topology if it doesn't have any more
1030 		 * resources in it now
1031 		 */
1032 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1033 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1034 			count = 0;
1035 			klgrpset_clear(changed);
1036 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1037 			    lgrp_alloc_max + 1, &changed);
1038 			return;
1039 		}
1040 
1041 		/*
1042 		 * This lgroup isn't empty, so just remove it from CPU
1043 		 * resources of any lgroups that contain it as such
1044 		 */
1045 		for (i = 0; i <= lgrp_alloc_max; i++) {
1046 			lgrp_t		*lgrp;
1047 
1048 			lgrp = lgrp_table[i];
1049 			if (!LGRP_EXISTS(lgrp) ||
1050 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1051 			    lgrpid))
1052 				continue;
1053 
1054 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1055 		}
1056 		return;
1057 	}
1058 
1059 	if (my_lgrp->lgrp_cpu == cp)
1060 		my_lgrp->lgrp_cpu = next;
1061 
1062 }
1063 
1064 /*
1065  * Update memory nodes in target lgroups and return ones that get changed
1066  */
1067 int
1068 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1069 {
1070 	int	count;
1071 	int	i;
1072 	int	j;
1073 	lgrp_t	*lgrp;
1074 	lgrp_t	*lgrp_rsrc;
1075 
1076 	count = 0;
1077 	if (changed)
1078 		klgrpset_clear(*changed);
1079 
1080 	if (klgrpset_isempty(target))
1081 		return (0);
1082 
1083 	/*
1084 	 * Find each lgroup in target lgroups
1085 	 */
1086 	for (i = 0; i <= lgrp_alloc_max; i++) {
1087 		/*
1088 		 * Skip any lgroups that don't exist or aren't in target group
1089 		 */
1090 		lgrp = lgrp_table[i];
1091 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1092 			continue;
1093 		}
1094 
1095 		/*
1096 		 * Initialize memnodes for intermediate lgroups to 0
1097 		 * and update them from scratch since they may have completely
1098 		 * changed
1099 		 */
1100 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1101 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1102 			lgrp->lgrp_nmnodes = 0;
1103 		}
1104 
1105 		/*
1106 		 * Update memory nodes of of target lgroup with memory nodes
1107 		 * from each lgroup in its lgroup memory resource set
1108 		 */
1109 		for (j = 0; j <= lgrp_alloc_max; j++) {
1110 			int	k;
1111 
1112 			/*
1113 			 * Skip any lgroups that don't exist or aren't in
1114 			 * memory resources of target lgroup
1115 			 */
1116 			lgrp_rsrc = lgrp_table[j];
1117 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1118 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1119 			    j))
1120 				continue;
1121 
1122 			/*
1123 			 * Update target lgroup's memnodes to include memnodes
1124 			 * of this lgroup
1125 			 */
1126 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1127 				mnodeset_t	mnode_mask;
1128 
1129 				mnode_mask = (mnodeset_t)1 << k;
1130 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1131 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1132 					lgrp->lgrp_mnodes |= mnode_mask;
1133 					lgrp->lgrp_nmnodes++;
1134 				}
1135 			}
1136 			count++;
1137 			if (changed)
1138 				klgrpset_add(*changed, lgrp->lgrp_id);
1139 		}
1140 	}
1141 
1142 	return (count);
1143 }
1144 
1145 /*
1146  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1147  * is moved from one board to another. The "from" and "to" arguments specify the
1148  * source and the destination of the move.
1149  *
1150  * See plat_lgrp_config() for a detailed description of the copy-rename
1151  * semantics.
1152  *
1153  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1154  * the lgroup topology which is changing as memory moves from one lgroup to
1155  * another. It removes the mnode from the source lgroup and re-inserts it in the
1156  * target lgroup.
1157  *
1158  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1159  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1160  * copy-rename operation.
1161  *
1162  * There is one case which requires special handling. If the system contains
1163  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1164  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1165  * lgrp_mem_init), but there is a window when the system has no memory in the
1166  * lgroup hierarchy. If another thread tries to allocate memory during this
1167  * window, the allocation will fail, although the system has physical memory.
1168  * This may cause a system panic or a deadlock (some sleeping memory allocations
1169  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1170  * the mnode back).
1171  *
1172  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1173  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1174  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1175  * but it updates the rest of the lgroup topology as if the mnode was actually
1176  * removed. The lgrp_mem_init() function recognizes that the mnode being
1177  * inserted represents such a special case and updates the topology
1178  * appropriately.
1179  */
1180 void
1181 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1182 {
1183 	/*
1184 	 * Remove the memory from the source node and add it to the destination
1185 	 * node.
1186 	 */
1187 	lgrp_mem_fini(mnode, from, B_TRUE);
1188 	lgrp_mem_init(mnode, to, B_TRUE);
1189 }
1190 
1191 /*
1192  * Called to indicate that the lgrp with platform handle "hand" now
1193  * contains the memory identified by "mnode".
1194  *
1195  * LOCKING for this routine is a bit tricky. Usually it is called without
1196  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1197  * callers. During DR of the board containing the caged memory it may be called
1198  * with cpu_lock already held and CPUs paused.
1199  *
1200  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1201  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1202  * dealing with the special case of DR copy-rename described in
1203  * lgrp_mem_rename().
1204  */
1205 void
1206 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1207 {
1208 	klgrpset_t	changed;
1209 	int		count;
1210 	int		i;
1211 	lgrp_t		*my_lgrp;
1212 	lgrp_id_t	lgrpid;
1213 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1214 	boolean_t	drop_lock = B_FALSE;
1215 	boolean_t	need_synch = B_FALSE;
1216 
1217 	/*
1218 	 * Grab CPU lock (if we haven't already)
1219 	 */
1220 	if (!MUTEX_HELD(&cpu_lock)) {
1221 		mutex_enter(&cpu_lock);
1222 		drop_lock = B_TRUE;
1223 	}
1224 
1225 	/*
1226 	 * This routine may be called from a context where we already
1227 	 * hold cpu_lock, and have already paused cpus.
1228 	 */
1229 	if (!cpus_paused())
1230 		need_synch = B_TRUE;
1231 
1232 	/*
1233 	 * Check if this mnode is already configured and return immediately if
1234 	 * it is.
1235 	 *
1236 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1237 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1238 	 * recognize this case and continue as usual, but skip the update to
1239 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1240 	 * in topology, temporarily introduced by lgrp_mem_fini().
1241 	 */
1242 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1243 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1244 		if (drop_lock)
1245 			mutex_exit(&cpu_lock);
1246 		return;
1247 	}
1248 
1249 	/*
1250 	 * Update lgroup topology with new memory resources, keeping track of
1251 	 * which lgroups change
1252 	 */
1253 	count = 0;
1254 	klgrpset_clear(changed);
1255 	my_lgrp = lgrp_hand_to_lgrp(hand);
1256 	if (my_lgrp == NULL) {
1257 		/* new lgrp */
1258 		my_lgrp = lgrp_create();
1259 		lgrpid = my_lgrp->lgrp_id;
1260 		my_lgrp->lgrp_plathand = hand;
1261 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1262 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1263 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1264 
1265 		if (need_synch)
1266 			pause_cpus(NULL);
1267 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1268 		    &changed);
1269 		if (need_synch)
1270 			start_cpus();
1271 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1272 	    > 0) {
1273 		/*
1274 		 * Leaf lgroup was created, but latency wasn't available
1275 		 * then.  So, set latency for it and fill in rest of lgroup
1276 		 * topology  now that we know how far it is from other leaf
1277 		 * lgroups.
1278 		 */
1279 		klgrpset_clear(changed);
1280 		lgrpid = my_lgrp->lgrp_id;
1281 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1282 		    lgrpid))
1283 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1284 		if (need_synch)
1285 			pause_cpus(NULL);
1286 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1287 		    &changed);
1288 		if (need_synch)
1289 			start_cpus();
1290 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1291 	    my_lgrp->lgrp_id)) {
1292 		/*
1293 		 * Add new lgroup memory resource to existing lgroup
1294 		 */
1295 		lgrpid = my_lgrp->lgrp_id;
1296 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1297 		klgrpset_add(changed, lgrpid);
1298 		count++;
1299 		for (i = 0; i <= lgrp_alloc_max; i++) {
1300 			lgrp_t		*lgrp;
1301 
1302 			lgrp = lgrp_table[i];
1303 			if (!LGRP_EXISTS(lgrp) ||
1304 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1305 				continue;
1306 
1307 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1308 			klgrpset_add(changed, lgrp->lgrp_id);
1309 			count++;
1310 		}
1311 	}
1312 
1313 	/*
1314 	 * Add memory node to lgroup and remove lgroup from ones that need
1315 	 * to be updated
1316 	 */
1317 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1318 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1319 		my_lgrp->lgrp_nmnodes++;
1320 	}
1321 	klgrpset_del(changed, lgrpid);
1322 
1323 	/*
1324 	 * Update memory node information for all lgroups that changed and
1325 	 * contain new memory node as a resource
1326 	 */
1327 	if (count)
1328 		(void) lgrp_mnode_update(changed, NULL);
1329 
1330 	if (drop_lock)
1331 		mutex_exit(&cpu_lock);
1332 }
1333 
1334 /*
1335  * Called to indicate that the lgroup associated with the platform
1336  * handle "hand" no longer contains given memory node
1337  *
1338  * LOCKING for this routine is a bit tricky. Usually it is called without
1339  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1340  * callers. During DR of the board containing the caged memory it may be called
1341  * with cpu_lock already held and CPUs paused.
1342  *
1343  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1344  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1345  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1346  * the same mnode back into the topology. See lgrp_mem_rename() and
1347  * lgrp_mem_init() for additional details.
1348  */
1349 void
1350 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1351 {
1352 	klgrpset_t	changed;
1353 	int		count;
1354 	int		i;
1355 	lgrp_t		*my_lgrp;
1356 	lgrp_id_t	lgrpid;
1357 	mnodeset_t	mnodes_mask;
1358 	boolean_t	drop_lock = B_FALSE;
1359 	boolean_t	need_synch = B_FALSE;
1360 
1361 	/*
1362 	 * Grab CPU lock (if we haven't already)
1363 	 */
1364 	if (!MUTEX_HELD(&cpu_lock)) {
1365 		mutex_enter(&cpu_lock);
1366 		drop_lock = B_TRUE;
1367 	}
1368 
1369 	/*
1370 	 * This routine may be called from a context where we already
1371 	 * hold cpu_lock and have already paused cpus.
1372 	 */
1373 	if (!cpus_paused())
1374 		need_synch = B_TRUE;
1375 
1376 	my_lgrp = lgrp_hand_to_lgrp(hand);
1377 
1378 	/*
1379 	 * The lgrp *must* be pre-existing
1380 	 */
1381 	ASSERT(my_lgrp != NULL);
1382 
1383 	/*
1384 	 * Delete memory node from lgroups which contain it
1385 	 */
1386 	mnodes_mask = ((mnodeset_t)1 << mnode);
1387 	for (i = 0; i <= lgrp_alloc_max; i++) {
1388 		lgrp_t *lgrp = lgrp_table[i];
1389 		/*
1390 		 * Skip any non-existent lgroups and any lgroups that don't
1391 		 * contain leaf lgroup of memory as a memory resource
1392 		 */
1393 		if (!LGRP_EXISTS(lgrp) ||
1394 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1395 			continue;
1396 
1397 		/*
1398 		 * Avoid removing the last mnode from the root in the DR
1399 		 * copy-rename case. See lgrp_mem_rename() for details.
1400 		 */
1401 		if (is_copy_rename &&
1402 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1403 			continue;
1404 
1405 		/*
1406 		 * Remove memory node from lgroup.
1407 		 */
1408 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1409 		lgrp->lgrp_nmnodes--;
1410 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1411 	}
1412 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1413 
1414 	/*
1415 	 * Don't need to update lgroup topology if this lgroup still has memory.
1416 	 *
1417 	 * In the special case of DR copy-rename with the only mnode being
1418 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1419 	 * still need to update the lgroup topology.
1420 	 */
1421 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1422 	    !(is_copy_rename &&
1423 		(my_lgrp == lgrp_root) &&
1424 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1425 		if (drop_lock)
1426 			mutex_exit(&cpu_lock);
1427 		return;
1428 	}
1429 
1430 	/*
1431 	 * This lgroup does not contain any memory now
1432 	 */
1433 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1434 
1435 	/*
1436 	 * Remove this lgroup from lgroup topology if it does not contain any
1437 	 * resources now
1438 	 */
1439 	lgrpid = my_lgrp->lgrp_id;
1440 	count = 0;
1441 	klgrpset_clear(changed);
1442 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1443 		/*
1444 		 * Delete lgroup when no more resources
1445 		 */
1446 		if (need_synch)
1447 			pause_cpus(NULL);
1448 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1449 		    lgrp_alloc_max + 1, &changed);
1450 		ASSERT(count > 0);
1451 		if (need_synch)
1452 			start_cpus();
1453 	} else {
1454 		/*
1455 		 * Remove lgroup from memory resources of any lgroups that
1456 		 * contain it as such
1457 		 */
1458 		for (i = 0; i <= lgrp_alloc_max; i++) {
1459 			lgrp_t		*lgrp;
1460 
1461 			lgrp = lgrp_table[i];
1462 			if (!LGRP_EXISTS(lgrp) ||
1463 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1464 			    lgrpid))
1465 				continue;
1466 
1467 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1468 		}
1469 	}
1470 	if (drop_lock)
1471 		mutex_exit(&cpu_lock);
1472 }
1473 
1474 /*
1475  * Return lgroup with given platform handle
1476  */
1477 lgrp_t *
1478 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1479 {
1480 	int	i;
1481 	lgrp_t	*lgrp;
1482 
1483 	if (hand == LGRP_NULL_HANDLE)
1484 		return (NULL);
1485 
1486 	for (i = 0; i <= lgrp_alloc_max; i++) {
1487 		lgrp = lgrp_table[i];
1488 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1489 			return (lgrp);
1490 	}
1491 	return (NULL);
1492 }
1493 
1494 /*
1495  * Return the home lgroup of the current thread.
1496  * We must do this with kernel preemption disabled, since we don't want our
1497  * thread to be re-homed while we're poking around with its lpl, and the lpl
1498  * should never be NULL.
1499  *
1500  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1501  * is enabled because of DR.  Callers can use disable kernel preemption
1502  * around this call to guarantee that the lgroup will be valid beyond this
1503  * routine, since kernel preemption can be recursive.
1504  */
1505 lgrp_t *
1506 lgrp_home_lgrp(void)
1507 {
1508 	lgrp_t	*lgrp;
1509 	lpl_t	*lpl;
1510 
1511 	kpreempt_disable();
1512 
1513 	lpl = curthread->t_lpl;
1514 	ASSERT(lpl != NULL);
1515 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1516 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1517 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1518 
1519 	kpreempt_enable();
1520 
1521 	return (lgrp);
1522 }
1523 
1524 /*
1525  * Return ID of home lgroup for given thread
1526  * (See comments for lgrp_home_lgrp() for special care and handling
1527  * instructions)
1528  */
1529 lgrp_id_t
1530 lgrp_home_id(kthread_t *t)
1531 {
1532 	lgrp_id_t	lgrp;
1533 	lpl_t		*lpl;
1534 
1535 	ASSERT(t != NULL);
1536 	/*
1537 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1538 	 * cannot since the HAT layer can call into this routine to
1539 	 * determine the locality for its data structures in the context
1540 	 * of a page fault.
1541 	 */
1542 
1543 	kpreempt_disable();
1544 
1545 	lpl = t->t_lpl;
1546 	ASSERT(lpl != NULL);
1547 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1548 	lgrp = lpl->lpl_lgrpid;
1549 
1550 	kpreempt_enable();
1551 
1552 	return (lgrp);
1553 }
1554 
1555 /*
1556  * Return lgroup containing the physical memory for the given page frame number
1557  */
1558 lgrp_t *
1559 lgrp_pfn_to_lgrp(pfn_t pfn)
1560 {
1561 	lgrp_handle_t	hand;
1562 	int		i;
1563 	lgrp_t		*lgrp;
1564 
1565 	hand = lgrp_plat_pfn_to_hand(pfn);
1566 	if (hand != LGRP_NULL_HANDLE)
1567 		for (i = 0; i <= lgrp_alloc_max; i++) {
1568 			lgrp = lgrp_table[i];
1569 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1570 				return (lgrp);
1571 		}
1572 	return (NULL);
1573 }
1574 
1575 /*
1576  * Return lgroup containing the physical memory for the given page frame number
1577  */
1578 lgrp_t *
1579 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1580 {
1581 	lgrp_handle_t	hand;
1582 	int		i;
1583 	lgrp_t		*lgrp;
1584 	pfn_t		pfn;
1585 
1586 	pfn = btop(physaddr);
1587 	hand = lgrp_plat_pfn_to_hand(pfn);
1588 	if (hand != LGRP_NULL_HANDLE)
1589 		for (i = 0; i <= lgrp_alloc_max; i++) {
1590 			lgrp = lgrp_table[i];
1591 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1592 				return (lgrp);
1593 		}
1594 	return (NULL);
1595 }
1596 
1597 /*
1598  * Return the leaf lgroup containing the given CPU
1599  *
1600  * The caller needs to take precautions necessary to prevent
1601  * "cpu", and it's lpl from going away across a call to this function.
1602  * hint: kpreempt_disable()/kpreempt_enable()
1603  */
1604 static lgrp_t *
1605 lgrp_cpu_to_lgrp(cpu_t *cpu)
1606 {
1607 	return (cpu->cpu_lpl->lpl_lgrp);
1608 }
1609 
1610 /*
1611  * Return the sum of the partition loads in an lgrp divided by
1612  * the number of CPUs in the lgrp.  This is our best approximation
1613  * of an 'lgroup load average' for a useful per-lgroup kstat.
1614  */
1615 static uint64_t
1616 lgrp_sum_loadavgs(lgrp_t *lgrp)
1617 {
1618 	cpu_t *cpu;
1619 	int ncpu;
1620 	uint64_t loads = 0;
1621 
1622 	mutex_enter(&cpu_lock);
1623 
1624 	cpu = lgrp->lgrp_cpu;
1625 	ncpu = lgrp->lgrp_cpucnt;
1626 
1627 	if (cpu == NULL || ncpu == 0) {
1628 		mutex_exit(&cpu_lock);
1629 		return (0ull);
1630 	}
1631 
1632 	do {
1633 		loads += cpu->cpu_lpl->lpl_loadavg;
1634 		cpu = cpu->cpu_next_lgrp;
1635 	} while (cpu != lgrp->lgrp_cpu);
1636 
1637 	mutex_exit(&cpu_lock);
1638 
1639 	return (loads / ncpu);
1640 }
1641 
1642 void
1643 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1644 {
1645 	struct lgrp_stats *pstats;
1646 
1647 	/*
1648 	 * Verify that the caller isn't trying to add to
1649 	 * a statistic for an lgroup that has gone away
1650 	 */
1651 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1652 		return;
1653 
1654 	pstats = &lgrp_stats[lgrpid];
1655 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1656 }
1657 
1658 int64_t
1659 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1660 {
1661 	uint64_t val;
1662 	struct lgrp_stats *pstats;
1663 
1664 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1665 		return ((int64_t)0);
1666 
1667 	pstats = &lgrp_stats[lgrpid];
1668 	LGRP_STAT_READ(pstats, stat, val);
1669 	return (val);
1670 }
1671 
1672 /*
1673  * Reset all kstats for lgrp specified by its lgrpid.
1674  */
1675 static void
1676 lgrp_kstat_reset(lgrp_id_t lgrpid)
1677 {
1678 	lgrp_stat_t stat;
1679 
1680 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1681 		return;
1682 
1683 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1684 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1685 	}
1686 }
1687 
1688 /*
1689  * Collect all per-lgrp statistics for the lgrp associated with this
1690  * kstat, and store them in the ks_data array.
1691  *
1692  * The superuser can reset all the running counter statistics for an
1693  * lgrp by writing to any of the lgrp's stats.
1694  */
1695 static int
1696 lgrp_kstat_extract(kstat_t *ksp, int rw)
1697 {
1698 	lgrp_stat_t		stat;
1699 	struct kstat_named	*ksd;
1700 	lgrp_t			*lgrp;
1701 	lgrp_id_t		lgrpid;
1702 
1703 	lgrp = (lgrp_t *)ksp->ks_private;
1704 
1705 	ksd = (struct kstat_named *)ksp->ks_data;
1706 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1707 
1708 	lgrpid = lgrp->lgrp_id;
1709 
1710 	if (lgrpid == LGRP_NONE) {
1711 		/*
1712 		 * Return all zeroes as stats for freed lgrp.
1713 		 */
1714 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1715 			ksd[stat].value.i64 = 0;
1716 		}
1717 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1718 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1719 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1720 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1721 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1722 	} else if (rw != KSTAT_WRITE) {
1723 		/*
1724 		 * Handle counter stats
1725 		 */
1726 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1727 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1728 		}
1729 
1730 		/*
1731 		 * Handle kernel data snapshot stats
1732 		 */
1733 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1734 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1735 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1736 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1737 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1738 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1739 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1740 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1741 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1742 		    lgrp_loadavg_max_effect;
1743 	} else {
1744 		lgrp_kstat_reset(lgrpid);
1745 	}
1746 
1747 	return (0);
1748 }
1749 
1750 int
1751 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1752 {
1753 	cpu_t	*cp;
1754 
1755 	mutex_enter(&cpu_lock);
1756 
1757 	if ((cp = cpu_get(id)) == NULL) {
1758 		mutex_exit(&cpu_lock);
1759 		return (EINVAL);
1760 	}
1761 
1762 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1763 		mutex_exit(&cpu_lock);
1764 		return (EINVAL);
1765 	}
1766 
1767 	ASSERT(cp->cpu_lpl != NULL);
1768 
1769 	*lp = cp->cpu_lpl->lpl_lgrpid;
1770 
1771 	mutex_exit(&cpu_lock);
1772 
1773 	return (0);
1774 }
1775 
1776 int
1777 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1778 {
1779 	cpu_t *cp;
1780 
1781 	mutex_enter(&cpu_lock);
1782 
1783 	if ((cp = cpu_get(id)) == NULL) {
1784 		mutex_exit(&cpu_lock);
1785 		return (EINVAL);
1786 	}
1787 
1788 	ASSERT(cp->cpu_lpl != NULL);
1789 
1790 	*lp = cp->cpu_lpl->lpl_loadavg;
1791 
1792 	mutex_exit(&cpu_lock);
1793 
1794 	return (0);
1795 }
1796 
1797 /*
1798  * Add a resource named by lpl_leaf to rset of lpl_target
1799  *
1800  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1801  * resource. It is adjusted here, as this is presently the only place that we
1802  * can be certain a resource addition has succeeded.
1803  *
1804  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1805  * list in order until it reaches a NULL.  (This list is required to be NULL
1806  * terminated, too).  This is done so that we can mark start pos + 1, so that
1807  * each lpl is traversed sequentially, but in a different order.  We hope this
1808  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1809  */
1810 
1811 void
1812 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1813 {
1814 	int		i;
1815 	int		entry_slot = 0;
1816 
1817 	/* return if leaf is already present */
1818 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1819 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1820 			return;
1821 		}
1822 
1823 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1824 		    lpl_leaf->lpl_lgrpid) {
1825 			break;
1826 		}
1827 	}
1828 
1829 	/* insert leaf, update counts */
1830 	entry_slot = i;
1831 	i = lpl_target->lpl_nrset++;
1832 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1833 		panic("More leaf lgrps in system than are supported!\n");
1834 	}
1835 
1836 	/*
1837 	 * Start at the end of the rset array and work backwards towards the
1838 	 * slot into which the new lpl will be inserted. This effectively
1839 	 * preserves the current ordering by scooting everybody over one entry,
1840 	 * and placing the new entry into the space created.
1841 	 */
1842 
1843 	while (i-- > entry_slot) {
1844 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1845 	}
1846 
1847 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1848 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1849 }
1850 
1851 /*
1852  * Update each of lpl_parent's children with a proper hint and
1853  * a reference to their parent.
1854  * The lgrp topology is used as the reference since it is fully
1855  * consistent and correct at this point.
1856  *
1857  * Each child's hint will reference an element in lpl_parent's
1858  * rset that designates where the child should start searching
1859  * for CPU resources. The hint selected is the highest order leaf present
1860  * in the child's lineage.
1861  *
1862  * This should be called after any potential change in lpl_parent's
1863  * rset.
1864  */
1865 static void
1866 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1867 {
1868 	klgrpset_t	children, leaves;
1869 	lpl_t		*lpl;
1870 	int		hint;
1871 	int		i, j;
1872 
1873 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1874 	if (klgrpset_isempty(children))
1875 		return; /* nothing to do */
1876 
1877 	for (i = 0; i <= lgrp_alloc_max; i++) {
1878 		if (klgrpset_ismember(children, i)) {
1879 
1880 			/*
1881 			 * Given the set of leaves in this child's lineage,
1882 			 * find the highest order leaf present in the parent's
1883 			 * rset. Select this as the hint for the child.
1884 			 */
1885 			leaves = lgrp_table[i]->lgrp_leaves;
1886 			hint = 0;
1887 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1888 				lpl = lpl_parent->lpl_rset[j];
1889 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1890 					hint = j;
1891 			}
1892 			cp->cp_lgrploads[i].lpl_hint = hint;
1893 
1894 			/*
1895 			 * (Re)set the parent. It may be incorrect if
1896 			 * lpl_parent is new in the topology.
1897 			 */
1898 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1899 		}
1900 	}
1901 }
1902 
1903 /*
1904  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1905  *
1906  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1907  * resource. The values are adjusted here, as this is the only place that we can
1908  * be certain a resource was successfully deleted.
1909  */
1910 void
1911 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1912 {
1913 	int i;
1914 
1915 	/* find leaf in intermediate node */
1916 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1917 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1918 			break;
1919 	}
1920 
1921 	/* return if leaf not found */
1922 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1923 		return;
1924 
1925 	/* prune leaf, compress array */
1926 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1927 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1928 	lpl_target->lpl_ncpu--;
1929 	do {
1930 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1931 	} while (i++ < lpl_target->lpl_nrset);
1932 }
1933 
1934 /*
1935  * Check to see if the resource set of the target lpl contains the
1936  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1937  */
1938 
1939 int
1940 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1941 {
1942 	int i;
1943 
1944 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1945 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1946 			return (1);
1947 	}
1948 
1949 	return (0);
1950 }
1951 
1952 /*
1953  * Called when we change cpu lpl membership.  This increments or decrements the
1954  * per-cpu counter in every lpl in which our leaf appears.
1955  */
1956 void
1957 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1958 {
1959 	cpupart_t	*cpupart;
1960 	lgrp_t		*lgrp_leaf;
1961 	lgrp_t		*lgrp_cur;
1962 	lpl_t		*lpl_leaf;
1963 	lpl_t		*lpl_cur;
1964 	int		i;
1965 
1966 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1967 
1968 	cpupart = cp->cpu_part;
1969 	lpl_leaf = cp->cpu_lpl;
1970 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1971 
1972 	for (i = 0; i <= lgrp_alloc_max; i++) {
1973 		lgrp_cur = lgrp_table[i];
1974 
1975 		/*
1976 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1977 		 * for the cpu in question, or if the current lgrp and leaf
1978 		 * don't share the same resources.
1979 		 */
1980 
1981 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1982 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1983 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1984 			continue;
1985 
1986 
1987 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1988 
1989 		if (lpl_cur->lpl_nrset > 0) {
1990 			if (act == LPL_INCREMENT) {
1991 				lpl_cur->lpl_ncpu++;
1992 			} else if (act == LPL_DECREMENT) {
1993 				lpl_cur->lpl_ncpu--;
1994 			}
1995 		}
1996 	}
1997 }
1998 
1999 /*
2000  * Initialize lpl with given resources and specified lgrp
2001  */
2002 
2003 void
2004 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2005 {
2006 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2007 	lpl->lpl_loadavg = 0;
2008 	if (lpl == lpl_leaf)
2009 		lpl->lpl_ncpu = 1;
2010 	else
2011 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2012 	lpl->lpl_nrset = 1;
2013 	lpl->lpl_rset[0] = lpl_leaf;
2014 	lpl->lpl_lgrp = lgrp;
2015 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2016 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2017 }
2018 
2019 /*
2020  * Clear an unused lpl
2021  */
2022 
2023 void
2024 lpl_clear(lpl_t *lpl)
2025 {
2026 	lgrp_id_t	lid;
2027 
2028 	/* save lid for debugging purposes */
2029 	lid = lpl->lpl_lgrpid;
2030 	bzero(lpl, sizeof (lpl_t));
2031 	lpl->lpl_lgrpid = lid;
2032 }
2033 
2034 /*
2035  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2036  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2037  * make full use of all of the lgroup topology, but this checks to make sure
2038  * that for the parts that it does use, it has correctly understood the
2039  * relationships that exist. This function returns
2040  * 0 if the topology is correct, and a non-zero error code, for non-debug
2041  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2042  * debugging on a DEBUG kernel.
2043  */
2044 int
2045 lpl_topo_verify(cpupart_t *cpupart)
2046 {
2047 	lgrp_t		*lgrp;
2048 	lpl_t		*lpl;
2049 	klgrpset_t	rset;
2050 	klgrpset_t	cset;
2051 	cpu_t		*cpu;
2052 	cpu_t		*cp_start;
2053 	int		i;
2054 	int		j;
2055 	int		sum;
2056 
2057 	/* topology can't be incorrect if it doesn't exist */
2058 	if (!lgrp_topo_initialized || !lgrp_initialized)
2059 		return (LPL_TOPO_CORRECT);
2060 
2061 	ASSERT(cpupart != NULL);
2062 
2063 	for (i = 0; i <= lgrp_alloc_max; i++) {
2064 		lgrp = lgrp_table[i];
2065 		lpl = NULL;
2066 		/* make sure lpls are allocated */
2067 		ASSERT(cpupart->cp_lgrploads);
2068 		if (!cpupart->cp_lgrploads)
2069 			return (LPL_TOPO_PART_HAS_NO_LPL);
2070 
2071 		lpl = &cpupart->cp_lgrploads[i];
2072 		/* make sure our index is good */
2073 		ASSERT(i < cpupart->cp_nlgrploads);
2074 
2075 		/* if lgroup doesn't exist, make sure lpl is empty */
2076 		if (!LGRP_EXISTS(lgrp)) {
2077 			ASSERT(lpl->lpl_ncpu == 0);
2078 			if (lpl->lpl_ncpu > 0) {
2079 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2080 			} else {
2081 				continue;
2082 			}
2083 		}
2084 
2085 		/* verify that lgroup and lpl are identically numbered */
2086 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2087 
2088 		/* if lgroup isn't in our partition, make sure lpl is empty */
2089 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2090 		    cpupart->cp_lgrpset)) {
2091 			ASSERT(lpl->lpl_ncpu == 0);
2092 			if (lpl->lpl_ncpu > 0) {
2093 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2094 			}
2095 			/*
2096 			 * lpl is empty, and lgroup isn't in partition.  verify
2097 			 * that lpl doesn't show up in anyone else's rsets (in
2098 			 * this partition, anyway)
2099 			 */
2100 
2101 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2102 				lpl_t *i_lpl; /* lpl we're iterating over */
2103 
2104 				i_lpl = &cpupart->cp_lgrploads[j];
2105 
2106 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2107 				if (lpl_rset_contains(i_lpl, lpl)) {
2108 					return (LPL_TOPO_LPL_ORPHANED);
2109 				}
2110 			}
2111 			/* lgroup is empty, and everything is ok. continue */
2112 			continue;
2113 		}
2114 
2115 
2116 		/* lgroup is in this partition, now check it against lpl */
2117 
2118 		/* do both have matching lgrps? */
2119 		ASSERT(lgrp == lpl->lpl_lgrp);
2120 		if (lgrp != lpl->lpl_lgrp) {
2121 			return (LPL_TOPO_LGRP_MISMATCH);
2122 		}
2123 
2124 		/* do the parent lgroups exist and do they match? */
2125 		if (lgrp->lgrp_parent) {
2126 			ASSERT(lpl->lpl_parent);
2127 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2128 				    lpl->lpl_parent->lpl_lgrpid);
2129 
2130 			if (!lpl->lpl_parent) {
2131 				return (LPL_TOPO_MISSING_PARENT);
2132 			} else if (lgrp->lgrp_parent->lgrp_id !=
2133 			    lpl->lpl_parent->lpl_lgrpid) {
2134 				return (LPL_TOPO_PARENT_MISMATCH);
2135 			}
2136 		}
2137 
2138 		/* only leaf lgroups keep a cpucnt, only check leaves */
2139 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2140 
2141 			/* verify that lgrp is also a leaf */
2142 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2143 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2144 			    lpl->lpl_lgrpid)));
2145 
2146 			if ((lgrp->lgrp_childcnt > 0) ||
2147 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2148 			    lpl->lpl_lgrpid))) {
2149 				return (LPL_TOPO_LGRP_NOT_LEAF);
2150 			}
2151 
2152 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2153 			    (lpl->lpl_ncpu > 0));
2154 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2155 				(lpl->lpl_ncpu <= 0)) {
2156 				return (LPL_TOPO_BAD_CPUCNT);
2157 			}
2158 
2159 			/*
2160 			 * Check that lpl_ncpu also matches the number of
2161 			 * cpus in the lpl's linked list.  This only exists in
2162 			 * leaves, but they should always match.
2163 			 */
2164 			j = 0;
2165 			cpu = cp_start = lpl->lpl_cpus;
2166 			while (cpu != NULL) {
2167 				j++;
2168 
2169 				/* check to make sure cpu's lpl is leaf lpl */
2170 				ASSERT(cpu->cpu_lpl == lpl);
2171 				if (cpu->cpu_lpl != lpl) {
2172 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2173 				}
2174 
2175 				/* check next cpu */
2176 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2177 					continue;
2178 				} else {
2179 					cpu = NULL;
2180 				}
2181 			}
2182 
2183 			ASSERT(j == lpl->lpl_ncpu);
2184 			if (j != lpl->lpl_ncpu) {
2185 				return (LPL_TOPO_LPL_BAD_NCPU);
2186 			}
2187 
2188 			/*
2189 			 * Also, check that leaf lpl is contained in all
2190 			 * intermediate lpls that name the leaf as a descendant
2191 			 */
2192 
2193 			for (j = 0; j <= lgrp_alloc_max; j++) {
2194 				klgrpset_t intersect;
2195 				lgrp_t *lgrp_cand;
2196 				lpl_t *lpl_cand;
2197 
2198 				lgrp_cand = lgrp_table[j];
2199 				intersect = klgrpset_intersects(
2200 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2201 				    cpupart->cp_lgrpset);
2202 
2203 				if (!LGRP_EXISTS(lgrp_cand) ||
2204 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2205 				    cpupart->cp_lgrpset) ||
2206 				    (intersect == 0))
2207 					continue;
2208 
2209 				lpl_cand =
2210 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2211 
2212 				if (klgrpset_ismember(intersect,
2213 				    lgrp->lgrp_id)) {
2214 					ASSERT(lpl_rset_contains(lpl_cand,
2215 					    lpl));
2216 
2217 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2218 						return (LPL_TOPO_RSET_MSSNG_LF);
2219 					}
2220 				}
2221 			}
2222 
2223 		} else { /* non-leaf specific checks */
2224 
2225 			/*
2226 			 * Non-leaf lpls should have lpl_cpus == NULL
2227 			 * verify that this is so
2228 			 */
2229 			ASSERT(lpl->lpl_cpus == NULL);
2230 			if (lpl->lpl_cpus != NULL) {
2231 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2232 			}
2233 
2234 			/*
2235 			 * verify that the sum of the cpus in the leaf resources
2236 			 * is equal to the total ncpu in the intermediate
2237 			 */
2238 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2239 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2240 			}
2241 
2242 			ASSERT(sum == lpl->lpl_ncpu);
2243 			if (sum != lpl->lpl_ncpu) {
2244 				return (LPL_TOPO_LPL_BAD_NCPU);
2245 			}
2246 		}
2247 
2248 		/*
2249 		 * check on lpl_hint. Don't check root, since it has no parent.
2250 		 */
2251 		if (lpl->lpl_parent != NULL) {
2252 			int hint;
2253 			lpl_t *hint_lpl;
2254 
2255 			/* make sure hint is within limits of nrset */
2256 			hint = lpl->lpl_hint;
2257 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2258 			if (lpl->lpl_parent->lpl_nrset < hint) {
2259 				return (LPL_TOPO_BOGUS_HINT);
2260 			}
2261 
2262 			/* make sure hint points to valid lpl */
2263 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2264 			ASSERT(hint_lpl->lpl_ncpu > 0);
2265 			if (hint_lpl->lpl_ncpu <= 0) {
2266 				return (LPL_TOPO_BOGUS_HINT);
2267 			}
2268 		}
2269 
2270 		/*
2271 		 * Check the rset of the lpl in question.  Make sure that each
2272 		 * rset contains a subset of the resources in
2273 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2274 		 * sure that each rset doesn't include resources that are
2275 		 * outside of that set.  (Which would be resources somehow not
2276 		 * accounted for).
2277 		 */
2278 
2279 		klgrpset_clear(rset);
2280 		for (j = 0; j < lpl->lpl_nrset; j++) {
2281 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2282 		}
2283 		klgrpset_copy(cset, rset);
2284 		/* make sure lpl rset matches lgrp rset */
2285 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2286 		/* make sure rset is contained with in partition, too */
2287 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2288 
2289 		ASSERT(klgrpset_isempty(rset) &&
2290 			    klgrpset_isempty(cset));
2291 		if (!klgrpset_isempty(rset) ||
2292 		    !klgrpset_isempty(cset)) {
2293 			return (LPL_TOPO_RSET_MISMATCH);
2294 		}
2295 
2296 		/*
2297 		 * check to make sure lpl_nrset matches the number of rsets
2298 		 * contained in the lpl
2299 		 */
2300 
2301 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2302 		    j++);
2303 
2304 		ASSERT(j == lpl->lpl_nrset);
2305 		if (j != lpl->lpl_nrset) {
2306 			return (LPL_TOPO_BAD_RSETCNT);
2307 		}
2308 
2309 	}
2310 	return (LPL_TOPO_CORRECT);
2311 }
2312 
2313 /*
2314  * Flatten lpl topology to given number of levels.  This is presently only
2315  * implemented for a flatten to 2 levels, which will prune out the intermediates
2316  * and home the leaf lpls to the root lpl.
2317  */
2318 int
2319 lpl_topo_flatten(int levels)
2320 {
2321 	int		i;
2322 	uint_t		sum;
2323 	lgrp_t		*lgrp_cur;
2324 	lpl_t		*lpl_cur;
2325 	lpl_t		*lpl_root;
2326 	cpupart_t	*cp;
2327 
2328 	if (levels != 2)
2329 		return (0);
2330 
2331 	/* called w/ cpus paused - grab no locks! */
2332 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2333 	    !lgrp_initialized);
2334 
2335 	cp = cp_list_head;
2336 	do {
2337 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2338 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2339 
2340 		for (i = 0; i <= lgrp_alloc_max; i++) {
2341 			lgrp_cur = lgrp_table[i];
2342 			lpl_cur = &cp->cp_lgrploads[i];
2343 
2344 			if ((lgrp_cur == lgrp_root) ||
2345 			    (!LGRP_EXISTS(lgrp_cur) &&
2346 			    (lpl_cur->lpl_ncpu == 0)))
2347 				continue;
2348 
2349 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2350 				/*
2351 				 * this should be a deleted intermediate, so
2352 				 * clear it
2353 				 */
2354 				lpl_clear(lpl_cur);
2355 			} else if ((lpl_cur->lpl_nrset == 1) &&
2356 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2357 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2358 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2359 				/*
2360 				 * this is a leaf whose parent was deleted, or
2361 				 * whose parent had their lgrp deleted.  (And
2362 				 * whose parent will soon be deleted).  Point
2363 				 * this guy back to the root lpl.
2364 				 */
2365 				lpl_cur->lpl_parent = lpl_root;
2366 				lpl_rset_add(lpl_root, lpl_cur);
2367 			}
2368 
2369 		}
2370 
2371 		/*
2372 		 * Now that we're done, make sure the count on the root lpl is
2373 		 * correct, and update the hints of the children for the sake of
2374 		 * thoroughness
2375 		 */
2376 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2377 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2378 		}
2379 		lpl_root->lpl_ncpu = sum;
2380 		lpl_child_update(lpl_root, cp);
2381 
2382 		cp = cp->cp_next;
2383 	} while (cp != cp_list_head);
2384 
2385 	return (levels);
2386 }
2387 
2388 /*
2389  * Insert a lpl into the resource hierarchy and create any additional lpls that
2390  * are necessary to represent the varying states of locality for the cpu
2391  * resoruces newly added to the partition.
2392  *
2393  * This routine is clever enough that it can correctly add resources from the
2394  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2395  * those for which the lpl is a leaf as opposed to simply a named equally local
2396  * resource).  The one special case that needs additional processing is when a
2397  * new intermediate lpl is introduced.  Since the main loop only traverses
2398  * looking to add the leaf resource where it does not yet exist, additional work
2399  * is necessary to add other leaf resources that may need to exist in the newly
2400  * created intermediate.  This is performed by the second inner loop, and is
2401  * only done when the check for more than one overlapping resource succeeds.
2402  */
2403 
2404 void
2405 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2406 {
2407 	int		i;
2408 	int		j;
2409 	int		hint;
2410 	int		rset_num_intersect;
2411 	lgrp_t		*lgrp_cur;
2412 	lpl_t		*lpl_cur;
2413 	lpl_t		*lpl_parent;
2414 	lgrp_id_t	parent_id;
2415 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2416 
2417 	for (i = 0; i <= lgrp_alloc_max; i++) {
2418 		lgrp_cur = lgrp_table[i];
2419 
2420 		/*
2421 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2422 		 * contained within the current lgrp, or if the current lgrp has
2423 		 * no leaves in this partition
2424 		 */
2425 
2426 		if (!LGRP_EXISTS(lgrp_cur) ||
2427 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2428 		    lpl_leaf->lpl_lgrpid) ||
2429 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2430 		    cpupart->cp_lgrpset))
2431 			continue;
2432 
2433 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2434 		if (lgrp_cur->lgrp_parent != NULL) {
2435 			/* if lgrp has a parent, assign it properly */
2436 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2437 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2438 		} else {
2439 			/* if not, make sure parent ptr gets set to null */
2440 			lpl_parent = NULL;
2441 		}
2442 
2443 		if (lpl_cur == lpl_leaf) {
2444 			/*
2445 			 * Almost all leaf state was initialized elsewhere.  The
2446 			 * only thing left to do is to set the parent.
2447 			 */
2448 			lpl_cur->lpl_parent = lpl_parent;
2449 			continue;
2450 		}
2451 
2452 		/*
2453 		 * Initialize intermediate lpl
2454 		 * Save this lpl's hint though. Since we're changing this
2455 		 * lpl's resources, we need to update the hint in this lpl's
2456 		 * children, but the hint in this lpl is unaffected and
2457 		 * should be preserved.
2458 		 */
2459 		hint = lpl_cur->lpl_hint;
2460 
2461 		lpl_clear(lpl_cur);
2462 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2463 
2464 		lpl_cur->lpl_hint = hint;
2465 		lpl_cur->lpl_parent = lpl_parent;
2466 
2467 		/* does new lpl need to be populated with other resources? */
2468 		rset_intersect =
2469 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2470 			cpupart->cp_lgrpset);
2471 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2472 
2473 		if (rset_num_intersect > 1) {
2474 			/*
2475 			 * If so, figure out what lpls have resources that
2476 			 * intersect this one, and add them.
2477 			 */
2478 			for (j = 0; j <= lgrp_alloc_max; j++) {
2479 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2480 				lpl_t	*lpl_cand;	/* candidate lpl */
2481 
2482 				lgrp_cand = lgrp_table[j];
2483 				if (!LGRP_EXISTS(lgrp_cand) ||
2484 				    !klgrpset_ismember(rset_intersect,
2485 					lgrp_cand->lgrp_id))
2486 					continue;
2487 				lpl_cand =
2488 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2489 				lpl_rset_add(lpl_cur, lpl_cand);
2490 			}
2491 		}
2492 		/*
2493 		 * This lpl's rset has changed. Update the hint in it's
2494 		 * children.
2495 		 */
2496 		lpl_child_update(lpl_cur, cpupart);
2497 	}
2498 }
2499 
2500 /*
2501  * remove a lpl from the hierarchy of resources, clearing its state when
2502  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2503  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2504  * delete them as well.
2505  */
2506 
2507 void
2508 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2509 {
2510 	int		i;
2511 	lgrp_t		*lgrp_cur;
2512 	lpl_t		*lpl_cur;
2513 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2514 
2515 	for (i = 0; i <= lgrp_alloc_max; i++) {
2516 		lgrp_cur = lgrp_table[i];
2517 
2518 		/*
2519 		 * Don't attempt to remove from lgrps that aren't there, that
2520 		 * don't contain our leaf, or from the leaf itself. (We do that
2521 		 * later)
2522 		 */
2523 
2524 		if (!LGRP_EXISTS(lgrp_cur))
2525 			continue;
2526 
2527 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2528 
2529 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2530 		    lpl_leaf->lpl_lgrpid) ||
2531 		    (lpl_cur == lpl_leaf)) {
2532 			continue;
2533 		}
2534 
2535 		/*
2536 		 * This is a slightly sleazy simplification in that we have
2537 		 * already marked the cp_lgrpset as no longer containing the
2538 		 * leaf we've deleted.  Any lpls that pass the above checks
2539 		 * based upon lgrp membership but not necessarily cpu-part
2540 		 * membership also get cleared by the checks below.  Currently
2541 		 * this is harmless, as the lpls should be empty anyway.
2542 		 *
2543 		 * In particular, we want to preserve lpls that have additional
2544 		 * leaf resources, even though we don't yet have a processor
2545 		 * architecture that represents resources this way.
2546 		 */
2547 
2548 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2549 		    cpupart->cp_lgrpset);
2550 
2551 		lpl_rset_del(lpl_cur, lpl_leaf);
2552 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2553 			lpl_clear(lpl_cur);
2554 		} else {
2555 			/*
2556 			 * Update this lpl's children
2557 			 */
2558 			lpl_child_update(lpl_cur, cpupart);
2559 		}
2560 	}
2561 	lpl_clear(lpl_leaf);
2562 }
2563 
2564 /*
2565  * add a cpu to a partition in terms of lgrp load avg bookeeping
2566  *
2567  * The lpl (cpu partition load average information) is now arranged in a
2568  * hierarchical fashion whereby resources that are closest, ie. most local, to
2569  * the cpu in question are considered to be leaves in a tree of resources.
2570  * There are two general cases for cpu additon:
2571  *
2572  * 1. A lpl structure that contains resources already in the hierarchy tree.
2573  * In this case, all of the associated lpl relationships have been defined, and
2574  * all that is necessary is that we link the new cpu into the per-lpl list of
2575  * cpus, and increment the ncpu count of all places where this cpu resource will
2576  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2577  * pushing is accomplished by this routine.
2578  *
2579  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2580  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2581  * construct the hierarchy of state necessary to name it's more distant
2582  * resources, if they should exist.  The leaf structure is initialized by this
2583  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2584  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2585  * and builds all of the "ancestoral" state necessary to identify resources at
2586  * differing levels of locality.
2587  */
2588 void
2589 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2590 {
2591 	cpupart_t	*cpupart;
2592 	lgrp_t		*lgrp_leaf;
2593 	lpl_t		*lpl_leaf;
2594 
2595 	/* called sometimes w/ cpus paused - grab no locks */
2596 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2597 
2598 	cpupart = cp->cpu_part;
2599 	lgrp_leaf = lgrp_table[lgrpid];
2600 
2601 	/* don't add non-existent lgrp */
2602 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2603 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2604 	cp->cpu_lpl = lpl_leaf;
2605 
2606 	/* only leaf lpls contain cpus */
2607 
2608 	if (lpl_leaf->lpl_ncpu++ == 0) {
2609 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2610 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2611 		lpl_leaf_insert(lpl_leaf, cpupart);
2612 	} else {
2613 		/*
2614 		 * the lpl should already exist in the parent, so just update
2615 		 * the count of available CPUs
2616 		 */
2617 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2618 	}
2619 
2620 	/* link cpu into list of cpus in lpl */
2621 
2622 	if (lpl_leaf->lpl_cpus) {
2623 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2624 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2625 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2626 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2627 	} else {
2628 		/*
2629 		 * We increment ncpu immediately after we create a new leaf
2630 		 * lpl, so assert that ncpu == 1 for the case where we don't
2631 		 * have any cpu pointers yet.
2632 		 */
2633 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2634 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2635 	}
2636 
2637 }
2638 
2639 
2640 /*
2641  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2642  *
2643  * The lpl (cpu partition load average information) is now arranged in a
2644  * hierarchical fashion whereby resources that are closest, ie. most local, to
2645  * the cpu in question are considered to be leaves in a tree of resources.
2646  * There are two removal cases in question:
2647  *
2648  * 1. Removal of the resource in the leaf leaves other resources remaining in
2649  * that leaf.  (Another cpu still exists at this level of locality).  In this
2650  * case, the count of available cpus is decremented in all assocated lpls by
2651  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2652  * from the per-cpu lpl list.
2653  *
2654  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2655  * empty)  In this case, all of what has occurred for the first step must take
2656  * place; however, additionally we must remove the lpl structure itself, prune
2657  * out any stranded lpls that do not directly name a leaf resource, and mark the
2658  * cpu partition in question as no longer containing resources from the lgrp of
2659  * the lpl that has been delted.  Cpu-partition changes are handled by this
2660  * method, but the lpl_leaf_remove function deals with the details of pruning
2661  * out the empty lpl and any of its orphaned direct ancestors.
2662  */
2663 void
2664 lgrp_part_del_cpu(cpu_t *cp)
2665 {
2666 	lpl_t		*lpl;
2667 	lpl_t		*leaf_lpl;
2668 	lgrp_t		*lgrp_leaf;
2669 
2670 	/* called sometimes w/ cpus paused - grab no locks */
2671 
2672 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2673 
2674 	lpl = leaf_lpl = cp->cpu_lpl;
2675 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2676 
2677 	/* don't delete a leaf that isn't there */
2678 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2679 
2680 	/* no double-deletes */
2681 	ASSERT(lpl->lpl_ncpu);
2682 	if (--lpl->lpl_ncpu == 0) {
2683 		/*
2684 		 * This was the last cpu in this lgroup for this partition,
2685 		 * clear its bit in the partition's lgroup bitmask
2686 		 */
2687 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2688 
2689 		/* eliminate remaning lpl link pointers in cpu, lpl */
2690 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2691 
2692 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2693 	} else {
2694 
2695 		/* unlink cpu from lists of cpus in lpl */
2696 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2697 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2698 		if (lpl->lpl_cpus == cp) {
2699 			lpl->lpl_cpus = cp->cpu_next_lpl;
2700 		}
2701 
2702 		/*
2703 		 * Update the cpu count in the lpls associated with parent
2704 		 * lgroups.
2705 		 */
2706 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2707 
2708 	}
2709 	/* clear cpu's lpl ptr when we're all done */
2710 	cp->cpu_lpl = NULL;
2711 }
2712 
2713 /*
2714  * Recompute load average for the specified partition/lgrp fragment.
2715  *
2716  * We rely on the fact that this routine is called from the clock thread
2717  * at a point before the clock thread can block (i.e. before its first
2718  * lock request).  Since the clock thread can not be preempted (since it
2719  * runs at highest priority), we know that cpu partitions can not change
2720  * (since doing so would require either the repartition requester or the
2721  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2722  * without grabbing cpu_lock.
2723  */
2724 void
2725 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2726 {
2727 	uint_t		ncpu;
2728 	int64_t		old, new, f;
2729 
2730 	/*
2731 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2732 	 */
2733 	static short expval[] = {
2734 	    0, 3196, 1618, 1083,
2735 	    814, 652, 543, 466,
2736 	    408, 363, 326, 297,
2737 	    272, 251, 233, 218,
2738 	    204, 192, 181, 172,
2739 	    163, 155, 148, 142,
2740 	    136, 130, 125, 121,
2741 	    116, 112, 109, 105
2742 	};
2743 
2744 	/* ASSERT (called from clock level) */
2745 
2746 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2747 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2748 		return;
2749 	}
2750 
2751 	for (;;) {
2752 
2753 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2754 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2755 		else
2756 			f = expval[ncpu];
2757 
2758 		/*
2759 		 * Modify the load average atomically to avoid losing
2760 		 * anticipatory load updates (see lgrp_move_thread()).
2761 		 */
2762 		if (ageflag) {
2763 			/*
2764 			 * We're supposed to both update and age the load.
2765 			 * This happens 10 times/sec. per cpu.  We do a
2766 			 * little hoop-jumping to avoid integer overflow.
2767 			 */
2768 			int64_t		q, r;
2769 
2770 			do {
2771 				old = new = lpl->lpl_loadavg;
2772 				q = (old  >> 16) << 7;
2773 				r = (old  & 0xffff) << 7;
2774 				new += ((long long)(nrcpus - q) * f -
2775 				    ((r * f) >> 16)) >> 7;
2776 
2777 				/*
2778 				 * Check for overflow
2779 				 */
2780 				if (new > LGRP_LOADAVG_MAX)
2781 					new = LGRP_LOADAVG_MAX;
2782 				else if (new < 0)
2783 					new = 0;
2784 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2785 			    new) != old);
2786 		} else {
2787 			/*
2788 			 * We're supposed to update the load, but not age it.
2789 			 * This option is used to update the load (which either
2790 			 * has already been aged in this 1/10 sec. interval or
2791 			 * soon will be) to account for a remotely executing
2792 			 * thread.
2793 			 */
2794 			do {
2795 				old = new = lpl->lpl_loadavg;
2796 				new += f;
2797 				/*
2798 				 * Check for overflow
2799 				 * Underflow not possible here
2800 				 */
2801 				if (new < old)
2802 					new = LGRP_LOADAVG_MAX;
2803 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2804 			    new) != old);
2805 		}
2806 
2807 		/*
2808 		 * Do the same for this lpl's parent
2809 		 */
2810 		if ((lpl = lpl->lpl_parent) == NULL)
2811 			break;
2812 		ncpu = lpl->lpl_ncpu;
2813 	}
2814 }
2815 
2816 /*
2817  * Initialize lpl topology in the target based on topology currently present in
2818  * lpl_bootstrap.
2819  *
2820  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2821  * initialize cp_default list of lpls. Up to this point all topology operations
2822  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2823  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2824  * `target' points to the list of lpls in cp_default and `size' is the size of
2825  * this list.
2826  *
2827  * This function walks the lpl topology in lpl_bootstrap and does for things:
2828  *
2829  * 1) Copies all fields from lpl_bootstrap to the target.
2830  *
2831  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2832  *
2833  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2834  *    instead of lpl_bootstrap.
2835  *
2836  * 4) Updates pointers in the resource list of the target to point to the lpls
2837  *    in the target list instead of lpl_bootstrap.
2838  *
2839  * After lpl_topo_bootstrap() completes, target contains the same information
2840  * that would be present there if it were used during boot instead of
2841  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2842  * and it is bzeroed.
2843  */
2844 void
2845 lpl_topo_bootstrap(lpl_t *target, int size)
2846 {
2847 	lpl_t	*lpl = lpl_bootstrap;
2848 	lpl_t	*target_lpl = target;
2849 	int	howmany;
2850 	int	id;
2851 	int	i;
2852 
2853 	/*
2854 	 * The only target that should be passed here is cp_default lpl list.
2855 	 */
2856 	ASSERT(target == cp_default.cp_lgrploads);
2857 	ASSERT(size == cp_default.cp_nlgrploads);
2858 	ASSERT(!lgrp_topo_initialized);
2859 	ASSERT(ncpus == 1);
2860 
2861 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2862 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2863 		/*
2864 		 * Copy all fields from lpl.
2865 		 */
2866 
2867 		*target_lpl = *lpl;
2868 
2869 		/*
2870 		 * Substitute CPU0 lpl pointer with one relative to target.
2871 		 */
2872 		if (lpl->lpl_cpus == CPU) {
2873 			ASSERT(CPU->cpu_lpl == lpl);
2874 			CPU->cpu_lpl = target_lpl;
2875 		}
2876 
2877 		/*
2878 		 * Substitute parent information with parent relative to target.
2879 		 */
2880 		if (lpl->lpl_parent != NULL)
2881 			target_lpl->lpl_parent = (lpl_t *)
2882 			    (((uintptr_t)lpl->lpl_parent -
2883 				(uintptr_t)lpl_bootstrap) +
2884 				(uintptr_t)target);
2885 
2886 		/*
2887 		 * Walk over resource set substituting pointers relative to
2888 		 * lpl_bootstrap to pointers relative to target.
2889 		 */
2890 		ASSERT(lpl->lpl_nrset <= 1);
2891 
2892 		for (id = 0; id < lpl->lpl_nrset; id++) {
2893 			if (lpl->lpl_rset[id] != NULL) {
2894 				target_lpl->lpl_rset[id] =
2895 				    (lpl_t *)
2896 				    (((uintptr_t)lpl->lpl_rset[id] -
2897 					(uintptr_t)lpl_bootstrap) +
2898 					(uintptr_t)target);
2899 			}
2900 		}
2901 	}
2902 
2903 	/*
2904 	 * Topology information in lpl_bootstrap is no longer needed.
2905 	 */
2906 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2907 }
2908 
2909 /*
2910  * If the lowest load among the lgroups a process' threads are currently
2911  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2912  * expanding the process to a new lgroup.
2913  */
2914 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2915 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2916 
2917 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2918 	((lgrp_expand_proc_thresh) / (ncpu))
2919 
2920 /*
2921  * A process will be expanded to a new lgroup only if the difference between
2922  * the lowest load on the lgroups the process' thread's are currently spread
2923  * across and the lowest load on the other lgroups in the process' partition
2924  * is greater than lgrp_expand_proc_diff.
2925  */
2926 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2927 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2928 
2929 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2930 	((lgrp_expand_proc_diff) / (ncpu))
2931 
2932 /*
2933  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2934  * be present due to impreciseness of the load average decay algorithm.
2935  *
2936  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2937  * tolerance is scaled by the number of cpus in the lgroup just like
2938  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2939  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2940  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2941  */
2942 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2943 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2944 	((lgrp_loadavg_tolerance) / ncpu)
2945 
2946 /*
2947  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2948  * average is above this threshold
2949  */
2950 uint32_t	lgrp_load_thresh = UINT32_MAX;
2951 
2952 /*
2953  * lgrp_choose() will try to skip any lgroups with less memory
2954  * than this free when choosing a home lgroup
2955  */
2956 pgcnt_t	lgrp_mem_free_thresh = 0;
2957 
2958 /*
2959  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2960  * one based on one of the following policies:
2961  * - Random selection
2962  * - Pseudo round robin placement
2963  * - Longest time since a thread was last placed
2964  */
2965 #define	LGRP_CHOOSE_RANDOM	1
2966 #define	LGRP_CHOOSE_RR		2
2967 #define	LGRP_CHOOSE_TIME	3
2968 
2969 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2970 
2971 /*
2972  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2973  * be bound to a CPU or processor set.
2974  *
2975  * Arguments:
2976  *	t		The thread
2977  *	cpupart		The partition the thread belongs to.
2978  *
2979  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2980  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2981  *	 partitions changing out from under us and assumes that given thread is
2982  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2983  *	 disabled, so don't grab any locks because we should never block under
2984  *	 those conditions.
2985  */
2986 lpl_t *
2987 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2988 {
2989 	lgrp_load_t	bestload, bestrload;
2990 	int		lgrpid_offset, lgrp_count;
2991 	lgrp_id_t	lgrpid, lgrpid_start;
2992 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2993 	klgrpset_t	lgrpset;
2994 	proc_t		*p;
2995 
2996 	ASSERT(t != NULL);
2997 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2998 	    THREAD_LOCK_HELD(t));
2999 	ASSERT(cpupart != NULL);
3000 
3001 	p = t->t_procp;
3002 
3003 	/* A process should always be in an active partition */
3004 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3005 
3006 	bestlpl = bestrlpl = NULL;
3007 	bestload = bestrload = LGRP_LOADAVG_MAX;
3008 	lgrpset = cpupart->cp_lgrpset;
3009 
3010 	switch (lgrp_choose_policy) {
3011 	case LGRP_CHOOSE_RR:
3012 		lgrpid = cpupart->cp_lgrp_hint;
3013 		do {
3014 			if (++lgrpid > lgrp_alloc_max)
3015 				lgrpid = 0;
3016 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3017 
3018 		break;
3019 	default:
3020 	case LGRP_CHOOSE_TIME:
3021 	case LGRP_CHOOSE_RANDOM:
3022 		klgrpset_nlgrps(lgrpset, lgrp_count);
3023 		lgrpid_offset =
3024 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3025 		for (lgrpid = 0; ; lgrpid++) {
3026 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3027 				if (--lgrpid_offset == 0)
3028 					break;
3029 			}
3030 		}
3031 		break;
3032 	}
3033 
3034 	lgrpid_start = lgrpid;
3035 
3036 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3037 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3038 
3039 	/*
3040 	 * Use lgroup affinities (if any) to choose best lgroup
3041 	 *
3042 	 * NOTE: Assumes that thread is protected from going away and its
3043 	 *	 lgroup affinities won't change (ie. p_lock, or
3044 	 *	 thread_lock() being held and/or CPUs paused)
3045 	 */
3046 	if (t->t_lgrp_affinity) {
3047 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3048 		if (lpl != NULL)
3049 			return (lpl);
3050 	}
3051 
3052 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3053 
3054 	do {
3055 		pgcnt_t	npgs;
3056 
3057 		/*
3058 		 * Skip any lgroups outside of thread's pset
3059 		 */
3060 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3061 			if (++lgrpid > lgrp_alloc_max)
3062 				lgrpid = 0;	/* wrap the search */
3063 			continue;
3064 		}
3065 
3066 		/*
3067 		 * Skip any non-leaf lgroups
3068 		 */
3069 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3070 			continue;
3071 
3072 		/*
3073 		 * Skip any lgroups without enough free memory
3074 		 * (when threshold set to nonzero positive value)
3075 		 */
3076 		if (lgrp_mem_free_thresh > 0) {
3077 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3078 			if (npgs < lgrp_mem_free_thresh) {
3079 				if (++lgrpid > lgrp_alloc_max)
3080 					lgrpid = 0;	/* wrap the search */
3081 				continue;
3082 			}
3083 		}
3084 
3085 		lpl = &cpupart->cp_lgrploads[lgrpid];
3086 		if (klgrpset_isempty(p->p_lgrpset) ||
3087 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3088 			/*
3089 			 * Either this is a new process or the process already
3090 			 * has threads on this lgrp, so this is a preferred
3091 			 * lgroup for the thread.
3092 			 */
3093 			if (bestlpl == NULL ||
3094 			    lpl_pick(lpl, bestlpl)) {
3095 				bestload = lpl->lpl_loadavg;
3096 				bestlpl = lpl;
3097 			}
3098 		} else {
3099 			/*
3100 			 * The process doesn't have any threads on this lgrp,
3101 			 * but we're willing to consider this lgrp if the load
3102 			 * difference is big enough to justify splitting up
3103 			 * the process' threads.
3104 			 */
3105 			if (bestrlpl == NULL ||
3106 			    lpl_pick(lpl, bestrlpl)) {
3107 				bestrload = lpl->lpl_loadavg;
3108 				bestrlpl = lpl;
3109 			}
3110 		}
3111 		if (++lgrpid > lgrp_alloc_max)
3112 			lgrpid = 0;	/* wrap the search */
3113 	} while (lgrpid != lgrpid_start);
3114 
3115 	/*
3116 	 * Return root lgroup if threshold isn't set to maximum value and
3117 	 * lowest lgroup load average more than a certain threshold
3118 	 */
3119 	if (lgrp_load_thresh != UINT32_MAX &&
3120 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3121 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3122 
3123 	/*
3124 	 * If all the lgroups over which the thread's process is spread are
3125 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3126 	 * the thread on one of the other leaf lgroups in the thread's
3127 	 * partition.
3128 	 */
3129 	if ((bestlpl == NULL) ||
3130 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3131 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3132 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3133 	    bestload))) {
3134 		bestlpl = bestrlpl;
3135 	}
3136 
3137 	if (bestlpl == NULL) {
3138 		/*
3139 		 * No lgroup looked particularly good, but we still
3140 		 * have to pick something. Go with the randomly selected
3141 		 * legal lgroup we started with above.
3142 		 */
3143 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3144 	}
3145 
3146 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3147 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3148 
3149 	ASSERT(bestlpl->lpl_ncpu > 0);
3150 	return (bestlpl);
3151 }
3152 
3153 /*
3154  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3155  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3156  */
3157 static int
3158 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3159 {
3160 	lgrp_load_t	l1, l2;
3161 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3162 
3163 	l1 = lpl1->lpl_loadavg;
3164 	l2 = lpl2->lpl_loadavg;
3165 
3166 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3167 		/* lpl1 is significantly less loaded than lpl2 */
3168 		return (1);
3169 	}
3170 
3171 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3172 	    l1 + tolerance >= l2 && l1 < l2 &&
3173 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3174 		/*
3175 		 * lpl1's load is within the tolerance of lpl2. We're
3176 		 * willing to consider it be to better however if
3177 		 * it has been longer since we last homed a thread there
3178 		 */
3179 		return (1);
3180 	}
3181 
3182 	return (0);
3183 }
3184 
3185 /*
3186  * An LWP is expected to be assigned to an lgroup for at least this long
3187  * for its anticipatory load to be justified.  NOTE that this value should
3188  * not be set extremely huge (say, larger than 100 years), to avoid problems
3189  * with overflow in the calculation that uses it.
3190  */
3191 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3192 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3193 
3194 /*
3195  * Routine to change a thread's lgroup affiliation.  This routine updates
3196  * the thread's kthread_t struct and its process' proc_t struct to note the
3197  * thread's new lgroup affiliation, and its lgroup affinities.
3198  *
3199  * Note that this is the only routine that modifies a thread's t_lpl field,
3200  * and that adds in or removes anticipatory load.
3201  *
3202  * If the thread is exiting, newlpl is NULL.
3203  *
3204  * Locking:
3205  * The following lock must be held on entry:
3206  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3207  *		doesn't get removed from t's partition
3208  *
3209  * This routine is not allowed to grab any locks, since it may be called
3210  * with cpus paused (such as from cpu_offline).
3211  */
3212 void
3213 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3214 {
3215 	proc_t		*p;
3216 	lpl_t		*lpl, *oldlpl;
3217 	lgrp_id_t	oldid;
3218 	kthread_t	*tp;
3219 	uint_t		ncpu;
3220 	lgrp_load_t	old, new;
3221 
3222 	ASSERT(t);
3223 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3224 	    THREAD_LOCK_HELD(t));
3225 
3226 	/*
3227 	 * If not changing lpls, just return
3228 	 */
3229 	if ((oldlpl = t->t_lpl) == newlpl)
3230 		return;
3231 
3232 	/*
3233 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3234 	 * associated with process 0 rather than with its original process).
3235 	 */
3236 	if (t->t_proc_flag & TP_LWPEXIT) {
3237 		if (newlpl != NULL) {
3238 			t->t_lpl = newlpl;
3239 		}
3240 		return;
3241 	}
3242 
3243 	p = ttoproc(t);
3244 
3245 	/*
3246 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3247 	 * to account for it being moved from its old lgroup.
3248 	 */
3249 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3250 	    (p->p_tlist != NULL)) {
3251 		oldid = oldlpl->lpl_lgrpid;
3252 
3253 		if (newlpl != NULL)
3254 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3255 
3256 		if ((do_lgrpset_delete) &&
3257 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3258 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3259 				/*
3260 				 * Check if a thread other than the thread
3261 				 * that's moving is assigned to the same
3262 				 * lgroup as the thread that's moving.  Note
3263 				 * that we have to compare lgroup IDs, rather
3264 				 * than simply comparing t_lpl's, since the
3265 				 * threads may belong to different partitions
3266 				 * but be assigned to the same lgroup.
3267 				 */
3268 				ASSERT(tp->t_lpl != NULL);
3269 
3270 				if ((tp != t) &&
3271 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3272 					/*
3273 					 * Another thread is assigned to the
3274 					 * same lgroup as the thread that's
3275 					 * moving, p_lgrpset doesn't change.
3276 					 */
3277 					break;
3278 				} else if (tp == p->p_tlist) {
3279 					/*
3280 					 * No other thread is assigned to the
3281 					 * same lgroup as the exiting thread,
3282 					 * clear the lgroup's bit in p_lgrpset.
3283 					 */
3284 					klgrpset_del(p->p_lgrpset, oldid);
3285 					break;
3286 				}
3287 			}
3288 		}
3289 
3290 		/*
3291 		 * If this thread was assigned to its old lgroup for such a
3292 		 * short amount of time that the anticipatory load that was
3293 		 * added on its behalf has aged very little, remove that
3294 		 * anticipatory load.
3295 		 */
3296 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3297 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3298 			lpl = oldlpl;
3299 			for (;;) {
3300 				do {
3301 					old = new = lpl->lpl_loadavg;
3302 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3303 					if (new > old) {
3304 						/*
3305 						 * this can happen if the load
3306 						 * average was aged since we
3307 						 * added in the anticipatory
3308 						 * load
3309 						 */
3310 						new = 0;
3311 					}
3312 				} while (cas32(
3313 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3314 					    new) != old);
3315 
3316 				lpl = lpl->lpl_parent;
3317 				if (lpl == NULL)
3318 					break;
3319 
3320 				ncpu = lpl->lpl_ncpu;
3321 				ASSERT(ncpu > 0);
3322 			}
3323 		}
3324 	}
3325 	/*
3326 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3327 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3328 	 * to its new lgroup to account for its move to its new lgroup.
3329 	 */
3330 	if (newlpl != NULL) {
3331 		/*
3332 		 * This thread is moving to a new lgroup
3333 		 */
3334 		t->t_lpl = newlpl;
3335 
3336 		/*
3337 		 * Reflect move in load average of new lgroup
3338 		 * unless it is root lgroup
3339 		 */
3340 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3341 			return;
3342 
3343 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3344 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3345 		}
3346 
3347 		/*
3348 		 * It'll take some time for the load on the new lgroup
3349 		 * to reflect this thread's placement on it.  We'd
3350 		 * like not, however, to have all threads between now
3351 		 * and then also piling on to this lgroup.  To avoid
3352 		 * this pileup, we anticipate the load this thread
3353 		 * will generate on its new lgroup.  The goal is to
3354 		 * make the lgroup's load appear as though the thread
3355 		 * had been there all along.  We're very conservative
3356 		 * in calculating this anticipatory load, we assume
3357 		 * the worst case case (100% CPU-bound thread).  This
3358 		 * may be modified in the future to be more accurate.
3359 		 */
3360 		lpl = newlpl;
3361 		for (;;) {
3362 			ncpu = lpl->lpl_ncpu;
3363 			ASSERT(ncpu > 0);
3364 			do {
3365 				old = new = lpl->lpl_loadavg;
3366 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3367 				/*
3368 				 * Check for overflow
3369 				 * Underflow not possible here
3370 				 */
3371 				if (new < old)
3372 					new = UINT32_MAX;
3373 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3374 			    new) != old);
3375 
3376 			lpl = lpl->lpl_parent;
3377 			if (lpl == NULL)
3378 				break;
3379 		}
3380 		t->t_anttime = gethrtime();
3381 	}
3382 }
3383 
3384 /*
3385  * Return lgroup memory allocation policy given advice from madvise(3C)
3386  */
3387 lgrp_mem_policy_t
3388 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3389 {
3390 	switch (advice) {
3391 	case MADV_ACCESS_LWP:
3392 		return (LGRP_MEM_POLICY_NEXT);
3393 	case MADV_ACCESS_MANY:
3394 		return (LGRP_MEM_POLICY_RANDOM);
3395 	default:
3396 		return (lgrp_mem_policy_default(size, type));
3397 	}
3398 }
3399 
3400 /*
3401  * Figure out default policy
3402  */
3403 lgrp_mem_policy_t
3404 lgrp_mem_policy_default(size_t size, int type)
3405 {
3406 	cpupart_t		*cp;
3407 	lgrp_mem_policy_t	policy;
3408 	size_t			pset_mem_size;
3409 
3410 	/*
3411 	 * Randomly allocate memory across lgroups for shared memory
3412 	 * beyond a certain threshold
3413 	 */
3414 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3415 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3416 		/*
3417 		 * Get total memory size of current thread's pset
3418 		 */
3419 		kpreempt_disable();
3420 		cp = curthread->t_cpupart;
3421 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3422 		kpreempt_enable();
3423 
3424 		/*
3425 		 * Choose policy to randomly allocate memory across
3426 		 * lgroups in pset if it will fit and is not default
3427 		 * partition.  Otherwise, allocate memory randomly
3428 		 * across machine.
3429 		 */
3430 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3431 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3432 		else
3433 			policy = LGRP_MEM_POLICY_RANDOM;
3434 	} else
3435 		/*
3436 		 * Apply default policy for private memory and
3437 		 * shared memory under the respective random
3438 		 * threshold.
3439 		 */
3440 		policy = lgrp_mem_default_policy;
3441 
3442 	return (policy);
3443 }
3444 
3445 /*
3446  * Get memory allocation policy for this segment
3447  */
3448 lgrp_mem_policy_info_t *
3449 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3450 {
3451 	lgrp_mem_policy_info_t	*policy_info;
3452 	extern struct seg_ops	segspt_ops;
3453 	extern struct seg_ops	segspt_shmops;
3454 
3455 	/*
3456 	 * This is for binary compatibility to protect against third party
3457 	 * segment drivers which haven't recompiled to allow for
3458 	 * SEGOP_GETPOLICY()
3459 	 */
3460 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3461 	    seg->s_ops != &segspt_shmops)
3462 		return (NULL);
3463 
3464 	policy_info = NULL;
3465 	if (seg->s_ops->getpolicy != NULL)
3466 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3467 
3468 	return (policy_info);
3469 }
3470 
3471 /*
3472  * Set policy for allocating private memory given desired policy, policy info,
3473  * size in bytes of memory that policy is being applied.
3474  * Return 0 if policy wasn't set already and 1 if policy was set already
3475  */
3476 int
3477 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3478     lgrp_mem_policy_info_t *policy_info, size_t size)
3479 {
3480 
3481 	ASSERT(policy_info != NULL);
3482 
3483 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3484 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3485 
3486 	/*
3487 	 * Policy set already?
3488 	 */
3489 	if (policy == policy_info->mem_policy)
3490 		return (1);
3491 
3492 	/*
3493 	 * Set policy
3494 	 */
3495 	policy_info->mem_policy = policy;
3496 	policy_info->mem_reserved = 0;
3497 
3498 	return (0);
3499 }
3500 
3501 
3502 /*
3503  * Get shared memory allocation policy with given tree and offset
3504  */
3505 lgrp_mem_policy_info_t *
3506 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3507     u_offset_t vn_off)
3508 {
3509 	u_offset_t		off;
3510 	lgrp_mem_policy_info_t	*policy_info;
3511 	lgrp_shm_policy_seg_t	*policy_seg;
3512 	lgrp_shm_locality_t	*shm_locality;
3513 	avl_tree_t		*tree;
3514 	avl_index_t		where;
3515 
3516 	/*
3517 	 * Get policy segment tree from anon_map or vnode and use specified
3518 	 * anon index or vnode offset as offset
3519 	 *
3520 	 * Assume that no lock needs to be held on anon_map or vnode, since
3521 	 * they should be protected by their reference count which must be
3522 	 * nonzero for an existing segment
3523 	 */
3524 	if (amp) {
3525 		ASSERT(amp->refcnt != 0);
3526 		shm_locality = amp->locality;
3527 		if (shm_locality == NULL)
3528 			return (NULL);
3529 		tree = shm_locality->loc_tree;
3530 		off = ptob(anon_index);
3531 	} else if (vp) {
3532 		shm_locality = vp->v_locality;
3533 		if (shm_locality == NULL)
3534 			return (NULL);
3535 		ASSERT(shm_locality->loc_count != 0);
3536 		tree = shm_locality->loc_tree;
3537 		off = vn_off;
3538 	}
3539 
3540 	if (tree == NULL)
3541 		return (NULL);
3542 
3543 	/*
3544 	 * Lookup policy segment for offset into shared object and return
3545 	 * policy info
3546 	 */
3547 	rw_enter(&shm_locality->loc_lock, RW_READER);
3548 	policy_info = NULL;
3549 	policy_seg = avl_find(tree, &off, &where);
3550 	if (policy_seg)
3551 		policy_info = &policy_seg->shm_policy;
3552 	rw_exit(&shm_locality->loc_lock);
3553 
3554 	return (policy_info);
3555 }
3556 
3557 /*
3558  * Default memory allocation policy for kernel segmap pages
3559  */
3560 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3561 
3562 /*
3563  * Return lgroup to use for allocating memory
3564  * given the segment and address
3565  *
3566  * There isn't any mutual exclusion that exists between calls
3567  * to this routine and DR, so this routine and whomever calls it
3568  * should be mindful of the possibility that the lgrp returned
3569  * may be deleted. If this happens, dereferences of the lgrp
3570  * pointer will still be safe, but the resources in the lgrp will
3571  * be gone, and LGRP_EXISTS() will no longer be true.
3572  */
3573 lgrp_t *
3574 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3575 {
3576 	int			i;
3577 	lgrp_t			*lgrp;
3578 	klgrpset_t		lgrpset;
3579 	int			lgrps_spanned;
3580 	unsigned long		off;
3581 	lgrp_mem_policy_t	policy;
3582 	lgrp_mem_policy_info_t	*policy_info;
3583 	ushort_t		random;
3584 	int			stat = 0;
3585 	extern struct seg	*segkmap;
3586 
3587 	/*
3588 	 * Just return null if the lgrp framework hasn't finished
3589 	 * initializing or if this is a UMA machine.
3590 	 */
3591 	if (nlgrps == 1 || !lgrp_initialized)
3592 		return (lgrp_root);
3593 
3594 	/*
3595 	 * Get memory allocation policy for this segment
3596 	 */
3597 	policy = lgrp_mem_default_policy;
3598 	if (seg != NULL) {
3599 		if (seg->s_as == &kas) {
3600 			if (seg == segkmap)
3601 				policy = lgrp_segmap_default_policy;
3602 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3603 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3604 				policy = LGRP_MEM_POLICY_RANDOM;
3605 		} else {
3606 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3607 			if (policy_info != NULL)
3608 				policy = policy_info->mem_policy;
3609 		}
3610 	}
3611 	lgrpset = 0;
3612 
3613 	/*
3614 	 * Initialize lgroup to home by default
3615 	 */
3616 	lgrp = lgrp_home_lgrp();
3617 
3618 	/*
3619 	 * When homing threads on root lgrp, override default memory
3620 	 * allocation policies with root lgroup memory allocation policy
3621 	 */
3622 	if (lgrp == lgrp_root)
3623 		policy = lgrp_mem_policy_root;
3624 
3625 	/*
3626 	 * Implement policy
3627 	 */
3628 	switch (policy) {
3629 	case LGRP_MEM_POLICY_NEXT_CPU:
3630 
3631 		/*
3632 		 * Return lgroup of current CPU which faulted on memory
3633 		 * If the CPU isn't currently in an lgrp, then opt to
3634 		 * allocate from the root.
3635 		 *
3636 		 * Kernel preemption needs to be disabled here to prevent
3637 		 * the current CPU from going away before lgrp is found.
3638 		 */
3639 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3640 			lgrp = lgrp_root;
3641 		} else {
3642 			kpreempt_disable();
3643 			lgrp = lgrp_cpu_to_lgrp(CPU);
3644 			kpreempt_enable();
3645 		}
3646 		break;
3647 
3648 	case LGRP_MEM_POLICY_NEXT:
3649 	case LGRP_MEM_POLICY_DEFAULT:
3650 	default:
3651 
3652 		/*
3653 		 * Just return current thread's home lgroup
3654 		 * for default policy (next touch)
3655 		 * If the thread is homed to the root,
3656 		 * then the default policy is random across lgroups.
3657 		 * Fallthrough to the random case.
3658 		 */
3659 		if (lgrp != lgrp_root) {
3660 			if (policy == LGRP_MEM_POLICY_NEXT)
3661 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3662 			else
3663 				lgrp_stat_add(lgrp->lgrp_id,
3664 				    LGRP_NUM_DEFAULT, 1);
3665 			break;
3666 		}
3667 		/* LINTED fallthrough on case statement */
3668 	case LGRP_MEM_POLICY_RANDOM:
3669 
3670 		/*
3671 		 * Return a random leaf lgroup with memory
3672 		 */
3673 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3674 		/*
3675 		 * Count how many lgroups are spanned
3676 		 */
3677 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3678 
3679 		/*
3680 		 * There may be no memnodes in the root lgroup during DR copy
3681 		 * rename on a system with only two boards (memnodes)
3682 		 * configured. In this case just return the root lgrp.
3683 		 */
3684 		if (lgrps_spanned == 0) {
3685 			lgrp = lgrp_root;
3686 			break;
3687 		}
3688 
3689 		/*
3690 		 * Pick a random offset within lgroups spanned
3691 		 * and return lgroup at that offset
3692 		 */
3693 		random = (ushort_t)gethrtime() >> 4;
3694 		off = random % lgrps_spanned;
3695 		ASSERT(off <= lgrp_alloc_max);
3696 
3697 		for (i = 0; i <= lgrp_alloc_max; i++) {
3698 			if (!klgrpset_ismember(lgrpset, i))
3699 				continue;
3700 			if (off)
3701 				off--;
3702 			else {
3703 				lgrp = lgrp_table[i];
3704 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3705 				    1);
3706 				break;
3707 			}
3708 		}
3709 		break;
3710 
3711 	case LGRP_MEM_POLICY_RANDOM_PROC:
3712 
3713 		/*
3714 		 * Grab copy of bitmask of lgroups spanned by
3715 		 * this process
3716 		 */
3717 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3718 		stat = LGRP_NUM_RANDOM_PROC;
3719 
3720 		/* LINTED fallthrough on case statement */
3721 	case LGRP_MEM_POLICY_RANDOM_PSET:
3722 
3723 		if (!stat)
3724 			stat = LGRP_NUM_RANDOM_PSET;
3725 
3726 		if (klgrpset_isempty(lgrpset)) {
3727 			/*
3728 			 * Grab copy of bitmask of lgroups spanned by
3729 			 * this processor set
3730 			 */
3731 			kpreempt_disable();
3732 			klgrpset_copy(lgrpset,
3733 			    curthread->t_cpupart->cp_lgrpset);
3734 			kpreempt_enable();
3735 		}
3736 
3737 		/*
3738 		 * Count how many lgroups are spanned
3739 		 */
3740 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3741 		ASSERT(lgrps_spanned <= nlgrps);
3742 
3743 		/*
3744 		 * Probably lgrps_spanned should be always non-zero, but to be
3745 		 * on the safe side we return lgrp_root if it is empty.
3746 		 */
3747 		if (lgrps_spanned == 0) {
3748 			lgrp = lgrp_root;
3749 			break;
3750 		}
3751 
3752 		/*
3753 		 * Pick a random offset within lgroups spanned
3754 		 * and return lgroup at that offset
3755 		 */
3756 		random = (ushort_t)gethrtime() >> 4;
3757 		off = random % lgrps_spanned;
3758 		ASSERT(off <= lgrp_alloc_max);
3759 
3760 		for (i = 0; i <= lgrp_alloc_max; i++) {
3761 			if (!klgrpset_ismember(lgrpset, i))
3762 				continue;
3763 			if (off)
3764 				off--;
3765 			else {
3766 				lgrp = lgrp_table[i];
3767 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3768 				    1);
3769 				break;
3770 			}
3771 		}
3772 		break;
3773 
3774 	case LGRP_MEM_POLICY_ROUNDROBIN:
3775 
3776 		/*
3777 		 * Use offset within segment to determine
3778 		 * offset from home lgroup to choose for
3779 		 * next lgroup to allocate memory from
3780 		 */
3781 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3782 		    (lgrp_alloc_max + 1);
3783 
3784 		kpreempt_disable();
3785 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3786 		i = lgrp->lgrp_id;
3787 		kpreempt_enable();
3788 
3789 		while (off > 0) {
3790 			i = (i + 1) % (lgrp_alloc_max + 1);
3791 			lgrp = lgrp_table[i];
3792 			if (klgrpset_ismember(lgrpset, i))
3793 				off--;
3794 		}
3795 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3796 
3797 		break;
3798 	}
3799 
3800 	ASSERT(lgrp != NULL);
3801 	return (lgrp);
3802 }
3803 
3804 /*
3805  * Return the number of pages in an lgroup
3806  *
3807  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3808  *	 could cause tests that rely on the numat driver to fail....
3809  */
3810 pgcnt_t
3811 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3812 {
3813 	lgrp_t *lgrp;
3814 
3815 	lgrp = lgrp_table[lgrpid];
3816 	if (!LGRP_EXISTS(lgrp) ||
3817 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3818 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3819 		return (0);
3820 
3821 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3822 }
3823 
3824 /*
3825  * Initialize lgroup shared memory allocation policy support
3826  */
3827 void
3828 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3829 {
3830 	lgrp_shm_locality_t	*shm_locality;
3831 
3832 	/*
3833 	 * Initialize locality field in anon_map
3834 	 * Don't need any locks because this is called when anon_map is
3835 	 * allocated, but not used anywhere yet.
3836 	 */
3837 	if (amp) {
3838 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3839 		if (amp->locality == NULL) {
3840 			/*
3841 			 * Allocate and initialize shared memory locality info
3842 			 * and set anon_map locality pointer to it
3843 			 * Drop lock across kmem_alloc(KM_SLEEP)
3844 			 */
3845 			ANON_LOCK_EXIT(&amp->a_rwlock);
3846 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3847 			    KM_SLEEP);
3848 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3849 			    NULL);
3850 			shm_locality->loc_count = 1;	/* not used for amp */
3851 			shm_locality->loc_tree = NULL;
3852 
3853 			/*
3854 			 * Reacquire lock and check to see whether anyone beat
3855 			 * us to initializing the locality info
3856 			 */
3857 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3858 			if (amp->locality != NULL) {
3859 				rw_destroy(&shm_locality->loc_lock);
3860 				kmem_free(shm_locality,
3861 				    sizeof (*shm_locality));
3862 			} else
3863 				amp->locality = shm_locality;
3864 		}
3865 		ANON_LOCK_EXIT(&amp->a_rwlock);
3866 		return;
3867 	}
3868 
3869 	/*
3870 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3871 	 */
3872 	mutex_enter(&vp->v_lock);
3873 	if ((vp->v_flag & V_LOCALITY) == 0) {
3874 		/*
3875 		 * Allocate and initialize shared memory locality info
3876 		 */
3877 		mutex_exit(&vp->v_lock);
3878 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3879 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3880 		shm_locality->loc_count = 1;
3881 		shm_locality->loc_tree = NULL;
3882 
3883 		/*
3884 		 * Point vnode locality field at shared vnode policy info
3885 		 * and set locality aware flag in vnode
3886 		 */
3887 		mutex_enter(&vp->v_lock);
3888 		if ((vp->v_flag & V_LOCALITY) == 0) {
3889 			vp->v_locality = shm_locality;
3890 			vp->v_flag |= V_LOCALITY;
3891 		} else {
3892 			/*
3893 			 * Lost race so free locality info and increment count.
3894 			 */
3895 			rw_destroy(&shm_locality->loc_lock);
3896 			kmem_free(shm_locality, sizeof (*shm_locality));
3897 			shm_locality = vp->v_locality;
3898 			shm_locality->loc_count++;
3899 		}
3900 		mutex_exit(&vp->v_lock);
3901 
3902 		return;
3903 	}
3904 
3905 	/*
3906 	 * Increment reference count of number of segments mapping this vnode
3907 	 * shared
3908 	 */
3909 	shm_locality = vp->v_locality;
3910 	shm_locality->loc_count++;
3911 	mutex_exit(&vp->v_lock);
3912 }
3913 
3914 /*
3915  * Destroy the given shared memory policy segment tree
3916  */
3917 void
3918 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3919 {
3920 	lgrp_shm_policy_seg_t	*cur;
3921 	lgrp_shm_policy_seg_t	*next;
3922 
3923 	if (tree == NULL)
3924 		return;
3925 
3926 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3927 	while (cur != NULL) {
3928 		next = AVL_NEXT(tree, cur);
3929 		avl_remove(tree, cur);
3930 		kmem_free(cur, sizeof (*cur));
3931 		cur = next;
3932 	}
3933 	kmem_free(tree, sizeof (avl_tree_t));
3934 }
3935 
3936 /*
3937  * Uninitialize lgroup shared memory allocation policy support
3938  */
3939 void
3940 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3941 {
3942 	lgrp_shm_locality_t	*shm_locality;
3943 
3944 	/*
3945 	 * For anon_map, deallocate shared memory policy tree and
3946 	 * zero locality field
3947 	 * Don't need any locks because anon_map is being freed
3948 	 */
3949 	if (amp) {
3950 		if (amp->locality == NULL)
3951 			return;
3952 		shm_locality = amp->locality;
3953 		shm_locality->loc_count = 0;	/* not really used for amp */
3954 		rw_destroy(&shm_locality->loc_lock);
3955 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3956 		kmem_free(shm_locality, sizeof (*shm_locality));
3957 		amp->locality = 0;
3958 		return;
3959 	}
3960 
3961 	/*
3962 	 * For vnode, decrement reference count of segments mapping this vnode
3963 	 * shared and delete locality info if reference count drops to 0
3964 	 */
3965 	mutex_enter(&vp->v_lock);
3966 	shm_locality = vp->v_locality;
3967 	shm_locality->loc_count--;
3968 
3969 	if (shm_locality->loc_count == 0) {
3970 		rw_destroy(&shm_locality->loc_lock);
3971 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3972 		kmem_free(shm_locality, sizeof (*shm_locality));
3973 		vp->v_locality = 0;
3974 		vp->v_flag &= ~V_LOCALITY;
3975 	}
3976 	mutex_exit(&vp->v_lock);
3977 }
3978 
3979 /*
3980  * Compare two shared memory policy segments
3981  * Used by AVL tree code for searching
3982  */
3983 int
3984 lgrp_shm_policy_compar(const void *x, const void *y)
3985 {
3986 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3987 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3988 
3989 	if (a->shm_off < b->shm_off)
3990 		return (-1);
3991 	if (a->shm_off >= b->shm_off + b->shm_size)
3992 		return (1);
3993 	return (0);
3994 }
3995 
3996 /*
3997  * Concatenate seg1 with seg2 and remove seg2
3998  */
3999 static int
4000 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4001     lgrp_shm_policy_seg_t *seg2)
4002 {
4003 	if (!seg1 || !seg2 ||
4004 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4005 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4006 		return (-1);
4007 
4008 	seg1->shm_size += seg2->shm_size;
4009 	avl_remove(tree, seg2);
4010 	kmem_free(seg2, sizeof (*seg2));
4011 	return (0);
4012 }
4013 
4014 /*
4015  * Split segment at given offset and return rightmost (uppermost) segment
4016  * Assumes that there are no overlapping segments
4017  */
4018 static lgrp_shm_policy_seg_t *
4019 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4020     u_offset_t off)
4021 {
4022 	lgrp_shm_policy_seg_t	*newseg;
4023 	avl_index_t		where;
4024 
4025 	ASSERT(seg != NULL);
4026 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4027 
4028 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4029 	    seg->shm_size)
4030 		return (NULL);
4031 
4032 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4033 		return (seg);
4034 
4035 	/*
4036 	 * Adjust size of left segment and allocate new (right) segment
4037 	 */
4038 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4039 	newseg->shm_policy = seg->shm_policy;
4040 	newseg->shm_off = off;
4041 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4042 	seg->shm_size = off - seg->shm_off;
4043 
4044 	/*
4045 	 * Find where to insert new segment in AVL tree and insert it
4046 	 */
4047 	(void) avl_find(tree, &off, &where);
4048 	avl_insert(tree, newseg, where);
4049 
4050 	return (newseg);
4051 }
4052 
4053 /*
4054  * Set shared memory allocation policy on specified shared object at given
4055  * offset and length
4056  *
4057  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4058  * -1 if can't set policy.
4059  */
4060 int
4061 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4062     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4063 {
4064 	u_offset_t		eoff;
4065 	lgrp_shm_policy_seg_t	*next;
4066 	lgrp_shm_policy_seg_t	*newseg;
4067 	u_offset_t		off;
4068 	u_offset_t		oldeoff;
4069 	lgrp_shm_policy_seg_t	*prev;
4070 	int			retval;
4071 	lgrp_shm_policy_seg_t	*seg;
4072 	lgrp_shm_locality_t	*shm_locality;
4073 	avl_tree_t		*tree;
4074 	avl_index_t		where;
4075 
4076 	ASSERT(amp || vp);
4077 	ASSERT((len & PAGEOFFSET) == 0);
4078 
4079 	if (len == 0)
4080 		return (-1);
4081 
4082 	retval = 0;
4083 
4084 	/*
4085 	 * Get locality info and starting offset into shared object
4086 	 * Try anon map first and then vnode
4087 	 * Assume that no locks need to be held on anon_map or vnode, since
4088 	 * it should be protected by its reference count which must be nonzero
4089 	 * for an existing segment.
4090 	 */
4091 	if (amp) {
4092 		/*
4093 		 * Get policy info from anon_map
4094 		 *
4095 		 */
4096 		ASSERT(amp->refcnt != 0);
4097 		if (amp->locality == NULL)
4098 			lgrp_shm_policy_init(amp, NULL);
4099 		shm_locality = amp->locality;
4100 		off = ptob(anon_index);
4101 	} else if (vp) {
4102 		/*
4103 		 * Get policy info from vnode
4104 		 */
4105 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4106 			lgrp_shm_policy_init(NULL, vp);
4107 		shm_locality = vp->v_locality;
4108 		ASSERT(shm_locality->loc_count != 0);
4109 		off = vn_off;
4110 	} else
4111 		return (-1);
4112 
4113 	ASSERT((off & PAGEOFFSET) == 0);
4114 
4115 	/*
4116 	 * Figure out default policy
4117 	 */
4118 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4119 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4120 
4121 	/*
4122 	 * Create AVL tree if there isn't one yet
4123 	 * and set locality field to point at it
4124 	 */
4125 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4126 	tree = shm_locality->loc_tree;
4127 	if (!tree) {
4128 		rw_exit(&shm_locality->loc_lock);
4129 
4130 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4131 
4132 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4133 		if (shm_locality->loc_tree == NULL) {
4134 			avl_create(tree, lgrp_shm_policy_compar,
4135 			    sizeof (lgrp_shm_policy_seg_t),
4136 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4137 			shm_locality->loc_tree = tree;
4138 		} else {
4139 			/*
4140 			 * Another thread managed to set up the tree
4141 			 * before we could. Free the tree we allocated
4142 			 * and use the one that's already there.
4143 			 */
4144 			kmem_free(tree, sizeof (*tree));
4145 			tree = shm_locality->loc_tree;
4146 		}
4147 	}
4148 
4149 	/*
4150 	 * Set policy
4151 	 *
4152 	 * Need to maintain hold on writer's lock to keep tree from
4153 	 * changing out from under us
4154 	 */
4155 	while (len != 0) {
4156 		/*
4157 		 * Find policy segment for specified offset into shared object
4158 		 */
4159 		seg = avl_find(tree, &off, &where);
4160 
4161 		/*
4162 		 * Didn't find any existing segment that contains specified
4163 		 * offset, so allocate new segment, insert it, and concatenate
4164 		 * with adjacent segments if possible
4165 		 */
4166 		if (seg == NULL) {
4167 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4168 			    KM_SLEEP);
4169 			newseg->shm_policy.mem_policy = policy;
4170 			newseg->shm_policy.mem_reserved = 0;
4171 			newseg->shm_off = off;
4172 			avl_insert(tree, newseg, where);
4173 
4174 			/*
4175 			 * Check to see whether new segment overlaps with next
4176 			 * one, set length of new segment accordingly, and
4177 			 * calculate remaining length and next offset
4178 			 */
4179 			seg = AVL_NEXT(tree, newseg);
4180 			if (seg == NULL || off + len <= seg->shm_off) {
4181 				newseg->shm_size = len;
4182 				len = 0;
4183 			} else {
4184 				newseg->shm_size = seg->shm_off - off;
4185 				off = seg->shm_off;
4186 				len -= newseg->shm_size;
4187 			}
4188 
4189 			/*
4190 			 * Try to concatenate new segment with next and
4191 			 * previous ones, since they might have the same policy
4192 			 * now.  Grab previous and next segments first because
4193 			 * they will change on concatenation.
4194 			 */
4195 			prev =  AVL_PREV(tree, newseg);
4196 			next = AVL_NEXT(tree, newseg);
4197 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4198 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4199 
4200 			continue;
4201 		}
4202 
4203 		eoff = off + len;
4204 		oldeoff = seg->shm_off + seg->shm_size;
4205 
4206 		/*
4207 		 * Policy set already?
4208 		 */
4209 		if (policy == seg->shm_policy.mem_policy) {
4210 			/*
4211 			 * Nothing left to do if offset and length
4212 			 * fall within this segment
4213 			 */
4214 			if (eoff <= oldeoff) {
4215 				retval = 1;
4216 				break;
4217 			} else {
4218 				len = eoff - oldeoff;
4219 				off = oldeoff;
4220 				continue;
4221 			}
4222 		}
4223 
4224 		/*
4225 		 * Specified offset and length match existing segment exactly
4226 		 */
4227 		if (off == seg->shm_off && len == seg->shm_size) {
4228 			/*
4229 			 * Set policy and update current length
4230 			 */
4231 			seg->shm_policy.mem_policy = policy;
4232 			seg->shm_policy.mem_reserved = 0;
4233 			len = 0;
4234 
4235 			/*
4236 			 * Try concatenating new segment with previous and next
4237 			 * segments, since they might have the same policy now.
4238 			 * Grab previous and next segments first because they
4239 			 * will change on concatenation.
4240 			 */
4241 			prev =  AVL_PREV(tree, seg);
4242 			next = AVL_NEXT(tree, seg);
4243 			(void) lgrp_shm_policy_concat(tree, seg, next);
4244 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4245 		} else {
4246 			/*
4247 			 * Specified offset and length only apply to part of
4248 			 * existing segment
4249 			 */
4250 
4251 			/*
4252 			 * New segment starts in middle of old one, so split
4253 			 * new one off near beginning of old one
4254 			 */
4255 			newseg = NULL;
4256 			if (off > seg->shm_off) {
4257 				newseg = lgrp_shm_policy_split(tree, seg, off);
4258 
4259 				/*
4260 				 * New segment ends where old one did, so try
4261 				 * to concatenate with next segment
4262 				 */
4263 				if (eoff == oldeoff) {
4264 					newseg->shm_policy.mem_policy = policy;
4265 					newseg->shm_policy.mem_reserved = 0;
4266 					(void) lgrp_shm_policy_concat(tree,
4267 					    newseg, AVL_NEXT(tree, newseg));
4268 					break;
4269 				}
4270 			}
4271 
4272 			/*
4273 			 * New segment ends before old one, so split off end of
4274 			 * old one
4275 			 */
4276 			if (eoff < oldeoff) {
4277 				if (newseg) {
4278 					(void) lgrp_shm_policy_split(tree,
4279 					    newseg, eoff);
4280 					newseg->shm_policy.mem_policy = policy;
4281 					newseg->shm_policy.mem_reserved = 0;
4282 				} else {
4283 					(void) lgrp_shm_policy_split(tree, seg,
4284 					    eoff);
4285 					seg->shm_policy.mem_policy = policy;
4286 					seg->shm_policy.mem_reserved = 0;
4287 				}
4288 
4289 				if (off == seg->shm_off)
4290 					(void) lgrp_shm_policy_concat(tree,
4291 					    AVL_PREV(tree, seg), seg);
4292 				break;
4293 			}
4294 
4295 			/*
4296 			 * Calculate remaining length and next offset
4297 			 */
4298 			len = eoff - oldeoff;
4299 			off = oldeoff;
4300 		}
4301 	}
4302 
4303 	rw_exit(&shm_locality->loc_lock);
4304 	return (retval);
4305 }
4306 
4307 /*
4308  * Return the best memnode from which to allocate memory given
4309  * an lgroup.
4310  *
4311  * "c" is for cookie, which is good enough for me.
4312  * It references a cookie struct that should be zero'ed to initialize.
4313  * The cookie should live on the caller's stack.
4314  *
4315  * The routine returns -1 when:
4316  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4317  *	- traverse is 1, and all the memnodes in the system have been
4318  *	  returned.
4319  */
4320 int
4321 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4322 {
4323 	lgrp_t		*lp = c->lmc_lgrp;
4324 	mnodeset_t	nodes = c->lmc_nodes;
4325 	int		cnt = c->lmc_cnt;
4326 	int		offset, mnode;
4327 
4328 	extern int	max_mem_nodes;
4329 
4330 	/*
4331 	 * If the set is empty, and the caller is willing, traverse
4332 	 * up the hierarchy until we find a non-empty set.
4333 	 */
4334 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4335 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4336 		    ((lp = lp->lgrp_parent) == NULL))
4337 			return (-1);
4338 
4339 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4340 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4341 	}
4342 
4343 	/*
4344 	 * Select a memnode by picking one at a "random" offset.
4345 	 * Because of DR, memnodes can come and go at any time.
4346 	 * This code must be able to cope with the possibility
4347 	 * that the nodes count "cnt" is inconsistent with respect
4348 	 * to the number of elements actually in "nodes", and
4349 	 * therefore that the offset chosen could be greater than
4350 	 * the number of elements in the set (some memnodes may
4351 	 * have dissapeared just before cnt was read).
4352 	 * If this happens, the search simply wraps back to the
4353 	 * beginning of the set.
4354 	 */
4355 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4356 	offset = c->lmc_rand % cnt;
4357 	do {
4358 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4359 			if (nodes & ((mnodeset_t)1 << mnode))
4360 				if (!offset--)
4361 					break;
4362 	} while (mnode >= max_mem_nodes);
4363 
4364 	/* Found a node. Store state before returning. */
4365 	c->lmc_lgrp = lp;
4366 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4367 	c->lmc_cnt = cnt - 1;
4368 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4369 	c->lmc_ntried++;
4370 
4371 	return (mnode);
4372 }
4373