xref: /illumos-gate/usr/src/uts/common/os/lgrp.c (revision b3001def2a41995242feff3e584ad9ead06d7b1b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/chip.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 /*
216  * lgroup memory event handlers
217  */
218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 
222 /*
223  * lgroup CPU partition event handlers
224  */
225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 static void	lgrp_part_del_cpu(struct cpu *);
227 
228 static void	lgrp_root_init(void);
229 
230 /*
231  * lpl topology
232  */
233 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234 static void	lpl_clear(lpl_t *);
235 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237 static void	lpl_rset_add(lpl_t *, lpl_t *);
238 static void	lpl_rset_del(lpl_t *, lpl_t *);
239 static int	lpl_rset_contains(lpl_t *, lpl_t *);
240 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241 static void	lpl_child_update(lpl_t *, struct cpupart *);
242 static int	lpl_pick(lpl_t *, lpl_t *);
243 static void	lpl_verify_wrapper(struct cpupart *);
244 
245 /*
246  * defines for lpl topology verifier return codes
247  */
248 
249 #define	LPL_TOPO_CORRECT			0
250 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
251 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252 #define	LPL_TOPO_LGRP_MISMATCH			-3
253 #define	LPL_TOPO_MISSING_PARENT			-4
254 #define	LPL_TOPO_PARENT_MISMATCH		-5
255 #define	LPL_TOPO_BAD_CPUCNT			-6
256 #define	LPL_TOPO_RSET_MISMATCH			-7
257 #define	LPL_TOPO_LPL_ORPHANED			-8
258 #define	LPL_TOPO_LPL_BAD_NCPU			-9
259 #define	LPL_TOPO_RSET_MSSNG_LF			-10
260 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261 #define	LPL_TOPO_BOGUS_HINT			-12
262 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
263 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
264 #define	LPL_TOPO_BAD_RSETCNT			-15
265 
266 /*
267  * Return whether lgroup optimizations should be enabled on this system
268  */
269 int
270 lgrp_optimizations(void)
271 {
272 	/*
273 	 * System must have more than 2 lgroups to enable lgroup optimizations
274 	 *
275 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
276 	 * with one child lgroup containing all the resources. A 2 lgroup
277 	 * system with a root lgroup directly containing CPUs or memory might
278 	 * need lgroup optimizations with its child lgroup, but there
279 	 * isn't such a machine for now....
280 	 */
281 	if (nlgrps > 2)
282 		return (1);
283 
284 	return (0);
285 }
286 
287 /*
288  * Build full lgroup topology
289  */
290 static void
291 lgrp_root_init(void)
292 {
293 	lgrp_handle_t	hand;
294 	int		i;
295 	lgrp_id_t	id;
296 
297 	/*
298 	 * Create the "root" lgroup
299 	 */
300 	ASSERT(nlgrps == 0);
301 	id = nlgrps++;
302 
303 	lgrp_root = &lroot;
304 
305 	lgrp_root->lgrp_cpu = NULL;
306 	lgrp_root->lgrp_mnodes = 0;
307 	lgrp_root->lgrp_nmnodes = 0;
308 	hand = lgrp_plat_root_hand();
309 	lgrp_root->lgrp_plathand = hand;
310 
311 	lgrp_root->lgrp_id = id;
312 	lgrp_root->lgrp_cpucnt = 0;
313 	lgrp_root->lgrp_childcnt = 0;
314 	klgrpset_clear(lgrp_root->lgrp_children);
315 	klgrpset_clear(lgrp_root->lgrp_leaves);
316 	lgrp_root->lgrp_parent = NULL;
317 	lgrp_root->lgrp_chips = NULL;
318 	lgrp_root->lgrp_chipcnt = 0;
319 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
320 
321 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
322 		klgrpset_clear(lgrp_root->lgrp_set[i]);
323 
324 	lgrp_root->lgrp_kstat = NULL;
325 
326 	lgrp_table[id] = lgrp_root;
327 
328 	/*
329 	 * Setup initial lpl list for CPU0 and initial t0 home.
330 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
331 	 * all topology operations until cp_default is initialized at which
332 	 * point t0.t_lpl will be updated.
333 	 */
334 	lpl_bootstrap = lpl_bootstrap_list;
335 	t0.t_lpl = lpl_bootstrap;
336 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
337 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
338 	cp_default.cp_lgrploads = lpl_bootstrap;
339 }
340 
341 /*
342  * Initialize the lgroup framework and allow the platform to do the same
343  */
344 void
345 lgrp_init(void)
346 {
347 	/*
348 	 * Initialize the platform
349 	 */
350 	lgrp_plat_init();
351 
352 	/*
353 	 * Set max number of lgroups supported on this platform which must be
354 	 * less than the max number of lgroups supported by the common lgroup
355 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
356 	 */
357 	nlgrpsmax = lgrp_plat_max_lgrps();
358 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
359 }
360 
361 /*
362  * Create the root and cpu0's lgroup, and set t0's home.
363  */
364 void
365 lgrp_setup(void)
366 {
367 	/*
368 	 * Setup the root lgroup
369 	 */
370 	lgrp_root_init();
371 
372 	/*
373 	 * Add cpu0 to an lgroup
374 	 */
375 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
376 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
377 }
378 
379 /*
380  * Lgroup initialization is split in two parts. The first part
381  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
382  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
383  * when all CPUs are brought online and all distance information is available.
384  *
385  * When lgrp_main_init() is complete it sets lgrp_initialized. The
386  * lgrp_main_mp_init() sets lgrp_topo_initialized.
387  */
388 
389 /*
390  * true when lgrp initialization has been completed.
391  */
392 int	lgrp_initialized = 0;
393 
394 /*
395  * True when lgrp topology is constructed.
396  */
397 int	lgrp_topo_initialized = 0;
398 
399 /*
400  * Init routine called after startup(), /etc/system has been processed,
401  * and cpu0 has been added to an lgroup.
402  */
403 void
404 lgrp_main_init(void)
405 {
406 	cpu_t		*cp = CPU;
407 	lgrp_id_t	lgrpid;
408 	int		i;
409 	/*
410 	 * Enforce a valid lgrp_mem_default_policy
411 	 */
412 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
413 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
414 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
415 
416 	/*
417 	 * See if mpo should be disabled.
418 	 * This may happen in the case of null proc LPA on Starcat.
419 	 * The platform won't be able to detect null proc LPA until after
420 	 * cpu0 and memory have already been added to lgroups.
421 	 * When and if it is detected, the Starcat platform will return
422 	 * a different platform handle for cpu0 which is what we check for
423 	 * here. If mpo should be disabled move cpu0 to it's rightful place
424 	 * (the root), and destroy the remaining lgroups. This effectively
425 	 * provides an UMA lgroup topology.
426 	 */
427 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
428 	if (lgrp_table[lgrpid]->lgrp_plathand !=
429 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
430 		lgrp_part_del_cpu(cp);
431 		lgrp_cpu_fini(cp, lgrpid);
432 
433 		lgrp_cpu_init(cp);
434 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
435 
436 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
437 
438 		/*
439 		 * Destroy all lgroups except for root
440 		 */
441 		for (i = 0; i <= lgrp_alloc_max; i++) {
442 			if (LGRP_EXISTS(lgrp_table[i]) &&
443 			    lgrp_table[i] != lgrp_root)
444 				lgrp_destroy(lgrp_table[i]);
445 		}
446 
447 		/*
448 		 * Fix up root to point at itself for leaves and resources
449 		 * and not have any children
450 		 */
451 		lgrp_root->lgrp_childcnt = 0;
452 		klgrpset_clear(lgrp_root->lgrp_children);
453 		klgrpset_clear(lgrp_root->lgrp_leaves);
454 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
455 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
456 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
457 	}
458 
459 	/*
460 	 * Initialize kstats framework.
461 	 */
462 	lgrp_kstat_init();
463 	/*
464 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
465 	 */
466 	mutex_enter(&cpu_lock);
467 	lgrp_kstat_create(cp);
468 	mutex_exit(&cpu_lock);
469 
470 	lgrp_plat_main_init();
471 	lgrp_initialized = 1;
472 }
473 
474 /*
475  * Finish lgrp initialization after all CPUS are brought on-line.
476  * This routine is called after start_other_cpus().
477  */
478 void
479 lgrp_main_mp_init(void)
480 {
481 	klgrpset_t changed;
482 
483 	/*
484 	 * Update lgroup topology (if necessary)
485 	 */
486 	klgrpset_clear(changed);
487 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
488 	lgrp_topo_initialized = 1;
489 }
490 
491 /*
492  * Change latency of lgroup with specified lgroup platform handle (if one is
493  * given) or change all lgroups with old latency to new latency
494  */
495 void
496 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
497     u_longlong_t newtime)
498 {
499 	lgrp_t		*lgrp;
500 	int		i;
501 
502 	for (i = 0; i <= lgrp_alloc_max; i++) {
503 		lgrp = lgrp_table[i];
504 
505 		if (!LGRP_EXISTS(lgrp))
506 			continue;
507 
508 		if ((hand == LGRP_NULL_HANDLE &&
509 		    lgrp->lgrp_latency == oldtime) ||
510 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
511 			lgrp->lgrp_latency = (int)newtime;
512 	}
513 }
514 
515 /*
516  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
517  */
518 void
519 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
520 {
521 	klgrpset_t	changed;
522 	cpu_t		*cp;
523 	lgrp_id_t	id;
524 	int		rc;
525 
526 	switch (event) {
527 	/*
528 	 * The following (re)configuration events are common code
529 	 * initiated. lgrp_plat_config() is called here to inform the
530 	 * platform of the reconfiguration event.
531 	 */
532 	case LGRP_CONFIG_CPU_ADD:
533 		cp = (cpu_t *)resource;
534 
535 		/*
536 		 * Initialize the new CPU's lgrp related next/prev
537 		 * links, and give it a bootstrap lpl so that it can
538 		 * survive should it need to enter the dispatcher.
539 		 */
540 		cp->cpu_next_lpl = cp;
541 		cp->cpu_prev_lpl = cp;
542 		cp->cpu_next_lgrp = cp;
543 		cp->cpu_prev_lgrp = cp;
544 		cp->cpu_lpl = lpl_bootstrap;
545 
546 		lgrp_plat_config(event, resource);
547 		atomic_add_32(&lgrp_gen, 1);
548 
549 		break;
550 	case LGRP_CONFIG_CPU_DEL:
551 		lgrp_plat_config(event, resource);
552 		atomic_add_32(&lgrp_gen, 1);
553 
554 		break;
555 	case LGRP_CONFIG_CPU_ONLINE:
556 		cp = (cpu_t *)resource;
557 		lgrp_cpu_init(cp);
558 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
559 		rc = lpl_topo_verify(cp->cpu_part);
560 		if (rc != LPL_TOPO_CORRECT) {
561 			panic("lpl_topo_verify failed: %d", rc);
562 		}
563 		lgrp_plat_config(event, resource);
564 		atomic_add_32(&lgrp_gen, 1);
565 
566 		break;
567 	case LGRP_CONFIG_CPU_OFFLINE:
568 		cp = (cpu_t *)resource;
569 		id = cp->cpu_lpl->lpl_lgrpid;
570 		lgrp_part_del_cpu(cp);
571 		lgrp_cpu_fini(cp, id);
572 		rc = lpl_topo_verify(cp->cpu_part);
573 		if (rc != LPL_TOPO_CORRECT) {
574 			panic("lpl_topo_verify failed: %d", rc);
575 		}
576 		lgrp_plat_config(event, resource);
577 		atomic_add_32(&lgrp_gen, 1);
578 
579 		break;
580 	case LGRP_CONFIG_CPUPART_ADD:
581 		cp = (cpu_t *)resource;
582 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
583 		rc = lpl_topo_verify(cp->cpu_part);
584 		if (rc != LPL_TOPO_CORRECT) {
585 			panic("lpl_topo_verify failed: %d", rc);
586 		}
587 		lgrp_plat_config(event, resource);
588 
589 		break;
590 	case LGRP_CONFIG_CPUPART_DEL:
591 		cp = (cpu_t *)resource;
592 		lgrp_part_del_cpu((cpu_t *)resource);
593 		rc = lpl_topo_verify(cp->cpu_part);
594 		if (rc != LPL_TOPO_CORRECT) {
595 			panic("lpl_topo_verify failed: %d", rc);
596 		}
597 		lgrp_plat_config(event, resource);
598 
599 		break;
600 	/*
601 	 * The following events are initiated by the memnode
602 	 * subsystem.
603 	 */
604 	case LGRP_CONFIG_MEM_ADD:
605 		lgrp_mem_init((int)resource, where, B_FALSE);
606 		atomic_add_32(&lgrp_gen, 1);
607 
608 		break;
609 	case LGRP_CONFIG_MEM_DEL:
610 		lgrp_mem_fini((int)resource, where, B_FALSE);
611 		atomic_add_32(&lgrp_gen, 1);
612 
613 		break;
614 	case LGRP_CONFIG_MEM_RENAME: {
615 		lgrp_config_mem_rename_t *ren_arg =
616 		    (lgrp_config_mem_rename_t *)where;
617 
618 		lgrp_mem_rename((int)resource,
619 		    ren_arg->lmem_rename_from,
620 		    ren_arg->lmem_rename_to);
621 		atomic_add_32(&lgrp_gen, 1);
622 
623 		break;
624 	}
625 	case LGRP_CONFIG_GEN_UPDATE:
626 		atomic_add_32(&lgrp_gen, 1);
627 
628 		break;
629 	case LGRP_CONFIG_FLATTEN:
630 		if (where == 0)
631 			lgrp_topo_levels = (int)resource;
632 		else
633 			(void) lgrp_topo_flatten(resource,
634 			    lgrp_table, lgrp_alloc_max, &changed);
635 
636 		break;
637 	/*
638 	 * Update any lgroups with old latency to new latency
639 	 */
640 	case LGRP_CONFIG_LAT_CHANGE_ALL:
641 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
642 		    (u_longlong_t)where);
643 
644 		break;
645 	/*
646 	 * Update lgroup with specified lgroup platform handle to have
647 	 * new latency
648 	 */
649 	case LGRP_CONFIG_LAT_CHANGE:
650 		lgrp_latency_change((lgrp_handle_t)resource, 0,
651 		    (u_longlong_t)where);
652 
653 		break;
654 	case LGRP_CONFIG_NOP:
655 
656 		break;
657 	default:
658 		break;
659 	}
660 
661 }
662 
663 /*
664  * Called to add lgrp info into cpu structure from cpu_add_unit;
665  * do not assume cpu is in cpu[] yet!
666  *
667  * CPUs are brought online with all other CPUs paused so we can't
668  * allocate memory or we could deadlock the system, so we rely on
669  * the platform to statically allocate as much space as we need
670  * for the lgrp structs and stats.
671  */
672 static void
673 lgrp_cpu_init(struct cpu *cp)
674 {
675 	klgrpset_t	changed;
676 	int		count;
677 	lgrp_handle_t	hand;
678 	int		first_cpu;
679 	lgrp_t		*my_lgrp;
680 	lgrp_id_t	lgrpid;
681 	struct cpu	*cptr;
682 	struct chip	*chp;
683 
684 	/*
685 	 * This is the first time through if the resource set
686 	 * for the root lgroup is empty. After cpu0 has been
687 	 * initially added to an lgroup, the root's CPU resource
688 	 * set can never be empty, since the system's last CPU
689 	 * cannot be offlined.
690 	 */
691 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
692 		/*
693 		 * First time through.
694 		 */
695 		first_cpu = 1;
696 	} else {
697 		/*
698 		 * If cpu0 needs to move lgroups, we may come
699 		 * through here again, at which time cpu_lock won't
700 		 * be held, and lgrp_initialized will be false.
701 		 */
702 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
703 		ASSERT(cp->cpu_part != NULL);
704 		first_cpu = 0;
705 	}
706 
707 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
708 	my_lgrp = lgrp_hand_to_lgrp(hand);
709 
710 	if (my_lgrp == NULL) {
711 		/*
712 		 * Create new lgrp and add it to lgroup topology
713 		 */
714 		my_lgrp = lgrp_create();
715 		my_lgrp->lgrp_plathand = hand;
716 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
717 		lgrpid = my_lgrp->lgrp_id;
718 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
719 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
720 
721 		count = 0;
722 		klgrpset_clear(changed);
723 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
724 		    &changed);
725 		/*
726 		 * May have added new intermediate lgroups, so need to add
727 		 * resources other than CPUs which are added below
728 		 */
729 		(void) lgrp_mnode_update(changed, NULL);
730 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
731 	    > 0) {
732 		/*
733 		 * Leaf lgroup was created, but latency wasn't available
734 		 * then.  So, set latency for it and fill in rest of lgroup
735 		 * topology  now that we know how far it is from other leaf
736 		 * lgroups.
737 		 */
738 		lgrpid = my_lgrp->lgrp_id;
739 		klgrpset_clear(changed);
740 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
741 		    lgrpid))
742 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
743 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
744 		    &changed);
745 
746 		/*
747 		 * May have added new intermediate lgroups, so need to add
748 		 * resources other than CPUs which are added below
749 		 */
750 		(void) lgrp_mnode_update(changed, NULL);
751 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
752 	    my_lgrp->lgrp_id)) {
753 		int	i;
754 
755 		/*
756 		 * Update existing lgroup and lgroups containing it with CPU
757 		 * resource
758 		 */
759 		lgrpid = my_lgrp->lgrp_id;
760 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
761 		for (i = 0; i <= lgrp_alloc_max; i++) {
762 			lgrp_t		*lgrp;
763 
764 			lgrp = lgrp_table[i];
765 			if (!LGRP_EXISTS(lgrp) ||
766 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
767 				continue;
768 
769 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
770 		}
771 	}
772 
773 	lgrpid = my_lgrp->lgrp_id;
774 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
775 
776 	/*
777 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
778 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
779 	 * not since none of lgroup IDs in the lpl's have been set yet.
780 	 */
781 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
782 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
783 
784 	/*
785 	 * link the CPU into the lgrp's CPU list
786 	 */
787 	if (my_lgrp->lgrp_cpucnt == 0) {
788 		my_lgrp->lgrp_cpu = cp;
789 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
790 	} else {
791 		cptr = my_lgrp->lgrp_cpu;
792 		cp->cpu_next_lgrp = cptr;
793 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
794 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
795 		cptr->cpu_prev_lgrp = cp;
796 	}
797 	my_lgrp->lgrp_cpucnt++;
798 
799 	/*
800 	 * Add this cpu's chip to the per lgroup list
801 	 * if necessary
802 	 */
803 	if (cp->cpu_chip->chip_lgrp == NULL) {
804 		struct chip *lcpr;
805 
806 		chp = cp->cpu_chip;
807 
808 		if (my_lgrp->lgrp_chipcnt == 0) {
809 			my_lgrp->lgrp_chips = chp;
810 			chp->chip_next_lgrp =
811 			    chp->chip_prev_lgrp = chp;
812 		} else {
813 			lcpr = my_lgrp->lgrp_chips;
814 			chp->chip_next_lgrp = lcpr;
815 			chp->chip_prev_lgrp =
816 			    lcpr->chip_prev_lgrp;
817 			lcpr->chip_prev_lgrp->chip_next_lgrp =
818 			    chp;
819 			lcpr->chip_prev_lgrp = chp;
820 		}
821 		chp->chip_lgrp = my_lgrp;
822 		chp->chip_balance = chp->chip_next_lgrp;
823 		my_lgrp->lgrp_chipcnt++;
824 	}
825 }
826 
827 lgrp_t *
828 lgrp_create(void)
829 {
830 	lgrp_t		*my_lgrp;
831 	lgrp_id_t	lgrpid;
832 	int		i;
833 
834 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
835 
836 	/*
837 	 * Find an open slot in the lgroup table and recycle unused lgroup
838 	 * left there if any
839 	 */
840 	my_lgrp = NULL;
841 	if (lgrp_alloc_hint == -1)
842 		/*
843 		 * Allocate from end when hint not set yet because no lgroups
844 		 * have been deleted yet
845 		 */
846 		lgrpid = nlgrps++;
847 	else {
848 		/*
849 		 * Start looking for next open slot from hint and leave hint
850 		 * at slot allocated
851 		 */
852 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
853 			my_lgrp = lgrp_table[i];
854 			if (!LGRP_EXISTS(my_lgrp)) {
855 				lgrpid = i;
856 				nlgrps++;
857 				break;
858 			}
859 		}
860 		lgrp_alloc_hint = lgrpid;
861 	}
862 
863 	/*
864 	 * Keep track of max lgroup ID allocated so far to cut down on searches
865 	 */
866 	if (lgrpid > lgrp_alloc_max)
867 		lgrp_alloc_max = lgrpid;
868 
869 	/*
870 	 * Need to allocate new lgroup if next open slot didn't have one
871 	 * for recycling
872 	 */
873 	if (my_lgrp == NULL)
874 		my_lgrp = lgrp_plat_alloc(lgrpid);
875 
876 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
877 		panic("Too many lgrps for platform (%d)", nlgrps);
878 
879 	my_lgrp->lgrp_id = lgrpid;
880 	my_lgrp->lgrp_latency = 0;
881 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
882 	my_lgrp->lgrp_parent = NULL;
883 	my_lgrp->lgrp_childcnt = 0;
884 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
885 	my_lgrp->lgrp_nmnodes = 0;
886 	klgrpset_clear(my_lgrp->lgrp_children);
887 	klgrpset_clear(my_lgrp->lgrp_leaves);
888 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
889 		klgrpset_clear(my_lgrp->lgrp_set[i]);
890 
891 	my_lgrp->lgrp_cpu = NULL;
892 	my_lgrp->lgrp_cpucnt = 0;
893 	my_lgrp->lgrp_chips = NULL;
894 	my_lgrp->lgrp_chipcnt = 0;
895 
896 	if (my_lgrp->lgrp_kstat != NULL)
897 		lgrp_kstat_reset(lgrpid);
898 
899 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
900 
901 	return (my_lgrp);
902 }
903 
904 void
905 lgrp_destroy(lgrp_t *lgrp)
906 {
907 	int		i;
908 
909 	/*
910 	 * Unless this lgroup is being destroyed on behalf of
911 	 * the boot CPU, cpu_lock must be held
912 	 */
913 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
914 
915 	if (nlgrps == 1)
916 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
917 
918 	if (!LGRP_EXISTS(lgrp))
919 		return;
920 
921 	/*
922 	 * Set hint to lgroup being deleted and try to keep lower numbered
923 	 * hints to facilitate finding empty slots
924 	 */
925 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
926 		lgrp_alloc_hint = lgrp->lgrp_id;
927 
928 	/*
929 	 * Mark this lgroup to be recycled by setting its lgroup ID to
930 	 * LGRP_NONE and clear relevant fields
931 	 */
932 	lgrp->lgrp_id = LGRP_NONE;
933 	lgrp->lgrp_latency = 0;
934 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
935 	lgrp->lgrp_parent = NULL;
936 	lgrp->lgrp_childcnt = 0;
937 
938 	klgrpset_clear(lgrp->lgrp_children);
939 	klgrpset_clear(lgrp->lgrp_leaves);
940 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
941 		klgrpset_clear(lgrp->lgrp_set[i]);
942 
943 	lgrp->lgrp_mnodes = (mnodeset_t)0;
944 	lgrp->lgrp_nmnodes = 0;
945 
946 	lgrp->lgrp_cpu = NULL;
947 	lgrp->lgrp_cpucnt = 0;
948 	lgrp->lgrp_chipcnt = 0;
949 	lgrp->lgrp_chips = NULL;
950 
951 	nlgrps--;
952 }
953 
954 /*
955  * Initialize kstat data. Called from lgrp intialization code.
956  */
957 static void
958 lgrp_kstat_init(void)
959 {
960 	lgrp_stat_t	stat;
961 
962 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
963 
964 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
965 		kstat_named_init(&lgrp_kstat_data[stat],
966 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
967 }
968 
969 /*
970  * initialize an lgrp's kstats if needed
971  * called with cpu_lock held but not with cpus paused.
972  * we don't tear these down now because we don't know about
973  * memory leaving the lgrp yet...
974  */
975 
976 void
977 lgrp_kstat_create(cpu_t *cp)
978 {
979 	kstat_t		*lgrp_kstat;
980 	lgrp_id_t	lgrpid;
981 	lgrp_t		*my_lgrp;
982 
983 	ASSERT(MUTEX_HELD(&cpu_lock));
984 
985 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
986 	my_lgrp = lgrp_table[lgrpid];
987 
988 	if (my_lgrp->lgrp_kstat != NULL)
989 		return; /* already initialized */
990 
991 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
992 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
993 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
994 
995 	if (lgrp_kstat != NULL) {
996 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
997 		lgrp_kstat->ks_private = my_lgrp;
998 		lgrp_kstat->ks_data = &lgrp_kstat_data;
999 		lgrp_kstat->ks_update = lgrp_kstat_extract;
1000 		my_lgrp->lgrp_kstat = lgrp_kstat;
1001 		kstat_install(lgrp_kstat);
1002 	}
1003 }
1004 
1005 /*
1006  * this will do something when we manage to remove now unused lgrps
1007  */
1008 
1009 /* ARGSUSED */
1010 void
1011 lgrp_kstat_destroy(cpu_t *cp)
1012 {
1013 	ASSERT(MUTEX_HELD(&cpu_lock));
1014 }
1015 
1016 /*
1017  * Called when a CPU is off-lined.
1018  */
1019 static void
1020 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1021 {
1022 	lgrp_t *my_lgrp;
1023 	struct cpu *prev;
1024 	struct cpu *next;
1025 	chip_t  *chp;
1026 
1027 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1028 
1029 	prev = cp->cpu_prev_lgrp;
1030 	next = cp->cpu_next_lgrp;
1031 
1032 	prev->cpu_next_lgrp = next;
1033 	next->cpu_prev_lgrp = prev;
1034 
1035 	/*
1036 	 * just because I'm paranoid doesn't mean...
1037 	 */
1038 
1039 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1040 
1041 	my_lgrp = lgrp_table[lgrpid];
1042 	my_lgrp->lgrp_cpucnt--;
1043 
1044 	/*
1045 	 * If the last CPU on it's chip is being offlined
1046 	 * then remove this chip from the per lgroup list.
1047 	 *
1048 	 * This is also done for the boot CPU when it needs
1049 	 * to move between lgroups as a consequence of
1050 	 * null proc lpa.
1051 	 */
1052 	chp = cp->cpu_chip;
1053 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
1054 
1055 		chip_t	*chpp;
1056 
1057 		if (--my_lgrp->lgrp_chipcnt == 0)
1058 			my_lgrp->lgrp_chips = NULL;
1059 		else if (my_lgrp->lgrp_chips == chp)
1060 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
1061 
1062 		/*
1063 		 * Walk this lgroup's chip list looking for chips that
1064 		 * may try to balance against the one that's leaving
1065 		 */
1066 		for (chpp = chp->chip_next_lgrp; chpp != chp;
1067 		    chpp = chpp->chip_next_lgrp) {
1068 			if (chpp->chip_balance == chp)
1069 				chpp->chip_balance = chp->chip_next_lgrp;
1070 		}
1071 
1072 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1073 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1074 
1075 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1076 		chp->chip_lgrp = NULL;
1077 		chp->chip_balance = NULL;
1078 	}
1079 
1080 	/*
1081 	 * Removing last CPU in lgroup, so update lgroup topology
1082 	 */
1083 	if (my_lgrp->lgrp_cpucnt == 0) {
1084 		klgrpset_t	changed;
1085 		int		count;
1086 		int		i;
1087 
1088 		my_lgrp->lgrp_cpu = NULL;
1089 
1090 		/*
1091 		 * Remove this lgroup from its lgroup CPU resources and remove
1092 		 * lgroup from lgroup topology if it doesn't have any more
1093 		 * resources in it now
1094 		 */
1095 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1096 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1097 			count = 0;
1098 			klgrpset_clear(changed);
1099 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1100 			    lgrp_alloc_max + 1, &changed);
1101 			return;
1102 		}
1103 
1104 		/*
1105 		 * This lgroup isn't empty, so just remove it from CPU
1106 		 * resources of any lgroups that contain it as such
1107 		 */
1108 		for (i = 0; i <= lgrp_alloc_max; i++) {
1109 			lgrp_t		*lgrp;
1110 
1111 			lgrp = lgrp_table[i];
1112 			if (!LGRP_EXISTS(lgrp) ||
1113 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1114 			    lgrpid))
1115 				continue;
1116 
1117 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1118 		}
1119 		return;
1120 	}
1121 
1122 	if (my_lgrp->lgrp_cpu == cp)
1123 		my_lgrp->lgrp_cpu = next;
1124 
1125 }
1126 
1127 /*
1128  * Update memory nodes in target lgroups and return ones that get changed
1129  */
1130 int
1131 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1132 {
1133 	int	count;
1134 	int	i;
1135 	int	j;
1136 	lgrp_t	*lgrp;
1137 	lgrp_t	*lgrp_rsrc;
1138 
1139 	count = 0;
1140 	if (changed)
1141 		klgrpset_clear(*changed);
1142 
1143 	if (klgrpset_isempty(target))
1144 		return (0);
1145 
1146 	/*
1147 	 * Find each lgroup in target lgroups
1148 	 */
1149 	for (i = 0; i <= lgrp_alloc_max; i++) {
1150 		/*
1151 		 * Skip any lgroups that don't exist or aren't in target group
1152 		 */
1153 		lgrp = lgrp_table[i];
1154 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1155 			continue;
1156 		}
1157 
1158 		/*
1159 		 * Initialize memnodes for intermediate lgroups to 0
1160 		 * and update them from scratch since they may have completely
1161 		 * changed
1162 		 */
1163 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1164 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1165 			lgrp->lgrp_nmnodes = 0;
1166 		}
1167 
1168 		/*
1169 		 * Update memory nodes of of target lgroup with memory nodes
1170 		 * from each lgroup in its lgroup memory resource set
1171 		 */
1172 		for (j = 0; j <= lgrp_alloc_max; j++) {
1173 			int	k;
1174 
1175 			/*
1176 			 * Skip any lgroups that don't exist or aren't in
1177 			 * memory resources of target lgroup
1178 			 */
1179 			lgrp_rsrc = lgrp_table[j];
1180 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1181 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1182 			    j))
1183 				continue;
1184 
1185 			/*
1186 			 * Update target lgroup's memnodes to include memnodes
1187 			 * of this lgroup
1188 			 */
1189 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1190 				mnodeset_t	mnode_mask;
1191 
1192 				mnode_mask = (mnodeset_t)1 << k;
1193 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1194 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1195 					lgrp->lgrp_mnodes |= mnode_mask;
1196 					lgrp->lgrp_nmnodes++;
1197 				}
1198 			}
1199 			count++;
1200 			if (changed)
1201 				klgrpset_add(*changed, lgrp->lgrp_id);
1202 		}
1203 	}
1204 
1205 	return (count);
1206 }
1207 
1208 /*
1209  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1210  * is moved from one board to another. The "from" and "to" arguments specify the
1211  * source and the destination of the move.
1212  *
1213  * See plat_lgrp_config() for a detailed description of the copy-rename
1214  * semantics.
1215  *
1216  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1217  * the lgroup topology which is changing as memory moves from one lgroup to
1218  * another. It removes the mnode from the source lgroup and re-inserts it in the
1219  * target lgroup.
1220  *
1221  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1222  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1223  * copy-rename operation.
1224  *
1225  * There is one case which requires special handling. If the system contains
1226  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1227  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1228  * lgrp_mem_init), but there is a window when the system has no memory in the
1229  * lgroup hierarchy. If another thread tries to allocate memory during this
1230  * window, the allocation will fail, although the system has physical memory.
1231  * This may cause a system panic or a deadlock (some sleeping memory allocations
1232  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1233  * the mnode back).
1234  *
1235  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1236  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1237  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1238  * but it updates the rest of the lgroup topology as if the mnode was actually
1239  * removed. The lgrp_mem_init() function recognizes that the mnode being
1240  * inserted represents such a special case and updates the topology
1241  * appropriately.
1242  */
1243 void
1244 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1245 {
1246 	/*
1247 	 * Remove the memory from the source node and add it to the destination
1248 	 * node.
1249 	 */
1250 	lgrp_mem_fini(mnode, from, B_TRUE);
1251 	lgrp_mem_init(mnode, to, B_TRUE);
1252 }
1253 
1254 /*
1255  * Called to indicate that the lgrp with platform handle "hand" now
1256  * contains the memory identified by "mnode".
1257  *
1258  * LOCKING for this routine is a bit tricky. Usually it is called without
1259  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1260  * callers. During DR of the board containing the caged memory it may be called
1261  * with cpu_lock already held and CPUs paused.
1262  *
1263  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1264  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1265  * dealing with the special case of DR copy-rename described in
1266  * lgrp_mem_rename().
1267  */
1268 void
1269 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1270 {
1271 	klgrpset_t	changed;
1272 	int		count;
1273 	int		i;
1274 	lgrp_t		*my_lgrp;
1275 	lgrp_id_t	lgrpid;
1276 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1277 	boolean_t	drop_lock = B_FALSE;
1278 	boolean_t	need_synch = B_FALSE;
1279 
1280 	/*
1281 	 * Grab CPU lock (if we haven't already)
1282 	 */
1283 	if (!MUTEX_HELD(&cpu_lock)) {
1284 		mutex_enter(&cpu_lock);
1285 		drop_lock = B_TRUE;
1286 	}
1287 
1288 	/*
1289 	 * This routine may be called from a context where we already
1290 	 * hold cpu_lock, and have already paused cpus.
1291 	 */
1292 	if (!cpus_paused())
1293 		need_synch = B_TRUE;
1294 
1295 	/*
1296 	 * Check if this mnode is already configured and return immediately if
1297 	 * it is.
1298 	 *
1299 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1300 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1301 	 * recognize this case and continue as usual, but skip the update to
1302 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1303 	 * in topology, temporarily introduced by lgrp_mem_fini().
1304 	 */
1305 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1306 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1307 		if (drop_lock)
1308 			mutex_exit(&cpu_lock);
1309 		return;
1310 	}
1311 
1312 	/*
1313 	 * Update lgroup topology with new memory resources, keeping track of
1314 	 * which lgroups change
1315 	 */
1316 	count = 0;
1317 	klgrpset_clear(changed);
1318 	my_lgrp = lgrp_hand_to_lgrp(hand);
1319 	if (my_lgrp == NULL) {
1320 		/* new lgrp */
1321 		my_lgrp = lgrp_create();
1322 		lgrpid = my_lgrp->lgrp_id;
1323 		my_lgrp->lgrp_plathand = hand;
1324 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1325 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1326 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1327 
1328 		if (need_synch)
1329 			pause_cpus(NULL);
1330 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1331 		    &changed);
1332 		if (need_synch)
1333 			start_cpus();
1334 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1335 	    > 0) {
1336 		/*
1337 		 * Leaf lgroup was created, but latency wasn't available
1338 		 * then.  So, set latency for it and fill in rest of lgroup
1339 		 * topology  now that we know how far it is from other leaf
1340 		 * lgroups.
1341 		 */
1342 		klgrpset_clear(changed);
1343 		lgrpid = my_lgrp->lgrp_id;
1344 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1345 		    lgrpid))
1346 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1347 		if (need_synch)
1348 			pause_cpus(NULL);
1349 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1350 		    &changed);
1351 		if (need_synch)
1352 			start_cpus();
1353 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1354 	    my_lgrp->lgrp_id)) {
1355 		/*
1356 		 * Add new lgroup memory resource to existing lgroup
1357 		 */
1358 		lgrpid = my_lgrp->lgrp_id;
1359 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1360 		klgrpset_add(changed, lgrpid);
1361 		count++;
1362 		for (i = 0; i <= lgrp_alloc_max; i++) {
1363 			lgrp_t		*lgrp;
1364 
1365 			lgrp = lgrp_table[i];
1366 			if (!LGRP_EXISTS(lgrp) ||
1367 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1368 				continue;
1369 
1370 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1371 			klgrpset_add(changed, lgrp->lgrp_id);
1372 			count++;
1373 		}
1374 	}
1375 
1376 	/*
1377 	 * Add memory node to lgroup and remove lgroup from ones that need
1378 	 * to be updated
1379 	 */
1380 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1381 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1382 		my_lgrp->lgrp_nmnodes++;
1383 	}
1384 	klgrpset_del(changed, lgrpid);
1385 
1386 	/*
1387 	 * Update memory node information for all lgroups that changed and
1388 	 * contain new memory node as a resource
1389 	 */
1390 	if (count)
1391 		(void) lgrp_mnode_update(changed, NULL);
1392 
1393 	if (drop_lock)
1394 		mutex_exit(&cpu_lock);
1395 }
1396 
1397 /*
1398  * Called to indicate that the lgroup associated with the platform
1399  * handle "hand" no longer contains given memory node
1400  *
1401  * LOCKING for this routine is a bit tricky. Usually it is called without
1402  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1403  * callers. During DR of the board containing the caged memory it may be called
1404  * with cpu_lock already held and CPUs paused.
1405  *
1406  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1407  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1408  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1409  * the same mnode back into the topology. See lgrp_mem_rename() and
1410  * lgrp_mem_init() for additional details.
1411  */
1412 void
1413 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1414 {
1415 	klgrpset_t	changed;
1416 	int		count;
1417 	int		i;
1418 	lgrp_t		*my_lgrp;
1419 	lgrp_id_t	lgrpid;
1420 	mnodeset_t	mnodes_mask;
1421 	boolean_t	drop_lock = B_FALSE;
1422 	boolean_t	need_synch = B_FALSE;
1423 
1424 	/*
1425 	 * Grab CPU lock (if we haven't already)
1426 	 */
1427 	if (!MUTEX_HELD(&cpu_lock)) {
1428 		mutex_enter(&cpu_lock);
1429 		drop_lock = B_TRUE;
1430 	}
1431 
1432 	/*
1433 	 * This routine may be called from a context where we already
1434 	 * hold cpu_lock and have already paused cpus.
1435 	 */
1436 	if (!cpus_paused())
1437 		need_synch = B_TRUE;
1438 
1439 	my_lgrp = lgrp_hand_to_lgrp(hand);
1440 
1441 	/*
1442 	 * The lgrp *must* be pre-existing
1443 	 */
1444 	ASSERT(my_lgrp != NULL);
1445 
1446 	/*
1447 	 * Delete memory node from lgroups which contain it
1448 	 */
1449 	mnodes_mask = ((mnodeset_t)1 << mnode);
1450 	for (i = 0; i <= lgrp_alloc_max; i++) {
1451 		lgrp_t *lgrp = lgrp_table[i];
1452 		/*
1453 		 * Skip any non-existent lgroups and any lgroups that don't
1454 		 * contain leaf lgroup of memory as a memory resource
1455 		 */
1456 		if (!LGRP_EXISTS(lgrp) ||
1457 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1458 			continue;
1459 
1460 		/*
1461 		 * Avoid removing the last mnode from the root in the DR
1462 		 * copy-rename case. See lgrp_mem_rename() for details.
1463 		 */
1464 		if (is_copy_rename &&
1465 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1466 			continue;
1467 
1468 		/*
1469 		 * Remove memory node from lgroup.
1470 		 */
1471 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1472 		lgrp->lgrp_nmnodes--;
1473 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1474 	}
1475 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1476 
1477 	/*
1478 	 * Don't need to update lgroup topology if this lgroup still has memory.
1479 	 *
1480 	 * In the special case of DR copy-rename with the only mnode being
1481 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1482 	 * still need to update the lgroup topology.
1483 	 */
1484 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1485 	    !(is_copy_rename &&
1486 		(my_lgrp == lgrp_root) &&
1487 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1488 		if (drop_lock)
1489 			mutex_exit(&cpu_lock);
1490 		return;
1491 	}
1492 
1493 	/*
1494 	 * This lgroup does not contain any memory now
1495 	 */
1496 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1497 
1498 	/*
1499 	 * Remove this lgroup from lgroup topology if it does not contain any
1500 	 * resources now
1501 	 */
1502 	lgrpid = my_lgrp->lgrp_id;
1503 	count = 0;
1504 	klgrpset_clear(changed);
1505 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1506 		/*
1507 		 * Delete lgroup when no more resources
1508 		 */
1509 		if (need_synch)
1510 			pause_cpus(NULL);
1511 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1512 		    lgrp_alloc_max + 1, &changed);
1513 		ASSERT(count > 0);
1514 		if (need_synch)
1515 			start_cpus();
1516 	} else {
1517 		/*
1518 		 * Remove lgroup from memory resources of any lgroups that
1519 		 * contain it as such
1520 		 */
1521 		for (i = 0; i <= lgrp_alloc_max; i++) {
1522 			lgrp_t		*lgrp;
1523 
1524 			lgrp = lgrp_table[i];
1525 			if (!LGRP_EXISTS(lgrp) ||
1526 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1527 			    lgrpid))
1528 				continue;
1529 
1530 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1531 		}
1532 	}
1533 	if (drop_lock)
1534 		mutex_exit(&cpu_lock);
1535 }
1536 
1537 /*
1538  * Return lgroup with given platform handle
1539  */
1540 lgrp_t *
1541 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1542 {
1543 	int	i;
1544 	lgrp_t	*lgrp;
1545 
1546 	if (hand == LGRP_NULL_HANDLE)
1547 		return (NULL);
1548 
1549 	for (i = 0; i <= lgrp_alloc_max; i++) {
1550 		lgrp = lgrp_table[i];
1551 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1552 			return (lgrp);
1553 	}
1554 	return (NULL);
1555 }
1556 
1557 /*
1558  * Return the home lgroup of the current thread.
1559  * We must do this with kernel preemption disabled, since we don't want our
1560  * thread to be re-homed while we're poking around with its lpl, and the lpl
1561  * should never be NULL.
1562  *
1563  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1564  * is enabled because of DR.  Callers can use disable kernel preemption
1565  * around this call to guarantee that the lgroup will be valid beyond this
1566  * routine, since kernel preemption can be recursive.
1567  */
1568 lgrp_t *
1569 lgrp_home_lgrp(void)
1570 {
1571 	lgrp_t	*lgrp;
1572 	lpl_t	*lpl;
1573 
1574 	kpreempt_disable();
1575 
1576 	lpl = curthread->t_lpl;
1577 	ASSERT(lpl != NULL);
1578 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1579 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1580 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1581 
1582 	kpreempt_enable();
1583 
1584 	return (lgrp);
1585 }
1586 
1587 /*
1588  * Return ID of home lgroup for given thread
1589  * (See comments for lgrp_home_lgrp() for special care and handling
1590  * instructions)
1591  */
1592 lgrp_id_t
1593 lgrp_home_id(kthread_t *t)
1594 {
1595 	lgrp_id_t	lgrp;
1596 	lpl_t		*lpl;
1597 
1598 	ASSERT(t != NULL);
1599 	/*
1600 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1601 	 * cannot since the HAT layer can call into this routine to
1602 	 * determine the locality for its data structures in the context
1603 	 * of a page fault.
1604 	 */
1605 
1606 	kpreempt_disable();
1607 
1608 	lpl = t->t_lpl;
1609 	ASSERT(lpl != NULL);
1610 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1611 	lgrp = lpl->lpl_lgrpid;
1612 
1613 	kpreempt_enable();
1614 
1615 	return (lgrp);
1616 }
1617 
1618 /*
1619  * Return lgroup containing the physical memory for the given page frame number
1620  */
1621 lgrp_t *
1622 lgrp_pfn_to_lgrp(pfn_t pfn)
1623 {
1624 	lgrp_handle_t	hand;
1625 	int		i;
1626 	lgrp_t		*lgrp;
1627 
1628 	hand = lgrp_plat_pfn_to_hand(pfn);
1629 	if (hand != LGRP_NULL_HANDLE)
1630 		for (i = 0; i <= lgrp_alloc_max; i++) {
1631 			lgrp = lgrp_table[i];
1632 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1633 				return (lgrp);
1634 		}
1635 	return (NULL);
1636 }
1637 
1638 /*
1639  * Return lgroup containing the physical memory for the given page frame number
1640  */
1641 lgrp_t *
1642 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1643 {
1644 	lgrp_handle_t	hand;
1645 	int		i;
1646 	lgrp_t		*lgrp;
1647 	pfn_t		pfn;
1648 
1649 	pfn = btop(physaddr);
1650 	hand = lgrp_plat_pfn_to_hand(pfn);
1651 	if (hand != LGRP_NULL_HANDLE)
1652 		for (i = 0; i <= lgrp_alloc_max; i++) {
1653 			lgrp = lgrp_table[i];
1654 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1655 				return (lgrp);
1656 		}
1657 	return (NULL);
1658 }
1659 
1660 /*
1661  * Return the leaf lgroup containing the given CPU
1662  *
1663  * The caller needs to take precautions necessary to prevent
1664  * "cpu" from going away across a call to this function.
1665  * hint: kpreempt_disable()/kpreempt_enable()
1666  */
1667 static lgrp_t *
1668 lgrp_cpu_to_lgrp(cpu_t *cpu)
1669 {
1670 	return (cpu->cpu_lpl->lpl_lgrp);
1671 }
1672 
1673 /*
1674  * Return the sum of the partition loads in an lgrp divided by
1675  * the number of CPUs in the lgrp.  This is our best approximation
1676  * of an 'lgroup load average' for a useful per-lgroup kstat.
1677  */
1678 static uint64_t
1679 lgrp_sum_loadavgs(lgrp_t *lgrp)
1680 {
1681 	cpu_t *cpu;
1682 	int ncpu;
1683 	uint64_t loads = 0;
1684 
1685 	mutex_enter(&cpu_lock);
1686 
1687 	cpu = lgrp->lgrp_cpu;
1688 	ncpu = lgrp->lgrp_cpucnt;
1689 
1690 	if (cpu == NULL || ncpu == 0) {
1691 		mutex_exit(&cpu_lock);
1692 		return (0ull);
1693 	}
1694 
1695 	do {
1696 		loads += cpu->cpu_lpl->lpl_loadavg;
1697 		cpu = cpu->cpu_next_lgrp;
1698 	} while (cpu != lgrp->lgrp_cpu);
1699 
1700 	mutex_exit(&cpu_lock);
1701 
1702 	return (loads / ncpu);
1703 }
1704 
1705 void
1706 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1707 {
1708 	struct lgrp_stats *pstats;
1709 
1710 	/*
1711 	 * Verify that the caller isn't trying to add to
1712 	 * a statistic for an lgroup that has gone away
1713 	 */
1714 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1715 		return;
1716 
1717 	pstats = &lgrp_stats[lgrpid];
1718 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1719 }
1720 
1721 int64_t
1722 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1723 {
1724 	uint64_t val;
1725 	struct lgrp_stats *pstats;
1726 
1727 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1728 		return ((int64_t)0);
1729 
1730 	pstats = &lgrp_stats[lgrpid];
1731 	LGRP_STAT_READ(pstats, stat, val);
1732 	return (val);
1733 }
1734 
1735 /*
1736  * Reset all kstats for lgrp specified by its lgrpid.
1737  */
1738 static void
1739 lgrp_kstat_reset(lgrp_id_t lgrpid)
1740 {
1741 	lgrp_stat_t stat;
1742 
1743 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1744 		return;
1745 
1746 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1747 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1748 	}
1749 }
1750 
1751 /*
1752  * Collect all per-lgrp statistics for the lgrp associated with this
1753  * kstat, and store them in the ks_data array.
1754  *
1755  * The superuser can reset all the running counter statistics for an
1756  * lgrp by writing to any of the lgrp's stats.
1757  */
1758 static int
1759 lgrp_kstat_extract(kstat_t *ksp, int rw)
1760 {
1761 	lgrp_stat_t		stat;
1762 	struct kstat_named	*ksd;
1763 	lgrp_t			*lgrp;
1764 	lgrp_id_t		lgrpid;
1765 
1766 	lgrp = (lgrp_t *)ksp->ks_private;
1767 
1768 	ksd = (struct kstat_named *)ksp->ks_data;
1769 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1770 
1771 	lgrpid = lgrp->lgrp_id;
1772 
1773 	if (lgrpid == LGRP_NONE) {
1774 		/*
1775 		 * Return all zeroes as stats for freed lgrp.
1776 		 */
1777 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1778 			ksd[stat].value.i64 = 0;
1779 		}
1780 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1781 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1782 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1783 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1784 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1785 	} else if (rw != KSTAT_WRITE) {
1786 		/*
1787 		 * Handle counter stats
1788 		 */
1789 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1790 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1791 		}
1792 
1793 		/*
1794 		 * Handle kernel data snapshot stats
1795 		 */
1796 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1797 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1798 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1799 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1800 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1801 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1802 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1803 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1804 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1805 		    lgrp_loadavg_max_effect;
1806 	} else {
1807 		lgrp_kstat_reset(lgrpid);
1808 	}
1809 
1810 	return (0);
1811 }
1812 
1813 int
1814 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1815 {
1816 	cpu_t	*cp;
1817 
1818 	mutex_enter(&cpu_lock);
1819 
1820 	if ((cp = cpu_get(id)) == NULL) {
1821 		mutex_exit(&cpu_lock);
1822 		return (EINVAL);
1823 	}
1824 
1825 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1826 		mutex_exit(&cpu_lock);
1827 		return (EINVAL);
1828 	}
1829 
1830 	ASSERT(cp->cpu_lpl != NULL);
1831 
1832 	*lp = cp->cpu_lpl->lpl_lgrpid;
1833 
1834 	mutex_exit(&cpu_lock);
1835 
1836 	return (0);
1837 }
1838 
1839 int
1840 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1841 {
1842 	cpu_t *cp;
1843 
1844 	mutex_enter(&cpu_lock);
1845 
1846 	if ((cp = cpu_get(id)) == NULL) {
1847 		mutex_exit(&cpu_lock);
1848 		return (EINVAL);
1849 	}
1850 
1851 	ASSERT(cp->cpu_lpl != NULL);
1852 
1853 	*lp = cp->cpu_lpl->lpl_loadavg;
1854 
1855 	mutex_exit(&cpu_lock);
1856 
1857 	return (0);
1858 }
1859 
1860 /*
1861  * Add a resource named by lpl_leaf to rset of lpl_target
1862  *
1863  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1864  * resource. It is adjusted here, as this is presently the only place that we
1865  * can be certain a resource addition has succeeded.
1866  *
1867  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1868  * list in order until it reaches a NULL.  (This list is required to be NULL
1869  * terminated, too).  This is done so that we can mark start pos + 1, so that
1870  * each lpl is traversed sequentially, but in a different order.  We hope this
1871  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1872  */
1873 
1874 void
1875 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1876 {
1877 	int		i;
1878 	int		entry_slot = 0;
1879 
1880 	/* return if leaf is already present */
1881 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1882 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1883 			return;
1884 		}
1885 
1886 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1887 		    lpl_leaf->lpl_lgrpid) {
1888 			break;
1889 		}
1890 	}
1891 
1892 	/* insert leaf, update counts */
1893 	entry_slot = i;
1894 	i = lpl_target->lpl_nrset++;
1895 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1896 		panic("More leaf lgrps in system than are supported!\n");
1897 	}
1898 
1899 	/*
1900 	 * Start at the end of the rset array and work backwards towards the
1901 	 * slot into which the new lpl will be inserted. This effectively
1902 	 * preserves the current ordering by scooting everybody over one entry,
1903 	 * and placing the new entry into the space created.
1904 	 */
1905 
1906 	while (i-- > entry_slot) {
1907 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1908 	}
1909 
1910 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1911 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1912 }
1913 
1914 /*
1915  * Update each of lpl_parent's children with a proper hint and
1916  * a reference to their parent.
1917  * The lgrp topology is used as the reference since it is fully
1918  * consistent and correct at this point.
1919  *
1920  * Each child's hint will reference an element in lpl_parent's
1921  * rset that designates where the child should start searching
1922  * for CPU resources. The hint selected is the highest order leaf present
1923  * in the child's lineage.
1924  *
1925  * This should be called after any potential change in lpl_parent's
1926  * rset.
1927  */
1928 static void
1929 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1930 {
1931 	klgrpset_t	children, leaves;
1932 	lpl_t		*lpl;
1933 	int		hint;
1934 	int		i, j;
1935 
1936 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1937 	if (klgrpset_isempty(children))
1938 		return; /* nothing to do */
1939 
1940 	for (i = 0; i <= lgrp_alloc_max; i++) {
1941 		if (klgrpset_ismember(children, i)) {
1942 
1943 			/*
1944 			 * Given the set of leaves in this child's lineage,
1945 			 * find the highest order leaf present in the parent's
1946 			 * rset. Select this as the hint for the child.
1947 			 */
1948 			leaves = lgrp_table[i]->lgrp_leaves;
1949 			hint = 0;
1950 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1951 				lpl = lpl_parent->lpl_rset[j];
1952 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1953 					hint = j;
1954 			}
1955 			cp->cp_lgrploads[i].lpl_hint = hint;
1956 
1957 			/*
1958 			 * (Re)set the parent. It may be incorrect if
1959 			 * lpl_parent is new in the topology.
1960 			 */
1961 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1962 		}
1963 	}
1964 }
1965 
1966 /*
1967  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1968  *
1969  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1970  * resource. The values are adjusted here, as this is the only place that we can
1971  * be certain a resource was successfully deleted.
1972  */
1973 void
1974 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1975 {
1976 	int i;
1977 
1978 	/* find leaf in intermediate node */
1979 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1980 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1981 			break;
1982 	}
1983 
1984 	/* return if leaf not found */
1985 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1986 		return;
1987 
1988 	/* prune leaf, compress array */
1989 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1990 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1991 	lpl_target->lpl_ncpu--;
1992 	do {
1993 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1994 	} while (i++ < lpl_target->lpl_nrset);
1995 }
1996 
1997 /*
1998  * Check to see if the resource set of the target lpl contains the
1999  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
2000  */
2001 
2002 int
2003 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
2004 {
2005 	int i;
2006 
2007 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
2008 		if (lpl_target->lpl_rset[i] == lpl_leaf)
2009 			return (1);
2010 	}
2011 
2012 	return (0);
2013 }
2014 
2015 /*
2016  * Called when we change cpu lpl membership.  This increments or decrements the
2017  * per-cpu counter in every lpl in which our leaf appears.
2018  */
2019 void
2020 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
2021 {
2022 	cpupart_t	*cpupart;
2023 	lgrp_t		*lgrp_leaf;
2024 	lgrp_t		*lgrp_cur;
2025 	lpl_t		*lpl_leaf;
2026 	lpl_t		*lpl_cur;
2027 	int		i;
2028 
2029 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
2030 
2031 	cpupart = cp->cpu_part;
2032 	lpl_leaf = cp->cpu_lpl;
2033 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
2034 
2035 	for (i = 0; i <= lgrp_alloc_max; i++) {
2036 		lgrp_cur = lgrp_table[i];
2037 
2038 		/*
2039 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2040 		 * for the cpu in question, or if the current lgrp and leaf
2041 		 * don't share the same resources.
2042 		 */
2043 
2044 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2045 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2046 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2047 			continue;
2048 
2049 
2050 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2051 
2052 		if (lpl_cur->lpl_nrset > 0) {
2053 			if (act == LPL_INCREMENT) {
2054 				lpl_cur->lpl_ncpu++;
2055 			} else if (act == LPL_DECREMENT) {
2056 				lpl_cur->lpl_ncpu--;
2057 			}
2058 		}
2059 	}
2060 }
2061 
2062 /*
2063  * Initialize lpl with given resources and specified lgrp
2064  */
2065 
2066 void
2067 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2068 {
2069 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2070 	lpl->lpl_loadavg = 0;
2071 	if (lpl == lpl_leaf)
2072 		lpl->lpl_ncpu = 1;
2073 	else
2074 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2075 	lpl->lpl_nrset = 1;
2076 	lpl->lpl_rset[0] = lpl_leaf;
2077 	lpl->lpl_lgrp = lgrp;
2078 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2079 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2080 }
2081 
2082 /*
2083  * Clear an unused lpl
2084  */
2085 
2086 void
2087 lpl_clear(lpl_t *lpl)
2088 {
2089 	lgrp_id_t	lid;
2090 
2091 	/* save lid for debugging purposes */
2092 	lid = lpl->lpl_lgrpid;
2093 	bzero(lpl, sizeof (lpl_t));
2094 	lpl->lpl_lgrpid = lid;
2095 }
2096 
2097 /*
2098  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2099  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2100  * make full use of all of the lgroup topology, but this checks to make sure
2101  * that for the parts that it does use, it has correctly understood the
2102  * relationships that exist. This function returns
2103  * 0 if the topology is correct, and a non-zero error code, for non-debug
2104  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2105  * debugging on a DEBUG kernel.
2106  */
2107 int
2108 lpl_topo_verify(cpupart_t *cpupart)
2109 {
2110 	lgrp_t		*lgrp;
2111 	lpl_t		*lpl;
2112 	klgrpset_t	rset;
2113 	klgrpset_t	cset;
2114 	cpu_t		*cpu;
2115 	cpu_t		*cp_start;
2116 	int		i;
2117 	int		j;
2118 	int		sum;
2119 
2120 	/* topology can't be incorrect if it doesn't exist */
2121 	if (!lgrp_topo_initialized || !lgrp_initialized)
2122 		return (LPL_TOPO_CORRECT);
2123 
2124 	ASSERT(cpupart != NULL);
2125 
2126 	for (i = 0; i <= lgrp_alloc_max; i++) {
2127 		lgrp = lgrp_table[i];
2128 		lpl = NULL;
2129 		/* make sure lpls are allocated */
2130 		ASSERT(cpupart->cp_lgrploads);
2131 		if (!cpupart->cp_lgrploads)
2132 			return (LPL_TOPO_PART_HAS_NO_LPL);
2133 
2134 		lpl = &cpupart->cp_lgrploads[i];
2135 		/* make sure our index is good */
2136 		ASSERT(i < cpupart->cp_nlgrploads);
2137 
2138 		/* if lgroup doesn't exist, make sure lpl is empty */
2139 		if (!LGRP_EXISTS(lgrp)) {
2140 			ASSERT(lpl->lpl_ncpu == 0);
2141 			if (lpl->lpl_ncpu > 0) {
2142 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2143 			} else {
2144 				continue;
2145 			}
2146 		}
2147 
2148 		/* verify that lgroup and lpl are identically numbered */
2149 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2150 
2151 		/* if lgroup isn't in our partition, make sure lpl is empty */
2152 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2153 		    cpupart->cp_lgrpset)) {
2154 			ASSERT(lpl->lpl_ncpu == 0);
2155 			if (lpl->lpl_ncpu > 0) {
2156 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2157 			}
2158 			/*
2159 			 * lpl is empty, and lgroup isn't in partition.  verify
2160 			 * that lpl doesn't show up in anyone else's rsets (in
2161 			 * this partition, anyway)
2162 			 */
2163 
2164 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2165 				lpl_t *i_lpl; /* lpl we're iterating over */
2166 
2167 				i_lpl = &cpupart->cp_lgrploads[j];
2168 
2169 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2170 				if (lpl_rset_contains(i_lpl, lpl)) {
2171 					return (LPL_TOPO_LPL_ORPHANED);
2172 				}
2173 			}
2174 			/* lgroup is empty, and everything is ok. continue */
2175 			continue;
2176 		}
2177 
2178 
2179 		/* lgroup is in this partition, now check it against lpl */
2180 
2181 		/* do both have matching lgrps? */
2182 		ASSERT(lgrp == lpl->lpl_lgrp);
2183 		if (lgrp != lpl->lpl_lgrp) {
2184 			return (LPL_TOPO_LGRP_MISMATCH);
2185 		}
2186 
2187 		/* do the parent lgroups exist and do they match? */
2188 		if (lgrp->lgrp_parent) {
2189 			ASSERT(lpl->lpl_parent);
2190 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2191 				    lpl->lpl_parent->lpl_lgrpid);
2192 
2193 			if (!lpl->lpl_parent) {
2194 				return (LPL_TOPO_MISSING_PARENT);
2195 			} else if (lgrp->lgrp_parent->lgrp_id !=
2196 			    lpl->lpl_parent->lpl_lgrpid) {
2197 				return (LPL_TOPO_PARENT_MISMATCH);
2198 			}
2199 		}
2200 
2201 		/* only leaf lgroups keep a cpucnt, only check leaves */
2202 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2203 
2204 			/* verify that lgrp is also a leaf */
2205 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2206 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2207 			    lpl->lpl_lgrpid)));
2208 
2209 			if ((lgrp->lgrp_childcnt > 0) ||
2210 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2211 			    lpl->lpl_lgrpid))) {
2212 				return (LPL_TOPO_LGRP_NOT_LEAF);
2213 			}
2214 
2215 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2216 			    (lpl->lpl_ncpu > 0));
2217 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2218 				(lpl->lpl_ncpu <= 0)) {
2219 				return (LPL_TOPO_BAD_CPUCNT);
2220 			}
2221 
2222 			/*
2223 			 * Check that lpl_ncpu also matches the number of
2224 			 * cpus in the lpl's linked list.  This only exists in
2225 			 * leaves, but they should always match.
2226 			 */
2227 			j = 0;
2228 			cpu = cp_start = lpl->lpl_cpus;
2229 			while (cpu != NULL) {
2230 				j++;
2231 
2232 				/* check to make sure cpu's lpl is leaf lpl */
2233 				ASSERT(cpu->cpu_lpl == lpl);
2234 				if (cpu->cpu_lpl != lpl) {
2235 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2236 				}
2237 
2238 				/* check next cpu */
2239 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2240 					continue;
2241 				} else {
2242 					cpu = NULL;
2243 				}
2244 			}
2245 
2246 			ASSERT(j == lpl->lpl_ncpu);
2247 			if (j != lpl->lpl_ncpu) {
2248 				return (LPL_TOPO_LPL_BAD_NCPU);
2249 			}
2250 
2251 			/*
2252 			 * Also, check that leaf lpl is contained in all
2253 			 * intermediate lpls that name the leaf as a descendant
2254 			 */
2255 
2256 			for (j = 0; j <= lgrp_alloc_max; j++) {
2257 				klgrpset_t intersect;
2258 				lgrp_t *lgrp_cand;
2259 				lpl_t *lpl_cand;
2260 
2261 				lgrp_cand = lgrp_table[j];
2262 				intersect = klgrpset_intersects(
2263 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2264 				    cpupart->cp_lgrpset);
2265 
2266 				if (!LGRP_EXISTS(lgrp_cand) ||
2267 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2268 				    cpupart->cp_lgrpset) ||
2269 				    (intersect == 0))
2270 					continue;
2271 
2272 				lpl_cand =
2273 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2274 
2275 				if (klgrpset_ismember(intersect,
2276 				    lgrp->lgrp_id)) {
2277 					ASSERT(lpl_rset_contains(lpl_cand,
2278 					    lpl));
2279 
2280 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2281 						return (LPL_TOPO_RSET_MSSNG_LF);
2282 					}
2283 				}
2284 			}
2285 
2286 		} else { /* non-leaf specific checks */
2287 
2288 			/*
2289 			 * Non-leaf lpls should have lpl_cpus == NULL
2290 			 * verify that this is so
2291 			 */
2292 			ASSERT(lpl->lpl_cpus == NULL);
2293 			if (lpl->lpl_cpus != NULL) {
2294 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2295 			}
2296 
2297 			/*
2298 			 * verify that the sum of the cpus in the leaf resources
2299 			 * is equal to the total ncpu in the intermediate
2300 			 */
2301 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2302 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2303 			}
2304 
2305 			ASSERT(sum == lpl->lpl_ncpu);
2306 			if (sum != lpl->lpl_ncpu) {
2307 				return (LPL_TOPO_LPL_BAD_NCPU);
2308 			}
2309 		}
2310 
2311 		/*
2312 		 * check on lpl_hint. Don't check root, since it has no parent.
2313 		 */
2314 		if (lpl->lpl_parent != NULL) {
2315 			int hint;
2316 			lpl_t *hint_lpl;
2317 
2318 			/* make sure hint is within limits of nrset */
2319 			hint = lpl->lpl_hint;
2320 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2321 			if (lpl->lpl_parent->lpl_nrset < hint) {
2322 				return (LPL_TOPO_BOGUS_HINT);
2323 			}
2324 
2325 			/* make sure hint points to valid lpl */
2326 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2327 			ASSERT(hint_lpl->lpl_ncpu > 0);
2328 			if (hint_lpl->lpl_ncpu <= 0) {
2329 				return (LPL_TOPO_BOGUS_HINT);
2330 			}
2331 		}
2332 
2333 		/*
2334 		 * Check the rset of the lpl in question.  Make sure that each
2335 		 * rset contains a subset of the resources in
2336 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2337 		 * sure that each rset doesn't include resources that are
2338 		 * outside of that set.  (Which would be resources somehow not
2339 		 * accounted for).
2340 		 */
2341 
2342 		klgrpset_clear(rset);
2343 		for (j = 0; j < lpl->lpl_nrset; j++) {
2344 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2345 		}
2346 		klgrpset_copy(cset, rset);
2347 		/* make sure lpl rset matches lgrp rset */
2348 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2349 		/* make sure rset is contained with in partition, too */
2350 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2351 
2352 		ASSERT(klgrpset_isempty(rset) &&
2353 			    klgrpset_isempty(cset));
2354 		if (!klgrpset_isempty(rset) ||
2355 		    !klgrpset_isempty(cset)) {
2356 			return (LPL_TOPO_RSET_MISMATCH);
2357 		}
2358 
2359 		/*
2360 		 * check to make sure lpl_nrset matches the number of rsets
2361 		 * contained in the lpl
2362 		 */
2363 
2364 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2365 		    j++);
2366 
2367 		ASSERT(j == lpl->lpl_nrset);
2368 		if (j != lpl->lpl_nrset) {
2369 			return (LPL_TOPO_BAD_RSETCNT);
2370 		}
2371 
2372 	}
2373 	return (LPL_TOPO_CORRECT);
2374 }
2375 
2376 /*
2377  * Flatten lpl topology to given number of levels.  This is presently only
2378  * implemented for a flatten to 2 levels, which will prune out the intermediates
2379  * and home the leaf lpls to the root lpl.
2380  */
2381 int
2382 lpl_topo_flatten(int levels)
2383 {
2384 	int		i;
2385 	uint_t		sum;
2386 	lgrp_t		*lgrp_cur;
2387 	lpl_t		*lpl_cur;
2388 	lpl_t		*lpl_root;
2389 	cpupart_t	*cp;
2390 
2391 	if (levels != 2)
2392 		return (0);
2393 
2394 	/* called w/ cpus paused - grab no locks! */
2395 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2396 	    !lgrp_initialized);
2397 
2398 	cp = cp_list_head;
2399 	do {
2400 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2401 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2402 
2403 		for (i = 0; i <= lgrp_alloc_max; i++) {
2404 			lgrp_cur = lgrp_table[i];
2405 			lpl_cur = &cp->cp_lgrploads[i];
2406 
2407 			if ((lgrp_cur == lgrp_root) ||
2408 			    (!LGRP_EXISTS(lgrp_cur) &&
2409 			    (lpl_cur->lpl_ncpu == 0)))
2410 				continue;
2411 
2412 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2413 				/*
2414 				 * this should be a deleted intermediate, so
2415 				 * clear it
2416 				 */
2417 				lpl_clear(lpl_cur);
2418 			} else if ((lpl_cur->lpl_nrset == 1) &&
2419 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2420 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2421 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2422 				/*
2423 				 * this is a leaf whose parent was deleted, or
2424 				 * whose parent had their lgrp deleted.  (And
2425 				 * whose parent will soon be deleted).  Point
2426 				 * this guy back to the root lpl.
2427 				 */
2428 				lpl_cur->lpl_parent = lpl_root;
2429 				lpl_rset_add(lpl_root, lpl_cur);
2430 			}
2431 
2432 		}
2433 
2434 		/*
2435 		 * Now that we're done, make sure the count on the root lpl is
2436 		 * correct, and update the hints of the children for the sake of
2437 		 * thoroughness
2438 		 */
2439 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2440 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2441 		}
2442 		lpl_root->lpl_ncpu = sum;
2443 		lpl_child_update(lpl_root, cp);
2444 
2445 		cp = cp->cp_next;
2446 	} while (cp != cp_list_head);
2447 
2448 	return (levels);
2449 }
2450 
2451 /*
2452  * Insert a lpl into the resource hierarchy and create any additional lpls that
2453  * are necessary to represent the varying states of locality for the cpu
2454  * resoruces newly added to the partition.
2455  *
2456  * This routine is clever enough that it can correctly add resources from the
2457  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2458  * those for which the lpl is a leaf as opposed to simply a named equally local
2459  * resource).  The one special case that needs additional processing is when a
2460  * new intermediate lpl is introduced.  Since the main loop only traverses
2461  * looking to add the leaf resource where it does not yet exist, additional work
2462  * is necessary to add other leaf resources that may need to exist in the newly
2463  * created intermediate.  This is performed by the second inner loop, and is
2464  * only done when the check for more than one overlapping resource succeeds.
2465  */
2466 
2467 void
2468 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2469 {
2470 	int		i;
2471 	int		j;
2472 	int		hint;
2473 	int		rset_num_intersect;
2474 	lgrp_t		*lgrp_cur;
2475 	lpl_t		*lpl_cur;
2476 	lpl_t		*lpl_parent;
2477 	lgrp_id_t	parent_id;
2478 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2479 
2480 	for (i = 0; i <= lgrp_alloc_max; i++) {
2481 		lgrp_cur = lgrp_table[i];
2482 
2483 		/*
2484 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2485 		 * contained within the current lgrp, or if the current lgrp has
2486 		 * no leaves in this partition
2487 		 */
2488 
2489 		if (!LGRP_EXISTS(lgrp_cur) ||
2490 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2491 		    lpl_leaf->lpl_lgrpid) ||
2492 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2493 		    cpupart->cp_lgrpset))
2494 			continue;
2495 
2496 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2497 		if (lgrp_cur->lgrp_parent != NULL) {
2498 			/* if lgrp has a parent, assign it properly */
2499 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2500 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2501 		} else {
2502 			/* if not, make sure parent ptr gets set to null */
2503 			lpl_parent = NULL;
2504 		}
2505 
2506 		if (lpl_cur == lpl_leaf) {
2507 			/*
2508 			 * Almost all leaf state was initialized elsewhere.  The
2509 			 * only thing left to do is to set the parent.
2510 			 */
2511 			lpl_cur->lpl_parent = lpl_parent;
2512 			continue;
2513 		}
2514 
2515 		/*
2516 		 * Initialize intermediate lpl
2517 		 * Save this lpl's hint though. Since we're changing this
2518 		 * lpl's resources, we need to update the hint in this lpl's
2519 		 * children, but the hint in this lpl is unaffected and
2520 		 * should be preserved.
2521 		 */
2522 		hint = lpl_cur->lpl_hint;
2523 
2524 		lpl_clear(lpl_cur);
2525 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2526 
2527 		lpl_cur->lpl_hint = hint;
2528 		lpl_cur->lpl_parent = lpl_parent;
2529 
2530 		/* does new lpl need to be populated with other resources? */
2531 		rset_intersect =
2532 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2533 			cpupart->cp_lgrpset);
2534 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2535 
2536 		if (rset_num_intersect > 1) {
2537 			/*
2538 			 * If so, figure out what lpls have resources that
2539 			 * intersect this one, and add them.
2540 			 */
2541 			for (j = 0; j <= lgrp_alloc_max; j++) {
2542 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2543 				lpl_t	*lpl_cand;	/* candidate lpl */
2544 
2545 				lgrp_cand = lgrp_table[j];
2546 				if (!LGRP_EXISTS(lgrp_cand) ||
2547 				    !klgrpset_ismember(rset_intersect,
2548 					lgrp_cand->lgrp_id))
2549 					continue;
2550 				lpl_cand =
2551 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2552 				lpl_rset_add(lpl_cur, lpl_cand);
2553 			}
2554 		}
2555 		/*
2556 		 * This lpl's rset has changed. Update the hint in it's
2557 		 * children.
2558 		 */
2559 		lpl_child_update(lpl_cur, cpupart);
2560 	}
2561 }
2562 
2563 /*
2564  * remove a lpl from the hierarchy of resources, clearing its state when
2565  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2566  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2567  * delete them as well.
2568  */
2569 
2570 void
2571 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2572 {
2573 	int		i;
2574 	lgrp_t		*lgrp_cur;
2575 	lpl_t		*lpl_cur;
2576 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2577 
2578 	for (i = 0; i <= lgrp_alloc_max; i++) {
2579 		lgrp_cur = lgrp_table[i];
2580 
2581 		/*
2582 		 * Don't attempt to remove from lgrps that aren't there, that
2583 		 * don't contain our leaf, or from the leaf itself. (We do that
2584 		 * later)
2585 		 */
2586 
2587 		if (!LGRP_EXISTS(lgrp_cur))
2588 			continue;
2589 
2590 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2591 
2592 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2593 		    lpl_leaf->lpl_lgrpid) ||
2594 		    (lpl_cur == lpl_leaf)) {
2595 			continue;
2596 		}
2597 
2598 		/*
2599 		 * This is a slightly sleazy simplification in that we have
2600 		 * already marked the cp_lgrpset as no longer containing the
2601 		 * leaf we've deleted.  Any lpls that pass the above checks
2602 		 * based upon lgrp membership but not necessarily cpu-part
2603 		 * membership also get cleared by the checks below.  Currently
2604 		 * this is harmless, as the lpls should be empty anyway.
2605 		 *
2606 		 * In particular, we want to preserve lpls that have additional
2607 		 * leaf resources, even though we don't yet have a processor
2608 		 * architecture that represents resources this way.
2609 		 */
2610 
2611 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2612 		    cpupart->cp_lgrpset);
2613 
2614 		lpl_rset_del(lpl_cur, lpl_leaf);
2615 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2616 			lpl_clear(lpl_cur);
2617 		} else {
2618 			/*
2619 			 * Update this lpl's children
2620 			 */
2621 			lpl_child_update(lpl_cur, cpupart);
2622 		}
2623 	}
2624 	lpl_clear(lpl_leaf);
2625 }
2626 
2627 /*
2628  * add a cpu to a partition in terms of lgrp load avg bookeeping
2629  *
2630  * The lpl (cpu partition load average information) is now arranged in a
2631  * hierarchical fashion whereby resources that are closest, ie. most local, to
2632  * the cpu in question are considered to be leaves in a tree of resources.
2633  * There are two general cases for cpu additon:
2634  *
2635  * 1. A lpl structure that contains resources already in the hierarchy tree.
2636  * In this case, all of the associated lpl relationships have been defined, and
2637  * all that is necessary is that we link the new cpu into the per-lpl list of
2638  * cpus, and increment the ncpu count of all places where this cpu resource will
2639  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2640  * pushing is accomplished by this routine.
2641  *
2642  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2643  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2644  * construct the hierarchy of state necessary to name it's more distant
2645  * resources, if they should exist.  The leaf structure is initialized by this
2646  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2647  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2648  * and builds all of the "ancestoral" state necessary to identify resources at
2649  * differing levels of locality.
2650  */
2651 void
2652 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2653 {
2654 	cpupart_t	*cpupart;
2655 	lgrp_t		*lgrp_leaf;
2656 	lpl_t		*lpl_leaf;
2657 
2658 	/* called sometimes w/ cpus paused - grab no locks */
2659 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2660 
2661 	cpupart = cp->cpu_part;
2662 	lgrp_leaf = lgrp_table[lgrpid];
2663 
2664 	/* don't add non-existent lgrp */
2665 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2666 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2667 	cp->cpu_lpl = lpl_leaf;
2668 
2669 	/* only leaf lpls contain cpus */
2670 
2671 	if (lpl_leaf->lpl_ncpu++ == 0) {
2672 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2673 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2674 		lpl_leaf_insert(lpl_leaf, cpupart);
2675 	} else {
2676 		/*
2677 		 * the lpl should already exist in the parent, so just update
2678 		 * the count of available CPUs
2679 		 */
2680 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2681 	}
2682 
2683 	/* link cpu into list of cpus in lpl */
2684 
2685 	if (lpl_leaf->lpl_cpus) {
2686 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2687 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2688 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2689 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2690 	} else {
2691 		/*
2692 		 * We increment ncpu immediately after we create a new leaf
2693 		 * lpl, so assert that ncpu == 1 for the case where we don't
2694 		 * have any cpu pointers yet.
2695 		 */
2696 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2697 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2698 	}
2699 
2700 }
2701 
2702 
2703 /*
2704  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2705  *
2706  * The lpl (cpu partition load average information) is now arranged in a
2707  * hierarchical fashion whereby resources that are closest, ie. most local, to
2708  * the cpu in question are considered to be leaves in a tree of resources.
2709  * There are two removal cases in question:
2710  *
2711  * 1. Removal of the resource in the leaf leaves other resources remaining in
2712  * that leaf.  (Another cpu still exists at this level of locality).  In this
2713  * case, the count of available cpus is decremented in all assocated lpls by
2714  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2715  * from the per-cpu lpl list.
2716  *
2717  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2718  * empty)  In this case, all of what has occurred for the first step must take
2719  * place; however, additionally we must remove the lpl structure itself, prune
2720  * out any stranded lpls that do not directly name a leaf resource, and mark the
2721  * cpu partition in question as no longer containing resources from the lgrp of
2722  * the lpl that has been delted.  Cpu-partition changes are handled by this
2723  * method, but the lpl_leaf_remove function deals with the details of pruning
2724  * out the empty lpl and any of its orphaned direct ancestors.
2725  */
2726 void
2727 lgrp_part_del_cpu(cpu_t *cp)
2728 {
2729 	lpl_t		*lpl;
2730 	lpl_t		*leaf_lpl;
2731 	lgrp_t		*lgrp_leaf;
2732 
2733 	/* called sometimes w/ cpus paused - grab no locks */
2734 
2735 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2736 
2737 	lpl = leaf_lpl = cp->cpu_lpl;
2738 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2739 
2740 	/* don't delete a leaf that isn't there */
2741 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2742 
2743 	/* no double-deletes */
2744 	ASSERT(lpl->lpl_ncpu);
2745 	if (--lpl->lpl_ncpu == 0) {
2746 		/*
2747 		 * This was the last cpu in this lgroup for this partition,
2748 		 * clear its bit in the partition's lgroup bitmask
2749 		 */
2750 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2751 
2752 		/* eliminate remaning lpl link pointers in cpu, lpl */
2753 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2754 
2755 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2756 	} else {
2757 
2758 		/* unlink cpu from lists of cpus in lpl */
2759 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2760 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2761 		if (lpl->lpl_cpus == cp) {
2762 			lpl->lpl_cpus = cp->cpu_next_lpl;
2763 		}
2764 
2765 		/*
2766 		 * Update the cpu count in the lpls associated with parent
2767 		 * lgroups.
2768 		 */
2769 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2770 
2771 	}
2772 	/* clear cpu's lpl ptr when we're all done */
2773 	cp->cpu_lpl = NULL;
2774 }
2775 
2776 /*
2777  * Recompute load average for the specified partition/lgrp fragment.
2778  *
2779  * We rely on the fact that this routine is called from the clock thread
2780  * at a point before the clock thread can block (i.e. before its first
2781  * lock request).  Since the clock thread can not be preempted (since it
2782  * runs at highest priority), we know that cpu partitions can not change
2783  * (since doing so would require either the repartition requester or the
2784  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2785  * without grabbing cpu_lock.
2786  */
2787 void
2788 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2789 {
2790 	uint_t		ncpu;
2791 	int64_t		old, new, f;
2792 
2793 	/*
2794 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2795 	 */
2796 	static short expval[] = {
2797 	    0, 3196, 1618, 1083,
2798 	    814, 652, 543, 466,
2799 	    408, 363, 326, 297,
2800 	    272, 251, 233, 218,
2801 	    204, 192, 181, 172,
2802 	    163, 155, 148, 142,
2803 	    136, 130, 125, 121,
2804 	    116, 112, 109, 105
2805 	};
2806 
2807 	/* ASSERT (called from clock level) */
2808 
2809 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2810 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2811 		return;
2812 	}
2813 
2814 	for (;;) {
2815 
2816 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2817 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2818 		else
2819 			f = expval[ncpu];
2820 
2821 		/*
2822 		 * Modify the load average atomically to avoid losing
2823 		 * anticipatory load updates (see lgrp_move_thread()).
2824 		 */
2825 		if (ageflag) {
2826 			/*
2827 			 * We're supposed to both update and age the load.
2828 			 * This happens 10 times/sec. per cpu.  We do a
2829 			 * little hoop-jumping to avoid integer overflow.
2830 			 */
2831 			int64_t		q, r;
2832 
2833 			do {
2834 				old = new = lpl->lpl_loadavg;
2835 				q = (old  >> 16) << 7;
2836 				r = (old  & 0xffff) << 7;
2837 				new += ((long long)(nrcpus - q) * f -
2838 				    ((r * f) >> 16)) >> 7;
2839 
2840 				/*
2841 				 * Check for overflow
2842 				 */
2843 				if (new > LGRP_LOADAVG_MAX)
2844 					new = LGRP_LOADAVG_MAX;
2845 				else if (new < 0)
2846 					new = 0;
2847 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2848 			    new) != old);
2849 		} else {
2850 			/*
2851 			 * We're supposed to update the load, but not age it.
2852 			 * This option is used to update the load (which either
2853 			 * has already been aged in this 1/10 sec. interval or
2854 			 * soon will be) to account for a remotely executing
2855 			 * thread.
2856 			 */
2857 			do {
2858 				old = new = lpl->lpl_loadavg;
2859 				new += f;
2860 				/*
2861 				 * Check for overflow
2862 				 * Underflow not possible here
2863 				 */
2864 				if (new < old)
2865 					new = LGRP_LOADAVG_MAX;
2866 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2867 			    new) != old);
2868 		}
2869 
2870 		/*
2871 		 * Do the same for this lpl's parent
2872 		 */
2873 		if ((lpl = lpl->lpl_parent) == NULL)
2874 			break;
2875 		ncpu = lpl->lpl_ncpu;
2876 	}
2877 }
2878 
2879 /*
2880  * Initialize lpl topology in the target based on topology currently present in
2881  * lpl_bootstrap.
2882  *
2883  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2884  * initialize cp_default list of lpls. Up to this point all topology operations
2885  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2886  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2887  * `target' points to the list of lpls in cp_default and `size' is the size of
2888  * this list.
2889  *
2890  * This function walks the lpl topology in lpl_bootstrap and does for things:
2891  *
2892  * 1) Copies all fields from lpl_bootstrap to the target.
2893  *
2894  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2895  *
2896  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2897  *    instead of lpl_bootstrap.
2898  *
2899  * 4) Updates pointers in the resource list of the target to point to the lpls
2900  *    in the target list instead of lpl_bootstrap.
2901  *
2902  * After lpl_topo_bootstrap() completes, target contains the same information
2903  * that would be present there if it were used during boot instead of
2904  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2905  * and it is bzeroed.
2906  */
2907 void
2908 lpl_topo_bootstrap(lpl_t *target, int size)
2909 {
2910 	lpl_t	*lpl = lpl_bootstrap;
2911 	lpl_t	*target_lpl = target;
2912 	int	howmany;
2913 	int	id;
2914 	int	i;
2915 
2916 	/*
2917 	 * The only target that should be passed here is cp_default lpl list.
2918 	 */
2919 	ASSERT(target == cp_default.cp_lgrploads);
2920 	ASSERT(size == cp_default.cp_nlgrploads);
2921 	ASSERT(!lgrp_topo_initialized);
2922 	ASSERT(ncpus == 1);
2923 
2924 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2925 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2926 		/*
2927 		 * Copy all fields from lpl.
2928 		 */
2929 
2930 		*target_lpl = *lpl;
2931 
2932 		/*
2933 		 * Substitute CPU0 lpl pointer with one relative to target.
2934 		 */
2935 		if (lpl->lpl_cpus == CPU) {
2936 			ASSERT(CPU->cpu_lpl == lpl);
2937 			CPU->cpu_lpl = target_lpl;
2938 		}
2939 
2940 		/*
2941 		 * Substitute parent information with parent relative to target.
2942 		 */
2943 		if (lpl->lpl_parent != NULL)
2944 			target_lpl->lpl_parent = (lpl_t *)
2945 			    (((uintptr_t)lpl->lpl_parent -
2946 				(uintptr_t)lpl_bootstrap) +
2947 				(uintptr_t)target);
2948 
2949 		/*
2950 		 * Walk over resource set substituting pointers relative to
2951 		 * lpl_bootstrap to pointers relative to target.
2952 		 */
2953 		ASSERT(lpl->lpl_nrset <= 1);
2954 
2955 		for (id = 0; id < lpl->lpl_nrset; id++) {
2956 			if (lpl->lpl_rset[id] != NULL) {
2957 				target_lpl->lpl_rset[id] =
2958 				    (lpl_t *)
2959 				    (((uintptr_t)lpl->lpl_rset[id] -
2960 					(uintptr_t)lpl_bootstrap) +
2961 					(uintptr_t)target);
2962 			}
2963 		}
2964 	}
2965 
2966 	/*
2967 	 * Topology information in lpl_bootstrap is no longer needed.
2968 	 */
2969 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2970 }
2971 
2972 /*
2973  * If the lowest load among the lgroups a process' threads are currently
2974  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2975  * expanding the process to a new lgroup.
2976  */
2977 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2978 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2979 
2980 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2981 	((lgrp_expand_proc_thresh) / (ncpu))
2982 
2983 /*
2984  * A process will be expanded to a new lgroup only if the difference between
2985  * the lowest load on the lgroups the process' thread's are currently spread
2986  * across and the lowest load on the other lgroups in the process' partition
2987  * is greater than lgrp_expand_proc_diff.
2988  */
2989 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2990 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2991 
2992 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2993 	((lgrp_expand_proc_diff) / (ncpu))
2994 
2995 /*
2996  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2997  * be present due to impreciseness of the load average decay algorithm.
2998  *
2999  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
3000  * tolerance is scaled by the number of cpus in the lgroup just like
3001  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
3002  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
3003  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
3004  */
3005 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
3006 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
3007 	((lgrp_loadavg_tolerance) / ncpu)
3008 
3009 /*
3010  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
3011  * average is above this threshold
3012  */
3013 uint32_t	lgrp_load_thresh = UINT32_MAX;
3014 
3015 /*
3016  * lgrp_choose() will try to skip any lgroups with less memory
3017  * than this free when choosing a home lgroup
3018  */
3019 pgcnt_t	lgrp_mem_free_thresh = 0;
3020 
3021 /*
3022  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
3023  * one based on one of the following policies:
3024  * - Random selection
3025  * - Pseudo round robin placement
3026  * - Longest time since a thread was last placed
3027  */
3028 #define	LGRP_CHOOSE_RANDOM	1
3029 #define	LGRP_CHOOSE_RR		2
3030 #define	LGRP_CHOOSE_TIME	3
3031 
3032 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3033 
3034 /*
3035  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3036  * be bound to a CPU or processor set.
3037  *
3038  * Arguments:
3039  *	t		The thread
3040  *	cpupart		The partition the thread belongs to.
3041  *
3042  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3043  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3044  *	 partitions changing out from under us and assumes that given thread is
3045  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3046  *	 disabled, so don't grab any locks because we should never block under
3047  *	 those conditions.
3048  */
3049 lpl_t *
3050 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3051 {
3052 	lgrp_load_t	bestload, bestrload;
3053 	int		lgrpid_offset, lgrp_count;
3054 	lgrp_id_t	lgrpid, lgrpid_start;
3055 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3056 	klgrpset_t	lgrpset;
3057 	proc_t		*p;
3058 
3059 	ASSERT(t != NULL);
3060 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3061 	    THREAD_LOCK_HELD(t));
3062 	ASSERT(cpupart != NULL);
3063 
3064 	p = t->t_procp;
3065 
3066 	/* A process should always be in an active partition */
3067 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3068 
3069 	bestlpl = bestrlpl = NULL;
3070 	bestload = bestrload = LGRP_LOADAVG_MAX;
3071 	lgrpset = cpupart->cp_lgrpset;
3072 
3073 	switch (lgrp_choose_policy) {
3074 	case LGRP_CHOOSE_RR:
3075 		lgrpid = cpupart->cp_lgrp_hint;
3076 		do {
3077 			if (++lgrpid > lgrp_alloc_max)
3078 				lgrpid = 0;
3079 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3080 
3081 		break;
3082 	default:
3083 	case LGRP_CHOOSE_TIME:
3084 	case LGRP_CHOOSE_RANDOM:
3085 		klgrpset_nlgrps(lgrpset, lgrp_count);
3086 		lgrpid_offset =
3087 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3088 		for (lgrpid = 0; ; lgrpid++) {
3089 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3090 				if (--lgrpid_offset == 0)
3091 					break;
3092 			}
3093 		}
3094 		break;
3095 	}
3096 
3097 	lgrpid_start = lgrpid;
3098 
3099 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3100 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3101 
3102 	/*
3103 	 * Use lgroup affinities (if any) to choose best lgroup
3104 	 *
3105 	 * NOTE: Assumes that thread is protected from going away and its
3106 	 *	 lgroup affinities won't change (ie. p_lock, or
3107 	 *	 thread_lock() being held and/or CPUs paused)
3108 	 */
3109 	if (t->t_lgrp_affinity) {
3110 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3111 		if (lpl != NULL)
3112 			return (lpl);
3113 	}
3114 
3115 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3116 
3117 	do {
3118 		pgcnt_t	npgs;
3119 
3120 		/*
3121 		 * Skip any lgroups outside of thread's pset
3122 		 */
3123 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3124 			if (++lgrpid > lgrp_alloc_max)
3125 				lgrpid = 0;	/* wrap the search */
3126 			continue;
3127 		}
3128 
3129 		/*
3130 		 * Skip any non-leaf lgroups
3131 		 */
3132 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3133 			continue;
3134 
3135 		/*
3136 		 * Skip any lgroups without enough free memory
3137 		 * (when threshold set to nonzero positive value)
3138 		 */
3139 		if (lgrp_mem_free_thresh > 0) {
3140 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3141 			if (npgs < lgrp_mem_free_thresh) {
3142 				if (++lgrpid > lgrp_alloc_max)
3143 					lgrpid = 0;	/* wrap the search */
3144 				continue;
3145 			}
3146 		}
3147 
3148 		lpl = &cpupart->cp_lgrploads[lgrpid];
3149 		if (klgrpset_isempty(p->p_lgrpset) ||
3150 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3151 			/*
3152 			 * Either this is a new process or the process already
3153 			 * has threads on this lgrp, so this is a preferred
3154 			 * lgroup for the thread.
3155 			 */
3156 			if (bestlpl == NULL ||
3157 			    lpl_pick(lpl, bestlpl)) {
3158 				bestload = lpl->lpl_loadavg;
3159 				bestlpl = lpl;
3160 			}
3161 		} else {
3162 			/*
3163 			 * The process doesn't have any threads on this lgrp,
3164 			 * but we're willing to consider this lgrp if the load
3165 			 * difference is big enough to justify splitting up
3166 			 * the process' threads.
3167 			 */
3168 			if (bestrlpl == NULL ||
3169 			    lpl_pick(lpl, bestrlpl)) {
3170 				bestrload = lpl->lpl_loadavg;
3171 				bestrlpl = lpl;
3172 			}
3173 		}
3174 		if (++lgrpid > lgrp_alloc_max)
3175 			lgrpid = 0;	/* wrap the search */
3176 	} while (lgrpid != lgrpid_start);
3177 
3178 	/*
3179 	 * Return root lgroup if threshold isn't set to maximum value and
3180 	 * lowest lgroup load average more than a certain threshold
3181 	 */
3182 	if (lgrp_load_thresh != UINT32_MAX &&
3183 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3184 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3185 
3186 	/*
3187 	 * If all the lgroups over which the thread's process is spread are
3188 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3189 	 * the thread on one of the other leaf lgroups in the thread's
3190 	 * partition.
3191 	 */
3192 	if ((bestlpl == NULL) ||
3193 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3194 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3195 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3196 	    bestload))) {
3197 		bestlpl = bestrlpl;
3198 	}
3199 
3200 	if (bestlpl == NULL) {
3201 		/*
3202 		 * No lgroup looked particularly good, but we still
3203 		 * have to pick something. Go with the randomly selected
3204 		 * legal lgroup we started with above.
3205 		 */
3206 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3207 	}
3208 
3209 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3210 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3211 
3212 	ASSERT(bestlpl->lpl_ncpu > 0);
3213 	return (bestlpl);
3214 }
3215 
3216 /*
3217  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3218  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3219  */
3220 static int
3221 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3222 {
3223 	lgrp_load_t	l1, l2;
3224 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3225 
3226 	l1 = lpl1->lpl_loadavg;
3227 	l2 = lpl2->lpl_loadavg;
3228 
3229 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3230 		/* lpl1 is significantly less loaded than lpl2 */
3231 		return (1);
3232 	}
3233 
3234 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3235 	    l1 + tolerance >= l2 && l1 < l2 &&
3236 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3237 		/*
3238 		 * lpl1's load is within the tolerance of lpl2. We're
3239 		 * willing to consider it be to better however if
3240 		 * it has been longer since we last homed a thread there
3241 		 */
3242 		return (1);
3243 	}
3244 
3245 	return (0);
3246 }
3247 
3248 /*
3249  * An LWP is expected to be assigned to an lgroup for at least this long
3250  * for its anticipatory load to be justified.  NOTE that this value should
3251  * not be set extremely huge (say, larger than 100 years), to avoid problems
3252  * with overflow in the calculation that uses it.
3253  */
3254 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3255 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3256 
3257 /*
3258  * Routine to change a thread's lgroup affiliation.  This routine updates
3259  * the thread's kthread_t struct and its process' proc_t struct to note the
3260  * thread's new lgroup affiliation, and its lgroup affinities.
3261  *
3262  * Note that this is the only routine that modifies a thread's t_lpl field,
3263  * and that adds in or removes anticipatory load.
3264  *
3265  * If the thread is exiting, newlpl is NULL.
3266  *
3267  * Locking:
3268  * The following lock must be held on entry:
3269  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3270  *		doesn't get removed from t's partition
3271  *
3272  * This routine is not allowed to grab any locks, since it may be called
3273  * with cpus paused (such as from cpu_offline).
3274  */
3275 void
3276 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3277 {
3278 	proc_t		*p;
3279 	lpl_t		*lpl, *oldlpl;
3280 	lgrp_id_t	oldid;
3281 	kthread_t	*tp;
3282 	uint_t		ncpu;
3283 	lgrp_load_t	old, new;
3284 
3285 	ASSERT(t);
3286 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3287 	    THREAD_LOCK_HELD(t));
3288 
3289 	/*
3290 	 * If not changing lpls, just return
3291 	 */
3292 	if ((oldlpl = t->t_lpl) == newlpl)
3293 		return;
3294 
3295 	/*
3296 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3297 	 * associated with process 0 rather than with its original process).
3298 	 */
3299 	if (t->t_proc_flag & TP_LWPEXIT) {
3300 		if (newlpl != NULL) {
3301 			t->t_lpl = newlpl;
3302 		}
3303 		return;
3304 	}
3305 
3306 	p = ttoproc(t);
3307 
3308 	/*
3309 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3310 	 * to account for it being moved from its old lgroup.
3311 	 */
3312 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3313 	    (p->p_tlist != NULL)) {
3314 		oldid = oldlpl->lpl_lgrpid;
3315 
3316 		if (newlpl != NULL)
3317 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3318 
3319 		if ((do_lgrpset_delete) &&
3320 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3321 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3322 				/*
3323 				 * Check if a thread other than the thread
3324 				 * that's moving is assigned to the same
3325 				 * lgroup as the thread that's moving.  Note
3326 				 * that we have to compare lgroup IDs, rather
3327 				 * than simply comparing t_lpl's, since the
3328 				 * threads may belong to different partitions
3329 				 * but be assigned to the same lgroup.
3330 				 */
3331 				ASSERT(tp->t_lpl != NULL);
3332 
3333 				if ((tp != t) &&
3334 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3335 					/*
3336 					 * Another thread is assigned to the
3337 					 * same lgroup as the thread that's
3338 					 * moving, p_lgrpset doesn't change.
3339 					 */
3340 					break;
3341 				} else if (tp == p->p_tlist) {
3342 					/*
3343 					 * No other thread is assigned to the
3344 					 * same lgroup as the exiting thread,
3345 					 * clear the lgroup's bit in p_lgrpset.
3346 					 */
3347 					klgrpset_del(p->p_lgrpset, oldid);
3348 					break;
3349 				}
3350 			}
3351 		}
3352 
3353 		/*
3354 		 * If this thread was assigned to its old lgroup for such a
3355 		 * short amount of time that the anticipatory load that was
3356 		 * added on its behalf has aged very little, remove that
3357 		 * anticipatory load.
3358 		 */
3359 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3360 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3361 			lpl = oldlpl;
3362 			for (;;) {
3363 				do {
3364 					old = new = lpl->lpl_loadavg;
3365 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3366 					if (new > old) {
3367 						/*
3368 						 * this can happen if the load
3369 						 * average was aged since we
3370 						 * added in the anticipatory
3371 						 * load
3372 						 */
3373 						new = 0;
3374 					}
3375 				} while (cas32(
3376 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3377 					    new) != old);
3378 
3379 				lpl = lpl->lpl_parent;
3380 				if (lpl == NULL)
3381 					break;
3382 
3383 				ncpu = lpl->lpl_ncpu;
3384 				ASSERT(ncpu > 0);
3385 			}
3386 		}
3387 	}
3388 	/*
3389 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3390 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3391 	 * to its new lgroup to account for its move to its new lgroup.
3392 	 */
3393 	if (newlpl != NULL) {
3394 		/*
3395 		 * This thread is moving to a new lgroup
3396 		 */
3397 		t->t_lpl = newlpl;
3398 
3399 		/*
3400 		 * Reflect move in load average of new lgroup
3401 		 * unless it is root lgroup
3402 		 */
3403 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3404 			return;
3405 
3406 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3407 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3408 		}
3409 
3410 		/*
3411 		 * It'll take some time for the load on the new lgroup
3412 		 * to reflect this thread's placement on it.  We'd
3413 		 * like not, however, to have all threads between now
3414 		 * and then also piling on to this lgroup.  To avoid
3415 		 * this pileup, we anticipate the load this thread
3416 		 * will generate on its new lgroup.  The goal is to
3417 		 * make the lgroup's load appear as though the thread
3418 		 * had been there all along.  We're very conservative
3419 		 * in calculating this anticipatory load, we assume
3420 		 * the worst case case (100% CPU-bound thread).  This
3421 		 * may be modified in the future to be more accurate.
3422 		 */
3423 		lpl = newlpl;
3424 		for (;;) {
3425 			ncpu = lpl->lpl_ncpu;
3426 			ASSERT(ncpu > 0);
3427 			do {
3428 				old = new = lpl->lpl_loadavg;
3429 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3430 				/*
3431 				 * Check for overflow
3432 				 * Underflow not possible here
3433 				 */
3434 				if (new < old)
3435 					new = UINT32_MAX;
3436 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3437 			    new) != old);
3438 
3439 			lpl = lpl->lpl_parent;
3440 			if (lpl == NULL)
3441 				break;
3442 		}
3443 		t->t_anttime = gethrtime();
3444 	}
3445 }
3446 
3447 /*
3448  * Return lgroup memory allocation policy given advice from madvise(3C)
3449  */
3450 lgrp_mem_policy_t
3451 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3452 {
3453 	switch (advice) {
3454 	case MADV_ACCESS_LWP:
3455 		return (LGRP_MEM_POLICY_NEXT);
3456 	case MADV_ACCESS_MANY:
3457 		return (LGRP_MEM_POLICY_RANDOM);
3458 	default:
3459 		return (lgrp_mem_policy_default(size, type));
3460 	}
3461 }
3462 
3463 /*
3464  * Figure out default policy
3465  */
3466 lgrp_mem_policy_t
3467 lgrp_mem_policy_default(size_t size, int type)
3468 {
3469 	cpupart_t		*cp;
3470 	lgrp_mem_policy_t	policy;
3471 	size_t			pset_mem_size;
3472 
3473 	/*
3474 	 * Randomly allocate memory across lgroups for shared memory
3475 	 * beyond a certain threshold
3476 	 */
3477 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3478 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3479 		/*
3480 		 * Get total memory size of current thread's pset
3481 		 */
3482 		kpreempt_disable();
3483 		cp = curthread->t_cpupart;
3484 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3485 		kpreempt_enable();
3486 
3487 		/*
3488 		 * Choose policy to randomly allocate memory across
3489 		 * lgroups in pset if it will fit and is not default
3490 		 * partition.  Otherwise, allocate memory randomly
3491 		 * across machine.
3492 		 */
3493 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3494 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3495 		else
3496 			policy = LGRP_MEM_POLICY_RANDOM;
3497 	} else
3498 		/*
3499 		 * Apply default policy for private memory and
3500 		 * shared memory under the respective random
3501 		 * threshold.
3502 		 */
3503 		policy = lgrp_mem_default_policy;
3504 
3505 	return (policy);
3506 }
3507 
3508 /*
3509  * Get memory allocation policy for this segment
3510  */
3511 lgrp_mem_policy_info_t *
3512 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3513 {
3514 	lgrp_mem_policy_info_t	*policy_info;
3515 	extern struct seg_ops	segspt_ops;
3516 	extern struct seg_ops	segspt_shmops;
3517 
3518 	/*
3519 	 * This is for binary compatibility to protect against third party
3520 	 * segment drivers which haven't recompiled to allow for
3521 	 * SEGOP_GETPOLICY()
3522 	 */
3523 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3524 	    seg->s_ops != &segspt_shmops)
3525 		return (NULL);
3526 
3527 	policy_info = NULL;
3528 	if (seg->s_ops->getpolicy != NULL)
3529 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3530 
3531 	return (policy_info);
3532 }
3533 
3534 /*
3535  * Set policy for allocating private memory given desired policy, policy info,
3536  * size in bytes of memory that policy is being applied.
3537  * Return 0 if policy wasn't set already and 1 if policy was set already
3538  */
3539 int
3540 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3541     lgrp_mem_policy_info_t *policy_info, size_t size)
3542 {
3543 
3544 	ASSERT(policy_info != NULL);
3545 
3546 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3547 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3548 
3549 	/*
3550 	 * Policy set already?
3551 	 */
3552 	if (policy == policy_info->mem_policy)
3553 		return (1);
3554 
3555 	/*
3556 	 * Set policy
3557 	 */
3558 	policy_info->mem_policy = policy;
3559 	policy_info->mem_reserved = 0;
3560 
3561 	return (0);
3562 }
3563 
3564 
3565 /*
3566  * Get shared memory allocation policy with given tree and offset
3567  */
3568 lgrp_mem_policy_info_t *
3569 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3570     u_offset_t vn_off)
3571 {
3572 	u_offset_t		off;
3573 	lgrp_mem_policy_info_t	*policy_info;
3574 	lgrp_shm_policy_seg_t	*policy_seg;
3575 	lgrp_shm_locality_t	*shm_locality;
3576 	avl_tree_t		*tree;
3577 	avl_index_t		where;
3578 
3579 	/*
3580 	 * Get policy segment tree from anon_map or vnode and use specified
3581 	 * anon index or vnode offset as offset
3582 	 *
3583 	 * Assume that no lock needs to be held on anon_map or vnode, since
3584 	 * they should be protected by their reference count which must be
3585 	 * nonzero for an existing segment
3586 	 */
3587 	if (amp) {
3588 		ASSERT(amp->refcnt != 0);
3589 		shm_locality = amp->locality;
3590 		if (shm_locality == NULL)
3591 			return (NULL);
3592 		tree = shm_locality->loc_tree;
3593 		off = ptob(anon_index);
3594 	} else if (vp) {
3595 		shm_locality = vp->v_locality;
3596 		if (shm_locality == NULL)
3597 			return (NULL);
3598 		ASSERT(shm_locality->loc_count != 0);
3599 		tree = shm_locality->loc_tree;
3600 		off = vn_off;
3601 	}
3602 
3603 	if (tree == NULL)
3604 		return (NULL);
3605 
3606 	/*
3607 	 * Lookup policy segment for offset into shared object and return
3608 	 * policy info
3609 	 */
3610 	rw_enter(&shm_locality->loc_lock, RW_READER);
3611 	policy_info = NULL;
3612 	policy_seg = avl_find(tree, &off, &where);
3613 	if (policy_seg)
3614 		policy_info = &policy_seg->shm_policy;
3615 	rw_exit(&shm_locality->loc_lock);
3616 
3617 	return (policy_info);
3618 }
3619 
3620 /*
3621  * Default memory allocation policy for kernel segmap pages
3622  */
3623 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3624 
3625 /*
3626  * Return lgroup to use for allocating memory
3627  * given the segment and address
3628  *
3629  * There isn't any mutual exclusion that exists between calls
3630  * to this routine and DR, so this routine and whomever calls it
3631  * should be mindful of the possibility that the lgrp returned
3632  * may be deleted. If this happens, dereferences of the lgrp
3633  * pointer will still be safe, but the resources in the lgrp will
3634  * be gone, and LGRP_EXISTS() will no longer be true.
3635  */
3636 lgrp_t *
3637 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3638 {
3639 	int			i;
3640 	lgrp_t			*lgrp;
3641 	klgrpset_t		lgrpset;
3642 	int			lgrps_spanned;
3643 	unsigned long		off;
3644 	lgrp_mem_policy_t	policy;
3645 	lgrp_mem_policy_info_t	*policy_info;
3646 	ushort_t		random;
3647 	int			stat = 0;
3648 	extern struct seg	*segkmap;
3649 
3650 	/*
3651 	 * Just return null if the lgrp framework hasn't finished
3652 	 * initializing or if this is a UMA machine.
3653 	 */
3654 	if (nlgrps == 1 || !lgrp_initialized)
3655 		return (lgrp_root);
3656 
3657 	/*
3658 	 * Get memory allocation policy for this segment
3659 	 */
3660 	policy = lgrp_mem_default_policy;
3661 	if (seg != NULL) {
3662 		if (seg->s_as == &kas) {
3663 			if (seg == segkmap)
3664 				policy = lgrp_segmap_default_policy;
3665 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3666 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3667 				policy = LGRP_MEM_POLICY_RANDOM;
3668 		} else {
3669 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3670 			if (policy_info != NULL)
3671 				policy = policy_info->mem_policy;
3672 		}
3673 	}
3674 	lgrpset = 0;
3675 
3676 	/*
3677 	 * Initialize lgroup to home by default
3678 	 */
3679 	lgrp = lgrp_home_lgrp();
3680 
3681 	/*
3682 	 * When homing threads on root lgrp, override default memory
3683 	 * allocation policies with root lgroup memory allocation policy
3684 	 */
3685 	if (lgrp == lgrp_root)
3686 		policy = lgrp_mem_policy_root;
3687 
3688 	/*
3689 	 * Implement policy
3690 	 */
3691 	switch (policy) {
3692 	case LGRP_MEM_POLICY_NEXT_CPU:
3693 
3694 		/*
3695 		 * Return lgroup of current CPU which faulted on memory
3696 		 * If the CPU isn't currently in an lgrp, then opt to
3697 		 * allocate from the root.
3698 		 *
3699 		 * Kernel preemption needs to be disabled here to prevent
3700 		 * the current CPU from going away before lgrp is found.
3701 		 */
3702 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3703 			lgrp = lgrp_root;
3704 		} else {
3705 			kpreempt_disable();
3706 			lgrp = lgrp_cpu_to_lgrp(CPU);
3707 			kpreempt_enable();
3708 		}
3709 		break;
3710 
3711 	case LGRP_MEM_POLICY_NEXT:
3712 	case LGRP_MEM_POLICY_DEFAULT:
3713 	default:
3714 
3715 		/*
3716 		 * Just return current thread's home lgroup
3717 		 * for default policy (next touch)
3718 		 * If the thread is homed to the root,
3719 		 * then the default policy is random across lgroups.
3720 		 * Fallthrough to the random case.
3721 		 */
3722 		if (lgrp != lgrp_root) {
3723 			if (policy == LGRP_MEM_POLICY_NEXT)
3724 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3725 			else
3726 				lgrp_stat_add(lgrp->lgrp_id,
3727 				    LGRP_NUM_DEFAULT, 1);
3728 			break;
3729 		}
3730 		/* LINTED fallthrough on case statement */
3731 	case LGRP_MEM_POLICY_RANDOM:
3732 
3733 		/*
3734 		 * Return a random leaf lgroup with memory
3735 		 */
3736 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3737 		/*
3738 		 * Count how many lgroups are spanned
3739 		 */
3740 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3741 
3742 		/*
3743 		 * There may be no memnodes in the root lgroup during DR copy
3744 		 * rename on a system with only two boards (memnodes)
3745 		 * configured. In this case just return the root lgrp.
3746 		 */
3747 		if (lgrps_spanned == 0) {
3748 			lgrp = lgrp_root;
3749 			break;
3750 		}
3751 
3752 		/*
3753 		 * Pick a random offset within lgroups spanned
3754 		 * and return lgroup at that offset
3755 		 */
3756 		random = (ushort_t)gethrtime() >> 4;
3757 		off = random % lgrps_spanned;
3758 		ASSERT(off <= lgrp_alloc_max);
3759 
3760 		for (i = 0; i <= lgrp_alloc_max; i++) {
3761 			if (!klgrpset_ismember(lgrpset, i))
3762 				continue;
3763 			if (off)
3764 				off--;
3765 			else {
3766 				lgrp = lgrp_table[i];
3767 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3768 				    1);
3769 				break;
3770 			}
3771 		}
3772 		break;
3773 
3774 	case LGRP_MEM_POLICY_RANDOM_PROC:
3775 
3776 		/*
3777 		 * Grab copy of bitmask of lgroups spanned by
3778 		 * this process
3779 		 */
3780 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3781 		stat = LGRP_NUM_RANDOM_PROC;
3782 
3783 		/* LINTED fallthrough on case statement */
3784 	case LGRP_MEM_POLICY_RANDOM_PSET:
3785 
3786 		if (!stat)
3787 			stat = LGRP_NUM_RANDOM_PSET;
3788 
3789 		if (klgrpset_isempty(lgrpset)) {
3790 			/*
3791 			 * Grab copy of bitmask of lgroups spanned by
3792 			 * this processor set
3793 			 */
3794 			kpreempt_disable();
3795 			klgrpset_copy(lgrpset,
3796 			    curthread->t_cpupart->cp_lgrpset);
3797 			kpreempt_enable();
3798 		}
3799 
3800 		/*
3801 		 * Count how many lgroups are spanned
3802 		 */
3803 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3804 		ASSERT(lgrps_spanned <= nlgrps);
3805 
3806 		/*
3807 		 * Probably lgrps_spanned should be always non-zero, but to be
3808 		 * on the safe side we return lgrp_root if it is empty.
3809 		 */
3810 		if (lgrps_spanned == 0) {
3811 			lgrp = lgrp_root;
3812 			break;
3813 		}
3814 
3815 		/*
3816 		 * Pick a random offset within lgroups spanned
3817 		 * and return lgroup at that offset
3818 		 */
3819 		random = (ushort_t)gethrtime() >> 4;
3820 		off = random % lgrps_spanned;
3821 		ASSERT(off <= lgrp_alloc_max);
3822 
3823 		for (i = 0; i <= lgrp_alloc_max; i++) {
3824 			if (!klgrpset_ismember(lgrpset, i))
3825 				continue;
3826 			if (off)
3827 				off--;
3828 			else {
3829 				lgrp = lgrp_table[i];
3830 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3831 				    1);
3832 				break;
3833 			}
3834 		}
3835 		break;
3836 
3837 	case LGRP_MEM_POLICY_ROUNDROBIN:
3838 
3839 		/*
3840 		 * Use offset within segment to determine
3841 		 * offset from home lgroup to choose for
3842 		 * next lgroup to allocate memory from
3843 		 */
3844 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3845 		    (lgrp_alloc_max + 1);
3846 
3847 		kpreempt_disable();
3848 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3849 		i = lgrp->lgrp_id;
3850 		kpreempt_enable();
3851 
3852 		while (off > 0) {
3853 			i = (i + 1) % (lgrp_alloc_max + 1);
3854 			lgrp = lgrp_table[i];
3855 			if (klgrpset_ismember(lgrpset, i))
3856 				off--;
3857 		}
3858 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3859 
3860 		break;
3861 	}
3862 
3863 	ASSERT(lgrp != NULL);
3864 	return (lgrp);
3865 }
3866 
3867 /*
3868  * Return the number of pages in an lgroup
3869  *
3870  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3871  *	 could cause tests that rely on the numat driver to fail....
3872  */
3873 pgcnt_t
3874 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3875 {
3876 	lgrp_t *lgrp;
3877 
3878 	lgrp = lgrp_table[lgrpid];
3879 	if (!LGRP_EXISTS(lgrp) ||
3880 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3881 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3882 		return (0);
3883 
3884 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3885 }
3886 
3887 /*
3888  * Initialize lgroup shared memory allocation policy support
3889  */
3890 void
3891 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3892 {
3893 	lgrp_shm_locality_t	*shm_locality;
3894 
3895 	/*
3896 	 * Initialize locality field in anon_map
3897 	 * Don't need any locks because this is called when anon_map is
3898 	 * allocated, but not used anywhere yet.
3899 	 */
3900 	if (amp) {
3901 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3902 		if (amp->locality == NULL) {
3903 			/*
3904 			 * Allocate and initialize shared memory locality info
3905 			 * and set anon_map locality pointer to it
3906 			 * Drop lock across kmem_alloc(KM_SLEEP)
3907 			 */
3908 			ANON_LOCK_EXIT(&amp->a_rwlock);
3909 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3910 			    KM_SLEEP);
3911 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3912 			    NULL);
3913 			shm_locality->loc_count = 1;	/* not used for amp */
3914 			shm_locality->loc_tree = NULL;
3915 
3916 			/*
3917 			 * Reacquire lock and check to see whether anyone beat
3918 			 * us to initializing the locality info
3919 			 */
3920 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3921 			if (amp->locality != NULL) {
3922 				rw_destroy(&shm_locality->loc_lock);
3923 				kmem_free(shm_locality,
3924 				    sizeof (*shm_locality));
3925 			} else
3926 				amp->locality = shm_locality;
3927 		}
3928 		ANON_LOCK_EXIT(&amp->a_rwlock);
3929 		return;
3930 	}
3931 
3932 	/*
3933 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3934 	 */
3935 	mutex_enter(&vp->v_lock);
3936 	if ((vp->v_flag & V_LOCALITY) == 0) {
3937 		/*
3938 		 * Allocate and initialize shared memory locality info
3939 		 */
3940 		mutex_exit(&vp->v_lock);
3941 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3942 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3943 		shm_locality->loc_count = 1;
3944 		shm_locality->loc_tree = NULL;
3945 
3946 		/*
3947 		 * Point vnode locality field at shared vnode policy info
3948 		 * and set locality aware flag in vnode
3949 		 */
3950 		mutex_enter(&vp->v_lock);
3951 		if ((vp->v_flag & V_LOCALITY) == 0) {
3952 			vp->v_locality = shm_locality;
3953 			vp->v_flag |= V_LOCALITY;
3954 		} else {
3955 			/*
3956 			 * Lost race so free locality info and increment count.
3957 			 */
3958 			rw_destroy(&shm_locality->loc_lock);
3959 			kmem_free(shm_locality, sizeof (*shm_locality));
3960 			shm_locality = vp->v_locality;
3961 			shm_locality->loc_count++;
3962 		}
3963 		mutex_exit(&vp->v_lock);
3964 
3965 		return;
3966 	}
3967 
3968 	/*
3969 	 * Increment reference count of number of segments mapping this vnode
3970 	 * shared
3971 	 */
3972 	shm_locality = vp->v_locality;
3973 	shm_locality->loc_count++;
3974 	mutex_exit(&vp->v_lock);
3975 }
3976 
3977 /*
3978  * Destroy the given shared memory policy segment tree
3979  */
3980 void
3981 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3982 {
3983 	lgrp_shm_policy_seg_t	*cur;
3984 	lgrp_shm_policy_seg_t	*next;
3985 
3986 	if (tree == NULL)
3987 		return;
3988 
3989 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3990 	while (cur != NULL) {
3991 		next = AVL_NEXT(tree, cur);
3992 		avl_remove(tree, cur);
3993 		kmem_free(cur, sizeof (*cur));
3994 		cur = next;
3995 	}
3996 	kmem_free(tree, sizeof (avl_tree_t));
3997 }
3998 
3999 /*
4000  * Uninitialize lgroup shared memory allocation policy support
4001  */
4002 void
4003 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4004 {
4005 	lgrp_shm_locality_t	*shm_locality;
4006 
4007 	/*
4008 	 * For anon_map, deallocate shared memory policy tree and
4009 	 * zero locality field
4010 	 * Don't need any locks because anon_map is being freed
4011 	 */
4012 	if (amp) {
4013 		if (amp->locality == NULL)
4014 			return;
4015 		shm_locality = amp->locality;
4016 		shm_locality->loc_count = 0;	/* not really used for amp */
4017 		rw_destroy(&shm_locality->loc_lock);
4018 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4019 		kmem_free(shm_locality, sizeof (*shm_locality));
4020 		amp->locality = 0;
4021 		return;
4022 	}
4023 
4024 	/*
4025 	 * For vnode, decrement reference count of segments mapping this vnode
4026 	 * shared and delete locality info if reference count drops to 0
4027 	 */
4028 	mutex_enter(&vp->v_lock);
4029 	shm_locality = vp->v_locality;
4030 	shm_locality->loc_count--;
4031 
4032 	if (shm_locality->loc_count == 0) {
4033 		rw_destroy(&shm_locality->loc_lock);
4034 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4035 		kmem_free(shm_locality, sizeof (*shm_locality));
4036 		vp->v_locality = 0;
4037 		vp->v_flag &= ~V_LOCALITY;
4038 	}
4039 	mutex_exit(&vp->v_lock);
4040 }
4041 
4042 /*
4043  * Compare two shared memory policy segments
4044  * Used by AVL tree code for searching
4045  */
4046 int
4047 lgrp_shm_policy_compar(const void *x, const void *y)
4048 {
4049 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4050 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4051 
4052 	if (a->shm_off < b->shm_off)
4053 		return (-1);
4054 	if (a->shm_off >= b->shm_off + b->shm_size)
4055 		return (1);
4056 	return (0);
4057 }
4058 
4059 /*
4060  * Concatenate seg1 with seg2 and remove seg2
4061  */
4062 static int
4063 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4064     lgrp_shm_policy_seg_t *seg2)
4065 {
4066 	if (!seg1 || !seg2 ||
4067 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4068 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4069 		return (-1);
4070 
4071 	seg1->shm_size += seg2->shm_size;
4072 	avl_remove(tree, seg2);
4073 	kmem_free(seg2, sizeof (*seg2));
4074 	return (0);
4075 }
4076 
4077 /*
4078  * Split segment at given offset and return rightmost (uppermost) segment
4079  * Assumes that there are no overlapping segments
4080  */
4081 static lgrp_shm_policy_seg_t *
4082 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4083     u_offset_t off)
4084 {
4085 	lgrp_shm_policy_seg_t	*newseg;
4086 	avl_index_t		where;
4087 
4088 	ASSERT(seg != NULL);
4089 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4090 
4091 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4092 	    seg->shm_size)
4093 		return (NULL);
4094 
4095 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4096 		return (seg);
4097 
4098 	/*
4099 	 * Adjust size of left segment and allocate new (right) segment
4100 	 */
4101 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4102 	newseg->shm_policy = seg->shm_policy;
4103 	newseg->shm_off = off;
4104 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4105 	seg->shm_size = off - seg->shm_off;
4106 
4107 	/*
4108 	 * Find where to insert new segment in AVL tree and insert it
4109 	 */
4110 	(void) avl_find(tree, &off, &where);
4111 	avl_insert(tree, newseg, where);
4112 
4113 	return (newseg);
4114 }
4115 
4116 /*
4117  * Set shared memory allocation policy on specified shared object at given
4118  * offset and length
4119  *
4120  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4121  * -1 if can't set policy.
4122  */
4123 int
4124 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4125     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4126 {
4127 	u_offset_t		eoff;
4128 	lgrp_shm_policy_seg_t	*next;
4129 	lgrp_shm_policy_seg_t	*newseg;
4130 	u_offset_t		off;
4131 	u_offset_t		oldeoff;
4132 	lgrp_shm_policy_seg_t	*prev;
4133 	int			retval;
4134 	lgrp_shm_policy_seg_t	*seg;
4135 	lgrp_shm_locality_t	*shm_locality;
4136 	avl_tree_t		*tree;
4137 	avl_index_t		where;
4138 
4139 	ASSERT(amp || vp);
4140 	ASSERT((len & PAGEOFFSET) == 0);
4141 
4142 	if (len == 0)
4143 		return (-1);
4144 
4145 	retval = 0;
4146 
4147 	/*
4148 	 * Get locality info and starting offset into shared object
4149 	 * Try anon map first and then vnode
4150 	 * Assume that no locks need to be held on anon_map or vnode, since
4151 	 * it should be protected by its reference count which must be nonzero
4152 	 * for an existing segment.
4153 	 */
4154 	if (amp) {
4155 		/*
4156 		 * Get policy info from anon_map
4157 		 *
4158 		 */
4159 		ASSERT(amp->refcnt != 0);
4160 		if (amp->locality == NULL)
4161 			lgrp_shm_policy_init(amp, NULL);
4162 		shm_locality = amp->locality;
4163 		off = ptob(anon_index);
4164 	} else if (vp) {
4165 		/*
4166 		 * Get policy info from vnode
4167 		 */
4168 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4169 			lgrp_shm_policy_init(NULL, vp);
4170 		shm_locality = vp->v_locality;
4171 		ASSERT(shm_locality->loc_count != 0);
4172 		off = vn_off;
4173 	} else
4174 		return (-1);
4175 
4176 	ASSERT((off & PAGEOFFSET) == 0);
4177 
4178 	/*
4179 	 * Figure out default policy
4180 	 */
4181 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4182 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4183 
4184 	/*
4185 	 * Create AVL tree if there isn't one yet
4186 	 * and set locality field to point at it
4187 	 */
4188 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4189 	tree = shm_locality->loc_tree;
4190 	if (!tree) {
4191 		rw_exit(&shm_locality->loc_lock);
4192 
4193 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4194 
4195 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4196 		if (shm_locality->loc_tree == NULL) {
4197 			avl_create(tree, lgrp_shm_policy_compar,
4198 			    sizeof (lgrp_shm_policy_seg_t),
4199 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4200 			shm_locality->loc_tree = tree;
4201 		} else {
4202 			/*
4203 			 * Another thread managed to set up the tree
4204 			 * before we could. Free the tree we allocated
4205 			 * and use the one that's already there.
4206 			 */
4207 			kmem_free(tree, sizeof (*tree));
4208 			tree = shm_locality->loc_tree;
4209 		}
4210 	}
4211 
4212 	/*
4213 	 * Set policy
4214 	 *
4215 	 * Need to maintain hold on writer's lock to keep tree from
4216 	 * changing out from under us
4217 	 */
4218 	while (len != 0) {
4219 		/*
4220 		 * Find policy segment for specified offset into shared object
4221 		 */
4222 		seg = avl_find(tree, &off, &where);
4223 
4224 		/*
4225 		 * Didn't find any existing segment that contains specified
4226 		 * offset, so allocate new segment, insert it, and concatenate
4227 		 * with adjacent segments if possible
4228 		 */
4229 		if (seg == NULL) {
4230 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4231 			    KM_SLEEP);
4232 			newseg->shm_policy.mem_policy = policy;
4233 			newseg->shm_policy.mem_reserved = 0;
4234 			newseg->shm_off = off;
4235 			avl_insert(tree, newseg, where);
4236 
4237 			/*
4238 			 * Check to see whether new segment overlaps with next
4239 			 * one, set length of new segment accordingly, and
4240 			 * calculate remaining length and next offset
4241 			 */
4242 			seg = AVL_NEXT(tree, newseg);
4243 			if (seg == NULL || off + len <= seg->shm_off) {
4244 				newseg->shm_size = len;
4245 				len = 0;
4246 			} else {
4247 				newseg->shm_size = seg->shm_off - off;
4248 				off = seg->shm_off;
4249 				len -= newseg->shm_size;
4250 			}
4251 
4252 			/*
4253 			 * Try to concatenate new segment with next and
4254 			 * previous ones, since they might have the same policy
4255 			 * now.  Grab previous and next segments first because
4256 			 * they will change on concatenation.
4257 			 */
4258 			prev =  AVL_PREV(tree, newseg);
4259 			next = AVL_NEXT(tree, newseg);
4260 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4261 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4262 
4263 			continue;
4264 		}
4265 
4266 		eoff = off + len;
4267 		oldeoff = seg->shm_off + seg->shm_size;
4268 
4269 		/*
4270 		 * Policy set already?
4271 		 */
4272 		if (policy == seg->shm_policy.mem_policy) {
4273 			/*
4274 			 * Nothing left to do if offset and length
4275 			 * fall within this segment
4276 			 */
4277 			if (eoff <= oldeoff) {
4278 				retval = 1;
4279 				break;
4280 			} else {
4281 				len = eoff - oldeoff;
4282 				off = oldeoff;
4283 				continue;
4284 			}
4285 		}
4286 
4287 		/*
4288 		 * Specified offset and length match existing segment exactly
4289 		 */
4290 		if (off == seg->shm_off && len == seg->shm_size) {
4291 			/*
4292 			 * Set policy and update current length
4293 			 */
4294 			seg->shm_policy.mem_policy = policy;
4295 			seg->shm_policy.mem_reserved = 0;
4296 			len = 0;
4297 
4298 			/*
4299 			 * Try concatenating new segment with previous and next
4300 			 * segments, since they might have the same policy now.
4301 			 * Grab previous and next segments first because they
4302 			 * will change on concatenation.
4303 			 */
4304 			prev =  AVL_PREV(tree, seg);
4305 			next = AVL_NEXT(tree, seg);
4306 			(void) lgrp_shm_policy_concat(tree, seg, next);
4307 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4308 		} else {
4309 			/*
4310 			 * Specified offset and length only apply to part of
4311 			 * existing segment
4312 			 */
4313 
4314 			/*
4315 			 * New segment starts in middle of old one, so split
4316 			 * new one off near beginning of old one
4317 			 */
4318 			newseg = NULL;
4319 			if (off > seg->shm_off) {
4320 				newseg = lgrp_shm_policy_split(tree, seg, off);
4321 
4322 				/*
4323 				 * New segment ends where old one did, so try
4324 				 * to concatenate with next segment
4325 				 */
4326 				if (eoff == oldeoff) {
4327 					newseg->shm_policy.mem_policy = policy;
4328 					newseg->shm_policy.mem_reserved = 0;
4329 					(void) lgrp_shm_policy_concat(tree,
4330 					    newseg, AVL_NEXT(tree, newseg));
4331 					break;
4332 				}
4333 			}
4334 
4335 			/*
4336 			 * New segment ends before old one, so split off end of
4337 			 * old one
4338 			 */
4339 			if (eoff < oldeoff) {
4340 				if (newseg) {
4341 					(void) lgrp_shm_policy_split(tree,
4342 					    newseg, eoff);
4343 					newseg->shm_policy.mem_policy = policy;
4344 					newseg->shm_policy.mem_reserved = 0;
4345 				} else {
4346 					(void) lgrp_shm_policy_split(tree, seg,
4347 					    eoff);
4348 					seg->shm_policy.mem_policy = policy;
4349 					seg->shm_policy.mem_reserved = 0;
4350 				}
4351 
4352 				if (off == seg->shm_off)
4353 					(void) lgrp_shm_policy_concat(tree,
4354 					    AVL_PREV(tree, seg), seg);
4355 				break;
4356 			}
4357 
4358 			/*
4359 			 * Calculate remaining length and next offset
4360 			 */
4361 			len = eoff - oldeoff;
4362 			off = oldeoff;
4363 		}
4364 	}
4365 
4366 	rw_exit(&shm_locality->loc_lock);
4367 	return (retval);
4368 }
4369 
4370 /*
4371  * Return the best memnode from which to allocate memory given
4372  * an lgroup.
4373  *
4374  * "c" is for cookie, which is good enough for me.
4375  * It references a cookie struct that should be zero'ed to initialize.
4376  * The cookie should live on the caller's stack.
4377  *
4378  * The routine returns -1 when:
4379  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4380  *	- traverse is 1, and all the memnodes in the system have been
4381  *	  returned.
4382  */
4383 int
4384 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4385 {
4386 	lgrp_t		*lp = c->lmc_lgrp;
4387 	mnodeset_t	nodes = c->lmc_nodes;
4388 	int		cnt = c->lmc_cnt;
4389 	int		offset, mnode;
4390 
4391 	extern int	max_mem_nodes;
4392 
4393 	/*
4394 	 * If the set is empty, and the caller is willing, traverse
4395 	 * up the hierarchy until we find a non-empty set.
4396 	 */
4397 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4398 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4399 		    ((lp = lp->lgrp_parent) == NULL))
4400 			return (-1);
4401 
4402 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4403 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4404 	}
4405 
4406 	/*
4407 	 * Select a memnode by picking one at a "random" offset.
4408 	 * Because of DR, memnodes can come and go at any time.
4409 	 * This code must be able to cope with the possibility
4410 	 * that the nodes count "cnt" is inconsistent with respect
4411 	 * to the number of elements actually in "nodes", and
4412 	 * therefore that the offset chosen could be greater than
4413 	 * the number of elements in the set (some memnodes may
4414 	 * have dissapeared just before cnt was read).
4415 	 * If this happens, the search simply wraps back to the
4416 	 * beginning of the set.
4417 	 */
4418 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4419 	offset = c->lmc_rand % cnt;
4420 	do {
4421 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4422 			if (nodes & ((mnodeset_t)1 << mnode))
4423 				if (!offset--)
4424 					break;
4425 	} while (mnode >= max_mem_nodes);
4426 
4427 	/* Found a node. Store state before returning. */
4428 	c->lmc_lgrp = lp;
4429 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4430 	c->lmc_cnt = cnt - 1;
4431 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4432 	c->lmc_ntried++;
4433 
4434 	return (mnode);
4435 }
4436