xref: /illumos-gate/usr/src/uts/common/os/lgrp.c (revision 9acbbeaf2a1ffe5c14b244867d427714fab43c5c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/chip.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
216 
217 /*
218  * lgroup memory event handlers
219  */
220 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
221 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
222 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
223 
224 /*
225  * lgroup CPU partition event handlers
226  */
227 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
228 static void	lgrp_part_del_cpu(struct cpu *);
229 
230 static void	lgrp_root_init(void);
231 
232 /*
233  * lpl topology
234  */
235 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
236 static void	lpl_clear(lpl_t *);
237 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
238 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
239 static void	lpl_rset_add(lpl_t *, lpl_t *);
240 static void	lpl_rset_del(lpl_t *, lpl_t *);
241 static int	lpl_rset_contains(lpl_t *, lpl_t *);
242 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
243 static void	lpl_child_update(lpl_t *, struct cpupart *);
244 static int	lpl_pick(lpl_t *, lpl_t *);
245 static void	lpl_verify_wrapper(struct cpupart *);
246 
247 /*
248  * defines for lpl topology verifier return codes
249  */
250 
251 #define	LPL_TOPO_CORRECT			0
252 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
253 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
254 #define	LPL_TOPO_LGRP_MISMATCH			-3
255 #define	LPL_TOPO_MISSING_PARENT			-4
256 #define	LPL_TOPO_PARENT_MISMATCH		-5
257 #define	LPL_TOPO_BAD_CPUCNT			-6
258 #define	LPL_TOPO_RSET_MISMATCH			-7
259 #define	LPL_TOPO_LPL_ORPHANED			-8
260 #define	LPL_TOPO_LPL_BAD_NCPU			-9
261 #define	LPL_TOPO_RSET_MSSNG_LF			-10
262 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
263 #define	LPL_TOPO_BOGUS_HINT			-12
264 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
265 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
266 #define	LPL_TOPO_BAD_RSETCNT			-15
267 
268 /*
269  * Return whether lgroup optimizations should be enabled on this system
270  */
271 int
272 lgrp_optimizations(void)
273 {
274 	/*
275 	 * System must have more than 2 lgroups to enable lgroup optimizations
276 	 *
277 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
278 	 * with one child lgroup containing all the resources. A 2 lgroup
279 	 * system with a root lgroup directly containing CPUs or memory might
280 	 * need lgroup optimizations with its child lgroup, but there
281 	 * isn't such a machine for now....
282 	 */
283 	if (nlgrps > 2)
284 		return (1);
285 
286 	return (0);
287 }
288 
289 /*
290  * Build full lgroup topology
291  */
292 static void
293 lgrp_root_init(void)
294 {
295 	lgrp_handle_t	hand;
296 	int		i;
297 	lgrp_id_t	id;
298 
299 	/*
300 	 * Create the "root" lgroup
301 	 */
302 	ASSERT(nlgrps == 0);
303 	id = nlgrps++;
304 
305 	lgrp_root = &lroot;
306 
307 	lgrp_root->lgrp_cpu = NULL;
308 	lgrp_root->lgrp_mnodes = 0;
309 	lgrp_root->lgrp_nmnodes = 0;
310 	hand = lgrp_plat_root_hand();
311 	lgrp_root->lgrp_plathand = hand;
312 
313 	lgrp_root->lgrp_id = id;
314 	lgrp_root->lgrp_cpucnt = 0;
315 	lgrp_root->lgrp_childcnt = 0;
316 	klgrpset_clear(lgrp_root->lgrp_children);
317 	klgrpset_clear(lgrp_root->lgrp_leaves);
318 	lgrp_root->lgrp_parent = NULL;
319 	lgrp_root->lgrp_chips = NULL;
320 	lgrp_root->lgrp_chipcnt = 0;
321 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
322 
323 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
324 		klgrpset_clear(lgrp_root->lgrp_set[i]);
325 
326 	lgrp_root->lgrp_kstat = NULL;
327 
328 	lgrp_table[id] = lgrp_root;
329 
330 	/*
331 	 * Setup initial lpl list for CPU0 and initial t0 home.
332 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
333 	 * all topology operations until cp_default is initialized at which
334 	 * point t0.t_lpl will be updated.
335 	 */
336 	lpl_bootstrap = lpl_bootstrap_list;
337 	t0.t_lpl = lpl_bootstrap;
338 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
339 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
340 	cp_default.cp_lgrploads = lpl_bootstrap;
341 }
342 
343 /*
344  * Initialize the lgroup framework and allow the platform to do the same
345  */
346 void
347 lgrp_init(void)
348 {
349 	/*
350 	 * Initialize the platform
351 	 */
352 	lgrp_plat_init();
353 
354 	/*
355 	 * Set max number of lgroups supported on this platform which must be
356 	 * less than the max number of lgroups supported by the common lgroup
357 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
358 	 */
359 	nlgrpsmax = lgrp_plat_max_lgrps();
360 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
361 }
362 
363 /*
364  * Create the root and cpu0's lgroup, and set t0's home.
365  */
366 void
367 lgrp_setup(void)
368 {
369 	/*
370 	 * Setup the root lgroup
371 	 */
372 	lgrp_root_init();
373 
374 	/*
375 	 * Add cpu0 to an lgroup
376 	 */
377 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
378 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
379 }
380 
381 /*
382  * Lgroup initialization is split in two parts. The first part
383  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
384  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
385  * when all CPUs are brought online and all distance information is available.
386  *
387  * When lgrp_main_init() is complete it sets lgrp_initialized. The
388  * lgrp_main_mp_init() sets lgrp_topo_initialized.
389  */
390 
391 /*
392  * true when lgrp initialization has been completed.
393  */
394 int	lgrp_initialized = 0;
395 
396 /*
397  * True when lgrp topology is constructed.
398  */
399 int	lgrp_topo_initialized = 0;
400 
401 /*
402  * Init routine called after startup(), /etc/system has been processed,
403  * and cpu0 has been added to an lgroup.
404  */
405 void
406 lgrp_main_init(void)
407 {
408 	cpu_t		*cp = CPU;
409 	lgrp_id_t	lgrpid;
410 	int		i;
411 	/*
412 	 * Enforce a valid lgrp_mem_default_policy
413 	 */
414 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
415 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
416 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
417 
418 	/*
419 	 * See if mpo should be disabled.
420 	 * This may happen in the case of null proc LPA on Starcat.
421 	 * The platform won't be able to detect null proc LPA until after
422 	 * cpu0 and memory have already been added to lgroups.
423 	 * When and if it is detected, the Starcat platform will return
424 	 * a different platform handle for cpu0 which is what we check for
425 	 * here. If mpo should be disabled move cpu0 to it's rightful place
426 	 * (the root), and destroy the remaining lgroups. This effectively
427 	 * provides an UMA lgroup topology.
428 	 */
429 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
430 	if (lgrp_table[lgrpid]->lgrp_plathand !=
431 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
432 		lgrp_part_del_cpu(cp);
433 		lgrp_cpu_fini(cp, lgrpid);
434 
435 		lgrp_cpu_init(cp);
436 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
437 
438 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
439 
440 		/*
441 		 * Destroy all lgroups except for root
442 		 */
443 		for (i = 0; i <= lgrp_alloc_max; i++) {
444 			if (LGRP_EXISTS(lgrp_table[i]) &&
445 			    lgrp_table[i] != lgrp_root)
446 				lgrp_destroy(lgrp_table[i]);
447 		}
448 
449 		/*
450 		 * Fix up root to point at itself for leaves and resources
451 		 * and not have any children
452 		 */
453 		lgrp_root->lgrp_childcnt = 0;
454 		klgrpset_clear(lgrp_root->lgrp_children);
455 		klgrpset_clear(lgrp_root->lgrp_leaves);
456 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
457 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
458 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
459 	}
460 
461 	/*
462 	 * Initialize kstats framework.
463 	 */
464 	lgrp_kstat_init();
465 	/*
466 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
467 	 */
468 	mutex_enter(&cpu_lock);
469 	lgrp_kstat_create(cp);
470 	mutex_exit(&cpu_lock);
471 
472 	lgrp_plat_main_init();
473 	lgrp_initialized = 1;
474 }
475 
476 /*
477  * Finish lgrp initialization after all CPUS are brought on-line.
478  * This routine is called after start_other_cpus().
479  */
480 void
481 lgrp_main_mp_init(void)
482 {
483 	klgrpset_t changed;
484 
485 	/*
486 	 * Update lgroup topology (if necessary)
487 	 */
488 	klgrpset_clear(changed);
489 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
490 	lgrp_topo_initialized = 1;
491 }
492 
493 /*
494  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
495  */
496 void
497 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
498 {
499 	klgrpset_t	changed;
500 	cpu_t		*cp;
501 	lgrp_id_t	id;
502 	int		rc;
503 
504 	switch (event) {
505 	/*
506 	 * The following (re)configuration events are common code
507 	 * initiated. lgrp_plat_config() is called here to inform the
508 	 * platform of the reconfiguration event.
509 	 */
510 	case LGRP_CONFIG_CPU_ADD:
511 		cp = (cpu_t *)resource;
512 
513 		/*
514 		 * Initialize the new CPU's lgrp related next/prev
515 		 * links, and give it a bootstrap lpl so that it can
516 		 * survive should it need to enter the dispatcher.
517 		 */
518 		cp->cpu_next_lpl = cp;
519 		cp->cpu_prev_lpl = cp;
520 		cp->cpu_next_lgrp = cp;
521 		cp->cpu_prev_lgrp = cp;
522 		cp->cpu_lpl = lpl_bootstrap;
523 
524 		lgrp_plat_config(event, resource);
525 		atomic_add_32(&lgrp_gen, 1);
526 
527 		break;
528 	case LGRP_CONFIG_CPU_DEL:
529 		lgrp_plat_config(event, resource);
530 		atomic_add_32(&lgrp_gen, 1);
531 
532 		break;
533 	case LGRP_CONFIG_CPU_ONLINE:
534 		cp = (cpu_t *)resource;
535 		lgrp_cpu_init(cp);
536 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
537 		rc = lpl_topo_verify(cp->cpu_part);
538 		if (rc != LPL_TOPO_CORRECT) {
539 			panic("lpl_topo_verify failed: %d", rc);
540 		}
541 		lgrp_plat_config(event, resource);
542 		atomic_add_32(&lgrp_gen, 1);
543 
544 		break;
545 	case LGRP_CONFIG_CPU_OFFLINE:
546 		cp = (cpu_t *)resource;
547 		id = cp->cpu_lpl->lpl_lgrpid;
548 		lgrp_part_del_cpu(cp);
549 		lgrp_cpu_fini(cp, id);
550 		rc = lpl_topo_verify(cp->cpu_part);
551 		if (rc != LPL_TOPO_CORRECT) {
552 			panic("lpl_topo_verify failed: %d", rc);
553 		}
554 		lgrp_plat_config(event, resource);
555 		atomic_add_32(&lgrp_gen, 1);
556 
557 		break;
558 	case LGRP_CONFIG_CPUPART_ADD:
559 		cp = (cpu_t *)resource;
560 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
561 		rc = lpl_topo_verify(cp->cpu_part);
562 		if (rc != LPL_TOPO_CORRECT) {
563 			panic("lpl_topo_verify failed: %d", rc);
564 		}
565 		lgrp_plat_config(event, resource);
566 
567 		break;
568 	case LGRP_CONFIG_CPUPART_DEL:
569 		cp = (cpu_t *)resource;
570 		lgrp_part_del_cpu((cpu_t *)resource);
571 		rc = lpl_topo_verify(cp->cpu_part);
572 		if (rc != LPL_TOPO_CORRECT) {
573 			panic("lpl_topo_verify failed: %d", rc);
574 		}
575 		lgrp_plat_config(event, resource);
576 
577 		break;
578 	/*
579 	 * The following events are initiated by the memnode
580 	 * subsystem.
581 	 */
582 	case LGRP_CONFIG_MEM_ADD:
583 		lgrp_mem_init((int)resource, where, B_FALSE);
584 		atomic_add_32(&lgrp_gen, 1);
585 
586 		break;
587 	case LGRP_CONFIG_MEM_DEL:
588 		lgrp_mem_fini((int)resource, where, B_FALSE);
589 		atomic_add_32(&lgrp_gen, 1);
590 
591 		break;
592 	case LGRP_CONFIG_MEM_RENAME: {
593 		lgrp_config_mem_rename_t *ren_arg =
594 		    (lgrp_config_mem_rename_t *)where;
595 
596 		lgrp_mem_rename((int)resource,
597 		    ren_arg->lmem_rename_from,
598 		    ren_arg->lmem_rename_to);
599 		atomic_add_32(&lgrp_gen, 1);
600 
601 		break;
602 	}
603 	case LGRP_CONFIG_GEN_UPDATE:
604 		atomic_add_32(&lgrp_gen, 1);
605 
606 		break;
607 	case LGRP_CONFIG_FLATTEN:
608 		if (where == 0)
609 			lgrp_topo_levels = (int)resource;
610 		else
611 			(void) lgrp_topo_flatten(resource,
612 			    lgrp_table, lgrp_alloc_max, &changed);
613 
614 		break;
615 	/*
616 	 * Initiated by platform latency probing code
617 	 */
618 	case LGRP_CONFIG_LATENCY_CHANGE:
619 		lgrp_latency_change((u_longlong_t)resource,
620 		    (u_longlong_t)where);
621 
622 		break;
623 	case LGRP_CONFIG_NOP:
624 
625 		break;
626 	default:
627 		break;
628 	}
629 
630 }
631 
632 /*
633  * Called to add lgrp info into cpu structure from cpu_add_unit;
634  * do not assume cpu is in cpu[] yet!
635  *
636  * CPUs are brought online with all other CPUs paused so we can't
637  * allocate memory or we could deadlock the system, so we rely on
638  * the platform to statically allocate as much space as we need
639  * for the lgrp structs and stats.
640  */
641 static void
642 lgrp_cpu_init(struct cpu *cp)
643 {
644 	klgrpset_t	changed;
645 	int		count;
646 	lgrp_handle_t	hand;
647 	int		first_cpu;
648 	lgrp_t		*my_lgrp;
649 	lgrp_id_t	lgrpid;
650 	struct cpu	*cptr;
651 	struct chip	*chp;
652 
653 	/*
654 	 * This is the first time through if the resource set
655 	 * for the root lgroup is empty. After cpu0 has been
656 	 * initially added to an lgroup, the root's CPU resource
657 	 * set can never be empty, since the system's last CPU
658 	 * cannot be offlined.
659 	 */
660 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
661 		/*
662 		 * First time through.
663 		 */
664 		first_cpu = 1;
665 	} else {
666 		/*
667 		 * If cpu0 needs to move lgroups, we may come
668 		 * through here again, at which time cpu_lock won't
669 		 * be held, and lgrp_initialized will be false.
670 		 */
671 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
672 		ASSERT(cp->cpu_part != NULL);
673 		first_cpu = 0;
674 	}
675 
676 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
677 	my_lgrp = lgrp_hand_to_lgrp(hand);
678 
679 	if (my_lgrp == NULL) {
680 		/*
681 		 * Create new lgrp and add it to lgroup topology
682 		 */
683 		my_lgrp = lgrp_create();
684 		my_lgrp->lgrp_plathand = hand;
685 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
686 		lgrpid = my_lgrp->lgrp_id;
687 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
688 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
689 
690 		count = 0;
691 		klgrpset_clear(changed);
692 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
693 		    &changed);
694 		/*
695 		 * May have added new intermediate lgroups, so need to add
696 		 * resources other than CPUs which are added below
697 		 */
698 		(void) lgrp_mnode_update(changed, NULL);
699 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
700 	    > 0) {
701 		/*
702 		 * Leaf lgroup was created, but latency wasn't available
703 		 * then.  So, set latency for it and fill in rest of lgroup
704 		 * topology  now that we know how far it is from other leaf
705 		 * lgroups.
706 		 */
707 		lgrpid = my_lgrp->lgrp_id;
708 		klgrpset_clear(changed);
709 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
710 		    lgrpid))
711 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
712 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
713 		    &changed);
714 
715 		/*
716 		 * May have added new intermediate lgroups, so need to add
717 		 * resources other than CPUs which are added below
718 		 */
719 		(void) lgrp_mnode_update(changed, NULL);
720 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
721 	    my_lgrp->lgrp_id)) {
722 		int	i;
723 
724 		/*
725 		 * Update existing lgroup and lgroups containing it with CPU
726 		 * resource
727 		 */
728 		lgrpid = my_lgrp->lgrp_id;
729 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
730 		for (i = 0; i <= lgrp_alloc_max; i++) {
731 			lgrp_t		*lgrp;
732 
733 			lgrp = lgrp_table[i];
734 			if (!LGRP_EXISTS(lgrp) ||
735 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
736 				continue;
737 
738 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
739 		}
740 	}
741 
742 	lgrpid = my_lgrp->lgrp_id;
743 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
744 
745 	/*
746 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
747 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
748 	 * not since none of lgroup IDs in the lpl's have been set yet.
749 	 */
750 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
751 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
752 
753 	/*
754 	 * link the CPU into the lgrp's CPU list
755 	 */
756 	if (my_lgrp->lgrp_cpucnt == 0) {
757 		my_lgrp->lgrp_cpu = cp;
758 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
759 	} else {
760 		cptr = my_lgrp->lgrp_cpu;
761 		cp->cpu_next_lgrp = cptr;
762 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
763 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
764 		cptr->cpu_prev_lgrp = cp;
765 	}
766 	my_lgrp->lgrp_cpucnt++;
767 
768 	/*
769 	 * Add this cpu's chip to the per lgroup list
770 	 * if necessary
771 	 */
772 	if (cp->cpu_chip->chip_lgrp == NULL) {
773 		struct chip *lcpr;
774 
775 		chp = cp->cpu_chip;
776 
777 		if (my_lgrp->lgrp_chipcnt == 0) {
778 			my_lgrp->lgrp_chips = chp;
779 			chp->chip_next_lgrp =
780 			    chp->chip_prev_lgrp = chp;
781 		} else {
782 			lcpr = my_lgrp->lgrp_chips;
783 			chp->chip_next_lgrp = lcpr;
784 			chp->chip_prev_lgrp =
785 			    lcpr->chip_prev_lgrp;
786 			lcpr->chip_prev_lgrp->chip_next_lgrp =
787 			    chp;
788 			lcpr->chip_prev_lgrp = chp;
789 		}
790 		chp->chip_lgrp = my_lgrp;
791 		chp->chip_balance = chp->chip_next_lgrp;
792 		my_lgrp->lgrp_chipcnt++;
793 	}
794 }
795 
796 lgrp_t *
797 lgrp_create(void)
798 {
799 	lgrp_t		*my_lgrp;
800 	lgrp_id_t	lgrpid;
801 	int		i;
802 
803 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
804 
805 	/*
806 	 * Find an open slot in the lgroup table and recycle unused lgroup
807 	 * left there if any
808 	 */
809 	my_lgrp = NULL;
810 	if (lgrp_alloc_hint == -1)
811 		/*
812 		 * Allocate from end when hint not set yet because no lgroups
813 		 * have been deleted yet
814 		 */
815 		lgrpid = nlgrps++;
816 	else {
817 		/*
818 		 * Start looking for next open slot from hint and leave hint
819 		 * at slot allocated
820 		 */
821 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
822 			my_lgrp = lgrp_table[i];
823 			if (!LGRP_EXISTS(my_lgrp)) {
824 				lgrpid = i;
825 				nlgrps++;
826 				break;
827 			}
828 		}
829 		lgrp_alloc_hint = lgrpid;
830 	}
831 
832 	/*
833 	 * Keep track of max lgroup ID allocated so far to cut down on searches
834 	 */
835 	if (lgrpid > lgrp_alloc_max)
836 		lgrp_alloc_max = lgrpid;
837 
838 	/*
839 	 * Need to allocate new lgroup if next open slot didn't have one
840 	 * for recycling
841 	 */
842 	if (my_lgrp == NULL)
843 		my_lgrp = lgrp_plat_alloc(lgrpid);
844 
845 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
846 		panic("Too many lgrps for platform (%d)", nlgrps);
847 
848 	my_lgrp->lgrp_id = lgrpid;
849 	my_lgrp->lgrp_latency = 0;
850 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
851 	my_lgrp->lgrp_parent = NULL;
852 	my_lgrp->lgrp_childcnt = 0;
853 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
854 	my_lgrp->lgrp_nmnodes = 0;
855 	klgrpset_clear(my_lgrp->lgrp_children);
856 	klgrpset_clear(my_lgrp->lgrp_leaves);
857 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
858 		klgrpset_clear(my_lgrp->lgrp_set[i]);
859 
860 	my_lgrp->lgrp_cpu = NULL;
861 	my_lgrp->lgrp_cpucnt = 0;
862 	my_lgrp->lgrp_chips = NULL;
863 	my_lgrp->lgrp_chipcnt = 0;
864 
865 	if (my_lgrp->lgrp_kstat != NULL)
866 		lgrp_kstat_reset(lgrpid);
867 
868 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
869 
870 	return (my_lgrp);
871 }
872 
873 void
874 lgrp_destroy(lgrp_t *lgrp)
875 {
876 	int		i;
877 
878 	/*
879 	 * Unless this lgroup is being destroyed on behalf of
880 	 * the boot CPU, cpu_lock must be held
881 	 */
882 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
883 
884 	if (nlgrps == 1)
885 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
886 
887 	if (!LGRP_EXISTS(lgrp))
888 		return;
889 
890 	/*
891 	 * Set hint to lgroup being deleted and try to keep lower numbered
892 	 * hints to facilitate finding empty slots
893 	 */
894 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
895 		lgrp_alloc_hint = lgrp->lgrp_id;
896 
897 	/*
898 	 * Mark this lgroup to be recycled by setting its lgroup ID to
899 	 * LGRP_NONE and clear relevant fields
900 	 */
901 	lgrp->lgrp_id = LGRP_NONE;
902 	lgrp->lgrp_latency = 0;
903 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
904 	lgrp->lgrp_parent = NULL;
905 	lgrp->lgrp_childcnt = 0;
906 
907 	klgrpset_clear(lgrp->lgrp_children);
908 	klgrpset_clear(lgrp->lgrp_leaves);
909 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
910 		klgrpset_clear(lgrp->lgrp_set[i]);
911 
912 	lgrp->lgrp_mnodes = (mnodeset_t)0;
913 	lgrp->lgrp_nmnodes = 0;
914 
915 	lgrp->lgrp_cpu = NULL;
916 	lgrp->lgrp_cpucnt = 0;
917 	lgrp->lgrp_chipcnt = 0;
918 	lgrp->lgrp_chips = NULL;
919 
920 	nlgrps--;
921 }
922 
923 /*
924  * Initialize kstat data. Called from lgrp intialization code.
925  */
926 static void
927 lgrp_kstat_init(void)
928 {
929 	lgrp_stat_t	stat;
930 
931 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
932 
933 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
934 		kstat_named_init(&lgrp_kstat_data[stat],
935 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
936 }
937 
938 /*
939  * initialize an lgrp's kstats if needed
940  * called with cpu_lock held but not with cpus paused.
941  * we don't tear these down now because we don't know about
942  * memory leaving the lgrp yet...
943  */
944 
945 void
946 lgrp_kstat_create(cpu_t *cp)
947 {
948 	kstat_t		*lgrp_kstat;
949 	lgrp_id_t	lgrpid;
950 	lgrp_t		*my_lgrp;
951 
952 	ASSERT(MUTEX_HELD(&cpu_lock));
953 
954 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
955 	my_lgrp = lgrp_table[lgrpid];
956 
957 	if (my_lgrp->lgrp_kstat != NULL)
958 		return; /* already initialized */
959 
960 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
961 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
962 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
963 
964 	if (lgrp_kstat != NULL) {
965 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
966 		lgrp_kstat->ks_private = my_lgrp;
967 		lgrp_kstat->ks_data = &lgrp_kstat_data;
968 		lgrp_kstat->ks_update = lgrp_kstat_extract;
969 		my_lgrp->lgrp_kstat = lgrp_kstat;
970 		kstat_install(lgrp_kstat);
971 	}
972 }
973 
974 /*
975  * this will do something when we manage to remove now unused lgrps
976  */
977 
978 /* ARGSUSED */
979 void
980 lgrp_kstat_destroy(cpu_t *cp)
981 {
982 	ASSERT(MUTEX_HELD(&cpu_lock));
983 }
984 
985 /*
986  * Called when a CPU is off-lined.
987  */
988 static void
989 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
990 {
991 	lgrp_t *my_lgrp;
992 	struct cpu *prev;
993 	struct cpu *next;
994 	chip_t  *chp;
995 
996 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
997 
998 	prev = cp->cpu_prev_lgrp;
999 	next = cp->cpu_next_lgrp;
1000 
1001 	prev->cpu_next_lgrp = next;
1002 	next->cpu_prev_lgrp = prev;
1003 
1004 	/*
1005 	 * just because I'm paranoid doesn't mean...
1006 	 */
1007 
1008 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1009 
1010 	my_lgrp = lgrp_table[lgrpid];
1011 	my_lgrp->lgrp_cpucnt--;
1012 
1013 	/*
1014 	 * If the last CPU on it's chip is being offlined
1015 	 * then remove this chip from the per lgroup list.
1016 	 *
1017 	 * This is also done for the boot CPU when it needs
1018 	 * to move between lgroups as a consequence of
1019 	 * null proc lpa.
1020 	 */
1021 	chp = cp->cpu_chip;
1022 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
1023 
1024 		chip_t	*chpp;
1025 
1026 		if (--my_lgrp->lgrp_chipcnt == 0)
1027 			my_lgrp->lgrp_chips = NULL;
1028 		else if (my_lgrp->lgrp_chips == chp)
1029 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
1030 
1031 		/*
1032 		 * Walk this lgroup's chip list looking for chips that
1033 		 * may try to balance against the one that's leaving
1034 		 */
1035 		for (chpp = chp->chip_next_lgrp; chpp != chp;
1036 		    chpp = chpp->chip_next_lgrp) {
1037 			if (chpp->chip_balance == chp)
1038 				chpp->chip_balance = chp->chip_next_lgrp;
1039 		}
1040 
1041 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1042 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1043 
1044 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1045 		chp->chip_lgrp = NULL;
1046 		chp->chip_balance = NULL;
1047 	}
1048 
1049 	/*
1050 	 * Removing last CPU in lgroup, so update lgroup topology
1051 	 */
1052 	if (my_lgrp->lgrp_cpucnt == 0) {
1053 		klgrpset_t	changed;
1054 		int		count;
1055 		int		i;
1056 
1057 		my_lgrp->lgrp_cpu = NULL;
1058 
1059 		/*
1060 		 * Remove this lgroup from its lgroup CPU resources and remove
1061 		 * lgroup from lgroup topology if it doesn't have any more
1062 		 * resources in it now
1063 		 */
1064 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1065 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1066 			count = 0;
1067 			klgrpset_clear(changed);
1068 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1069 			    lgrp_alloc_max + 1, &changed);
1070 			return;
1071 		}
1072 
1073 		/*
1074 		 * This lgroup isn't empty, so just remove it from CPU
1075 		 * resources of any lgroups that contain it as such
1076 		 */
1077 		for (i = 0; i <= lgrp_alloc_max; i++) {
1078 			lgrp_t		*lgrp;
1079 
1080 			lgrp = lgrp_table[i];
1081 			if (!LGRP_EXISTS(lgrp) ||
1082 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1083 			    lgrpid))
1084 				continue;
1085 
1086 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1087 		}
1088 		return;
1089 	}
1090 
1091 	if (my_lgrp->lgrp_cpu == cp)
1092 		my_lgrp->lgrp_cpu = next;
1093 
1094 }
1095 
1096 /*
1097  * Update memory nodes in target lgroups and return ones that get changed
1098  */
1099 int
1100 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1101 {
1102 	int	count;
1103 	int	i;
1104 	int	j;
1105 	lgrp_t	*lgrp;
1106 	lgrp_t	*lgrp_rsrc;
1107 
1108 	count = 0;
1109 	if (changed)
1110 		klgrpset_clear(*changed);
1111 
1112 	if (klgrpset_isempty(target))
1113 		return (0);
1114 
1115 	/*
1116 	 * Find each lgroup in target lgroups
1117 	 */
1118 	for (i = 0; i <= lgrp_alloc_max; i++) {
1119 		/*
1120 		 * Skip any lgroups that don't exist or aren't in target group
1121 		 */
1122 		lgrp = lgrp_table[i];
1123 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1124 			continue;
1125 		}
1126 
1127 		/*
1128 		 * Initialize memnodes for intermediate lgroups to 0
1129 		 * and update them from scratch since they may have completely
1130 		 * changed
1131 		 */
1132 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1133 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1134 			lgrp->lgrp_nmnodes = 0;
1135 		}
1136 
1137 		/*
1138 		 * Update memory nodes of of target lgroup with memory nodes
1139 		 * from each lgroup in its lgroup memory resource set
1140 		 */
1141 		for (j = 0; j <= lgrp_alloc_max; j++) {
1142 			int	k;
1143 
1144 			/*
1145 			 * Skip any lgroups that don't exist or aren't in
1146 			 * memory resources of target lgroup
1147 			 */
1148 			lgrp_rsrc = lgrp_table[j];
1149 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1150 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1151 			    j))
1152 				continue;
1153 
1154 			/*
1155 			 * Update target lgroup's memnodes to include memnodes
1156 			 * of this lgroup
1157 			 */
1158 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1159 				mnodeset_t	mnode_mask;
1160 
1161 				mnode_mask = (mnodeset_t)1 << k;
1162 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1163 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1164 					lgrp->lgrp_mnodes |= mnode_mask;
1165 					lgrp->lgrp_nmnodes++;
1166 				}
1167 			}
1168 			count++;
1169 			if (changed)
1170 				klgrpset_add(*changed, lgrp->lgrp_id);
1171 		}
1172 	}
1173 
1174 	return (count);
1175 }
1176 
1177 /*
1178  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1179  * is moved from one board to another. The "from" and "to" arguments specify the
1180  * source and the destination of the move.
1181  *
1182  * See plat_lgrp_config() for a detailed description of the copy-rename
1183  * semantics.
1184  *
1185  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1186  * the lgroup topology which is changing as memory moves from one lgroup to
1187  * another. It removes the mnode from the source lgroup and re-inserts it in the
1188  * target lgroup.
1189  *
1190  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1191  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1192  * copy-rename operation.
1193  *
1194  * There is one case which requires special handling. If the system contains
1195  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1196  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1197  * lgrp_mem_init), but there is a window when the system has no memory in the
1198  * lgroup hierarchy. If another thread tries to allocate memory during this
1199  * window, the allocation will fail, although the system has physical memory.
1200  * This may cause a system panic or a deadlock (some sleeping memory allocations
1201  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1202  * the mnode back).
1203  *
1204  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1205  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1206  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1207  * but it updates the rest of the lgroup topology as if the mnode was actually
1208  * removed. The lgrp_mem_init() function recognizes that the mnode being
1209  * inserted represents such a special case and updates the topology
1210  * appropriately.
1211  */
1212 void
1213 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1214 {
1215 	/*
1216 	 * Remove the memory from the source node and add it to the destination
1217 	 * node.
1218 	 */
1219 	lgrp_mem_fini(mnode, from, B_TRUE);
1220 	lgrp_mem_init(mnode, to, B_TRUE);
1221 }
1222 
1223 /*
1224  * Called to indicate that the lgrp with platform handle "hand" now
1225  * contains the memory identified by "mnode".
1226  *
1227  * LOCKING for this routine is a bit tricky. Usually it is called without
1228  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1229  * callers. During DR of the board containing the caged memory it may be called
1230  * with cpu_lock already held and CPUs paused.
1231  *
1232  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1233  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1234  * dealing with the special case of DR copy-rename described in
1235  * lgrp_mem_rename().
1236  */
1237 void
1238 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1239 {
1240 	klgrpset_t	changed;
1241 	int		count;
1242 	int		i;
1243 	lgrp_t		*my_lgrp;
1244 	lgrp_id_t	lgrpid;
1245 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1246 	boolean_t	drop_lock = B_FALSE;
1247 	boolean_t	need_synch = B_FALSE;
1248 
1249 	/*
1250 	 * Grab CPU lock (if we haven't already)
1251 	 */
1252 	if (!MUTEX_HELD(&cpu_lock)) {
1253 		mutex_enter(&cpu_lock);
1254 		drop_lock = B_TRUE;
1255 	}
1256 
1257 	/*
1258 	 * This routine may be called from a context where we already
1259 	 * hold cpu_lock, and have already paused cpus.
1260 	 */
1261 	if (!cpus_paused())
1262 		need_synch = B_TRUE;
1263 
1264 	/*
1265 	 * Check if this mnode is already configured and return immediately if
1266 	 * it is.
1267 	 *
1268 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1269 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1270 	 * recognize this case and continue as usual, but skip the update to
1271 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1272 	 * in topology, temporarily introduced by lgrp_mem_fini().
1273 	 */
1274 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1275 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1276 		if (drop_lock)
1277 			mutex_exit(&cpu_lock);
1278 		return;
1279 	}
1280 
1281 	/*
1282 	 * Update lgroup topology with new memory resources, keeping track of
1283 	 * which lgroups change
1284 	 */
1285 	count = 0;
1286 	klgrpset_clear(changed);
1287 	my_lgrp = lgrp_hand_to_lgrp(hand);
1288 	if (my_lgrp == NULL) {
1289 		/* new lgrp */
1290 		my_lgrp = lgrp_create();
1291 		lgrpid = my_lgrp->lgrp_id;
1292 		my_lgrp->lgrp_plathand = hand;
1293 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1294 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1295 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1296 
1297 		if (need_synch)
1298 			pause_cpus(NULL);
1299 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1300 		    &changed);
1301 		if (need_synch)
1302 			start_cpus();
1303 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1304 	    > 0) {
1305 		/*
1306 		 * Leaf lgroup was created, but latency wasn't available
1307 		 * then.  So, set latency for it and fill in rest of lgroup
1308 		 * topology  now that we know how far it is from other leaf
1309 		 * lgroups.
1310 		 */
1311 		klgrpset_clear(changed);
1312 		lgrpid = my_lgrp->lgrp_id;
1313 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1314 		    lgrpid))
1315 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1316 		if (need_synch)
1317 			pause_cpus(NULL);
1318 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1319 		    &changed);
1320 		if (need_synch)
1321 			start_cpus();
1322 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1323 	    my_lgrp->lgrp_id)) {
1324 		/*
1325 		 * Add new lgroup memory resource to existing lgroup
1326 		 */
1327 		lgrpid = my_lgrp->lgrp_id;
1328 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1329 		klgrpset_add(changed, lgrpid);
1330 		count++;
1331 		for (i = 0; i <= lgrp_alloc_max; i++) {
1332 			lgrp_t		*lgrp;
1333 
1334 			lgrp = lgrp_table[i];
1335 			if (!LGRP_EXISTS(lgrp) ||
1336 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1337 				continue;
1338 
1339 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1340 			klgrpset_add(changed, lgrp->lgrp_id);
1341 			count++;
1342 		}
1343 	}
1344 
1345 	/*
1346 	 * Add memory node to lgroup and remove lgroup from ones that need
1347 	 * to be updated
1348 	 */
1349 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1350 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1351 		my_lgrp->lgrp_nmnodes++;
1352 	}
1353 	klgrpset_del(changed, lgrpid);
1354 
1355 	/*
1356 	 * Update memory node information for all lgroups that changed and
1357 	 * contain new memory node as a resource
1358 	 */
1359 	if (count)
1360 		(void) lgrp_mnode_update(changed, NULL);
1361 
1362 	if (drop_lock)
1363 		mutex_exit(&cpu_lock);
1364 }
1365 
1366 /*
1367  * Called to indicate that the lgroup associated with the platform
1368  * handle "hand" no longer contains given memory node
1369  *
1370  * LOCKING for this routine is a bit tricky. Usually it is called without
1371  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1372  * callers. During DR of the board containing the caged memory it may be called
1373  * with cpu_lock already held and CPUs paused.
1374  *
1375  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1376  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1377  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1378  * the same mnode back into the topology. See lgrp_mem_rename() and
1379  * lgrp_mem_init() for additional details.
1380  */
1381 void
1382 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1383 {
1384 	klgrpset_t	changed;
1385 	int		count;
1386 	int		i;
1387 	lgrp_t		*my_lgrp;
1388 	lgrp_id_t	lgrpid;
1389 	mnodeset_t	mnodes_mask;
1390 	boolean_t	drop_lock = B_FALSE;
1391 	boolean_t	need_synch = B_FALSE;
1392 
1393 	/*
1394 	 * Grab CPU lock (if we haven't already)
1395 	 */
1396 	if (!MUTEX_HELD(&cpu_lock)) {
1397 		mutex_enter(&cpu_lock);
1398 		drop_lock = B_TRUE;
1399 	}
1400 
1401 	/*
1402 	 * This routine may be called from a context where we already
1403 	 * hold cpu_lock and have already paused cpus.
1404 	 */
1405 	if (!cpus_paused())
1406 		need_synch = B_TRUE;
1407 
1408 	my_lgrp = lgrp_hand_to_lgrp(hand);
1409 
1410 	/*
1411 	 * The lgrp *must* be pre-existing
1412 	 */
1413 	ASSERT(my_lgrp != NULL);
1414 
1415 	/*
1416 	 * Delete memory node from lgroups which contain it
1417 	 */
1418 	mnodes_mask = ((mnodeset_t)1 << mnode);
1419 	for (i = 0; i <= lgrp_alloc_max; i++) {
1420 		lgrp_t *lgrp = lgrp_table[i];
1421 		/*
1422 		 * Skip any non-existent lgroups and any lgroups that don't
1423 		 * contain leaf lgroup of memory as a memory resource
1424 		 */
1425 		if (!LGRP_EXISTS(lgrp) ||
1426 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1427 			continue;
1428 
1429 		/*
1430 		 * Avoid removing the last mnode from the root in the DR
1431 		 * copy-rename case. See lgrp_mem_rename() for details.
1432 		 */
1433 		if (is_copy_rename &&
1434 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1435 			continue;
1436 
1437 		/*
1438 		 * Remove memory node from lgroup.
1439 		 */
1440 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1441 		lgrp->lgrp_nmnodes--;
1442 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1443 	}
1444 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1445 
1446 	/*
1447 	 * Don't need to update lgroup topology if this lgroup still has memory.
1448 	 *
1449 	 * In the special case of DR copy-rename with the only mnode being
1450 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1451 	 * still need to update the lgroup topology.
1452 	 */
1453 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1454 	    !(is_copy_rename &&
1455 		(my_lgrp == lgrp_root) &&
1456 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1457 		if (drop_lock)
1458 			mutex_exit(&cpu_lock);
1459 		return;
1460 	}
1461 
1462 	/*
1463 	 * This lgroup does not contain any memory now
1464 	 */
1465 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1466 
1467 	/*
1468 	 * Remove this lgroup from lgroup topology if it does not contain any
1469 	 * resources now
1470 	 */
1471 	lgrpid = my_lgrp->lgrp_id;
1472 	count = 0;
1473 	klgrpset_clear(changed);
1474 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1475 		/*
1476 		 * Delete lgroup when no more resources
1477 		 */
1478 		if (need_synch)
1479 			pause_cpus(NULL);
1480 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1481 		    lgrp_alloc_max + 1, &changed);
1482 		ASSERT(count > 0);
1483 		if (need_synch)
1484 			start_cpus();
1485 	} else {
1486 		/*
1487 		 * Remove lgroup from memory resources of any lgroups that
1488 		 * contain it as such
1489 		 */
1490 		for (i = 0; i <= lgrp_alloc_max; i++) {
1491 			lgrp_t		*lgrp;
1492 
1493 			lgrp = lgrp_table[i];
1494 			if (!LGRP_EXISTS(lgrp) ||
1495 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1496 			    lgrpid))
1497 				continue;
1498 
1499 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1500 		}
1501 	}
1502 	if (drop_lock)
1503 		mutex_exit(&cpu_lock);
1504 }
1505 
1506 /*
1507  * Return lgroup with given platform handle
1508  */
1509 lgrp_t *
1510 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1511 {
1512 	int	i;
1513 	lgrp_t	*lgrp;
1514 
1515 	if (hand == LGRP_NULL_HANDLE)
1516 		return (NULL);
1517 
1518 	for (i = 0; i <= lgrp_alloc_max; i++) {
1519 		lgrp = lgrp_table[i];
1520 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1521 			return (lgrp);
1522 	}
1523 	return (NULL);
1524 }
1525 
1526 /*
1527  * Return the home lgroup of the current thread.
1528  * We must do this with kernel preemption disabled, since we don't want our
1529  * thread to be re-homed while we're poking around with its lpl, and the lpl
1530  * should never be NULL.
1531  *
1532  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1533  * is enabled because of DR.  Callers can use disable kernel preemption
1534  * around this call to guarantee that the lgroup will be valid beyond this
1535  * routine, since kernel preemption can be recursive.
1536  */
1537 lgrp_t *
1538 lgrp_home_lgrp(void)
1539 {
1540 	lgrp_t	*lgrp;
1541 	lpl_t	*lpl;
1542 
1543 	kpreempt_disable();
1544 
1545 	lpl = curthread->t_lpl;
1546 	ASSERT(lpl != NULL);
1547 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1548 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1549 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1550 
1551 	kpreempt_enable();
1552 
1553 	return (lgrp);
1554 }
1555 
1556 /*
1557  * Return ID of home lgroup for given thread
1558  * (See comments for lgrp_home_lgrp() for special care and handling
1559  * instructions)
1560  */
1561 lgrp_id_t
1562 lgrp_home_id(kthread_t *t)
1563 {
1564 	lgrp_id_t	lgrp;
1565 	lpl_t		*lpl;
1566 
1567 	ASSERT(t != NULL);
1568 	/*
1569 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1570 	 * cannot since the HAT layer can call into this routine to
1571 	 * determine the locality for its data structures in the context
1572 	 * of a page fault.
1573 	 */
1574 
1575 	kpreempt_disable();
1576 
1577 	lpl = t->t_lpl;
1578 	ASSERT(lpl != NULL);
1579 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1580 	lgrp = lpl->lpl_lgrpid;
1581 
1582 	kpreempt_enable();
1583 
1584 	return (lgrp);
1585 }
1586 
1587 /*
1588  * Return lgroup containing the physical memory for the given page frame number
1589  */
1590 lgrp_t *
1591 lgrp_pfn_to_lgrp(pfn_t pfn)
1592 {
1593 	lgrp_handle_t	hand;
1594 	int		i;
1595 	lgrp_t		*lgrp;
1596 
1597 	hand = lgrp_plat_pfn_to_hand(pfn);
1598 	if (hand != LGRP_NULL_HANDLE)
1599 		for (i = 0; i <= lgrp_alloc_max; i++) {
1600 			lgrp = lgrp_table[i];
1601 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1602 				return (lgrp);
1603 		}
1604 	return (NULL);
1605 }
1606 
1607 /*
1608  * Return lgroup containing the physical memory for the given page frame number
1609  */
1610 lgrp_t *
1611 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1612 {
1613 	lgrp_handle_t	hand;
1614 	int		i;
1615 	lgrp_t		*lgrp;
1616 	pfn_t		pfn;
1617 
1618 	pfn = btop(physaddr);
1619 	hand = lgrp_plat_pfn_to_hand(pfn);
1620 	if (hand != LGRP_NULL_HANDLE)
1621 		for (i = 0; i <= lgrp_alloc_max; i++) {
1622 			lgrp = lgrp_table[i];
1623 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1624 				return (lgrp);
1625 		}
1626 	return (NULL);
1627 }
1628 
1629 /*
1630  * Return the leaf lgroup containing the given CPU
1631  *
1632  * The caller needs to take precautions necessary to prevent
1633  * "cpu" from going away across a call to this function.
1634  * hint: kpreempt_disable()/kpreempt_enable()
1635  */
1636 static lgrp_t *
1637 lgrp_cpu_to_lgrp(cpu_t *cpu)
1638 {
1639 	return (cpu->cpu_lpl->lpl_lgrp);
1640 }
1641 
1642 /*
1643  * Return the sum of the partition loads in an lgrp divided by
1644  * the number of CPUs in the lgrp.  This is our best approximation
1645  * of an 'lgroup load average' for a useful per-lgroup kstat.
1646  */
1647 static uint64_t
1648 lgrp_sum_loadavgs(lgrp_t *lgrp)
1649 {
1650 	cpu_t *cpu;
1651 	int ncpu;
1652 	uint64_t loads = 0;
1653 
1654 	mutex_enter(&cpu_lock);
1655 
1656 	cpu = lgrp->lgrp_cpu;
1657 	ncpu = lgrp->lgrp_cpucnt;
1658 
1659 	if (cpu == NULL || ncpu == 0) {
1660 		mutex_exit(&cpu_lock);
1661 		return (0ull);
1662 	}
1663 
1664 	do {
1665 		loads += cpu->cpu_lpl->lpl_loadavg;
1666 		cpu = cpu->cpu_next_lgrp;
1667 	} while (cpu != lgrp->lgrp_cpu);
1668 
1669 	mutex_exit(&cpu_lock);
1670 
1671 	return (loads / ncpu);
1672 }
1673 
1674 void
1675 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1676 {
1677 	struct lgrp_stats *pstats;
1678 
1679 	/*
1680 	 * Verify that the caller isn't trying to add to
1681 	 * a statistic for an lgroup that has gone away
1682 	 */
1683 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1684 		return;
1685 
1686 	pstats = &lgrp_stats[lgrpid];
1687 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1688 }
1689 
1690 int64_t
1691 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1692 {
1693 	uint64_t val;
1694 	struct lgrp_stats *pstats;
1695 
1696 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1697 		return ((int64_t)0);
1698 
1699 	pstats = &lgrp_stats[lgrpid];
1700 	LGRP_STAT_READ(pstats, stat, val);
1701 	return (val);
1702 }
1703 
1704 /*
1705  * Reset all kstats for lgrp specified by its lgrpid.
1706  */
1707 static void
1708 lgrp_kstat_reset(lgrp_id_t lgrpid)
1709 {
1710 	lgrp_stat_t stat;
1711 
1712 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1713 		return;
1714 
1715 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1716 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1717 	}
1718 }
1719 
1720 /*
1721  * Collect all per-lgrp statistics for the lgrp associated with this
1722  * kstat, and store them in the ks_data array.
1723  *
1724  * The superuser can reset all the running counter statistics for an
1725  * lgrp by writing to any of the lgrp's stats.
1726  */
1727 static int
1728 lgrp_kstat_extract(kstat_t *ksp, int rw)
1729 {
1730 	lgrp_stat_t		stat;
1731 	struct kstat_named	*ksd;
1732 	lgrp_t			*lgrp;
1733 	lgrp_id_t		lgrpid;
1734 
1735 	lgrp = (lgrp_t *)ksp->ks_private;
1736 
1737 	ksd = (struct kstat_named *)ksp->ks_data;
1738 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1739 
1740 	lgrpid = lgrp->lgrp_id;
1741 
1742 	if (lgrpid == LGRP_NONE) {
1743 		/*
1744 		 * Return all zeroes as stats for freed lgrp.
1745 		 */
1746 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1747 			ksd[stat].value.i64 = 0;
1748 		}
1749 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1750 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1751 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1752 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1753 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1754 	} else if (rw != KSTAT_WRITE) {
1755 		/*
1756 		 * Handle counter stats
1757 		 */
1758 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1759 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1760 		}
1761 
1762 		/*
1763 		 * Handle kernel data snapshot stats
1764 		 */
1765 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1766 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1767 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1768 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1769 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1770 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1771 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1772 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1773 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1774 		    lgrp_loadavg_max_effect;
1775 	} else {
1776 		lgrp_kstat_reset(lgrpid);
1777 	}
1778 
1779 	return (0);
1780 }
1781 
1782 int
1783 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1784 {
1785 	cpu_t	*cp;
1786 
1787 	mutex_enter(&cpu_lock);
1788 
1789 	if ((cp = cpu_get(id)) == NULL) {
1790 		mutex_exit(&cpu_lock);
1791 		return (EINVAL);
1792 	}
1793 
1794 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1795 		mutex_exit(&cpu_lock);
1796 		return (EINVAL);
1797 	}
1798 
1799 	ASSERT(cp->cpu_lpl != NULL);
1800 
1801 	*lp = cp->cpu_lpl->lpl_lgrpid;
1802 
1803 	mutex_exit(&cpu_lock);
1804 
1805 	return (0);
1806 }
1807 
1808 int
1809 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1810 {
1811 	cpu_t *cp;
1812 
1813 	mutex_enter(&cpu_lock);
1814 
1815 	if ((cp = cpu_get(id)) == NULL) {
1816 		mutex_exit(&cpu_lock);
1817 		return (EINVAL);
1818 	}
1819 
1820 	ASSERT(cp->cpu_lpl != NULL);
1821 
1822 	*lp = cp->cpu_lpl->lpl_loadavg;
1823 
1824 	mutex_exit(&cpu_lock);
1825 
1826 	return (0);
1827 }
1828 
1829 void
1830 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1831 {
1832 	lgrp_t		*lgrp;
1833 	int		i;
1834 
1835 	for (i = 0; i <= lgrp_alloc_max; i++) {
1836 		lgrp = lgrp_table[i];
1837 
1838 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1839 			lgrp->lgrp_latency = (int)newtime;
1840 	}
1841 }
1842 
1843 /*
1844  * Add a resource named by lpl_leaf to rset of lpl_target
1845  *
1846  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1847  * resource. It is adjusted here, as this is presently the only place that we
1848  * can be certain a resource addition has succeeded.
1849  *
1850  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1851  * list in order until it reaches a NULL.  (This list is required to be NULL
1852  * terminated, too).  This is done so that we can mark start pos + 1, so that
1853  * each lpl is traversed sequentially, but in a different order.  We hope this
1854  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1855  */
1856 
1857 void
1858 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1859 {
1860 	int		i;
1861 	int		entry_slot = 0;
1862 
1863 	/* return if leaf is already present */
1864 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1865 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1866 			return;
1867 		}
1868 
1869 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1870 		    lpl_leaf->lpl_lgrpid) {
1871 			break;
1872 		}
1873 	}
1874 
1875 	/* insert leaf, update counts */
1876 	entry_slot = i;
1877 	i = lpl_target->lpl_nrset++;
1878 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1879 		panic("More leaf lgrps in system than are supported!\n");
1880 	}
1881 
1882 	/*
1883 	 * Start at the end of the rset array and work backwards towards the
1884 	 * slot into which the new lpl will be inserted. This effectively
1885 	 * preserves the current ordering by scooting everybody over one entry,
1886 	 * and placing the new entry into the space created.
1887 	 */
1888 
1889 	while (i-- > entry_slot) {
1890 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1891 	}
1892 
1893 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1894 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1895 }
1896 
1897 /*
1898  * Update each of lpl_parent's children with a proper hint and
1899  * a reference to their parent.
1900  * The lgrp topology is used as the reference since it is fully
1901  * consistent and correct at this point.
1902  *
1903  * Each child's hint will reference an element in lpl_parent's
1904  * rset that designates where the child should start searching
1905  * for CPU resources. The hint selected is the highest order leaf present
1906  * in the child's lineage.
1907  *
1908  * This should be called after any potential change in lpl_parent's
1909  * rset.
1910  */
1911 static void
1912 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1913 {
1914 	klgrpset_t	children, leaves;
1915 	lpl_t		*lpl;
1916 	int		hint;
1917 	int		i, j;
1918 
1919 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1920 	if (klgrpset_isempty(children))
1921 		return; /* nothing to do */
1922 
1923 	for (i = 0; i <= lgrp_alloc_max; i++) {
1924 		if (klgrpset_ismember(children, i)) {
1925 
1926 			/*
1927 			 * Given the set of leaves in this child's lineage,
1928 			 * find the highest order leaf present in the parent's
1929 			 * rset. Select this as the hint for the child.
1930 			 */
1931 			leaves = lgrp_table[i]->lgrp_leaves;
1932 			hint = 0;
1933 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1934 				lpl = lpl_parent->lpl_rset[j];
1935 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1936 					hint = j;
1937 			}
1938 			cp->cp_lgrploads[i].lpl_hint = hint;
1939 
1940 			/*
1941 			 * (Re)set the parent. It may be incorrect if
1942 			 * lpl_parent is new in the topology.
1943 			 */
1944 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1945 		}
1946 	}
1947 }
1948 
1949 /*
1950  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1951  *
1952  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1953  * resource. The values are adjusted here, as this is the only place that we can
1954  * be certain a resource was successfully deleted.
1955  */
1956 void
1957 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1958 {
1959 	int i;
1960 
1961 	/* find leaf in intermediate node */
1962 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1963 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1964 			break;
1965 	}
1966 
1967 	/* return if leaf not found */
1968 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1969 		return;
1970 
1971 	/* prune leaf, compress array */
1972 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1973 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1974 	lpl_target->lpl_ncpu--;
1975 	do {
1976 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1977 	} while (i++ < lpl_target->lpl_nrset);
1978 }
1979 
1980 /*
1981  * Check to see if the resource set of the target lpl contains the
1982  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1983  */
1984 
1985 int
1986 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1987 {
1988 	int i;
1989 
1990 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1991 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1992 			return (1);
1993 	}
1994 
1995 	return (0);
1996 }
1997 
1998 /*
1999  * Called when we change cpu lpl membership.  This increments or decrements the
2000  * per-cpu counter in every lpl in which our leaf appears.
2001  */
2002 void
2003 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
2004 {
2005 	cpupart_t	*cpupart;
2006 	lgrp_t		*lgrp_leaf;
2007 	lgrp_t		*lgrp_cur;
2008 	lpl_t		*lpl_leaf;
2009 	lpl_t		*lpl_cur;
2010 	int		i;
2011 
2012 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
2013 
2014 	cpupart = cp->cpu_part;
2015 	lpl_leaf = cp->cpu_lpl;
2016 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
2017 
2018 	for (i = 0; i <= lgrp_alloc_max; i++) {
2019 		lgrp_cur = lgrp_table[i];
2020 
2021 		/*
2022 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2023 		 * for the cpu in question, or if the current lgrp and leaf
2024 		 * don't share the same resources.
2025 		 */
2026 
2027 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2028 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2029 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2030 			continue;
2031 
2032 
2033 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2034 
2035 		if (lpl_cur->lpl_nrset > 0) {
2036 			if (act == LPL_INCREMENT) {
2037 				lpl_cur->lpl_ncpu++;
2038 			} else if (act == LPL_DECREMENT) {
2039 				lpl_cur->lpl_ncpu--;
2040 			}
2041 		}
2042 	}
2043 }
2044 
2045 /*
2046  * Initialize lpl with given resources and specified lgrp
2047  */
2048 
2049 void
2050 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2051 {
2052 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2053 	lpl->lpl_loadavg = 0;
2054 	if (lpl == lpl_leaf)
2055 		lpl->lpl_ncpu = 1;
2056 	else
2057 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2058 	lpl->lpl_nrset = 1;
2059 	lpl->lpl_rset[0] = lpl_leaf;
2060 	lpl->lpl_lgrp = lgrp;
2061 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2062 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2063 }
2064 
2065 /*
2066  * Clear an unused lpl
2067  */
2068 
2069 void
2070 lpl_clear(lpl_t *lpl)
2071 {
2072 	lgrp_id_t	lid;
2073 
2074 	/* save lid for debugging purposes */
2075 	lid = lpl->lpl_lgrpid;
2076 	bzero(lpl, sizeof (lpl_t));
2077 	lpl->lpl_lgrpid = lid;
2078 }
2079 
2080 /*
2081  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2082  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2083  * make full use of all of the lgroup topology, but this checks to make sure
2084  * that for the parts that it does use, it has correctly understood the
2085  * relationships that exist. This function returns
2086  * 0 if the topology is correct, and a non-zero error code, for non-debug
2087  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2088  * debugging on a DEBUG kernel.
2089  */
2090 int
2091 lpl_topo_verify(cpupart_t *cpupart)
2092 {
2093 	lgrp_t		*lgrp;
2094 	lpl_t		*lpl;
2095 	klgrpset_t	rset;
2096 	klgrpset_t	cset;
2097 	cpu_t		*cpu;
2098 	cpu_t		*cp_start;
2099 	int		i;
2100 	int		j;
2101 	int		sum;
2102 
2103 	/* topology can't be incorrect if it doesn't exist */
2104 	if (!lgrp_topo_initialized || !lgrp_initialized)
2105 		return (LPL_TOPO_CORRECT);
2106 
2107 	ASSERT(cpupart != NULL);
2108 
2109 	for (i = 0; i <= lgrp_alloc_max; i++) {
2110 		lgrp = lgrp_table[i];
2111 		lpl = NULL;
2112 		/* make sure lpls are allocated */
2113 		ASSERT(cpupart->cp_lgrploads);
2114 		if (!cpupart->cp_lgrploads)
2115 			return (LPL_TOPO_PART_HAS_NO_LPL);
2116 
2117 		lpl = &cpupart->cp_lgrploads[i];
2118 		/* make sure our index is good */
2119 		ASSERT(i < cpupart->cp_nlgrploads);
2120 
2121 		/* if lgroup doesn't exist, make sure lpl is empty */
2122 		if (!LGRP_EXISTS(lgrp)) {
2123 			ASSERT(lpl->lpl_ncpu == 0);
2124 			if (lpl->lpl_ncpu > 0) {
2125 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2126 			} else {
2127 				continue;
2128 			}
2129 		}
2130 
2131 		/* verify that lgroup and lpl are identically numbered */
2132 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2133 
2134 		/* if lgroup isn't in our partition, make sure lpl is empty */
2135 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2136 		    cpupart->cp_lgrpset)) {
2137 			ASSERT(lpl->lpl_ncpu == 0);
2138 			if (lpl->lpl_ncpu > 0) {
2139 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2140 			}
2141 			/*
2142 			 * lpl is empty, and lgroup isn't in partition.  verify
2143 			 * that lpl doesn't show up in anyone else's rsets (in
2144 			 * this partition, anyway)
2145 			 */
2146 
2147 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2148 				lpl_t *i_lpl; /* lpl we're iterating over */
2149 
2150 				i_lpl = &cpupart->cp_lgrploads[j];
2151 
2152 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2153 				if (lpl_rset_contains(i_lpl, lpl)) {
2154 					return (LPL_TOPO_LPL_ORPHANED);
2155 				}
2156 			}
2157 			/* lgroup is empty, and everything is ok. continue */
2158 			continue;
2159 		}
2160 
2161 
2162 		/* lgroup is in this partition, now check it against lpl */
2163 
2164 		/* do both have matching lgrps? */
2165 		ASSERT(lgrp == lpl->lpl_lgrp);
2166 		if (lgrp != lpl->lpl_lgrp) {
2167 			return (LPL_TOPO_LGRP_MISMATCH);
2168 		}
2169 
2170 		/* do the parent lgroups exist and do they match? */
2171 		if (lgrp->lgrp_parent) {
2172 			ASSERT(lpl->lpl_parent);
2173 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2174 				    lpl->lpl_parent->lpl_lgrpid);
2175 
2176 			if (!lpl->lpl_parent) {
2177 				return (LPL_TOPO_MISSING_PARENT);
2178 			} else if (lgrp->lgrp_parent->lgrp_id !=
2179 			    lpl->lpl_parent->lpl_lgrpid) {
2180 				return (LPL_TOPO_PARENT_MISMATCH);
2181 			}
2182 		}
2183 
2184 		/* only leaf lgroups keep a cpucnt, only check leaves */
2185 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2186 
2187 			/* verify that lgrp is also a leaf */
2188 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2189 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2190 			    lpl->lpl_lgrpid)));
2191 
2192 			if ((lgrp->lgrp_childcnt > 0) ||
2193 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2194 			    lpl->lpl_lgrpid))) {
2195 				return (LPL_TOPO_LGRP_NOT_LEAF);
2196 			}
2197 
2198 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2199 			    (lpl->lpl_ncpu > 0));
2200 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2201 				(lpl->lpl_ncpu <= 0)) {
2202 				return (LPL_TOPO_BAD_CPUCNT);
2203 			}
2204 
2205 			/*
2206 			 * Check that lpl_ncpu also matches the number of
2207 			 * cpus in the lpl's linked list.  This only exists in
2208 			 * leaves, but they should always match.
2209 			 */
2210 			j = 0;
2211 			cpu = cp_start = lpl->lpl_cpus;
2212 			while (cpu != NULL) {
2213 				j++;
2214 
2215 				/* check to make sure cpu's lpl is leaf lpl */
2216 				ASSERT(cpu->cpu_lpl == lpl);
2217 				if (cpu->cpu_lpl != lpl) {
2218 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2219 				}
2220 
2221 				/* check next cpu */
2222 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2223 					continue;
2224 				} else {
2225 					cpu = NULL;
2226 				}
2227 			}
2228 
2229 			ASSERT(j == lpl->lpl_ncpu);
2230 			if (j != lpl->lpl_ncpu) {
2231 				return (LPL_TOPO_LPL_BAD_NCPU);
2232 			}
2233 
2234 			/*
2235 			 * Also, check that leaf lpl is contained in all
2236 			 * intermediate lpls that name the leaf as a descendant
2237 			 */
2238 
2239 			for (j = 0; j <= lgrp_alloc_max; j++) {
2240 				klgrpset_t intersect;
2241 				lgrp_t *lgrp_cand;
2242 				lpl_t *lpl_cand;
2243 
2244 				lgrp_cand = lgrp_table[j];
2245 				intersect = klgrpset_intersects(
2246 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2247 				    cpupart->cp_lgrpset);
2248 
2249 				if (!LGRP_EXISTS(lgrp_cand) ||
2250 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2251 				    cpupart->cp_lgrpset) ||
2252 				    (intersect == 0))
2253 					continue;
2254 
2255 				lpl_cand =
2256 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2257 
2258 				if (klgrpset_ismember(intersect,
2259 				    lgrp->lgrp_id)) {
2260 					ASSERT(lpl_rset_contains(lpl_cand,
2261 					    lpl));
2262 
2263 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2264 						return (LPL_TOPO_RSET_MSSNG_LF);
2265 					}
2266 				}
2267 			}
2268 
2269 		} else { /* non-leaf specific checks */
2270 
2271 			/*
2272 			 * Non-leaf lpls should have lpl_cpus == NULL
2273 			 * verify that this is so
2274 			 */
2275 			ASSERT(lpl->lpl_cpus == NULL);
2276 			if (lpl->lpl_cpus != NULL) {
2277 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2278 			}
2279 
2280 			/*
2281 			 * verify that the sum of the cpus in the leaf resources
2282 			 * is equal to the total ncpu in the intermediate
2283 			 */
2284 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2285 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2286 			}
2287 
2288 			ASSERT(sum == lpl->lpl_ncpu);
2289 			if (sum != lpl->lpl_ncpu) {
2290 				return (LPL_TOPO_LPL_BAD_NCPU);
2291 			}
2292 		}
2293 
2294 		/*
2295 		 * check on lpl_hint. Don't check root, since it has no parent.
2296 		 */
2297 		if (lpl->lpl_parent != NULL) {
2298 			int hint;
2299 			lpl_t *hint_lpl;
2300 
2301 			/* make sure hint is within limits of nrset */
2302 			hint = lpl->lpl_hint;
2303 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2304 			if (lpl->lpl_parent->lpl_nrset < hint) {
2305 				return (LPL_TOPO_BOGUS_HINT);
2306 			}
2307 
2308 			/* make sure hint points to valid lpl */
2309 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2310 			ASSERT(hint_lpl->lpl_ncpu > 0);
2311 			if (hint_lpl->lpl_ncpu <= 0) {
2312 				return (LPL_TOPO_BOGUS_HINT);
2313 			}
2314 		}
2315 
2316 		/*
2317 		 * Check the rset of the lpl in question.  Make sure that each
2318 		 * rset contains a subset of the resources in
2319 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2320 		 * sure that each rset doesn't include resources that are
2321 		 * outside of that set.  (Which would be resources somehow not
2322 		 * accounted for).
2323 		 */
2324 
2325 		klgrpset_clear(rset);
2326 		for (j = 0; j < lpl->lpl_nrset; j++) {
2327 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2328 		}
2329 		klgrpset_copy(cset, rset);
2330 		/* make sure lpl rset matches lgrp rset */
2331 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2332 		/* make sure rset is contained with in partition, too */
2333 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2334 
2335 		ASSERT(klgrpset_isempty(rset) &&
2336 			    klgrpset_isempty(cset));
2337 		if (!klgrpset_isempty(rset) ||
2338 		    !klgrpset_isempty(cset)) {
2339 			return (LPL_TOPO_RSET_MISMATCH);
2340 		}
2341 
2342 		/*
2343 		 * check to make sure lpl_nrset matches the number of rsets
2344 		 * contained in the lpl
2345 		 */
2346 
2347 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2348 		    j++);
2349 
2350 		ASSERT(j == lpl->lpl_nrset);
2351 		if (j != lpl->lpl_nrset) {
2352 			return (LPL_TOPO_BAD_RSETCNT);
2353 		}
2354 
2355 	}
2356 	return (LPL_TOPO_CORRECT);
2357 }
2358 
2359 /*
2360  * Flatten lpl topology to given number of levels.  This is presently only
2361  * implemented for a flatten to 2 levels, which will prune out the intermediates
2362  * and home the leaf lpls to the root lpl.
2363  */
2364 int
2365 lpl_topo_flatten(int levels)
2366 {
2367 	int		i;
2368 	uint_t		sum;
2369 	lgrp_t		*lgrp_cur;
2370 	lpl_t		*lpl_cur;
2371 	lpl_t		*lpl_root;
2372 	cpupart_t	*cp;
2373 
2374 	if (levels != 2)
2375 		return (0);
2376 
2377 	/* called w/ cpus paused - grab no locks! */
2378 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2379 	    !lgrp_initialized);
2380 
2381 	cp = cp_list_head;
2382 	do {
2383 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2384 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2385 
2386 		for (i = 0; i <= lgrp_alloc_max; i++) {
2387 			lgrp_cur = lgrp_table[i];
2388 			lpl_cur = &cp->cp_lgrploads[i];
2389 
2390 			if ((lgrp_cur == lgrp_root) ||
2391 			    (!LGRP_EXISTS(lgrp_cur) &&
2392 			    (lpl_cur->lpl_ncpu == 0)))
2393 				continue;
2394 
2395 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2396 				/*
2397 				 * this should be a deleted intermediate, so
2398 				 * clear it
2399 				 */
2400 				lpl_clear(lpl_cur);
2401 			} else if ((lpl_cur->lpl_nrset == 1) &&
2402 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2403 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2404 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2405 				/*
2406 				 * this is a leaf whose parent was deleted, or
2407 				 * whose parent had their lgrp deleted.  (And
2408 				 * whose parent will soon be deleted).  Point
2409 				 * this guy back to the root lpl.
2410 				 */
2411 				lpl_cur->lpl_parent = lpl_root;
2412 				lpl_rset_add(lpl_root, lpl_cur);
2413 			}
2414 
2415 		}
2416 
2417 		/*
2418 		 * Now that we're done, make sure the count on the root lpl is
2419 		 * correct, and update the hints of the children for the sake of
2420 		 * thoroughness
2421 		 */
2422 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2423 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2424 		}
2425 		lpl_root->lpl_ncpu = sum;
2426 		lpl_child_update(lpl_root, cp);
2427 
2428 		cp = cp->cp_next;
2429 	} while (cp != cp_list_head);
2430 
2431 	return (levels);
2432 }
2433 
2434 /*
2435  * Insert a lpl into the resource hierarchy and create any additional lpls that
2436  * are necessary to represent the varying states of locality for the cpu
2437  * resoruces newly added to the partition.
2438  *
2439  * This routine is clever enough that it can correctly add resources from the
2440  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2441  * those for which the lpl is a leaf as opposed to simply a named equally local
2442  * resource).  The one special case that needs additional processing is when a
2443  * new intermediate lpl is introduced.  Since the main loop only traverses
2444  * looking to add the leaf resource where it does not yet exist, additional work
2445  * is necessary to add other leaf resources that may need to exist in the newly
2446  * created intermediate.  This is performed by the second inner loop, and is
2447  * only done when the check for more than one overlapping resource succeeds.
2448  */
2449 
2450 void
2451 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2452 {
2453 	int		i;
2454 	int		j;
2455 	int		hint;
2456 	int		rset_num_intersect;
2457 	lgrp_t		*lgrp_cur;
2458 	lpl_t		*lpl_cur;
2459 	lpl_t		*lpl_parent;
2460 	lgrp_id_t	parent_id;
2461 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2462 
2463 	for (i = 0; i <= lgrp_alloc_max; i++) {
2464 		lgrp_cur = lgrp_table[i];
2465 
2466 		/*
2467 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2468 		 * contained within the current lgrp, or if the current lgrp has
2469 		 * no leaves in this partition
2470 		 */
2471 
2472 		if (!LGRP_EXISTS(lgrp_cur) ||
2473 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2474 		    lpl_leaf->lpl_lgrpid) ||
2475 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2476 		    cpupart->cp_lgrpset))
2477 			continue;
2478 
2479 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2480 		if (lgrp_cur->lgrp_parent != NULL) {
2481 			/* if lgrp has a parent, assign it properly */
2482 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2483 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2484 		} else {
2485 			/* if not, make sure parent ptr gets set to null */
2486 			lpl_parent = NULL;
2487 		}
2488 
2489 		if (lpl_cur == lpl_leaf) {
2490 			/*
2491 			 * Almost all leaf state was initialized elsewhere.  The
2492 			 * only thing left to do is to set the parent.
2493 			 */
2494 			lpl_cur->lpl_parent = lpl_parent;
2495 			continue;
2496 		}
2497 
2498 		/*
2499 		 * Initialize intermediate lpl
2500 		 * Save this lpl's hint though. Since we're changing this
2501 		 * lpl's resources, we need to update the hint in this lpl's
2502 		 * children, but the hint in this lpl is unaffected and
2503 		 * should be preserved.
2504 		 */
2505 		hint = lpl_cur->lpl_hint;
2506 
2507 		lpl_clear(lpl_cur);
2508 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2509 
2510 		lpl_cur->lpl_hint = hint;
2511 		lpl_cur->lpl_parent = lpl_parent;
2512 
2513 		/* does new lpl need to be populated with other resources? */
2514 		rset_intersect =
2515 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2516 			cpupart->cp_lgrpset);
2517 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2518 
2519 		if (rset_num_intersect > 1) {
2520 			/*
2521 			 * If so, figure out what lpls have resources that
2522 			 * intersect this one, and add them.
2523 			 */
2524 			for (j = 0; j <= lgrp_alloc_max; j++) {
2525 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2526 				lpl_t	*lpl_cand;	/* candidate lpl */
2527 
2528 				lgrp_cand = lgrp_table[j];
2529 				if (!LGRP_EXISTS(lgrp_cand) ||
2530 				    !klgrpset_ismember(rset_intersect,
2531 					lgrp_cand->lgrp_id))
2532 					continue;
2533 				lpl_cand =
2534 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2535 				lpl_rset_add(lpl_cur, lpl_cand);
2536 			}
2537 		}
2538 		/*
2539 		 * This lpl's rset has changed. Update the hint in it's
2540 		 * children.
2541 		 */
2542 		lpl_child_update(lpl_cur, cpupart);
2543 	}
2544 }
2545 
2546 /*
2547  * remove a lpl from the hierarchy of resources, clearing its state when
2548  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2549  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2550  * delete them as well.
2551  */
2552 
2553 void
2554 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2555 {
2556 	int		i;
2557 	lgrp_t		*lgrp_cur;
2558 	lpl_t		*lpl_cur;
2559 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2560 
2561 	for (i = 0; i <= lgrp_alloc_max; i++) {
2562 		lgrp_cur = lgrp_table[i];
2563 
2564 		/*
2565 		 * Don't attempt to remove from lgrps that aren't there, that
2566 		 * don't contain our leaf, or from the leaf itself. (We do that
2567 		 * later)
2568 		 */
2569 
2570 		if (!LGRP_EXISTS(lgrp_cur))
2571 			continue;
2572 
2573 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2574 
2575 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2576 		    lpl_leaf->lpl_lgrpid) ||
2577 		    (lpl_cur == lpl_leaf)) {
2578 			continue;
2579 		}
2580 
2581 		/*
2582 		 * This is a slightly sleazy simplification in that we have
2583 		 * already marked the cp_lgrpset as no longer containing the
2584 		 * leaf we've deleted.  Any lpls that pass the above checks
2585 		 * based upon lgrp membership but not necessarily cpu-part
2586 		 * membership also get cleared by the checks below.  Currently
2587 		 * this is harmless, as the lpls should be empty anyway.
2588 		 *
2589 		 * In particular, we want to preserve lpls that have additional
2590 		 * leaf resources, even though we don't yet have a processor
2591 		 * architecture that represents resources this way.
2592 		 */
2593 
2594 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2595 		    cpupart->cp_lgrpset);
2596 
2597 		lpl_rset_del(lpl_cur, lpl_leaf);
2598 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2599 			lpl_clear(lpl_cur);
2600 		} else {
2601 			/*
2602 			 * Update this lpl's children
2603 			 */
2604 			lpl_child_update(lpl_cur, cpupart);
2605 		}
2606 	}
2607 	lpl_clear(lpl_leaf);
2608 }
2609 
2610 /*
2611  * add a cpu to a partition in terms of lgrp load avg bookeeping
2612  *
2613  * The lpl (cpu partition load average information) is now arranged in a
2614  * hierarchical fashion whereby resources that are closest, ie. most local, to
2615  * the cpu in question are considered to be leaves in a tree of resources.
2616  * There are two general cases for cpu additon:
2617  *
2618  * 1. A lpl structure that contains resources already in the hierarchy tree.
2619  * In this case, all of the associated lpl relationships have been defined, and
2620  * all that is necessary is that we link the new cpu into the per-lpl list of
2621  * cpus, and increment the ncpu count of all places where this cpu resource will
2622  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2623  * pushing is accomplished by this routine.
2624  *
2625  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2626  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2627  * construct the hierarchy of state necessary to name it's more distant
2628  * resources, if they should exist.  The leaf structure is initialized by this
2629  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2630  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2631  * and builds all of the "ancestoral" state necessary to identify resources at
2632  * differing levels of locality.
2633  */
2634 void
2635 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2636 {
2637 	cpupart_t	*cpupart;
2638 	lgrp_t		*lgrp_leaf;
2639 	lpl_t		*lpl_leaf;
2640 
2641 	/* called sometimes w/ cpus paused - grab no locks */
2642 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2643 
2644 	cpupart = cp->cpu_part;
2645 	lgrp_leaf = lgrp_table[lgrpid];
2646 
2647 	/* don't add non-existent lgrp */
2648 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2649 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2650 	cp->cpu_lpl = lpl_leaf;
2651 
2652 	/* only leaf lpls contain cpus */
2653 
2654 	if (lpl_leaf->lpl_ncpu++ == 0) {
2655 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2656 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2657 		lpl_leaf_insert(lpl_leaf, cpupart);
2658 	} else {
2659 		/*
2660 		 * the lpl should already exist in the parent, so just update
2661 		 * the count of available CPUs
2662 		 */
2663 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2664 	}
2665 
2666 	/* link cpu into list of cpus in lpl */
2667 
2668 	if (lpl_leaf->lpl_cpus) {
2669 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2670 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2671 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2672 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2673 	} else {
2674 		/*
2675 		 * We increment ncpu immediately after we create a new leaf
2676 		 * lpl, so assert that ncpu == 1 for the case where we don't
2677 		 * have any cpu pointers yet.
2678 		 */
2679 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2680 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2681 	}
2682 
2683 }
2684 
2685 
2686 /*
2687  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2688  *
2689  * The lpl (cpu partition load average information) is now arranged in a
2690  * hierarchical fashion whereby resources that are closest, ie. most local, to
2691  * the cpu in question are considered to be leaves in a tree of resources.
2692  * There are two removal cases in question:
2693  *
2694  * 1. Removal of the resource in the leaf leaves other resources remaining in
2695  * that leaf.  (Another cpu still exists at this level of locality).  In this
2696  * case, the count of available cpus is decremented in all assocated lpls by
2697  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2698  * from the per-cpu lpl list.
2699  *
2700  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2701  * empty)  In this case, all of what has occurred for the first step must take
2702  * place; however, additionally we must remove the lpl structure itself, prune
2703  * out any stranded lpls that do not directly name a leaf resource, and mark the
2704  * cpu partition in question as no longer containing resources from the lgrp of
2705  * the lpl that has been delted.  Cpu-partition changes are handled by this
2706  * method, but the lpl_leaf_remove function deals with the details of pruning
2707  * out the empty lpl and any of its orphaned direct ancestors.
2708  */
2709 void
2710 lgrp_part_del_cpu(cpu_t *cp)
2711 {
2712 	lpl_t		*lpl;
2713 	lpl_t		*leaf_lpl;
2714 	lgrp_t		*lgrp_leaf;
2715 
2716 	/* called sometimes w/ cpus paused - grab no locks */
2717 
2718 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2719 
2720 	lpl = leaf_lpl = cp->cpu_lpl;
2721 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2722 
2723 	/* don't delete a leaf that isn't there */
2724 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2725 
2726 	/* no double-deletes */
2727 	ASSERT(lpl->lpl_ncpu);
2728 	if (--lpl->lpl_ncpu == 0) {
2729 		/*
2730 		 * This was the last cpu in this lgroup for this partition,
2731 		 * clear its bit in the partition's lgroup bitmask
2732 		 */
2733 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2734 
2735 		/* eliminate remaning lpl link pointers in cpu, lpl */
2736 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2737 
2738 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2739 	} else {
2740 
2741 		/* unlink cpu from lists of cpus in lpl */
2742 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2743 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2744 		if (lpl->lpl_cpus == cp) {
2745 			lpl->lpl_cpus = cp->cpu_next_lpl;
2746 		}
2747 
2748 		/*
2749 		 * Update the cpu count in the lpls associated with parent
2750 		 * lgroups.
2751 		 */
2752 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2753 
2754 	}
2755 	/* clear cpu's lpl ptr when we're all done */
2756 	cp->cpu_lpl = NULL;
2757 }
2758 
2759 /*
2760  * Recompute load average for the specified partition/lgrp fragment.
2761  *
2762  * We rely on the fact that this routine is called from the clock thread
2763  * at a point before the clock thread can block (i.e. before its first
2764  * lock request).  Since the clock thread can not be preempted (since it
2765  * runs at highest priority), we know that cpu partitions can not change
2766  * (since doing so would require either the repartition requester or the
2767  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2768  * without grabbing cpu_lock.
2769  */
2770 void
2771 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2772 {
2773 	uint_t		ncpu;
2774 	int64_t		old, new, f;
2775 
2776 	/*
2777 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2778 	 */
2779 	static short expval[] = {
2780 	    0, 3196, 1618, 1083,
2781 	    814, 652, 543, 466,
2782 	    408, 363, 326, 297,
2783 	    272, 251, 233, 218,
2784 	    204, 192, 181, 172,
2785 	    163, 155, 148, 142,
2786 	    136, 130, 125, 121,
2787 	    116, 112, 109, 105
2788 	};
2789 
2790 	/* ASSERT (called from clock level) */
2791 
2792 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2793 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2794 		return;
2795 	}
2796 
2797 	for (;;) {
2798 
2799 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2800 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2801 		else
2802 			f = expval[ncpu];
2803 
2804 		/*
2805 		 * Modify the load average atomically to avoid losing
2806 		 * anticipatory load updates (see lgrp_move_thread()).
2807 		 */
2808 		if (ageflag) {
2809 			/*
2810 			 * We're supposed to both update and age the load.
2811 			 * This happens 10 times/sec. per cpu.  We do a
2812 			 * little hoop-jumping to avoid integer overflow.
2813 			 */
2814 			int64_t		q, r;
2815 
2816 			do {
2817 				old = new = lpl->lpl_loadavg;
2818 				q = (old  >> 16) << 7;
2819 				r = (old  & 0xffff) << 7;
2820 				new += ((long long)(nrcpus - q) * f -
2821 				    ((r * f) >> 16)) >> 7;
2822 
2823 				/*
2824 				 * Check for overflow
2825 				 */
2826 				if (new > LGRP_LOADAVG_MAX)
2827 					new = LGRP_LOADAVG_MAX;
2828 				else if (new < 0)
2829 					new = 0;
2830 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2831 			    new) != old);
2832 		} else {
2833 			/*
2834 			 * We're supposed to update the load, but not age it.
2835 			 * This option is used to update the load (which either
2836 			 * has already been aged in this 1/10 sec. interval or
2837 			 * soon will be) to account for a remotely executing
2838 			 * thread.
2839 			 */
2840 			do {
2841 				old = new = lpl->lpl_loadavg;
2842 				new += f;
2843 				/*
2844 				 * Check for overflow
2845 				 * Underflow not possible here
2846 				 */
2847 				if (new < old)
2848 					new = LGRP_LOADAVG_MAX;
2849 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2850 			    new) != old);
2851 		}
2852 
2853 		/*
2854 		 * Do the same for this lpl's parent
2855 		 */
2856 		if ((lpl = lpl->lpl_parent) == NULL)
2857 			break;
2858 		ncpu = lpl->lpl_ncpu;
2859 	}
2860 }
2861 
2862 /*
2863  * Initialize lpl topology in the target based on topology currently present in
2864  * lpl_bootstrap.
2865  *
2866  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2867  * initialize cp_default list of lpls. Up to this point all topology operations
2868  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2869  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2870  * `target' points to the list of lpls in cp_default and `size' is the size of
2871  * this list.
2872  *
2873  * This function walks the lpl topology in lpl_bootstrap and does for things:
2874  *
2875  * 1) Copies all fields from lpl_bootstrap to the target.
2876  *
2877  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2878  *
2879  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2880  *    instead of lpl_bootstrap.
2881  *
2882  * 4) Updates pointers in the resource list of the target to point to the lpls
2883  *    in the target list instead of lpl_bootstrap.
2884  *
2885  * After lpl_topo_bootstrap() completes, target contains the same information
2886  * that would be present there if it were used during boot instead of
2887  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2888  * and it is bzeroed.
2889  */
2890 void
2891 lpl_topo_bootstrap(lpl_t *target, int size)
2892 {
2893 	lpl_t	*lpl = lpl_bootstrap;
2894 	lpl_t	*target_lpl = target;
2895 	int	howmany;
2896 	int	id;
2897 	int	i;
2898 
2899 	/*
2900 	 * The only target that should be passed here is cp_default lpl list.
2901 	 */
2902 	ASSERT(target == cp_default.cp_lgrploads);
2903 	ASSERT(size == cp_default.cp_nlgrploads);
2904 	ASSERT(!lgrp_topo_initialized);
2905 	ASSERT(ncpus == 1);
2906 
2907 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2908 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2909 		/*
2910 		 * Copy all fields from lpl.
2911 		 */
2912 
2913 		*target_lpl = *lpl;
2914 
2915 		/*
2916 		 * Substitute CPU0 lpl pointer with one relative to target.
2917 		 */
2918 		if (lpl->lpl_cpus == CPU) {
2919 			ASSERT(CPU->cpu_lpl == lpl);
2920 			CPU->cpu_lpl = target_lpl;
2921 		}
2922 
2923 		/*
2924 		 * Substitute parent information with parent relative to target.
2925 		 */
2926 		if (lpl->lpl_parent != NULL)
2927 			target_lpl->lpl_parent = (lpl_t *)
2928 			    (((uintptr_t)lpl->lpl_parent -
2929 				(uintptr_t)lpl_bootstrap) +
2930 				(uintptr_t)target);
2931 
2932 		/*
2933 		 * Walk over resource set substituting pointers relative to
2934 		 * lpl_bootstrap to pointers relative to target.
2935 		 */
2936 		ASSERT(lpl->lpl_nrset <= 1);
2937 
2938 		for (id = 0; id < lpl->lpl_nrset; id++) {
2939 			if (lpl->lpl_rset[id] != NULL) {
2940 				target_lpl->lpl_rset[id] =
2941 				    (lpl_t *)
2942 				    (((uintptr_t)lpl->lpl_rset[id] -
2943 					(uintptr_t)lpl_bootstrap) +
2944 					(uintptr_t)target);
2945 			}
2946 		}
2947 	}
2948 
2949 	/*
2950 	 * Topology information in lpl_bootstrap is no longer needed.
2951 	 */
2952 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2953 }
2954 
2955 /*
2956  * If the lowest load among the lgroups a process' threads are currently
2957  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2958  * expanding the process to a new lgroup.
2959  */
2960 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2961 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2962 
2963 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2964 	((lgrp_expand_proc_thresh) / (ncpu))
2965 
2966 /*
2967  * A process will be expanded to a new lgroup only if the difference between
2968  * the lowest load on the lgroups the process' thread's are currently spread
2969  * across and the lowest load on the other lgroups in the process' partition
2970  * is greater than lgrp_expand_proc_diff.
2971  */
2972 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2973 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2974 
2975 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2976 	((lgrp_expand_proc_diff) / (ncpu))
2977 
2978 /*
2979  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2980  * be present due to impreciseness of the load average decay algorithm.
2981  *
2982  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2983  * tolerance is scaled by the number of cpus in the lgroup just like
2984  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2985  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2986  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2987  */
2988 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2989 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2990 	((lgrp_loadavg_tolerance) / ncpu)
2991 
2992 /*
2993  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2994  * average is above this threshold
2995  */
2996 uint32_t	lgrp_load_thresh = UINT32_MAX;
2997 
2998 /*
2999  * lgrp_choose() will try to skip any lgroups with less memory
3000  * than this free when choosing a home lgroup
3001  */
3002 pgcnt_t	lgrp_mem_free_thresh = 0;
3003 
3004 /*
3005  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
3006  * one based on one of the following policies:
3007  * - Random selection
3008  * - Pseudo round robin placement
3009  * - Longest time since a thread was last placed
3010  */
3011 #define	LGRP_CHOOSE_RANDOM	1
3012 #define	LGRP_CHOOSE_RR		2
3013 #define	LGRP_CHOOSE_TIME	3
3014 
3015 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3016 
3017 /*
3018  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3019  * be bound to a CPU or processor set.
3020  *
3021  * Arguments:
3022  *	t		The thread
3023  *	cpupart		The partition the thread belongs to.
3024  *
3025  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3026  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3027  *	 partitions changing out from under us and assumes that given thread is
3028  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3029  *	 disabled, so don't grab any locks because we should never block under
3030  *	 those conditions.
3031  */
3032 lpl_t *
3033 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3034 {
3035 	lgrp_load_t	bestload, bestrload;
3036 	int		lgrpid_offset, lgrp_count;
3037 	lgrp_id_t	lgrpid, lgrpid_start;
3038 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3039 	klgrpset_t	lgrpset;
3040 	proc_t		*p;
3041 
3042 	ASSERT(t != NULL);
3043 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3044 	    THREAD_LOCK_HELD(t));
3045 	ASSERT(cpupart != NULL);
3046 
3047 	p = t->t_procp;
3048 
3049 	/* A process should always be in an active partition */
3050 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3051 
3052 	bestlpl = bestrlpl = NULL;
3053 	bestload = bestrload = LGRP_LOADAVG_MAX;
3054 	lgrpset = cpupart->cp_lgrpset;
3055 
3056 	switch (lgrp_choose_policy) {
3057 	case LGRP_CHOOSE_RR:
3058 		lgrpid = cpupart->cp_lgrp_hint;
3059 		do {
3060 			if (++lgrpid > lgrp_alloc_max)
3061 				lgrpid = 0;
3062 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3063 
3064 		break;
3065 	default:
3066 	case LGRP_CHOOSE_TIME:
3067 	case LGRP_CHOOSE_RANDOM:
3068 		klgrpset_nlgrps(lgrpset, lgrp_count);
3069 		lgrpid_offset =
3070 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3071 		for (lgrpid = 0; ; lgrpid++) {
3072 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3073 				if (--lgrpid_offset == 0)
3074 					break;
3075 			}
3076 		}
3077 		break;
3078 	}
3079 
3080 	lgrpid_start = lgrpid;
3081 
3082 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3083 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3084 
3085 	/*
3086 	 * Use lgroup affinities (if any) to choose best lgroup
3087 	 *
3088 	 * NOTE: Assumes that thread is protected from going away and its
3089 	 *	 lgroup affinities won't change (ie. p_lock, or
3090 	 *	 thread_lock() being held and/or CPUs paused)
3091 	 */
3092 	if (t->t_lgrp_affinity) {
3093 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3094 		if (lpl != NULL)
3095 			return (lpl);
3096 	}
3097 
3098 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3099 
3100 	do {
3101 		pgcnt_t	npgs;
3102 
3103 		/*
3104 		 * Skip any lgroups outside of thread's pset
3105 		 */
3106 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3107 			if (++lgrpid > lgrp_alloc_max)
3108 				lgrpid = 0;	/* wrap the search */
3109 			continue;
3110 		}
3111 
3112 		/*
3113 		 * Skip any non-leaf lgroups
3114 		 */
3115 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3116 			continue;
3117 
3118 		/*
3119 		 * Skip any lgroups without enough free memory
3120 		 * (when threshold set to nonzero positive value)
3121 		 */
3122 		if (lgrp_mem_free_thresh > 0) {
3123 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3124 			if (npgs < lgrp_mem_free_thresh) {
3125 				if (++lgrpid > lgrp_alloc_max)
3126 					lgrpid = 0;	/* wrap the search */
3127 				continue;
3128 			}
3129 		}
3130 
3131 		lpl = &cpupart->cp_lgrploads[lgrpid];
3132 		if (klgrpset_isempty(p->p_lgrpset) ||
3133 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3134 			/*
3135 			 * Either this is a new process or the process already
3136 			 * has threads on this lgrp, so this is a preferred
3137 			 * lgroup for the thread.
3138 			 */
3139 			if (bestlpl == NULL ||
3140 			    lpl_pick(lpl, bestlpl)) {
3141 				bestload = lpl->lpl_loadavg;
3142 				bestlpl = lpl;
3143 			}
3144 		} else {
3145 			/*
3146 			 * The process doesn't have any threads on this lgrp,
3147 			 * but we're willing to consider this lgrp if the load
3148 			 * difference is big enough to justify splitting up
3149 			 * the process' threads.
3150 			 */
3151 			if (bestrlpl == NULL ||
3152 			    lpl_pick(lpl, bestrlpl)) {
3153 				bestrload = lpl->lpl_loadavg;
3154 				bestrlpl = lpl;
3155 			}
3156 		}
3157 		if (++lgrpid > lgrp_alloc_max)
3158 			lgrpid = 0;	/* wrap the search */
3159 	} while (lgrpid != lgrpid_start);
3160 
3161 	/*
3162 	 * Return root lgroup if threshold isn't set to maximum value and
3163 	 * lowest lgroup load average more than a certain threshold
3164 	 */
3165 	if (lgrp_load_thresh != UINT32_MAX &&
3166 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3167 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3168 
3169 	/*
3170 	 * If all the lgroups over which the thread's process is spread are
3171 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3172 	 * the thread on one of the other leaf lgroups in the thread's
3173 	 * partition.
3174 	 */
3175 	if ((bestlpl == NULL) ||
3176 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3177 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3178 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3179 	    bestload))) {
3180 		bestlpl = bestrlpl;
3181 	}
3182 
3183 	if (bestlpl == NULL) {
3184 		/*
3185 		 * No lgroup looked particularly good, but we still
3186 		 * have to pick something. Go with the randomly selected
3187 		 * legal lgroup we started with above.
3188 		 */
3189 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3190 	}
3191 
3192 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3193 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3194 
3195 	ASSERT(bestlpl->lpl_ncpu > 0);
3196 	return (bestlpl);
3197 }
3198 
3199 /*
3200  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3201  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3202  */
3203 static int
3204 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3205 {
3206 	lgrp_load_t	l1, l2;
3207 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3208 
3209 	l1 = lpl1->lpl_loadavg;
3210 	l2 = lpl2->lpl_loadavg;
3211 
3212 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3213 		/* lpl1 is significantly less loaded than lpl2 */
3214 		return (1);
3215 	}
3216 
3217 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3218 	    l1 + tolerance >= l2 && l1 < l2 &&
3219 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3220 		/*
3221 		 * lpl1's load is within the tolerance of lpl2. We're
3222 		 * willing to consider it be to better however if
3223 		 * it has been longer since we last homed a thread there
3224 		 */
3225 		return (1);
3226 	}
3227 
3228 	return (0);
3229 }
3230 
3231 /*
3232  * An LWP is expected to be assigned to an lgroup for at least this long
3233  * for its anticipatory load to be justified.  NOTE that this value should
3234  * not be set extremely huge (say, larger than 100 years), to avoid problems
3235  * with overflow in the calculation that uses it.
3236  */
3237 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3238 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3239 
3240 /*
3241  * Routine to change a thread's lgroup affiliation.  This routine updates
3242  * the thread's kthread_t struct and its process' proc_t struct to note the
3243  * thread's new lgroup affiliation, and its lgroup affinities.
3244  *
3245  * Note that this is the only routine that modifies a thread's t_lpl field,
3246  * and that adds in or removes anticipatory load.
3247  *
3248  * If the thread is exiting, newlpl is NULL.
3249  *
3250  * Locking:
3251  * The following lock must be held on entry:
3252  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3253  *		doesn't get removed from t's partition
3254  *
3255  * This routine is not allowed to grab any locks, since it may be called
3256  * with cpus paused (such as from cpu_offline).
3257  */
3258 void
3259 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3260 {
3261 	proc_t		*p;
3262 	lpl_t		*lpl, *oldlpl;
3263 	lgrp_id_t	oldid;
3264 	kthread_t	*tp;
3265 	uint_t		ncpu;
3266 	lgrp_load_t	old, new;
3267 
3268 	ASSERT(t);
3269 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3270 	    THREAD_LOCK_HELD(t));
3271 
3272 	/*
3273 	 * If not changing lpls, just return
3274 	 */
3275 	if ((oldlpl = t->t_lpl) == newlpl)
3276 		return;
3277 
3278 	/*
3279 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3280 	 * associated with process 0 rather than with its original process).
3281 	 */
3282 	if (t->t_proc_flag & TP_LWPEXIT) {
3283 		if (newlpl != NULL) {
3284 			t->t_lpl = newlpl;
3285 		}
3286 		return;
3287 	}
3288 
3289 	p = ttoproc(t);
3290 
3291 	/*
3292 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3293 	 * to account for it being moved from its old lgroup.
3294 	 */
3295 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3296 	    (p->p_tlist != NULL)) {
3297 		oldid = oldlpl->lpl_lgrpid;
3298 
3299 		if (newlpl != NULL)
3300 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3301 
3302 		if ((do_lgrpset_delete) &&
3303 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3304 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3305 				/*
3306 				 * Check if a thread other than the thread
3307 				 * that's moving is assigned to the same
3308 				 * lgroup as the thread that's moving.  Note
3309 				 * that we have to compare lgroup IDs, rather
3310 				 * than simply comparing t_lpl's, since the
3311 				 * threads may belong to different partitions
3312 				 * but be assigned to the same lgroup.
3313 				 */
3314 				ASSERT(tp->t_lpl != NULL);
3315 
3316 				if ((tp != t) &&
3317 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3318 					/*
3319 					 * Another thread is assigned to the
3320 					 * same lgroup as the thread that's
3321 					 * moving, p_lgrpset doesn't change.
3322 					 */
3323 					break;
3324 				} else if (tp == p->p_tlist) {
3325 					/*
3326 					 * No other thread is assigned to the
3327 					 * same lgroup as the exiting thread,
3328 					 * clear the lgroup's bit in p_lgrpset.
3329 					 */
3330 					klgrpset_del(p->p_lgrpset, oldid);
3331 					break;
3332 				}
3333 			}
3334 		}
3335 
3336 		/*
3337 		 * If this thread was assigned to its old lgroup for such a
3338 		 * short amount of time that the anticipatory load that was
3339 		 * added on its behalf has aged very little, remove that
3340 		 * anticipatory load.
3341 		 */
3342 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3343 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3344 			lpl = oldlpl;
3345 			for (;;) {
3346 				do {
3347 					old = new = lpl->lpl_loadavg;
3348 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3349 					if (new > old) {
3350 						/*
3351 						 * this can happen if the load
3352 						 * average was aged since we
3353 						 * added in the anticipatory
3354 						 * load
3355 						 */
3356 						new = 0;
3357 					}
3358 				} while (cas32(
3359 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3360 					    new) != old);
3361 
3362 				lpl = lpl->lpl_parent;
3363 				if (lpl == NULL)
3364 					break;
3365 
3366 				ncpu = lpl->lpl_ncpu;
3367 				ASSERT(ncpu > 0);
3368 			}
3369 		}
3370 	}
3371 	/*
3372 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3373 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3374 	 * to its new lgroup to account for its move to its new lgroup.
3375 	 */
3376 	if (newlpl != NULL) {
3377 		/*
3378 		 * This thread is moving to a new lgroup
3379 		 */
3380 		t->t_lpl = newlpl;
3381 
3382 		/*
3383 		 * Reflect move in load average of new lgroup
3384 		 * unless it is root lgroup
3385 		 */
3386 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3387 			return;
3388 
3389 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3390 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3391 		}
3392 
3393 		/*
3394 		 * It'll take some time for the load on the new lgroup
3395 		 * to reflect this thread's placement on it.  We'd
3396 		 * like not, however, to have all threads between now
3397 		 * and then also piling on to this lgroup.  To avoid
3398 		 * this pileup, we anticipate the load this thread
3399 		 * will generate on its new lgroup.  The goal is to
3400 		 * make the lgroup's load appear as though the thread
3401 		 * had been there all along.  We're very conservative
3402 		 * in calculating this anticipatory load, we assume
3403 		 * the worst case case (100% CPU-bound thread).  This
3404 		 * may be modified in the future to be more accurate.
3405 		 */
3406 		lpl = newlpl;
3407 		for (;;) {
3408 			ncpu = lpl->lpl_ncpu;
3409 			ASSERT(ncpu > 0);
3410 			do {
3411 				old = new = lpl->lpl_loadavg;
3412 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3413 				/*
3414 				 * Check for overflow
3415 				 * Underflow not possible here
3416 				 */
3417 				if (new < old)
3418 					new = UINT32_MAX;
3419 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3420 			    new) != old);
3421 
3422 			lpl = lpl->lpl_parent;
3423 			if (lpl == NULL)
3424 				break;
3425 		}
3426 		t->t_anttime = gethrtime();
3427 	}
3428 }
3429 
3430 /*
3431  * Return lgroup memory allocation policy given advice from madvise(3C)
3432  */
3433 lgrp_mem_policy_t
3434 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3435 {
3436 	switch (advice) {
3437 	case MADV_ACCESS_LWP:
3438 		return (LGRP_MEM_POLICY_NEXT);
3439 	case MADV_ACCESS_MANY:
3440 		return (LGRP_MEM_POLICY_RANDOM);
3441 	default:
3442 		return (lgrp_mem_policy_default(size, type));
3443 	}
3444 }
3445 
3446 /*
3447  * Figure out default policy
3448  */
3449 lgrp_mem_policy_t
3450 lgrp_mem_policy_default(size_t size, int type)
3451 {
3452 	cpupart_t		*cp;
3453 	lgrp_mem_policy_t	policy;
3454 	size_t			pset_mem_size;
3455 
3456 	/*
3457 	 * Randomly allocate memory across lgroups for shared memory
3458 	 * beyond a certain threshold
3459 	 */
3460 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3461 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3462 		/*
3463 		 * Get total memory size of current thread's pset
3464 		 */
3465 		kpreempt_disable();
3466 		cp = curthread->t_cpupart;
3467 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3468 		kpreempt_enable();
3469 
3470 		/*
3471 		 * Choose policy to randomly allocate memory across
3472 		 * lgroups in pset if it will fit and is not default
3473 		 * partition.  Otherwise, allocate memory randomly
3474 		 * across machine.
3475 		 */
3476 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3477 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3478 		else
3479 			policy = LGRP_MEM_POLICY_RANDOM;
3480 	} else
3481 		/*
3482 		 * Apply default policy for private memory and
3483 		 * shared memory under the respective random
3484 		 * threshold.
3485 		 */
3486 		policy = lgrp_mem_default_policy;
3487 
3488 	return (policy);
3489 }
3490 
3491 /*
3492  * Get memory allocation policy for this segment
3493  */
3494 lgrp_mem_policy_info_t *
3495 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3496 {
3497 	lgrp_mem_policy_info_t	*policy_info;
3498 	extern struct seg_ops	segspt_ops;
3499 	extern struct seg_ops	segspt_shmops;
3500 
3501 	/*
3502 	 * This is for binary compatibility to protect against third party
3503 	 * segment drivers which haven't recompiled to allow for
3504 	 * SEGOP_GETPOLICY()
3505 	 */
3506 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3507 	    seg->s_ops != &segspt_shmops)
3508 		return (NULL);
3509 
3510 	policy_info = NULL;
3511 	if (seg->s_ops->getpolicy != NULL)
3512 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3513 
3514 	return (policy_info);
3515 }
3516 
3517 /*
3518  * Set policy for allocating private memory given desired policy, policy info,
3519  * size in bytes of memory that policy is being applied.
3520  * Return 0 if policy wasn't set already and 1 if policy was set already
3521  */
3522 int
3523 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3524     lgrp_mem_policy_info_t *policy_info, size_t size)
3525 {
3526 
3527 	ASSERT(policy_info != NULL);
3528 
3529 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3530 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3531 
3532 	/*
3533 	 * Policy set already?
3534 	 */
3535 	if (policy == policy_info->mem_policy)
3536 		return (1);
3537 
3538 	/*
3539 	 * Set policy
3540 	 */
3541 	policy_info->mem_policy = policy;
3542 	policy_info->mem_reserved = 0;
3543 
3544 	return (0);
3545 }
3546 
3547 
3548 /*
3549  * Get shared memory allocation policy with given tree and offset
3550  */
3551 lgrp_mem_policy_info_t *
3552 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3553     u_offset_t vn_off)
3554 {
3555 	u_offset_t		off;
3556 	lgrp_mem_policy_info_t	*policy_info;
3557 	lgrp_shm_policy_seg_t	*policy_seg;
3558 	lgrp_shm_locality_t	*shm_locality;
3559 	avl_tree_t		*tree;
3560 	avl_index_t		where;
3561 
3562 	/*
3563 	 * Get policy segment tree from anon_map or vnode and use specified
3564 	 * anon index or vnode offset as offset
3565 	 *
3566 	 * Assume that no lock needs to be held on anon_map or vnode, since
3567 	 * they should be protected by their reference count which must be
3568 	 * nonzero for an existing segment
3569 	 */
3570 	if (amp) {
3571 		ASSERT(amp->refcnt != 0);
3572 		shm_locality = amp->locality;
3573 		if (shm_locality == NULL)
3574 			return (NULL);
3575 		tree = shm_locality->loc_tree;
3576 		off = ptob(anon_index);
3577 	} else if (vp) {
3578 		shm_locality = vp->v_locality;
3579 		if (shm_locality == NULL)
3580 			return (NULL);
3581 		ASSERT(shm_locality->loc_count != 0);
3582 		tree = shm_locality->loc_tree;
3583 		off = vn_off;
3584 	}
3585 
3586 	if (tree == NULL)
3587 		return (NULL);
3588 
3589 	/*
3590 	 * Lookup policy segment for offset into shared object and return
3591 	 * policy info
3592 	 */
3593 	rw_enter(&shm_locality->loc_lock, RW_READER);
3594 	policy_info = NULL;
3595 	policy_seg = avl_find(tree, &off, &where);
3596 	if (policy_seg)
3597 		policy_info = &policy_seg->shm_policy;
3598 	rw_exit(&shm_locality->loc_lock);
3599 
3600 	return (policy_info);
3601 }
3602 
3603 /*
3604  * Default memory allocation policy for kernel segmap pages
3605  */
3606 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3607 
3608 /*
3609  * Return lgroup to use for allocating memory
3610  * given the segment and address
3611  *
3612  * There isn't any mutual exclusion that exists between calls
3613  * to this routine and DR, so this routine and whomever calls it
3614  * should be mindful of the possibility that the lgrp returned
3615  * may be deleted. If this happens, dereferences of the lgrp
3616  * pointer will still be safe, but the resources in the lgrp will
3617  * be gone, and LGRP_EXISTS() will no longer be true.
3618  */
3619 lgrp_t *
3620 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3621 {
3622 	int			i;
3623 	lgrp_t			*lgrp;
3624 	klgrpset_t		lgrpset;
3625 	int			lgrps_spanned;
3626 	unsigned long		off;
3627 	lgrp_mem_policy_t	policy;
3628 	lgrp_mem_policy_info_t	*policy_info;
3629 	ushort_t		random;
3630 	int			stat = 0;
3631 	extern struct seg	*segkmap;
3632 
3633 	/*
3634 	 * Just return null if the lgrp framework hasn't finished
3635 	 * initializing or if this is a UMA machine.
3636 	 */
3637 	if (nlgrps == 1 || !lgrp_initialized)
3638 		return (lgrp_root);
3639 
3640 	/*
3641 	 * Get memory allocation policy for this segment
3642 	 */
3643 	policy = lgrp_mem_default_policy;
3644 	if (seg != NULL) {
3645 		if (seg->s_as == &kas) {
3646 			if (seg == segkmap)
3647 				policy = lgrp_segmap_default_policy;
3648 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3649 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3650 				policy = LGRP_MEM_POLICY_RANDOM;
3651 		} else {
3652 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3653 			if (policy_info != NULL)
3654 				policy = policy_info->mem_policy;
3655 		}
3656 	}
3657 	lgrpset = 0;
3658 
3659 	/*
3660 	 * Initialize lgroup to home by default
3661 	 */
3662 	lgrp = lgrp_home_lgrp();
3663 
3664 	/*
3665 	 * When homing threads on root lgrp, override default memory
3666 	 * allocation policies with root lgroup memory allocation policy
3667 	 */
3668 	if (lgrp == lgrp_root)
3669 		policy = lgrp_mem_policy_root;
3670 
3671 	/*
3672 	 * Implement policy
3673 	 */
3674 	switch (policy) {
3675 	case LGRP_MEM_POLICY_NEXT_CPU:
3676 
3677 		/*
3678 		 * Return lgroup of current CPU which faulted on memory
3679 		 * If the CPU isn't currently in an lgrp, then opt to
3680 		 * allocate from the root.
3681 		 *
3682 		 * Kernel preemption needs to be disabled here to prevent
3683 		 * the current CPU from going away before lgrp is found.
3684 		 */
3685 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3686 			lgrp = lgrp_root;
3687 		} else {
3688 			kpreempt_disable();
3689 			lgrp = lgrp_cpu_to_lgrp(CPU);
3690 			kpreempt_enable();
3691 		}
3692 		break;
3693 
3694 	case LGRP_MEM_POLICY_NEXT:
3695 	case LGRP_MEM_POLICY_DEFAULT:
3696 	default:
3697 
3698 		/*
3699 		 * Just return current thread's home lgroup
3700 		 * for default policy (next touch)
3701 		 * If the thread is homed to the root,
3702 		 * then the default policy is random across lgroups.
3703 		 * Fallthrough to the random case.
3704 		 */
3705 		if (lgrp != lgrp_root) {
3706 			if (policy == LGRP_MEM_POLICY_NEXT)
3707 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3708 			else
3709 				lgrp_stat_add(lgrp->lgrp_id,
3710 				    LGRP_NUM_DEFAULT, 1);
3711 			break;
3712 		}
3713 		/* LINTED fallthrough on case statement */
3714 	case LGRP_MEM_POLICY_RANDOM:
3715 
3716 		/*
3717 		 * Return a random leaf lgroup with memory
3718 		 */
3719 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3720 		/*
3721 		 * Count how many lgroups are spanned
3722 		 */
3723 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3724 
3725 		/*
3726 		 * There may be no memnodes in the root lgroup during DR copy
3727 		 * rename on a system with only two boards (memnodes)
3728 		 * configured. In this case just return the root lgrp.
3729 		 */
3730 		if (lgrps_spanned == 0) {
3731 			lgrp = lgrp_root;
3732 			break;
3733 		}
3734 
3735 		/*
3736 		 * Pick a random offset within lgroups spanned
3737 		 * and return lgroup at that offset
3738 		 */
3739 		random = (ushort_t)gethrtime() >> 4;
3740 		off = random % lgrps_spanned;
3741 		ASSERT(off <= lgrp_alloc_max);
3742 
3743 		for (i = 0; i <= lgrp_alloc_max; i++) {
3744 			if (!klgrpset_ismember(lgrpset, i))
3745 				continue;
3746 			if (off)
3747 				off--;
3748 			else {
3749 				lgrp = lgrp_table[i];
3750 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3751 				    1);
3752 				break;
3753 			}
3754 		}
3755 		break;
3756 
3757 	case LGRP_MEM_POLICY_RANDOM_PROC:
3758 
3759 		/*
3760 		 * Grab copy of bitmask of lgroups spanned by
3761 		 * this process
3762 		 */
3763 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3764 		stat = LGRP_NUM_RANDOM_PROC;
3765 
3766 		/* LINTED fallthrough on case statement */
3767 	case LGRP_MEM_POLICY_RANDOM_PSET:
3768 
3769 		if (!stat)
3770 			stat = LGRP_NUM_RANDOM_PSET;
3771 
3772 		if (klgrpset_isempty(lgrpset)) {
3773 			/*
3774 			 * Grab copy of bitmask of lgroups spanned by
3775 			 * this processor set
3776 			 */
3777 			kpreempt_disable();
3778 			klgrpset_copy(lgrpset,
3779 			    curthread->t_cpupart->cp_lgrpset);
3780 			kpreempt_enable();
3781 		}
3782 
3783 		/*
3784 		 * Count how many lgroups are spanned
3785 		 */
3786 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3787 		ASSERT(lgrps_spanned <= nlgrps);
3788 
3789 		/*
3790 		 * Probably lgrps_spanned should be always non-zero, but to be
3791 		 * on the safe side we return lgrp_root if it is empty.
3792 		 */
3793 		if (lgrps_spanned == 0) {
3794 			lgrp = lgrp_root;
3795 			break;
3796 		}
3797 
3798 		/*
3799 		 * Pick a random offset within lgroups spanned
3800 		 * and return lgroup at that offset
3801 		 */
3802 		random = (ushort_t)gethrtime() >> 4;
3803 		off = random % lgrps_spanned;
3804 		ASSERT(off <= lgrp_alloc_max);
3805 
3806 		for (i = 0; i <= lgrp_alloc_max; i++) {
3807 			if (!klgrpset_ismember(lgrpset, i))
3808 				continue;
3809 			if (off)
3810 				off--;
3811 			else {
3812 				lgrp = lgrp_table[i];
3813 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3814 				    1);
3815 				break;
3816 			}
3817 		}
3818 		break;
3819 
3820 	case LGRP_MEM_POLICY_ROUNDROBIN:
3821 
3822 		/*
3823 		 * Use offset within segment to determine
3824 		 * offset from home lgroup to choose for
3825 		 * next lgroup to allocate memory from
3826 		 */
3827 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3828 		    (lgrp_alloc_max + 1);
3829 
3830 		kpreempt_disable();
3831 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3832 		i = lgrp->lgrp_id;
3833 		kpreempt_enable();
3834 
3835 		while (off > 0) {
3836 			i = (i + 1) % (lgrp_alloc_max + 1);
3837 			lgrp = lgrp_table[i];
3838 			if (klgrpset_ismember(lgrpset, i))
3839 				off--;
3840 		}
3841 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3842 
3843 		break;
3844 	}
3845 
3846 	ASSERT(lgrp != NULL);
3847 	return (lgrp);
3848 }
3849 
3850 /*
3851  * Return the number of pages in an lgroup
3852  *
3853  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3854  *	 could cause tests that rely on the numat driver to fail....
3855  */
3856 pgcnt_t
3857 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3858 {
3859 	lgrp_t *lgrp;
3860 
3861 	lgrp = lgrp_table[lgrpid];
3862 	if (!LGRP_EXISTS(lgrp) ||
3863 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3864 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3865 		return (0);
3866 
3867 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3868 }
3869 
3870 /*
3871  * Initialize lgroup shared memory allocation policy support
3872  */
3873 void
3874 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3875 {
3876 	lgrp_shm_locality_t	*shm_locality;
3877 
3878 	/*
3879 	 * Initialize locality field in anon_map
3880 	 * Don't need any locks because this is called when anon_map is
3881 	 * allocated, but not used anywhere yet.
3882 	 */
3883 	if (amp) {
3884 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3885 		if (amp->locality == NULL) {
3886 			/*
3887 			 * Allocate and initialize shared memory locality info
3888 			 * and set anon_map locality pointer to it
3889 			 * Drop lock across kmem_alloc(KM_SLEEP)
3890 			 */
3891 			ANON_LOCK_EXIT(&amp->a_rwlock);
3892 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3893 			    KM_SLEEP);
3894 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3895 			    NULL);
3896 			shm_locality->loc_count = 1;	/* not used for amp */
3897 			shm_locality->loc_tree = NULL;
3898 
3899 			/*
3900 			 * Reacquire lock and check to see whether anyone beat
3901 			 * us to initializing the locality info
3902 			 */
3903 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3904 			if (amp->locality != NULL) {
3905 				rw_destroy(&shm_locality->loc_lock);
3906 				kmem_free(shm_locality,
3907 				    sizeof (*shm_locality));
3908 			} else
3909 				amp->locality = shm_locality;
3910 		}
3911 		ANON_LOCK_EXIT(&amp->a_rwlock);
3912 		return;
3913 	}
3914 
3915 	/*
3916 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3917 	 */
3918 	mutex_enter(&vp->v_lock);
3919 	if ((vp->v_flag & V_LOCALITY) == 0) {
3920 		/*
3921 		 * Allocate and initialize shared memory locality info
3922 		 */
3923 		mutex_exit(&vp->v_lock);
3924 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3925 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3926 		shm_locality->loc_count = 1;
3927 		shm_locality->loc_tree = NULL;
3928 
3929 		/*
3930 		 * Point vnode locality field at shared vnode policy info
3931 		 * and set locality aware flag in vnode
3932 		 */
3933 		mutex_enter(&vp->v_lock);
3934 		if ((vp->v_flag & V_LOCALITY) == 0) {
3935 			vp->v_locality = shm_locality;
3936 			vp->v_flag |= V_LOCALITY;
3937 		} else {
3938 			/*
3939 			 * Lost race so free locality info and increment count.
3940 			 */
3941 			rw_destroy(&shm_locality->loc_lock);
3942 			kmem_free(shm_locality, sizeof (*shm_locality));
3943 			shm_locality = vp->v_locality;
3944 			shm_locality->loc_count++;
3945 		}
3946 		mutex_exit(&vp->v_lock);
3947 
3948 		return;
3949 	}
3950 
3951 	/*
3952 	 * Increment reference count of number of segments mapping this vnode
3953 	 * shared
3954 	 */
3955 	shm_locality = vp->v_locality;
3956 	shm_locality->loc_count++;
3957 	mutex_exit(&vp->v_lock);
3958 }
3959 
3960 /*
3961  * Destroy the given shared memory policy segment tree
3962  */
3963 void
3964 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3965 {
3966 	lgrp_shm_policy_seg_t	*cur;
3967 	lgrp_shm_policy_seg_t	*next;
3968 
3969 	if (tree == NULL)
3970 		return;
3971 
3972 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3973 	while (cur != NULL) {
3974 		next = AVL_NEXT(tree, cur);
3975 		avl_remove(tree, cur);
3976 		kmem_free(cur, sizeof (*cur));
3977 		cur = next;
3978 	}
3979 	kmem_free(tree, sizeof (avl_tree_t));
3980 }
3981 
3982 /*
3983  * Uninitialize lgroup shared memory allocation policy support
3984  */
3985 void
3986 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3987 {
3988 	lgrp_shm_locality_t	*shm_locality;
3989 
3990 	/*
3991 	 * For anon_map, deallocate shared memory policy tree and
3992 	 * zero locality field
3993 	 * Don't need any locks because anon_map is being freed
3994 	 */
3995 	if (amp) {
3996 		if (amp->locality == NULL)
3997 			return;
3998 		shm_locality = amp->locality;
3999 		shm_locality->loc_count = 0;	/* not really used for amp */
4000 		rw_destroy(&shm_locality->loc_lock);
4001 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4002 		kmem_free(shm_locality, sizeof (*shm_locality));
4003 		amp->locality = 0;
4004 		return;
4005 	}
4006 
4007 	/*
4008 	 * For vnode, decrement reference count of segments mapping this vnode
4009 	 * shared and delete locality info if reference count drops to 0
4010 	 */
4011 	mutex_enter(&vp->v_lock);
4012 	shm_locality = vp->v_locality;
4013 	shm_locality->loc_count--;
4014 
4015 	if (shm_locality->loc_count == 0) {
4016 		rw_destroy(&shm_locality->loc_lock);
4017 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4018 		kmem_free(shm_locality, sizeof (*shm_locality));
4019 		vp->v_locality = 0;
4020 		vp->v_flag &= ~V_LOCALITY;
4021 	}
4022 	mutex_exit(&vp->v_lock);
4023 }
4024 
4025 /*
4026  * Compare two shared memory policy segments
4027  * Used by AVL tree code for searching
4028  */
4029 int
4030 lgrp_shm_policy_compar(const void *x, const void *y)
4031 {
4032 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4033 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4034 
4035 	if (a->shm_off < b->shm_off)
4036 		return (-1);
4037 	if (a->shm_off >= b->shm_off + b->shm_size)
4038 		return (1);
4039 	return (0);
4040 }
4041 
4042 /*
4043  * Concatenate seg1 with seg2 and remove seg2
4044  */
4045 static int
4046 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4047     lgrp_shm_policy_seg_t *seg2)
4048 {
4049 	if (!seg1 || !seg2 ||
4050 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4051 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4052 		return (-1);
4053 
4054 	seg1->shm_size += seg2->shm_size;
4055 	avl_remove(tree, seg2);
4056 	kmem_free(seg2, sizeof (*seg2));
4057 	return (0);
4058 }
4059 
4060 /*
4061  * Split segment at given offset and return rightmost (uppermost) segment
4062  * Assumes that there are no overlapping segments
4063  */
4064 static lgrp_shm_policy_seg_t *
4065 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4066     u_offset_t off)
4067 {
4068 	lgrp_shm_policy_seg_t	*newseg;
4069 	avl_index_t		where;
4070 
4071 	ASSERT(seg != NULL);
4072 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4073 
4074 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4075 	    seg->shm_size)
4076 		return (NULL);
4077 
4078 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4079 		return (seg);
4080 
4081 	/*
4082 	 * Adjust size of left segment and allocate new (right) segment
4083 	 */
4084 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4085 	newseg->shm_policy = seg->shm_policy;
4086 	newseg->shm_off = off;
4087 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4088 	seg->shm_size = off - seg->shm_off;
4089 
4090 	/*
4091 	 * Find where to insert new segment in AVL tree and insert it
4092 	 */
4093 	(void) avl_find(tree, &off, &where);
4094 	avl_insert(tree, newseg, where);
4095 
4096 	return (newseg);
4097 }
4098 
4099 /*
4100  * Set shared memory allocation policy on specified shared object at given
4101  * offset and length
4102  *
4103  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4104  * -1 if can't set policy.
4105  */
4106 int
4107 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4108     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4109 {
4110 	u_offset_t		eoff;
4111 	lgrp_shm_policy_seg_t	*next;
4112 	lgrp_shm_policy_seg_t	*newseg;
4113 	u_offset_t		off;
4114 	u_offset_t		oldeoff;
4115 	lgrp_shm_policy_seg_t	*prev;
4116 	int			retval;
4117 	lgrp_shm_policy_seg_t	*seg;
4118 	lgrp_shm_locality_t	*shm_locality;
4119 	avl_tree_t		*tree;
4120 	avl_index_t		where;
4121 
4122 	ASSERT(amp || vp);
4123 	ASSERT((len & PAGEOFFSET) == 0);
4124 
4125 	if (len == 0)
4126 		return (-1);
4127 
4128 	retval = 0;
4129 
4130 	/*
4131 	 * Get locality info and starting offset into shared object
4132 	 * Try anon map first and then vnode
4133 	 * Assume that no locks need to be held on anon_map or vnode, since
4134 	 * it should be protected by its reference count which must be nonzero
4135 	 * for an existing segment.
4136 	 */
4137 	if (amp) {
4138 		/*
4139 		 * Get policy info from anon_map
4140 		 *
4141 		 */
4142 		ASSERT(amp->refcnt != 0);
4143 		if (amp->locality == NULL)
4144 			lgrp_shm_policy_init(amp, NULL);
4145 		shm_locality = amp->locality;
4146 		off = ptob(anon_index);
4147 	} else if (vp) {
4148 		/*
4149 		 * Get policy info from vnode
4150 		 */
4151 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4152 			lgrp_shm_policy_init(NULL, vp);
4153 		shm_locality = vp->v_locality;
4154 		ASSERT(shm_locality->loc_count != 0);
4155 		off = vn_off;
4156 	} else
4157 		return (-1);
4158 
4159 	ASSERT((off & PAGEOFFSET) == 0);
4160 
4161 	/*
4162 	 * Figure out default policy
4163 	 */
4164 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4165 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4166 
4167 	/*
4168 	 * Create AVL tree if there isn't one yet
4169 	 * and set locality field to point at it
4170 	 */
4171 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4172 	tree = shm_locality->loc_tree;
4173 	if (!tree) {
4174 		rw_exit(&shm_locality->loc_lock);
4175 
4176 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4177 
4178 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4179 		if (shm_locality->loc_tree == NULL) {
4180 			avl_create(tree, lgrp_shm_policy_compar,
4181 			    sizeof (lgrp_shm_policy_seg_t),
4182 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4183 			shm_locality->loc_tree = tree;
4184 		} else {
4185 			/*
4186 			 * Another thread managed to set up the tree
4187 			 * before we could. Free the tree we allocated
4188 			 * and use the one that's already there.
4189 			 */
4190 			kmem_free(tree, sizeof (*tree));
4191 			tree = shm_locality->loc_tree;
4192 		}
4193 	}
4194 
4195 	/*
4196 	 * Set policy
4197 	 *
4198 	 * Need to maintain hold on writer's lock to keep tree from
4199 	 * changing out from under us
4200 	 */
4201 	while (len != 0) {
4202 		/*
4203 		 * Find policy segment for specified offset into shared object
4204 		 */
4205 		seg = avl_find(tree, &off, &where);
4206 
4207 		/*
4208 		 * Didn't find any existing segment that contains specified
4209 		 * offset, so allocate new segment, insert it, and concatenate
4210 		 * with adjacent segments if possible
4211 		 */
4212 		if (seg == NULL) {
4213 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4214 			    KM_SLEEP);
4215 			newseg->shm_policy.mem_policy = policy;
4216 			newseg->shm_policy.mem_reserved = 0;
4217 			newseg->shm_off = off;
4218 			avl_insert(tree, newseg, where);
4219 
4220 			/*
4221 			 * Check to see whether new segment overlaps with next
4222 			 * one, set length of new segment accordingly, and
4223 			 * calculate remaining length and next offset
4224 			 */
4225 			seg = AVL_NEXT(tree, newseg);
4226 			if (seg == NULL || off + len <= seg->shm_off) {
4227 				newseg->shm_size = len;
4228 				len = 0;
4229 			} else {
4230 				newseg->shm_size = seg->shm_off - off;
4231 				off = seg->shm_off;
4232 				len -= newseg->shm_size;
4233 			}
4234 
4235 			/*
4236 			 * Try to concatenate new segment with next and
4237 			 * previous ones, since they might have the same policy
4238 			 * now.  Grab previous and next segments first because
4239 			 * they will change on concatenation.
4240 			 */
4241 			prev =  AVL_PREV(tree, newseg);
4242 			next = AVL_NEXT(tree, newseg);
4243 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4244 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4245 
4246 			continue;
4247 		}
4248 
4249 		eoff = off + len;
4250 		oldeoff = seg->shm_off + seg->shm_size;
4251 
4252 		/*
4253 		 * Policy set already?
4254 		 */
4255 		if (policy == seg->shm_policy.mem_policy) {
4256 			/*
4257 			 * Nothing left to do if offset and length
4258 			 * fall within this segment
4259 			 */
4260 			if (eoff <= oldeoff) {
4261 				retval = 1;
4262 				break;
4263 			} else {
4264 				len = eoff - oldeoff;
4265 				off = oldeoff;
4266 				continue;
4267 			}
4268 		}
4269 
4270 		/*
4271 		 * Specified offset and length match existing segment exactly
4272 		 */
4273 		if (off == seg->shm_off && len == seg->shm_size) {
4274 			/*
4275 			 * Set policy and update current length
4276 			 */
4277 			seg->shm_policy.mem_policy = policy;
4278 			seg->shm_policy.mem_reserved = 0;
4279 			len = 0;
4280 
4281 			/*
4282 			 * Try concatenating new segment with previous and next
4283 			 * segments, since they might have the same policy now.
4284 			 * Grab previous and next segments first because they
4285 			 * will change on concatenation.
4286 			 */
4287 			prev =  AVL_PREV(tree, seg);
4288 			next = AVL_NEXT(tree, seg);
4289 			(void) lgrp_shm_policy_concat(tree, seg, next);
4290 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4291 		} else {
4292 			/*
4293 			 * Specified offset and length only apply to part of
4294 			 * existing segment
4295 			 */
4296 
4297 			/*
4298 			 * New segment starts in middle of old one, so split
4299 			 * new one off near beginning of old one
4300 			 */
4301 			newseg = NULL;
4302 			if (off > seg->shm_off) {
4303 				newseg = lgrp_shm_policy_split(tree, seg, off);
4304 
4305 				/*
4306 				 * New segment ends where old one did, so try
4307 				 * to concatenate with next segment
4308 				 */
4309 				if (eoff == oldeoff) {
4310 					newseg->shm_policy.mem_policy = policy;
4311 					newseg->shm_policy.mem_reserved = 0;
4312 					(void) lgrp_shm_policy_concat(tree,
4313 					    newseg, AVL_NEXT(tree, newseg));
4314 					break;
4315 				}
4316 			}
4317 
4318 			/*
4319 			 * New segment ends before old one, so split off end of
4320 			 * old one
4321 			 */
4322 			if (eoff < oldeoff) {
4323 				if (newseg) {
4324 					(void) lgrp_shm_policy_split(tree,
4325 					    newseg, eoff);
4326 					newseg->shm_policy.mem_policy = policy;
4327 					newseg->shm_policy.mem_reserved = 0;
4328 				} else {
4329 					(void) lgrp_shm_policy_split(tree, seg,
4330 					    eoff);
4331 					seg->shm_policy.mem_policy = policy;
4332 					seg->shm_policy.mem_reserved = 0;
4333 				}
4334 
4335 				if (off == seg->shm_off)
4336 					(void) lgrp_shm_policy_concat(tree,
4337 					    AVL_PREV(tree, seg), seg);
4338 				break;
4339 			}
4340 
4341 			/*
4342 			 * Calculate remaining length and next offset
4343 			 */
4344 			len = eoff - oldeoff;
4345 			off = oldeoff;
4346 		}
4347 	}
4348 
4349 	rw_exit(&shm_locality->loc_lock);
4350 	return (retval);
4351 }
4352 
4353 /*
4354  * Return the best memnode from which to allocate memory given
4355  * an lgroup.
4356  *
4357  * "c" is for cookie, which is good enough for me.
4358  * It references a cookie struct that should be zero'ed to initialize.
4359  * The cookie should live on the caller's stack.
4360  *
4361  * The routine returns -1 when:
4362  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4363  *	- traverse is 1, and all the memnodes in the system have been
4364  *	  returned.
4365  */
4366 int
4367 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4368 {
4369 	lgrp_t		*lp = c->lmc_lgrp;
4370 	mnodeset_t	nodes = c->lmc_nodes;
4371 	int		cnt = c->lmc_cnt;
4372 	int		offset, mnode;
4373 
4374 	extern int	max_mem_nodes;
4375 
4376 	/*
4377 	 * If the set is empty, and the caller is willing, traverse
4378 	 * up the hierarchy until we find a non-empty set.
4379 	 */
4380 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4381 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4382 		    ((lp = lp->lgrp_parent) == NULL))
4383 			return (-1);
4384 
4385 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4386 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4387 	}
4388 
4389 	/*
4390 	 * Select a memnode by picking one at a "random" offset.
4391 	 * Because of DR, memnodes can come and go at any time.
4392 	 * This code must be able to cope with the possibility
4393 	 * that the nodes count "cnt" is inconsistent with respect
4394 	 * to the number of elements actually in "nodes", and
4395 	 * therefore that the offset chosen could be greater than
4396 	 * the number of elements in the set (some memnodes may
4397 	 * have dissapeared just before cnt was read).
4398 	 * If this happens, the search simply wraps back to the
4399 	 * beginning of the set.
4400 	 */
4401 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4402 	offset = c->lmc_rand % cnt;
4403 	do {
4404 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4405 			if (nodes & ((mnodeset_t)1 << mnode))
4406 				if (!offset--)
4407 					break;
4408 	} while (mnode >= max_mem_nodes);
4409 
4410 	/* Found a node. Store state before returning. */
4411 	c->lmc_lgrp = lp;
4412 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4413 	c->lmc_cnt = cnt - 1;
4414 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4415 	c->lmc_ntried++;
4416 
4417 	return (mnode);
4418 }
4419