xref: /titanic_50/usr/src/uts/common/os/lgrp.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Basic NUMA support in terms of locality groups
31  *
32  * Solaris needs to know which CPUs, memory, etc. are near each other to
33  * provide good performance on NUMA machines by optimizing for locality.
34  * In order to do this, a new abstraction called a "locality group (lgroup)"
35  * has been introduced to keep track of which CPU-like and memory-like hardware
36  * resources are close to each other.  Currently, latency is the only measure
37  * used to determine how to group hardware resources into lgroups, but this
38  * does not limit the groupings to be based solely on latency.  Other factors
39  * may be used to determine the groupings in the future.
40  *
41  * Lgroups are organized into a hieararchy or topology that represents the
42  * latency topology of the machine.  There is always at least a root lgroup in
43  * the system.  It represents all the hardware resources in the machine at a
44  * latency big enough that any hardware resource can at least access any other
45  * hardware resource within that latency.  A Uniform Memory Access (UMA)
46  * machine is represented with one lgroup (the root).  In contrast, a NUMA
47  * machine is represented at least by the root lgroup and some number of leaf
48  * lgroups where the leaf lgroups contain the hardware resources within the
49  * least latency of each other and the root lgroup still contains all the
50  * resources in the machine.  Some number of intermediate lgroups may exist
51  * which represent more levels of locality than just the local latency of the
52  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
53  * (eg. root and intermediate lgroups) contain the next nearest resources to
54  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
55  * to the root lgroup shows the hardware resources from closest to farthest
56  * from the leaf lgroup such that each successive ancestor lgroup contains
57  * the next nearest resources at the next level of locality from the previous.
58  *
59  * The kernel uses the lgroup abstraction to know how to allocate resources
60  * near a given process/thread.  At fork() and lwp/thread_create() time, a
61  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
62  * with the lowest load average.  Binding to a processor or processor set will
63  * change the home lgroup for a thread.  The scheduler has been modified to try
64  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
65  * allocation is lgroup aware too, so memory will be allocated from the current
66  * thread's home lgroup if possible.  If the desired resources are not
67  * available, the kernel traverses the lgroup hierarchy going to the parent
68  * lgroup to find resources at the next level of locality until it reaches the
69  * root lgroup.
70  */
71 
72 #include <sys/lgrp.h>
73 #include <sys/lgrp_user.h>
74 #include <sys/types.h>
75 #include <sys/mman.h>
76 #include <sys/param.h>
77 #include <sys/var.h>
78 #include <sys/thread.h>
79 #include <sys/cpuvar.h>
80 #include <sys/cpupart.h>
81 #include <sys/kmem.h>
82 #include <vm/seg.h>
83 #include <vm/seg_kmem.h>
84 #include <vm/seg_spt.h>
85 #include <vm/seg_vn.h>
86 #include <vm/as.h>
87 #include <sys/atomic.h>
88 #include <sys/systm.h>
89 #include <sys/errno.h>
90 #include <sys/cmn_err.h>
91 #include <sys/kstat.h>
92 #include <sys/sysmacros.h>
93 #include <sys/chip.h>
94 #include <sys/promif.h>
95 #include <sys/sdt.h>
96 
97 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
99 				/* indexed by lgrp_id */
100 int	nlgrps;			/* number of lgroups in machine */
101 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
102 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
103 
104 /*
105  * Kstat data for lgroups.
106  *
107  * Actual kstat data is collected in lgrp_stats array.
108  * The lgrp_kstat_data array of named kstats is used to extract data from
109  * lgrp_stats and present it to kstat framework. It is protected from partallel
110  * modifications by lgrp_kstat_mutex. This may cause some contention when
111  * several kstat commands run in parallel but this is not the
112  * performance-critical path.
113  */
114 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
115 
116 /*
117  * Declare kstat names statically for enums as defined in the header file.
118  */
119 LGRP_KSTAT_NAMES;
120 
121 static void	lgrp_kstat_init(void);
122 static int	lgrp_kstat_extract(kstat_t *, int);
123 static void	lgrp_kstat_reset(lgrp_id_t);
124 
125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
126 static kmutex_t lgrp_kstat_mutex;
127 
128 
129 /*
130  * max number of lgroups supported by the platform
131  */
132 int	nlgrpsmax = 0;
133 
134 /*
135  * The root lgroup. Represents the set of resources at the system wide
136  * level of locality.
137  */
138 lgrp_t		*lgrp_root = NULL;
139 
140 /*
141  * During system bootstrap cp_default does not contain the list of lgrp load
142  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
143  * on-line when cp_default is initialized by cpupart_initialize_default().
144  * Configuring CPU0 may create a two-level topology with root and one leaf node
145  * containing CPU0. This topology is initially constructed in a special
146  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
147  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
148  * for all lpl operations until cp_default is fully constructed.
149  *
150  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
151  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
152  * the first element of lpl_bootstrap_list.
153  */
154 #define	LPL_BOOTSTRAP_SIZE 2
155 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
156 lpl_t		*lpl_bootstrap;
157 
158 static lgrp_t	lroot;
159 
160 
161 /*
162  * Size, in bytes, beyond which random memory allocation policy is applied
163  * to non-shared memory.  Default is the maximum size, so random memory
164  * allocation won't be used for non-shared memory by default.
165  */
166 size_t	lgrp_privm_random_thresh = (size_t)(-1);
167 
168 /*
169  * Size, in bytes, beyond which random memory allocation policy is applied to
170  * shared memory.  Default is 8MB (2 ISM pages).
171  */
172 size_t	lgrp_shm_random_thresh = 8*1024*1024;
173 
174 /*
175  * Whether to do processor set aware memory allocation by default
176  */
177 int	lgrp_mem_pset_aware = 0;
178 
179 /*
180  * Set the default memory allocation policy for root lgroup
181  */
182 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
183 
184 /*
185  * Set the default memory allocation policy.  For most platforms,
186  * next touch is sufficient, but some platforms may wish to override
187  * this.
188  */
189 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
190 
191 
192 /*
193  * lgroup CPU event handlers
194  */
195 static void	lgrp_cpu_init(struct cpu *);
196 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
197 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
198 
199 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
200 
201 /*
202  * lgroup memory event handlers
203  */
204 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
205 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
206 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
207 
208 /*
209  * lgroup CPU partition event handlers
210  */
211 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
212 static void	lgrp_part_del_cpu(struct cpu *);
213 
214 static void	lgrp_root_init(void);
215 
216 /*
217  * lpl topology
218  */
219 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
220 static void	lpl_clear(lpl_t *);
221 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
222 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
223 static void	lpl_rset_add(lpl_t *, lpl_t *);
224 static void	lpl_rset_del(lpl_t *, lpl_t *);
225 static int	lpl_rset_contains(lpl_t *, lpl_t *);
226 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
227 static void	lpl_child_update(lpl_t *, struct cpupart *);
228 static int	lpl_pick(lpl_t *, lpl_t *);
229 static void	lpl_verify_wrapper(struct cpupart *);
230 
231 /*
232  * defines for lpl topology verifier return codes
233  */
234 
235 #define	LPL_TOPO_CORRECT			0
236 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
237 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
238 #define	LPL_TOPO_LGRP_MISMATCH			-3
239 #define	LPL_TOPO_MISSING_PARENT			-4
240 #define	LPL_TOPO_PARENT_MISMATCH		-5
241 #define	LPL_TOPO_BAD_CPUCNT			-6
242 #define	LPL_TOPO_RSET_MISMATCH			-7
243 #define	LPL_TOPO_LPL_ORPHANED			-8
244 #define	LPL_TOPO_LPL_BAD_NCPU			-9
245 #define	LPL_TOPO_RSET_MSSNG_LF			-10
246 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
247 #define	LPL_TOPO_BOGUS_HINT			-12
248 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
249 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
250 #define	LPL_TOPO_BAD_RSETCNT			-15
251 
252 /*
253  * Return whether lgroup optimizations should be enabled on this system
254  */
255 int
256 lgrp_optimizations(void)
257 {
258 	/*
259 	 * System must have more than 2 lgroups to enable lgroup optimizations
260 	 *
261 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
262 	 * with one child lgroup containing all the resources. A 2 lgroup
263 	 * system with a root lgroup directly containing CPUs or memory might
264 	 * need lgroup optimizations with its child lgroup, but there
265 	 * isn't such a machine for now....
266 	 */
267 	if (nlgrps > 2)
268 		return (1);
269 
270 	return (0);
271 }
272 
273 /*
274  * Build full lgroup topology
275  */
276 static void
277 lgrp_root_init(void)
278 {
279 	lgrp_handle_t	hand;
280 	int		i;
281 	lgrp_id_t	id;
282 
283 	/*
284 	 * Create the "root" lgroup
285 	 */
286 	ASSERT(nlgrps == 0);
287 	id = nlgrps++;
288 
289 	lgrp_root = &lroot;
290 
291 	lgrp_root->lgrp_cpu = NULL;
292 	lgrp_root->lgrp_mnodes = 0;
293 	lgrp_root->lgrp_nmnodes = 0;
294 	hand = lgrp_plat_root_hand();
295 	lgrp_root->lgrp_plathand = hand;
296 
297 	lgrp_root->lgrp_id = id;
298 	lgrp_root->lgrp_cpucnt = 0;
299 	lgrp_root->lgrp_childcnt = 0;
300 	klgrpset_clear(lgrp_root->lgrp_children);
301 	klgrpset_clear(lgrp_root->lgrp_leaves);
302 	lgrp_root->lgrp_parent = NULL;
303 	lgrp_root->lgrp_chips = NULL;
304 	lgrp_root->lgrp_chipcnt = 0;
305 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
306 
307 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
308 		klgrpset_clear(lgrp_root->lgrp_set[i]);
309 
310 	lgrp_root->lgrp_kstat = NULL;
311 
312 	lgrp_table[id] = lgrp_root;
313 
314 	/*
315 	 * Setup initial lpl list for CPU0 and initial t0 home.
316 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
317 	 * all topology operations untill cp_default until cp_default is
318 	 * initialized at which point t0.t_lpl will be updated.
319 	 */
320 	lpl_bootstrap = lpl_bootstrap_list;
321 	t0.t_lpl = lpl_bootstrap;
322 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
323 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
324 	cp_default.cp_lgrploads = lpl_bootstrap;
325 }
326 
327 /*
328  * Initialize the lgroup framework and allow the platform to do the same
329  */
330 void
331 lgrp_init(void)
332 {
333 	/*
334 	 * Initialize the platform
335 	 */
336 	lgrp_plat_init();
337 
338 	/*
339 	 * Set max number of lgroups supported on this platform which must be
340 	 * less than the max number of lgroups supported by the common lgroup
341 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
342 	 */
343 	nlgrpsmax = lgrp_plat_max_lgrps();
344 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
345 }
346 
347 /*
348  * Create the root and cpu0's lgroup, and set t0's home.
349  */
350 void
351 lgrp_setup(void)
352 {
353 	/*
354 	 * Setup the root lgroup
355 	 */
356 	lgrp_root_init();
357 
358 	/*
359 	 * Add cpu0 to an lgroup
360 	 */
361 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
362 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
363 }
364 
365 /*
366  * Lgroup initialization is split in two parts. The first part
367  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
368  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
369  * when all CPUs are brought online and all distance information is available.
370  *
371  * When lgrp_main_init() is complete it sets lgrp_initialized. The
372  * lgrp_main_mp_init() sets lgrp_topo_initialized.
373  */
374 
375 /*
376  * true when lgrp initialization has been completed.
377  */
378 int	lgrp_initialized = 0;
379 
380 /*
381  * True when lgrp topology is constructed.
382  */
383 int	lgrp_topo_initialized = 0;
384 
385 /*
386  * Init routine called after startup(), /etc/system has been processed,
387  * and cpu0 has been added to an lgroup.
388  */
389 void
390 lgrp_main_init(void)
391 {
392 	cpu_t		*cp = CPU;
393 	lgrp_id_t	lgrpid;
394 	int		i;
395 	/*
396 	 * Enforce a valid lgrp_mem_default_policy
397 	 */
398 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
399 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
400 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
401 
402 	/*
403 	 * See if mpo should be disabled.
404 	 * This may happen in the case of null proc LPA on Starcat.
405 	 * The platform won't be able to detect null proc LPA until after
406 	 * cpu0 and memory have already been added to lgroups.
407 	 * When and if it is detected, the Starcat platform will return
408 	 * a different platform handle for cpu0 which is what we check for
409 	 * here. If mpo should be disabled move cpu0 to it's rightful place
410 	 * (the root), and destroy the remaining lgroups. This effectively
411 	 * provides an UMA lgroup topology.
412 	 */
413 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
414 	if (lgrp_table[lgrpid]->lgrp_plathand !=
415 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
416 		lgrp_part_del_cpu(cp);
417 		lgrp_cpu_fini(cp, lgrpid);
418 
419 		lgrp_cpu_init(cp);
420 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
421 
422 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
423 
424 		for (i = 0; i <= lgrp_alloc_max; i++) {
425 			if (LGRP_EXISTS(lgrp_table[i]) &&
426 			    lgrp_table[i] != lgrp_root)
427 				lgrp_destroy(lgrp_table[i]);
428 		}
429 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
430 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
431 	}
432 
433 	/*
434 	 * Initialize kstats framework.
435 	 */
436 	lgrp_kstat_init();
437 	/*
438 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
439 	 */
440 	mutex_enter(&cpu_lock);
441 	lgrp_kstat_create(cp);
442 	mutex_exit(&cpu_lock);
443 
444 	lgrp_plat_main_init();
445 	lgrp_initialized = 1;
446 }
447 
448 /*
449  * Finish lgrp initialization after all CPUS are brought on-line.
450  * This routine is called after start_other_cpus().
451  */
452 void
453 lgrp_main_mp_init(void)
454 {
455 	klgrpset_t changed;
456 
457 	/*
458 	 * Update lgroup topology (if necessary)
459 	 */
460 	klgrpset_clear(changed);
461 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
462 	lgrp_topo_initialized = 1;
463 }
464 
465 /*
466  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
467  */
468 void
469 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
470 {
471 	klgrpset_t	changed;
472 	cpu_t		*cp;
473 	lgrp_id_t	id;
474 	int		rc;
475 
476 	switch (event) {
477 	/*
478 	 * The following (re)configuration events are common code
479 	 * initiated. lgrp_plat_config() is called here to inform the
480 	 * platform of the reconfiguration event.
481 	 */
482 	case LGRP_CONFIG_CPU_ADD:
483 		lgrp_plat_config(event, resource);
484 		atomic_add_32(&lgrp_gen, 1);
485 
486 		break;
487 	case LGRP_CONFIG_CPU_DEL:
488 		lgrp_plat_config(event, resource);
489 		atomic_add_32(&lgrp_gen, 1);
490 
491 		break;
492 	case LGRP_CONFIG_CPU_ONLINE:
493 		cp = (cpu_t *)resource;
494 		lgrp_cpu_init(cp);
495 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
496 		rc = lpl_topo_verify(cp->cpu_part);
497 		if (rc != LPL_TOPO_CORRECT) {
498 			panic("lpl_topo_verify failed: %d", rc);
499 		}
500 		lgrp_plat_config(event, resource);
501 		atomic_add_32(&lgrp_gen, 1);
502 
503 		break;
504 	case LGRP_CONFIG_CPU_OFFLINE:
505 		cp = (cpu_t *)resource;
506 		id = cp->cpu_lpl->lpl_lgrpid;
507 		lgrp_part_del_cpu(cp);
508 		lgrp_cpu_fini(cp, id);
509 		rc = lpl_topo_verify(cp->cpu_part);
510 		if (rc != LPL_TOPO_CORRECT) {
511 			panic("lpl_topo_verify failed: %d", rc);
512 		}
513 		lgrp_plat_config(event, resource);
514 		atomic_add_32(&lgrp_gen, 1);
515 
516 		break;
517 	case LGRP_CONFIG_CPUPART_ADD:
518 		cp = (cpu_t *)resource;
519 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
520 		rc = lpl_topo_verify(cp->cpu_part);
521 		if (rc != LPL_TOPO_CORRECT) {
522 			panic("lpl_topo_verify failed: %d", rc);
523 		}
524 		lgrp_plat_config(event, resource);
525 
526 		break;
527 	case LGRP_CONFIG_CPUPART_DEL:
528 		cp = (cpu_t *)resource;
529 		lgrp_part_del_cpu((cpu_t *)resource);
530 		rc = lpl_topo_verify(cp->cpu_part);
531 		if (rc != LPL_TOPO_CORRECT) {
532 			panic("lpl_topo_verify failed: %d", rc);
533 		}
534 		lgrp_plat_config(event, resource);
535 
536 		break;
537 	/*
538 	 * The following events are initiated by the memnode
539 	 * subsystem.
540 	 */
541 	case LGRP_CONFIG_MEM_ADD:
542 		lgrp_mem_init((int)resource, where, B_FALSE);
543 		atomic_add_32(&lgrp_gen, 1);
544 
545 		break;
546 	case LGRP_CONFIG_MEM_DEL:
547 		lgrp_mem_fini((int)resource, where, B_FALSE);
548 		atomic_add_32(&lgrp_gen, 1);
549 
550 		break;
551 	case LGRP_CONFIG_MEM_RENAME: {
552 		lgrp_config_mem_rename_t *ren_arg =
553 		    (lgrp_config_mem_rename_t *)where;
554 
555 		lgrp_mem_rename((int)resource,
556 		    ren_arg->lmem_rename_from,
557 		    ren_arg->lmem_rename_to);
558 		atomic_add_32(&lgrp_gen, 1);
559 
560 		break;
561 	}
562 	case LGRP_CONFIG_GEN_UPDATE:
563 		atomic_add_32(&lgrp_gen, 1);
564 
565 		break;
566 	case LGRP_CONFIG_FLATTEN:
567 		if (where == 0)
568 			lgrp_topo_levels = (int)resource;
569 		else
570 			(void) lgrp_topo_flatten(resource,
571 			    lgrp_table, lgrp_alloc_max, &changed);
572 
573 		break;
574 	/*
575 	 * Initiated by platform latency probing code
576 	 */
577 	case LGRP_CONFIG_LATENCY_CHANGE:
578 		lgrp_latency_change((u_longlong_t)resource,
579 		    (u_longlong_t)where);
580 
581 		break;
582 	case LGRP_CONFIG_NOP:
583 
584 		break;
585 	default:
586 		break;
587 	}
588 
589 }
590 
591 /*
592  * Called to add lgrp info into cpu structure from cpu_add_unit;
593  * do not assume cpu is in cpu[] yet!
594  *
595  * CPUs are brought online with all other CPUs paused so we can't
596  * allocate memory or we could deadlock the system, so we rely on
597  * the platform to statically allocate as much space as we need
598  * for the lgrp structs and stats.
599  */
600 static void
601 lgrp_cpu_init(struct cpu *cp)
602 {
603 	klgrpset_t	changed;
604 	int		count;
605 	lgrp_handle_t	hand;
606 	int		first_cpu;
607 	lgrp_t		*my_lgrp;
608 	lgrp_id_t	lgrpid;
609 	struct cpu	*cptr;
610 	struct chip	*chp;
611 
612 	/*
613 	 * This is the first time through if the resource set
614 	 * for the root lgroup is empty. After cpu0 has been
615 	 * initially added to an lgroup, the root's CPU resource
616 	 * set can never be empty, since the system's last CPU
617 	 * cannot be offlined.
618 	 */
619 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
620 		/*
621 		 * First time through.
622 		 */
623 		first_cpu = 1;
624 	} else {
625 		/*
626 		 * If cpu0 needs to move lgroups, we may come
627 		 * through here again, at which time cpu_lock won't
628 		 * be held, and lgrp_initialized will be false.
629 		 */
630 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
631 		ASSERT(cp->cpu_part != NULL);
632 		first_cpu = 0;
633 	}
634 
635 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
636 	my_lgrp = lgrp_hand_to_lgrp(hand);
637 
638 	if (my_lgrp == NULL) {
639 		/*
640 		 * Create new lgrp and add it to lgroup topology
641 		 */
642 		my_lgrp = lgrp_create();
643 		my_lgrp->lgrp_plathand = hand;
644 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
645 		lgrpid = my_lgrp->lgrp_id;
646 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
647 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
648 
649 		count = 0;
650 		klgrpset_clear(changed);
651 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
652 		    &changed);
653 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
654 	    > 0) {
655 		/*
656 		 * Leaf lgroup was created, but latency wasn't available
657 		 * then.  So, set latency for it and fill in rest of lgroup
658 		 * topology  now that we know how far it is from other leaf
659 		 * lgroups.
660 		 */
661 		lgrpid = my_lgrp->lgrp_id;
662 		klgrpset_clear(changed);
663 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
664 		    lgrpid))
665 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
666 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
667 		    &changed);
668 
669 		/*
670 		 * May have added new intermediate lgroups, so need to add
671 		 * resources other than CPUs which are added below
672 		 */
673 		(void) lgrp_mnode_update(changed, NULL);
674 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
675 	    my_lgrp->lgrp_id)) {
676 		int	i;
677 
678 		/*
679 		 * Update existing lgroup and lgroups containing it with CPU
680 		 * resource
681 		 */
682 		lgrpid = my_lgrp->lgrp_id;
683 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
684 		for (i = 0; i <= lgrp_alloc_max; i++) {
685 			lgrp_t		*lgrp;
686 
687 			lgrp = lgrp_table[i];
688 			if (!LGRP_EXISTS(lgrp) ||
689 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
690 				continue;
691 
692 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
693 		}
694 	}
695 
696 	lgrpid = my_lgrp->lgrp_id;
697 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
698 
699 	/*
700 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
701 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
702 	 * not since none of lgroup IDs in the lpl's have been set yet.
703 	 */
704 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
705 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
706 
707 	/*
708 	 * link the CPU into the lgrp's CPU list
709 	 */
710 	if (my_lgrp->lgrp_cpucnt == 0) {
711 		my_lgrp->lgrp_cpu = cp;
712 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
713 	} else {
714 		cptr = my_lgrp->lgrp_cpu;
715 		cp->cpu_next_lgrp = cptr;
716 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
717 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
718 		cptr->cpu_prev_lgrp = cp;
719 	}
720 	my_lgrp->lgrp_cpucnt++;
721 
722 	/*
723 	 * Add this cpu's chip to the per lgroup list
724 	 * if necessary
725 	 */
726 	if (cp->cpu_chip->chip_lgrp == NULL) {
727 		struct chip *lcpr;
728 
729 		chp = cp->cpu_chip;
730 
731 		if (my_lgrp->lgrp_chipcnt == 0) {
732 			my_lgrp->lgrp_chips = chp;
733 			chp->chip_next_lgrp =
734 			    chp->chip_prev_lgrp = chp;
735 		} else {
736 			lcpr = my_lgrp->lgrp_chips;
737 			chp->chip_next_lgrp = lcpr;
738 			chp->chip_prev_lgrp =
739 			    lcpr->chip_prev_lgrp;
740 			lcpr->chip_prev_lgrp->chip_next_lgrp =
741 			    chp;
742 			lcpr->chip_prev_lgrp = chp;
743 		}
744 		chp->chip_lgrp = my_lgrp;
745 		chp->chip_balance = chp->chip_next_lgrp;
746 		my_lgrp->lgrp_chipcnt++;
747 	}
748 }
749 
750 lgrp_t *
751 lgrp_create(void)
752 {
753 	lgrp_t		*my_lgrp;
754 	lgrp_id_t	lgrpid;
755 	int		i;
756 
757 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
758 
759 	/*
760 	 * Find an open slot in the lgroup table and recycle unused lgroup
761 	 * left there if any
762 	 */
763 	my_lgrp = NULL;
764 	if (lgrp_alloc_hint == -1)
765 		/*
766 		 * Allocate from end when hint not set yet because no lgroups
767 		 * have been deleted yet
768 		 */
769 		lgrpid = nlgrps++;
770 	else {
771 		/*
772 		 * Start looking for next open slot from hint and leave hint
773 		 * at slot allocated
774 		 */
775 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
776 			my_lgrp = lgrp_table[i];
777 			if (!LGRP_EXISTS(my_lgrp)) {
778 				lgrpid = i;
779 				nlgrps++;
780 				break;
781 			}
782 		}
783 		lgrp_alloc_hint = lgrpid;
784 	}
785 
786 	/*
787 	 * Keep track of max lgroup ID allocated so far to cut down on searches
788 	 */
789 	if (lgrpid > lgrp_alloc_max)
790 		lgrp_alloc_max = lgrpid;
791 
792 	/*
793 	 * Need to allocate new lgroup if next open slot didn't have one
794 	 * for recycling
795 	 */
796 	if (my_lgrp == NULL)
797 		my_lgrp = lgrp_plat_alloc(lgrpid);
798 
799 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
800 		panic("Too many lgrps for platform (%d)", nlgrps);
801 
802 	my_lgrp->lgrp_id = lgrpid;
803 	my_lgrp->lgrp_latency = 0;
804 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
805 	my_lgrp->lgrp_parent = NULL;
806 	my_lgrp->lgrp_childcnt = 0;
807 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
808 	my_lgrp->lgrp_nmnodes = 0;
809 	klgrpset_clear(my_lgrp->lgrp_children);
810 	klgrpset_clear(my_lgrp->lgrp_leaves);
811 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
812 		klgrpset_clear(my_lgrp->lgrp_set[i]);
813 
814 	my_lgrp->lgrp_cpu = NULL;
815 	my_lgrp->lgrp_cpucnt = 0;
816 	my_lgrp->lgrp_chips = NULL;
817 	my_lgrp->lgrp_chipcnt = 0;
818 
819 	if (my_lgrp->lgrp_kstat != NULL)
820 		lgrp_kstat_reset(lgrpid);
821 
822 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
823 
824 	return (my_lgrp);
825 }
826 
827 void
828 lgrp_destroy(lgrp_t *lgrp)
829 {
830 	int		i;
831 
832 	/*
833 	 * Unless this lgroup is being destroyed on behalf of
834 	 * the boot CPU, cpu_lock must be held
835 	 */
836 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
837 
838 	if (nlgrps == 1)
839 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
840 
841 	if (!LGRP_EXISTS(lgrp))
842 		return;
843 
844 	/*
845 	 * Set hint to lgroup being deleted and try to keep lower numbered
846 	 * hints to facilitate finding empty slots
847 	 */
848 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
849 		lgrp_alloc_hint = lgrp->lgrp_id;
850 
851 	/*
852 	 * Mark this lgroup to be recycled by setting its lgroup ID to
853 	 * LGRP_NONE and clear relevant fields
854 	 */
855 	lgrp->lgrp_id = LGRP_NONE;
856 	lgrp->lgrp_latency = 0;
857 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
858 	lgrp->lgrp_parent = NULL;
859 	lgrp->lgrp_childcnt = 0;
860 
861 	klgrpset_clear(lgrp->lgrp_children);
862 	klgrpset_clear(lgrp->lgrp_leaves);
863 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
864 		klgrpset_clear(lgrp->lgrp_set[i]);
865 
866 	lgrp->lgrp_mnodes = (mnodeset_t)0;
867 	lgrp->lgrp_nmnodes = 0;
868 
869 	lgrp->lgrp_cpu = NULL;
870 	lgrp->lgrp_cpucnt = 0;
871 	lgrp->lgrp_chipcnt = 0;
872 	lgrp->lgrp_chips = NULL;
873 
874 	nlgrps--;
875 }
876 
877 /*
878  * Initialize kstat data. Called from lgrp intialization code.
879  */
880 static void
881 lgrp_kstat_init(void)
882 {
883 	lgrp_stat_t	stat;
884 
885 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
886 
887 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
888 		kstat_named_init(&lgrp_kstat_data[stat],
889 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
890 }
891 
892 /*
893  * initialize an lgrp's kstats if needed
894  * called with cpu_lock held but not with cpus paused.
895  * we don't tear these down now because we don't know about
896  * memory leaving the lgrp yet...
897  */
898 
899 void
900 lgrp_kstat_create(cpu_t *cp)
901 {
902 	kstat_t		*lgrp_kstat;
903 	lgrp_id_t	lgrpid;
904 	lgrp_t		*my_lgrp;
905 
906 	ASSERT(MUTEX_HELD(&cpu_lock));
907 
908 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
909 	my_lgrp = lgrp_table[lgrpid];
910 
911 	if (my_lgrp->lgrp_kstat != NULL)
912 		return; /* already initialized */
913 
914 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
915 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
916 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
917 
918 	if (lgrp_kstat != NULL) {
919 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
920 		lgrp_kstat->ks_private = my_lgrp;
921 		lgrp_kstat->ks_data = &lgrp_kstat_data;
922 		lgrp_kstat->ks_update = lgrp_kstat_extract;
923 		my_lgrp->lgrp_kstat = lgrp_kstat;
924 		kstat_install(lgrp_kstat);
925 	}
926 }
927 
928 /*
929  * this will do something when we manage to remove now unused lgrps
930  */
931 
932 /* ARGSUSED */
933 void
934 lgrp_kstat_destroy(cpu_t *cp)
935 {
936 	ASSERT(MUTEX_HELD(&cpu_lock));
937 }
938 
939 /*
940  * Called when a CPU is off-lined.
941  */
942 static void
943 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
944 {
945 	lgrp_t *my_lgrp;
946 	struct cpu *prev;
947 	struct cpu *next;
948 	chip_t  *chp;
949 
950 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
951 
952 	prev = cp->cpu_prev_lgrp;
953 	next = cp->cpu_next_lgrp;
954 
955 	prev->cpu_next_lgrp = next;
956 	next->cpu_prev_lgrp = prev;
957 
958 	/*
959 	 * just because I'm paranoid doesn't mean...
960 	 */
961 
962 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
963 
964 	my_lgrp = lgrp_table[lgrpid];
965 	my_lgrp->lgrp_cpucnt--;
966 
967 	/*
968 	 * If the last CPU on it's chip is being offlined
969 	 * then remove this chip from the per lgroup list.
970 	 *
971 	 * This is also done for the boot CPU when it needs
972 	 * to move between lgroups as a consequence of
973 	 * null proc lpa.
974 	 */
975 	chp = cp->cpu_chip;
976 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
977 
978 		chip_t	*chpp;
979 
980 		if (--my_lgrp->lgrp_chipcnt == 0)
981 			my_lgrp->lgrp_chips = NULL;
982 		else if (my_lgrp->lgrp_chips == chp)
983 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
984 
985 		/*
986 		 * Walk this lgroup's chip list looking for chips that
987 		 * may try to balance against the one that's leaving
988 		 */
989 		for (chpp = chp->chip_next_lgrp; chpp != chp;
990 		    chpp = chpp->chip_next_lgrp) {
991 			if (chpp->chip_balance == chp)
992 				chpp->chip_balance = chp->chip_next_lgrp;
993 		}
994 
995 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
996 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
997 
998 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
999 		chp->chip_lgrp = NULL;
1000 		chp->chip_balance = NULL;
1001 	}
1002 
1003 	/*
1004 	 * Removing last CPU in lgroup, so update lgroup topology
1005 	 */
1006 	if (my_lgrp->lgrp_cpucnt == 0) {
1007 		klgrpset_t	changed;
1008 		int		count;
1009 		int		i;
1010 
1011 		my_lgrp->lgrp_cpu = NULL;
1012 
1013 		/*
1014 		 * Remove this lgroup from its lgroup CPU resources and remove
1015 		 * lgroup from lgroup topology if it doesn't have any more
1016 		 * resources in it now
1017 		 */
1018 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1019 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1020 			count = 0;
1021 			klgrpset_clear(changed);
1022 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1023 			    lgrp_alloc_max + 1, &changed);
1024 			return;
1025 		}
1026 
1027 		/*
1028 		 * This lgroup isn't empty, so just remove it from CPU
1029 		 * resources of any lgroups that contain it as such
1030 		 */
1031 		for (i = 0; i <= lgrp_alloc_max; i++) {
1032 			lgrp_t		*lgrp;
1033 
1034 			lgrp = lgrp_table[i];
1035 			if (!LGRP_EXISTS(lgrp) ||
1036 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1037 			    lgrpid))
1038 				continue;
1039 
1040 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1041 		}
1042 		return;
1043 	}
1044 
1045 	if (my_lgrp->lgrp_cpu == cp)
1046 		my_lgrp->lgrp_cpu = next;
1047 
1048 }
1049 
1050 /*
1051  * Update memory nodes in target lgroups and return ones that get changed
1052  */
1053 int
1054 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1055 {
1056 	int	count;
1057 	int	i;
1058 	int	j;
1059 	lgrp_t	*lgrp;
1060 	lgrp_t	*lgrp_rsrc;
1061 
1062 	count = 0;
1063 	if (changed)
1064 		klgrpset_clear(*changed);
1065 
1066 	if (klgrpset_isempty(target))
1067 		return (0);
1068 
1069 	/*
1070 	 * Find each lgroup in target lgroups
1071 	 */
1072 	for (i = 0; i <= lgrp_alloc_max; i++) {
1073 		/*
1074 		 * Skip any lgroups that don't exist or aren't in target group
1075 		 */
1076 		lgrp = lgrp_table[i];
1077 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1078 			continue;
1079 		}
1080 
1081 		/*
1082 		 * Initialize memnodes for intermediate lgroups to 0
1083 		 * and update them from scratch since they may have completely
1084 		 * changed
1085 		 */
1086 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1087 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1088 			lgrp->lgrp_nmnodes = 0;
1089 		}
1090 
1091 		/*
1092 		 * Update memory nodes of of target lgroup with memory nodes
1093 		 * from each lgroup in its lgroup memory resource set
1094 		 */
1095 		for (j = 0; j <= lgrp_alloc_max; j++) {
1096 			int	k;
1097 
1098 			/*
1099 			 * Skip any lgroups that don't exist or aren't in
1100 			 * memory resources of target lgroup
1101 			 */
1102 			lgrp_rsrc = lgrp_table[j];
1103 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1104 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1105 			    j))
1106 				continue;
1107 
1108 			/*
1109 			 * Update target lgroup's memnodes to include memnodes
1110 			 * of this lgroup
1111 			 */
1112 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1113 				mnodeset_t	mnode_mask;
1114 
1115 				mnode_mask = (mnodeset_t)1 << k;
1116 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1117 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1118 					lgrp->lgrp_mnodes |= mnode_mask;
1119 					lgrp->lgrp_nmnodes++;
1120 				}
1121 			}
1122 			count++;
1123 			if (changed)
1124 				klgrpset_add(*changed, lgrp->lgrp_id);
1125 		}
1126 	}
1127 
1128 	return (count);
1129 }
1130 
1131 /*
1132  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1133  * is moved from one board to another. The "from" and "to" arguments specify the
1134  * source and the destination of the move.
1135  *
1136  * See plat_lgrp_config() for a detailed description of the copy-rename
1137  * semantics.
1138  *
1139  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1140  * the lgroup topology which is changing as memory moves from one lgroup to
1141  * another. It removes the mnode from the source lgroup and re-inserts it in the
1142  * target lgroup.
1143  *
1144  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1145  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1146  * copy-rename operation.
1147  *
1148  * There is one case which requires special handling. If the system contains
1149  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1150  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1151  * lgrp_mem_init), but there is a window when the system has no memory in the
1152  * lgroup hierarchy. If another thread tries to allocate memory during this
1153  * window, the allocation will fail, although the system has physical memory.
1154  * This may cause a system panic or a deadlock (some sleeping memory allocations
1155  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1156  * the mnode back).
1157  *
1158  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1159  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1160  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1161  * but it updates the rest of the lgroup topology as if the mnode was actually
1162  * removed. The lgrp_mem_init() function recognizes that the mnode being
1163  * inserted represents such a special case and updates the topology
1164  * appropriately.
1165  */
1166 void
1167 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1168 {
1169 	/*
1170 	 * Remove the memory from the source node and add it to the destination
1171 	 * node.
1172 	 */
1173 	lgrp_mem_fini(mnode, from, B_TRUE);
1174 	lgrp_mem_init(mnode, to, B_TRUE);
1175 }
1176 
1177 /*
1178  * Called to indicate that the lgrp with platform handle "hand" now
1179  * contains the memory identified by "mnode".
1180  *
1181  * LOCKING for this routine is a bit tricky. Usually it is called without
1182  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1183  * callers. During DR of the board containing the caged memory it may be called
1184  * with cpu_lock already held and CPUs paused.
1185  *
1186  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1187  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1188  * dealing with the special case of DR copy-rename described in
1189  * lgrp_mem_rename().
1190  */
1191 void
1192 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1193 {
1194 	klgrpset_t	changed;
1195 	int		count;
1196 	int		i;
1197 	lgrp_t		*my_lgrp;
1198 	lgrp_id_t	lgrpid;
1199 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1200 	boolean_t	drop_lock = B_FALSE;
1201 	boolean_t	need_synch = B_FALSE;
1202 
1203 	/*
1204 	 * Grab CPU lock (if we haven't already)
1205 	 */
1206 	if (!MUTEX_HELD(&cpu_lock)) {
1207 		mutex_enter(&cpu_lock);
1208 		drop_lock = B_TRUE;
1209 	}
1210 
1211 	/*
1212 	 * This routine may be called from a context where we already
1213 	 * hold cpu_lock, and have already paused cpus.
1214 	 */
1215 	if (!cpus_paused())
1216 		need_synch = B_TRUE;
1217 
1218 	/*
1219 	 * Check if this mnode is already configured and return immediately if
1220 	 * it is.
1221 	 *
1222 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1223 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1224 	 * recognize this case and continue as usual, but skip the update to
1225 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1226 	 * in topology, temporarily introduced by lgrp_mem_fini().
1227 	 */
1228 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1229 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1230 		if (drop_lock)
1231 			mutex_exit(&cpu_lock);
1232 		return;
1233 	}
1234 
1235 	/*
1236 	 * Update lgroup topology with new memory resources, keeping track of
1237 	 * which lgroups change
1238 	 */
1239 	count = 0;
1240 	klgrpset_clear(changed);
1241 	my_lgrp = lgrp_hand_to_lgrp(hand);
1242 	if (my_lgrp == NULL) {
1243 		/* new lgrp */
1244 		my_lgrp = lgrp_create();
1245 		lgrpid = my_lgrp->lgrp_id;
1246 		my_lgrp->lgrp_plathand = hand;
1247 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1248 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1249 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1250 
1251 		if (need_synch)
1252 			pause_cpus(NULL);
1253 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1254 		    &changed);
1255 		if (need_synch)
1256 			start_cpus();
1257 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1258 	    > 0) {
1259 		/*
1260 		 * Leaf lgroup was created, but latency wasn't available
1261 		 * then.  So, set latency for it and fill in rest of lgroup
1262 		 * topology  now that we know how far it is from other leaf
1263 		 * lgroups.
1264 		 */
1265 		klgrpset_clear(changed);
1266 		lgrpid = my_lgrp->lgrp_id;
1267 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1268 		    lgrpid))
1269 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1270 		if (need_synch)
1271 			pause_cpus(NULL);
1272 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1273 		    &changed);
1274 		if (need_synch)
1275 			start_cpus();
1276 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1277 	    my_lgrp->lgrp_id)) {
1278 		klgrpset_add(changed, lgrpid);
1279 		count = 1;
1280 
1281 		lgrpid = my_lgrp->lgrp_id;
1282 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1283 		klgrpset_add(changed, lgrpid);
1284 		count++;
1285 		for (i = 0; i <= lgrp_alloc_max; i++) {
1286 			lgrp_t		*lgrp;
1287 
1288 			lgrp = lgrp_table[i];
1289 			if (!LGRP_EXISTS(lgrp) ||
1290 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1291 				continue;
1292 
1293 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1294 			klgrpset_add(changed, lgrp->lgrp_id);
1295 			count++;
1296 		}
1297 	}
1298 
1299 	/*
1300 	 * Add memory node to lgroup and remove lgroup from ones that need
1301 	 * to be updated
1302 	 */
1303 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1304 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1305 		my_lgrp->lgrp_nmnodes++;
1306 	}
1307 	klgrpset_del(changed, lgrpid);
1308 
1309 	/*
1310 	 * Update memory node information for all lgroups that changed and
1311 	 * contain new memory node as a resource
1312 	 */
1313 	if (count)
1314 		(void) lgrp_mnode_update(changed, NULL);
1315 
1316 	if (drop_lock)
1317 		mutex_exit(&cpu_lock);
1318 }
1319 
1320 /*
1321  * Called to indicate that the lgroup associated with the platform
1322  * handle "hand" no longer contains given memory node
1323  *
1324  * LOCKING for this routine is a bit tricky. Usually it is called without
1325  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1326  * callers. During DR of the board containing the caged memory it may be called
1327  * with cpu_lock already held and CPUs paused.
1328  *
1329  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1330  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1331  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1332  * the same mnode back into the topology. See lgrp_mem_rename() and
1333  * lgrp_mem_init() for additional details.
1334  */
1335 void
1336 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1337 {
1338 	klgrpset_t	changed;
1339 	int		count;
1340 	int		i;
1341 	lgrp_t		*my_lgrp;
1342 	lgrp_id_t	lgrpid;
1343 	mnodeset_t	mnodes_mask;
1344 	boolean_t	drop_lock = B_FALSE;
1345 	boolean_t	need_synch = B_FALSE;
1346 
1347 	/*
1348 	 * Grab CPU lock (if we haven't already)
1349 	 */
1350 	if (!MUTEX_HELD(&cpu_lock)) {
1351 		mutex_enter(&cpu_lock);
1352 		drop_lock = B_TRUE;
1353 	}
1354 
1355 	/*
1356 	 * This routine may be called from a context where we already
1357 	 * hold cpu_lock and have already paused cpus.
1358 	 */
1359 	if (!cpus_paused())
1360 		need_synch = B_TRUE;
1361 
1362 	my_lgrp = lgrp_hand_to_lgrp(hand);
1363 
1364 	/*
1365 	 * The lgrp *must* be pre-existing
1366 	 */
1367 	ASSERT(my_lgrp != NULL);
1368 
1369 	/*
1370 	 * Delete memory node from lgroups which contain it
1371 	 */
1372 	mnodes_mask = ((mnodeset_t)1 << mnode);
1373 	for (i = 0; i <= lgrp_alloc_max; i++) {
1374 		lgrp_t *lgrp = lgrp_table[i];
1375 		/*
1376 		 * Skip any non-existent lgroups and any lgroups that don't
1377 		 * contain leaf lgroup of memory as a memory resource
1378 		 */
1379 		if (!LGRP_EXISTS(lgrp) ||
1380 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1381 			continue;
1382 
1383 		/*
1384 		 * Avoid removing the last mnode from the root in the DR
1385 		 * copy-rename case. See lgrp_mem_rename() for details.
1386 		 */
1387 		if (is_copy_rename &&
1388 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1389 			continue;
1390 
1391 		/*
1392 		 * Remove memory node from lgroup.
1393 		 */
1394 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1395 		lgrp->lgrp_nmnodes--;
1396 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1397 	}
1398 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1399 
1400 	/*
1401 	 * Don't need to update lgroup topology if this lgroup still has memory.
1402 	 *
1403 	 * In the special case of DR copy-rename with the only mnode being
1404 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1405 	 * still need to update the lgroup topology.
1406 	 */
1407 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1408 	    !(is_copy_rename &&
1409 		(my_lgrp == lgrp_root) &&
1410 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1411 		if (drop_lock)
1412 			mutex_exit(&cpu_lock);
1413 		return;
1414 	}
1415 
1416 	/*
1417 	 * This lgroup does not contain any memory now
1418 	 */
1419 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1420 
1421 	/*
1422 	 * Remove this lgroup from lgroup topology if it does not contain any
1423 	 * resources now
1424 	 */
1425 	lgrpid = my_lgrp->lgrp_id;
1426 	count = 0;
1427 	klgrpset_clear(changed);
1428 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1429 		/*
1430 		 * Delete lgroup when no more resources
1431 		 */
1432 		if (need_synch)
1433 			pause_cpus(NULL);
1434 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1435 		    lgrp_alloc_max + 1, &changed);
1436 		ASSERT(count > 0);
1437 		if (need_synch)
1438 			start_cpus();
1439 	} else {
1440 		/*
1441 		 * Remove lgroup from memory resources of any lgroups that
1442 		 * contain it as such
1443 		 */
1444 		for (i = 0; i <= lgrp_alloc_max; i++) {
1445 			lgrp_t		*lgrp;
1446 
1447 			lgrp = lgrp_table[i];
1448 			if (!LGRP_EXISTS(lgrp) ||
1449 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1450 			    lgrpid))
1451 				continue;
1452 
1453 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1454 		}
1455 	}
1456 	if (drop_lock)
1457 		mutex_exit(&cpu_lock);
1458 }
1459 
1460 /*
1461  * Return lgroup with given platform handle
1462  */
1463 lgrp_t *
1464 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1465 {
1466 	int	i;
1467 	lgrp_t	*lgrp;
1468 
1469 	if (hand == LGRP_NULL_HANDLE)
1470 		return (NULL);
1471 
1472 	for (i = 0; i <= lgrp_alloc_max; i++) {
1473 		lgrp = lgrp_table[i];
1474 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1475 			return (lgrp);
1476 	}
1477 	return (NULL);
1478 }
1479 
1480 /*
1481  * Return the home lgroup of the current thread.
1482  * We must do this with kernel preemption disabled, since we don't want our
1483  * thread to be re-homed while we're poking around with its lpl, and the lpl
1484  * should never be NULL.
1485  *
1486  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1487  * is enabled because of DR.  Callers can use disable kernel preemption
1488  * around this call to guarantee that the lgroup will be valid beyond this
1489  * routine, since kernel preemption can be recursive.
1490  */
1491 lgrp_t *
1492 lgrp_home_lgrp(void)
1493 {
1494 	lgrp_t	*lgrp;
1495 	lpl_t	*lpl;
1496 
1497 	kpreempt_disable();
1498 
1499 	lpl = curthread->t_lpl;
1500 	ASSERT(lpl != NULL);
1501 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1502 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1503 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1504 
1505 	kpreempt_enable();
1506 
1507 	return (lgrp);
1508 }
1509 
1510 /*
1511  * Return ID of home lgroup for given thread
1512  * (See comments for lgrp_home_lgrp() for special care and handling
1513  * instructions)
1514  */
1515 lgrp_id_t
1516 lgrp_home_id(kthread_t *t)
1517 {
1518 	lgrp_id_t	lgrp;
1519 	lpl_t		*lpl;
1520 
1521 	ASSERT(t != NULL);
1522 	/*
1523 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1524 	 * cannot since the HAT layer can call into this routine to
1525 	 * determine the locality for its data structures in the context
1526 	 * of a page fault.
1527 	 */
1528 
1529 	kpreempt_disable();
1530 
1531 	lpl = t->t_lpl;
1532 	ASSERT(lpl != NULL);
1533 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1534 	lgrp = lpl->lpl_lgrpid;
1535 
1536 	kpreempt_enable();
1537 
1538 	return (lgrp);
1539 }
1540 
1541 /*
1542  * Return lgroup containing the physical memory for the given page frame number
1543  */
1544 lgrp_t *
1545 lgrp_pfn_to_lgrp(pfn_t pfn)
1546 {
1547 	lgrp_handle_t	hand;
1548 	int		i;
1549 	lgrp_t		*lgrp;
1550 
1551 	hand = lgrp_plat_pfn_to_hand(pfn);
1552 	if (hand != LGRP_NULL_HANDLE)
1553 		for (i = 0; i <= lgrp_alloc_max; i++) {
1554 			lgrp = lgrp_table[i];
1555 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1556 				return (lgrp);
1557 		}
1558 	return (NULL);
1559 }
1560 
1561 /*
1562  * Return lgroup containing the physical memory for the given page frame number
1563  */
1564 lgrp_t *
1565 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1566 {
1567 	lgrp_handle_t	hand;
1568 	int		i;
1569 	lgrp_t		*lgrp;
1570 	pfn_t		pfn;
1571 
1572 	pfn = btop(physaddr);
1573 	hand = lgrp_plat_pfn_to_hand(pfn);
1574 	if (hand != LGRP_NULL_HANDLE)
1575 		for (i = 0; i <= lgrp_alloc_max; i++) {
1576 			lgrp = lgrp_table[i];
1577 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1578 				return (lgrp);
1579 		}
1580 	return (NULL);
1581 }
1582 
1583 /*
1584  * Return the leaf lgroup containing the given CPU
1585  */
1586 static lgrp_t *
1587 lgrp_cpu_to_lgrp(cpu_t *cpu)
1588 {
1589 	return (cpu->cpu_chip->chip_lgrp);
1590 }
1591 
1592 /*
1593  * Return the sum of the partition loads in an lgrp divided by
1594  * the number of CPUs in the lgrp.  This is our best approximation
1595  * of an 'lgroup load average' for a useful per-lgroup kstat.
1596  */
1597 static uint64_t
1598 lgrp_sum_loadavgs(lgrp_t *lgrp)
1599 {
1600 	cpu_t *cpu;
1601 	int ncpu;
1602 	uint64_t loads = 0;
1603 
1604 	mutex_enter(&cpu_lock);
1605 
1606 	cpu = lgrp->lgrp_cpu;
1607 	ncpu = lgrp->lgrp_cpucnt;
1608 
1609 	if (cpu == NULL || ncpu == 0) {
1610 		mutex_exit(&cpu_lock);
1611 		return (0ull);
1612 	}
1613 
1614 	do {
1615 		loads += cpu->cpu_lpl->lpl_loadavg;
1616 		cpu = cpu->cpu_next_lgrp;
1617 	} while (cpu != lgrp->lgrp_cpu);
1618 
1619 	mutex_exit(&cpu_lock);
1620 
1621 	return (loads / ncpu);
1622 }
1623 
1624 void
1625 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1626 {
1627 	struct lgrp_stats *pstats;
1628 
1629 	/*
1630 	 * Verify that the caller isn't trying to add to
1631 	 * a statistic for an lgroup that has gone away
1632 	 */
1633 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1634 		return;
1635 
1636 	pstats = &lgrp_stats[lgrpid];
1637 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1638 }
1639 
1640 int64_t
1641 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1642 {
1643 	uint64_t val;
1644 	struct lgrp_stats *pstats;
1645 
1646 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1647 		return ((int64_t)0);
1648 
1649 	pstats = &lgrp_stats[lgrpid];
1650 	LGRP_STAT_READ(pstats, stat, val);
1651 	return (val);
1652 }
1653 
1654 /*
1655  * Reset all kstats for lgrp specified by its lgrpid.
1656  */
1657 static void
1658 lgrp_kstat_reset(lgrp_id_t lgrpid)
1659 {
1660 	lgrp_stat_t stat;
1661 
1662 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1663 		return;
1664 
1665 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1666 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1667 	}
1668 }
1669 
1670 /*
1671  * Collect all per-lgrp statistics for the lgrp associated with this
1672  * kstat, and store them in the ks_data array.
1673  *
1674  * The superuser can reset all the running counter statistics for an
1675  * lgrp by writing to any of the lgrp's stats.
1676  */
1677 static int
1678 lgrp_kstat_extract(kstat_t *ksp, int rw)
1679 {
1680 	lgrp_stat_t		stat;
1681 	struct kstat_named	*ksd;
1682 	lgrp_t			*lgrp;
1683 	lgrp_id_t		lgrpid;
1684 
1685 	lgrp = (lgrp_t *)ksp->ks_private;
1686 
1687 	ksd = (struct kstat_named *)ksp->ks_data;
1688 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1689 
1690 	lgrpid = lgrp->lgrp_id;
1691 
1692 	if (lgrpid == LGRP_NONE) {
1693 		/*
1694 		 * Return all zeroes as stats for freed lgrp.
1695 		 */
1696 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1697 			ksd[stat].value.i64 = 0;
1698 		}
1699 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1700 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1701 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1702 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1703 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1704 	} else if (rw != KSTAT_WRITE) {
1705 		/*
1706 		 * Handle counter stats
1707 		 */
1708 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1709 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1710 		}
1711 
1712 		/*
1713 		 * Handle kernel data snapshot stats
1714 		 */
1715 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1716 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1717 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1718 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1719 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1720 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1721 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1722 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1723 	} else {
1724 		lgrp_kstat_reset(lgrpid);
1725 	}
1726 
1727 	return (0);
1728 }
1729 
1730 int
1731 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1732 {
1733 	cpu_t	*cp;
1734 
1735 	mutex_enter(&cpu_lock);
1736 
1737 	if ((cp = cpu_get(id)) == NULL) {
1738 		mutex_exit(&cpu_lock);
1739 		return (EINVAL);
1740 	}
1741 
1742 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1743 		mutex_exit(&cpu_lock);
1744 		return (EINVAL);
1745 	}
1746 
1747 	ASSERT(cp->cpu_lpl != NULL);
1748 
1749 	*lp = cp->cpu_lpl->lpl_lgrpid;
1750 
1751 	mutex_exit(&cpu_lock);
1752 
1753 	return (0);
1754 }
1755 
1756 int
1757 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1758 {
1759 	cpu_t *cp;
1760 
1761 	mutex_enter(&cpu_lock);
1762 
1763 	if ((cp = cpu_get(id)) == NULL) {
1764 		mutex_exit(&cpu_lock);
1765 		return (EINVAL);
1766 	}
1767 
1768 	ASSERT(cp->cpu_lpl != NULL);
1769 
1770 	*lp = cp->cpu_lpl->lpl_loadavg;
1771 
1772 	mutex_exit(&cpu_lock);
1773 
1774 	return (0);
1775 }
1776 
1777 void
1778 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1779 {
1780 	lgrp_t		*lgrp;
1781 	int		i;
1782 
1783 	for (i = 0; i <= lgrp_alloc_max; i++) {
1784 		lgrp = lgrp_table[i];
1785 
1786 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1787 			lgrp->lgrp_latency = (int)newtime;
1788 	}
1789 }
1790 
1791 /*
1792  * Add a resource named by lpl_leaf to rset of lpl_target
1793  *
1794  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1795  * resource. It is adjusted here, as this is presently the only place that we
1796  * can be certain a resource addition has succeeded.
1797  *
1798  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1799  * list in order until it reaches a NULL.  (This list is required to be NULL
1800  * terminated, too).  This is done so that we can mark start pos + 1, so that
1801  * each lpl is traversed sequentially, but in a different order.  We hope this
1802  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1803  */
1804 
1805 void
1806 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1807 {
1808 	int		i;
1809 	int		entry_slot = 0;
1810 
1811 	/* return if leaf is already present */
1812 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1813 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1814 			return;
1815 		}
1816 
1817 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1818 		    lpl_leaf->lpl_lgrpid) {
1819 			break;
1820 		}
1821 	}
1822 
1823 	/* insert leaf, update counts */
1824 	entry_slot = i;
1825 	i = lpl_target->lpl_nrset++;
1826 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1827 		panic("More leaf lgrps in system than are supported!\n");
1828 	}
1829 
1830 	/*
1831 	 * Start at the end of the rset array and work backwards towards the
1832 	 * slot into which the new lpl will be inserted. This effectively
1833 	 * preserves the current ordering by scooting everybody over one entry,
1834 	 * and placing the new entry into the space created.
1835 	 */
1836 
1837 	while (i-- > entry_slot) {
1838 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1839 	}
1840 
1841 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1842 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1843 }
1844 
1845 /*
1846  * Update each of lpl_parent's children with a proper hint and
1847  * a reference to their parent.
1848  * The lgrp topology is used as the reference since it is fully
1849  * consistent and correct at this point.
1850  *
1851  * Each child's hint will reference an element in lpl_parent's
1852  * rset that designates where the child should start searching
1853  * for CPU resources. The hint selected is the highest order leaf present
1854  * in the child's lineage.
1855  *
1856  * This should be called after any potential change in lpl_parent's
1857  * rset.
1858  */
1859 static void
1860 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1861 {
1862 	klgrpset_t	children, leaves;
1863 	lpl_t		*lpl;
1864 	int		hint;
1865 	int		i, j;
1866 
1867 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1868 	if (klgrpset_isempty(children))
1869 		return; /* nothing to do */
1870 
1871 	for (i = 0; i <= lgrp_alloc_max; i++) {
1872 		if (klgrpset_ismember(children, i)) {
1873 
1874 			/*
1875 			 * Given the set of leaves in this child's lineage,
1876 			 * find the highest order leaf present in the parent's
1877 			 * rset. Select this as the hint for the child.
1878 			 */
1879 			leaves = lgrp_table[i]->lgrp_leaves;
1880 			hint = 0;
1881 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1882 				lpl = lpl_parent->lpl_rset[j];
1883 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1884 					hint = j;
1885 			}
1886 			cp->cp_lgrploads[i].lpl_hint = hint;
1887 
1888 			/*
1889 			 * (Re)set the parent. It may be incorrect if
1890 			 * lpl_parent is new in the topology.
1891 			 */
1892 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1893 		}
1894 	}
1895 }
1896 
1897 /*
1898  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1899  *
1900  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1901  * resource. The values are adjusted here, as this is the only place that we can
1902  * be certain a resource was successfully deleted.
1903  */
1904 void
1905 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1906 {
1907 	int i;
1908 
1909 	/* find leaf in intermediate node */
1910 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1911 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1912 			break;
1913 	}
1914 
1915 	/* return if leaf not found */
1916 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1917 		return;
1918 
1919 	/* prune leaf, compress array */
1920 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1921 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1922 	lpl_target->lpl_ncpu--;
1923 	do {
1924 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1925 	} while (i++ < lpl_target->lpl_nrset);
1926 }
1927 
1928 /*
1929  * Check to see if the resource set of the target lpl contains the
1930  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1931  */
1932 
1933 int
1934 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1935 {
1936 	int i;
1937 
1938 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1939 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1940 			return (1);
1941 	}
1942 
1943 	return (0);
1944 }
1945 
1946 /*
1947  * Called when we change cpu lpl membership.  This increments or decrements the
1948  * per-cpu counter in every lpl in which our leaf appears.
1949  */
1950 void
1951 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1952 {
1953 	cpupart_t	*cpupart;
1954 	lgrp_t		*lgrp_leaf;
1955 	lgrp_t		*lgrp_cur;
1956 	lpl_t		*lpl_leaf;
1957 	lpl_t		*lpl_cur;
1958 	int		i;
1959 
1960 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1961 
1962 	cpupart = cp->cpu_part;
1963 	lpl_leaf = cp->cpu_lpl;
1964 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1965 
1966 	for (i = 0; i <= lgrp_alloc_max; i++) {
1967 		lgrp_cur = lgrp_table[i];
1968 
1969 		/*
1970 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1971 		 * for the cpu in question, or if the current lgrp and leaf
1972 		 * don't share the same resources.
1973 		 */
1974 
1975 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1976 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1977 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1978 			continue;
1979 
1980 
1981 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1982 
1983 		if (lpl_cur->lpl_nrset > 0) {
1984 			if (act == LPL_INCREMENT) {
1985 				lpl_cur->lpl_ncpu++;
1986 			} else if (act == LPL_DECREMENT) {
1987 				lpl_cur->lpl_ncpu--;
1988 			}
1989 		}
1990 	}
1991 }
1992 
1993 /*
1994  * Initialize lpl with given resources and specified lgrp
1995  */
1996 
1997 void
1998 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
1999 {
2000 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2001 	lpl->lpl_loadavg = 0;
2002 	if (lpl == lpl_leaf)
2003 		lpl->lpl_ncpu = 1;
2004 	else
2005 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2006 	lpl->lpl_nrset = 1;
2007 	lpl->lpl_rset[0] = lpl_leaf;
2008 	lpl->lpl_lgrp = lgrp;
2009 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2010 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2011 }
2012 
2013 /*
2014  * Clear an unused lpl
2015  */
2016 
2017 void
2018 lpl_clear(lpl_t *lpl)
2019 {
2020 	lgrpid_t	lid;
2021 
2022 	/* save lid for debugging purposes */
2023 	lid = lpl->lpl_lgrpid;
2024 	bzero(lpl, sizeof (lpl_t));
2025 	lpl->lpl_lgrpid = lid;
2026 }
2027 
2028 /*
2029  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2030  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2031  * make full use of all of the lgroup topology, but this checks to make sure
2032  * that for the parts that it does use, it has correctly understood the
2033  * relationships that exist. This function returns
2034  * 0 if the topology is correct, and a non-zero error code, for non-debug
2035  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2036  * debugging on a DEBUG kernel.
2037  */
2038 int
2039 lpl_topo_verify(cpupart_t *cpupart)
2040 {
2041 	lgrp_t		*lgrp;
2042 	lpl_t		*lpl;
2043 	klgrpset_t	rset;
2044 	klgrpset_t	cset;
2045 	cpu_t		*cpu;
2046 	cpu_t		*cp_start;
2047 	int		i;
2048 	int		j;
2049 	int		sum;
2050 
2051 	/* topology can't be incorrect if it doesn't exist */
2052 	if (!lgrp_topo_initialized || !lgrp_initialized)
2053 		return (LPL_TOPO_CORRECT);
2054 
2055 	ASSERT(cpupart != NULL);
2056 
2057 	for (i = 0; i <= lgrp_alloc_max; i++) {
2058 		lgrp = lgrp_table[i];
2059 		lpl = NULL;
2060 		/* make sure lpls are allocated */
2061 		ASSERT(cpupart->cp_lgrploads);
2062 		if (!cpupart->cp_lgrploads)
2063 			return (LPL_TOPO_PART_HAS_NO_LPL);
2064 
2065 		lpl = &cpupart->cp_lgrploads[i];
2066 		/* make sure our index is good */
2067 		ASSERT(i < cpupart->cp_nlgrploads);
2068 
2069 		/* if lgroup doesn't exist, make sure lpl is empty */
2070 		if (!LGRP_EXISTS(lgrp)) {
2071 			ASSERT(lpl->lpl_ncpu == 0);
2072 			if (lpl->lpl_ncpu > 0) {
2073 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2074 			} else {
2075 				continue;
2076 			}
2077 		}
2078 
2079 		/* verify that lgroup and lpl are identically numbered */
2080 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2081 
2082 		/* if lgroup isn't in our partition, make sure lpl is empty */
2083 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2084 		    cpupart->cp_lgrpset)) {
2085 			ASSERT(lpl->lpl_ncpu == 0);
2086 			if (lpl->lpl_ncpu > 0) {
2087 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2088 			}
2089 			/*
2090 			 * lpl is empty, and lgroup isn't in partition.  verify
2091 			 * that lpl doesn't show up in anyone else's rsets (in
2092 			 * this partition, anyway)
2093 			 */
2094 
2095 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2096 				lpl_t *i_lpl; /* lpl we're iterating over */
2097 
2098 				i_lpl = &cpupart->cp_lgrploads[j];
2099 
2100 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2101 				if (lpl_rset_contains(i_lpl, lpl)) {
2102 					return (LPL_TOPO_LPL_ORPHANED);
2103 				}
2104 			}
2105 			/* lgroup is empty, and everything is ok. continue */
2106 			continue;
2107 		}
2108 
2109 
2110 		/* lgroup is in this partition, now check it against lpl */
2111 
2112 		/* do both have matching lgrps? */
2113 		ASSERT(lgrp == lpl->lpl_lgrp);
2114 		if (lgrp != lpl->lpl_lgrp) {
2115 			return (LPL_TOPO_LGRP_MISMATCH);
2116 		}
2117 
2118 		/* do the parent lgroups exist and do they match? */
2119 		if (lgrp->lgrp_parent) {
2120 			ASSERT(lpl->lpl_parent);
2121 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2122 				    lpl->lpl_parent->lpl_lgrpid);
2123 
2124 			if (!lpl->lpl_parent) {
2125 				return (LPL_TOPO_MISSING_PARENT);
2126 			} else if (lgrp->lgrp_parent->lgrp_id !=
2127 			    lpl->lpl_parent->lpl_lgrpid) {
2128 				return (LPL_TOPO_PARENT_MISMATCH);
2129 			}
2130 		}
2131 
2132 		/* only leaf lgroups keep a cpucnt, only check leaves */
2133 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2134 
2135 			/* verify that lgrp is also a leaf */
2136 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2137 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2138 			    lpl->lpl_lgrpid)));
2139 
2140 			if ((lgrp->lgrp_childcnt > 0) ||
2141 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2142 			    lpl->lpl_lgrpid))) {
2143 				return (LPL_TOPO_LGRP_NOT_LEAF);
2144 			}
2145 
2146 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2147 			    (lpl->lpl_ncpu > 0));
2148 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2149 				(lpl->lpl_ncpu <= 0)) {
2150 				return (LPL_TOPO_BAD_CPUCNT);
2151 			}
2152 
2153 			/*
2154 			 * Check that lpl_ncpu also matches the number of
2155 			 * cpus in the lpl's linked list.  This only exists in
2156 			 * leaves, but they should always match.
2157 			 */
2158 			j = 0;
2159 			cpu = cp_start = lpl->lpl_cpus;
2160 			while (cpu != NULL) {
2161 				j++;
2162 
2163 				/* check to make sure cpu's lpl is leaf lpl */
2164 				ASSERT(cpu->cpu_lpl == lpl);
2165 				if (cpu->cpu_lpl != lpl) {
2166 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2167 				}
2168 
2169 				/* check next cpu */
2170 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2171 					continue;
2172 				} else {
2173 					cpu = NULL;
2174 				}
2175 			}
2176 
2177 			ASSERT(j == lpl->lpl_ncpu);
2178 			if (j != lpl->lpl_ncpu) {
2179 				return (LPL_TOPO_LPL_BAD_NCPU);
2180 			}
2181 
2182 			/*
2183 			 * Also, check that leaf lpl is contained in all
2184 			 * intermediate lpls that name the leaf as a descendant
2185 			 */
2186 
2187 			for (j = 0; j <= lgrp_alloc_max; j++) {
2188 				klgrpset_t intersect;
2189 				lgrp_t *lgrp_cand;
2190 				lpl_t *lpl_cand;
2191 
2192 				lgrp_cand = lgrp_table[j];
2193 				intersect = klgrpset_intersects(
2194 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2195 				    cpupart->cp_lgrpset);
2196 
2197 				if (!LGRP_EXISTS(lgrp_cand) ||
2198 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2199 				    cpupart->cp_lgrpset) ||
2200 				    (intersect == 0))
2201 					continue;
2202 
2203 				lpl_cand =
2204 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2205 
2206 				if (klgrpset_ismember(intersect,
2207 				    lgrp->lgrp_id)) {
2208 					ASSERT(lpl_rset_contains(lpl_cand,
2209 					    lpl));
2210 
2211 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2212 						return (LPL_TOPO_RSET_MSSNG_LF);
2213 					}
2214 				}
2215 			}
2216 
2217 		} else { /* non-leaf specific checks */
2218 
2219 			/*
2220 			 * Non-leaf lpls should have lpl_cpus == NULL
2221 			 * verify that this is so
2222 			 */
2223 			ASSERT(lpl->lpl_cpus == NULL);
2224 			if (lpl->lpl_cpus != NULL) {
2225 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2226 			}
2227 
2228 			/*
2229 			 * verify that the sum of the cpus in the leaf resources
2230 			 * is equal to the total ncpu in the intermediate
2231 			 */
2232 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2233 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2234 			}
2235 
2236 			ASSERT(sum == lpl->lpl_ncpu);
2237 			if (sum != lpl->lpl_ncpu) {
2238 				return (LPL_TOPO_LPL_BAD_NCPU);
2239 			}
2240 		}
2241 
2242 		/*
2243 		 * check on lpl_hint. Don't check root, since it has no parent.
2244 		 */
2245 		if (lpl->lpl_parent != NULL) {
2246 			int hint;
2247 			lpl_t *hint_lpl;
2248 
2249 			/* make sure hint is within limits of nrset */
2250 			hint = lpl->lpl_hint;
2251 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2252 			if (lpl->lpl_parent->lpl_nrset < hint) {
2253 				return (LPL_TOPO_BOGUS_HINT);
2254 			}
2255 
2256 			/* make sure hint points to valid lpl */
2257 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2258 			ASSERT(hint_lpl->lpl_ncpu > 0);
2259 			if (hint_lpl->lpl_ncpu <= 0) {
2260 				return (LPL_TOPO_BOGUS_HINT);
2261 			}
2262 		}
2263 
2264 		/*
2265 		 * Check the rset of the lpl in question.  Make sure that each
2266 		 * rset contains a subset of the resources in
2267 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2268 		 * sure that each rset doesn't include resources that are
2269 		 * outside of that set.  (Which would be resources somehow not
2270 		 * accounted for).
2271 		 */
2272 
2273 		klgrpset_clear(rset);
2274 		for (j = 0; j < lpl->lpl_nrset; j++) {
2275 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2276 		}
2277 		klgrpset_copy(cset, rset);
2278 		/* make sure lpl rset matches lgrp rset */
2279 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2280 		/* make sure rset is contained with in partition, too */
2281 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2282 
2283 		ASSERT(klgrpset_isempty(rset) &&
2284 			    klgrpset_isempty(cset));
2285 		if (!klgrpset_isempty(rset) ||
2286 		    !klgrpset_isempty(cset)) {
2287 			return (LPL_TOPO_RSET_MISMATCH);
2288 		}
2289 
2290 		/*
2291 		 * check to make sure lpl_nrset matches the number of rsets
2292 		 * contained in the lpl
2293 		 */
2294 
2295 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2296 		    j++);
2297 
2298 		ASSERT(j == lpl->lpl_nrset);
2299 		if (j != lpl->lpl_nrset) {
2300 			return (LPL_TOPO_BAD_RSETCNT);
2301 		}
2302 
2303 	}
2304 	return (LPL_TOPO_CORRECT);
2305 }
2306 
2307 /*
2308  * Flatten lpl topology to given number of levels.  This is presently only
2309  * implemented for a flatten to 2 levels, which will prune out the intermediates
2310  * and home the leaf lpls to the root lpl.
2311  */
2312 int
2313 lpl_topo_flatten(int levels)
2314 {
2315 	int		i;
2316 	uint_t		sum;
2317 	lgrp_t		*lgrp_cur;
2318 	lpl_t		*lpl_cur;
2319 	lpl_t		*lpl_root;
2320 	cpupart_t	*cp;
2321 
2322 	if (levels != 2)
2323 		return (0);
2324 
2325 	/* called w/ cpus paused - grab no locks! */
2326 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2327 	    !lgrp_initialized);
2328 
2329 	cp = cp_list_head;
2330 	do {
2331 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2332 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2333 
2334 		for (i = 0; i <= lgrp_alloc_max; i++) {
2335 			lgrp_cur = lgrp_table[i];
2336 			lpl_cur = &cp->cp_lgrploads[i];
2337 
2338 			if ((lgrp_cur == lgrp_root) ||
2339 			    (!LGRP_EXISTS(lgrp_cur) &&
2340 			    (lpl_cur->lpl_ncpu == 0)))
2341 				continue;
2342 
2343 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2344 				/*
2345 				 * this should be a deleted intermediate, so
2346 				 * clear it
2347 				 */
2348 				lpl_clear(lpl_cur);
2349 			} else if ((lpl_cur->lpl_nrset == 1) &&
2350 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2351 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2352 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2353 				/*
2354 				 * this is a leaf whose parent was deleted, or
2355 				 * whose parent had their lgrp deleted.  (And
2356 				 * whose parent will soon be deleted).  Point
2357 				 * this guy back to the root lpl.
2358 				 */
2359 				lpl_cur->lpl_parent = lpl_root;
2360 				lpl_rset_add(lpl_root, lpl_cur);
2361 			}
2362 
2363 		}
2364 
2365 		/*
2366 		 * Now that we're done, make sure the count on the root lpl is
2367 		 * correct, and update the hints of the children for the sake of
2368 		 * thoroughness
2369 		 */
2370 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2371 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2372 		}
2373 		lpl_root->lpl_ncpu = sum;
2374 		lpl_child_update(lpl_root, cp);
2375 
2376 		cp = cp->cp_next;
2377 	} while (cp != cp_list_head);
2378 
2379 	return (levels);
2380 }
2381 
2382 /*
2383  * Insert a lpl into the resource hierarchy and create any additional lpls that
2384  * are necessary to represent the varying states of locality for the cpu
2385  * resoruces newly added to the partition.
2386  *
2387  * This routine is clever enough that it can correctly add resources from the
2388  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2389  * those for which the lpl is a leaf as opposed to simply a named equally local
2390  * resource).  The one special case that needs additional processing is when a
2391  * new intermediate lpl is introduced.  Since the main loop only traverses
2392  * looking to add the leaf resource where it does not yet exist, additional work
2393  * is necessary to add other leaf resources that may need to exist in the newly
2394  * created intermediate.  This is performed by the second inner loop, and is
2395  * only done when the check for more than one overlapping resource succeeds.
2396  */
2397 
2398 void
2399 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2400 {
2401 	int		i;
2402 	int		j;
2403 	int		hint;
2404 	int		rset_num_intersect;
2405 	lgrp_t		*lgrp_cur;
2406 	lpl_t		*lpl_cur;
2407 	lpl_t		*lpl_parent;
2408 	lgrpid_t	parent_id;
2409 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2410 
2411 	for (i = 0; i <= lgrp_alloc_max; i++) {
2412 		lgrp_cur = lgrp_table[i];
2413 
2414 		/*
2415 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2416 		 * contained within the current lgrp, or if the current lgrp has
2417 		 * no leaves in this partition
2418 		 */
2419 
2420 		if (!LGRP_EXISTS(lgrp_cur) ||
2421 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2422 		    lpl_leaf->lpl_lgrpid) ||
2423 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2424 		    cpupart->cp_lgrpset))
2425 			continue;
2426 
2427 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2428 		if (lgrp_cur->lgrp_parent != NULL) {
2429 			/* if lgrp has a parent, assign it properly */
2430 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2431 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2432 		} else {
2433 			/* if not, make sure parent ptr gets set to null */
2434 			lpl_parent = NULL;
2435 		}
2436 
2437 		if (lpl_cur == lpl_leaf) {
2438 			/*
2439 			 * Almost all leaf state was initialized elsewhere.  The
2440 			 * only thing left to do is to set the parent.
2441 			 */
2442 			lpl_cur->lpl_parent = lpl_parent;
2443 			continue;
2444 		}
2445 
2446 		/*
2447 		 * Initialize intermediate lpl
2448 		 * Save this lpl's hint though. Since we're changing this
2449 		 * lpl's resources, we need to update the hint in this lpl's
2450 		 * children, but the hint in this lpl is unaffected and
2451 		 * should be preserved.
2452 		 */
2453 		hint = lpl_cur->lpl_hint;
2454 
2455 		lpl_clear(lpl_cur);
2456 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2457 
2458 		lpl_cur->lpl_hint = hint;
2459 		lpl_cur->lpl_parent = lpl_parent;
2460 
2461 		/* does new lpl need to be populated with other resources? */
2462 		rset_intersect =
2463 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2464 			cpupart->cp_lgrpset);
2465 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2466 
2467 		if (rset_num_intersect > 1) {
2468 			/*
2469 			 * If so, figure out what lpls have resources that
2470 			 * intersect this one, and add them.
2471 			 */
2472 			for (j = 0; j <= lgrp_alloc_max; j++) {
2473 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2474 				lpl_t	*lpl_cand;	/* candidate lpl */
2475 
2476 				lgrp_cand = lgrp_table[j];
2477 				if (!LGRP_EXISTS(lgrp_cand) ||
2478 				    !klgrpset_ismember(rset_intersect,
2479 					lgrp_cand->lgrp_id))
2480 					continue;
2481 				lpl_cand =
2482 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2483 				lpl_rset_add(lpl_cur, lpl_cand);
2484 			}
2485 		}
2486 		/*
2487 		 * This lpl's rset has changed. Update the hint in it's
2488 		 * children.
2489 		 */
2490 		lpl_child_update(lpl_cur, cpupart);
2491 	}
2492 }
2493 
2494 /*
2495  * remove a lpl from the hierarchy of resources, clearing its state when
2496  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2497  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2498  * delete them as well.
2499  */
2500 
2501 void
2502 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2503 {
2504 	int		i;
2505 	lgrp_t		*lgrp_cur;
2506 	lpl_t		*lpl_cur;
2507 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2508 
2509 	for (i = 0; i <= lgrp_alloc_max; i++) {
2510 		lgrp_cur = lgrp_table[i];
2511 
2512 		/*
2513 		 * Don't attempt to remove from lgrps that aren't there, that
2514 		 * don't contain our leaf, or from the leaf itself. (We do that
2515 		 * later)
2516 		 */
2517 
2518 		if (!LGRP_EXISTS(lgrp_cur))
2519 			continue;
2520 
2521 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2522 
2523 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2524 		    lpl_leaf->lpl_lgrpid) ||
2525 		    (lpl_cur == lpl_leaf)) {
2526 			continue;
2527 		}
2528 
2529 		/*
2530 		 * This is a slightly sleazy simplification in that we have
2531 		 * already marked the cp_lgrpset as no longer containing the
2532 		 * leaf we've deleted.  Any lpls that pass the above checks
2533 		 * based upon lgrp membership but not necessarily cpu-part
2534 		 * membership also get cleared by the checks below.  Currently
2535 		 * this is harmless, as the lpls should be empty anyway.
2536 		 *
2537 		 * In particular, we want to preserve lpls that have additional
2538 		 * leaf resources, even though we don't yet have a processor
2539 		 * architecture that represents resources this way.
2540 		 */
2541 
2542 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2543 		    cpupart->cp_lgrpset);
2544 
2545 		lpl_rset_del(lpl_cur, lpl_leaf);
2546 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2547 			lpl_clear(lpl_cur);
2548 		} else {
2549 			/*
2550 			 * Update this lpl's children
2551 			 */
2552 			lpl_child_update(lpl_cur, cpupart);
2553 		}
2554 	}
2555 	lpl_clear(lpl_leaf);
2556 }
2557 
2558 /*
2559  * add a cpu to a partition in terms of lgrp load avg bookeeping
2560  *
2561  * The lpl (cpu partition load average information) is now arranged in a
2562  * hierarchical fashion whereby resources that are closest, ie. most local, to
2563  * the cpu in question are considered to be leaves in a tree of resources.
2564  * There are two general cases for cpu additon:
2565  *
2566  * 1. A lpl structure that contains resources already in the hierarchy tree.
2567  * In this case, all of the associated lpl relationships have been defined, and
2568  * all that is necessary is that we link the new cpu into the per-lpl list of
2569  * cpus, and increment the ncpu count of all places where this cpu resource will
2570  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2571  * pushing is accomplished by this routine.
2572  *
2573  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2574  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2575  * construct the hierarchy of state necessary to name it's more distant
2576  * resources, if they should exist.  The leaf structure is initialized by this
2577  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2578  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2579  * and builds all of the "ancestoral" state necessary to identify resources at
2580  * differing levels of locality.
2581  */
2582 void
2583 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2584 {
2585 	cpupart_t	*cpupart;
2586 	lgrp_t		*lgrp_leaf;
2587 	lpl_t		*lpl_leaf;
2588 
2589 	/* called sometimes w/ cpus paused - grab no locks */
2590 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2591 
2592 	cpupart = cp->cpu_part;
2593 	lgrp_leaf = lgrp_table[lgrpid];
2594 
2595 	/* don't add non-existent lgrp */
2596 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2597 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2598 	cp->cpu_lpl = lpl_leaf;
2599 
2600 	/* only leaf lpls contain cpus */
2601 
2602 	if (lpl_leaf->lpl_ncpu++ == 0) {
2603 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2604 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2605 		lpl_leaf_insert(lpl_leaf, cpupart);
2606 	} else {
2607 		/*
2608 		 * the lpl should already exist in the parent, so just update
2609 		 * the count of available CPUs
2610 		 */
2611 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2612 	}
2613 
2614 	/* link cpu into list of cpus in lpl */
2615 
2616 	if (lpl_leaf->lpl_cpus) {
2617 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2618 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2619 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2620 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2621 	} else {
2622 		/*
2623 		 * We increment ncpu immediately after we create a new leaf
2624 		 * lpl, so assert that ncpu == 1 for the case where we don't
2625 		 * have any cpu pointers yet.
2626 		 */
2627 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2628 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2629 	}
2630 
2631 }
2632 
2633 
2634 /*
2635  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2636  *
2637  * The lpl (cpu partition load average information) is now arranged in a
2638  * hierarchical fashion whereby resources that are closest, ie. most local, to
2639  * the cpu in question are considered to be leaves in a tree of resources.
2640  * There are two removal cases in question:
2641  *
2642  * 1. Removal of the resource in the leaf leaves other resources remaining in
2643  * that leaf.  (Another cpu still exists at this level of locality).  In this
2644  * case, the count of available cpus is decremented in all assocated lpls by
2645  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2646  * from the per-cpu lpl list.
2647  *
2648  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2649  * empty)  In this case, all of what has occurred for the first step must take
2650  * place; however, additionally we must remove the lpl structure itself, prune
2651  * out any stranded lpls that do not directly name a leaf resource, and mark the
2652  * cpu partition in question as no longer containing resources from the lgrp of
2653  * the lpl that has been delted.  Cpu-partition changes are handled by this
2654  * method, but the lpl_leaf_remove function deals with the details of pruning
2655  * out the empty lpl and any of its orphaned direct ancestors.
2656  */
2657 void
2658 lgrp_part_del_cpu(cpu_t *cp)
2659 {
2660 	lpl_t		*lpl;
2661 	lpl_t		*leaf_lpl;
2662 	lgrp_t		*lgrp_leaf;
2663 
2664 	/* called sometimes w/ cpus paused - grab no locks */
2665 
2666 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2667 
2668 	lpl = leaf_lpl = cp->cpu_lpl;
2669 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2670 
2671 	/* don't delete a leaf that isn't there */
2672 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2673 
2674 	/* no double-deletes */
2675 	ASSERT(lpl->lpl_ncpu);
2676 	if (--lpl->lpl_ncpu == 0) {
2677 		/*
2678 		 * This was the last cpu in this lgroup for this partition,
2679 		 * clear its bit in the partition's lgroup bitmask
2680 		 */
2681 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2682 
2683 		/* eliminate remaning lpl link pointers in cpu, lpl */
2684 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2685 
2686 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2687 	} else {
2688 
2689 		/* unlink cpu from lists of cpus in lpl */
2690 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2691 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2692 		if (lpl->lpl_cpus == cp) {
2693 			lpl->lpl_cpus = cp->cpu_next_lpl;
2694 		}
2695 
2696 		/*
2697 		 * Update the cpu count in the lpls associated with parent
2698 		 * lgroups.
2699 		 */
2700 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2701 
2702 	}
2703 	/* clear cpu's lpl ptr when we're all done */
2704 	cp->cpu_lpl = NULL;
2705 }
2706 
2707 /*
2708  * Recompute load average for the specified partition/lgrp fragment.
2709  *
2710  * We rely on the fact that this routine is called from the clock thread
2711  * at a point before the clock thread can block (i.e. before its first
2712  * lock request).  Since the clock thread can not be preempted (since it
2713  * runs at highest priority), we know that cpu partitions can not change
2714  * (since doing so would require either the repartition requester or the
2715  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2716  * without grabbing cpu_lock.
2717  */
2718 void
2719 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2720 {
2721 	uint_t		ncpu;
2722 	int64_t		old, new, f;
2723 
2724 	/*
2725 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2726 	 */
2727 	static short expval[] = {
2728 	    0, 3196, 1618, 1083,
2729 	    814, 652, 543, 466,
2730 	    408, 363, 326, 297,
2731 	    272, 251, 233, 218,
2732 	    204, 192, 181, 172,
2733 	    163, 155, 148, 142,
2734 	    136, 130, 125, 121,
2735 	    116, 112, 109, 105
2736 	};
2737 
2738 	/* ASSERT (called from clock level) */
2739 
2740 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2741 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2742 		return;
2743 	}
2744 
2745 	for (;;) {
2746 
2747 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2748 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2749 		else
2750 			f = expval[ncpu];
2751 
2752 		/*
2753 		 * Modify the load average atomically to avoid losing
2754 		 * anticipatory load updates (see lgrp_move_thread()).
2755 		 */
2756 		if (ageflag) {
2757 			/*
2758 			 * We're supposed to both update and age the load.
2759 			 * This happens 10 times/sec. per cpu.  We do a
2760 			 * little hoop-jumping to avoid integer overflow.
2761 			 */
2762 			int64_t		q, r;
2763 
2764 			do {
2765 				old = new = lpl->lpl_loadavg;
2766 				q = (old  >> 16) << 7;
2767 				r = (old  & 0xffff) << 7;
2768 				new += ((long long)(nrcpus - q) * f -
2769 				    ((r * f) >> 16)) >> 7;
2770 
2771 				/*
2772 				 * Check for overflow
2773 				 */
2774 				if (new > LGRP_LOADAVG_MAX)
2775 					new = LGRP_LOADAVG_MAX;
2776 				else if (new < 0)
2777 					new = 0;
2778 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2779 			    new) != old);
2780 		} else {
2781 			/*
2782 			 * We're supposed to update the load, but not age it.
2783 			 * This option is used to update the load (which either
2784 			 * has already been aged in this 1/10 sec. interval or
2785 			 * soon will be) to account for a remotely executing
2786 			 * thread.
2787 			 */
2788 			do {
2789 				old = new = lpl->lpl_loadavg;
2790 				new += f;
2791 				/*
2792 				 * Check for overflow
2793 				 * Underflow not possible here
2794 				 */
2795 				if (new < old)
2796 					new = LGRP_LOADAVG_MAX;
2797 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2798 			    new) != old);
2799 		}
2800 
2801 		/*
2802 		 * Do the same for this lpl's parent
2803 		 */
2804 		if ((lpl = lpl->lpl_parent) == NULL)
2805 			break;
2806 		ncpu = lpl->lpl_ncpu;
2807 	}
2808 }
2809 
2810 /*
2811  * Initialize lpl topology in the target based on topology currently present in
2812  * lpl_bootstrap.
2813  *
2814  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2815  * initialize cp_default list of lpls. Up to this point all topology operations
2816  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2817  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2818  * `target' points to the list of lpls in cp_default and `size' is the size of
2819  * this list.
2820  *
2821  * This function walks the lpl topology in lpl_bootstrap and does for things:
2822  *
2823  * 1) Copies all fields from lpl_bootstrap to the target.
2824  *
2825  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2826  *
2827  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2828  *    instead of lpl_bootstrap.
2829  *
2830  * 4) Updates pointers in the resource list of the target to point to the lpls
2831  *    in the target list instead of lpl_bootstrap.
2832  *
2833  * After lpl_topo_bootstrap() completes, target contains the same information
2834  * that would be present there if it were used during boot instead of
2835  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2836  * and it is bzeroed.
2837  */
2838 void
2839 lpl_topo_bootstrap(lpl_t *target, int size)
2840 {
2841 	lpl_t	*lpl = lpl_bootstrap;
2842 	lpl_t	*target_lpl = target;
2843 	int	howmany;
2844 	int	id;
2845 	int	i;
2846 
2847 	/*
2848 	 * The only target that should be passed here is cp_default lpl list.
2849 	 */
2850 	ASSERT(target == cp_default.cp_lgrploads);
2851 	ASSERT(size == cp_default.cp_nlgrploads);
2852 	ASSERT(!lgrp_topo_initialized);
2853 	ASSERT(ncpus == 1);
2854 
2855 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2856 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2857 		/*
2858 		 * Copy all fields from lpl.
2859 		 */
2860 
2861 		*target_lpl = *lpl;
2862 
2863 		/*
2864 		 * Substitute CPU0 lpl pointer with one relative to target.
2865 		 */
2866 		if (lpl->lpl_cpus == CPU) {
2867 			ASSERT(CPU->cpu_lpl == lpl);
2868 			CPU->cpu_lpl = target_lpl;
2869 		}
2870 
2871 		/*
2872 		 * Substitute parent information with parent relative to target.
2873 		 */
2874 		if (lpl->lpl_parent != NULL)
2875 			target_lpl->lpl_parent = (lpl_t *)
2876 			    (((uintptr_t)lpl->lpl_parent -
2877 				(uintptr_t)lpl_bootstrap) +
2878 				(uintptr_t)target);
2879 
2880 		/*
2881 		 * Walk over resource set substituting pointers relative to
2882 		 * lpl_bootstrap to pointers relative to target.
2883 		 */
2884 		ASSERT(lpl->lpl_nrset <= 1);
2885 
2886 		for (id = 0; id < lpl->lpl_nrset; id++) {
2887 			if (lpl->lpl_rset[id] != NULL) {
2888 				target_lpl->lpl_rset[id] =
2889 				    (lpl_t *)
2890 				    (((uintptr_t)lpl->lpl_rset[id] -
2891 					(uintptr_t)lpl_bootstrap) +
2892 					(uintptr_t)target);
2893 			}
2894 		}
2895 	}
2896 
2897 	/*
2898 	 * Topology information in lpl_bootstrap is no longer needed.
2899 	 */
2900 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2901 }
2902 
2903 /* the maximum effect that a single thread can have on it's lgroup's load */
2904 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2905 	((lgrp_loadavg_max_effect) / (ncpu))
2906 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2907 
2908 /*
2909  * If the lowest load among the lgroups a process' threads are currently
2910  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2911  * expanding the process to a new lgroup.
2912  */
2913 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2914 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2915 
2916 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2917 	((lgrp_expand_proc_thresh) / (ncpu))
2918 
2919 /*
2920  * A process will be expanded to a new lgroup only if the difference between
2921  * the lowest load on the lgroups the process' thread's are currently spread
2922  * across and the lowest load on the other lgroups in the process' partition
2923  * is greater than lgrp_expand_proc_diff.
2924  */
2925 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2926 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2927 
2928 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2929 	((lgrp_expand_proc_diff) / (ncpu))
2930 
2931 /*
2932  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2933  * be present due to impreciseness of the load average decay algorithm.
2934  *
2935  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2936  * tolerance is scaled by the number of cpus in the lgroup just like
2937  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2938  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2939  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2940  */
2941 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2942 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2943 	((lgrp_loadavg_tolerance) / ncpu)
2944 
2945 /*
2946  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2947  * average is above this threshold
2948  */
2949 uint32_t	lgrp_load_thresh = UINT32_MAX;
2950 
2951 /*
2952  * lgrp_choose() will try to skip any lgroups with less memory
2953  * than this free when choosing a home lgroup
2954  */
2955 pgcnt_t	lgrp_mem_free_thresh = 0;
2956 
2957 /*
2958  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2959  * one based on one of the following policies:
2960  * - Random selection
2961  * - Pseudo round robin placement
2962  * - Longest time since a thread was last placed
2963  */
2964 #define	LGRP_CHOOSE_RANDOM	1
2965 #define	LGRP_CHOOSE_RR		2
2966 #define	LGRP_CHOOSE_TIME	3
2967 
2968 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2969 
2970 /*
2971  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2972  * be bound to a CPU or processor set.
2973  *
2974  * Arguments:
2975  *	t		The thread
2976  *	cpupart		The partition the thread belongs to.
2977  *
2978  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2979  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2980  *	 partitions changing out from under us and assumes that given thread is
2981  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2982  *	 disabled, so don't grab any locks because we should never block under
2983  *	 those conditions.
2984  */
2985 lpl_t *
2986 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2987 {
2988 	lgrp_load_t	bestload, bestrload;
2989 	int		lgrpid_offset, lgrp_count;
2990 	lgrp_id_t	lgrpid, lgrpid_start;
2991 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2992 	klgrpset_t	lgrpset;
2993 	proc_t		*p;
2994 
2995 	ASSERT(t != NULL);
2996 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2997 	    THREAD_LOCK_HELD(t));
2998 	ASSERT(cpupart != NULL);
2999 
3000 	p = t->t_procp;
3001 
3002 	/* A process should always be in an active partition */
3003 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3004 
3005 	bestlpl = bestrlpl = NULL;
3006 	bestload = bestrload = LGRP_LOADAVG_MAX;
3007 	lgrpset = cpupart->cp_lgrpset;
3008 
3009 	switch (lgrp_choose_policy) {
3010 	case LGRP_CHOOSE_RR:
3011 		lgrpid = cpupart->cp_lgrp_hint;
3012 		do {
3013 			if (++lgrpid > lgrp_alloc_max)
3014 				lgrpid = 0;
3015 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3016 
3017 		break;
3018 	default:
3019 	case LGRP_CHOOSE_TIME:
3020 	case LGRP_CHOOSE_RANDOM:
3021 		klgrpset_nlgrps(lgrpset, lgrp_count);
3022 		lgrpid_offset =
3023 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3024 		for (lgrpid = 0; ; lgrpid++) {
3025 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3026 				if (--lgrpid_offset == 0)
3027 					break;
3028 			}
3029 		}
3030 		break;
3031 	}
3032 
3033 	lgrpid_start = lgrpid;
3034 
3035 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3036 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3037 
3038 	/*
3039 	 * Use lgroup affinities (if any) to choose best lgroup
3040 	 *
3041 	 * NOTE: Assumes that thread is protected from going away and its
3042 	 *	 lgroup affinities won't change (ie. p_lock, or
3043 	 *	 thread_lock() being held and/or CPUs paused)
3044 	 */
3045 	if (t->t_lgrp_affinity) {
3046 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3047 		if (lpl != NULL)
3048 			return (lpl);
3049 	}
3050 
3051 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3052 	bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3053 
3054 	do {
3055 		pgcnt_t	npgs;
3056 
3057 		/*
3058 		 * Skip any lgroups outside of thread's pset
3059 		 */
3060 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3061 			if (++lgrpid > lgrp_alloc_max)
3062 				lgrpid = 0;	/* wrap the search */
3063 			continue;
3064 		}
3065 
3066 		/*
3067 		 * Skip any non-leaf lgroups
3068 		 */
3069 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3070 			continue;
3071 
3072 		/*
3073 		 * Skip any lgroups without enough free memory
3074 		 * (when threshold set to nonzero positive value)
3075 		 */
3076 		if (lgrp_mem_free_thresh > 0) {
3077 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3078 			if (npgs < lgrp_mem_free_thresh) {
3079 				if (++lgrpid > lgrp_alloc_max)
3080 					lgrpid = 0;	/* wrap the search */
3081 				continue;
3082 			}
3083 		}
3084 
3085 		lpl = &cpupart->cp_lgrploads[lgrpid];
3086 		if (klgrpset_isempty(p->p_lgrpset) ||
3087 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3088 			/*
3089 			 * Either this is a new process or the process already
3090 			 * has threads on this lgrp, so this is a preferred
3091 			 * lgroup for the thread.
3092 			 */
3093 			if (lpl_pick(lpl, bestlpl)) {
3094 				bestload = lpl->lpl_loadavg;
3095 				bestlpl = lpl;
3096 			}
3097 		} else {
3098 			/*
3099 			 * The process doesn't have any threads on this lgrp,
3100 			 * but we're willing to consider this lgrp if the load
3101 			 * difference is big enough to justify splitting up
3102 			 * the process' threads.
3103 			 */
3104 			if (lpl_pick(lpl, bestrlpl)) {
3105 				bestrload = lpl->lpl_loadavg;
3106 				bestrlpl = lpl;
3107 			}
3108 		}
3109 		if (++lgrpid > lgrp_alloc_max)
3110 			lgrpid = 0;	/* wrap the search */
3111 	} while (lgrpid != lgrpid_start);
3112 
3113 	/*
3114 	 * Return root lgroup if threshold isn't set to maximum value and
3115 	 * lowest lgroup load average more than a certain threshold
3116 	 */
3117 	if (lgrp_load_thresh != UINT32_MAX &&
3118 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3119 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3120 
3121 	/*
3122 	 * If all the lgroups over which the thread's process is spread are
3123 	 * heavily loaded, we'll consider placing the thread on one of the
3124 	 * other leaf lgroups in the thread's partition.
3125 	 */
3126 	if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3127 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3128 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3129 	    bestload)) {
3130 		bestlpl = bestrlpl;
3131 	}
3132 
3133 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3134 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3135 
3136 	ASSERT(bestlpl->lpl_ncpu > 0);
3137 	return (bestlpl);
3138 }
3139 
3140 /*
3141  * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing.
3142  */
3143 static int
3144 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3145 {
3146 	lgrp_load_t	l1, l2;
3147 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3148 
3149 
3150 	if (lpl2 == NULL)
3151 		return (1);
3152 
3153 	l1 = lpl1->lpl_loadavg;
3154 	l2 = lpl2->lpl_loadavg;
3155 
3156 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3157 		/* lpl1 is significantly less loaded than lpl2 */
3158 		return (1);
3159 	}
3160 
3161 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3162 	    l1 + tolerance >= l2 && l1 < l2 &&
3163 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3164 		/*
3165 		 * lpl1's load is within the tolerance of lpl2. We're
3166 		 * willing to consider it be to better however if
3167 		 * it has been longer since we last homed a thread there
3168 		 */
3169 		return (1);
3170 	}
3171 
3172 	return (0);
3173 }
3174 
3175 /*
3176  * An LWP is expected to be assigned to an lgroup for at least this long
3177  * for its anticipatory load to be justified.  NOTE that this value should
3178  * not be set extremely huge (say, larger than 100 years), to avoid problems
3179  * with overflow in the calculation that uses it.
3180  */
3181 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3182 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3183 
3184 /*
3185  * Routine to change a thread's lgroup affiliation.  This routine updates
3186  * the thread's kthread_t struct and its process' proc_t struct to note the
3187  * thread's new lgroup affiliation, and its lgroup affinities.
3188  *
3189  * Note that this is the only routine that modifies a thread's t_lpl field,
3190  * and that adds in or removes anticipatory load.
3191  *
3192  * If the thread is exiting, newlpl is NULL.
3193  *
3194  * Locking:
3195  * The following lock must be held on entry:
3196  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3197  *		doesn't get removed from t's partition
3198  *
3199  * This routine is not allowed to grab any locks, since it may be called
3200  * with cpus paused (such as from cpu_offline).
3201  */
3202 void
3203 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3204 {
3205 	proc_t		*p;
3206 	lpl_t		*lpl, *oldlpl;
3207 	lgrp_id_t	oldid;
3208 	kthread_t	*tp;
3209 	uint_t		ncpu;
3210 	lgrp_load_t	old, new;
3211 
3212 	ASSERT(t);
3213 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3214 	    THREAD_LOCK_HELD(t));
3215 
3216 	/*
3217 	 * If not changing lpls, just return
3218 	 */
3219 	if ((oldlpl = t->t_lpl) == newlpl)
3220 		return;
3221 
3222 	/*
3223 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3224 	 * associated with process 0 rather than with its original process).
3225 	 */
3226 	if (t->t_proc_flag & TP_LWPEXIT) {
3227 		if (newlpl != NULL) {
3228 			t->t_lpl = newlpl;
3229 		}
3230 		return;
3231 	}
3232 
3233 	p = ttoproc(t);
3234 
3235 	/*
3236 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3237 	 * to account for it being moved from its old lgroup.
3238 	 */
3239 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3240 	    (p->p_tlist != NULL)) {
3241 		oldid = oldlpl->lpl_lgrpid;
3242 
3243 		if (newlpl != NULL)
3244 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3245 
3246 		if ((do_lgrpset_delete) &&
3247 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3248 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3249 				/*
3250 				 * Check if a thread other than the thread
3251 				 * that's moving is assigned to the same
3252 				 * lgroup as the thread that's moving.  Note
3253 				 * that we have to compare lgroup IDs, rather
3254 				 * than simply comparing t_lpl's, since the
3255 				 * threads may belong to different partitions
3256 				 * but be assigned to the same lgroup.
3257 				 */
3258 				ASSERT(tp->t_lpl != NULL);
3259 
3260 				if ((tp != t) &&
3261 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3262 					/*
3263 					 * Another thread is assigned to the
3264 					 * same lgroup as the thread that's
3265 					 * moving, p_lgrpset doesn't change.
3266 					 */
3267 					break;
3268 				} else if (tp == p->p_tlist) {
3269 					/*
3270 					 * No other thread is assigned to the
3271 					 * same lgroup as the exiting thread,
3272 					 * clear the lgroup's bit in p_lgrpset.
3273 					 */
3274 					klgrpset_del(p->p_lgrpset, oldid);
3275 					break;
3276 				}
3277 			}
3278 		}
3279 
3280 		/*
3281 		 * If this thread was assigned to its old lgroup for such a
3282 		 * short amount of time that the anticipatory load that was
3283 		 * added on its behalf has aged very little, remove that
3284 		 * anticipatory load.
3285 		 */
3286 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3287 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3288 			lpl = oldlpl;
3289 			for (;;) {
3290 				do {
3291 					old = new = lpl->lpl_loadavg;
3292 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3293 					if (new > old) {
3294 						/*
3295 						 * this can happen if the load
3296 						 * average was aged since we
3297 						 * added in the anticipatory
3298 						 * load
3299 						 */
3300 						new = 0;
3301 					}
3302 				} while (cas32(
3303 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3304 					    new) != old);
3305 
3306 				lpl = lpl->lpl_parent;
3307 				if (lpl == NULL)
3308 					break;
3309 
3310 				ncpu = lpl->lpl_ncpu;
3311 				ASSERT(ncpu > 0);
3312 			}
3313 		}
3314 	}
3315 	/*
3316 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3317 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3318 	 * to its new lgroup to account for its move to its new lgroup.
3319 	 */
3320 	if (newlpl != NULL) {
3321 		/*
3322 		 * This thread is moving to a new lgroup
3323 		 */
3324 		t->t_lpl = newlpl;
3325 
3326 		/*
3327 		 * Reflect move in load average of new lgroup
3328 		 * unless it is root lgroup
3329 		 */
3330 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3331 			return;
3332 
3333 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3334 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3335 		}
3336 
3337 		/*
3338 		 * It'll take some time for the load on the new lgroup
3339 		 * to reflect this thread's placement on it.  We'd
3340 		 * like not, however, to have all threads between now
3341 		 * and then also piling on to this lgroup.  To avoid
3342 		 * this pileup, we anticipate the load this thread
3343 		 * will generate on its new lgroup.  The goal is to
3344 		 * make the lgroup's load appear as though the thread
3345 		 * had been there all along.  We're very conservative
3346 		 * in calculating this anticipatory load, we assume
3347 		 * the worst case case (100% CPU-bound thread).  This
3348 		 * may be modified in the future to be more accurate.
3349 		 */
3350 		lpl = newlpl;
3351 		for (;;) {
3352 			ncpu = lpl->lpl_ncpu;
3353 			ASSERT(ncpu > 0);
3354 			do {
3355 				old = new = lpl->lpl_loadavg;
3356 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3357 				/*
3358 				 * Check for overflow
3359 				 * Underflow not possible here
3360 				 */
3361 				if (new < old)
3362 					new = UINT32_MAX;
3363 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3364 			    new) != old);
3365 
3366 			lpl = lpl->lpl_parent;
3367 			if (lpl == NULL)
3368 				break;
3369 		}
3370 		t->t_anttime = gethrtime();
3371 	}
3372 }
3373 
3374 /*
3375  * Return lgroup memory allocation policy given advice from madvise(3C)
3376  */
3377 lgrp_mem_policy_t
3378 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3379 {
3380 	switch (advice) {
3381 	case MADV_ACCESS_LWP:
3382 		return (LGRP_MEM_POLICY_NEXT);
3383 	case MADV_ACCESS_MANY:
3384 		return (LGRP_MEM_POLICY_RANDOM);
3385 	default:
3386 		return (lgrp_mem_policy_default(size, type));
3387 	}
3388 }
3389 
3390 /*
3391  * Figure out default policy
3392  */
3393 lgrp_mem_policy_t
3394 lgrp_mem_policy_default(size_t size, int type)
3395 {
3396 	cpupart_t		*cp;
3397 	lgrp_mem_policy_t	policy;
3398 	size_t			pset_mem_size;
3399 
3400 	/*
3401 	 * Randomly allocate memory across lgroups for shared memory
3402 	 * beyond a certain threshold
3403 	 */
3404 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3405 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3406 		/*
3407 		 * Get total memory size of current thread's pset
3408 		 */
3409 		kpreempt_disable();
3410 		cp = curthread->t_cpupart;
3411 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3412 		kpreempt_enable();
3413 
3414 		/*
3415 		 * Choose policy to randomly allocate memory across
3416 		 * lgroups in pset if it will fit and is not default
3417 		 * partition.  Otherwise, allocate memory randomly
3418 		 * across machine.
3419 		 */
3420 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3421 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3422 		else
3423 			policy = LGRP_MEM_POLICY_RANDOM;
3424 	} else
3425 		/*
3426 		 * Apply default policy for private memory and
3427 		 * shared memory under the respective random
3428 		 * threshold.
3429 		 */
3430 		policy = lgrp_mem_default_policy;
3431 
3432 	return (policy);
3433 }
3434 
3435 /*
3436  * Get memory allocation policy for this segment
3437  */
3438 lgrp_mem_policy_info_t *
3439 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3440 {
3441 	lgrp_mem_policy_info_t	*policy_info;
3442 	extern struct seg_ops	segspt_ops;
3443 	extern struct seg_ops	segspt_shmops;
3444 
3445 	/*
3446 	 * This is for binary compatibility to protect against third party
3447 	 * segment drivers which haven't recompiled to allow for
3448 	 * SEGOP_GETPOLICY()
3449 	 */
3450 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3451 	    seg->s_ops != &segspt_shmops)
3452 		return (NULL);
3453 
3454 	policy_info = NULL;
3455 	if (seg->s_ops->getpolicy != NULL)
3456 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3457 
3458 	return (policy_info);
3459 }
3460 
3461 /*
3462  * Set policy for allocating private memory given desired policy, policy info,
3463  * size in bytes of memory that policy is being applied.
3464  * Return 0 if policy wasn't set already and 1 if policy was set already
3465  */
3466 int
3467 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3468     lgrp_mem_policy_info_t *policy_info, size_t size)
3469 {
3470 
3471 	ASSERT(policy_info != NULL);
3472 
3473 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3474 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3475 
3476 	/*
3477 	 * Policy set already?
3478 	 */
3479 	if (policy == policy_info->mem_policy)
3480 		return (1);
3481 
3482 	/*
3483 	 * Set policy
3484 	 */
3485 	policy_info->mem_policy = policy;
3486 	policy_info->mem_reserved = 0;
3487 
3488 	return (0);
3489 }
3490 
3491 
3492 /*
3493  * Get shared memory allocation policy with given tree and offset
3494  */
3495 lgrp_mem_policy_info_t *
3496 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3497     u_offset_t vn_off)
3498 {
3499 	u_offset_t		off;
3500 	lgrp_mem_policy_info_t	*policy_info;
3501 	lgrp_shm_policy_seg_t	*policy_seg;
3502 	lgrp_shm_locality_t	*shm_locality;
3503 	avl_tree_t		*tree;
3504 	avl_index_t		where;
3505 
3506 	/*
3507 	 * Get policy segment tree from anon_map or vnode and use specified
3508 	 * anon index or vnode offset as offset
3509 	 *
3510 	 * Assume that no lock needs to be held on anon_map or vnode, since
3511 	 * they should be protected by their reference count which must be
3512 	 * nonzero for an existing segment
3513 	 */
3514 	if (amp) {
3515 		ASSERT(amp->refcnt != 0);
3516 		shm_locality = amp->locality;
3517 		if (shm_locality == NULL)
3518 			return (NULL);
3519 		tree = shm_locality->loc_tree;
3520 		off = ptob(anon_index);
3521 	} else if (vp) {
3522 		shm_locality = vp->v_locality;
3523 		if (shm_locality == NULL)
3524 			return (NULL);
3525 		ASSERT(shm_locality->loc_count != 0);
3526 		tree = shm_locality->loc_tree;
3527 		off = vn_off;
3528 	}
3529 
3530 	if (tree == NULL)
3531 		return (NULL);
3532 
3533 	/*
3534 	 * Lookup policy segment for offset into shared object and return
3535 	 * policy info
3536 	 */
3537 	rw_enter(&shm_locality->loc_lock, RW_READER);
3538 	policy_info = NULL;
3539 	policy_seg = avl_find(tree, &off, &where);
3540 	if (policy_seg)
3541 		policy_info = &policy_seg->shm_policy;
3542 	rw_exit(&shm_locality->loc_lock);
3543 
3544 	return (policy_info);
3545 }
3546 
3547 /*
3548  * Return lgroup to use for allocating memory
3549  * given the segment and address
3550  *
3551  * There isn't any mutual exclusion that exists between calls
3552  * to this routine and DR, so this routine and whomever calls it
3553  * should be mindful of the possibility that the lgrp returned
3554  * may be deleted. If this happens, dereferences of the lgrp
3555  * pointer will still be safe, but the resources in the lgrp will
3556  * be gone, and LGRP_EXISTS() will no longer be true.
3557  */
3558 lgrp_t *
3559 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3560 {
3561 	int			i;
3562 	lgrp_t			*lgrp;
3563 	klgrpset_t		lgrpset;
3564 	int			lgrps_spanned;
3565 	unsigned long		off;
3566 	lgrp_mem_policy_t	policy;
3567 	lgrp_mem_policy_info_t	*policy_info;
3568 	ushort_t		random;
3569 	int			stat = 0;
3570 
3571 	/*
3572 	 * Just return null if the lgrp framework hasn't finished
3573 	 * initializing or if this is a UMA machine.
3574 	 */
3575 	if (nlgrps == 1 || !lgrp_initialized)
3576 		return (lgrp_root);
3577 
3578 	/*
3579 	 * Get memory allocation policy for this segment
3580 	 */
3581 	policy = lgrp_mem_default_policy;
3582 	if (seg != NULL) {
3583 		if (seg->s_as == &kas) {
3584 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3585 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3586 				policy = LGRP_MEM_POLICY_RANDOM;
3587 		} else {
3588 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3589 			if (policy_info != NULL)
3590 				policy = policy_info->mem_policy;
3591 		}
3592 	}
3593 	lgrpset = 0;
3594 
3595 	/*
3596 	 * Initialize lgroup to home by default
3597 	 */
3598 	lgrp = lgrp_home_lgrp();
3599 
3600 	/*
3601 	 * When homing threads on root lgrp, override default memory
3602 	 * allocation policies with root lgroup memory allocation policy
3603 	 */
3604 	if (lgrp == lgrp_root)
3605 		policy = lgrp_mem_policy_root;
3606 
3607 	/*
3608 	 * Implement policy
3609 	 */
3610 	switch (policy) {
3611 	case LGRP_MEM_POLICY_NEXT_CPU:
3612 
3613 		/*
3614 		 * Return lgroup of current CPU which faulted on memory
3615 		 */
3616 		lgrp = lgrp_cpu_to_lgrp(CPU);
3617 		break;
3618 
3619 	case LGRP_MEM_POLICY_NEXT:
3620 	case LGRP_MEM_POLICY_DEFAULT:
3621 	default:
3622 
3623 		/*
3624 		 * Just return current thread's home lgroup
3625 		 * for default policy (next touch)
3626 		 * If the thread is homed to the root,
3627 		 * then the default policy is random across lgroups.
3628 		 * Fallthrough to the random case.
3629 		 */
3630 		if (lgrp != lgrp_root) {
3631 			if (policy == LGRP_MEM_POLICY_NEXT)
3632 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3633 			else
3634 				lgrp_stat_add(lgrp->lgrp_id,
3635 				    LGRP_NUM_DEFAULT, 1);
3636 			break;
3637 		}
3638 		/* LINTED fallthrough on case statement */
3639 	case LGRP_MEM_POLICY_RANDOM:
3640 
3641 		/*
3642 		 * Return a random leaf lgroup with memory
3643 		 */
3644 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3645 		/*
3646 		 * Count how many lgroups are spanned
3647 		 */
3648 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3649 
3650 		/*
3651 		 * There may be no memnodes in the root lgroup during DR copy
3652 		 * rename on a system with only two boards (memnodes)
3653 		 * configured. In this case just return the root lgrp.
3654 		 */
3655 		if (lgrps_spanned == 0) {
3656 			lgrp = lgrp_root;
3657 			break;
3658 		}
3659 
3660 		/*
3661 		 * Pick a random offset within lgroups spanned
3662 		 * and return lgroup at that offset
3663 		 */
3664 		random = (ushort_t)gethrtime() >> 4;
3665 		off = random % lgrps_spanned;
3666 		ASSERT(off <= lgrp_alloc_max);
3667 
3668 		for (i = 0; i <= lgrp_alloc_max; i++) {
3669 			if (!klgrpset_ismember(lgrpset, i))
3670 				continue;
3671 			if (off)
3672 				off--;
3673 			else {
3674 				lgrp = lgrp_table[i];
3675 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3676 				    1);
3677 				break;
3678 			}
3679 		}
3680 		break;
3681 
3682 	case LGRP_MEM_POLICY_RANDOM_PROC:
3683 
3684 		/*
3685 		 * Grab copy of bitmask of lgroups spanned by
3686 		 * this process
3687 		 */
3688 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3689 		stat = LGRP_NUM_RANDOM_PROC;
3690 
3691 		/* LINTED fallthrough on case statement */
3692 	case LGRP_MEM_POLICY_RANDOM_PSET:
3693 
3694 		if (!stat)
3695 			stat = LGRP_NUM_RANDOM_PSET;
3696 
3697 		if (klgrpset_isempty(lgrpset)) {
3698 			/*
3699 			 * Grab copy of bitmask of lgroups spanned by
3700 			 * this processor set
3701 			 */
3702 			kpreempt_disable();
3703 			klgrpset_copy(lgrpset,
3704 			    curthread->t_cpupart->cp_lgrpset);
3705 			kpreempt_enable();
3706 		}
3707 
3708 		/*
3709 		 * Count how many lgroups are spanned
3710 		 */
3711 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3712 		ASSERT(lgrps_spanned <= nlgrps);
3713 
3714 		/*
3715 		 * Probably lgrps_spanned should be always non-zero, but to be
3716 		 * on the safe side we return lgrp_root if it is empty.
3717 		 */
3718 		if (lgrps_spanned == 0) {
3719 			lgrp = lgrp_root;
3720 			break;
3721 		}
3722 
3723 		/*
3724 		 * Pick a random offset within lgroups spanned
3725 		 * and return lgroup at that offset
3726 		 */
3727 		random = (ushort_t)gethrtime() >> 4;
3728 		off = random % lgrps_spanned;
3729 		ASSERT(off <= lgrp_alloc_max);
3730 
3731 		for (i = 0; i <= lgrp_alloc_max; i++) {
3732 			if (!klgrpset_ismember(lgrpset, i))
3733 				continue;
3734 			if (off)
3735 				off--;
3736 			else {
3737 				lgrp = lgrp_table[i];
3738 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3739 				    1);
3740 				break;
3741 			}
3742 		}
3743 		break;
3744 
3745 	case LGRP_MEM_POLICY_ROUNDROBIN:
3746 
3747 		/*
3748 		 * Use offset within segment to determine
3749 		 * offset from home lgroup to choose for
3750 		 * next lgroup to allocate memory from
3751 		 */
3752 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3753 		    (lgrp_alloc_max + 1);
3754 
3755 		kpreempt_disable();
3756 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3757 		i = lgrp->lgrp_id;
3758 		kpreempt_enable();
3759 
3760 		while (off > 0) {
3761 			i = (i + 1) % (lgrp_alloc_max + 1);
3762 			lgrp = lgrp_table[i];
3763 			if (klgrpset_ismember(lgrpset, i))
3764 				off--;
3765 		}
3766 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3767 
3768 		break;
3769 	}
3770 
3771 	ASSERT(lgrp != NULL);
3772 	return (lgrp);
3773 }
3774 
3775 /*
3776  * Return the number of pages in an lgroup
3777  *
3778  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3779  *	 could cause tests that rely on the numat driver to fail....
3780  */
3781 pgcnt_t
3782 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3783 {
3784 	lgrp_t *lgrp;
3785 
3786 	lgrp = lgrp_table[lgrpid];
3787 	if (!LGRP_EXISTS(lgrp) ||
3788 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3789 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3790 		return (0);
3791 
3792 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3793 }
3794 
3795 /*
3796  * Initialize lgroup shared memory allocation policy support
3797  */
3798 void
3799 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3800 {
3801 	lgrp_shm_locality_t	*shm_locality;
3802 
3803 	/*
3804 	 * Initialize locality field in anon_map
3805 	 * Don't need any locks because this is called when anon_map is
3806 	 * allocated, but not used anywhere yet.
3807 	 */
3808 	if (amp) {
3809 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3810 		if (amp->locality == NULL) {
3811 			/*
3812 			 * Allocate and initialize shared memory locality info
3813 			 * and set anon_map locality pointer to it
3814 			 * Drop lock across kmem_alloc(KM_SLEEP)
3815 			 */
3816 			ANON_LOCK_EXIT(&amp->a_rwlock);
3817 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3818 			    KM_SLEEP);
3819 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3820 			    NULL);
3821 			shm_locality->loc_count = 1;	/* not used for amp */
3822 			shm_locality->loc_tree = NULL;
3823 
3824 			/*
3825 			 * Reacquire lock and check to see whether anyone beat
3826 			 * us to initializing the locality info
3827 			 */
3828 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3829 			if (amp->locality != NULL) {
3830 				rw_destroy(&shm_locality->loc_lock);
3831 				kmem_free(shm_locality,
3832 				    sizeof (*shm_locality));
3833 			} else
3834 				amp->locality = shm_locality;
3835 		}
3836 		ANON_LOCK_EXIT(&amp->a_rwlock);
3837 		return;
3838 	}
3839 
3840 	/*
3841 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3842 	 */
3843 	mutex_enter(&vp->v_lock);
3844 	if ((vp->v_flag & V_LOCALITY) == 0) {
3845 		/*
3846 		 * Allocate and initialize shared memory locality info
3847 		 */
3848 		mutex_exit(&vp->v_lock);
3849 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3850 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3851 		shm_locality->loc_count = 1;
3852 		shm_locality->loc_tree = NULL;
3853 
3854 		/*
3855 		 * Point vnode locality field at shared vnode policy info
3856 		 * and set locality aware flag in vnode
3857 		 */
3858 		mutex_enter(&vp->v_lock);
3859 		if ((vp->v_flag & V_LOCALITY) == 0) {
3860 			vp->v_locality = shm_locality;
3861 			vp->v_flag |= V_LOCALITY;
3862 		} else {
3863 			/*
3864 			 * Lost race so free locality info and increment count.
3865 			 */
3866 			rw_destroy(&shm_locality->loc_lock);
3867 			kmem_free(shm_locality, sizeof (*shm_locality));
3868 			shm_locality = vp->v_locality;
3869 			shm_locality->loc_count++;
3870 		}
3871 		mutex_exit(&vp->v_lock);
3872 
3873 		return;
3874 	}
3875 
3876 	/*
3877 	 * Increment reference count of number of segments mapping this vnode
3878 	 * shared
3879 	 */
3880 	shm_locality = vp->v_locality;
3881 	shm_locality->loc_count++;
3882 	mutex_exit(&vp->v_lock);
3883 }
3884 
3885 /*
3886  * Destroy the given shared memory policy segment tree
3887  */
3888 void
3889 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3890 {
3891 	lgrp_shm_policy_seg_t	*cur;
3892 	lgrp_shm_policy_seg_t	*next;
3893 
3894 	if (tree == NULL)
3895 		return;
3896 
3897 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3898 	while (cur != NULL) {
3899 		next = AVL_NEXT(tree, cur);
3900 		avl_remove(tree, cur);
3901 		kmem_free(cur, sizeof (*cur));
3902 		cur = next;
3903 	}
3904 	kmem_free(tree, sizeof (avl_tree_t));
3905 }
3906 
3907 /*
3908  * Uninitialize lgroup shared memory allocation policy support
3909  */
3910 void
3911 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3912 {
3913 	lgrp_shm_locality_t	*shm_locality;
3914 
3915 	/*
3916 	 * For anon_map, deallocate shared memory policy tree and
3917 	 * zero locality field
3918 	 * Don't need any locks because anon_map is being freed
3919 	 */
3920 	if (amp) {
3921 		if (amp->locality == NULL)
3922 			return;
3923 		shm_locality = amp->locality;
3924 		shm_locality->loc_count = 0;	/* not really used for amp */
3925 		rw_destroy(&shm_locality->loc_lock);
3926 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3927 		kmem_free(shm_locality, sizeof (*shm_locality));
3928 		amp->locality = 0;
3929 		return;
3930 	}
3931 
3932 	/*
3933 	 * For vnode, decrement reference count of segments mapping this vnode
3934 	 * shared and delete locality info if reference count drops to 0
3935 	 */
3936 	mutex_enter(&vp->v_lock);
3937 	shm_locality = vp->v_locality;
3938 	shm_locality->loc_count--;
3939 
3940 	if (shm_locality->loc_count == 0) {
3941 		rw_destroy(&shm_locality->loc_lock);
3942 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3943 		kmem_free(shm_locality, sizeof (*shm_locality));
3944 		vp->v_locality = 0;
3945 		vp->v_flag &= ~V_LOCALITY;
3946 	}
3947 	mutex_exit(&vp->v_lock);
3948 }
3949 
3950 /*
3951  * Compare two shared memory policy segments
3952  * Used by AVL tree code for searching
3953  */
3954 int
3955 lgrp_shm_policy_compar(const void *x, const void *y)
3956 {
3957 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3958 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3959 
3960 	if (a->shm_off < b->shm_off)
3961 		return (-1);
3962 	if (a->shm_off >= b->shm_off + b->shm_size)
3963 		return (1);
3964 	return (0);
3965 }
3966 
3967 /*
3968  * Concatenate seg1 with seg2 and remove seg2
3969  */
3970 static int
3971 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
3972     lgrp_shm_policy_seg_t *seg2)
3973 {
3974 	if (!seg1 || !seg2 ||
3975 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
3976 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
3977 		return (-1);
3978 
3979 	seg1->shm_size += seg2->shm_size;
3980 	avl_remove(tree, seg2);
3981 	kmem_free(seg2, sizeof (*seg2));
3982 	return (0);
3983 }
3984 
3985 /*
3986  * Split segment at given offset and return rightmost (uppermost) segment
3987  * Assumes that there are no overlapping segments
3988  */
3989 static lgrp_shm_policy_seg_t *
3990 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
3991     u_offset_t off)
3992 {
3993 	lgrp_shm_policy_seg_t	*newseg;
3994 	avl_index_t		where;
3995 
3996 	ASSERT(seg != NULL);
3997 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
3998 
3999 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4000 	    seg->shm_size)
4001 		return (NULL);
4002 
4003 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4004 		return (seg);
4005 
4006 	/*
4007 	 * Adjust size of left segment and allocate new (right) segment
4008 	 */
4009 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4010 	newseg->shm_policy = seg->shm_policy;
4011 	newseg->shm_off = off;
4012 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4013 	seg->shm_size = off - seg->shm_off;
4014 
4015 	/*
4016 	 * Find where to insert new segment in AVL tree and insert it
4017 	 */
4018 	(void) avl_find(tree, &off, &where);
4019 	avl_insert(tree, newseg, where);
4020 
4021 	return (newseg);
4022 }
4023 
4024 /*
4025  * Set shared memory allocation policy on specified shared object at given
4026  * offset and length
4027  *
4028  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4029  * -1 if can't set policy.
4030  */
4031 int
4032 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4033     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4034 {
4035 	u_offset_t		eoff;
4036 	lgrp_shm_policy_seg_t	*next;
4037 	lgrp_shm_policy_seg_t	*newseg;
4038 	u_offset_t		off;
4039 	u_offset_t		oldeoff;
4040 	lgrp_shm_policy_seg_t	*prev;
4041 	int			retval;
4042 	lgrp_shm_policy_seg_t	*seg;
4043 	lgrp_shm_locality_t	*shm_locality;
4044 	avl_tree_t		*tree;
4045 	avl_index_t		where;
4046 
4047 	ASSERT(amp || vp);
4048 	ASSERT((len & PAGEOFFSET) == 0);
4049 
4050 	if (len == 0)
4051 		return (-1);
4052 
4053 	retval = 0;
4054 
4055 	/*
4056 	 * Get locality info and starting offset into shared object
4057 	 * Try anon map first and then vnode
4058 	 * Assume that no locks need to be held on anon_map or vnode, since
4059 	 * it should be protected by its reference count which must be nonzero
4060 	 * for an existing segment.
4061 	 */
4062 	if (amp) {
4063 		/*
4064 		 * Get policy info from anon_map
4065 		 *
4066 		 */
4067 		ASSERT(amp->refcnt != 0);
4068 		if (amp->locality == NULL)
4069 			lgrp_shm_policy_init(amp, NULL);
4070 		shm_locality = amp->locality;
4071 		off = ptob(anon_index);
4072 	} else if (vp) {
4073 		/*
4074 		 * Get policy info from vnode
4075 		 */
4076 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4077 			lgrp_shm_policy_init(NULL, vp);
4078 		shm_locality = vp->v_locality;
4079 		ASSERT(shm_locality->loc_count != 0);
4080 		off = vn_off;
4081 	} else
4082 		return (-1);
4083 
4084 	ASSERT((off & PAGEOFFSET) == 0);
4085 
4086 	/*
4087 	 * Figure out default policy
4088 	 */
4089 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4090 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4091 
4092 	/*
4093 	 * Create AVL tree if there isn't one yet
4094 	 * and set locality field to point at it
4095 	 */
4096 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4097 	tree = shm_locality->loc_tree;
4098 	if (!tree) {
4099 		rw_exit(&shm_locality->loc_lock);
4100 
4101 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4102 
4103 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4104 		if (shm_locality->loc_tree == NULL) {
4105 			avl_create(tree, lgrp_shm_policy_compar,
4106 			    sizeof (lgrp_shm_policy_seg_t),
4107 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4108 			shm_locality->loc_tree = tree;
4109 		} else {
4110 			/*
4111 			 * Another thread managed to set up the tree
4112 			 * before we could. Free the tree we allocated
4113 			 * and use the one that's already there.
4114 			 */
4115 			kmem_free(tree, sizeof (*tree));
4116 			tree = shm_locality->loc_tree;
4117 		}
4118 	}
4119 
4120 	/*
4121 	 * Set policy
4122 	 *
4123 	 * Need to maintain hold on writer's lock to keep tree from
4124 	 * changing out from under us
4125 	 */
4126 	while (len != 0) {
4127 		/*
4128 		 * Find policy segment for specified offset into shared object
4129 		 */
4130 		seg = avl_find(tree, &off, &where);
4131 
4132 		/*
4133 		 * Didn't find any existing segment that contains specified
4134 		 * offset, so allocate new segment, insert it, and concatenate
4135 		 * with adjacent segments if possible
4136 		 */
4137 		if (seg == NULL) {
4138 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4139 			    KM_SLEEP);
4140 			newseg->shm_policy.mem_policy = policy;
4141 			newseg->shm_policy.mem_reserved = 0;
4142 			newseg->shm_off = off;
4143 			avl_insert(tree, newseg, where);
4144 
4145 			/*
4146 			 * Check to see whether new segment overlaps with next
4147 			 * one, set length of new segment accordingly, and
4148 			 * calculate remaining length and next offset
4149 			 */
4150 			seg = AVL_NEXT(tree, newseg);
4151 			if (seg == NULL || off + len <= seg->shm_off) {
4152 				newseg->shm_size = len;
4153 				len = 0;
4154 			} else {
4155 				newseg->shm_size = seg->shm_off - off;
4156 				off = seg->shm_off;
4157 				len -= newseg->shm_size;
4158 			}
4159 
4160 			/*
4161 			 * Try to concatenate new segment with next and
4162 			 * previous ones, since they might have the same policy
4163 			 * now.  Grab previous and next segments first because
4164 			 * they will change on concatenation.
4165 			 */
4166 			prev =  AVL_PREV(tree, newseg);
4167 			next = AVL_NEXT(tree, newseg);
4168 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4169 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4170 
4171 			continue;
4172 		}
4173 
4174 		eoff = off + len;
4175 		oldeoff = seg->shm_off + seg->shm_size;
4176 
4177 		/*
4178 		 * Policy set already?
4179 		 */
4180 		if (policy == seg->shm_policy.mem_policy) {
4181 			/*
4182 			 * Nothing left to do if offset and length
4183 			 * fall within this segment
4184 			 */
4185 			if (eoff <= oldeoff) {
4186 				retval = 1;
4187 				break;
4188 			} else {
4189 				len = eoff - oldeoff;
4190 				off = oldeoff;
4191 				continue;
4192 			}
4193 		}
4194 
4195 		/*
4196 		 * Specified offset and length match existing segment exactly
4197 		 */
4198 		if (off == seg->shm_off && len == seg->shm_size) {
4199 			/*
4200 			 * Set policy and update current length
4201 			 */
4202 			seg->shm_policy.mem_policy = policy;
4203 			seg->shm_policy.mem_reserved = 0;
4204 			len = 0;
4205 
4206 			/*
4207 			 * Try concatenating new segment with previous and next
4208 			 * segments, since they might have the same policy now.
4209 			 * Grab previous and next segments first because they
4210 			 * will change on concatenation.
4211 			 */
4212 			prev =  AVL_PREV(tree, seg);
4213 			next = AVL_NEXT(tree, seg);
4214 			(void) lgrp_shm_policy_concat(tree, seg, next);
4215 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4216 		} else {
4217 			/*
4218 			 * Specified offset and length only apply to part of
4219 			 * existing segment
4220 			 */
4221 
4222 			/*
4223 			 * New segment starts in middle of old one, so split
4224 			 * new one off near beginning of old one
4225 			 */
4226 			newseg = NULL;
4227 			if (off > seg->shm_off) {
4228 				newseg = lgrp_shm_policy_split(tree, seg, off);
4229 
4230 				/*
4231 				 * New segment ends where old one did, so try
4232 				 * to concatenate with next segment
4233 				 */
4234 				if (eoff == oldeoff) {
4235 					newseg->shm_policy.mem_policy = policy;
4236 					newseg->shm_policy.mem_reserved = 0;
4237 					(void) lgrp_shm_policy_concat(tree,
4238 					    newseg, AVL_NEXT(tree, newseg));
4239 					break;
4240 				}
4241 			}
4242 
4243 			/*
4244 			 * New segment ends before old one, so split off end of
4245 			 * old one
4246 			 */
4247 			if (eoff < oldeoff) {
4248 				if (newseg) {
4249 					(void) lgrp_shm_policy_split(tree,
4250 					    newseg, eoff);
4251 					newseg->shm_policy.mem_policy = policy;
4252 					newseg->shm_policy.mem_reserved = 0;
4253 				} else {
4254 					(void) lgrp_shm_policy_split(tree, seg,
4255 					    eoff);
4256 					seg->shm_policy.mem_policy = policy;
4257 					seg->shm_policy.mem_reserved = 0;
4258 				}
4259 
4260 				if (off == seg->shm_off)
4261 					(void) lgrp_shm_policy_concat(tree,
4262 					    AVL_PREV(tree, seg), seg);
4263 				break;
4264 			}
4265 
4266 			/*
4267 			 * Calculate remaining length and next offset
4268 			 */
4269 			len = eoff - oldeoff;
4270 			off = oldeoff;
4271 		}
4272 	}
4273 
4274 	rw_exit(&shm_locality->loc_lock);
4275 	return (retval);
4276 }
4277 
4278 /*
4279  * Return the best memnode from which to allocate memory given
4280  * an lgroup.
4281  *
4282  * "c" is for cookie, which is good enough for me.
4283  * It references a cookie struct that should be zero'ed to initialize.
4284  * The cookie should live on the caller's stack.
4285  *
4286  * The routine returns -1 when:
4287  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4288  *	- traverse is 1, and all the memnodes in the system have been
4289  *	  returned.
4290  */
4291 int
4292 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4293 {
4294 	lgrp_t		*lp = c->lmc_lgrp;
4295 	mnodeset_t	nodes = c->lmc_nodes;
4296 	int		cnt = c->lmc_cnt;
4297 	int		offset, mnode;
4298 
4299 	extern int	max_mem_nodes;
4300 
4301 	/*
4302 	 * If the set is empty, and the caller is willing, traverse
4303 	 * up the hierarchy until we find a non-empty set.
4304 	 */
4305 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4306 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4307 		    ((lp = lp->lgrp_parent) == NULL))
4308 			return (-1);
4309 
4310 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4311 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4312 	}
4313 
4314 	/*
4315 	 * Select a memnode by picking one at a "random" offset.
4316 	 * Because of DR, memnodes can come and go at any time.
4317 	 * This code must be able to cope with the possibility
4318 	 * that the nodes count "cnt" is inconsistent with respect
4319 	 * to the number of elements actually in "nodes", and
4320 	 * therefore that the offset chosen could be greater than
4321 	 * the number of elements in the set (some memnodes may
4322 	 * have dissapeared just before cnt was read).
4323 	 * If this happens, the search simply wraps back to the
4324 	 * beginning of the set.
4325 	 */
4326 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4327 	offset = c->lmc_rand % cnt;
4328 	do {
4329 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4330 			if (nodes & ((mnodeset_t)1 << mnode))
4331 				if (!offset--)
4332 					break;
4333 	} while (mnode >= max_mem_nodes);
4334 
4335 	/* Found a node. Store state before returning. */
4336 	c->lmc_lgrp = lp;
4337 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4338 	c->lmc_cnt = cnt - 1;
4339 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4340 	c->lmc_ntried++;
4341 
4342 	return (mnode);
4343 }
4344