xref: /titanic_51/usr/src/uts/common/os/lgrp.c (revision 2dae3fb5f236a83380b9deea54417c4e1f535121)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Basic NUMA support in terms of locality groups
31  *
32  * Solaris needs to know which CPUs, memory, etc. are near each other to
33  * provide good performance on NUMA machines by optimizing for locality.
34  * In order to do this, a new abstraction called a "locality group (lgroup)"
35  * has been introduced to keep track of which CPU-like and memory-like hardware
36  * resources are close to each other.  Currently, latency is the only measure
37  * used to determine how to group hardware resources into lgroups, but this
38  * does not limit the groupings to be based solely on latency.  Other factors
39  * may be used to determine the groupings in the future.
40  *
41  * Lgroups are organized into a hieararchy or topology that represents the
42  * latency topology of the machine.  There is always at least a root lgroup in
43  * the system.  It represents all the hardware resources in the machine at a
44  * latency big enough that any hardware resource can at least access any other
45  * hardware resource within that latency.  A Uniform Memory Access (UMA)
46  * machine is represented with one lgroup (the root).  In contrast, a NUMA
47  * machine is represented at least by the root lgroup and some number of leaf
48  * lgroups where the leaf lgroups contain the hardware resources within the
49  * least latency of each other and the root lgroup still contains all the
50  * resources in the machine.  Some number of intermediate lgroups may exist
51  * which represent more levels of locality than just the local latency of the
52  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
53  * (eg. root and intermediate lgroups) contain the next nearest resources to
54  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
55  * to the root lgroup shows the hardware resources from closest to farthest
56  * from the leaf lgroup such that each successive ancestor lgroup contains
57  * the next nearest resources at the next level of locality from the previous.
58  *
59  * The kernel uses the lgroup abstraction to know how to allocate resources
60  * near a given process/thread.  At fork() and lwp/thread_create() time, a
61  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
62  * with the lowest load average.  Binding to a processor or processor set will
63  * change the home lgroup for a thread.  The scheduler has been modified to try
64  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
65  * allocation is lgroup aware too, so memory will be allocated from the current
66  * thread's home lgroup if possible.  If the desired resources are not
67  * available, the kernel traverses the lgroup hierarchy going to the parent
68  * lgroup to find resources at the next level of locality until it reaches the
69  * root lgroup.
70  */
71 
72 #include <sys/lgrp.h>
73 #include <sys/lgrp_user.h>
74 #include <sys/types.h>
75 #include <sys/mman.h>
76 #include <sys/param.h>
77 #include <sys/var.h>
78 #include <sys/thread.h>
79 #include <sys/cpuvar.h>
80 #include <sys/cpupart.h>
81 #include <sys/kmem.h>
82 #include <vm/seg.h>
83 #include <vm/seg_kmem.h>
84 #include <vm/seg_spt.h>
85 #include <vm/seg_vn.h>
86 #include <vm/as.h>
87 #include <sys/atomic.h>
88 #include <sys/systm.h>
89 #include <sys/errno.h>
90 #include <sys/cmn_err.h>
91 #include <sys/kstat.h>
92 #include <sys/sysmacros.h>
93 #include <sys/chip.h>
94 #include <sys/promif.h>
95 #include <sys/sdt.h>
96 
97 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
99 				/* indexed by lgrp_id */
100 int	nlgrps;			/* number of lgroups in machine */
101 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
102 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
103 
104 /*
105  * Kstat data for lgroups.
106  *
107  * Actual kstat data is collected in lgrp_stats array.
108  * The lgrp_kstat_data array of named kstats is used to extract data from
109  * lgrp_stats and present it to kstat framework. It is protected from partallel
110  * modifications by lgrp_kstat_mutex. This may cause some contention when
111  * several kstat commands run in parallel but this is not the
112  * performance-critical path.
113  */
114 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
115 
116 /*
117  * Declare kstat names statically for enums as defined in the header file.
118  */
119 LGRP_KSTAT_NAMES;
120 
121 static void	lgrp_kstat_init(void);
122 static int	lgrp_kstat_extract(kstat_t *, int);
123 static void	lgrp_kstat_reset(lgrp_id_t);
124 
125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
126 static kmutex_t lgrp_kstat_mutex;
127 
128 
129 /*
130  * max number of lgroups supported by the platform
131  */
132 int	nlgrpsmax = 0;
133 
134 /*
135  * The root lgroup. Represents the set of resources at the system wide
136  * level of locality.
137  */
138 lgrp_t		*lgrp_root = NULL;
139 
140 /*
141  * During system bootstrap cp_default does not contain the list of lgrp load
142  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
143  * on-line when cp_default is initialized by cpupart_initialize_default().
144  * Configuring CPU0 may create a two-level topology with root and one leaf node
145  * containing CPU0. This topology is initially constructed in a special
146  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
147  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
148  * for all lpl operations until cp_default is fully constructed.
149  *
150  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
151  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
152  * the first element of lpl_bootstrap_list.
153  */
154 #define	LPL_BOOTSTRAP_SIZE 2
155 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
156 lpl_t		*lpl_bootstrap;
157 
158 static lgrp_t	lroot;
159 
160 
161 /*
162  * Size, in bytes, beyond which random memory allocation policy is applied
163  * to non-shared memory.  Default is the maximum size, so random memory
164  * allocation won't be used for non-shared memory by default.
165  */
166 size_t	lgrp_privm_random_thresh = (size_t)(-1);
167 
168 /*
169  * Size, in bytes, beyond which random memory allocation policy is applied to
170  * shared memory.  Default is 8MB (2 ISM pages).
171  */
172 size_t	lgrp_shm_random_thresh = 8*1024*1024;
173 
174 /*
175  * Whether to do processor set aware memory allocation by default
176  */
177 int	lgrp_mem_pset_aware = 0;
178 
179 /*
180  * Set the default memory allocation policy for root lgroup
181  */
182 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
183 
184 /*
185  * Set the default memory allocation policy.  For most platforms,
186  * next touch is sufficient, but some platforms may wish to override
187  * this.
188  */
189 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
190 
191 
192 /*
193  * lgroup CPU event handlers
194  */
195 static void	lgrp_cpu_init(struct cpu *);
196 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
197 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
198 
199 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
200 
201 /*
202  * lgroup memory event handlers
203  */
204 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
205 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
206 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
207 
208 /*
209  * lgroup CPU partition event handlers
210  */
211 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
212 static void	lgrp_part_del_cpu(struct cpu *);
213 
214 static void	lgrp_root_init(void);
215 
216 /*
217  * lpl topology
218  */
219 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
220 static void	lpl_clear(lpl_t *);
221 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
222 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
223 static void	lpl_rset_add(lpl_t *, lpl_t *);
224 static void	lpl_rset_del(lpl_t *, lpl_t *);
225 static int	lpl_rset_contains(lpl_t *, lpl_t *);
226 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
227 static void	lpl_child_update(lpl_t *, struct cpupart *);
228 static int	lpl_pick(lpl_t *, lpl_t *);
229 static void	lpl_verify_wrapper(struct cpupart *);
230 
231 /*
232  * defines for lpl topology verifier return codes
233  */
234 
235 #define	LPL_TOPO_CORRECT			0
236 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
237 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
238 #define	LPL_TOPO_LGRP_MISMATCH			-3
239 #define	LPL_TOPO_MISSING_PARENT			-4
240 #define	LPL_TOPO_PARENT_MISMATCH		-5
241 #define	LPL_TOPO_BAD_CPUCNT			-6
242 #define	LPL_TOPO_RSET_MISMATCH			-7
243 #define	LPL_TOPO_LPL_ORPHANED			-8
244 #define	LPL_TOPO_LPL_BAD_NCPU			-9
245 #define	LPL_TOPO_RSET_MSSNG_LF			-10
246 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
247 #define	LPL_TOPO_BOGUS_HINT			-12
248 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
249 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
250 #define	LPL_TOPO_BAD_RSETCNT			-15
251 
252 /*
253  * Return whether lgroup optimizations should be enabled on this system
254  */
255 int
256 lgrp_optimizations(void)
257 {
258 	/*
259 	 * System must have more than 2 lgroups to enable lgroup optimizations
260 	 *
261 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
262 	 * with one child lgroup containing all the resources. A 2 lgroup
263 	 * system with a root lgroup directly containing CPUs or memory might
264 	 * need lgroup optimizations with its child lgroup, but there
265 	 * isn't such a machine for now....
266 	 */
267 	if (nlgrps > 2)
268 		return (1);
269 
270 	return (0);
271 }
272 
273 /*
274  * Build full lgroup topology
275  */
276 static void
277 lgrp_root_init(void)
278 {
279 	lgrp_handle_t	hand;
280 	int		i;
281 	lgrp_id_t	id;
282 
283 	/*
284 	 * Create the "root" lgroup
285 	 */
286 	ASSERT(nlgrps == 0);
287 	id = nlgrps++;
288 
289 	lgrp_root = &lroot;
290 
291 	lgrp_root->lgrp_cpu = NULL;
292 	lgrp_root->lgrp_mnodes = 0;
293 	lgrp_root->lgrp_nmnodes = 0;
294 	hand = lgrp_plat_root_hand();
295 	lgrp_root->lgrp_plathand = hand;
296 
297 	lgrp_root->lgrp_id = id;
298 	lgrp_root->lgrp_cpucnt = 0;
299 	lgrp_root->lgrp_childcnt = 0;
300 	klgrpset_clear(lgrp_root->lgrp_children);
301 	klgrpset_clear(lgrp_root->lgrp_leaves);
302 	lgrp_root->lgrp_parent = NULL;
303 	lgrp_root->lgrp_chips = NULL;
304 	lgrp_root->lgrp_chipcnt = 0;
305 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
306 
307 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
308 		klgrpset_clear(lgrp_root->lgrp_set[i]);
309 
310 	lgrp_root->lgrp_kstat = NULL;
311 
312 	lgrp_table[id] = lgrp_root;
313 
314 	/*
315 	 * Setup initial lpl list for CPU0 and initial t0 home.
316 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
317 	 * all topology operations untill cp_default until cp_default is
318 	 * initialized at which point t0.t_lpl will be updated.
319 	 */
320 	lpl_bootstrap = lpl_bootstrap_list;
321 	t0.t_lpl = lpl_bootstrap;
322 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
323 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
324 	cp_default.cp_lgrploads = lpl_bootstrap;
325 }
326 
327 /*
328  * Initialize the lgroup framework and allow the platform to do the same
329  */
330 void
331 lgrp_init(void)
332 {
333 	/*
334 	 * Initialize the platform
335 	 */
336 	lgrp_plat_init();
337 
338 	/*
339 	 * Set max number of lgroups supported on this platform which must be
340 	 * less than the max number of lgroups supported by the common lgroup
341 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
342 	 */
343 	nlgrpsmax = lgrp_plat_max_lgrps();
344 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
345 }
346 
347 /*
348  * Create the root and cpu0's lgroup, and set t0's home.
349  */
350 void
351 lgrp_setup(void)
352 {
353 	/*
354 	 * Setup the root lgroup
355 	 */
356 	lgrp_root_init();
357 
358 	/*
359 	 * Add cpu0 to an lgroup
360 	 */
361 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
362 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
363 }
364 
365 /*
366  * Lgroup initialization is split in two parts. The first part
367  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
368  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
369  * when all CPUs are brought online and all distance information is available.
370  *
371  * When lgrp_main_init() is complete it sets lgrp_initialized. The
372  * lgrp_main_mp_init() sets lgrp_topo_initialized.
373  */
374 
375 /*
376  * true when lgrp initialization has been completed.
377  */
378 int	lgrp_initialized = 0;
379 
380 /*
381  * True when lgrp topology is constructed.
382  */
383 int	lgrp_topo_initialized = 0;
384 
385 /*
386  * Init routine called after startup(), /etc/system has been processed,
387  * and cpu0 has been added to an lgroup.
388  */
389 void
390 lgrp_main_init(void)
391 {
392 	cpu_t		*cp = CPU;
393 	lgrp_id_t	lgrpid;
394 	int		i;
395 	/*
396 	 * Enforce a valid lgrp_mem_default_policy
397 	 */
398 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
399 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
400 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
401 
402 	/*
403 	 * See if mpo should be disabled.
404 	 * This may happen in the case of null proc LPA on Starcat.
405 	 * The platform won't be able to detect null proc LPA until after
406 	 * cpu0 and memory have already been added to lgroups.
407 	 * When and if it is detected, the Starcat platform will return
408 	 * a different platform handle for cpu0 which is what we check for
409 	 * here. If mpo should be disabled move cpu0 to it's rightful place
410 	 * (the root), and destroy the remaining lgroups. This effectively
411 	 * provides an UMA lgroup topology.
412 	 */
413 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
414 	if (lgrp_table[lgrpid]->lgrp_plathand !=
415 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
416 		lgrp_part_del_cpu(cp);
417 		lgrp_cpu_fini(cp, lgrpid);
418 
419 		lgrp_cpu_init(cp);
420 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
421 
422 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
423 
424 		for (i = 0; i <= lgrp_alloc_max; i++) {
425 			if (LGRP_EXISTS(lgrp_table[i]) &&
426 			    lgrp_table[i] != lgrp_root)
427 				lgrp_destroy(lgrp_table[i]);
428 		}
429 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
430 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
431 	}
432 
433 	/*
434 	 * Initialize kstats framework.
435 	 */
436 	lgrp_kstat_init();
437 	/*
438 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
439 	 */
440 	mutex_enter(&cpu_lock);
441 	lgrp_kstat_create(cp);
442 	mutex_exit(&cpu_lock);
443 
444 	lgrp_plat_main_init();
445 	lgrp_initialized = 1;
446 }
447 
448 /*
449  * Finish lgrp initialization after all CPUS are brought on-line.
450  * This routine is called after start_other_cpus().
451  */
452 void
453 lgrp_main_mp_init(void)
454 {
455 	klgrpset_t changed;
456 
457 	/*
458 	 * Update lgroup topology (if necessary)
459 	 */
460 	klgrpset_clear(changed);
461 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
462 	lgrp_topo_initialized = 1;
463 }
464 
465 /*
466  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
467  */
468 void
469 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
470 {
471 	klgrpset_t	changed;
472 	cpu_t		*cp;
473 	lgrp_id_t	id;
474 	int		rc;
475 
476 	switch (event) {
477 	/*
478 	 * The following (re)configuration events are common code
479 	 * initiated. lgrp_plat_config() is called here to inform the
480 	 * platform of the reconfiguration event.
481 	 */
482 	case LGRP_CONFIG_CPU_ADD:
483 		lgrp_plat_config(event, resource);
484 		atomic_add_32(&lgrp_gen, 1);
485 
486 		break;
487 	case LGRP_CONFIG_CPU_DEL:
488 		lgrp_plat_config(event, resource);
489 		atomic_add_32(&lgrp_gen, 1);
490 
491 		break;
492 	case LGRP_CONFIG_CPU_ONLINE:
493 		cp = (cpu_t *)resource;
494 		lgrp_cpu_init(cp);
495 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
496 		rc = lpl_topo_verify(cp->cpu_part);
497 		if (rc != LPL_TOPO_CORRECT) {
498 			panic("lpl_topo_verify failed: %d", rc);
499 		}
500 		lgrp_plat_config(event, resource);
501 		atomic_add_32(&lgrp_gen, 1);
502 
503 		break;
504 	case LGRP_CONFIG_CPU_OFFLINE:
505 		cp = (cpu_t *)resource;
506 		id = cp->cpu_lpl->lpl_lgrpid;
507 		lgrp_part_del_cpu(cp);
508 		lgrp_cpu_fini(cp, id);
509 		rc = lpl_topo_verify(cp->cpu_part);
510 		if (rc != LPL_TOPO_CORRECT) {
511 			panic("lpl_topo_verify failed: %d", rc);
512 		}
513 		lgrp_plat_config(event, resource);
514 		atomic_add_32(&lgrp_gen, 1);
515 
516 		break;
517 	case LGRP_CONFIG_CPUPART_ADD:
518 		cp = (cpu_t *)resource;
519 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
520 		rc = lpl_topo_verify(cp->cpu_part);
521 		if (rc != LPL_TOPO_CORRECT) {
522 			panic("lpl_topo_verify failed: %d", rc);
523 		}
524 		lgrp_plat_config(event, resource);
525 
526 		break;
527 	case LGRP_CONFIG_CPUPART_DEL:
528 		cp = (cpu_t *)resource;
529 		lgrp_part_del_cpu((cpu_t *)resource);
530 		rc = lpl_topo_verify(cp->cpu_part);
531 		if (rc != LPL_TOPO_CORRECT) {
532 			panic("lpl_topo_verify failed: %d", rc);
533 		}
534 		lgrp_plat_config(event, resource);
535 
536 		break;
537 	/*
538 	 * The following events are initiated by the memnode
539 	 * subsystem.
540 	 */
541 	case LGRP_CONFIG_MEM_ADD:
542 		lgrp_mem_init((int)resource, where, B_FALSE);
543 		atomic_add_32(&lgrp_gen, 1);
544 
545 		break;
546 	case LGRP_CONFIG_MEM_DEL:
547 		lgrp_mem_fini((int)resource, where, B_FALSE);
548 		atomic_add_32(&lgrp_gen, 1);
549 
550 		break;
551 	case LGRP_CONFIG_MEM_RENAME: {
552 		lgrp_config_mem_rename_t *ren_arg =
553 		    (lgrp_config_mem_rename_t *)where;
554 
555 		lgrp_mem_rename((int)resource,
556 		    ren_arg->lmem_rename_from,
557 		    ren_arg->lmem_rename_to);
558 		atomic_add_32(&lgrp_gen, 1);
559 
560 		break;
561 	}
562 	case LGRP_CONFIG_GEN_UPDATE:
563 		atomic_add_32(&lgrp_gen, 1);
564 
565 		break;
566 	case LGRP_CONFIG_FLATTEN:
567 		if (where == 0)
568 			lgrp_topo_levels = (int)resource;
569 		else
570 			(void) lgrp_topo_flatten(resource,
571 			    lgrp_table, lgrp_alloc_max, &changed);
572 
573 		break;
574 	/*
575 	 * Initiated by platform latency probing code
576 	 */
577 	case LGRP_CONFIG_LATENCY_CHANGE:
578 		lgrp_latency_change((u_longlong_t)resource,
579 		    (u_longlong_t)where);
580 
581 		break;
582 	case LGRP_CONFIG_NOP:
583 
584 		break;
585 	default:
586 		break;
587 	}
588 
589 }
590 
591 /*
592  * Called to add lgrp info into cpu structure from cpu_add_unit;
593  * do not assume cpu is in cpu[] yet!
594  *
595  * CPUs are brought online with all other CPUs paused so we can't
596  * allocate memory or we could deadlock the system, so we rely on
597  * the platform to statically allocate as much space as we need
598  * for the lgrp structs and stats.
599  */
600 static void
601 lgrp_cpu_init(struct cpu *cp)
602 {
603 	klgrpset_t	changed;
604 	int		count;
605 	lgrp_handle_t	hand;
606 	int		first_cpu;
607 	lgrp_t		*my_lgrp;
608 	lgrp_id_t	lgrpid;
609 	struct cpu	*cptr;
610 	struct chip	*chp;
611 
612 	/*
613 	 * This is the first time through if the resource set
614 	 * for the root lgroup is empty. After cpu0 has been
615 	 * initially added to an lgroup, the root's CPU resource
616 	 * set can never be empty, since the system's last CPU
617 	 * cannot be offlined.
618 	 */
619 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
620 		/*
621 		 * First time through.
622 		 */
623 		first_cpu = 1;
624 	} else {
625 		/*
626 		 * If cpu0 needs to move lgroups, we may come
627 		 * through here again, at which time cpu_lock won't
628 		 * be held, and lgrp_initialized will be false.
629 		 */
630 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
631 		ASSERT(cp->cpu_part != NULL);
632 		first_cpu = 0;
633 	}
634 
635 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
636 	my_lgrp = lgrp_hand_to_lgrp(hand);
637 
638 	if (my_lgrp == NULL) {
639 		/*
640 		 * Create new lgrp and add it to lgroup topology
641 		 */
642 		my_lgrp = lgrp_create();
643 		my_lgrp->lgrp_plathand = hand;
644 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
645 		lgrpid = my_lgrp->lgrp_id;
646 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
647 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
648 
649 		count = 0;
650 		klgrpset_clear(changed);
651 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
652 		    &changed);
653 		/*
654 		 * May have added new intermediate lgroups, so need to add
655 		 * resources other than CPUs which are added below
656 		 */
657 		(void) lgrp_mnode_update(changed, NULL);
658 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
659 	    > 0) {
660 		/*
661 		 * Leaf lgroup was created, but latency wasn't available
662 		 * then.  So, set latency for it and fill in rest of lgroup
663 		 * topology  now that we know how far it is from other leaf
664 		 * lgroups.
665 		 */
666 		lgrpid = my_lgrp->lgrp_id;
667 		klgrpset_clear(changed);
668 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
669 		    lgrpid))
670 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
671 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
672 		    &changed);
673 
674 		/*
675 		 * May have added new intermediate lgroups, so need to add
676 		 * resources other than CPUs which are added below
677 		 */
678 		(void) lgrp_mnode_update(changed, NULL);
679 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
680 	    my_lgrp->lgrp_id)) {
681 		int	i;
682 
683 		/*
684 		 * Update existing lgroup and lgroups containing it with CPU
685 		 * resource
686 		 */
687 		lgrpid = my_lgrp->lgrp_id;
688 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
689 		for (i = 0; i <= lgrp_alloc_max; i++) {
690 			lgrp_t		*lgrp;
691 
692 			lgrp = lgrp_table[i];
693 			if (!LGRP_EXISTS(lgrp) ||
694 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
695 				continue;
696 
697 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
698 		}
699 	}
700 
701 	lgrpid = my_lgrp->lgrp_id;
702 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
703 
704 	/*
705 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
706 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
707 	 * not since none of lgroup IDs in the lpl's have been set yet.
708 	 */
709 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
710 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
711 
712 	/*
713 	 * link the CPU into the lgrp's CPU list
714 	 */
715 	if (my_lgrp->lgrp_cpucnt == 0) {
716 		my_lgrp->lgrp_cpu = cp;
717 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
718 	} else {
719 		cptr = my_lgrp->lgrp_cpu;
720 		cp->cpu_next_lgrp = cptr;
721 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
722 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
723 		cptr->cpu_prev_lgrp = cp;
724 	}
725 	my_lgrp->lgrp_cpucnt++;
726 
727 	/*
728 	 * Add this cpu's chip to the per lgroup list
729 	 * if necessary
730 	 */
731 	if (cp->cpu_chip->chip_lgrp == NULL) {
732 		struct chip *lcpr;
733 
734 		chp = cp->cpu_chip;
735 
736 		if (my_lgrp->lgrp_chipcnt == 0) {
737 			my_lgrp->lgrp_chips = chp;
738 			chp->chip_next_lgrp =
739 			    chp->chip_prev_lgrp = chp;
740 		} else {
741 			lcpr = my_lgrp->lgrp_chips;
742 			chp->chip_next_lgrp = lcpr;
743 			chp->chip_prev_lgrp =
744 			    lcpr->chip_prev_lgrp;
745 			lcpr->chip_prev_lgrp->chip_next_lgrp =
746 			    chp;
747 			lcpr->chip_prev_lgrp = chp;
748 		}
749 		chp->chip_lgrp = my_lgrp;
750 		chp->chip_balance = chp->chip_next_lgrp;
751 		my_lgrp->lgrp_chipcnt++;
752 	}
753 }
754 
755 lgrp_t *
756 lgrp_create(void)
757 {
758 	lgrp_t		*my_lgrp;
759 	lgrp_id_t	lgrpid;
760 	int		i;
761 
762 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
763 
764 	/*
765 	 * Find an open slot in the lgroup table and recycle unused lgroup
766 	 * left there if any
767 	 */
768 	my_lgrp = NULL;
769 	if (lgrp_alloc_hint == -1)
770 		/*
771 		 * Allocate from end when hint not set yet because no lgroups
772 		 * have been deleted yet
773 		 */
774 		lgrpid = nlgrps++;
775 	else {
776 		/*
777 		 * Start looking for next open slot from hint and leave hint
778 		 * at slot allocated
779 		 */
780 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
781 			my_lgrp = lgrp_table[i];
782 			if (!LGRP_EXISTS(my_lgrp)) {
783 				lgrpid = i;
784 				nlgrps++;
785 				break;
786 			}
787 		}
788 		lgrp_alloc_hint = lgrpid;
789 	}
790 
791 	/*
792 	 * Keep track of max lgroup ID allocated so far to cut down on searches
793 	 */
794 	if (lgrpid > lgrp_alloc_max)
795 		lgrp_alloc_max = lgrpid;
796 
797 	/*
798 	 * Need to allocate new lgroup if next open slot didn't have one
799 	 * for recycling
800 	 */
801 	if (my_lgrp == NULL)
802 		my_lgrp = lgrp_plat_alloc(lgrpid);
803 
804 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
805 		panic("Too many lgrps for platform (%d)", nlgrps);
806 
807 	my_lgrp->lgrp_id = lgrpid;
808 	my_lgrp->lgrp_latency = 0;
809 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
810 	my_lgrp->lgrp_parent = NULL;
811 	my_lgrp->lgrp_childcnt = 0;
812 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
813 	my_lgrp->lgrp_nmnodes = 0;
814 	klgrpset_clear(my_lgrp->lgrp_children);
815 	klgrpset_clear(my_lgrp->lgrp_leaves);
816 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
817 		klgrpset_clear(my_lgrp->lgrp_set[i]);
818 
819 	my_lgrp->lgrp_cpu = NULL;
820 	my_lgrp->lgrp_cpucnt = 0;
821 	my_lgrp->lgrp_chips = NULL;
822 	my_lgrp->lgrp_chipcnt = 0;
823 
824 	if (my_lgrp->lgrp_kstat != NULL)
825 		lgrp_kstat_reset(lgrpid);
826 
827 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
828 
829 	return (my_lgrp);
830 }
831 
832 void
833 lgrp_destroy(lgrp_t *lgrp)
834 {
835 	int		i;
836 
837 	/*
838 	 * Unless this lgroup is being destroyed on behalf of
839 	 * the boot CPU, cpu_lock must be held
840 	 */
841 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
842 
843 	if (nlgrps == 1)
844 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
845 
846 	if (!LGRP_EXISTS(lgrp))
847 		return;
848 
849 	/*
850 	 * Set hint to lgroup being deleted and try to keep lower numbered
851 	 * hints to facilitate finding empty slots
852 	 */
853 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
854 		lgrp_alloc_hint = lgrp->lgrp_id;
855 
856 	/*
857 	 * Mark this lgroup to be recycled by setting its lgroup ID to
858 	 * LGRP_NONE and clear relevant fields
859 	 */
860 	lgrp->lgrp_id = LGRP_NONE;
861 	lgrp->lgrp_latency = 0;
862 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
863 	lgrp->lgrp_parent = NULL;
864 	lgrp->lgrp_childcnt = 0;
865 
866 	klgrpset_clear(lgrp->lgrp_children);
867 	klgrpset_clear(lgrp->lgrp_leaves);
868 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
869 		klgrpset_clear(lgrp->lgrp_set[i]);
870 
871 	lgrp->lgrp_mnodes = (mnodeset_t)0;
872 	lgrp->lgrp_nmnodes = 0;
873 
874 	lgrp->lgrp_cpu = NULL;
875 	lgrp->lgrp_cpucnt = 0;
876 	lgrp->lgrp_chipcnt = 0;
877 	lgrp->lgrp_chips = NULL;
878 
879 	nlgrps--;
880 }
881 
882 /*
883  * Initialize kstat data. Called from lgrp intialization code.
884  */
885 static void
886 lgrp_kstat_init(void)
887 {
888 	lgrp_stat_t	stat;
889 
890 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
891 
892 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
893 		kstat_named_init(&lgrp_kstat_data[stat],
894 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
895 }
896 
897 /*
898  * initialize an lgrp's kstats if needed
899  * called with cpu_lock held but not with cpus paused.
900  * we don't tear these down now because we don't know about
901  * memory leaving the lgrp yet...
902  */
903 
904 void
905 lgrp_kstat_create(cpu_t *cp)
906 {
907 	kstat_t		*lgrp_kstat;
908 	lgrp_id_t	lgrpid;
909 	lgrp_t		*my_lgrp;
910 
911 	ASSERT(MUTEX_HELD(&cpu_lock));
912 
913 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
914 	my_lgrp = lgrp_table[lgrpid];
915 
916 	if (my_lgrp->lgrp_kstat != NULL)
917 		return; /* already initialized */
918 
919 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
920 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
921 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
922 
923 	if (lgrp_kstat != NULL) {
924 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
925 		lgrp_kstat->ks_private = my_lgrp;
926 		lgrp_kstat->ks_data = &lgrp_kstat_data;
927 		lgrp_kstat->ks_update = lgrp_kstat_extract;
928 		my_lgrp->lgrp_kstat = lgrp_kstat;
929 		kstat_install(lgrp_kstat);
930 	}
931 }
932 
933 /*
934  * this will do something when we manage to remove now unused lgrps
935  */
936 
937 /* ARGSUSED */
938 void
939 lgrp_kstat_destroy(cpu_t *cp)
940 {
941 	ASSERT(MUTEX_HELD(&cpu_lock));
942 }
943 
944 /*
945  * Called when a CPU is off-lined.
946  */
947 static void
948 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
949 {
950 	lgrp_t *my_lgrp;
951 	struct cpu *prev;
952 	struct cpu *next;
953 	chip_t  *chp;
954 
955 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
956 
957 	prev = cp->cpu_prev_lgrp;
958 	next = cp->cpu_next_lgrp;
959 
960 	prev->cpu_next_lgrp = next;
961 	next->cpu_prev_lgrp = prev;
962 
963 	/*
964 	 * just because I'm paranoid doesn't mean...
965 	 */
966 
967 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
968 
969 	my_lgrp = lgrp_table[lgrpid];
970 	my_lgrp->lgrp_cpucnt--;
971 
972 	/*
973 	 * If the last CPU on it's chip is being offlined
974 	 * then remove this chip from the per lgroup list.
975 	 *
976 	 * This is also done for the boot CPU when it needs
977 	 * to move between lgroups as a consequence of
978 	 * null proc lpa.
979 	 */
980 	chp = cp->cpu_chip;
981 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
982 
983 		chip_t	*chpp;
984 
985 		if (--my_lgrp->lgrp_chipcnt == 0)
986 			my_lgrp->lgrp_chips = NULL;
987 		else if (my_lgrp->lgrp_chips == chp)
988 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
989 
990 		/*
991 		 * Walk this lgroup's chip list looking for chips that
992 		 * may try to balance against the one that's leaving
993 		 */
994 		for (chpp = chp->chip_next_lgrp; chpp != chp;
995 		    chpp = chpp->chip_next_lgrp) {
996 			if (chpp->chip_balance == chp)
997 				chpp->chip_balance = chp->chip_next_lgrp;
998 		}
999 
1000 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1001 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1002 
1003 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1004 		chp->chip_lgrp = NULL;
1005 		chp->chip_balance = NULL;
1006 	}
1007 
1008 	/*
1009 	 * Removing last CPU in lgroup, so update lgroup topology
1010 	 */
1011 	if (my_lgrp->lgrp_cpucnt == 0) {
1012 		klgrpset_t	changed;
1013 		int		count;
1014 		int		i;
1015 
1016 		my_lgrp->lgrp_cpu = NULL;
1017 
1018 		/*
1019 		 * Remove this lgroup from its lgroup CPU resources and remove
1020 		 * lgroup from lgroup topology if it doesn't have any more
1021 		 * resources in it now
1022 		 */
1023 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1024 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1025 			count = 0;
1026 			klgrpset_clear(changed);
1027 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1028 			    lgrp_alloc_max + 1, &changed);
1029 			return;
1030 		}
1031 
1032 		/*
1033 		 * This lgroup isn't empty, so just remove it from CPU
1034 		 * resources of any lgroups that contain it as such
1035 		 */
1036 		for (i = 0; i <= lgrp_alloc_max; i++) {
1037 			lgrp_t		*lgrp;
1038 
1039 			lgrp = lgrp_table[i];
1040 			if (!LGRP_EXISTS(lgrp) ||
1041 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1042 			    lgrpid))
1043 				continue;
1044 
1045 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1046 		}
1047 		return;
1048 	}
1049 
1050 	if (my_lgrp->lgrp_cpu == cp)
1051 		my_lgrp->lgrp_cpu = next;
1052 
1053 }
1054 
1055 /*
1056  * Update memory nodes in target lgroups and return ones that get changed
1057  */
1058 int
1059 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1060 {
1061 	int	count;
1062 	int	i;
1063 	int	j;
1064 	lgrp_t	*lgrp;
1065 	lgrp_t	*lgrp_rsrc;
1066 
1067 	count = 0;
1068 	if (changed)
1069 		klgrpset_clear(*changed);
1070 
1071 	if (klgrpset_isempty(target))
1072 		return (0);
1073 
1074 	/*
1075 	 * Find each lgroup in target lgroups
1076 	 */
1077 	for (i = 0; i <= lgrp_alloc_max; i++) {
1078 		/*
1079 		 * Skip any lgroups that don't exist or aren't in target group
1080 		 */
1081 		lgrp = lgrp_table[i];
1082 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1083 			continue;
1084 		}
1085 
1086 		/*
1087 		 * Initialize memnodes for intermediate lgroups to 0
1088 		 * and update them from scratch since they may have completely
1089 		 * changed
1090 		 */
1091 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1092 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1093 			lgrp->lgrp_nmnodes = 0;
1094 		}
1095 
1096 		/*
1097 		 * Update memory nodes of of target lgroup with memory nodes
1098 		 * from each lgroup in its lgroup memory resource set
1099 		 */
1100 		for (j = 0; j <= lgrp_alloc_max; j++) {
1101 			int	k;
1102 
1103 			/*
1104 			 * Skip any lgroups that don't exist or aren't in
1105 			 * memory resources of target lgroup
1106 			 */
1107 			lgrp_rsrc = lgrp_table[j];
1108 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1109 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1110 			    j))
1111 				continue;
1112 
1113 			/*
1114 			 * Update target lgroup's memnodes to include memnodes
1115 			 * of this lgroup
1116 			 */
1117 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1118 				mnodeset_t	mnode_mask;
1119 
1120 				mnode_mask = (mnodeset_t)1 << k;
1121 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1122 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1123 					lgrp->lgrp_mnodes |= mnode_mask;
1124 					lgrp->lgrp_nmnodes++;
1125 				}
1126 			}
1127 			count++;
1128 			if (changed)
1129 				klgrpset_add(*changed, lgrp->lgrp_id);
1130 		}
1131 	}
1132 
1133 	return (count);
1134 }
1135 
1136 /*
1137  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1138  * is moved from one board to another. The "from" and "to" arguments specify the
1139  * source and the destination of the move.
1140  *
1141  * See plat_lgrp_config() for a detailed description of the copy-rename
1142  * semantics.
1143  *
1144  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1145  * the lgroup topology which is changing as memory moves from one lgroup to
1146  * another. It removes the mnode from the source lgroup and re-inserts it in the
1147  * target lgroup.
1148  *
1149  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1150  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1151  * copy-rename operation.
1152  *
1153  * There is one case which requires special handling. If the system contains
1154  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1155  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1156  * lgrp_mem_init), but there is a window when the system has no memory in the
1157  * lgroup hierarchy. If another thread tries to allocate memory during this
1158  * window, the allocation will fail, although the system has physical memory.
1159  * This may cause a system panic or a deadlock (some sleeping memory allocations
1160  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1161  * the mnode back).
1162  *
1163  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1164  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1165  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1166  * but it updates the rest of the lgroup topology as if the mnode was actually
1167  * removed. The lgrp_mem_init() function recognizes that the mnode being
1168  * inserted represents such a special case and updates the topology
1169  * appropriately.
1170  */
1171 void
1172 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1173 {
1174 	/*
1175 	 * Remove the memory from the source node and add it to the destination
1176 	 * node.
1177 	 */
1178 	lgrp_mem_fini(mnode, from, B_TRUE);
1179 	lgrp_mem_init(mnode, to, B_TRUE);
1180 }
1181 
1182 /*
1183  * Called to indicate that the lgrp with platform handle "hand" now
1184  * contains the memory identified by "mnode".
1185  *
1186  * LOCKING for this routine is a bit tricky. Usually it is called without
1187  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1188  * callers. During DR of the board containing the caged memory it may be called
1189  * with cpu_lock already held and CPUs paused.
1190  *
1191  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1192  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1193  * dealing with the special case of DR copy-rename described in
1194  * lgrp_mem_rename().
1195  */
1196 void
1197 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1198 {
1199 	klgrpset_t	changed;
1200 	int		count;
1201 	int		i;
1202 	lgrp_t		*my_lgrp;
1203 	lgrp_id_t	lgrpid;
1204 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1205 	boolean_t	drop_lock = B_FALSE;
1206 	boolean_t	need_synch = B_FALSE;
1207 
1208 	/*
1209 	 * Grab CPU lock (if we haven't already)
1210 	 */
1211 	if (!MUTEX_HELD(&cpu_lock)) {
1212 		mutex_enter(&cpu_lock);
1213 		drop_lock = B_TRUE;
1214 	}
1215 
1216 	/*
1217 	 * This routine may be called from a context where we already
1218 	 * hold cpu_lock, and have already paused cpus.
1219 	 */
1220 	if (!cpus_paused())
1221 		need_synch = B_TRUE;
1222 
1223 	/*
1224 	 * Check if this mnode is already configured and return immediately if
1225 	 * it is.
1226 	 *
1227 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1228 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1229 	 * recognize this case and continue as usual, but skip the update to
1230 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1231 	 * in topology, temporarily introduced by lgrp_mem_fini().
1232 	 */
1233 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1234 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1235 		if (drop_lock)
1236 			mutex_exit(&cpu_lock);
1237 		return;
1238 	}
1239 
1240 	/*
1241 	 * Update lgroup topology with new memory resources, keeping track of
1242 	 * which lgroups change
1243 	 */
1244 	count = 0;
1245 	klgrpset_clear(changed);
1246 	my_lgrp = lgrp_hand_to_lgrp(hand);
1247 	if (my_lgrp == NULL) {
1248 		/* new lgrp */
1249 		my_lgrp = lgrp_create();
1250 		lgrpid = my_lgrp->lgrp_id;
1251 		my_lgrp->lgrp_plathand = hand;
1252 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1253 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1254 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1255 
1256 		if (need_synch)
1257 			pause_cpus(NULL);
1258 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1259 		    &changed);
1260 		if (need_synch)
1261 			start_cpus();
1262 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1263 	    > 0) {
1264 		/*
1265 		 * Leaf lgroup was created, but latency wasn't available
1266 		 * then.  So, set latency for it and fill in rest of lgroup
1267 		 * topology  now that we know how far it is from other leaf
1268 		 * lgroups.
1269 		 */
1270 		klgrpset_clear(changed);
1271 		lgrpid = my_lgrp->lgrp_id;
1272 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1273 		    lgrpid))
1274 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1275 		if (need_synch)
1276 			pause_cpus(NULL);
1277 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1278 		    &changed);
1279 		if (need_synch)
1280 			start_cpus();
1281 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1282 	    my_lgrp->lgrp_id)) {
1283 		/*
1284 		 * Add new lgroup memory resource to existing lgroup
1285 		 */
1286 		lgrpid = my_lgrp->lgrp_id;
1287 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1288 		klgrpset_add(changed, lgrpid);
1289 		count++;
1290 		for (i = 0; i <= lgrp_alloc_max; i++) {
1291 			lgrp_t		*lgrp;
1292 
1293 			lgrp = lgrp_table[i];
1294 			if (!LGRP_EXISTS(lgrp) ||
1295 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1296 				continue;
1297 
1298 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1299 			klgrpset_add(changed, lgrp->lgrp_id);
1300 			count++;
1301 		}
1302 	}
1303 
1304 	/*
1305 	 * Add memory node to lgroup and remove lgroup from ones that need
1306 	 * to be updated
1307 	 */
1308 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1309 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1310 		my_lgrp->lgrp_nmnodes++;
1311 	}
1312 	klgrpset_del(changed, lgrpid);
1313 
1314 	/*
1315 	 * Update memory node information for all lgroups that changed and
1316 	 * contain new memory node as a resource
1317 	 */
1318 	if (count)
1319 		(void) lgrp_mnode_update(changed, NULL);
1320 
1321 	if (drop_lock)
1322 		mutex_exit(&cpu_lock);
1323 }
1324 
1325 /*
1326  * Called to indicate that the lgroup associated with the platform
1327  * handle "hand" no longer contains given memory node
1328  *
1329  * LOCKING for this routine is a bit tricky. Usually it is called without
1330  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1331  * callers. During DR of the board containing the caged memory it may be called
1332  * with cpu_lock already held and CPUs paused.
1333  *
1334  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1335  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1336  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1337  * the same mnode back into the topology. See lgrp_mem_rename() and
1338  * lgrp_mem_init() for additional details.
1339  */
1340 void
1341 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1342 {
1343 	klgrpset_t	changed;
1344 	int		count;
1345 	int		i;
1346 	lgrp_t		*my_lgrp;
1347 	lgrp_id_t	lgrpid;
1348 	mnodeset_t	mnodes_mask;
1349 	boolean_t	drop_lock = B_FALSE;
1350 	boolean_t	need_synch = B_FALSE;
1351 
1352 	/*
1353 	 * Grab CPU lock (if we haven't already)
1354 	 */
1355 	if (!MUTEX_HELD(&cpu_lock)) {
1356 		mutex_enter(&cpu_lock);
1357 		drop_lock = B_TRUE;
1358 	}
1359 
1360 	/*
1361 	 * This routine may be called from a context where we already
1362 	 * hold cpu_lock and have already paused cpus.
1363 	 */
1364 	if (!cpus_paused())
1365 		need_synch = B_TRUE;
1366 
1367 	my_lgrp = lgrp_hand_to_lgrp(hand);
1368 
1369 	/*
1370 	 * The lgrp *must* be pre-existing
1371 	 */
1372 	ASSERT(my_lgrp != NULL);
1373 
1374 	/*
1375 	 * Delete memory node from lgroups which contain it
1376 	 */
1377 	mnodes_mask = ((mnodeset_t)1 << mnode);
1378 	for (i = 0; i <= lgrp_alloc_max; i++) {
1379 		lgrp_t *lgrp = lgrp_table[i];
1380 		/*
1381 		 * Skip any non-existent lgroups and any lgroups that don't
1382 		 * contain leaf lgroup of memory as a memory resource
1383 		 */
1384 		if (!LGRP_EXISTS(lgrp) ||
1385 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1386 			continue;
1387 
1388 		/*
1389 		 * Avoid removing the last mnode from the root in the DR
1390 		 * copy-rename case. See lgrp_mem_rename() for details.
1391 		 */
1392 		if (is_copy_rename &&
1393 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1394 			continue;
1395 
1396 		/*
1397 		 * Remove memory node from lgroup.
1398 		 */
1399 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1400 		lgrp->lgrp_nmnodes--;
1401 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1402 	}
1403 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1404 
1405 	/*
1406 	 * Don't need to update lgroup topology if this lgroup still has memory.
1407 	 *
1408 	 * In the special case of DR copy-rename with the only mnode being
1409 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1410 	 * still need to update the lgroup topology.
1411 	 */
1412 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1413 	    !(is_copy_rename &&
1414 		(my_lgrp == lgrp_root) &&
1415 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1416 		if (drop_lock)
1417 			mutex_exit(&cpu_lock);
1418 		return;
1419 	}
1420 
1421 	/*
1422 	 * This lgroup does not contain any memory now
1423 	 */
1424 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1425 
1426 	/*
1427 	 * Remove this lgroup from lgroup topology if it does not contain any
1428 	 * resources now
1429 	 */
1430 	lgrpid = my_lgrp->lgrp_id;
1431 	count = 0;
1432 	klgrpset_clear(changed);
1433 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1434 		/*
1435 		 * Delete lgroup when no more resources
1436 		 */
1437 		if (need_synch)
1438 			pause_cpus(NULL);
1439 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1440 		    lgrp_alloc_max + 1, &changed);
1441 		ASSERT(count > 0);
1442 		if (need_synch)
1443 			start_cpus();
1444 	} else {
1445 		/*
1446 		 * Remove lgroup from memory resources of any lgroups that
1447 		 * contain it as such
1448 		 */
1449 		for (i = 0; i <= lgrp_alloc_max; i++) {
1450 			lgrp_t		*lgrp;
1451 
1452 			lgrp = lgrp_table[i];
1453 			if (!LGRP_EXISTS(lgrp) ||
1454 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1455 			    lgrpid))
1456 				continue;
1457 
1458 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1459 		}
1460 	}
1461 	if (drop_lock)
1462 		mutex_exit(&cpu_lock);
1463 }
1464 
1465 /*
1466  * Return lgroup with given platform handle
1467  */
1468 lgrp_t *
1469 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1470 {
1471 	int	i;
1472 	lgrp_t	*lgrp;
1473 
1474 	if (hand == LGRP_NULL_HANDLE)
1475 		return (NULL);
1476 
1477 	for (i = 0; i <= lgrp_alloc_max; i++) {
1478 		lgrp = lgrp_table[i];
1479 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1480 			return (lgrp);
1481 	}
1482 	return (NULL);
1483 }
1484 
1485 /*
1486  * Return the home lgroup of the current thread.
1487  * We must do this with kernel preemption disabled, since we don't want our
1488  * thread to be re-homed while we're poking around with its lpl, and the lpl
1489  * should never be NULL.
1490  *
1491  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1492  * is enabled because of DR.  Callers can use disable kernel preemption
1493  * around this call to guarantee that the lgroup will be valid beyond this
1494  * routine, since kernel preemption can be recursive.
1495  */
1496 lgrp_t *
1497 lgrp_home_lgrp(void)
1498 {
1499 	lgrp_t	*lgrp;
1500 	lpl_t	*lpl;
1501 
1502 	kpreempt_disable();
1503 
1504 	lpl = curthread->t_lpl;
1505 	ASSERT(lpl != NULL);
1506 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1507 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1508 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1509 
1510 	kpreempt_enable();
1511 
1512 	return (lgrp);
1513 }
1514 
1515 /*
1516  * Return ID of home lgroup for given thread
1517  * (See comments for lgrp_home_lgrp() for special care and handling
1518  * instructions)
1519  */
1520 lgrp_id_t
1521 lgrp_home_id(kthread_t *t)
1522 {
1523 	lgrp_id_t	lgrp;
1524 	lpl_t		*lpl;
1525 
1526 	ASSERT(t != NULL);
1527 	/*
1528 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1529 	 * cannot since the HAT layer can call into this routine to
1530 	 * determine the locality for its data structures in the context
1531 	 * of a page fault.
1532 	 */
1533 
1534 	kpreempt_disable();
1535 
1536 	lpl = t->t_lpl;
1537 	ASSERT(lpl != NULL);
1538 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1539 	lgrp = lpl->lpl_lgrpid;
1540 
1541 	kpreempt_enable();
1542 
1543 	return (lgrp);
1544 }
1545 
1546 /*
1547  * Return lgroup containing the physical memory for the given page frame number
1548  */
1549 lgrp_t *
1550 lgrp_pfn_to_lgrp(pfn_t pfn)
1551 {
1552 	lgrp_handle_t	hand;
1553 	int		i;
1554 	lgrp_t		*lgrp;
1555 
1556 	hand = lgrp_plat_pfn_to_hand(pfn);
1557 	if (hand != LGRP_NULL_HANDLE)
1558 		for (i = 0; i <= lgrp_alloc_max; i++) {
1559 			lgrp = lgrp_table[i];
1560 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1561 				return (lgrp);
1562 		}
1563 	return (NULL);
1564 }
1565 
1566 /*
1567  * Return lgroup containing the physical memory for the given page frame number
1568  */
1569 lgrp_t *
1570 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1571 {
1572 	lgrp_handle_t	hand;
1573 	int		i;
1574 	lgrp_t		*lgrp;
1575 	pfn_t		pfn;
1576 
1577 	pfn = btop(physaddr);
1578 	hand = lgrp_plat_pfn_to_hand(pfn);
1579 	if (hand != LGRP_NULL_HANDLE)
1580 		for (i = 0; i <= lgrp_alloc_max; i++) {
1581 			lgrp = lgrp_table[i];
1582 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1583 				return (lgrp);
1584 		}
1585 	return (NULL);
1586 }
1587 
1588 /*
1589  * Return the leaf lgroup containing the given CPU
1590  */
1591 static lgrp_t *
1592 lgrp_cpu_to_lgrp(cpu_t *cpu)
1593 {
1594 	return (cpu->cpu_chip->chip_lgrp);
1595 }
1596 
1597 /*
1598  * Return the sum of the partition loads in an lgrp divided by
1599  * the number of CPUs in the lgrp.  This is our best approximation
1600  * of an 'lgroup load average' for a useful per-lgroup kstat.
1601  */
1602 static uint64_t
1603 lgrp_sum_loadavgs(lgrp_t *lgrp)
1604 {
1605 	cpu_t *cpu;
1606 	int ncpu;
1607 	uint64_t loads = 0;
1608 
1609 	mutex_enter(&cpu_lock);
1610 
1611 	cpu = lgrp->lgrp_cpu;
1612 	ncpu = lgrp->lgrp_cpucnt;
1613 
1614 	if (cpu == NULL || ncpu == 0) {
1615 		mutex_exit(&cpu_lock);
1616 		return (0ull);
1617 	}
1618 
1619 	do {
1620 		loads += cpu->cpu_lpl->lpl_loadavg;
1621 		cpu = cpu->cpu_next_lgrp;
1622 	} while (cpu != lgrp->lgrp_cpu);
1623 
1624 	mutex_exit(&cpu_lock);
1625 
1626 	return (loads / ncpu);
1627 }
1628 
1629 void
1630 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1631 {
1632 	struct lgrp_stats *pstats;
1633 
1634 	/*
1635 	 * Verify that the caller isn't trying to add to
1636 	 * a statistic for an lgroup that has gone away
1637 	 */
1638 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1639 		return;
1640 
1641 	pstats = &lgrp_stats[lgrpid];
1642 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1643 }
1644 
1645 int64_t
1646 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1647 {
1648 	uint64_t val;
1649 	struct lgrp_stats *pstats;
1650 
1651 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1652 		return ((int64_t)0);
1653 
1654 	pstats = &lgrp_stats[lgrpid];
1655 	LGRP_STAT_READ(pstats, stat, val);
1656 	return (val);
1657 }
1658 
1659 /*
1660  * Reset all kstats for lgrp specified by its lgrpid.
1661  */
1662 static void
1663 lgrp_kstat_reset(lgrp_id_t lgrpid)
1664 {
1665 	lgrp_stat_t stat;
1666 
1667 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1668 		return;
1669 
1670 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1671 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1672 	}
1673 }
1674 
1675 /*
1676  * Collect all per-lgrp statistics for the lgrp associated with this
1677  * kstat, and store them in the ks_data array.
1678  *
1679  * The superuser can reset all the running counter statistics for an
1680  * lgrp by writing to any of the lgrp's stats.
1681  */
1682 static int
1683 lgrp_kstat_extract(kstat_t *ksp, int rw)
1684 {
1685 	lgrp_stat_t		stat;
1686 	struct kstat_named	*ksd;
1687 	lgrp_t			*lgrp;
1688 	lgrp_id_t		lgrpid;
1689 
1690 	lgrp = (lgrp_t *)ksp->ks_private;
1691 
1692 	ksd = (struct kstat_named *)ksp->ks_data;
1693 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1694 
1695 	lgrpid = lgrp->lgrp_id;
1696 
1697 	if (lgrpid == LGRP_NONE) {
1698 		/*
1699 		 * Return all zeroes as stats for freed lgrp.
1700 		 */
1701 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1702 			ksd[stat].value.i64 = 0;
1703 		}
1704 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1705 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1706 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1707 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1708 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1709 	} else if (rw != KSTAT_WRITE) {
1710 		/*
1711 		 * Handle counter stats
1712 		 */
1713 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1714 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1715 		}
1716 
1717 		/*
1718 		 * Handle kernel data snapshot stats
1719 		 */
1720 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1721 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1722 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1723 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1724 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1725 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1726 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1727 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1728 	} else {
1729 		lgrp_kstat_reset(lgrpid);
1730 	}
1731 
1732 	return (0);
1733 }
1734 
1735 int
1736 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1737 {
1738 	cpu_t	*cp;
1739 
1740 	mutex_enter(&cpu_lock);
1741 
1742 	if ((cp = cpu_get(id)) == NULL) {
1743 		mutex_exit(&cpu_lock);
1744 		return (EINVAL);
1745 	}
1746 
1747 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1748 		mutex_exit(&cpu_lock);
1749 		return (EINVAL);
1750 	}
1751 
1752 	ASSERT(cp->cpu_lpl != NULL);
1753 
1754 	*lp = cp->cpu_lpl->lpl_lgrpid;
1755 
1756 	mutex_exit(&cpu_lock);
1757 
1758 	return (0);
1759 }
1760 
1761 int
1762 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1763 {
1764 	cpu_t *cp;
1765 
1766 	mutex_enter(&cpu_lock);
1767 
1768 	if ((cp = cpu_get(id)) == NULL) {
1769 		mutex_exit(&cpu_lock);
1770 		return (EINVAL);
1771 	}
1772 
1773 	ASSERT(cp->cpu_lpl != NULL);
1774 
1775 	*lp = cp->cpu_lpl->lpl_loadavg;
1776 
1777 	mutex_exit(&cpu_lock);
1778 
1779 	return (0);
1780 }
1781 
1782 void
1783 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1784 {
1785 	lgrp_t		*lgrp;
1786 	int		i;
1787 
1788 	for (i = 0; i <= lgrp_alloc_max; i++) {
1789 		lgrp = lgrp_table[i];
1790 
1791 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1792 			lgrp->lgrp_latency = (int)newtime;
1793 	}
1794 }
1795 
1796 /*
1797  * Add a resource named by lpl_leaf to rset of lpl_target
1798  *
1799  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1800  * resource. It is adjusted here, as this is presently the only place that we
1801  * can be certain a resource addition has succeeded.
1802  *
1803  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1804  * list in order until it reaches a NULL.  (This list is required to be NULL
1805  * terminated, too).  This is done so that we can mark start pos + 1, so that
1806  * each lpl is traversed sequentially, but in a different order.  We hope this
1807  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1808  */
1809 
1810 void
1811 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1812 {
1813 	int		i;
1814 	int		entry_slot = 0;
1815 
1816 	/* return if leaf is already present */
1817 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1818 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1819 			return;
1820 		}
1821 
1822 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1823 		    lpl_leaf->lpl_lgrpid) {
1824 			break;
1825 		}
1826 	}
1827 
1828 	/* insert leaf, update counts */
1829 	entry_slot = i;
1830 	i = lpl_target->lpl_nrset++;
1831 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1832 		panic("More leaf lgrps in system than are supported!\n");
1833 	}
1834 
1835 	/*
1836 	 * Start at the end of the rset array and work backwards towards the
1837 	 * slot into which the new lpl will be inserted. This effectively
1838 	 * preserves the current ordering by scooting everybody over one entry,
1839 	 * and placing the new entry into the space created.
1840 	 */
1841 
1842 	while (i-- > entry_slot) {
1843 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1844 	}
1845 
1846 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1847 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1848 }
1849 
1850 /*
1851  * Update each of lpl_parent's children with a proper hint and
1852  * a reference to their parent.
1853  * The lgrp topology is used as the reference since it is fully
1854  * consistent and correct at this point.
1855  *
1856  * Each child's hint will reference an element in lpl_parent's
1857  * rset that designates where the child should start searching
1858  * for CPU resources. The hint selected is the highest order leaf present
1859  * in the child's lineage.
1860  *
1861  * This should be called after any potential change in lpl_parent's
1862  * rset.
1863  */
1864 static void
1865 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1866 {
1867 	klgrpset_t	children, leaves;
1868 	lpl_t		*lpl;
1869 	int		hint;
1870 	int		i, j;
1871 
1872 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1873 	if (klgrpset_isempty(children))
1874 		return; /* nothing to do */
1875 
1876 	for (i = 0; i <= lgrp_alloc_max; i++) {
1877 		if (klgrpset_ismember(children, i)) {
1878 
1879 			/*
1880 			 * Given the set of leaves in this child's lineage,
1881 			 * find the highest order leaf present in the parent's
1882 			 * rset. Select this as the hint for the child.
1883 			 */
1884 			leaves = lgrp_table[i]->lgrp_leaves;
1885 			hint = 0;
1886 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1887 				lpl = lpl_parent->lpl_rset[j];
1888 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1889 					hint = j;
1890 			}
1891 			cp->cp_lgrploads[i].lpl_hint = hint;
1892 
1893 			/*
1894 			 * (Re)set the parent. It may be incorrect if
1895 			 * lpl_parent is new in the topology.
1896 			 */
1897 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1898 		}
1899 	}
1900 }
1901 
1902 /*
1903  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1904  *
1905  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1906  * resource. The values are adjusted here, as this is the only place that we can
1907  * be certain a resource was successfully deleted.
1908  */
1909 void
1910 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1911 {
1912 	int i;
1913 
1914 	/* find leaf in intermediate node */
1915 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1916 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1917 			break;
1918 	}
1919 
1920 	/* return if leaf not found */
1921 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1922 		return;
1923 
1924 	/* prune leaf, compress array */
1925 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1926 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1927 	lpl_target->lpl_ncpu--;
1928 	do {
1929 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1930 	} while (i++ < lpl_target->lpl_nrset);
1931 }
1932 
1933 /*
1934  * Check to see if the resource set of the target lpl contains the
1935  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1936  */
1937 
1938 int
1939 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1940 {
1941 	int i;
1942 
1943 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1944 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1945 			return (1);
1946 	}
1947 
1948 	return (0);
1949 }
1950 
1951 /*
1952  * Called when we change cpu lpl membership.  This increments or decrements the
1953  * per-cpu counter in every lpl in which our leaf appears.
1954  */
1955 void
1956 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1957 {
1958 	cpupart_t	*cpupart;
1959 	lgrp_t		*lgrp_leaf;
1960 	lgrp_t		*lgrp_cur;
1961 	lpl_t		*lpl_leaf;
1962 	lpl_t		*lpl_cur;
1963 	int		i;
1964 
1965 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1966 
1967 	cpupart = cp->cpu_part;
1968 	lpl_leaf = cp->cpu_lpl;
1969 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1970 
1971 	for (i = 0; i <= lgrp_alloc_max; i++) {
1972 		lgrp_cur = lgrp_table[i];
1973 
1974 		/*
1975 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1976 		 * for the cpu in question, or if the current lgrp and leaf
1977 		 * don't share the same resources.
1978 		 */
1979 
1980 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1981 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1982 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1983 			continue;
1984 
1985 
1986 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1987 
1988 		if (lpl_cur->lpl_nrset > 0) {
1989 			if (act == LPL_INCREMENT) {
1990 				lpl_cur->lpl_ncpu++;
1991 			} else if (act == LPL_DECREMENT) {
1992 				lpl_cur->lpl_ncpu--;
1993 			}
1994 		}
1995 	}
1996 }
1997 
1998 /*
1999  * Initialize lpl with given resources and specified lgrp
2000  */
2001 
2002 void
2003 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2004 {
2005 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2006 	lpl->lpl_loadavg = 0;
2007 	if (lpl == lpl_leaf)
2008 		lpl->lpl_ncpu = 1;
2009 	else
2010 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2011 	lpl->lpl_nrset = 1;
2012 	lpl->lpl_rset[0] = lpl_leaf;
2013 	lpl->lpl_lgrp = lgrp;
2014 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2015 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2016 }
2017 
2018 /*
2019  * Clear an unused lpl
2020  */
2021 
2022 void
2023 lpl_clear(lpl_t *lpl)
2024 {
2025 	lgrpid_t	lid;
2026 
2027 	/* save lid for debugging purposes */
2028 	lid = lpl->lpl_lgrpid;
2029 	bzero(lpl, sizeof (lpl_t));
2030 	lpl->lpl_lgrpid = lid;
2031 }
2032 
2033 /*
2034  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2035  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2036  * make full use of all of the lgroup topology, but this checks to make sure
2037  * that for the parts that it does use, it has correctly understood the
2038  * relationships that exist. This function returns
2039  * 0 if the topology is correct, and a non-zero error code, for non-debug
2040  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2041  * debugging on a DEBUG kernel.
2042  */
2043 int
2044 lpl_topo_verify(cpupart_t *cpupart)
2045 {
2046 	lgrp_t		*lgrp;
2047 	lpl_t		*lpl;
2048 	klgrpset_t	rset;
2049 	klgrpset_t	cset;
2050 	cpu_t		*cpu;
2051 	cpu_t		*cp_start;
2052 	int		i;
2053 	int		j;
2054 	int		sum;
2055 
2056 	/* topology can't be incorrect if it doesn't exist */
2057 	if (!lgrp_topo_initialized || !lgrp_initialized)
2058 		return (LPL_TOPO_CORRECT);
2059 
2060 	ASSERT(cpupart != NULL);
2061 
2062 	for (i = 0; i <= lgrp_alloc_max; i++) {
2063 		lgrp = lgrp_table[i];
2064 		lpl = NULL;
2065 		/* make sure lpls are allocated */
2066 		ASSERT(cpupart->cp_lgrploads);
2067 		if (!cpupart->cp_lgrploads)
2068 			return (LPL_TOPO_PART_HAS_NO_LPL);
2069 
2070 		lpl = &cpupart->cp_lgrploads[i];
2071 		/* make sure our index is good */
2072 		ASSERT(i < cpupart->cp_nlgrploads);
2073 
2074 		/* if lgroup doesn't exist, make sure lpl is empty */
2075 		if (!LGRP_EXISTS(lgrp)) {
2076 			ASSERT(lpl->lpl_ncpu == 0);
2077 			if (lpl->lpl_ncpu > 0) {
2078 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2079 			} else {
2080 				continue;
2081 			}
2082 		}
2083 
2084 		/* verify that lgroup and lpl are identically numbered */
2085 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2086 
2087 		/* if lgroup isn't in our partition, make sure lpl is empty */
2088 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2089 		    cpupart->cp_lgrpset)) {
2090 			ASSERT(lpl->lpl_ncpu == 0);
2091 			if (lpl->lpl_ncpu > 0) {
2092 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2093 			}
2094 			/*
2095 			 * lpl is empty, and lgroup isn't in partition.  verify
2096 			 * that lpl doesn't show up in anyone else's rsets (in
2097 			 * this partition, anyway)
2098 			 */
2099 
2100 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2101 				lpl_t *i_lpl; /* lpl we're iterating over */
2102 
2103 				i_lpl = &cpupart->cp_lgrploads[j];
2104 
2105 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2106 				if (lpl_rset_contains(i_lpl, lpl)) {
2107 					return (LPL_TOPO_LPL_ORPHANED);
2108 				}
2109 			}
2110 			/* lgroup is empty, and everything is ok. continue */
2111 			continue;
2112 		}
2113 
2114 
2115 		/* lgroup is in this partition, now check it against lpl */
2116 
2117 		/* do both have matching lgrps? */
2118 		ASSERT(lgrp == lpl->lpl_lgrp);
2119 		if (lgrp != lpl->lpl_lgrp) {
2120 			return (LPL_TOPO_LGRP_MISMATCH);
2121 		}
2122 
2123 		/* do the parent lgroups exist and do they match? */
2124 		if (lgrp->lgrp_parent) {
2125 			ASSERT(lpl->lpl_parent);
2126 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2127 				    lpl->lpl_parent->lpl_lgrpid);
2128 
2129 			if (!lpl->lpl_parent) {
2130 				return (LPL_TOPO_MISSING_PARENT);
2131 			} else if (lgrp->lgrp_parent->lgrp_id !=
2132 			    lpl->lpl_parent->lpl_lgrpid) {
2133 				return (LPL_TOPO_PARENT_MISMATCH);
2134 			}
2135 		}
2136 
2137 		/* only leaf lgroups keep a cpucnt, only check leaves */
2138 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2139 
2140 			/* verify that lgrp is also a leaf */
2141 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2142 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2143 			    lpl->lpl_lgrpid)));
2144 
2145 			if ((lgrp->lgrp_childcnt > 0) ||
2146 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2147 			    lpl->lpl_lgrpid))) {
2148 				return (LPL_TOPO_LGRP_NOT_LEAF);
2149 			}
2150 
2151 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2152 			    (lpl->lpl_ncpu > 0));
2153 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2154 				(lpl->lpl_ncpu <= 0)) {
2155 				return (LPL_TOPO_BAD_CPUCNT);
2156 			}
2157 
2158 			/*
2159 			 * Check that lpl_ncpu also matches the number of
2160 			 * cpus in the lpl's linked list.  This only exists in
2161 			 * leaves, but they should always match.
2162 			 */
2163 			j = 0;
2164 			cpu = cp_start = lpl->lpl_cpus;
2165 			while (cpu != NULL) {
2166 				j++;
2167 
2168 				/* check to make sure cpu's lpl is leaf lpl */
2169 				ASSERT(cpu->cpu_lpl == lpl);
2170 				if (cpu->cpu_lpl != lpl) {
2171 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2172 				}
2173 
2174 				/* check next cpu */
2175 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2176 					continue;
2177 				} else {
2178 					cpu = NULL;
2179 				}
2180 			}
2181 
2182 			ASSERT(j == lpl->lpl_ncpu);
2183 			if (j != lpl->lpl_ncpu) {
2184 				return (LPL_TOPO_LPL_BAD_NCPU);
2185 			}
2186 
2187 			/*
2188 			 * Also, check that leaf lpl is contained in all
2189 			 * intermediate lpls that name the leaf as a descendant
2190 			 */
2191 
2192 			for (j = 0; j <= lgrp_alloc_max; j++) {
2193 				klgrpset_t intersect;
2194 				lgrp_t *lgrp_cand;
2195 				lpl_t *lpl_cand;
2196 
2197 				lgrp_cand = lgrp_table[j];
2198 				intersect = klgrpset_intersects(
2199 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2200 				    cpupart->cp_lgrpset);
2201 
2202 				if (!LGRP_EXISTS(lgrp_cand) ||
2203 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2204 				    cpupart->cp_lgrpset) ||
2205 				    (intersect == 0))
2206 					continue;
2207 
2208 				lpl_cand =
2209 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2210 
2211 				if (klgrpset_ismember(intersect,
2212 				    lgrp->lgrp_id)) {
2213 					ASSERT(lpl_rset_contains(lpl_cand,
2214 					    lpl));
2215 
2216 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2217 						return (LPL_TOPO_RSET_MSSNG_LF);
2218 					}
2219 				}
2220 			}
2221 
2222 		} else { /* non-leaf specific checks */
2223 
2224 			/*
2225 			 * Non-leaf lpls should have lpl_cpus == NULL
2226 			 * verify that this is so
2227 			 */
2228 			ASSERT(lpl->lpl_cpus == NULL);
2229 			if (lpl->lpl_cpus != NULL) {
2230 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2231 			}
2232 
2233 			/*
2234 			 * verify that the sum of the cpus in the leaf resources
2235 			 * is equal to the total ncpu in the intermediate
2236 			 */
2237 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2238 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2239 			}
2240 
2241 			ASSERT(sum == lpl->lpl_ncpu);
2242 			if (sum != lpl->lpl_ncpu) {
2243 				return (LPL_TOPO_LPL_BAD_NCPU);
2244 			}
2245 		}
2246 
2247 		/*
2248 		 * check on lpl_hint. Don't check root, since it has no parent.
2249 		 */
2250 		if (lpl->lpl_parent != NULL) {
2251 			int hint;
2252 			lpl_t *hint_lpl;
2253 
2254 			/* make sure hint is within limits of nrset */
2255 			hint = lpl->lpl_hint;
2256 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2257 			if (lpl->lpl_parent->lpl_nrset < hint) {
2258 				return (LPL_TOPO_BOGUS_HINT);
2259 			}
2260 
2261 			/* make sure hint points to valid lpl */
2262 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2263 			ASSERT(hint_lpl->lpl_ncpu > 0);
2264 			if (hint_lpl->lpl_ncpu <= 0) {
2265 				return (LPL_TOPO_BOGUS_HINT);
2266 			}
2267 		}
2268 
2269 		/*
2270 		 * Check the rset of the lpl in question.  Make sure that each
2271 		 * rset contains a subset of the resources in
2272 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2273 		 * sure that each rset doesn't include resources that are
2274 		 * outside of that set.  (Which would be resources somehow not
2275 		 * accounted for).
2276 		 */
2277 
2278 		klgrpset_clear(rset);
2279 		for (j = 0; j < lpl->lpl_nrset; j++) {
2280 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2281 		}
2282 		klgrpset_copy(cset, rset);
2283 		/* make sure lpl rset matches lgrp rset */
2284 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2285 		/* make sure rset is contained with in partition, too */
2286 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2287 
2288 		ASSERT(klgrpset_isempty(rset) &&
2289 			    klgrpset_isempty(cset));
2290 		if (!klgrpset_isempty(rset) ||
2291 		    !klgrpset_isempty(cset)) {
2292 			return (LPL_TOPO_RSET_MISMATCH);
2293 		}
2294 
2295 		/*
2296 		 * check to make sure lpl_nrset matches the number of rsets
2297 		 * contained in the lpl
2298 		 */
2299 
2300 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2301 		    j++);
2302 
2303 		ASSERT(j == lpl->lpl_nrset);
2304 		if (j != lpl->lpl_nrset) {
2305 			return (LPL_TOPO_BAD_RSETCNT);
2306 		}
2307 
2308 	}
2309 	return (LPL_TOPO_CORRECT);
2310 }
2311 
2312 /*
2313  * Flatten lpl topology to given number of levels.  This is presently only
2314  * implemented for a flatten to 2 levels, which will prune out the intermediates
2315  * and home the leaf lpls to the root lpl.
2316  */
2317 int
2318 lpl_topo_flatten(int levels)
2319 {
2320 	int		i;
2321 	uint_t		sum;
2322 	lgrp_t		*lgrp_cur;
2323 	lpl_t		*lpl_cur;
2324 	lpl_t		*lpl_root;
2325 	cpupart_t	*cp;
2326 
2327 	if (levels != 2)
2328 		return (0);
2329 
2330 	/* called w/ cpus paused - grab no locks! */
2331 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2332 	    !lgrp_initialized);
2333 
2334 	cp = cp_list_head;
2335 	do {
2336 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2337 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2338 
2339 		for (i = 0; i <= lgrp_alloc_max; i++) {
2340 			lgrp_cur = lgrp_table[i];
2341 			lpl_cur = &cp->cp_lgrploads[i];
2342 
2343 			if ((lgrp_cur == lgrp_root) ||
2344 			    (!LGRP_EXISTS(lgrp_cur) &&
2345 			    (lpl_cur->lpl_ncpu == 0)))
2346 				continue;
2347 
2348 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2349 				/*
2350 				 * this should be a deleted intermediate, so
2351 				 * clear it
2352 				 */
2353 				lpl_clear(lpl_cur);
2354 			} else if ((lpl_cur->lpl_nrset == 1) &&
2355 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2356 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2357 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2358 				/*
2359 				 * this is a leaf whose parent was deleted, or
2360 				 * whose parent had their lgrp deleted.  (And
2361 				 * whose parent will soon be deleted).  Point
2362 				 * this guy back to the root lpl.
2363 				 */
2364 				lpl_cur->lpl_parent = lpl_root;
2365 				lpl_rset_add(lpl_root, lpl_cur);
2366 			}
2367 
2368 		}
2369 
2370 		/*
2371 		 * Now that we're done, make sure the count on the root lpl is
2372 		 * correct, and update the hints of the children for the sake of
2373 		 * thoroughness
2374 		 */
2375 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2376 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2377 		}
2378 		lpl_root->lpl_ncpu = sum;
2379 		lpl_child_update(lpl_root, cp);
2380 
2381 		cp = cp->cp_next;
2382 	} while (cp != cp_list_head);
2383 
2384 	return (levels);
2385 }
2386 
2387 /*
2388  * Insert a lpl into the resource hierarchy and create any additional lpls that
2389  * are necessary to represent the varying states of locality for the cpu
2390  * resoruces newly added to the partition.
2391  *
2392  * This routine is clever enough that it can correctly add resources from the
2393  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2394  * those for which the lpl is a leaf as opposed to simply a named equally local
2395  * resource).  The one special case that needs additional processing is when a
2396  * new intermediate lpl is introduced.  Since the main loop only traverses
2397  * looking to add the leaf resource where it does not yet exist, additional work
2398  * is necessary to add other leaf resources that may need to exist in the newly
2399  * created intermediate.  This is performed by the second inner loop, and is
2400  * only done when the check for more than one overlapping resource succeeds.
2401  */
2402 
2403 void
2404 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2405 {
2406 	int		i;
2407 	int		j;
2408 	int		hint;
2409 	int		rset_num_intersect;
2410 	lgrp_t		*lgrp_cur;
2411 	lpl_t		*lpl_cur;
2412 	lpl_t		*lpl_parent;
2413 	lgrpid_t	parent_id;
2414 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2415 
2416 	for (i = 0; i <= lgrp_alloc_max; i++) {
2417 		lgrp_cur = lgrp_table[i];
2418 
2419 		/*
2420 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2421 		 * contained within the current lgrp, or if the current lgrp has
2422 		 * no leaves in this partition
2423 		 */
2424 
2425 		if (!LGRP_EXISTS(lgrp_cur) ||
2426 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2427 		    lpl_leaf->lpl_lgrpid) ||
2428 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2429 		    cpupart->cp_lgrpset))
2430 			continue;
2431 
2432 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2433 		if (lgrp_cur->lgrp_parent != NULL) {
2434 			/* if lgrp has a parent, assign it properly */
2435 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2436 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2437 		} else {
2438 			/* if not, make sure parent ptr gets set to null */
2439 			lpl_parent = NULL;
2440 		}
2441 
2442 		if (lpl_cur == lpl_leaf) {
2443 			/*
2444 			 * Almost all leaf state was initialized elsewhere.  The
2445 			 * only thing left to do is to set the parent.
2446 			 */
2447 			lpl_cur->lpl_parent = lpl_parent;
2448 			continue;
2449 		}
2450 
2451 		/*
2452 		 * Initialize intermediate lpl
2453 		 * Save this lpl's hint though. Since we're changing this
2454 		 * lpl's resources, we need to update the hint in this lpl's
2455 		 * children, but the hint in this lpl is unaffected and
2456 		 * should be preserved.
2457 		 */
2458 		hint = lpl_cur->lpl_hint;
2459 
2460 		lpl_clear(lpl_cur);
2461 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2462 
2463 		lpl_cur->lpl_hint = hint;
2464 		lpl_cur->lpl_parent = lpl_parent;
2465 
2466 		/* does new lpl need to be populated with other resources? */
2467 		rset_intersect =
2468 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2469 			cpupart->cp_lgrpset);
2470 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2471 
2472 		if (rset_num_intersect > 1) {
2473 			/*
2474 			 * If so, figure out what lpls have resources that
2475 			 * intersect this one, and add them.
2476 			 */
2477 			for (j = 0; j <= lgrp_alloc_max; j++) {
2478 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2479 				lpl_t	*lpl_cand;	/* candidate lpl */
2480 
2481 				lgrp_cand = lgrp_table[j];
2482 				if (!LGRP_EXISTS(lgrp_cand) ||
2483 				    !klgrpset_ismember(rset_intersect,
2484 					lgrp_cand->lgrp_id))
2485 					continue;
2486 				lpl_cand =
2487 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2488 				lpl_rset_add(lpl_cur, lpl_cand);
2489 			}
2490 		}
2491 		/*
2492 		 * This lpl's rset has changed. Update the hint in it's
2493 		 * children.
2494 		 */
2495 		lpl_child_update(lpl_cur, cpupart);
2496 	}
2497 }
2498 
2499 /*
2500  * remove a lpl from the hierarchy of resources, clearing its state when
2501  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2502  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2503  * delete them as well.
2504  */
2505 
2506 void
2507 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2508 {
2509 	int		i;
2510 	lgrp_t		*lgrp_cur;
2511 	lpl_t		*lpl_cur;
2512 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2513 
2514 	for (i = 0; i <= lgrp_alloc_max; i++) {
2515 		lgrp_cur = lgrp_table[i];
2516 
2517 		/*
2518 		 * Don't attempt to remove from lgrps that aren't there, that
2519 		 * don't contain our leaf, or from the leaf itself. (We do that
2520 		 * later)
2521 		 */
2522 
2523 		if (!LGRP_EXISTS(lgrp_cur))
2524 			continue;
2525 
2526 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2527 
2528 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2529 		    lpl_leaf->lpl_lgrpid) ||
2530 		    (lpl_cur == lpl_leaf)) {
2531 			continue;
2532 		}
2533 
2534 		/*
2535 		 * This is a slightly sleazy simplification in that we have
2536 		 * already marked the cp_lgrpset as no longer containing the
2537 		 * leaf we've deleted.  Any lpls that pass the above checks
2538 		 * based upon lgrp membership but not necessarily cpu-part
2539 		 * membership also get cleared by the checks below.  Currently
2540 		 * this is harmless, as the lpls should be empty anyway.
2541 		 *
2542 		 * In particular, we want to preserve lpls that have additional
2543 		 * leaf resources, even though we don't yet have a processor
2544 		 * architecture that represents resources this way.
2545 		 */
2546 
2547 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2548 		    cpupart->cp_lgrpset);
2549 
2550 		lpl_rset_del(lpl_cur, lpl_leaf);
2551 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2552 			lpl_clear(lpl_cur);
2553 		} else {
2554 			/*
2555 			 * Update this lpl's children
2556 			 */
2557 			lpl_child_update(lpl_cur, cpupart);
2558 		}
2559 	}
2560 	lpl_clear(lpl_leaf);
2561 }
2562 
2563 /*
2564  * add a cpu to a partition in terms of lgrp load avg bookeeping
2565  *
2566  * The lpl (cpu partition load average information) is now arranged in a
2567  * hierarchical fashion whereby resources that are closest, ie. most local, to
2568  * the cpu in question are considered to be leaves in a tree of resources.
2569  * There are two general cases for cpu additon:
2570  *
2571  * 1. A lpl structure that contains resources already in the hierarchy tree.
2572  * In this case, all of the associated lpl relationships have been defined, and
2573  * all that is necessary is that we link the new cpu into the per-lpl list of
2574  * cpus, and increment the ncpu count of all places where this cpu resource will
2575  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2576  * pushing is accomplished by this routine.
2577  *
2578  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2579  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2580  * construct the hierarchy of state necessary to name it's more distant
2581  * resources, if they should exist.  The leaf structure is initialized by this
2582  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2583  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2584  * and builds all of the "ancestoral" state necessary to identify resources at
2585  * differing levels of locality.
2586  */
2587 void
2588 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2589 {
2590 	cpupart_t	*cpupart;
2591 	lgrp_t		*lgrp_leaf;
2592 	lpl_t		*lpl_leaf;
2593 
2594 	/* called sometimes w/ cpus paused - grab no locks */
2595 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2596 
2597 	cpupart = cp->cpu_part;
2598 	lgrp_leaf = lgrp_table[lgrpid];
2599 
2600 	/* don't add non-existent lgrp */
2601 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2602 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2603 	cp->cpu_lpl = lpl_leaf;
2604 
2605 	/* only leaf lpls contain cpus */
2606 
2607 	if (lpl_leaf->lpl_ncpu++ == 0) {
2608 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2609 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2610 		lpl_leaf_insert(lpl_leaf, cpupart);
2611 	} else {
2612 		/*
2613 		 * the lpl should already exist in the parent, so just update
2614 		 * the count of available CPUs
2615 		 */
2616 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2617 	}
2618 
2619 	/* link cpu into list of cpus in lpl */
2620 
2621 	if (lpl_leaf->lpl_cpus) {
2622 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2623 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2624 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2625 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2626 	} else {
2627 		/*
2628 		 * We increment ncpu immediately after we create a new leaf
2629 		 * lpl, so assert that ncpu == 1 for the case where we don't
2630 		 * have any cpu pointers yet.
2631 		 */
2632 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2633 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2634 	}
2635 
2636 }
2637 
2638 
2639 /*
2640  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2641  *
2642  * The lpl (cpu partition load average information) is now arranged in a
2643  * hierarchical fashion whereby resources that are closest, ie. most local, to
2644  * the cpu in question are considered to be leaves in a tree of resources.
2645  * There are two removal cases in question:
2646  *
2647  * 1. Removal of the resource in the leaf leaves other resources remaining in
2648  * that leaf.  (Another cpu still exists at this level of locality).  In this
2649  * case, the count of available cpus is decremented in all assocated lpls by
2650  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2651  * from the per-cpu lpl list.
2652  *
2653  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2654  * empty)  In this case, all of what has occurred for the first step must take
2655  * place; however, additionally we must remove the lpl structure itself, prune
2656  * out any stranded lpls that do not directly name a leaf resource, and mark the
2657  * cpu partition in question as no longer containing resources from the lgrp of
2658  * the lpl that has been delted.  Cpu-partition changes are handled by this
2659  * method, but the lpl_leaf_remove function deals with the details of pruning
2660  * out the empty lpl and any of its orphaned direct ancestors.
2661  */
2662 void
2663 lgrp_part_del_cpu(cpu_t *cp)
2664 {
2665 	lpl_t		*lpl;
2666 	lpl_t		*leaf_lpl;
2667 	lgrp_t		*lgrp_leaf;
2668 
2669 	/* called sometimes w/ cpus paused - grab no locks */
2670 
2671 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2672 
2673 	lpl = leaf_lpl = cp->cpu_lpl;
2674 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2675 
2676 	/* don't delete a leaf that isn't there */
2677 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2678 
2679 	/* no double-deletes */
2680 	ASSERT(lpl->lpl_ncpu);
2681 	if (--lpl->lpl_ncpu == 0) {
2682 		/*
2683 		 * This was the last cpu in this lgroup for this partition,
2684 		 * clear its bit in the partition's lgroup bitmask
2685 		 */
2686 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2687 
2688 		/* eliminate remaning lpl link pointers in cpu, lpl */
2689 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2690 
2691 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2692 	} else {
2693 
2694 		/* unlink cpu from lists of cpus in lpl */
2695 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2696 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2697 		if (lpl->lpl_cpus == cp) {
2698 			lpl->lpl_cpus = cp->cpu_next_lpl;
2699 		}
2700 
2701 		/*
2702 		 * Update the cpu count in the lpls associated with parent
2703 		 * lgroups.
2704 		 */
2705 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2706 
2707 	}
2708 	/* clear cpu's lpl ptr when we're all done */
2709 	cp->cpu_lpl = NULL;
2710 }
2711 
2712 /*
2713  * Recompute load average for the specified partition/lgrp fragment.
2714  *
2715  * We rely on the fact that this routine is called from the clock thread
2716  * at a point before the clock thread can block (i.e. before its first
2717  * lock request).  Since the clock thread can not be preempted (since it
2718  * runs at highest priority), we know that cpu partitions can not change
2719  * (since doing so would require either the repartition requester or the
2720  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2721  * without grabbing cpu_lock.
2722  */
2723 void
2724 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2725 {
2726 	uint_t		ncpu;
2727 	int64_t		old, new, f;
2728 
2729 	/*
2730 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2731 	 */
2732 	static short expval[] = {
2733 	    0, 3196, 1618, 1083,
2734 	    814, 652, 543, 466,
2735 	    408, 363, 326, 297,
2736 	    272, 251, 233, 218,
2737 	    204, 192, 181, 172,
2738 	    163, 155, 148, 142,
2739 	    136, 130, 125, 121,
2740 	    116, 112, 109, 105
2741 	};
2742 
2743 	/* ASSERT (called from clock level) */
2744 
2745 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2746 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2747 		return;
2748 	}
2749 
2750 	for (;;) {
2751 
2752 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2753 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2754 		else
2755 			f = expval[ncpu];
2756 
2757 		/*
2758 		 * Modify the load average atomically to avoid losing
2759 		 * anticipatory load updates (see lgrp_move_thread()).
2760 		 */
2761 		if (ageflag) {
2762 			/*
2763 			 * We're supposed to both update and age the load.
2764 			 * This happens 10 times/sec. per cpu.  We do a
2765 			 * little hoop-jumping to avoid integer overflow.
2766 			 */
2767 			int64_t		q, r;
2768 
2769 			do {
2770 				old = new = lpl->lpl_loadavg;
2771 				q = (old  >> 16) << 7;
2772 				r = (old  & 0xffff) << 7;
2773 				new += ((long long)(nrcpus - q) * f -
2774 				    ((r * f) >> 16)) >> 7;
2775 
2776 				/*
2777 				 * Check for overflow
2778 				 */
2779 				if (new > LGRP_LOADAVG_MAX)
2780 					new = LGRP_LOADAVG_MAX;
2781 				else if (new < 0)
2782 					new = 0;
2783 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2784 			    new) != old);
2785 		} else {
2786 			/*
2787 			 * We're supposed to update the load, but not age it.
2788 			 * This option is used to update the load (which either
2789 			 * has already been aged in this 1/10 sec. interval or
2790 			 * soon will be) to account for a remotely executing
2791 			 * thread.
2792 			 */
2793 			do {
2794 				old = new = lpl->lpl_loadavg;
2795 				new += f;
2796 				/*
2797 				 * Check for overflow
2798 				 * Underflow not possible here
2799 				 */
2800 				if (new < old)
2801 					new = LGRP_LOADAVG_MAX;
2802 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2803 			    new) != old);
2804 		}
2805 
2806 		/*
2807 		 * Do the same for this lpl's parent
2808 		 */
2809 		if ((lpl = lpl->lpl_parent) == NULL)
2810 			break;
2811 		ncpu = lpl->lpl_ncpu;
2812 	}
2813 }
2814 
2815 /*
2816  * Initialize lpl topology in the target based on topology currently present in
2817  * lpl_bootstrap.
2818  *
2819  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2820  * initialize cp_default list of lpls. Up to this point all topology operations
2821  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2822  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2823  * `target' points to the list of lpls in cp_default and `size' is the size of
2824  * this list.
2825  *
2826  * This function walks the lpl topology in lpl_bootstrap and does for things:
2827  *
2828  * 1) Copies all fields from lpl_bootstrap to the target.
2829  *
2830  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2831  *
2832  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2833  *    instead of lpl_bootstrap.
2834  *
2835  * 4) Updates pointers in the resource list of the target to point to the lpls
2836  *    in the target list instead of lpl_bootstrap.
2837  *
2838  * After lpl_topo_bootstrap() completes, target contains the same information
2839  * that would be present there if it were used during boot instead of
2840  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2841  * and it is bzeroed.
2842  */
2843 void
2844 lpl_topo_bootstrap(lpl_t *target, int size)
2845 {
2846 	lpl_t	*lpl = lpl_bootstrap;
2847 	lpl_t	*target_lpl = target;
2848 	int	howmany;
2849 	int	id;
2850 	int	i;
2851 
2852 	/*
2853 	 * The only target that should be passed here is cp_default lpl list.
2854 	 */
2855 	ASSERT(target == cp_default.cp_lgrploads);
2856 	ASSERT(size == cp_default.cp_nlgrploads);
2857 	ASSERT(!lgrp_topo_initialized);
2858 	ASSERT(ncpus == 1);
2859 
2860 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2861 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2862 		/*
2863 		 * Copy all fields from lpl.
2864 		 */
2865 
2866 		*target_lpl = *lpl;
2867 
2868 		/*
2869 		 * Substitute CPU0 lpl pointer with one relative to target.
2870 		 */
2871 		if (lpl->lpl_cpus == CPU) {
2872 			ASSERT(CPU->cpu_lpl == lpl);
2873 			CPU->cpu_lpl = target_lpl;
2874 		}
2875 
2876 		/*
2877 		 * Substitute parent information with parent relative to target.
2878 		 */
2879 		if (lpl->lpl_parent != NULL)
2880 			target_lpl->lpl_parent = (lpl_t *)
2881 			    (((uintptr_t)lpl->lpl_parent -
2882 				(uintptr_t)lpl_bootstrap) +
2883 				(uintptr_t)target);
2884 
2885 		/*
2886 		 * Walk over resource set substituting pointers relative to
2887 		 * lpl_bootstrap to pointers relative to target.
2888 		 */
2889 		ASSERT(lpl->lpl_nrset <= 1);
2890 
2891 		for (id = 0; id < lpl->lpl_nrset; id++) {
2892 			if (lpl->lpl_rset[id] != NULL) {
2893 				target_lpl->lpl_rset[id] =
2894 				    (lpl_t *)
2895 				    (((uintptr_t)lpl->lpl_rset[id] -
2896 					(uintptr_t)lpl_bootstrap) +
2897 					(uintptr_t)target);
2898 			}
2899 		}
2900 	}
2901 
2902 	/*
2903 	 * Topology information in lpl_bootstrap is no longer needed.
2904 	 */
2905 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2906 }
2907 
2908 /* the maximum effect that a single thread can have on it's lgroup's load */
2909 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2910 	((lgrp_loadavg_max_effect) / (ncpu))
2911 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2912 
2913 /*
2914  * If the lowest load among the lgroups a process' threads are currently
2915  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2916  * expanding the process to a new lgroup.
2917  */
2918 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2919 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2920 
2921 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2922 	((lgrp_expand_proc_thresh) / (ncpu))
2923 
2924 /*
2925  * A process will be expanded to a new lgroup only if the difference between
2926  * the lowest load on the lgroups the process' thread's are currently spread
2927  * across and the lowest load on the other lgroups in the process' partition
2928  * is greater than lgrp_expand_proc_diff.
2929  */
2930 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2931 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2932 
2933 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2934 	((lgrp_expand_proc_diff) / (ncpu))
2935 
2936 /*
2937  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2938  * be present due to impreciseness of the load average decay algorithm.
2939  *
2940  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2941  * tolerance is scaled by the number of cpus in the lgroup just like
2942  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2943  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2944  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2945  */
2946 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2947 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2948 	((lgrp_loadavg_tolerance) / ncpu)
2949 
2950 /*
2951  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2952  * average is above this threshold
2953  */
2954 uint32_t	lgrp_load_thresh = UINT32_MAX;
2955 
2956 /*
2957  * lgrp_choose() will try to skip any lgroups with less memory
2958  * than this free when choosing a home lgroup
2959  */
2960 pgcnt_t	lgrp_mem_free_thresh = 0;
2961 
2962 /*
2963  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2964  * one based on one of the following policies:
2965  * - Random selection
2966  * - Pseudo round robin placement
2967  * - Longest time since a thread was last placed
2968  */
2969 #define	LGRP_CHOOSE_RANDOM	1
2970 #define	LGRP_CHOOSE_RR		2
2971 #define	LGRP_CHOOSE_TIME	3
2972 
2973 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2974 
2975 /*
2976  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2977  * be bound to a CPU or processor set.
2978  *
2979  * Arguments:
2980  *	t		The thread
2981  *	cpupart		The partition the thread belongs to.
2982  *
2983  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2984  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2985  *	 partitions changing out from under us and assumes that given thread is
2986  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2987  *	 disabled, so don't grab any locks because we should never block under
2988  *	 those conditions.
2989  */
2990 lpl_t *
2991 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2992 {
2993 	lgrp_load_t	bestload, bestrload;
2994 	int		lgrpid_offset, lgrp_count;
2995 	lgrp_id_t	lgrpid, lgrpid_start;
2996 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2997 	klgrpset_t	lgrpset;
2998 	proc_t		*p;
2999 
3000 	ASSERT(t != NULL);
3001 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3002 	    THREAD_LOCK_HELD(t));
3003 	ASSERT(cpupart != NULL);
3004 
3005 	p = t->t_procp;
3006 
3007 	/* A process should always be in an active partition */
3008 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3009 
3010 	bestlpl = bestrlpl = NULL;
3011 	bestload = bestrload = LGRP_LOADAVG_MAX;
3012 	lgrpset = cpupart->cp_lgrpset;
3013 
3014 	switch (lgrp_choose_policy) {
3015 	case LGRP_CHOOSE_RR:
3016 		lgrpid = cpupart->cp_lgrp_hint;
3017 		do {
3018 			if (++lgrpid > lgrp_alloc_max)
3019 				lgrpid = 0;
3020 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3021 
3022 		break;
3023 	default:
3024 	case LGRP_CHOOSE_TIME:
3025 	case LGRP_CHOOSE_RANDOM:
3026 		klgrpset_nlgrps(lgrpset, lgrp_count);
3027 		lgrpid_offset =
3028 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3029 		for (lgrpid = 0; ; lgrpid++) {
3030 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3031 				if (--lgrpid_offset == 0)
3032 					break;
3033 			}
3034 		}
3035 		break;
3036 	}
3037 
3038 	lgrpid_start = lgrpid;
3039 
3040 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3041 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3042 
3043 	/*
3044 	 * Use lgroup affinities (if any) to choose best lgroup
3045 	 *
3046 	 * NOTE: Assumes that thread is protected from going away and its
3047 	 *	 lgroup affinities won't change (ie. p_lock, or
3048 	 *	 thread_lock() being held and/or CPUs paused)
3049 	 */
3050 	if (t->t_lgrp_affinity) {
3051 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3052 		if (lpl != NULL)
3053 			return (lpl);
3054 	}
3055 
3056 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3057 	bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3058 
3059 	do {
3060 		pgcnt_t	npgs;
3061 
3062 		/*
3063 		 * Skip any lgroups outside of thread's pset
3064 		 */
3065 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3066 			if (++lgrpid > lgrp_alloc_max)
3067 				lgrpid = 0;	/* wrap the search */
3068 			continue;
3069 		}
3070 
3071 		/*
3072 		 * Skip any non-leaf lgroups
3073 		 */
3074 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3075 			continue;
3076 
3077 		/*
3078 		 * Skip any lgroups without enough free memory
3079 		 * (when threshold set to nonzero positive value)
3080 		 */
3081 		if (lgrp_mem_free_thresh > 0) {
3082 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3083 			if (npgs < lgrp_mem_free_thresh) {
3084 				if (++lgrpid > lgrp_alloc_max)
3085 					lgrpid = 0;	/* wrap the search */
3086 				continue;
3087 			}
3088 		}
3089 
3090 		lpl = &cpupart->cp_lgrploads[lgrpid];
3091 		if (klgrpset_isempty(p->p_lgrpset) ||
3092 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3093 			/*
3094 			 * Either this is a new process or the process already
3095 			 * has threads on this lgrp, so this is a preferred
3096 			 * lgroup for the thread.
3097 			 */
3098 			if (lpl_pick(lpl, bestlpl)) {
3099 				bestload = lpl->lpl_loadavg;
3100 				bestlpl = lpl;
3101 			}
3102 		} else {
3103 			/*
3104 			 * The process doesn't have any threads on this lgrp,
3105 			 * but we're willing to consider this lgrp if the load
3106 			 * difference is big enough to justify splitting up
3107 			 * the process' threads.
3108 			 */
3109 			if (lpl_pick(lpl, bestrlpl)) {
3110 				bestrload = lpl->lpl_loadavg;
3111 				bestrlpl = lpl;
3112 			}
3113 		}
3114 		if (++lgrpid > lgrp_alloc_max)
3115 			lgrpid = 0;	/* wrap the search */
3116 	} while (lgrpid != lgrpid_start);
3117 
3118 	/*
3119 	 * Return root lgroup if threshold isn't set to maximum value and
3120 	 * lowest lgroup load average more than a certain threshold
3121 	 */
3122 	if (lgrp_load_thresh != UINT32_MAX &&
3123 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3124 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3125 
3126 	/*
3127 	 * If all the lgroups over which the thread's process is spread are
3128 	 * heavily loaded, we'll consider placing the thread on one of the
3129 	 * other leaf lgroups in the thread's partition.
3130 	 */
3131 	if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3132 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3133 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3134 	    bestload)) {
3135 		bestlpl = bestrlpl;
3136 	}
3137 
3138 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3139 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3140 
3141 	ASSERT(bestlpl->lpl_ncpu > 0);
3142 	return (bestlpl);
3143 }
3144 
3145 /*
3146  * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing.
3147  */
3148 static int
3149 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3150 {
3151 	lgrp_load_t	l1, l2;
3152 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3153 
3154 
3155 	if (lpl2 == NULL)
3156 		return (1);
3157 
3158 	l1 = lpl1->lpl_loadavg;
3159 	l2 = lpl2->lpl_loadavg;
3160 
3161 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3162 		/* lpl1 is significantly less loaded than lpl2 */
3163 		return (1);
3164 	}
3165 
3166 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3167 	    l1 + tolerance >= l2 && l1 < l2 &&
3168 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3169 		/*
3170 		 * lpl1's load is within the tolerance of lpl2. We're
3171 		 * willing to consider it be to better however if
3172 		 * it has been longer since we last homed a thread there
3173 		 */
3174 		return (1);
3175 	}
3176 
3177 	return (0);
3178 }
3179 
3180 /*
3181  * An LWP is expected to be assigned to an lgroup for at least this long
3182  * for its anticipatory load to be justified.  NOTE that this value should
3183  * not be set extremely huge (say, larger than 100 years), to avoid problems
3184  * with overflow in the calculation that uses it.
3185  */
3186 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3187 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3188 
3189 /*
3190  * Routine to change a thread's lgroup affiliation.  This routine updates
3191  * the thread's kthread_t struct and its process' proc_t struct to note the
3192  * thread's new lgroup affiliation, and its lgroup affinities.
3193  *
3194  * Note that this is the only routine that modifies a thread's t_lpl field,
3195  * and that adds in or removes anticipatory load.
3196  *
3197  * If the thread is exiting, newlpl is NULL.
3198  *
3199  * Locking:
3200  * The following lock must be held on entry:
3201  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3202  *		doesn't get removed from t's partition
3203  *
3204  * This routine is not allowed to grab any locks, since it may be called
3205  * with cpus paused (such as from cpu_offline).
3206  */
3207 void
3208 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3209 {
3210 	proc_t		*p;
3211 	lpl_t		*lpl, *oldlpl;
3212 	lgrp_id_t	oldid;
3213 	kthread_t	*tp;
3214 	uint_t		ncpu;
3215 	lgrp_load_t	old, new;
3216 
3217 	ASSERT(t);
3218 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3219 	    THREAD_LOCK_HELD(t));
3220 
3221 	/*
3222 	 * If not changing lpls, just return
3223 	 */
3224 	if ((oldlpl = t->t_lpl) == newlpl)
3225 		return;
3226 
3227 	/*
3228 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3229 	 * associated with process 0 rather than with its original process).
3230 	 */
3231 	if (t->t_proc_flag & TP_LWPEXIT) {
3232 		if (newlpl != NULL) {
3233 			t->t_lpl = newlpl;
3234 		}
3235 		return;
3236 	}
3237 
3238 	p = ttoproc(t);
3239 
3240 	/*
3241 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3242 	 * to account for it being moved from its old lgroup.
3243 	 */
3244 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3245 	    (p->p_tlist != NULL)) {
3246 		oldid = oldlpl->lpl_lgrpid;
3247 
3248 		if (newlpl != NULL)
3249 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3250 
3251 		if ((do_lgrpset_delete) &&
3252 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3253 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3254 				/*
3255 				 * Check if a thread other than the thread
3256 				 * that's moving is assigned to the same
3257 				 * lgroup as the thread that's moving.  Note
3258 				 * that we have to compare lgroup IDs, rather
3259 				 * than simply comparing t_lpl's, since the
3260 				 * threads may belong to different partitions
3261 				 * but be assigned to the same lgroup.
3262 				 */
3263 				ASSERT(tp->t_lpl != NULL);
3264 
3265 				if ((tp != t) &&
3266 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3267 					/*
3268 					 * Another thread is assigned to the
3269 					 * same lgroup as the thread that's
3270 					 * moving, p_lgrpset doesn't change.
3271 					 */
3272 					break;
3273 				} else if (tp == p->p_tlist) {
3274 					/*
3275 					 * No other thread is assigned to the
3276 					 * same lgroup as the exiting thread,
3277 					 * clear the lgroup's bit in p_lgrpset.
3278 					 */
3279 					klgrpset_del(p->p_lgrpset, oldid);
3280 					break;
3281 				}
3282 			}
3283 		}
3284 
3285 		/*
3286 		 * If this thread was assigned to its old lgroup for such a
3287 		 * short amount of time that the anticipatory load that was
3288 		 * added on its behalf has aged very little, remove that
3289 		 * anticipatory load.
3290 		 */
3291 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3292 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3293 			lpl = oldlpl;
3294 			for (;;) {
3295 				do {
3296 					old = new = lpl->lpl_loadavg;
3297 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3298 					if (new > old) {
3299 						/*
3300 						 * this can happen if the load
3301 						 * average was aged since we
3302 						 * added in the anticipatory
3303 						 * load
3304 						 */
3305 						new = 0;
3306 					}
3307 				} while (cas32(
3308 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3309 					    new) != old);
3310 
3311 				lpl = lpl->lpl_parent;
3312 				if (lpl == NULL)
3313 					break;
3314 
3315 				ncpu = lpl->lpl_ncpu;
3316 				ASSERT(ncpu > 0);
3317 			}
3318 		}
3319 	}
3320 	/*
3321 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3322 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3323 	 * to its new lgroup to account for its move to its new lgroup.
3324 	 */
3325 	if (newlpl != NULL) {
3326 		/*
3327 		 * This thread is moving to a new lgroup
3328 		 */
3329 		t->t_lpl = newlpl;
3330 
3331 		/*
3332 		 * Reflect move in load average of new lgroup
3333 		 * unless it is root lgroup
3334 		 */
3335 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3336 			return;
3337 
3338 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3339 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3340 		}
3341 
3342 		/*
3343 		 * It'll take some time for the load on the new lgroup
3344 		 * to reflect this thread's placement on it.  We'd
3345 		 * like not, however, to have all threads between now
3346 		 * and then also piling on to this lgroup.  To avoid
3347 		 * this pileup, we anticipate the load this thread
3348 		 * will generate on its new lgroup.  The goal is to
3349 		 * make the lgroup's load appear as though the thread
3350 		 * had been there all along.  We're very conservative
3351 		 * in calculating this anticipatory load, we assume
3352 		 * the worst case case (100% CPU-bound thread).  This
3353 		 * may be modified in the future to be more accurate.
3354 		 */
3355 		lpl = newlpl;
3356 		for (;;) {
3357 			ncpu = lpl->lpl_ncpu;
3358 			ASSERT(ncpu > 0);
3359 			do {
3360 				old = new = lpl->lpl_loadavg;
3361 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3362 				/*
3363 				 * Check for overflow
3364 				 * Underflow not possible here
3365 				 */
3366 				if (new < old)
3367 					new = UINT32_MAX;
3368 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3369 			    new) != old);
3370 
3371 			lpl = lpl->lpl_parent;
3372 			if (lpl == NULL)
3373 				break;
3374 		}
3375 		t->t_anttime = gethrtime();
3376 	}
3377 }
3378 
3379 /*
3380  * Return lgroup memory allocation policy given advice from madvise(3C)
3381  */
3382 lgrp_mem_policy_t
3383 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3384 {
3385 	switch (advice) {
3386 	case MADV_ACCESS_LWP:
3387 		return (LGRP_MEM_POLICY_NEXT);
3388 	case MADV_ACCESS_MANY:
3389 		return (LGRP_MEM_POLICY_RANDOM);
3390 	default:
3391 		return (lgrp_mem_policy_default(size, type));
3392 	}
3393 }
3394 
3395 /*
3396  * Figure out default policy
3397  */
3398 lgrp_mem_policy_t
3399 lgrp_mem_policy_default(size_t size, int type)
3400 {
3401 	cpupart_t		*cp;
3402 	lgrp_mem_policy_t	policy;
3403 	size_t			pset_mem_size;
3404 
3405 	/*
3406 	 * Randomly allocate memory across lgroups for shared memory
3407 	 * beyond a certain threshold
3408 	 */
3409 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3410 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3411 		/*
3412 		 * Get total memory size of current thread's pset
3413 		 */
3414 		kpreempt_disable();
3415 		cp = curthread->t_cpupart;
3416 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3417 		kpreempt_enable();
3418 
3419 		/*
3420 		 * Choose policy to randomly allocate memory across
3421 		 * lgroups in pset if it will fit and is not default
3422 		 * partition.  Otherwise, allocate memory randomly
3423 		 * across machine.
3424 		 */
3425 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3426 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3427 		else
3428 			policy = LGRP_MEM_POLICY_RANDOM;
3429 	} else
3430 		/*
3431 		 * Apply default policy for private memory and
3432 		 * shared memory under the respective random
3433 		 * threshold.
3434 		 */
3435 		policy = lgrp_mem_default_policy;
3436 
3437 	return (policy);
3438 }
3439 
3440 /*
3441  * Get memory allocation policy for this segment
3442  */
3443 lgrp_mem_policy_info_t *
3444 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3445 {
3446 	lgrp_mem_policy_info_t	*policy_info;
3447 	extern struct seg_ops	segspt_ops;
3448 	extern struct seg_ops	segspt_shmops;
3449 
3450 	/*
3451 	 * This is for binary compatibility to protect against third party
3452 	 * segment drivers which haven't recompiled to allow for
3453 	 * SEGOP_GETPOLICY()
3454 	 */
3455 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3456 	    seg->s_ops != &segspt_shmops)
3457 		return (NULL);
3458 
3459 	policy_info = NULL;
3460 	if (seg->s_ops->getpolicy != NULL)
3461 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3462 
3463 	return (policy_info);
3464 }
3465 
3466 /*
3467  * Set policy for allocating private memory given desired policy, policy info,
3468  * size in bytes of memory that policy is being applied.
3469  * Return 0 if policy wasn't set already and 1 if policy was set already
3470  */
3471 int
3472 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3473     lgrp_mem_policy_info_t *policy_info, size_t size)
3474 {
3475 
3476 	ASSERT(policy_info != NULL);
3477 
3478 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3479 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3480 
3481 	/*
3482 	 * Policy set already?
3483 	 */
3484 	if (policy == policy_info->mem_policy)
3485 		return (1);
3486 
3487 	/*
3488 	 * Set policy
3489 	 */
3490 	policy_info->mem_policy = policy;
3491 	policy_info->mem_reserved = 0;
3492 
3493 	return (0);
3494 }
3495 
3496 
3497 /*
3498  * Get shared memory allocation policy with given tree and offset
3499  */
3500 lgrp_mem_policy_info_t *
3501 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3502     u_offset_t vn_off)
3503 {
3504 	u_offset_t		off;
3505 	lgrp_mem_policy_info_t	*policy_info;
3506 	lgrp_shm_policy_seg_t	*policy_seg;
3507 	lgrp_shm_locality_t	*shm_locality;
3508 	avl_tree_t		*tree;
3509 	avl_index_t		where;
3510 
3511 	/*
3512 	 * Get policy segment tree from anon_map or vnode and use specified
3513 	 * anon index or vnode offset as offset
3514 	 *
3515 	 * Assume that no lock needs to be held on anon_map or vnode, since
3516 	 * they should be protected by their reference count which must be
3517 	 * nonzero for an existing segment
3518 	 */
3519 	if (amp) {
3520 		ASSERT(amp->refcnt != 0);
3521 		shm_locality = amp->locality;
3522 		if (shm_locality == NULL)
3523 			return (NULL);
3524 		tree = shm_locality->loc_tree;
3525 		off = ptob(anon_index);
3526 	} else if (vp) {
3527 		shm_locality = vp->v_locality;
3528 		if (shm_locality == NULL)
3529 			return (NULL);
3530 		ASSERT(shm_locality->loc_count != 0);
3531 		tree = shm_locality->loc_tree;
3532 		off = vn_off;
3533 	}
3534 
3535 	if (tree == NULL)
3536 		return (NULL);
3537 
3538 	/*
3539 	 * Lookup policy segment for offset into shared object and return
3540 	 * policy info
3541 	 */
3542 	rw_enter(&shm_locality->loc_lock, RW_READER);
3543 	policy_info = NULL;
3544 	policy_seg = avl_find(tree, &off, &where);
3545 	if (policy_seg)
3546 		policy_info = &policy_seg->shm_policy;
3547 	rw_exit(&shm_locality->loc_lock);
3548 
3549 	return (policy_info);
3550 }
3551 
3552 /*
3553  * Return lgroup to use for allocating memory
3554  * given the segment and address
3555  *
3556  * There isn't any mutual exclusion that exists between calls
3557  * to this routine and DR, so this routine and whomever calls it
3558  * should be mindful of the possibility that the lgrp returned
3559  * may be deleted. If this happens, dereferences of the lgrp
3560  * pointer will still be safe, but the resources in the lgrp will
3561  * be gone, and LGRP_EXISTS() will no longer be true.
3562  */
3563 lgrp_t *
3564 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3565 {
3566 	int			i;
3567 	lgrp_t			*lgrp;
3568 	klgrpset_t		lgrpset;
3569 	int			lgrps_spanned;
3570 	unsigned long		off;
3571 	lgrp_mem_policy_t	policy;
3572 	lgrp_mem_policy_info_t	*policy_info;
3573 	ushort_t		random;
3574 	int			stat = 0;
3575 
3576 	/*
3577 	 * Just return null if the lgrp framework hasn't finished
3578 	 * initializing or if this is a UMA machine.
3579 	 */
3580 	if (nlgrps == 1 || !lgrp_initialized)
3581 		return (lgrp_root);
3582 
3583 	/*
3584 	 * Get memory allocation policy for this segment
3585 	 */
3586 	policy = lgrp_mem_default_policy;
3587 	if (seg != NULL) {
3588 		if (seg->s_as == &kas) {
3589 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3590 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3591 				policy = LGRP_MEM_POLICY_RANDOM;
3592 		} else {
3593 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3594 			if (policy_info != NULL)
3595 				policy = policy_info->mem_policy;
3596 		}
3597 	}
3598 	lgrpset = 0;
3599 
3600 	/*
3601 	 * Initialize lgroup to home by default
3602 	 */
3603 	lgrp = lgrp_home_lgrp();
3604 
3605 	/*
3606 	 * When homing threads on root lgrp, override default memory
3607 	 * allocation policies with root lgroup memory allocation policy
3608 	 */
3609 	if (lgrp == lgrp_root)
3610 		policy = lgrp_mem_policy_root;
3611 
3612 	/*
3613 	 * Implement policy
3614 	 */
3615 	switch (policy) {
3616 	case LGRP_MEM_POLICY_NEXT_CPU:
3617 
3618 		/*
3619 		 * Return lgroup of current CPU which faulted on memory
3620 		 */
3621 		lgrp = lgrp_cpu_to_lgrp(CPU);
3622 		break;
3623 
3624 	case LGRP_MEM_POLICY_NEXT:
3625 	case LGRP_MEM_POLICY_DEFAULT:
3626 	default:
3627 
3628 		/*
3629 		 * Just return current thread's home lgroup
3630 		 * for default policy (next touch)
3631 		 * If the thread is homed to the root,
3632 		 * then the default policy is random across lgroups.
3633 		 * Fallthrough to the random case.
3634 		 */
3635 		if (lgrp != lgrp_root) {
3636 			if (policy == LGRP_MEM_POLICY_NEXT)
3637 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3638 			else
3639 				lgrp_stat_add(lgrp->lgrp_id,
3640 				    LGRP_NUM_DEFAULT, 1);
3641 			break;
3642 		}
3643 		/* LINTED fallthrough on case statement */
3644 	case LGRP_MEM_POLICY_RANDOM:
3645 
3646 		/*
3647 		 * Return a random leaf lgroup with memory
3648 		 */
3649 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3650 		/*
3651 		 * Count how many lgroups are spanned
3652 		 */
3653 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3654 
3655 		/*
3656 		 * There may be no memnodes in the root lgroup during DR copy
3657 		 * rename on a system with only two boards (memnodes)
3658 		 * configured. In this case just return the root lgrp.
3659 		 */
3660 		if (lgrps_spanned == 0) {
3661 			lgrp = lgrp_root;
3662 			break;
3663 		}
3664 
3665 		/*
3666 		 * Pick a random offset within lgroups spanned
3667 		 * and return lgroup at that offset
3668 		 */
3669 		random = (ushort_t)gethrtime() >> 4;
3670 		off = random % lgrps_spanned;
3671 		ASSERT(off <= lgrp_alloc_max);
3672 
3673 		for (i = 0; i <= lgrp_alloc_max; i++) {
3674 			if (!klgrpset_ismember(lgrpset, i))
3675 				continue;
3676 			if (off)
3677 				off--;
3678 			else {
3679 				lgrp = lgrp_table[i];
3680 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3681 				    1);
3682 				break;
3683 			}
3684 		}
3685 		break;
3686 
3687 	case LGRP_MEM_POLICY_RANDOM_PROC:
3688 
3689 		/*
3690 		 * Grab copy of bitmask of lgroups spanned by
3691 		 * this process
3692 		 */
3693 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3694 		stat = LGRP_NUM_RANDOM_PROC;
3695 
3696 		/* LINTED fallthrough on case statement */
3697 	case LGRP_MEM_POLICY_RANDOM_PSET:
3698 
3699 		if (!stat)
3700 			stat = LGRP_NUM_RANDOM_PSET;
3701 
3702 		if (klgrpset_isempty(lgrpset)) {
3703 			/*
3704 			 * Grab copy of bitmask of lgroups spanned by
3705 			 * this processor set
3706 			 */
3707 			kpreempt_disable();
3708 			klgrpset_copy(lgrpset,
3709 			    curthread->t_cpupart->cp_lgrpset);
3710 			kpreempt_enable();
3711 		}
3712 
3713 		/*
3714 		 * Count how many lgroups are spanned
3715 		 */
3716 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3717 		ASSERT(lgrps_spanned <= nlgrps);
3718 
3719 		/*
3720 		 * Probably lgrps_spanned should be always non-zero, but to be
3721 		 * on the safe side we return lgrp_root if it is empty.
3722 		 */
3723 		if (lgrps_spanned == 0) {
3724 			lgrp = lgrp_root;
3725 			break;
3726 		}
3727 
3728 		/*
3729 		 * Pick a random offset within lgroups spanned
3730 		 * and return lgroup at that offset
3731 		 */
3732 		random = (ushort_t)gethrtime() >> 4;
3733 		off = random % lgrps_spanned;
3734 		ASSERT(off <= lgrp_alloc_max);
3735 
3736 		for (i = 0; i <= lgrp_alloc_max; i++) {
3737 			if (!klgrpset_ismember(lgrpset, i))
3738 				continue;
3739 			if (off)
3740 				off--;
3741 			else {
3742 				lgrp = lgrp_table[i];
3743 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3744 				    1);
3745 				break;
3746 			}
3747 		}
3748 		break;
3749 
3750 	case LGRP_MEM_POLICY_ROUNDROBIN:
3751 
3752 		/*
3753 		 * Use offset within segment to determine
3754 		 * offset from home lgroup to choose for
3755 		 * next lgroup to allocate memory from
3756 		 */
3757 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3758 		    (lgrp_alloc_max + 1);
3759 
3760 		kpreempt_disable();
3761 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3762 		i = lgrp->lgrp_id;
3763 		kpreempt_enable();
3764 
3765 		while (off > 0) {
3766 			i = (i + 1) % (lgrp_alloc_max + 1);
3767 			lgrp = lgrp_table[i];
3768 			if (klgrpset_ismember(lgrpset, i))
3769 				off--;
3770 		}
3771 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3772 
3773 		break;
3774 	}
3775 
3776 	ASSERT(lgrp != NULL);
3777 	return (lgrp);
3778 }
3779 
3780 /*
3781  * Return the number of pages in an lgroup
3782  *
3783  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3784  *	 could cause tests that rely on the numat driver to fail....
3785  */
3786 pgcnt_t
3787 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3788 {
3789 	lgrp_t *lgrp;
3790 
3791 	lgrp = lgrp_table[lgrpid];
3792 	if (!LGRP_EXISTS(lgrp) ||
3793 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3794 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3795 		return (0);
3796 
3797 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3798 }
3799 
3800 /*
3801  * Initialize lgroup shared memory allocation policy support
3802  */
3803 void
3804 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3805 {
3806 	lgrp_shm_locality_t	*shm_locality;
3807 
3808 	/*
3809 	 * Initialize locality field in anon_map
3810 	 * Don't need any locks because this is called when anon_map is
3811 	 * allocated, but not used anywhere yet.
3812 	 */
3813 	if (amp) {
3814 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3815 		if (amp->locality == NULL) {
3816 			/*
3817 			 * Allocate and initialize shared memory locality info
3818 			 * and set anon_map locality pointer to it
3819 			 * Drop lock across kmem_alloc(KM_SLEEP)
3820 			 */
3821 			ANON_LOCK_EXIT(&amp->a_rwlock);
3822 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3823 			    KM_SLEEP);
3824 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3825 			    NULL);
3826 			shm_locality->loc_count = 1;	/* not used for amp */
3827 			shm_locality->loc_tree = NULL;
3828 
3829 			/*
3830 			 * Reacquire lock and check to see whether anyone beat
3831 			 * us to initializing the locality info
3832 			 */
3833 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3834 			if (amp->locality != NULL) {
3835 				rw_destroy(&shm_locality->loc_lock);
3836 				kmem_free(shm_locality,
3837 				    sizeof (*shm_locality));
3838 			} else
3839 				amp->locality = shm_locality;
3840 		}
3841 		ANON_LOCK_EXIT(&amp->a_rwlock);
3842 		return;
3843 	}
3844 
3845 	/*
3846 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3847 	 */
3848 	mutex_enter(&vp->v_lock);
3849 	if ((vp->v_flag & V_LOCALITY) == 0) {
3850 		/*
3851 		 * Allocate and initialize shared memory locality info
3852 		 */
3853 		mutex_exit(&vp->v_lock);
3854 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3855 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3856 		shm_locality->loc_count = 1;
3857 		shm_locality->loc_tree = NULL;
3858 
3859 		/*
3860 		 * Point vnode locality field at shared vnode policy info
3861 		 * and set locality aware flag in vnode
3862 		 */
3863 		mutex_enter(&vp->v_lock);
3864 		if ((vp->v_flag & V_LOCALITY) == 0) {
3865 			vp->v_locality = shm_locality;
3866 			vp->v_flag |= V_LOCALITY;
3867 		} else {
3868 			/*
3869 			 * Lost race so free locality info and increment count.
3870 			 */
3871 			rw_destroy(&shm_locality->loc_lock);
3872 			kmem_free(shm_locality, sizeof (*shm_locality));
3873 			shm_locality = vp->v_locality;
3874 			shm_locality->loc_count++;
3875 		}
3876 		mutex_exit(&vp->v_lock);
3877 
3878 		return;
3879 	}
3880 
3881 	/*
3882 	 * Increment reference count of number of segments mapping this vnode
3883 	 * shared
3884 	 */
3885 	shm_locality = vp->v_locality;
3886 	shm_locality->loc_count++;
3887 	mutex_exit(&vp->v_lock);
3888 }
3889 
3890 /*
3891  * Destroy the given shared memory policy segment tree
3892  */
3893 void
3894 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3895 {
3896 	lgrp_shm_policy_seg_t	*cur;
3897 	lgrp_shm_policy_seg_t	*next;
3898 
3899 	if (tree == NULL)
3900 		return;
3901 
3902 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3903 	while (cur != NULL) {
3904 		next = AVL_NEXT(tree, cur);
3905 		avl_remove(tree, cur);
3906 		kmem_free(cur, sizeof (*cur));
3907 		cur = next;
3908 	}
3909 	kmem_free(tree, sizeof (avl_tree_t));
3910 }
3911 
3912 /*
3913  * Uninitialize lgroup shared memory allocation policy support
3914  */
3915 void
3916 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3917 {
3918 	lgrp_shm_locality_t	*shm_locality;
3919 
3920 	/*
3921 	 * For anon_map, deallocate shared memory policy tree and
3922 	 * zero locality field
3923 	 * Don't need any locks because anon_map is being freed
3924 	 */
3925 	if (amp) {
3926 		if (amp->locality == NULL)
3927 			return;
3928 		shm_locality = amp->locality;
3929 		shm_locality->loc_count = 0;	/* not really used for amp */
3930 		rw_destroy(&shm_locality->loc_lock);
3931 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3932 		kmem_free(shm_locality, sizeof (*shm_locality));
3933 		amp->locality = 0;
3934 		return;
3935 	}
3936 
3937 	/*
3938 	 * For vnode, decrement reference count of segments mapping this vnode
3939 	 * shared and delete locality info if reference count drops to 0
3940 	 */
3941 	mutex_enter(&vp->v_lock);
3942 	shm_locality = vp->v_locality;
3943 	shm_locality->loc_count--;
3944 
3945 	if (shm_locality->loc_count == 0) {
3946 		rw_destroy(&shm_locality->loc_lock);
3947 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3948 		kmem_free(shm_locality, sizeof (*shm_locality));
3949 		vp->v_locality = 0;
3950 		vp->v_flag &= ~V_LOCALITY;
3951 	}
3952 	mutex_exit(&vp->v_lock);
3953 }
3954 
3955 /*
3956  * Compare two shared memory policy segments
3957  * Used by AVL tree code for searching
3958  */
3959 int
3960 lgrp_shm_policy_compar(const void *x, const void *y)
3961 {
3962 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3963 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3964 
3965 	if (a->shm_off < b->shm_off)
3966 		return (-1);
3967 	if (a->shm_off >= b->shm_off + b->shm_size)
3968 		return (1);
3969 	return (0);
3970 }
3971 
3972 /*
3973  * Concatenate seg1 with seg2 and remove seg2
3974  */
3975 static int
3976 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
3977     lgrp_shm_policy_seg_t *seg2)
3978 {
3979 	if (!seg1 || !seg2 ||
3980 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
3981 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
3982 		return (-1);
3983 
3984 	seg1->shm_size += seg2->shm_size;
3985 	avl_remove(tree, seg2);
3986 	kmem_free(seg2, sizeof (*seg2));
3987 	return (0);
3988 }
3989 
3990 /*
3991  * Split segment at given offset and return rightmost (uppermost) segment
3992  * Assumes that there are no overlapping segments
3993  */
3994 static lgrp_shm_policy_seg_t *
3995 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
3996     u_offset_t off)
3997 {
3998 	lgrp_shm_policy_seg_t	*newseg;
3999 	avl_index_t		where;
4000 
4001 	ASSERT(seg != NULL);
4002 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4003 
4004 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4005 	    seg->shm_size)
4006 		return (NULL);
4007 
4008 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4009 		return (seg);
4010 
4011 	/*
4012 	 * Adjust size of left segment and allocate new (right) segment
4013 	 */
4014 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4015 	newseg->shm_policy = seg->shm_policy;
4016 	newseg->shm_off = off;
4017 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4018 	seg->shm_size = off - seg->shm_off;
4019 
4020 	/*
4021 	 * Find where to insert new segment in AVL tree and insert it
4022 	 */
4023 	(void) avl_find(tree, &off, &where);
4024 	avl_insert(tree, newseg, where);
4025 
4026 	return (newseg);
4027 }
4028 
4029 /*
4030  * Set shared memory allocation policy on specified shared object at given
4031  * offset and length
4032  *
4033  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4034  * -1 if can't set policy.
4035  */
4036 int
4037 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4038     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4039 {
4040 	u_offset_t		eoff;
4041 	lgrp_shm_policy_seg_t	*next;
4042 	lgrp_shm_policy_seg_t	*newseg;
4043 	u_offset_t		off;
4044 	u_offset_t		oldeoff;
4045 	lgrp_shm_policy_seg_t	*prev;
4046 	int			retval;
4047 	lgrp_shm_policy_seg_t	*seg;
4048 	lgrp_shm_locality_t	*shm_locality;
4049 	avl_tree_t		*tree;
4050 	avl_index_t		where;
4051 
4052 	ASSERT(amp || vp);
4053 	ASSERT((len & PAGEOFFSET) == 0);
4054 
4055 	if (len == 0)
4056 		return (-1);
4057 
4058 	retval = 0;
4059 
4060 	/*
4061 	 * Get locality info and starting offset into shared object
4062 	 * Try anon map first and then vnode
4063 	 * Assume that no locks need to be held on anon_map or vnode, since
4064 	 * it should be protected by its reference count which must be nonzero
4065 	 * for an existing segment.
4066 	 */
4067 	if (amp) {
4068 		/*
4069 		 * Get policy info from anon_map
4070 		 *
4071 		 */
4072 		ASSERT(amp->refcnt != 0);
4073 		if (amp->locality == NULL)
4074 			lgrp_shm_policy_init(amp, NULL);
4075 		shm_locality = amp->locality;
4076 		off = ptob(anon_index);
4077 	} else if (vp) {
4078 		/*
4079 		 * Get policy info from vnode
4080 		 */
4081 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4082 			lgrp_shm_policy_init(NULL, vp);
4083 		shm_locality = vp->v_locality;
4084 		ASSERT(shm_locality->loc_count != 0);
4085 		off = vn_off;
4086 	} else
4087 		return (-1);
4088 
4089 	ASSERT((off & PAGEOFFSET) == 0);
4090 
4091 	/*
4092 	 * Figure out default policy
4093 	 */
4094 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4095 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4096 
4097 	/*
4098 	 * Create AVL tree if there isn't one yet
4099 	 * and set locality field to point at it
4100 	 */
4101 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4102 	tree = shm_locality->loc_tree;
4103 	if (!tree) {
4104 		rw_exit(&shm_locality->loc_lock);
4105 
4106 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4107 
4108 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4109 		if (shm_locality->loc_tree == NULL) {
4110 			avl_create(tree, lgrp_shm_policy_compar,
4111 			    sizeof (lgrp_shm_policy_seg_t),
4112 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4113 			shm_locality->loc_tree = tree;
4114 		} else {
4115 			/*
4116 			 * Another thread managed to set up the tree
4117 			 * before we could. Free the tree we allocated
4118 			 * and use the one that's already there.
4119 			 */
4120 			kmem_free(tree, sizeof (*tree));
4121 			tree = shm_locality->loc_tree;
4122 		}
4123 	}
4124 
4125 	/*
4126 	 * Set policy
4127 	 *
4128 	 * Need to maintain hold on writer's lock to keep tree from
4129 	 * changing out from under us
4130 	 */
4131 	while (len != 0) {
4132 		/*
4133 		 * Find policy segment for specified offset into shared object
4134 		 */
4135 		seg = avl_find(tree, &off, &where);
4136 
4137 		/*
4138 		 * Didn't find any existing segment that contains specified
4139 		 * offset, so allocate new segment, insert it, and concatenate
4140 		 * with adjacent segments if possible
4141 		 */
4142 		if (seg == NULL) {
4143 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4144 			    KM_SLEEP);
4145 			newseg->shm_policy.mem_policy = policy;
4146 			newseg->shm_policy.mem_reserved = 0;
4147 			newseg->shm_off = off;
4148 			avl_insert(tree, newseg, where);
4149 
4150 			/*
4151 			 * Check to see whether new segment overlaps with next
4152 			 * one, set length of new segment accordingly, and
4153 			 * calculate remaining length and next offset
4154 			 */
4155 			seg = AVL_NEXT(tree, newseg);
4156 			if (seg == NULL || off + len <= seg->shm_off) {
4157 				newseg->shm_size = len;
4158 				len = 0;
4159 			} else {
4160 				newseg->shm_size = seg->shm_off - off;
4161 				off = seg->shm_off;
4162 				len -= newseg->shm_size;
4163 			}
4164 
4165 			/*
4166 			 * Try to concatenate new segment with next and
4167 			 * previous ones, since they might have the same policy
4168 			 * now.  Grab previous and next segments first because
4169 			 * they will change on concatenation.
4170 			 */
4171 			prev =  AVL_PREV(tree, newseg);
4172 			next = AVL_NEXT(tree, newseg);
4173 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4174 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4175 
4176 			continue;
4177 		}
4178 
4179 		eoff = off + len;
4180 		oldeoff = seg->shm_off + seg->shm_size;
4181 
4182 		/*
4183 		 * Policy set already?
4184 		 */
4185 		if (policy == seg->shm_policy.mem_policy) {
4186 			/*
4187 			 * Nothing left to do if offset and length
4188 			 * fall within this segment
4189 			 */
4190 			if (eoff <= oldeoff) {
4191 				retval = 1;
4192 				break;
4193 			} else {
4194 				len = eoff - oldeoff;
4195 				off = oldeoff;
4196 				continue;
4197 			}
4198 		}
4199 
4200 		/*
4201 		 * Specified offset and length match existing segment exactly
4202 		 */
4203 		if (off == seg->shm_off && len == seg->shm_size) {
4204 			/*
4205 			 * Set policy and update current length
4206 			 */
4207 			seg->shm_policy.mem_policy = policy;
4208 			seg->shm_policy.mem_reserved = 0;
4209 			len = 0;
4210 
4211 			/*
4212 			 * Try concatenating new segment with previous and next
4213 			 * segments, since they might have the same policy now.
4214 			 * Grab previous and next segments first because they
4215 			 * will change on concatenation.
4216 			 */
4217 			prev =  AVL_PREV(tree, seg);
4218 			next = AVL_NEXT(tree, seg);
4219 			(void) lgrp_shm_policy_concat(tree, seg, next);
4220 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4221 		} else {
4222 			/*
4223 			 * Specified offset and length only apply to part of
4224 			 * existing segment
4225 			 */
4226 
4227 			/*
4228 			 * New segment starts in middle of old one, so split
4229 			 * new one off near beginning of old one
4230 			 */
4231 			newseg = NULL;
4232 			if (off > seg->shm_off) {
4233 				newseg = lgrp_shm_policy_split(tree, seg, off);
4234 
4235 				/*
4236 				 * New segment ends where old one did, so try
4237 				 * to concatenate with next segment
4238 				 */
4239 				if (eoff == oldeoff) {
4240 					newseg->shm_policy.mem_policy = policy;
4241 					newseg->shm_policy.mem_reserved = 0;
4242 					(void) lgrp_shm_policy_concat(tree,
4243 					    newseg, AVL_NEXT(tree, newseg));
4244 					break;
4245 				}
4246 			}
4247 
4248 			/*
4249 			 * New segment ends before old one, so split off end of
4250 			 * old one
4251 			 */
4252 			if (eoff < oldeoff) {
4253 				if (newseg) {
4254 					(void) lgrp_shm_policy_split(tree,
4255 					    newseg, eoff);
4256 					newseg->shm_policy.mem_policy = policy;
4257 					newseg->shm_policy.mem_reserved = 0;
4258 				} else {
4259 					(void) lgrp_shm_policy_split(tree, seg,
4260 					    eoff);
4261 					seg->shm_policy.mem_policy = policy;
4262 					seg->shm_policy.mem_reserved = 0;
4263 				}
4264 
4265 				if (off == seg->shm_off)
4266 					(void) lgrp_shm_policy_concat(tree,
4267 					    AVL_PREV(tree, seg), seg);
4268 				break;
4269 			}
4270 
4271 			/*
4272 			 * Calculate remaining length and next offset
4273 			 */
4274 			len = eoff - oldeoff;
4275 			off = oldeoff;
4276 		}
4277 	}
4278 
4279 	rw_exit(&shm_locality->loc_lock);
4280 	return (retval);
4281 }
4282 
4283 /*
4284  * Return the best memnode from which to allocate memory given
4285  * an lgroup.
4286  *
4287  * "c" is for cookie, which is good enough for me.
4288  * It references a cookie struct that should be zero'ed to initialize.
4289  * The cookie should live on the caller's stack.
4290  *
4291  * The routine returns -1 when:
4292  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4293  *	- traverse is 1, and all the memnodes in the system have been
4294  *	  returned.
4295  */
4296 int
4297 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4298 {
4299 	lgrp_t		*lp = c->lmc_lgrp;
4300 	mnodeset_t	nodes = c->lmc_nodes;
4301 	int		cnt = c->lmc_cnt;
4302 	int		offset, mnode;
4303 
4304 	extern int	max_mem_nodes;
4305 
4306 	/*
4307 	 * If the set is empty, and the caller is willing, traverse
4308 	 * up the hierarchy until we find a non-empty set.
4309 	 */
4310 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4311 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4312 		    ((lp = lp->lgrp_parent) == NULL))
4313 			return (-1);
4314 
4315 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4316 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4317 	}
4318 
4319 	/*
4320 	 * Select a memnode by picking one at a "random" offset.
4321 	 * Because of DR, memnodes can come and go at any time.
4322 	 * This code must be able to cope with the possibility
4323 	 * that the nodes count "cnt" is inconsistent with respect
4324 	 * to the number of elements actually in "nodes", and
4325 	 * therefore that the offset chosen could be greater than
4326 	 * the number of elements in the set (some memnodes may
4327 	 * have dissapeared just before cnt was read).
4328 	 * If this happens, the search simply wraps back to the
4329 	 * beginning of the set.
4330 	 */
4331 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4332 	offset = c->lmc_rand % cnt;
4333 	do {
4334 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4335 			if (nodes & ((mnodeset_t)1 << mnode))
4336 				if (!offset--)
4337 					break;
4338 	} while (mnode >= max_mem_nodes);
4339 
4340 	/* Found a node. Store state before returning. */
4341 	c->lmc_lgrp = lp;
4342 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4343 	c->lmc_cnt = cnt - 1;
4344 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4345 	c->lmc_ntried++;
4346 
4347 	return (mnode);
4348 }
4349