xref: /titanic_50/usr/src/uts/common/os/lgrp.c (revision a93a1f58a8763fa69172980b98e3d24720c1136e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/pg.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 /*
216  * lgroup memory event handlers
217  */
218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 
222 /*
223  * lgroup CPU partition event handlers
224  */
225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 static void	lgrp_part_del_cpu(struct cpu *);
227 
228 static void	lgrp_root_init(void);
229 
230 /*
231  * lpl topology
232  */
233 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234 static void	lpl_clear(lpl_t *);
235 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237 static void	lpl_rset_add(lpl_t *, lpl_t *);
238 static void	lpl_rset_del(lpl_t *, lpl_t *);
239 static int	lpl_rset_contains(lpl_t *, lpl_t *);
240 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241 static void	lpl_child_update(lpl_t *, struct cpupart *);
242 static int	lpl_pick(lpl_t *, lpl_t *);
243 static void	lpl_verify_wrapper(struct cpupart *);
244 
245 /*
246  * defines for lpl topology verifier return codes
247  */
248 
249 #define	LPL_TOPO_CORRECT			0
250 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
251 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252 #define	LPL_TOPO_LGRP_MISMATCH			-3
253 #define	LPL_TOPO_MISSING_PARENT			-4
254 #define	LPL_TOPO_PARENT_MISMATCH		-5
255 #define	LPL_TOPO_BAD_CPUCNT			-6
256 #define	LPL_TOPO_RSET_MISMATCH			-7
257 #define	LPL_TOPO_LPL_ORPHANED			-8
258 #define	LPL_TOPO_LPL_BAD_NCPU			-9
259 #define	LPL_TOPO_RSET_MSSNG_LF			-10
260 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261 #define	LPL_TOPO_BOGUS_HINT			-12
262 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
263 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
264 #define	LPL_TOPO_BAD_RSETCNT			-15
265 
266 /*
267  * Return whether lgroup optimizations should be enabled on this system
268  */
269 int
270 lgrp_optimizations(void)
271 {
272 	/*
273 	 * System must have more than 2 lgroups to enable lgroup optimizations
274 	 *
275 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
276 	 * with one child lgroup containing all the resources. A 2 lgroup
277 	 * system with a root lgroup directly containing CPUs or memory might
278 	 * need lgroup optimizations with its child lgroup, but there
279 	 * isn't such a machine for now....
280 	 */
281 	if (nlgrps > 2)
282 		return (1);
283 
284 	return (0);
285 }
286 
287 /*
288  * Build full lgroup topology
289  */
290 static void
291 lgrp_root_init(void)
292 {
293 	lgrp_handle_t	hand;
294 	int		i;
295 	lgrp_id_t	id;
296 
297 	/*
298 	 * Create the "root" lgroup
299 	 */
300 	ASSERT(nlgrps == 0);
301 	id = nlgrps++;
302 
303 	lgrp_root = &lroot;
304 
305 	lgrp_root->lgrp_cpu = NULL;
306 	lgrp_root->lgrp_mnodes = 0;
307 	lgrp_root->lgrp_nmnodes = 0;
308 	hand = lgrp_plat_root_hand();
309 	lgrp_root->lgrp_plathand = hand;
310 
311 	lgrp_root->lgrp_id = id;
312 	lgrp_root->lgrp_cpucnt = 0;
313 	lgrp_root->lgrp_childcnt = 0;
314 	klgrpset_clear(lgrp_root->lgrp_children);
315 	klgrpset_clear(lgrp_root->lgrp_leaves);
316 	lgrp_root->lgrp_parent = NULL;
317 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318 
319 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320 		klgrpset_clear(lgrp_root->lgrp_set[i]);
321 
322 	lgrp_root->lgrp_kstat = NULL;
323 
324 	lgrp_table[id] = lgrp_root;
325 
326 	/*
327 	 * Setup initial lpl list for CPU0 and initial t0 home.
328 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329 	 * all topology operations until cp_default is initialized at which
330 	 * point t0.t_lpl will be updated.
331 	 */
332 	lpl_bootstrap = lpl_bootstrap_list;
333 	t0.t_lpl = lpl_bootstrap;
334 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336 	cp_default.cp_lgrploads = lpl_bootstrap;
337 }
338 
339 /*
340  * Initialize the lgroup framework and allow the platform to do the same
341  */
342 void
343 lgrp_init(void)
344 {
345 	/*
346 	 * Initialize the platform
347 	 */
348 	lgrp_plat_init();
349 
350 	/*
351 	 * Set max number of lgroups supported on this platform which must be
352 	 * less than the max number of lgroups supported by the common lgroup
353 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354 	 */
355 	nlgrpsmax = lgrp_plat_max_lgrps();
356 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357 }
358 
359 /*
360  * Create the root and cpu0's lgroup, and set t0's home.
361  */
362 void
363 lgrp_setup(void)
364 {
365 	/*
366 	 * Setup the root lgroup
367 	 */
368 	lgrp_root_init();
369 
370 	/*
371 	 * Add cpu0 to an lgroup
372 	 */
373 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375 }
376 
377 /*
378  * Lgroup initialization is split in two parts. The first part
379  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381  * when all CPUs are brought online and all distance information is available.
382  *
383  * When lgrp_main_init() is complete it sets lgrp_initialized. The
384  * lgrp_main_mp_init() sets lgrp_topo_initialized.
385  */
386 
387 /*
388  * true when lgrp initialization has been completed.
389  */
390 int	lgrp_initialized = 0;
391 
392 /*
393  * True when lgrp topology is constructed.
394  */
395 int	lgrp_topo_initialized = 0;
396 
397 /*
398  * Init routine called after startup(), /etc/system has been processed,
399  * and cpu0 has been added to an lgroup.
400  */
401 void
402 lgrp_main_init(void)
403 {
404 	cpu_t		*cp = CPU;
405 	lgrp_id_t	lgrpid;
406 	int		i;
407 	extern void	pg_cpu0_reinit();
408 
409 	/*
410 	 * Enforce a valid lgrp_mem_default_policy
411 	 */
412 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
413 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
414 	    (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
415 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
416 
417 	/*
418 	 * See if mpo should be disabled.
419 	 * This may happen in the case of null proc LPA on Starcat.
420 	 * The platform won't be able to detect null proc LPA until after
421 	 * cpu0 and memory have already been added to lgroups.
422 	 * When and if it is detected, the Starcat platform will return
423 	 * a different platform handle for cpu0 which is what we check for
424 	 * here. If mpo should be disabled move cpu0 to it's rightful place
425 	 * (the root), and destroy the remaining lgroups. This effectively
426 	 * provides an UMA lgroup topology.
427 	 */
428 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
429 	if (lgrp_table[lgrpid]->lgrp_plathand !=
430 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
431 		lgrp_part_del_cpu(cp);
432 		lgrp_cpu_fini(cp, lgrpid);
433 
434 		lgrp_cpu_init(cp);
435 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
436 
437 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
438 
439 		/*
440 		 * Notify the PG subsystem that the CPU's lgrp
441 		 * association has changed
442 		 */
443 		pg_cpu0_reinit();
444 
445 		/*
446 		 * Destroy all lgroups except for root
447 		 */
448 		for (i = 0; i <= lgrp_alloc_max; i++) {
449 			if (LGRP_EXISTS(lgrp_table[i]) &&
450 			    lgrp_table[i] != lgrp_root)
451 				lgrp_destroy(lgrp_table[i]);
452 		}
453 
454 		/*
455 		 * Fix up root to point at itself for leaves and resources
456 		 * and not have any children
457 		 */
458 		lgrp_root->lgrp_childcnt = 0;
459 		klgrpset_clear(lgrp_root->lgrp_children);
460 		klgrpset_clear(lgrp_root->lgrp_leaves);
461 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
462 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
463 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
464 	}
465 
466 	/*
467 	 * Initialize kstats framework.
468 	 */
469 	lgrp_kstat_init();
470 	/*
471 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
472 	 */
473 	mutex_enter(&cpu_lock);
474 	lgrp_kstat_create(cp);
475 	mutex_exit(&cpu_lock);
476 
477 	lgrp_plat_main_init();
478 	lgrp_initialized = 1;
479 }
480 
481 /*
482  * Finish lgrp initialization after all CPUS are brought on-line.
483  * This routine is called after start_other_cpus().
484  */
485 void
486 lgrp_main_mp_init(void)
487 {
488 	klgrpset_t changed;
489 
490 	/*
491 	 * Update lgroup topology (if necessary)
492 	 */
493 	klgrpset_clear(changed);
494 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
495 	lgrp_topo_initialized = 1;
496 }
497 
498 /*
499  * Change latency of lgroup with specified lgroup platform handle (if one is
500  * given) or change all lgroups with old latency to new latency
501  */
502 void
503 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
504     u_longlong_t newtime)
505 {
506 	lgrp_t		*lgrp;
507 	int		i;
508 
509 	for (i = 0; i <= lgrp_alloc_max; i++) {
510 		lgrp = lgrp_table[i];
511 
512 		if (!LGRP_EXISTS(lgrp))
513 			continue;
514 
515 		if ((hand == LGRP_NULL_HANDLE &&
516 		    lgrp->lgrp_latency == oldtime) ||
517 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
518 			lgrp->lgrp_latency = (int)newtime;
519 	}
520 }
521 
522 /*
523  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
524  */
525 void
526 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
527 {
528 	klgrpset_t	changed;
529 	cpu_t		*cp;
530 	lgrp_id_t	id;
531 	int		rc;
532 
533 	switch (event) {
534 	/*
535 	 * The following (re)configuration events are common code
536 	 * initiated. lgrp_plat_config() is called here to inform the
537 	 * platform of the reconfiguration event.
538 	 */
539 	case LGRP_CONFIG_CPU_ADD:
540 		cp = (cpu_t *)resource;
541 
542 		/*
543 		 * Initialize the new CPU's lgrp related next/prev
544 		 * links, and give it a bootstrap lpl so that it can
545 		 * survive should it need to enter the dispatcher.
546 		 */
547 		cp->cpu_next_lpl = cp;
548 		cp->cpu_prev_lpl = cp;
549 		cp->cpu_next_lgrp = cp;
550 		cp->cpu_prev_lgrp = cp;
551 		cp->cpu_lpl = lpl_bootstrap;
552 
553 		lgrp_plat_config(event, resource);
554 		atomic_add_32(&lgrp_gen, 1);
555 
556 		break;
557 	case LGRP_CONFIG_CPU_DEL:
558 		lgrp_plat_config(event, resource);
559 		atomic_add_32(&lgrp_gen, 1);
560 
561 		break;
562 	case LGRP_CONFIG_CPU_ONLINE:
563 		cp = (cpu_t *)resource;
564 		lgrp_cpu_init(cp);
565 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
566 		rc = lpl_topo_verify(cp->cpu_part);
567 		if (rc != LPL_TOPO_CORRECT) {
568 			panic("lpl_topo_verify failed: %d", rc);
569 		}
570 		lgrp_plat_config(event, resource);
571 		atomic_add_32(&lgrp_gen, 1);
572 
573 		break;
574 	case LGRP_CONFIG_CPU_OFFLINE:
575 		cp = (cpu_t *)resource;
576 		id = cp->cpu_lpl->lpl_lgrpid;
577 		lgrp_part_del_cpu(cp);
578 		lgrp_cpu_fini(cp, id);
579 		rc = lpl_topo_verify(cp->cpu_part);
580 		if (rc != LPL_TOPO_CORRECT) {
581 			panic("lpl_topo_verify failed: %d", rc);
582 		}
583 		lgrp_plat_config(event, resource);
584 		atomic_add_32(&lgrp_gen, 1);
585 
586 		break;
587 	case LGRP_CONFIG_CPUPART_ADD:
588 		cp = (cpu_t *)resource;
589 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
590 		rc = lpl_topo_verify(cp->cpu_part);
591 		if (rc != LPL_TOPO_CORRECT) {
592 			panic("lpl_topo_verify failed: %d", rc);
593 		}
594 		lgrp_plat_config(event, resource);
595 
596 		break;
597 	case LGRP_CONFIG_CPUPART_DEL:
598 		cp = (cpu_t *)resource;
599 		lgrp_part_del_cpu((cpu_t *)resource);
600 		rc = lpl_topo_verify(cp->cpu_part);
601 		if (rc != LPL_TOPO_CORRECT) {
602 			panic("lpl_topo_verify failed: %d", rc);
603 		}
604 		lgrp_plat_config(event, resource);
605 
606 		break;
607 	/*
608 	 * The following events are initiated by the memnode
609 	 * subsystem.
610 	 */
611 	case LGRP_CONFIG_MEM_ADD:
612 		lgrp_mem_init((int)resource, where, B_FALSE);
613 		atomic_add_32(&lgrp_gen, 1);
614 
615 		break;
616 	case LGRP_CONFIG_MEM_DEL:
617 		lgrp_mem_fini((int)resource, where, B_FALSE);
618 		atomic_add_32(&lgrp_gen, 1);
619 
620 		break;
621 	case LGRP_CONFIG_MEM_RENAME: {
622 		lgrp_config_mem_rename_t *ren_arg =
623 		    (lgrp_config_mem_rename_t *)where;
624 
625 		lgrp_mem_rename((int)resource,
626 		    ren_arg->lmem_rename_from,
627 		    ren_arg->lmem_rename_to);
628 		atomic_add_32(&lgrp_gen, 1);
629 
630 		break;
631 	}
632 	case LGRP_CONFIG_GEN_UPDATE:
633 		atomic_add_32(&lgrp_gen, 1);
634 
635 		break;
636 	case LGRP_CONFIG_FLATTEN:
637 		if (where == 0)
638 			lgrp_topo_levels = (int)resource;
639 		else
640 			(void) lgrp_topo_flatten(resource,
641 			    lgrp_table, lgrp_alloc_max, &changed);
642 
643 		break;
644 	/*
645 	 * Update any lgroups with old latency to new latency
646 	 */
647 	case LGRP_CONFIG_LAT_CHANGE_ALL:
648 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
649 		    (u_longlong_t)where);
650 
651 		break;
652 	/*
653 	 * Update lgroup with specified lgroup platform handle to have
654 	 * new latency
655 	 */
656 	case LGRP_CONFIG_LAT_CHANGE:
657 		lgrp_latency_change((lgrp_handle_t)resource, 0,
658 		    (u_longlong_t)where);
659 
660 		break;
661 	case LGRP_CONFIG_NOP:
662 
663 		break;
664 	default:
665 		break;
666 	}
667 
668 }
669 
670 /*
671  * Called to add lgrp info into cpu structure from cpu_add_unit;
672  * do not assume cpu is in cpu[] yet!
673  *
674  * CPUs are brought online with all other CPUs paused so we can't
675  * allocate memory or we could deadlock the system, so we rely on
676  * the platform to statically allocate as much space as we need
677  * for the lgrp structs and stats.
678  */
679 static void
680 lgrp_cpu_init(struct cpu *cp)
681 {
682 	klgrpset_t	changed;
683 	int		count;
684 	lgrp_handle_t	hand;
685 	int		first_cpu;
686 	lgrp_t		*my_lgrp;
687 	lgrp_id_t	lgrpid;
688 	struct cpu	*cptr;
689 
690 	/*
691 	 * This is the first time through if the resource set
692 	 * for the root lgroup is empty. After cpu0 has been
693 	 * initially added to an lgroup, the root's CPU resource
694 	 * set can never be empty, since the system's last CPU
695 	 * cannot be offlined.
696 	 */
697 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
698 		/*
699 		 * First time through.
700 		 */
701 		first_cpu = 1;
702 	} else {
703 		/*
704 		 * If cpu0 needs to move lgroups, we may come
705 		 * through here again, at which time cpu_lock won't
706 		 * be held, and lgrp_initialized will be false.
707 		 */
708 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
709 		ASSERT(cp->cpu_part != NULL);
710 		first_cpu = 0;
711 	}
712 
713 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
714 	my_lgrp = lgrp_hand_to_lgrp(hand);
715 
716 	if (my_lgrp == NULL) {
717 		/*
718 		 * Create new lgrp and add it to lgroup topology
719 		 */
720 		my_lgrp = lgrp_create();
721 		my_lgrp->lgrp_plathand = hand;
722 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
723 		lgrpid = my_lgrp->lgrp_id;
724 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
725 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
726 
727 		count = 0;
728 		klgrpset_clear(changed);
729 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
730 		    &changed);
731 		/*
732 		 * May have added new intermediate lgroups, so need to add
733 		 * resources other than CPUs which are added below
734 		 */
735 		(void) lgrp_mnode_update(changed, NULL);
736 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
737 	    > 0) {
738 		/*
739 		 * Leaf lgroup was created, but latency wasn't available
740 		 * then.  So, set latency for it and fill in rest of lgroup
741 		 * topology  now that we know how far it is from other leaf
742 		 * lgroups.
743 		 */
744 		lgrpid = my_lgrp->lgrp_id;
745 		klgrpset_clear(changed);
746 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
747 		    lgrpid))
748 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
749 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
750 		    &changed);
751 
752 		/*
753 		 * May have added new intermediate lgroups, so need to add
754 		 * resources other than CPUs which are added below
755 		 */
756 		(void) lgrp_mnode_update(changed, NULL);
757 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
758 	    my_lgrp->lgrp_id)) {
759 		int	i;
760 
761 		/*
762 		 * Update existing lgroup and lgroups containing it with CPU
763 		 * resource
764 		 */
765 		lgrpid = my_lgrp->lgrp_id;
766 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
767 		for (i = 0; i <= lgrp_alloc_max; i++) {
768 			lgrp_t		*lgrp;
769 
770 			lgrp = lgrp_table[i];
771 			if (!LGRP_EXISTS(lgrp) ||
772 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
773 				continue;
774 
775 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
776 		}
777 	}
778 
779 	lgrpid = my_lgrp->lgrp_id;
780 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
781 
782 	/*
783 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
784 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
785 	 * not since none of lgroup IDs in the lpl's have been set yet.
786 	 */
787 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
788 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
789 
790 	/*
791 	 * link the CPU into the lgrp's CPU list
792 	 */
793 	if (my_lgrp->lgrp_cpucnt == 0) {
794 		my_lgrp->lgrp_cpu = cp;
795 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
796 	} else {
797 		cptr = my_lgrp->lgrp_cpu;
798 		cp->cpu_next_lgrp = cptr;
799 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
800 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
801 		cptr->cpu_prev_lgrp = cp;
802 	}
803 	my_lgrp->lgrp_cpucnt++;
804 }
805 
806 lgrp_t *
807 lgrp_create(void)
808 {
809 	lgrp_t		*my_lgrp;
810 	lgrp_id_t	lgrpid;
811 	int		i;
812 
813 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
814 
815 	/*
816 	 * Find an open slot in the lgroup table and recycle unused lgroup
817 	 * left there if any
818 	 */
819 	my_lgrp = NULL;
820 	if (lgrp_alloc_hint == -1)
821 		/*
822 		 * Allocate from end when hint not set yet because no lgroups
823 		 * have been deleted yet
824 		 */
825 		lgrpid = nlgrps++;
826 	else {
827 		/*
828 		 * Start looking for next open slot from hint and leave hint
829 		 * at slot allocated
830 		 */
831 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
832 			my_lgrp = lgrp_table[i];
833 			if (!LGRP_EXISTS(my_lgrp)) {
834 				lgrpid = i;
835 				nlgrps++;
836 				break;
837 			}
838 		}
839 		lgrp_alloc_hint = lgrpid;
840 	}
841 
842 	/*
843 	 * Keep track of max lgroup ID allocated so far to cut down on searches
844 	 */
845 	if (lgrpid > lgrp_alloc_max)
846 		lgrp_alloc_max = lgrpid;
847 
848 	/*
849 	 * Need to allocate new lgroup if next open slot didn't have one
850 	 * for recycling
851 	 */
852 	if (my_lgrp == NULL)
853 		my_lgrp = lgrp_plat_alloc(lgrpid);
854 
855 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
856 		panic("Too many lgrps for platform (%d)", nlgrps);
857 
858 	my_lgrp->lgrp_id = lgrpid;
859 	my_lgrp->lgrp_latency = 0;
860 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
861 	my_lgrp->lgrp_parent = NULL;
862 	my_lgrp->lgrp_childcnt = 0;
863 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
864 	my_lgrp->lgrp_nmnodes = 0;
865 	klgrpset_clear(my_lgrp->lgrp_children);
866 	klgrpset_clear(my_lgrp->lgrp_leaves);
867 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
868 		klgrpset_clear(my_lgrp->lgrp_set[i]);
869 
870 	my_lgrp->lgrp_cpu = NULL;
871 	my_lgrp->lgrp_cpucnt = 0;
872 
873 	if (my_lgrp->lgrp_kstat != NULL)
874 		lgrp_kstat_reset(lgrpid);
875 
876 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
877 
878 	return (my_lgrp);
879 }
880 
881 void
882 lgrp_destroy(lgrp_t *lgrp)
883 {
884 	int		i;
885 
886 	/*
887 	 * Unless this lgroup is being destroyed on behalf of
888 	 * the boot CPU, cpu_lock must be held
889 	 */
890 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
891 
892 	if (nlgrps == 1)
893 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
894 
895 	if (!LGRP_EXISTS(lgrp))
896 		return;
897 
898 	/*
899 	 * Set hint to lgroup being deleted and try to keep lower numbered
900 	 * hints to facilitate finding empty slots
901 	 */
902 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
903 		lgrp_alloc_hint = lgrp->lgrp_id;
904 
905 	/*
906 	 * Mark this lgroup to be recycled by setting its lgroup ID to
907 	 * LGRP_NONE and clear relevant fields
908 	 */
909 	lgrp->lgrp_id = LGRP_NONE;
910 	lgrp->lgrp_latency = 0;
911 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
912 	lgrp->lgrp_parent = NULL;
913 	lgrp->lgrp_childcnt = 0;
914 
915 	klgrpset_clear(lgrp->lgrp_children);
916 	klgrpset_clear(lgrp->lgrp_leaves);
917 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
918 		klgrpset_clear(lgrp->lgrp_set[i]);
919 
920 	lgrp->lgrp_mnodes = (mnodeset_t)0;
921 	lgrp->lgrp_nmnodes = 0;
922 
923 	lgrp->lgrp_cpu = NULL;
924 	lgrp->lgrp_cpucnt = 0;
925 
926 	nlgrps--;
927 }
928 
929 /*
930  * Initialize kstat data. Called from lgrp intialization code.
931  */
932 static void
933 lgrp_kstat_init(void)
934 {
935 	lgrp_stat_t	stat;
936 
937 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
938 
939 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
940 		kstat_named_init(&lgrp_kstat_data[stat],
941 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
942 }
943 
944 /*
945  * initialize an lgrp's kstats if needed
946  * called with cpu_lock held but not with cpus paused.
947  * we don't tear these down now because we don't know about
948  * memory leaving the lgrp yet...
949  */
950 
951 void
952 lgrp_kstat_create(cpu_t *cp)
953 {
954 	kstat_t		*lgrp_kstat;
955 	lgrp_id_t	lgrpid;
956 	lgrp_t		*my_lgrp;
957 
958 	ASSERT(MUTEX_HELD(&cpu_lock));
959 
960 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
961 	my_lgrp = lgrp_table[lgrpid];
962 
963 	if (my_lgrp->lgrp_kstat != NULL)
964 		return; /* already initialized */
965 
966 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
967 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
968 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
969 
970 	if (lgrp_kstat != NULL) {
971 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
972 		lgrp_kstat->ks_private = my_lgrp;
973 		lgrp_kstat->ks_data = &lgrp_kstat_data;
974 		lgrp_kstat->ks_update = lgrp_kstat_extract;
975 		my_lgrp->lgrp_kstat = lgrp_kstat;
976 		kstat_install(lgrp_kstat);
977 	}
978 }
979 
980 /*
981  * this will do something when we manage to remove now unused lgrps
982  */
983 
984 /* ARGSUSED */
985 void
986 lgrp_kstat_destroy(cpu_t *cp)
987 {
988 	ASSERT(MUTEX_HELD(&cpu_lock));
989 }
990 
991 /*
992  * Called when a CPU is off-lined.
993  */
994 static void
995 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
996 {
997 	lgrp_t *my_lgrp;
998 	struct cpu *prev;
999 	struct cpu *next;
1000 
1001 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1002 
1003 	prev = cp->cpu_prev_lgrp;
1004 	next = cp->cpu_next_lgrp;
1005 
1006 	prev->cpu_next_lgrp = next;
1007 	next->cpu_prev_lgrp = prev;
1008 
1009 	/*
1010 	 * just because I'm paranoid doesn't mean...
1011 	 */
1012 
1013 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1014 
1015 	my_lgrp = lgrp_table[lgrpid];
1016 	my_lgrp->lgrp_cpucnt--;
1017 
1018 	/*
1019 	 * Removing last CPU in lgroup, so update lgroup topology
1020 	 */
1021 	if (my_lgrp->lgrp_cpucnt == 0) {
1022 		klgrpset_t	changed;
1023 		int		count;
1024 		int		i;
1025 
1026 		my_lgrp->lgrp_cpu = NULL;
1027 
1028 		/*
1029 		 * Remove this lgroup from its lgroup CPU resources and remove
1030 		 * lgroup from lgroup topology if it doesn't have any more
1031 		 * resources in it now
1032 		 */
1033 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1034 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1035 			count = 0;
1036 			klgrpset_clear(changed);
1037 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1038 			    lgrp_alloc_max + 1, &changed);
1039 			return;
1040 		}
1041 
1042 		/*
1043 		 * This lgroup isn't empty, so just remove it from CPU
1044 		 * resources of any lgroups that contain it as such
1045 		 */
1046 		for (i = 0; i <= lgrp_alloc_max; i++) {
1047 			lgrp_t		*lgrp;
1048 
1049 			lgrp = lgrp_table[i];
1050 			if (!LGRP_EXISTS(lgrp) ||
1051 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1052 			    lgrpid))
1053 				continue;
1054 
1055 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1056 		}
1057 		return;
1058 	}
1059 
1060 	if (my_lgrp->lgrp_cpu == cp)
1061 		my_lgrp->lgrp_cpu = next;
1062 
1063 }
1064 
1065 /*
1066  * Update memory nodes in target lgroups and return ones that get changed
1067  */
1068 int
1069 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1070 {
1071 	int	count;
1072 	int	i;
1073 	int	j;
1074 	lgrp_t	*lgrp;
1075 	lgrp_t	*lgrp_rsrc;
1076 
1077 	count = 0;
1078 	if (changed)
1079 		klgrpset_clear(*changed);
1080 
1081 	if (klgrpset_isempty(target))
1082 		return (0);
1083 
1084 	/*
1085 	 * Find each lgroup in target lgroups
1086 	 */
1087 	for (i = 0; i <= lgrp_alloc_max; i++) {
1088 		/*
1089 		 * Skip any lgroups that don't exist or aren't in target group
1090 		 */
1091 		lgrp = lgrp_table[i];
1092 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1093 			continue;
1094 		}
1095 
1096 		/*
1097 		 * Initialize memnodes for intermediate lgroups to 0
1098 		 * and update them from scratch since they may have completely
1099 		 * changed
1100 		 */
1101 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1102 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1103 			lgrp->lgrp_nmnodes = 0;
1104 		}
1105 
1106 		/*
1107 		 * Update memory nodes of of target lgroup with memory nodes
1108 		 * from each lgroup in its lgroup memory resource set
1109 		 */
1110 		for (j = 0; j <= lgrp_alloc_max; j++) {
1111 			int	k;
1112 
1113 			/*
1114 			 * Skip any lgroups that don't exist or aren't in
1115 			 * memory resources of target lgroup
1116 			 */
1117 			lgrp_rsrc = lgrp_table[j];
1118 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1119 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1120 			    j))
1121 				continue;
1122 
1123 			/*
1124 			 * Update target lgroup's memnodes to include memnodes
1125 			 * of this lgroup
1126 			 */
1127 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1128 				mnodeset_t	mnode_mask;
1129 
1130 				mnode_mask = (mnodeset_t)1 << k;
1131 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1132 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1133 					lgrp->lgrp_mnodes |= mnode_mask;
1134 					lgrp->lgrp_nmnodes++;
1135 				}
1136 			}
1137 			count++;
1138 			if (changed)
1139 				klgrpset_add(*changed, lgrp->lgrp_id);
1140 		}
1141 	}
1142 
1143 	return (count);
1144 }
1145 
1146 /*
1147  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1148  * is moved from one board to another. The "from" and "to" arguments specify the
1149  * source and the destination of the move.
1150  *
1151  * See plat_lgrp_config() for a detailed description of the copy-rename
1152  * semantics.
1153  *
1154  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1155  * the lgroup topology which is changing as memory moves from one lgroup to
1156  * another. It removes the mnode from the source lgroup and re-inserts it in the
1157  * target lgroup.
1158  *
1159  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1160  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1161  * copy-rename operation.
1162  *
1163  * There is one case which requires special handling. If the system contains
1164  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1165  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1166  * lgrp_mem_init), but there is a window when the system has no memory in the
1167  * lgroup hierarchy. If another thread tries to allocate memory during this
1168  * window, the allocation will fail, although the system has physical memory.
1169  * This may cause a system panic or a deadlock (some sleeping memory allocations
1170  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1171  * the mnode back).
1172  *
1173  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1174  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1175  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1176  * but it updates the rest of the lgroup topology as if the mnode was actually
1177  * removed. The lgrp_mem_init() function recognizes that the mnode being
1178  * inserted represents such a special case and updates the topology
1179  * appropriately.
1180  */
1181 void
1182 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1183 {
1184 	/*
1185 	 * Remove the memory from the source node and add it to the destination
1186 	 * node.
1187 	 */
1188 	lgrp_mem_fini(mnode, from, B_TRUE);
1189 	lgrp_mem_init(mnode, to, B_TRUE);
1190 }
1191 
1192 /*
1193  * Called to indicate that the lgrp with platform handle "hand" now
1194  * contains the memory identified by "mnode".
1195  *
1196  * LOCKING for this routine is a bit tricky. Usually it is called without
1197  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1198  * callers. During DR of the board containing the caged memory it may be called
1199  * with cpu_lock already held and CPUs paused.
1200  *
1201  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1202  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1203  * dealing with the special case of DR copy-rename described in
1204  * lgrp_mem_rename().
1205  */
1206 void
1207 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1208 {
1209 	klgrpset_t	changed;
1210 	int		count;
1211 	int		i;
1212 	lgrp_t		*my_lgrp;
1213 	lgrp_id_t	lgrpid;
1214 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1215 	boolean_t	drop_lock = B_FALSE;
1216 	boolean_t	need_synch = B_FALSE;
1217 
1218 	/*
1219 	 * Grab CPU lock (if we haven't already)
1220 	 */
1221 	if (!MUTEX_HELD(&cpu_lock)) {
1222 		mutex_enter(&cpu_lock);
1223 		drop_lock = B_TRUE;
1224 	}
1225 
1226 	/*
1227 	 * This routine may be called from a context where we already
1228 	 * hold cpu_lock, and have already paused cpus.
1229 	 */
1230 	if (!cpus_paused())
1231 		need_synch = B_TRUE;
1232 
1233 	/*
1234 	 * Check if this mnode is already configured and return immediately if
1235 	 * it is.
1236 	 *
1237 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1238 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1239 	 * recognize this case and continue as usual, but skip the update to
1240 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1241 	 * in topology, temporarily introduced by lgrp_mem_fini().
1242 	 */
1243 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1244 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1245 		if (drop_lock)
1246 			mutex_exit(&cpu_lock);
1247 		return;
1248 	}
1249 
1250 	/*
1251 	 * Update lgroup topology with new memory resources, keeping track of
1252 	 * which lgroups change
1253 	 */
1254 	count = 0;
1255 	klgrpset_clear(changed);
1256 	my_lgrp = lgrp_hand_to_lgrp(hand);
1257 	if (my_lgrp == NULL) {
1258 		/* new lgrp */
1259 		my_lgrp = lgrp_create();
1260 		lgrpid = my_lgrp->lgrp_id;
1261 		my_lgrp->lgrp_plathand = hand;
1262 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1263 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1264 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1265 
1266 		if (need_synch)
1267 			pause_cpus(NULL);
1268 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1269 		    &changed);
1270 		if (need_synch)
1271 			start_cpus();
1272 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1273 	    > 0) {
1274 		/*
1275 		 * Leaf lgroup was created, but latency wasn't available
1276 		 * then.  So, set latency for it and fill in rest of lgroup
1277 		 * topology  now that we know how far it is from other leaf
1278 		 * lgroups.
1279 		 */
1280 		klgrpset_clear(changed);
1281 		lgrpid = my_lgrp->lgrp_id;
1282 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1283 		    lgrpid))
1284 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1285 		if (need_synch)
1286 			pause_cpus(NULL);
1287 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1288 		    &changed);
1289 		if (need_synch)
1290 			start_cpus();
1291 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1292 	    my_lgrp->lgrp_id)) {
1293 		/*
1294 		 * Add new lgroup memory resource to existing lgroup
1295 		 */
1296 		lgrpid = my_lgrp->lgrp_id;
1297 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1298 		klgrpset_add(changed, lgrpid);
1299 		count++;
1300 		for (i = 0; i <= lgrp_alloc_max; i++) {
1301 			lgrp_t		*lgrp;
1302 
1303 			lgrp = lgrp_table[i];
1304 			if (!LGRP_EXISTS(lgrp) ||
1305 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1306 				continue;
1307 
1308 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1309 			klgrpset_add(changed, lgrp->lgrp_id);
1310 			count++;
1311 		}
1312 	}
1313 
1314 	/*
1315 	 * Add memory node to lgroup and remove lgroup from ones that need
1316 	 * to be updated
1317 	 */
1318 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1319 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1320 		my_lgrp->lgrp_nmnodes++;
1321 	}
1322 	klgrpset_del(changed, lgrpid);
1323 
1324 	/*
1325 	 * Update memory node information for all lgroups that changed and
1326 	 * contain new memory node as a resource
1327 	 */
1328 	if (count)
1329 		(void) lgrp_mnode_update(changed, NULL);
1330 
1331 	if (drop_lock)
1332 		mutex_exit(&cpu_lock);
1333 }
1334 
1335 /*
1336  * Called to indicate that the lgroup associated with the platform
1337  * handle "hand" no longer contains given memory node
1338  *
1339  * LOCKING for this routine is a bit tricky. Usually it is called without
1340  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1341  * callers. During DR of the board containing the caged memory it may be called
1342  * with cpu_lock already held and CPUs paused.
1343  *
1344  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1345  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1346  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1347  * the same mnode back into the topology. See lgrp_mem_rename() and
1348  * lgrp_mem_init() for additional details.
1349  */
1350 void
1351 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1352 {
1353 	klgrpset_t	changed;
1354 	int		count;
1355 	int		i;
1356 	lgrp_t		*my_lgrp;
1357 	lgrp_id_t	lgrpid;
1358 	mnodeset_t	mnodes_mask;
1359 	boolean_t	drop_lock = B_FALSE;
1360 	boolean_t	need_synch = B_FALSE;
1361 
1362 	/*
1363 	 * Grab CPU lock (if we haven't already)
1364 	 */
1365 	if (!MUTEX_HELD(&cpu_lock)) {
1366 		mutex_enter(&cpu_lock);
1367 		drop_lock = B_TRUE;
1368 	}
1369 
1370 	/*
1371 	 * This routine may be called from a context where we already
1372 	 * hold cpu_lock and have already paused cpus.
1373 	 */
1374 	if (!cpus_paused())
1375 		need_synch = B_TRUE;
1376 
1377 	my_lgrp = lgrp_hand_to_lgrp(hand);
1378 
1379 	/*
1380 	 * The lgrp *must* be pre-existing
1381 	 */
1382 	ASSERT(my_lgrp != NULL);
1383 
1384 	/*
1385 	 * Delete memory node from lgroups which contain it
1386 	 */
1387 	mnodes_mask = ((mnodeset_t)1 << mnode);
1388 	for (i = 0; i <= lgrp_alloc_max; i++) {
1389 		lgrp_t *lgrp = lgrp_table[i];
1390 		/*
1391 		 * Skip any non-existent lgroups and any lgroups that don't
1392 		 * contain leaf lgroup of memory as a memory resource
1393 		 */
1394 		if (!LGRP_EXISTS(lgrp) ||
1395 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1396 			continue;
1397 
1398 		/*
1399 		 * Avoid removing the last mnode from the root in the DR
1400 		 * copy-rename case. See lgrp_mem_rename() for details.
1401 		 */
1402 		if (is_copy_rename &&
1403 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1404 			continue;
1405 
1406 		/*
1407 		 * Remove memory node from lgroup.
1408 		 */
1409 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1410 		lgrp->lgrp_nmnodes--;
1411 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1412 	}
1413 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1414 
1415 	/*
1416 	 * Don't need to update lgroup topology if this lgroup still has memory.
1417 	 *
1418 	 * In the special case of DR copy-rename with the only mnode being
1419 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1420 	 * still need to update the lgroup topology.
1421 	 */
1422 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1423 	    !(is_copy_rename &&
1424 		(my_lgrp == lgrp_root) &&
1425 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1426 		if (drop_lock)
1427 			mutex_exit(&cpu_lock);
1428 		return;
1429 	}
1430 
1431 	/*
1432 	 * This lgroup does not contain any memory now
1433 	 */
1434 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1435 
1436 	/*
1437 	 * Remove this lgroup from lgroup topology if it does not contain any
1438 	 * resources now
1439 	 */
1440 	lgrpid = my_lgrp->lgrp_id;
1441 	count = 0;
1442 	klgrpset_clear(changed);
1443 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1444 		/*
1445 		 * Delete lgroup when no more resources
1446 		 */
1447 		if (need_synch)
1448 			pause_cpus(NULL);
1449 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1450 		    lgrp_alloc_max + 1, &changed);
1451 		ASSERT(count > 0);
1452 		if (need_synch)
1453 			start_cpus();
1454 	} else {
1455 		/*
1456 		 * Remove lgroup from memory resources of any lgroups that
1457 		 * contain it as such
1458 		 */
1459 		for (i = 0; i <= lgrp_alloc_max; i++) {
1460 			lgrp_t		*lgrp;
1461 
1462 			lgrp = lgrp_table[i];
1463 			if (!LGRP_EXISTS(lgrp) ||
1464 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1465 			    lgrpid))
1466 				continue;
1467 
1468 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1469 		}
1470 	}
1471 	if (drop_lock)
1472 		mutex_exit(&cpu_lock);
1473 }
1474 
1475 /*
1476  * Return lgroup with given platform handle
1477  */
1478 lgrp_t *
1479 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1480 {
1481 	int	i;
1482 	lgrp_t	*lgrp;
1483 
1484 	if (hand == LGRP_NULL_HANDLE)
1485 		return (NULL);
1486 
1487 	for (i = 0; i <= lgrp_alloc_max; i++) {
1488 		lgrp = lgrp_table[i];
1489 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1490 			return (lgrp);
1491 	}
1492 	return (NULL);
1493 }
1494 
1495 /*
1496  * Return the home lgroup of the current thread.
1497  * We must do this with kernel preemption disabled, since we don't want our
1498  * thread to be re-homed while we're poking around with its lpl, and the lpl
1499  * should never be NULL.
1500  *
1501  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1502  * is enabled because of DR.  Callers can use disable kernel preemption
1503  * around this call to guarantee that the lgroup will be valid beyond this
1504  * routine, since kernel preemption can be recursive.
1505  */
1506 lgrp_t *
1507 lgrp_home_lgrp(void)
1508 {
1509 	lgrp_t	*lgrp;
1510 	lpl_t	*lpl;
1511 
1512 	kpreempt_disable();
1513 
1514 	lpl = curthread->t_lpl;
1515 	ASSERT(lpl != NULL);
1516 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1517 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1518 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1519 
1520 	kpreempt_enable();
1521 
1522 	return (lgrp);
1523 }
1524 
1525 /*
1526  * Return ID of home lgroup for given thread
1527  * (See comments for lgrp_home_lgrp() for special care and handling
1528  * instructions)
1529  */
1530 lgrp_id_t
1531 lgrp_home_id(kthread_t *t)
1532 {
1533 	lgrp_id_t	lgrp;
1534 	lpl_t		*lpl;
1535 
1536 	ASSERT(t != NULL);
1537 	/*
1538 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1539 	 * cannot since the HAT layer can call into this routine to
1540 	 * determine the locality for its data structures in the context
1541 	 * of a page fault.
1542 	 */
1543 
1544 	kpreempt_disable();
1545 
1546 	lpl = t->t_lpl;
1547 	ASSERT(lpl != NULL);
1548 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1549 	lgrp = lpl->lpl_lgrpid;
1550 
1551 	kpreempt_enable();
1552 
1553 	return (lgrp);
1554 }
1555 
1556 /*
1557  * Return lgroup containing the physical memory for the given page frame number
1558  */
1559 lgrp_t *
1560 lgrp_pfn_to_lgrp(pfn_t pfn)
1561 {
1562 	lgrp_handle_t	hand;
1563 	int		i;
1564 	lgrp_t		*lgrp;
1565 
1566 	hand = lgrp_plat_pfn_to_hand(pfn);
1567 	if (hand != LGRP_NULL_HANDLE)
1568 		for (i = 0; i <= lgrp_alloc_max; i++) {
1569 			lgrp = lgrp_table[i];
1570 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1571 				return (lgrp);
1572 		}
1573 	return (NULL);
1574 }
1575 
1576 /*
1577  * Return lgroup containing the physical memory for the given page frame number
1578  */
1579 lgrp_t *
1580 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1581 {
1582 	lgrp_handle_t	hand;
1583 	int		i;
1584 	lgrp_t		*lgrp;
1585 	pfn_t		pfn;
1586 
1587 	pfn = btop(physaddr);
1588 	hand = lgrp_plat_pfn_to_hand(pfn);
1589 	if (hand != LGRP_NULL_HANDLE)
1590 		for (i = 0; i <= lgrp_alloc_max; i++) {
1591 			lgrp = lgrp_table[i];
1592 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1593 				return (lgrp);
1594 		}
1595 	return (NULL);
1596 }
1597 
1598 /*
1599  * Return the leaf lgroup containing the given CPU
1600  *
1601  * The caller needs to take precautions necessary to prevent
1602  * "cpu", and it's lpl from going away across a call to this function.
1603  * hint: kpreempt_disable()/kpreempt_enable()
1604  */
1605 static lgrp_t *
1606 lgrp_cpu_to_lgrp(cpu_t *cpu)
1607 {
1608 	return (cpu->cpu_lpl->lpl_lgrp);
1609 }
1610 
1611 /*
1612  * Return the sum of the partition loads in an lgrp divided by
1613  * the number of CPUs in the lgrp.  This is our best approximation
1614  * of an 'lgroup load average' for a useful per-lgroup kstat.
1615  */
1616 static uint64_t
1617 lgrp_sum_loadavgs(lgrp_t *lgrp)
1618 {
1619 	cpu_t *cpu;
1620 	int ncpu;
1621 	uint64_t loads = 0;
1622 
1623 	mutex_enter(&cpu_lock);
1624 
1625 	cpu = lgrp->lgrp_cpu;
1626 	ncpu = lgrp->lgrp_cpucnt;
1627 
1628 	if (cpu == NULL || ncpu == 0) {
1629 		mutex_exit(&cpu_lock);
1630 		return (0ull);
1631 	}
1632 
1633 	do {
1634 		loads += cpu->cpu_lpl->lpl_loadavg;
1635 		cpu = cpu->cpu_next_lgrp;
1636 	} while (cpu != lgrp->lgrp_cpu);
1637 
1638 	mutex_exit(&cpu_lock);
1639 
1640 	return (loads / ncpu);
1641 }
1642 
1643 void
1644 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1645 {
1646 	struct lgrp_stats *pstats;
1647 
1648 	/*
1649 	 * Verify that the caller isn't trying to add to
1650 	 * a statistic for an lgroup that has gone away
1651 	 */
1652 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1653 		return;
1654 
1655 	pstats = &lgrp_stats[lgrpid];
1656 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1657 }
1658 
1659 int64_t
1660 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1661 {
1662 	uint64_t val;
1663 	struct lgrp_stats *pstats;
1664 
1665 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1666 		return ((int64_t)0);
1667 
1668 	pstats = &lgrp_stats[lgrpid];
1669 	LGRP_STAT_READ(pstats, stat, val);
1670 	return (val);
1671 }
1672 
1673 /*
1674  * Reset all kstats for lgrp specified by its lgrpid.
1675  */
1676 static void
1677 lgrp_kstat_reset(lgrp_id_t lgrpid)
1678 {
1679 	lgrp_stat_t stat;
1680 
1681 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1682 		return;
1683 
1684 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1685 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1686 	}
1687 }
1688 
1689 /*
1690  * Collect all per-lgrp statistics for the lgrp associated with this
1691  * kstat, and store them in the ks_data array.
1692  *
1693  * The superuser can reset all the running counter statistics for an
1694  * lgrp by writing to any of the lgrp's stats.
1695  */
1696 static int
1697 lgrp_kstat_extract(kstat_t *ksp, int rw)
1698 {
1699 	lgrp_stat_t		stat;
1700 	struct kstat_named	*ksd;
1701 	lgrp_t			*lgrp;
1702 	lgrp_id_t		lgrpid;
1703 
1704 	lgrp = (lgrp_t *)ksp->ks_private;
1705 
1706 	ksd = (struct kstat_named *)ksp->ks_data;
1707 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1708 
1709 	lgrpid = lgrp->lgrp_id;
1710 
1711 	if (lgrpid == LGRP_NONE) {
1712 		/*
1713 		 * Return all zeroes as stats for freed lgrp.
1714 		 */
1715 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1716 			ksd[stat].value.i64 = 0;
1717 		}
1718 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1719 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1720 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1721 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1722 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1723 	} else if (rw != KSTAT_WRITE) {
1724 		/*
1725 		 * Handle counter stats
1726 		 */
1727 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1728 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1729 		}
1730 
1731 		/*
1732 		 * Handle kernel data snapshot stats
1733 		 */
1734 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1735 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1736 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1737 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1738 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1739 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1740 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1741 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1742 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1743 		    lgrp_loadavg_max_effect;
1744 	} else {
1745 		lgrp_kstat_reset(lgrpid);
1746 	}
1747 
1748 	return (0);
1749 }
1750 
1751 int
1752 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1753 {
1754 	cpu_t	*cp;
1755 
1756 	mutex_enter(&cpu_lock);
1757 
1758 	if ((cp = cpu_get(id)) == NULL) {
1759 		mutex_exit(&cpu_lock);
1760 		return (EINVAL);
1761 	}
1762 
1763 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1764 		mutex_exit(&cpu_lock);
1765 		return (EINVAL);
1766 	}
1767 
1768 	ASSERT(cp->cpu_lpl != NULL);
1769 
1770 	*lp = cp->cpu_lpl->lpl_lgrpid;
1771 
1772 	mutex_exit(&cpu_lock);
1773 
1774 	return (0);
1775 }
1776 
1777 int
1778 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1779 {
1780 	cpu_t *cp;
1781 
1782 	mutex_enter(&cpu_lock);
1783 
1784 	if ((cp = cpu_get(id)) == NULL) {
1785 		mutex_exit(&cpu_lock);
1786 		return (EINVAL);
1787 	}
1788 
1789 	ASSERT(cp->cpu_lpl != NULL);
1790 
1791 	*lp = cp->cpu_lpl->lpl_loadavg;
1792 
1793 	mutex_exit(&cpu_lock);
1794 
1795 	return (0);
1796 }
1797 
1798 /*
1799  * Add a resource named by lpl_leaf to rset of lpl_target
1800  *
1801  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1802  * resource. It is adjusted here, as this is presently the only place that we
1803  * can be certain a resource addition has succeeded.
1804  *
1805  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1806  * list in order until it reaches a NULL.  (This list is required to be NULL
1807  * terminated, too).  This is done so that we can mark start pos + 1, so that
1808  * each lpl is traversed sequentially, but in a different order.  We hope this
1809  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1810  */
1811 
1812 void
1813 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1814 {
1815 	int		i;
1816 	int		entry_slot = 0;
1817 
1818 	/* return if leaf is already present */
1819 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1820 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1821 			return;
1822 		}
1823 
1824 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1825 		    lpl_leaf->lpl_lgrpid) {
1826 			break;
1827 		}
1828 	}
1829 
1830 	/* insert leaf, update counts */
1831 	entry_slot = i;
1832 	i = lpl_target->lpl_nrset++;
1833 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1834 		panic("More leaf lgrps in system than are supported!\n");
1835 	}
1836 
1837 	/*
1838 	 * Start at the end of the rset array and work backwards towards the
1839 	 * slot into which the new lpl will be inserted. This effectively
1840 	 * preserves the current ordering by scooting everybody over one entry,
1841 	 * and placing the new entry into the space created.
1842 	 */
1843 
1844 	while (i-- > entry_slot) {
1845 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1846 	}
1847 
1848 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1849 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1850 }
1851 
1852 /*
1853  * Update each of lpl_parent's children with a proper hint and
1854  * a reference to their parent.
1855  * The lgrp topology is used as the reference since it is fully
1856  * consistent and correct at this point.
1857  *
1858  * Each child's hint will reference an element in lpl_parent's
1859  * rset that designates where the child should start searching
1860  * for CPU resources. The hint selected is the highest order leaf present
1861  * in the child's lineage.
1862  *
1863  * This should be called after any potential change in lpl_parent's
1864  * rset.
1865  */
1866 static void
1867 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1868 {
1869 	klgrpset_t	children, leaves;
1870 	lpl_t		*lpl;
1871 	int		hint;
1872 	int		i, j;
1873 
1874 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1875 	if (klgrpset_isempty(children))
1876 		return; /* nothing to do */
1877 
1878 	for (i = 0; i <= lgrp_alloc_max; i++) {
1879 		if (klgrpset_ismember(children, i)) {
1880 
1881 			/*
1882 			 * Given the set of leaves in this child's lineage,
1883 			 * find the highest order leaf present in the parent's
1884 			 * rset. Select this as the hint for the child.
1885 			 */
1886 			leaves = lgrp_table[i]->lgrp_leaves;
1887 			hint = 0;
1888 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1889 				lpl = lpl_parent->lpl_rset[j];
1890 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1891 					hint = j;
1892 			}
1893 			cp->cp_lgrploads[i].lpl_hint = hint;
1894 
1895 			/*
1896 			 * (Re)set the parent. It may be incorrect if
1897 			 * lpl_parent is new in the topology.
1898 			 */
1899 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1900 		}
1901 	}
1902 }
1903 
1904 /*
1905  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1906  *
1907  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1908  * resource. The values are adjusted here, as this is the only place that we can
1909  * be certain a resource was successfully deleted.
1910  */
1911 void
1912 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1913 {
1914 	int i;
1915 
1916 	/* find leaf in intermediate node */
1917 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1918 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1919 			break;
1920 	}
1921 
1922 	/* return if leaf not found */
1923 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1924 		return;
1925 
1926 	/* prune leaf, compress array */
1927 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1928 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1929 	lpl_target->lpl_ncpu--;
1930 	do {
1931 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1932 	} while (i++ < lpl_target->lpl_nrset);
1933 }
1934 
1935 /*
1936  * Check to see if the resource set of the target lpl contains the
1937  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1938  */
1939 
1940 int
1941 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1942 {
1943 	int i;
1944 
1945 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1946 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1947 			return (1);
1948 	}
1949 
1950 	return (0);
1951 }
1952 
1953 /*
1954  * Called when we change cpu lpl membership.  This increments or decrements the
1955  * per-cpu counter in every lpl in which our leaf appears.
1956  */
1957 void
1958 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1959 {
1960 	cpupart_t	*cpupart;
1961 	lgrp_t		*lgrp_leaf;
1962 	lgrp_t		*lgrp_cur;
1963 	lpl_t		*lpl_leaf;
1964 	lpl_t		*lpl_cur;
1965 	int		i;
1966 
1967 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1968 
1969 	cpupart = cp->cpu_part;
1970 	lpl_leaf = cp->cpu_lpl;
1971 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1972 
1973 	for (i = 0; i <= lgrp_alloc_max; i++) {
1974 		lgrp_cur = lgrp_table[i];
1975 
1976 		/*
1977 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1978 		 * for the cpu in question, or if the current lgrp and leaf
1979 		 * don't share the same resources.
1980 		 */
1981 
1982 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1983 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1984 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1985 			continue;
1986 
1987 
1988 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1989 
1990 		if (lpl_cur->lpl_nrset > 0) {
1991 			if (act == LPL_INCREMENT) {
1992 				lpl_cur->lpl_ncpu++;
1993 			} else if (act == LPL_DECREMENT) {
1994 				lpl_cur->lpl_ncpu--;
1995 			}
1996 		}
1997 	}
1998 }
1999 
2000 /*
2001  * Initialize lpl with given resources and specified lgrp
2002  */
2003 
2004 void
2005 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2006 {
2007 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2008 	lpl->lpl_loadavg = 0;
2009 	if (lpl == lpl_leaf)
2010 		lpl->lpl_ncpu = 1;
2011 	else
2012 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2013 	lpl->lpl_nrset = 1;
2014 	lpl->lpl_rset[0] = lpl_leaf;
2015 	lpl->lpl_lgrp = lgrp;
2016 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2017 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2018 }
2019 
2020 /*
2021  * Clear an unused lpl
2022  */
2023 
2024 void
2025 lpl_clear(lpl_t *lpl)
2026 {
2027 	lgrp_id_t	lid;
2028 
2029 	/* save lid for debugging purposes */
2030 	lid = lpl->lpl_lgrpid;
2031 	bzero(lpl, sizeof (lpl_t));
2032 	lpl->lpl_lgrpid = lid;
2033 }
2034 
2035 /*
2036  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2037  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2038  * make full use of all of the lgroup topology, but this checks to make sure
2039  * that for the parts that it does use, it has correctly understood the
2040  * relationships that exist. This function returns
2041  * 0 if the topology is correct, and a non-zero error code, for non-debug
2042  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2043  * debugging on a DEBUG kernel.
2044  */
2045 int
2046 lpl_topo_verify(cpupart_t *cpupart)
2047 {
2048 	lgrp_t		*lgrp;
2049 	lpl_t		*lpl;
2050 	klgrpset_t	rset;
2051 	klgrpset_t	cset;
2052 	cpu_t		*cpu;
2053 	cpu_t		*cp_start;
2054 	int		i;
2055 	int		j;
2056 	int		sum;
2057 
2058 	/* topology can't be incorrect if it doesn't exist */
2059 	if (!lgrp_topo_initialized || !lgrp_initialized)
2060 		return (LPL_TOPO_CORRECT);
2061 
2062 	ASSERT(cpupart != NULL);
2063 
2064 	for (i = 0; i <= lgrp_alloc_max; i++) {
2065 		lgrp = lgrp_table[i];
2066 		lpl = NULL;
2067 		/* make sure lpls are allocated */
2068 		ASSERT(cpupart->cp_lgrploads);
2069 		if (!cpupart->cp_lgrploads)
2070 			return (LPL_TOPO_PART_HAS_NO_LPL);
2071 
2072 		lpl = &cpupart->cp_lgrploads[i];
2073 		/* make sure our index is good */
2074 		ASSERT(i < cpupart->cp_nlgrploads);
2075 
2076 		/* if lgroup doesn't exist, make sure lpl is empty */
2077 		if (!LGRP_EXISTS(lgrp)) {
2078 			ASSERT(lpl->lpl_ncpu == 0);
2079 			if (lpl->lpl_ncpu > 0) {
2080 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2081 			} else {
2082 				continue;
2083 			}
2084 		}
2085 
2086 		/* verify that lgroup and lpl are identically numbered */
2087 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2088 
2089 		/* if lgroup isn't in our partition, make sure lpl is empty */
2090 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2091 		    cpupart->cp_lgrpset)) {
2092 			ASSERT(lpl->lpl_ncpu == 0);
2093 			if (lpl->lpl_ncpu > 0) {
2094 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2095 			}
2096 			/*
2097 			 * lpl is empty, and lgroup isn't in partition.  verify
2098 			 * that lpl doesn't show up in anyone else's rsets (in
2099 			 * this partition, anyway)
2100 			 */
2101 
2102 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2103 				lpl_t *i_lpl; /* lpl we're iterating over */
2104 
2105 				i_lpl = &cpupart->cp_lgrploads[j];
2106 
2107 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2108 				if (lpl_rset_contains(i_lpl, lpl)) {
2109 					return (LPL_TOPO_LPL_ORPHANED);
2110 				}
2111 			}
2112 			/* lgroup is empty, and everything is ok. continue */
2113 			continue;
2114 		}
2115 
2116 
2117 		/* lgroup is in this partition, now check it against lpl */
2118 
2119 		/* do both have matching lgrps? */
2120 		ASSERT(lgrp == lpl->lpl_lgrp);
2121 		if (lgrp != lpl->lpl_lgrp) {
2122 			return (LPL_TOPO_LGRP_MISMATCH);
2123 		}
2124 
2125 		/* do the parent lgroups exist and do they match? */
2126 		if (lgrp->lgrp_parent) {
2127 			ASSERT(lpl->lpl_parent);
2128 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2129 				    lpl->lpl_parent->lpl_lgrpid);
2130 
2131 			if (!lpl->lpl_parent) {
2132 				return (LPL_TOPO_MISSING_PARENT);
2133 			} else if (lgrp->lgrp_parent->lgrp_id !=
2134 			    lpl->lpl_parent->lpl_lgrpid) {
2135 				return (LPL_TOPO_PARENT_MISMATCH);
2136 			}
2137 		}
2138 
2139 		/* only leaf lgroups keep a cpucnt, only check leaves */
2140 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2141 
2142 			/* verify that lgrp is also a leaf */
2143 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2144 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2145 			    lpl->lpl_lgrpid)));
2146 
2147 			if ((lgrp->lgrp_childcnt > 0) ||
2148 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2149 			    lpl->lpl_lgrpid))) {
2150 				return (LPL_TOPO_LGRP_NOT_LEAF);
2151 			}
2152 
2153 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2154 			    (lpl->lpl_ncpu > 0));
2155 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2156 				(lpl->lpl_ncpu <= 0)) {
2157 				return (LPL_TOPO_BAD_CPUCNT);
2158 			}
2159 
2160 			/*
2161 			 * Check that lpl_ncpu also matches the number of
2162 			 * cpus in the lpl's linked list.  This only exists in
2163 			 * leaves, but they should always match.
2164 			 */
2165 			j = 0;
2166 			cpu = cp_start = lpl->lpl_cpus;
2167 			while (cpu != NULL) {
2168 				j++;
2169 
2170 				/* check to make sure cpu's lpl is leaf lpl */
2171 				ASSERT(cpu->cpu_lpl == lpl);
2172 				if (cpu->cpu_lpl != lpl) {
2173 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2174 				}
2175 
2176 				/* check next cpu */
2177 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2178 					continue;
2179 				} else {
2180 					cpu = NULL;
2181 				}
2182 			}
2183 
2184 			ASSERT(j == lpl->lpl_ncpu);
2185 			if (j != lpl->lpl_ncpu) {
2186 				return (LPL_TOPO_LPL_BAD_NCPU);
2187 			}
2188 
2189 			/*
2190 			 * Also, check that leaf lpl is contained in all
2191 			 * intermediate lpls that name the leaf as a descendant
2192 			 */
2193 
2194 			for (j = 0; j <= lgrp_alloc_max; j++) {
2195 				klgrpset_t intersect;
2196 				lgrp_t *lgrp_cand;
2197 				lpl_t *lpl_cand;
2198 
2199 				lgrp_cand = lgrp_table[j];
2200 				intersect = klgrpset_intersects(
2201 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2202 				    cpupart->cp_lgrpset);
2203 
2204 				if (!LGRP_EXISTS(lgrp_cand) ||
2205 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2206 				    cpupart->cp_lgrpset) ||
2207 				    (intersect == 0))
2208 					continue;
2209 
2210 				lpl_cand =
2211 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2212 
2213 				if (klgrpset_ismember(intersect,
2214 				    lgrp->lgrp_id)) {
2215 					ASSERT(lpl_rset_contains(lpl_cand,
2216 					    lpl));
2217 
2218 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2219 						return (LPL_TOPO_RSET_MSSNG_LF);
2220 					}
2221 				}
2222 			}
2223 
2224 		} else { /* non-leaf specific checks */
2225 
2226 			/*
2227 			 * Non-leaf lpls should have lpl_cpus == NULL
2228 			 * verify that this is so
2229 			 */
2230 			ASSERT(lpl->lpl_cpus == NULL);
2231 			if (lpl->lpl_cpus != NULL) {
2232 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2233 			}
2234 
2235 			/*
2236 			 * verify that the sum of the cpus in the leaf resources
2237 			 * is equal to the total ncpu in the intermediate
2238 			 */
2239 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2240 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2241 			}
2242 
2243 			ASSERT(sum == lpl->lpl_ncpu);
2244 			if (sum != lpl->lpl_ncpu) {
2245 				return (LPL_TOPO_LPL_BAD_NCPU);
2246 			}
2247 		}
2248 
2249 		/*
2250 		 * check on lpl_hint. Don't check root, since it has no parent.
2251 		 */
2252 		if (lpl->lpl_parent != NULL) {
2253 			int hint;
2254 			lpl_t *hint_lpl;
2255 
2256 			/* make sure hint is within limits of nrset */
2257 			hint = lpl->lpl_hint;
2258 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2259 			if (lpl->lpl_parent->lpl_nrset < hint) {
2260 				return (LPL_TOPO_BOGUS_HINT);
2261 			}
2262 
2263 			/* make sure hint points to valid lpl */
2264 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2265 			ASSERT(hint_lpl->lpl_ncpu > 0);
2266 			if (hint_lpl->lpl_ncpu <= 0) {
2267 				return (LPL_TOPO_BOGUS_HINT);
2268 			}
2269 		}
2270 
2271 		/*
2272 		 * Check the rset of the lpl in question.  Make sure that each
2273 		 * rset contains a subset of the resources in
2274 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2275 		 * sure that each rset doesn't include resources that are
2276 		 * outside of that set.  (Which would be resources somehow not
2277 		 * accounted for).
2278 		 */
2279 
2280 		klgrpset_clear(rset);
2281 		for (j = 0; j < lpl->lpl_nrset; j++) {
2282 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2283 		}
2284 		klgrpset_copy(cset, rset);
2285 		/* make sure lpl rset matches lgrp rset */
2286 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2287 		/* make sure rset is contained with in partition, too */
2288 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2289 
2290 		ASSERT(klgrpset_isempty(rset) &&
2291 			    klgrpset_isempty(cset));
2292 		if (!klgrpset_isempty(rset) ||
2293 		    !klgrpset_isempty(cset)) {
2294 			return (LPL_TOPO_RSET_MISMATCH);
2295 		}
2296 
2297 		/*
2298 		 * check to make sure lpl_nrset matches the number of rsets
2299 		 * contained in the lpl
2300 		 */
2301 
2302 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2303 		    j++);
2304 
2305 		ASSERT(j == lpl->lpl_nrset);
2306 		if (j != lpl->lpl_nrset) {
2307 			return (LPL_TOPO_BAD_RSETCNT);
2308 		}
2309 
2310 	}
2311 	return (LPL_TOPO_CORRECT);
2312 }
2313 
2314 /*
2315  * Flatten lpl topology to given number of levels.  This is presently only
2316  * implemented for a flatten to 2 levels, which will prune out the intermediates
2317  * and home the leaf lpls to the root lpl.
2318  */
2319 int
2320 lpl_topo_flatten(int levels)
2321 {
2322 	int		i;
2323 	uint_t		sum;
2324 	lgrp_t		*lgrp_cur;
2325 	lpl_t		*lpl_cur;
2326 	lpl_t		*lpl_root;
2327 	cpupart_t	*cp;
2328 
2329 	if (levels != 2)
2330 		return (0);
2331 
2332 	/* called w/ cpus paused - grab no locks! */
2333 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2334 	    !lgrp_initialized);
2335 
2336 	cp = cp_list_head;
2337 	do {
2338 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2339 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2340 
2341 		for (i = 0; i <= lgrp_alloc_max; i++) {
2342 			lgrp_cur = lgrp_table[i];
2343 			lpl_cur = &cp->cp_lgrploads[i];
2344 
2345 			if ((lgrp_cur == lgrp_root) ||
2346 			    (!LGRP_EXISTS(lgrp_cur) &&
2347 			    (lpl_cur->lpl_ncpu == 0)))
2348 				continue;
2349 
2350 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2351 				/*
2352 				 * this should be a deleted intermediate, so
2353 				 * clear it
2354 				 */
2355 				lpl_clear(lpl_cur);
2356 			} else if ((lpl_cur->lpl_nrset == 1) &&
2357 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2358 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2359 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2360 				/*
2361 				 * this is a leaf whose parent was deleted, or
2362 				 * whose parent had their lgrp deleted.  (And
2363 				 * whose parent will soon be deleted).  Point
2364 				 * this guy back to the root lpl.
2365 				 */
2366 				lpl_cur->lpl_parent = lpl_root;
2367 				lpl_rset_add(lpl_root, lpl_cur);
2368 			}
2369 
2370 		}
2371 
2372 		/*
2373 		 * Now that we're done, make sure the count on the root lpl is
2374 		 * correct, and update the hints of the children for the sake of
2375 		 * thoroughness
2376 		 */
2377 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2378 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2379 		}
2380 		lpl_root->lpl_ncpu = sum;
2381 		lpl_child_update(lpl_root, cp);
2382 
2383 		cp = cp->cp_next;
2384 	} while (cp != cp_list_head);
2385 
2386 	return (levels);
2387 }
2388 
2389 /*
2390  * Insert a lpl into the resource hierarchy and create any additional lpls that
2391  * are necessary to represent the varying states of locality for the cpu
2392  * resoruces newly added to the partition.
2393  *
2394  * This routine is clever enough that it can correctly add resources from the
2395  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2396  * those for which the lpl is a leaf as opposed to simply a named equally local
2397  * resource).  The one special case that needs additional processing is when a
2398  * new intermediate lpl is introduced.  Since the main loop only traverses
2399  * looking to add the leaf resource where it does not yet exist, additional work
2400  * is necessary to add other leaf resources that may need to exist in the newly
2401  * created intermediate.  This is performed by the second inner loop, and is
2402  * only done when the check for more than one overlapping resource succeeds.
2403  */
2404 
2405 void
2406 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2407 {
2408 	int		i;
2409 	int		j;
2410 	int		hint;
2411 	int		rset_num_intersect;
2412 	lgrp_t		*lgrp_cur;
2413 	lpl_t		*lpl_cur;
2414 	lpl_t		*lpl_parent;
2415 	lgrp_id_t	parent_id;
2416 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2417 
2418 	for (i = 0; i <= lgrp_alloc_max; i++) {
2419 		lgrp_cur = lgrp_table[i];
2420 
2421 		/*
2422 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2423 		 * contained within the current lgrp, or if the current lgrp has
2424 		 * no leaves in this partition
2425 		 */
2426 
2427 		if (!LGRP_EXISTS(lgrp_cur) ||
2428 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2429 		    lpl_leaf->lpl_lgrpid) ||
2430 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2431 		    cpupart->cp_lgrpset))
2432 			continue;
2433 
2434 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2435 		if (lgrp_cur->lgrp_parent != NULL) {
2436 			/* if lgrp has a parent, assign it properly */
2437 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2438 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2439 		} else {
2440 			/* if not, make sure parent ptr gets set to null */
2441 			lpl_parent = NULL;
2442 		}
2443 
2444 		if (lpl_cur == lpl_leaf) {
2445 			/*
2446 			 * Almost all leaf state was initialized elsewhere.  The
2447 			 * only thing left to do is to set the parent.
2448 			 */
2449 			lpl_cur->lpl_parent = lpl_parent;
2450 			continue;
2451 		}
2452 
2453 		/*
2454 		 * Initialize intermediate lpl
2455 		 * Save this lpl's hint though. Since we're changing this
2456 		 * lpl's resources, we need to update the hint in this lpl's
2457 		 * children, but the hint in this lpl is unaffected and
2458 		 * should be preserved.
2459 		 */
2460 		hint = lpl_cur->lpl_hint;
2461 
2462 		lpl_clear(lpl_cur);
2463 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2464 
2465 		lpl_cur->lpl_hint = hint;
2466 		lpl_cur->lpl_parent = lpl_parent;
2467 
2468 		/* does new lpl need to be populated with other resources? */
2469 		rset_intersect =
2470 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2471 			cpupart->cp_lgrpset);
2472 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2473 
2474 		if (rset_num_intersect > 1) {
2475 			/*
2476 			 * If so, figure out what lpls have resources that
2477 			 * intersect this one, and add them.
2478 			 */
2479 			for (j = 0; j <= lgrp_alloc_max; j++) {
2480 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2481 				lpl_t	*lpl_cand;	/* candidate lpl */
2482 
2483 				lgrp_cand = lgrp_table[j];
2484 				if (!LGRP_EXISTS(lgrp_cand) ||
2485 				    !klgrpset_ismember(rset_intersect,
2486 					lgrp_cand->lgrp_id))
2487 					continue;
2488 				lpl_cand =
2489 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2490 				lpl_rset_add(lpl_cur, lpl_cand);
2491 			}
2492 		}
2493 		/*
2494 		 * This lpl's rset has changed. Update the hint in it's
2495 		 * children.
2496 		 */
2497 		lpl_child_update(lpl_cur, cpupart);
2498 	}
2499 }
2500 
2501 /*
2502  * remove a lpl from the hierarchy of resources, clearing its state when
2503  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2504  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2505  * delete them as well.
2506  */
2507 
2508 void
2509 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2510 {
2511 	int		i;
2512 	lgrp_t		*lgrp_cur;
2513 	lpl_t		*lpl_cur;
2514 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2515 
2516 	for (i = 0; i <= lgrp_alloc_max; i++) {
2517 		lgrp_cur = lgrp_table[i];
2518 
2519 		/*
2520 		 * Don't attempt to remove from lgrps that aren't there, that
2521 		 * don't contain our leaf, or from the leaf itself. (We do that
2522 		 * later)
2523 		 */
2524 
2525 		if (!LGRP_EXISTS(lgrp_cur))
2526 			continue;
2527 
2528 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2529 
2530 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2531 		    lpl_leaf->lpl_lgrpid) ||
2532 		    (lpl_cur == lpl_leaf)) {
2533 			continue;
2534 		}
2535 
2536 		/*
2537 		 * This is a slightly sleazy simplification in that we have
2538 		 * already marked the cp_lgrpset as no longer containing the
2539 		 * leaf we've deleted.  Any lpls that pass the above checks
2540 		 * based upon lgrp membership but not necessarily cpu-part
2541 		 * membership also get cleared by the checks below.  Currently
2542 		 * this is harmless, as the lpls should be empty anyway.
2543 		 *
2544 		 * In particular, we want to preserve lpls that have additional
2545 		 * leaf resources, even though we don't yet have a processor
2546 		 * architecture that represents resources this way.
2547 		 */
2548 
2549 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2550 		    cpupart->cp_lgrpset);
2551 
2552 		lpl_rset_del(lpl_cur, lpl_leaf);
2553 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2554 			lpl_clear(lpl_cur);
2555 		} else {
2556 			/*
2557 			 * Update this lpl's children
2558 			 */
2559 			lpl_child_update(lpl_cur, cpupart);
2560 		}
2561 	}
2562 	lpl_clear(lpl_leaf);
2563 }
2564 
2565 /*
2566  * add a cpu to a partition in terms of lgrp load avg bookeeping
2567  *
2568  * The lpl (cpu partition load average information) is now arranged in a
2569  * hierarchical fashion whereby resources that are closest, ie. most local, to
2570  * the cpu in question are considered to be leaves in a tree of resources.
2571  * There are two general cases for cpu additon:
2572  *
2573  * 1. A lpl structure that contains resources already in the hierarchy tree.
2574  * In this case, all of the associated lpl relationships have been defined, and
2575  * all that is necessary is that we link the new cpu into the per-lpl list of
2576  * cpus, and increment the ncpu count of all places where this cpu resource will
2577  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2578  * pushing is accomplished by this routine.
2579  *
2580  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2581  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2582  * construct the hierarchy of state necessary to name it's more distant
2583  * resources, if they should exist.  The leaf structure is initialized by this
2584  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2585  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2586  * and builds all of the "ancestoral" state necessary to identify resources at
2587  * differing levels of locality.
2588  */
2589 void
2590 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2591 {
2592 	cpupart_t	*cpupart;
2593 	lgrp_t		*lgrp_leaf;
2594 	lpl_t		*lpl_leaf;
2595 
2596 	/* called sometimes w/ cpus paused - grab no locks */
2597 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2598 
2599 	cpupart = cp->cpu_part;
2600 	lgrp_leaf = lgrp_table[lgrpid];
2601 
2602 	/* don't add non-existent lgrp */
2603 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2604 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2605 	cp->cpu_lpl = lpl_leaf;
2606 
2607 	/* only leaf lpls contain cpus */
2608 
2609 	if (lpl_leaf->lpl_ncpu++ == 0) {
2610 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2611 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2612 		lpl_leaf_insert(lpl_leaf, cpupart);
2613 	} else {
2614 		/*
2615 		 * the lpl should already exist in the parent, so just update
2616 		 * the count of available CPUs
2617 		 */
2618 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2619 	}
2620 
2621 	/* link cpu into list of cpus in lpl */
2622 
2623 	if (lpl_leaf->lpl_cpus) {
2624 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2625 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2626 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2627 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2628 	} else {
2629 		/*
2630 		 * We increment ncpu immediately after we create a new leaf
2631 		 * lpl, so assert that ncpu == 1 for the case where we don't
2632 		 * have any cpu pointers yet.
2633 		 */
2634 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2635 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2636 	}
2637 
2638 }
2639 
2640 
2641 /*
2642  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2643  *
2644  * The lpl (cpu partition load average information) is now arranged in a
2645  * hierarchical fashion whereby resources that are closest, ie. most local, to
2646  * the cpu in question are considered to be leaves in a tree of resources.
2647  * There are two removal cases in question:
2648  *
2649  * 1. Removal of the resource in the leaf leaves other resources remaining in
2650  * that leaf.  (Another cpu still exists at this level of locality).  In this
2651  * case, the count of available cpus is decremented in all assocated lpls by
2652  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2653  * from the per-cpu lpl list.
2654  *
2655  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2656  * empty)  In this case, all of what has occurred for the first step must take
2657  * place; however, additionally we must remove the lpl structure itself, prune
2658  * out any stranded lpls that do not directly name a leaf resource, and mark the
2659  * cpu partition in question as no longer containing resources from the lgrp of
2660  * the lpl that has been delted.  Cpu-partition changes are handled by this
2661  * method, but the lpl_leaf_remove function deals with the details of pruning
2662  * out the empty lpl and any of its orphaned direct ancestors.
2663  */
2664 void
2665 lgrp_part_del_cpu(cpu_t *cp)
2666 {
2667 	lpl_t		*lpl;
2668 	lpl_t		*leaf_lpl;
2669 	lgrp_t		*lgrp_leaf;
2670 
2671 	/* called sometimes w/ cpus paused - grab no locks */
2672 
2673 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2674 
2675 	lpl = leaf_lpl = cp->cpu_lpl;
2676 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2677 
2678 	/* don't delete a leaf that isn't there */
2679 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2680 
2681 	/* no double-deletes */
2682 	ASSERT(lpl->lpl_ncpu);
2683 	if (--lpl->lpl_ncpu == 0) {
2684 		/*
2685 		 * This was the last cpu in this lgroup for this partition,
2686 		 * clear its bit in the partition's lgroup bitmask
2687 		 */
2688 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2689 
2690 		/* eliminate remaning lpl link pointers in cpu, lpl */
2691 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2692 
2693 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2694 	} else {
2695 
2696 		/* unlink cpu from lists of cpus in lpl */
2697 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2698 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2699 		if (lpl->lpl_cpus == cp) {
2700 			lpl->lpl_cpus = cp->cpu_next_lpl;
2701 		}
2702 
2703 		/*
2704 		 * Update the cpu count in the lpls associated with parent
2705 		 * lgroups.
2706 		 */
2707 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2708 
2709 	}
2710 	/* clear cpu's lpl ptr when we're all done */
2711 	cp->cpu_lpl = NULL;
2712 }
2713 
2714 /*
2715  * Recompute load average for the specified partition/lgrp fragment.
2716  *
2717  * We rely on the fact that this routine is called from the clock thread
2718  * at a point before the clock thread can block (i.e. before its first
2719  * lock request).  Since the clock thread can not be preempted (since it
2720  * runs at highest priority), we know that cpu partitions can not change
2721  * (since doing so would require either the repartition requester or the
2722  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2723  * without grabbing cpu_lock.
2724  */
2725 void
2726 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2727 {
2728 	uint_t		ncpu;
2729 	int64_t		old, new, f;
2730 
2731 	/*
2732 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2733 	 */
2734 	static short expval[] = {
2735 	    0, 3196, 1618, 1083,
2736 	    814, 652, 543, 466,
2737 	    408, 363, 326, 297,
2738 	    272, 251, 233, 218,
2739 	    204, 192, 181, 172,
2740 	    163, 155, 148, 142,
2741 	    136, 130, 125, 121,
2742 	    116, 112, 109, 105
2743 	};
2744 
2745 	/* ASSERT (called from clock level) */
2746 
2747 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2748 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2749 		return;
2750 	}
2751 
2752 	for (;;) {
2753 
2754 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2755 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2756 		else
2757 			f = expval[ncpu];
2758 
2759 		/*
2760 		 * Modify the load average atomically to avoid losing
2761 		 * anticipatory load updates (see lgrp_move_thread()).
2762 		 */
2763 		if (ageflag) {
2764 			/*
2765 			 * We're supposed to both update and age the load.
2766 			 * This happens 10 times/sec. per cpu.  We do a
2767 			 * little hoop-jumping to avoid integer overflow.
2768 			 */
2769 			int64_t		q, r;
2770 
2771 			do {
2772 				old = new = lpl->lpl_loadavg;
2773 				q = (old  >> 16) << 7;
2774 				r = (old  & 0xffff) << 7;
2775 				new += ((long long)(nrcpus - q) * f -
2776 				    ((r * f) >> 16)) >> 7;
2777 
2778 				/*
2779 				 * Check for overflow
2780 				 */
2781 				if (new > LGRP_LOADAVG_MAX)
2782 					new = LGRP_LOADAVG_MAX;
2783 				else if (new < 0)
2784 					new = 0;
2785 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2786 			    new) != old);
2787 		} else {
2788 			/*
2789 			 * We're supposed to update the load, but not age it.
2790 			 * This option is used to update the load (which either
2791 			 * has already been aged in this 1/10 sec. interval or
2792 			 * soon will be) to account for a remotely executing
2793 			 * thread.
2794 			 */
2795 			do {
2796 				old = new = lpl->lpl_loadavg;
2797 				new += f;
2798 				/*
2799 				 * Check for overflow
2800 				 * Underflow not possible here
2801 				 */
2802 				if (new < old)
2803 					new = LGRP_LOADAVG_MAX;
2804 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2805 			    new) != old);
2806 		}
2807 
2808 		/*
2809 		 * Do the same for this lpl's parent
2810 		 */
2811 		if ((lpl = lpl->lpl_parent) == NULL)
2812 			break;
2813 		ncpu = lpl->lpl_ncpu;
2814 	}
2815 }
2816 
2817 /*
2818  * Initialize lpl topology in the target based on topology currently present in
2819  * lpl_bootstrap.
2820  *
2821  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2822  * initialize cp_default list of lpls. Up to this point all topology operations
2823  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2824  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2825  * `target' points to the list of lpls in cp_default and `size' is the size of
2826  * this list.
2827  *
2828  * This function walks the lpl topology in lpl_bootstrap and does for things:
2829  *
2830  * 1) Copies all fields from lpl_bootstrap to the target.
2831  *
2832  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2833  *
2834  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2835  *    instead of lpl_bootstrap.
2836  *
2837  * 4) Updates pointers in the resource list of the target to point to the lpls
2838  *    in the target list instead of lpl_bootstrap.
2839  *
2840  * After lpl_topo_bootstrap() completes, target contains the same information
2841  * that would be present there if it were used during boot instead of
2842  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2843  * and it is bzeroed.
2844  */
2845 void
2846 lpl_topo_bootstrap(lpl_t *target, int size)
2847 {
2848 	lpl_t	*lpl = lpl_bootstrap;
2849 	lpl_t	*target_lpl = target;
2850 	int	howmany;
2851 	int	id;
2852 	int	i;
2853 
2854 	/*
2855 	 * The only target that should be passed here is cp_default lpl list.
2856 	 */
2857 	ASSERT(target == cp_default.cp_lgrploads);
2858 	ASSERT(size == cp_default.cp_nlgrploads);
2859 	ASSERT(!lgrp_topo_initialized);
2860 	ASSERT(ncpus == 1);
2861 
2862 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2863 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2864 		/*
2865 		 * Copy all fields from lpl.
2866 		 */
2867 
2868 		*target_lpl = *lpl;
2869 
2870 		/*
2871 		 * Substitute CPU0 lpl pointer with one relative to target.
2872 		 */
2873 		if (lpl->lpl_cpus == CPU) {
2874 			ASSERT(CPU->cpu_lpl == lpl);
2875 			CPU->cpu_lpl = target_lpl;
2876 		}
2877 
2878 		/*
2879 		 * Substitute parent information with parent relative to target.
2880 		 */
2881 		if (lpl->lpl_parent != NULL)
2882 			target_lpl->lpl_parent = (lpl_t *)
2883 			    (((uintptr_t)lpl->lpl_parent -
2884 				(uintptr_t)lpl_bootstrap) +
2885 				(uintptr_t)target);
2886 
2887 		/*
2888 		 * Walk over resource set substituting pointers relative to
2889 		 * lpl_bootstrap to pointers relative to target.
2890 		 */
2891 		ASSERT(lpl->lpl_nrset <= 1);
2892 
2893 		for (id = 0; id < lpl->lpl_nrset; id++) {
2894 			if (lpl->lpl_rset[id] != NULL) {
2895 				target_lpl->lpl_rset[id] =
2896 				    (lpl_t *)
2897 				    (((uintptr_t)lpl->lpl_rset[id] -
2898 					(uintptr_t)lpl_bootstrap) +
2899 					(uintptr_t)target);
2900 			}
2901 		}
2902 	}
2903 
2904 	/*
2905 	 * Topology information in lpl_bootstrap is no longer needed.
2906 	 */
2907 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2908 }
2909 
2910 /*
2911  * If the lowest load among the lgroups a process' threads are currently
2912  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2913  * expanding the process to a new lgroup.
2914  */
2915 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2916 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2917 
2918 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2919 	((lgrp_expand_proc_thresh) / (ncpu))
2920 
2921 /*
2922  * A process will be expanded to a new lgroup only if the difference between
2923  * the lowest load on the lgroups the process' thread's are currently spread
2924  * across and the lowest load on the other lgroups in the process' partition
2925  * is greater than lgrp_expand_proc_diff.
2926  */
2927 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2928 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2929 
2930 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2931 	((lgrp_expand_proc_diff) / (ncpu))
2932 
2933 /*
2934  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2935  * be present due to impreciseness of the load average decay algorithm.
2936  *
2937  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2938  * tolerance is scaled by the number of cpus in the lgroup just like
2939  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2940  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2941  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2942  */
2943 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2944 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2945 	((lgrp_loadavg_tolerance) / ncpu)
2946 
2947 /*
2948  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2949  * average is above this threshold
2950  */
2951 uint32_t	lgrp_load_thresh = UINT32_MAX;
2952 
2953 /*
2954  * lgrp_choose() will try to skip any lgroups with less memory
2955  * than this free when choosing a home lgroup
2956  */
2957 pgcnt_t	lgrp_mem_free_thresh = 0;
2958 
2959 /*
2960  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2961  * one based on one of the following policies:
2962  * - Random selection
2963  * - Pseudo round robin placement
2964  * - Longest time since a thread was last placed
2965  */
2966 #define	LGRP_CHOOSE_RANDOM	1
2967 #define	LGRP_CHOOSE_RR		2
2968 #define	LGRP_CHOOSE_TIME	3
2969 
2970 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2971 
2972 /*
2973  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2974  * be bound to a CPU or processor set.
2975  *
2976  * Arguments:
2977  *	t		The thread
2978  *	cpupart		The partition the thread belongs to.
2979  *
2980  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2981  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2982  *	 partitions changing out from under us and assumes that given thread is
2983  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2984  *	 disabled, so don't grab any locks because we should never block under
2985  *	 those conditions.
2986  */
2987 lpl_t *
2988 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2989 {
2990 	lgrp_load_t	bestload, bestrload;
2991 	int		lgrpid_offset, lgrp_count;
2992 	lgrp_id_t	lgrpid, lgrpid_start;
2993 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2994 	klgrpset_t	lgrpset;
2995 	proc_t		*p;
2996 
2997 	ASSERT(t != NULL);
2998 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2999 	    THREAD_LOCK_HELD(t));
3000 	ASSERT(cpupart != NULL);
3001 
3002 	p = t->t_procp;
3003 
3004 	/* A process should always be in an active partition */
3005 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3006 
3007 	bestlpl = bestrlpl = NULL;
3008 	bestload = bestrload = LGRP_LOADAVG_MAX;
3009 	lgrpset = cpupart->cp_lgrpset;
3010 
3011 	switch (lgrp_choose_policy) {
3012 	case LGRP_CHOOSE_RR:
3013 		lgrpid = cpupart->cp_lgrp_hint;
3014 		do {
3015 			if (++lgrpid > lgrp_alloc_max)
3016 				lgrpid = 0;
3017 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3018 
3019 		break;
3020 	default:
3021 	case LGRP_CHOOSE_TIME:
3022 	case LGRP_CHOOSE_RANDOM:
3023 		klgrpset_nlgrps(lgrpset, lgrp_count);
3024 		lgrpid_offset =
3025 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3026 		for (lgrpid = 0; ; lgrpid++) {
3027 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3028 				if (--lgrpid_offset == 0)
3029 					break;
3030 			}
3031 		}
3032 		break;
3033 	}
3034 
3035 	lgrpid_start = lgrpid;
3036 
3037 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3038 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3039 
3040 	/*
3041 	 * Use lgroup affinities (if any) to choose best lgroup
3042 	 *
3043 	 * NOTE: Assumes that thread is protected from going away and its
3044 	 *	 lgroup affinities won't change (ie. p_lock, or
3045 	 *	 thread_lock() being held and/or CPUs paused)
3046 	 */
3047 	if (t->t_lgrp_affinity) {
3048 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3049 		if (lpl != NULL)
3050 			return (lpl);
3051 	}
3052 
3053 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3054 
3055 	do {
3056 		pgcnt_t	npgs;
3057 
3058 		/*
3059 		 * Skip any lgroups outside of thread's pset
3060 		 */
3061 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3062 			if (++lgrpid > lgrp_alloc_max)
3063 				lgrpid = 0;	/* wrap the search */
3064 			continue;
3065 		}
3066 
3067 		/*
3068 		 * Skip any non-leaf lgroups
3069 		 */
3070 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3071 			continue;
3072 
3073 		/*
3074 		 * Skip any lgroups without enough free memory
3075 		 * (when threshold set to nonzero positive value)
3076 		 */
3077 		if (lgrp_mem_free_thresh > 0) {
3078 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3079 			if (npgs < lgrp_mem_free_thresh) {
3080 				if (++lgrpid > lgrp_alloc_max)
3081 					lgrpid = 0;	/* wrap the search */
3082 				continue;
3083 			}
3084 		}
3085 
3086 		lpl = &cpupart->cp_lgrploads[lgrpid];
3087 		if (klgrpset_isempty(p->p_lgrpset) ||
3088 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3089 			/*
3090 			 * Either this is a new process or the process already
3091 			 * has threads on this lgrp, so this is a preferred
3092 			 * lgroup for the thread.
3093 			 */
3094 			if (bestlpl == NULL ||
3095 			    lpl_pick(lpl, bestlpl)) {
3096 				bestload = lpl->lpl_loadavg;
3097 				bestlpl = lpl;
3098 			}
3099 		} else {
3100 			/*
3101 			 * The process doesn't have any threads on this lgrp,
3102 			 * but we're willing to consider this lgrp if the load
3103 			 * difference is big enough to justify splitting up
3104 			 * the process' threads.
3105 			 */
3106 			if (bestrlpl == NULL ||
3107 			    lpl_pick(lpl, bestrlpl)) {
3108 				bestrload = lpl->lpl_loadavg;
3109 				bestrlpl = lpl;
3110 			}
3111 		}
3112 		if (++lgrpid > lgrp_alloc_max)
3113 			lgrpid = 0;	/* wrap the search */
3114 	} while (lgrpid != lgrpid_start);
3115 
3116 	/*
3117 	 * Return root lgroup if threshold isn't set to maximum value and
3118 	 * lowest lgroup load average more than a certain threshold
3119 	 */
3120 	if (lgrp_load_thresh != UINT32_MAX &&
3121 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3122 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3123 
3124 	/*
3125 	 * If all the lgroups over which the thread's process is spread are
3126 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3127 	 * the thread on one of the other leaf lgroups in the thread's
3128 	 * partition.
3129 	 */
3130 	if ((bestlpl == NULL) ||
3131 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3132 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3133 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3134 	    bestload))) {
3135 		bestlpl = bestrlpl;
3136 	}
3137 
3138 	if (bestlpl == NULL) {
3139 		/*
3140 		 * No lgroup looked particularly good, but we still
3141 		 * have to pick something. Go with the randomly selected
3142 		 * legal lgroup we started with above.
3143 		 */
3144 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3145 	}
3146 
3147 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3148 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3149 
3150 	ASSERT(bestlpl->lpl_ncpu > 0);
3151 	return (bestlpl);
3152 }
3153 
3154 /*
3155  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3156  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3157  */
3158 static int
3159 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3160 {
3161 	lgrp_load_t	l1, l2;
3162 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3163 
3164 	l1 = lpl1->lpl_loadavg;
3165 	l2 = lpl2->lpl_loadavg;
3166 
3167 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3168 		/* lpl1 is significantly less loaded than lpl2 */
3169 		return (1);
3170 	}
3171 
3172 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3173 	    l1 + tolerance >= l2 && l1 < l2 &&
3174 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3175 		/*
3176 		 * lpl1's load is within the tolerance of lpl2. We're
3177 		 * willing to consider it be to better however if
3178 		 * it has been longer since we last homed a thread there
3179 		 */
3180 		return (1);
3181 	}
3182 
3183 	return (0);
3184 }
3185 
3186 /*
3187  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3188  * process that uses text replication changed home lgrp. This info is used by
3189  * segvn asyncronous thread to detect if it needs to recheck what lgrps
3190  * should be used for text replication.
3191  */
3192 static uint64_t lgrp_trthr_moves = 0;
3193 
3194 uint64_t
3195 lgrp_get_trthr_migrations(void)
3196 {
3197 	return (lgrp_trthr_moves);
3198 }
3199 
3200 void
3201 lgrp_update_trthr_migrations(uint64_t incr)
3202 {
3203 	atomic_add_64(&lgrp_trthr_moves, incr);
3204 }
3205 
3206 /*
3207  * An LWP is expected to be assigned to an lgroup for at least this long
3208  * for its anticipatory load to be justified.  NOTE that this value should
3209  * not be set extremely huge (say, larger than 100 years), to avoid problems
3210  * with overflow in the calculation that uses it.
3211  */
3212 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3213 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3214 
3215 /*
3216  * Routine to change a thread's lgroup affiliation.  This routine updates
3217  * the thread's kthread_t struct and its process' proc_t struct to note the
3218  * thread's new lgroup affiliation, and its lgroup affinities.
3219  *
3220  * Note that this is the only routine that modifies a thread's t_lpl field,
3221  * and that adds in or removes anticipatory load.
3222  *
3223  * If the thread is exiting, newlpl is NULL.
3224  *
3225  * Locking:
3226  * The following lock must be held on entry:
3227  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3228  *		doesn't get removed from t's partition
3229  *
3230  * This routine is not allowed to grab any locks, since it may be called
3231  * with cpus paused (such as from cpu_offline).
3232  */
3233 void
3234 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3235 {
3236 	proc_t		*p;
3237 	lpl_t		*lpl, *oldlpl;
3238 	lgrp_id_t	oldid;
3239 	kthread_t	*tp;
3240 	uint_t		ncpu;
3241 	lgrp_load_t	old, new;
3242 
3243 	ASSERT(t);
3244 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3245 	    THREAD_LOCK_HELD(t));
3246 
3247 	/*
3248 	 * If not changing lpls, just return
3249 	 */
3250 	if ((oldlpl = t->t_lpl) == newlpl)
3251 		return;
3252 
3253 	/*
3254 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3255 	 * associated with process 0 rather than with its original process).
3256 	 */
3257 	if (t->t_proc_flag & TP_LWPEXIT) {
3258 		if (newlpl != NULL) {
3259 			t->t_lpl = newlpl;
3260 		}
3261 		return;
3262 	}
3263 
3264 	p = ttoproc(t);
3265 
3266 	/*
3267 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3268 	 * to account for it being moved from its old lgroup.
3269 	 */
3270 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3271 	    (p->p_tlist != NULL)) {
3272 		oldid = oldlpl->lpl_lgrpid;
3273 
3274 		if (newlpl != NULL)
3275 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3276 
3277 		if ((do_lgrpset_delete) &&
3278 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3279 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3280 				/*
3281 				 * Check if a thread other than the thread
3282 				 * that's moving is assigned to the same
3283 				 * lgroup as the thread that's moving.  Note
3284 				 * that we have to compare lgroup IDs, rather
3285 				 * than simply comparing t_lpl's, since the
3286 				 * threads may belong to different partitions
3287 				 * but be assigned to the same lgroup.
3288 				 */
3289 				ASSERT(tp->t_lpl != NULL);
3290 
3291 				if ((tp != t) &&
3292 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3293 					/*
3294 					 * Another thread is assigned to the
3295 					 * same lgroup as the thread that's
3296 					 * moving, p_lgrpset doesn't change.
3297 					 */
3298 					break;
3299 				} else if (tp == p->p_tlist) {
3300 					/*
3301 					 * No other thread is assigned to the
3302 					 * same lgroup as the exiting thread,
3303 					 * clear the lgroup's bit in p_lgrpset.
3304 					 */
3305 					klgrpset_del(p->p_lgrpset, oldid);
3306 					break;
3307 				}
3308 			}
3309 		}
3310 
3311 		/*
3312 		 * If this thread was assigned to its old lgroup for such a
3313 		 * short amount of time that the anticipatory load that was
3314 		 * added on its behalf has aged very little, remove that
3315 		 * anticipatory load.
3316 		 */
3317 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3318 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3319 			lpl = oldlpl;
3320 			for (;;) {
3321 				do {
3322 					old = new = lpl->lpl_loadavg;
3323 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3324 					if (new > old) {
3325 						/*
3326 						 * this can happen if the load
3327 						 * average was aged since we
3328 						 * added in the anticipatory
3329 						 * load
3330 						 */
3331 						new = 0;
3332 					}
3333 				} while (cas32(
3334 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3335 					    new) != old);
3336 
3337 				lpl = lpl->lpl_parent;
3338 				if (lpl == NULL)
3339 					break;
3340 
3341 				ncpu = lpl->lpl_ncpu;
3342 				ASSERT(ncpu > 0);
3343 			}
3344 		}
3345 	}
3346 	/*
3347 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3348 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3349 	 * to its new lgroup to account for its move to its new lgroup.
3350 	 */
3351 	if (newlpl != NULL) {
3352 		/*
3353 		 * This thread is moving to a new lgroup
3354 		 */
3355 		t->t_lpl = newlpl;
3356 		if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3357 			p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3358 			membar_producer();
3359 			if (p->p_tr_lgrpid != LGRP_NONE &&
3360 			    p->p_tr_lgrpid != p->p_t1_lgrpid) {
3361 				lgrp_update_trthr_migrations(1);
3362 			}
3363 		}
3364 
3365 		/*
3366 		 * Reflect move in load average of new lgroup
3367 		 * unless it is root lgroup
3368 		 */
3369 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3370 			return;
3371 
3372 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3373 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3374 		}
3375 
3376 		/*
3377 		 * It'll take some time for the load on the new lgroup
3378 		 * to reflect this thread's placement on it.  We'd
3379 		 * like not, however, to have all threads between now
3380 		 * and then also piling on to this lgroup.  To avoid
3381 		 * this pileup, we anticipate the load this thread
3382 		 * will generate on its new lgroup.  The goal is to
3383 		 * make the lgroup's load appear as though the thread
3384 		 * had been there all along.  We're very conservative
3385 		 * in calculating this anticipatory load, we assume
3386 		 * the worst case case (100% CPU-bound thread).  This
3387 		 * may be modified in the future to be more accurate.
3388 		 */
3389 		lpl = newlpl;
3390 		for (;;) {
3391 			ncpu = lpl->lpl_ncpu;
3392 			ASSERT(ncpu > 0);
3393 			do {
3394 				old = new = lpl->lpl_loadavg;
3395 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3396 				/*
3397 				 * Check for overflow
3398 				 * Underflow not possible here
3399 				 */
3400 				if (new < old)
3401 					new = UINT32_MAX;
3402 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3403 			    new) != old);
3404 
3405 			lpl = lpl->lpl_parent;
3406 			if (lpl == NULL)
3407 				break;
3408 		}
3409 		t->t_anttime = gethrtime();
3410 	}
3411 }
3412 
3413 /*
3414  * Return lgroup memory allocation policy given advice from madvise(3C)
3415  */
3416 lgrp_mem_policy_t
3417 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3418 {
3419 	switch (advice) {
3420 	case MADV_ACCESS_LWP:
3421 		return (LGRP_MEM_POLICY_NEXT);
3422 	case MADV_ACCESS_MANY:
3423 		return (LGRP_MEM_POLICY_RANDOM);
3424 	default:
3425 		return (lgrp_mem_policy_default(size, type));
3426 	}
3427 }
3428 
3429 /*
3430  * Figure out default policy
3431  */
3432 lgrp_mem_policy_t
3433 lgrp_mem_policy_default(size_t size, int type)
3434 {
3435 	cpupart_t		*cp;
3436 	lgrp_mem_policy_t	policy;
3437 	size_t			pset_mem_size;
3438 
3439 	/*
3440 	 * Randomly allocate memory across lgroups for shared memory
3441 	 * beyond a certain threshold
3442 	 */
3443 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3444 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3445 		/*
3446 		 * Get total memory size of current thread's pset
3447 		 */
3448 		kpreempt_disable();
3449 		cp = curthread->t_cpupart;
3450 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3451 		kpreempt_enable();
3452 
3453 		/*
3454 		 * Choose policy to randomly allocate memory across
3455 		 * lgroups in pset if it will fit and is not default
3456 		 * partition.  Otherwise, allocate memory randomly
3457 		 * across machine.
3458 		 */
3459 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3460 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3461 		else
3462 			policy = LGRP_MEM_POLICY_RANDOM;
3463 	} else
3464 		/*
3465 		 * Apply default policy for private memory and
3466 		 * shared memory under the respective random
3467 		 * threshold.
3468 		 */
3469 		policy = lgrp_mem_default_policy;
3470 
3471 	return (policy);
3472 }
3473 
3474 /*
3475  * Get memory allocation policy for this segment
3476  */
3477 lgrp_mem_policy_info_t *
3478 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3479 {
3480 	lgrp_mem_policy_info_t	*policy_info;
3481 	extern struct seg_ops	segspt_ops;
3482 	extern struct seg_ops	segspt_shmops;
3483 
3484 	/*
3485 	 * This is for binary compatibility to protect against third party
3486 	 * segment drivers which haven't recompiled to allow for
3487 	 * SEGOP_GETPOLICY()
3488 	 */
3489 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3490 	    seg->s_ops != &segspt_shmops)
3491 		return (NULL);
3492 
3493 	policy_info = NULL;
3494 	if (seg->s_ops->getpolicy != NULL)
3495 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3496 
3497 	return (policy_info);
3498 }
3499 
3500 /*
3501  * Set policy for allocating private memory given desired policy, policy info,
3502  * size in bytes of memory that policy is being applied.
3503  * Return 0 if policy wasn't set already and 1 if policy was set already
3504  */
3505 int
3506 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3507     lgrp_mem_policy_info_t *policy_info, size_t size)
3508 {
3509 
3510 	ASSERT(policy_info != NULL);
3511 
3512 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3513 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3514 
3515 	/*
3516 	 * Policy set already?
3517 	 */
3518 	if (policy == policy_info->mem_policy)
3519 		return (1);
3520 
3521 	/*
3522 	 * Set policy
3523 	 */
3524 	policy_info->mem_policy = policy;
3525 	policy_info->mem_lgrpid = LGRP_NONE;
3526 
3527 	return (0);
3528 }
3529 
3530 
3531 /*
3532  * Get shared memory allocation policy with given tree and offset
3533  */
3534 lgrp_mem_policy_info_t *
3535 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3536     u_offset_t vn_off)
3537 {
3538 	u_offset_t		off;
3539 	lgrp_mem_policy_info_t	*policy_info;
3540 	lgrp_shm_policy_seg_t	*policy_seg;
3541 	lgrp_shm_locality_t	*shm_locality;
3542 	avl_tree_t		*tree;
3543 	avl_index_t		where;
3544 
3545 	/*
3546 	 * Get policy segment tree from anon_map or vnode and use specified
3547 	 * anon index or vnode offset as offset
3548 	 *
3549 	 * Assume that no lock needs to be held on anon_map or vnode, since
3550 	 * they should be protected by their reference count which must be
3551 	 * nonzero for an existing segment
3552 	 */
3553 	if (amp) {
3554 		ASSERT(amp->refcnt != 0);
3555 		shm_locality = amp->locality;
3556 		if (shm_locality == NULL)
3557 			return (NULL);
3558 		tree = shm_locality->loc_tree;
3559 		off = ptob(anon_index);
3560 	} else if (vp) {
3561 		shm_locality = vp->v_locality;
3562 		if (shm_locality == NULL)
3563 			return (NULL);
3564 		ASSERT(shm_locality->loc_count != 0);
3565 		tree = shm_locality->loc_tree;
3566 		off = vn_off;
3567 	}
3568 
3569 	if (tree == NULL)
3570 		return (NULL);
3571 
3572 	/*
3573 	 * Lookup policy segment for offset into shared object and return
3574 	 * policy info
3575 	 */
3576 	rw_enter(&shm_locality->loc_lock, RW_READER);
3577 	policy_info = NULL;
3578 	policy_seg = avl_find(tree, &off, &where);
3579 	if (policy_seg)
3580 		policy_info = &policy_seg->shm_policy;
3581 	rw_exit(&shm_locality->loc_lock);
3582 
3583 	return (policy_info);
3584 }
3585 
3586 /*
3587  * Default memory allocation policy for kernel segmap pages
3588  */
3589 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3590 
3591 /*
3592  * Return lgroup to use for allocating memory
3593  * given the segment and address
3594  *
3595  * There isn't any mutual exclusion that exists between calls
3596  * to this routine and DR, so this routine and whomever calls it
3597  * should be mindful of the possibility that the lgrp returned
3598  * may be deleted. If this happens, dereferences of the lgrp
3599  * pointer will still be safe, but the resources in the lgrp will
3600  * be gone, and LGRP_EXISTS() will no longer be true.
3601  */
3602 lgrp_t *
3603 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3604 {
3605 	int			i;
3606 	lgrp_t			*lgrp;
3607 	klgrpset_t		lgrpset;
3608 	int			lgrps_spanned;
3609 	unsigned long		off;
3610 	lgrp_mem_policy_t	policy;
3611 	lgrp_mem_policy_info_t	*policy_info;
3612 	ushort_t		random;
3613 	int			stat = 0;
3614 	extern struct seg	*segkmap;
3615 
3616 	/*
3617 	 * Just return null if the lgrp framework hasn't finished
3618 	 * initializing or if this is a UMA machine.
3619 	 */
3620 	if (nlgrps == 1 || !lgrp_initialized)
3621 		return (lgrp_root);
3622 
3623 	/*
3624 	 * Get memory allocation policy for this segment
3625 	 */
3626 	policy = lgrp_mem_default_policy;
3627 	if (seg != NULL) {
3628 		if (seg->s_as == &kas) {
3629 			if (seg == segkmap)
3630 				policy = lgrp_segmap_default_policy;
3631 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3632 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3633 				policy = LGRP_MEM_POLICY_RANDOM;
3634 		} else {
3635 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3636 			if (policy_info != NULL) {
3637 				policy = policy_info->mem_policy;
3638 				if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3639 					lgrp_id_t id = policy_info->mem_lgrpid;
3640 					ASSERT(id != LGRP_NONE);
3641 					ASSERT(id < NLGRPS_MAX);
3642 					lgrp = lgrp_table[id];
3643 					if (!LGRP_EXISTS(lgrp)) {
3644 						policy = LGRP_MEM_POLICY_NEXT;
3645 					} else {
3646 						lgrp_stat_add(id,
3647 						    LGRP_NUM_NEXT_SEG, 1);
3648 						return (lgrp);
3649 					}
3650 				}
3651 			}
3652 		}
3653 	}
3654 	lgrpset = 0;
3655 
3656 	/*
3657 	 * Initialize lgroup to home by default
3658 	 */
3659 	lgrp = lgrp_home_lgrp();
3660 
3661 	/*
3662 	 * When homing threads on root lgrp, override default memory
3663 	 * allocation policies with root lgroup memory allocation policy
3664 	 */
3665 	if (lgrp == lgrp_root)
3666 		policy = lgrp_mem_policy_root;
3667 
3668 	/*
3669 	 * Implement policy
3670 	 */
3671 	switch (policy) {
3672 	case LGRP_MEM_POLICY_NEXT_CPU:
3673 
3674 		/*
3675 		 * Return lgroup of current CPU which faulted on memory
3676 		 * If the CPU isn't currently in an lgrp, then opt to
3677 		 * allocate from the root.
3678 		 *
3679 		 * Kernel preemption needs to be disabled here to prevent
3680 		 * the current CPU from going away before lgrp is found.
3681 		 */
3682 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3683 			lgrp = lgrp_root;
3684 		} else {
3685 			kpreempt_disable();
3686 			lgrp = lgrp_cpu_to_lgrp(CPU);
3687 			kpreempt_enable();
3688 		}
3689 		break;
3690 
3691 	case LGRP_MEM_POLICY_NEXT:
3692 	case LGRP_MEM_POLICY_DEFAULT:
3693 	default:
3694 
3695 		/*
3696 		 * Just return current thread's home lgroup
3697 		 * for default policy (next touch)
3698 		 * If the thread is homed to the root,
3699 		 * then the default policy is random across lgroups.
3700 		 * Fallthrough to the random case.
3701 		 */
3702 		if (lgrp != lgrp_root) {
3703 			if (policy == LGRP_MEM_POLICY_NEXT)
3704 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3705 			else
3706 				lgrp_stat_add(lgrp->lgrp_id,
3707 				    LGRP_NUM_DEFAULT, 1);
3708 			break;
3709 		}
3710 		/* LINTED fallthrough on case statement */
3711 	case LGRP_MEM_POLICY_RANDOM:
3712 
3713 		/*
3714 		 * Return a random leaf lgroup with memory
3715 		 */
3716 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3717 		/*
3718 		 * Count how many lgroups are spanned
3719 		 */
3720 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3721 
3722 		/*
3723 		 * There may be no memnodes in the root lgroup during DR copy
3724 		 * rename on a system with only two boards (memnodes)
3725 		 * configured. In this case just return the root lgrp.
3726 		 */
3727 		if (lgrps_spanned == 0) {
3728 			lgrp = lgrp_root;
3729 			break;
3730 		}
3731 
3732 		/*
3733 		 * Pick a random offset within lgroups spanned
3734 		 * and return lgroup at that offset
3735 		 */
3736 		random = (ushort_t)gethrtime() >> 4;
3737 		off = random % lgrps_spanned;
3738 		ASSERT(off <= lgrp_alloc_max);
3739 
3740 		for (i = 0; i <= lgrp_alloc_max; i++) {
3741 			if (!klgrpset_ismember(lgrpset, i))
3742 				continue;
3743 			if (off)
3744 				off--;
3745 			else {
3746 				lgrp = lgrp_table[i];
3747 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3748 				    1);
3749 				break;
3750 			}
3751 		}
3752 		break;
3753 
3754 	case LGRP_MEM_POLICY_RANDOM_PROC:
3755 
3756 		/*
3757 		 * Grab copy of bitmask of lgroups spanned by
3758 		 * this process
3759 		 */
3760 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3761 		stat = LGRP_NUM_RANDOM_PROC;
3762 
3763 		/* LINTED fallthrough on case statement */
3764 	case LGRP_MEM_POLICY_RANDOM_PSET:
3765 
3766 		if (!stat)
3767 			stat = LGRP_NUM_RANDOM_PSET;
3768 
3769 		if (klgrpset_isempty(lgrpset)) {
3770 			/*
3771 			 * Grab copy of bitmask of lgroups spanned by
3772 			 * this processor set
3773 			 */
3774 			kpreempt_disable();
3775 			klgrpset_copy(lgrpset,
3776 			    curthread->t_cpupart->cp_lgrpset);
3777 			kpreempt_enable();
3778 		}
3779 
3780 		/*
3781 		 * Count how many lgroups are spanned
3782 		 */
3783 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3784 		ASSERT(lgrps_spanned <= nlgrps);
3785 
3786 		/*
3787 		 * Probably lgrps_spanned should be always non-zero, but to be
3788 		 * on the safe side we return lgrp_root if it is empty.
3789 		 */
3790 		if (lgrps_spanned == 0) {
3791 			lgrp = lgrp_root;
3792 			break;
3793 		}
3794 
3795 		/*
3796 		 * Pick a random offset within lgroups spanned
3797 		 * and return lgroup at that offset
3798 		 */
3799 		random = (ushort_t)gethrtime() >> 4;
3800 		off = random % lgrps_spanned;
3801 		ASSERT(off <= lgrp_alloc_max);
3802 
3803 		for (i = 0; i <= lgrp_alloc_max; i++) {
3804 			if (!klgrpset_ismember(lgrpset, i))
3805 				continue;
3806 			if (off)
3807 				off--;
3808 			else {
3809 				lgrp = lgrp_table[i];
3810 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3811 				    1);
3812 				break;
3813 			}
3814 		}
3815 		break;
3816 
3817 	case LGRP_MEM_POLICY_ROUNDROBIN:
3818 
3819 		/*
3820 		 * Use offset within segment to determine
3821 		 * offset from home lgroup to choose for
3822 		 * next lgroup to allocate memory from
3823 		 */
3824 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3825 		    (lgrp_alloc_max + 1);
3826 
3827 		kpreempt_disable();
3828 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3829 		i = lgrp->lgrp_id;
3830 		kpreempt_enable();
3831 
3832 		while (off > 0) {
3833 			i = (i + 1) % (lgrp_alloc_max + 1);
3834 			lgrp = lgrp_table[i];
3835 			if (klgrpset_ismember(lgrpset, i))
3836 				off--;
3837 		}
3838 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3839 
3840 		break;
3841 	}
3842 
3843 	ASSERT(lgrp != NULL);
3844 	return (lgrp);
3845 }
3846 
3847 /*
3848  * Return the number of pages in an lgroup
3849  *
3850  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3851  *	 could cause tests that rely on the numat driver to fail....
3852  */
3853 pgcnt_t
3854 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3855 {
3856 	lgrp_t *lgrp;
3857 
3858 	lgrp = lgrp_table[lgrpid];
3859 	if (!LGRP_EXISTS(lgrp) ||
3860 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3861 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3862 		return (0);
3863 
3864 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3865 }
3866 
3867 /*
3868  * Initialize lgroup shared memory allocation policy support
3869  */
3870 void
3871 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3872 {
3873 	lgrp_shm_locality_t	*shm_locality;
3874 
3875 	/*
3876 	 * Initialize locality field in anon_map
3877 	 * Don't need any locks because this is called when anon_map is
3878 	 * allocated, but not used anywhere yet.
3879 	 */
3880 	if (amp) {
3881 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3882 		if (amp->locality == NULL) {
3883 			/*
3884 			 * Allocate and initialize shared memory locality info
3885 			 * and set anon_map locality pointer to it
3886 			 * Drop lock across kmem_alloc(KM_SLEEP)
3887 			 */
3888 			ANON_LOCK_EXIT(&amp->a_rwlock);
3889 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3890 			    KM_SLEEP);
3891 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3892 			    NULL);
3893 			shm_locality->loc_count = 1;	/* not used for amp */
3894 			shm_locality->loc_tree = NULL;
3895 
3896 			/*
3897 			 * Reacquire lock and check to see whether anyone beat
3898 			 * us to initializing the locality info
3899 			 */
3900 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3901 			if (amp->locality != NULL) {
3902 				rw_destroy(&shm_locality->loc_lock);
3903 				kmem_free(shm_locality,
3904 				    sizeof (*shm_locality));
3905 			} else
3906 				amp->locality = shm_locality;
3907 		}
3908 		ANON_LOCK_EXIT(&amp->a_rwlock);
3909 		return;
3910 	}
3911 
3912 	/*
3913 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3914 	 */
3915 	mutex_enter(&vp->v_lock);
3916 	if ((vp->v_flag & V_LOCALITY) == 0) {
3917 		/*
3918 		 * Allocate and initialize shared memory locality info
3919 		 */
3920 		mutex_exit(&vp->v_lock);
3921 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3922 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3923 		shm_locality->loc_count = 1;
3924 		shm_locality->loc_tree = NULL;
3925 
3926 		/*
3927 		 * Point vnode locality field at shared vnode policy info
3928 		 * and set locality aware flag in vnode
3929 		 */
3930 		mutex_enter(&vp->v_lock);
3931 		if ((vp->v_flag & V_LOCALITY) == 0) {
3932 			vp->v_locality = shm_locality;
3933 			vp->v_flag |= V_LOCALITY;
3934 		} else {
3935 			/*
3936 			 * Lost race so free locality info and increment count.
3937 			 */
3938 			rw_destroy(&shm_locality->loc_lock);
3939 			kmem_free(shm_locality, sizeof (*shm_locality));
3940 			shm_locality = vp->v_locality;
3941 			shm_locality->loc_count++;
3942 		}
3943 		mutex_exit(&vp->v_lock);
3944 
3945 		return;
3946 	}
3947 
3948 	/*
3949 	 * Increment reference count of number of segments mapping this vnode
3950 	 * shared
3951 	 */
3952 	shm_locality = vp->v_locality;
3953 	shm_locality->loc_count++;
3954 	mutex_exit(&vp->v_lock);
3955 }
3956 
3957 /*
3958  * Destroy the given shared memory policy segment tree
3959  */
3960 void
3961 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3962 {
3963 	lgrp_shm_policy_seg_t	*cur;
3964 	lgrp_shm_policy_seg_t	*next;
3965 
3966 	if (tree == NULL)
3967 		return;
3968 
3969 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3970 	while (cur != NULL) {
3971 		next = AVL_NEXT(tree, cur);
3972 		avl_remove(tree, cur);
3973 		kmem_free(cur, sizeof (*cur));
3974 		cur = next;
3975 	}
3976 	kmem_free(tree, sizeof (avl_tree_t));
3977 }
3978 
3979 /*
3980  * Uninitialize lgroup shared memory allocation policy support
3981  */
3982 void
3983 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3984 {
3985 	lgrp_shm_locality_t	*shm_locality;
3986 
3987 	/*
3988 	 * For anon_map, deallocate shared memory policy tree and
3989 	 * zero locality field
3990 	 * Don't need any locks because anon_map is being freed
3991 	 */
3992 	if (amp) {
3993 		if (amp->locality == NULL)
3994 			return;
3995 		shm_locality = amp->locality;
3996 		shm_locality->loc_count = 0;	/* not really used for amp */
3997 		rw_destroy(&shm_locality->loc_lock);
3998 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3999 		kmem_free(shm_locality, sizeof (*shm_locality));
4000 		amp->locality = 0;
4001 		return;
4002 	}
4003 
4004 	/*
4005 	 * For vnode, decrement reference count of segments mapping this vnode
4006 	 * shared and delete locality info if reference count drops to 0
4007 	 */
4008 	mutex_enter(&vp->v_lock);
4009 	shm_locality = vp->v_locality;
4010 	shm_locality->loc_count--;
4011 
4012 	if (shm_locality->loc_count == 0) {
4013 		rw_destroy(&shm_locality->loc_lock);
4014 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4015 		kmem_free(shm_locality, sizeof (*shm_locality));
4016 		vp->v_locality = 0;
4017 		vp->v_flag &= ~V_LOCALITY;
4018 	}
4019 	mutex_exit(&vp->v_lock);
4020 }
4021 
4022 /*
4023  * Compare two shared memory policy segments
4024  * Used by AVL tree code for searching
4025  */
4026 int
4027 lgrp_shm_policy_compar(const void *x, const void *y)
4028 {
4029 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4030 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4031 
4032 	if (a->shm_off < b->shm_off)
4033 		return (-1);
4034 	if (a->shm_off >= b->shm_off + b->shm_size)
4035 		return (1);
4036 	return (0);
4037 }
4038 
4039 /*
4040  * Concatenate seg1 with seg2 and remove seg2
4041  */
4042 static int
4043 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4044     lgrp_shm_policy_seg_t *seg2)
4045 {
4046 	if (!seg1 || !seg2 ||
4047 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4048 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4049 		return (-1);
4050 
4051 	seg1->shm_size += seg2->shm_size;
4052 	avl_remove(tree, seg2);
4053 	kmem_free(seg2, sizeof (*seg2));
4054 	return (0);
4055 }
4056 
4057 /*
4058  * Split segment at given offset and return rightmost (uppermost) segment
4059  * Assumes that there are no overlapping segments
4060  */
4061 static lgrp_shm_policy_seg_t *
4062 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4063     u_offset_t off)
4064 {
4065 	lgrp_shm_policy_seg_t	*newseg;
4066 	avl_index_t		where;
4067 
4068 	ASSERT(seg != NULL);
4069 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4070 
4071 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4072 	    seg->shm_size)
4073 		return (NULL);
4074 
4075 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4076 		return (seg);
4077 
4078 	/*
4079 	 * Adjust size of left segment and allocate new (right) segment
4080 	 */
4081 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4082 	newseg->shm_policy = seg->shm_policy;
4083 	newseg->shm_off = off;
4084 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4085 	seg->shm_size = off - seg->shm_off;
4086 
4087 	/*
4088 	 * Find where to insert new segment in AVL tree and insert it
4089 	 */
4090 	(void) avl_find(tree, &off, &where);
4091 	avl_insert(tree, newseg, where);
4092 
4093 	return (newseg);
4094 }
4095 
4096 /*
4097  * Set shared memory allocation policy on specified shared object at given
4098  * offset and length
4099  *
4100  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4101  * -1 if can't set policy.
4102  */
4103 int
4104 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4105     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4106 {
4107 	u_offset_t		eoff;
4108 	lgrp_shm_policy_seg_t	*next;
4109 	lgrp_shm_policy_seg_t	*newseg;
4110 	u_offset_t		off;
4111 	u_offset_t		oldeoff;
4112 	lgrp_shm_policy_seg_t	*prev;
4113 	int			retval;
4114 	lgrp_shm_policy_seg_t	*seg;
4115 	lgrp_shm_locality_t	*shm_locality;
4116 	avl_tree_t		*tree;
4117 	avl_index_t		where;
4118 
4119 	ASSERT(amp || vp);
4120 	ASSERT((len & PAGEOFFSET) == 0);
4121 
4122 	if (len == 0)
4123 		return (-1);
4124 
4125 	retval = 0;
4126 
4127 	/*
4128 	 * Get locality info and starting offset into shared object
4129 	 * Try anon map first and then vnode
4130 	 * Assume that no locks need to be held on anon_map or vnode, since
4131 	 * it should be protected by its reference count which must be nonzero
4132 	 * for an existing segment.
4133 	 */
4134 	if (amp) {
4135 		/*
4136 		 * Get policy info from anon_map
4137 		 *
4138 		 */
4139 		ASSERT(amp->refcnt != 0);
4140 		if (amp->locality == NULL)
4141 			lgrp_shm_policy_init(amp, NULL);
4142 		shm_locality = amp->locality;
4143 		off = ptob(anon_index);
4144 	} else if (vp) {
4145 		/*
4146 		 * Get policy info from vnode
4147 		 */
4148 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4149 			lgrp_shm_policy_init(NULL, vp);
4150 		shm_locality = vp->v_locality;
4151 		ASSERT(shm_locality->loc_count != 0);
4152 		off = vn_off;
4153 	} else
4154 		return (-1);
4155 
4156 	ASSERT((off & PAGEOFFSET) == 0);
4157 
4158 	/*
4159 	 * Figure out default policy
4160 	 */
4161 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4162 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4163 
4164 	/*
4165 	 * Create AVL tree if there isn't one yet
4166 	 * and set locality field to point at it
4167 	 */
4168 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4169 	tree = shm_locality->loc_tree;
4170 	if (!tree) {
4171 		rw_exit(&shm_locality->loc_lock);
4172 
4173 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4174 
4175 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4176 		if (shm_locality->loc_tree == NULL) {
4177 			avl_create(tree, lgrp_shm_policy_compar,
4178 			    sizeof (lgrp_shm_policy_seg_t),
4179 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4180 			shm_locality->loc_tree = tree;
4181 		} else {
4182 			/*
4183 			 * Another thread managed to set up the tree
4184 			 * before we could. Free the tree we allocated
4185 			 * and use the one that's already there.
4186 			 */
4187 			kmem_free(tree, sizeof (*tree));
4188 			tree = shm_locality->loc_tree;
4189 		}
4190 	}
4191 
4192 	/*
4193 	 * Set policy
4194 	 *
4195 	 * Need to maintain hold on writer's lock to keep tree from
4196 	 * changing out from under us
4197 	 */
4198 	while (len != 0) {
4199 		/*
4200 		 * Find policy segment for specified offset into shared object
4201 		 */
4202 		seg = avl_find(tree, &off, &where);
4203 
4204 		/*
4205 		 * Didn't find any existing segment that contains specified
4206 		 * offset, so allocate new segment, insert it, and concatenate
4207 		 * with adjacent segments if possible
4208 		 */
4209 		if (seg == NULL) {
4210 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4211 			    KM_SLEEP);
4212 			newseg->shm_policy.mem_policy = policy;
4213 			newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4214 			newseg->shm_off = off;
4215 			avl_insert(tree, newseg, where);
4216 
4217 			/*
4218 			 * Check to see whether new segment overlaps with next
4219 			 * one, set length of new segment accordingly, and
4220 			 * calculate remaining length and next offset
4221 			 */
4222 			seg = AVL_NEXT(tree, newseg);
4223 			if (seg == NULL || off + len <= seg->shm_off) {
4224 				newseg->shm_size = len;
4225 				len = 0;
4226 			} else {
4227 				newseg->shm_size = seg->shm_off - off;
4228 				off = seg->shm_off;
4229 				len -= newseg->shm_size;
4230 			}
4231 
4232 			/*
4233 			 * Try to concatenate new segment with next and
4234 			 * previous ones, since they might have the same policy
4235 			 * now.  Grab previous and next segments first because
4236 			 * they will change on concatenation.
4237 			 */
4238 			prev =  AVL_PREV(tree, newseg);
4239 			next = AVL_NEXT(tree, newseg);
4240 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4241 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4242 
4243 			continue;
4244 		}
4245 
4246 		eoff = off + len;
4247 		oldeoff = seg->shm_off + seg->shm_size;
4248 
4249 		/*
4250 		 * Policy set already?
4251 		 */
4252 		if (policy == seg->shm_policy.mem_policy) {
4253 			/*
4254 			 * Nothing left to do if offset and length
4255 			 * fall within this segment
4256 			 */
4257 			if (eoff <= oldeoff) {
4258 				retval = 1;
4259 				break;
4260 			} else {
4261 				len = eoff - oldeoff;
4262 				off = oldeoff;
4263 				continue;
4264 			}
4265 		}
4266 
4267 		/*
4268 		 * Specified offset and length match existing segment exactly
4269 		 */
4270 		if (off == seg->shm_off && len == seg->shm_size) {
4271 			/*
4272 			 * Set policy and update current length
4273 			 */
4274 			seg->shm_policy.mem_policy = policy;
4275 			seg->shm_policy.mem_lgrpid = LGRP_NONE;
4276 			len = 0;
4277 
4278 			/*
4279 			 * Try concatenating new segment with previous and next
4280 			 * segments, since they might have the same policy now.
4281 			 * Grab previous and next segments first because they
4282 			 * will change on concatenation.
4283 			 */
4284 			prev =  AVL_PREV(tree, seg);
4285 			next = AVL_NEXT(tree, seg);
4286 			(void) lgrp_shm_policy_concat(tree, seg, next);
4287 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4288 		} else {
4289 			/*
4290 			 * Specified offset and length only apply to part of
4291 			 * existing segment
4292 			 */
4293 
4294 			/*
4295 			 * New segment starts in middle of old one, so split
4296 			 * new one off near beginning of old one
4297 			 */
4298 			newseg = NULL;
4299 			if (off > seg->shm_off) {
4300 				newseg = lgrp_shm_policy_split(tree, seg, off);
4301 
4302 				/*
4303 				 * New segment ends where old one did, so try
4304 				 * to concatenate with next segment
4305 				 */
4306 				if (eoff == oldeoff) {
4307 					newseg->shm_policy.mem_policy = policy;
4308 					newseg->shm_policy.mem_lgrpid =
4309 					    LGRP_NONE;
4310 					(void) lgrp_shm_policy_concat(tree,
4311 					    newseg, AVL_NEXT(tree, newseg));
4312 					break;
4313 				}
4314 			}
4315 
4316 			/*
4317 			 * New segment ends before old one, so split off end of
4318 			 * old one
4319 			 */
4320 			if (eoff < oldeoff) {
4321 				if (newseg) {
4322 					(void) lgrp_shm_policy_split(tree,
4323 					    newseg, eoff);
4324 					newseg->shm_policy.mem_policy = policy;
4325 					newseg->shm_policy.mem_lgrpid =
4326 					    LGRP_NONE;
4327 				} else {
4328 					(void) lgrp_shm_policy_split(tree, seg,
4329 					    eoff);
4330 					seg->shm_policy.mem_policy = policy;
4331 					seg->shm_policy.mem_lgrpid = LGRP_NONE;
4332 				}
4333 
4334 				if (off == seg->shm_off)
4335 					(void) lgrp_shm_policy_concat(tree,
4336 					    AVL_PREV(tree, seg), seg);
4337 				break;
4338 			}
4339 
4340 			/*
4341 			 * Calculate remaining length and next offset
4342 			 */
4343 			len = eoff - oldeoff;
4344 			off = oldeoff;
4345 		}
4346 	}
4347 
4348 	rw_exit(&shm_locality->loc_lock);
4349 	return (retval);
4350 }
4351 
4352 /*
4353  * Return the best memnode from which to allocate memory given
4354  * an lgroup.
4355  *
4356  * "c" is for cookie, which is good enough for me.
4357  * It references a cookie struct that should be zero'ed to initialize.
4358  * The cookie should live on the caller's stack.
4359  *
4360  * The routine returns -1 when:
4361  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4362  *	- traverse is 1, and all the memnodes in the system have been
4363  *	  returned.
4364  */
4365 int
4366 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4367 {
4368 	lgrp_t		*lp = c->lmc_lgrp;
4369 	mnodeset_t	nodes = c->lmc_nodes;
4370 	int		cnt = c->lmc_cnt;
4371 	int		offset, mnode;
4372 
4373 	extern int	max_mem_nodes;
4374 
4375 	/*
4376 	 * If the set is empty, and the caller is willing, traverse
4377 	 * up the hierarchy until we find a non-empty set.
4378 	 */
4379 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4380 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4381 		    ((lp = lp->lgrp_parent) == NULL))
4382 			return (-1);
4383 
4384 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4385 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4386 	}
4387 
4388 	/*
4389 	 * Select a memnode by picking one at a "random" offset.
4390 	 * Because of DR, memnodes can come and go at any time.
4391 	 * This code must be able to cope with the possibility
4392 	 * that the nodes count "cnt" is inconsistent with respect
4393 	 * to the number of elements actually in "nodes", and
4394 	 * therefore that the offset chosen could be greater than
4395 	 * the number of elements in the set (some memnodes may
4396 	 * have dissapeared just before cnt was read).
4397 	 * If this happens, the search simply wraps back to the
4398 	 * beginning of the set.
4399 	 */
4400 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4401 	offset = c->lmc_rand % cnt;
4402 	do {
4403 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4404 			if (nodes & ((mnodeset_t)1 << mnode))
4405 				if (!offset--)
4406 					break;
4407 	} while (mnode >= max_mem_nodes);
4408 
4409 	/* Found a node. Store state before returning. */
4410 	c->lmc_lgrp = lp;
4411 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4412 	c->lmc_cnt = cnt - 1;
4413 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4414 	c->lmc_ntried++;
4415 
4416 	return (mnode);
4417 }
4418