xref: /illumos-gate/usr/src/uts/common/os/lgrp.c (revision 78801af7286cd73dbc996d470f789e75993cf15d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 /*
28  * Basic NUMA support in terms of locality groups
29  *
30  * Solaris needs to know which CPUs, memory, etc. are near each other to
31  * provide good performance on NUMA machines by optimizing for locality.
32  * In order to do this, a new abstraction called a "locality group (lgroup)"
33  * has been introduced to keep track of which CPU-like and memory-like hardware
34  * resources are close to each other.  Currently, latency is the only measure
35  * used to determine how to group hardware resources into lgroups, but this
36  * does not limit the groupings to be based solely on latency.  Other factors
37  * may be used to determine the groupings in the future.
38  *
39  * Lgroups are organized into a hieararchy or topology that represents the
40  * latency topology of the machine.  There is always at least a root lgroup in
41  * the system.  It represents all the hardware resources in the machine at a
42  * latency big enough that any hardware resource can at least access any other
43  * hardware resource within that latency.  A Uniform Memory Access (UMA)
44  * machine is represented with one lgroup (the root).  In contrast, a NUMA
45  * machine is represented at least by the root lgroup and some number of leaf
46  * lgroups where the leaf lgroups contain the hardware resources within the
47  * least latency of each other and the root lgroup still contains all the
48  * resources in the machine.  Some number of intermediate lgroups may exist
49  * which represent more levels of locality than just the local latency of the
50  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
51  * (eg. root and intermediate lgroups) contain the next nearest resources to
52  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
53  * to the root lgroup shows the hardware resources from closest to farthest
54  * from the leaf lgroup such that each successive ancestor lgroup contains
55  * the next nearest resources at the next level of locality from the previous.
56  *
57  * The kernel uses the lgroup abstraction to know how to allocate resources
58  * near a given process/thread.  At fork() and lwp/thread_create() time, a
59  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
60  * with the lowest load average.  Binding to a processor or processor set will
61  * change the home lgroup for a thread.  The scheduler has been modified to try
62  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
63  * allocation is lgroup aware too, so memory will be allocated from the current
64  * thread's home lgroup if possible.  If the desired resources are not
65  * available, the kernel traverses the lgroup hierarchy going to the parent
66  * lgroup to find resources at the next level of locality until it reaches the
67  * root lgroup.
68  */
69 
70 #include <sys/lgrp.h>
71 #include <sys/lgrp_user.h>
72 #include <sys/types.h>
73 #include <sys/mman.h>
74 #include <sys/param.h>
75 #include <sys/var.h>
76 #include <sys/thread.h>
77 #include <sys/cpuvar.h>
78 #include <sys/cpupart.h>
79 #include <sys/kmem.h>
80 #include <vm/seg.h>
81 #include <vm/seg_kmem.h>
82 #include <vm/seg_spt.h>
83 #include <vm/seg_vn.h>
84 #include <vm/as.h>
85 #include <sys/atomic.h>
86 #include <sys/systm.h>
87 #include <sys/errno.h>
88 #include <sys/cmn_err.h>
89 #include <sys/kstat.h>
90 #include <sys/sysmacros.h>
91 #include <sys/pg.h>
92 #include <sys/promif.h>
93 #include <sys/sdt.h>
94 #include <sys/smt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 static lpl_t	*lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
162 static int	lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
163 
164 /*
165  * If cp still references the bootstrap lpl, it has not yet been added to
166  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
167  * a thread is trying to allocate memory close to a CPU that has no lgrp.
168  */
169 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
170 
171 static lgrp_t	lroot;
172 
173 /*
174  * Size, in bytes, beyond which random memory allocation policy is applied
175  * to non-shared memory.  Default is the maximum size, so random memory
176  * allocation won't be used for non-shared memory by default.
177  */
178 size_t	lgrp_privm_random_thresh = (size_t)(-1);
179 
180 /* the maximum effect that a single thread can have on it's lgroup's load */
181 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
182 	((lgrp_loadavg_max_effect) / (ncpu))
183 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
184 
185 
186 /*
187  * Size, in bytes, beyond which random memory allocation policy is applied to
188  * shared memory.  Default is 8MB (2 ISM pages).
189  */
190 size_t	lgrp_shm_random_thresh = 8*1024*1024;
191 
192 /*
193  * Whether to do processor set aware memory allocation by default
194  */
195 int	lgrp_mem_pset_aware = 0;
196 
197 /*
198  * Set the default memory allocation policy for root lgroup
199  */
200 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
201 
202 /*
203  * Set the default memory allocation policy.  For most platforms,
204  * next touch is sufficient, but some platforms may wish to override
205  * this.
206  */
207 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
208 
209 
210 /*
211  * lgroup CPU event handlers
212  */
213 static void	lgrp_cpu_init(struct cpu *);
214 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
215 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
216 
217 /*
218  * lgroup memory event handlers
219  */
220 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
221 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
222 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
223 
224 /*
225  * lgroup CPU partition event handlers
226  */
227 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
228 static void	lgrp_part_del_cpu(struct cpu *);
229 
230 /*
231  * lgroup framework initialization
232  */
233 static void	lgrp_main_init(void);
234 static void	lgrp_main_mp_init(void);
235 static void	lgrp_root_init(void);
236 static void	lgrp_setup(void);
237 
238 /*
239  * lpl topology
240  */
241 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
242 static void	lpl_clear(lpl_t *);
243 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
244 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
245 static void	lpl_rset_add(lpl_t *, lpl_t *);
246 static void	lpl_rset_del(lpl_t *, lpl_t *);
247 static int	lpl_rset_contains(lpl_t *, lpl_t *);
248 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
249 static void	lpl_child_update(lpl_t *, struct cpupart *);
250 static int	lpl_pick(lpl_t *, lpl_t *);
251 static void	lpl_verify_wrapper(struct cpupart *);
252 
253 /*
254  * defines for lpl topology verifier return codes
255  */
256 
257 #define	LPL_TOPO_CORRECT			0
258 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
259 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
260 #define	LPL_TOPO_LGRP_MISMATCH			-3
261 #define	LPL_TOPO_MISSING_PARENT			-4
262 #define	LPL_TOPO_PARENT_MISMATCH		-5
263 #define	LPL_TOPO_BAD_CPUCNT			-6
264 #define	LPL_TOPO_RSET_MISMATCH			-7
265 #define	LPL_TOPO_LPL_ORPHANED			-8
266 #define	LPL_TOPO_LPL_BAD_NCPU			-9
267 #define	LPL_TOPO_RSET_MSSNG_LF			-10
268 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
269 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-12
270 #define	LPL_TOPO_LGRP_NOT_LEAF			-13
271 #define	LPL_TOPO_BAD_RSETCNT			-14
272 
273 /*
274  * Return whether lgroup optimizations should be enabled on this system
275  */
276 int
277 lgrp_optimizations(void)
278 {
279 	/*
280 	 * System must have more than 2 lgroups to enable lgroup optimizations
281 	 *
282 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
283 	 * with one child lgroup containing all the resources. A 2 lgroup
284 	 * system with a root lgroup directly containing CPUs or memory might
285 	 * need lgroup optimizations with its child lgroup, but there
286 	 * isn't such a machine for now....
287 	 */
288 	if (nlgrps > 2)
289 		return (1);
290 
291 	return (0);
292 }
293 
294 /*
295  * Setup root lgroup
296  */
297 static void
298 lgrp_root_init(void)
299 {
300 	lgrp_handle_t	hand;
301 	int		i;
302 	lgrp_id_t	id;
303 
304 	/*
305 	 * Create the "root" lgroup
306 	 */
307 	ASSERT(nlgrps == 0);
308 	id = nlgrps++;
309 
310 	lgrp_root = &lroot;
311 
312 	lgrp_root->lgrp_cpu = NULL;
313 	lgrp_root->lgrp_mnodes = 0;
314 	lgrp_root->lgrp_nmnodes = 0;
315 	hand = lgrp_plat_root_hand();
316 	lgrp_root->lgrp_plathand = hand;
317 
318 	lgrp_root->lgrp_id = id;
319 	lgrp_root->lgrp_cpucnt = 0;
320 	lgrp_root->lgrp_childcnt = 0;
321 	klgrpset_clear(lgrp_root->lgrp_children);
322 	klgrpset_clear(lgrp_root->lgrp_leaves);
323 	lgrp_root->lgrp_parent = NULL;
324 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
325 
326 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
327 		klgrpset_clear(lgrp_root->lgrp_set[i]);
328 
329 	lgrp_root->lgrp_kstat = NULL;
330 
331 	lgrp_table[id] = lgrp_root;
332 
333 	/*
334 	 * Setup initial lpl list for CPU0 and initial t0 home.
335 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
336 	 * all topology operations until cp_default is initialized at which
337 	 * point t0.t_lpl will be updated.
338 	 */
339 	lpl_bootstrap = lpl_bootstrap_list;
340 	t0.t_lpl = lpl_bootstrap;
341 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
342 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
343 
344 	/*
345 	 * Set up the bootstrap rset
346 	 * Since the bootstrap toplogy has just the root, and a leaf,
347 	 * the rset contains just the leaf, and both lpls can use the same rset
348 	 */
349 	lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
350 	lpl_bootstrap_list[0].lpl_rset_sz = 1;
351 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
352 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
353 
354 	lpl_bootstrap_list[1].lpl_rset_sz = 1;
355 	lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
356 	lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
357 
358 	cp_default.cp_lgrploads = lpl_bootstrap;
359 }
360 
361 /*
362  * Initialize the lgroup framework and allow the platform to do the same
363  *
364  * This happens in stages during boot and is all funnelled through this routine
365  * (see definition of lgrp_init_stages_t to see what happens at each stage and
366  * when)
367  */
368 void
369 lgrp_init(lgrp_init_stages_t stage)
370 {
371 	/*
372 	 * Initialize the platform
373 	 */
374 	lgrp_plat_init(stage);
375 
376 	switch (stage) {
377 	case LGRP_INIT_STAGE1:
378 		/*
379 		 * Set max number of lgroups supported on this platform which
380 		 * must be less than the max number of lgroups supported by the
381 		 * common lgroup framework (eg. NLGRPS_MAX is max elements in
382 		 * lgrp_table[], etc.)
383 		 */
384 		nlgrpsmax = lgrp_plat_max_lgrps();
385 		ASSERT(nlgrpsmax <= NLGRPS_MAX);
386 		break;
387 
388 	case LGRP_INIT_STAGE2:
389 		lgrp_setup();
390 		break;
391 
392 	case LGRP_INIT_STAGE4:
393 		lgrp_main_init();
394 		break;
395 
396 	case LGRP_INIT_STAGE5:
397 		lgrp_main_mp_init();
398 		break;
399 
400 	default:
401 		break;
402 	}
403 }
404 
405 /*
406  * Create the root and cpu0's lgroup, and set t0's home.
407  */
408 static void
409 lgrp_setup(void)
410 {
411 	/*
412 	 * Setup the root lgroup
413 	 */
414 	lgrp_root_init();
415 
416 	/*
417 	 * Add cpu0 to an lgroup
418 	 */
419 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
420 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
421 }
422 
423 /*
424  * true when lgrp initialization has been completed.
425  */
426 int	lgrp_initialized = 0;
427 
428 /*
429  * True when lgrp topology is constructed.
430  */
431 int	lgrp_topo_initialized = 0;
432 
433 /*
434  * Init routine called after startup(), /etc/system has been processed,
435  * and cpu0 has been added to an lgroup.
436  */
437 static void
438 lgrp_main_init(void)
439 {
440 	cpu_t		*cp = CPU;
441 	lgrp_id_t	lgrpid;
442 	int		i;
443 	extern void	pg_cpu0_reinit();
444 
445 	/*
446 	 * Enforce a valid lgrp_mem_default_policy
447 	 */
448 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
449 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
450 	    (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
451 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
452 
453 	/*
454 	 * See if mpo should be disabled.
455 	 * This may happen in the case of null proc LPA on Starcat.
456 	 * The platform won't be able to detect null proc LPA until after
457 	 * cpu0 and memory have already been added to lgroups.
458 	 * When and if it is detected, the Starcat platform will return
459 	 * a different platform handle for cpu0 which is what we check for
460 	 * here. If mpo should be disabled move cpu0 to it's rightful place
461 	 * (the root), and destroy the remaining lgroups. This effectively
462 	 * provides an UMA lgroup topology.
463 	 */
464 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
465 	if (lgrp_table[lgrpid]->lgrp_plathand !=
466 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
467 		lgrp_part_del_cpu(cp);
468 		lgrp_cpu_fini(cp, lgrpid);
469 
470 		lgrp_cpu_init(cp);
471 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
472 
473 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
474 
475 		/*
476 		 * Notify the PG subsystem that the CPU's lgrp
477 		 * association has changed
478 		 */
479 		pg_cpu0_reinit();
480 
481 		/*
482 		 * Destroy all lgroups except for root
483 		 */
484 		for (i = 0; i <= lgrp_alloc_max; i++) {
485 			if (LGRP_EXISTS(lgrp_table[i]) &&
486 			    lgrp_table[i] != lgrp_root)
487 				lgrp_destroy(lgrp_table[i]);
488 		}
489 
490 		/*
491 		 * Fix up root to point at itself for leaves and resources
492 		 * and not have any children
493 		 */
494 		lgrp_root->lgrp_childcnt = 0;
495 		klgrpset_clear(lgrp_root->lgrp_children);
496 		klgrpset_clear(lgrp_root->lgrp_leaves);
497 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
498 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
499 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
500 	}
501 
502 	/*
503 	 * Initialize kstats framework.
504 	 */
505 	lgrp_kstat_init();
506 	/*
507 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
508 	 */
509 	mutex_enter(&cpu_lock);
510 	lgrp_kstat_create(cp);
511 	mutex_exit(&cpu_lock);
512 
513 	lgrp_initialized = 1;
514 }
515 
516 /*
517  * Finish lgrp initialization after all CPUS are brought on-line.
518  * This routine is called after start_other_cpus().
519  */
520 static void
521 lgrp_main_mp_init(void)
522 {
523 	klgrpset_t changed;
524 
525 	smt_init();
526 
527 	/*
528 	 * Update lgroup topology (if necessary)
529 	 */
530 	klgrpset_clear(changed);
531 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
532 	lgrp_topo_initialized = 1;
533 }
534 
535 /*
536  * Change latency of lgroup with specified lgroup platform handle (if one is
537  * given) or change all lgroups with old latency to new latency
538  */
539 void
540 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
541     u_longlong_t newtime)
542 {
543 	lgrp_t		*lgrp;
544 	int		i;
545 
546 	for (i = 0; i <= lgrp_alloc_max; i++) {
547 		lgrp = lgrp_table[i];
548 
549 		if (!LGRP_EXISTS(lgrp))
550 			continue;
551 
552 		if ((hand == LGRP_NULL_HANDLE &&
553 		    lgrp->lgrp_latency == oldtime) ||
554 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
555 			lgrp->lgrp_latency = (int)newtime;
556 	}
557 }
558 
559 /*
560  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
561  */
562 void
563 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
564 {
565 	klgrpset_t	changed;
566 	cpu_t		*cp;
567 	lgrp_id_t	id;
568 	int		rc;
569 
570 	switch (event) {
571 	/*
572 	 * The following (re)configuration events are common code
573 	 * initiated. lgrp_plat_config() is called here to inform the
574 	 * platform of the reconfiguration event.
575 	 */
576 	case LGRP_CONFIG_CPU_ADD:
577 		cp = (cpu_t *)resource;
578 
579 		/*
580 		 * Initialize the new CPU's lgrp related next/prev
581 		 * links, and give it a bootstrap lpl so that it can
582 		 * survive should it need to enter the dispatcher.
583 		 */
584 		cp->cpu_next_lpl = cp;
585 		cp->cpu_prev_lpl = cp;
586 		cp->cpu_next_lgrp = cp;
587 		cp->cpu_prev_lgrp = cp;
588 		cp->cpu_lpl = lpl_bootstrap;
589 
590 		lgrp_plat_config(event, resource);
591 		atomic_inc_32(&lgrp_gen);
592 
593 		break;
594 	case LGRP_CONFIG_CPU_DEL:
595 		lgrp_plat_config(event, resource);
596 		atomic_inc_32(&lgrp_gen);
597 
598 		break;
599 	case LGRP_CONFIG_CPU_ONLINE:
600 		cp = (cpu_t *)resource;
601 		lgrp_cpu_init(cp);
602 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
603 		rc = lpl_topo_verify(cp->cpu_part);
604 		if (rc != LPL_TOPO_CORRECT) {
605 			panic("lpl_topo_verify failed: %d", rc);
606 		}
607 		lgrp_plat_config(event, resource);
608 		atomic_inc_32(&lgrp_gen);
609 
610 		break;
611 	case LGRP_CONFIG_CPU_OFFLINE:
612 		cp = (cpu_t *)resource;
613 		id = cp->cpu_lpl->lpl_lgrpid;
614 		lgrp_part_del_cpu(cp);
615 		lgrp_cpu_fini(cp, id);
616 		rc = lpl_topo_verify(cp->cpu_part);
617 		if (rc != LPL_TOPO_CORRECT) {
618 			panic("lpl_topo_verify failed: %d", rc);
619 		}
620 		lgrp_plat_config(event, resource);
621 		atomic_inc_32(&lgrp_gen);
622 
623 		break;
624 	case LGRP_CONFIG_CPUPART_ADD:
625 		cp = (cpu_t *)resource;
626 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
627 		rc = lpl_topo_verify(cp->cpu_part);
628 		if (rc != LPL_TOPO_CORRECT) {
629 			panic("lpl_topo_verify failed: %d", rc);
630 		}
631 		lgrp_plat_config(event, resource);
632 
633 		break;
634 	case LGRP_CONFIG_CPUPART_DEL:
635 		cp = (cpu_t *)resource;
636 		lgrp_part_del_cpu((cpu_t *)resource);
637 		rc = lpl_topo_verify(cp->cpu_part);
638 		if (rc != LPL_TOPO_CORRECT) {
639 			panic("lpl_topo_verify failed: %d", rc);
640 		}
641 		lgrp_plat_config(event, resource);
642 
643 		break;
644 	/*
645 	 * The following events are initiated by the memnode
646 	 * subsystem.
647 	 */
648 	case LGRP_CONFIG_MEM_ADD:
649 		lgrp_mem_init((int)resource, where, B_FALSE);
650 		atomic_inc_32(&lgrp_gen);
651 
652 		break;
653 	case LGRP_CONFIG_MEM_DEL:
654 		lgrp_mem_fini((int)resource, where, B_FALSE);
655 		atomic_inc_32(&lgrp_gen);
656 
657 		break;
658 	case LGRP_CONFIG_MEM_RENAME: {
659 		lgrp_config_mem_rename_t *ren_arg =
660 		    (lgrp_config_mem_rename_t *)where;
661 
662 		lgrp_mem_rename((int)resource,
663 		    ren_arg->lmem_rename_from,
664 		    ren_arg->lmem_rename_to);
665 		atomic_inc_32(&lgrp_gen);
666 
667 		break;
668 	}
669 	case LGRP_CONFIG_GEN_UPDATE:
670 		atomic_inc_32(&lgrp_gen);
671 
672 		break;
673 	case LGRP_CONFIG_FLATTEN:
674 		if (where == 0)
675 			lgrp_topo_levels = (int)resource;
676 		else
677 			(void) lgrp_topo_flatten(resource,
678 			    lgrp_table, lgrp_alloc_max, &changed);
679 
680 		break;
681 	/*
682 	 * Update any lgroups with old latency to new latency
683 	 */
684 	case LGRP_CONFIG_LAT_CHANGE_ALL:
685 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
686 		    (u_longlong_t)where);
687 
688 		break;
689 	/*
690 	 * Update lgroup with specified lgroup platform handle to have
691 	 * new latency
692 	 */
693 	case LGRP_CONFIG_LAT_CHANGE:
694 		lgrp_latency_change((lgrp_handle_t)resource, 0,
695 		    (u_longlong_t)where);
696 
697 		break;
698 	case LGRP_CONFIG_NOP:
699 
700 		break;
701 	default:
702 		break;
703 	}
704 
705 }
706 
707 /*
708  * Called to add lgrp info into cpu structure from cpu_add_unit;
709  * do not assume cpu is in cpu[] yet!
710  *
711  * CPUs are brought online with all other CPUs paused so we can't
712  * allocate memory or we could deadlock the system, so we rely on
713  * the platform to statically allocate as much space as we need
714  * for the lgrp structs and stats.
715  */
716 static void
717 lgrp_cpu_init(struct cpu *cp)
718 {
719 	klgrpset_t	changed;
720 	int		count;
721 	lgrp_handle_t	hand;
722 	int		first_cpu;
723 	lgrp_t		*my_lgrp;
724 	lgrp_id_t	lgrpid;
725 	struct cpu	*cptr;
726 
727 	/*
728 	 * This is the first time through if the resource set
729 	 * for the root lgroup is empty. After cpu0 has been
730 	 * initially added to an lgroup, the root's CPU resource
731 	 * set can never be empty, since the system's last CPU
732 	 * cannot be offlined.
733 	 */
734 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
735 		/*
736 		 * First time through.
737 		 */
738 		first_cpu = 1;
739 	} else {
740 		/*
741 		 * If cpu0 needs to move lgroups, we may come
742 		 * through here again, at which time cpu_lock won't
743 		 * be held, and lgrp_initialized will be false.
744 		 */
745 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
746 		ASSERT(cp->cpu_part != NULL);
747 		first_cpu = 0;
748 	}
749 
750 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
751 	my_lgrp = lgrp_hand_to_lgrp(hand);
752 
753 	if (my_lgrp == NULL) {
754 		/*
755 		 * Create new lgrp and add it to lgroup topology
756 		 */
757 		my_lgrp = lgrp_create();
758 		my_lgrp->lgrp_plathand = hand;
759 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
760 		lgrpid = my_lgrp->lgrp_id;
761 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
762 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
763 
764 		count = 0;
765 		klgrpset_clear(changed);
766 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
767 		    &changed);
768 		/*
769 		 * May have added new intermediate lgroups, so need to add
770 		 * resources other than CPUs which are added below
771 		 */
772 		(void) lgrp_mnode_update(changed, NULL);
773 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
774 	    > 0) {
775 		/*
776 		 * Leaf lgroup was created, but latency wasn't available
777 		 * then.  So, set latency for it and fill in rest of lgroup
778 		 * topology  now that we know how far it is from other leaf
779 		 * lgroups.
780 		 */
781 		lgrpid = my_lgrp->lgrp_id;
782 		klgrpset_clear(changed);
783 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
784 		    lgrpid))
785 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
786 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
787 		    &changed);
788 
789 		/*
790 		 * May have added new intermediate lgroups, so need to add
791 		 * resources other than CPUs which are added below
792 		 */
793 		(void) lgrp_mnode_update(changed, NULL);
794 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
795 	    my_lgrp->lgrp_id)) {
796 		int	i;
797 
798 		/*
799 		 * Update existing lgroup and lgroups containing it with CPU
800 		 * resource
801 		 */
802 		lgrpid = my_lgrp->lgrp_id;
803 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
804 		for (i = 0; i <= lgrp_alloc_max; i++) {
805 			lgrp_t		*lgrp;
806 
807 			lgrp = lgrp_table[i];
808 			if (!LGRP_EXISTS(lgrp) ||
809 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
810 				continue;
811 
812 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
813 		}
814 	}
815 
816 	lgrpid = my_lgrp->lgrp_id;
817 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
818 
819 	/*
820 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
821 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
822 	 * not since none of lgroup IDs in the lpl's have been set yet.
823 	 */
824 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
825 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
826 
827 	/*
828 	 * link the CPU into the lgrp's CPU list
829 	 */
830 	if (my_lgrp->lgrp_cpucnt == 0) {
831 		my_lgrp->lgrp_cpu = cp;
832 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
833 	} else {
834 		cptr = my_lgrp->lgrp_cpu;
835 		cp->cpu_next_lgrp = cptr;
836 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
837 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
838 		cptr->cpu_prev_lgrp = cp;
839 	}
840 	my_lgrp->lgrp_cpucnt++;
841 }
842 
843 lgrp_t *
844 lgrp_create(void)
845 {
846 	lgrp_t		*my_lgrp;
847 	lgrp_id_t	lgrpid;
848 	int		i;
849 
850 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
851 	lgrpid = 0;
852 
853 	/*
854 	 * Find an open slot in the lgroup table and recycle unused lgroup
855 	 * left there if any
856 	 */
857 	my_lgrp = NULL;
858 	if (lgrp_alloc_hint == -1)
859 		/*
860 		 * Allocate from end when hint not set yet because no lgroups
861 		 * have been deleted yet
862 		 */
863 		lgrpid = nlgrps++;
864 	else {
865 		/*
866 		 * Start looking for next open slot from hint and leave hint
867 		 * at slot allocated
868 		 */
869 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
870 			my_lgrp = lgrp_table[i];
871 			if (!LGRP_EXISTS(my_lgrp)) {
872 				lgrpid = i;
873 				nlgrps++;
874 				break;
875 			}
876 		}
877 		lgrp_alloc_hint = lgrpid;
878 	}
879 
880 	/*
881 	 * Keep track of max lgroup ID allocated so far to cut down on searches
882 	 */
883 	if (lgrpid > lgrp_alloc_max)
884 		lgrp_alloc_max = lgrpid;
885 
886 	/*
887 	 * Need to allocate new lgroup if next open slot didn't have one
888 	 * for recycling
889 	 */
890 	if (my_lgrp == NULL)
891 		my_lgrp = lgrp_plat_alloc(lgrpid);
892 
893 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
894 		panic("Too many lgrps for platform (%d)", nlgrps);
895 
896 	my_lgrp->lgrp_id = lgrpid;
897 	my_lgrp->lgrp_latency = 0;
898 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
899 	my_lgrp->lgrp_parent = NULL;
900 	my_lgrp->lgrp_childcnt = 0;
901 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
902 	my_lgrp->lgrp_nmnodes = 0;
903 	klgrpset_clear(my_lgrp->lgrp_children);
904 	klgrpset_clear(my_lgrp->lgrp_leaves);
905 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
906 		klgrpset_clear(my_lgrp->lgrp_set[i]);
907 
908 	my_lgrp->lgrp_cpu = NULL;
909 	my_lgrp->lgrp_cpucnt = 0;
910 
911 	if (my_lgrp->lgrp_kstat != NULL)
912 		lgrp_kstat_reset(lgrpid);
913 
914 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
915 
916 	return (my_lgrp);
917 }
918 
919 void
920 lgrp_destroy(lgrp_t *lgrp)
921 {
922 	int		i;
923 
924 	/*
925 	 * Unless this lgroup is being destroyed on behalf of
926 	 * the boot CPU, cpu_lock must be held
927 	 */
928 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
929 
930 	if (nlgrps == 1)
931 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
932 
933 	if (!LGRP_EXISTS(lgrp))
934 		return;
935 
936 	/*
937 	 * Set hint to lgroup being deleted and try to keep lower numbered
938 	 * hints to facilitate finding empty slots
939 	 */
940 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
941 		lgrp_alloc_hint = lgrp->lgrp_id;
942 
943 	/*
944 	 * Mark this lgroup to be recycled by setting its lgroup ID to
945 	 * LGRP_NONE and clear relevant fields
946 	 */
947 	lgrp->lgrp_id = LGRP_NONE;
948 	lgrp->lgrp_latency = 0;
949 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
950 	lgrp->lgrp_parent = NULL;
951 	lgrp->lgrp_childcnt = 0;
952 
953 	klgrpset_clear(lgrp->lgrp_children);
954 	klgrpset_clear(lgrp->lgrp_leaves);
955 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
956 		klgrpset_clear(lgrp->lgrp_set[i]);
957 
958 	lgrp->lgrp_mnodes = (mnodeset_t)0;
959 	lgrp->lgrp_nmnodes = 0;
960 
961 	lgrp->lgrp_cpu = NULL;
962 	lgrp->lgrp_cpucnt = 0;
963 
964 	nlgrps--;
965 }
966 
967 /*
968  * Initialize kstat data. Called from lgrp intialization code.
969  */
970 static void
971 lgrp_kstat_init(void)
972 {
973 	lgrp_stat_t	stat;
974 
975 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
976 
977 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
978 		kstat_named_init(&lgrp_kstat_data[stat],
979 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
980 }
981 
982 /*
983  * initialize an lgrp's kstats if needed
984  * called with cpu_lock held but not with cpus paused.
985  * we don't tear these down now because we don't know about
986  * memory leaving the lgrp yet...
987  */
988 
989 void
990 lgrp_kstat_create(cpu_t *cp)
991 {
992 	kstat_t		*lgrp_kstat;
993 	lgrp_id_t	lgrpid;
994 	lgrp_t		*my_lgrp;
995 
996 	ASSERT(MUTEX_HELD(&cpu_lock));
997 
998 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
999 	my_lgrp = lgrp_table[lgrpid];
1000 
1001 	if (my_lgrp->lgrp_kstat != NULL)
1002 		return; /* already initialized */
1003 
1004 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1005 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1006 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1007 
1008 	if (lgrp_kstat != NULL) {
1009 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1010 		lgrp_kstat->ks_private = my_lgrp;
1011 		lgrp_kstat->ks_data = &lgrp_kstat_data;
1012 		lgrp_kstat->ks_update = lgrp_kstat_extract;
1013 		my_lgrp->lgrp_kstat = lgrp_kstat;
1014 		kstat_install(lgrp_kstat);
1015 	}
1016 }
1017 
1018 /*
1019  * this will do something when we manage to remove now unused lgrps
1020  */
1021 
1022 /* ARGSUSED */
1023 void
1024 lgrp_kstat_destroy(cpu_t *cp)
1025 {
1026 	ASSERT(MUTEX_HELD(&cpu_lock));
1027 }
1028 
1029 /*
1030  * Called when a CPU is off-lined.
1031  */
1032 static void
1033 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1034 {
1035 	lgrp_t *my_lgrp;
1036 	struct cpu *prev;
1037 	struct cpu *next;
1038 
1039 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1040 
1041 	prev = cp->cpu_prev_lgrp;
1042 	next = cp->cpu_next_lgrp;
1043 
1044 	prev->cpu_next_lgrp = next;
1045 	next->cpu_prev_lgrp = prev;
1046 
1047 	/*
1048 	 * just because I'm paranoid doesn't mean...
1049 	 */
1050 
1051 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1052 
1053 	my_lgrp = lgrp_table[lgrpid];
1054 	my_lgrp->lgrp_cpucnt--;
1055 
1056 	/*
1057 	 * Removing last CPU in lgroup, so update lgroup topology
1058 	 */
1059 	if (my_lgrp->lgrp_cpucnt == 0) {
1060 		klgrpset_t	changed;
1061 		int		count;
1062 		int		i;
1063 
1064 		my_lgrp->lgrp_cpu = NULL;
1065 
1066 		/*
1067 		 * Remove this lgroup from its lgroup CPU resources and remove
1068 		 * lgroup from lgroup topology if it doesn't have any more
1069 		 * resources in it now
1070 		 */
1071 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1072 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1073 			count = 0;
1074 			klgrpset_clear(changed);
1075 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1076 			    lgrp_alloc_max + 1, &changed);
1077 			return;
1078 		}
1079 
1080 		/*
1081 		 * This lgroup isn't empty, so just remove it from CPU
1082 		 * resources of any lgroups that contain it as such
1083 		 */
1084 		for (i = 0; i <= lgrp_alloc_max; i++) {
1085 			lgrp_t		*lgrp;
1086 
1087 			lgrp = lgrp_table[i];
1088 			if (!LGRP_EXISTS(lgrp) ||
1089 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1090 			    lgrpid))
1091 				continue;
1092 
1093 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1094 		}
1095 		return;
1096 	}
1097 
1098 	if (my_lgrp->lgrp_cpu == cp)
1099 		my_lgrp->lgrp_cpu = next;
1100 
1101 }
1102 
1103 /*
1104  * Update memory nodes in target lgroups and return ones that get changed
1105  */
1106 int
1107 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1108 {
1109 	int	count;
1110 	int	i;
1111 	int	j;
1112 	lgrp_t	*lgrp;
1113 	lgrp_t	*lgrp_rsrc;
1114 
1115 	count = 0;
1116 	if (changed)
1117 		klgrpset_clear(*changed);
1118 
1119 	if (klgrpset_isempty(target))
1120 		return (0);
1121 
1122 	/*
1123 	 * Find each lgroup in target lgroups
1124 	 */
1125 	for (i = 0; i <= lgrp_alloc_max; i++) {
1126 		/*
1127 		 * Skip any lgroups that don't exist or aren't in target group
1128 		 */
1129 		lgrp = lgrp_table[i];
1130 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1131 			continue;
1132 		}
1133 
1134 		/*
1135 		 * Initialize memnodes for intermediate lgroups to 0
1136 		 * and update them from scratch since they may have completely
1137 		 * changed
1138 		 */
1139 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1140 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1141 			lgrp->lgrp_nmnodes = 0;
1142 		}
1143 
1144 		/*
1145 		 * Update memory nodes of of target lgroup with memory nodes
1146 		 * from each lgroup in its lgroup memory resource set
1147 		 */
1148 		for (j = 0; j <= lgrp_alloc_max; j++) {
1149 			int	k;
1150 
1151 			/*
1152 			 * Skip any lgroups that don't exist or aren't in
1153 			 * memory resources of target lgroup
1154 			 */
1155 			lgrp_rsrc = lgrp_table[j];
1156 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1157 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1158 			    j))
1159 				continue;
1160 
1161 			/*
1162 			 * Update target lgroup's memnodes to include memnodes
1163 			 * of this lgroup
1164 			 */
1165 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1166 				mnodeset_t	mnode_mask;
1167 
1168 				mnode_mask = (mnodeset_t)1 << k;
1169 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1170 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1171 					lgrp->lgrp_mnodes |= mnode_mask;
1172 					lgrp->lgrp_nmnodes++;
1173 				}
1174 			}
1175 			count++;
1176 			if (changed)
1177 				klgrpset_add(*changed, lgrp->lgrp_id);
1178 		}
1179 	}
1180 
1181 	return (count);
1182 }
1183 
1184 /*
1185  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1186  * is moved from one board to another. The "from" and "to" arguments specify the
1187  * source and the destination of the move.
1188  *
1189  * See plat_lgrp_config() for a detailed description of the copy-rename
1190  * semantics.
1191  *
1192  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1193  * the lgroup topology which is changing as memory moves from one lgroup to
1194  * another. It removes the mnode from the source lgroup and re-inserts it in the
1195  * target lgroup.
1196  *
1197  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1198  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1199  * copy-rename operation.
1200  *
1201  * There is one case which requires special handling. If the system contains
1202  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1203  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1204  * lgrp_mem_init), but there is a window when the system has no memory in the
1205  * lgroup hierarchy. If another thread tries to allocate memory during this
1206  * window, the allocation will fail, although the system has physical memory.
1207  * This may cause a system panic or a deadlock (some sleeping memory allocations
1208  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1209  * the mnode back).
1210  *
1211  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1212  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1213  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1214  * but it updates the rest of the lgroup topology as if the mnode was actually
1215  * removed. The lgrp_mem_init() function recognizes that the mnode being
1216  * inserted represents such a special case and updates the topology
1217  * appropriately.
1218  */
1219 void
1220 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1221 {
1222 	/*
1223 	 * Remove the memory from the source node and add it to the destination
1224 	 * node.
1225 	 */
1226 	lgrp_mem_fini(mnode, from, B_TRUE);
1227 	lgrp_mem_init(mnode, to, B_TRUE);
1228 }
1229 
1230 /*
1231  * Called to indicate that the lgrp with platform handle "hand" now
1232  * contains the memory identified by "mnode".
1233  *
1234  * LOCKING for this routine is a bit tricky. Usually it is called without
1235  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1236  * callers. During DR of the board containing the caged memory it may be called
1237  * with cpu_lock already held and CPUs paused.
1238  *
1239  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1240  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1241  * dealing with the special case of DR copy-rename described in
1242  * lgrp_mem_rename().
1243  */
1244 void
1245 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1246 {
1247 	klgrpset_t	changed;
1248 	int		count;
1249 	int		i;
1250 	lgrp_t		*my_lgrp;
1251 	lgrp_id_t	lgrpid;
1252 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1253 	boolean_t	drop_lock = B_FALSE;
1254 	boolean_t	need_synch = B_FALSE;
1255 
1256 	/*
1257 	 * Grab CPU lock (if we haven't already)
1258 	 */
1259 	if (!MUTEX_HELD(&cpu_lock)) {
1260 		mutex_enter(&cpu_lock);
1261 		drop_lock = B_TRUE;
1262 	}
1263 
1264 	/*
1265 	 * This routine may be called from a context where we already
1266 	 * hold cpu_lock, and have already paused cpus.
1267 	 */
1268 	if (!cpus_paused())
1269 		need_synch = B_TRUE;
1270 
1271 	/*
1272 	 * Check if this mnode is already configured and return immediately if
1273 	 * it is.
1274 	 *
1275 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1276 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1277 	 * recognize this case and continue as usual, but skip the update to
1278 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1279 	 * in topology, temporarily introduced by lgrp_mem_fini().
1280 	 */
1281 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1282 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1283 		if (drop_lock)
1284 			mutex_exit(&cpu_lock);
1285 		return;
1286 	}
1287 
1288 	/*
1289 	 * Update lgroup topology with new memory resources, keeping track of
1290 	 * which lgroups change
1291 	 */
1292 	count = 0;
1293 	klgrpset_clear(changed);
1294 	my_lgrp = lgrp_hand_to_lgrp(hand);
1295 	if (my_lgrp == NULL) {
1296 		/* new lgrp */
1297 		my_lgrp = lgrp_create();
1298 		lgrpid = my_lgrp->lgrp_id;
1299 		my_lgrp->lgrp_plathand = hand;
1300 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1301 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1302 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1303 
1304 		if (need_synch)
1305 			pause_cpus(NULL, NULL);
1306 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1307 		    &changed);
1308 		if (need_synch)
1309 			start_cpus();
1310 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1311 	    > 0) {
1312 		/*
1313 		 * Leaf lgroup was created, but latency wasn't available
1314 		 * then.  So, set latency for it and fill in rest of lgroup
1315 		 * topology  now that we know how far it is from other leaf
1316 		 * lgroups.
1317 		 */
1318 		klgrpset_clear(changed);
1319 		lgrpid = my_lgrp->lgrp_id;
1320 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1321 		    lgrpid))
1322 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1323 		if (need_synch)
1324 			pause_cpus(NULL, NULL);
1325 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1326 		    &changed);
1327 		if (need_synch)
1328 			start_cpus();
1329 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1330 	    my_lgrp->lgrp_id)) {
1331 		/*
1332 		 * Add new lgroup memory resource to existing lgroup
1333 		 */
1334 		lgrpid = my_lgrp->lgrp_id;
1335 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1336 		klgrpset_add(changed, lgrpid);
1337 		count++;
1338 		for (i = 0; i <= lgrp_alloc_max; i++) {
1339 			lgrp_t		*lgrp;
1340 
1341 			lgrp = lgrp_table[i];
1342 			if (!LGRP_EXISTS(lgrp) ||
1343 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1344 				continue;
1345 
1346 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1347 			klgrpset_add(changed, lgrp->lgrp_id);
1348 			count++;
1349 		}
1350 	} else {
1351 		if (drop_lock)
1352 			mutex_exit(&cpu_lock);
1353 		return;
1354 	}
1355 
1356 	/*
1357 	 * Add memory node to lgroup and remove lgroup from ones that need
1358 	 * to be updated
1359 	 */
1360 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1361 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1362 		my_lgrp->lgrp_nmnodes++;
1363 	}
1364 	klgrpset_del(changed, lgrpid);
1365 
1366 	/*
1367 	 * Update memory node information for all lgroups that changed and
1368 	 * contain new memory node as a resource
1369 	 */
1370 	if (count)
1371 		(void) lgrp_mnode_update(changed, NULL);
1372 
1373 	if (drop_lock)
1374 		mutex_exit(&cpu_lock);
1375 }
1376 
1377 /*
1378  * Called to indicate that the lgroup associated with the platform
1379  * handle "hand" no longer contains given memory node
1380  *
1381  * LOCKING for this routine is a bit tricky. Usually it is called without
1382  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1383  * callers. During DR of the board containing the caged memory it may be called
1384  * with cpu_lock already held and CPUs paused.
1385  *
1386  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1387  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1388  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1389  * the same mnode back into the topology. See lgrp_mem_rename() and
1390  * lgrp_mem_init() for additional details.
1391  */
1392 void
1393 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1394 {
1395 	klgrpset_t	changed;
1396 	int		count;
1397 	int		i;
1398 	lgrp_t		*my_lgrp;
1399 	lgrp_id_t	lgrpid;
1400 	mnodeset_t	mnodes_mask;
1401 	boolean_t	drop_lock = B_FALSE;
1402 	boolean_t	need_synch = B_FALSE;
1403 
1404 	/*
1405 	 * Grab CPU lock (if we haven't already)
1406 	 */
1407 	if (!MUTEX_HELD(&cpu_lock)) {
1408 		mutex_enter(&cpu_lock);
1409 		drop_lock = B_TRUE;
1410 	}
1411 
1412 	/*
1413 	 * This routine may be called from a context where we already
1414 	 * hold cpu_lock and have already paused cpus.
1415 	 */
1416 	if (!cpus_paused())
1417 		need_synch = B_TRUE;
1418 
1419 	my_lgrp = lgrp_hand_to_lgrp(hand);
1420 
1421 	/*
1422 	 * The lgrp *must* be pre-existing
1423 	 */
1424 	ASSERT(my_lgrp != NULL);
1425 
1426 	/*
1427 	 * Delete memory node from lgroups which contain it
1428 	 */
1429 	mnodes_mask = ((mnodeset_t)1 << mnode);
1430 	for (i = 0; i <= lgrp_alloc_max; i++) {
1431 		lgrp_t *lgrp = lgrp_table[i];
1432 		/*
1433 		 * Skip any non-existent lgroups and any lgroups that don't
1434 		 * contain leaf lgroup of memory as a memory resource
1435 		 */
1436 		if (!LGRP_EXISTS(lgrp) ||
1437 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1438 			continue;
1439 
1440 		/*
1441 		 * Avoid removing the last mnode from the root in the DR
1442 		 * copy-rename case. See lgrp_mem_rename() for details.
1443 		 */
1444 		if (is_copy_rename &&
1445 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1446 			continue;
1447 
1448 		/*
1449 		 * Remove memory node from lgroup.
1450 		 */
1451 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1452 		lgrp->lgrp_nmnodes--;
1453 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1454 	}
1455 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1456 
1457 	/*
1458 	 * Don't need to update lgroup topology if this lgroup still has memory.
1459 	 *
1460 	 * In the special case of DR copy-rename with the only mnode being
1461 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1462 	 * still need to update the lgroup topology.
1463 	 */
1464 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1465 	    !(is_copy_rename && (my_lgrp == lgrp_root) &&
1466 	    (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1467 		if (drop_lock)
1468 			mutex_exit(&cpu_lock);
1469 		return;
1470 	}
1471 
1472 	/*
1473 	 * This lgroup does not contain any memory now
1474 	 */
1475 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1476 
1477 	/*
1478 	 * Remove this lgroup from lgroup topology if it does not contain any
1479 	 * resources now
1480 	 */
1481 	lgrpid = my_lgrp->lgrp_id;
1482 	count = 0;
1483 	klgrpset_clear(changed);
1484 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1485 		/*
1486 		 * Delete lgroup when no more resources
1487 		 */
1488 		if (need_synch)
1489 			pause_cpus(NULL, NULL);
1490 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1491 		    lgrp_alloc_max + 1, &changed);
1492 		ASSERT(count > 0);
1493 		if (need_synch)
1494 			start_cpus();
1495 	} else {
1496 		/*
1497 		 * Remove lgroup from memory resources of any lgroups that
1498 		 * contain it as such
1499 		 */
1500 		for (i = 0; i <= lgrp_alloc_max; i++) {
1501 			lgrp_t		*lgrp;
1502 
1503 			lgrp = lgrp_table[i];
1504 			if (!LGRP_EXISTS(lgrp) ||
1505 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1506 			    lgrpid))
1507 				continue;
1508 
1509 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1510 		}
1511 	}
1512 	if (drop_lock)
1513 		mutex_exit(&cpu_lock);
1514 }
1515 
1516 /*
1517  * Return lgroup with given platform handle
1518  */
1519 lgrp_t *
1520 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1521 {
1522 	int	i;
1523 	lgrp_t	*lgrp;
1524 
1525 	if (hand == LGRP_NULL_HANDLE)
1526 		return (NULL);
1527 
1528 	for (i = 0; i <= lgrp_alloc_max; i++) {
1529 		lgrp = lgrp_table[i];
1530 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1531 			return (lgrp);
1532 	}
1533 	return (NULL);
1534 }
1535 
1536 /*
1537  * Return the home lgroup of the current thread.
1538  * We must do this with kernel preemption disabled, since we don't want our
1539  * thread to be re-homed while we're poking around with its lpl, and the lpl
1540  * should never be NULL.
1541  *
1542  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1543  * is enabled because of DR.  Callers can use disable kernel preemption
1544  * around this call to guarantee that the lgroup will be valid beyond this
1545  * routine, since kernel preemption can be recursive.
1546  */
1547 lgrp_t *
1548 lgrp_home_lgrp(void)
1549 {
1550 	lgrp_t	*lgrp;
1551 	lpl_t	*lpl;
1552 
1553 	kpreempt_disable();
1554 
1555 	lpl = curthread->t_lpl;
1556 	ASSERT(lpl != NULL);
1557 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1558 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1559 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1560 
1561 	kpreempt_enable();
1562 
1563 	return (lgrp);
1564 }
1565 
1566 /*
1567  * Return ID of home lgroup for given thread
1568  * (See comments for lgrp_home_lgrp() for special care and handling
1569  * instructions)
1570  */
1571 lgrp_id_t
1572 lgrp_home_id(kthread_t *t)
1573 {
1574 	lgrp_id_t	lgrp;
1575 	lpl_t		*lpl;
1576 
1577 	ASSERT(t != NULL);
1578 	/*
1579 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1580 	 * cannot since the HAT layer can call into this routine to
1581 	 * determine the locality for its data structures in the context
1582 	 * of a page fault.
1583 	 */
1584 
1585 	kpreempt_disable();
1586 
1587 	lpl = t->t_lpl;
1588 	ASSERT(lpl != NULL);
1589 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1590 	lgrp = lpl->lpl_lgrpid;
1591 
1592 	kpreempt_enable();
1593 
1594 	return (lgrp);
1595 }
1596 
1597 /*
1598  * Return lgroup containing the physical memory for the given page frame number
1599  */
1600 lgrp_t *
1601 lgrp_pfn_to_lgrp(pfn_t pfn)
1602 {
1603 	lgrp_handle_t	hand;
1604 	int		i;
1605 	lgrp_t		*lgrp;
1606 
1607 	hand = lgrp_plat_pfn_to_hand(pfn);
1608 	if (hand != LGRP_NULL_HANDLE)
1609 		for (i = 0; i <= lgrp_alloc_max; i++) {
1610 			lgrp = lgrp_table[i];
1611 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1612 				return (lgrp);
1613 		}
1614 	return (NULL);
1615 }
1616 
1617 /*
1618  * Return lgroup containing the physical memory for the given page frame number
1619  */
1620 lgrp_t *
1621 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1622 {
1623 	lgrp_handle_t	hand;
1624 	int		i;
1625 	lgrp_t		*lgrp;
1626 	pfn_t		pfn;
1627 
1628 	pfn = btop(physaddr);
1629 	hand = lgrp_plat_pfn_to_hand(pfn);
1630 	if (hand != LGRP_NULL_HANDLE)
1631 		for (i = 0; i <= lgrp_alloc_max; i++) {
1632 			lgrp = lgrp_table[i];
1633 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1634 				return (lgrp);
1635 		}
1636 	return (NULL);
1637 }
1638 
1639 /*
1640  * Return the leaf lgroup containing the given CPU
1641  *
1642  * The caller needs to take precautions necessary to prevent
1643  * "cpu", and it's lpl from going away across a call to this function.
1644  * hint: kpreempt_disable()/kpreempt_enable()
1645  */
1646 static lgrp_t *
1647 lgrp_cpu_to_lgrp(cpu_t *cpu)
1648 {
1649 	return (cpu->cpu_lpl->lpl_lgrp);
1650 }
1651 
1652 /*
1653  * Return the sum of the partition loads in an lgrp divided by
1654  * the number of CPUs in the lgrp.  This is our best approximation
1655  * of an 'lgroup load average' for a useful per-lgroup kstat.
1656  */
1657 static uint64_t
1658 lgrp_sum_loadavgs(lgrp_t *lgrp)
1659 {
1660 	cpu_t *cpu;
1661 	int ncpu;
1662 	uint64_t loads = 0;
1663 
1664 	mutex_enter(&cpu_lock);
1665 
1666 	cpu = lgrp->lgrp_cpu;
1667 	ncpu = lgrp->lgrp_cpucnt;
1668 
1669 	if (cpu == NULL || ncpu == 0) {
1670 		mutex_exit(&cpu_lock);
1671 		return (0ull);
1672 	}
1673 
1674 	do {
1675 		loads += cpu->cpu_lpl->lpl_loadavg;
1676 		cpu = cpu->cpu_next_lgrp;
1677 	} while (cpu != lgrp->lgrp_cpu);
1678 
1679 	mutex_exit(&cpu_lock);
1680 
1681 	return (loads / ncpu);
1682 }
1683 
1684 void
1685 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1686 {
1687 	struct lgrp_stats *pstats;
1688 
1689 	/*
1690 	 * Verify that the caller isn't trying to add to
1691 	 * a statistic for an lgroup that has gone away
1692 	 */
1693 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1694 		return;
1695 
1696 	pstats = &lgrp_stats[lgrpid];
1697 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1698 }
1699 
1700 int64_t
1701 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1702 {
1703 	uint64_t val;
1704 	struct lgrp_stats *pstats;
1705 
1706 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1707 		return ((int64_t)0);
1708 
1709 	pstats = &lgrp_stats[lgrpid];
1710 	LGRP_STAT_READ(pstats, stat, val);
1711 	return (val);
1712 }
1713 
1714 /*
1715  * Reset all kstats for lgrp specified by its lgrpid.
1716  */
1717 static void
1718 lgrp_kstat_reset(lgrp_id_t lgrpid)
1719 {
1720 	lgrp_stat_t stat;
1721 
1722 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1723 		return;
1724 
1725 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1726 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1727 	}
1728 }
1729 
1730 /*
1731  * Collect all per-lgrp statistics for the lgrp associated with this
1732  * kstat, and store them in the ks_data array.
1733  *
1734  * The superuser can reset all the running counter statistics for an
1735  * lgrp by writing to any of the lgrp's stats.
1736  */
1737 static int
1738 lgrp_kstat_extract(kstat_t *ksp, int rw)
1739 {
1740 	lgrp_stat_t		stat;
1741 	struct kstat_named	*ksd;
1742 	lgrp_t			*lgrp;
1743 	lgrp_id_t		lgrpid;
1744 
1745 	lgrp = (lgrp_t *)ksp->ks_private;
1746 
1747 	ksd = (struct kstat_named *)ksp->ks_data;
1748 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1749 
1750 	lgrpid = lgrp->lgrp_id;
1751 
1752 	if (lgrpid == LGRP_NONE) {
1753 		/*
1754 		 * Return all zeroes as stats for freed lgrp.
1755 		 */
1756 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1757 			ksd[stat].value.i64 = 0;
1758 		}
1759 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1760 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1761 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1762 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1763 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1764 	} else if (rw != KSTAT_WRITE) {
1765 		/*
1766 		 * Handle counter stats
1767 		 */
1768 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1769 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1770 		}
1771 
1772 		/*
1773 		 * Handle kernel data snapshot stats
1774 		 */
1775 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1776 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1777 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1778 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1779 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1780 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1781 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1782 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1783 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1784 		    lgrp_loadavg_max_effect;
1785 	} else {
1786 		lgrp_kstat_reset(lgrpid);
1787 	}
1788 
1789 	return (0);
1790 }
1791 
1792 int
1793 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1794 {
1795 	cpu_t	*cp;
1796 
1797 	mutex_enter(&cpu_lock);
1798 
1799 	if ((cp = cpu_get(id)) == NULL) {
1800 		mutex_exit(&cpu_lock);
1801 		return (EINVAL);
1802 	}
1803 
1804 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1805 		mutex_exit(&cpu_lock);
1806 		return (EINVAL);
1807 	}
1808 
1809 	ASSERT(cp->cpu_lpl != NULL);
1810 
1811 	*lp = cp->cpu_lpl->lpl_lgrpid;
1812 
1813 	mutex_exit(&cpu_lock);
1814 
1815 	return (0);
1816 }
1817 
1818 int
1819 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1820 {
1821 	cpu_t *cp;
1822 
1823 	mutex_enter(&cpu_lock);
1824 
1825 	if ((cp = cpu_get(id)) == NULL) {
1826 		mutex_exit(&cpu_lock);
1827 		return (EINVAL);
1828 	}
1829 
1830 	ASSERT(cp->cpu_lpl != NULL);
1831 
1832 	*lp = cp->cpu_lpl->lpl_loadavg;
1833 
1834 	mutex_exit(&cpu_lock);
1835 
1836 	return (0);
1837 }
1838 
1839 /*
1840  * Add a resource named by lpl_leaf to rset of lpl_target
1841  *
1842  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1843  * resource. It is adjusted here, as this is presently the only place that we
1844  * can be certain a resource addition has succeeded.
1845  *
1846  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1847  * list in order until it reaches a NULL.  (This list is required to be NULL
1848  * terminated, too).  This is done so that we can mark start pos + 1, so that
1849  * each lpl is traversed sequentially, but in a different order.  We hope this
1850  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1851  */
1852 
1853 void
1854 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1855 {
1856 	int		i;
1857 	int		entry_slot = 0;
1858 
1859 	/* return if leaf is already present */
1860 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1861 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1862 			return;
1863 		}
1864 
1865 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1866 		    lpl_leaf->lpl_lgrpid) {
1867 			break;
1868 		}
1869 	}
1870 
1871 	/* insert leaf, update counts */
1872 	entry_slot = i;
1873 	i = lpl_target->lpl_nrset++;
1874 
1875 	/*
1876 	 * Start at the end of the rset array and work backwards towards the
1877 	 * slot into which the new lpl will be inserted. This effectively
1878 	 * preserves the current ordering by scooting everybody over one entry,
1879 	 * and placing the new entry into the space created.
1880 	 */
1881 	while (i-- > entry_slot) {
1882 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1883 		lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1884 		    i + 1;
1885 	}
1886 
1887 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1888 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1889 
1890 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1891 }
1892 
1893 /*
1894  * Update each of lpl_parent's children with a reference to their parent.
1895  * The lgrp topology is used as the reference since it is fully
1896  * consistent and correct at this point.
1897  * This should be called after any potential change in lpl_parent's
1898  * rset.
1899  */
1900 static void
1901 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1902 {
1903 	klgrpset_t	children;
1904 	int		i;
1905 
1906 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1907 	if (klgrpset_isempty(children))
1908 		return; /* nothing to do */
1909 
1910 	for (i = 0; i <= lgrp_alloc_max; i++) {
1911 		if (klgrpset_ismember(children, i)) {
1912 			/*
1913 			 * (Re)set the parent. It may be incorrect if
1914 			 * lpl_parent is new in the topology.
1915 			 */
1916 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1917 		}
1918 	}
1919 }
1920 
1921 /*
1922  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1923  *
1924  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1925  * resource. The values are adjusted here, as this is the only place that we can
1926  * be certain a resource was successfully deleted.
1927  */
1928 void
1929 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1930 {
1931 	int i;
1932 	lpl_t *leaf;
1933 
1934 	if (lpl_target->lpl_nrset == 0)
1935 		return;
1936 
1937 	/* find leaf in intermediate node */
1938 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1939 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1940 			break;
1941 	}
1942 
1943 	/* return if leaf not found */
1944 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1945 		return;
1946 
1947 	/* prune leaf, compress array */
1948 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1949 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1950 	lpl_target->lpl_ncpu--;
1951 	do {
1952 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1953 		/*
1954 		 * Update the lgrp id <=> rset mapping
1955 		 */
1956 		if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1957 			lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1958 		}
1959 	} while (i++ < lpl_target->lpl_nrset);
1960 }
1961 
1962 /*
1963  * Check to see if the resource set of the target lpl contains the
1964  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1965  */
1966 
1967 int
1968 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1969 {
1970 	int i;
1971 
1972 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1973 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1974 			return (1);
1975 	}
1976 
1977 	return (0);
1978 }
1979 
1980 /*
1981  * Called when we change cpu lpl membership.  This increments or decrements the
1982  * per-cpu counter in every lpl in which our leaf appears.
1983  */
1984 void
1985 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1986 {
1987 	cpupart_t	*cpupart;
1988 	lgrp_t		*lgrp_leaf;
1989 	lgrp_t		*lgrp_cur;
1990 	lpl_t		*lpl_leaf;
1991 	lpl_t		*lpl_cur;
1992 	int		i;
1993 
1994 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1995 
1996 	cpupart = cp->cpu_part;
1997 	lpl_leaf = cp->cpu_lpl;
1998 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1999 
2000 	for (i = 0; i <= lgrp_alloc_max; i++) {
2001 		lgrp_cur = lgrp_table[i];
2002 
2003 		/*
2004 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2005 		 * for the cpu in question, or if the current lgrp and leaf
2006 		 * don't share the same resources.
2007 		 */
2008 
2009 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2010 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2011 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2012 			continue;
2013 
2014 
2015 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2016 
2017 		if (lpl_cur->lpl_nrset > 0) {
2018 			if (act == LPL_INCREMENT) {
2019 				lpl_cur->lpl_ncpu++;
2020 			} else if (act == LPL_DECREMENT) {
2021 				lpl_cur->lpl_ncpu--;
2022 			}
2023 		}
2024 	}
2025 }
2026 
2027 /*
2028  * Initialize lpl with given resources and specified lgrp
2029  */
2030 void
2031 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2032 {
2033 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2034 	lpl->lpl_loadavg = 0;
2035 	if (lpl == lpl_leaf)
2036 		lpl->lpl_ncpu = 1;
2037 	else
2038 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2039 	lpl->lpl_nrset = 1;
2040 	lpl->lpl_rset[0] = lpl_leaf;
2041 	lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2042 	lpl->lpl_lgrp = lgrp;
2043 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2044 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2045 }
2046 
2047 /*
2048  * Clear an unused lpl
2049  */
2050 void
2051 lpl_clear(lpl_t *lpl)
2052 {
2053 	/*
2054 	 * Clear out all fields in the lpl except:
2055 	 *    lpl_lgrpid - to facilitate debugging
2056 	 *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2057 	 *
2058 	 * Note that the lpl's rset and id2rset mapping are cleared as well.
2059 	 */
2060 	lpl->lpl_loadavg = 0;
2061 	lpl->lpl_ncpu = 0;
2062 	lpl->lpl_lgrp = NULL;
2063 	lpl->lpl_parent = NULL;
2064 	lpl->lpl_cpus = NULL;
2065 	lpl->lpl_nrset = 0;
2066 	lpl->lpl_homed_time = 0;
2067 	bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2068 	bzero(lpl->lpl_id2rset,
2069 	    sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2070 }
2071 
2072 /*
2073  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2074  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2075  * make full use of all of the lgroup topology, but this checks to make sure
2076  * that for the parts that it does use, it has correctly understood the
2077  * relationships that exist. This function returns
2078  * 0 if the topology is correct, and a non-zero error code, for non-debug
2079  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2080  * debugging on a DEBUG kernel.
2081  */
2082 int
2083 lpl_topo_verify(cpupart_t *cpupart)
2084 {
2085 	lgrp_t		*lgrp;
2086 	lpl_t		*lpl;
2087 	klgrpset_t	rset;
2088 	klgrpset_t	cset;
2089 	cpu_t		*cpu;
2090 	cpu_t		*cp_start;
2091 	int		i;
2092 	int		j;
2093 	int		sum;
2094 
2095 	/* topology can't be incorrect if it doesn't exist */
2096 	if (!lgrp_topo_initialized || !lgrp_initialized)
2097 		return (LPL_TOPO_CORRECT);
2098 
2099 	ASSERT(cpupart != NULL);
2100 
2101 	for (i = 0; i <= lgrp_alloc_max; i++) {
2102 		lgrp = lgrp_table[i];
2103 		lpl = NULL;
2104 		/* make sure lpls are allocated */
2105 		ASSERT(cpupart->cp_lgrploads);
2106 		if (!cpupart->cp_lgrploads)
2107 			return (LPL_TOPO_PART_HAS_NO_LPL);
2108 
2109 		lpl = &cpupart->cp_lgrploads[i];
2110 		/* make sure our index is good */
2111 		ASSERT(i < cpupart->cp_nlgrploads);
2112 
2113 		/* if lgroup doesn't exist, make sure lpl is empty */
2114 		if (!LGRP_EXISTS(lgrp)) {
2115 			ASSERT(lpl->lpl_ncpu == 0);
2116 			if (lpl->lpl_ncpu > 0) {
2117 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2118 			} else {
2119 				continue;
2120 			}
2121 		}
2122 
2123 		/* verify that lgroup and lpl are identically numbered */
2124 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2125 
2126 		/* if lgroup isn't in our partition, make sure lpl is empty */
2127 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2128 		    cpupart->cp_lgrpset)) {
2129 			ASSERT(lpl->lpl_ncpu == 0);
2130 			if (lpl->lpl_ncpu > 0) {
2131 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2132 			}
2133 			/*
2134 			 * lpl is empty, and lgroup isn't in partition.  verify
2135 			 * that lpl doesn't show up in anyone else's rsets (in
2136 			 * this partition, anyway)
2137 			 */
2138 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2139 				lpl_t *i_lpl; /* lpl we're iterating over */
2140 
2141 				i_lpl = &cpupart->cp_lgrploads[j];
2142 
2143 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2144 				if (lpl_rset_contains(i_lpl, lpl)) {
2145 					return (LPL_TOPO_LPL_ORPHANED);
2146 				}
2147 			}
2148 			/* lgroup is empty, and everything is ok. continue */
2149 			continue;
2150 		}
2151 
2152 
2153 		/* lgroup is in this partition, now check it against lpl */
2154 
2155 		/* do both have matching lgrps? */
2156 		ASSERT(lgrp == lpl->lpl_lgrp);
2157 		if (lgrp != lpl->lpl_lgrp) {
2158 			return (LPL_TOPO_LGRP_MISMATCH);
2159 		}
2160 
2161 		/* do the parent lgroups exist and do they match? */
2162 		if (lgrp->lgrp_parent) {
2163 			ASSERT(lpl->lpl_parent);
2164 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2165 			    lpl->lpl_parent->lpl_lgrpid);
2166 
2167 			if (!lpl->lpl_parent) {
2168 				return (LPL_TOPO_MISSING_PARENT);
2169 			} else if (lgrp->lgrp_parent->lgrp_id !=
2170 			    lpl->lpl_parent->lpl_lgrpid) {
2171 				return (LPL_TOPO_PARENT_MISMATCH);
2172 			}
2173 		}
2174 
2175 		/* only leaf lgroups keep a cpucnt, only check leaves */
2176 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2177 
2178 			/* verify that lgrp is also a leaf */
2179 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2180 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2181 			    lpl->lpl_lgrpid)));
2182 
2183 			if ((lgrp->lgrp_childcnt > 0) ||
2184 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2185 			    lpl->lpl_lgrpid))) {
2186 				return (LPL_TOPO_LGRP_NOT_LEAF);
2187 			}
2188 
2189 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2190 			    (lpl->lpl_ncpu > 0));
2191 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2192 			    (lpl->lpl_ncpu <= 0)) {
2193 				return (LPL_TOPO_BAD_CPUCNT);
2194 			}
2195 
2196 			/*
2197 			 * Check that lpl_ncpu also matches the number of
2198 			 * cpus in the lpl's linked list.  This only exists in
2199 			 * leaves, but they should always match.
2200 			 */
2201 			j = 0;
2202 			cpu = cp_start = lpl->lpl_cpus;
2203 			while (cpu != NULL) {
2204 				j++;
2205 
2206 				/* check to make sure cpu's lpl is leaf lpl */
2207 				ASSERT(cpu->cpu_lpl == lpl);
2208 				if (cpu->cpu_lpl != lpl) {
2209 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2210 				}
2211 
2212 				/* check next cpu */
2213 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2214 					continue;
2215 				} else {
2216 					cpu = NULL;
2217 				}
2218 			}
2219 
2220 			ASSERT(j == lpl->lpl_ncpu);
2221 			if (j != lpl->lpl_ncpu) {
2222 				return (LPL_TOPO_LPL_BAD_NCPU);
2223 			}
2224 
2225 			/*
2226 			 * Also, check that leaf lpl is contained in all
2227 			 * intermediate lpls that name the leaf as a descendant
2228 			 */
2229 			for (j = 0; j <= lgrp_alloc_max; j++) {
2230 				klgrpset_t intersect;
2231 				lgrp_t *lgrp_cand;
2232 				lpl_t *lpl_cand;
2233 
2234 				lgrp_cand = lgrp_table[j];
2235 				intersect = klgrpset_intersects(
2236 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2237 				    cpupart->cp_lgrpset);
2238 
2239 				if (!LGRP_EXISTS(lgrp_cand) ||
2240 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2241 				    cpupart->cp_lgrpset) ||
2242 				    (intersect == 0))
2243 					continue;
2244 
2245 				lpl_cand =
2246 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2247 
2248 				if (klgrpset_ismember(intersect,
2249 				    lgrp->lgrp_id)) {
2250 					ASSERT(lpl_rset_contains(lpl_cand,
2251 					    lpl));
2252 
2253 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2254 						return (LPL_TOPO_RSET_MSSNG_LF);
2255 					}
2256 				}
2257 			}
2258 
2259 		} else { /* non-leaf specific checks */
2260 
2261 			/*
2262 			 * Non-leaf lpls should have lpl_cpus == NULL
2263 			 * verify that this is so
2264 			 */
2265 			ASSERT(lpl->lpl_cpus == NULL);
2266 			if (lpl->lpl_cpus != NULL) {
2267 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2268 			}
2269 
2270 			/*
2271 			 * verify that the sum of the cpus in the leaf resources
2272 			 * is equal to the total ncpu in the intermediate
2273 			 */
2274 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2275 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2276 			}
2277 
2278 			ASSERT(sum == lpl->lpl_ncpu);
2279 			if (sum != lpl->lpl_ncpu) {
2280 				return (LPL_TOPO_LPL_BAD_NCPU);
2281 			}
2282 		}
2283 
2284 		/*
2285 		 * Check the rset of the lpl in question.  Make sure that each
2286 		 * rset contains a subset of the resources in
2287 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2288 		 * sure that each rset doesn't include resources that are
2289 		 * outside of that set.  (Which would be resources somehow not
2290 		 * accounted for).
2291 		 */
2292 		klgrpset_clear(rset);
2293 		for (j = 0; j < lpl->lpl_nrset; j++) {
2294 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2295 		}
2296 		klgrpset_copy(cset, rset);
2297 		/* make sure lpl rset matches lgrp rset */
2298 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2299 		/* make sure rset is contained with in partition, too */
2300 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2301 
2302 		ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2303 		if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2304 			return (LPL_TOPO_RSET_MISMATCH);
2305 		}
2306 
2307 		/*
2308 		 * check to make sure lpl_nrset matches the number of rsets
2309 		 * contained in the lpl
2310 		 */
2311 		for (j = 0; j < lpl->lpl_nrset; j++) {
2312 			if (lpl->lpl_rset[j] == NULL)
2313 				break;
2314 		}
2315 
2316 		ASSERT(j == lpl->lpl_nrset);
2317 		if (j != lpl->lpl_nrset) {
2318 			return (LPL_TOPO_BAD_RSETCNT);
2319 		}
2320 
2321 	}
2322 	return (LPL_TOPO_CORRECT);
2323 }
2324 
2325 /*
2326  * Flatten lpl topology to given number of levels.  This is presently only
2327  * implemented for a flatten to 2 levels, which will prune out the intermediates
2328  * and home the leaf lpls to the root lpl.
2329  */
2330 int
2331 lpl_topo_flatten(int levels)
2332 {
2333 	int		i;
2334 	uint_t		sum;
2335 	lgrp_t		*lgrp_cur;
2336 	lpl_t		*lpl_cur;
2337 	lpl_t		*lpl_root;
2338 	cpupart_t	*cp;
2339 
2340 	if (levels != 2)
2341 		return (0);
2342 
2343 	/* called w/ cpus paused - grab no locks! */
2344 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2345 	    !lgrp_initialized);
2346 
2347 	cp = cp_list_head;
2348 	do {
2349 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2350 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2351 
2352 		for (i = 0; i <= lgrp_alloc_max; i++) {
2353 			lgrp_cur = lgrp_table[i];
2354 			lpl_cur = &cp->cp_lgrploads[i];
2355 
2356 			if ((lgrp_cur == lgrp_root) ||
2357 			    (!LGRP_EXISTS(lgrp_cur) &&
2358 			    (lpl_cur->lpl_ncpu == 0)))
2359 				continue;
2360 
2361 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2362 				/*
2363 				 * this should be a deleted intermediate, so
2364 				 * clear it
2365 				 */
2366 				lpl_clear(lpl_cur);
2367 			} else if ((lpl_cur->lpl_nrset == 1) &&
2368 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2369 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2370 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2371 				/*
2372 				 * this is a leaf whose parent was deleted, or
2373 				 * whose parent had their lgrp deleted.  (And
2374 				 * whose parent will soon be deleted).  Point
2375 				 * this guy back to the root lpl.
2376 				 */
2377 				lpl_cur->lpl_parent = lpl_root;
2378 				lpl_rset_add(lpl_root, lpl_cur);
2379 			}
2380 
2381 		}
2382 
2383 		/*
2384 		 * Now that we're done, make sure the count on the root lpl is
2385 		 * correct, and update the hints of the children for the sake of
2386 		 * thoroughness
2387 		 */
2388 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2389 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2390 		}
2391 		lpl_root->lpl_ncpu = sum;
2392 		lpl_child_update(lpl_root, cp);
2393 
2394 		cp = cp->cp_next;
2395 	} while (cp != cp_list_head);
2396 
2397 	return (levels);
2398 }
2399 
2400 /*
2401  * Insert a lpl into the resource hierarchy and create any additional lpls that
2402  * are necessary to represent the varying states of locality for the cpu
2403  * resoruces newly added to the partition.
2404  *
2405  * This routine is clever enough that it can correctly add resources from the
2406  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2407  * those for which the lpl is a leaf as opposed to simply a named equally local
2408  * resource).  The one special case that needs additional processing is when a
2409  * new intermediate lpl is introduced.  Since the main loop only traverses
2410  * looking to add the leaf resource where it does not yet exist, additional work
2411  * is necessary to add other leaf resources that may need to exist in the newly
2412  * created intermediate.  This is performed by the second inner loop, and is
2413  * only done when the check for more than one overlapping resource succeeds.
2414  */
2415 
2416 void
2417 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2418 {
2419 	int		i;
2420 	int		j;
2421 	int		rset_num_intersect;
2422 	lgrp_t		*lgrp_cur;
2423 	lpl_t		*lpl_cur;
2424 	lpl_t		*lpl_parent;
2425 	lgrp_id_t	parent_id;
2426 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2427 
2428 	for (i = 0; i <= lgrp_alloc_max; i++) {
2429 		lgrp_cur = lgrp_table[i];
2430 
2431 		/*
2432 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2433 		 * contained within the current lgrp, or if the current lgrp has
2434 		 * no leaves in this partition
2435 		 */
2436 
2437 		if (!LGRP_EXISTS(lgrp_cur) ||
2438 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2439 		    lpl_leaf->lpl_lgrpid) ||
2440 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2441 		    cpupart->cp_lgrpset))
2442 			continue;
2443 
2444 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2445 		if (lgrp_cur->lgrp_parent != NULL) {
2446 			/* if lgrp has a parent, assign it properly */
2447 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2448 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2449 		} else {
2450 			/* if not, make sure parent ptr gets set to null */
2451 			lpl_parent = NULL;
2452 		}
2453 
2454 		if (lpl_cur == lpl_leaf) {
2455 			/*
2456 			 * Almost all leaf state was initialized elsewhere.  The
2457 			 * only thing left to do is to set the parent.
2458 			 */
2459 			lpl_cur->lpl_parent = lpl_parent;
2460 			continue;
2461 		}
2462 
2463 		lpl_clear(lpl_cur);
2464 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2465 
2466 		lpl_cur->lpl_parent = lpl_parent;
2467 
2468 		/* does new lpl need to be populated with other resources? */
2469 		rset_intersect =
2470 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2471 		    cpupart->cp_lgrpset);
2472 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2473 
2474 		if (rset_num_intersect > 1) {
2475 			/*
2476 			 * If so, figure out what lpls have resources that
2477 			 * intersect this one, and add them.
2478 			 */
2479 			for (j = 0; j <= lgrp_alloc_max; j++) {
2480 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2481 				lpl_t	*lpl_cand;	/* candidate lpl */
2482 
2483 				lgrp_cand = lgrp_table[j];
2484 				if (!LGRP_EXISTS(lgrp_cand) ||
2485 				    !klgrpset_ismember(rset_intersect,
2486 				    lgrp_cand->lgrp_id))
2487 					continue;
2488 				lpl_cand =
2489 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2490 				lpl_rset_add(lpl_cur, lpl_cand);
2491 			}
2492 		}
2493 		/*
2494 		 * This lpl's rset has changed. Update the hint in it's
2495 		 * children.
2496 		 */
2497 		lpl_child_update(lpl_cur, cpupart);
2498 	}
2499 }
2500 
2501 /*
2502  * remove a lpl from the hierarchy of resources, clearing its state when
2503  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2504  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2505  * delete them as well.
2506  */
2507 
2508 void
2509 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2510 {
2511 	int		i;
2512 	lgrp_t		*lgrp_cur;
2513 	lpl_t		*lpl_cur;
2514 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2515 
2516 	for (i = 0; i <= lgrp_alloc_max; i++) {
2517 		lgrp_cur = lgrp_table[i];
2518 
2519 		/*
2520 		 * Don't attempt to remove from lgrps that aren't there, that
2521 		 * don't contain our leaf, or from the leaf itself. (We do that
2522 		 * later)
2523 		 */
2524 
2525 		if (!LGRP_EXISTS(lgrp_cur))
2526 			continue;
2527 
2528 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2529 
2530 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2531 		    lpl_leaf->lpl_lgrpid) ||
2532 		    (lpl_cur == lpl_leaf)) {
2533 			continue;
2534 		}
2535 
2536 		/*
2537 		 * This is a slightly sleazy simplification in that we have
2538 		 * already marked the cp_lgrpset as no longer containing the
2539 		 * leaf we've deleted.  Any lpls that pass the above checks
2540 		 * based upon lgrp membership but not necessarily cpu-part
2541 		 * membership also get cleared by the checks below.  Currently
2542 		 * this is harmless, as the lpls should be empty anyway.
2543 		 *
2544 		 * In particular, we want to preserve lpls that have additional
2545 		 * leaf resources, even though we don't yet have a processor
2546 		 * architecture that represents resources this way.
2547 		 */
2548 
2549 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2550 		    cpupart->cp_lgrpset);
2551 
2552 		lpl_rset_del(lpl_cur, lpl_leaf);
2553 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2554 			lpl_clear(lpl_cur);
2555 		} else {
2556 			/*
2557 			 * Update this lpl's children
2558 			 */
2559 			lpl_child_update(lpl_cur, cpupart);
2560 		}
2561 	}
2562 	lpl_clear(lpl_leaf);
2563 }
2564 
2565 /*
2566  * add a cpu to a partition in terms of lgrp load avg bookeeping
2567  *
2568  * The lpl (cpu partition load average information) is now arranged in a
2569  * hierarchical fashion whereby resources that are closest, ie. most local, to
2570  * the cpu in question are considered to be leaves in a tree of resources.
2571  * There are two general cases for cpu additon:
2572  *
2573  * 1. A lpl structure that contains resources already in the hierarchy tree.
2574  * In this case, all of the associated lpl relationships have been defined, and
2575  * all that is necessary is that we link the new cpu into the per-lpl list of
2576  * cpus, and increment the ncpu count of all places where this cpu resource will
2577  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2578  * pushing is accomplished by this routine.
2579  *
2580  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2581  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2582  * construct the hierarchy of state necessary to name it's more distant
2583  * resources, if they should exist.  The leaf structure is initialized by this
2584  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2585  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2586  * and builds all of the "ancestoral" state necessary to identify resources at
2587  * differing levels of locality.
2588  */
2589 void
2590 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2591 {
2592 	cpupart_t	*cpupart;
2593 	lgrp_t		*lgrp_leaf;
2594 	lpl_t		*lpl_leaf;
2595 
2596 	/* called sometimes w/ cpus paused - grab no locks */
2597 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2598 
2599 	cpupart = cp->cpu_part;
2600 	lgrp_leaf = lgrp_table[lgrpid];
2601 
2602 	/* don't add non-existent lgrp */
2603 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2604 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2605 	cp->cpu_lpl = lpl_leaf;
2606 
2607 	/* only leaf lpls contain cpus */
2608 
2609 	if (lpl_leaf->lpl_ncpu++ == 0) {
2610 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2611 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2612 		lpl_leaf_insert(lpl_leaf, cpupart);
2613 	} else {
2614 		/*
2615 		 * the lpl should already exist in the parent, so just update
2616 		 * the count of available CPUs
2617 		 */
2618 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2619 	}
2620 
2621 	/* link cpu into list of cpus in lpl */
2622 
2623 	if (lpl_leaf->lpl_cpus) {
2624 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2625 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2626 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2627 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2628 	} else {
2629 		/*
2630 		 * We increment ncpu immediately after we create a new leaf
2631 		 * lpl, so assert that ncpu == 1 for the case where we don't
2632 		 * have any cpu pointers yet.
2633 		 */
2634 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2635 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2636 	}
2637 
2638 }
2639 
2640 
2641 /*
2642  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2643  *
2644  * The lpl (cpu partition load average information) is now arranged in a
2645  * hierarchical fashion whereby resources that are closest, ie. most local, to
2646  * the cpu in question are considered to be leaves in a tree of resources.
2647  * There are two removal cases in question:
2648  *
2649  * 1. Removal of the resource in the leaf leaves other resources remaining in
2650  * that leaf.  (Another cpu still exists at this level of locality).  In this
2651  * case, the count of available cpus is decremented in all assocated lpls by
2652  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2653  * from the per-cpu lpl list.
2654  *
2655  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2656  * empty)  In this case, all of what has occurred for the first step must take
2657  * place; however, additionally we must remove the lpl structure itself, prune
2658  * out any stranded lpls that do not directly name a leaf resource, and mark the
2659  * cpu partition in question as no longer containing resources from the lgrp of
2660  * the lpl that has been delted.  Cpu-partition changes are handled by this
2661  * method, but the lpl_leaf_remove function deals with the details of pruning
2662  * out the empty lpl and any of its orphaned direct ancestors.
2663  */
2664 void
2665 lgrp_part_del_cpu(cpu_t *cp)
2666 {
2667 	lpl_t		*lpl;
2668 	lpl_t		*leaf_lpl;
2669 	lgrp_t		*lgrp_leaf;
2670 
2671 	/* called sometimes w/ cpus paused - grab no locks */
2672 
2673 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2674 
2675 	lpl = leaf_lpl = cp->cpu_lpl;
2676 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2677 
2678 	/* don't delete a leaf that isn't there */
2679 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2680 
2681 	/* no double-deletes */
2682 	ASSERT(lpl->lpl_ncpu);
2683 	if (--lpl->lpl_ncpu == 0) {
2684 		/*
2685 		 * This was the last cpu in this lgroup for this partition,
2686 		 * clear its bit in the partition's lgroup bitmask
2687 		 */
2688 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2689 
2690 		/* eliminate remaning lpl link pointers in cpu, lpl */
2691 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2692 
2693 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2694 	} else {
2695 
2696 		/* unlink cpu from lists of cpus in lpl */
2697 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2698 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2699 		if (lpl->lpl_cpus == cp) {
2700 			lpl->lpl_cpus = cp->cpu_next_lpl;
2701 		}
2702 
2703 		/*
2704 		 * Update the cpu count in the lpls associated with parent
2705 		 * lgroups.
2706 		 */
2707 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2708 
2709 	}
2710 	/* clear cpu's lpl ptr when we're all done */
2711 	cp->cpu_lpl = NULL;
2712 }
2713 
2714 /*
2715  * Recompute load average for the specified partition/lgrp fragment.
2716  *
2717  * We rely on the fact that this routine is called from the clock thread
2718  * at a point before the clock thread can block (i.e. before its first
2719  * lock request).  Since the clock thread can not be preempted (since it
2720  * runs at highest priority), we know that cpu partitions can not change
2721  * (since doing so would require either the repartition requester or the
2722  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2723  * without grabbing cpu_lock.
2724  */
2725 void
2726 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2727 {
2728 	uint_t		ncpu;
2729 	int64_t		old, new, f;
2730 
2731 	/*
2732 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2733 	 */
2734 	static short expval[] = {
2735 	    0, 3196, 1618, 1083,
2736 	    814, 652, 543, 466,
2737 	    408, 363, 326, 297,
2738 	    272, 251, 233, 218,
2739 	    204, 192, 181, 172,
2740 	    163, 155, 148, 142,
2741 	    136, 130, 125, 121,
2742 	    116, 112, 109, 105
2743 	};
2744 
2745 	/* ASSERT (called from clock level) */
2746 
2747 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2748 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2749 		return;
2750 	}
2751 
2752 	for (;;) {
2753 
2754 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2755 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2756 		else
2757 			f = expval[ncpu];
2758 
2759 		/*
2760 		 * Modify the load average atomically to avoid losing
2761 		 * anticipatory load updates (see lgrp_move_thread()).
2762 		 */
2763 		if (ageflag) {
2764 			/*
2765 			 * We're supposed to both update and age the load.
2766 			 * This happens 10 times/sec. per cpu.  We do a
2767 			 * little hoop-jumping to avoid integer overflow.
2768 			 */
2769 			int64_t		q, r;
2770 
2771 			do {
2772 				old = new = lpl->lpl_loadavg;
2773 				q = (old  >> 16) << 7;
2774 				r = (old  & 0xffff) << 7;
2775 				new += ((long long)(nrcpus - q) * f -
2776 				    ((r * f) >> 16)) >> 7;
2777 
2778 				/*
2779 				 * Check for overflow
2780 				 */
2781 				if (new > LGRP_LOADAVG_MAX)
2782 					new = LGRP_LOADAVG_MAX;
2783 				else if (new < 0)
2784 					new = 0;
2785 			} while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2786 			    old, new) != old);
2787 		} else {
2788 			/*
2789 			 * We're supposed to update the load, but not age it.
2790 			 * This option is used to update the load (which either
2791 			 * has already been aged in this 1/10 sec. interval or
2792 			 * soon will be) to account for a remotely executing
2793 			 * thread.
2794 			 */
2795 			do {
2796 				old = new = lpl->lpl_loadavg;
2797 				new += f;
2798 				/*
2799 				 * Check for overflow
2800 				 * Underflow not possible here
2801 				 */
2802 				if (new < old)
2803 					new = LGRP_LOADAVG_MAX;
2804 			} while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2805 			    old, new) != old);
2806 		}
2807 
2808 		/*
2809 		 * Do the same for this lpl's parent
2810 		 */
2811 		if ((lpl = lpl->lpl_parent) == NULL)
2812 			break;
2813 		ncpu = lpl->lpl_ncpu;
2814 	}
2815 }
2816 
2817 /*
2818  * Initialize lpl topology in the target based on topology currently present in
2819  * lpl_bootstrap.
2820  *
2821  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2822  * initialize cp_default list of lpls. Up to this point all topology operations
2823  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2824  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2825  * `target' points to the list of lpls in cp_default and `size' is the size of
2826  * this list.
2827  *
2828  * This function walks the lpl topology in lpl_bootstrap and does for things:
2829  *
2830  * 1) Copies all fields from lpl_bootstrap to the target.
2831  *
2832  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2833  *
2834  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2835  *    instead of lpl_bootstrap.
2836  *
2837  * 4) Updates pointers in the resource list of the target to point to the lpls
2838  *    in the target list instead of lpl_bootstrap.
2839  *
2840  * After lpl_topo_bootstrap() completes, target contains the same information
2841  * that would be present there if it were used during boot instead of
2842  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2843  * and it is bzeroed.
2844  */
2845 void
2846 lpl_topo_bootstrap(lpl_t *target, int size)
2847 {
2848 	lpl_t	*lpl = lpl_bootstrap;
2849 	lpl_t	*target_lpl = target;
2850 	lpl_t	**rset;
2851 	int	*id2rset;
2852 	int	sz;
2853 	int	howmany;
2854 	int	id;
2855 	int	i;
2856 
2857 	/*
2858 	 * The only target that should be passed here is cp_default lpl list.
2859 	 */
2860 	ASSERT(target == cp_default.cp_lgrploads);
2861 	ASSERT(size == cp_default.cp_nlgrploads);
2862 	ASSERT(!lgrp_topo_initialized);
2863 	ASSERT(ncpus == 1);
2864 
2865 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2866 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2867 		/*
2868 		 * Copy all fields from lpl, except for the rset,
2869 		 * lgrp id <=> rset mapping storage,
2870 		 * and amount of storage
2871 		 */
2872 		rset = target_lpl->lpl_rset;
2873 		id2rset = target_lpl->lpl_id2rset;
2874 		sz = target_lpl->lpl_rset_sz;
2875 
2876 		*target_lpl = *lpl;
2877 
2878 		target_lpl->lpl_rset_sz = sz;
2879 		target_lpl->lpl_rset = rset;
2880 		target_lpl->lpl_id2rset = id2rset;
2881 
2882 		/*
2883 		 * Substitute CPU0 lpl pointer with one relative to target.
2884 		 */
2885 		if (lpl->lpl_cpus == CPU) {
2886 			ASSERT(CPU->cpu_lpl == lpl);
2887 			CPU->cpu_lpl = target_lpl;
2888 		}
2889 
2890 		/*
2891 		 * Substitute parent information with parent relative to target.
2892 		 */
2893 		if (lpl->lpl_parent != NULL)
2894 			target_lpl->lpl_parent = (lpl_t *)
2895 			    (((uintptr_t)lpl->lpl_parent -
2896 			    (uintptr_t)lpl_bootstrap) +
2897 			    (uintptr_t)target);
2898 
2899 		/*
2900 		 * Walk over resource set substituting pointers relative to
2901 		 * lpl_bootstrap's rset to pointers relative to target's
2902 		 */
2903 		ASSERT(lpl->lpl_nrset <= 1);
2904 
2905 		for (id = 0; id < lpl->lpl_nrset; id++) {
2906 			if (lpl->lpl_rset[id] != NULL) {
2907 				target_lpl->lpl_rset[id] = (lpl_t *)
2908 				    (((uintptr_t)lpl->lpl_rset[id] -
2909 				    (uintptr_t)lpl_bootstrap) +
2910 				    (uintptr_t)target);
2911 			}
2912 			target_lpl->lpl_id2rset[id] =
2913 			    lpl->lpl_id2rset[id];
2914 		}
2915 	}
2916 
2917 	/*
2918 	 * Clean up the bootstrap lpls since we have switched over to the
2919 	 * actual lpl array in the default cpu partition.
2920 	 *
2921 	 * We still need to keep one empty lpl around for newly starting
2922 	 * slave CPUs to reference should they need to make it through the
2923 	 * dispatcher prior to their lgrp/lpl initialization.
2924 	 *
2925 	 * The lpl related dispatcher code has been designed to work properly
2926 	 * (and without extra checks) for this special case of a zero'ed
2927 	 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2928 	 * with lgrpid 0 and an empty resource set. Iteration over the rset
2929 	 * array by the dispatcher is also NULL terminated for this reason.
2930 	 *
2931 	 * This provides the desired behaviour for an uninitialized CPU.
2932 	 * It shouldn't see any other CPU to either dispatch to or steal
2933 	 * from until it is properly initialized.
2934 	 */
2935 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2936 	bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2937 	bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2938 
2939 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2940 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2941 }
2942 
2943 /*
2944  * If the lowest load among the lgroups a process' threads are currently
2945  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2946  * expanding the process to a new lgroup.
2947  */
2948 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2949 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2950 
2951 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2952 	((lgrp_expand_proc_thresh) / (ncpu))
2953 
2954 /*
2955  * A process will be expanded to a new lgroup only if the difference between
2956  * the lowest load on the lgroups the process' thread's are currently spread
2957  * across and the lowest load on the other lgroups in the process' partition
2958  * is greater than lgrp_expand_proc_diff.
2959  */
2960 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2961 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2962 
2963 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2964 	((lgrp_expand_proc_diff) / (ncpu))
2965 
2966 /*
2967  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2968  * be present due to impreciseness of the load average decay algorithm.
2969  *
2970  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2971  * tolerance is scaled by the number of cpus in the lgroup just like
2972  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2973  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2974  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2975  */
2976 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2977 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2978 	((lgrp_loadavg_tolerance) / ncpu)
2979 
2980 /*
2981  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2982  * average is above this threshold
2983  */
2984 uint32_t	lgrp_load_thresh = UINT32_MAX;
2985 
2986 /*
2987  * lgrp_choose() will try to skip any lgroups with less memory
2988  * than this free when choosing a home lgroup
2989  */
2990 pgcnt_t	lgrp_mem_free_thresh = 0;
2991 
2992 /*
2993  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2994  * one based on one of the following policies:
2995  * - Random selection
2996  * - Pseudo round robin placement
2997  * - Longest time since a thread was last placed
2998  */
2999 #define	LGRP_CHOOSE_RANDOM	1
3000 #define	LGRP_CHOOSE_RR		2
3001 #define	LGRP_CHOOSE_TIME	3
3002 
3003 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3004 
3005 /*
3006  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3007  * be bound to a CPU or processor set.
3008  *
3009  * Arguments:
3010  *	t		The thread
3011  *	cpupart		The partition the thread belongs to.
3012  *
3013  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3014  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3015  *	 partitions changing out from under us and assumes that given thread is
3016  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3017  *	 disabled, so don't grab any locks because we should never block under
3018  *	 those conditions.
3019  */
3020 lpl_t *
3021 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3022 {
3023 	lgrp_load_t	bestload, bestrload;
3024 	int		lgrpid_offset, lgrp_count;
3025 	lgrp_id_t	lgrpid, lgrpid_start;
3026 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3027 	klgrpset_t	lgrpset;
3028 	proc_t		*p;
3029 
3030 	ASSERT(t != NULL);
3031 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3032 	    THREAD_LOCK_HELD(t));
3033 	ASSERT(cpupart != NULL);
3034 
3035 	p = t->t_procp;
3036 
3037 	/* A process should always be in an active partition */
3038 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3039 
3040 	bestlpl = bestrlpl = NULL;
3041 	bestload = bestrload = LGRP_LOADAVG_MAX;
3042 	lgrpset = cpupart->cp_lgrpset;
3043 
3044 	switch (lgrp_choose_policy) {
3045 	case LGRP_CHOOSE_RR:
3046 		lgrpid = cpupart->cp_lgrp_hint;
3047 		do {
3048 			if (++lgrpid > lgrp_alloc_max)
3049 				lgrpid = 0;
3050 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3051 
3052 		break;
3053 	default:
3054 	case LGRP_CHOOSE_TIME:
3055 	case LGRP_CHOOSE_RANDOM:
3056 		klgrpset_nlgrps(lgrpset, lgrp_count);
3057 		lgrpid_offset =
3058 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3059 		for (lgrpid = 0; ; lgrpid++) {
3060 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3061 				if (--lgrpid_offset == 0)
3062 					break;
3063 			}
3064 		}
3065 		break;
3066 	}
3067 
3068 	lgrpid_start = lgrpid;
3069 
3070 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3071 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3072 
3073 	/*
3074 	 * Use lgroup affinities (if any) to choose best lgroup
3075 	 *
3076 	 * NOTE: Assumes that thread is protected from going away and its
3077 	 *	 lgroup affinities won't change (ie. p_lock, or
3078 	 *	 thread_lock() being held and/or CPUs paused)
3079 	 */
3080 	if (t->t_lgrp_affinity) {
3081 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3082 		if (lpl != NULL)
3083 			return (lpl);
3084 	}
3085 
3086 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3087 
3088 	do {
3089 		pgcnt_t	npgs;
3090 
3091 		/*
3092 		 * Skip any lgroups outside of thread's pset
3093 		 */
3094 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3095 			if (++lgrpid > lgrp_alloc_max)
3096 				lgrpid = 0;	/* wrap the search */
3097 			continue;
3098 		}
3099 
3100 		/*
3101 		 * Skip any non-leaf lgroups
3102 		 */
3103 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3104 			continue;
3105 
3106 		/*
3107 		 * Skip any lgroups without enough free memory
3108 		 * (when threshold set to nonzero positive value)
3109 		 */
3110 		if (lgrp_mem_free_thresh > 0) {
3111 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3112 			if (npgs < lgrp_mem_free_thresh) {
3113 				if (++lgrpid > lgrp_alloc_max)
3114 					lgrpid = 0;	/* wrap the search */
3115 				continue;
3116 			}
3117 		}
3118 
3119 		lpl = &cpupart->cp_lgrploads[lgrpid];
3120 		if (klgrpset_isempty(p->p_lgrpset) ||
3121 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3122 			/*
3123 			 * Either this is a new process or the process already
3124 			 * has threads on this lgrp, so this is a preferred
3125 			 * lgroup for the thread.
3126 			 */
3127 			if (bestlpl == NULL ||
3128 			    lpl_pick(lpl, bestlpl)) {
3129 				bestload = lpl->lpl_loadavg;
3130 				bestlpl = lpl;
3131 			}
3132 		} else {
3133 			/*
3134 			 * The process doesn't have any threads on this lgrp,
3135 			 * but we're willing to consider this lgrp if the load
3136 			 * difference is big enough to justify splitting up
3137 			 * the process' threads.
3138 			 */
3139 			if (bestrlpl == NULL ||
3140 			    lpl_pick(lpl, bestrlpl)) {
3141 				bestrload = lpl->lpl_loadavg;
3142 				bestrlpl = lpl;
3143 			}
3144 		}
3145 		if (++lgrpid > lgrp_alloc_max)
3146 			lgrpid = 0;	/* wrap the search */
3147 	} while (lgrpid != lgrpid_start);
3148 
3149 	/*
3150 	 * Return root lgroup if threshold isn't set to maximum value and
3151 	 * lowest lgroup load average more than a certain threshold
3152 	 */
3153 	if (lgrp_load_thresh != UINT32_MAX &&
3154 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3155 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3156 
3157 	/*
3158 	 * If all the lgroups over which the thread's process is spread are
3159 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3160 	 * the thread on one of the other leaf lgroups in the thread's
3161 	 * partition.
3162 	 */
3163 	if ((bestlpl == NULL) ||
3164 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3165 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3166 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3167 	    bestload))) {
3168 		bestlpl = bestrlpl;
3169 	}
3170 
3171 	if (bestlpl == NULL) {
3172 		/*
3173 		 * No lgroup looked particularly good, but we still
3174 		 * have to pick something. Go with the randomly selected
3175 		 * legal lgroup we started with above.
3176 		 */
3177 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3178 	}
3179 
3180 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3181 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3182 
3183 	ASSERT(bestlpl->lpl_ncpu > 0);
3184 	return (bestlpl);
3185 }
3186 
3187 /*
3188  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3189  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3190  */
3191 static int
3192 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3193 {
3194 	lgrp_load_t	l1, l2;
3195 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3196 
3197 	l1 = lpl1->lpl_loadavg;
3198 	l2 = lpl2->lpl_loadavg;
3199 
3200 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3201 		/* lpl1 is significantly less loaded than lpl2 */
3202 		return (1);
3203 	}
3204 
3205 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3206 	    l1 + tolerance >= l2 && l1 < l2 &&
3207 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3208 		/*
3209 		 * lpl1's load is within the tolerance of lpl2. We're
3210 		 * willing to consider it be to better however if
3211 		 * it has been longer since we last homed a thread there
3212 		 */
3213 		return (1);
3214 	}
3215 
3216 	return (0);
3217 }
3218 
3219 /*
3220  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3221  * process that uses text replication changed home lgrp. This info is used by
3222  * segvn asyncronous thread to detect if it needs to recheck what lgrps
3223  * should be used for text replication.
3224  */
3225 static uint64_t lgrp_trthr_moves = 0;
3226 
3227 uint64_t
3228 lgrp_get_trthr_migrations(void)
3229 {
3230 	return (lgrp_trthr_moves);
3231 }
3232 
3233 void
3234 lgrp_update_trthr_migrations(uint64_t incr)
3235 {
3236 	atomic_add_64(&lgrp_trthr_moves, incr);
3237 }
3238 
3239 /*
3240  * An LWP is expected to be assigned to an lgroup for at least this long
3241  * for its anticipatory load to be justified.  NOTE that this value should
3242  * not be set extremely huge (say, larger than 100 years), to avoid problems
3243  * with overflow in the calculation that uses it.
3244  */
3245 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3246 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3247 
3248 /*
3249  * Routine to change a thread's lgroup affiliation.  This routine updates
3250  * the thread's kthread_t struct and its process' proc_t struct to note the
3251  * thread's new lgroup affiliation, and its lgroup affinities.
3252  *
3253  * Note that this is the only routine that modifies a thread's t_lpl field,
3254  * and that adds in or removes anticipatory load.
3255  *
3256  * If the thread is exiting, newlpl is NULL.
3257  *
3258  * Locking:
3259  * The following lock must be held on entry:
3260  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3261  *		doesn't get removed from t's partition
3262  *
3263  * This routine is not allowed to grab any locks, since it may be called
3264  * with cpus paused (such as from cpu_offline).
3265  */
3266 void
3267 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3268 {
3269 	proc_t		*p;
3270 	lpl_t		*lpl, *oldlpl;
3271 	lgrp_id_t	oldid;
3272 	kthread_t	*tp;
3273 	uint_t		ncpu;
3274 	lgrp_load_t	old, new;
3275 
3276 	ASSERT(t);
3277 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3278 	    THREAD_LOCK_HELD(t));
3279 
3280 	/*
3281 	 * If not changing lpls, just return
3282 	 */
3283 	if ((oldlpl = t->t_lpl) == newlpl)
3284 		return;
3285 
3286 	/*
3287 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3288 	 * associated with process 0 rather than with its original process).
3289 	 */
3290 	if (t->t_proc_flag & TP_LWPEXIT) {
3291 		if (newlpl != NULL) {
3292 			t->t_lpl = newlpl;
3293 		}
3294 		return;
3295 	}
3296 
3297 	p = ttoproc(t);
3298 
3299 	/*
3300 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3301 	 * to account for it being moved from its old lgroup.
3302 	 */
3303 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3304 	    (p->p_tlist != NULL)) {
3305 		oldid = oldlpl->lpl_lgrpid;
3306 
3307 		if (newlpl != NULL)
3308 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3309 
3310 		if ((do_lgrpset_delete) &&
3311 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3312 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3313 				/*
3314 				 * Check if a thread other than the thread
3315 				 * that's moving is assigned to the same
3316 				 * lgroup as the thread that's moving.  Note
3317 				 * that we have to compare lgroup IDs, rather
3318 				 * than simply comparing t_lpl's, since the
3319 				 * threads may belong to different partitions
3320 				 * but be assigned to the same lgroup.
3321 				 */
3322 				ASSERT(tp->t_lpl != NULL);
3323 
3324 				if ((tp != t) &&
3325 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3326 					/*
3327 					 * Another thread is assigned to the
3328 					 * same lgroup as the thread that's
3329 					 * moving, p_lgrpset doesn't change.
3330 					 */
3331 					break;
3332 				} else if (tp == p->p_tlist) {
3333 					/*
3334 					 * No other thread is assigned to the
3335 					 * same lgroup as the exiting thread,
3336 					 * clear the lgroup's bit in p_lgrpset.
3337 					 */
3338 					klgrpset_del(p->p_lgrpset, oldid);
3339 					break;
3340 				}
3341 			}
3342 		}
3343 
3344 		/*
3345 		 * If this thread was assigned to its old lgroup for such a
3346 		 * short amount of time that the anticipatory load that was
3347 		 * added on its behalf has aged very little, remove that
3348 		 * anticipatory load.
3349 		 */
3350 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3351 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3352 			lpl = oldlpl;
3353 			for (;;) {
3354 				do {
3355 					old = new = lpl->lpl_loadavg;
3356 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3357 					if (new > old) {
3358 						/*
3359 						 * this can happen if the load
3360 						 * average was aged since we
3361 						 * added in the anticipatory
3362 						 * load
3363 						 */
3364 						new = 0;
3365 					}
3366 				} while (atomic_cas_32(
3367 				    (lgrp_load_t *)&lpl->lpl_loadavg, old,
3368 				    new) != old);
3369 
3370 				lpl = lpl->lpl_parent;
3371 				if (lpl == NULL)
3372 					break;
3373 
3374 				ncpu = lpl->lpl_ncpu;
3375 				ASSERT(ncpu > 0);
3376 			}
3377 		}
3378 	}
3379 	/*
3380 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3381 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3382 	 * to its new lgroup to account for its move to its new lgroup.
3383 	 */
3384 	if (newlpl != NULL) {
3385 		/*
3386 		 * This thread is moving to a new lgroup
3387 		 */
3388 		t->t_lpl = newlpl;
3389 		if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3390 			p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3391 			membar_producer();
3392 			if (p->p_tr_lgrpid != LGRP_NONE &&
3393 			    p->p_tr_lgrpid != p->p_t1_lgrpid) {
3394 				lgrp_update_trthr_migrations(1);
3395 			}
3396 		}
3397 
3398 		/*
3399 		 * Reflect move in load average of new lgroup
3400 		 * unless it is root lgroup
3401 		 */
3402 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3403 			return;
3404 
3405 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3406 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3407 		}
3408 
3409 		/*
3410 		 * It'll take some time for the load on the new lgroup
3411 		 * to reflect this thread's placement on it.  We'd
3412 		 * like not, however, to have all threads between now
3413 		 * and then also piling on to this lgroup.  To avoid
3414 		 * this pileup, we anticipate the load this thread
3415 		 * will generate on its new lgroup.  The goal is to
3416 		 * make the lgroup's load appear as though the thread
3417 		 * had been there all along.  We're very conservative
3418 		 * in calculating this anticipatory load, we assume
3419 		 * the worst case case (100% CPU-bound thread).  This
3420 		 * may be modified in the future to be more accurate.
3421 		 */
3422 		lpl = newlpl;
3423 		for (;;) {
3424 			ncpu = lpl->lpl_ncpu;
3425 			ASSERT(ncpu > 0);
3426 			do {
3427 				old = new = lpl->lpl_loadavg;
3428 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3429 				/*
3430 				 * Check for overflow
3431 				 * Underflow not possible here
3432 				 */
3433 				if (new < old)
3434 					new = UINT32_MAX;
3435 			} while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
3436 			    old, new) != old);
3437 
3438 			lpl = lpl->lpl_parent;
3439 			if (lpl == NULL)
3440 				break;
3441 		}
3442 		t->t_anttime = gethrtime();
3443 	}
3444 }
3445 
3446 /*
3447  * Return lgroup memory allocation policy given advice from madvise(3C)
3448  */
3449 lgrp_mem_policy_t
3450 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3451 {
3452 	switch (advice) {
3453 	case MADV_ACCESS_LWP:
3454 		return (LGRP_MEM_POLICY_NEXT);
3455 	case MADV_ACCESS_MANY:
3456 		return (LGRP_MEM_POLICY_RANDOM);
3457 	default:
3458 		return (lgrp_mem_policy_default(size, type));
3459 	}
3460 }
3461 
3462 /*
3463  * Figure out default policy
3464  */
3465 lgrp_mem_policy_t
3466 lgrp_mem_policy_default(size_t size, int type)
3467 {
3468 	cpupart_t		*cp;
3469 	lgrp_mem_policy_t	policy;
3470 	size_t			pset_mem_size;
3471 
3472 	/*
3473 	 * Randomly allocate memory across lgroups for shared memory
3474 	 * beyond a certain threshold
3475 	 */
3476 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3477 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3478 		/*
3479 		 * Get total memory size of current thread's pset
3480 		 */
3481 		kpreempt_disable();
3482 		cp = curthread->t_cpupart;
3483 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3484 		kpreempt_enable();
3485 
3486 		/*
3487 		 * Choose policy to randomly allocate memory across
3488 		 * lgroups in pset if it will fit and is not default
3489 		 * partition.  Otherwise, allocate memory randomly
3490 		 * across machine.
3491 		 */
3492 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3493 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3494 		else
3495 			policy = LGRP_MEM_POLICY_RANDOM;
3496 	} else
3497 		/*
3498 		 * Apply default policy for private memory and
3499 		 * shared memory under the respective random
3500 		 * threshold.
3501 		 */
3502 		policy = lgrp_mem_default_policy;
3503 
3504 	return (policy);
3505 }
3506 
3507 /*
3508  * Get memory allocation policy for this segment
3509  */
3510 lgrp_mem_policy_info_t *
3511 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3512 {
3513 	lgrp_mem_policy_info_t	*policy_info;
3514 	extern struct seg_ops	segspt_ops;
3515 	extern struct seg_ops	segspt_shmops;
3516 
3517 	/*
3518 	 * This is for binary compatibility to protect against third party
3519 	 * segment drivers which haven't recompiled to allow for
3520 	 * SEGOP_GETPOLICY()
3521 	 */
3522 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3523 	    seg->s_ops != &segspt_shmops)
3524 		return (NULL);
3525 
3526 	policy_info = NULL;
3527 	if (seg->s_ops->getpolicy != NULL)
3528 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3529 
3530 	return (policy_info);
3531 }
3532 
3533 /*
3534  * Set policy for allocating private memory given desired policy, policy info,
3535  * size in bytes of memory that policy is being applied.
3536  * Return 0 if policy wasn't set already and 1 if policy was set already
3537  */
3538 int
3539 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3540     lgrp_mem_policy_info_t *policy_info, size_t size)
3541 {
3542 
3543 	ASSERT(policy_info != NULL);
3544 
3545 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3546 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3547 
3548 	/*
3549 	 * Policy set already?
3550 	 */
3551 	if (policy == policy_info->mem_policy)
3552 		return (1);
3553 
3554 	/*
3555 	 * Set policy
3556 	 */
3557 	policy_info->mem_policy = policy;
3558 	policy_info->mem_lgrpid = LGRP_NONE;
3559 
3560 	return (0);
3561 }
3562 
3563 
3564 /*
3565  * Get shared memory allocation policy with given tree and offset
3566  */
3567 lgrp_mem_policy_info_t *
3568 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3569     u_offset_t vn_off)
3570 {
3571 	u_offset_t		off;
3572 	lgrp_mem_policy_info_t	*policy_info;
3573 	lgrp_shm_policy_seg_t	*policy_seg;
3574 	lgrp_shm_locality_t	*shm_locality;
3575 	avl_tree_t		*tree;
3576 	avl_index_t		where;
3577 
3578 	shm_locality = NULL;
3579 	tree = NULL;
3580 	/*
3581 	 * Get policy segment tree from anon_map or vnode and use specified
3582 	 * anon index or vnode offset as offset
3583 	 *
3584 	 * Assume that no lock needs to be held on anon_map or vnode, since
3585 	 * they should be protected by their reference count which must be
3586 	 * nonzero for an existing segment
3587 	 */
3588 	if (amp) {
3589 		ASSERT(amp->refcnt != 0);
3590 		shm_locality = amp->locality;
3591 		if (shm_locality == NULL)
3592 			return (NULL);
3593 		tree = shm_locality->loc_tree;
3594 		off = ptob(anon_index);
3595 	} else if (vp) {
3596 		shm_locality = vp->v_locality;
3597 		if (shm_locality == NULL)
3598 			return (NULL);
3599 		ASSERT(shm_locality->loc_count != 0);
3600 		tree = shm_locality->loc_tree;
3601 		off = vn_off;
3602 	}
3603 
3604 	if (tree == NULL)
3605 		return (NULL);
3606 
3607 	/*
3608 	 * Lookup policy segment for offset into shared object and return
3609 	 * policy info
3610 	 */
3611 	rw_enter(&shm_locality->loc_lock, RW_READER);
3612 	policy_info = NULL;
3613 	policy_seg = avl_find(tree, &off, &where);
3614 	if (policy_seg)
3615 		policy_info = &policy_seg->shm_policy;
3616 	rw_exit(&shm_locality->loc_lock);
3617 
3618 	return (policy_info);
3619 }
3620 
3621 /*
3622  * Default memory allocation policy for kernel segmap pages
3623  */
3624 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3625 
3626 /*
3627  * Return lgroup to use for allocating memory
3628  * given the segment and address
3629  *
3630  * There isn't any mutual exclusion that exists between calls
3631  * to this routine and DR, so this routine and whomever calls it
3632  * should be mindful of the possibility that the lgrp returned
3633  * may be deleted. If this happens, dereferences of the lgrp
3634  * pointer will still be safe, but the resources in the lgrp will
3635  * be gone, and LGRP_EXISTS() will no longer be true.
3636  */
3637 lgrp_t *
3638 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3639 {
3640 	int			i;
3641 	lgrp_t			*lgrp;
3642 	klgrpset_t		lgrpset;
3643 	int			lgrps_spanned;
3644 	unsigned long		off;
3645 	lgrp_mem_policy_t	policy;
3646 	lgrp_mem_policy_info_t	*policy_info;
3647 	ushort_t		random;
3648 	int			stat = 0;
3649 	extern struct seg	*segkmap;
3650 
3651 	/*
3652 	 * Just return null if the lgrp framework hasn't finished
3653 	 * initializing or if this is a UMA machine.
3654 	 */
3655 	if (nlgrps == 1 || !lgrp_initialized)
3656 		return (lgrp_root);
3657 
3658 	/*
3659 	 * Get memory allocation policy for this segment
3660 	 */
3661 	policy = lgrp_mem_default_policy;
3662 	if (seg != NULL) {
3663 		if (seg->s_as == &kas) {
3664 			if (seg == segkmap)
3665 				policy = lgrp_segmap_default_policy;
3666 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3667 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3668 				policy = LGRP_MEM_POLICY_RANDOM;
3669 		} else {
3670 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3671 			if (policy_info != NULL) {
3672 				policy = policy_info->mem_policy;
3673 				if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3674 					lgrp_id_t id = policy_info->mem_lgrpid;
3675 					ASSERT(id != LGRP_NONE);
3676 					ASSERT(id < NLGRPS_MAX);
3677 					lgrp = lgrp_table[id];
3678 					if (!LGRP_EXISTS(lgrp)) {
3679 						policy = LGRP_MEM_POLICY_NEXT;
3680 					} else {
3681 						lgrp_stat_add(id,
3682 						    LGRP_NUM_NEXT_SEG, 1);
3683 						return (lgrp);
3684 					}
3685 				}
3686 			}
3687 		}
3688 	}
3689 	lgrpset = 0;
3690 
3691 	/*
3692 	 * Initialize lgroup to home by default
3693 	 */
3694 	lgrp = lgrp_home_lgrp();
3695 
3696 	/*
3697 	 * When homing threads on root lgrp, override default memory
3698 	 * allocation policies with root lgroup memory allocation policy
3699 	 */
3700 	if (lgrp == lgrp_root)
3701 		policy = lgrp_mem_policy_root;
3702 
3703 	/*
3704 	 * Implement policy
3705 	 */
3706 	switch (policy) {
3707 	case LGRP_MEM_POLICY_NEXT_CPU:
3708 
3709 		/*
3710 		 * Return lgroup of current CPU which faulted on memory
3711 		 * If the CPU isn't currently in an lgrp, then opt to
3712 		 * allocate from the root.
3713 		 *
3714 		 * Kernel preemption needs to be disabled here to prevent
3715 		 * the current CPU from going away before lgrp is found.
3716 		 */
3717 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3718 			lgrp = lgrp_root;
3719 		} else {
3720 			kpreempt_disable();
3721 			lgrp = lgrp_cpu_to_lgrp(CPU);
3722 			kpreempt_enable();
3723 		}
3724 		break;
3725 
3726 	case LGRP_MEM_POLICY_NEXT:
3727 	case LGRP_MEM_POLICY_DEFAULT:
3728 	default:
3729 
3730 		/*
3731 		 * Just return current thread's home lgroup
3732 		 * for default policy (next touch)
3733 		 * If the thread is homed to the root,
3734 		 * then the default policy is random across lgroups.
3735 		 * Fallthrough to the random case.
3736 		 */
3737 		if (lgrp != lgrp_root) {
3738 			if (policy == LGRP_MEM_POLICY_NEXT)
3739 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3740 			else
3741 				lgrp_stat_add(lgrp->lgrp_id,
3742 				    LGRP_NUM_DEFAULT, 1);
3743 			break;
3744 		}
3745 		/* FALLTHROUGH */
3746 	case LGRP_MEM_POLICY_RANDOM:
3747 
3748 		/*
3749 		 * Return a random leaf lgroup with memory
3750 		 */
3751 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3752 		/*
3753 		 * Count how many lgroups are spanned
3754 		 */
3755 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3756 
3757 		/*
3758 		 * There may be no memnodes in the root lgroup during DR copy
3759 		 * rename on a system with only two boards (memnodes)
3760 		 * configured. In this case just return the root lgrp.
3761 		 */
3762 		if (lgrps_spanned == 0) {
3763 			lgrp = lgrp_root;
3764 			break;
3765 		}
3766 
3767 		/*
3768 		 * Pick a random offset within lgroups spanned
3769 		 * and return lgroup at that offset
3770 		 */
3771 		random = (ushort_t)gethrtime() >> 4;
3772 		off = random % lgrps_spanned;
3773 		ASSERT(off <= lgrp_alloc_max);
3774 
3775 		for (i = 0; i <= lgrp_alloc_max; i++) {
3776 			if (!klgrpset_ismember(lgrpset, i))
3777 				continue;
3778 			if (off)
3779 				off--;
3780 			else {
3781 				lgrp = lgrp_table[i];
3782 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3783 				    1);
3784 				break;
3785 			}
3786 		}
3787 		break;
3788 
3789 	case LGRP_MEM_POLICY_RANDOM_PROC:
3790 
3791 		/*
3792 		 * Grab copy of bitmask of lgroups spanned by
3793 		 * this process
3794 		 */
3795 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3796 		stat = LGRP_NUM_RANDOM_PROC;
3797 
3798 		/* FALLTHROUGH */
3799 	case LGRP_MEM_POLICY_RANDOM_PSET:
3800 
3801 		if (!stat)
3802 			stat = LGRP_NUM_RANDOM_PSET;
3803 
3804 		if (klgrpset_isempty(lgrpset)) {
3805 			/*
3806 			 * Grab copy of bitmask of lgroups spanned by
3807 			 * this processor set
3808 			 */
3809 			kpreempt_disable();
3810 			klgrpset_copy(lgrpset,
3811 			    curthread->t_cpupart->cp_lgrpset);
3812 			kpreempt_enable();
3813 		}
3814 
3815 		/*
3816 		 * Count how many lgroups are spanned
3817 		 */
3818 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3819 		ASSERT(lgrps_spanned <= nlgrps);
3820 
3821 		/*
3822 		 * Probably lgrps_spanned should be always non-zero, but to be
3823 		 * on the safe side we return lgrp_root if it is empty.
3824 		 */
3825 		if (lgrps_spanned == 0) {
3826 			lgrp = lgrp_root;
3827 			break;
3828 		}
3829 
3830 		/*
3831 		 * Pick a random offset within lgroups spanned
3832 		 * and return lgroup at that offset
3833 		 */
3834 		random = (ushort_t)gethrtime() >> 4;
3835 		off = random % lgrps_spanned;
3836 		ASSERT(off <= lgrp_alloc_max);
3837 
3838 		for (i = 0; i <= lgrp_alloc_max; i++) {
3839 			if (!klgrpset_ismember(lgrpset, i))
3840 				continue;
3841 			if (off)
3842 				off--;
3843 			else {
3844 				lgrp = lgrp_table[i];
3845 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3846 				    1);
3847 				break;
3848 			}
3849 		}
3850 		break;
3851 
3852 	case LGRP_MEM_POLICY_ROUNDROBIN:
3853 
3854 		/*
3855 		 * Use offset within segment to determine
3856 		 * offset from home lgroup to choose for
3857 		 * next lgroup to allocate memory from
3858 		 */
3859 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3860 		    (lgrp_alloc_max + 1);
3861 
3862 		kpreempt_disable();
3863 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3864 		i = lgrp->lgrp_id;
3865 		kpreempt_enable();
3866 
3867 		while (off > 0) {
3868 			i = (i + 1) % (lgrp_alloc_max + 1);
3869 			lgrp = lgrp_table[i];
3870 			if (klgrpset_ismember(lgrpset, i))
3871 				off--;
3872 		}
3873 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3874 
3875 		break;
3876 	}
3877 
3878 	ASSERT(lgrp != NULL);
3879 	return (lgrp);
3880 }
3881 
3882 /*
3883  * Return the number of pages in an lgroup
3884  *
3885  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3886  *	 could cause tests that rely on the numat driver to fail....
3887  */
3888 pgcnt_t
3889 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3890 {
3891 	lgrp_t *lgrp;
3892 
3893 	lgrp = lgrp_table[lgrpid];
3894 	if (!LGRP_EXISTS(lgrp) ||
3895 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3896 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3897 		return (0);
3898 
3899 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3900 }
3901 
3902 /*
3903  * Initialize lgroup shared memory allocation policy support
3904  */
3905 void
3906 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3907 {
3908 	lgrp_shm_locality_t	*shm_locality;
3909 
3910 	/*
3911 	 * Initialize locality field in anon_map
3912 	 * Don't need any locks because this is called when anon_map is
3913 	 * allocated, but not used anywhere yet.
3914 	 */
3915 	if (amp) {
3916 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3917 		if (amp->locality == NULL) {
3918 			/*
3919 			 * Allocate and initialize shared memory locality info
3920 			 * and set anon_map locality pointer to it
3921 			 * Drop lock across kmem_alloc(KM_SLEEP)
3922 			 */
3923 			ANON_LOCK_EXIT(&amp->a_rwlock);
3924 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3925 			    KM_SLEEP);
3926 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3927 			    NULL);
3928 			shm_locality->loc_count = 1;	/* not used for amp */
3929 			shm_locality->loc_tree = NULL;
3930 
3931 			/*
3932 			 * Reacquire lock and check to see whether anyone beat
3933 			 * us to initializing the locality info
3934 			 */
3935 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3936 			if (amp->locality != NULL) {
3937 				rw_destroy(&shm_locality->loc_lock);
3938 				kmem_free(shm_locality,
3939 				    sizeof (*shm_locality));
3940 			} else
3941 				amp->locality = shm_locality;
3942 		}
3943 		ANON_LOCK_EXIT(&amp->a_rwlock);
3944 		return;
3945 	}
3946 
3947 	/*
3948 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3949 	 */
3950 	mutex_enter(&vp->v_lock);
3951 	if ((vp->v_flag & V_LOCALITY) == 0) {
3952 		/*
3953 		 * Allocate and initialize shared memory locality info
3954 		 */
3955 		mutex_exit(&vp->v_lock);
3956 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3957 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3958 		shm_locality->loc_count = 1;
3959 		shm_locality->loc_tree = NULL;
3960 
3961 		/*
3962 		 * Point vnode locality field at shared vnode policy info
3963 		 * and set locality aware flag in vnode
3964 		 */
3965 		mutex_enter(&vp->v_lock);
3966 		if ((vp->v_flag & V_LOCALITY) == 0) {
3967 			vp->v_locality = shm_locality;
3968 			vp->v_flag |= V_LOCALITY;
3969 		} else {
3970 			/*
3971 			 * Lost race so free locality info and increment count.
3972 			 */
3973 			rw_destroy(&shm_locality->loc_lock);
3974 			kmem_free(shm_locality, sizeof (*shm_locality));
3975 			shm_locality = vp->v_locality;
3976 			shm_locality->loc_count++;
3977 		}
3978 		mutex_exit(&vp->v_lock);
3979 
3980 		return;
3981 	}
3982 
3983 	/*
3984 	 * Increment reference count of number of segments mapping this vnode
3985 	 * shared
3986 	 */
3987 	shm_locality = vp->v_locality;
3988 	shm_locality->loc_count++;
3989 	mutex_exit(&vp->v_lock);
3990 }
3991 
3992 /*
3993  * Destroy the given shared memory policy segment tree
3994  */
3995 void
3996 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3997 {
3998 	lgrp_shm_policy_seg_t	*cur;
3999 	lgrp_shm_policy_seg_t	*next;
4000 
4001 	if (tree == NULL)
4002 		return;
4003 
4004 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
4005 	while (cur != NULL) {
4006 		next = AVL_NEXT(tree, cur);
4007 		avl_remove(tree, cur);
4008 		kmem_free(cur, sizeof (*cur));
4009 		cur = next;
4010 	}
4011 	kmem_free(tree, sizeof (avl_tree_t));
4012 }
4013 
4014 /*
4015  * Uninitialize lgroup shared memory allocation policy support
4016  */
4017 void
4018 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4019 {
4020 	lgrp_shm_locality_t	*shm_locality;
4021 
4022 	/*
4023 	 * For anon_map, deallocate shared memory policy tree and
4024 	 * zero locality field
4025 	 * Don't need any locks because anon_map is being freed
4026 	 */
4027 	if (amp) {
4028 		if (amp->locality == NULL)
4029 			return;
4030 		shm_locality = amp->locality;
4031 		shm_locality->loc_count = 0;	/* not really used for amp */
4032 		rw_destroy(&shm_locality->loc_lock);
4033 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4034 		kmem_free(shm_locality, sizeof (*shm_locality));
4035 		amp->locality = 0;
4036 		return;
4037 	}
4038 
4039 	/*
4040 	 * For vnode, decrement reference count of segments mapping this vnode
4041 	 * shared and delete locality info if reference count drops to 0
4042 	 */
4043 	mutex_enter(&vp->v_lock);
4044 	shm_locality = vp->v_locality;
4045 	shm_locality->loc_count--;
4046 
4047 	if (shm_locality->loc_count == 0) {
4048 		rw_destroy(&shm_locality->loc_lock);
4049 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4050 		kmem_free(shm_locality, sizeof (*shm_locality));
4051 		vp->v_locality = 0;
4052 		vp->v_flag &= ~V_LOCALITY;
4053 	}
4054 	mutex_exit(&vp->v_lock);
4055 }
4056 
4057 /*
4058  * Compare two shared memory policy segments
4059  * Used by AVL tree code for searching
4060  */
4061 int
4062 lgrp_shm_policy_compar(const void *x, const void *y)
4063 {
4064 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4065 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4066 
4067 	if (a->shm_off < b->shm_off)
4068 		return (-1);
4069 	if (a->shm_off >= b->shm_off + b->shm_size)
4070 		return (1);
4071 	return (0);
4072 }
4073 
4074 /*
4075  * Concatenate seg1 with seg2 and remove seg2
4076  */
4077 static int
4078 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4079     lgrp_shm_policy_seg_t *seg2)
4080 {
4081 	if (!seg1 || !seg2 ||
4082 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4083 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4084 		return (-1);
4085 
4086 	seg1->shm_size += seg2->shm_size;
4087 	avl_remove(tree, seg2);
4088 	kmem_free(seg2, sizeof (*seg2));
4089 	return (0);
4090 }
4091 
4092 /*
4093  * Split segment at given offset and return rightmost (uppermost) segment
4094  * Assumes that there are no overlapping segments
4095  */
4096 static lgrp_shm_policy_seg_t *
4097 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4098     u_offset_t off)
4099 {
4100 	lgrp_shm_policy_seg_t	*newseg;
4101 	avl_index_t		where;
4102 
4103 	ASSERT(seg != NULL);
4104 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4105 
4106 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4107 	    seg->shm_size)
4108 		return (NULL);
4109 
4110 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4111 		return (seg);
4112 
4113 	/*
4114 	 * Adjust size of left segment and allocate new (right) segment
4115 	 */
4116 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4117 	newseg->shm_policy = seg->shm_policy;
4118 	newseg->shm_off = off;
4119 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4120 	seg->shm_size = off - seg->shm_off;
4121 
4122 	/*
4123 	 * Find where to insert new segment in AVL tree and insert it
4124 	 */
4125 	(void) avl_find(tree, &off, &where);
4126 	avl_insert(tree, newseg, where);
4127 
4128 	return (newseg);
4129 }
4130 
4131 /*
4132  * Set shared memory allocation policy on specified shared object at given
4133  * offset and length
4134  *
4135  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4136  * -1 if can't set policy.
4137  */
4138 int
4139 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4140     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4141 {
4142 	u_offset_t		eoff;
4143 	lgrp_shm_policy_seg_t	*next;
4144 	lgrp_shm_policy_seg_t	*newseg;
4145 	u_offset_t		off;
4146 	u_offset_t		oldeoff;
4147 	lgrp_shm_policy_seg_t	*prev;
4148 	int			retval;
4149 	lgrp_shm_policy_seg_t	*seg;
4150 	lgrp_shm_locality_t	*shm_locality;
4151 	avl_tree_t		*tree;
4152 	avl_index_t		where;
4153 
4154 	ASSERT(amp || vp);
4155 	ASSERT((len & PAGEOFFSET) == 0);
4156 
4157 	if (len == 0)
4158 		return (-1);
4159 
4160 	retval = 0;
4161 
4162 	/*
4163 	 * Get locality info and starting offset into shared object
4164 	 * Try anon map first and then vnode
4165 	 * Assume that no locks need to be held on anon_map or vnode, since
4166 	 * it should be protected by its reference count which must be nonzero
4167 	 * for an existing segment.
4168 	 */
4169 	if (amp) {
4170 		/*
4171 		 * Get policy info from anon_map
4172 		 *
4173 		 */
4174 		ASSERT(amp->refcnt != 0);
4175 		if (amp->locality == NULL)
4176 			lgrp_shm_policy_init(amp, NULL);
4177 		shm_locality = amp->locality;
4178 		off = ptob(anon_index);
4179 	} else if (vp) {
4180 		/*
4181 		 * Get policy info from vnode
4182 		 */
4183 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4184 			lgrp_shm_policy_init(NULL, vp);
4185 		shm_locality = vp->v_locality;
4186 		ASSERT(shm_locality->loc_count != 0);
4187 		off = vn_off;
4188 	} else
4189 		return (-1);
4190 
4191 	ASSERT((off & PAGEOFFSET) == 0);
4192 
4193 	/*
4194 	 * Figure out default policy
4195 	 */
4196 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4197 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4198 
4199 	/*
4200 	 * Create AVL tree if there isn't one yet
4201 	 * and set locality field to point at it
4202 	 */
4203 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4204 	tree = shm_locality->loc_tree;
4205 	if (!tree) {
4206 		rw_exit(&shm_locality->loc_lock);
4207 
4208 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4209 
4210 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4211 		if (shm_locality->loc_tree == NULL) {
4212 			avl_create(tree, lgrp_shm_policy_compar,
4213 			    sizeof (lgrp_shm_policy_seg_t),
4214 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4215 			shm_locality->loc_tree = tree;
4216 		} else {
4217 			/*
4218 			 * Another thread managed to set up the tree
4219 			 * before we could. Free the tree we allocated
4220 			 * and use the one that's already there.
4221 			 */
4222 			kmem_free(tree, sizeof (*tree));
4223 			tree = shm_locality->loc_tree;
4224 		}
4225 	}
4226 
4227 	/*
4228 	 * Set policy
4229 	 *
4230 	 * Need to maintain hold on writer's lock to keep tree from
4231 	 * changing out from under us
4232 	 */
4233 	while (len != 0) {
4234 		/*
4235 		 * Find policy segment for specified offset into shared object
4236 		 */
4237 		seg = avl_find(tree, &off, &where);
4238 
4239 		/*
4240 		 * Didn't find any existing segment that contains specified
4241 		 * offset, so allocate new segment, insert it, and concatenate
4242 		 * with adjacent segments if possible
4243 		 */
4244 		if (seg == NULL) {
4245 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4246 			    KM_SLEEP);
4247 			newseg->shm_policy.mem_policy = policy;
4248 			newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4249 			newseg->shm_off = off;
4250 			avl_insert(tree, newseg, where);
4251 
4252 			/*
4253 			 * Check to see whether new segment overlaps with next
4254 			 * one, set length of new segment accordingly, and
4255 			 * calculate remaining length and next offset
4256 			 */
4257 			seg = AVL_NEXT(tree, newseg);
4258 			if (seg == NULL || off + len <= seg->shm_off) {
4259 				newseg->shm_size = len;
4260 				len = 0;
4261 			} else {
4262 				newseg->shm_size = seg->shm_off - off;
4263 				off = seg->shm_off;
4264 				len -= newseg->shm_size;
4265 			}
4266 
4267 			/*
4268 			 * Try to concatenate new segment with next and
4269 			 * previous ones, since they might have the same policy
4270 			 * now.  Grab previous and next segments first because
4271 			 * they will change on concatenation.
4272 			 */
4273 			prev =  AVL_PREV(tree, newseg);
4274 			next = AVL_NEXT(tree, newseg);
4275 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4276 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4277 
4278 			continue;
4279 		}
4280 
4281 		eoff = off + len;
4282 		oldeoff = seg->shm_off + seg->shm_size;
4283 
4284 		/*
4285 		 * Policy set already?
4286 		 */
4287 		if (policy == seg->shm_policy.mem_policy) {
4288 			/*
4289 			 * Nothing left to do if offset and length
4290 			 * fall within this segment
4291 			 */
4292 			if (eoff <= oldeoff) {
4293 				retval = 1;
4294 				break;
4295 			} else {
4296 				len = eoff - oldeoff;
4297 				off = oldeoff;
4298 				continue;
4299 			}
4300 		}
4301 
4302 		/*
4303 		 * Specified offset and length match existing segment exactly
4304 		 */
4305 		if (off == seg->shm_off && len == seg->shm_size) {
4306 			/*
4307 			 * Set policy and update current length
4308 			 */
4309 			seg->shm_policy.mem_policy = policy;
4310 			seg->shm_policy.mem_lgrpid = LGRP_NONE;
4311 			len = 0;
4312 
4313 			/*
4314 			 * Try concatenating new segment with previous and next
4315 			 * segments, since they might have the same policy now.
4316 			 * Grab previous and next segments first because they
4317 			 * will change on concatenation.
4318 			 */
4319 			prev =  AVL_PREV(tree, seg);
4320 			next = AVL_NEXT(tree, seg);
4321 			(void) lgrp_shm_policy_concat(tree, seg, next);
4322 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4323 		} else {
4324 			/*
4325 			 * Specified offset and length only apply to part of
4326 			 * existing segment
4327 			 */
4328 
4329 			/*
4330 			 * New segment starts in middle of old one, so split
4331 			 * new one off near beginning of old one
4332 			 */
4333 			newseg = NULL;
4334 			if (off > seg->shm_off) {
4335 				newseg = lgrp_shm_policy_split(tree, seg, off);
4336 
4337 				/*
4338 				 * New segment ends where old one did, so try
4339 				 * to concatenate with next segment
4340 				 */
4341 				if (eoff == oldeoff) {
4342 					newseg->shm_policy.mem_policy = policy;
4343 					newseg->shm_policy.mem_lgrpid =
4344 					    LGRP_NONE;
4345 					(void) lgrp_shm_policy_concat(tree,
4346 					    newseg, AVL_NEXT(tree, newseg));
4347 					break;
4348 				}
4349 			}
4350 
4351 			/*
4352 			 * New segment ends before old one, so split off end of
4353 			 * old one
4354 			 */
4355 			if (eoff < oldeoff) {
4356 				if (newseg) {
4357 					(void) lgrp_shm_policy_split(tree,
4358 					    newseg, eoff);
4359 					newseg->shm_policy.mem_policy = policy;
4360 					newseg->shm_policy.mem_lgrpid =
4361 					    LGRP_NONE;
4362 				} else {
4363 					(void) lgrp_shm_policy_split(tree, seg,
4364 					    eoff);
4365 					seg->shm_policy.mem_policy = policy;
4366 					seg->shm_policy.mem_lgrpid = LGRP_NONE;
4367 				}
4368 
4369 				if (off == seg->shm_off)
4370 					(void) lgrp_shm_policy_concat(tree,
4371 					    AVL_PREV(tree, seg), seg);
4372 				break;
4373 			}
4374 
4375 			/*
4376 			 * Calculate remaining length and next offset
4377 			 */
4378 			len = eoff - oldeoff;
4379 			off = oldeoff;
4380 		}
4381 	}
4382 
4383 	rw_exit(&shm_locality->loc_lock);
4384 	return (retval);
4385 }
4386 
4387 /*
4388  * Return the best memnode from which to allocate memory given
4389  * an lgroup.
4390  *
4391  * "c" is for cookie, which is good enough for me.
4392  * It references a cookie struct that should be zero'ed to initialize.
4393  * The cookie should live on the caller's stack.
4394  *
4395  * The routine returns -1 when:
4396  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4397  *	- traverse is 1, and all the memnodes in the system have been
4398  *	  returned.
4399  */
4400 int
4401 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4402 {
4403 	lgrp_t		*lp = c->lmc_lgrp;
4404 	mnodeset_t	nodes = c->lmc_nodes;
4405 	int		cnt = c->lmc_cnt;
4406 	int		offset, mnode;
4407 
4408 	extern int	max_mem_nodes;
4409 
4410 	/*
4411 	 * If the set is empty, and the caller is willing, traverse
4412 	 * up the hierarchy until we find a non-empty set.
4413 	 */
4414 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4415 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4416 		    ((lp = lp->lgrp_parent) == NULL))
4417 			return (-1);
4418 
4419 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4420 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4421 	}
4422 
4423 	/*
4424 	 * Select a memnode by picking one at a "random" offset.
4425 	 * Because of DR, memnodes can come and go at any time.
4426 	 * This code must be able to cope with the possibility
4427 	 * that the nodes count "cnt" is inconsistent with respect
4428 	 * to the number of elements actually in "nodes", and
4429 	 * therefore that the offset chosen could be greater than
4430 	 * the number of elements in the set (some memnodes may
4431 	 * have dissapeared just before cnt was read).
4432 	 * If this happens, the search simply wraps back to the
4433 	 * beginning of the set.
4434 	 */
4435 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4436 	offset = c->lmc_rand % cnt;
4437 	do {
4438 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4439 			if (nodes & ((mnodeset_t)1 << mnode))
4440 				if (!offset--)
4441 					break;
4442 	} while (mnode >= max_mem_nodes);
4443 
4444 	/* Found a node. Store state before returning. */
4445 	c->lmc_lgrp = lp;
4446 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4447 	c->lmc_cnt = cnt - 1;
4448 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4449 	c->lmc_ntried++;
4450 
4451 	return (mnode);
4452 }
4453