xref: /titanic_51/usr/src/uts/common/os/lgrp.c (revision 791a814c934fcd4deb13b26c1f116ff283272a0d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Basic NUMA support in terms of locality groups
28  *
29  * Solaris needs to know which CPUs, memory, etc. are near each other to
30  * provide good performance on NUMA machines by optimizing for locality.
31  * In order to do this, a new abstraction called a "locality group (lgroup)"
32  * has been introduced to keep track of which CPU-like and memory-like hardware
33  * resources are close to each other.  Currently, latency is the only measure
34  * used to determine how to group hardware resources into lgroups, but this
35  * does not limit the groupings to be based solely on latency.  Other factors
36  * may be used to determine the groupings in the future.
37  *
38  * Lgroups are organized into a hieararchy or topology that represents the
39  * latency topology of the machine.  There is always at least a root lgroup in
40  * the system.  It represents all the hardware resources in the machine at a
41  * latency big enough that any hardware resource can at least access any other
42  * hardware resource within that latency.  A Uniform Memory Access (UMA)
43  * machine is represented with one lgroup (the root).  In contrast, a NUMA
44  * machine is represented at least by the root lgroup and some number of leaf
45  * lgroups where the leaf lgroups contain the hardware resources within the
46  * least latency of each other and the root lgroup still contains all the
47  * resources in the machine.  Some number of intermediate lgroups may exist
48  * which represent more levels of locality than just the local latency of the
49  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
50  * (eg. root and intermediate lgroups) contain the next nearest resources to
51  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
52  * to the root lgroup shows the hardware resources from closest to farthest
53  * from the leaf lgroup such that each successive ancestor lgroup contains
54  * the next nearest resources at the next level of locality from the previous.
55  *
56  * The kernel uses the lgroup abstraction to know how to allocate resources
57  * near a given process/thread.  At fork() and lwp/thread_create() time, a
58  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
59  * with the lowest load average.  Binding to a processor or processor set will
60  * change the home lgroup for a thread.  The scheduler has been modified to try
61  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
62  * allocation is lgroup aware too, so memory will be allocated from the current
63  * thread's home lgroup if possible.  If the desired resources are not
64  * available, the kernel traverses the lgroup hierarchy going to the parent
65  * lgroup to find resources at the next level of locality until it reaches the
66  * root lgroup.
67  */
68 
69 #include <sys/lgrp.h>
70 #include <sys/lgrp_user.h>
71 #include <sys/types.h>
72 #include <sys/mman.h>
73 #include <sys/param.h>
74 #include <sys/var.h>
75 #include <sys/thread.h>
76 #include <sys/cpuvar.h>
77 #include <sys/cpupart.h>
78 #include <sys/kmem.h>
79 #include <vm/seg.h>
80 #include <vm/seg_kmem.h>
81 #include <vm/seg_spt.h>
82 #include <vm/seg_vn.h>
83 #include <vm/as.h>
84 #include <sys/atomic.h>
85 #include <sys/systm.h>
86 #include <sys/errno.h>
87 #include <sys/cmn_err.h>
88 #include <sys/kstat.h>
89 #include <sys/sysmacros.h>
90 #include <sys/pg.h>
91 #include <sys/promif.h>
92 #include <sys/sdt.h>
93 
94 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
96 				/* indexed by lgrp_id */
97 int	nlgrps;			/* number of lgroups in machine */
98 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
99 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
100 
101 /*
102  * Kstat data for lgroups.
103  *
104  * Actual kstat data is collected in lgrp_stats array.
105  * The lgrp_kstat_data array of named kstats is used to extract data from
106  * lgrp_stats and present it to kstat framework. It is protected from partallel
107  * modifications by lgrp_kstat_mutex. This may cause some contention when
108  * several kstat commands run in parallel but this is not the
109  * performance-critical path.
110  */
111 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
112 
113 /*
114  * Declare kstat names statically for enums as defined in the header file.
115  */
116 LGRP_KSTAT_NAMES;
117 
118 static void	lgrp_kstat_init(void);
119 static int	lgrp_kstat_extract(kstat_t *, int);
120 static void	lgrp_kstat_reset(lgrp_id_t);
121 
122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
123 static kmutex_t lgrp_kstat_mutex;
124 
125 
126 /*
127  * max number of lgroups supported by the platform
128  */
129 int	nlgrpsmax = 0;
130 
131 /*
132  * The root lgroup. Represents the set of resources at the system wide
133  * level of locality.
134  */
135 lgrp_t		*lgrp_root = NULL;
136 
137 /*
138  * During system bootstrap cp_default does not contain the list of lgrp load
139  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
140  * on-line when cp_default is initialized by cpupart_initialize_default().
141  * Configuring CPU0 may create a two-level topology with root and one leaf node
142  * containing CPU0. This topology is initially constructed in a special
143  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
144  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
145  * for all lpl operations until cp_default is fully constructed.
146  *
147  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
148  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
149  * the first element of lpl_bootstrap_list.
150  *
151  * CPUs that are added to the system, but have not yet been assigned to an
152  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
153  * on some architectures (x86) it's possible for the slave CPU startup thread
154  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
155  */
156 #define	LPL_BOOTSTRAP_SIZE 2
157 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
158 lpl_t		*lpl_bootstrap;
159 static lpl_t	*lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
160 static int	lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 /*
172  * Size, in bytes, beyond which random memory allocation policy is applied
173  * to non-shared memory.  Default is the maximum size, so random memory
174  * allocation won't be used for non-shared memory by default.
175  */
176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
177 
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 	((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 
183 
184 /*
185  * Size, in bytes, beyond which random memory allocation policy is applied to
186  * shared memory.  Default is 8MB (2 ISM pages).
187  */
188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
189 
190 /*
191  * Whether to do processor set aware memory allocation by default
192  */
193 int	lgrp_mem_pset_aware = 0;
194 
195 /*
196  * Set the default memory allocation policy for root lgroup
197  */
198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 
200 /*
201  * Set the default memory allocation policy.  For most platforms,
202  * next touch is sufficient, but some platforms may wish to override
203  * this.
204  */
205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 
207 
208 /*
209  * lgroup CPU event handlers
210  */
211 static void	lgrp_cpu_init(struct cpu *);
212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214 
215 /*
216  * lgroup memory event handlers
217  */
218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 
222 /*
223  * lgroup CPU partition event handlers
224  */
225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 static void	lgrp_part_del_cpu(struct cpu *);
227 
228 static void	lgrp_root_init(void);
229 
230 /*
231  * lpl topology
232  */
233 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234 static void	lpl_clear(lpl_t *);
235 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237 static void	lpl_rset_add(lpl_t *, lpl_t *);
238 static void	lpl_rset_del(lpl_t *, lpl_t *);
239 static int	lpl_rset_contains(lpl_t *, lpl_t *);
240 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241 static void	lpl_child_update(lpl_t *, struct cpupart *);
242 static int	lpl_pick(lpl_t *, lpl_t *);
243 static void	lpl_verify_wrapper(struct cpupart *);
244 
245 /*
246  * defines for lpl topology verifier return codes
247  */
248 
249 #define	LPL_TOPO_CORRECT			0
250 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
251 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252 #define	LPL_TOPO_LGRP_MISMATCH			-3
253 #define	LPL_TOPO_MISSING_PARENT			-4
254 #define	LPL_TOPO_PARENT_MISMATCH		-5
255 #define	LPL_TOPO_BAD_CPUCNT			-6
256 #define	LPL_TOPO_RSET_MISMATCH			-7
257 #define	LPL_TOPO_LPL_ORPHANED			-8
258 #define	LPL_TOPO_LPL_BAD_NCPU			-9
259 #define	LPL_TOPO_RSET_MSSNG_LF			-10
260 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-12
262 #define	LPL_TOPO_LGRP_NOT_LEAF			-13
263 #define	LPL_TOPO_BAD_RSETCNT			-14
264 
265 /*
266  * Return whether lgroup optimizations should be enabled on this system
267  */
268 int
269 lgrp_optimizations(void)
270 {
271 	/*
272 	 * System must have more than 2 lgroups to enable lgroup optimizations
273 	 *
274 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
275 	 * with one child lgroup containing all the resources. A 2 lgroup
276 	 * system with a root lgroup directly containing CPUs or memory might
277 	 * need lgroup optimizations with its child lgroup, but there
278 	 * isn't such a machine for now....
279 	 */
280 	if (nlgrps > 2)
281 		return (1);
282 
283 	return (0);
284 }
285 
286 /*
287  * Build full lgroup topology
288  */
289 static void
290 lgrp_root_init(void)
291 {
292 	lgrp_handle_t	hand;
293 	int		i;
294 	lgrp_id_t	id;
295 
296 	/*
297 	 * Create the "root" lgroup
298 	 */
299 	ASSERT(nlgrps == 0);
300 	id = nlgrps++;
301 
302 	lgrp_root = &lroot;
303 
304 	lgrp_root->lgrp_cpu = NULL;
305 	lgrp_root->lgrp_mnodes = 0;
306 	lgrp_root->lgrp_nmnodes = 0;
307 	hand = lgrp_plat_root_hand();
308 	lgrp_root->lgrp_plathand = hand;
309 
310 	lgrp_root->lgrp_id = id;
311 	lgrp_root->lgrp_cpucnt = 0;
312 	lgrp_root->lgrp_childcnt = 0;
313 	klgrpset_clear(lgrp_root->lgrp_children);
314 	klgrpset_clear(lgrp_root->lgrp_leaves);
315 	lgrp_root->lgrp_parent = NULL;
316 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
317 
318 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
319 		klgrpset_clear(lgrp_root->lgrp_set[i]);
320 
321 	lgrp_root->lgrp_kstat = NULL;
322 
323 	lgrp_table[id] = lgrp_root;
324 
325 	/*
326 	 * Setup initial lpl list for CPU0 and initial t0 home.
327 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
328 	 * all topology operations until cp_default is initialized at which
329 	 * point t0.t_lpl will be updated.
330 	 */
331 	lpl_bootstrap = lpl_bootstrap_list;
332 	t0.t_lpl = lpl_bootstrap;
333 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
334 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
335 
336 	/*
337 	 * Set up the bootstrap rset
338 	 * Since the bootstrap toplogy has just the root, and a leaf,
339 	 * the rset contains just the leaf, and both lpls can use the same rset
340 	 */
341 	lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
342 	lpl_bootstrap_list[0].lpl_rset_sz = 1;
343 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
344 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
345 
346 	lpl_bootstrap_list[1].lpl_rset_sz = 1;
347 	lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
348 	lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
349 
350 	cp_default.cp_lgrploads = lpl_bootstrap;
351 }
352 
353 /*
354  * Initialize the lgroup framework and allow the platform to do the same
355  */
356 void
357 lgrp_init(void)
358 {
359 	/*
360 	 * Initialize the platform
361 	 */
362 	lgrp_plat_init();
363 
364 	/*
365 	 * Set max number of lgroups supported on this platform which must be
366 	 * less than the max number of lgroups supported by the common lgroup
367 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
368 	 */
369 	nlgrpsmax = lgrp_plat_max_lgrps();
370 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
371 }
372 
373 /*
374  * Create the root and cpu0's lgroup, and set t0's home.
375  */
376 void
377 lgrp_setup(void)
378 {
379 	/*
380 	 * Setup the root lgroup
381 	 */
382 	lgrp_root_init();
383 
384 	/*
385 	 * Add cpu0 to an lgroup
386 	 */
387 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
388 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
389 }
390 
391 /*
392  * Lgroup initialization is split in two parts. The first part
393  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
394  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
395  * when all CPUs are brought online and all distance information is available.
396  *
397  * When lgrp_main_init() is complete it sets lgrp_initialized. The
398  * lgrp_main_mp_init() sets lgrp_topo_initialized.
399  */
400 
401 /*
402  * true when lgrp initialization has been completed.
403  */
404 int	lgrp_initialized = 0;
405 
406 /*
407  * True when lgrp topology is constructed.
408  */
409 int	lgrp_topo_initialized = 0;
410 
411 /*
412  * Init routine called after startup(), /etc/system has been processed,
413  * and cpu0 has been added to an lgroup.
414  */
415 void
416 lgrp_main_init(void)
417 {
418 	cpu_t		*cp = CPU;
419 	lgrp_id_t	lgrpid;
420 	int		i;
421 	extern void	pg_cpu0_reinit();
422 
423 	/*
424 	 * Enforce a valid lgrp_mem_default_policy
425 	 */
426 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
427 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
428 	    (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
429 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
430 
431 	/*
432 	 * See if mpo should be disabled.
433 	 * This may happen in the case of null proc LPA on Starcat.
434 	 * The platform won't be able to detect null proc LPA until after
435 	 * cpu0 and memory have already been added to lgroups.
436 	 * When and if it is detected, the Starcat platform will return
437 	 * a different platform handle for cpu0 which is what we check for
438 	 * here. If mpo should be disabled move cpu0 to it's rightful place
439 	 * (the root), and destroy the remaining lgroups. This effectively
440 	 * provides an UMA lgroup topology.
441 	 */
442 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
443 	if (lgrp_table[lgrpid]->lgrp_plathand !=
444 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
445 		lgrp_part_del_cpu(cp);
446 		lgrp_cpu_fini(cp, lgrpid);
447 
448 		lgrp_cpu_init(cp);
449 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
450 
451 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
452 
453 		/*
454 		 * Notify the PG subsystem that the CPU's lgrp
455 		 * association has changed
456 		 */
457 		pg_cpu0_reinit();
458 
459 		/*
460 		 * Destroy all lgroups except for root
461 		 */
462 		for (i = 0; i <= lgrp_alloc_max; i++) {
463 			if (LGRP_EXISTS(lgrp_table[i]) &&
464 			    lgrp_table[i] != lgrp_root)
465 				lgrp_destroy(lgrp_table[i]);
466 		}
467 
468 		/*
469 		 * Fix up root to point at itself for leaves and resources
470 		 * and not have any children
471 		 */
472 		lgrp_root->lgrp_childcnt = 0;
473 		klgrpset_clear(lgrp_root->lgrp_children);
474 		klgrpset_clear(lgrp_root->lgrp_leaves);
475 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
476 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
477 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
478 	}
479 
480 	/*
481 	 * Initialize kstats framework.
482 	 */
483 	lgrp_kstat_init();
484 	/*
485 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
486 	 */
487 	mutex_enter(&cpu_lock);
488 	lgrp_kstat_create(cp);
489 	mutex_exit(&cpu_lock);
490 
491 	lgrp_plat_main_init();
492 	lgrp_initialized = 1;
493 }
494 
495 /*
496  * Finish lgrp initialization after all CPUS are brought on-line.
497  * This routine is called after start_other_cpus().
498  */
499 void
500 lgrp_main_mp_init(void)
501 {
502 	klgrpset_t changed;
503 
504 	/*
505 	 * Update lgroup topology (if necessary)
506 	 */
507 	klgrpset_clear(changed);
508 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
509 	lgrp_topo_initialized = 1;
510 }
511 
512 /*
513  * Change latency of lgroup with specified lgroup platform handle (if one is
514  * given) or change all lgroups with old latency to new latency
515  */
516 void
517 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
518     u_longlong_t newtime)
519 {
520 	lgrp_t		*lgrp;
521 	int		i;
522 
523 	for (i = 0; i <= lgrp_alloc_max; i++) {
524 		lgrp = lgrp_table[i];
525 
526 		if (!LGRP_EXISTS(lgrp))
527 			continue;
528 
529 		if ((hand == LGRP_NULL_HANDLE &&
530 		    lgrp->lgrp_latency == oldtime) ||
531 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
532 			lgrp->lgrp_latency = (int)newtime;
533 	}
534 }
535 
536 /*
537  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
538  */
539 void
540 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
541 {
542 	klgrpset_t	changed;
543 	cpu_t		*cp;
544 	lgrp_id_t	id;
545 	int		rc;
546 
547 	switch (event) {
548 	/*
549 	 * The following (re)configuration events are common code
550 	 * initiated. lgrp_plat_config() is called here to inform the
551 	 * platform of the reconfiguration event.
552 	 */
553 	case LGRP_CONFIG_CPU_ADD:
554 		cp = (cpu_t *)resource;
555 
556 		/*
557 		 * Initialize the new CPU's lgrp related next/prev
558 		 * links, and give it a bootstrap lpl so that it can
559 		 * survive should it need to enter the dispatcher.
560 		 */
561 		cp->cpu_next_lpl = cp;
562 		cp->cpu_prev_lpl = cp;
563 		cp->cpu_next_lgrp = cp;
564 		cp->cpu_prev_lgrp = cp;
565 		cp->cpu_lpl = lpl_bootstrap;
566 
567 		lgrp_plat_config(event, resource);
568 		atomic_add_32(&lgrp_gen, 1);
569 
570 		break;
571 	case LGRP_CONFIG_CPU_DEL:
572 		lgrp_plat_config(event, resource);
573 		atomic_add_32(&lgrp_gen, 1);
574 
575 		break;
576 	case LGRP_CONFIG_CPU_ONLINE:
577 		cp = (cpu_t *)resource;
578 		lgrp_cpu_init(cp);
579 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
580 		rc = lpl_topo_verify(cp->cpu_part);
581 		if (rc != LPL_TOPO_CORRECT) {
582 			panic("lpl_topo_verify failed: %d", rc);
583 		}
584 		lgrp_plat_config(event, resource);
585 		atomic_add_32(&lgrp_gen, 1);
586 
587 		break;
588 	case LGRP_CONFIG_CPU_OFFLINE:
589 		cp = (cpu_t *)resource;
590 		id = cp->cpu_lpl->lpl_lgrpid;
591 		lgrp_part_del_cpu(cp);
592 		lgrp_cpu_fini(cp, id);
593 		rc = lpl_topo_verify(cp->cpu_part);
594 		if (rc != LPL_TOPO_CORRECT) {
595 			panic("lpl_topo_verify failed: %d", rc);
596 		}
597 		lgrp_plat_config(event, resource);
598 		atomic_add_32(&lgrp_gen, 1);
599 
600 		break;
601 	case LGRP_CONFIG_CPUPART_ADD:
602 		cp = (cpu_t *)resource;
603 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
604 		rc = lpl_topo_verify(cp->cpu_part);
605 		if (rc != LPL_TOPO_CORRECT) {
606 			panic("lpl_topo_verify failed: %d", rc);
607 		}
608 		lgrp_plat_config(event, resource);
609 
610 		break;
611 	case LGRP_CONFIG_CPUPART_DEL:
612 		cp = (cpu_t *)resource;
613 		lgrp_part_del_cpu((cpu_t *)resource);
614 		rc = lpl_topo_verify(cp->cpu_part);
615 		if (rc != LPL_TOPO_CORRECT) {
616 			panic("lpl_topo_verify failed: %d", rc);
617 		}
618 		lgrp_plat_config(event, resource);
619 
620 		break;
621 	/*
622 	 * The following events are initiated by the memnode
623 	 * subsystem.
624 	 */
625 	case LGRP_CONFIG_MEM_ADD:
626 		lgrp_mem_init((int)resource, where, B_FALSE);
627 		atomic_add_32(&lgrp_gen, 1);
628 
629 		break;
630 	case LGRP_CONFIG_MEM_DEL:
631 		lgrp_mem_fini((int)resource, where, B_FALSE);
632 		atomic_add_32(&lgrp_gen, 1);
633 
634 		break;
635 	case LGRP_CONFIG_MEM_RENAME: {
636 		lgrp_config_mem_rename_t *ren_arg =
637 		    (lgrp_config_mem_rename_t *)where;
638 
639 		lgrp_mem_rename((int)resource,
640 		    ren_arg->lmem_rename_from,
641 		    ren_arg->lmem_rename_to);
642 		atomic_add_32(&lgrp_gen, 1);
643 
644 		break;
645 	}
646 	case LGRP_CONFIG_GEN_UPDATE:
647 		atomic_add_32(&lgrp_gen, 1);
648 
649 		break;
650 	case LGRP_CONFIG_FLATTEN:
651 		if (where == 0)
652 			lgrp_topo_levels = (int)resource;
653 		else
654 			(void) lgrp_topo_flatten(resource,
655 			    lgrp_table, lgrp_alloc_max, &changed);
656 
657 		break;
658 	/*
659 	 * Update any lgroups with old latency to new latency
660 	 */
661 	case LGRP_CONFIG_LAT_CHANGE_ALL:
662 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
663 		    (u_longlong_t)where);
664 
665 		break;
666 	/*
667 	 * Update lgroup with specified lgroup platform handle to have
668 	 * new latency
669 	 */
670 	case LGRP_CONFIG_LAT_CHANGE:
671 		lgrp_latency_change((lgrp_handle_t)resource, 0,
672 		    (u_longlong_t)where);
673 
674 		break;
675 	case LGRP_CONFIG_NOP:
676 
677 		break;
678 	default:
679 		break;
680 	}
681 
682 }
683 
684 /*
685  * Called to add lgrp info into cpu structure from cpu_add_unit;
686  * do not assume cpu is in cpu[] yet!
687  *
688  * CPUs are brought online with all other CPUs paused so we can't
689  * allocate memory or we could deadlock the system, so we rely on
690  * the platform to statically allocate as much space as we need
691  * for the lgrp structs and stats.
692  */
693 static void
694 lgrp_cpu_init(struct cpu *cp)
695 {
696 	klgrpset_t	changed;
697 	int		count;
698 	lgrp_handle_t	hand;
699 	int		first_cpu;
700 	lgrp_t		*my_lgrp;
701 	lgrp_id_t	lgrpid;
702 	struct cpu	*cptr;
703 
704 	/*
705 	 * This is the first time through if the resource set
706 	 * for the root lgroup is empty. After cpu0 has been
707 	 * initially added to an lgroup, the root's CPU resource
708 	 * set can never be empty, since the system's last CPU
709 	 * cannot be offlined.
710 	 */
711 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
712 		/*
713 		 * First time through.
714 		 */
715 		first_cpu = 1;
716 	} else {
717 		/*
718 		 * If cpu0 needs to move lgroups, we may come
719 		 * through here again, at which time cpu_lock won't
720 		 * be held, and lgrp_initialized will be false.
721 		 */
722 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
723 		ASSERT(cp->cpu_part != NULL);
724 		first_cpu = 0;
725 	}
726 
727 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
728 	my_lgrp = lgrp_hand_to_lgrp(hand);
729 
730 	if (my_lgrp == NULL) {
731 		/*
732 		 * Create new lgrp and add it to lgroup topology
733 		 */
734 		my_lgrp = lgrp_create();
735 		my_lgrp->lgrp_plathand = hand;
736 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
737 		lgrpid = my_lgrp->lgrp_id;
738 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
739 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
740 
741 		count = 0;
742 		klgrpset_clear(changed);
743 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
744 		    &changed);
745 		/*
746 		 * May have added new intermediate lgroups, so need to add
747 		 * resources other than CPUs which are added below
748 		 */
749 		(void) lgrp_mnode_update(changed, NULL);
750 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
751 	    > 0) {
752 		/*
753 		 * Leaf lgroup was created, but latency wasn't available
754 		 * then.  So, set latency for it and fill in rest of lgroup
755 		 * topology  now that we know how far it is from other leaf
756 		 * lgroups.
757 		 */
758 		lgrpid = my_lgrp->lgrp_id;
759 		klgrpset_clear(changed);
760 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
761 		    lgrpid))
762 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
763 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
764 		    &changed);
765 
766 		/*
767 		 * May have added new intermediate lgroups, so need to add
768 		 * resources other than CPUs which are added below
769 		 */
770 		(void) lgrp_mnode_update(changed, NULL);
771 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
772 	    my_lgrp->lgrp_id)) {
773 		int	i;
774 
775 		/*
776 		 * Update existing lgroup and lgroups containing it with CPU
777 		 * resource
778 		 */
779 		lgrpid = my_lgrp->lgrp_id;
780 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
781 		for (i = 0; i <= lgrp_alloc_max; i++) {
782 			lgrp_t		*lgrp;
783 
784 			lgrp = lgrp_table[i];
785 			if (!LGRP_EXISTS(lgrp) ||
786 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
787 				continue;
788 
789 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
790 		}
791 	}
792 
793 	lgrpid = my_lgrp->lgrp_id;
794 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
795 
796 	/*
797 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
798 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
799 	 * not since none of lgroup IDs in the lpl's have been set yet.
800 	 */
801 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
802 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
803 
804 	/*
805 	 * link the CPU into the lgrp's CPU list
806 	 */
807 	if (my_lgrp->lgrp_cpucnt == 0) {
808 		my_lgrp->lgrp_cpu = cp;
809 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
810 	} else {
811 		cptr = my_lgrp->lgrp_cpu;
812 		cp->cpu_next_lgrp = cptr;
813 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
814 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
815 		cptr->cpu_prev_lgrp = cp;
816 	}
817 	my_lgrp->lgrp_cpucnt++;
818 }
819 
820 lgrp_t *
821 lgrp_create(void)
822 {
823 	lgrp_t		*my_lgrp;
824 	lgrp_id_t	lgrpid;
825 	int		i;
826 
827 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
828 
829 	/*
830 	 * Find an open slot in the lgroup table and recycle unused lgroup
831 	 * left there if any
832 	 */
833 	my_lgrp = NULL;
834 	if (lgrp_alloc_hint == -1)
835 		/*
836 		 * Allocate from end when hint not set yet because no lgroups
837 		 * have been deleted yet
838 		 */
839 		lgrpid = nlgrps++;
840 	else {
841 		/*
842 		 * Start looking for next open slot from hint and leave hint
843 		 * at slot allocated
844 		 */
845 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
846 			my_lgrp = lgrp_table[i];
847 			if (!LGRP_EXISTS(my_lgrp)) {
848 				lgrpid = i;
849 				nlgrps++;
850 				break;
851 			}
852 		}
853 		lgrp_alloc_hint = lgrpid;
854 	}
855 
856 	/*
857 	 * Keep track of max lgroup ID allocated so far to cut down on searches
858 	 */
859 	if (lgrpid > lgrp_alloc_max)
860 		lgrp_alloc_max = lgrpid;
861 
862 	/*
863 	 * Need to allocate new lgroup if next open slot didn't have one
864 	 * for recycling
865 	 */
866 	if (my_lgrp == NULL)
867 		my_lgrp = lgrp_plat_alloc(lgrpid);
868 
869 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
870 		panic("Too many lgrps for platform (%d)", nlgrps);
871 
872 	my_lgrp->lgrp_id = lgrpid;
873 	my_lgrp->lgrp_latency = 0;
874 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
875 	my_lgrp->lgrp_parent = NULL;
876 	my_lgrp->lgrp_childcnt = 0;
877 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
878 	my_lgrp->lgrp_nmnodes = 0;
879 	klgrpset_clear(my_lgrp->lgrp_children);
880 	klgrpset_clear(my_lgrp->lgrp_leaves);
881 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
882 		klgrpset_clear(my_lgrp->lgrp_set[i]);
883 
884 	my_lgrp->lgrp_cpu = NULL;
885 	my_lgrp->lgrp_cpucnt = 0;
886 
887 	if (my_lgrp->lgrp_kstat != NULL)
888 		lgrp_kstat_reset(lgrpid);
889 
890 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
891 
892 	return (my_lgrp);
893 }
894 
895 void
896 lgrp_destroy(lgrp_t *lgrp)
897 {
898 	int		i;
899 
900 	/*
901 	 * Unless this lgroup is being destroyed on behalf of
902 	 * the boot CPU, cpu_lock must be held
903 	 */
904 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
905 
906 	if (nlgrps == 1)
907 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
908 
909 	if (!LGRP_EXISTS(lgrp))
910 		return;
911 
912 	/*
913 	 * Set hint to lgroup being deleted and try to keep lower numbered
914 	 * hints to facilitate finding empty slots
915 	 */
916 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
917 		lgrp_alloc_hint = lgrp->lgrp_id;
918 
919 	/*
920 	 * Mark this lgroup to be recycled by setting its lgroup ID to
921 	 * LGRP_NONE and clear relevant fields
922 	 */
923 	lgrp->lgrp_id = LGRP_NONE;
924 	lgrp->lgrp_latency = 0;
925 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
926 	lgrp->lgrp_parent = NULL;
927 	lgrp->lgrp_childcnt = 0;
928 
929 	klgrpset_clear(lgrp->lgrp_children);
930 	klgrpset_clear(lgrp->lgrp_leaves);
931 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
932 		klgrpset_clear(lgrp->lgrp_set[i]);
933 
934 	lgrp->lgrp_mnodes = (mnodeset_t)0;
935 	lgrp->lgrp_nmnodes = 0;
936 
937 	lgrp->lgrp_cpu = NULL;
938 	lgrp->lgrp_cpucnt = 0;
939 
940 	nlgrps--;
941 }
942 
943 /*
944  * Initialize kstat data. Called from lgrp intialization code.
945  */
946 static void
947 lgrp_kstat_init(void)
948 {
949 	lgrp_stat_t	stat;
950 
951 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
952 
953 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
954 		kstat_named_init(&lgrp_kstat_data[stat],
955 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
956 }
957 
958 /*
959  * initialize an lgrp's kstats if needed
960  * called with cpu_lock held but not with cpus paused.
961  * we don't tear these down now because we don't know about
962  * memory leaving the lgrp yet...
963  */
964 
965 void
966 lgrp_kstat_create(cpu_t *cp)
967 {
968 	kstat_t		*lgrp_kstat;
969 	lgrp_id_t	lgrpid;
970 	lgrp_t		*my_lgrp;
971 
972 	ASSERT(MUTEX_HELD(&cpu_lock));
973 
974 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
975 	my_lgrp = lgrp_table[lgrpid];
976 
977 	if (my_lgrp->lgrp_kstat != NULL)
978 		return; /* already initialized */
979 
980 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
981 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
982 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
983 
984 	if (lgrp_kstat != NULL) {
985 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
986 		lgrp_kstat->ks_private = my_lgrp;
987 		lgrp_kstat->ks_data = &lgrp_kstat_data;
988 		lgrp_kstat->ks_update = lgrp_kstat_extract;
989 		my_lgrp->lgrp_kstat = lgrp_kstat;
990 		kstat_install(lgrp_kstat);
991 	}
992 }
993 
994 /*
995  * this will do something when we manage to remove now unused lgrps
996  */
997 
998 /* ARGSUSED */
999 void
1000 lgrp_kstat_destroy(cpu_t *cp)
1001 {
1002 	ASSERT(MUTEX_HELD(&cpu_lock));
1003 }
1004 
1005 /*
1006  * Called when a CPU is off-lined.
1007  */
1008 static void
1009 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1010 {
1011 	lgrp_t *my_lgrp;
1012 	struct cpu *prev;
1013 	struct cpu *next;
1014 
1015 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1016 
1017 	prev = cp->cpu_prev_lgrp;
1018 	next = cp->cpu_next_lgrp;
1019 
1020 	prev->cpu_next_lgrp = next;
1021 	next->cpu_prev_lgrp = prev;
1022 
1023 	/*
1024 	 * just because I'm paranoid doesn't mean...
1025 	 */
1026 
1027 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1028 
1029 	my_lgrp = lgrp_table[lgrpid];
1030 	my_lgrp->lgrp_cpucnt--;
1031 
1032 	/*
1033 	 * Removing last CPU in lgroup, so update lgroup topology
1034 	 */
1035 	if (my_lgrp->lgrp_cpucnt == 0) {
1036 		klgrpset_t	changed;
1037 		int		count;
1038 		int		i;
1039 
1040 		my_lgrp->lgrp_cpu = NULL;
1041 
1042 		/*
1043 		 * Remove this lgroup from its lgroup CPU resources and remove
1044 		 * lgroup from lgroup topology if it doesn't have any more
1045 		 * resources in it now
1046 		 */
1047 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1048 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1049 			count = 0;
1050 			klgrpset_clear(changed);
1051 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1052 			    lgrp_alloc_max + 1, &changed);
1053 			return;
1054 		}
1055 
1056 		/*
1057 		 * This lgroup isn't empty, so just remove it from CPU
1058 		 * resources of any lgroups that contain it as such
1059 		 */
1060 		for (i = 0; i <= lgrp_alloc_max; i++) {
1061 			lgrp_t		*lgrp;
1062 
1063 			lgrp = lgrp_table[i];
1064 			if (!LGRP_EXISTS(lgrp) ||
1065 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1066 			    lgrpid))
1067 				continue;
1068 
1069 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1070 		}
1071 		return;
1072 	}
1073 
1074 	if (my_lgrp->lgrp_cpu == cp)
1075 		my_lgrp->lgrp_cpu = next;
1076 
1077 }
1078 
1079 /*
1080  * Update memory nodes in target lgroups and return ones that get changed
1081  */
1082 int
1083 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1084 {
1085 	int	count;
1086 	int	i;
1087 	int	j;
1088 	lgrp_t	*lgrp;
1089 	lgrp_t	*lgrp_rsrc;
1090 
1091 	count = 0;
1092 	if (changed)
1093 		klgrpset_clear(*changed);
1094 
1095 	if (klgrpset_isempty(target))
1096 		return (0);
1097 
1098 	/*
1099 	 * Find each lgroup in target lgroups
1100 	 */
1101 	for (i = 0; i <= lgrp_alloc_max; i++) {
1102 		/*
1103 		 * Skip any lgroups that don't exist or aren't in target group
1104 		 */
1105 		lgrp = lgrp_table[i];
1106 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1107 			continue;
1108 		}
1109 
1110 		/*
1111 		 * Initialize memnodes for intermediate lgroups to 0
1112 		 * and update them from scratch since they may have completely
1113 		 * changed
1114 		 */
1115 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1116 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1117 			lgrp->lgrp_nmnodes = 0;
1118 		}
1119 
1120 		/*
1121 		 * Update memory nodes of of target lgroup with memory nodes
1122 		 * from each lgroup in its lgroup memory resource set
1123 		 */
1124 		for (j = 0; j <= lgrp_alloc_max; j++) {
1125 			int	k;
1126 
1127 			/*
1128 			 * Skip any lgroups that don't exist or aren't in
1129 			 * memory resources of target lgroup
1130 			 */
1131 			lgrp_rsrc = lgrp_table[j];
1132 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1133 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1134 			    j))
1135 				continue;
1136 
1137 			/*
1138 			 * Update target lgroup's memnodes to include memnodes
1139 			 * of this lgroup
1140 			 */
1141 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1142 				mnodeset_t	mnode_mask;
1143 
1144 				mnode_mask = (mnodeset_t)1 << k;
1145 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1146 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1147 					lgrp->lgrp_mnodes |= mnode_mask;
1148 					lgrp->lgrp_nmnodes++;
1149 				}
1150 			}
1151 			count++;
1152 			if (changed)
1153 				klgrpset_add(*changed, lgrp->lgrp_id);
1154 		}
1155 	}
1156 
1157 	return (count);
1158 }
1159 
1160 /*
1161  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1162  * is moved from one board to another. The "from" and "to" arguments specify the
1163  * source and the destination of the move.
1164  *
1165  * See plat_lgrp_config() for a detailed description of the copy-rename
1166  * semantics.
1167  *
1168  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1169  * the lgroup topology which is changing as memory moves from one lgroup to
1170  * another. It removes the mnode from the source lgroup and re-inserts it in the
1171  * target lgroup.
1172  *
1173  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1174  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1175  * copy-rename operation.
1176  *
1177  * There is one case which requires special handling. If the system contains
1178  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1179  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1180  * lgrp_mem_init), but there is a window when the system has no memory in the
1181  * lgroup hierarchy. If another thread tries to allocate memory during this
1182  * window, the allocation will fail, although the system has physical memory.
1183  * This may cause a system panic or a deadlock (some sleeping memory allocations
1184  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1185  * the mnode back).
1186  *
1187  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1188  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1189  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1190  * but it updates the rest of the lgroup topology as if the mnode was actually
1191  * removed. The lgrp_mem_init() function recognizes that the mnode being
1192  * inserted represents such a special case and updates the topology
1193  * appropriately.
1194  */
1195 void
1196 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1197 {
1198 	/*
1199 	 * Remove the memory from the source node and add it to the destination
1200 	 * node.
1201 	 */
1202 	lgrp_mem_fini(mnode, from, B_TRUE);
1203 	lgrp_mem_init(mnode, to, B_TRUE);
1204 }
1205 
1206 /*
1207  * Called to indicate that the lgrp with platform handle "hand" now
1208  * contains the memory identified by "mnode".
1209  *
1210  * LOCKING for this routine is a bit tricky. Usually it is called without
1211  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1212  * callers. During DR of the board containing the caged memory it may be called
1213  * with cpu_lock already held and CPUs paused.
1214  *
1215  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1216  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1217  * dealing with the special case of DR copy-rename described in
1218  * lgrp_mem_rename().
1219  */
1220 void
1221 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1222 {
1223 	klgrpset_t	changed;
1224 	int		count;
1225 	int		i;
1226 	lgrp_t		*my_lgrp;
1227 	lgrp_id_t	lgrpid;
1228 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1229 	boolean_t	drop_lock = B_FALSE;
1230 	boolean_t	need_synch = B_FALSE;
1231 
1232 	/*
1233 	 * Grab CPU lock (if we haven't already)
1234 	 */
1235 	if (!MUTEX_HELD(&cpu_lock)) {
1236 		mutex_enter(&cpu_lock);
1237 		drop_lock = B_TRUE;
1238 	}
1239 
1240 	/*
1241 	 * This routine may be called from a context where we already
1242 	 * hold cpu_lock, and have already paused cpus.
1243 	 */
1244 	if (!cpus_paused())
1245 		need_synch = B_TRUE;
1246 
1247 	/*
1248 	 * Check if this mnode is already configured and return immediately if
1249 	 * it is.
1250 	 *
1251 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1252 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1253 	 * recognize this case and continue as usual, but skip the update to
1254 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1255 	 * in topology, temporarily introduced by lgrp_mem_fini().
1256 	 */
1257 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1258 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1259 		if (drop_lock)
1260 			mutex_exit(&cpu_lock);
1261 		return;
1262 	}
1263 
1264 	/*
1265 	 * Update lgroup topology with new memory resources, keeping track of
1266 	 * which lgroups change
1267 	 */
1268 	count = 0;
1269 	klgrpset_clear(changed);
1270 	my_lgrp = lgrp_hand_to_lgrp(hand);
1271 	if (my_lgrp == NULL) {
1272 		/* new lgrp */
1273 		my_lgrp = lgrp_create();
1274 		lgrpid = my_lgrp->lgrp_id;
1275 		my_lgrp->lgrp_plathand = hand;
1276 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1277 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1278 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1279 
1280 		if (need_synch)
1281 			pause_cpus(NULL);
1282 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1283 		    &changed);
1284 		if (need_synch)
1285 			start_cpus();
1286 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1287 	    > 0) {
1288 		/*
1289 		 * Leaf lgroup was created, but latency wasn't available
1290 		 * then.  So, set latency for it and fill in rest of lgroup
1291 		 * topology  now that we know how far it is from other leaf
1292 		 * lgroups.
1293 		 */
1294 		klgrpset_clear(changed);
1295 		lgrpid = my_lgrp->lgrp_id;
1296 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1297 		    lgrpid))
1298 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1299 		if (need_synch)
1300 			pause_cpus(NULL);
1301 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1302 		    &changed);
1303 		if (need_synch)
1304 			start_cpus();
1305 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1306 	    my_lgrp->lgrp_id)) {
1307 		/*
1308 		 * Add new lgroup memory resource to existing lgroup
1309 		 */
1310 		lgrpid = my_lgrp->lgrp_id;
1311 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1312 		klgrpset_add(changed, lgrpid);
1313 		count++;
1314 		for (i = 0; i <= lgrp_alloc_max; i++) {
1315 			lgrp_t		*lgrp;
1316 
1317 			lgrp = lgrp_table[i];
1318 			if (!LGRP_EXISTS(lgrp) ||
1319 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1320 				continue;
1321 
1322 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1323 			klgrpset_add(changed, lgrp->lgrp_id);
1324 			count++;
1325 		}
1326 	}
1327 
1328 	/*
1329 	 * Add memory node to lgroup and remove lgroup from ones that need
1330 	 * to be updated
1331 	 */
1332 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1333 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1334 		my_lgrp->lgrp_nmnodes++;
1335 	}
1336 	klgrpset_del(changed, lgrpid);
1337 
1338 	/*
1339 	 * Update memory node information for all lgroups that changed and
1340 	 * contain new memory node as a resource
1341 	 */
1342 	if (count)
1343 		(void) lgrp_mnode_update(changed, NULL);
1344 
1345 	if (drop_lock)
1346 		mutex_exit(&cpu_lock);
1347 }
1348 
1349 /*
1350  * Called to indicate that the lgroup associated with the platform
1351  * handle "hand" no longer contains given memory node
1352  *
1353  * LOCKING for this routine is a bit tricky. Usually it is called without
1354  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1355  * callers. During DR of the board containing the caged memory it may be called
1356  * with cpu_lock already held and CPUs paused.
1357  *
1358  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1359  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1360  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1361  * the same mnode back into the topology. See lgrp_mem_rename() and
1362  * lgrp_mem_init() for additional details.
1363  */
1364 void
1365 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1366 {
1367 	klgrpset_t	changed;
1368 	int		count;
1369 	int		i;
1370 	lgrp_t		*my_lgrp;
1371 	lgrp_id_t	lgrpid;
1372 	mnodeset_t	mnodes_mask;
1373 	boolean_t	drop_lock = B_FALSE;
1374 	boolean_t	need_synch = B_FALSE;
1375 
1376 	/*
1377 	 * Grab CPU lock (if we haven't already)
1378 	 */
1379 	if (!MUTEX_HELD(&cpu_lock)) {
1380 		mutex_enter(&cpu_lock);
1381 		drop_lock = B_TRUE;
1382 	}
1383 
1384 	/*
1385 	 * This routine may be called from a context where we already
1386 	 * hold cpu_lock and have already paused cpus.
1387 	 */
1388 	if (!cpus_paused())
1389 		need_synch = B_TRUE;
1390 
1391 	my_lgrp = lgrp_hand_to_lgrp(hand);
1392 
1393 	/*
1394 	 * The lgrp *must* be pre-existing
1395 	 */
1396 	ASSERT(my_lgrp != NULL);
1397 
1398 	/*
1399 	 * Delete memory node from lgroups which contain it
1400 	 */
1401 	mnodes_mask = ((mnodeset_t)1 << mnode);
1402 	for (i = 0; i <= lgrp_alloc_max; i++) {
1403 		lgrp_t *lgrp = lgrp_table[i];
1404 		/*
1405 		 * Skip any non-existent lgroups and any lgroups that don't
1406 		 * contain leaf lgroup of memory as a memory resource
1407 		 */
1408 		if (!LGRP_EXISTS(lgrp) ||
1409 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1410 			continue;
1411 
1412 		/*
1413 		 * Avoid removing the last mnode from the root in the DR
1414 		 * copy-rename case. See lgrp_mem_rename() for details.
1415 		 */
1416 		if (is_copy_rename &&
1417 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1418 			continue;
1419 
1420 		/*
1421 		 * Remove memory node from lgroup.
1422 		 */
1423 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1424 		lgrp->lgrp_nmnodes--;
1425 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1426 	}
1427 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1428 
1429 	/*
1430 	 * Don't need to update lgroup topology if this lgroup still has memory.
1431 	 *
1432 	 * In the special case of DR copy-rename with the only mnode being
1433 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1434 	 * still need to update the lgroup topology.
1435 	 */
1436 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1437 	    !(is_copy_rename && (my_lgrp == lgrp_root) &&
1438 	    (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1439 		if (drop_lock)
1440 			mutex_exit(&cpu_lock);
1441 		return;
1442 	}
1443 
1444 	/*
1445 	 * This lgroup does not contain any memory now
1446 	 */
1447 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1448 
1449 	/*
1450 	 * Remove this lgroup from lgroup topology if it does not contain any
1451 	 * resources now
1452 	 */
1453 	lgrpid = my_lgrp->lgrp_id;
1454 	count = 0;
1455 	klgrpset_clear(changed);
1456 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1457 		/*
1458 		 * Delete lgroup when no more resources
1459 		 */
1460 		if (need_synch)
1461 			pause_cpus(NULL);
1462 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1463 		    lgrp_alloc_max + 1, &changed);
1464 		ASSERT(count > 0);
1465 		if (need_synch)
1466 			start_cpus();
1467 	} else {
1468 		/*
1469 		 * Remove lgroup from memory resources of any lgroups that
1470 		 * contain it as such
1471 		 */
1472 		for (i = 0; i <= lgrp_alloc_max; i++) {
1473 			lgrp_t		*lgrp;
1474 
1475 			lgrp = lgrp_table[i];
1476 			if (!LGRP_EXISTS(lgrp) ||
1477 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1478 			    lgrpid))
1479 				continue;
1480 
1481 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1482 		}
1483 	}
1484 	if (drop_lock)
1485 		mutex_exit(&cpu_lock);
1486 }
1487 
1488 /*
1489  * Return lgroup with given platform handle
1490  */
1491 lgrp_t *
1492 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1493 {
1494 	int	i;
1495 	lgrp_t	*lgrp;
1496 
1497 	if (hand == LGRP_NULL_HANDLE)
1498 		return (NULL);
1499 
1500 	for (i = 0; i <= lgrp_alloc_max; i++) {
1501 		lgrp = lgrp_table[i];
1502 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1503 			return (lgrp);
1504 	}
1505 	return (NULL);
1506 }
1507 
1508 /*
1509  * Return the home lgroup of the current thread.
1510  * We must do this with kernel preemption disabled, since we don't want our
1511  * thread to be re-homed while we're poking around with its lpl, and the lpl
1512  * should never be NULL.
1513  *
1514  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1515  * is enabled because of DR.  Callers can use disable kernel preemption
1516  * around this call to guarantee that the lgroup will be valid beyond this
1517  * routine, since kernel preemption can be recursive.
1518  */
1519 lgrp_t *
1520 lgrp_home_lgrp(void)
1521 {
1522 	lgrp_t	*lgrp;
1523 	lpl_t	*lpl;
1524 
1525 	kpreempt_disable();
1526 
1527 	lpl = curthread->t_lpl;
1528 	ASSERT(lpl != NULL);
1529 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1530 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1531 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1532 
1533 	kpreempt_enable();
1534 
1535 	return (lgrp);
1536 }
1537 
1538 /*
1539  * Return ID of home lgroup for given thread
1540  * (See comments for lgrp_home_lgrp() for special care and handling
1541  * instructions)
1542  */
1543 lgrp_id_t
1544 lgrp_home_id(kthread_t *t)
1545 {
1546 	lgrp_id_t	lgrp;
1547 	lpl_t		*lpl;
1548 
1549 	ASSERT(t != NULL);
1550 	/*
1551 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1552 	 * cannot since the HAT layer can call into this routine to
1553 	 * determine the locality for its data structures in the context
1554 	 * of a page fault.
1555 	 */
1556 
1557 	kpreempt_disable();
1558 
1559 	lpl = t->t_lpl;
1560 	ASSERT(lpl != NULL);
1561 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1562 	lgrp = lpl->lpl_lgrpid;
1563 
1564 	kpreempt_enable();
1565 
1566 	return (lgrp);
1567 }
1568 
1569 /*
1570  * Return lgroup containing the physical memory for the given page frame number
1571  */
1572 lgrp_t *
1573 lgrp_pfn_to_lgrp(pfn_t pfn)
1574 {
1575 	lgrp_handle_t	hand;
1576 	int		i;
1577 	lgrp_t		*lgrp;
1578 
1579 	hand = lgrp_plat_pfn_to_hand(pfn);
1580 	if (hand != LGRP_NULL_HANDLE)
1581 		for (i = 0; i <= lgrp_alloc_max; i++) {
1582 			lgrp = lgrp_table[i];
1583 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1584 				return (lgrp);
1585 		}
1586 	return (NULL);
1587 }
1588 
1589 /*
1590  * Return lgroup containing the physical memory for the given page frame number
1591  */
1592 lgrp_t *
1593 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1594 {
1595 	lgrp_handle_t	hand;
1596 	int		i;
1597 	lgrp_t		*lgrp;
1598 	pfn_t		pfn;
1599 
1600 	pfn = btop(physaddr);
1601 	hand = lgrp_plat_pfn_to_hand(pfn);
1602 	if (hand != LGRP_NULL_HANDLE)
1603 		for (i = 0; i <= lgrp_alloc_max; i++) {
1604 			lgrp = lgrp_table[i];
1605 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1606 				return (lgrp);
1607 		}
1608 	return (NULL);
1609 }
1610 
1611 /*
1612  * Return the leaf lgroup containing the given CPU
1613  *
1614  * The caller needs to take precautions necessary to prevent
1615  * "cpu", and it's lpl from going away across a call to this function.
1616  * hint: kpreempt_disable()/kpreempt_enable()
1617  */
1618 static lgrp_t *
1619 lgrp_cpu_to_lgrp(cpu_t *cpu)
1620 {
1621 	return (cpu->cpu_lpl->lpl_lgrp);
1622 }
1623 
1624 /*
1625  * Return the sum of the partition loads in an lgrp divided by
1626  * the number of CPUs in the lgrp.  This is our best approximation
1627  * of an 'lgroup load average' for a useful per-lgroup kstat.
1628  */
1629 static uint64_t
1630 lgrp_sum_loadavgs(lgrp_t *lgrp)
1631 {
1632 	cpu_t *cpu;
1633 	int ncpu;
1634 	uint64_t loads = 0;
1635 
1636 	mutex_enter(&cpu_lock);
1637 
1638 	cpu = lgrp->lgrp_cpu;
1639 	ncpu = lgrp->lgrp_cpucnt;
1640 
1641 	if (cpu == NULL || ncpu == 0) {
1642 		mutex_exit(&cpu_lock);
1643 		return (0ull);
1644 	}
1645 
1646 	do {
1647 		loads += cpu->cpu_lpl->lpl_loadavg;
1648 		cpu = cpu->cpu_next_lgrp;
1649 	} while (cpu != lgrp->lgrp_cpu);
1650 
1651 	mutex_exit(&cpu_lock);
1652 
1653 	return (loads / ncpu);
1654 }
1655 
1656 void
1657 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1658 {
1659 	struct lgrp_stats *pstats;
1660 
1661 	/*
1662 	 * Verify that the caller isn't trying to add to
1663 	 * a statistic for an lgroup that has gone away
1664 	 */
1665 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1666 		return;
1667 
1668 	pstats = &lgrp_stats[lgrpid];
1669 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1670 }
1671 
1672 int64_t
1673 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1674 {
1675 	uint64_t val;
1676 	struct lgrp_stats *pstats;
1677 
1678 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1679 		return ((int64_t)0);
1680 
1681 	pstats = &lgrp_stats[lgrpid];
1682 	LGRP_STAT_READ(pstats, stat, val);
1683 	return (val);
1684 }
1685 
1686 /*
1687  * Reset all kstats for lgrp specified by its lgrpid.
1688  */
1689 static void
1690 lgrp_kstat_reset(lgrp_id_t lgrpid)
1691 {
1692 	lgrp_stat_t stat;
1693 
1694 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1695 		return;
1696 
1697 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1698 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1699 	}
1700 }
1701 
1702 /*
1703  * Collect all per-lgrp statistics for the lgrp associated with this
1704  * kstat, and store them in the ks_data array.
1705  *
1706  * The superuser can reset all the running counter statistics for an
1707  * lgrp by writing to any of the lgrp's stats.
1708  */
1709 static int
1710 lgrp_kstat_extract(kstat_t *ksp, int rw)
1711 {
1712 	lgrp_stat_t		stat;
1713 	struct kstat_named	*ksd;
1714 	lgrp_t			*lgrp;
1715 	lgrp_id_t		lgrpid;
1716 
1717 	lgrp = (lgrp_t *)ksp->ks_private;
1718 
1719 	ksd = (struct kstat_named *)ksp->ks_data;
1720 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1721 
1722 	lgrpid = lgrp->lgrp_id;
1723 
1724 	if (lgrpid == LGRP_NONE) {
1725 		/*
1726 		 * Return all zeroes as stats for freed lgrp.
1727 		 */
1728 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1729 			ksd[stat].value.i64 = 0;
1730 		}
1731 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1732 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1733 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1734 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1735 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1736 	} else if (rw != KSTAT_WRITE) {
1737 		/*
1738 		 * Handle counter stats
1739 		 */
1740 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1741 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1742 		}
1743 
1744 		/*
1745 		 * Handle kernel data snapshot stats
1746 		 */
1747 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1748 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1749 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1750 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1751 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1752 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1753 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1754 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1755 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1756 		    lgrp_loadavg_max_effect;
1757 	} else {
1758 		lgrp_kstat_reset(lgrpid);
1759 	}
1760 
1761 	return (0);
1762 }
1763 
1764 int
1765 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1766 {
1767 	cpu_t	*cp;
1768 
1769 	mutex_enter(&cpu_lock);
1770 
1771 	if ((cp = cpu_get(id)) == NULL) {
1772 		mutex_exit(&cpu_lock);
1773 		return (EINVAL);
1774 	}
1775 
1776 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1777 		mutex_exit(&cpu_lock);
1778 		return (EINVAL);
1779 	}
1780 
1781 	ASSERT(cp->cpu_lpl != NULL);
1782 
1783 	*lp = cp->cpu_lpl->lpl_lgrpid;
1784 
1785 	mutex_exit(&cpu_lock);
1786 
1787 	return (0);
1788 }
1789 
1790 int
1791 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1792 {
1793 	cpu_t *cp;
1794 
1795 	mutex_enter(&cpu_lock);
1796 
1797 	if ((cp = cpu_get(id)) == NULL) {
1798 		mutex_exit(&cpu_lock);
1799 		return (EINVAL);
1800 	}
1801 
1802 	ASSERT(cp->cpu_lpl != NULL);
1803 
1804 	*lp = cp->cpu_lpl->lpl_loadavg;
1805 
1806 	mutex_exit(&cpu_lock);
1807 
1808 	return (0);
1809 }
1810 
1811 /*
1812  * Add a resource named by lpl_leaf to rset of lpl_target
1813  *
1814  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1815  * resource. It is adjusted here, as this is presently the only place that we
1816  * can be certain a resource addition has succeeded.
1817  *
1818  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1819  * list in order until it reaches a NULL.  (This list is required to be NULL
1820  * terminated, too).  This is done so that we can mark start pos + 1, so that
1821  * each lpl is traversed sequentially, but in a different order.  We hope this
1822  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1823  */
1824 
1825 void
1826 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1827 {
1828 	int		i;
1829 	int		entry_slot = 0;
1830 
1831 	/* return if leaf is already present */
1832 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1833 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1834 			return;
1835 		}
1836 
1837 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1838 		    lpl_leaf->lpl_lgrpid) {
1839 			break;
1840 		}
1841 	}
1842 
1843 	/* insert leaf, update counts */
1844 	entry_slot = i;
1845 	i = lpl_target->lpl_nrset++;
1846 
1847 	/*
1848 	 * Start at the end of the rset array and work backwards towards the
1849 	 * slot into which the new lpl will be inserted. This effectively
1850 	 * preserves the current ordering by scooting everybody over one entry,
1851 	 * and placing the new entry into the space created.
1852 	 */
1853 	while (i-- > entry_slot) {
1854 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1855 		lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1856 		    i + 1;
1857 	}
1858 
1859 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1860 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1861 
1862 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1863 }
1864 
1865 /*
1866  * Update each of lpl_parent's children with a reference to their parent.
1867  * The lgrp topology is used as the reference since it is fully
1868  * consistent and correct at this point.
1869  * This should be called after any potential change in lpl_parent's
1870  * rset.
1871  */
1872 static void
1873 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1874 {
1875 	klgrpset_t	children;
1876 	int		i;
1877 
1878 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1879 	if (klgrpset_isempty(children))
1880 		return; /* nothing to do */
1881 
1882 	for (i = 0; i <= lgrp_alloc_max; i++) {
1883 		if (klgrpset_ismember(children, i)) {
1884 			/*
1885 			 * (Re)set the parent. It may be incorrect if
1886 			 * lpl_parent is new in the topology.
1887 			 */
1888 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1889 		}
1890 	}
1891 }
1892 
1893 /*
1894  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1895  *
1896  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1897  * resource. The values are adjusted here, as this is the only place that we can
1898  * be certain a resource was successfully deleted.
1899  */
1900 void
1901 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1902 {
1903 	int i;
1904 	lpl_t *leaf;
1905 
1906 	if (lpl_target->lpl_nrset == 0)
1907 		return;
1908 
1909 	/* find leaf in intermediate node */
1910 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1911 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1912 			break;
1913 	}
1914 
1915 	/* return if leaf not found */
1916 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1917 		return;
1918 
1919 	/* prune leaf, compress array */
1920 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1921 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1922 	lpl_target->lpl_ncpu--;
1923 	do {
1924 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1925 		/*
1926 		 * Update the lgrp id <=> rset mapping
1927 		 */
1928 		if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1929 			lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1930 		}
1931 	} while (i++ < lpl_target->lpl_nrset);
1932 }
1933 
1934 /*
1935  * Check to see if the resource set of the target lpl contains the
1936  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1937  */
1938 
1939 int
1940 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1941 {
1942 	int i;
1943 
1944 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1945 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1946 			return (1);
1947 	}
1948 
1949 	return (0);
1950 }
1951 
1952 /*
1953  * Called when we change cpu lpl membership.  This increments or decrements the
1954  * per-cpu counter in every lpl in which our leaf appears.
1955  */
1956 void
1957 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1958 {
1959 	cpupart_t	*cpupart;
1960 	lgrp_t		*lgrp_leaf;
1961 	lgrp_t		*lgrp_cur;
1962 	lpl_t		*lpl_leaf;
1963 	lpl_t		*lpl_cur;
1964 	int		i;
1965 
1966 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1967 
1968 	cpupart = cp->cpu_part;
1969 	lpl_leaf = cp->cpu_lpl;
1970 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1971 
1972 	for (i = 0; i <= lgrp_alloc_max; i++) {
1973 		lgrp_cur = lgrp_table[i];
1974 
1975 		/*
1976 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1977 		 * for the cpu in question, or if the current lgrp and leaf
1978 		 * don't share the same resources.
1979 		 */
1980 
1981 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1982 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1983 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1984 			continue;
1985 
1986 
1987 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1988 
1989 		if (lpl_cur->lpl_nrset > 0) {
1990 			if (act == LPL_INCREMENT) {
1991 				lpl_cur->lpl_ncpu++;
1992 			} else if (act == LPL_DECREMENT) {
1993 				lpl_cur->lpl_ncpu--;
1994 			}
1995 		}
1996 	}
1997 }
1998 
1999 /*
2000  * Initialize lpl with given resources and specified lgrp
2001  */
2002 void
2003 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2004 {
2005 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2006 	lpl->lpl_loadavg = 0;
2007 	if (lpl == lpl_leaf)
2008 		lpl->lpl_ncpu = 1;
2009 	else
2010 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2011 	lpl->lpl_nrset = 1;
2012 	lpl->lpl_rset[0] = lpl_leaf;
2013 	lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2014 	lpl->lpl_lgrp = lgrp;
2015 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2016 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2017 }
2018 
2019 /*
2020  * Clear an unused lpl
2021  */
2022 void
2023 lpl_clear(lpl_t *lpl)
2024 {
2025 	/*
2026 	 * Clear out all fields in the lpl except:
2027 	 *    lpl_lgrpid - to facilitate debugging
2028 	 *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2029 	 *
2030 	 * Note that the lpl's rset and id2rset mapping are cleared as well.
2031 	 */
2032 	lpl->lpl_loadavg = 0;
2033 	lpl->lpl_ncpu = 0;
2034 	lpl->lpl_lgrp = NULL;
2035 	lpl->lpl_parent = NULL;
2036 	lpl->lpl_cpus = NULL;
2037 	lpl->lpl_nrset = 0;
2038 	lpl->lpl_homed_time = 0;
2039 	bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2040 	bzero(lpl->lpl_id2rset,
2041 	    sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2042 }
2043 
2044 /*
2045  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2046  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2047  * make full use of all of the lgroup topology, but this checks to make sure
2048  * that for the parts that it does use, it has correctly understood the
2049  * relationships that exist. This function returns
2050  * 0 if the topology is correct, and a non-zero error code, for non-debug
2051  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2052  * debugging on a DEBUG kernel.
2053  */
2054 int
2055 lpl_topo_verify(cpupart_t *cpupart)
2056 {
2057 	lgrp_t		*lgrp;
2058 	lpl_t		*lpl;
2059 	klgrpset_t	rset;
2060 	klgrpset_t	cset;
2061 	cpu_t		*cpu;
2062 	cpu_t		*cp_start;
2063 	int		i;
2064 	int		j;
2065 	int		sum;
2066 
2067 	/* topology can't be incorrect if it doesn't exist */
2068 	if (!lgrp_topo_initialized || !lgrp_initialized)
2069 		return (LPL_TOPO_CORRECT);
2070 
2071 	ASSERT(cpupart != NULL);
2072 
2073 	for (i = 0; i <= lgrp_alloc_max; i++) {
2074 		lgrp = lgrp_table[i];
2075 		lpl = NULL;
2076 		/* make sure lpls are allocated */
2077 		ASSERT(cpupart->cp_lgrploads);
2078 		if (!cpupart->cp_lgrploads)
2079 			return (LPL_TOPO_PART_HAS_NO_LPL);
2080 
2081 		lpl = &cpupart->cp_lgrploads[i];
2082 		/* make sure our index is good */
2083 		ASSERT(i < cpupart->cp_nlgrploads);
2084 
2085 		/* if lgroup doesn't exist, make sure lpl is empty */
2086 		if (!LGRP_EXISTS(lgrp)) {
2087 			ASSERT(lpl->lpl_ncpu == 0);
2088 			if (lpl->lpl_ncpu > 0) {
2089 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2090 			} else {
2091 				continue;
2092 			}
2093 		}
2094 
2095 		/* verify that lgroup and lpl are identically numbered */
2096 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2097 
2098 		/* if lgroup isn't in our partition, make sure lpl is empty */
2099 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2100 		    cpupart->cp_lgrpset)) {
2101 			ASSERT(lpl->lpl_ncpu == 0);
2102 			if (lpl->lpl_ncpu > 0) {
2103 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2104 			}
2105 			/*
2106 			 * lpl is empty, and lgroup isn't in partition.  verify
2107 			 * that lpl doesn't show up in anyone else's rsets (in
2108 			 * this partition, anyway)
2109 			 */
2110 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2111 				lpl_t *i_lpl; /* lpl we're iterating over */
2112 
2113 				i_lpl = &cpupart->cp_lgrploads[j];
2114 
2115 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2116 				if (lpl_rset_contains(i_lpl, lpl)) {
2117 					return (LPL_TOPO_LPL_ORPHANED);
2118 				}
2119 			}
2120 			/* lgroup is empty, and everything is ok. continue */
2121 			continue;
2122 		}
2123 
2124 
2125 		/* lgroup is in this partition, now check it against lpl */
2126 
2127 		/* do both have matching lgrps? */
2128 		ASSERT(lgrp == lpl->lpl_lgrp);
2129 		if (lgrp != lpl->lpl_lgrp) {
2130 			return (LPL_TOPO_LGRP_MISMATCH);
2131 		}
2132 
2133 		/* do the parent lgroups exist and do they match? */
2134 		if (lgrp->lgrp_parent) {
2135 			ASSERT(lpl->lpl_parent);
2136 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2137 			    lpl->lpl_parent->lpl_lgrpid);
2138 
2139 			if (!lpl->lpl_parent) {
2140 				return (LPL_TOPO_MISSING_PARENT);
2141 			} else if (lgrp->lgrp_parent->lgrp_id !=
2142 			    lpl->lpl_parent->lpl_lgrpid) {
2143 				return (LPL_TOPO_PARENT_MISMATCH);
2144 			}
2145 		}
2146 
2147 		/* only leaf lgroups keep a cpucnt, only check leaves */
2148 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2149 
2150 			/* verify that lgrp is also a leaf */
2151 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2152 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2153 			    lpl->lpl_lgrpid)));
2154 
2155 			if ((lgrp->lgrp_childcnt > 0) ||
2156 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2157 			    lpl->lpl_lgrpid))) {
2158 				return (LPL_TOPO_LGRP_NOT_LEAF);
2159 			}
2160 
2161 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2162 			    (lpl->lpl_ncpu > 0));
2163 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2164 			    (lpl->lpl_ncpu <= 0)) {
2165 				return (LPL_TOPO_BAD_CPUCNT);
2166 			}
2167 
2168 			/*
2169 			 * Check that lpl_ncpu also matches the number of
2170 			 * cpus in the lpl's linked list.  This only exists in
2171 			 * leaves, but they should always match.
2172 			 */
2173 			j = 0;
2174 			cpu = cp_start = lpl->lpl_cpus;
2175 			while (cpu != NULL) {
2176 				j++;
2177 
2178 				/* check to make sure cpu's lpl is leaf lpl */
2179 				ASSERT(cpu->cpu_lpl == lpl);
2180 				if (cpu->cpu_lpl != lpl) {
2181 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2182 				}
2183 
2184 				/* check next cpu */
2185 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2186 					continue;
2187 				} else {
2188 					cpu = NULL;
2189 				}
2190 			}
2191 
2192 			ASSERT(j == lpl->lpl_ncpu);
2193 			if (j != lpl->lpl_ncpu) {
2194 				return (LPL_TOPO_LPL_BAD_NCPU);
2195 			}
2196 
2197 			/*
2198 			 * Also, check that leaf lpl is contained in all
2199 			 * intermediate lpls that name the leaf as a descendant
2200 			 */
2201 			for (j = 0; j <= lgrp_alloc_max; j++) {
2202 				klgrpset_t intersect;
2203 				lgrp_t *lgrp_cand;
2204 				lpl_t *lpl_cand;
2205 
2206 				lgrp_cand = lgrp_table[j];
2207 				intersect = klgrpset_intersects(
2208 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2209 				    cpupart->cp_lgrpset);
2210 
2211 				if (!LGRP_EXISTS(lgrp_cand) ||
2212 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2213 				    cpupart->cp_lgrpset) ||
2214 				    (intersect == 0))
2215 					continue;
2216 
2217 				lpl_cand =
2218 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2219 
2220 				if (klgrpset_ismember(intersect,
2221 				    lgrp->lgrp_id)) {
2222 					ASSERT(lpl_rset_contains(lpl_cand,
2223 					    lpl));
2224 
2225 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2226 						return (LPL_TOPO_RSET_MSSNG_LF);
2227 					}
2228 				}
2229 			}
2230 
2231 		} else { /* non-leaf specific checks */
2232 
2233 			/*
2234 			 * Non-leaf lpls should have lpl_cpus == NULL
2235 			 * verify that this is so
2236 			 */
2237 			ASSERT(lpl->lpl_cpus == NULL);
2238 			if (lpl->lpl_cpus != NULL) {
2239 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2240 			}
2241 
2242 			/*
2243 			 * verify that the sum of the cpus in the leaf resources
2244 			 * is equal to the total ncpu in the intermediate
2245 			 */
2246 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2247 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2248 			}
2249 
2250 			ASSERT(sum == lpl->lpl_ncpu);
2251 			if (sum != lpl->lpl_ncpu) {
2252 				return (LPL_TOPO_LPL_BAD_NCPU);
2253 			}
2254 		}
2255 
2256 		/*
2257 		 * Check the rset of the lpl in question.  Make sure that each
2258 		 * rset contains a subset of the resources in
2259 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2260 		 * sure that each rset doesn't include resources that are
2261 		 * outside of that set.  (Which would be resources somehow not
2262 		 * accounted for).
2263 		 */
2264 		klgrpset_clear(rset);
2265 		for (j = 0; j < lpl->lpl_nrset; j++) {
2266 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2267 		}
2268 		klgrpset_copy(cset, rset);
2269 		/* make sure lpl rset matches lgrp rset */
2270 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2271 		/* make sure rset is contained with in partition, too */
2272 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2273 
2274 		ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2275 		if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2276 			return (LPL_TOPO_RSET_MISMATCH);
2277 		}
2278 
2279 		/*
2280 		 * check to make sure lpl_nrset matches the number of rsets
2281 		 * contained in the lpl
2282 		 */
2283 		for (j = 0; j < lpl->lpl_nrset; j++) {
2284 			if (lpl->lpl_rset[j] == NULL)
2285 				break;
2286 		}
2287 
2288 		ASSERT(j == lpl->lpl_nrset);
2289 		if (j != lpl->lpl_nrset) {
2290 			return (LPL_TOPO_BAD_RSETCNT);
2291 		}
2292 
2293 	}
2294 	return (LPL_TOPO_CORRECT);
2295 }
2296 
2297 /*
2298  * Flatten lpl topology to given number of levels.  This is presently only
2299  * implemented for a flatten to 2 levels, which will prune out the intermediates
2300  * and home the leaf lpls to the root lpl.
2301  */
2302 int
2303 lpl_topo_flatten(int levels)
2304 {
2305 	int		i;
2306 	uint_t		sum;
2307 	lgrp_t		*lgrp_cur;
2308 	lpl_t		*lpl_cur;
2309 	lpl_t		*lpl_root;
2310 	cpupart_t	*cp;
2311 
2312 	if (levels != 2)
2313 		return (0);
2314 
2315 	/* called w/ cpus paused - grab no locks! */
2316 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2317 	    !lgrp_initialized);
2318 
2319 	cp = cp_list_head;
2320 	do {
2321 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2322 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2323 
2324 		for (i = 0; i <= lgrp_alloc_max; i++) {
2325 			lgrp_cur = lgrp_table[i];
2326 			lpl_cur = &cp->cp_lgrploads[i];
2327 
2328 			if ((lgrp_cur == lgrp_root) ||
2329 			    (!LGRP_EXISTS(lgrp_cur) &&
2330 			    (lpl_cur->lpl_ncpu == 0)))
2331 				continue;
2332 
2333 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2334 				/*
2335 				 * this should be a deleted intermediate, so
2336 				 * clear it
2337 				 */
2338 				lpl_clear(lpl_cur);
2339 			} else if ((lpl_cur->lpl_nrset == 1) &&
2340 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2341 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2342 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2343 				/*
2344 				 * this is a leaf whose parent was deleted, or
2345 				 * whose parent had their lgrp deleted.  (And
2346 				 * whose parent will soon be deleted).  Point
2347 				 * this guy back to the root lpl.
2348 				 */
2349 				lpl_cur->lpl_parent = lpl_root;
2350 				lpl_rset_add(lpl_root, lpl_cur);
2351 			}
2352 
2353 		}
2354 
2355 		/*
2356 		 * Now that we're done, make sure the count on the root lpl is
2357 		 * correct, and update the hints of the children for the sake of
2358 		 * thoroughness
2359 		 */
2360 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2361 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2362 		}
2363 		lpl_root->lpl_ncpu = sum;
2364 		lpl_child_update(lpl_root, cp);
2365 
2366 		cp = cp->cp_next;
2367 	} while (cp != cp_list_head);
2368 
2369 	return (levels);
2370 }
2371 
2372 /*
2373  * Insert a lpl into the resource hierarchy and create any additional lpls that
2374  * are necessary to represent the varying states of locality for the cpu
2375  * resoruces newly added to the partition.
2376  *
2377  * This routine is clever enough that it can correctly add resources from the
2378  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2379  * those for which the lpl is a leaf as opposed to simply a named equally local
2380  * resource).  The one special case that needs additional processing is when a
2381  * new intermediate lpl is introduced.  Since the main loop only traverses
2382  * looking to add the leaf resource where it does not yet exist, additional work
2383  * is necessary to add other leaf resources that may need to exist in the newly
2384  * created intermediate.  This is performed by the second inner loop, and is
2385  * only done when the check for more than one overlapping resource succeeds.
2386  */
2387 
2388 void
2389 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2390 {
2391 	int		i;
2392 	int		j;
2393 	int		rset_num_intersect;
2394 	lgrp_t		*lgrp_cur;
2395 	lpl_t		*lpl_cur;
2396 	lpl_t		*lpl_parent;
2397 	lgrp_id_t	parent_id;
2398 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2399 
2400 	for (i = 0; i <= lgrp_alloc_max; i++) {
2401 		lgrp_cur = lgrp_table[i];
2402 
2403 		/*
2404 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2405 		 * contained within the current lgrp, or if the current lgrp has
2406 		 * no leaves in this partition
2407 		 */
2408 
2409 		if (!LGRP_EXISTS(lgrp_cur) ||
2410 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2411 		    lpl_leaf->lpl_lgrpid) ||
2412 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2413 		    cpupart->cp_lgrpset))
2414 			continue;
2415 
2416 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2417 		if (lgrp_cur->lgrp_parent != NULL) {
2418 			/* if lgrp has a parent, assign it properly */
2419 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2420 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2421 		} else {
2422 			/* if not, make sure parent ptr gets set to null */
2423 			lpl_parent = NULL;
2424 		}
2425 
2426 		if (lpl_cur == lpl_leaf) {
2427 			/*
2428 			 * Almost all leaf state was initialized elsewhere.  The
2429 			 * only thing left to do is to set the parent.
2430 			 */
2431 			lpl_cur->lpl_parent = lpl_parent;
2432 			continue;
2433 		}
2434 
2435 		lpl_clear(lpl_cur);
2436 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2437 
2438 		lpl_cur->lpl_parent = lpl_parent;
2439 
2440 		/* does new lpl need to be populated with other resources? */
2441 		rset_intersect =
2442 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2443 		    cpupart->cp_lgrpset);
2444 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2445 
2446 		if (rset_num_intersect > 1) {
2447 			/*
2448 			 * If so, figure out what lpls have resources that
2449 			 * intersect this one, and add them.
2450 			 */
2451 			for (j = 0; j <= lgrp_alloc_max; j++) {
2452 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2453 				lpl_t	*lpl_cand;	/* candidate lpl */
2454 
2455 				lgrp_cand = lgrp_table[j];
2456 				if (!LGRP_EXISTS(lgrp_cand) ||
2457 				    !klgrpset_ismember(rset_intersect,
2458 				    lgrp_cand->lgrp_id))
2459 					continue;
2460 				lpl_cand =
2461 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2462 				lpl_rset_add(lpl_cur, lpl_cand);
2463 			}
2464 		}
2465 		/*
2466 		 * This lpl's rset has changed. Update the hint in it's
2467 		 * children.
2468 		 */
2469 		lpl_child_update(lpl_cur, cpupart);
2470 	}
2471 }
2472 
2473 /*
2474  * remove a lpl from the hierarchy of resources, clearing its state when
2475  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2476  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2477  * delete them as well.
2478  */
2479 
2480 void
2481 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2482 {
2483 	int		i;
2484 	lgrp_t		*lgrp_cur;
2485 	lpl_t		*lpl_cur;
2486 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2487 
2488 	for (i = 0; i <= lgrp_alloc_max; i++) {
2489 		lgrp_cur = lgrp_table[i];
2490 
2491 		/*
2492 		 * Don't attempt to remove from lgrps that aren't there, that
2493 		 * don't contain our leaf, or from the leaf itself. (We do that
2494 		 * later)
2495 		 */
2496 
2497 		if (!LGRP_EXISTS(lgrp_cur))
2498 			continue;
2499 
2500 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2501 
2502 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2503 		    lpl_leaf->lpl_lgrpid) ||
2504 		    (lpl_cur == lpl_leaf)) {
2505 			continue;
2506 		}
2507 
2508 		/*
2509 		 * This is a slightly sleazy simplification in that we have
2510 		 * already marked the cp_lgrpset as no longer containing the
2511 		 * leaf we've deleted.  Any lpls that pass the above checks
2512 		 * based upon lgrp membership but not necessarily cpu-part
2513 		 * membership also get cleared by the checks below.  Currently
2514 		 * this is harmless, as the lpls should be empty anyway.
2515 		 *
2516 		 * In particular, we want to preserve lpls that have additional
2517 		 * leaf resources, even though we don't yet have a processor
2518 		 * architecture that represents resources this way.
2519 		 */
2520 
2521 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2522 		    cpupart->cp_lgrpset);
2523 
2524 		lpl_rset_del(lpl_cur, lpl_leaf);
2525 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2526 			lpl_clear(lpl_cur);
2527 		} else {
2528 			/*
2529 			 * Update this lpl's children
2530 			 */
2531 			lpl_child_update(lpl_cur, cpupart);
2532 		}
2533 	}
2534 	lpl_clear(lpl_leaf);
2535 }
2536 
2537 /*
2538  * add a cpu to a partition in terms of lgrp load avg bookeeping
2539  *
2540  * The lpl (cpu partition load average information) is now arranged in a
2541  * hierarchical fashion whereby resources that are closest, ie. most local, to
2542  * the cpu in question are considered to be leaves in a tree of resources.
2543  * There are two general cases for cpu additon:
2544  *
2545  * 1. A lpl structure that contains resources already in the hierarchy tree.
2546  * In this case, all of the associated lpl relationships have been defined, and
2547  * all that is necessary is that we link the new cpu into the per-lpl list of
2548  * cpus, and increment the ncpu count of all places where this cpu resource will
2549  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2550  * pushing is accomplished by this routine.
2551  *
2552  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2553  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2554  * construct the hierarchy of state necessary to name it's more distant
2555  * resources, if they should exist.  The leaf structure is initialized by this
2556  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2557  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2558  * and builds all of the "ancestoral" state necessary to identify resources at
2559  * differing levels of locality.
2560  */
2561 void
2562 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2563 {
2564 	cpupart_t	*cpupart;
2565 	lgrp_t		*lgrp_leaf;
2566 	lpl_t		*lpl_leaf;
2567 
2568 	/* called sometimes w/ cpus paused - grab no locks */
2569 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2570 
2571 	cpupart = cp->cpu_part;
2572 	lgrp_leaf = lgrp_table[lgrpid];
2573 
2574 	/* don't add non-existent lgrp */
2575 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2576 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2577 	cp->cpu_lpl = lpl_leaf;
2578 
2579 	/* only leaf lpls contain cpus */
2580 
2581 	if (lpl_leaf->lpl_ncpu++ == 0) {
2582 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2583 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2584 		lpl_leaf_insert(lpl_leaf, cpupart);
2585 	} else {
2586 		/*
2587 		 * the lpl should already exist in the parent, so just update
2588 		 * the count of available CPUs
2589 		 */
2590 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2591 	}
2592 
2593 	/* link cpu into list of cpus in lpl */
2594 
2595 	if (lpl_leaf->lpl_cpus) {
2596 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2597 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2598 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2599 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2600 	} else {
2601 		/*
2602 		 * We increment ncpu immediately after we create a new leaf
2603 		 * lpl, so assert that ncpu == 1 for the case where we don't
2604 		 * have any cpu pointers yet.
2605 		 */
2606 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2607 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2608 	}
2609 
2610 }
2611 
2612 
2613 /*
2614  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2615  *
2616  * The lpl (cpu partition load average information) is now arranged in a
2617  * hierarchical fashion whereby resources that are closest, ie. most local, to
2618  * the cpu in question are considered to be leaves in a tree of resources.
2619  * There are two removal cases in question:
2620  *
2621  * 1. Removal of the resource in the leaf leaves other resources remaining in
2622  * that leaf.  (Another cpu still exists at this level of locality).  In this
2623  * case, the count of available cpus is decremented in all assocated lpls by
2624  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2625  * from the per-cpu lpl list.
2626  *
2627  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2628  * empty)  In this case, all of what has occurred for the first step must take
2629  * place; however, additionally we must remove the lpl structure itself, prune
2630  * out any stranded lpls that do not directly name a leaf resource, and mark the
2631  * cpu partition in question as no longer containing resources from the lgrp of
2632  * the lpl that has been delted.  Cpu-partition changes are handled by this
2633  * method, but the lpl_leaf_remove function deals with the details of pruning
2634  * out the empty lpl and any of its orphaned direct ancestors.
2635  */
2636 void
2637 lgrp_part_del_cpu(cpu_t *cp)
2638 {
2639 	lpl_t		*lpl;
2640 	lpl_t		*leaf_lpl;
2641 	lgrp_t		*lgrp_leaf;
2642 
2643 	/* called sometimes w/ cpus paused - grab no locks */
2644 
2645 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2646 
2647 	lpl = leaf_lpl = cp->cpu_lpl;
2648 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2649 
2650 	/* don't delete a leaf that isn't there */
2651 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2652 
2653 	/* no double-deletes */
2654 	ASSERT(lpl->lpl_ncpu);
2655 	if (--lpl->lpl_ncpu == 0) {
2656 		/*
2657 		 * This was the last cpu in this lgroup for this partition,
2658 		 * clear its bit in the partition's lgroup bitmask
2659 		 */
2660 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2661 
2662 		/* eliminate remaning lpl link pointers in cpu, lpl */
2663 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2664 
2665 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2666 	} else {
2667 
2668 		/* unlink cpu from lists of cpus in lpl */
2669 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2670 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2671 		if (lpl->lpl_cpus == cp) {
2672 			lpl->lpl_cpus = cp->cpu_next_lpl;
2673 		}
2674 
2675 		/*
2676 		 * Update the cpu count in the lpls associated with parent
2677 		 * lgroups.
2678 		 */
2679 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2680 
2681 	}
2682 	/* clear cpu's lpl ptr when we're all done */
2683 	cp->cpu_lpl = NULL;
2684 }
2685 
2686 /*
2687  * Recompute load average for the specified partition/lgrp fragment.
2688  *
2689  * We rely on the fact that this routine is called from the clock thread
2690  * at a point before the clock thread can block (i.e. before its first
2691  * lock request).  Since the clock thread can not be preempted (since it
2692  * runs at highest priority), we know that cpu partitions can not change
2693  * (since doing so would require either the repartition requester or the
2694  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2695  * without grabbing cpu_lock.
2696  */
2697 void
2698 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2699 {
2700 	uint_t		ncpu;
2701 	int64_t		old, new, f;
2702 
2703 	/*
2704 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2705 	 */
2706 	static short expval[] = {
2707 	    0, 3196, 1618, 1083,
2708 	    814, 652, 543, 466,
2709 	    408, 363, 326, 297,
2710 	    272, 251, 233, 218,
2711 	    204, 192, 181, 172,
2712 	    163, 155, 148, 142,
2713 	    136, 130, 125, 121,
2714 	    116, 112, 109, 105
2715 	};
2716 
2717 	/* ASSERT (called from clock level) */
2718 
2719 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2720 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2721 		return;
2722 	}
2723 
2724 	for (;;) {
2725 
2726 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2727 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2728 		else
2729 			f = expval[ncpu];
2730 
2731 		/*
2732 		 * Modify the load average atomically to avoid losing
2733 		 * anticipatory load updates (see lgrp_move_thread()).
2734 		 */
2735 		if (ageflag) {
2736 			/*
2737 			 * We're supposed to both update and age the load.
2738 			 * This happens 10 times/sec. per cpu.  We do a
2739 			 * little hoop-jumping to avoid integer overflow.
2740 			 */
2741 			int64_t		q, r;
2742 
2743 			do {
2744 				old = new = lpl->lpl_loadavg;
2745 				q = (old  >> 16) << 7;
2746 				r = (old  & 0xffff) << 7;
2747 				new += ((long long)(nrcpus - q) * f -
2748 				    ((r * f) >> 16)) >> 7;
2749 
2750 				/*
2751 				 * Check for overflow
2752 				 */
2753 				if (new > LGRP_LOADAVG_MAX)
2754 					new = LGRP_LOADAVG_MAX;
2755 				else if (new < 0)
2756 					new = 0;
2757 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2758 			    new) != old);
2759 		} else {
2760 			/*
2761 			 * We're supposed to update the load, but not age it.
2762 			 * This option is used to update the load (which either
2763 			 * has already been aged in this 1/10 sec. interval or
2764 			 * soon will be) to account for a remotely executing
2765 			 * thread.
2766 			 */
2767 			do {
2768 				old = new = lpl->lpl_loadavg;
2769 				new += f;
2770 				/*
2771 				 * Check for overflow
2772 				 * Underflow not possible here
2773 				 */
2774 				if (new < old)
2775 					new = LGRP_LOADAVG_MAX;
2776 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2777 			    new) != old);
2778 		}
2779 
2780 		/*
2781 		 * Do the same for this lpl's parent
2782 		 */
2783 		if ((lpl = lpl->lpl_parent) == NULL)
2784 			break;
2785 		ncpu = lpl->lpl_ncpu;
2786 	}
2787 }
2788 
2789 /*
2790  * Initialize lpl topology in the target based on topology currently present in
2791  * lpl_bootstrap.
2792  *
2793  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2794  * initialize cp_default list of lpls. Up to this point all topology operations
2795  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2796  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2797  * `target' points to the list of lpls in cp_default and `size' is the size of
2798  * this list.
2799  *
2800  * This function walks the lpl topology in lpl_bootstrap and does for things:
2801  *
2802  * 1) Copies all fields from lpl_bootstrap to the target.
2803  *
2804  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2805  *
2806  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2807  *    instead of lpl_bootstrap.
2808  *
2809  * 4) Updates pointers in the resource list of the target to point to the lpls
2810  *    in the target list instead of lpl_bootstrap.
2811  *
2812  * After lpl_topo_bootstrap() completes, target contains the same information
2813  * that would be present there if it were used during boot instead of
2814  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2815  * and it is bzeroed.
2816  */
2817 void
2818 lpl_topo_bootstrap(lpl_t *target, int size)
2819 {
2820 	lpl_t	*lpl = lpl_bootstrap;
2821 	lpl_t	*target_lpl = target;
2822 	lpl_t	**rset;
2823 	int	*id2rset;
2824 	int	sz;
2825 	int	howmany;
2826 	int	id;
2827 	int	i;
2828 
2829 	/*
2830 	 * The only target that should be passed here is cp_default lpl list.
2831 	 */
2832 	ASSERT(target == cp_default.cp_lgrploads);
2833 	ASSERT(size == cp_default.cp_nlgrploads);
2834 	ASSERT(!lgrp_topo_initialized);
2835 	ASSERT(ncpus == 1);
2836 
2837 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2838 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2839 		/*
2840 		 * Copy all fields from lpl, except for the rset,
2841 		 * lgrp id <=> rset mapping storage,
2842 		 * and amount of storage
2843 		 */
2844 		rset = target_lpl->lpl_rset;
2845 		id2rset = target_lpl->lpl_id2rset;
2846 		sz = target_lpl->lpl_rset_sz;
2847 
2848 		*target_lpl = *lpl;
2849 
2850 		target_lpl->lpl_rset_sz = sz;
2851 		target_lpl->lpl_rset = rset;
2852 		target_lpl->lpl_id2rset = id2rset;
2853 
2854 		/*
2855 		 * Substitute CPU0 lpl pointer with one relative to target.
2856 		 */
2857 		if (lpl->lpl_cpus == CPU) {
2858 			ASSERT(CPU->cpu_lpl == lpl);
2859 			CPU->cpu_lpl = target_lpl;
2860 		}
2861 
2862 		/*
2863 		 * Substitute parent information with parent relative to target.
2864 		 */
2865 		if (lpl->lpl_parent != NULL)
2866 			target_lpl->lpl_parent = (lpl_t *)
2867 			    (((uintptr_t)lpl->lpl_parent -
2868 			    (uintptr_t)lpl_bootstrap) +
2869 			    (uintptr_t)target);
2870 
2871 		/*
2872 		 * Walk over resource set substituting pointers relative to
2873 		 * lpl_bootstrap's rset to pointers relative to target's
2874 		 */
2875 		ASSERT(lpl->lpl_nrset <= 1);
2876 
2877 		for (id = 0; id < lpl->lpl_nrset; id++) {
2878 			if (lpl->lpl_rset[id] != NULL) {
2879 				target_lpl->lpl_rset[id] = (lpl_t *)
2880 				    (((uintptr_t)lpl->lpl_rset[id] -
2881 				    (uintptr_t)lpl_bootstrap) +
2882 				    (uintptr_t)target);
2883 			}
2884 			target_lpl->lpl_id2rset[id] =
2885 			    lpl->lpl_id2rset[id];
2886 		}
2887 	}
2888 
2889 	/*
2890 	 * Clean up the bootstrap lpls since we have switched over to the
2891 	 * actual lpl array in the default cpu partition.
2892 	 *
2893 	 * We still need to keep one empty lpl around for newly starting
2894 	 * slave CPUs to reference should they need to make it through the
2895 	 * dispatcher prior to their lgrp/lpl initialization.
2896 	 *
2897 	 * The lpl related dispatcher code has been designed to work properly
2898 	 * (and without extra checks) for this special case of a zero'ed
2899 	 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2900 	 * with lgrpid 0 and an empty resource set. Iteration over the rset
2901 	 * array by the dispatcher is also NULL terminated for this reason.
2902 	 *
2903 	 * This provides the desired behaviour for an uninitialized CPU.
2904 	 * It shouldn't see any other CPU to either dispatch to or steal
2905 	 * from until it is properly initialized.
2906 	 */
2907 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2908 	bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2909 	bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2910 
2911 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2912 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2913 }
2914 
2915 /*
2916  * If the lowest load among the lgroups a process' threads are currently
2917  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2918  * expanding the process to a new lgroup.
2919  */
2920 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2921 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2922 
2923 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2924 	((lgrp_expand_proc_thresh) / (ncpu))
2925 
2926 /*
2927  * A process will be expanded to a new lgroup only if the difference between
2928  * the lowest load on the lgroups the process' thread's are currently spread
2929  * across and the lowest load on the other lgroups in the process' partition
2930  * is greater than lgrp_expand_proc_diff.
2931  */
2932 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2933 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2934 
2935 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2936 	((lgrp_expand_proc_diff) / (ncpu))
2937 
2938 /*
2939  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2940  * be present due to impreciseness of the load average decay algorithm.
2941  *
2942  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2943  * tolerance is scaled by the number of cpus in the lgroup just like
2944  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2945  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2946  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2947  */
2948 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2949 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2950 	((lgrp_loadavg_tolerance) / ncpu)
2951 
2952 /*
2953  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2954  * average is above this threshold
2955  */
2956 uint32_t	lgrp_load_thresh = UINT32_MAX;
2957 
2958 /*
2959  * lgrp_choose() will try to skip any lgroups with less memory
2960  * than this free when choosing a home lgroup
2961  */
2962 pgcnt_t	lgrp_mem_free_thresh = 0;
2963 
2964 /*
2965  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2966  * one based on one of the following policies:
2967  * - Random selection
2968  * - Pseudo round robin placement
2969  * - Longest time since a thread was last placed
2970  */
2971 #define	LGRP_CHOOSE_RANDOM	1
2972 #define	LGRP_CHOOSE_RR		2
2973 #define	LGRP_CHOOSE_TIME	3
2974 
2975 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2976 
2977 /*
2978  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2979  * be bound to a CPU or processor set.
2980  *
2981  * Arguments:
2982  *	t		The thread
2983  *	cpupart		The partition the thread belongs to.
2984  *
2985  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2986  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2987  *	 partitions changing out from under us and assumes that given thread is
2988  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2989  *	 disabled, so don't grab any locks because we should never block under
2990  *	 those conditions.
2991  */
2992 lpl_t *
2993 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2994 {
2995 	lgrp_load_t	bestload, bestrload;
2996 	int		lgrpid_offset, lgrp_count;
2997 	lgrp_id_t	lgrpid, lgrpid_start;
2998 	lpl_t		*lpl, *bestlpl, *bestrlpl;
2999 	klgrpset_t	lgrpset;
3000 	proc_t		*p;
3001 
3002 	ASSERT(t != NULL);
3003 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3004 	    THREAD_LOCK_HELD(t));
3005 	ASSERT(cpupart != NULL);
3006 
3007 	p = t->t_procp;
3008 
3009 	/* A process should always be in an active partition */
3010 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3011 
3012 	bestlpl = bestrlpl = NULL;
3013 	bestload = bestrload = LGRP_LOADAVG_MAX;
3014 	lgrpset = cpupart->cp_lgrpset;
3015 
3016 	switch (lgrp_choose_policy) {
3017 	case LGRP_CHOOSE_RR:
3018 		lgrpid = cpupart->cp_lgrp_hint;
3019 		do {
3020 			if (++lgrpid > lgrp_alloc_max)
3021 				lgrpid = 0;
3022 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3023 
3024 		break;
3025 	default:
3026 	case LGRP_CHOOSE_TIME:
3027 	case LGRP_CHOOSE_RANDOM:
3028 		klgrpset_nlgrps(lgrpset, lgrp_count);
3029 		lgrpid_offset =
3030 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3031 		for (lgrpid = 0; ; lgrpid++) {
3032 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3033 				if (--lgrpid_offset == 0)
3034 					break;
3035 			}
3036 		}
3037 		break;
3038 	}
3039 
3040 	lgrpid_start = lgrpid;
3041 
3042 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3043 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3044 
3045 	/*
3046 	 * Use lgroup affinities (if any) to choose best lgroup
3047 	 *
3048 	 * NOTE: Assumes that thread is protected from going away and its
3049 	 *	 lgroup affinities won't change (ie. p_lock, or
3050 	 *	 thread_lock() being held and/or CPUs paused)
3051 	 */
3052 	if (t->t_lgrp_affinity) {
3053 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3054 		if (lpl != NULL)
3055 			return (lpl);
3056 	}
3057 
3058 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3059 
3060 	do {
3061 		pgcnt_t	npgs;
3062 
3063 		/*
3064 		 * Skip any lgroups outside of thread's pset
3065 		 */
3066 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3067 			if (++lgrpid > lgrp_alloc_max)
3068 				lgrpid = 0;	/* wrap the search */
3069 			continue;
3070 		}
3071 
3072 		/*
3073 		 * Skip any non-leaf lgroups
3074 		 */
3075 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3076 			continue;
3077 
3078 		/*
3079 		 * Skip any lgroups without enough free memory
3080 		 * (when threshold set to nonzero positive value)
3081 		 */
3082 		if (lgrp_mem_free_thresh > 0) {
3083 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3084 			if (npgs < lgrp_mem_free_thresh) {
3085 				if (++lgrpid > lgrp_alloc_max)
3086 					lgrpid = 0;	/* wrap the search */
3087 				continue;
3088 			}
3089 		}
3090 
3091 		lpl = &cpupart->cp_lgrploads[lgrpid];
3092 		if (klgrpset_isempty(p->p_lgrpset) ||
3093 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3094 			/*
3095 			 * Either this is a new process or the process already
3096 			 * has threads on this lgrp, so this is a preferred
3097 			 * lgroup for the thread.
3098 			 */
3099 			if (bestlpl == NULL ||
3100 			    lpl_pick(lpl, bestlpl)) {
3101 				bestload = lpl->lpl_loadavg;
3102 				bestlpl = lpl;
3103 			}
3104 		} else {
3105 			/*
3106 			 * The process doesn't have any threads on this lgrp,
3107 			 * but we're willing to consider this lgrp if the load
3108 			 * difference is big enough to justify splitting up
3109 			 * the process' threads.
3110 			 */
3111 			if (bestrlpl == NULL ||
3112 			    lpl_pick(lpl, bestrlpl)) {
3113 				bestrload = lpl->lpl_loadavg;
3114 				bestrlpl = lpl;
3115 			}
3116 		}
3117 		if (++lgrpid > lgrp_alloc_max)
3118 			lgrpid = 0;	/* wrap the search */
3119 	} while (lgrpid != lgrpid_start);
3120 
3121 	/*
3122 	 * Return root lgroup if threshold isn't set to maximum value and
3123 	 * lowest lgroup load average more than a certain threshold
3124 	 */
3125 	if (lgrp_load_thresh != UINT32_MAX &&
3126 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3127 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3128 
3129 	/*
3130 	 * If all the lgroups over which the thread's process is spread are
3131 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3132 	 * the thread on one of the other leaf lgroups in the thread's
3133 	 * partition.
3134 	 */
3135 	if ((bestlpl == NULL) ||
3136 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3137 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3138 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3139 	    bestload))) {
3140 		bestlpl = bestrlpl;
3141 	}
3142 
3143 	if (bestlpl == NULL) {
3144 		/*
3145 		 * No lgroup looked particularly good, but we still
3146 		 * have to pick something. Go with the randomly selected
3147 		 * legal lgroup we started with above.
3148 		 */
3149 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3150 	}
3151 
3152 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3153 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3154 
3155 	ASSERT(bestlpl->lpl_ncpu > 0);
3156 	return (bestlpl);
3157 }
3158 
3159 /*
3160  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3161  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3162  */
3163 static int
3164 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3165 {
3166 	lgrp_load_t	l1, l2;
3167 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3168 
3169 	l1 = lpl1->lpl_loadavg;
3170 	l2 = lpl2->lpl_loadavg;
3171 
3172 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3173 		/* lpl1 is significantly less loaded than lpl2 */
3174 		return (1);
3175 	}
3176 
3177 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3178 	    l1 + tolerance >= l2 && l1 < l2 &&
3179 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3180 		/*
3181 		 * lpl1's load is within the tolerance of lpl2. We're
3182 		 * willing to consider it be to better however if
3183 		 * it has been longer since we last homed a thread there
3184 		 */
3185 		return (1);
3186 	}
3187 
3188 	return (0);
3189 }
3190 
3191 /*
3192  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3193  * process that uses text replication changed home lgrp. This info is used by
3194  * segvn asyncronous thread to detect if it needs to recheck what lgrps
3195  * should be used for text replication.
3196  */
3197 static uint64_t lgrp_trthr_moves = 0;
3198 
3199 uint64_t
3200 lgrp_get_trthr_migrations(void)
3201 {
3202 	return (lgrp_trthr_moves);
3203 }
3204 
3205 void
3206 lgrp_update_trthr_migrations(uint64_t incr)
3207 {
3208 	atomic_add_64(&lgrp_trthr_moves, incr);
3209 }
3210 
3211 /*
3212  * An LWP is expected to be assigned to an lgroup for at least this long
3213  * for its anticipatory load to be justified.  NOTE that this value should
3214  * not be set extremely huge (say, larger than 100 years), to avoid problems
3215  * with overflow in the calculation that uses it.
3216  */
3217 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3218 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3219 
3220 /*
3221  * Routine to change a thread's lgroup affiliation.  This routine updates
3222  * the thread's kthread_t struct and its process' proc_t struct to note the
3223  * thread's new lgroup affiliation, and its lgroup affinities.
3224  *
3225  * Note that this is the only routine that modifies a thread's t_lpl field,
3226  * and that adds in or removes anticipatory load.
3227  *
3228  * If the thread is exiting, newlpl is NULL.
3229  *
3230  * Locking:
3231  * The following lock must be held on entry:
3232  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3233  *		doesn't get removed from t's partition
3234  *
3235  * This routine is not allowed to grab any locks, since it may be called
3236  * with cpus paused (such as from cpu_offline).
3237  */
3238 void
3239 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3240 {
3241 	proc_t		*p;
3242 	lpl_t		*lpl, *oldlpl;
3243 	lgrp_id_t	oldid;
3244 	kthread_t	*tp;
3245 	uint_t		ncpu;
3246 	lgrp_load_t	old, new;
3247 
3248 	ASSERT(t);
3249 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3250 	    THREAD_LOCK_HELD(t));
3251 
3252 	/*
3253 	 * If not changing lpls, just return
3254 	 */
3255 	if ((oldlpl = t->t_lpl) == newlpl)
3256 		return;
3257 
3258 	/*
3259 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3260 	 * associated with process 0 rather than with its original process).
3261 	 */
3262 	if (t->t_proc_flag & TP_LWPEXIT) {
3263 		if (newlpl != NULL) {
3264 			t->t_lpl = newlpl;
3265 		}
3266 		return;
3267 	}
3268 
3269 	p = ttoproc(t);
3270 
3271 	/*
3272 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3273 	 * to account for it being moved from its old lgroup.
3274 	 */
3275 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3276 	    (p->p_tlist != NULL)) {
3277 		oldid = oldlpl->lpl_lgrpid;
3278 
3279 		if (newlpl != NULL)
3280 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3281 
3282 		if ((do_lgrpset_delete) &&
3283 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3284 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3285 				/*
3286 				 * Check if a thread other than the thread
3287 				 * that's moving is assigned to the same
3288 				 * lgroup as the thread that's moving.  Note
3289 				 * that we have to compare lgroup IDs, rather
3290 				 * than simply comparing t_lpl's, since the
3291 				 * threads may belong to different partitions
3292 				 * but be assigned to the same lgroup.
3293 				 */
3294 				ASSERT(tp->t_lpl != NULL);
3295 
3296 				if ((tp != t) &&
3297 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3298 					/*
3299 					 * Another thread is assigned to the
3300 					 * same lgroup as the thread that's
3301 					 * moving, p_lgrpset doesn't change.
3302 					 */
3303 					break;
3304 				} else if (tp == p->p_tlist) {
3305 					/*
3306 					 * No other thread is assigned to the
3307 					 * same lgroup as the exiting thread,
3308 					 * clear the lgroup's bit in p_lgrpset.
3309 					 */
3310 					klgrpset_del(p->p_lgrpset, oldid);
3311 					break;
3312 				}
3313 			}
3314 		}
3315 
3316 		/*
3317 		 * If this thread was assigned to its old lgroup for such a
3318 		 * short amount of time that the anticipatory load that was
3319 		 * added on its behalf has aged very little, remove that
3320 		 * anticipatory load.
3321 		 */
3322 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3323 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3324 			lpl = oldlpl;
3325 			for (;;) {
3326 				do {
3327 					old = new = lpl->lpl_loadavg;
3328 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3329 					if (new > old) {
3330 						/*
3331 						 * this can happen if the load
3332 						 * average was aged since we
3333 						 * added in the anticipatory
3334 						 * load
3335 						 */
3336 						new = 0;
3337 					}
3338 				} while (cas32(
3339 				    (lgrp_load_t *)&lpl->lpl_loadavg, old,
3340 				    new) != old);
3341 
3342 				lpl = lpl->lpl_parent;
3343 				if (lpl == NULL)
3344 					break;
3345 
3346 				ncpu = lpl->lpl_ncpu;
3347 				ASSERT(ncpu > 0);
3348 			}
3349 		}
3350 	}
3351 	/*
3352 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3353 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3354 	 * to its new lgroup to account for its move to its new lgroup.
3355 	 */
3356 	if (newlpl != NULL) {
3357 		/*
3358 		 * This thread is moving to a new lgroup
3359 		 */
3360 		t->t_lpl = newlpl;
3361 		if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3362 			p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3363 			membar_producer();
3364 			if (p->p_tr_lgrpid != LGRP_NONE &&
3365 			    p->p_tr_lgrpid != p->p_t1_lgrpid) {
3366 				lgrp_update_trthr_migrations(1);
3367 			}
3368 		}
3369 
3370 		/*
3371 		 * Reflect move in load average of new lgroup
3372 		 * unless it is root lgroup
3373 		 */
3374 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3375 			return;
3376 
3377 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3378 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3379 		}
3380 
3381 		/*
3382 		 * It'll take some time for the load on the new lgroup
3383 		 * to reflect this thread's placement on it.  We'd
3384 		 * like not, however, to have all threads between now
3385 		 * and then also piling on to this lgroup.  To avoid
3386 		 * this pileup, we anticipate the load this thread
3387 		 * will generate on its new lgroup.  The goal is to
3388 		 * make the lgroup's load appear as though the thread
3389 		 * had been there all along.  We're very conservative
3390 		 * in calculating this anticipatory load, we assume
3391 		 * the worst case case (100% CPU-bound thread).  This
3392 		 * may be modified in the future to be more accurate.
3393 		 */
3394 		lpl = newlpl;
3395 		for (;;) {
3396 			ncpu = lpl->lpl_ncpu;
3397 			ASSERT(ncpu > 0);
3398 			do {
3399 				old = new = lpl->lpl_loadavg;
3400 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3401 				/*
3402 				 * Check for overflow
3403 				 * Underflow not possible here
3404 				 */
3405 				if (new < old)
3406 					new = UINT32_MAX;
3407 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3408 			    new) != old);
3409 
3410 			lpl = lpl->lpl_parent;
3411 			if (lpl == NULL)
3412 				break;
3413 		}
3414 		t->t_anttime = gethrtime();
3415 	}
3416 }
3417 
3418 /*
3419  * Return lgroup memory allocation policy given advice from madvise(3C)
3420  */
3421 lgrp_mem_policy_t
3422 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3423 {
3424 	switch (advice) {
3425 	case MADV_ACCESS_LWP:
3426 		return (LGRP_MEM_POLICY_NEXT);
3427 	case MADV_ACCESS_MANY:
3428 		return (LGRP_MEM_POLICY_RANDOM);
3429 	default:
3430 		return (lgrp_mem_policy_default(size, type));
3431 	}
3432 }
3433 
3434 /*
3435  * Figure out default policy
3436  */
3437 lgrp_mem_policy_t
3438 lgrp_mem_policy_default(size_t size, int type)
3439 {
3440 	cpupart_t		*cp;
3441 	lgrp_mem_policy_t	policy;
3442 	size_t			pset_mem_size;
3443 
3444 	/*
3445 	 * Randomly allocate memory across lgroups for shared memory
3446 	 * beyond a certain threshold
3447 	 */
3448 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3449 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3450 		/*
3451 		 * Get total memory size of current thread's pset
3452 		 */
3453 		kpreempt_disable();
3454 		cp = curthread->t_cpupart;
3455 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3456 		kpreempt_enable();
3457 
3458 		/*
3459 		 * Choose policy to randomly allocate memory across
3460 		 * lgroups in pset if it will fit and is not default
3461 		 * partition.  Otherwise, allocate memory randomly
3462 		 * across machine.
3463 		 */
3464 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3465 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3466 		else
3467 			policy = LGRP_MEM_POLICY_RANDOM;
3468 	} else
3469 		/*
3470 		 * Apply default policy for private memory and
3471 		 * shared memory under the respective random
3472 		 * threshold.
3473 		 */
3474 		policy = lgrp_mem_default_policy;
3475 
3476 	return (policy);
3477 }
3478 
3479 /*
3480  * Get memory allocation policy for this segment
3481  */
3482 lgrp_mem_policy_info_t *
3483 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3484 {
3485 	lgrp_mem_policy_info_t	*policy_info;
3486 	extern struct seg_ops	segspt_ops;
3487 	extern struct seg_ops	segspt_shmops;
3488 
3489 	/*
3490 	 * This is for binary compatibility to protect against third party
3491 	 * segment drivers which haven't recompiled to allow for
3492 	 * SEGOP_GETPOLICY()
3493 	 */
3494 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3495 	    seg->s_ops != &segspt_shmops)
3496 		return (NULL);
3497 
3498 	policy_info = NULL;
3499 	if (seg->s_ops->getpolicy != NULL)
3500 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3501 
3502 	return (policy_info);
3503 }
3504 
3505 /*
3506  * Set policy for allocating private memory given desired policy, policy info,
3507  * size in bytes of memory that policy is being applied.
3508  * Return 0 if policy wasn't set already and 1 if policy was set already
3509  */
3510 int
3511 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3512     lgrp_mem_policy_info_t *policy_info, size_t size)
3513 {
3514 
3515 	ASSERT(policy_info != NULL);
3516 
3517 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3518 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3519 
3520 	/*
3521 	 * Policy set already?
3522 	 */
3523 	if (policy == policy_info->mem_policy)
3524 		return (1);
3525 
3526 	/*
3527 	 * Set policy
3528 	 */
3529 	policy_info->mem_policy = policy;
3530 	policy_info->mem_lgrpid = LGRP_NONE;
3531 
3532 	return (0);
3533 }
3534 
3535 
3536 /*
3537  * Get shared memory allocation policy with given tree and offset
3538  */
3539 lgrp_mem_policy_info_t *
3540 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3541     u_offset_t vn_off)
3542 {
3543 	u_offset_t		off;
3544 	lgrp_mem_policy_info_t	*policy_info;
3545 	lgrp_shm_policy_seg_t	*policy_seg;
3546 	lgrp_shm_locality_t	*shm_locality;
3547 	avl_tree_t		*tree;
3548 	avl_index_t		where;
3549 
3550 	/*
3551 	 * Get policy segment tree from anon_map or vnode and use specified
3552 	 * anon index or vnode offset as offset
3553 	 *
3554 	 * Assume that no lock needs to be held on anon_map or vnode, since
3555 	 * they should be protected by their reference count which must be
3556 	 * nonzero for an existing segment
3557 	 */
3558 	if (amp) {
3559 		ASSERT(amp->refcnt != 0);
3560 		shm_locality = amp->locality;
3561 		if (shm_locality == NULL)
3562 			return (NULL);
3563 		tree = shm_locality->loc_tree;
3564 		off = ptob(anon_index);
3565 	} else if (vp) {
3566 		shm_locality = vp->v_locality;
3567 		if (shm_locality == NULL)
3568 			return (NULL);
3569 		ASSERT(shm_locality->loc_count != 0);
3570 		tree = shm_locality->loc_tree;
3571 		off = vn_off;
3572 	}
3573 
3574 	if (tree == NULL)
3575 		return (NULL);
3576 
3577 	/*
3578 	 * Lookup policy segment for offset into shared object and return
3579 	 * policy info
3580 	 */
3581 	rw_enter(&shm_locality->loc_lock, RW_READER);
3582 	policy_info = NULL;
3583 	policy_seg = avl_find(tree, &off, &where);
3584 	if (policy_seg)
3585 		policy_info = &policy_seg->shm_policy;
3586 	rw_exit(&shm_locality->loc_lock);
3587 
3588 	return (policy_info);
3589 }
3590 
3591 /*
3592  * Default memory allocation policy for kernel segmap pages
3593  */
3594 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3595 
3596 /*
3597  * Return lgroup to use for allocating memory
3598  * given the segment and address
3599  *
3600  * There isn't any mutual exclusion that exists between calls
3601  * to this routine and DR, so this routine and whomever calls it
3602  * should be mindful of the possibility that the lgrp returned
3603  * may be deleted. If this happens, dereferences of the lgrp
3604  * pointer will still be safe, but the resources in the lgrp will
3605  * be gone, and LGRP_EXISTS() will no longer be true.
3606  */
3607 lgrp_t *
3608 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3609 {
3610 	int			i;
3611 	lgrp_t			*lgrp;
3612 	klgrpset_t		lgrpset;
3613 	int			lgrps_spanned;
3614 	unsigned long		off;
3615 	lgrp_mem_policy_t	policy;
3616 	lgrp_mem_policy_info_t	*policy_info;
3617 	ushort_t		random;
3618 	int			stat = 0;
3619 	extern struct seg	*segkmap;
3620 
3621 	/*
3622 	 * Just return null if the lgrp framework hasn't finished
3623 	 * initializing or if this is a UMA machine.
3624 	 */
3625 	if (nlgrps == 1 || !lgrp_initialized)
3626 		return (lgrp_root);
3627 
3628 	/*
3629 	 * Get memory allocation policy for this segment
3630 	 */
3631 	policy = lgrp_mem_default_policy;
3632 	if (seg != NULL) {
3633 		if (seg->s_as == &kas) {
3634 			if (seg == segkmap)
3635 				policy = lgrp_segmap_default_policy;
3636 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3637 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3638 				policy = LGRP_MEM_POLICY_RANDOM;
3639 		} else {
3640 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3641 			if (policy_info != NULL) {
3642 				policy = policy_info->mem_policy;
3643 				if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3644 					lgrp_id_t id = policy_info->mem_lgrpid;
3645 					ASSERT(id != LGRP_NONE);
3646 					ASSERT(id < NLGRPS_MAX);
3647 					lgrp = lgrp_table[id];
3648 					if (!LGRP_EXISTS(lgrp)) {
3649 						policy = LGRP_MEM_POLICY_NEXT;
3650 					} else {
3651 						lgrp_stat_add(id,
3652 						    LGRP_NUM_NEXT_SEG, 1);
3653 						return (lgrp);
3654 					}
3655 				}
3656 			}
3657 		}
3658 	}
3659 	lgrpset = 0;
3660 
3661 	/*
3662 	 * Initialize lgroup to home by default
3663 	 */
3664 	lgrp = lgrp_home_lgrp();
3665 
3666 	/*
3667 	 * When homing threads on root lgrp, override default memory
3668 	 * allocation policies with root lgroup memory allocation policy
3669 	 */
3670 	if (lgrp == lgrp_root)
3671 		policy = lgrp_mem_policy_root;
3672 
3673 	/*
3674 	 * Implement policy
3675 	 */
3676 	switch (policy) {
3677 	case LGRP_MEM_POLICY_NEXT_CPU:
3678 
3679 		/*
3680 		 * Return lgroup of current CPU which faulted on memory
3681 		 * If the CPU isn't currently in an lgrp, then opt to
3682 		 * allocate from the root.
3683 		 *
3684 		 * Kernel preemption needs to be disabled here to prevent
3685 		 * the current CPU from going away before lgrp is found.
3686 		 */
3687 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3688 			lgrp = lgrp_root;
3689 		} else {
3690 			kpreempt_disable();
3691 			lgrp = lgrp_cpu_to_lgrp(CPU);
3692 			kpreempt_enable();
3693 		}
3694 		break;
3695 
3696 	case LGRP_MEM_POLICY_NEXT:
3697 	case LGRP_MEM_POLICY_DEFAULT:
3698 	default:
3699 
3700 		/*
3701 		 * Just return current thread's home lgroup
3702 		 * for default policy (next touch)
3703 		 * If the thread is homed to the root,
3704 		 * then the default policy is random across lgroups.
3705 		 * Fallthrough to the random case.
3706 		 */
3707 		if (lgrp != lgrp_root) {
3708 			if (policy == LGRP_MEM_POLICY_NEXT)
3709 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3710 			else
3711 				lgrp_stat_add(lgrp->lgrp_id,
3712 				    LGRP_NUM_DEFAULT, 1);
3713 			break;
3714 		}
3715 		/* LINTED fallthrough on case statement */
3716 	case LGRP_MEM_POLICY_RANDOM:
3717 
3718 		/*
3719 		 * Return a random leaf lgroup with memory
3720 		 */
3721 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3722 		/*
3723 		 * Count how many lgroups are spanned
3724 		 */
3725 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3726 
3727 		/*
3728 		 * There may be no memnodes in the root lgroup during DR copy
3729 		 * rename on a system with only two boards (memnodes)
3730 		 * configured. In this case just return the root lgrp.
3731 		 */
3732 		if (lgrps_spanned == 0) {
3733 			lgrp = lgrp_root;
3734 			break;
3735 		}
3736 
3737 		/*
3738 		 * Pick a random offset within lgroups spanned
3739 		 * and return lgroup at that offset
3740 		 */
3741 		random = (ushort_t)gethrtime() >> 4;
3742 		off = random % lgrps_spanned;
3743 		ASSERT(off <= lgrp_alloc_max);
3744 
3745 		for (i = 0; i <= lgrp_alloc_max; i++) {
3746 			if (!klgrpset_ismember(lgrpset, i))
3747 				continue;
3748 			if (off)
3749 				off--;
3750 			else {
3751 				lgrp = lgrp_table[i];
3752 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3753 				    1);
3754 				break;
3755 			}
3756 		}
3757 		break;
3758 
3759 	case LGRP_MEM_POLICY_RANDOM_PROC:
3760 
3761 		/*
3762 		 * Grab copy of bitmask of lgroups spanned by
3763 		 * this process
3764 		 */
3765 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3766 		stat = LGRP_NUM_RANDOM_PROC;
3767 
3768 		/* LINTED fallthrough on case statement */
3769 	case LGRP_MEM_POLICY_RANDOM_PSET:
3770 
3771 		if (!stat)
3772 			stat = LGRP_NUM_RANDOM_PSET;
3773 
3774 		if (klgrpset_isempty(lgrpset)) {
3775 			/*
3776 			 * Grab copy of bitmask of lgroups spanned by
3777 			 * this processor set
3778 			 */
3779 			kpreempt_disable();
3780 			klgrpset_copy(lgrpset,
3781 			    curthread->t_cpupart->cp_lgrpset);
3782 			kpreempt_enable();
3783 		}
3784 
3785 		/*
3786 		 * Count how many lgroups are spanned
3787 		 */
3788 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3789 		ASSERT(lgrps_spanned <= nlgrps);
3790 
3791 		/*
3792 		 * Probably lgrps_spanned should be always non-zero, but to be
3793 		 * on the safe side we return lgrp_root if it is empty.
3794 		 */
3795 		if (lgrps_spanned == 0) {
3796 			lgrp = lgrp_root;
3797 			break;
3798 		}
3799 
3800 		/*
3801 		 * Pick a random offset within lgroups spanned
3802 		 * and return lgroup at that offset
3803 		 */
3804 		random = (ushort_t)gethrtime() >> 4;
3805 		off = random % lgrps_spanned;
3806 		ASSERT(off <= lgrp_alloc_max);
3807 
3808 		for (i = 0; i <= lgrp_alloc_max; i++) {
3809 			if (!klgrpset_ismember(lgrpset, i))
3810 				continue;
3811 			if (off)
3812 				off--;
3813 			else {
3814 				lgrp = lgrp_table[i];
3815 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3816 				    1);
3817 				break;
3818 			}
3819 		}
3820 		break;
3821 
3822 	case LGRP_MEM_POLICY_ROUNDROBIN:
3823 
3824 		/*
3825 		 * Use offset within segment to determine
3826 		 * offset from home lgroup to choose for
3827 		 * next lgroup to allocate memory from
3828 		 */
3829 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3830 		    (lgrp_alloc_max + 1);
3831 
3832 		kpreempt_disable();
3833 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3834 		i = lgrp->lgrp_id;
3835 		kpreempt_enable();
3836 
3837 		while (off > 0) {
3838 			i = (i + 1) % (lgrp_alloc_max + 1);
3839 			lgrp = lgrp_table[i];
3840 			if (klgrpset_ismember(lgrpset, i))
3841 				off--;
3842 		}
3843 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3844 
3845 		break;
3846 	}
3847 
3848 	ASSERT(lgrp != NULL);
3849 	return (lgrp);
3850 }
3851 
3852 /*
3853  * Return the number of pages in an lgroup
3854  *
3855  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3856  *	 could cause tests that rely on the numat driver to fail....
3857  */
3858 pgcnt_t
3859 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3860 {
3861 	lgrp_t *lgrp;
3862 
3863 	lgrp = lgrp_table[lgrpid];
3864 	if (!LGRP_EXISTS(lgrp) ||
3865 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3866 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3867 		return (0);
3868 
3869 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3870 }
3871 
3872 /*
3873  * Initialize lgroup shared memory allocation policy support
3874  */
3875 void
3876 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3877 {
3878 	lgrp_shm_locality_t	*shm_locality;
3879 
3880 	/*
3881 	 * Initialize locality field in anon_map
3882 	 * Don't need any locks because this is called when anon_map is
3883 	 * allocated, but not used anywhere yet.
3884 	 */
3885 	if (amp) {
3886 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3887 		if (amp->locality == NULL) {
3888 			/*
3889 			 * Allocate and initialize shared memory locality info
3890 			 * and set anon_map locality pointer to it
3891 			 * Drop lock across kmem_alloc(KM_SLEEP)
3892 			 */
3893 			ANON_LOCK_EXIT(&amp->a_rwlock);
3894 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3895 			    KM_SLEEP);
3896 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3897 			    NULL);
3898 			shm_locality->loc_count = 1;	/* not used for amp */
3899 			shm_locality->loc_tree = NULL;
3900 
3901 			/*
3902 			 * Reacquire lock and check to see whether anyone beat
3903 			 * us to initializing the locality info
3904 			 */
3905 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3906 			if (amp->locality != NULL) {
3907 				rw_destroy(&shm_locality->loc_lock);
3908 				kmem_free(shm_locality,
3909 				    sizeof (*shm_locality));
3910 			} else
3911 				amp->locality = shm_locality;
3912 		}
3913 		ANON_LOCK_EXIT(&amp->a_rwlock);
3914 		return;
3915 	}
3916 
3917 	/*
3918 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3919 	 */
3920 	mutex_enter(&vp->v_lock);
3921 	if ((vp->v_flag & V_LOCALITY) == 0) {
3922 		/*
3923 		 * Allocate and initialize shared memory locality info
3924 		 */
3925 		mutex_exit(&vp->v_lock);
3926 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3927 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3928 		shm_locality->loc_count = 1;
3929 		shm_locality->loc_tree = NULL;
3930 
3931 		/*
3932 		 * Point vnode locality field at shared vnode policy info
3933 		 * and set locality aware flag in vnode
3934 		 */
3935 		mutex_enter(&vp->v_lock);
3936 		if ((vp->v_flag & V_LOCALITY) == 0) {
3937 			vp->v_locality = shm_locality;
3938 			vp->v_flag |= V_LOCALITY;
3939 		} else {
3940 			/*
3941 			 * Lost race so free locality info and increment count.
3942 			 */
3943 			rw_destroy(&shm_locality->loc_lock);
3944 			kmem_free(shm_locality, sizeof (*shm_locality));
3945 			shm_locality = vp->v_locality;
3946 			shm_locality->loc_count++;
3947 		}
3948 		mutex_exit(&vp->v_lock);
3949 
3950 		return;
3951 	}
3952 
3953 	/*
3954 	 * Increment reference count of number of segments mapping this vnode
3955 	 * shared
3956 	 */
3957 	shm_locality = vp->v_locality;
3958 	shm_locality->loc_count++;
3959 	mutex_exit(&vp->v_lock);
3960 }
3961 
3962 /*
3963  * Destroy the given shared memory policy segment tree
3964  */
3965 void
3966 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3967 {
3968 	lgrp_shm_policy_seg_t	*cur;
3969 	lgrp_shm_policy_seg_t	*next;
3970 
3971 	if (tree == NULL)
3972 		return;
3973 
3974 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3975 	while (cur != NULL) {
3976 		next = AVL_NEXT(tree, cur);
3977 		avl_remove(tree, cur);
3978 		kmem_free(cur, sizeof (*cur));
3979 		cur = next;
3980 	}
3981 	kmem_free(tree, sizeof (avl_tree_t));
3982 }
3983 
3984 /*
3985  * Uninitialize lgroup shared memory allocation policy support
3986  */
3987 void
3988 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3989 {
3990 	lgrp_shm_locality_t	*shm_locality;
3991 
3992 	/*
3993 	 * For anon_map, deallocate shared memory policy tree and
3994 	 * zero locality field
3995 	 * Don't need any locks because anon_map is being freed
3996 	 */
3997 	if (amp) {
3998 		if (amp->locality == NULL)
3999 			return;
4000 		shm_locality = amp->locality;
4001 		shm_locality->loc_count = 0;	/* not really used for amp */
4002 		rw_destroy(&shm_locality->loc_lock);
4003 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4004 		kmem_free(shm_locality, sizeof (*shm_locality));
4005 		amp->locality = 0;
4006 		return;
4007 	}
4008 
4009 	/*
4010 	 * For vnode, decrement reference count of segments mapping this vnode
4011 	 * shared and delete locality info if reference count drops to 0
4012 	 */
4013 	mutex_enter(&vp->v_lock);
4014 	shm_locality = vp->v_locality;
4015 	shm_locality->loc_count--;
4016 
4017 	if (shm_locality->loc_count == 0) {
4018 		rw_destroy(&shm_locality->loc_lock);
4019 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4020 		kmem_free(shm_locality, sizeof (*shm_locality));
4021 		vp->v_locality = 0;
4022 		vp->v_flag &= ~V_LOCALITY;
4023 	}
4024 	mutex_exit(&vp->v_lock);
4025 }
4026 
4027 /*
4028  * Compare two shared memory policy segments
4029  * Used by AVL tree code for searching
4030  */
4031 int
4032 lgrp_shm_policy_compar(const void *x, const void *y)
4033 {
4034 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4035 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4036 
4037 	if (a->shm_off < b->shm_off)
4038 		return (-1);
4039 	if (a->shm_off >= b->shm_off + b->shm_size)
4040 		return (1);
4041 	return (0);
4042 }
4043 
4044 /*
4045  * Concatenate seg1 with seg2 and remove seg2
4046  */
4047 static int
4048 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4049     lgrp_shm_policy_seg_t *seg2)
4050 {
4051 	if (!seg1 || !seg2 ||
4052 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4053 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4054 		return (-1);
4055 
4056 	seg1->shm_size += seg2->shm_size;
4057 	avl_remove(tree, seg2);
4058 	kmem_free(seg2, sizeof (*seg2));
4059 	return (0);
4060 }
4061 
4062 /*
4063  * Split segment at given offset and return rightmost (uppermost) segment
4064  * Assumes that there are no overlapping segments
4065  */
4066 static lgrp_shm_policy_seg_t *
4067 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4068     u_offset_t off)
4069 {
4070 	lgrp_shm_policy_seg_t	*newseg;
4071 	avl_index_t		where;
4072 
4073 	ASSERT(seg != NULL);
4074 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4075 
4076 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4077 	    seg->shm_size)
4078 		return (NULL);
4079 
4080 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4081 		return (seg);
4082 
4083 	/*
4084 	 * Adjust size of left segment and allocate new (right) segment
4085 	 */
4086 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4087 	newseg->shm_policy = seg->shm_policy;
4088 	newseg->shm_off = off;
4089 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4090 	seg->shm_size = off - seg->shm_off;
4091 
4092 	/*
4093 	 * Find where to insert new segment in AVL tree and insert it
4094 	 */
4095 	(void) avl_find(tree, &off, &where);
4096 	avl_insert(tree, newseg, where);
4097 
4098 	return (newseg);
4099 }
4100 
4101 /*
4102  * Set shared memory allocation policy on specified shared object at given
4103  * offset and length
4104  *
4105  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4106  * -1 if can't set policy.
4107  */
4108 int
4109 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4110     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4111 {
4112 	u_offset_t		eoff;
4113 	lgrp_shm_policy_seg_t	*next;
4114 	lgrp_shm_policy_seg_t	*newseg;
4115 	u_offset_t		off;
4116 	u_offset_t		oldeoff;
4117 	lgrp_shm_policy_seg_t	*prev;
4118 	int			retval;
4119 	lgrp_shm_policy_seg_t	*seg;
4120 	lgrp_shm_locality_t	*shm_locality;
4121 	avl_tree_t		*tree;
4122 	avl_index_t		where;
4123 
4124 	ASSERT(amp || vp);
4125 	ASSERT((len & PAGEOFFSET) == 0);
4126 
4127 	if (len == 0)
4128 		return (-1);
4129 
4130 	retval = 0;
4131 
4132 	/*
4133 	 * Get locality info and starting offset into shared object
4134 	 * Try anon map first and then vnode
4135 	 * Assume that no locks need to be held on anon_map or vnode, since
4136 	 * it should be protected by its reference count which must be nonzero
4137 	 * for an existing segment.
4138 	 */
4139 	if (amp) {
4140 		/*
4141 		 * Get policy info from anon_map
4142 		 *
4143 		 */
4144 		ASSERT(amp->refcnt != 0);
4145 		if (amp->locality == NULL)
4146 			lgrp_shm_policy_init(amp, NULL);
4147 		shm_locality = amp->locality;
4148 		off = ptob(anon_index);
4149 	} else if (vp) {
4150 		/*
4151 		 * Get policy info from vnode
4152 		 */
4153 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4154 			lgrp_shm_policy_init(NULL, vp);
4155 		shm_locality = vp->v_locality;
4156 		ASSERT(shm_locality->loc_count != 0);
4157 		off = vn_off;
4158 	} else
4159 		return (-1);
4160 
4161 	ASSERT((off & PAGEOFFSET) == 0);
4162 
4163 	/*
4164 	 * Figure out default policy
4165 	 */
4166 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4167 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4168 
4169 	/*
4170 	 * Create AVL tree if there isn't one yet
4171 	 * and set locality field to point at it
4172 	 */
4173 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4174 	tree = shm_locality->loc_tree;
4175 	if (!tree) {
4176 		rw_exit(&shm_locality->loc_lock);
4177 
4178 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4179 
4180 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4181 		if (shm_locality->loc_tree == NULL) {
4182 			avl_create(tree, lgrp_shm_policy_compar,
4183 			    sizeof (lgrp_shm_policy_seg_t),
4184 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4185 			shm_locality->loc_tree = tree;
4186 		} else {
4187 			/*
4188 			 * Another thread managed to set up the tree
4189 			 * before we could. Free the tree we allocated
4190 			 * and use the one that's already there.
4191 			 */
4192 			kmem_free(tree, sizeof (*tree));
4193 			tree = shm_locality->loc_tree;
4194 		}
4195 	}
4196 
4197 	/*
4198 	 * Set policy
4199 	 *
4200 	 * Need to maintain hold on writer's lock to keep tree from
4201 	 * changing out from under us
4202 	 */
4203 	while (len != 0) {
4204 		/*
4205 		 * Find policy segment for specified offset into shared object
4206 		 */
4207 		seg = avl_find(tree, &off, &where);
4208 
4209 		/*
4210 		 * Didn't find any existing segment that contains specified
4211 		 * offset, so allocate new segment, insert it, and concatenate
4212 		 * with adjacent segments if possible
4213 		 */
4214 		if (seg == NULL) {
4215 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4216 			    KM_SLEEP);
4217 			newseg->shm_policy.mem_policy = policy;
4218 			newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4219 			newseg->shm_off = off;
4220 			avl_insert(tree, newseg, where);
4221 
4222 			/*
4223 			 * Check to see whether new segment overlaps with next
4224 			 * one, set length of new segment accordingly, and
4225 			 * calculate remaining length and next offset
4226 			 */
4227 			seg = AVL_NEXT(tree, newseg);
4228 			if (seg == NULL || off + len <= seg->shm_off) {
4229 				newseg->shm_size = len;
4230 				len = 0;
4231 			} else {
4232 				newseg->shm_size = seg->shm_off - off;
4233 				off = seg->shm_off;
4234 				len -= newseg->shm_size;
4235 			}
4236 
4237 			/*
4238 			 * Try to concatenate new segment with next and
4239 			 * previous ones, since they might have the same policy
4240 			 * now.  Grab previous and next segments first because
4241 			 * they will change on concatenation.
4242 			 */
4243 			prev =  AVL_PREV(tree, newseg);
4244 			next = AVL_NEXT(tree, newseg);
4245 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4246 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4247 
4248 			continue;
4249 		}
4250 
4251 		eoff = off + len;
4252 		oldeoff = seg->shm_off + seg->shm_size;
4253 
4254 		/*
4255 		 * Policy set already?
4256 		 */
4257 		if (policy == seg->shm_policy.mem_policy) {
4258 			/*
4259 			 * Nothing left to do if offset and length
4260 			 * fall within this segment
4261 			 */
4262 			if (eoff <= oldeoff) {
4263 				retval = 1;
4264 				break;
4265 			} else {
4266 				len = eoff - oldeoff;
4267 				off = oldeoff;
4268 				continue;
4269 			}
4270 		}
4271 
4272 		/*
4273 		 * Specified offset and length match existing segment exactly
4274 		 */
4275 		if (off == seg->shm_off && len == seg->shm_size) {
4276 			/*
4277 			 * Set policy and update current length
4278 			 */
4279 			seg->shm_policy.mem_policy = policy;
4280 			seg->shm_policy.mem_lgrpid = LGRP_NONE;
4281 			len = 0;
4282 
4283 			/*
4284 			 * Try concatenating new segment with previous and next
4285 			 * segments, since they might have the same policy now.
4286 			 * Grab previous and next segments first because they
4287 			 * will change on concatenation.
4288 			 */
4289 			prev =  AVL_PREV(tree, seg);
4290 			next = AVL_NEXT(tree, seg);
4291 			(void) lgrp_shm_policy_concat(tree, seg, next);
4292 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4293 		} else {
4294 			/*
4295 			 * Specified offset and length only apply to part of
4296 			 * existing segment
4297 			 */
4298 
4299 			/*
4300 			 * New segment starts in middle of old one, so split
4301 			 * new one off near beginning of old one
4302 			 */
4303 			newseg = NULL;
4304 			if (off > seg->shm_off) {
4305 				newseg = lgrp_shm_policy_split(tree, seg, off);
4306 
4307 				/*
4308 				 * New segment ends where old one did, so try
4309 				 * to concatenate with next segment
4310 				 */
4311 				if (eoff == oldeoff) {
4312 					newseg->shm_policy.mem_policy = policy;
4313 					newseg->shm_policy.mem_lgrpid =
4314 					    LGRP_NONE;
4315 					(void) lgrp_shm_policy_concat(tree,
4316 					    newseg, AVL_NEXT(tree, newseg));
4317 					break;
4318 				}
4319 			}
4320 
4321 			/*
4322 			 * New segment ends before old one, so split off end of
4323 			 * old one
4324 			 */
4325 			if (eoff < oldeoff) {
4326 				if (newseg) {
4327 					(void) lgrp_shm_policy_split(tree,
4328 					    newseg, eoff);
4329 					newseg->shm_policy.mem_policy = policy;
4330 					newseg->shm_policy.mem_lgrpid =
4331 					    LGRP_NONE;
4332 				} else {
4333 					(void) lgrp_shm_policy_split(tree, seg,
4334 					    eoff);
4335 					seg->shm_policy.mem_policy = policy;
4336 					seg->shm_policy.mem_lgrpid = LGRP_NONE;
4337 				}
4338 
4339 				if (off == seg->shm_off)
4340 					(void) lgrp_shm_policy_concat(tree,
4341 					    AVL_PREV(tree, seg), seg);
4342 				break;
4343 			}
4344 
4345 			/*
4346 			 * Calculate remaining length and next offset
4347 			 */
4348 			len = eoff - oldeoff;
4349 			off = oldeoff;
4350 		}
4351 	}
4352 
4353 	rw_exit(&shm_locality->loc_lock);
4354 	return (retval);
4355 }
4356 
4357 /*
4358  * Return the best memnode from which to allocate memory given
4359  * an lgroup.
4360  *
4361  * "c" is for cookie, which is good enough for me.
4362  * It references a cookie struct that should be zero'ed to initialize.
4363  * The cookie should live on the caller's stack.
4364  *
4365  * The routine returns -1 when:
4366  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4367  *	- traverse is 1, and all the memnodes in the system have been
4368  *	  returned.
4369  */
4370 int
4371 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4372 {
4373 	lgrp_t		*lp = c->lmc_lgrp;
4374 	mnodeset_t	nodes = c->lmc_nodes;
4375 	int		cnt = c->lmc_cnt;
4376 	int		offset, mnode;
4377 
4378 	extern int	max_mem_nodes;
4379 
4380 	/*
4381 	 * If the set is empty, and the caller is willing, traverse
4382 	 * up the hierarchy until we find a non-empty set.
4383 	 */
4384 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4385 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4386 		    ((lp = lp->lgrp_parent) == NULL))
4387 			return (-1);
4388 
4389 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4390 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4391 	}
4392 
4393 	/*
4394 	 * Select a memnode by picking one at a "random" offset.
4395 	 * Because of DR, memnodes can come and go at any time.
4396 	 * This code must be able to cope with the possibility
4397 	 * that the nodes count "cnt" is inconsistent with respect
4398 	 * to the number of elements actually in "nodes", and
4399 	 * therefore that the offset chosen could be greater than
4400 	 * the number of elements in the set (some memnodes may
4401 	 * have dissapeared just before cnt was read).
4402 	 * If this happens, the search simply wraps back to the
4403 	 * beginning of the set.
4404 	 */
4405 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4406 	offset = c->lmc_rand % cnt;
4407 	do {
4408 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4409 			if (nodes & ((mnodeset_t)1 << mnode))
4410 				if (!offset--)
4411 					break;
4412 	} while (mnode >= max_mem_nodes);
4413 
4414 	/* Found a node. Store state before returning. */
4415 	c->lmc_lgrp = lp;
4416 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4417 	c->lmc_cnt = cnt - 1;
4418 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4419 	c->lmc_ntried++;
4420 
4421 	return (mnode);
4422 }
4423