xref: /titanic_44/usr/src/uts/common/os/lgrp.c (revision 37b407883a45b878109bb872aba0145ef4101cec)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Basic NUMA support in terms of locality groups
31  *
32  * Solaris needs to know which CPUs, memory, etc. are near each other to
33  * provide good performance on NUMA machines by optimizing for locality.
34  * In order to do this, a new abstraction called a "locality group (lgroup)"
35  * has been introduced to keep track of which CPU-like and memory-like hardware
36  * resources are close to each other.  Currently, latency is the only measure
37  * used to determine how to group hardware resources into lgroups, but this
38  * does not limit the groupings to be based solely on latency.  Other factors
39  * may be used to determine the groupings in the future.
40  *
41  * Lgroups are organized into a hieararchy or topology that represents the
42  * latency topology of the machine.  There is always at least a root lgroup in
43  * the system.  It represents all the hardware resources in the machine at a
44  * latency big enough that any hardware resource can at least access any other
45  * hardware resource within that latency.  A Uniform Memory Access (UMA)
46  * machine is represented with one lgroup (the root).  In contrast, a NUMA
47  * machine is represented at least by the root lgroup and some number of leaf
48  * lgroups where the leaf lgroups contain the hardware resources within the
49  * least latency of each other and the root lgroup still contains all the
50  * resources in the machine.  Some number of intermediate lgroups may exist
51  * which represent more levels of locality than just the local latency of the
52  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
53  * (eg. root and intermediate lgroups) contain the next nearest resources to
54  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
55  * to the root lgroup shows the hardware resources from closest to farthest
56  * from the leaf lgroup such that each successive ancestor lgroup contains
57  * the next nearest resources at the next level of locality from the previous.
58  *
59  * The kernel uses the lgroup abstraction to know how to allocate resources
60  * near a given process/thread.  At fork() and lwp/thread_create() time, a
61  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
62  * with the lowest load average.  Binding to a processor or processor set will
63  * change the home lgroup for a thread.  The scheduler has been modified to try
64  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
65  * allocation is lgroup aware too, so memory will be allocated from the current
66  * thread's home lgroup if possible.  If the desired resources are not
67  * available, the kernel traverses the lgroup hierarchy going to the parent
68  * lgroup to find resources at the next level of locality until it reaches the
69  * root lgroup.
70  */
71 
72 #include <sys/lgrp.h>
73 #include <sys/lgrp_user.h>
74 #include <sys/types.h>
75 #include <sys/mman.h>
76 #include <sys/param.h>
77 #include <sys/var.h>
78 #include <sys/thread.h>
79 #include <sys/cpuvar.h>
80 #include <sys/cpupart.h>
81 #include <sys/kmem.h>
82 #include <vm/seg.h>
83 #include <vm/seg_kmem.h>
84 #include <vm/seg_spt.h>
85 #include <vm/seg_vn.h>
86 #include <vm/as.h>
87 #include <sys/atomic.h>
88 #include <sys/systm.h>
89 #include <sys/errno.h>
90 #include <sys/cmn_err.h>
91 #include <sys/kstat.h>
92 #include <sys/sysmacros.h>
93 #include <sys/chip.h>
94 #include <sys/promif.h>
95 #include <sys/sdt.h>
96 
97 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
99 				/* indexed by lgrp_id */
100 int	nlgrps;			/* number of lgroups in machine */
101 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
102 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
103 
104 /*
105  * Kstat data for lgroups.
106  *
107  * Actual kstat data is collected in lgrp_stats array.
108  * The lgrp_kstat_data array of named kstats is used to extract data from
109  * lgrp_stats and present it to kstat framework. It is protected from partallel
110  * modifications by lgrp_kstat_mutex. This may cause some contention when
111  * several kstat commands run in parallel but this is not the
112  * performance-critical path.
113  */
114 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
115 
116 /*
117  * Declare kstat names statically for enums as defined in the header file.
118  */
119 LGRP_KSTAT_NAMES;
120 
121 static void	lgrp_kstat_init(void);
122 static int	lgrp_kstat_extract(kstat_t *, int);
123 static void	lgrp_kstat_reset(lgrp_id_t);
124 
125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
126 static kmutex_t lgrp_kstat_mutex;
127 
128 
129 /*
130  * max number of lgroups supported by the platform
131  */
132 int	nlgrpsmax = 0;
133 
134 /*
135  * The root lgroup. Represents the set of resources at the system wide
136  * level of locality.
137  */
138 lgrp_t		*lgrp_root = NULL;
139 
140 /*
141  * During system bootstrap cp_default does not contain the list of lgrp load
142  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
143  * on-line when cp_default is initialized by cpupart_initialize_default().
144  * Configuring CPU0 may create a two-level topology with root and one leaf node
145  * containing CPU0. This topology is initially constructed in a special
146  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
147  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
148  * for all lpl operations until cp_default is fully constructed.
149  *
150  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
151  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
152  * the first element of lpl_bootstrap_list.
153  *
154  * CPUs that are added to the system, but have not yet been assigned to an
155  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
156  * on some architectures (x86) it's possible for the slave CPU startup thread
157  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
158  */
159 #define	LPL_BOOTSTRAP_SIZE 2
160 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
161 lpl_t		*lpl_bootstrap;
162 
163 /*
164  * If cp still references the bootstrap lpl, it has not yet been added to
165  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
166  * a thread is trying to allocate memory close to a CPU that has no lgrp.
167  */
168 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
169 
170 static lgrp_t	lroot;
171 
172 
173 /*
174  * Size, in bytes, beyond which random memory allocation policy is applied
175  * to non-shared memory.  Default is the maximum size, so random memory
176  * allocation won't be used for non-shared memory by default.
177  */
178 size_t	lgrp_privm_random_thresh = (size_t)(-1);
179 
180 /*
181  * Size, in bytes, beyond which random memory allocation policy is applied to
182  * shared memory.  Default is 8MB (2 ISM pages).
183  */
184 size_t	lgrp_shm_random_thresh = 8*1024*1024;
185 
186 /*
187  * Whether to do processor set aware memory allocation by default
188  */
189 int	lgrp_mem_pset_aware = 0;
190 
191 /*
192  * Set the default memory allocation policy for root lgroup
193  */
194 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
195 
196 /*
197  * Set the default memory allocation policy.  For most platforms,
198  * next touch is sufficient, but some platforms may wish to override
199  * this.
200  */
201 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
202 
203 
204 /*
205  * lgroup CPU event handlers
206  */
207 static void	lgrp_cpu_init(struct cpu *);
208 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
209 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
210 
211 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
212 
213 /*
214  * lgroup memory event handlers
215  */
216 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
217 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
218 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
219 
220 /*
221  * lgroup CPU partition event handlers
222  */
223 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
224 static void	lgrp_part_del_cpu(struct cpu *);
225 
226 static void	lgrp_root_init(void);
227 
228 /*
229  * lpl topology
230  */
231 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
232 static void	lpl_clear(lpl_t *);
233 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
234 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
235 static void	lpl_rset_add(lpl_t *, lpl_t *);
236 static void	lpl_rset_del(lpl_t *, lpl_t *);
237 static int	lpl_rset_contains(lpl_t *, lpl_t *);
238 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
239 static void	lpl_child_update(lpl_t *, struct cpupart *);
240 static int	lpl_pick(lpl_t *, lpl_t *);
241 static void	lpl_verify_wrapper(struct cpupart *);
242 
243 /*
244  * defines for lpl topology verifier return codes
245  */
246 
247 #define	LPL_TOPO_CORRECT			0
248 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
249 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
250 #define	LPL_TOPO_LGRP_MISMATCH			-3
251 #define	LPL_TOPO_MISSING_PARENT			-4
252 #define	LPL_TOPO_PARENT_MISMATCH		-5
253 #define	LPL_TOPO_BAD_CPUCNT			-6
254 #define	LPL_TOPO_RSET_MISMATCH			-7
255 #define	LPL_TOPO_LPL_ORPHANED			-8
256 #define	LPL_TOPO_LPL_BAD_NCPU			-9
257 #define	LPL_TOPO_RSET_MSSNG_LF			-10
258 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
259 #define	LPL_TOPO_BOGUS_HINT			-12
260 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
261 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
262 #define	LPL_TOPO_BAD_RSETCNT			-15
263 
264 /*
265  * Return whether lgroup optimizations should be enabled on this system
266  */
267 int
268 lgrp_optimizations(void)
269 {
270 	/*
271 	 * System must have more than 2 lgroups to enable lgroup optimizations
272 	 *
273 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
274 	 * with one child lgroup containing all the resources. A 2 lgroup
275 	 * system with a root lgroup directly containing CPUs or memory might
276 	 * need lgroup optimizations with its child lgroup, but there
277 	 * isn't such a machine for now....
278 	 */
279 	if (nlgrps > 2)
280 		return (1);
281 
282 	return (0);
283 }
284 
285 /*
286  * Build full lgroup topology
287  */
288 static void
289 lgrp_root_init(void)
290 {
291 	lgrp_handle_t	hand;
292 	int		i;
293 	lgrp_id_t	id;
294 
295 	/*
296 	 * Create the "root" lgroup
297 	 */
298 	ASSERT(nlgrps == 0);
299 	id = nlgrps++;
300 
301 	lgrp_root = &lroot;
302 
303 	lgrp_root->lgrp_cpu = NULL;
304 	lgrp_root->lgrp_mnodes = 0;
305 	lgrp_root->lgrp_nmnodes = 0;
306 	hand = lgrp_plat_root_hand();
307 	lgrp_root->lgrp_plathand = hand;
308 
309 	lgrp_root->lgrp_id = id;
310 	lgrp_root->lgrp_cpucnt = 0;
311 	lgrp_root->lgrp_childcnt = 0;
312 	klgrpset_clear(lgrp_root->lgrp_children);
313 	klgrpset_clear(lgrp_root->lgrp_leaves);
314 	lgrp_root->lgrp_parent = NULL;
315 	lgrp_root->lgrp_chips = NULL;
316 	lgrp_root->lgrp_chipcnt = 0;
317 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318 
319 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320 		klgrpset_clear(lgrp_root->lgrp_set[i]);
321 
322 	lgrp_root->lgrp_kstat = NULL;
323 
324 	lgrp_table[id] = lgrp_root;
325 
326 	/*
327 	 * Setup initial lpl list for CPU0 and initial t0 home.
328 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329 	 * all topology operations until cp_default is initialized at which
330 	 * point t0.t_lpl will be updated.
331 	 */
332 	lpl_bootstrap = lpl_bootstrap_list;
333 	t0.t_lpl = lpl_bootstrap;
334 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336 	cp_default.cp_lgrploads = lpl_bootstrap;
337 }
338 
339 /*
340  * Initialize the lgroup framework and allow the platform to do the same
341  */
342 void
343 lgrp_init(void)
344 {
345 	/*
346 	 * Initialize the platform
347 	 */
348 	lgrp_plat_init();
349 
350 	/*
351 	 * Set max number of lgroups supported on this platform which must be
352 	 * less than the max number of lgroups supported by the common lgroup
353 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354 	 */
355 	nlgrpsmax = lgrp_plat_max_lgrps();
356 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357 }
358 
359 /*
360  * Create the root and cpu0's lgroup, and set t0's home.
361  */
362 void
363 lgrp_setup(void)
364 {
365 	/*
366 	 * Setup the root lgroup
367 	 */
368 	lgrp_root_init();
369 
370 	/*
371 	 * Add cpu0 to an lgroup
372 	 */
373 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375 }
376 
377 /*
378  * Lgroup initialization is split in two parts. The first part
379  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381  * when all CPUs are brought online and all distance information is available.
382  *
383  * When lgrp_main_init() is complete it sets lgrp_initialized. The
384  * lgrp_main_mp_init() sets lgrp_topo_initialized.
385  */
386 
387 /*
388  * true when lgrp initialization has been completed.
389  */
390 int	lgrp_initialized = 0;
391 
392 /*
393  * True when lgrp topology is constructed.
394  */
395 int	lgrp_topo_initialized = 0;
396 
397 /*
398  * Init routine called after startup(), /etc/system has been processed,
399  * and cpu0 has been added to an lgroup.
400  */
401 void
402 lgrp_main_init(void)
403 {
404 	cpu_t		*cp = CPU;
405 	lgrp_id_t	lgrpid;
406 	int		i;
407 	/*
408 	 * Enforce a valid lgrp_mem_default_policy
409 	 */
410 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
411 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
412 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
413 
414 	/*
415 	 * See if mpo should be disabled.
416 	 * This may happen in the case of null proc LPA on Starcat.
417 	 * The platform won't be able to detect null proc LPA until after
418 	 * cpu0 and memory have already been added to lgroups.
419 	 * When and if it is detected, the Starcat platform will return
420 	 * a different platform handle for cpu0 which is what we check for
421 	 * here. If mpo should be disabled move cpu0 to it's rightful place
422 	 * (the root), and destroy the remaining lgroups. This effectively
423 	 * provides an UMA lgroup topology.
424 	 */
425 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
426 	if (lgrp_table[lgrpid]->lgrp_plathand !=
427 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
428 		lgrp_part_del_cpu(cp);
429 		lgrp_cpu_fini(cp, lgrpid);
430 
431 		lgrp_cpu_init(cp);
432 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
433 
434 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
435 
436 		for (i = 0; i <= lgrp_alloc_max; i++) {
437 			if (LGRP_EXISTS(lgrp_table[i]) &&
438 			    lgrp_table[i] != lgrp_root)
439 				lgrp_destroy(lgrp_table[i]);
440 		}
441 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
442 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
443 	}
444 
445 	/*
446 	 * Initialize kstats framework.
447 	 */
448 	lgrp_kstat_init();
449 	/*
450 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
451 	 */
452 	mutex_enter(&cpu_lock);
453 	lgrp_kstat_create(cp);
454 	mutex_exit(&cpu_lock);
455 
456 	lgrp_plat_main_init();
457 	lgrp_initialized = 1;
458 }
459 
460 /*
461  * Finish lgrp initialization after all CPUS are brought on-line.
462  * This routine is called after start_other_cpus().
463  */
464 void
465 lgrp_main_mp_init(void)
466 {
467 	klgrpset_t changed;
468 
469 	/*
470 	 * Update lgroup topology (if necessary)
471 	 */
472 	klgrpset_clear(changed);
473 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
474 	lgrp_topo_initialized = 1;
475 }
476 
477 /*
478  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
479  */
480 void
481 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
482 {
483 	klgrpset_t	changed;
484 	cpu_t		*cp;
485 	lgrp_id_t	id;
486 	int		rc;
487 
488 	switch (event) {
489 	/*
490 	 * The following (re)configuration events are common code
491 	 * initiated. lgrp_plat_config() is called here to inform the
492 	 * platform of the reconfiguration event.
493 	 */
494 	case LGRP_CONFIG_CPU_ADD:
495 		cp = (cpu_t *)resource;
496 
497 		/*
498 		 * Initialize the new CPU's lgrp related next/prev
499 		 * links, and give it a bootstrap lpl so that it can
500 		 * survive should it need to enter the dispatcher.
501 		 */
502 		cp->cpu_next_lpl = cp;
503 		cp->cpu_prev_lpl = cp;
504 		cp->cpu_next_lgrp = cp;
505 		cp->cpu_prev_lgrp = cp;
506 		cp->cpu_lpl = lpl_bootstrap;
507 
508 		lgrp_plat_config(event, resource);
509 		atomic_add_32(&lgrp_gen, 1);
510 
511 		break;
512 	case LGRP_CONFIG_CPU_DEL:
513 		lgrp_plat_config(event, resource);
514 		atomic_add_32(&lgrp_gen, 1);
515 
516 		break;
517 	case LGRP_CONFIG_CPU_ONLINE:
518 		cp = (cpu_t *)resource;
519 		lgrp_cpu_init(cp);
520 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
521 		rc = lpl_topo_verify(cp->cpu_part);
522 		if (rc != LPL_TOPO_CORRECT) {
523 			panic("lpl_topo_verify failed: %d", rc);
524 		}
525 		lgrp_plat_config(event, resource);
526 		atomic_add_32(&lgrp_gen, 1);
527 
528 		break;
529 	case LGRP_CONFIG_CPU_OFFLINE:
530 		cp = (cpu_t *)resource;
531 		id = cp->cpu_lpl->lpl_lgrpid;
532 		lgrp_part_del_cpu(cp);
533 		lgrp_cpu_fini(cp, id);
534 		rc = lpl_topo_verify(cp->cpu_part);
535 		if (rc != LPL_TOPO_CORRECT) {
536 			panic("lpl_topo_verify failed: %d", rc);
537 		}
538 		lgrp_plat_config(event, resource);
539 		atomic_add_32(&lgrp_gen, 1);
540 
541 		break;
542 	case LGRP_CONFIG_CPUPART_ADD:
543 		cp = (cpu_t *)resource;
544 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
545 		rc = lpl_topo_verify(cp->cpu_part);
546 		if (rc != LPL_TOPO_CORRECT) {
547 			panic("lpl_topo_verify failed: %d", rc);
548 		}
549 		lgrp_plat_config(event, resource);
550 
551 		break;
552 	case LGRP_CONFIG_CPUPART_DEL:
553 		cp = (cpu_t *)resource;
554 		lgrp_part_del_cpu((cpu_t *)resource);
555 		rc = lpl_topo_verify(cp->cpu_part);
556 		if (rc != LPL_TOPO_CORRECT) {
557 			panic("lpl_topo_verify failed: %d", rc);
558 		}
559 		lgrp_plat_config(event, resource);
560 
561 		break;
562 	/*
563 	 * The following events are initiated by the memnode
564 	 * subsystem.
565 	 */
566 	case LGRP_CONFIG_MEM_ADD:
567 		lgrp_mem_init((int)resource, where, B_FALSE);
568 		atomic_add_32(&lgrp_gen, 1);
569 
570 		break;
571 	case LGRP_CONFIG_MEM_DEL:
572 		lgrp_mem_fini((int)resource, where, B_FALSE);
573 		atomic_add_32(&lgrp_gen, 1);
574 
575 		break;
576 	case LGRP_CONFIG_MEM_RENAME: {
577 		lgrp_config_mem_rename_t *ren_arg =
578 		    (lgrp_config_mem_rename_t *)where;
579 
580 		lgrp_mem_rename((int)resource,
581 		    ren_arg->lmem_rename_from,
582 		    ren_arg->lmem_rename_to);
583 		atomic_add_32(&lgrp_gen, 1);
584 
585 		break;
586 	}
587 	case LGRP_CONFIG_GEN_UPDATE:
588 		atomic_add_32(&lgrp_gen, 1);
589 
590 		break;
591 	case LGRP_CONFIG_FLATTEN:
592 		if (where == 0)
593 			lgrp_topo_levels = (int)resource;
594 		else
595 			(void) lgrp_topo_flatten(resource,
596 			    lgrp_table, lgrp_alloc_max, &changed);
597 
598 		break;
599 	/*
600 	 * Initiated by platform latency probing code
601 	 */
602 	case LGRP_CONFIG_LATENCY_CHANGE:
603 		lgrp_latency_change((u_longlong_t)resource,
604 		    (u_longlong_t)where);
605 
606 		break;
607 	case LGRP_CONFIG_NOP:
608 
609 		break;
610 	default:
611 		break;
612 	}
613 
614 }
615 
616 /*
617  * Called to add lgrp info into cpu structure from cpu_add_unit;
618  * do not assume cpu is in cpu[] yet!
619  *
620  * CPUs are brought online with all other CPUs paused so we can't
621  * allocate memory or we could deadlock the system, so we rely on
622  * the platform to statically allocate as much space as we need
623  * for the lgrp structs and stats.
624  */
625 static void
626 lgrp_cpu_init(struct cpu *cp)
627 {
628 	klgrpset_t	changed;
629 	int		count;
630 	lgrp_handle_t	hand;
631 	int		first_cpu;
632 	lgrp_t		*my_lgrp;
633 	lgrp_id_t	lgrpid;
634 	struct cpu	*cptr;
635 	struct chip	*chp;
636 
637 	/*
638 	 * This is the first time through if the resource set
639 	 * for the root lgroup is empty. After cpu0 has been
640 	 * initially added to an lgroup, the root's CPU resource
641 	 * set can never be empty, since the system's last CPU
642 	 * cannot be offlined.
643 	 */
644 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
645 		/*
646 		 * First time through.
647 		 */
648 		first_cpu = 1;
649 	} else {
650 		/*
651 		 * If cpu0 needs to move lgroups, we may come
652 		 * through here again, at which time cpu_lock won't
653 		 * be held, and lgrp_initialized will be false.
654 		 */
655 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
656 		ASSERT(cp->cpu_part != NULL);
657 		first_cpu = 0;
658 	}
659 
660 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
661 	my_lgrp = lgrp_hand_to_lgrp(hand);
662 
663 	if (my_lgrp == NULL) {
664 		/*
665 		 * Create new lgrp and add it to lgroup topology
666 		 */
667 		my_lgrp = lgrp_create();
668 		my_lgrp->lgrp_plathand = hand;
669 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
670 		lgrpid = my_lgrp->lgrp_id;
671 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
672 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
673 
674 		count = 0;
675 		klgrpset_clear(changed);
676 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
677 		    &changed);
678 		/*
679 		 * May have added new intermediate lgroups, so need to add
680 		 * resources other than CPUs which are added below
681 		 */
682 		(void) lgrp_mnode_update(changed, NULL);
683 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
684 	    > 0) {
685 		/*
686 		 * Leaf lgroup was created, but latency wasn't available
687 		 * then.  So, set latency for it and fill in rest of lgroup
688 		 * topology  now that we know how far it is from other leaf
689 		 * lgroups.
690 		 */
691 		lgrpid = my_lgrp->lgrp_id;
692 		klgrpset_clear(changed);
693 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
694 		    lgrpid))
695 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
696 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
697 		    &changed);
698 
699 		/*
700 		 * May have added new intermediate lgroups, so need to add
701 		 * resources other than CPUs which are added below
702 		 */
703 		(void) lgrp_mnode_update(changed, NULL);
704 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
705 	    my_lgrp->lgrp_id)) {
706 		int	i;
707 
708 		/*
709 		 * Update existing lgroup and lgroups containing it with CPU
710 		 * resource
711 		 */
712 		lgrpid = my_lgrp->lgrp_id;
713 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
714 		for (i = 0; i <= lgrp_alloc_max; i++) {
715 			lgrp_t		*lgrp;
716 
717 			lgrp = lgrp_table[i];
718 			if (!LGRP_EXISTS(lgrp) ||
719 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
720 				continue;
721 
722 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
723 		}
724 	}
725 
726 	lgrpid = my_lgrp->lgrp_id;
727 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
728 
729 	/*
730 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
731 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
732 	 * not since none of lgroup IDs in the lpl's have been set yet.
733 	 */
734 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
735 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
736 
737 	/*
738 	 * link the CPU into the lgrp's CPU list
739 	 */
740 	if (my_lgrp->lgrp_cpucnt == 0) {
741 		my_lgrp->lgrp_cpu = cp;
742 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
743 	} else {
744 		cptr = my_lgrp->lgrp_cpu;
745 		cp->cpu_next_lgrp = cptr;
746 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
747 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
748 		cptr->cpu_prev_lgrp = cp;
749 	}
750 	my_lgrp->lgrp_cpucnt++;
751 
752 	/*
753 	 * Add this cpu's chip to the per lgroup list
754 	 * if necessary
755 	 */
756 	if (cp->cpu_chip->chip_lgrp == NULL) {
757 		struct chip *lcpr;
758 
759 		chp = cp->cpu_chip;
760 
761 		if (my_lgrp->lgrp_chipcnt == 0) {
762 			my_lgrp->lgrp_chips = chp;
763 			chp->chip_next_lgrp =
764 			    chp->chip_prev_lgrp = chp;
765 		} else {
766 			lcpr = my_lgrp->lgrp_chips;
767 			chp->chip_next_lgrp = lcpr;
768 			chp->chip_prev_lgrp =
769 			    lcpr->chip_prev_lgrp;
770 			lcpr->chip_prev_lgrp->chip_next_lgrp =
771 			    chp;
772 			lcpr->chip_prev_lgrp = chp;
773 		}
774 		chp->chip_lgrp = my_lgrp;
775 		chp->chip_balance = chp->chip_next_lgrp;
776 		my_lgrp->lgrp_chipcnt++;
777 	}
778 }
779 
780 lgrp_t *
781 lgrp_create(void)
782 {
783 	lgrp_t		*my_lgrp;
784 	lgrp_id_t	lgrpid;
785 	int		i;
786 
787 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
788 
789 	/*
790 	 * Find an open slot in the lgroup table and recycle unused lgroup
791 	 * left there if any
792 	 */
793 	my_lgrp = NULL;
794 	if (lgrp_alloc_hint == -1)
795 		/*
796 		 * Allocate from end when hint not set yet because no lgroups
797 		 * have been deleted yet
798 		 */
799 		lgrpid = nlgrps++;
800 	else {
801 		/*
802 		 * Start looking for next open slot from hint and leave hint
803 		 * at slot allocated
804 		 */
805 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
806 			my_lgrp = lgrp_table[i];
807 			if (!LGRP_EXISTS(my_lgrp)) {
808 				lgrpid = i;
809 				nlgrps++;
810 				break;
811 			}
812 		}
813 		lgrp_alloc_hint = lgrpid;
814 	}
815 
816 	/*
817 	 * Keep track of max lgroup ID allocated so far to cut down on searches
818 	 */
819 	if (lgrpid > lgrp_alloc_max)
820 		lgrp_alloc_max = lgrpid;
821 
822 	/*
823 	 * Need to allocate new lgroup if next open slot didn't have one
824 	 * for recycling
825 	 */
826 	if (my_lgrp == NULL)
827 		my_lgrp = lgrp_plat_alloc(lgrpid);
828 
829 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
830 		panic("Too many lgrps for platform (%d)", nlgrps);
831 
832 	my_lgrp->lgrp_id = lgrpid;
833 	my_lgrp->lgrp_latency = 0;
834 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
835 	my_lgrp->lgrp_parent = NULL;
836 	my_lgrp->lgrp_childcnt = 0;
837 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
838 	my_lgrp->lgrp_nmnodes = 0;
839 	klgrpset_clear(my_lgrp->lgrp_children);
840 	klgrpset_clear(my_lgrp->lgrp_leaves);
841 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
842 		klgrpset_clear(my_lgrp->lgrp_set[i]);
843 
844 	my_lgrp->lgrp_cpu = NULL;
845 	my_lgrp->lgrp_cpucnt = 0;
846 	my_lgrp->lgrp_chips = NULL;
847 	my_lgrp->lgrp_chipcnt = 0;
848 
849 	if (my_lgrp->lgrp_kstat != NULL)
850 		lgrp_kstat_reset(lgrpid);
851 
852 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
853 
854 	return (my_lgrp);
855 }
856 
857 void
858 lgrp_destroy(lgrp_t *lgrp)
859 {
860 	int		i;
861 
862 	/*
863 	 * Unless this lgroup is being destroyed on behalf of
864 	 * the boot CPU, cpu_lock must be held
865 	 */
866 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
867 
868 	if (nlgrps == 1)
869 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
870 
871 	if (!LGRP_EXISTS(lgrp))
872 		return;
873 
874 	/*
875 	 * Set hint to lgroup being deleted and try to keep lower numbered
876 	 * hints to facilitate finding empty slots
877 	 */
878 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
879 		lgrp_alloc_hint = lgrp->lgrp_id;
880 
881 	/*
882 	 * Mark this lgroup to be recycled by setting its lgroup ID to
883 	 * LGRP_NONE and clear relevant fields
884 	 */
885 	lgrp->lgrp_id = LGRP_NONE;
886 	lgrp->lgrp_latency = 0;
887 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
888 	lgrp->lgrp_parent = NULL;
889 	lgrp->lgrp_childcnt = 0;
890 
891 	klgrpset_clear(lgrp->lgrp_children);
892 	klgrpset_clear(lgrp->lgrp_leaves);
893 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
894 		klgrpset_clear(lgrp->lgrp_set[i]);
895 
896 	lgrp->lgrp_mnodes = (mnodeset_t)0;
897 	lgrp->lgrp_nmnodes = 0;
898 
899 	lgrp->lgrp_cpu = NULL;
900 	lgrp->lgrp_cpucnt = 0;
901 	lgrp->lgrp_chipcnt = 0;
902 	lgrp->lgrp_chips = NULL;
903 
904 	nlgrps--;
905 }
906 
907 /*
908  * Initialize kstat data. Called from lgrp intialization code.
909  */
910 static void
911 lgrp_kstat_init(void)
912 {
913 	lgrp_stat_t	stat;
914 
915 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
916 
917 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
918 		kstat_named_init(&lgrp_kstat_data[stat],
919 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
920 }
921 
922 /*
923  * initialize an lgrp's kstats if needed
924  * called with cpu_lock held but not with cpus paused.
925  * we don't tear these down now because we don't know about
926  * memory leaving the lgrp yet...
927  */
928 
929 void
930 lgrp_kstat_create(cpu_t *cp)
931 {
932 	kstat_t		*lgrp_kstat;
933 	lgrp_id_t	lgrpid;
934 	lgrp_t		*my_lgrp;
935 
936 	ASSERT(MUTEX_HELD(&cpu_lock));
937 
938 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
939 	my_lgrp = lgrp_table[lgrpid];
940 
941 	if (my_lgrp->lgrp_kstat != NULL)
942 		return; /* already initialized */
943 
944 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
945 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
946 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
947 
948 	if (lgrp_kstat != NULL) {
949 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
950 		lgrp_kstat->ks_private = my_lgrp;
951 		lgrp_kstat->ks_data = &lgrp_kstat_data;
952 		lgrp_kstat->ks_update = lgrp_kstat_extract;
953 		my_lgrp->lgrp_kstat = lgrp_kstat;
954 		kstat_install(lgrp_kstat);
955 	}
956 }
957 
958 /*
959  * this will do something when we manage to remove now unused lgrps
960  */
961 
962 /* ARGSUSED */
963 void
964 lgrp_kstat_destroy(cpu_t *cp)
965 {
966 	ASSERT(MUTEX_HELD(&cpu_lock));
967 }
968 
969 /*
970  * Called when a CPU is off-lined.
971  */
972 static void
973 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
974 {
975 	lgrp_t *my_lgrp;
976 	struct cpu *prev;
977 	struct cpu *next;
978 	chip_t  *chp;
979 
980 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
981 
982 	prev = cp->cpu_prev_lgrp;
983 	next = cp->cpu_next_lgrp;
984 
985 	prev->cpu_next_lgrp = next;
986 	next->cpu_prev_lgrp = prev;
987 
988 	/*
989 	 * just because I'm paranoid doesn't mean...
990 	 */
991 
992 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
993 
994 	my_lgrp = lgrp_table[lgrpid];
995 	my_lgrp->lgrp_cpucnt--;
996 
997 	/*
998 	 * If the last CPU on it's chip is being offlined
999 	 * then remove this chip from the per lgroup list.
1000 	 *
1001 	 * This is also done for the boot CPU when it needs
1002 	 * to move between lgroups as a consequence of
1003 	 * null proc lpa.
1004 	 */
1005 	chp = cp->cpu_chip;
1006 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
1007 
1008 		chip_t	*chpp;
1009 
1010 		if (--my_lgrp->lgrp_chipcnt == 0)
1011 			my_lgrp->lgrp_chips = NULL;
1012 		else if (my_lgrp->lgrp_chips == chp)
1013 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
1014 
1015 		/*
1016 		 * Walk this lgroup's chip list looking for chips that
1017 		 * may try to balance against the one that's leaving
1018 		 */
1019 		for (chpp = chp->chip_next_lgrp; chpp != chp;
1020 		    chpp = chpp->chip_next_lgrp) {
1021 			if (chpp->chip_balance == chp)
1022 				chpp->chip_balance = chp->chip_next_lgrp;
1023 		}
1024 
1025 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1026 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1027 
1028 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1029 		chp->chip_lgrp = NULL;
1030 		chp->chip_balance = NULL;
1031 	}
1032 
1033 	/*
1034 	 * Removing last CPU in lgroup, so update lgroup topology
1035 	 */
1036 	if (my_lgrp->lgrp_cpucnt == 0) {
1037 		klgrpset_t	changed;
1038 		int		count;
1039 		int		i;
1040 
1041 		my_lgrp->lgrp_cpu = NULL;
1042 
1043 		/*
1044 		 * Remove this lgroup from its lgroup CPU resources and remove
1045 		 * lgroup from lgroup topology if it doesn't have any more
1046 		 * resources in it now
1047 		 */
1048 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1049 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1050 			count = 0;
1051 			klgrpset_clear(changed);
1052 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1053 			    lgrp_alloc_max + 1, &changed);
1054 			return;
1055 		}
1056 
1057 		/*
1058 		 * This lgroup isn't empty, so just remove it from CPU
1059 		 * resources of any lgroups that contain it as such
1060 		 */
1061 		for (i = 0; i <= lgrp_alloc_max; i++) {
1062 			lgrp_t		*lgrp;
1063 
1064 			lgrp = lgrp_table[i];
1065 			if (!LGRP_EXISTS(lgrp) ||
1066 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1067 			    lgrpid))
1068 				continue;
1069 
1070 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1071 		}
1072 		return;
1073 	}
1074 
1075 	if (my_lgrp->lgrp_cpu == cp)
1076 		my_lgrp->lgrp_cpu = next;
1077 
1078 }
1079 
1080 /*
1081  * Update memory nodes in target lgroups and return ones that get changed
1082  */
1083 int
1084 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1085 {
1086 	int	count;
1087 	int	i;
1088 	int	j;
1089 	lgrp_t	*lgrp;
1090 	lgrp_t	*lgrp_rsrc;
1091 
1092 	count = 0;
1093 	if (changed)
1094 		klgrpset_clear(*changed);
1095 
1096 	if (klgrpset_isempty(target))
1097 		return (0);
1098 
1099 	/*
1100 	 * Find each lgroup in target lgroups
1101 	 */
1102 	for (i = 0; i <= lgrp_alloc_max; i++) {
1103 		/*
1104 		 * Skip any lgroups that don't exist or aren't in target group
1105 		 */
1106 		lgrp = lgrp_table[i];
1107 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1108 			continue;
1109 		}
1110 
1111 		/*
1112 		 * Initialize memnodes for intermediate lgroups to 0
1113 		 * and update them from scratch since they may have completely
1114 		 * changed
1115 		 */
1116 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1117 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1118 			lgrp->lgrp_nmnodes = 0;
1119 		}
1120 
1121 		/*
1122 		 * Update memory nodes of of target lgroup with memory nodes
1123 		 * from each lgroup in its lgroup memory resource set
1124 		 */
1125 		for (j = 0; j <= lgrp_alloc_max; j++) {
1126 			int	k;
1127 
1128 			/*
1129 			 * Skip any lgroups that don't exist or aren't in
1130 			 * memory resources of target lgroup
1131 			 */
1132 			lgrp_rsrc = lgrp_table[j];
1133 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1134 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1135 			    j))
1136 				continue;
1137 
1138 			/*
1139 			 * Update target lgroup's memnodes to include memnodes
1140 			 * of this lgroup
1141 			 */
1142 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1143 				mnodeset_t	mnode_mask;
1144 
1145 				mnode_mask = (mnodeset_t)1 << k;
1146 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1147 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1148 					lgrp->lgrp_mnodes |= mnode_mask;
1149 					lgrp->lgrp_nmnodes++;
1150 				}
1151 			}
1152 			count++;
1153 			if (changed)
1154 				klgrpset_add(*changed, lgrp->lgrp_id);
1155 		}
1156 	}
1157 
1158 	return (count);
1159 }
1160 
1161 /*
1162  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1163  * is moved from one board to another. The "from" and "to" arguments specify the
1164  * source and the destination of the move.
1165  *
1166  * See plat_lgrp_config() for a detailed description of the copy-rename
1167  * semantics.
1168  *
1169  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1170  * the lgroup topology which is changing as memory moves from one lgroup to
1171  * another. It removes the mnode from the source lgroup and re-inserts it in the
1172  * target lgroup.
1173  *
1174  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1175  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1176  * copy-rename operation.
1177  *
1178  * There is one case which requires special handling. If the system contains
1179  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1180  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1181  * lgrp_mem_init), but there is a window when the system has no memory in the
1182  * lgroup hierarchy. If another thread tries to allocate memory during this
1183  * window, the allocation will fail, although the system has physical memory.
1184  * This may cause a system panic or a deadlock (some sleeping memory allocations
1185  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1186  * the mnode back).
1187  *
1188  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1189  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1190  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1191  * but it updates the rest of the lgroup topology as if the mnode was actually
1192  * removed. The lgrp_mem_init() function recognizes that the mnode being
1193  * inserted represents such a special case and updates the topology
1194  * appropriately.
1195  */
1196 void
1197 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1198 {
1199 	/*
1200 	 * Remove the memory from the source node and add it to the destination
1201 	 * node.
1202 	 */
1203 	lgrp_mem_fini(mnode, from, B_TRUE);
1204 	lgrp_mem_init(mnode, to, B_TRUE);
1205 }
1206 
1207 /*
1208  * Called to indicate that the lgrp with platform handle "hand" now
1209  * contains the memory identified by "mnode".
1210  *
1211  * LOCKING for this routine is a bit tricky. Usually it is called without
1212  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1213  * callers. During DR of the board containing the caged memory it may be called
1214  * with cpu_lock already held and CPUs paused.
1215  *
1216  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1217  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1218  * dealing with the special case of DR copy-rename described in
1219  * lgrp_mem_rename().
1220  */
1221 void
1222 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1223 {
1224 	klgrpset_t	changed;
1225 	int		count;
1226 	int		i;
1227 	lgrp_t		*my_lgrp;
1228 	lgrp_id_t	lgrpid;
1229 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1230 	boolean_t	drop_lock = B_FALSE;
1231 	boolean_t	need_synch = B_FALSE;
1232 
1233 	/*
1234 	 * Grab CPU lock (if we haven't already)
1235 	 */
1236 	if (!MUTEX_HELD(&cpu_lock)) {
1237 		mutex_enter(&cpu_lock);
1238 		drop_lock = B_TRUE;
1239 	}
1240 
1241 	/*
1242 	 * This routine may be called from a context where we already
1243 	 * hold cpu_lock, and have already paused cpus.
1244 	 */
1245 	if (!cpus_paused())
1246 		need_synch = B_TRUE;
1247 
1248 	/*
1249 	 * Check if this mnode is already configured and return immediately if
1250 	 * it is.
1251 	 *
1252 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1253 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1254 	 * recognize this case and continue as usual, but skip the update to
1255 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1256 	 * in topology, temporarily introduced by lgrp_mem_fini().
1257 	 */
1258 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1259 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1260 		if (drop_lock)
1261 			mutex_exit(&cpu_lock);
1262 		return;
1263 	}
1264 
1265 	/*
1266 	 * Update lgroup topology with new memory resources, keeping track of
1267 	 * which lgroups change
1268 	 */
1269 	count = 0;
1270 	klgrpset_clear(changed);
1271 	my_lgrp = lgrp_hand_to_lgrp(hand);
1272 	if (my_lgrp == NULL) {
1273 		/* new lgrp */
1274 		my_lgrp = lgrp_create();
1275 		lgrpid = my_lgrp->lgrp_id;
1276 		my_lgrp->lgrp_plathand = hand;
1277 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1278 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1279 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1280 
1281 		if (need_synch)
1282 			pause_cpus(NULL);
1283 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1284 		    &changed);
1285 		if (need_synch)
1286 			start_cpus();
1287 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1288 	    > 0) {
1289 		/*
1290 		 * Leaf lgroup was created, but latency wasn't available
1291 		 * then.  So, set latency for it and fill in rest of lgroup
1292 		 * topology  now that we know how far it is from other leaf
1293 		 * lgroups.
1294 		 */
1295 		klgrpset_clear(changed);
1296 		lgrpid = my_lgrp->lgrp_id;
1297 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1298 		    lgrpid))
1299 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1300 		if (need_synch)
1301 			pause_cpus(NULL);
1302 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1303 		    &changed);
1304 		if (need_synch)
1305 			start_cpus();
1306 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1307 	    my_lgrp->lgrp_id)) {
1308 		/*
1309 		 * Add new lgroup memory resource to existing lgroup
1310 		 */
1311 		lgrpid = my_lgrp->lgrp_id;
1312 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1313 		klgrpset_add(changed, lgrpid);
1314 		count++;
1315 		for (i = 0; i <= lgrp_alloc_max; i++) {
1316 			lgrp_t		*lgrp;
1317 
1318 			lgrp = lgrp_table[i];
1319 			if (!LGRP_EXISTS(lgrp) ||
1320 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1321 				continue;
1322 
1323 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1324 			klgrpset_add(changed, lgrp->lgrp_id);
1325 			count++;
1326 		}
1327 	}
1328 
1329 	/*
1330 	 * Add memory node to lgroup and remove lgroup from ones that need
1331 	 * to be updated
1332 	 */
1333 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1334 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1335 		my_lgrp->lgrp_nmnodes++;
1336 	}
1337 	klgrpset_del(changed, lgrpid);
1338 
1339 	/*
1340 	 * Update memory node information for all lgroups that changed and
1341 	 * contain new memory node as a resource
1342 	 */
1343 	if (count)
1344 		(void) lgrp_mnode_update(changed, NULL);
1345 
1346 	if (drop_lock)
1347 		mutex_exit(&cpu_lock);
1348 }
1349 
1350 /*
1351  * Called to indicate that the lgroup associated with the platform
1352  * handle "hand" no longer contains given memory node
1353  *
1354  * LOCKING for this routine is a bit tricky. Usually it is called without
1355  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1356  * callers. During DR of the board containing the caged memory it may be called
1357  * with cpu_lock already held and CPUs paused.
1358  *
1359  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1360  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1361  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1362  * the same mnode back into the topology. See lgrp_mem_rename() and
1363  * lgrp_mem_init() for additional details.
1364  */
1365 void
1366 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1367 {
1368 	klgrpset_t	changed;
1369 	int		count;
1370 	int		i;
1371 	lgrp_t		*my_lgrp;
1372 	lgrp_id_t	lgrpid;
1373 	mnodeset_t	mnodes_mask;
1374 	boolean_t	drop_lock = B_FALSE;
1375 	boolean_t	need_synch = B_FALSE;
1376 
1377 	/*
1378 	 * Grab CPU lock (if we haven't already)
1379 	 */
1380 	if (!MUTEX_HELD(&cpu_lock)) {
1381 		mutex_enter(&cpu_lock);
1382 		drop_lock = B_TRUE;
1383 	}
1384 
1385 	/*
1386 	 * This routine may be called from a context where we already
1387 	 * hold cpu_lock and have already paused cpus.
1388 	 */
1389 	if (!cpus_paused())
1390 		need_synch = B_TRUE;
1391 
1392 	my_lgrp = lgrp_hand_to_lgrp(hand);
1393 
1394 	/*
1395 	 * The lgrp *must* be pre-existing
1396 	 */
1397 	ASSERT(my_lgrp != NULL);
1398 
1399 	/*
1400 	 * Delete memory node from lgroups which contain it
1401 	 */
1402 	mnodes_mask = ((mnodeset_t)1 << mnode);
1403 	for (i = 0; i <= lgrp_alloc_max; i++) {
1404 		lgrp_t *lgrp = lgrp_table[i];
1405 		/*
1406 		 * Skip any non-existent lgroups and any lgroups that don't
1407 		 * contain leaf lgroup of memory as a memory resource
1408 		 */
1409 		if (!LGRP_EXISTS(lgrp) ||
1410 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1411 			continue;
1412 
1413 		/*
1414 		 * Avoid removing the last mnode from the root in the DR
1415 		 * copy-rename case. See lgrp_mem_rename() for details.
1416 		 */
1417 		if (is_copy_rename &&
1418 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1419 			continue;
1420 
1421 		/*
1422 		 * Remove memory node from lgroup.
1423 		 */
1424 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1425 		lgrp->lgrp_nmnodes--;
1426 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1427 	}
1428 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1429 
1430 	/*
1431 	 * Don't need to update lgroup topology if this lgroup still has memory.
1432 	 *
1433 	 * In the special case of DR copy-rename with the only mnode being
1434 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1435 	 * still need to update the lgroup topology.
1436 	 */
1437 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1438 	    !(is_copy_rename &&
1439 		(my_lgrp == lgrp_root) &&
1440 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1441 		if (drop_lock)
1442 			mutex_exit(&cpu_lock);
1443 		return;
1444 	}
1445 
1446 	/*
1447 	 * This lgroup does not contain any memory now
1448 	 */
1449 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1450 
1451 	/*
1452 	 * Remove this lgroup from lgroup topology if it does not contain any
1453 	 * resources now
1454 	 */
1455 	lgrpid = my_lgrp->lgrp_id;
1456 	count = 0;
1457 	klgrpset_clear(changed);
1458 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1459 		/*
1460 		 * Delete lgroup when no more resources
1461 		 */
1462 		if (need_synch)
1463 			pause_cpus(NULL);
1464 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1465 		    lgrp_alloc_max + 1, &changed);
1466 		ASSERT(count > 0);
1467 		if (need_synch)
1468 			start_cpus();
1469 	} else {
1470 		/*
1471 		 * Remove lgroup from memory resources of any lgroups that
1472 		 * contain it as such
1473 		 */
1474 		for (i = 0; i <= lgrp_alloc_max; i++) {
1475 			lgrp_t		*lgrp;
1476 
1477 			lgrp = lgrp_table[i];
1478 			if (!LGRP_EXISTS(lgrp) ||
1479 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1480 			    lgrpid))
1481 				continue;
1482 
1483 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1484 		}
1485 	}
1486 	if (drop_lock)
1487 		mutex_exit(&cpu_lock);
1488 }
1489 
1490 /*
1491  * Return lgroup with given platform handle
1492  */
1493 lgrp_t *
1494 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1495 {
1496 	int	i;
1497 	lgrp_t	*lgrp;
1498 
1499 	if (hand == LGRP_NULL_HANDLE)
1500 		return (NULL);
1501 
1502 	for (i = 0; i <= lgrp_alloc_max; i++) {
1503 		lgrp = lgrp_table[i];
1504 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1505 			return (lgrp);
1506 	}
1507 	return (NULL);
1508 }
1509 
1510 /*
1511  * Return the home lgroup of the current thread.
1512  * We must do this with kernel preemption disabled, since we don't want our
1513  * thread to be re-homed while we're poking around with its lpl, and the lpl
1514  * should never be NULL.
1515  *
1516  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1517  * is enabled because of DR.  Callers can use disable kernel preemption
1518  * around this call to guarantee that the lgroup will be valid beyond this
1519  * routine, since kernel preemption can be recursive.
1520  */
1521 lgrp_t *
1522 lgrp_home_lgrp(void)
1523 {
1524 	lgrp_t	*lgrp;
1525 	lpl_t	*lpl;
1526 
1527 	kpreempt_disable();
1528 
1529 	lpl = curthread->t_lpl;
1530 	ASSERT(lpl != NULL);
1531 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1532 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1533 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1534 
1535 	kpreempt_enable();
1536 
1537 	return (lgrp);
1538 }
1539 
1540 /*
1541  * Return ID of home lgroup for given thread
1542  * (See comments for lgrp_home_lgrp() for special care and handling
1543  * instructions)
1544  */
1545 lgrp_id_t
1546 lgrp_home_id(kthread_t *t)
1547 {
1548 	lgrp_id_t	lgrp;
1549 	lpl_t		*lpl;
1550 
1551 	ASSERT(t != NULL);
1552 	/*
1553 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1554 	 * cannot since the HAT layer can call into this routine to
1555 	 * determine the locality for its data structures in the context
1556 	 * of a page fault.
1557 	 */
1558 
1559 	kpreempt_disable();
1560 
1561 	lpl = t->t_lpl;
1562 	ASSERT(lpl != NULL);
1563 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1564 	lgrp = lpl->lpl_lgrpid;
1565 
1566 	kpreempt_enable();
1567 
1568 	return (lgrp);
1569 }
1570 
1571 /*
1572  * Return lgroup containing the physical memory for the given page frame number
1573  */
1574 lgrp_t *
1575 lgrp_pfn_to_lgrp(pfn_t pfn)
1576 {
1577 	lgrp_handle_t	hand;
1578 	int		i;
1579 	lgrp_t		*lgrp;
1580 
1581 	hand = lgrp_plat_pfn_to_hand(pfn);
1582 	if (hand != LGRP_NULL_HANDLE)
1583 		for (i = 0; i <= lgrp_alloc_max; i++) {
1584 			lgrp = lgrp_table[i];
1585 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1586 				return (lgrp);
1587 		}
1588 	return (NULL);
1589 }
1590 
1591 /*
1592  * Return lgroup containing the physical memory for the given page frame number
1593  */
1594 lgrp_t *
1595 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1596 {
1597 	lgrp_handle_t	hand;
1598 	int		i;
1599 	lgrp_t		*lgrp;
1600 	pfn_t		pfn;
1601 
1602 	pfn = btop(physaddr);
1603 	hand = lgrp_plat_pfn_to_hand(pfn);
1604 	if (hand != LGRP_NULL_HANDLE)
1605 		for (i = 0; i <= lgrp_alloc_max; i++) {
1606 			lgrp = lgrp_table[i];
1607 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1608 				return (lgrp);
1609 		}
1610 	return (NULL);
1611 }
1612 
1613 /*
1614  * Return the leaf lgroup containing the given CPU
1615  *
1616  * The caller needs to take precautions necessary to prevent
1617  * "cpu" from going away across a call to this function.
1618  * hint: kpreempt_disable()/kpreempt_enable()
1619  */
1620 static lgrp_t *
1621 lgrp_cpu_to_lgrp(cpu_t *cpu)
1622 {
1623 	return (cpu->cpu_chip->chip_lgrp);
1624 }
1625 
1626 /*
1627  * Return the sum of the partition loads in an lgrp divided by
1628  * the number of CPUs in the lgrp.  This is our best approximation
1629  * of an 'lgroup load average' for a useful per-lgroup kstat.
1630  */
1631 static uint64_t
1632 lgrp_sum_loadavgs(lgrp_t *lgrp)
1633 {
1634 	cpu_t *cpu;
1635 	int ncpu;
1636 	uint64_t loads = 0;
1637 
1638 	mutex_enter(&cpu_lock);
1639 
1640 	cpu = lgrp->lgrp_cpu;
1641 	ncpu = lgrp->lgrp_cpucnt;
1642 
1643 	if (cpu == NULL || ncpu == 0) {
1644 		mutex_exit(&cpu_lock);
1645 		return (0ull);
1646 	}
1647 
1648 	do {
1649 		loads += cpu->cpu_lpl->lpl_loadavg;
1650 		cpu = cpu->cpu_next_lgrp;
1651 	} while (cpu != lgrp->lgrp_cpu);
1652 
1653 	mutex_exit(&cpu_lock);
1654 
1655 	return (loads / ncpu);
1656 }
1657 
1658 void
1659 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1660 {
1661 	struct lgrp_stats *pstats;
1662 
1663 	/*
1664 	 * Verify that the caller isn't trying to add to
1665 	 * a statistic for an lgroup that has gone away
1666 	 */
1667 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1668 		return;
1669 
1670 	pstats = &lgrp_stats[lgrpid];
1671 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1672 }
1673 
1674 int64_t
1675 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1676 {
1677 	uint64_t val;
1678 	struct lgrp_stats *pstats;
1679 
1680 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1681 		return ((int64_t)0);
1682 
1683 	pstats = &lgrp_stats[lgrpid];
1684 	LGRP_STAT_READ(pstats, stat, val);
1685 	return (val);
1686 }
1687 
1688 /*
1689  * Reset all kstats for lgrp specified by its lgrpid.
1690  */
1691 static void
1692 lgrp_kstat_reset(lgrp_id_t lgrpid)
1693 {
1694 	lgrp_stat_t stat;
1695 
1696 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1697 		return;
1698 
1699 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1700 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1701 	}
1702 }
1703 
1704 /*
1705  * Collect all per-lgrp statistics for the lgrp associated with this
1706  * kstat, and store them in the ks_data array.
1707  *
1708  * The superuser can reset all the running counter statistics for an
1709  * lgrp by writing to any of the lgrp's stats.
1710  */
1711 static int
1712 lgrp_kstat_extract(kstat_t *ksp, int rw)
1713 {
1714 	lgrp_stat_t		stat;
1715 	struct kstat_named	*ksd;
1716 	lgrp_t			*lgrp;
1717 	lgrp_id_t		lgrpid;
1718 
1719 	lgrp = (lgrp_t *)ksp->ks_private;
1720 
1721 	ksd = (struct kstat_named *)ksp->ks_data;
1722 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1723 
1724 	lgrpid = lgrp->lgrp_id;
1725 
1726 	if (lgrpid == LGRP_NONE) {
1727 		/*
1728 		 * Return all zeroes as stats for freed lgrp.
1729 		 */
1730 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1731 			ksd[stat].value.i64 = 0;
1732 		}
1733 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1734 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1735 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1736 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1737 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1738 	} else if (rw != KSTAT_WRITE) {
1739 		/*
1740 		 * Handle counter stats
1741 		 */
1742 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1743 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1744 		}
1745 
1746 		/*
1747 		 * Handle kernel data snapshot stats
1748 		 */
1749 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1750 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1751 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1752 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1753 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1754 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1755 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1756 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1757 	} else {
1758 		lgrp_kstat_reset(lgrpid);
1759 	}
1760 
1761 	return (0);
1762 }
1763 
1764 int
1765 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1766 {
1767 	cpu_t	*cp;
1768 
1769 	mutex_enter(&cpu_lock);
1770 
1771 	if ((cp = cpu_get(id)) == NULL) {
1772 		mutex_exit(&cpu_lock);
1773 		return (EINVAL);
1774 	}
1775 
1776 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1777 		mutex_exit(&cpu_lock);
1778 		return (EINVAL);
1779 	}
1780 
1781 	ASSERT(cp->cpu_lpl != NULL);
1782 
1783 	*lp = cp->cpu_lpl->lpl_lgrpid;
1784 
1785 	mutex_exit(&cpu_lock);
1786 
1787 	return (0);
1788 }
1789 
1790 int
1791 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1792 {
1793 	cpu_t *cp;
1794 
1795 	mutex_enter(&cpu_lock);
1796 
1797 	if ((cp = cpu_get(id)) == NULL) {
1798 		mutex_exit(&cpu_lock);
1799 		return (EINVAL);
1800 	}
1801 
1802 	ASSERT(cp->cpu_lpl != NULL);
1803 
1804 	*lp = cp->cpu_lpl->lpl_loadavg;
1805 
1806 	mutex_exit(&cpu_lock);
1807 
1808 	return (0);
1809 }
1810 
1811 void
1812 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1813 {
1814 	lgrp_t		*lgrp;
1815 	int		i;
1816 
1817 	for (i = 0; i <= lgrp_alloc_max; i++) {
1818 		lgrp = lgrp_table[i];
1819 
1820 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1821 			lgrp->lgrp_latency = (int)newtime;
1822 	}
1823 }
1824 
1825 /*
1826  * Add a resource named by lpl_leaf to rset of lpl_target
1827  *
1828  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1829  * resource. It is adjusted here, as this is presently the only place that we
1830  * can be certain a resource addition has succeeded.
1831  *
1832  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1833  * list in order until it reaches a NULL.  (This list is required to be NULL
1834  * terminated, too).  This is done so that we can mark start pos + 1, so that
1835  * each lpl is traversed sequentially, but in a different order.  We hope this
1836  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1837  */
1838 
1839 void
1840 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1841 {
1842 	int		i;
1843 	int		entry_slot = 0;
1844 
1845 	/* return if leaf is already present */
1846 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1847 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1848 			return;
1849 		}
1850 
1851 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1852 		    lpl_leaf->lpl_lgrpid) {
1853 			break;
1854 		}
1855 	}
1856 
1857 	/* insert leaf, update counts */
1858 	entry_slot = i;
1859 	i = lpl_target->lpl_nrset++;
1860 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1861 		panic("More leaf lgrps in system than are supported!\n");
1862 	}
1863 
1864 	/*
1865 	 * Start at the end of the rset array and work backwards towards the
1866 	 * slot into which the new lpl will be inserted. This effectively
1867 	 * preserves the current ordering by scooting everybody over one entry,
1868 	 * and placing the new entry into the space created.
1869 	 */
1870 
1871 	while (i-- > entry_slot) {
1872 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1873 	}
1874 
1875 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1876 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1877 }
1878 
1879 /*
1880  * Update each of lpl_parent's children with a proper hint and
1881  * a reference to their parent.
1882  * The lgrp topology is used as the reference since it is fully
1883  * consistent and correct at this point.
1884  *
1885  * Each child's hint will reference an element in lpl_parent's
1886  * rset that designates where the child should start searching
1887  * for CPU resources. The hint selected is the highest order leaf present
1888  * in the child's lineage.
1889  *
1890  * This should be called after any potential change in lpl_parent's
1891  * rset.
1892  */
1893 static void
1894 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1895 {
1896 	klgrpset_t	children, leaves;
1897 	lpl_t		*lpl;
1898 	int		hint;
1899 	int		i, j;
1900 
1901 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1902 	if (klgrpset_isempty(children))
1903 		return; /* nothing to do */
1904 
1905 	for (i = 0; i <= lgrp_alloc_max; i++) {
1906 		if (klgrpset_ismember(children, i)) {
1907 
1908 			/*
1909 			 * Given the set of leaves in this child's lineage,
1910 			 * find the highest order leaf present in the parent's
1911 			 * rset. Select this as the hint for the child.
1912 			 */
1913 			leaves = lgrp_table[i]->lgrp_leaves;
1914 			hint = 0;
1915 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1916 				lpl = lpl_parent->lpl_rset[j];
1917 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1918 					hint = j;
1919 			}
1920 			cp->cp_lgrploads[i].lpl_hint = hint;
1921 
1922 			/*
1923 			 * (Re)set the parent. It may be incorrect if
1924 			 * lpl_parent is new in the topology.
1925 			 */
1926 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1927 		}
1928 	}
1929 }
1930 
1931 /*
1932  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1933  *
1934  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1935  * resource. The values are adjusted here, as this is the only place that we can
1936  * be certain a resource was successfully deleted.
1937  */
1938 void
1939 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1940 {
1941 	int i;
1942 
1943 	/* find leaf in intermediate node */
1944 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1945 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1946 			break;
1947 	}
1948 
1949 	/* return if leaf not found */
1950 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1951 		return;
1952 
1953 	/* prune leaf, compress array */
1954 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1955 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1956 	lpl_target->lpl_ncpu--;
1957 	do {
1958 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1959 	} while (i++ < lpl_target->lpl_nrset);
1960 }
1961 
1962 /*
1963  * Check to see if the resource set of the target lpl contains the
1964  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1965  */
1966 
1967 int
1968 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1969 {
1970 	int i;
1971 
1972 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1973 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1974 			return (1);
1975 	}
1976 
1977 	return (0);
1978 }
1979 
1980 /*
1981  * Called when we change cpu lpl membership.  This increments or decrements the
1982  * per-cpu counter in every lpl in which our leaf appears.
1983  */
1984 void
1985 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1986 {
1987 	cpupart_t	*cpupart;
1988 	lgrp_t		*lgrp_leaf;
1989 	lgrp_t		*lgrp_cur;
1990 	lpl_t		*lpl_leaf;
1991 	lpl_t		*lpl_cur;
1992 	int		i;
1993 
1994 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1995 
1996 	cpupart = cp->cpu_part;
1997 	lpl_leaf = cp->cpu_lpl;
1998 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1999 
2000 	for (i = 0; i <= lgrp_alloc_max; i++) {
2001 		lgrp_cur = lgrp_table[i];
2002 
2003 		/*
2004 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2005 		 * for the cpu in question, or if the current lgrp and leaf
2006 		 * don't share the same resources.
2007 		 */
2008 
2009 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2010 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2011 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2012 			continue;
2013 
2014 
2015 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2016 
2017 		if (lpl_cur->lpl_nrset > 0) {
2018 			if (act == LPL_INCREMENT) {
2019 				lpl_cur->lpl_ncpu++;
2020 			} else if (act == LPL_DECREMENT) {
2021 				lpl_cur->lpl_ncpu--;
2022 			}
2023 		}
2024 	}
2025 }
2026 
2027 /*
2028  * Initialize lpl with given resources and specified lgrp
2029  */
2030 
2031 void
2032 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2033 {
2034 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2035 	lpl->lpl_loadavg = 0;
2036 	if (lpl == lpl_leaf)
2037 		lpl->lpl_ncpu = 1;
2038 	else
2039 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2040 	lpl->lpl_nrset = 1;
2041 	lpl->lpl_rset[0] = lpl_leaf;
2042 	lpl->lpl_lgrp = lgrp;
2043 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2044 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2045 }
2046 
2047 /*
2048  * Clear an unused lpl
2049  */
2050 
2051 void
2052 lpl_clear(lpl_t *lpl)
2053 {
2054 	lgrpid_t	lid;
2055 
2056 	/* save lid for debugging purposes */
2057 	lid = lpl->lpl_lgrpid;
2058 	bzero(lpl, sizeof (lpl_t));
2059 	lpl->lpl_lgrpid = lid;
2060 }
2061 
2062 /*
2063  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2064  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2065  * make full use of all of the lgroup topology, but this checks to make sure
2066  * that for the parts that it does use, it has correctly understood the
2067  * relationships that exist. This function returns
2068  * 0 if the topology is correct, and a non-zero error code, for non-debug
2069  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2070  * debugging on a DEBUG kernel.
2071  */
2072 int
2073 lpl_topo_verify(cpupart_t *cpupart)
2074 {
2075 	lgrp_t		*lgrp;
2076 	lpl_t		*lpl;
2077 	klgrpset_t	rset;
2078 	klgrpset_t	cset;
2079 	cpu_t		*cpu;
2080 	cpu_t		*cp_start;
2081 	int		i;
2082 	int		j;
2083 	int		sum;
2084 
2085 	/* topology can't be incorrect if it doesn't exist */
2086 	if (!lgrp_topo_initialized || !lgrp_initialized)
2087 		return (LPL_TOPO_CORRECT);
2088 
2089 	ASSERT(cpupart != NULL);
2090 
2091 	for (i = 0; i <= lgrp_alloc_max; i++) {
2092 		lgrp = lgrp_table[i];
2093 		lpl = NULL;
2094 		/* make sure lpls are allocated */
2095 		ASSERT(cpupart->cp_lgrploads);
2096 		if (!cpupart->cp_lgrploads)
2097 			return (LPL_TOPO_PART_HAS_NO_LPL);
2098 
2099 		lpl = &cpupart->cp_lgrploads[i];
2100 		/* make sure our index is good */
2101 		ASSERT(i < cpupart->cp_nlgrploads);
2102 
2103 		/* if lgroup doesn't exist, make sure lpl is empty */
2104 		if (!LGRP_EXISTS(lgrp)) {
2105 			ASSERT(lpl->lpl_ncpu == 0);
2106 			if (lpl->lpl_ncpu > 0) {
2107 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2108 			} else {
2109 				continue;
2110 			}
2111 		}
2112 
2113 		/* verify that lgroup and lpl are identically numbered */
2114 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2115 
2116 		/* if lgroup isn't in our partition, make sure lpl is empty */
2117 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2118 		    cpupart->cp_lgrpset)) {
2119 			ASSERT(lpl->lpl_ncpu == 0);
2120 			if (lpl->lpl_ncpu > 0) {
2121 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2122 			}
2123 			/*
2124 			 * lpl is empty, and lgroup isn't in partition.  verify
2125 			 * that lpl doesn't show up in anyone else's rsets (in
2126 			 * this partition, anyway)
2127 			 */
2128 
2129 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2130 				lpl_t *i_lpl; /* lpl we're iterating over */
2131 
2132 				i_lpl = &cpupart->cp_lgrploads[j];
2133 
2134 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2135 				if (lpl_rset_contains(i_lpl, lpl)) {
2136 					return (LPL_TOPO_LPL_ORPHANED);
2137 				}
2138 			}
2139 			/* lgroup is empty, and everything is ok. continue */
2140 			continue;
2141 		}
2142 
2143 
2144 		/* lgroup is in this partition, now check it against lpl */
2145 
2146 		/* do both have matching lgrps? */
2147 		ASSERT(lgrp == lpl->lpl_lgrp);
2148 		if (lgrp != lpl->lpl_lgrp) {
2149 			return (LPL_TOPO_LGRP_MISMATCH);
2150 		}
2151 
2152 		/* do the parent lgroups exist and do they match? */
2153 		if (lgrp->lgrp_parent) {
2154 			ASSERT(lpl->lpl_parent);
2155 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2156 				    lpl->lpl_parent->lpl_lgrpid);
2157 
2158 			if (!lpl->lpl_parent) {
2159 				return (LPL_TOPO_MISSING_PARENT);
2160 			} else if (lgrp->lgrp_parent->lgrp_id !=
2161 			    lpl->lpl_parent->lpl_lgrpid) {
2162 				return (LPL_TOPO_PARENT_MISMATCH);
2163 			}
2164 		}
2165 
2166 		/* only leaf lgroups keep a cpucnt, only check leaves */
2167 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2168 
2169 			/* verify that lgrp is also a leaf */
2170 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2171 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2172 			    lpl->lpl_lgrpid)));
2173 
2174 			if ((lgrp->lgrp_childcnt > 0) ||
2175 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2176 			    lpl->lpl_lgrpid))) {
2177 				return (LPL_TOPO_LGRP_NOT_LEAF);
2178 			}
2179 
2180 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2181 			    (lpl->lpl_ncpu > 0));
2182 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2183 				(lpl->lpl_ncpu <= 0)) {
2184 				return (LPL_TOPO_BAD_CPUCNT);
2185 			}
2186 
2187 			/*
2188 			 * Check that lpl_ncpu also matches the number of
2189 			 * cpus in the lpl's linked list.  This only exists in
2190 			 * leaves, but they should always match.
2191 			 */
2192 			j = 0;
2193 			cpu = cp_start = lpl->lpl_cpus;
2194 			while (cpu != NULL) {
2195 				j++;
2196 
2197 				/* check to make sure cpu's lpl is leaf lpl */
2198 				ASSERT(cpu->cpu_lpl == lpl);
2199 				if (cpu->cpu_lpl != lpl) {
2200 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2201 				}
2202 
2203 				/* check next cpu */
2204 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2205 					continue;
2206 				} else {
2207 					cpu = NULL;
2208 				}
2209 			}
2210 
2211 			ASSERT(j == lpl->lpl_ncpu);
2212 			if (j != lpl->lpl_ncpu) {
2213 				return (LPL_TOPO_LPL_BAD_NCPU);
2214 			}
2215 
2216 			/*
2217 			 * Also, check that leaf lpl is contained in all
2218 			 * intermediate lpls that name the leaf as a descendant
2219 			 */
2220 
2221 			for (j = 0; j <= lgrp_alloc_max; j++) {
2222 				klgrpset_t intersect;
2223 				lgrp_t *lgrp_cand;
2224 				lpl_t *lpl_cand;
2225 
2226 				lgrp_cand = lgrp_table[j];
2227 				intersect = klgrpset_intersects(
2228 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2229 				    cpupart->cp_lgrpset);
2230 
2231 				if (!LGRP_EXISTS(lgrp_cand) ||
2232 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2233 				    cpupart->cp_lgrpset) ||
2234 				    (intersect == 0))
2235 					continue;
2236 
2237 				lpl_cand =
2238 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2239 
2240 				if (klgrpset_ismember(intersect,
2241 				    lgrp->lgrp_id)) {
2242 					ASSERT(lpl_rset_contains(lpl_cand,
2243 					    lpl));
2244 
2245 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2246 						return (LPL_TOPO_RSET_MSSNG_LF);
2247 					}
2248 				}
2249 			}
2250 
2251 		} else { /* non-leaf specific checks */
2252 
2253 			/*
2254 			 * Non-leaf lpls should have lpl_cpus == NULL
2255 			 * verify that this is so
2256 			 */
2257 			ASSERT(lpl->lpl_cpus == NULL);
2258 			if (lpl->lpl_cpus != NULL) {
2259 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2260 			}
2261 
2262 			/*
2263 			 * verify that the sum of the cpus in the leaf resources
2264 			 * is equal to the total ncpu in the intermediate
2265 			 */
2266 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2267 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2268 			}
2269 
2270 			ASSERT(sum == lpl->lpl_ncpu);
2271 			if (sum != lpl->lpl_ncpu) {
2272 				return (LPL_TOPO_LPL_BAD_NCPU);
2273 			}
2274 		}
2275 
2276 		/*
2277 		 * check on lpl_hint. Don't check root, since it has no parent.
2278 		 */
2279 		if (lpl->lpl_parent != NULL) {
2280 			int hint;
2281 			lpl_t *hint_lpl;
2282 
2283 			/* make sure hint is within limits of nrset */
2284 			hint = lpl->lpl_hint;
2285 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2286 			if (lpl->lpl_parent->lpl_nrset < hint) {
2287 				return (LPL_TOPO_BOGUS_HINT);
2288 			}
2289 
2290 			/* make sure hint points to valid lpl */
2291 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2292 			ASSERT(hint_lpl->lpl_ncpu > 0);
2293 			if (hint_lpl->lpl_ncpu <= 0) {
2294 				return (LPL_TOPO_BOGUS_HINT);
2295 			}
2296 		}
2297 
2298 		/*
2299 		 * Check the rset of the lpl in question.  Make sure that each
2300 		 * rset contains a subset of the resources in
2301 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2302 		 * sure that each rset doesn't include resources that are
2303 		 * outside of that set.  (Which would be resources somehow not
2304 		 * accounted for).
2305 		 */
2306 
2307 		klgrpset_clear(rset);
2308 		for (j = 0; j < lpl->lpl_nrset; j++) {
2309 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2310 		}
2311 		klgrpset_copy(cset, rset);
2312 		/* make sure lpl rset matches lgrp rset */
2313 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2314 		/* make sure rset is contained with in partition, too */
2315 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2316 
2317 		ASSERT(klgrpset_isempty(rset) &&
2318 			    klgrpset_isempty(cset));
2319 		if (!klgrpset_isempty(rset) ||
2320 		    !klgrpset_isempty(cset)) {
2321 			return (LPL_TOPO_RSET_MISMATCH);
2322 		}
2323 
2324 		/*
2325 		 * check to make sure lpl_nrset matches the number of rsets
2326 		 * contained in the lpl
2327 		 */
2328 
2329 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2330 		    j++);
2331 
2332 		ASSERT(j == lpl->lpl_nrset);
2333 		if (j != lpl->lpl_nrset) {
2334 			return (LPL_TOPO_BAD_RSETCNT);
2335 		}
2336 
2337 	}
2338 	return (LPL_TOPO_CORRECT);
2339 }
2340 
2341 /*
2342  * Flatten lpl topology to given number of levels.  This is presently only
2343  * implemented for a flatten to 2 levels, which will prune out the intermediates
2344  * and home the leaf lpls to the root lpl.
2345  */
2346 int
2347 lpl_topo_flatten(int levels)
2348 {
2349 	int		i;
2350 	uint_t		sum;
2351 	lgrp_t		*lgrp_cur;
2352 	lpl_t		*lpl_cur;
2353 	lpl_t		*lpl_root;
2354 	cpupart_t	*cp;
2355 
2356 	if (levels != 2)
2357 		return (0);
2358 
2359 	/* called w/ cpus paused - grab no locks! */
2360 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2361 	    !lgrp_initialized);
2362 
2363 	cp = cp_list_head;
2364 	do {
2365 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2366 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2367 
2368 		for (i = 0; i <= lgrp_alloc_max; i++) {
2369 			lgrp_cur = lgrp_table[i];
2370 			lpl_cur = &cp->cp_lgrploads[i];
2371 
2372 			if ((lgrp_cur == lgrp_root) ||
2373 			    (!LGRP_EXISTS(lgrp_cur) &&
2374 			    (lpl_cur->lpl_ncpu == 0)))
2375 				continue;
2376 
2377 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2378 				/*
2379 				 * this should be a deleted intermediate, so
2380 				 * clear it
2381 				 */
2382 				lpl_clear(lpl_cur);
2383 			} else if ((lpl_cur->lpl_nrset == 1) &&
2384 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2385 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2386 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2387 				/*
2388 				 * this is a leaf whose parent was deleted, or
2389 				 * whose parent had their lgrp deleted.  (And
2390 				 * whose parent will soon be deleted).  Point
2391 				 * this guy back to the root lpl.
2392 				 */
2393 				lpl_cur->lpl_parent = lpl_root;
2394 				lpl_rset_add(lpl_root, lpl_cur);
2395 			}
2396 
2397 		}
2398 
2399 		/*
2400 		 * Now that we're done, make sure the count on the root lpl is
2401 		 * correct, and update the hints of the children for the sake of
2402 		 * thoroughness
2403 		 */
2404 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2405 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2406 		}
2407 		lpl_root->lpl_ncpu = sum;
2408 		lpl_child_update(lpl_root, cp);
2409 
2410 		cp = cp->cp_next;
2411 	} while (cp != cp_list_head);
2412 
2413 	return (levels);
2414 }
2415 
2416 /*
2417  * Insert a lpl into the resource hierarchy and create any additional lpls that
2418  * are necessary to represent the varying states of locality for the cpu
2419  * resoruces newly added to the partition.
2420  *
2421  * This routine is clever enough that it can correctly add resources from the
2422  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2423  * those for which the lpl is a leaf as opposed to simply a named equally local
2424  * resource).  The one special case that needs additional processing is when a
2425  * new intermediate lpl is introduced.  Since the main loop only traverses
2426  * looking to add the leaf resource where it does not yet exist, additional work
2427  * is necessary to add other leaf resources that may need to exist in the newly
2428  * created intermediate.  This is performed by the second inner loop, and is
2429  * only done when the check for more than one overlapping resource succeeds.
2430  */
2431 
2432 void
2433 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2434 {
2435 	int		i;
2436 	int		j;
2437 	int		hint;
2438 	int		rset_num_intersect;
2439 	lgrp_t		*lgrp_cur;
2440 	lpl_t		*lpl_cur;
2441 	lpl_t		*lpl_parent;
2442 	lgrpid_t	parent_id;
2443 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2444 
2445 	for (i = 0; i <= lgrp_alloc_max; i++) {
2446 		lgrp_cur = lgrp_table[i];
2447 
2448 		/*
2449 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2450 		 * contained within the current lgrp, or if the current lgrp has
2451 		 * no leaves in this partition
2452 		 */
2453 
2454 		if (!LGRP_EXISTS(lgrp_cur) ||
2455 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2456 		    lpl_leaf->lpl_lgrpid) ||
2457 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2458 		    cpupart->cp_lgrpset))
2459 			continue;
2460 
2461 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2462 		if (lgrp_cur->lgrp_parent != NULL) {
2463 			/* if lgrp has a parent, assign it properly */
2464 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2465 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2466 		} else {
2467 			/* if not, make sure parent ptr gets set to null */
2468 			lpl_parent = NULL;
2469 		}
2470 
2471 		if (lpl_cur == lpl_leaf) {
2472 			/*
2473 			 * Almost all leaf state was initialized elsewhere.  The
2474 			 * only thing left to do is to set the parent.
2475 			 */
2476 			lpl_cur->lpl_parent = lpl_parent;
2477 			continue;
2478 		}
2479 
2480 		/*
2481 		 * Initialize intermediate lpl
2482 		 * Save this lpl's hint though. Since we're changing this
2483 		 * lpl's resources, we need to update the hint in this lpl's
2484 		 * children, but the hint in this lpl is unaffected and
2485 		 * should be preserved.
2486 		 */
2487 		hint = lpl_cur->lpl_hint;
2488 
2489 		lpl_clear(lpl_cur);
2490 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2491 
2492 		lpl_cur->lpl_hint = hint;
2493 		lpl_cur->lpl_parent = lpl_parent;
2494 
2495 		/* does new lpl need to be populated with other resources? */
2496 		rset_intersect =
2497 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2498 			cpupart->cp_lgrpset);
2499 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2500 
2501 		if (rset_num_intersect > 1) {
2502 			/*
2503 			 * If so, figure out what lpls have resources that
2504 			 * intersect this one, and add them.
2505 			 */
2506 			for (j = 0; j <= lgrp_alloc_max; j++) {
2507 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2508 				lpl_t	*lpl_cand;	/* candidate lpl */
2509 
2510 				lgrp_cand = lgrp_table[j];
2511 				if (!LGRP_EXISTS(lgrp_cand) ||
2512 				    !klgrpset_ismember(rset_intersect,
2513 					lgrp_cand->lgrp_id))
2514 					continue;
2515 				lpl_cand =
2516 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2517 				lpl_rset_add(lpl_cur, lpl_cand);
2518 			}
2519 		}
2520 		/*
2521 		 * This lpl's rset has changed. Update the hint in it's
2522 		 * children.
2523 		 */
2524 		lpl_child_update(lpl_cur, cpupart);
2525 	}
2526 }
2527 
2528 /*
2529  * remove a lpl from the hierarchy of resources, clearing its state when
2530  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2531  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2532  * delete them as well.
2533  */
2534 
2535 void
2536 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2537 {
2538 	int		i;
2539 	lgrp_t		*lgrp_cur;
2540 	lpl_t		*lpl_cur;
2541 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2542 
2543 	for (i = 0; i <= lgrp_alloc_max; i++) {
2544 		lgrp_cur = lgrp_table[i];
2545 
2546 		/*
2547 		 * Don't attempt to remove from lgrps that aren't there, that
2548 		 * don't contain our leaf, or from the leaf itself. (We do that
2549 		 * later)
2550 		 */
2551 
2552 		if (!LGRP_EXISTS(lgrp_cur))
2553 			continue;
2554 
2555 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2556 
2557 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2558 		    lpl_leaf->lpl_lgrpid) ||
2559 		    (lpl_cur == lpl_leaf)) {
2560 			continue;
2561 		}
2562 
2563 		/*
2564 		 * This is a slightly sleazy simplification in that we have
2565 		 * already marked the cp_lgrpset as no longer containing the
2566 		 * leaf we've deleted.  Any lpls that pass the above checks
2567 		 * based upon lgrp membership but not necessarily cpu-part
2568 		 * membership also get cleared by the checks below.  Currently
2569 		 * this is harmless, as the lpls should be empty anyway.
2570 		 *
2571 		 * In particular, we want to preserve lpls that have additional
2572 		 * leaf resources, even though we don't yet have a processor
2573 		 * architecture that represents resources this way.
2574 		 */
2575 
2576 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2577 		    cpupart->cp_lgrpset);
2578 
2579 		lpl_rset_del(lpl_cur, lpl_leaf);
2580 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2581 			lpl_clear(lpl_cur);
2582 		} else {
2583 			/*
2584 			 * Update this lpl's children
2585 			 */
2586 			lpl_child_update(lpl_cur, cpupart);
2587 		}
2588 	}
2589 	lpl_clear(lpl_leaf);
2590 }
2591 
2592 /*
2593  * add a cpu to a partition in terms of lgrp load avg bookeeping
2594  *
2595  * The lpl (cpu partition load average information) is now arranged in a
2596  * hierarchical fashion whereby resources that are closest, ie. most local, to
2597  * the cpu in question are considered to be leaves in a tree of resources.
2598  * There are two general cases for cpu additon:
2599  *
2600  * 1. A lpl structure that contains resources already in the hierarchy tree.
2601  * In this case, all of the associated lpl relationships have been defined, and
2602  * all that is necessary is that we link the new cpu into the per-lpl list of
2603  * cpus, and increment the ncpu count of all places where this cpu resource will
2604  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2605  * pushing is accomplished by this routine.
2606  *
2607  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2608  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2609  * construct the hierarchy of state necessary to name it's more distant
2610  * resources, if they should exist.  The leaf structure is initialized by this
2611  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2612  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2613  * and builds all of the "ancestoral" state necessary to identify resources at
2614  * differing levels of locality.
2615  */
2616 void
2617 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2618 {
2619 	cpupart_t	*cpupart;
2620 	lgrp_t		*lgrp_leaf;
2621 	lpl_t		*lpl_leaf;
2622 
2623 	/* called sometimes w/ cpus paused - grab no locks */
2624 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2625 
2626 	cpupart = cp->cpu_part;
2627 	lgrp_leaf = lgrp_table[lgrpid];
2628 
2629 	/* don't add non-existent lgrp */
2630 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2631 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2632 	cp->cpu_lpl = lpl_leaf;
2633 
2634 	/* only leaf lpls contain cpus */
2635 
2636 	if (lpl_leaf->lpl_ncpu++ == 0) {
2637 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2638 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2639 		lpl_leaf_insert(lpl_leaf, cpupart);
2640 	} else {
2641 		/*
2642 		 * the lpl should already exist in the parent, so just update
2643 		 * the count of available CPUs
2644 		 */
2645 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2646 	}
2647 
2648 	/* link cpu into list of cpus in lpl */
2649 
2650 	if (lpl_leaf->lpl_cpus) {
2651 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2652 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2653 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2654 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2655 	} else {
2656 		/*
2657 		 * We increment ncpu immediately after we create a new leaf
2658 		 * lpl, so assert that ncpu == 1 for the case where we don't
2659 		 * have any cpu pointers yet.
2660 		 */
2661 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2662 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2663 	}
2664 
2665 }
2666 
2667 
2668 /*
2669  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2670  *
2671  * The lpl (cpu partition load average information) is now arranged in a
2672  * hierarchical fashion whereby resources that are closest, ie. most local, to
2673  * the cpu in question are considered to be leaves in a tree of resources.
2674  * There are two removal cases in question:
2675  *
2676  * 1. Removal of the resource in the leaf leaves other resources remaining in
2677  * that leaf.  (Another cpu still exists at this level of locality).  In this
2678  * case, the count of available cpus is decremented in all assocated lpls by
2679  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2680  * from the per-cpu lpl list.
2681  *
2682  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2683  * empty)  In this case, all of what has occurred for the first step must take
2684  * place; however, additionally we must remove the lpl structure itself, prune
2685  * out any stranded lpls that do not directly name a leaf resource, and mark the
2686  * cpu partition in question as no longer containing resources from the lgrp of
2687  * the lpl that has been delted.  Cpu-partition changes are handled by this
2688  * method, but the lpl_leaf_remove function deals with the details of pruning
2689  * out the empty lpl and any of its orphaned direct ancestors.
2690  */
2691 void
2692 lgrp_part_del_cpu(cpu_t *cp)
2693 {
2694 	lpl_t		*lpl;
2695 	lpl_t		*leaf_lpl;
2696 	lgrp_t		*lgrp_leaf;
2697 
2698 	/* called sometimes w/ cpus paused - grab no locks */
2699 
2700 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2701 
2702 	lpl = leaf_lpl = cp->cpu_lpl;
2703 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2704 
2705 	/* don't delete a leaf that isn't there */
2706 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2707 
2708 	/* no double-deletes */
2709 	ASSERT(lpl->lpl_ncpu);
2710 	if (--lpl->lpl_ncpu == 0) {
2711 		/*
2712 		 * This was the last cpu in this lgroup for this partition,
2713 		 * clear its bit in the partition's lgroup bitmask
2714 		 */
2715 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2716 
2717 		/* eliminate remaning lpl link pointers in cpu, lpl */
2718 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2719 
2720 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2721 	} else {
2722 
2723 		/* unlink cpu from lists of cpus in lpl */
2724 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2725 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2726 		if (lpl->lpl_cpus == cp) {
2727 			lpl->lpl_cpus = cp->cpu_next_lpl;
2728 		}
2729 
2730 		/*
2731 		 * Update the cpu count in the lpls associated with parent
2732 		 * lgroups.
2733 		 */
2734 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2735 
2736 	}
2737 	/* clear cpu's lpl ptr when we're all done */
2738 	cp->cpu_lpl = NULL;
2739 }
2740 
2741 /*
2742  * Recompute load average for the specified partition/lgrp fragment.
2743  *
2744  * We rely on the fact that this routine is called from the clock thread
2745  * at a point before the clock thread can block (i.e. before its first
2746  * lock request).  Since the clock thread can not be preempted (since it
2747  * runs at highest priority), we know that cpu partitions can not change
2748  * (since doing so would require either the repartition requester or the
2749  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2750  * without grabbing cpu_lock.
2751  */
2752 void
2753 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2754 {
2755 	uint_t		ncpu;
2756 	int64_t		old, new, f;
2757 
2758 	/*
2759 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2760 	 */
2761 	static short expval[] = {
2762 	    0, 3196, 1618, 1083,
2763 	    814, 652, 543, 466,
2764 	    408, 363, 326, 297,
2765 	    272, 251, 233, 218,
2766 	    204, 192, 181, 172,
2767 	    163, 155, 148, 142,
2768 	    136, 130, 125, 121,
2769 	    116, 112, 109, 105
2770 	};
2771 
2772 	/* ASSERT (called from clock level) */
2773 
2774 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2775 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2776 		return;
2777 	}
2778 
2779 	for (;;) {
2780 
2781 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2782 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2783 		else
2784 			f = expval[ncpu];
2785 
2786 		/*
2787 		 * Modify the load average atomically to avoid losing
2788 		 * anticipatory load updates (see lgrp_move_thread()).
2789 		 */
2790 		if (ageflag) {
2791 			/*
2792 			 * We're supposed to both update and age the load.
2793 			 * This happens 10 times/sec. per cpu.  We do a
2794 			 * little hoop-jumping to avoid integer overflow.
2795 			 */
2796 			int64_t		q, r;
2797 
2798 			do {
2799 				old = new = lpl->lpl_loadavg;
2800 				q = (old  >> 16) << 7;
2801 				r = (old  & 0xffff) << 7;
2802 				new += ((long long)(nrcpus - q) * f -
2803 				    ((r * f) >> 16)) >> 7;
2804 
2805 				/*
2806 				 * Check for overflow
2807 				 */
2808 				if (new > LGRP_LOADAVG_MAX)
2809 					new = LGRP_LOADAVG_MAX;
2810 				else if (new < 0)
2811 					new = 0;
2812 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2813 			    new) != old);
2814 		} else {
2815 			/*
2816 			 * We're supposed to update the load, but not age it.
2817 			 * This option is used to update the load (which either
2818 			 * has already been aged in this 1/10 sec. interval or
2819 			 * soon will be) to account for a remotely executing
2820 			 * thread.
2821 			 */
2822 			do {
2823 				old = new = lpl->lpl_loadavg;
2824 				new += f;
2825 				/*
2826 				 * Check for overflow
2827 				 * Underflow not possible here
2828 				 */
2829 				if (new < old)
2830 					new = LGRP_LOADAVG_MAX;
2831 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2832 			    new) != old);
2833 		}
2834 
2835 		/*
2836 		 * Do the same for this lpl's parent
2837 		 */
2838 		if ((lpl = lpl->lpl_parent) == NULL)
2839 			break;
2840 		ncpu = lpl->lpl_ncpu;
2841 	}
2842 }
2843 
2844 /*
2845  * Initialize lpl topology in the target based on topology currently present in
2846  * lpl_bootstrap.
2847  *
2848  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2849  * initialize cp_default list of lpls. Up to this point all topology operations
2850  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2851  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2852  * `target' points to the list of lpls in cp_default and `size' is the size of
2853  * this list.
2854  *
2855  * This function walks the lpl topology in lpl_bootstrap and does for things:
2856  *
2857  * 1) Copies all fields from lpl_bootstrap to the target.
2858  *
2859  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2860  *
2861  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2862  *    instead of lpl_bootstrap.
2863  *
2864  * 4) Updates pointers in the resource list of the target to point to the lpls
2865  *    in the target list instead of lpl_bootstrap.
2866  *
2867  * After lpl_topo_bootstrap() completes, target contains the same information
2868  * that would be present there if it were used during boot instead of
2869  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2870  * and it is bzeroed.
2871  */
2872 void
2873 lpl_topo_bootstrap(lpl_t *target, int size)
2874 {
2875 	lpl_t	*lpl = lpl_bootstrap;
2876 	lpl_t	*target_lpl = target;
2877 	int	howmany;
2878 	int	id;
2879 	int	i;
2880 
2881 	/*
2882 	 * The only target that should be passed here is cp_default lpl list.
2883 	 */
2884 	ASSERT(target == cp_default.cp_lgrploads);
2885 	ASSERT(size == cp_default.cp_nlgrploads);
2886 	ASSERT(!lgrp_topo_initialized);
2887 	ASSERT(ncpus == 1);
2888 
2889 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2890 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2891 		/*
2892 		 * Copy all fields from lpl.
2893 		 */
2894 
2895 		*target_lpl = *lpl;
2896 
2897 		/*
2898 		 * Substitute CPU0 lpl pointer with one relative to target.
2899 		 */
2900 		if (lpl->lpl_cpus == CPU) {
2901 			ASSERT(CPU->cpu_lpl == lpl);
2902 			CPU->cpu_lpl = target_lpl;
2903 		}
2904 
2905 		/*
2906 		 * Substitute parent information with parent relative to target.
2907 		 */
2908 		if (lpl->lpl_parent != NULL)
2909 			target_lpl->lpl_parent = (lpl_t *)
2910 			    (((uintptr_t)lpl->lpl_parent -
2911 				(uintptr_t)lpl_bootstrap) +
2912 				(uintptr_t)target);
2913 
2914 		/*
2915 		 * Walk over resource set substituting pointers relative to
2916 		 * lpl_bootstrap to pointers relative to target.
2917 		 */
2918 		ASSERT(lpl->lpl_nrset <= 1);
2919 
2920 		for (id = 0; id < lpl->lpl_nrset; id++) {
2921 			if (lpl->lpl_rset[id] != NULL) {
2922 				target_lpl->lpl_rset[id] =
2923 				    (lpl_t *)
2924 				    (((uintptr_t)lpl->lpl_rset[id] -
2925 					(uintptr_t)lpl_bootstrap) +
2926 					(uintptr_t)target);
2927 			}
2928 		}
2929 	}
2930 
2931 	/*
2932 	 * Topology information in lpl_bootstrap is no longer needed.
2933 	 */
2934 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2935 }
2936 
2937 /* the maximum effect that a single thread can have on it's lgroup's load */
2938 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2939 	((lgrp_loadavg_max_effect) / (ncpu))
2940 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2941 
2942 /*
2943  * If the lowest load among the lgroups a process' threads are currently
2944  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2945  * expanding the process to a new lgroup.
2946  */
2947 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2948 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2949 
2950 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2951 	((lgrp_expand_proc_thresh) / (ncpu))
2952 
2953 /*
2954  * A process will be expanded to a new lgroup only if the difference between
2955  * the lowest load on the lgroups the process' thread's are currently spread
2956  * across and the lowest load on the other lgroups in the process' partition
2957  * is greater than lgrp_expand_proc_diff.
2958  */
2959 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2960 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2961 
2962 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2963 	((lgrp_expand_proc_diff) / (ncpu))
2964 
2965 /*
2966  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2967  * be present due to impreciseness of the load average decay algorithm.
2968  *
2969  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2970  * tolerance is scaled by the number of cpus in the lgroup just like
2971  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2972  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2973  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2974  */
2975 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2976 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2977 	((lgrp_loadavg_tolerance) / ncpu)
2978 
2979 /*
2980  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2981  * average is above this threshold
2982  */
2983 uint32_t	lgrp_load_thresh = UINT32_MAX;
2984 
2985 /*
2986  * lgrp_choose() will try to skip any lgroups with less memory
2987  * than this free when choosing a home lgroup
2988  */
2989 pgcnt_t	lgrp_mem_free_thresh = 0;
2990 
2991 /*
2992  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2993  * one based on one of the following policies:
2994  * - Random selection
2995  * - Pseudo round robin placement
2996  * - Longest time since a thread was last placed
2997  */
2998 #define	LGRP_CHOOSE_RANDOM	1
2999 #define	LGRP_CHOOSE_RR		2
3000 #define	LGRP_CHOOSE_TIME	3
3001 
3002 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3003 
3004 /*
3005  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3006  * be bound to a CPU or processor set.
3007  *
3008  * Arguments:
3009  *	t		The thread
3010  *	cpupart		The partition the thread belongs to.
3011  *
3012  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3013  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3014  *	 partitions changing out from under us and assumes that given thread is
3015  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3016  *	 disabled, so don't grab any locks because we should never block under
3017  *	 those conditions.
3018  */
3019 lpl_t *
3020 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3021 {
3022 	lgrp_load_t	bestload, bestrload;
3023 	int		lgrpid_offset, lgrp_count;
3024 	lgrp_id_t	lgrpid, lgrpid_start;
3025 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3026 	klgrpset_t	lgrpset;
3027 	proc_t		*p;
3028 
3029 	ASSERT(t != NULL);
3030 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3031 	    THREAD_LOCK_HELD(t));
3032 	ASSERT(cpupart != NULL);
3033 
3034 	p = t->t_procp;
3035 
3036 	/* A process should always be in an active partition */
3037 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3038 
3039 	bestlpl = bestrlpl = NULL;
3040 	bestload = bestrload = LGRP_LOADAVG_MAX;
3041 	lgrpset = cpupart->cp_lgrpset;
3042 
3043 	switch (lgrp_choose_policy) {
3044 	case LGRP_CHOOSE_RR:
3045 		lgrpid = cpupart->cp_lgrp_hint;
3046 		do {
3047 			if (++lgrpid > lgrp_alloc_max)
3048 				lgrpid = 0;
3049 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3050 
3051 		break;
3052 	default:
3053 	case LGRP_CHOOSE_TIME:
3054 	case LGRP_CHOOSE_RANDOM:
3055 		klgrpset_nlgrps(lgrpset, lgrp_count);
3056 		lgrpid_offset =
3057 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3058 		for (lgrpid = 0; ; lgrpid++) {
3059 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3060 				if (--lgrpid_offset == 0)
3061 					break;
3062 			}
3063 		}
3064 		break;
3065 	}
3066 
3067 	lgrpid_start = lgrpid;
3068 
3069 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3070 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3071 
3072 	/*
3073 	 * Use lgroup affinities (if any) to choose best lgroup
3074 	 *
3075 	 * NOTE: Assumes that thread is protected from going away and its
3076 	 *	 lgroup affinities won't change (ie. p_lock, or
3077 	 *	 thread_lock() being held and/or CPUs paused)
3078 	 */
3079 	if (t->t_lgrp_affinity) {
3080 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3081 		if (lpl != NULL)
3082 			return (lpl);
3083 	}
3084 
3085 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3086 	bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3087 
3088 	do {
3089 		pgcnt_t	npgs;
3090 
3091 		/*
3092 		 * Skip any lgroups outside of thread's pset
3093 		 */
3094 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3095 			if (++lgrpid > lgrp_alloc_max)
3096 				lgrpid = 0;	/* wrap the search */
3097 			continue;
3098 		}
3099 
3100 		/*
3101 		 * Skip any non-leaf lgroups
3102 		 */
3103 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3104 			continue;
3105 
3106 		/*
3107 		 * Skip any lgroups without enough free memory
3108 		 * (when threshold set to nonzero positive value)
3109 		 */
3110 		if (lgrp_mem_free_thresh > 0) {
3111 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3112 			if (npgs < lgrp_mem_free_thresh) {
3113 				if (++lgrpid > lgrp_alloc_max)
3114 					lgrpid = 0;	/* wrap the search */
3115 				continue;
3116 			}
3117 		}
3118 
3119 		lpl = &cpupart->cp_lgrploads[lgrpid];
3120 		if (klgrpset_isempty(p->p_lgrpset) ||
3121 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3122 			/*
3123 			 * Either this is a new process or the process already
3124 			 * has threads on this lgrp, so this is a preferred
3125 			 * lgroup for the thread.
3126 			 */
3127 			if (lpl_pick(lpl, bestlpl)) {
3128 				bestload = lpl->lpl_loadavg;
3129 				bestlpl = lpl;
3130 			}
3131 		} else {
3132 			/*
3133 			 * The process doesn't have any threads on this lgrp,
3134 			 * but we're willing to consider this lgrp if the load
3135 			 * difference is big enough to justify splitting up
3136 			 * the process' threads.
3137 			 */
3138 			if (lpl_pick(lpl, bestrlpl)) {
3139 				bestrload = lpl->lpl_loadavg;
3140 				bestrlpl = lpl;
3141 			}
3142 		}
3143 		if (++lgrpid > lgrp_alloc_max)
3144 			lgrpid = 0;	/* wrap the search */
3145 	} while (lgrpid != lgrpid_start);
3146 
3147 	/*
3148 	 * Return root lgroup if threshold isn't set to maximum value and
3149 	 * lowest lgroup load average more than a certain threshold
3150 	 */
3151 	if (lgrp_load_thresh != UINT32_MAX &&
3152 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3153 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3154 
3155 	/*
3156 	 * If all the lgroups over which the thread's process is spread are
3157 	 * heavily loaded, we'll consider placing the thread on one of the
3158 	 * other leaf lgroups in the thread's partition.
3159 	 */
3160 	if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3161 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3162 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3163 	    bestload)) {
3164 		bestlpl = bestrlpl;
3165 	}
3166 
3167 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3168 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3169 
3170 	ASSERT(bestlpl->lpl_ncpu > 0);
3171 	return (bestlpl);
3172 }
3173 
3174 /*
3175  * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing.
3176  */
3177 static int
3178 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3179 {
3180 	lgrp_load_t	l1, l2;
3181 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3182 
3183 
3184 	if (lpl2 == NULL)
3185 		return (1);
3186 
3187 	l1 = lpl1->lpl_loadavg;
3188 	l2 = lpl2->lpl_loadavg;
3189 
3190 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3191 		/* lpl1 is significantly less loaded than lpl2 */
3192 		return (1);
3193 	}
3194 
3195 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3196 	    l1 + tolerance >= l2 && l1 < l2 &&
3197 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3198 		/*
3199 		 * lpl1's load is within the tolerance of lpl2. We're
3200 		 * willing to consider it be to better however if
3201 		 * it has been longer since we last homed a thread there
3202 		 */
3203 		return (1);
3204 	}
3205 
3206 	return (0);
3207 }
3208 
3209 /*
3210  * An LWP is expected to be assigned to an lgroup for at least this long
3211  * for its anticipatory load to be justified.  NOTE that this value should
3212  * not be set extremely huge (say, larger than 100 years), to avoid problems
3213  * with overflow in the calculation that uses it.
3214  */
3215 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3216 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3217 
3218 /*
3219  * Routine to change a thread's lgroup affiliation.  This routine updates
3220  * the thread's kthread_t struct and its process' proc_t struct to note the
3221  * thread's new lgroup affiliation, and its lgroup affinities.
3222  *
3223  * Note that this is the only routine that modifies a thread's t_lpl field,
3224  * and that adds in or removes anticipatory load.
3225  *
3226  * If the thread is exiting, newlpl is NULL.
3227  *
3228  * Locking:
3229  * The following lock must be held on entry:
3230  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3231  *		doesn't get removed from t's partition
3232  *
3233  * This routine is not allowed to grab any locks, since it may be called
3234  * with cpus paused (such as from cpu_offline).
3235  */
3236 void
3237 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3238 {
3239 	proc_t		*p;
3240 	lpl_t		*lpl, *oldlpl;
3241 	lgrp_id_t	oldid;
3242 	kthread_t	*tp;
3243 	uint_t		ncpu;
3244 	lgrp_load_t	old, new;
3245 
3246 	ASSERT(t);
3247 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3248 	    THREAD_LOCK_HELD(t));
3249 
3250 	/*
3251 	 * If not changing lpls, just return
3252 	 */
3253 	if ((oldlpl = t->t_lpl) == newlpl)
3254 		return;
3255 
3256 	/*
3257 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3258 	 * associated with process 0 rather than with its original process).
3259 	 */
3260 	if (t->t_proc_flag & TP_LWPEXIT) {
3261 		if (newlpl != NULL) {
3262 			t->t_lpl = newlpl;
3263 		}
3264 		return;
3265 	}
3266 
3267 	p = ttoproc(t);
3268 
3269 	/*
3270 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3271 	 * to account for it being moved from its old lgroup.
3272 	 */
3273 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3274 	    (p->p_tlist != NULL)) {
3275 		oldid = oldlpl->lpl_lgrpid;
3276 
3277 		if (newlpl != NULL)
3278 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3279 
3280 		if ((do_lgrpset_delete) &&
3281 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3282 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3283 				/*
3284 				 * Check if a thread other than the thread
3285 				 * that's moving is assigned to the same
3286 				 * lgroup as the thread that's moving.  Note
3287 				 * that we have to compare lgroup IDs, rather
3288 				 * than simply comparing t_lpl's, since the
3289 				 * threads may belong to different partitions
3290 				 * but be assigned to the same lgroup.
3291 				 */
3292 				ASSERT(tp->t_lpl != NULL);
3293 
3294 				if ((tp != t) &&
3295 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3296 					/*
3297 					 * Another thread is assigned to the
3298 					 * same lgroup as the thread that's
3299 					 * moving, p_lgrpset doesn't change.
3300 					 */
3301 					break;
3302 				} else if (tp == p->p_tlist) {
3303 					/*
3304 					 * No other thread is assigned to the
3305 					 * same lgroup as the exiting thread,
3306 					 * clear the lgroup's bit in p_lgrpset.
3307 					 */
3308 					klgrpset_del(p->p_lgrpset, oldid);
3309 					break;
3310 				}
3311 			}
3312 		}
3313 
3314 		/*
3315 		 * If this thread was assigned to its old lgroup for such a
3316 		 * short amount of time that the anticipatory load that was
3317 		 * added on its behalf has aged very little, remove that
3318 		 * anticipatory load.
3319 		 */
3320 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3321 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3322 			lpl = oldlpl;
3323 			for (;;) {
3324 				do {
3325 					old = new = lpl->lpl_loadavg;
3326 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3327 					if (new > old) {
3328 						/*
3329 						 * this can happen if the load
3330 						 * average was aged since we
3331 						 * added in the anticipatory
3332 						 * load
3333 						 */
3334 						new = 0;
3335 					}
3336 				} while (cas32(
3337 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3338 					    new) != old);
3339 
3340 				lpl = lpl->lpl_parent;
3341 				if (lpl == NULL)
3342 					break;
3343 
3344 				ncpu = lpl->lpl_ncpu;
3345 				ASSERT(ncpu > 0);
3346 			}
3347 		}
3348 	}
3349 	/*
3350 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3351 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3352 	 * to its new lgroup to account for its move to its new lgroup.
3353 	 */
3354 	if (newlpl != NULL) {
3355 		/*
3356 		 * This thread is moving to a new lgroup
3357 		 */
3358 		t->t_lpl = newlpl;
3359 
3360 		/*
3361 		 * Reflect move in load average of new lgroup
3362 		 * unless it is root lgroup
3363 		 */
3364 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3365 			return;
3366 
3367 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3368 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3369 		}
3370 
3371 		/*
3372 		 * It'll take some time for the load on the new lgroup
3373 		 * to reflect this thread's placement on it.  We'd
3374 		 * like not, however, to have all threads between now
3375 		 * and then also piling on to this lgroup.  To avoid
3376 		 * this pileup, we anticipate the load this thread
3377 		 * will generate on its new lgroup.  The goal is to
3378 		 * make the lgroup's load appear as though the thread
3379 		 * had been there all along.  We're very conservative
3380 		 * in calculating this anticipatory load, we assume
3381 		 * the worst case case (100% CPU-bound thread).  This
3382 		 * may be modified in the future to be more accurate.
3383 		 */
3384 		lpl = newlpl;
3385 		for (;;) {
3386 			ncpu = lpl->lpl_ncpu;
3387 			ASSERT(ncpu > 0);
3388 			do {
3389 				old = new = lpl->lpl_loadavg;
3390 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3391 				/*
3392 				 * Check for overflow
3393 				 * Underflow not possible here
3394 				 */
3395 				if (new < old)
3396 					new = UINT32_MAX;
3397 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3398 			    new) != old);
3399 
3400 			lpl = lpl->lpl_parent;
3401 			if (lpl == NULL)
3402 				break;
3403 		}
3404 		t->t_anttime = gethrtime();
3405 	}
3406 }
3407 
3408 /*
3409  * Return lgroup memory allocation policy given advice from madvise(3C)
3410  */
3411 lgrp_mem_policy_t
3412 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3413 {
3414 	switch (advice) {
3415 	case MADV_ACCESS_LWP:
3416 		return (LGRP_MEM_POLICY_NEXT);
3417 	case MADV_ACCESS_MANY:
3418 		return (LGRP_MEM_POLICY_RANDOM);
3419 	default:
3420 		return (lgrp_mem_policy_default(size, type));
3421 	}
3422 }
3423 
3424 /*
3425  * Figure out default policy
3426  */
3427 lgrp_mem_policy_t
3428 lgrp_mem_policy_default(size_t size, int type)
3429 {
3430 	cpupart_t		*cp;
3431 	lgrp_mem_policy_t	policy;
3432 	size_t			pset_mem_size;
3433 
3434 	/*
3435 	 * Randomly allocate memory across lgroups for shared memory
3436 	 * beyond a certain threshold
3437 	 */
3438 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3439 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3440 		/*
3441 		 * Get total memory size of current thread's pset
3442 		 */
3443 		kpreempt_disable();
3444 		cp = curthread->t_cpupart;
3445 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3446 		kpreempt_enable();
3447 
3448 		/*
3449 		 * Choose policy to randomly allocate memory across
3450 		 * lgroups in pset if it will fit and is not default
3451 		 * partition.  Otherwise, allocate memory randomly
3452 		 * across machine.
3453 		 */
3454 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3455 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3456 		else
3457 			policy = LGRP_MEM_POLICY_RANDOM;
3458 	} else
3459 		/*
3460 		 * Apply default policy for private memory and
3461 		 * shared memory under the respective random
3462 		 * threshold.
3463 		 */
3464 		policy = lgrp_mem_default_policy;
3465 
3466 	return (policy);
3467 }
3468 
3469 /*
3470  * Get memory allocation policy for this segment
3471  */
3472 lgrp_mem_policy_info_t *
3473 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3474 {
3475 	lgrp_mem_policy_info_t	*policy_info;
3476 	extern struct seg_ops	segspt_ops;
3477 	extern struct seg_ops	segspt_shmops;
3478 
3479 	/*
3480 	 * This is for binary compatibility to protect against third party
3481 	 * segment drivers which haven't recompiled to allow for
3482 	 * SEGOP_GETPOLICY()
3483 	 */
3484 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3485 	    seg->s_ops != &segspt_shmops)
3486 		return (NULL);
3487 
3488 	policy_info = NULL;
3489 	if (seg->s_ops->getpolicy != NULL)
3490 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3491 
3492 	return (policy_info);
3493 }
3494 
3495 /*
3496  * Set policy for allocating private memory given desired policy, policy info,
3497  * size in bytes of memory that policy is being applied.
3498  * Return 0 if policy wasn't set already and 1 if policy was set already
3499  */
3500 int
3501 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3502     lgrp_mem_policy_info_t *policy_info, size_t size)
3503 {
3504 
3505 	ASSERT(policy_info != NULL);
3506 
3507 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3508 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3509 
3510 	/*
3511 	 * Policy set already?
3512 	 */
3513 	if (policy == policy_info->mem_policy)
3514 		return (1);
3515 
3516 	/*
3517 	 * Set policy
3518 	 */
3519 	policy_info->mem_policy = policy;
3520 	policy_info->mem_reserved = 0;
3521 
3522 	return (0);
3523 }
3524 
3525 
3526 /*
3527  * Get shared memory allocation policy with given tree and offset
3528  */
3529 lgrp_mem_policy_info_t *
3530 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3531     u_offset_t vn_off)
3532 {
3533 	u_offset_t		off;
3534 	lgrp_mem_policy_info_t	*policy_info;
3535 	lgrp_shm_policy_seg_t	*policy_seg;
3536 	lgrp_shm_locality_t	*shm_locality;
3537 	avl_tree_t		*tree;
3538 	avl_index_t		where;
3539 
3540 	/*
3541 	 * Get policy segment tree from anon_map or vnode and use specified
3542 	 * anon index or vnode offset as offset
3543 	 *
3544 	 * Assume that no lock needs to be held on anon_map or vnode, since
3545 	 * they should be protected by their reference count which must be
3546 	 * nonzero for an existing segment
3547 	 */
3548 	if (amp) {
3549 		ASSERT(amp->refcnt != 0);
3550 		shm_locality = amp->locality;
3551 		if (shm_locality == NULL)
3552 			return (NULL);
3553 		tree = shm_locality->loc_tree;
3554 		off = ptob(anon_index);
3555 	} else if (vp) {
3556 		shm_locality = vp->v_locality;
3557 		if (shm_locality == NULL)
3558 			return (NULL);
3559 		ASSERT(shm_locality->loc_count != 0);
3560 		tree = shm_locality->loc_tree;
3561 		off = vn_off;
3562 	}
3563 
3564 	if (tree == NULL)
3565 		return (NULL);
3566 
3567 	/*
3568 	 * Lookup policy segment for offset into shared object and return
3569 	 * policy info
3570 	 */
3571 	rw_enter(&shm_locality->loc_lock, RW_READER);
3572 	policy_info = NULL;
3573 	policy_seg = avl_find(tree, &off, &where);
3574 	if (policy_seg)
3575 		policy_info = &policy_seg->shm_policy;
3576 	rw_exit(&shm_locality->loc_lock);
3577 
3578 	return (policy_info);
3579 }
3580 
3581 /*
3582  * Return lgroup to use for allocating memory
3583  * given the segment and address
3584  *
3585  * There isn't any mutual exclusion that exists between calls
3586  * to this routine and DR, so this routine and whomever calls it
3587  * should be mindful of the possibility that the lgrp returned
3588  * may be deleted. If this happens, dereferences of the lgrp
3589  * pointer will still be safe, but the resources in the lgrp will
3590  * be gone, and LGRP_EXISTS() will no longer be true.
3591  */
3592 lgrp_t *
3593 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3594 {
3595 	int			i;
3596 	lgrp_t			*lgrp;
3597 	klgrpset_t		lgrpset;
3598 	int			lgrps_spanned;
3599 	unsigned long		off;
3600 	lgrp_mem_policy_t	policy;
3601 	lgrp_mem_policy_info_t	*policy_info;
3602 	ushort_t		random;
3603 	int			stat = 0;
3604 
3605 	/*
3606 	 * Just return null if the lgrp framework hasn't finished
3607 	 * initializing or if this is a UMA machine.
3608 	 */
3609 	if (nlgrps == 1 || !lgrp_initialized)
3610 		return (lgrp_root);
3611 
3612 	/*
3613 	 * Get memory allocation policy for this segment
3614 	 */
3615 	policy = lgrp_mem_default_policy;
3616 	if (seg != NULL) {
3617 		if (seg->s_as == &kas) {
3618 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3619 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3620 				policy = LGRP_MEM_POLICY_RANDOM;
3621 		} else {
3622 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3623 			if (policy_info != NULL)
3624 				policy = policy_info->mem_policy;
3625 		}
3626 	}
3627 	lgrpset = 0;
3628 
3629 	/*
3630 	 * Initialize lgroup to home by default
3631 	 */
3632 	lgrp = lgrp_home_lgrp();
3633 
3634 	/*
3635 	 * When homing threads on root lgrp, override default memory
3636 	 * allocation policies with root lgroup memory allocation policy
3637 	 */
3638 	if (lgrp == lgrp_root)
3639 		policy = lgrp_mem_policy_root;
3640 
3641 	/*
3642 	 * Implement policy
3643 	 */
3644 	switch (policy) {
3645 	case LGRP_MEM_POLICY_NEXT_CPU:
3646 
3647 		/*
3648 		 * Return lgroup of current CPU which faulted on memory
3649 		 * If the CPU isn't currently in an lgrp, then opt to
3650 		 * allocate from the root.
3651 		 *
3652 		 * Kernel preemption needs to be disabled here to prevent
3653 		 * the current CPU from going away before lgrp is found.
3654 		 */
3655 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3656 			lgrp = lgrp_root;
3657 		} else {
3658 			kpreempt_disable();
3659 			lgrp = lgrp_cpu_to_lgrp(CPU);
3660 			kpreempt_enable();
3661 		}
3662 		break;
3663 
3664 	case LGRP_MEM_POLICY_NEXT:
3665 	case LGRP_MEM_POLICY_DEFAULT:
3666 	default:
3667 
3668 		/*
3669 		 * Just return current thread's home lgroup
3670 		 * for default policy (next touch)
3671 		 * If the thread is homed to the root,
3672 		 * then the default policy is random across lgroups.
3673 		 * Fallthrough to the random case.
3674 		 */
3675 		if (lgrp != lgrp_root) {
3676 			if (policy == LGRP_MEM_POLICY_NEXT)
3677 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3678 			else
3679 				lgrp_stat_add(lgrp->lgrp_id,
3680 				    LGRP_NUM_DEFAULT, 1);
3681 			break;
3682 		}
3683 		/* LINTED fallthrough on case statement */
3684 	case LGRP_MEM_POLICY_RANDOM:
3685 
3686 		/*
3687 		 * Return a random leaf lgroup with memory
3688 		 */
3689 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3690 		/*
3691 		 * Count how many lgroups are spanned
3692 		 */
3693 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3694 
3695 		/*
3696 		 * There may be no memnodes in the root lgroup during DR copy
3697 		 * rename on a system with only two boards (memnodes)
3698 		 * configured. In this case just return the root lgrp.
3699 		 */
3700 		if (lgrps_spanned == 0) {
3701 			lgrp = lgrp_root;
3702 			break;
3703 		}
3704 
3705 		/*
3706 		 * Pick a random offset within lgroups spanned
3707 		 * and return lgroup at that offset
3708 		 */
3709 		random = (ushort_t)gethrtime() >> 4;
3710 		off = random % lgrps_spanned;
3711 		ASSERT(off <= lgrp_alloc_max);
3712 
3713 		for (i = 0; i <= lgrp_alloc_max; i++) {
3714 			if (!klgrpset_ismember(lgrpset, i))
3715 				continue;
3716 			if (off)
3717 				off--;
3718 			else {
3719 				lgrp = lgrp_table[i];
3720 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3721 				    1);
3722 				break;
3723 			}
3724 		}
3725 		break;
3726 
3727 	case LGRP_MEM_POLICY_RANDOM_PROC:
3728 
3729 		/*
3730 		 * Grab copy of bitmask of lgroups spanned by
3731 		 * this process
3732 		 */
3733 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3734 		stat = LGRP_NUM_RANDOM_PROC;
3735 
3736 		/* LINTED fallthrough on case statement */
3737 	case LGRP_MEM_POLICY_RANDOM_PSET:
3738 
3739 		if (!stat)
3740 			stat = LGRP_NUM_RANDOM_PSET;
3741 
3742 		if (klgrpset_isempty(lgrpset)) {
3743 			/*
3744 			 * Grab copy of bitmask of lgroups spanned by
3745 			 * this processor set
3746 			 */
3747 			kpreempt_disable();
3748 			klgrpset_copy(lgrpset,
3749 			    curthread->t_cpupart->cp_lgrpset);
3750 			kpreempt_enable();
3751 		}
3752 
3753 		/*
3754 		 * Count how many lgroups are spanned
3755 		 */
3756 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3757 		ASSERT(lgrps_spanned <= nlgrps);
3758 
3759 		/*
3760 		 * Probably lgrps_spanned should be always non-zero, but to be
3761 		 * on the safe side we return lgrp_root if it is empty.
3762 		 */
3763 		if (lgrps_spanned == 0) {
3764 			lgrp = lgrp_root;
3765 			break;
3766 		}
3767 
3768 		/*
3769 		 * Pick a random offset within lgroups spanned
3770 		 * and return lgroup at that offset
3771 		 */
3772 		random = (ushort_t)gethrtime() >> 4;
3773 		off = random % lgrps_spanned;
3774 		ASSERT(off <= lgrp_alloc_max);
3775 
3776 		for (i = 0; i <= lgrp_alloc_max; i++) {
3777 			if (!klgrpset_ismember(lgrpset, i))
3778 				continue;
3779 			if (off)
3780 				off--;
3781 			else {
3782 				lgrp = lgrp_table[i];
3783 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3784 				    1);
3785 				break;
3786 			}
3787 		}
3788 		break;
3789 
3790 	case LGRP_MEM_POLICY_ROUNDROBIN:
3791 
3792 		/*
3793 		 * Use offset within segment to determine
3794 		 * offset from home lgroup to choose for
3795 		 * next lgroup to allocate memory from
3796 		 */
3797 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3798 		    (lgrp_alloc_max + 1);
3799 
3800 		kpreempt_disable();
3801 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3802 		i = lgrp->lgrp_id;
3803 		kpreempt_enable();
3804 
3805 		while (off > 0) {
3806 			i = (i + 1) % (lgrp_alloc_max + 1);
3807 			lgrp = lgrp_table[i];
3808 			if (klgrpset_ismember(lgrpset, i))
3809 				off--;
3810 		}
3811 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3812 
3813 		break;
3814 	}
3815 
3816 	ASSERT(lgrp != NULL);
3817 	return (lgrp);
3818 }
3819 
3820 /*
3821  * Return the number of pages in an lgroup
3822  *
3823  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3824  *	 could cause tests that rely on the numat driver to fail....
3825  */
3826 pgcnt_t
3827 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3828 {
3829 	lgrp_t *lgrp;
3830 
3831 	lgrp = lgrp_table[lgrpid];
3832 	if (!LGRP_EXISTS(lgrp) ||
3833 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3834 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3835 		return (0);
3836 
3837 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3838 }
3839 
3840 /*
3841  * Initialize lgroup shared memory allocation policy support
3842  */
3843 void
3844 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3845 {
3846 	lgrp_shm_locality_t	*shm_locality;
3847 
3848 	/*
3849 	 * Initialize locality field in anon_map
3850 	 * Don't need any locks because this is called when anon_map is
3851 	 * allocated, but not used anywhere yet.
3852 	 */
3853 	if (amp) {
3854 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3855 		if (amp->locality == NULL) {
3856 			/*
3857 			 * Allocate and initialize shared memory locality info
3858 			 * and set anon_map locality pointer to it
3859 			 * Drop lock across kmem_alloc(KM_SLEEP)
3860 			 */
3861 			ANON_LOCK_EXIT(&amp->a_rwlock);
3862 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3863 			    KM_SLEEP);
3864 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3865 			    NULL);
3866 			shm_locality->loc_count = 1;	/* not used for amp */
3867 			shm_locality->loc_tree = NULL;
3868 
3869 			/*
3870 			 * Reacquire lock and check to see whether anyone beat
3871 			 * us to initializing the locality info
3872 			 */
3873 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3874 			if (amp->locality != NULL) {
3875 				rw_destroy(&shm_locality->loc_lock);
3876 				kmem_free(shm_locality,
3877 				    sizeof (*shm_locality));
3878 			} else
3879 				amp->locality = shm_locality;
3880 		}
3881 		ANON_LOCK_EXIT(&amp->a_rwlock);
3882 		return;
3883 	}
3884 
3885 	/*
3886 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3887 	 */
3888 	mutex_enter(&vp->v_lock);
3889 	if ((vp->v_flag & V_LOCALITY) == 0) {
3890 		/*
3891 		 * Allocate and initialize shared memory locality info
3892 		 */
3893 		mutex_exit(&vp->v_lock);
3894 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3895 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3896 		shm_locality->loc_count = 1;
3897 		shm_locality->loc_tree = NULL;
3898 
3899 		/*
3900 		 * Point vnode locality field at shared vnode policy info
3901 		 * and set locality aware flag in vnode
3902 		 */
3903 		mutex_enter(&vp->v_lock);
3904 		if ((vp->v_flag & V_LOCALITY) == 0) {
3905 			vp->v_locality = shm_locality;
3906 			vp->v_flag |= V_LOCALITY;
3907 		} else {
3908 			/*
3909 			 * Lost race so free locality info and increment count.
3910 			 */
3911 			rw_destroy(&shm_locality->loc_lock);
3912 			kmem_free(shm_locality, sizeof (*shm_locality));
3913 			shm_locality = vp->v_locality;
3914 			shm_locality->loc_count++;
3915 		}
3916 		mutex_exit(&vp->v_lock);
3917 
3918 		return;
3919 	}
3920 
3921 	/*
3922 	 * Increment reference count of number of segments mapping this vnode
3923 	 * shared
3924 	 */
3925 	shm_locality = vp->v_locality;
3926 	shm_locality->loc_count++;
3927 	mutex_exit(&vp->v_lock);
3928 }
3929 
3930 /*
3931  * Destroy the given shared memory policy segment tree
3932  */
3933 void
3934 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3935 {
3936 	lgrp_shm_policy_seg_t	*cur;
3937 	lgrp_shm_policy_seg_t	*next;
3938 
3939 	if (tree == NULL)
3940 		return;
3941 
3942 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3943 	while (cur != NULL) {
3944 		next = AVL_NEXT(tree, cur);
3945 		avl_remove(tree, cur);
3946 		kmem_free(cur, sizeof (*cur));
3947 		cur = next;
3948 	}
3949 	kmem_free(tree, sizeof (avl_tree_t));
3950 }
3951 
3952 /*
3953  * Uninitialize lgroup shared memory allocation policy support
3954  */
3955 void
3956 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3957 {
3958 	lgrp_shm_locality_t	*shm_locality;
3959 
3960 	/*
3961 	 * For anon_map, deallocate shared memory policy tree and
3962 	 * zero locality field
3963 	 * Don't need any locks because anon_map is being freed
3964 	 */
3965 	if (amp) {
3966 		if (amp->locality == NULL)
3967 			return;
3968 		shm_locality = amp->locality;
3969 		shm_locality->loc_count = 0;	/* not really used for amp */
3970 		rw_destroy(&shm_locality->loc_lock);
3971 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3972 		kmem_free(shm_locality, sizeof (*shm_locality));
3973 		amp->locality = 0;
3974 		return;
3975 	}
3976 
3977 	/*
3978 	 * For vnode, decrement reference count of segments mapping this vnode
3979 	 * shared and delete locality info if reference count drops to 0
3980 	 */
3981 	mutex_enter(&vp->v_lock);
3982 	shm_locality = vp->v_locality;
3983 	shm_locality->loc_count--;
3984 
3985 	if (shm_locality->loc_count == 0) {
3986 		rw_destroy(&shm_locality->loc_lock);
3987 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3988 		kmem_free(shm_locality, sizeof (*shm_locality));
3989 		vp->v_locality = 0;
3990 		vp->v_flag &= ~V_LOCALITY;
3991 	}
3992 	mutex_exit(&vp->v_lock);
3993 }
3994 
3995 /*
3996  * Compare two shared memory policy segments
3997  * Used by AVL tree code for searching
3998  */
3999 int
4000 lgrp_shm_policy_compar(const void *x, const void *y)
4001 {
4002 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4003 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4004 
4005 	if (a->shm_off < b->shm_off)
4006 		return (-1);
4007 	if (a->shm_off >= b->shm_off + b->shm_size)
4008 		return (1);
4009 	return (0);
4010 }
4011 
4012 /*
4013  * Concatenate seg1 with seg2 and remove seg2
4014  */
4015 static int
4016 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4017     lgrp_shm_policy_seg_t *seg2)
4018 {
4019 	if (!seg1 || !seg2 ||
4020 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4021 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4022 		return (-1);
4023 
4024 	seg1->shm_size += seg2->shm_size;
4025 	avl_remove(tree, seg2);
4026 	kmem_free(seg2, sizeof (*seg2));
4027 	return (0);
4028 }
4029 
4030 /*
4031  * Split segment at given offset and return rightmost (uppermost) segment
4032  * Assumes that there are no overlapping segments
4033  */
4034 static lgrp_shm_policy_seg_t *
4035 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4036     u_offset_t off)
4037 {
4038 	lgrp_shm_policy_seg_t	*newseg;
4039 	avl_index_t		where;
4040 
4041 	ASSERT(seg != NULL);
4042 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4043 
4044 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4045 	    seg->shm_size)
4046 		return (NULL);
4047 
4048 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4049 		return (seg);
4050 
4051 	/*
4052 	 * Adjust size of left segment and allocate new (right) segment
4053 	 */
4054 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4055 	newseg->shm_policy = seg->shm_policy;
4056 	newseg->shm_off = off;
4057 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4058 	seg->shm_size = off - seg->shm_off;
4059 
4060 	/*
4061 	 * Find where to insert new segment in AVL tree and insert it
4062 	 */
4063 	(void) avl_find(tree, &off, &where);
4064 	avl_insert(tree, newseg, where);
4065 
4066 	return (newseg);
4067 }
4068 
4069 /*
4070  * Set shared memory allocation policy on specified shared object at given
4071  * offset and length
4072  *
4073  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4074  * -1 if can't set policy.
4075  */
4076 int
4077 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4078     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4079 {
4080 	u_offset_t		eoff;
4081 	lgrp_shm_policy_seg_t	*next;
4082 	lgrp_shm_policy_seg_t	*newseg;
4083 	u_offset_t		off;
4084 	u_offset_t		oldeoff;
4085 	lgrp_shm_policy_seg_t	*prev;
4086 	int			retval;
4087 	lgrp_shm_policy_seg_t	*seg;
4088 	lgrp_shm_locality_t	*shm_locality;
4089 	avl_tree_t		*tree;
4090 	avl_index_t		where;
4091 
4092 	ASSERT(amp || vp);
4093 	ASSERT((len & PAGEOFFSET) == 0);
4094 
4095 	if (len == 0)
4096 		return (-1);
4097 
4098 	retval = 0;
4099 
4100 	/*
4101 	 * Get locality info and starting offset into shared object
4102 	 * Try anon map first and then vnode
4103 	 * Assume that no locks need to be held on anon_map or vnode, since
4104 	 * it should be protected by its reference count which must be nonzero
4105 	 * for an existing segment.
4106 	 */
4107 	if (amp) {
4108 		/*
4109 		 * Get policy info from anon_map
4110 		 *
4111 		 */
4112 		ASSERT(amp->refcnt != 0);
4113 		if (amp->locality == NULL)
4114 			lgrp_shm_policy_init(amp, NULL);
4115 		shm_locality = amp->locality;
4116 		off = ptob(anon_index);
4117 	} else if (vp) {
4118 		/*
4119 		 * Get policy info from vnode
4120 		 */
4121 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4122 			lgrp_shm_policy_init(NULL, vp);
4123 		shm_locality = vp->v_locality;
4124 		ASSERT(shm_locality->loc_count != 0);
4125 		off = vn_off;
4126 	} else
4127 		return (-1);
4128 
4129 	ASSERT((off & PAGEOFFSET) == 0);
4130 
4131 	/*
4132 	 * Figure out default policy
4133 	 */
4134 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4135 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4136 
4137 	/*
4138 	 * Create AVL tree if there isn't one yet
4139 	 * and set locality field to point at it
4140 	 */
4141 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4142 	tree = shm_locality->loc_tree;
4143 	if (!tree) {
4144 		rw_exit(&shm_locality->loc_lock);
4145 
4146 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4147 
4148 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4149 		if (shm_locality->loc_tree == NULL) {
4150 			avl_create(tree, lgrp_shm_policy_compar,
4151 			    sizeof (lgrp_shm_policy_seg_t),
4152 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4153 			shm_locality->loc_tree = tree;
4154 		} else {
4155 			/*
4156 			 * Another thread managed to set up the tree
4157 			 * before we could. Free the tree we allocated
4158 			 * and use the one that's already there.
4159 			 */
4160 			kmem_free(tree, sizeof (*tree));
4161 			tree = shm_locality->loc_tree;
4162 		}
4163 	}
4164 
4165 	/*
4166 	 * Set policy
4167 	 *
4168 	 * Need to maintain hold on writer's lock to keep tree from
4169 	 * changing out from under us
4170 	 */
4171 	while (len != 0) {
4172 		/*
4173 		 * Find policy segment for specified offset into shared object
4174 		 */
4175 		seg = avl_find(tree, &off, &where);
4176 
4177 		/*
4178 		 * Didn't find any existing segment that contains specified
4179 		 * offset, so allocate new segment, insert it, and concatenate
4180 		 * with adjacent segments if possible
4181 		 */
4182 		if (seg == NULL) {
4183 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4184 			    KM_SLEEP);
4185 			newseg->shm_policy.mem_policy = policy;
4186 			newseg->shm_policy.mem_reserved = 0;
4187 			newseg->shm_off = off;
4188 			avl_insert(tree, newseg, where);
4189 
4190 			/*
4191 			 * Check to see whether new segment overlaps with next
4192 			 * one, set length of new segment accordingly, and
4193 			 * calculate remaining length and next offset
4194 			 */
4195 			seg = AVL_NEXT(tree, newseg);
4196 			if (seg == NULL || off + len <= seg->shm_off) {
4197 				newseg->shm_size = len;
4198 				len = 0;
4199 			} else {
4200 				newseg->shm_size = seg->shm_off - off;
4201 				off = seg->shm_off;
4202 				len -= newseg->shm_size;
4203 			}
4204 
4205 			/*
4206 			 * Try to concatenate new segment with next and
4207 			 * previous ones, since they might have the same policy
4208 			 * now.  Grab previous and next segments first because
4209 			 * they will change on concatenation.
4210 			 */
4211 			prev =  AVL_PREV(tree, newseg);
4212 			next = AVL_NEXT(tree, newseg);
4213 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4214 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4215 
4216 			continue;
4217 		}
4218 
4219 		eoff = off + len;
4220 		oldeoff = seg->shm_off + seg->shm_size;
4221 
4222 		/*
4223 		 * Policy set already?
4224 		 */
4225 		if (policy == seg->shm_policy.mem_policy) {
4226 			/*
4227 			 * Nothing left to do if offset and length
4228 			 * fall within this segment
4229 			 */
4230 			if (eoff <= oldeoff) {
4231 				retval = 1;
4232 				break;
4233 			} else {
4234 				len = eoff - oldeoff;
4235 				off = oldeoff;
4236 				continue;
4237 			}
4238 		}
4239 
4240 		/*
4241 		 * Specified offset and length match existing segment exactly
4242 		 */
4243 		if (off == seg->shm_off && len == seg->shm_size) {
4244 			/*
4245 			 * Set policy and update current length
4246 			 */
4247 			seg->shm_policy.mem_policy = policy;
4248 			seg->shm_policy.mem_reserved = 0;
4249 			len = 0;
4250 
4251 			/*
4252 			 * Try concatenating new segment with previous and next
4253 			 * segments, since they might have the same policy now.
4254 			 * Grab previous and next segments first because they
4255 			 * will change on concatenation.
4256 			 */
4257 			prev =  AVL_PREV(tree, seg);
4258 			next = AVL_NEXT(tree, seg);
4259 			(void) lgrp_shm_policy_concat(tree, seg, next);
4260 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4261 		} else {
4262 			/*
4263 			 * Specified offset and length only apply to part of
4264 			 * existing segment
4265 			 */
4266 
4267 			/*
4268 			 * New segment starts in middle of old one, so split
4269 			 * new one off near beginning of old one
4270 			 */
4271 			newseg = NULL;
4272 			if (off > seg->shm_off) {
4273 				newseg = lgrp_shm_policy_split(tree, seg, off);
4274 
4275 				/*
4276 				 * New segment ends where old one did, so try
4277 				 * to concatenate with next segment
4278 				 */
4279 				if (eoff == oldeoff) {
4280 					newseg->shm_policy.mem_policy = policy;
4281 					newseg->shm_policy.mem_reserved = 0;
4282 					(void) lgrp_shm_policy_concat(tree,
4283 					    newseg, AVL_NEXT(tree, newseg));
4284 					break;
4285 				}
4286 			}
4287 
4288 			/*
4289 			 * New segment ends before old one, so split off end of
4290 			 * old one
4291 			 */
4292 			if (eoff < oldeoff) {
4293 				if (newseg) {
4294 					(void) lgrp_shm_policy_split(tree,
4295 					    newseg, eoff);
4296 					newseg->shm_policy.mem_policy = policy;
4297 					newseg->shm_policy.mem_reserved = 0;
4298 				} else {
4299 					(void) lgrp_shm_policy_split(tree, seg,
4300 					    eoff);
4301 					seg->shm_policy.mem_policy = policy;
4302 					seg->shm_policy.mem_reserved = 0;
4303 				}
4304 
4305 				if (off == seg->shm_off)
4306 					(void) lgrp_shm_policy_concat(tree,
4307 					    AVL_PREV(tree, seg), seg);
4308 				break;
4309 			}
4310 
4311 			/*
4312 			 * Calculate remaining length and next offset
4313 			 */
4314 			len = eoff - oldeoff;
4315 			off = oldeoff;
4316 		}
4317 	}
4318 
4319 	rw_exit(&shm_locality->loc_lock);
4320 	return (retval);
4321 }
4322 
4323 /*
4324  * Return the best memnode from which to allocate memory given
4325  * an lgroup.
4326  *
4327  * "c" is for cookie, which is good enough for me.
4328  * It references a cookie struct that should be zero'ed to initialize.
4329  * The cookie should live on the caller's stack.
4330  *
4331  * The routine returns -1 when:
4332  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4333  *	- traverse is 1, and all the memnodes in the system have been
4334  *	  returned.
4335  */
4336 int
4337 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4338 {
4339 	lgrp_t		*lp = c->lmc_lgrp;
4340 	mnodeset_t	nodes = c->lmc_nodes;
4341 	int		cnt = c->lmc_cnt;
4342 	int		offset, mnode;
4343 
4344 	extern int	max_mem_nodes;
4345 
4346 	/*
4347 	 * If the set is empty, and the caller is willing, traverse
4348 	 * up the hierarchy until we find a non-empty set.
4349 	 */
4350 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4351 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4352 		    ((lp = lp->lgrp_parent) == NULL))
4353 			return (-1);
4354 
4355 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4356 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4357 	}
4358 
4359 	/*
4360 	 * Select a memnode by picking one at a "random" offset.
4361 	 * Because of DR, memnodes can come and go at any time.
4362 	 * This code must be able to cope with the possibility
4363 	 * that the nodes count "cnt" is inconsistent with respect
4364 	 * to the number of elements actually in "nodes", and
4365 	 * therefore that the offset chosen could be greater than
4366 	 * the number of elements in the set (some memnodes may
4367 	 * have dissapeared just before cnt was read).
4368 	 * If this happens, the search simply wraps back to the
4369 	 * beginning of the set.
4370 	 */
4371 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4372 	offset = c->lmc_rand % cnt;
4373 	do {
4374 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4375 			if (nodes & ((mnodeset_t)1 << mnode))
4376 				if (!offset--)
4377 					break;
4378 	} while (mnode >= max_mem_nodes);
4379 
4380 	/* Found a node. Store state before returning. */
4381 	c->lmc_lgrp = lp;
4382 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4383 	c->lmc_cnt = cnt - 1;
4384 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4385 	c->lmc_ntried++;
4386 
4387 	return (mnode);
4388 }
4389