xref: /titanic_44/usr/src/uts/common/os/lgrp.c (revision 700c902c445eb3882848aaddc19d13638818cfd6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Basic NUMA support in terms of locality groups
30  *
31  * Solaris needs to know which CPUs, memory, etc. are near each other to
32  * provide good performance on NUMA machines by optimizing for locality.
33  * In order to do this, a new abstraction called a "locality group (lgroup)"
34  * has been introduced to keep track of which CPU-like and memory-like hardware
35  * resources are close to each other.  Currently, latency is the only measure
36  * used to determine how to group hardware resources into lgroups, but this
37  * does not limit the groupings to be based solely on latency.  Other factors
38  * may be used to determine the groupings in the future.
39  *
40  * Lgroups are organized into a hieararchy or topology that represents the
41  * latency topology of the machine.  There is always at least a root lgroup in
42  * the system.  It represents all the hardware resources in the machine at a
43  * latency big enough that any hardware resource can at least access any other
44  * hardware resource within that latency.  A Uniform Memory Access (UMA)
45  * machine is represented with one lgroup (the root).  In contrast, a NUMA
46  * machine is represented at least by the root lgroup and some number of leaf
47  * lgroups where the leaf lgroups contain the hardware resources within the
48  * least latency of each other and the root lgroup still contains all the
49  * resources in the machine.  Some number of intermediate lgroups may exist
50  * which represent more levels of locality than just the local latency of the
51  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52  * (eg. root and intermediate lgroups) contain the next nearest resources to
53  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54  * to the root lgroup shows the hardware resources from closest to farthest
55  * from the leaf lgroup such that each successive ancestor lgroup contains
56  * the next nearest resources at the next level of locality from the previous.
57  *
58  * The kernel uses the lgroup abstraction to know how to allocate resources
59  * near a given process/thread.  At fork() and lwp/thread_create() time, a
60  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61  * with the lowest load average.  Binding to a processor or processor set will
62  * change the home lgroup for a thread.  The scheduler has been modified to try
63  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64  * allocation is lgroup aware too, so memory will be allocated from the current
65  * thread's home lgroup if possible.  If the desired resources are not
66  * available, the kernel traverses the lgroup hierarchy going to the parent
67  * lgroup to find resources at the next level of locality until it reaches the
68  * root lgroup.
69  */
70 
71 #include <sys/lgrp.h>
72 #include <sys/lgrp_user.h>
73 #include <sys/types.h>
74 #include <sys/mman.h>
75 #include <sys/param.h>
76 #include <sys/var.h>
77 #include <sys/thread.h>
78 #include <sys/cpuvar.h>
79 #include <sys/cpupart.h>
80 #include <sys/kmem.h>
81 #include <vm/seg.h>
82 #include <vm/seg_kmem.h>
83 #include <vm/seg_spt.h>
84 #include <vm/seg_vn.h>
85 #include <vm/as.h>
86 #include <sys/atomic.h>
87 #include <sys/systm.h>
88 #include <sys/errno.h>
89 #include <sys/cmn_err.h>
90 #include <sys/kstat.h>
91 #include <sys/sysmacros.h>
92 #include <sys/chip.h>
93 #include <sys/promif.h>
94 #include <sys/sdt.h>
95 
96 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 				/* indexed by lgrp_id */
99 int	nlgrps;			/* number of lgroups in machine */
100 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102 
103 /*
104  * Kstat data for lgroups.
105  *
106  * Actual kstat data is collected in lgrp_stats array.
107  * The lgrp_kstat_data array of named kstats is used to extract data from
108  * lgrp_stats and present it to kstat framework. It is protected from partallel
109  * modifications by lgrp_kstat_mutex. This may cause some contention when
110  * several kstat commands run in parallel but this is not the
111  * performance-critical path.
112  */
113 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114 
115 /*
116  * Declare kstat names statically for enums as defined in the header file.
117  */
118 LGRP_KSTAT_NAMES;
119 
120 static void	lgrp_kstat_init(void);
121 static int	lgrp_kstat_extract(kstat_t *, int);
122 static void	lgrp_kstat_reset(lgrp_id_t);
123 
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126 
127 
128 /*
129  * max number of lgroups supported by the platform
130  */
131 int	nlgrpsmax = 0;
132 
133 /*
134  * The root lgroup. Represents the set of resources at the system wide
135  * level of locality.
136  */
137 lgrp_t		*lgrp_root = NULL;
138 
139 /*
140  * During system bootstrap cp_default does not contain the list of lgrp load
141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142  * on-line when cp_default is initialized by cpupart_initialize_default().
143  * Configuring CPU0 may create a two-level topology with root and one leaf node
144  * containing CPU0. This topology is initially constructed in a special
145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147  * for all lpl operations until cp_default is fully constructed.
148  *
149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151  * the first element of lpl_bootstrap_list.
152  *
153  * CPUs that are added to the system, but have not yet been assigned to an
154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155  * on some architectures (x86) it's possible for the slave CPU startup thread
156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157  */
158 #define	LPL_BOOTSTRAP_SIZE 2
159 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t		*lpl_bootstrap;
161 
162 /*
163  * If cp still references the bootstrap lpl, it has not yet been added to
164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
166  */
167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168 
169 static lgrp_t	lroot;
170 
171 
172 /*
173  * Size, in bytes, beyond which random memory allocation policy is applied
174  * to non-shared memory.  Default is the maximum size, so random memory
175  * allocation won't be used for non-shared memory by default.
176  */
177 size_t	lgrp_privm_random_thresh = (size_t)(-1);
178 
179 /*
180  * Size, in bytes, beyond which random memory allocation policy is applied to
181  * shared memory.  Default is 8MB (2 ISM pages).
182  */
183 size_t	lgrp_shm_random_thresh = 8*1024*1024;
184 
185 /*
186  * Whether to do processor set aware memory allocation by default
187  */
188 int	lgrp_mem_pset_aware = 0;
189 
190 /*
191  * Set the default memory allocation policy for root lgroup
192  */
193 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
194 
195 /*
196  * Set the default memory allocation policy.  For most platforms,
197  * next touch is sufficient, but some platforms may wish to override
198  * this.
199  */
200 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
201 
202 
203 /*
204  * lgroup CPU event handlers
205  */
206 static void	lgrp_cpu_init(struct cpu *);
207 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
208 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
209 
210 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
211 
212 /*
213  * lgroup memory event handlers
214  */
215 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
216 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
217 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
218 
219 /*
220  * lgroup CPU partition event handlers
221  */
222 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
223 static void	lgrp_part_del_cpu(struct cpu *);
224 
225 static void	lgrp_root_init(void);
226 
227 /*
228  * lpl topology
229  */
230 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
231 static void	lpl_clear(lpl_t *);
232 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
233 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
234 static void	lpl_rset_add(lpl_t *, lpl_t *);
235 static void	lpl_rset_del(lpl_t *, lpl_t *);
236 static int	lpl_rset_contains(lpl_t *, lpl_t *);
237 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
238 static void	lpl_child_update(lpl_t *, struct cpupart *);
239 static int	lpl_pick(lpl_t *, lpl_t *);
240 static void	lpl_verify_wrapper(struct cpupart *);
241 
242 /*
243  * defines for lpl topology verifier return codes
244  */
245 
246 #define	LPL_TOPO_CORRECT			0
247 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
248 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
249 #define	LPL_TOPO_LGRP_MISMATCH			-3
250 #define	LPL_TOPO_MISSING_PARENT			-4
251 #define	LPL_TOPO_PARENT_MISMATCH		-5
252 #define	LPL_TOPO_BAD_CPUCNT			-6
253 #define	LPL_TOPO_RSET_MISMATCH			-7
254 #define	LPL_TOPO_LPL_ORPHANED			-8
255 #define	LPL_TOPO_LPL_BAD_NCPU			-9
256 #define	LPL_TOPO_RSET_MSSNG_LF			-10
257 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
258 #define	LPL_TOPO_BOGUS_HINT			-12
259 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
260 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
261 #define	LPL_TOPO_BAD_RSETCNT			-15
262 
263 /*
264  * Return whether lgroup optimizations should be enabled on this system
265  */
266 int
267 lgrp_optimizations(void)
268 {
269 	/*
270 	 * System must have more than 2 lgroups to enable lgroup optimizations
271 	 *
272 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
273 	 * with one child lgroup containing all the resources. A 2 lgroup
274 	 * system with a root lgroup directly containing CPUs or memory might
275 	 * need lgroup optimizations with its child lgroup, but there
276 	 * isn't such a machine for now....
277 	 */
278 	if (nlgrps > 2)
279 		return (1);
280 
281 	return (0);
282 }
283 
284 /*
285  * Build full lgroup topology
286  */
287 static void
288 lgrp_root_init(void)
289 {
290 	lgrp_handle_t	hand;
291 	int		i;
292 	lgrp_id_t	id;
293 
294 	/*
295 	 * Create the "root" lgroup
296 	 */
297 	ASSERT(nlgrps == 0);
298 	id = nlgrps++;
299 
300 	lgrp_root = &lroot;
301 
302 	lgrp_root->lgrp_cpu = NULL;
303 	lgrp_root->lgrp_mnodes = 0;
304 	lgrp_root->lgrp_nmnodes = 0;
305 	hand = lgrp_plat_root_hand();
306 	lgrp_root->lgrp_plathand = hand;
307 
308 	lgrp_root->lgrp_id = id;
309 	lgrp_root->lgrp_cpucnt = 0;
310 	lgrp_root->lgrp_childcnt = 0;
311 	klgrpset_clear(lgrp_root->lgrp_children);
312 	klgrpset_clear(lgrp_root->lgrp_leaves);
313 	lgrp_root->lgrp_parent = NULL;
314 	lgrp_root->lgrp_chips = NULL;
315 	lgrp_root->lgrp_chipcnt = 0;
316 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
317 
318 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
319 		klgrpset_clear(lgrp_root->lgrp_set[i]);
320 
321 	lgrp_root->lgrp_kstat = NULL;
322 
323 	lgrp_table[id] = lgrp_root;
324 
325 	/*
326 	 * Setup initial lpl list for CPU0 and initial t0 home.
327 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
328 	 * all topology operations until cp_default is initialized at which
329 	 * point t0.t_lpl will be updated.
330 	 */
331 	lpl_bootstrap = lpl_bootstrap_list;
332 	t0.t_lpl = lpl_bootstrap;
333 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
334 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
335 	cp_default.cp_lgrploads = lpl_bootstrap;
336 }
337 
338 /*
339  * Initialize the lgroup framework and allow the platform to do the same
340  */
341 void
342 lgrp_init(void)
343 {
344 	/*
345 	 * Initialize the platform
346 	 */
347 	lgrp_plat_init();
348 
349 	/*
350 	 * Set max number of lgroups supported on this platform which must be
351 	 * less than the max number of lgroups supported by the common lgroup
352 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
353 	 */
354 	nlgrpsmax = lgrp_plat_max_lgrps();
355 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
356 }
357 
358 /*
359  * Create the root and cpu0's lgroup, and set t0's home.
360  */
361 void
362 lgrp_setup(void)
363 {
364 	/*
365 	 * Setup the root lgroup
366 	 */
367 	lgrp_root_init();
368 
369 	/*
370 	 * Add cpu0 to an lgroup
371 	 */
372 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
373 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
374 }
375 
376 /*
377  * Lgroup initialization is split in two parts. The first part
378  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
379  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
380  * when all CPUs are brought online and all distance information is available.
381  *
382  * When lgrp_main_init() is complete it sets lgrp_initialized. The
383  * lgrp_main_mp_init() sets lgrp_topo_initialized.
384  */
385 
386 /*
387  * true when lgrp initialization has been completed.
388  */
389 int	lgrp_initialized = 0;
390 
391 /*
392  * True when lgrp topology is constructed.
393  */
394 int	lgrp_topo_initialized = 0;
395 
396 /*
397  * Init routine called after startup(), /etc/system has been processed,
398  * and cpu0 has been added to an lgroup.
399  */
400 void
401 lgrp_main_init(void)
402 {
403 	cpu_t		*cp = CPU;
404 	lgrp_id_t	lgrpid;
405 	int		i;
406 	/*
407 	 * Enforce a valid lgrp_mem_default_policy
408 	 */
409 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
410 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
411 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
412 
413 	/*
414 	 * See if mpo should be disabled.
415 	 * This may happen in the case of null proc LPA on Starcat.
416 	 * The platform won't be able to detect null proc LPA until after
417 	 * cpu0 and memory have already been added to lgroups.
418 	 * When and if it is detected, the Starcat platform will return
419 	 * a different platform handle for cpu0 which is what we check for
420 	 * here. If mpo should be disabled move cpu0 to it's rightful place
421 	 * (the root), and destroy the remaining lgroups. This effectively
422 	 * provides an UMA lgroup topology.
423 	 */
424 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
425 	if (lgrp_table[lgrpid]->lgrp_plathand !=
426 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
427 		lgrp_part_del_cpu(cp);
428 		lgrp_cpu_fini(cp, lgrpid);
429 
430 		lgrp_cpu_init(cp);
431 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
432 
433 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
434 
435 		/*
436 		 * Destroy all lgroups except for root
437 		 */
438 		for (i = 0; i <= lgrp_alloc_max; i++) {
439 			if (LGRP_EXISTS(lgrp_table[i]) &&
440 			    lgrp_table[i] != lgrp_root)
441 				lgrp_destroy(lgrp_table[i]);
442 		}
443 
444 		/*
445 		 * Fix up root to point at itself for leaves and resources
446 		 * and not have any children
447 		 */
448 		lgrp_root->lgrp_childcnt = 0;
449 		klgrpset_clear(lgrp_root->lgrp_children);
450 		klgrpset_clear(lgrp_root->lgrp_leaves);
451 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
452 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
453 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
454 	}
455 
456 	/*
457 	 * Initialize kstats framework.
458 	 */
459 	lgrp_kstat_init();
460 	/*
461 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
462 	 */
463 	mutex_enter(&cpu_lock);
464 	lgrp_kstat_create(cp);
465 	mutex_exit(&cpu_lock);
466 
467 	lgrp_plat_main_init();
468 	lgrp_initialized = 1;
469 }
470 
471 /*
472  * Finish lgrp initialization after all CPUS are brought on-line.
473  * This routine is called after start_other_cpus().
474  */
475 void
476 lgrp_main_mp_init(void)
477 {
478 	klgrpset_t changed;
479 
480 	/*
481 	 * Update lgroup topology (if necessary)
482 	 */
483 	klgrpset_clear(changed);
484 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
485 	lgrp_topo_initialized = 1;
486 }
487 
488 /*
489  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
490  */
491 void
492 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
493 {
494 	klgrpset_t	changed;
495 	cpu_t		*cp;
496 	lgrp_id_t	id;
497 	int		rc;
498 
499 	switch (event) {
500 	/*
501 	 * The following (re)configuration events are common code
502 	 * initiated. lgrp_plat_config() is called here to inform the
503 	 * platform of the reconfiguration event.
504 	 */
505 	case LGRP_CONFIG_CPU_ADD:
506 		cp = (cpu_t *)resource;
507 
508 		/*
509 		 * Initialize the new CPU's lgrp related next/prev
510 		 * links, and give it a bootstrap lpl so that it can
511 		 * survive should it need to enter the dispatcher.
512 		 */
513 		cp->cpu_next_lpl = cp;
514 		cp->cpu_prev_lpl = cp;
515 		cp->cpu_next_lgrp = cp;
516 		cp->cpu_prev_lgrp = cp;
517 		cp->cpu_lpl = lpl_bootstrap;
518 
519 		lgrp_plat_config(event, resource);
520 		atomic_add_32(&lgrp_gen, 1);
521 
522 		break;
523 	case LGRP_CONFIG_CPU_DEL:
524 		lgrp_plat_config(event, resource);
525 		atomic_add_32(&lgrp_gen, 1);
526 
527 		break;
528 	case LGRP_CONFIG_CPU_ONLINE:
529 		cp = (cpu_t *)resource;
530 		lgrp_cpu_init(cp);
531 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
532 		rc = lpl_topo_verify(cp->cpu_part);
533 		if (rc != LPL_TOPO_CORRECT) {
534 			panic("lpl_topo_verify failed: %d", rc);
535 		}
536 		lgrp_plat_config(event, resource);
537 		atomic_add_32(&lgrp_gen, 1);
538 
539 		break;
540 	case LGRP_CONFIG_CPU_OFFLINE:
541 		cp = (cpu_t *)resource;
542 		id = cp->cpu_lpl->lpl_lgrpid;
543 		lgrp_part_del_cpu(cp);
544 		lgrp_cpu_fini(cp, id);
545 		rc = lpl_topo_verify(cp->cpu_part);
546 		if (rc != LPL_TOPO_CORRECT) {
547 			panic("lpl_topo_verify failed: %d", rc);
548 		}
549 		lgrp_plat_config(event, resource);
550 		atomic_add_32(&lgrp_gen, 1);
551 
552 		break;
553 	case LGRP_CONFIG_CPUPART_ADD:
554 		cp = (cpu_t *)resource;
555 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
556 		rc = lpl_topo_verify(cp->cpu_part);
557 		if (rc != LPL_TOPO_CORRECT) {
558 			panic("lpl_topo_verify failed: %d", rc);
559 		}
560 		lgrp_plat_config(event, resource);
561 
562 		break;
563 	case LGRP_CONFIG_CPUPART_DEL:
564 		cp = (cpu_t *)resource;
565 		lgrp_part_del_cpu((cpu_t *)resource);
566 		rc = lpl_topo_verify(cp->cpu_part);
567 		if (rc != LPL_TOPO_CORRECT) {
568 			panic("lpl_topo_verify failed: %d", rc);
569 		}
570 		lgrp_plat_config(event, resource);
571 
572 		break;
573 	/*
574 	 * The following events are initiated by the memnode
575 	 * subsystem.
576 	 */
577 	case LGRP_CONFIG_MEM_ADD:
578 		lgrp_mem_init((int)resource, where, B_FALSE);
579 		atomic_add_32(&lgrp_gen, 1);
580 
581 		break;
582 	case LGRP_CONFIG_MEM_DEL:
583 		lgrp_mem_fini((int)resource, where, B_FALSE);
584 		atomic_add_32(&lgrp_gen, 1);
585 
586 		break;
587 	case LGRP_CONFIG_MEM_RENAME: {
588 		lgrp_config_mem_rename_t *ren_arg =
589 		    (lgrp_config_mem_rename_t *)where;
590 
591 		lgrp_mem_rename((int)resource,
592 		    ren_arg->lmem_rename_from,
593 		    ren_arg->lmem_rename_to);
594 		atomic_add_32(&lgrp_gen, 1);
595 
596 		break;
597 	}
598 	case LGRP_CONFIG_GEN_UPDATE:
599 		atomic_add_32(&lgrp_gen, 1);
600 
601 		break;
602 	case LGRP_CONFIG_FLATTEN:
603 		if (where == 0)
604 			lgrp_topo_levels = (int)resource;
605 		else
606 			(void) lgrp_topo_flatten(resource,
607 			    lgrp_table, lgrp_alloc_max, &changed);
608 
609 		break;
610 	/*
611 	 * Initiated by platform latency probing code
612 	 */
613 	case LGRP_CONFIG_LATENCY_CHANGE:
614 		lgrp_latency_change((u_longlong_t)resource,
615 		    (u_longlong_t)where);
616 
617 		break;
618 	case LGRP_CONFIG_NOP:
619 
620 		break;
621 	default:
622 		break;
623 	}
624 
625 }
626 
627 /*
628  * Called to add lgrp info into cpu structure from cpu_add_unit;
629  * do not assume cpu is in cpu[] yet!
630  *
631  * CPUs are brought online with all other CPUs paused so we can't
632  * allocate memory or we could deadlock the system, so we rely on
633  * the platform to statically allocate as much space as we need
634  * for the lgrp structs and stats.
635  */
636 static void
637 lgrp_cpu_init(struct cpu *cp)
638 {
639 	klgrpset_t	changed;
640 	int		count;
641 	lgrp_handle_t	hand;
642 	int		first_cpu;
643 	lgrp_t		*my_lgrp;
644 	lgrp_id_t	lgrpid;
645 	struct cpu	*cptr;
646 	struct chip	*chp;
647 
648 	/*
649 	 * This is the first time through if the resource set
650 	 * for the root lgroup is empty. After cpu0 has been
651 	 * initially added to an lgroup, the root's CPU resource
652 	 * set can never be empty, since the system's last CPU
653 	 * cannot be offlined.
654 	 */
655 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
656 		/*
657 		 * First time through.
658 		 */
659 		first_cpu = 1;
660 	} else {
661 		/*
662 		 * If cpu0 needs to move lgroups, we may come
663 		 * through here again, at which time cpu_lock won't
664 		 * be held, and lgrp_initialized will be false.
665 		 */
666 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
667 		ASSERT(cp->cpu_part != NULL);
668 		first_cpu = 0;
669 	}
670 
671 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
672 	my_lgrp = lgrp_hand_to_lgrp(hand);
673 
674 	if (my_lgrp == NULL) {
675 		/*
676 		 * Create new lgrp and add it to lgroup topology
677 		 */
678 		my_lgrp = lgrp_create();
679 		my_lgrp->lgrp_plathand = hand;
680 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
681 		lgrpid = my_lgrp->lgrp_id;
682 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
683 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
684 
685 		count = 0;
686 		klgrpset_clear(changed);
687 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
688 		    &changed);
689 		/*
690 		 * May have added new intermediate lgroups, so need to add
691 		 * resources other than CPUs which are added below
692 		 */
693 		(void) lgrp_mnode_update(changed, NULL);
694 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
695 	    > 0) {
696 		/*
697 		 * Leaf lgroup was created, but latency wasn't available
698 		 * then.  So, set latency for it and fill in rest of lgroup
699 		 * topology  now that we know how far it is from other leaf
700 		 * lgroups.
701 		 */
702 		lgrpid = my_lgrp->lgrp_id;
703 		klgrpset_clear(changed);
704 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
705 		    lgrpid))
706 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
707 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
708 		    &changed);
709 
710 		/*
711 		 * May have added new intermediate lgroups, so need to add
712 		 * resources other than CPUs which are added below
713 		 */
714 		(void) lgrp_mnode_update(changed, NULL);
715 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
716 	    my_lgrp->lgrp_id)) {
717 		int	i;
718 
719 		/*
720 		 * Update existing lgroup and lgroups containing it with CPU
721 		 * resource
722 		 */
723 		lgrpid = my_lgrp->lgrp_id;
724 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
725 		for (i = 0; i <= lgrp_alloc_max; i++) {
726 			lgrp_t		*lgrp;
727 
728 			lgrp = lgrp_table[i];
729 			if (!LGRP_EXISTS(lgrp) ||
730 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
731 				continue;
732 
733 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
734 		}
735 	}
736 
737 	lgrpid = my_lgrp->lgrp_id;
738 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
739 
740 	/*
741 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
742 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
743 	 * not since none of lgroup IDs in the lpl's have been set yet.
744 	 */
745 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
746 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
747 
748 	/*
749 	 * link the CPU into the lgrp's CPU list
750 	 */
751 	if (my_lgrp->lgrp_cpucnt == 0) {
752 		my_lgrp->lgrp_cpu = cp;
753 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
754 	} else {
755 		cptr = my_lgrp->lgrp_cpu;
756 		cp->cpu_next_lgrp = cptr;
757 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
758 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
759 		cptr->cpu_prev_lgrp = cp;
760 	}
761 	my_lgrp->lgrp_cpucnt++;
762 
763 	/*
764 	 * Add this cpu's chip to the per lgroup list
765 	 * if necessary
766 	 */
767 	if (cp->cpu_chip->chip_lgrp == NULL) {
768 		struct chip *lcpr;
769 
770 		chp = cp->cpu_chip;
771 
772 		if (my_lgrp->lgrp_chipcnt == 0) {
773 			my_lgrp->lgrp_chips = chp;
774 			chp->chip_next_lgrp =
775 			    chp->chip_prev_lgrp = chp;
776 		} else {
777 			lcpr = my_lgrp->lgrp_chips;
778 			chp->chip_next_lgrp = lcpr;
779 			chp->chip_prev_lgrp =
780 			    lcpr->chip_prev_lgrp;
781 			lcpr->chip_prev_lgrp->chip_next_lgrp =
782 			    chp;
783 			lcpr->chip_prev_lgrp = chp;
784 		}
785 		chp->chip_lgrp = my_lgrp;
786 		chp->chip_balance = chp->chip_next_lgrp;
787 		my_lgrp->lgrp_chipcnt++;
788 	}
789 }
790 
791 lgrp_t *
792 lgrp_create(void)
793 {
794 	lgrp_t		*my_lgrp;
795 	lgrp_id_t	lgrpid;
796 	int		i;
797 
798 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
799 
800 	/*
801 	 * Find an open slot in the lgroup table and recycle unused lgroup
802 	 * left there if any
803 	 */
804 	my_lgrp = NULL;
805 	if (lgrp_alloc_hint == -1)
806 		/*
807 		 * Allocate from end when hint not set yet because no lgroups
808 		 * have been deleted yet
809 		 */
810 		lgrpid = nlgrps++;
811 	else {
812 		/*
813 		 * Start looking for next open slot from hint and leave hint
814 		 * at slot allocated
815 		 */
816 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
817 			my_lgrp = lgrp_table[i];
818 			if (!LGRP_EXISTS(my_lgrp)) {
819 				lgrpid = i;
820 				nlgrps++;
821 				break;
822 			}
823 		}
824 		lgrp_alloc_hint = lgrpid;
825 	}
826 
827 	/*
828 	 * Keep track of max lgroup ID allocated so far to cut down on searches
829 	 */
830 	if (lgrpid > lgrp_alloc_max)
831 		lgrp_alloc_max = lgrpid;
832 
833 	/*
834 	 * Need to allocate new lgroup if next open slot didn't have one
835 	 * for recycling
836 	 */
837 	if (my_lgrp == NULL)
838 		my_lgrp = lgrp_plat_alloc(lgrpid);
839 
840 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
841 		panic("Too many lgrps for platform (%d)", nlgrps);
842 
843 	my_lgrp->lgrp_id = lgrpid;
844 	my_lgrp->lgrp_latency = 0;
845 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
846 	my_lgrp->lgrp_parent = NULL;
847 	my_lgrp->lgrp_childcnt = 0;
848 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
849 	my_lgrp->lgrp_nmnodes = 0;
850 	klgrpset_clear(my_lgrp->lgrp_children);
851 	klgrpset_clear(my_lgrp->lgrp_leaves);
852 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
853 		klgrpset_clear(my_lgrp->lgrp_set[i]);
854 
855 	my_lgrp->lgrp_cpu = NULL;
856 	my_lgrp->lgrp_cpucnt = 0;
857 	my_lgrp->lgrp_chips = NULL;
858 	my_lgrp->lgrp_chipcnt = 0;
859 
860 	if (my_lgrp->lgrp_kstat != NULL)
861 		lgrp_kstat_reset(lgrpid);
862 
863 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
864 
865 	return (my_lgrp);
866 }
867 
868 void
869 lgrp_destroy(lgrp_t *lgrp)
870 {
871 	int		i;
872 
873 	/*
874 	 * Unless this lgroup is being destroyed on behalf of
875 	 * the boot CPU, cpu_lock must be held
876 	 */
877 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
878 
879 	if (nlgrps == 1)
880 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
881 
882 	if (!LGRP_EXISTS(lgrp))
883 		return;
884 
885 	/*
886 	 * Set hint to lgroup being deleted and try to keep lower numbered
887 	 * hints to facilitate finding empty slots
888 	 */
889 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
890 		lgrp_alloc_hint = lgrp->lgrp_id;
891 
892 	/*
893 	 * Mark this lgroup to be recycled by setting its lgroup ID to
894 	 * LGRP_NONE and clear relevant fields
895 	 */
896 	lgrp->lgrp_id = LGRP_NONE;
897 	lgrp->lgrp_latency = 0;
898 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
899 	lgrp->lgrp_parent = NULL;
900 	lgrp->lgrp_childcnt = 0;
901 
902 	klgrpset_clear(lgrp->lgrp_children);
903 	klgrpset_clear(lgrp->lgrp_leaves);
904 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
905 		klgrpset_clear(lgrp->lgrp_set[i]);
906 
907 	lgrp->lgrp_mnodes = (mnodeset_t)0;
908 	lgrp->lgrp_nmnodes = 0;
909 
910 	lgrp->lgrp_cpu = NULL;
911 	lgrp->lgrp_cpucnt = 0;
912 	lgrp->lgrp_chipcnt = 0;
913 	lgrp->lgrp_chips = NULL;
914 
915 	nlgrps--;
916 }
917 
918 /*
919  * Initialize kstat data. Called from lgrp intialization code.
920  */
921 static void
922 lgrp_kstat_init(void)
923 {
924 	lgrp_stat_t	stat;
925 
926 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
927 
928 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
929 		kstat_named_init(&lgrp_kstat_data[stat],
930 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
931 }
932 
933 /*
934  * initialize an lgrp's kstats if needed
935  * called with cpu_lock held but not with cpus paused.
936  * we don't tear these down now because we don't know about
937  * memory leaving the lgrp yet...
938  */
939 
940 void
941 lgrp_kstat_create(cpu_t *cp)
942 {
943 	kstat_t		*lgrp_kstat;
944 	lgrp_id_t	lgrpid;
945 	lgrp_t		*my_lgrp;
946 
947 	ASSERT(MUTEX_HELD(&cpu_lock));
948 
949 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
950 	my_lgrp = lgrp_table[lgrpid];
951 
952 	if (my_lgrp->lgrp_kstat != NULL)
953 		return; /* already initialized */
954 
955 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
956 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
957 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
958 
959 	if (lgrp_kstat != NULL) {
960 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
961 		lgrp_kstat->ks_private = my_lgrp;
962 		lgrp_kstat->ks_data = &lgrp_kstat_data;
963 		lgrp_kstat->ks_update = lgrp_kstat_extract;
964 		my_lgrp->lgrp_kstat = lgrp_kstat;
965 		kstat_install(lgrp_kstat);
966 	}
967 }
968 
969 /*
970  * this will do something when we manage to remove now unused lgrps
971  */
972 
973 /* ARGSUSED */
974 void
975 lgrp_kstat_destroy(cpu_t *cp)
976 {
977 	ASSERT(MUTEX_HELD(&cpu_lock));
978 }
979 
980 /*
981  * Called when a CPU is off-lined.
982  */
983 static void
984 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
985 {
986 	lgrp_t *my_lgrp;
987 	struct cpu *prev;
988 	struct cpu *next;
989 	chip_t  *chp;
990 
991 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
992 
993 	prev = cp->cpu_prev_lgrp;
994 	next = cp->cpu_next_lgrp;
995 
996 	prev->cpu_next_lgrp = next;
997 	next->cpu_prev_lgrp = prev;
998 
999 	/*
1000 	 * just because I'm paranoid doesn't mean...
1001 	 */
1002 
1003 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1004 
1005 	my_lgrp = lgrp_table[lgrpid];
1006 	my_lgrp->lgrp_cpucnt--;
1007 
1008 	/*
1009 	 * If the last CPU on it's chip is being offlined
1010 	 * then remove this chip from the per lgroup list.
1011 	 *
1012 	 * This is also done for the boot CPU when it needs
1013 	 * to move between lgroups as a consequence of
1014 	 * null proc lpa.
1015 	 */
1016 	chp = cp->cpu_chip;
1017 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
1018 
1019 		chip_t	*chpp;
1020 
1021 		if (--my_lgrp->lgrp_chipcnt == 0)
1022 			my_lgrp->lgrp_chips = NULL;
1023 		else if (my_lgrp->lgrp_chips == chp)
1024 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
1025 
1026 		/*
1027 		 * Walk this lgroup's chip list looking for chips that
1028 		 * may try to balance against the one that's leaving
1029 		 */
1030 		for (chpp = chp->chip_next_lgrp; chpp != chp;
1031 		    chpp = chpp->chip_next_lgrp) {
1032 			if (chpp->chip_balance == chp)
1033 				chpp->chip_balance = chp->chip_next_lgrp;
1034 		}
1035 
1036 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1037 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1038 
1039 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1040 		chp->chip_lgrp = NULL;
1041 		chp->chip_balance = NULL;
1042 	}
1043 
1044 	/*
1045 	 * Removing last CPU in lgroup, so update lgroup topology
1046 	 */
1047 	if (my_lgrp->lgrp_cpucnt == 0) {
1048 		klgrpset_t	changed;
1049 		int		count;
1050 		int		i;
1051 
1052 		my_lgrp->lgrp_cpu = NULL;
1053 
1054 		/*
1055 		 * Remove this lgroup from its lgroup CPU resources and remove
1056 		 * lgroup from lgroup topology if it doesn't have any more
1057 		 * resources in it now
1058 		 */
1059 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1060 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1061 			count = 0;
1062 			klgrpset_clear(changed);
1063 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1064 			    lgrp_alloc_max + 1, &changed);
1065 			return;
1066 		}
1067 
1068 		/*
1069 		 * This lgroup isn't empty, so just remove it from CPU
1070 		 * resources of any lgroups that contain it as such
1071 		 */
1072 		for (i = 0; i <= lgrp_alloc_max; i++) {
1073 			lgrp_t		*lgrp;
1074 
1075 			lgrp = lgrp_table[i];
1076 			if (!LGRP_EXISTS(lgrp) ||
1077 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1078 			    lgrpid))
1079 				continue;
1080 
1081 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1082 		}
1083 		return;
1084 	}
1085 
1086 	if (my_lgrp->lgrp_cpu == cp)
1087 		my_lgrp->lgrp_cpu = next;
1088 
1089 }
1090 
1091 /*
1092  * Update memory nodes in target lgroups and return ones that get changed
1093  */
1094 int
1095 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1096 {
1097 	int	count;
1098 	int	i;
1099 	int	j;
1100 	lgrp_t	*lgrp;
1101 	lgrp_t	*lgrp_rsrc;
1102 
1103 	count = 0;
1104 	if (changed)
1105 		klgrpset_clear(*changed);
1106 
1107 	if (klgrpset_isempty(target))
1108 		return (0);
1109 
1110 	/*
1111 	 * Find each lgroup in target lgroups
1112 	 */
1113 	for (i = 0; i <= lgrp_alloc_max; i++) {
1114 		/*
1115 		 * Skip any lgroups that don't exist or aren't in target group
1116 		 */
1117 		lgrp = lgrp_table[i];
1118 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1119 			continue;
1120 		}
1121 
1122 		/*
1123 		 * Initialize memnodes for intermediate lgroups to 0
1124 		 * and update them from scratch since they may have completely
1125 		 * changed
1126 		 */
1127 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1128 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1129 			lgrp->lgrp_nmnodes = 0;
1130 		}
1131 
1132 		/*
1133 		 * Update memory nodes of of target lgroup with memory nodes
1134 		 * from each lgroup in its lgroup memory resource set
1135 		 */
1136 		for (j = 0; j <= lgrp_alloc_max; j++) {
1137 			int	k;
1138 
1139 			/*
1140 			 * Skip any lgroups that don't exist or aren't in
1141 			 * memory resources of target lgroup
1142 			 */
1143 			lgrp_rsrc = lgrp_table[j];
1144 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1145 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1146 			    j))
1147 				continue;
1148 
1149 			/*
1150 			 * Update target lgroup's memnodes to include memnodes
1151 			 * of this lgroup
1152 			 */
1153 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1154 				mnodeset_t	mnode_mask;
1155 
1156 				mnode_mask = (mnodeset_t)1 << k;
1157 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1158 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1159 					lgrp->lgrp_mnodes |= mnode_mask;
1160 					lgrp->lgrp_nmnodes++;
1161 				}
1162 			}
1163 			count++;
1164 			if (changed)
1165 				klgrpset_add(*changed, lgrp->lgrp_id);
1166 		}
1167 	}
1168 
1169 	return (count);
1170 }
1171 
1172 /*
1173  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1174  * is moved from one board to another. The "from" and "to" arguments specify the
1175  * source and the destination of the move.
1176  *
1177  * See plat_lgrp_config() for a detailed description of the copy-rename
1178  * semantics.
1179  *
1180  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1181  * the lgroup topology which is changing as memory moves from one lgroup to
1182  * another. It removes the mnode from the source lgroup and re-inserts it in the
1183  * target lgroup.
1184  *
1185  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1186  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1187  * copy-rename operation.
1188  *
1189  * There is one case which requires special handling. If the system contains
1190  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1191  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1192  * lgrp_mem_init), but there is a window when the system has no memory in the
1193  * lgroup hierarchy. If another thread tries to allocate memory during this
1194  * window, the allocation will fail, although the system has physical memory.
1195  * This may cause a system panic or a deadlock (some sleeping memory allocations
1196  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1197  * the mnode back).
1198  *
1199  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1200  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1201  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1202  * but it updates the rest of the lgroup topology as if the mnode was actually
1203  * removed. The lgrp_mem_init() function recognizes that the mnode being
1204  * inserted represents such a special case and updates the topology
1205  * appropriately.
1206  */
1207 void
1208 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1209 {
1210 	/*
1211 	 * Remove the memory from the source node and add it to the destination
1212 	 * node.
1213 	 */
1214 	lgrp_mem_fini(mnode, from, B_TRUE);
1215 	lgrp_mem_init(mnode, to, B_TRUE);
1216 }
1217 
1218 /*
1219  * Called to indicate that the lgrp with platform handle "hand" now
1220  * contains the memory identified by "mnode".
1221  *
1222  * LOCKING for this routine is a bit tricky. Usually it is called without
1223  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1224  * callers. During DR of the board containing the caged memory it may be called
1225  * with cpu_lock already held and CPUs paused.
1226  *
1227  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1228  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1229  * dealing with the special case of DR copy-rename described in
1230  * lgrp_mem_rename().
1231  */
1232 void
1233 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1234 {
1235 	klgrpset_t	changed;
1236 	int		count;
1237 	int		i;
1238 	lgrp_t		*my_lgrp;
1239 	lgrp_id_t	lgrpid;
1240 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1241 	boolean_t	drop_lock = B_FALSE;
1242 	boolean_t	need_synch = B_FALSE;
1243 
1244 	/*
1245 	 * Grab CPU lock (if we haven't already)
1246 	 */
1247 	if (!MUTEX_HELD(&cpu_lock)) {
1248 		mutex_enter(&cpu_lock);
1249 		drop_lock = B_TRUE;
1250 	}
1251 
1252 	/*
1253 	 * This routine may be called from a context where we already
1254 	 * hold cpu_lock, and have already paused cpus.
1255 	 */
1256 	if (!cpus_paused())
1257 		need_synch = B_TRUE;
1258 
1259 	/*
1260 	 * Check if this mnode is already configured and return immediately if
1261 	 * it is.
1262 	 *
1263 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1264 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1265 	 * recognize this case and continue as usual, but skip the update to
1266 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1267 	 * in topology, temporarily introduced by lgrp_mem_fini().
1268 	 */
1269 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1270 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1271 		if (drop_lock)
1272 			mutex_exit(&cpu_lock);
1273 		return;
1274 	}
1275 
1276 	/*
1277 	 * Update lgroup topology with new memory resources, keeping track of
1278 	 * which lgroups change
1279 	 */
1280 	count = 0;
1281 	klgrpset_clear(changed);
1282 	my_lgrp = lgrp_hand_to_lgrp(hand);
1283 	if (my_lgrp == NULL) {
1284 		/* new lgrp */
1285 		my_lgrp = lgrp_create();
1286 		lgrpid = my_lgrp->lgrp_id;
1287 		my_lgrp->lgrp_plathand = hand;
1288 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1289 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1290 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1291 
1292 		if (need_synch)
1293 			pause_cpus(NULL);
1294 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1295 		    &changed);
1296 		if (need_synch)
1297 			start_cpus();
1298 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1299 	    > 0) {
1300 		/*
1301 		 * Leaf lgroup was created, but latency wasn't available
1302 		 * then.  So, set latency for it and fill in rest of lgroup
1303 		 * topology  now that we know how far it is from other leaf
1304 		 * lgroups.
1305 		 */
1306 		klgrpset_clear(changed);
1307 		lgrpid = my_lgrp->lgrp_id;
1308 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1309 		    lgrpid))
1310 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1311 		if (need_synch)
1312 			pause_cpus(NULL);
1313 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1314 		    &changed);
1315 		if (need_synch)
1316 			start_cpus();
1317 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1318 	    my_lgrp->lgrp_id)) {
1319 		/*
1320 		 * Add new lgroup memory resource to existing lgroup
1321 		 */
1322 		lgrpid = my_lgrp->lgrp_id;
1323 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1324 		klgrpset_add(changed, lgrpid);
1325 		count++;
1326 		for (i = 0; i <= lgrp_alloc_max; i++) {
1327 			lgrp_t		*lgrp;
1328 
1329 			lgrp = lgrp_table[i];
1330 			if (!LGRP_EXISTS(lgrp) ||
1331 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1332 				continue;
1333 
1334 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1335 			klgrpset_add(changed, lgrp->lgrp_id);
1336 			count++;
1337 		}
1338 	}
1339 
1340 	/*
1341 	 * Add memory node to lgroup and remove lgroup from ones that need
1342 	 * to be updated
1343 	 */
1344 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1345 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1346 		my_lgrp->lgrp_nmnodes++;
1347 	}
1348 	klgrpset_del(changed, lgrpid);
1349 
1350 	/*
1351 	 * Update memory node information for all lgroups that changed and
1352 	 * contain new memory node as a resource
1353 	 */
1354 	if (count)
1355 		(void) lgrp_mnode_update(changed, NULL);
1356 
1357 	if (drop_lock)
1358 		mutex_exit(&cpu_lock);
1359 }
1360 
1361 /*
1362  * Called to indicate that the lgroup associated with the platform
1363  * handle "hand" no longer contains given memory node
1364  *
1365  * LOCKING for this routine is a bit tricky. Usually it is called without
1366  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1367  * callers. During DR of the board containing the caged memory it may be called
1368  * with cpu_lock already held and CPUs paused.
1369  *
1370  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1371  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1372  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1373  * the same mnode back into the topology. See lgrp_mem_rename() and
1374  * lgrp_mem_init() for additional details.
1375  */
1376 void
1377 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1378 {
1379 	klgrpset_t	changed;
1380 	int		count;
1381 	int		i;
1382 	lgrp_t		*my_lgrp;
1383 	lgrp_id_t	lgrpid;
1384 	mnodeset_t	mnodes_mask;
1385 	boolean_t	drop_lock = B_FALSE;
1386 	boolean_t	need_synch = B_FALSE;
1387 
1388 	/*
1389 	 * Grab CPU lock (if we haven't already)
1390 	 */
1391 	if (!MUTEX_HELD(&cpu_lock)) {
1392 		mutex_enter(&cpu_lock);
1393 		drop_lock = B_TRUE;
1394 	}
1395 
1396 	/*
1397 	 * This routine may be called from a context where we already
1398 	 * hold cpu_lock and have already paused cpus.
1399 	 */
1400 	if (!cpus_paused())
1401 		need_synch = B_TRUE;
1402 
1403 	my_lgrp = lgrp_hand_to_lgrp(hand);
1404 
1405 	/*
1406 	 * The lgrp *must* be pre-existing
1407 	 */
1408 	ASSERT(my_lgrp != NULL);
1409 
1410 	/*
1411 	 * Delete memory node from lgroups which contain it
1412 	 */
1413 	mnodes_mask = ((mnodeset_t)1 << mnode);
1414 	for (i = 0; i <= lgrp_alloc_max; i++) {
1415 		lgrp_t *lgrp = lgrp_table[i];
1416 		/*
1417 		 * Skip any non-existent lgroups and any lgroups that don't
1418 		 * contain leaf lgroup of memory as a memory resource
1419 		 */
1420 		if (!LGRP_EXISTS(lgrp) ||
1421 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1422 			continue;
1423 
1424 		/*
1425 		 * Avoid removing the last mnode from the root in the DR
1426 		 * copy-rename case. See lgrp_mem_rename() for details.
1427 		 */
1428 		if (is_copy_rename &&
1429 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1430 			continue;
1431 
1432 		/*
1433 		 * Remove memory node from lgroup.
1434 		 */
1435 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1436 		lgrp->lgrp_nmnodes--;
1437 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1438 	}
1439 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1440 
1441 	/*
1442 	 * Don't need to update lgroup topology if this lgroup still has memory.
1443 	 *
1444 	 * In the special case of DR copy-rename with the only mnode being
1445 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1446 	 * still need to update the lgroup topology.
1447 	 */
1448 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1449 	    !(is_copy_rename &&
1450 		(my_lgrp == lgrp_root) &&
1451 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1452 		if (drop_lock)
1453 			mutex_exit(&cpu_lock);
1454 		return;
1455 	}
1456 
1457 	/*
1458 	 * This lgroup does not contain any memory now
1459 	 */
1460 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1461 
1462 	/*
1463 	 * Remove this lgroup from lgroup topology if it does not contain any
1464 	 * resources now
1465 	 */
1466 	lgrpid = my_lgrp->lgrp_id;
1467 	count = 0;
1468 	klgrpset_clear(changed);
1469 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1470 		/*
1471 		 * Delete lgroup when no more resources
1472 		 */
1473 		if (need_synch)
1474 			pause_cpus(NULL);
1475 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1476 		    lgrp_alloc_max + 1, &changed);
1477 		ASSERT(count > 0);
1478 		if (need_synch)
1479 			start_cpus();
1480 	} else {
1481 		/*
1482 		 * Remove lgroup from memory resources of any lgroups that
1483 		 * contain it as such
1484 		 */
1485 		for (i = 0; i <= lgrp_alloc_max; i++) {
1486 			lgrp_t		*lgrp;
1487 
1488 			lgrp = lgrp_table[i];
1489 			if (!LGRP_EXISTS(lgrp) ||
1490 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1491 			    lgrpid))
1492 				continue;
1493 
1494 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1495 		}
1496 	}
1497 	if (drop_lock)
1498 		mutex_exit(&cpu_lock);
1499 }
1500 
1501 /*
1502  * Return lgroup with given platform handle
1503  */
1504 lgrp_t *
1505 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1506 {
1507 	int	i;
1508 	lgrp_t	*lgrp;
1509 
1510 	if (hand == LGRP_NULL_HANDLE)
1511 		return (NULL);
1512 
1513 	for (i = 0; i <= lgrp_alloc_max; i++) {
1514 		lgrp = lgrp_table[i];
1515 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1516 			return (lgrp);
1517 	}
1518 	return (NULL);
1519 }
1520 
1521 /*
1522  * Return the home lgroup of the current thread.
1523  * We must do this with kernel preemption disabled, since we don't want our
1524  * thread to be re-homed while we're poking around with its lpl, and the lpl
1525  * should never be NULL.
1526  *
1527  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1528  * is enabled because of DR.  Callers can use disable kernel preemption
1529  * around this call to guarantee that the lgroup will be valid beyond this
1530  * routine, since kernel preemption can be recursive.
1531  */
1532 lgrp_t *
1533 lgrp_home_lgrp(void)
1534 {
1535 	lgrp_t	*lgrp;
1536 	lpl_t	*lpl;
1537 
1538 	kpreempt_disable();
1539 
1540 	lpl = curthread->t_lpl;
1541 	ASSERT(lpl != NULL);
1542 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1543 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1544 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1545 
1546 	kpreempt_enable();
1547 
1548 	return (lgrp);
1549 }
1550 
1551 /*
1552  * Return ID of home lgroup for given thread
1553  * (See comments for lgrp_home_lgrp() for special care and handling
1554  * instructions)
1555  */
1556 lgrp_id_t
1557 lgrp_home_id(kthread_t *t)
1558 {
1559 	lgrp_id_t	lgrp;
1560 	lpl_t		*lpl;
1561 
1562 	ASSERT(t != NULL);
1563 	/*
1564 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1565 	 * cannot since the HAT layer can call into this routine to
1566 	 * determine the locality for its data structures in the context
1567 	 * of a page fault.
1568 	 */
1569 
1570 	kpreempt_disable();
1571 
1572 	lpl = t->t_lpl;
1573 	ASSERT(lpl != NULL);
1574 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1575 	lgrp = lpl->lpl_lgrpid;
1576 
1577 	kpreempt_enable();
1578 
1579 	return (lgrp);
1580 }
1581 
1582 /*
1583  * Return lgroup containing the physical memory for the given page frame number
1584  */
1585 lgrp_t *
1586 lgrp_pfn_to_lgrp(pfn_t pfn)
1587 {
1588 	lgrp_handle_t	hand;
1589 	int		i;
1590 	lgrp_t		*lgrp;
1591 
1592 	hand = lgrp_plat_pfn_to_hand(pfn);
1593 	if (hand != LGRP_NULL_HANDLE)
1594 		for (i = 0; i <= lgrp_alloc_max; i++) {
1595 			lgrp = lgrp_table[i];
1596 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1597 				return (lgrp);
1598 		}
1599 	return (NULL);
1600 }
1601 
1602 /*
1603  * Return lgroup containing the physical memory for the given page frame number
1604  */
1605 lgrp_t *
1606 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1607 {
1608 	lgrp_handle_t	hand;
1609 	int		i;
1610 	lgrp_t		*lgrp;
1611 	pfn_t		pfn;
1612 
1613 	pfn = btop(physaddr);
1614 	hand = lgrp_plat_pfn_to_hand(pfn);
1615 	if (hand != LGRP_NULL_HANDLE)
1616 		for (i = 0; i <= lgrp_alloc_max; i++) {
1617 			lgrp = lgrp_table[i];
1618 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1619 				return (lgrp);
1620 		}
1621 	return (NULL);
1622 }
1623 
1624 /*
1625  * Return the leaf lgroup containing the given CPU
1626  *
1627  * The caller needs to take precautions necessary to prevent
1628  * "cpu" from going away across a call to this function.
1629  * hint: kpreempt_disable()/kpreempt_enable()
1630  */
1631 static lgrp_t *
1632 lgrp_cpu_to_lgrp(cpu_t *cpu)
1633 {
1634 	return (cpu->cpu_lpl->lpl_lgrp);
1635 }
1636 
1637 /*
1638  * Return the sum of the partition loads in an lgrp divided by
1639  * the number of CPUs in the lgrp.  This is our best approximation
1640  * of an 'lgroup load average' for a useful per-lgroup kstat.
1641  */
1642 static uint64_t
1643 lgrp_sum_loadavgs(lgrp_t *lgrp)
1644 {
1645 	cpu_t *cpu;
1646 	int ncpu;
1647 	uint64_t loads = 0;
1648 
1649 	mutex_enter(&cpu_lock);
1650 
1651 	cpu = lgrp->lgrp_cpu;
1652 	ncpu = lgrp->lgrp_cpucnt;
1653 
1654 	if (cpu == NULL || ncpu == 0) {
1655 		mutex_exit(&cpu_lock);
1656 		return (0ull);
1657 	}
1658 
1659 	do {
1660 		loads += cpu->cpu_lpl->lpl_loadavg;
1661 		cpu = cpu->cpu_next_lgrp;
1662 	} while (cpu != lgrp->lgrp_cpu);
1663 
1664 	mutex_exit(&cpu_lock);
1665 
1666 	return (loads / ncpu);
1667 }
1668 
1669 void
1670 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1671 {
1672 	struct lgrp_stats *pstats;
1673 
1674 	/*
1675 	 * Verify that the caller isn't trying to add to
1676 	 * a statistic for an lgroup that has gone away
1677 	 */
1678 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1679 		return;
1680 
1681 	pstats = &lgrp_stats[lgrpid];
1682 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1683 }
1684 
1685 int64_t
1686 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1687 {
1688 	uint64_t val;
1689 	struct lgrp_stats *pstats;
1690 
1691 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1692 		return ((int64_t)0);
1693 
1694 	pstats = &lgrp_stats[lgrpid];
1695 	LGRP_STAT_READ(pstats, stat, val);
1696 	return (val);
1697 }
1698 
1699 /*
1700  * Reset all kstats for lgrp specified by its lgrpid.
1701  */
1702 static void
1703 lgrp_kstat_reset(lgrp_id_t lgrpid)
1704 {
1705 	lgrp_stat_t stat;
1706 
1707 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1708 		return;
1709 
1710 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1711 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1712 	}
1713 }
1714 
1715 /*
1716  * Collect all per-lgrp statistics for the lgrp associated with this
1717  * kstat, and store them in the ks_data array.
1718  *
1719  * The superuser can reset all the running counter statistics for an
1720  * lgrp by writing to any of the lgrp's stats.
1721  */
1722 static int
1723 lgrp_kstat_extract(kstat_t *ksp, int rw)
1724 {
1725 	lgrp_stat_t		stat;
1726 	struct kstat_named	*ksd;
1727 	lgrp_t			*lgrp;
1728 	lgrp_id_t		lgrpid;
1729 
1730 	lgrp = (lgrp_t *)ksp->ks_private;
1731 
1732 	ksd = (struct kstat_named *)ksp->ks_data;
1733 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1734 
1735 	lgrpid = lgrp->lgrp_id;
1736 
1737 	if (lgrpid == LGRP_NONE) {
1738 		/*
1739 		 * Return all zeroes as stats for freed lgrp.
1740 		 */
1741 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1742 			ksd[stat].value.i64 = 0;
1743 		}
1744 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1745 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1746 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1747 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1748 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1749 	} else if (rw != KSTAT_WRITE) {
1750 		/*
1751 		 * Handle counter stats
1752 		 */
1753 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1754 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1755 		}
1756 
1757 		/*
1758 		 * Handle kernel data snapshot stats
1759 		 */
1760 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1761 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1762 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1763 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1764 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1765 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1766 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1767 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1768 	} else {
1769 		lgrp_kstat_reset(lgrpid);
1770 	}
1771 
1772 	return (0);
1773 }
1774 
1775 int
1776 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1777 {
1778 	cpu_t	*cp;
1779 
1780 	mutex_enter(&cpu_lock);
1781 
1782 	if ((cp = cpu_get(id)) == NULL) {
1783 		mutex_exit(&cpu_lock);
1784 		return (EINVAL);
1785 	}
1786 
1787 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1788 		mutex_exit(&cpu_lock);
1789 		return (EINVAL);
1790 	}
1791 
1792 	ASSERT(cp->cpu_lpl != NULL);
1793 
1794 	*lp = cp->cpu_lpl->lpl_lgrpid;
1795 
1796 	mutex_exit(&cpu_lock);
1797 
1798 	return (0);
1799 }
1800 
1801 int
1802 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1803 {
1804 	cpu_t *cp;
1805 
1806 	mutex_enter(&cpu_lock);
1807 
1808 	if ((cp = cpu_get(id)) == NULL) {
1809 		mutex_exit(&cpu_lock);
1810 		return (EINVAL);
1811 	}
1812 
1813 	ASSERT(cp->cpu_lpl != NULL);
1814 
1815 	*lp = cp->cpu_lpl->lpl_loadavg;
1816 
1817 	mutex_exit(&cpu_lock);
1818 
1819 	return (0);
1820 }
1821 
1822 void
1823 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1824 {
1825 	lgrp_t		*lgrp;
1826 	int		i;
1827 
1828 	for (i = 0; i <= lgrp_alloc_max; i++) {
1829 		lgrp = lgrp_table[i];
1830 
1831 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1832 			lgrp->lgrp_latency = (int)newtime;
1833 	}
1834 }
1835 
1836 /*
1837  * Add a resource named by lpl_leaf to rset of lpl_target
1838  *
1839  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1840  * resource. It is adjusted here, as this is presently the only place that we
1841  * can be certain a resource addition has succeeded.
1842  *
1843  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1844  * list in order until it reaches a NULL.  (This list is required to be NULL
1845  * terminated, too).  This is done so that we can mark start pos + 1, so that
1846  * each lpl is traversed sequentially, but in a different order.  We hope this
1847  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1848  */
1849 
1850 void
1851 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1852 {
1853 	int		i;
1854 	int		entry_slot = 0;
1855 
1856 	/* return if leaf is already present */
1857 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1858 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1859 			return;
1860 		}
1861 
1862 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1863 		    lpl_leaf->lpl_lgrpid) {
1864 			break;
1865 		}
1866 	}
1867 
1868 	/* insert leaf, update counts */
1869 	entry_slot = i;
1870 	i = lpl_target->lpl_nrset++;
1871 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1872 		panic("More leaf lgrps in system than are supported!\n");
1873 	}
1874 
1875 	/*
1876 	 * Start at the end of the rset array and work backwards towards the
1877 	 * slot into which the new lpl will be inserted. This effectively
1878 	 * preserves the current ordering by scooting everybody over one entry,
1879 	 * and placing the new entry into the space created.
1880 	 */
1881 
1882 	while (i-- > entry_slot) {
1883 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1884 	}
1885 
1886 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1887 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1888 }
1889 
1890 /*
1891  * Update each of lpl_parent's children with a proper hint and
1892  * a reference to their parent.
1893  * The lgrp topology is used as the reference since it is fully
1894  * consistent and correct at this point.
1895  *
1896  * Each child's hint will reference an element in lpl_parent's
1897  * rset that designates where the child should start searching
1898  * for CPU resources. The hint selected is the highest order leaf present
1899  * in the child's lineage.
1900  *
1901  * This should be called after any potential change in lpl_parent's
1902  * rset.
1903  */
1904 static void
1905 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1906 {
1907 	klgrpset_t	children, leaves;
1908 	lpl_t		*lpl;
1909 	int		hint;
1910 	int		i, j;
1911 
1912 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1913 	if (klgrpset_isempty(children))
1914 		return; /* nothing to do */
1915 
1916 	for (i = 0; i <= lgrp_alloc_max; i++) {
1917 		if (klgrpset_ismember(children, i)) {
1918 
1919 			/*
1920 			 * Given the set of leaves in this child's lineage,
1921 			 * find the highest order leaf present in the parent's
1922 			 * rset. Select this as the hint for the child.
1923 			 */
1924 			leaves = lgrp_table[i]->lgrp_leaves;
1925 			hint = 0;
1926 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1927 				lpl = lpl_parent->lpl_rset[j];
1928 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1929 					hint = j;
1930 			}
1931 			cp->cp_lgrploads[i].lpl_hint = hint;
1932 
1933 			/*
1934 			 * (Re)set the parent. It may be incorrect if
1935 			 * lpl_parent is new in the topology.
1936 			 */
1937 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1938 		}
1939 	}
1940 }
1941 
1942 /*
1943  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1944  *
1945  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1946  * resource. The values are adjusted here, as this is the only place that we can
1947  * be certain a resource was successfully deleted.
1948  */
1949 void
1950 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1951 {
1952 	int i;
1953 
1954 	/* find leaf in intermediate node */
1955 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1956 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1957 			break;
1958 	}
1959 
1960 	/* return if leaf not found */
1961 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1962 		return;
1963 
1964 	/* prune leaf, compress array */
1965 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1966 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1967 	lpl_target->lpl_ncpu--;
1968 	do {
1969 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1970 	} while (i++ < lpl_target->lpl_nrset);
1971 }
1972 
1973 /*
1974  * Check to see if the resource set of the target lpl contains the
1975  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1976  */
1977 
1978 int
1979 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1980 {
1981 	int i;
1982 
1983 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1984 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1985 			return (1);
1986 	}
1987 
1988 	return (0);
1989 }
1990 
1991 /*
1992  * Called when we change cpu lpl membership.  This increments or decrements the
1993  * per-cpu counter in every lpl in which our leaf appears.
1994  */
1995 void
1996 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1997 {
1998 	cpupart_t	*cpupart;
1999 	lgrp_t		*lgrp_leaf;
2000 	lgrp_t		*lgrp_cur;
2001 	lpl_t		*lpl_leaf;
2002 	lpl_t		*lpl_cur;
2003 	int		i;
2004 
2005 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
2006 
2007 	cpupart = cp->cpu_part;
2008 	lpl_leaf = cp->cpu_lpl;
2009 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
2010 
2011 	for (i = 0; i <= lgrp_alloc_max; i++) {
2012 		lgrp_cur = lgrp_table[i];
2013 
2014 		/*
2015 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2016 		 * for the cpu in question, or if the current lgrp and leaf
2017 		 * don't share the same resources.
2018 		 */
2019 
2020 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2021 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2022 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2023 			continue;
2024 
2025 
2026 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2027 
2028 		if (lpl_cur->lpl_nrset > 0) {
2029 			if (act == LPL_INCREMENT) {
2030 				lpl_cur->lpl_ncpu++;
2031 			} else if (act == LPL_DECREMENT) {
2032 				lpl_cur->lpl_ncpu--;
2033 			}
2034 		}
2035 	}
2036 }
2037 
2038 /*
2039  * Initialize lpl with given resources and specified lgrp
2040  */
2041 
2042 void
2043 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2044 {
2045 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2046 	lpl->lpl_loadavg = 0;
2047 	if (lpl == lpl_leaf)
2048 		lpl->lpl_ncpu = 1;
2049 	else
2050 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2051 	lpl->lpl_nrset = 1;
2052 	lpl->lpl_rset[0] = lpl_leaf;
2053 	lpl->lpl_lgrp = lgrp;
2054 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2055 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2056 }
2057 
2058 /*
2059  * Clear an unused lpl
2060  */
2061 
2062 void
2063 lpl_clear(lpl_t *lpl)
2064 {
2065 	lgrp_id_t	lid;
2066 
2067 	/* save lid for debugging purposes */
2068 	lid = lpl->lpl_lgrpid;
2069 	bzero(lpl, sizeof (lpl_t));
2070 	lpl->lpl_lgrpid = lid;
2071 }
2072 
2073 /*
2074  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2075  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2076  * make full use of all of the lgroup topology, but this checks to make sure
2077  * that for the parts that it does use, it has correctly understood the
2078  * relationships that exist. This function returns
2079  * 0 if the topology is correct, and a non-zero error code, for non-debug
2080  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2081  * debugging on a DEBUG kernel.
2082  */
2083 int
2084 lpl_topo_verify(cpupart_t *cpupart)
2085 {
2086 	lgrp_t		*lgrp;
2087 	lpl_t		*lpl;
2088 	klgrpset_t	rset;
2089 	klgrpset_t	cset;
2090 	cpu_t		*cpu;
2091 	cpu_t		*cp_start;
2092 	int		i;
2093 	int		j;
2094 	int		sum;
2095 
2096 	/* topology can't be incorrect if it doesn't exist */
2097 	if (!lgrp_topo_initialized || !lgrp_initialized)
2098 		return (LPL_TOPO_CORRECT);
2099 
2100 	ASSERT(cpupart != NULL);
2101 
2102 	for (i = 0; i <= lgrp_alloc_max; i++) {
2103 		lgrp = lgrp_table[i];
2104 		lpl = NULL;
2105 		/* make sure lpls are allocated */
2106 		ASSERT(cpupart->cp_lgrploads);
2107 		if (!cpupart->cp_lgrploads)
2108 			return (LPL_TOPO_PART_HAS_NO_LPL);
2109 
2110 		lpl = &cpupart->cp_lgrploads[i];
2111 		/* make sure our index is good */
2112 		ASSERT(i < cpupart->cp_nlgrploads);
2113 
2114 		/* if lgroup doesn't exist, make sure lpl is empty */
2115 		if (!LGRP_EXISTS(lgrp)) {
2116 			ASSERT(lpl->lpl_ncpu == 0);
2117 			if (lpl->lpl_ncpu > 0) {
2118 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2119 			} else {
2120 				continue;
2121 			}
2122 		}
2123 
2124 		/* verify that lgroup and lpl are identically numbered */
2125 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2126 
2127 		/* if lgroup isn't in our partition, make sure lpl is empty */
2128 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2129 		    cpupart->cp_lgrpset)) {
2130 			ASSERT(lpl->lpl_ncpu == 0);
2131 			if (lpl->lpl_ncpu > 0) {
2132 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2133 			}
2134 			/*
2135 			 * lpl is empty, and lgroup isn't in partition.  verify
2136 			 * that lpl doesn't show up in anyone else's rsets (in
2137 			 * this partition, anyway)
2138 			 */
2139 
2140 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2141 				lpl_t *i_lpl; /* lpl we're iterating over */
2142 
2143 				i_lpl = &cpupart->cp_lgrploads[j];
2144 
2145 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2146 				if (lpl_rset_contains(i_lpl, lpl)) {
2147 					return (LPL_TOPO_LPL_ORPHANED);
2148 				}
2149 			}
2150 			/* lgroup is empty, and everything is ok. continue */
2151 			continue;
2152 		}
2153 
2154 
2155 		/* lgroup is in this partition, now check it against lpl */
2156 
2157 		/* do both have matching lgrps? */
2158 		ASSERT(lgrp == lpl->lpl_lgrp);
2159 		if (lgrp != lpl->lpl_lgrp) {
2160 			return (LPL_TOPO_LGRP_MISMATCH);
2161 		}
2162 
2163 		/* do the parent lgroups exist and do they match? */
2164 		if (lgrp->lgrp_parent) {
2165 			ASSERT(lpl->lpl_parent);
2166 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2167 				    lpl->lpl_parent->lpl_lgrpid);
2168 
2169 			if (!lpl->lpl_parent) {
2170 				return (LPL_TOPO_MISSING_PARENT);
2171 			} else if (lgrp->lgrp_parent->lgrp_id !=
2172 			    lpl->lpl_parent->lpl_lgrpid) {
2173 				return (LPL_TOPO_PARENT_MISMATCH);
2174 			}
2175 		}
2176 
2177 		/* only leaf lgroups keep a cpucnt, only check leaves */
2178 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2179 
2180 			/* verify that lgrp is also a leaf */
2181 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2182 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2183 			    lpl->lpl_lgrpid)));
2184 
2185 			if ((lgrp->lgrp_childcnt > 0) ||
2186 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2187 			    lpl->lpl_lgrpid))) {
2188 				return (LPL_TOPO_LGRP_NOT_LEAF);
2189 			}
2190 
2191 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2192 			    (lpl->lpl_ncpu > 0));
2193 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2194 				(lpl->lpl_ncpu <= 0)) {
2195 				return (LPL_TOPO_BAD_CPUCNT);
2196 			}
2197 
2198 			/*
2199 			 * Check that lpl_ncpu also matches the number of
2200 			 * cpus in the lpl's linked list.  This only exists in
2201 			 * leaves, but they should always match.
2202 			 */
2203 			j = 0;
2204 			cpu = cp_start = lpl->lpl_cpus;
2205 			while (cpu != NULL) {
2206 				j++;
2207 
2208 				/* check to make sure cpu's lpl is leaf lpl */
2209 				ASSERT(cpu->cpu_lpl == lpl);
2210 				if (cpu->cpu_lpl != lpl) {
2211 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2212 				}
2213 
2214 				/* check next cpu */
2215 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2216 					continue;
2217 				} else {
2218 					cpu = NULL;
2219 				}
2220 			}
2221 
2222 			ASSERT(j == lpl->lpl_ncpu);
2223 			if (j != lpl->lpl_ncpu) {
2224 				return (LPL_TOPO_LPL_BAD_NCPU);
2225 			}
2226 
2227 			/*
2228 			 * Also, check that leaf lpl is contained in all
2229 			 * intermediate lpls that name the leaf as a descendant
2230 			 */
2231 
2232 			for (j = 0; j <= lgrp_alloc_max; j++) {
2233 				klgrpset_t intersect;
2234 				lgrp_t *lgrp_cand;
2235 				lpl_t *lpl_cand;
2236 
2237 				lgrp_cand = lgrp_table[j];
2238 				intersect = klgrpset_intersects(
2239 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2240 				    cpupart->cp_lgrpset);
2241 
2242 				if (!LGRP_EXISTS(lgrp_cand) ||
2243 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2244 				    cpupart->cp_lgrpset) ||
2245 				    (intersect == 0))
2246 					continue;
2247 
2248 				lpl_cand =
2249 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2250 
2251 				if (klgrpset_ismember(intersect,
2252 				    lgrp->lgrp_id)) {
2253 					ASSERT(lpl_rset_contains(lpl_cand,
2254 					    lpl));
2255 
2256 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2257 						return (LPL_TOPO_RSET_MSSNG_LF);
2258 					}
2259 				}
2260 			}
2261 
2262 		} else { /* non-leaf specific checks */
2263 
2264 			/*
2265 			 * Non-leaf lpls should have lpl_cpus == NULL
2266 			 * verify that this is so
2267 			 */
2268 			ASSERT(lpl->lpl_cpus == NULL);
2269 			if (lpl->lpl_cpus != NULL) {
2270 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2271 			}
2272 
2273 			/*
2274 			 * verify that the sum of the cpus in the leaf resources
2275 			 * is equal to the total ncpu in the intermediate
2276 			 */
2277 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2278 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2279 			}
2280 
2281 			ASSERT(sum == lpl->lpl_ncpu);
2282 			if (sum != lpl->lpl_ncpu) {
2283 				return (LPL_TOPO_LPL_BAD_NCPU);
2284 			}
2285 		}
2286 
2287 		/*
2288 		 * check on lpl_hint. Don't check root, since it has no parent.
2289 		 */
2290 		if (lpl->lpl_parent != NULL) {
2291 			int hint;
2292 			lpl_t *hint_lpl;
2293 
2294 			/* make sure hint is within limits of nrset */
2295 			hint = lpl->lpl_hint;
2296 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2297 			if (lpl->lpl_parent->lpl_nrset < hint) {
2298 				return (LPL_TOPO_BOGUS_HINT);
2299 			}
2300 
2301 			/* make sure hint points to valid lpl */
2302 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2303 			ASSERT(hint_lpl->lpl_ncpu > 0);
2304 			if (hint_lpl->lpl_ncpu <= 0) {
2305 				return (LPL_TOPO_BOGUS_HINT);
2306 			}
2307 		}
2308 
2309 		/*
2310 		 * Check the rset of the lpl in question.  Make sure that each
2311 		 * rset contains a subset of the resources in
2312 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2313 		 * sure that each rset doesn't include resources that are
2314 		 * outside of that set.  (Which would be resources somehow not
2315 		 * accounted for).
2316 		 */
2317 
2318 		klgrpset_clear(rset);
2319 		for (j = 0; j < lpl->lpl_nrset; j++) {
2320 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2321 		}
2322 		klgrpset_copy(cset, rset);
2323 		/* make sure lpl rset matches lgrp rset */
2324 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2325 		/* make sure rset is contained with in partition, too */
2326 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2327 
2328 		ASSERT(klgrpset_isempty(rset) &&
2329 			    klgrpset_isempty(cset));
2330 		if (!klgrpset_isempty(rset) ||
2331 		    !klgrpset_isempty(cset)) {
2332 			return (LPL_TOPO_RSET_MISMATCH);
2333 		}
2334 
2335 		/*
2336 		 * check to make sure lpl_nrset matches the number of rsets
2337 		 * contained in the lpl
2338 		 */
2339 
2340 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2341 		    j++);
2342 
2343 		ASSERT(j == lpl->lpl_nrset);
2344 		if (j != lpl->lpl_nrset) {
2345 			return (LPL_TOPO_BAD_RSETCNT);
2346 		}
2347 
2348 	}
2349 	return (LPL_TOPO_CORRECT);
2350 }
2351 
2352 /*
2353  * Flatten lpl topology to given number of levels.  This is presently only
2354  * implemented for a flatten to 2 levels, which will prune out the intermediates
2355  * and home the leaf lpls to the root lpl.
2356  */
2357 int
2358 lpl_topo_flatten(int levels)
2359 {
2360 	int		i;
2361 	uint_t		sum;
2362 	lgrp_t		*lgrp_cur;
2363 	lpl_t		*lpl_cur;
2364 	lpl_t		*lpl_root;
2365 	cpupart_t	*cp;
2366 
2367 	if (levels != 2)
2368 		return (0);
2369 
2370 	/* called w/ cpus paused - grab no locks! */
2371 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2372 	    !lgrp_initialized);
2373 
2374 	cp = cp_list_head;
2375 	do {
2376 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2377 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2378 
2379 		for (i = 0; i <= lgrp_alloc_max; i++) {
2380 			lgrp_cur = lgrp_table[i];
2381 			lpl_cur = &cp->cp_lgrploads[i];
2382 
2383 			if ((lgrp_cur == lgrp_root) ||
2384 			    (!LGRP_EXISTS(lgrp_cur) &&
2385 			    (lpl_cur->lpl_ncpu == 0)))
2386 				continue;
2387 
2388 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2389 				/*
2390 				 * this should be a deleted intermediate, so
2391 				 * clear it
2392 				 */
2393 				lpl_clear(lpl_cur);
2394 			} else if ((lpl_cur->lpl_nrset == 1) &&
2395 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2396 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2397 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2398 				/*
2399 				 * this is a leaf whose parent was deleted, or
2400 				 * whose parent had their lgrp deleted.  (And
2401 				 * whose parent will soon be deleted).  Point
2402 				 * this guy back to the root lpl.
2403 				 */
2404 				lpl_cur->lpl_parent = lpl_root;
2405 				lpl_rset_add(lpl_root, lpl_cur);
2406 			}
2407 
2408 		}
2409 
2410 		/*
2411 		 * Now that we're done, make sure the count on the root lpl is
2412 		 * correct, and update the hints of the children for the sake of
2413 		 * thoroughness
2414 		 */
2415 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2416 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2417 		}
2418 		lpl_root->lpl_ncpu = sum;
2419 		lpl_child_update(lpl_root, cp);
2420 
2421 		cp = cp->cp_next;
2422 	} while (cp != cp_list_head);
2423 
2424 	return (levels);
2425 }
2426 
2427 /*
2428  * Insert a lpl into the resource hierarchy and create any additional lpls that
2429  * are necessary to represent the varying states of locality for the cpu
2430  * resoruces newly added to the partition.
2431  *
2432  * This routine is clever enough that it can correctly add resources from the
2433  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2434  * those for which the lpl is a leaf as opposed to simply a named equally local
2435  * resource).  The one special case that needs additional processing is when a
2436  * new intermediate lpl is introduced.  Since the main loop only traverses
2437  * looking to add the leaf resource where it does not yet exist, additional work
2438  * is necessary to add other leaf resources that may need to exist in the newly
2439  * created intermediate.  This is performed by the second inner loop, and is
2440  * only done when the check for more than one overlapping resource succeeds.
2441  */
2442 
2443 void
2444 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2445 {
2446 	int		i;
2447 	int		j;
2448 	int		hint;
2449 	int		rset_num_intersect;
2450 	lgrp_t		*lgrp_cur;
2451 	lpl_t		*lpl_cur;
2452 	lpl_t		*lpl_parent;
2453 	lgrp_id_t	parent_id;
2454 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2455 
2456 	for (i = 0; i <= lgrp_alloc_max; i++) {
2457 		lgrp_cur = lgrp_table[i];
2458 
2459 		/*
2460 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2461 		 * contained within the current lgrp, or if the current lgrp has
2462 		 * no leaves in this partition
2463 		 */
2464 
2465 		if (!LGRP_EXISTS(lgrp_cur) ||
2466 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2467 		    lpl_leaf->lpl_lgrpid) ||
2468 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2469 		    cpupart->cp_lgrpset))
2470 			continue;
2471 
2472 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2473 		if (lgrp_cur->lgrp_parent != NULL) {
2474 			/* if lgrp has a parent, assign it properly */
2475 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2476 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2477 		} else {
2478 			/* if not, make sure parent ptr gets set to null */
2479 			lpl_parent = NULL;
2480 		}
2481 
2482 		if (lpl_cur == lpl_leaf) {
2483 			/*
2484 			 * Almost all leaf state was initialized elsewhere.  The
2485 			 * only thing left to do is to set the parent.
2486 			 */
2487 			lpl_cur->lpl_parent = lpl_parent;
2488 			continue;
2489 		}
2490 
2491 		/*
2492 		 * Initialize intermediate lpl
2493 		 * Save this lpl's hint though. Since we're changing this
2494 		 * lpl's resources, we need to update the hint in this lpl's
2495 		 * children, but the hint in this lpl is unaffected and
2496 		 * should be preserved.
2497 		 */
2498 		hint = lpl_cur->lpl_hint;
2499 
2500 		lpl_clear(lpl_cur);
2501 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2502 
2503 		lpl_cur->lpl_hint = hint;
2504 		lpl_cur->lpl_parent = lpl_parent;
2505 
2506 		/* does new lpl need to be populated with other resources? */
2507 		rset_intersect =
2508 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2509 			cpupart->cp_lgrpset);
2510 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2511 
2512 		if (rset_num_intersect > 1) {
2513 			/*
2514 			 * If so, figure out what lpls have resources that
2515 			 * intersect this one, and add them.
2516 			 */
2517 			for (j = 0; j <= lgrp_alloc_max; j++) {
2518 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2519 				lpl_t	*lpl_cand;	/* candidate lpl */
2520 
2521 				lgrp_cand = lgrp_table[j];
2522 				if (!LGRP_EXISTS(lgrp_cand) ||
2523 				    !klgrpset_ismember(rset_intersect,
2524 					lgrp_cand->lgrp_id))
2525 					continue;
2526 				lpl_cand =
2527 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2528 				lpl_rset_add(lpl_cur, lpl_cand);
2529 			}
2530 		}
2531 		/*
2532 		 * This lpl's rset has changed. Update the hint in it's
2533 		 * children.
2534 		 */
2535 		lpl_child_update(lpl_cur, cpupart);
2536 	}
2537 }
2538 
2539 /*
2540  * remove a lpl from the hierarchy of resources, clearing its state when
2541  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2542  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2543  * delete them as well.
2544  */
2545 
2546 void
2547 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2548 {
2549 	int		i;
2550 	lgrp_t		*lgrp_cur;
2551 	lpl_t		*lpl_cur;
2552 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2553 
2554 	for (i = 0; i <= lgrp_alloc_max; i++) {
2555 		lgrp_cur = lgrp_table[i];
2556 
2557 		/*
2558 		 * Don't attempt to remove from lgrps that aren't there, that
2559 		 * don't contain our leaf, or from the leaf itself. (We do that
2560 		 * later)
2561 		 */
2562 
2563 		if (!LGRP_EXISTS(lgrp_cur))
2564 			continue;
2565 
2566 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2567 
2568 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2569 		    lpl_leaf->lpl_lgrpid) ||
2570 		    (lpl_cur == lpl_leaf)) {
2571 			continue;
2572 		}
2573 
2574 		/*
2575 		 * This is a slightly sleazy simplification in that we have
2576 		 * already marked the cp_lgrpset as no longer containing the
2577 		 * leaf we've deleted.  Any lpls that pass the above checks
2578 		 * based upon lgrp membership but not necessarily cpu-part
2579 		 * membership also get cleared by the checks below.  Currently
2580 		 * this is harmless, as the lpls should be empty anyway.
2581 		 *
2582 		 * In particular, we want to preserve lpls that have additional
2583 		 * leaf resources, even though we don't yet have a processor
2584 		 * architecture that represents resources this way.
2585 		 */
2586 
2587 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2588 		    cpupart->cp_lgrpset);
2589 
2590 		lpl_rset_del(lpl_cur, lpl_leaf);
2591 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2592 			lpl_clear(lpl_cur);
2593 		} else {
2594 			/*
2595 			 * Update this lpl's children
2596 			 */
2597 			lpl_child_update(lpl_cur, cpupart);
2598 		}
2599 	}
2600 	lpl_clear(lpl_leaf);
2601 }
2602 
2603 /*
2604  * add a cpu to a partition in terms of lgrp load avg bookeeping
2605  *
2606  * The lpl (cpu partition load average information) is now arranged in a
2607  * hierarchical fashion whereby resources that are closest, ie. most local, to
2608  * the cpu in question are considered to be leaves in a tree of resources.
2609  * There are two general cases for cpu additon:
2610  *
2611  * 1. A lpl structure that contains resources already in the hierarchy tree.
2612  * In this case, all of the associated lpl relationships have been defined, and
2613  * all that is necessary is that we link the new cpu into the per-lpl list of
2614  * cpus, and increment the ncpu count of all places where this cpu resource will
2615  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2616  * pushing is accomplished by this routine.
2617  *
2618  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2619  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2620  * construct the hierarchy of state necessary to name it's more distant
2621  * resources, if they should exist.  The leaf structure is initialized by this
2622  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2623  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2624  * and builds all of the "ancestoral" state necessary to identify resources at
2625  * differing levels of locality.
2626  */
2627 void
2628 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2629 {
2630 	cpupart_t	*cpupart;
2631 	lgrp_t		*lgrp_leaf;
2632 	lpl_t		*lpl_leaf;
2633 
2634 	/* called sometimes w/ cpus paused - grab no locks */
2635 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2636 
2637 	cpupart = cp->cpu_part;
2638 	lgrp_leaf = lgrp_table[lgrpid];
2639 
2640 	/* don't add non-existent lgrp */
2641 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2642 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2643 	cp->cpu_lpl = lpl_leaf;
2644 
2645 	/* only leaf lpls contain cpus */
2646 
2647 	if (lpl_leaf->lpl_ncpu++ == 0) {
2648 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2649 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2650 		lpl_leaf_insert(lpl_leaf, cpupart);
2651 	} else {
2652 		/*
2653 		 * the lpl should already exist in the parent, so just update
2654 		 * the count of available CPUs
2655 		 */
2656 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2657 	}
2658 
2659 	/* link cpu into list of cpus in lpl */
2660 
2661 	if (lpl_leaf->lpl_cpus) {
2662 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2663 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2664 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2665 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2666 	} else {
2667 		/*
2668 		 * We increment ncpu immediately after we create a new leaf
2669 		 * lpl, so assert that ncpu == 1 for the case where we don't
2670 		 * have any cpu pointers yet.
2671 		 */
2672 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2673 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2674 	}
2675 
2676 }
2677 
2678 
2679 /*
2680  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2681  *
2682  * The lpl (cpu partition load average information) is now arranged in a
2683  * hierarchical fashion whereby resources that are closest, ie. most local, to
2684  * the cpu in question are considered to be leaves in a tree of resources.
2685  * There are two removal cases in question:
2686  *
2687  * 1. Removal of the resource in the leaf leaves other resources remaining in
2688  * that leaf.  (Another cpu still exists at this level of locality).  In this
2689  * case, the count of available cpus is decremented in all assocated lpls by
2690  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2691  * from the per-cpu lpl list.
2692  *
2693  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2694  * empty)  In this case, all of what has occurred for the first step must take
2695  * place; however, additionally we must remove the lpl structure itself, prune
2696  * out any stranded lpls that do not directly name a leaf resource, and mark the
2697  * cpu partition in question as no longer containing resources from the lgrp of
2698  * the lpl that has been delted.  Cpu-partition changes are handled by this
2699  * method, but the lpl_leaf_remove function deals with the details of pruning
2700  * out the empty lpl and any of its orphaned direct ancestors.
2701  */
2702 void
2703 lgrp_part_del_cpu(cpu_t *cp)
2704 {
2705 	lpl_t		*lpl;
2706 	lpl_t		*leaf_lpl;
2707 	lgrp_t		*lgrp_leaf;
2708 
2709 	/* called sometimes w/ cpus paused - grab no locks */
2710 
2711 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2712 
2713 	lpl = leaf_lpl = cp->cpu_lpl;
2714 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2715 
2716 	/* don't delete a leaf that isn't there */
2717 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2718 
2719 	/* no double-deletes */
2720 	ASSERT(lpl->lpl_ncpu);
2721 	if (--lpl->lpl_ncpu == 0) {
2722 		/*
2723 		 * This was the last cpu in this lgroup for this partition,
2724 		 * clear its bit in the partition's lgroup bitmask
2725 		 */
2726 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2727 
2728 		/* eliminate remaning lpl link pointers in cpu, lpl */
2729 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2730 
2731 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2732 	} else {
2733 
2734 		/* unlink cpu from lists of cpus in lpl */
2735 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2736 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2737 		if (lpl->lpl_cpus == cp) {
2738 			lpl->lpl_cpus = cp->cpu_next_lpl;
2739 		}
2740 
2741 		/*
2742 		 * Update the cpu count in the lpls associated with parent
2743 		 * lgroups.
2744 		 */
2745 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2746 
2747 	}
2748 	/* clear cpu's lpl ptr when we're all done */
2749 	cp->cpu_lpl = NULL;
2750 }
2751 
2752 /*
2753  * Recompute load average for the specified partition/lgrp fragment.
2754  *
2755  * We rely on the fact that this routine is called from the clock thread
2756  * at a point before the clock thread can block (i.e. before its first
2757  * lock request).  Since the clock thread can not be preempted (since it
2758  * runs at highest priority), we know that cpu partitions can not change
2759  * (since doing so would require either the repartition requester or the
2760  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2761  * without grabbing cpu_lock.
2762  */
2763 void
2764 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2765 {
2766 	uint_t		ncpu;
2767 	int64_t		old, new, f;
2768 
2769 	/*
2770 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2771 	 */
2772 	static short expval[] = {
2773 	    0, 3196, 1618, 1083,
2774 	    814, 652, 543, 466,
2775 	    408, 363, 326, 297,
2776 	    272, 251, 233, 218,
2777 	    204, 192, 181, 172,
2778 	    163, 155, 148, 142,
2779 	    136, 130, 125, 121,
2780 	    116, 112, 109, 105
2781 	};
2782 
2783 	/* ASSERT (called from clock level) */
2784 
2785 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2786 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2787 		return;
2788 	}
2789 
2790 	for (;;) {
2791 
2792 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2793 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2794 		else
2795 			f = expval[ncpu];
2796 
2797 		/*
2798 		 * Modify the load average atomically to avoid losing
2799 		 * anticipatory load updates (see lgrp_move_thread()).
2800 		 */
2801 		if (ageflag) {
2802 			/*
2803 			 * We're supposed to both update and age the load.
2804 			 * This happens 10 times/sec. per cpu.  We do a
2805 			 * little hoop-jumping to avoid integer overflow.
2806 			 */
2807 			int64_t		q, r;
2808 
2809 			do {
2810 				old = new = lpl->lpl_loadavg;
2811 				q = (old  >> 16) << 7;
2812 				r = (old  & 0xffff) << 7;
2813 				new += ((long long)(nrcpus - q) * f -
2814 				    ((r * f) >> 16)) >> 7;
2815 
2816 				/*
2817 				 * Check for overflow
2818 				 */
2819 				if (new > LGRP_LOADAVG_MAX)
2820 					new = LGRP_LOADAVG_MAX;
2821 				else if (new < 0)
2822 					new = 0;
2823 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2824 			    new) != old);
2825 		} else {
2826 			/*
2827 			 * We're supposed to update the load, but not age it.
2828 			 * This option is used to update the load (which either
2829 			 * has already been aged in this 1/10 sec. interval or
2830 			 * soon will be) to account for a remotely executing
2831 			 * thread.
2832 			 */
2833 			do {
2834 				old = new = lpl->lpl_loadavg;
2835 				new += f;
2836 				/*
2837 				 * Check for overflow
2838 				 * Underflow not possible here
2839 				 */
2840 				if (new < old)
2841 					new = LGRP_LOADAVG_MAX;
2842 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2843 			    new) != old);
2844 		}
2845 
2846 		/*
2847 		 * Do the same for this lpl's parent
2848 		 */
2849 		if ((lpl = lpl->lpl_parent) == NULL)
2850 			break;
2851 		ncpu = lpl->lpl_ncpu;
2852 	}
2853 }
2854 
2855 /*
2856  * Initialize lpl topology in the target based on topology currently present in
2857  * lpl_bootstrap.
2858  *
2859  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2860  * initialize cp_default list of lpls. Up to this point all topology operations
2861  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2862  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2863  * `target' points to the list of lpls in cp_default and `size' is the size of
2864  * this list.
2865  *
2866  * This function walks the lpl topology in lpl_bootstrap and does for things:
2867  *
2868  * 1) Copies all fields from lpl_bootstrap to the target.
2869  *
2870  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2871  *
2872  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2873  *    instead of lpl_bootstrap.
2874  *
2875  * 4) Updates pointers in the resource list of the target to point to the lpls
2876  *    in the target list instead of lpl_bootstrap.
2877  *
2878  * After lpl_topo_bootstrap() completes, target contains the same information
2879  * that would be present there if it were used during boot instead of
2880  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2881  * and it is bzeroed.
2882  */
2883 void
2884 lpl_topo_bootstrap(lpl_t *target, int size)
2885 {
2886 	lpl_t	*lpl = lpl_bootstrap;
2887 	lpl_t	*target_lpl = target;
2888 	int	howmany;
2889 	int	id;
2890 	int	i;
2891 
2892 	/*
2893 	 * The only target that should be passed here is cp_default lpl list.
2894 	 */
2895 	ASSERT(target == cp_default.cp_lgrploads);
2896 	ASSERT(size == cp_default.cp_nlgrploads);
2897 	ASSERT(!lgrp_topo_initialized);
2898 	ASSERT(ncpus == 1);
2899 
2900 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2901 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2902 		/*
2903 		 * Copy all fields from lpl.
2904 		 */
2905 
2906 		*target_lpl = *lpl;
2907 
2908 		/*
2909 		 * Substitute CPU0 lpl pointer with one relative to target.
2910 		 */
2911 		if (lpl->lpl_cpus == CPU) {
2912 			ASSERT(CPU->cpu_lpl == lpl);
2913 			CPU->cpu_lpl = target_lpl;
2914 		}
2915 
2916 		/*
2917 		 * Substitute parent information with parent relative to target.
2918 		 */
2919 		if (lpl->lpl_parent != NULL)
2920 			target_lpl->lpl_parent = (lpl_t *)
2921 			    (((uintptr_t)lpl->lpl_parent -
2922 				(uintptr_t)lpl_bootstrap) +
2923 				(uintptr_t)target);
2924 
2925 		/*
2926 		 * Walk over resource set substituting pointers relative to
2927 		 * lpl_bootstrap to pointers relative to target.
2928 		 */
2929 		ASSERT(lpl->lpl_nrset <= 1);
2930 
2931 		for (id = 0; id < lpl->lpl_nrset; id++) {
2932 			if (lpl->lpl_rset[id] != NULL) {
2933 				target_lpl->lpl_rset[id] =
2934 				    (lpl_t *)
2935 				    (((uintptr_t)lpl->lpl_rset[id] -
2936 					(uintptr_t)lpl_bootstrap) +
2937 					(uintptr_t)target);
2938 			}
2939 		}
2940 	}
2941 
2942 	/*
2943 	 * Topology information in lpl_bootstrap is no longer needed.
2944 	 */
2945 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2946 }
2947 
2948 /* the maximum effect that a single thread can have on it's lgroup's load */
2949 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2950 	((lgrp_loadavg_max_effect) / (ncpu))
2951 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2952 
2953 /*
2954  * If the lowest load among the lgroups a process' threads are currently
2955  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2956  * expanding the process to a new lgroup.
2957  */
2958 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2959 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2960 
2961 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2962 	((lgrp_expand_proc_thresh) / (ncpu))
2963 
2964 /*
2965  * A process will be expanded to a new lgroup only if the difference between
2966  * the lowest load on the lgroups the process' thread's are currently spread
2967  * across and the lowest load on the other lgroups in the process' partition
2968  * is greater than lgrp_expand_proc_diff.
2969  */
2970 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2971 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2972 
2973 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2974 	((lgrp_expand_proc_diff) / (ncpu))
2975 
2976 /*
2977  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2978  * be present due to impreciseness of the load average decay algorithm.
2979  *
2980  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2981  * tolerance is scaled by the number of cpus in the lgroup just like
2982  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2983  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2984  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2985  */
2986 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2987 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2988 	((lgrp_loadavg_tolerance) / ncpu)
2989 
2990 /*
2991  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2992  * average is above this threshold
2993  */
2994 uint32_t	lgrp_load_thresh = UINT32_MAX;
2995 
2996 /*
2997  * lgrp_choose() will try to skip any lgroups with less memory
2998  * than this free when choosing a home lgroup
2999  */
3000 pgcnt_t	lgrp_mem_free_thresh = 0;
3001 
3002 /*
3003  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
3004  * one based on one of the following policies:
3005  * - Random selection
3006  * - Pseudo round robin placement
3007  * - Longest time since a thread was last placed
3008  */
3009 #define	LGRP_CHOOSE_RANDOM	1
3010 #define	LGRP_CHOOSE_RR		2
3011 #define	LGRP_CHOOSE_TIME	3
3012 
3013 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3014 
3015 /*
3016  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3017  * be bound to a CPU or processor set.
3018  *
3019  * Arguments:
3020  *	t		The thread
3021  *	cpupart		The partition the thread belongs to.
3022  *
3023  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3024  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3025  *	 partitions changing out from under us and assumes that given thread is
3026  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3027  *	 disabled, so don't grab any locks because we should never block under
3028  *	 those conditions.
3029  */
3030 lpl_t *
3031 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3032 {
3033 	lgrp_load_t	bestload, bestrload;
3034 	int		lgrpid_offset, lgrp_count;
3035 	lgrp_id_t	lgrpid, lgrpid_start;
3036 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3037 	klgrpset_t	lgrpset;
3038 	proc_t		*p;
3039 
3040 	ASSERT(t != NULL);
3041 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3042 	    THREAD_LOCK_HELD(t));
3043 	ASSERT(cpupart != NULL);
3044 
3045 	p = t->t_procp;
3046 
3047 	/* A process should always be in an active partition */
3048 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3049 
3050 	bestlpl = bestrlpl = NULL;
3051 	bestload = bestrload = LGRP_LOADAVG_MAX;
3052 	lgrpset = cpupart->cp_lgrpset;
3053 
3054 	switch (lgrp_choose_policy) {
3055 	case LGRP_CHOOSE_RR:
3056 		lgrpid = cpupart->cp_lgrp_hint;
3057 		do {
3058 			if (++lgrpid > lgrp_alloc_max)
3059 				lgrpid = 0;
3060 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3061 
3062 		break;
3063 	default:
3064 	case LGRP_CHOOSE_TIME:
3065 	case LGRP_CHOOSE_RANDOM:
3066 		klgrpset_nlgrps(lgrpset, lgrp_count);
3067 		lgrpid_offset =
3068 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3069 		for (lgrpid = 0; ; lgrpid++) {
3070 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3071 				if (--lgrpid_offset == 0)
3072 					break;
3073 			}
3074 		}
3075 		break;
3076 	}
3077 
3078 	lgrpid_start = lgrpid;
3079 
3080 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3081 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3082 
3083 	/*
3084 	 * Use lgroup affinities (if any) to choose best lgroup
3085 	 *
3086 	 * NOTE: Assumes that thread is protected from going away and its
3087 	 *	 lgroup affinities won't change (ie. p_lock, or
3088 	 *	 thread_lock() being held and/or CPUs paused)
3089 	 */
3090 	if (t->t_lgrp_affinity) {
3091 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3092 		if (lpl != NULL)
3093 			return (lpl);
3094 	}
3095 
3096 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3097 
3098 	do {
3099 		pgcnt_t	npgs;
3100 
3101 		/*
3102 		 * Skip any lgroups outside of thread's pset
3103 		 */
3104 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3105 			if (++lgrpid > lgrp_alloc_max)
3106 				lgrpid = 0;	/* wrap the search */
3107 			continue;
3108 		}
3109 
3110 		/*
3111 		 * Skip any non-leaf lgroups
3112 		 */
3113 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3114 			continue;
3115 
3116 		/*
3117 		 * Skip any lgroups without enough free memory
3118 		 * (when threshold set to nonzero positive value)
3119 		 */
3120 		if (lgrp_mem_free_thresh > 0) {
3121 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3122 			if (npgs < lgrp_mem_free_thresh) {
3123 				if (++lgrpid > lgrp_alloc_max)
3124 					lgrpid = 0;	/* wrap the search */
3125 				continue;
3126 			}
3127 		}
3128 
3129 		lpl = &cpupart->cp_lgrploads[lgrpid];
3130 		if (klgrpset_isempty(p->p_lgrpset) ||
3131 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3132 			/*
3133 			 * Either this is a new process or the process already
3134 			 * has threads on this lgrp, so this is a preferred
3135 			 * lgroup for the thread.
3136 			 */
3137 			if (bestlpl == NULL ||
3138 			    lpl_pick(lpl, bestlpl)) {
3139 				bestload = lpl->lpl_loadavg;
3140 				bestlpl = lpl;
3141 			}
3142 		} else {
3143 			/*
3144 			 * The process doesn't have any threads on this lgrp,
3145 			 * but we're willing to consider this lgrp if the load
3146 			 * difference is big enough to justify splitting up
3147 			 * the process' threads.
3148 			 */
3149 			if (bestrlpl == NULL ||
3150 			    lpl_pick(lpl, bestrlpl)) {
3151 				bestrload = lpl->lpl_loadavg;
3152 				bestrlpl = lpl;
3153 			}
3154 		}
3155 		if (++lgrpid > lgrp_alloc_max)
3156 			lgrpid = 0;	/* wrap the search */
3157 	} while (lgrpid != lgrpid_start);
3158 
3159 	/*
3160 	 * Return root lgroup if threshold isn't set to maximum value and
3161 	 * lowest lgroup load average more than a certain threshold
3162 	 */
3163 	if (lgrp_load_thresh != UINT32_MAX &&
3164 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3165 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3166 
3167 	/*
3168 	 * If all the lgroups over which the thread's process is spread are
3169 	 * heavily loaded, or otherwise undesirable, we'll consider placing
3170 	 * the thread on one of the other leaf lgroups in the thread's
3171 	 * partition.
3172 	 */
3173 	if ((bestlpl == NULL) ||
3174 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3175 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3176 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3177 	    bestload))) {
3178 		bestlpl = bestrlpl;
3179 	}
3180 
3181 	if (bestlpl == NULL) {
3182 		/*
3183 		 * No lgroup looked particularly good, but we still
3184 		 * have to pick something. Go with the randomly selected
3185 		 * legal lgroup we started with above.
3186 		 */
3187 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3188 	}
3189 
3190 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3191 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3192 
3193 	ASSERT(bestlpl->lpl_ncpu > 0);
3194 	return (bestlpl);
3195 }
3196 
3197 /*
3198  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3199  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3200  */
3201 static int
3202 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3203 {
3204 	lgrp_load_t	l1, l2;
3205 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3206 
3207 	l1 = lpl1->lpl_loadavg;
3208 	l2 = lpl2->lpl_loadavg;
3209 
3210 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3211 		/* lpl1 is significantly less loaded than lpl2 */
3212 		return (1);
3213 	}
3214 
3215 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3216 	    l1 + tolerance >= l2 && l1 < l2 &&
3217 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3218 		/*
3219 		 * lpl1's load is within the tolerance of lpl2. We're
3220 		 * willing to consider it be to better however if
3221 		 * it has been longer since we last homed a thread there
3222 		 */
3223 		return (1);
3224 	}
3225 
3226 	return (0);
3227 }
3228 
3229 /*
3230  * An LWP is expected to be assigned to an lgroup for at least this long
3231  * for its anticipatory load to be justified.  NOTE that this value should
3232  * not be set extremely huge (say, larger than 100 years), to avoid problems
3233  * with overflow in the calculation that uses it.
3234  */
3235 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3236 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3237 
3238 /*
3239  * Routine to change a thread's lgroup affiliation.  This routine updates
3240  * the thread's kthread_t struct and its process' proc_t struct to note the
3241  * thread's new lgroup affiliation, and its lgroup affinities.
3242  *
3243  * Note that this is the only routine that modifies a thread's t_lpl field,
3244  * and that adds in or removes anticipatory load.
3245  *
3246  * If the thread is exiting, newlpl is NULL.
3247  *
3248  * Locking:
3249  * The following lock must be held on entry:
3250  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3251  *		doesn't get removed from t's partition
3252  *
3253  * This routine is not allowed to grab any locks, since it may be called
3254  * with cpus paused (such as from cpu_offline).
3255  */
3256 void
3257 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3258 {
3259 	proc_t		*p;
3260 	lpl_t		*lpl, *oldlpl;
3261 	lgrp_id_t	oldid;
3262 	kthread_t	*tp;
3263 	uint_t		ncpu;
3264 	lgrp_load_t	old, new;
3265 
3266 	ASSERT(t);
3267 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3268 	    THREAD_LOCK_HELD(t));
3269 
3270 	/*
3271 	 * If not changing lpls, just return
3272 	 */
3273 	if ((oldlpl = t->t_lpl) == newlpl)
3274 		return;
3275 
3276 	/*
3277 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3278 	 * associated with process 0 rather than with its original process).
3279 	 */
3280 	if (t->t_proc_flag & TP_LWPEXIT) {
3281 		if (newlpl != NULL) {
3282 			t->t_lpl = newlpl;
3283 		}
3284 		return;
3285 	}
3286 
3287 	p = ttoproc(t);
3288 
3289 	/*
3290 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3291 	 * to account for it being moved from its old lgroup.
3292 	 */
3293 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3294 	    (p->p_tlist != NULL)) {
3295 		oldid = oldlpl->lpl_lgrpid;
3296 
3297 		if (newlpl != NULL)
3298 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3299 
3300 		if ((do_lgrpset_delete) &&
3301 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3302 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3303 				/*
3304 				 * Check if a thread other than the thread
3305 				 * that's moving is assigned to the same
3306 				 * lgroup as the thread that's moving.  Note
3307 				 * that we have to compare lgroup IDs, rather
3308 				 * than simply comparing t_lpl's, since the
3309 				 * threads may belong to different partitions
3310 				 * but be assigned to the same lgroup.
3311 				 */
3312 				ASSERT(tp->t_lpl != NULL);
3313 
3314 				if ((tp != t) &&
3315 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3316 					/*
3317 					 * Another thread is assigned to the
3318 					 * same lgroup as the thread that's
3319 					 * moving, p_lgrpset doesn't change.
3320 					 */
3321 					break;
3322 				} else if (tp == p->p_tlist) {
3323 					/*
3324 					 * No other thread is assigned to the
3325 					 * same lgroup as the exiting thread,
3326 					 * clear the lgroup's bit in p_lgrpset.
3327 					 */
3328 					klgrpset_del(p->p_lgrpset, oldid);
3329 					break;
3330 				}
3331 			}
3332 		}
3333 
3334 		/*
3335 		 * If this thread was assigned to its old lgroup for such a
3336 		 * short amount of time that the anticipatory load that was
3337 		 * added on its behalf has aged very little, remove that
3338 		 * anticipatory load.
3339 		 */
3340 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3341 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3342 			lpl = oldlpl;
3343 			for (;;) {
3344 				do {
3345 					old = new = lpl->lpl_loadavg;
3346 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3347 					if (new > old) {
3348 						/*
3349 						 * this can happen if the load
3350 						 * average was aged since we
3351 						 * added in the anticipatory
3352 						 * load
3353 						 */
3354 						new = 0;
3355 					}
3356 				} while (cas32(
3357 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3358 					    new) != old);
3359 
3360 				lpl = lpl->lpl_parent;
3361 				if (lpl == NULL)
3362 					break;
3363 
3364 				ncpu = lpl->lpl_ncpu;
3365 				ASSERT(ncpu > 0);
3366 			}
3367 		}
3368 	}
3369 	/*
3370 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3371 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3372 	 * to its new lgroup to account for its move to its new lgroup.
3373 	 */
3374 	if (newlpl != NULL) {
3375 		/*
3376 		 * This thread is moving to a new lgroup
3377 		 */
3378 		t->t_lpl = newlpl;
3379 
3380 		/*
3381 		 * Reflect move in load average of new lgroup
3382 		 * unless it is root lgroup
3383 		 */
3384 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3385 			return;
3386 
3387 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3388 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3389 		}
3390 
3391 		/*
3392 		 * It'll take some time for the load on the new lgroup
3393 		 * to reflect this thread's placement on it.  We'd
3394 		 * like not, however, to have all threads between now
3395 		 * and then also piling on to this lgroup.  To avoid
3396 		 * this pileup, we anticipate the load this thread
3397 		 * will generate on its new lgroup.  The goal is to
3398 		 * make the lgroup's load appear as though the thread
3399 		 * had been there all along.  We're very conservative
3400 		 * in calculating this anticipatory load, we assume
3401 		 * the worst case case (100% CPU-bound thread).  This
3402 		 * may be modified in the future to be more accurate.
3403 		 */
3404 		lpl = newlpl;
3405 		for (;;) {
3406 			ncpu = lpl->lpl_ncpu;
3407 			ASSERT(ncpu > 0);
3408 			do {
3409 				old = new = lpl->lpl_loadavg;
3410 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3411 				/*
3412 				 * Check for overflow
3413 				 * Underflow not possible here
3414 				 */
3415 				if (new < old)
3416 					new = UINT32_MAX;
3417 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3418 			    new) != old);
3419 
3420 			lpl = lpl->lpl_parent;
3421 			if (lpl == NULL)
3422 				break;
3423 		}
3424 		t->t_anttime = gethrtime();
3425 	}
3426 }
3427 
3428 /*
3429  * Return lgroup memory allocation policy given advice from madvise(3C)
3430  */
3431 lgrp_mem_policy_t
3432 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3433 {
3434 	switch (advice) {
3435 	case MADV_ACCESS_LWP:
3436 		return (LGRP_MEM_POLICY_NEXT);
3437 	case MADV_ACCESS_MANY:
3438 		return (LGRP_MEM_POLICY_RANDOM);
3439 	default:
3440 		return (lgrp_mem_policy_default(size, type));
3441 	}
3442 }
3443 
3444 /*
3445  * Figure out default policy
3446  */
3447 lgrp_mem_policy_t
3448 lgrp_mem_policy_default(size_t size, int type)
3449 {
3450 	cpupart_t		*cp;
3451 	lgrp_mem_policy_t	policy;
3452 	size_t			pset_mem_size;
3453 
3454 	/*
3455 	 * Randomly allocate memory across lgroups for shared memory
3456 	 * beyond a certain threshold
3457 	 */
3458 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3459 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3460 		/*
3461 		 * Get total memory size of current thread's pset
3462 		 */
3463 		kpreempt_disable();
3464 		cp = curthread->t_cpupart;
3465 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3466 		kpreempt_enable();
3467 
3468 		/*
3469 		 * Choose policy to randomly allocate memory across
3470 		 * lgroups in pset if it will fit and is not default
3471 		 * partition.  Otherwise, allocate memory randomly
3472 		 * across machine.
3473 		 */
3474 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3475 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3476 		else
3477 			policy = LGRP_MEM_POLICY_RANDOM;
3478 	} else
3479 		/*
3480 		 * Apply default policy for private memory and
3481 		 * shared memory under the respective random
3482 		 * threshold.
3483 		 */
3484 		policy = lgrp_mem_default_policy;
3485 
3486 	return (policy);
3487 }
3488 
3489 /*
3490  * Get memory allocation policy for this segment
3491  */
3492 lgrp_mem_policy_info_t *
3493 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3494 {
3495 	lgrp_mem_policy_info_t	*policy_info;
3496 	extern struct seg_ops	segspt_ops;
3497 	extern struct seg_ops	segspt_shmops;
3498 
3499 	/*
3500 	 * This is for binary compatibility to protect against third party
3501 	 * segment drivers which haven't recompiled to allow for
3502 	 * SEGOP_GETPOLICY()
3503 	 */
3504 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3505 	    seg->s_ops != &segspt_shmops)
3506 		return (NULL);
3507 
3508 	policy_info = NULL;
3509 	if (seg->s_ops->getpolicy != NULL)
3510 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3511 
3512 	return (policy_info);
3513 }
3514 
3515 /*
3516  * Set policy for allocating private memory given desired policy, policy info,
3517  * size in bytes of memory that policy is being applied.
3518  * Return 0 if policy wasn't set already and 1 if policy was set already
3519  */
3520 int
3521 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3522     lgrp_mem_policy_info_t *policy_info, size_t size)
3523 {
3524 
3525 	ASSERT(policy_info != NULL);
3526 
3527 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3528 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3529 
3530 	/*
3531 	 * Policy set already?
3532 	 */
3533 	if (policy == policy_info->mem_policy)
3534 		return (1);
3535 
3536 	/*
3537 	 * Set policy
3538 	 */
3539 	policy_info->mem_policy = policy;
3540 	policy_info->mem_reserved = 0;
3541 
3542 	return (0);
3543 }
3544 
3545 
3546 /*
3547  * Get shared memory allocation policy with given tree and offset
3548  */
3549 lgrp_mem_policy_info_t *
3550 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3551     u_offset_t vn_off)
3552 {
3553 	u_offset_t		off;
3554 	lgrp_mem_policy_info_t	*policy_info;
3555 	lgrp_shm_policy_seg_t	*policy_seg;
3556 	lgrp_shm_locality_t	*shm_locality;
3557 	avl_tree_t		*tree;
3558 	avl_index_t		where;
3559 
3560 	/*
3561 	 * Get policy segment tree from anon_map or vnode and use specified
3562 	 * anon index or vnode offset as offset
3563 	 *
3564 	 * Assume that no lock needs to be held on anon_map or vnode, since
3565 	 * they should be protected by their reference count which must be
3566 	 * nonzero for an existing segment
3567 	 */
3568 	if (amp) {
3569 		ASSERT(amp->refcnt != 0);
3570 		shm_locality = amp->locality;
3571 		if (shm_locality == NULL)
3572 			return (NULL);
3573 		tree = shm_locality->loc_tree;
3574 		off = ptob(anon_index);
3575 	} else if (vp) {
3576 		shm_locality = vp->v_locality;
3577 		if (shm_locality == NULL)
3578 			return (NULL);
3579 		ASSERT(shm_locality->loc_count != 0);
3580 		tree = shm_locality->loc_tree;
3581 		off = vn_off;
3582 	}
3583 
3584 	if (tree == NULL)
3585 		return (NULL);
3586 
3587 	/*
3588 	 * Lookup policy segment for offset into shared object and return
3589 	 * policy info
3590 	 */
3591 	rw_enter(&shm_locality->loc_lock, RW_READER);
3592 	policy_info = NULL;
3593 	policy_seg = avl_find(tree, &off, &where);
3594 	if (policy_seg)
3595 		policy_info = &policy_seg->shm_policy;
3596 	rw_exit(&shm_locality->loc_lock);
3597 
3598 	return (policy_info);
3599 }
3600 
3601 /*
3602  * Default memory allocation policy for kernel segmap pages
3603  */
3604 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3605 
3606 /*
3607  * Return lgroup to use for allocating memory
3608  * given the segment and address
3609  *
3610  * There isn't any mutual exclusion that exists between calls
3611  * to this routine and DR, so this routine and whomever calls it
3612  * should be mindful of the possibility that the lgrp returned
3613  * may be deleted. If this happens, dereferences of the lgrp
3614  * pointer will still be safe, but the resources in the lgrp will
3615  * be gone, and LGRP_EXISTS() will no longer be true.
3616  */
3617 lgrp_t *
3618 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3619 {
3620 	int			i;
3621 	lgrp_t			*lgrp;
3622 	klgrpset_t		lgrpset;
3623 	int			lgrps_spanned;
3624 	unsigned long		off;
3625 	lgrp_mem_policy_t	policy;
3626 	lgrp_mem_policy_info_t	*policy_info;
3627 	ushort_t		random;
3628 	int			stat = 0;
3629 	extern struct seg	*segkmap;
3630 
3631 	/*
3632 	 * Just return null if the lgrp framework hasn't finished
3633 	 * initializing or if this is a UMA machine.
3634 	 */
3635 	if (nlgrps == 1 || !lgrp_initialized)
3636 		return (lgrp_root);
3637 
3638 	/*
3639 	 * Get memory allocation policy for this segment
3640 	 */
3641 	policy = lgrp_mem_default_policy;
3642 	if (seg != NULL) {
3643 		if (seg->s_as == &kas) {
3644 			if (seg == segkmap)
3645 				policy = lgrp_segmap_default_policy;
3646 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3647 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3648 				policy = LGRP_MEM_POLICY_RANDOM;
3649 		} else {
3650 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3651 			if (policy_info != NULL)
3652 				policy = policy_info->mem_policy;
3653 		}
3654 	}
3655 	lgrpset = 0;
3656 
3657 	/*
3658 	 * Initialize lgroup to home by default
3659 	 */
3660 	lgrp = lgrp_home_lgrp();
3661 
3662 	/*
3663 	 * When homing threads on root lgrp, override default memory
3664 	 * allocation policies with root lgroup memory allocation policy
3665 	 */
3666 	if (lgrp == lgrp_root)
3667 		policy = lgrp_mem_policy_root;
3668 
3669 	/*
3670 	 * Implement policy
3671 	 */
3672 	switch (policy) {
3673 	case LGRP_MEM_POLICY_NEXT_CPU:
3674 
3675 		/*
3676 		 * Return lgroup of current CPU which faulted on memory
3677 		 * If the CPU isn't currently in an lgrp, then opt to
3678 		 * allocate from the root.
3679 		 *
3680 		 * Kernel preemption needs to be disabled here to prevent
3681 		 * the current CPU from going away before lgrp is found.
3682 		 */
3683 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3684 			lgrp = lgrp_root;
3685 		} else {
3686 			kpreempt_disable();
3687 			lgrp = lgrp_cpu_to_lgrp(CPU);
3688 			kpreempt_enable();
3689 		}
3690 		break;
3691 
3692 	case LGRP_MEM_POLICY_NEXT:
3693 	case LGRP_MEM_POLICY_DEFAULT:
3694 	default:
3695 
3696 		/*
3697 		 * Just return current thread's home lgroup
3698 		 * for default policy (next touch)
3699 		 * If the thread is homed to the root,
3700 		 * then the default policy is random across lgroups.
3701 		 * Fallthrough to the random case.
3702 		 */
3703 		if (lgrp != lgrp_root) {
3704 			if (policy == LGRP_MEM_POLICY_NEXT)
3705 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3706 			else
3707 				lgrp_stat_add(lgrp->lgrp_id,
3708 				    LGRP_NUM_DEFAULT, 1);
3709 			break;
3710 		}
3711 		/* LINTED fallthrough on case statement */
3712 	case LGRP_MEM_POLICY_RANDOM:
3713 
3714 		/*
3715 		 * Return a random leaf lgroup with memory
3716 		 */
3717 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3718 		/*
3719 		 * Count how many lgroups are spanned
3720 		 */
3721 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3722 
3723 		/*
3724 		 * There may be no memnodes in the root lgroup during DR copy
3725 		 * rename on a system with only two boards (memnodes)
3726 		 * configured. In this case just return the root lgrp.
3727 		 */
3728 		if (lgrps_spanned == 0) {
3729 			lgrp = lgrp_root;
3730 			break;
3731 		}
3732 
3733 		/*
3734 		 * Pick a random offset within lgroups spanned
3735 		 * and return lgroup at that offset
3736 		 */
3737 		random = (ushort_t)gethrtime() >> 4;
3738 		off = random % lgrps_spanned;
3739 		ASSERT(off <= lgrp_alloc_max);
3740 
3741 		for (i = 0; i <= lgrp_alloc_max; i++) {
3742 			if (!klgrpset_ismember(lgrpset, i))
3743 				continue;
3744 			if (off)
3745 				off--;
3746 			else {
3747 				lgrp = lgrp_table[i];
3748 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3749 				    1);
3750 				break;
3751 			}
3752 		}
3753 		break;
3754 
3755 	case LGRP_MEM_POLICY_RANDOM_PROC:
3756 
3757 		/*
3758 		 * Grab copy of bitmask of lgroups spanned by
3759 		 * this process
3760 		 */
3761 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3762 		stat = LGRP_NUM_RANDOM_PROC;
3763 
3764 		/* LINTED fallthrough on case statement */
3765 	case LGRP_MEM_POLICY_RANDOM_PSET:
3766 
3767 		if (!stat)
3768 			stat = LGRP_NUM_RANDOM_PSET;
3769 
3770 		if (klgrpset_isempty(lgrpset)) {
3771 			/*
3772 			 * Grab copy of bitmask of lgroups spanned by
3773 			 * this processor set
3774 			 */
3775 			kpreempt_disable();
3776 			klgrpset_copy(lgrpset,
3777 			    curthread->t_cpupart->cp_lgrpset);
3778 			kpreempt_enable();
3779 		}
3780 
3781 		/*
3782 		 * Count how many lgroups are spanned
3783 		 */
3784 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3785 		ASSERT(lgrps_spanned <= nlgrps);
3786 
3787 		/*
3788 		 * Probably lgrps_spanned should be always non-zero, but to be
3789 		 * on the safe side we return lgrp_root if it is empty.
3790 		 */
3791 		if (lgrps_spanned == 0) {
3792 			lgrp = lgrp_root;
3793 			break;
3794 		}
3795 
3796 		/*
3797 		 * Pick a random offset within lgroups spanned
3798 		 * and return lgroup at that offset
3799 		 */
3800 		random = (ushort_t)gethrtime() >> 4;
3801 		off = random % lgrps_spanned;
3802 		ASSERT(off <= lgrp_alloc_max);
3803 
3804 		for (i = 0; i <= lgrp_alloc_max; i++) {
3805 			if (!klgrpset_ismember(lgrpset, i))
3806 				continue;
3807 			if (off)
3808 				off--;
3809 			else {
3810 				lgrp = lgrp_table[i];
3811 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3812 				    1);
3813 				break;
3814 			}
3815 		}
3816 		break;
3817 
3818 	case LGRP_MEM_POLICY_ROUNDROBIN:
3819 
3820 		/*
3821 		 * Use offset within segment to determine
3822 		 * offset from home lgroup to choose for
3823 		 * next lgroup to allocate memory from
3824 		 */
3825 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3826 		    (lgrp_alloc_max + 1);
3827 
3828 		kpreempt_disable();
3829 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3830 		i = lgrp->lgrp_id;
3831 		kpreempt_enable();
3832 
3833 		while (off > 0) {
3834 			i = (i + 1) % (lgrp_alloc_max + 1);
3835 			lgrp = lgrp_table[i];
3836 			if (klgrpset_ismember(lgrpset, i))
3837 				off--;
3838 		}
3839 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3840 
3841 		break;
3842 	}
3843 
3844 	ASSERT(lgrp != NULL);
3845 	return (lgrp);
3846 }
3847 
3848 /*
3849  * Return the number of pages in an lgroup
3850  *
3851  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3852  *	 could cause tests that rely on the numat driver to fail....
3853  */
3854 pgcnt_t
3855 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3856 {
3857 	lgrp_t *lgrp;
3858 
3859 	lgrp = lgrp_table[lgrpid];
3860 	if (!LGRP_EXISTS(lgrp) ||
3861 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3862 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3863 		return (0);
3864 
3865 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3866 }
3867 
3868 /*
3869  * Initialize lgroup shared memory allocation policy support
3870  */
3871 void
3872 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3873 {
3874 	lgrp_shm_locality_t	*shm_locality;
3875 
3876 	/*
3877 	 * Initialize locality field in anon_map
3878 	 * Don't need any locks because this is called when anon_map is
3879 	 * allocated, but not used anywhere yet.
3880 	 */
3881 	if (amp) {
3882 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3883 		if (amp->locality == NULL) {
3884 			/*
3885 			 * Allocate and initialize shared memory locality info
3886 			 * and set anon_map locality pointer to it
3887 			 * Drop lock across kmem_alloc(KM_SLEEP)
3888 			 */
3889 			ANON_LOCK_EXIT(&amp->a_rwlock);
3890 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3891 			    KM_SLEEP);
3892 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3893 			    NULL);
3894 			shm_locality->loc_count = 1;	/* not used for amp */
3895 			shm_locality->loc_tree = NULL;
3896 
3897 			/*
3898 			 * Reacquire lock and check to see whether anyone beat
3899 			 * us to initializing the locality info
3900 			 */
3901 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3902 			if (amp->locality != NULL) {
3903 				rw_destroy(&shm_locality->loc_lock);
3904 				kmem_free(shm_locality,
3905 				    sizeof (*shm_locality));
3906 			} else
3907 				amp->locality = shm_locality;
3908 		}
3909 		ANON_LOCK_EXIT(&amp->a_rwlock);
3910 		return;
3911 	}
3912 
3913 	/*
3914 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3915 	 */
3916 	mutex_enter(&vp->v_lock);
3917 	if ((vp->v_flag & V_LOCALITY) == 0) {
3918 		/*
3919 		 * Allocate and initialize shared memory locality info
3920 		 */
3921 		mutex_exit(&vp->v_lock);
3922 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3923 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3924 		shm_locality->loc_count = 1;
3925 		shm_locality->loc_tree = NULL;
3926 
3927 		/*
3928 		 * Point vnode locality field at shared vnode policy info
3929 		 * and set locality aware flag in vnode
3930 		 */
3931 		mutex_enter(&vp->v_lock);
3932 		if ((vp->v_flag & V_LOCALITY) == 0) {
3933 			vp->v_locality = shm_locality;
3934 			vp->v_flag |= V_LOCALITY;
3935 		} else {
3936 			/*
3937 			 * Lost race so free locality info and increment count.
3938 			 */
3939 			rw_destroy(&shm_locality->loc_lock);
3940 			kmem_free(shm_locality, sizeof (*shm_locality));
3941 			shm_locality = vp->v_locality;
3942 			shm_locality->loc_count++;
3943 		}
3944 		mutex_exit(&vp->v_lock);
3945 
3946 		return;
3947 	}
3948 
3949 	/*
3950 	 * Increment reference count of number of segments mapping this vnode
3951 	 * shared
3952 	 */
3953 	shm_locality = vp->v_locality;
3954 	shm_locality->loc_count++;
3955 	mutex_exit(&vp->v_lock);
3956 }
3957 
3958 /*
3959  * Destroy the given shared memory policy segment tree
3960  */
3961 void
3962 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3963 {
3964 	lgrp_shm_policy_seg_t	*cur;
3965 	lgrp_shm_policy_seg_t	*next;
3966 
3967 	if (tree == NULL)
3968 		return;
3969 
3970 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3971 	while (cur != NULL) {
3972 		next = AVL_NEXT(tree, cur);
3973 		avl_remove(tree, cur);
3974 		kmem_free(cur, sizeof (*cur));
3975 		cur = next;
3976 	}
3977 	kmem_free(tree, sizeof (avl_tree_t));
3978 }
3979 
3980 /*
3981  * Uninitialize lgroup shared memory allocation policy support
3982  */
3983 void
3984 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3985 {
3986 	lgrp_shm_locality_t	*shm_locality;
3987 
3988 	/*
3989 	 * For anon_map, deallocate shared memory policy tree and
3990 	 * zero locality field
3991 	 * Don't need any locks because anon_map is being freed
3992 	 */
3993 	if (amp) {
3994 		if (amp->locality == NULL)
3995 			return;
3996 		shm_locality = amp->locality;
3997 		shm_locality->loc_count = 0;	/* not really used for amp */
3998 		rw_destroy(&shm_locality->loc_lock);
3999 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4000 		kmem_free(shm_locality, sizeof (*shm_locality));
4001 		amp->locality = 0;
4002 		return;
4003 	}
4004 
4005 	/*
4006 	 * For vnode, decrement reference count of segments mapping this vnode
4007 	 * shared and delete locality info if reference count drops to 0
4008 	 */
4009 	mutex_enter(&vp->v_lock);
4010 	shm_locality = vp->v_locality;
4011 	shm_locality->loc_count--;
4012 
4013 	if (shm_locality->loc_count == 0) {
4014 		rw_destroy(&shm_locality->loc_lock);
4015 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4016 		kmem_free(shm_locality, sizeof (*shm_locality));
4017 		vp->v_locality = 0;
4018 		vp->v_flag &= ~V_LOCALITY;
4019 	}
4020 	mutex_exit(&vp->v_lock);
4021 }
4022 
4023 /*
4024  * Compare two shared memory policy segments
4025  * Used by AVL tree code for searching
4026  */
4027 int
4028 lgrp_shm_policy_compar(const void *x, const void *y)
4029 {
4030 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4031 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4032 
4033 	if (a->shm_off < b->shm_off)
4034 		return (-1);
4035 	if (a->shm_off >= b->shm_off + b->shm_size)
4036 		return (1);
4037 	return (0);
4038 }
4039 
4040 /*
4041  * Concatenate seg1 with seg2 and remove seg2
4042  */
4043 static int
4044 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4045     lgrp_shm_policy_seg_t *seg2)
4046 {
4047 	if (!seg1 || !seg2 ||
4048 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4049 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4050 		return (-1);
4051 
4052 	seg1->shm_size += seg2->shm_size;
4053 	avl_remove(tree, seg2);
4054 	kmem_free(seg2, sizeof (*seg2));
4055 	return (0);
4056 }
4057 
4058 /*
4059  * Split segment at given offset and return rightmost (uppermost) segment
4060  * Assumes that there are no overlapping segments
4061  */
4062 static lgrp_shm_policy_seg_t *
4063 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4064     u_offset_t off)
4065 {
4066 	lgrp_shm_policy_seg_t	*newseg;
4067 	avl_index_t		where;
4068 
4069 	ASSERT(seg != NULL);
4070 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4071 
4072 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4073 	    seg->shm_size)
4074 		return (NULL);
4075 
4076 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4077 		return (seg);
4078 
4079 	/*
4080 	 * Adjust size of left segment and allocate new (right) segment
4081 	 */
4082 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4083 	newseg->shm_policy = seg->shm_policy;
4084 	newseg->shm_off = off;
4085 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4086 	seg->shm_size = off - seg->shm_off;
4087 
4088 	/*
4089 	 * Find where to insert new segment in AVL tree and insert it
4090 	 */
4091 	(void) avl_find(tree, &off, &where);
4092 	avl_insert(tree, newseg, where);
4093 
4094 	return (newseg);
4095 }
4096 
4097 /*
4098  * Set shared memory allocation policy on specified shared object at given
4099  * offset and length
4100  *
4101  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4102  * -1 if can't set policy.
4103  */
4104 int
4105 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4106     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4107 {
4108 	u_offset_t		eoff;
4109 	lgrp_shm_policy_seg_t	*next;
4110 	lgrp_shm_policy_seg_t	*newseg;
4111 	u_offset_t		off;
4112 	u_offset_t		oldeoff;
4113 	lgrp_shm_policy_seg_t	*prev;
4114 	int			retval;
4115 	lgrp_shm_policy_seg_t	*seg;
4116 	lgrp_shm_locality_t	*shm_locality;
4117 	avl_tree_t		*tree;
4118 	avl_index_t		where;
4119 
4120 	ASSERT(amp || vp);
4121 	ASSERT((len & PAGEOFFSET) == 0);
4122 
4123 	if (len == 0)
4124 		return (-1);
4125 
4126 	retval = 0;
4127 
4128 	/*
4129 	 * Get locality info and starting offset into shared object
4130 	 * Try anon map first and then vnode
4131 	 * Assume that no locks need to be held on anon_map or vnode, since
4132 	 * it should be protected by its reference count which must be nonzero
4133 	 * for an existing segment.
4134 	 */
4135 	if (amp) {
4136 		/*
4137 		 * Get policy info from anon_map
4138 		 *
4139 		 */
4140 		ASSERT(amp->refcnt != 0);
4141 		if (amp->locality == NULL)
4142 			lgrp_shm_policy_init(amp, NULL);
4143 		shm_locality = amp->locality;
4144 		off = ptob(anon_index);
4145 	} else if (vp) {
4146 		/*
4147 		 * Get policy info from vnode
4148 		 */
4149 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4150 			lgrp_shm_policy_init(NULL, vp);
4151 		shm_locality = vp->v_locality;
4152 		ASSERT(shm_locality->loc_count != 0);
4153 		off = vn_off;
4154 	} else
4155 		return (-1);
4156 
4157 	ASSERT((off & PAGEOFFSET) == 0);
4158 
4159 	/*
4160 	 * Figure out default policy
4161 	 */
4162 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4163 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4164 
4165 	/*
4166 	 * Create AVL tree if there isn't one yet
4167 	 * and set locality field to point at it
4168 	 */
4169 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4170 	tree = shm_locality->loc_tree;
4171 	if (!tree) {
4172 		rw_exit(&shm_locality->loc_lock);
4173 
4174 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4175 
4176 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4177 		if (shm_locality->loc_tree == NULL) {
4178 			avl_create(tree, lgrp_shm_policy_compar,
4179 			    sizeof (lgrp_shm_policy_seg_t),
4180 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4181 			shm_locality->loc_tree = tree;
4182 		} else {
4183 			/*
4184 			 * Another thread managed to set up the tree
4185 			 * before we could. Free the tree we allocated
4186 			 * and use the one that's already there.
4187 			 */
4188 			kmem_free(tree, sizeof (*tree));
4189 			tree = shm_locality->loc_tree;
4190 		}
4191 	}
4192 
4193 	/*
4194 	 * Set policy
4195 	 *
4196 	 * Need to maintain hold on writer's lock to keep tree from
4197 	 * changing out from under us
4198 	 */
4199 	while (len != 0) {
4200 		/*
4201 		 * Find policy segment for specified offset into shared object
4202 		 */
4203 		seg = avl_find(tree, &off, &where);
4204 
4205 		/*
4206 		 * Didn't find any existing segment that contains specified
4207 		 * offset, so allocate new segment, insert it, and concatenate
4208 		 * with adjacent segments if possible
4209 		 */
4210 		if (seg == NULL) {
4211 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4212 			    KM_SLEEP);
4213 			newseg->shm_policy.mem_policy = policy;
4214 			newseg->shm_policy.mem_reserved = 0;
4215 			newseg->shm_off = off;
4216 			avl_insert(tree, newseg, where);
4217 
4218 			/*
4219 			 * Check to see whether new segment overlaps with next
4220 			 * one, set length of new segment accordingly, and
4221 			 * calculate remaining length and next offset
4222 			 */
4223 			seg = AVL_NEXT(tree, newseg);
4224 			if (seg == NULL || off + len <= seg->shm_off) {
4225 				newseg->shm_size = len;
4226 				len = 0;
4227 			} else {
4228 				newseg->shm_size = seg->shm_off - off;
4229 				off = seg->shm_off;
4230 				len -= newseg->shm_size;
4231 			}
4232 
4233 			/*
4234 			 * Try to concatenate new segment with next and
4235 			 * previous ones, since they might have the same policy
4236 			 * now.  Grab previous and next segments first because
4237 			 * they will change on concatenation.
4238 			 */
4239 			prev =  AVL_PREV(tree, newseg);
4240 			next = AVL_NEXT(tree, newseg);
4241 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4242 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4243 
4244 			continue;
4245 		}
4246 
4247 		eoff = off + len;
4248 		oldeoff = seg->shm_off + seg->shm_size;
4249 
4250 		/*
4251 		 * Policy set already?
4252 		 */
4253 		if (policy == seg->shm_policy.mem_policy) {
4254 			/*
4255 			 * Nothing left to do if offset and length
4256 			 * fall within this segment
4257 			 */
4258 			if (eoff <= oldeoff) {
4259 				retval = 1;
4260 				break;
4261 			} else {
4262 				len = eoff - oldeoff;
4263 				off = oldeoff;
4264 				continue;
4265 			}
4266 		}
4267 
4268 		/*
4269 		 * Specified offset and length match existing segment exactly
4270 		 */
4271 		if (off == seg->shm_off && len == seg->shm_size) {
4272 			/*
4273 			 * Set policy and update current length
4274 			 */
4275 			seg->shm_policy.mem_policy = policy;
4276 			seg->shm_policy.mem_reserved = 0;
4277 			len = 0;
4278 
4279 			/*
4280 			 * Try concatenating new segment with previous and next
4281 			 * segments, since they might have the same policy now.
4282 			 * Grab previous and next segments first because they
4283 			 * will change on concatenation.
4284 			 */
4285 			prev =  AVL_PREV(tree, seg);
4286 			next = AVL_NEXT(tree, seg);
4287 			(void) lgrp_shm_policy_concat(tree, seg, next);
4288 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4289 		} else {
4290 			/*
4291 			 * Specified offset and length only apply to part of
4292 			 * existing segment
4293 			 */
4294 
4295 			/*
4296 			 * New segment starts in middle of old one, so split
4297 			 * new one off near beginning of old one
4298 			 */
4299 			newseg = NULL;
4300 			if (off > seg->shm_off) {
4301 				newseg = lgrp_shm_policy_split(tree, seg, off);
4302 
4303 				/*
4304 				 * New segment ends where old one did, so try
4305 				 * to concatenate with next segment
4306 				 */
4307 				if (eoff == oldeoff) {
4308 					newseg->shm_policy.mem_policy = policy;
4309 					newseg->shm_policy.mem_reserved = 0;
4310 					(void) lgrp_shm_policy_concat(tree,
4311 					    newseg, AVL_NEXT(tree, newseg));
4312 					break;
4313 				}
4314 			}
4315 
4316 			/*
4317 			 * New segment ends before old one, so split off end of
4318 			 * old one
4319 			 */
4320 			if (eoff < oldeoff) {
4321 				if (newseg) {
4322 					(void) lgrp_shm_policy_split(tree,
4323 					    newseg, eoff);
4324 					newseg->shm_policy.mem_policy = policy;
4325 					newseg->shm_policy.mem_reserved = 0;
4326 				} else {
4327 					(void) lgrp_shm_policy_split(tree, seg,
4328 					    eoff);
4329 					seg->shm_policy.mem_policy = policy;
4330 					seg->shm_policy.mem_reserved = 0;
4331 				}
4332 
4333 				if (off == seg->shm_off)
4334 					(void) lgrp_shm_policy_concat(tree,
4335 					    AVL_PREV(tree, seg), seg);
4336 				break;
4337 			}
4338 
4339 			/*
4340 			 * Calculate remaining length and next offset
4341 			 */
4342 			len = eoff - oldeoff;
4343 			off = oldeoff;
4344 		}
4345 	}
4346 
4347 	rw_exit(&shm_locality->loc_lock);
4348 	return (retval);
4349 }
4350 
4351 /*
4352  * Return the best memnode from which to allocate memory given
4353  * an lgroup.
4354  *
4355  * "c" is for cookie, which is good enough for me.
4356  * It references a cookie struct that should be zero'ed to initialize.
4357  * The cookie should live on the caller's stack.
4358  *
4359  * The routine returns -1 when:
4360  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4361  *	- traverse is 1, and all the memnodes in the system have been
4362  *	  returned.
4363  */
4364 int
4365 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4366 {
4367 	lgrp_t		*lp = c->lmc_lgrp;
4368 	mnodeset_t	nodes = c->lmc_nodes;
4369 	int		cnt = c->lmc_cnt;
4370 	int		offset, mnode;
4371 
4372 	extern int	max_mem_nodes;
4373 
4374 	/*
4375 	 * If the set is empty, and the caller is willing, traverse
4376 	 * up the hierarchy until we find a non-empty set.
4377 	 */
4378 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4379 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4380 		    ((lp = lp->lgrp_parent) == NULL))
4381 			return (-1);
4382 
4383 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4384 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4385 	}
4386 
4387 	/*
4388 	 * Select a memnode by picking one at a "random" offset.
4389 	 * Because of DR, memnodes can come and go at any time.
4390 	 * This code must be able to cope with the possibility
4391 	 * that the nodes count "cnt" is inconsistent with respect
4392 	 * to the number of elements actually in "nodes", and
4393 	 * therefore that the offset chosen could be greater than
4394 	 * the number of elements in the set (some memnodes may
4395 	 * have dissapeared just before cnt was read).
4396 	 * If this happens, the search simply wraps back to the
4397 	 * beginning of the set.
4398 	 */
4399 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4400 	offset = c->lmc_rand % cnt;
4401 	do {
4402 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4403 			if (nodes & ((mnodeset_t)1 << mnode))
4404 				if (!offset--)
4405 					break;
4406 	} while (mnode >= max_mem_nodes);
4407 
4408 	/* Found a node. Store state before returning. */
4409 	c->lmc_lgrp = lp;
4410 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4411 	c->lmc_cnt = cnt - 1;
4412 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4413 	c->lmc_ntried++;
4414 
4415 	return (mnode);
4416 }
4417