xref: /titanic_44/usr/src/uts/sun4v/os/mpo.c (revision b3c0e203b148ecc85043c9da9d327d45c6e7c470)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/machsystm.h>
32 #include <sys/machparam.h>
33 #include <sys/cmn_err.h>
34 #include <sys/stat.h>
35 #include <sys/mach_descrip.h>
36 #include <sys/memnode.h>
37 #include <sys/mdesc.h>
38 #include <sys/mpo.h>
39 #include <vm/vm_dep.h>
40 #include <vm/hat_sfmmu.h>
41 
42 /*
43  * MPO and the sun4v memory representation
44  * ---------------------------------------
45  *
46  * Latency groups are defined in the sun4v achitecture by memory-latency-group
47  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
48  * tie together cpu nodes and mblock nodes, and contain mask and match
49  * properties that identify the portion of an mblock that belongs to the
50  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
51  * but an mblock defines Real Addresses (RA).  To translate, the mblock
52  * includes the property address-congruence-offset, hereafter referred to as
53  * ra_to_pa.  A real address ra is a member of an lgroup if
54  *
55  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
56  *
57  * The MD is traversed, and information on all mblocks is kept in the array
58  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
59  * to, is kept in the array mpo_cpu[].
60  *
61  * This implementation makes (and verifies) the simplifying assumption that
62  * the mask bits are the same for all defined lgroups, and that all 1 bits in
63  * the mask are contiguous.  Thus the number of lgroups is bounded by the
64  * number of possible mask values, and the lgrp_handle_t is defined as the
65  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
66  * masks and values are also referred to as "home bits" in the code.
67  *
68  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71  * home bits.  This yields the mem_node.
72  *
73  * Interfaces
74  * ----------
75  *
76  * This file exports the following entry points:
77  *
78  * plat_lgrp_init()
79  * plat_build_mem_nodes()
80  * plat_lgrp_cpu_to_hand()
81  * plat_lgrp_latency()
82  * plat_pfn_to_mem_node()
83  *	These implement the usual platform lgroup interfaces.
84  *
85  * plat_rapfn_to_papfn()
86  *	Recover the PA page coloring bits from an RA.
87  *
88  * plat_mem_node_iterator_init()
89  *	Initialize an iterator to efficiently step through pages in a mem_node.
90  *
91  * plat_mem_node_intersect_range()
92  *	Find the intersection with a mem_node.
93  */
94 
95 int	sun4v_mpo_enable = 1;
96 int	sun4v_mpo_debug = 0;
97 char	sun4v_mpo_status[256] = "";
98 
99 /* Save CPU info from the MD and associate CPUs with lgroups */
100 static	struct cpu_md mpo_cpu[NCPU];
101 
102 /* Save lgroup info from the MD */
103 #define	MAX_MD_LGROUPS 32
104 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
105 static	int	n_lgrpnodes = 0;
106 static	int	n_locality_groups = 0;
107 static	int	max_locality_groups = 0;
108 
109 /* Save mblocks from the MD */
110 static 	struct	mblock_md mpo_mblock[MPO_MAX_MBLOCKS];
111 static	int	n_mblocks = 0;
112 
113 /* Save mem_node stripes calculate from mblocks and lgroups. */
114 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES];
115 static	int	n_mem_stripes = 0;
116 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
117 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
118 static	pfn_t	mnode_pages;	/* mem_node stripe width */
119 
120 /* Save home mask and shift used to calculate lgrp_handle_t values */
121 static	uint64_t home_mask = 0;
122 static	pfn_t	home_mask_pfn = 0;
123 static	int	home_mask_shift = 0;
124 static	uint_t	home_mask_pfn_shift = 0;
125 
126 /* Save lowest and highest latencies found across all lgroups */
127 static	int	lower_latency = 0;
128 static	int	higher_latency = 0;
129 
130 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
131 
132 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
133 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
134 static	int	fix_interleave(void);
135 
136 /* Debug support */
137 #if defined(DEBUG) && !defined(lint)
138 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
139 #else
140 #define	MPO_DEBUG(...)
141 #endif	/* DEBUG */
142 
143 /* Record status message, viewable from mdb */
144 #define	MPO_STATUS(args...) {						      \
145 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
146 	MPO_DEBUG(sun4v_mpo_status);					      \
147 }
148 
149 /*
150  * Routine to read a uint64_t from a given md
151  */
152 static	int64_t
153 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
154 {
155 	int err = md_get_prop_val(md, node, propname, val);
156 	return (err);
157 }
158 
159 static int
160 mblock_cmp(const void *a, const void *b)
161 {
162 	struct mblock_md *m1 = (struct mblock_md *)a;
163 	struct mblock_md *m2 = (struct mblock_md *)b;
164 
165 	if (m1->base < m2->base)
166 		return (-1);
167 	else if (m1->base == m2->base)
168 		return (0);
169 	else
170 		return (1);
171 }
172 
173 static void
174 mblock_sort(struct mblock_md *mblocks, int n)
175 {
176 	extern void qsort(void *, size_t, size_t,
177 	    int (*)(const void *, const void *));
178 
179 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
180 }
181 
182 /*
183  *
184  * Traverse the MD to determine:
185  *
186  *  Number of CPU nodes, lgrp_nodes, and mblocks
187  *  Then for each lgrp_node, obtain the appropriate data.
188  *  For each CPU, determine its home locality and store it.
189  *  For each mblock, retrieve its data and store it.
190  */
191 static	int
192 lgrp_traverse(md_t *md)
193 {
194 	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
195 	uint64_t i, j, k, o, n_nodes;
196 	uint64_t n_lgroups = 0;
197 	uint64_t mem_lg_homeset = 0;
198 	int ret_val = 0;
199 	int result = 0;
200 	int n_cpunodes = 0;
201 	int sub_page_fix;
202 
203 	n_nodes = md_node_count(md);
204 
205 	if (n_nodes <= 0) {
206 		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
207 		ret_val = -1;
208 		goto fail;
209 	}
210 
211 	root = md_root_node(md);
212 
213 	if (root == MDE_INVAL_ELEM_COOKIE) {
214 		MPO_STATUS("lgrp_traverse: Root node is missing\n");
215 		ret_val = -1;
216 		goto fail;
217 	}
218 
219 	/*
220 	 * Build the Memory Nodes.  Do this before any possibility of
221 	 * bailing from this routine so we obtain ra_to_pa (needed for page
222 	 * coloring) even when there are no lgroups defined.
223 	 */
224 
225 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
226 	    "fwd", &mblocknodes);
227 
228 	if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) {
229 		MPO_STATUS("lgrp_traverse: No mblock "
230 		    "nodes detected in Machine Descriptor\n");
231 		n_mblocks = 0;
232 		ret_val = -1;
233 		goto fail;
234 	}
235 
236 	for (i = 0; i < n_mblocks; i++) {
237 		mpo_mblock[i].node = mblocknodes[i];
238 
239 		/* Without a base or size value we will fail */
240 		result = get_int(md, mblocknodes[i], PROP_LG_BASE,
241 		    &mpo_mblock[i].base);
242 		if (result < 0) {
243 			MPO_STATUS("lgrp_traverse: "
244 			    "PROP_LG_BASE is missing\n");
245 			n_mblocks = 0;
246 			ret_val = -1;
247 			goto fail;
248 		}
249 
250 		result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
251 		    &mpo_mblock[i].size);
252 		if (result < 0) {
253 			MPO_STATUS("lgrp_traverse: "
254 			    "PROP_LG_SIZE is missing\n");
255 			n_mblocks = 0;
256 			ret_val = -1;
257 			goto fail;
258 		}
259 
260 		result = get_int(md, mblocknodes[i],
261 		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
262 
263 		/* If we don't have an ra_pa_offset, just set it to 0 */
264 		if (result < 0)
265 			mpo_mblock[i].ra_to_pa = 0;
266 
267 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
268 		    "ra_to_pa = %lx\n", i,
269 		    mpo_mblock[i].base,
270 		    mpo_mblock[i].size,
271 		    mpo_mblock[i].ra_to_pa);
272 	}
273 
274 	/* Must sort mblocks by address for mem_node_iterator_init() */
275 	mblock_sort(mpo_mblock, n_mblocks);
276 
277 	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
278 
279 	/* Page coloring hook is required so we can iterate through mnodes */
280 	if (&page_next_pfn_for_color_cpu == NULL) {
281 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
282 		ret_val = -1;
283 		goto fail;
284 	}
285 
286 	/* Global enable for mpo */
287 	if (sun4v_mpo_enable == 0) {
288 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
289 		ret_val = -1;
290 		goto fail;
291 	}
292 
293 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
294 	    "fwd", &lgrpnodes);
295 
296 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
297 		MPO_STATUS("lgrp_traverse: No Lgroups\n");
298 		ret_val = -1;
299 		goto fail;
300 	}
301 
302 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
303 
304 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
305 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
306 		    "in MD\n");
307 		ret_val = -1;
308 		goto fail;
309 	}
310 
311 	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
312 	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
313 	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
314 	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
315 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
316 	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
317 
318 	for (i = 0; i < n_lgrpnodes; i++) {
319 		mpo_lgroup[i].node = lgrpnodes[i];
320 		mpo_lgroup[i].id = i;
321 		mpo_lgroup[i].ncpu = 0;
322 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
323 		    &mpo_lgroup[i].addr_mask);
324 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
325 		    &mpo_lgroup[i].addr_match);
326 
327 		/*
328 		 * If either the mask or match properties are missing, set to 0
329 		 */
330 		if (result < 0) {
331 			mpo_lgroup[i].addr_mask = 0;
332 			mpo_lgroup[i].addr_match = 0;
333 		}
334 
335 		/* Set latency to 0 if property not present */
336 
337 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
338 		    &mpo_lgroup[i].latency);
339 		if (result < 0)
340 			mpo_lgroup[i].latency = 0;
341 	}
342 
343 	/*
344 	 * Sub-page level interleave is not yet supported.  Check for it,
345 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
346 	 * n_lgrpnodes.  If no lgroups are left, return.
347 	 */
348 
349 	sub_page_fix = fix_interleave();
350 	if (n_lgrpnodes == 0) {
351 		ret_val = -1;
352 		goto fail;
353 	}
354 
355 	/* Ensure that all of the addr_mask values are the same */
356 
357 	for (i = 0; i < n_lgrpnodes; i++) {
358 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
359 			MPO_STATUS("lgrp_traverse: "
360 			    "addr_mask values are not the same\n");
361 			ret_val = -1;
362 			goto fail;
363 		}
364 	}
365 
366 	/*
367 	 * Ensure that all lgrp nodes see all the mblocks. However, if
368 	 * sub-page interleave is being fixed, they do not, so skip
369 	 * the check.
370 	 */
371 
372 	if (sub_page_fix == 0) {
373 		for (i = 0; i < n_lgrpnodes; i++) {
374 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
375 			    PROP_LG_MBLOCK, "fwd", &nodes);
376 			md_free_scan_dag(md, &nodes);
377 			if (j != n_mblocks) {
378 				MPO_STATUS("lgrp_traverse: "
379 				    "sub-page interleave is being fixed\n");
380 				ret_val = -1;
381 				goto fail;
382 			}
383 		}
384 	}
385 
386 	/*
387 	 * Use the address mask from the first lgroup node
388 	 * to establish our home_mask.
389 	 */
390 	home_mask = mpo_lgroup[0].addr_mask;
391 	home_mask_pfn = btop(home_mask);
392 	home_mask_shift = lowbit(home_mask) - 1;
393 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
394 	mnode_pages = btop(1ULL << home_mask_shift);
395 
396 	/*
397 	 * How many values are possible in home mask?  Assume the mask
398 	 * bits are contiguous.
399 	 */
400 	max_locality_groups =
401 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
402 
403 	/* Now verify the home mask bits are contiguous */
404 
405 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
406 		MPO_STATUS("lgrp_traverse: "
407 		    "home mask bits are not contiguous\n");
408 		ret_val = -1;
409 		goto fail;
410 	}
411 
412 	/* Record all of the home bits */
413 
414 	for (i = 0; i < n_lgrpnodes; i++) {
415 		HOMESET_ADD(mem_lg_homeset,
416 		    mpo_lgroup[i].addr_match >> home_mask_shift);
417 	}
418 
419 	/* Count the number different "home"  mem_lg's we've discovered */
420 
421 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
422 
423 	/* If we have only 1 locality group then we can exit */
424 	if (n_locality_groups == 1) {
425 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
426 		ret_val = -1;
427 		goto fail;
428 	}
429 
430 	/*
431 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
432 	 * latency found.  All other memory is considered remote, and the
433 	 * remote latency is represented by the highest latency found.
434 	 * Thus hierarchical lgroups, if any, are approximated by a
435 	 * two level scheme.
436 	 *
437 	 * The Solaris MPO framework by convention wants to see latencies
438 	 * in units of nano-sec/10. In the MD, the units are defined to be
439 	 * pico-seconds.
440 	 */
441 
442 	lower_latency = mpo_lgroup[0].latency;
443 	higher_latency = mpo_lgroup[0].latency;
444 
445 	for (i = 1; i < n_lgrpnodes; i++) {
446 		if (mpo_lgroup[i].latency < lower_latency) {
447 			lower_latency = mpo_lgroup[i].latency;
448 		}
449 		if (mpo_lgroup[i].latency > higher_latency) {
450 			higher_latency = mpo_lgroup[i].latency;
451 		}
452 	}
453 	lower_latency /= 10000;
454 	higher_latency /= 10000;
455 
456 	/* Clear our CPU data */
457 
458 	for (i = 0; i < NCPU; i++) {
459 		mpo_cpu[i].home = 0;
460 		mpo_cpu[i].latency = (uint_t)(-1);
461 	}
462 
463 	/* Build the CPU nodes */
464 	for (i = 0; i < n_cpunodes; i++) {
465 
466 		/* Read in the lgroup nodes */
467 
468 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
469 		if (result < 0) {
470 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
471 			ret_val = -1;
472 			goto fail;
473 		}
474 
475 		n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
476 		    "fwd", &nodes);
477 		if (n_lgroups <= 0) {
478 			MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
479 			ret_val = -1;
480 			goto fail;
481 		}
482 
483 		/*
484 		 * Find the lgroup this cpu belongs to with the lowest latency.
485 		 * Check all the lgrp nodes connected to this CPU to determine
486 		 * which has the smallest latency.
487 		 */
488 
489 		for (j = 0; j < n_lgroups; j++) {
490 			for (o = 0; o < n_lgrpnodes; o++) {
491 				if (nodes[j] == mpo_lgroup[o].node) {
492 					if (mpo_lgroup[o].latency <
493 					    mpo_cpu[k].latency) {
494 						mpo_cpu[k].home =
495 						    mpo_lgroup[o].addr_match
496 						    >> home_mask_shift;
497 						mpo_cpu[k].latency =
498 						    mpo_lgroup[o].latency;
499 						mpo_lgroup[o].ncpu++;
500 					}
501 				}
502 			}
503 		}
504 		md_free_scan_dag(md, &nodes);
505 	}
506 
507 	/* Validate that no large pages cross mnode boundaries. */
508 	if (valid_pages(md, cpunodes[0]) == 0) {
509 		ret_val = -1;
510 		goto fail;
511 	}
512 
513 fail:
514 	/* MD cookies are no longer valid; ensure they are not used again. */
515 	for (i = 0; i < n_mblocks; i++)
516 		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
517 	for (i = 0; i < n_lgrpnodes; i++)
518 		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
519 
520 	if (n_cpunodes > 0)
521 		md_free_scan_dag(md, &cpunodes);
522 	if (n_lgrpnodes > 0)
523 		md_free_scan_dag(md, &lgrpnodes);
524 	if (n_mblocks > 0)
525 		md_free_scan_dag(md, &mblocknodes);
526 	else
527 		panic("lgrp_traverse: No memory blocks found");
528 
529 	if (ret_val == 0)
530 		MPO_STATUS("MPO feature is enabled.\n");
531 
532 	return (ret_val);
533 }
534 
535 /*
536  *  Determine the number of unique mem_lg's present in our system
537  */
538 static	int
539 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
540 {
541 	int homeid;
542 	int count = 0;
543 
544 	/*
545 	 * Scan the "home" bits of the mem_lgs, count
546 	 * the number that are unique.
547 	 */
548 
549 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
550 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
551 			count++;
552 		}
553 	}
554 
555 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
556 	    mem_lg_homeset);
557 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
558 
559 	/* Default must be at least one */
560 	if (count == 0)
561 		count = 1;
562 
563 	return (count);
564 }
565 
566 /*
567  * Platform specific lgroup initialization
568  */
569 void
570 plat_lgrp_init(void)
571 {
572 	md_t *md;
573 	int i, rc, ncpu_min;
574 
575 	/* Get the Machine Descriptor handle */
576 
577 	md = md_get_handle();
578 
579 	/* If not, we cannot continue */
580 
581 	if (md == NULL) {
582 		panic("cannot access machine descriptor\n");
583 	} else {
584 		rc = lgrp_traverse(md);
585 		(void) md_fini_handle(md);
586 	}
587 
588 	/*
589 	 * If we can't process the MD for lgroups then at least let the
590 	 * system try to boot.  Assume we have one lgroup so that
591 	 * when plat_build_mem_nodes is called, it will attempt to init
592 	 * an mnode based on the supplied memory segment.
593 	 */
594 
595 	if (rc == -1) {
596 		home_mask_pfn = 0;
597 		max_locality_groups = 1;
598 		n_locality_groups = 1;
599 		return;
600 	}
601 
602 	mem_node_pfn_shift = 0;
603 	mem_node_physalign = 0;
604 
605 	/* Use lgroup-aware TSB allocations */
606 	tsb_lgrp_affinity = 1;
607 
608 	/*
609 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
610 	 * this process is currently running on before considering
611 	 * expanding threads to another lgroup.
612 	 *
613 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
614 	 * must be loaded before expanding to it.
615 	 *
616 	 * On sun4v CMT processors, threads share a core pipeline, and
617 	 * at less than 100% utilization, best throughput is obtained by
618 	 * spreading threads across more cores, even if some are in a
619 	 * different lgroup.  Spread threads to a new lgroup if the
620 	 * current group is more than 50% loaded.  Because of virtualization,
621 	 * lgroups may have different numbers of CPUs, but the tunables
622 	 * apply to all lgroups, so find the smallest lgroup and compute
623 	 * 50% loading.
624 	 */
625 
626 	ncpu_min = NCPU;
627 	for (i = 0; i < n_lgrpnodes; i++) {
628 		int ncpu = mpo_lgroup[i].ncpu;
629 		if (ncpu != 0 && ncpu < ncpu_min)
630 			ncpu_min = ncpu;
631 	}
632 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
633 
634 	/* new home may only be half as loaded as the existing home to use it */
635 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
636 
637 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
638 
639 	/* Require that a home lgroup have some memory to be chosen */
640 	lgrp_mem_free_thresh = 1;
641 
642 	/* Standard home-on-next-touch policy */
643 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
644 
645 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
646 	lgrp_load_thresh = UINT32_MAX;
647 }
648 
649 /*
650  *  Helper routine for debugging calls to mem_node_add_slice()
651  */
652 static	void
653 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
654 {
655 #if defined(DEBUG) && !defined(lint)
656 	static int slice_count = 0;
657 
658 	slice_count++;
659 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
660 	    slice_count, basepfn, endpfn);
661 #endif
662 	mem_node_add_slice(basepfn, endpfn);
663 }
664 
665 /*
666  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
667  */
668 static	void
669 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
670 {
671 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
672 	    "mnode index: %d\n", plathand, mnode);
673 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
674 }
675 
676 /*
677  * plat_build_mem_nodes()
678  *
679  * Define the mem_nodes based on the modified boot memory list,
680  * or based on info read from the MD in plat_lgrp_init().
681  *
682  * When the home mask lies in the middle of the address bits (as it does on
683  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
684  * it is striped across an mblock in a repeating pattern of contiguous memory
685  * followed by a gap.  The stripe width is the size of the contiguous piece.
686  * The stride is the distance from the start of one contiguous piece to the
687  * start of the next.  The gap is thus stride - stripe_width.
688  *
689  * The stripe of an mnode that falls within an mblock is described by the type
690  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
691  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
692  * this array is predetermined.  The mem_stripe_t that describes mnode m
693  * within mpo_mblock[i] is stored at
694  *	 mem_stripes[ m + i * max_locality_groups ]
695  *
696  * max_locality_groups is the total number of possible locality groups,
697  * as defined by the size of the home mask, even if the memory assigned
698  * to the domain is small and does not cover all the lgroups.  Thus some
699  * mem_stripe_t's may be empty.
700  *
701  * The members of mem_stripe_t are:
702  *	physbase: First valid page in mem_node in the corresponding mblock
703  *	physmax: Last valid page in mem_node in mblock
704  *	offset:  The full stripe width starts at physbase - offset.
705  *	    Thus if offset is non-zero, this mem_node starts in the middle
706  *	    of a stripe width, and the second full stripe starts at
707  *	    physbase - offset + stride.  (even though physmax may fall in the
708  *	    middle of a stripe width, we do not save the ending fragment size
709  *	    in this data structure.)
710  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
711  *
712  *	The stripe width is kept in the global mnode_pages.
713  *	The stride is kept in the global mnode_stride.
714  *	All the above use pfn's as the unit.
715  *
716  * As an example, the memory layout for a domain with 2 mblocks and 4
717  * mem_nodes 0,1,2,3 could look like this:
718  *
719  *	123012301230 ...	012301230123 ...
720  *	  mblock 0		  mblock 1
721  */
722 
723 void
724 plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
725 {
726 	lgrp_handle_t lgrphand, lgrp_start;
727 	int i, mnode, elem;
728 	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
729 	uint64_t stripe, frag, remove;
730 	mem_stripe_t *ms;
731 
732 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
733 	max_mem_nodes = max_locality_groups;
734 
735 	/* Check for non-MPO sun4v platforms */
736 	if (n_locality_groups <= 1) {
737 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
738 		for (elem = 0; elem < nelems; elem += 2) {
739 			base = list[elem];
740 			len = list[elem+1];
741 
742 			mpo_mem_node_add_slice(btop(base),
743 			    btop(base + len - 1));
744 		}
745 		mem_node_pfn_shift = 0;
746 		mem_node_physalign = 0;
747 		n_mem_stripes = 0;
748 		if (n_mblocks == 1)
749 			return;
750 	}
751 
752 	bzero(mem_stripes, sizeof (mem_stripes));
753 	stripe = ptob(mnode_pages);
754 	stride = max_locality_groups * stripe;
755 
756 	/* Save commonly used values in globals */
757 	mnode_stride = btop(stride);
758 	n_mem_stripes = max_locality_groups * n_mblocks;
759 	stripe_shift = highbit(max_locality_groups) - 1;
760 
761 	for (i = 0; i < n_mblocks; i++) {
762 
763 		base = mpo_mblock[i].base;
764 		end = mpo_mblock[i].base + mpo_mblock[i].size;
765 		ra_to_pa = mpo_mblock[i].ra_to_pa;
766 		mpo_mblock[i].base_pfn = btop(base);
767 		mpo_mblock[i].end_pfn = btop(end - 1);
768 
769 		/* Find the offset from the prev stripe boundary in PA space. */
770 		offset = (base + ra_to_pa) & (stripe - 1);
771 
772 		/* Set the next stripe boundary. */
773 		stripe_end = base - offset + stripe;
774 
775 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
776 		    home_mask_shift);
777 		lgrphand = lgrp_start;
778 
779 		/*
780 		 * Loop over all lgroups covered by the mblock, creating a
781 		 * stripe for each.  Stop when lgrp_start is visited again.
782 		 */
783 		do {
784 			/* mblock may not span all lgroups */
785 			if (base >= end)
786 				break;
787 
788 			mnode = lgrphand;
789 			ASSERT(mnode < max_mem_nodes);
790 
791 			/*
792 			 * Calculate the size of the fragment that does not
793 			 * belong to the mnode in the last partial stride.
794 			 */
795 			frag = (end - (base - offset)) & (stride - 1);
796 			if (frag == 0) {
797 				/* remove the gap */
798 				remove = stride - stripe;
799 			} else if (frag < stripe) {
800 				/* fragment fits in stripe; keep it all */
801 				remove = 0;
802 			} else {
803 				/* fragment is large; trim after whole stripe */
804 				remove = frag - stripe;
805 			}
806 
807 			ms = &mem_stripes[i * max_locality_groups + mnode];
808 			ms->physbase = btop(base);
809 			ms->physmax = btop(end - 1 - remove);
810 			ms->offset = btop(offset);
811 			ms->exists = 1;
812 
813 			/*
814 			 * If we have only 1 lgroup and multiple mblocks,
815 			 * then we have already established our lgrp handle
816 			 * to mem_node and mem_node_config values above.
817 			 */
818 			if (n_locality_groups > 1) {
819 				mpo_plat_assign_lgrphand_to_mem_node(lgrphand,
820 				    mnode);
821 				mpo_mem_node_add_slice(ms->physbase,
822 				    ms->physmax);
823 			}
824 			base = stripe_end;
825 			stripe_end += stripe;
826 			offset = 0;
827 			lgrphand = (((base + ra_to_pa) & home_mask) >>
828 			    home_mask_shift);
829 		} while (lgrphand != lgrp_start);
830 	}
831 
832 	/*
833 	 * Indicate to vm_pagelist that the hpm_counters array
834 	 * should be shared because the ranges overlap.
835 	 */
836 	if (max_mem_nodes > 1) {
837 		interleaved_mnodes = 1;
838 	}
839 }
840 
841 /*
842  * Return the locality group value for the supplied processor
843  */
844 lgrp_handle_t
845 plat_lgrp_cpu_to_hand(processorid_t id)
846 {
847 	if (n_locality_groups > 1) {
848 		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
849 	} else {
850 		return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */
851 	}
852 }
853 
854 int
855 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
856 {
857 	/*
858 	 * Return min remote latency when there are more than two lgroups
859 	 * (root and child) and getting latency between two different lgroups
860 	 * or root is involved.
861 	 */
862 	if (lgrp_optimizations() && (from != to ||
863 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
864 		return ((int)higher_latency);
865 	} else {
866 		return ((int)lower_latency);
867 	}
868 }
869 
870 int
871 plat_pfn_to_mem_node(pfn_t pfn)
872 {
873 	int i, mnode;
874 	pfn_t ra_to_pa_pfn;
875 	struct mblock_md *mb;
876 
877 	if (n_locality_groups <= 1)
878 		return (0);
879 
880 	/*
881 	 * The mnode is defined to be 1:1 with the lgroup handle, which
882 	 * is taken from from the home bits.  Find the mblock in which
883 	 * the pfn falls to get the ra_to_pa adjustment, and extract
884 	 * the home bits.
885 	 */
886 	mb = &mpo_mblock[0];
887 	for (i = 0; i < n_mblocks; i++) {
888 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
889 			ra_to_pa_pfn = btop(mb->ra_to_pa);
890 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
891 			    home_mask_pfn_shift);
892 			ASSERT(mnode < max_mem_nodes);
893 			return (mnode);
894 		}
895 		mb++;
896 	}
897 
898 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
899 	return (pfn);
900 }
901 
902 /*
903  * plat_rapfn_to_papfn
904  *
905  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
906  * and home mask bits are correct.  The upper bits do not necessarily
907  * match the actual PA, however.
908  */
909 pfn_t
910 plat_rapfn_to_papfn(pfn_t pfn)
911 {
912 	int i;
913 	pfn_t ra_to_pa_pfn;
914 	struct mblock_md *mb;
915 
916 	ASSERT(n_mblocks > 0);
917 	if (n_mblocks == 1)
918 		return (pfn + base_ra_to_pa_pfn);
919 
920 	/*
921 	 * Find the mblock in which the pfn falls
922 	 * in order to get the ra_to_pa adjustment.
923 	 */
924 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
925 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
926 			ra_to_pa_pfn = btop(mb->ra_to_pa);
927 			return (pfn + ra_to_pa_pfn);
928 		}
929 	}
930 
931 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
932 	return (pfn);
933 }
934 
935 /*
936  * plat_mem_node_iterator_init()
937  *	Initialize cookie to iterate over pfn's in an mnode.  There is
938  *	no additional iterator function.  The caller uses the info from
939  *	the iterator structure directly.
940  *
941  *	pfn: starting pfn.
942  * 	mnode: desired mnode.
943  *	init: set to 1 for full init, 0 for continuation
944  *
945  *	Returns the appropriate starting pfn for the iteration
946  *	the same as the input pfn if it falls in an mblock.
947  *	Returns the (pfn_t)-1 value if the input pfn lies past
948  *	the last valid mnode pfn.
949  */
950 pfn_t
951 plat_mem_node_iterator_init(pfn_t pfn, int mnode,
952     mem_node_iterator_t *it, int init)
953 {
954 	int i;
955 	struct mblock_md *mblock;
956 	pfn_t base, end;
957 
958 	ASSERT(it != NULL);
959 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
960 	ASSERT(n_mblocks > 0);
961 
962 	if (init) {
963 		it->mi_last_mblock = 0;
964 		it->mi_init = 1;
965 	}
966 
967 	/* Check if mpo is not enabled and we only have one mblock */
968 	if (n_locality_groups == 1 && n_mblocks == 1) {
969 		it->mi_mnode = mnode;
970 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
971 		it->mi_mnode_pfn_mask = 0;
972 		it->mi_mnode_pfn_shift = 0;
973 		it->mi_mnode_mask = 0;
974 		it->mi_mblock_base = mem_node_config[mnode].physbase;
975 		it->mi_mblock_end = mem_node_config[mnode].physmax;
976 		if (pfn < it->mi_mblock_base)
977 			pfn = it->mi_mblock_base;
978 		else if (pfn > it->mi_mblock_end)
979 			pfn = (pfn_t)-1;
980 		return (pfn);
981 	}
982 
983 	/*
984 	 * Find mblock that contains pfn, or first mblock after pfn,
985 	 * else pfn is out of bounds, so use the last mblock.
986 	 * mblocks are sorted in ascending address order.
987 	 */
988 	ASSERT(it->mi_last_mblock < n_mblocks);
989 	ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
990 	i = init ? 0 : it->mi_last_mblock + 1;
991 	if (i == n_mblocks)
992 		return ((pfn_t)-1);
993 
994 	for (; i < n_mblocks; i++) {
995 		if (pfn <= mpo_mblock[i].end_pfn)
996 			break;
997 	}
998 	if (i == n_mblocks) {
999 		it->mi_last_mblock = i - 1;
1000 		return ((pfn_t)-1);
1001 	}
1002 	it->mi_last_mblock = i;
1003 
1004 	/*
1005 	 * Memory stripes are defined if there is more than one locality
1006 	 * group, so use the stripe bounds.  Otherwise use mblock bounds.
1007 	 */
1008 	mblock = &mpo_mblock[i];
1009 	if (n_mem_stripes > 0) {
1010 		mem_stripe_t *ms =
1011 		    &mem_stripes[i * max_locality_groups + mnode];
1012 		base = ms->physbase;
1013 		end = ms->physmax;
1014 	} else {
1015 		ASSERT(mnode == 0);
1016 		base = mblock->base_pfn;
1017 		end = mblock->end_pfn;
1018 	}
1019 
1020 	it->mi_mnode = mnode;
1021 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1022 	it->mi_mblock_base = base;
1023 	it->mi_mblock_end = end;
1024 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1025 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1026 	it->mi_mnode_mask = max_locality_groups - 1;
1027 	if (pfn < base)
1028 		pfn = base;
1029 	else if (pfn > end)
1030 		pfn = (pfn_t)-1;
1031 	return (pfn);
1032 }
1033 
1034 /*
1035  * plat_mem_node_intersect_range()
1036  *
1037  * Find the intersection between a memnode and a range of pfn's.
1038  */
1039 void
1040 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1041     int mnode, pgcnt_t *npages_out)
1042 {
1043 	pfn_t offset, len, hole, base, end, test_end, frag;
1044 	pfn_t nearest;
1045 	mem_stripe_t *ms;
1046 	int i, npages;
1047 
1048 	*npages_out = 0;
1049 
1050 	if (!mem_node_config[mnode].exists || test_len == 0)
1051 		return;
1052 
1053 	base = mem_node_config[mnode].physbase;
1054 	end = mem_node_config[mnode].physmax;
1055 
1056 	test_end = test_base + test_len - 1;
1057 	if (end < test_base || base > test_end)
1058 		return;
1059 
1060 	if (n_locality_groups == 1) {
1061 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1062 		return;
1063 	}
1064 
1065 	hole = mnode_stride - mnode_pages;
1066 	npages = 0;
1067 
1068 	/*
1069 	 * Iterate over all the stripes for this mnode (one per mblock),
1070 	 * find the intersection with each, and accumulate the intersections.
1071 	 *
1072 	 * Determing the intersection with a stripe is tricky.  If base or end
1073 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1074 	 * mem_node.  If base or end fall in a gap, round them to start of
1075 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1076 	 * but calculate the fragment size that should be excluded from the
1077 	 * stripe.  Calculate how many strides fall in the adjusted range,
1078 	 * multiply by stripe width, and add the start and end fragments.
1079 	 */
1080 
1081 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1082 		ms = &mem_stripes[i];
1083 		if (ms->exists &&
1084 		    test_base <= (end = ms->physmax) &&
1085 		    test_end >= (base = ms->physbase)) {
1086 
1087 			offset = ms->offset;
1088 
1089 			if (test_base > base) {
1090 				/* Round test_base to next multiple of stride */
1091 				len = P2ROUNDUP(test_base - (base - offset),
1092 				    mnode_stride);
1093 				nearest = base - offset + len;
1094 				/*
1095 				 * Compute distance from test_base to the
1096 				 * stride boundary to see if test_base falls
1097 				 * in the stripe or in the hole.
1098 				 */
1099 				if (nearest - test_base > hole) {
1100 					/*
1101 					 * test_base lies in stripe,
1102 					 * and offset should be excluded.
1103 					 */
1104 					offset = test_base -
1105 					    (nearest - mnode_stride);
1106 					base = test_base;
1107 				} else {
1108 					/* round up to next stripe start */
1109 					offset = 0;
1110 					base = nearest;
1111 					if (base > end)
1112 						continue;
1113 				}
1114 
1115 			}
1116 
1117 			if (test_end < end)
1118 				end = test_end;
1119 			end++;		/* adjust to an exclusive bound */
1120 
1121 			/* Round end to next multiple of stride */
1122 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1123 			nearest = (base - offset) + len;
1124 			if (nearest - end <= hole) {
1125 				/* end falls in hole, use entire last stripe */
1126 				frag = 0;
1127 			} else {
1128 				/* end falls in stripe, compute fragment */
1129 				frag = nearest - hole - end;
1130 			}
1131 
1132 			len = (len >> stripe_shift) - offset - frag;
1133 			npages += len;
1134 		}
1135 	}
1136 
1137 	*npages_out = npages;
1138 }
1139 
1140 /*
1141  * valid_pages()
1142  *
1143  * Return 1 if pages are valid and do not cross mnode boundaries
1144  * (which would break page free list assumptions), and 0 otherwise.
1145  */
1146 
1147 #define	MNODE(pa)	\
1148 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1149 
1150 static int
1151 valid_pages(md_t *md, mde_cookie_t cpu0)
1152 {
1153 	int i, max_szc;
1154 	uint64_t last_page_base, szc_mask;
1155 	uint64_t max_page_len, max_coalesce_len;
1156 	struct mblock_md *mb = mpo_mblock;
1157 
1158 	/*
1159 	 * Find the smaller of the largest page possible and supported.
1160 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1161 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1162 	 * to get a sane mask.
1163 	 */
1164 
1165 	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1166 		szc_mask = 0;
1167 	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
1168 	max_szc = highbit(szc_mask) - 1;
1169 	if (max_szc > TTE256M)
1170 		max_szc = TTE256M;
1171 	max_page_len = TTEBYTES(max_szc);
1172 
1173 	/*
1174 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1175 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1176 	 * within one mnode to use MPO.
1177 	 */
1178 	max_coalesce_len = TTEBYTES(TTE256M);
1179 	ASSERT(max_coalesce_len >= max_page_len);
1180 
1181 	if (ptob(mnode_pages) < max_coalesce_len) {
1182 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1183 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1184 		return (0);
1185 	}
1186 
1187 	for (i = 0; i < n_mblocks; i++) {
1188 		uint64_t base = mb->base;
1189 		uint64_t end = mb->base + mb->size - 1;
1190 		uint64_t ra_to_pa = mb->ra_to_pa;
1191 
1192 		/*
1193 		 * If mblock is smaller than the max page size, then
1194 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1195 		 * not span mnodes.
1196 		 */
1197 		if (mb->size < max_page_len) {
1198 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1199 				MPO_STATUS("Small mblock spans mnodes; "
1200 				    "MPO disabled: base = %lx, end = %lx, "
1201 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1202 				return (0);
1203 			}
1204 		} else {
1205 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1206 			uint64_t pa_base = base + ra_to_pa;
1207 			if ((base & (max_coalesce_len - 1)) !=
1208 			    (pa_base & (max_coalesce_len - 1))) {
1209 				MPO_STATUS("bad page alignment; MPO disabled: "
1210 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1211 				    base, pa_base, max_coalesce_len);
1212 				return (0);
1213 			}
1214 		}
1215 
1216 		/*
1217 		 * Find start of last large page in mblock in RA space.
1218 		 * If page extends into the next mblock, verify the
1219 		 * mnode does not change.
1220 		 */
1221 		last_page_base = P2ALIGN(end, max_coalesce_len);
1222 		if (i + 1 < n_mblocks &&
1223 		    last_page_base + max_coalesce_len > mb[1].base &&
1224 		    MNODE(last_page_base + ra_to_pa) !=
1225 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1226 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1227 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1228 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1229 			    mb[1].ra_to_pa, max_coalesce_len);
1230 			return (0);
1231 		}
1232 
1233 		mb++;
1234 	}
1235 	return (1);
1236 }
1237 
1238 
1239 /*
1240  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1241  * if any, and remove them.  This yields a config where the "coarse
1242  * grained" lgroups cover all of memory, even though part of that memory
1243  * is fine grain interleaved and does not deliver a purely local memory
1244  * latency.
1245  *
1246  * This function reads and modifies the globals:
1247  *	mpo_lgroup[], n_lgrpnodes
1248  *
1249  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1250  */
1251 
1252 static int
1253 fix_interleave(void)
1254 {
1255 	int i, j;
1256 	uint64_t mask = 0;
1257 
1258 	j = 0;
1259 	for (i = 0; i < n_lgrpnodes; i++) {
1260 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1261 			/* remove this lgroup */
1262 			mask = mpo_lgroup[i].addr_mask;
1263 		} else {
1264 			mpo_lgroup[j++] = mpo_lgroup[i];
1265 		}
1266 	}
1267 	n_lgrpnodes = j;
1268 
1269 	if (mask != 0)
1270 		MPO_STATUS("sub-page interleave %lx found; "
1271 		    "removing lgroup.\n", mask);
1272 
1273 	return (mask != 0);
1274 }
1275