xref: /titanic_52/usr/src/uts/sun4v/os/mpo.c (revision d7448364b360ed82582291005bd9831f2a5d18a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/machsystm.h>
32 #include <sys/machparam.h>
33 #include <sys/cmn_err.h>
34 #include <sys/stat.h>
35 #include <sys/mach_descrip.h>
36 #include <sys/memnode.h>
37 #include <sys/mdesc.h>
38 #include <sys/mpo.h>
39 #include <vm/vm_dep.h>
40 #include <vm/hat_sfmmu.h>
41 #include <sys/promif.h>
42 
43 /*
44  * MPO and the sun4v memory representation
45  * ---------------------------------------
46  *
47  * Latency groups are defined in the sun4v achitecture by memory-latency-group
48  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
49  * tie together cpu nodes and mblock nodes, and contain mask and match
50  * properties that identify the portion of an mblock that belongs to the
51  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
52  * but an mblock defines Real Addresses (RA).  To translate, the mblock
53  * includes the property address-congruence-offset, hereafter referred to as
54  * ra_to_pa.  A real address ra is a member of an lgroup if
55  *
56  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
57  *
58  * The MD is traversed, and information on all mblocks is kept in the array
59  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
60  * to, is kept in the array mpo_cpu[].
61  *
62  * This implementation makes (and verifies) the simplifying assumption that
63  * the mask bits are the same for all defined lgroups, and that all 1 bits in
64  * the mask are contiguous.  Thus the number of lgroups is bounded by the
65  * number of possible mask values, and the lgrp_handle_t is defined as the
66  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
67  * masks and values are also referred to as "home bits" in the code.
68  *
69  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
70  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
71  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
72  * home bits.  This yields the mem_node.
73  *
74  * Interfaces
75  * ----------
76  *
77  * This file exports the following entry points:
78  *
79  * plat_lgrp_init()
80  * plat_build_mem_nodes()
81  * plat_lgrp_cpu_to_hand()
82  * plat_lgrp_latency()
83  * plat_pfn_to_mem_node()
84  *	These implement the usual platform lgroup interfaces.
85  *
86  * plat_rapfn_to_papfn()
87  *	Recover the PA page coloring bits from an RA.
88  *
89  * plat_mem_node_iterator_init()
90  *	Initialize an iterator to efficiently step through pages in a mem_node.
91  *
92  * plat_mem_node_intersect_range()
93  *	Find the intersection with a mem_node.
94  */
95 
96 int	sun4v_mpo_enable = 1;
97 int	sun4v_mpo_debug = 0;
98 char	sun4v_mpo_status[256] = "";
99 
100 /* Save CPU info from the MD and associate CPUs with lgroups */
101 static	struct cpu_md mpo_cpu[NCPU];
102 
103 /* Save lgroup info from the MD */
104 #define	MAX_MD_LGROUPS 32
105 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
106 static	int	n_lgrpnodes = 0;
107 static	int	n_locality_groups = 0;
108 static	int	max_locality_groups = 0;
109 
110 /* Save mblocks from the MD */
111 #define	SMALL_MBLOCKS_COUNT	8
112 static 	struct	mblock_md *mpo_mblock;
113 static	struct 	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
114 static	int	n_mblocks = 0;
115 
116 /* Save mem_node stripes calculate from mblocks and lgroups. */
117 static mem_stripe_t *mem_stripes;
118 static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
119 static	int 	mstripesz = 0;
120 static	int	n_mem_stripes = 0;
121 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
122 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
123 static	pfn_t	mnode_pages;	/* mem_node stripe width */
124 
125 /* Save home mask and shift used to calculate lgrp_handle_t values */
126 static	uint64_t home_mask = 0;
127 static	pfn_t	home_mask_pfn = 0;
128 static	int	home_mask_shift = 0;
129 static	uint_t	home_mask_pfn_shift = 0;
130 
131 /* Save lowest and highest latencies found across all lgroups */
132 static	int	lower_latency = 0;
133 static	int	higher_latency = 0;
134 
135 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
136 
137 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
138 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
139 static	int	fix_interleave(void);
140 
141 /* Debug support */
142 #if defined(DEBUG) && !defined(lint)
143 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
144 #else
145 #define	MPO_DEBUG(...)
146 #endif	/* DEBUG */
147 
148 /* Record status message, viewable from mdb */
149 #define	MPO_STATUS(args...) {						      \
150 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
151 	MPO_DEBUG(sun4v_mpo_status);					      \
152 }
153 
154 /*
155  * Routine to read a uint64_t from a given md
156  */
157 static	int64_t
158 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
159 {
160 	int err = md_get_prop_val(md, node, propname, val);
161 	return (err);
162 }
163 
164 static int
165 mblock_cmp(const void *a, const void *b)
166 {
167 	struct mblock_md *m1 = (struct mblock_md *)a;
168 	struct mblock_md *m2 = (struct mblock_md *)b;
169 
170 	if (m1->base < m2->base)
171 		return (-1);
172 	else if (m1->base == m2->base)
173 		return (0);
174 	else
175 		return (1);
176 }
177 
178 static void
179 mblock_sort(struct mblock_md *mblocks, int n)
180 {
181 	extern void qsort(void *, size_t, size_t,
182 	    int (*)(const void *, const void *));
183 
184 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
185 }
186 
187 static void
188 mpo_update_tunables(void)
189 {
190 	int i, ncpu_min;
191 
192 	/*
193 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
194 	 * this process is currently running on before considering
195 	 *  expanding threads to another lgroup.
196 	 *
197 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
198 	 *  must be loaded before expanding to it.
199 	 *
200 	 * On sun4v CMT processors, threads share a core pipeline, and
201 	 * at less than 100% utilization, best throughput is obtained by
202 	 * spreading threads across more cores, even if some are in a
203 	 * different lgroup.  Spread threads to a new lgroup if the
204 	 * current group is more than 50% loaded.  Because of virtualization,
205 	 * lgroups may have different numbers of CPUs, but the tunables
206 	 * apply to all lgroups, so find the smallest lgroup and compute
207 	 * 50% loading.
208 	 */
209 
210 	ncpu_min = NCPU;
211 	for (i = 0; i < n_lgrpnodes; i++) {
212 		int ncpu = mpo_lgroup[i].ncpu;
213 		if (ncpu != 0 && ncpu < ncpu_min)
214 			ncpu_min = ncpu;
215 	}
216 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
217 
218 	/* new home may only be half as loaded as the existing home to use it */
219 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
220 
221 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
222 }
223 
224 static mde_cookie_t
225 cpuid_to_cpunode(md_t *md, int cpuid)
226 {
227 	mde_cookie_t    rootnode, foundnode, *cpunodes;
228 	uint64_t	cpuid_prop;
229 	int 	n_cpunodes, i;
230 
231 	if (md == NULL)
232 		return (MDE_INVAL_ELEM_COOKIE);
233 
234 	rootnode = md_root_node(md);
235 	if (rootnode == MDE_INVAL_ELEM_COOKIE)
236 		return (MDE_INVAL_ELEM_COOKIE);
237 
238 	n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
239 	    "fwd", &cpunodes);
240 	if (n_cpunodes <= 0 || n_cpunodes > NCPU)
241 		goto cpuid_fail;
242 
243 	for (i = 0; i < n_cpunodes; i++) {
244 		if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
245 		    &cpuid_prop))
246 			break;
247 		if (cpuid_prop == (uint64_t)cpuid) {
248 			foundnode = cpunodes[i];
249 			md_free_scan_dag(md, &cpunodes);
250 			return (foundnode);
251 		}
252 	}
253 cpuid_fail:
254 	if (n_cpunodes > 0)
255 		md_free_scan_dag(md, &cpunodes);
256 	return (MDE_INVAL_ELEM_COOKIE);
257 }
258 
259 static int
260 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
261 {
262 	mde_cookie_t *nodes;
263 	uint64_t latency, lowest_latency;
264 	uint64_t address_match, lowest_address_match;
265 	int n_lgroups, j, result = 0;
266 
267 	/* Find lgroup nodes reachable from this cpu */
268 	n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
269 	    "fwd", &nodes);
270 
271 	lowest_latency = ~(0UL);
272 
273 	/* Find the lgroup node with the smallest latency */
274 	for (j = 0; j < n_lgroups; j++) {
275 		result = get_int(md, nodes[j], PROP_LG_LATENCY,
276 		    &latency);
277 		result |= get_int(md, nodes[j], PROP_LG_MATCH,
278 		    &address_match);
279 		if (result != 0) {
280 			j = -1;
281 			goto to_lgrp_done;
282 		}
283 		if (latency < lowest_latency) {
284 			lowest_latency = latency;
285 			lowest_address_match = address_match;
286 		}
287 	}
288 	for (j = 0; j < n_lgrpnodes; j++) {
289 		if ((mpo_lgroup[j].latency == lowest_latency) &&
290 		    (mpo_lgroup[j].addr_match == lowest_address_match))
291 			break;
292 	}
293 	if (j == n_lgrpnodes)
294 		j = -1;
295 
296 to_lgrp_done:
297 	if (n_lgroups > 0)
298 		md_free_scan_dag(md, &nodes);
299 	return (j);
300 }
301 
302 /* Called when DR'ing in a CPU */
303 void
304 mpo_cpu_add(int cpuid)
305 {
306 	md_t *md;
307 	mde_cookie_t cpunode;
308 
309 	int i;
310 
311 	if (n_lgrpnodes <= 0)
312 		return;
313 
314 	md = md_get_handle();
315 
316 	if (md == NULL)
317 		goto add_fail;
318 
319 	cpunode = cpuid_to_cpunode(md, cpuid);
320 	if (cpunode == MDE_INVAL_ELEM_COOKIE)
321 		goto add_fail;
322 
323 	i = mpo_cpu_to_lgroup(md, cpunode);
324 	if (i == -1)
325 		goto add_fail;
326 
327 	mpo_cpu[cpuid].lgrp_index = i;
328 	mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
329 	mpo_lgroup[i].ncpu++;
330 	mpo_update_tunables();
331 	(void) md_fini_handle(md);
332 	return;
333 add_fail:
334 	panic("mpo_cpu_add: Cannot read MD");
335 }
336 
337 /* Called when DR'ing out a CPU */
338 void
339 mpo_cpu_remove(int cpuid)
340 {
341 	int i;
342 
343 	if (n_lgrpnodes <= 0)
344 		return;
345 
346 	i = mpo_cpu[cpuid].lgrp_index;
347 	mpo_lgroup[i].ncpu--;
348 	mpo_cpu[cpuid].home = 0;
349 	mpo_cpu[cpuid].lgrp_index = -1;
350 	mpo_update_tunables();
351 }
352 
353 /*
354  *
355  * Traverse the MD to determine:
356  *
357  *  Number of CPU nodes, lgrp_nodes, and mblocks
358  *  Then for each lgrp_node, obtain the appropriate data.
359  *  For each CPU, determine its home locality and store it.
360  *  For each mblock, retrieve its data and store it.
361  */
362 static	int
363 lgrp_traverse(md_t *md)
364 {
365 	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
366 	uint64_t i, j, k, o, n_nodes;
367 	uint64_t mem_lg_homeset = 0;
368 	int ret_val = 0;
369 	int result = 0;
370 	int n_cpunodes = 0;
371 	int sub_page_fix;
372 	int mblocksz = 0;
373 	size_t allocsz;
374 
375 	n_nodes = md_node_count(md);
376 
377 	if (n_nodes <= 0) {
378 		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
379 		ret_val = -1;
380 		goto fail;
381 	}
382 
383 	root = md_root_node(md);
384 
385 	if (root == MDE_INVAL_ELEM_COOKIE) {
386 		MPO_STATUS("lgrp_traverse: Root node is missing\n");
387 		ret_val = -1;
388 		goto fail;
389 	}
390 
391 	/*
392 	 * Build the Memory Nodes.  Do this before any possibility of
393 	 * bailing from this routine so we obtain ra_to_pa (needed for page
394 	 * coloring) even when there are no lgroups defined.
395 	 */
396 
397 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
398 	    "fwd", &mblocknodes);
399 
400 	if (n_mblocks <= 0) {
401 		MPO_STATUS("lgrp_traverse: No mblock "
402 		    "nodes detected in Machine Descriptor\n");
403 		n_mblocks = 0;
404 		ret_val = -1;
405 		goto fail;
406 	}
407 	/*
408 	 * If we have a small number of mblocks we will use the space
409 	 * that we preallocated. Otherwise, we will dynamically
410 	 * allocate the space
411 	 */
412 	mblocksz = n_mblocks * sizeof (struct mblock_md);
413 	mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t);
414 
415 	if (n_mblocks <= SMALL_MBLOCKS_COUNT) {
416 		mpo_mblock = &small_mpo_mblocks[0];
417 		mem_stripes = &small_mem_stripes[0];
418 	} else {
419 		allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
420 	/* Ensure that we dont request more space than reserved */
421 		if (allocsz > MPOBUF_SIZE) {
422 			MPO_STATUS("lgrp_traverse: Insufficient space "
423 			    "for mblock structures \n");
424 			ret_val = -1;
425 			n_mblocks = 0;
426 			goto fail;
427 		}
428 		mpo_mblock = (struct mblock_md *)
429 		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
430 		if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) {
431 			MPO_STATUS("lgrp_traverse: Cannot allocate space "
432 			    "for mblocks \n");
433 			ret_val = -1;
434 			n_mblocks = 0;
435 			goto fail;
436 		}
437 		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
438 		mpo_heap32_bufsz = MPOBUF_SIZE;
439 
440 		mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks);
441 	}
442 	for (i = 0, j = 0; j < n_mblocks; j++) {
443 		mpo_mblock[i].node = mblocknodes[j];
444 
445 		/* Without a base or size value we will fail */
446 		result = get_int(md, mblocknodes[j], PROP_LG_BASE,
447 		    &mpo_mblock[i].base);
448 		if (result < 0) {
449 			MPO_STATUS("lgrp_traverse: "
450 			    "PROP_LG_BASE is missing\n");
451 			n_mblocks = 0;
452 			ret_val = -1;
453 			goto fail;
454 		}
455 
456 		result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
457 		    &mpo_mblock[i].size);
458 		if (result < 0) {
459 			MPO_STATUS("lgrp_traverse: "
460 			    "PROP_LG_SIZE is missing\n");
461 			n_mblocks = 0;
462 			ret_val = -1;
463 			goto fail;
464 		}
465 
466 		result = get_int(md, mblocknodes[j],
467 		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
468 
469 		/* If we don't have an ra_pa_offset, just set it to 0 */
470 		if (result < 0)
471 			mpo_mblock[i].ra_to_pa = 0;
472 
473 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
474 		    "ra_to_pa = %lx\n", i,
475 		    mpo_mblock[i].base,
476 		    mpo_mblock[i].size,
477 		    mpo_mblock[i].ra_to_pa);
478 
479 		/* check for unsupportable values of base and size */
480 		if (mpo_mblock[i].base >
481 		    mpo_mblock[i].base + mpo_mblock[i].size) {
482 			MPO_STATUS("lgrp_traverse: "
483 			    "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
484 			    "base = %lx, size = %lx",
485 			    mpo_mblock[i].base, mpo_mblock[i].size);
486 			n_mblocks = 0;
487 			ret_val = -1;
488 			goto fail;
489 		}
490 
491 		/* eliminate size==0 blocks */
492 		if (mpo_mblock[i].size != 0) {
493 			i++;
494 		}
495 	}
496 
497 	if (i == 0) {
498 		MPO_STATUS("lgrp_traverse: "
499 		    "No non-empty mblock nodes were found "
500 		    "in the Machine Descriptor\n");
501 		n_mblocks = 0;
502 		ret_val = -1;
503 		goto fail;
504 	}
505 	ASSERT(i <= n_mblocks);
506 	n_mblocks = i;
507 
508 	/* Must sort mblocks by address for mem_node_iterator_init() */
509 	mblock_sort(mpo_mblock, n_mblocks);
510 
511 	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
512 
513 	/* Page coloring hook is required so we can iterate through mnodes */
514 	if (&page_next_pfn_for_color_cpu == NULL) {
515 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
516 		ret_val = -1;
517 		goto fail;
518 	}
519 
520 	/* Global enable for mpo */
521 	if (sun4v_mpo_enable == 0) {
522 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
523 		ret_val = -1;
524 		goto fail;
525 	}
526 
527 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
528 	    "fwd", &lgrpnodes);
529 
530 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
531 		MPO_STATUS("lgrp_traverse: No Lgroups\n");
532 		ret_val = -1;
533 		goto fail;
534 	}
535 
536 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
537 
538 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
539 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
540 		    "in MD\n");
541 		ret_val = -1;
542 		goto fail;
543 	}
544 
545 	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
546 	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
547 	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
548 	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
549 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
550 	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
551 
552 	for (i = 0; i < n_lgrpnodes; i++) {
553 		mpo_lgroup[i].node = lgrpnodes[i];
554 		mpo_lgroup[i].id = i;
555 		mpo_lgroup[i].ncpu = 0;
556 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
557 		    &mpo_lgroup[i].addr_mask);
558 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
559 		    &mpo_lgroup[i].addr_match);
560 
561 		/*
562 		 * If either the mask or match properties are missing, set to 0
563 		 */
564 		if (result < 0) {
565 			mpo_lgroup[i].addr_mask = 0;
566 			mpo_lgroup[i].addr_match = 0;
567 		}
568 
569 		/* Set latency to 0 if property not present */
570 
571 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
572 		    &mpo_lgroup[i].latency);
573 		if (result < 0)
574 			mpo_lgroup[i].latency = 0;
575 	}
576 
577 	/*
578 	 * Sub-page level interleave is not yet supported.  Check for it,
579 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
580 	 * n_lgrpnodes.  If no lgroups are left, return.
581 	 */
582 
583 	sub_page_fix = fix_interleave();
584 	if (n_lgrpnodes == 0) {
585 		ret_val = -1;
586 		goto fail;
587 	}
588 
589 	/* Ensure that all of the addr_mask values are the same */
590 
591 	for (i = 0; i < n_lgrpnodes; i++) {
592 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
593 			MPO_STATUS("lgrp_traverse: "
594 			    "addr_mask values are not the same\n");
595 			ret_val = -1;
596 			goto fail;
597 		}
598 	}
599 
600 	/*
601 	 * Ensure that all lgrp nodes see all the mblocks. However, if
602 	 * sub-page interleave is being fixed, they do not, so skip
603 	 * the check.
604 	 */
605 
606 	if (sub_page_fix == 0) {
607 		for (i = 0; i < n_lgrpnodes; i++) {
608 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
609 			    PROP_LG_MBLOCK, "fwd", &nodes);
610 			md_free_scan_dag(md, &nodes);
611 			if (j != n_mblocks) {
612 				MPO_STATUS("lgrp_traverse: "
613 				    "sub-page interleave is being fixed\n");
614 				ret_val = -1;
615 				goto fail;
616 			}
617 		}
618 	}
619 
620 	/*
621 	 * Use the address mask from the first lgroup node
622 	 * to establish our home_mask.
623 	 */
624 	home_mask = mpo_lgroup[0].addr_mask;
625 	home_mask_pfn = btop(home_mask);
626 	home_mask_shift = lowbit(home_mask) - 1;
627 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
628 	mnode_pages = btop(1ULL << home_mask_shift);
629 
630 	/*
631 	 * How many values are possible in home mask?  Assume the mask
632 	 * bits are contiguous.
633 	 */
634 	max_locality_groups =
635 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
636 
637 	/* Now verify the home mask bits are contiguous */
638 
639 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
640 		MPO_STATUS("lgrp_traverse: "
641 		    "home mask bits are not contiguous\n");
642 		ret_val = -1;
643 		goto fail;
644 	}
645 
646 	/* Record all of the home bits */
647 
648 	for (i = 0; i < n_lgrpnodes; i++) {
649 		HOMESET_ADD(mem_lg_homeset,
650 		    mpo_lgroup[i].addr_match >> home_mask_shift);
651 	}
652 
653 	/* Count the number different "home"  mem_lg's we've discovered */
654 
655 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
656 
657 	/* If we have only 1 locality group then we can exit */
658 	if (n_locality_groups == 1) {
659 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
660 		ret_val = -1;
661 		goto fail;
662 	}
663 
664 	/*
665 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
666 	 * latency found.  All other memory is considered remote, and the
667 	 * remote latency is represented by the highest latency found.
668 	 * Thus hierarchical lgroups, if any, are approximated by a
669 	 * two level scheme.
670 	 *
671 	 * The Solaris MPO framework by convention wants to see latencies
672 	 * in units of nano-sec/10. In the MD, the units are defined to be
673 	 * pico-seconds.
674 	 */
675 
676 	lower_latency = mpo_lgroup[0].latency;
677 	higher_latency = mpo_lgroup[0].latency;
678 
679 	for (i = 1; i < n_lgrpnodes; i++) {
680 		if (mpo_lgroup[i].latency < lower_latency) {
681 			lower_latency = mpo_lgroup[i].latency;
682 		}
683 		if (mpo_lgroup[i].latency > higher_latency) {
684 			higher_latency = mpo_lgroup[i].latency;
685 		}
686 	}
687 	lower_latency /= 10000;
688 	higher_latency /= 10000;
689 
690 	/* Clear our CPU data */
691 
692 	for (i = 0; i < NCPU; i++) {
693 		mpo_cpu[i].home = 0;
694 		mpo_cpu[i].lgrp_index = -1;
695 	}
696 
697 	/* Build the CPU nodes */
698 	for (i = 0; i < n_cpunodes; i++) {
699 
700 		/* Read in the lgroup nodes */
701 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
702 		if (result < 0) {
703 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
704 			ret_val = -1;
705 			goto fail;
706 		}
707 
708 		o = mpo_cpu_to_lgroup(md, cpunodes[i]);
709 		if (o == -1) {
710 			ret_val = -1;
711 			goto fail;
712 		}
713 		mpo_cpu[k].lgrp_index = o;
714 		mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
715 		mpo_lgroup[o].ncpu++;
716 	}
717 	/* Validate that no large pages cross mnode boundaries. */
718 	if (valid_pages(md, cpunodes[0]) == 0) {
719 		ret_val = -1;
720 		goto fail;
721 	}
722 
723 fail:
724 	/* MD cookies are no longer valid; ensure they are not used again. */
725 	for (i = 0; i < n_mblocks; i++)
726 		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
727 	for (i = 0; i < n_lgrpnodes; i++)
728 		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
729 
730 	if (n_cpunodes > 0)
731 		md_free_scan_dag(md, &cpunodes);
732 	if (n_lgrpnodes > 0)
733 		md_free_scan_dag(md, &lgrpnodes);
734 	if (n_mblocks > 0)
735 		md_free_scan_dag(md, &mblocknodes);
736 	else
737 		panic("lgrp_traverse: No memory blocks found");
738 
739 	if (ret_val == 0)
740 		MPO_STATUS("MPO feature is enabled.\n");
741 
742 	return (ret_val);
743 }
744 
745 /*
746  *  Determine the number of unique mem_lg's present in our system
747  */
748 static	int
749 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
750 {
751 	int homeid;
752 	int count = 0;
753 
754 	/*
755 	 * Scan the "home" bits of the mem_lgs, count
756 	 * the number that are unique.
757 	 */
758 
759 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
760 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
761 			count++;
762 		}
763 	}
764 
765 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
766 	    mem_lg_homeset);
767 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
768 
769 	/* Default must be at least one */
770 	if (count == 0)
771 		count = 1;
772 
773 	return (count);
774 }
775 
776 /*
777  * Platform specific lgroup initialization
778  */
779 void
780 plat_lgrp_init(void)
781 {
782 	md_t *md;
783 	int rc;
784 
785 	/* Get the Machine Descriptor handle */
786 
787 	md = md_get_handle();
788 
789 	/* If not, we cannot continue */
790 
791 	if (md == NULL) {
792 		panic("cannot access machine descriptor\n");
793 	} else {
794 		rc = lgrp_traverse(md);
795 		(void) md_fini_handle(md);
796 	}
797 
798 	/*
799 	 * If we can't process the MD for lgroups then at least let the
800 	 * system try to boot.  Assume we have one lgroup so that
801 	 * when plat_build_mem_nodes is called, it will attempt to init
802 	 * an mnode based on the supplied memory segment.
803 	 */
804 
805 	if (rc == -1) {
806 		home_mask_pfn = 0;
807 		max_locality_groups = 1;
808 		n_locality_groups = 1;
809 		return;
810 	}
811 
812 	mem_node_pfn_shift = 0;
813 	mem_node_physalign = 0;
814 
815 	/* Use lgroup-aware TSB allocations */
816 	tsb_lgrp_affinity = 1;
817 
818 	/* Require that a home lgroup have some memory to be chosen */
819 	lgrp_mem_free_thresh = 1;
820 
821 	/* Standard home-on-next-touch policy */
822 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
823 
824 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
825 	lgrp_load_thresh = UINT32_MAX;
826 
827 	mpo_update_tunables();
828 }
829 
830 /*
831  *  Helper routine for debugging calls to mem_node_add_slice()
832  */
833 static	void
834 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
835 {
836 #if defined(DEBUG) && !defined(lint)
837 	static int slice_count = 0;
838 
839 	slice_count++;
840 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
841 	    slice_count, basepfn, endpfn);
842 #endif
843 	mem_node_add_slice(basepfn, endpfn);
844 }
845 
846 /*
847  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
848  */
849 static	void
850 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
851 {
852 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
853 	    "mnode index: %d\n", plathand, mnode);
854 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
855 }
856 
857 /*
858  * plat_build_mem_nodes()
859  *
860  * Define the mem_nodes based on the modified boot memory list,
861  * or based on info read from the MD in plat_lgrp_init().
862  *
863  * When the home mask lies in the middle of the address bits (as it does on
864  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
865  * it is striped across an mblock in a repeating pattern of contiguous memory
866  * followed by a gap.  The stripe width is the size of the contiguous piece.
867  * The stride is the distance from the start of one contiguous piece to the
868  * start of the next.  The gap is thus stride - stripe_width.
869  *
870  * The stripe of an mnode that falls within an mblock is described by the type
871  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
872  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
873  * this array is predetermined.  The mem_stripe_t that describes mnode m
874  * within mpo_mblock[i] is stored at
875  *	 mem_stripes[ m + i * max_locality_groups ]
876  *
877  * max_locality_groups is the total number of possible locality groups,
878  * as defined by the size of the home mask, even if the memory assigned
879  * to the domain is small and does not cover all the lgroups.  Thus some
880  * mem_stripe_t's may be empty.
881  *
882  * The members of mem_stripe_t are:
883  *	physbase: First valid page in mem_node in the corresponding mblock
884  *	physmax: Last valid page in mem_node in mblock
885  *	offset:  The full stripe width starts at physbase - offset.
886  *	    Thus if offset is non-zero, this mem_node starts in the middle
887  *	    of a stripe width, and the second full stripe starts at
888  *	    physbase - offset + stride.  (even though physmax may fall in the
889  *	    middle of a stripe width, we do not save the ending fragment size
890  *	    in this data structure.)
891  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
892  *
893  *	The stripe width is kept in the global mnode_pages.
894  *	The stride is kept in the global mnode_stride.
895  *	All the above use pfn's as the unit.
896  *
897  * As an example, the memory layout for a domain with 2 mblocks and 4
898  * mem_nodes 0,1,2,3 could look like this:
899  *
900  *	123012301230 ...	012301230123 ...
901  *	  mblock 0		  mblock 1
902  */
903 
904 void
905 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
906 {
907 	lgrp_handle_t lgrphand, lgrp_start;
908 	int i, mnode, elem;
909 	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
910 	uint64_t stripe, frag, remove;
911 	mem_stripe_t *ms;
912 
913 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
914 	max_mem_nodes = max_locality_groups;
915 
916 	/* Check for non-MPO sun4v platforms */
917 	if (n_locality_groups <= 1) {
918 		ASSERT(n_locality_groups == 1);
919 		ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
920 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
921 		for (elem = 0; elem < nelems; list++, elem++) {
922 			base = list->addr;
923 			len = list->size;
924 
925 			mpo_mem_node_add_slice(btop(base),
926 			    btop(base + len - 1));
927 		}
928 		mem_node_pfn_shift = 0;
929 		mem_node_physalign = 0;
930 
931 		if (n_mblocks == 1) {
932 			n_mem_stripes = 0;
933 		} else {
934 			n_mem_stripes = n_mblocks;
935 			bzero(mem_stripes, mstripesz);
936 			for (i = 0; i < n_mblocks; i++) {
937 				base = mpo_mblock[i].base;
938 				end = base + mpo_mblock[i].size;
939 				ASSERT(end > base);
940 				mem_stripes[i].exists = 1;
941 				mpo_mblock[i].base_pfn = btop(base);
942 				mpo_mblock[i].end_pfn = btop(end - 1);
943 				mem_stripes[i].physbase =
944 				    mpo_mblock[i].base_pfn;
945 				mem_stripes[i].physmax = mpo_mblock[i].end_pfn;
946 			}
947 		}
948 		return;
949 	}
950 
951 	bzero(mem_stripes, mstripesz);
952 	stripe = ptob(mnode_pages);
953 	stride = max_locality_groups * stripe;
954 
955 	/* Save commonly used values in globals */
956 	mnode_stride = btop(stride);
957 	n_mem_stripes = max_locality_groups * n_mblocks;
958 	stripe_shift = highbit(max_locality_groups) - 1;
959 
960 	for (i = 0; i < n_mblocks; i++) {
961 		base = mpo_mblock[i].base;
962 		end = mpo_mblock[i].base + mpo_mblock[i].size;
963 		ra_to_pa = mpo_mblock[i].ra_to_pa;
964 		mpo_mblock[i].base_pfn = btop(base);
965 		mpo_mblock[i].end_pfn = btop(end - 1);
966 
967 		/* Find the offset from the prev stripe boundary in PA space. */
968 		offset = (base + ra_to_pa) & (stripe - 1);
969 
970 		/* Set the next stripe boundary. */
971 		stripe_end = base - offset + stripe;
972 
973 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
974 		    home_mask_shift);
975 		lgrphand = lgrp_start;
976 
977 		/*
978 		 * Loop over all lgroups covered by the mblock, creating a
979 		 * stripe for each.  Stop when lgrp_start is visited again.
980 		 */
981 		do {
982 			/* mblock may not span all lgroups */
983 			if (base >= end)
984 				break;
985 
986 			mnode = lgrphand;
987 			ASSERT(mnode < max_mem_nodes);
988 
989 			/*
990 			 * Calculate the size of the fragment that does not
991 			 * belong to the mnode in the last partial stride.
992 			 */
993 			frag = (end - (base - offset)) & (stride - 1);
994 			if (frag == 0) {
995 				/* remove the gap */
996 				remove = stride - stripe;
997 			} else if (frag < stripe) {
998 				/* fragment fits in stripe; keep it all */
999 				remove = 0;
1000 			} else {
1001 				/* fragment is large; trim after whole stripe */
1002 				remove = frag - stripe;
1003 			}
1004 
1005 			ms = &mem_stripes[i * max_locality_groups + mnode];
1006 			ms->physbase = btop(base);
1007 			ms->physmax = btop(end - 1 - remove);
1008 			ms->offset = btop(offset);
1009 			ms->exists = 1;
1010 
1011 			/*
1012 			 * If we have only 1 lgroup and multiple mblocks,
1013 			 * then we have already established our lgrp handle
1014 			 * to mem_node and mem_node_config values above.
1015 			 */
1016 			if (n_locality_groups > 1) {
1017 				mpo_plat_assign_lgrphand_to_mem_node(lgrphand,
1018 				    mnode);
1019 				mpo_mem_node_add_slice(ms->physbase,
1020 				    ms->physmax);
1021 			}
1022 			base = stripe_end;
1023 			stripe_end += stripe;
1024 			offset = 0;
1025 			lgrphand = (((base + ra_to_pa) & home_mask) >>
1026 			    home_mask_shift);
1027 		} while (lgrphand != lgrp_start);
1028 	}
1029 
1030 	/*
1031 	 * Indicate to vm_pagelist that the hpm_counters array
1032 	 * should be shared because the ranges overlap.
1033 	 */
1034 	if (max_mem_nodes > 1) {
1035 		interleaved_mnodes = 1;
1036 	}
1037 }
1038 
1039 /*
1040  * Return the locality group value for the supplied processor
1041  */
1042 lgrp_handle_t
1043 plat_lgrp_cpu_to_hand(processorid_t id)
1044 {
1045 	if (n_locality_groups > 1) {
1046 		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
1047 	} else {
1048 		return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */
1049 	}
1050 }
1051 
1052 int
1053 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
1054 {
1055 	/*
1056 	 * Return min remote latency when there are more than two lgroups
1057 	 * (root and child) and getting latency between two different lgroups
1058 	 * or root is involved.
1059 	 */
1060 	if (lgrp_optimizations() && (from != to ||
1061 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
1062 		return ((int)higher_latency);
1063 	} else {
1064 		return ((int)lower_latency);
1065 	}
1066 }
1067 
1068 int
1069 plat_pfn_to_mem_node(pfn_t pfn)
1070 {
1071 	int i, mnode;
1072 	pfn_t ra_to_pa_pfn;
1073 	struct mblock_md *mb;
1074 
1075 	if (n_locality_groups <= 1)
1076 		return (0);
1077 
1078 	/*
1079 	 * The mnode is defined to be 1:1 with the lgroup handle, which
1080 	 * is taken from from the home bits.  Find the mblock in which
1081 	 * the pfn falls to get the ra_to_pa adjustment, and extract
1082 	 * the home bits.
1083 	 */
1084 	mb = &mpo_mblock[0];
1085 	for (i = 0; i < n_mblocks; i++) {
1086 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1087 			ra_to_pa_pfn = btop(mb->ra_to_pa);
1088 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1089 			    home_mask_pfn_shift);
1090 			ASSERT(mnode < max_mem_nodes);
1091 			return (mnode);
1092 		}
1093 		mb++;
1094 	}
1095 
1096 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1097 	return (pfn);
1098 }
1099 
1100 /*
1101  * plat_rapfn_to_papfn
1102  *
1103  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1104  * and home mask bits are correct.  The upper bits do not necessarily
1105  * match the actual PA, however.
1106  */
1107 pfn_t
1108 plat_rapfn_to_papfn(pfn_t pfn)
1109 {
1110 	int i;
1111 	pfn_t ra_to_pa_pfn;
1112 	struct mblock_md *mb;
1113 
1114 	ASSERT(n_mblocks > 0);
1115 	if (n_mblocks == 1)
1116 		return (pfn + base_ra_to_pa_pfn);
1117 
1118 	/*
1119 	 * Find the mblock in which the pfn falls
1120 	 * in order to get the ra_to_pa adjustment.
1121 	 */
1122 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1123 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1124 			ra_to_pa_pfn = btop(mb->ra_to_pa);
1125 			return (pfn + ra_to_pa_pfn);
1126 		}
1127 	}
1128 
1129 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1130 	return (pfn);
1131 }
1132 
1133 /*
1134  * plat_mem_node_iterator_init()
1135  *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1136  *      no additional iterator function.  The caller uses the info from
1137  *      the iterator structure directly.
1138  *
1139  *      pfn: starting pfn.
1140  *      mnode: desired mnode.
1141  *	szc: desired page size.
1142  *      init:
1143  *          if 1, start a new traversal, initialize "it", find first
1144  *              mblock containing pfn, and return its starting pfn
1145  *              within the mnode.
1146  *          if 0, continue the previous traversal using passed-in data
1147  *              from "it", advance to the next mblock, and return its
1148  *              starting pfn within the mnode.
1149  *      it: returns readonly data to the caller; see below.
1150  *
1151  *	The input pfn must be aligned for the page size szc.
1152  *
1153  *      Returns: starting pfn for the iteration for the mnode/mblock,
1154  *	    which is aligned according to the page size,
1155  *          or returns (pfn_t)(-1) if the input pfn lies past the last
1156  *          valid pfn of the mnode.
1157  *      Returns misc values in the "it" struct that allows the caller
1158  *          to advance the pfn within an mblock using address arithmetic;
1159  *          see definition of mem_node_iterator_t in vm_dep.h.
1160  *          When the caller calculates a pfn that is greater than the
1161  *          returned value it->mi_mblock_end, the caller should again
1162  *          call plat_mem_node_iterator_init, passing init=0.
1163  */
1164 pfn_t
1165 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1166     mem_node_iterator_t *it, int init)
1167 {
1168 	int i;
1169 	pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1170 	struct mblock_md *mblock;
1171 	pfn_t base, end;
1172 	mem_stripe_t *ms;
1173 	uint64_t szcpagesize;
1174 
1175 	ASSERT(it != NULL);
1176 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1177 	ASSERT(n_mblocks > 0);
1178 	ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1179 
1180 	if (init) {
1181 		it->mi_last_mblock = 0;
1182 		it->mi_init = 1;
1183 	}
1184 
1185 	/* Check if mpo is not enabled and we only have one mblock */
1186 	if (n_locality_groups == 1 && n_mblocks == 1) {
1187 		if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt))
1188 			return ((pfn_t)-1);
1189 		it->mi_mnode = mnode;
1190 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1191 		it->mi_mnode_pfn_mask = 0;
1192 		it->mi_mnode_pfn_shift = 0;
1193 		it->mi_mnode_mask = 0;
1194 		it->mi_mblock_base = mem_node_config[mnode].physbase;
1195 		it->mi_mblock_end = mem_node_config[mnode].physmax;
1196 		if (pfn < it->mi_mblock_base)
1197 			pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1198 		if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1199 			pfn = (pfn_t)-1;
1200 		return (pfn);
1201 	}
1202 
1203 	/* init=1 means begin iterator, init=0 means continue */
1204 	if (init == 1) {
1205 		i = 0;
1206 	} else {
1207 		ASSERT(it->mi_last_mblock < n_mblocks);
1208 		i = it->mi_last_mblock;
1209 		ASSERT(pfn >
1210 		    mem_stripes[i * max_locality_groups + mnode].physmax);
1211 		if (++i == n_mblocks)
1212 			return ((pfn_t)-1);
1213 	}
1214 
1215 	/*
1216 	 * Find mblock that contains pfn for mnode's stripe, or first such an
1217 	 * mblock after pfn, else pfn is out of bound and we'll return -1.
1218 	 * mblocks and stripes are sorted in ascending address order.
1219 	 */
1220 	szcpagesize = szcpgcnt << PAGESHIFT;
1221 	for (; i < n_mblocks; i++) {
1222 		if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1223 			continue;
1224 		ms = &mem_stripes[i * max_locality_groups + mnode];
1225 		if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1226 		    (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1227 		    ms->physmax)
1228 			break;
1229 	}
1230 	if (i == n_mblocks) {
1231 		it->mi_last_mblock = i - 1;
1232 		return ((pfn_t)-1);
1233 	}
1234 
1235 	it->mi_last_mblock = i;
1236 
1237 	mblock = &mpo_mblock[i];
1238 	base = ms->physbase;
1239 	end = ms->physmax;
1240 
1241 	it->mi_mnode = mnode;
1242 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1243 	it->mi_mblock_base = base;
1244 	it->mi_mblock_end = end;
1245 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1246 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1247 	it->mi_mnode_mask = max_locality_groups - 1;
1248 	if (pfn < base) {
1249 		pfn = P2ROUNDUP(base, szcpgcnt);
1250 		ASSERT(pfn + szcpgcnt - 1 <= end);
1251 	}
1252 	ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1253 	return (pfn);
1254 }
1255 
1256 /*
1257  * plat_mem_node_intersect_range()
1258  *
1259  * Find the intersection between a memnode and a range of pfn's.
1260  */
1261 void
1262 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1263     int mnode, pgcnt_t *npages_out)
1264 {
1265 	pfn_t offset, len, hole, base, end, test_end, frag;
1266 	pfn_t nearest;
1267 	mem_stripe_t *ms;
1268 	int i, npages;
1269 
1270 	*npages_out = 0;
1271 
1272 	if (!mem_node_config[mnode].exists || test_len == 0)
1273 		return;
1274 
1275 	base = mem_node_config[mnode].physbase;
1276 	end = mem_node_config[mnode].physmax;
1277 
1278 	test_end = test_base + test_len - 1;
1279 	if (end < test_base || base > test_end)
1280 		return;
1281 
1282 	if (n_locality_groups == 1) {
1283 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1284 		return;
1285 	}
1286 
1287 	hole = mnode_stride - mnode_pages;
1288 	npages = 0;
1289 
1290 	/*
1291 	 * Iterate over all the stripes for this mnode (one per mblock),
1292 	 * find the intersection with each, and accumulate the intersections.
1293 	 *
1294 	 * Determing the intersection with a stripe is tricky.  If base or end
1295 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1296 	 * mem_node.  If base or end fall in a gap, round them to start of
1297 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1298 	 * but calculate the fragment size that should be excluded from the
1299 	 * stripe.  Calculate how many strides fall in the adjusted range,
1300 	 * multiply by stripe width, and add the start and end fragments.
1301 	 */
1302 
1303 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1304 		ms = &mem_stripes[i];
1305 		if (ms->exists &&
1306 		    test_base <= (end = ms->physmax) &&
1307 		    test_end >= (base = ms->physbase)) {
1308 
1309 			offset = ms->offset;
1310 
1311 			if (test_base > base) {
1312 				/* Round test_base to next multiple of stride */
1313 				len = P2ROUNDUP(test_base - (base - offset),
1314 				    mnode_stride);
1315 				nearest = base - offset + len;
1316 				/*
1317 				 * Compute distance from test_base to the
1318 				 * stride boundary to see if test_base falls
1319 				 * in the stripe or in the hole.
1320 				 */
1321 				if (nearest - test_base > hole) {
1322 					/*
1323 					 * test_base lies in stripe,
1324 					 * and offset should be excluded.
1325 					 */
1326 					offset = test_base -
1327 					    (nearest - mnode_stride);
1328 					base = test_base;
1329 				} else {
1330 					/* round up to next stripe start */
1331 					offset = 0;
1332 					base = nearest;
1333 					if (base > end)
1334 						continue;
1335 				}
1336 
1337 			}
1338 
1339 			if (test_end < end)
1340 				end = test_end;
1341 			end++;		/* adjust to an exclusive bound */
1342 
1343 			/* Round end to next multiple of stride */
1344 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1345 			nearest = (base - offset) + len;
1346 			if (nearest - end <= hole) {
1347 				/* end falls in hole, use entire last stripe */
1348 				frag = 0;
1349 			} else {
1350 				/* end falls in stripe, compute fragment */
1351 				frag = nearest - hole - end;
1352 			}
1353 
1354 			len = (len >> stripe_shift) - offset - frag;
1355 			npages += len;
1356 		}
1357 	}
1358 
1359 	*npages_out = npages;
1360 }
1361 
1362 /*
1363  * valid_pages()
1364  *
1365  * Return 1 if pages are valid and do not cross mnode boundaries
1366  * (which would break page free list assumptions), and 0 otherwise.
1367  */
1368 
1369 #define	MNODE(pa)	\
1370 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1371 
1372 static int
1373 valid_pages(md_t *md, mde_cookie_t cpu0)
1374 {
1375 	int i, max_szc;
1376 	uint64_t last_page_base, szc_mask;
1377 	uint64_t max_page_len, max_coalesce_len;
1378 	struct mblock_md *mb = mpo_mblock;
1379 
1380 	/*
1381 	 * Find the smaller of the largest page possible and supported.
1382 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1383 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1384 	 * to get a sane mask.
1385 	 */
1386 
1387 	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1388 		szc_mask = 0;
1389 	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
1390 	max_szc = highbit(szc_mask) - 1;
1391 	if (max_szc > TTE256M)
1392 		max_szc = TTE256M;
1393 	max_page_len = TTEBYTES(max_szc);
1394 
1395 	/*
1396 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1397 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1398 	 * within one mnode to use MPO.
1399 	 */
1400 	max_coalesce_len = TTEBYTES(TTE256M);
1401 	ASSERT(max_coalesce_len >= max_page_len);
1402 
1403 	if (ptob(mnode_pages) < max_coalesce_len) {
1404 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1405 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1406 		return (0);
1407 	}
1408 
1409 	for (i = 0; i < n_mblocks; i++) {
1410 		uint64_t base = mb->base;
1411 		uint64_t end = mb->base + mb->size - 1;
1412 		uint64_t ra_to_pa = mb->ra_to_pa;
1413 
1414 		/*
1415 		 * If mblock is smaller than the max page size, then
1416 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1417 		 * not span mnodes.
1418 		 */
1419 		if (mb->size < max_page_len) {
1420 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1421 				MPO_STATUS("Small mblock spans mnodes; "
1422 				    "MPO disabled: base = %lx, end = %lx, "
1423 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1424 				return (0);
1425 			}
1426 		} else {
1427 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1428 			uint64_t pa_base = base + ra_to_pa;
1429 			if ((base & (max_coalesce_len - 1)) !=
1430 			    (pa_base & (max_coalesce_len - 1))) {
1431 				MPO_STATUS("bad page alignment; MPO disabled: "
1432 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1433 				    base, pa_base, max_coalesce_len);
1434 				return (0);
1435 			}
1436 		}
1437 
1438 		/*
1439 		 * Find start of last large page in mblock in RA space.
1440 		 * If page extends into the next mblock, verify the
1441 		 * mnode does not change.
1442 		 */
1443 		last_page_base = P2ALIGN(end, max_coalesce_len);
1444 		if (i + 1 < n_mblocks &&
1445 		    last_page_base + max_coalesce_len > mb[1].base &&
1446 		    MNODE(last_page_base + ra_to_pa) !=
1447 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1448 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1449 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1450 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1451 			    mb[1].ra_to_pa, max_coalesce_len);
1452 			return (0);
1453 		}
1454 
1455 		mb++;
1456 	}
1457 	return (1);
1458 }
1459 
1460 
1461 /*
1462  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1463  * if any, and remove them.  This yields a config where the "coarse
1464  * grained" lgroups cover all of memory, even though part of that memory
1465  * is fine grain interleaved and does not deliver a purely local memory
1466  * latency.
1467  *
1468  * This function reads and modifies the globals:
1469  *	mpo_lgroup[], n_lgrpnodes
1470  *
1471  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1472  */
1473 
1474 static int
1475 fix_interleave(void)
1476 {
1477 	int i, j;
1478 	uint64_t mask = 0;
1479 
1480 	j = 0;
1481 	for (i = 0; i < n_lgrpnodes; i++) {
1482 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1483 			/* remove this lgroup */
1484 			mask = mpo_lgroup[i].addr_mask;
1485 		} else {
1486 			mpo_lgroup[j++] = mpo_lgroup[i];
1487 		}
1488 	}
1489 	n_lgrpnodes = j;
1490 
1491 	if (mask != 0)
1492 		MPO_STATUS("sub-page interleave %lx found; "
1493 		    "removing lgroup.\n", mask);
1494 
1495 	return (mask != 0);
1496 }
1497