xref: /illumos-gate/usr/src/uts/sun4v/os/mpo.c (revision 24f5a37652e188ebdcdd6da454511686935025df)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/sysmacros.h>
29 #include <sys/machsystm.h>
30 #include <sys/machparam.h>
31 #include <sys/cmn_err.h>
32 #include <sys/stat.h>
33 #include <sys/mach_descrip.h>
34 #include <sys/memnode.h>
35 #include <sys/mdesc.h>
36 #include <sys/mpo.h>
37 #include <vm/page.h>
38 #include <vm/vm_dep.h>
39 #include <vm/hat_sfmmu.h>
40 #include <sys/promif.h>
41 
42 /*
43  * MPO and the sun4v memory representation
44  * ---------------------------------------
45  *
46  * Latency groups are defined in the sun4v achitecture by memory-latency-group
47  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
48  * tie together cpu nodes and mblock nodes, and contain mask and match
49  * properties that identify the portion of an mblock that belongs to the
50  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
51  * but an mblock defines Real Addresses (RA).  To translate, the mblock
52  * includes the property address-congruence-offset, hereafter referred to as
53  * ra_to_pa.  A real address ra is a member of an lgroup if
54  *
55  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
56  *
57  * The MD is traversed, and information on all mblocks is kept in the array
58  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
59  * to, is kept in the array mpo_cpu[].
60  *
61  * This implementation makes (and verifies) the simplifying assumption that
62  * the mask bits are the same for all defined lgroups, and that all 1 bits in
63  * the mask are contiguous.  Thus the number of lgroups is bounded by the
64  * number of possible mask values, and the lgrp_handle_t is defined as the
65  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
66  * masks and values are also referred to as "home bits" in the code.
67  *
68  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71  * home bits.  This yields the mem_node.
72  *
73  * Interfaces
74  * ----------
75  *
76  * This file exports the following entry points:
77  *
78  * plat_lgrp_init()
79  * plat_build_mem_nodes()
80  * plat_lgrp_cpu_to_hand()
81  * plat_lgrp_latency()
82  * plat_pfn_to_mem_node()
83  *	These implement the usual platform lgroup interfaces.
84  *
85  * plat_rapfn_to_papfn()
86  *	Recover the PA page coloring bits from an RA.
87  *
88  * plat_mem_node_iterator_init()
89  *	Initialize an iterator to efficiently step through pages in a mem_node.
90  *
91  * plat_mem_node_intersect_range()
92  *	Find the intersection with a mem_node.
93  *
94  * plat_slice_add()
95  * plat_slice_del()
96  *	Platform hooks to add/delete a pfn range.
97  *
98  * Internal Organization
99  * ---------------------
100  *
101  * A number of routines are used both boot/DR code which (re)build
102  * appropriate MPO structures.
103  *
104  * mblock_alloc()
105  *	Allocate memory for mblocks and stripes as
106  *	appropriate for boot or memory DR.
107  *
108  * mblock_free()
109  *	Free memory allocated by mblock_alloc.
110  *
111  * mblock_update()
112  *	Build mblocks based on mblock nodes read from the MD.
113  *
114  * mblock_update_add()
115  *	Rebuild mblocks after a memory DR add operation.
116  *
117  * mblock_update_del()
118  *	Rebuild mblocks after a memory DR delete operation.
119  *
120  * mblock_install()
121  *	Install mblocks as the new configuration.
122  *
123  * mstripe_update()
124  *	Build stripes based on mblocks.
125  *
126  * mnode_update()
127  *	Call memnode layer to add/del a pfn range, based on stripes.
128  *
129  * The platform interfaces allocate all memory required for the
130  * particualar update first, block access to the MPO structures
131  * while they are updated, and free old structures after the update.
132  */
133 
134 int	sun4v_mpo_enable = 1;
135 int	sun4v_mpo_debug = 0;
136 char	sun4v_mpo_status[256] = "";
137 
138 /* Save CPU info from the MD and associate CPUs with lgroups */
139 static	struct cpu_md mpo_cpu[NCPU];
140 
141 /* Save lgroup info from the MD */
142 #define	MAX_MD_LGROUPS 32
143 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
144 static	int	n_lgrpnodes = 0;
145 static	int	n_locality_groups = 0;
146 static	int	max_locality_groups = 0;
147 static	int	szc_mask0 = 0;
148 
149 /* Save mblocks from the MD */
150 #define	SMALL_MBLOCKS_COUNT	8
151 static 	struct	mblock_md *mpo_mblock;
152 static	struct 	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
153 static	int	n_mblocks = 0;
154 
155 /* Save mem_node stripes calculate from mblocks and lgroups. */
156 static mem_stripe_t *mem_stripes;
157 static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
158 static	int	n_mem_stripes = 0;
159 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
160 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
161 static	pfn_t	mnode_pages;	/* mem_node stripe width */
162 
163 /* Save home mask and shift used to calculate lgrp_handle_t values */
164 static	uint64_t home_mask = 0;
165 static	pfn_t	home_mask_pfn = 0;
166 static	int	home_mask_shift = 0;
167 static	uint_t	home_mask_pfn_shift = 0;
168 
169 /* Save lowest and highest latencies found across all lgroups */
170 static	int	lower_latency = 0;
171 static	int	higher_latency = 0;
172 
173 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
174 static	int	mpo_genid;		/* config gen; updated by mem DR */
175 static	mpo_config_t mpo_config;	/* current mblocks and stripes */
176 
177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
178 
179 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
180 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
181 static	int	fix_interleave(void);
182 
183 static int  mblock_alloc(mpo_config_t *, update_t, int nmblocks);
184 static void mblock_install(mpo_config_t *);
185 static void mblock_free(mpo_config_t *);
186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
187 static void mblock_update_add(mpo_config_t *);
188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
189 static void mstripe_update(mpo_config_t *);
190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
191 
192 /* Debug support */
193 #if defined(DEBUG) && !defined(lint)
194 #define	VALIDATE_SLICE(base, end) { 					\
195 	ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M)));		\
196 	ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M)));	\
197 }
198 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
199 #else
200 #define	VALIDATE_SLICE(base, end)
201 #define	MPO_DEBUG(...)
202 #endif	/* DEBUG */
203 
204 /* Record status message, viewable from mdb */
205 #define	MPO_STATUS(args...) {						      \
206 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
207 	MPO_DEBUG(sun4v_mpo_status);					      \
208 }
209 
210 /*
211  * The MPO locks are to protect the MPO metadata while that
212  * information is updated as a result of a memory DR operation.
213  * The read lock must be acquired to read the metadata and the
214  * write locks must be acquired to update it.
215  */
216 #define	mpo_rd_lock	kpreempt_disable
217 #define	mpo_rd_unlock	kpreempt_enable
218 
219 static void
220 mpo_wr_lock()
221 {
222 	mutex_enter(&cpu_lock);
223 	pause_cpus(NULL, NULL);
224 	mutex_exit(&cpu_lock);
225 }
226 
227 static void
228 mpo_wr_unlock()
229 {
230 	mutex_enter(&cpu_lock);
231 	start_cpus();
232 	mutex_exit(&cpu_lock);
233 }
234 
235 /*
236  * Routine to read a uint64_t from a given md
237  */
238 static	int64_t
239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
240 {
241 	int err = md_get_prop_val(md, node, propname, val);
242 	return (err);
243 }
244 
245 static int
246 mblock_cmp(const void *a, const void *b)
247 {
248 	struct mblock_md *m1 = (struct mblock_md *)a;
249 	struct mblock_md *m2 = (struct mblock_md *)b;
250 
251 	if (m1->base < m2->base)
252 		return (-1);
253 	else if (m1->base == m2->base)
254 		return (0);
255 	else
256 		return (1);
257 }
258 
259 static void
260 mblock_sort(struct mblock_md *mblocks, int n)
261 {
262 	extern void qsort(void *, size_t, size_t,
263 	    int (*)(const void *, const void *));
264 
265 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
266 }
267 
268 static void
269 mpo_update_tunables(void)
270 {
271 	int i, ncpu_min;
272 
273 	/*
274 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275 	 * this process is currently running on before considering
276 	 *  expanding threads to another lgroup.
277 	 *
278 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
279 	 *  must be loaded before expanding to it.
280 	 *
281 	 * On sun4v CMT processors, threads share a core pipeline, and
282 	 * at less than 100% utilization, best throughput is obtained by
283 	 * spreading threads across more cores, even if some are in a
284 	 * different lgroup.  Spread threads to a new lgroup if the
285 	 * current group is more than 50% loaded.  Because of virtualization,
286 	 * lgroups may have different numbers of CPUs, but the tunables
287 	 * apply to all lgroups, so find the smallest lgroup and compute
288 	 * 50% loading.
289 	 */
290 
291 	ncpu_min = NCPU;
292 	for (i = 0; i < n_lgrpnodes; i++) {
293 		int ncpu = mpo_lgroup[i].ncpu;
294 		if (ncpu != 0 && ncpu < ncpu_min)
295 			ncpu_min = ncpu;
296 	}
297 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
298 
299 	/* new home may only be half as loaded as the existing home to use it */
300 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
301 
302 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
303 }
304 
305 static mde_cookie_t
306 cpuid_to_cpunode(md_t *md, int cpuid)
307 {
308 	mde_cookie_t    rootnode, foundnode, *cpunodes;
309 	uint64_t	cpuid_prop;
310 	int 	n_cpunodes, i;
311 
312 	if (md == NULL)
313 		return (MDE_INVAL_ELEM_COOKIE);
314 
315 	rootnode = md_root_node(md);
316 	if (rootnode == MDE_INVAL_ELEM_COOKIE)
317 		return (MDE_INVAL_ELEM_COOKIE);
318 
319 	n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
320 	    "fwd", &cpunodes);
321 	if (n_cpunodes <= 0 || n_cpunodes > NCPU)
322 		goto cpuid_fail;
323 
324 	for (i = 0; i < n_cpunodes; i++) {
325 		if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
326 		    &cpuid_prop))
327 			break;
328 		if (cpuid_prop == (uint64_t)cpuid) {
329 			foundnode = cpunodes[i];
330 			md_free_scan_dag(md, &cpunodes);
331 			return (foundnode);
332 		}
333 	}
334 cpuid_fail:
335 	if (n_cpunodes > 0)
336 		md_free_scan_dag(md, &cpunodes);
337 	return (MDE_INVAL_ELEM_COOKIE);
338 }
339 
340 static int
341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
342 {
343 	mde_cookie_t *nodes;
344 	uint64_t latency, lowest_latency;
345 	uint64_t address_match, lowest_address_match;
346 	int n_lgroups, j, result = 0;
347 
348 	/* Find lgroup nodes reachable from this cpu */
349 	n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
350 	    "fwd", &nodes);
351 
352 	lowest_latency = ~(0UL);
353 
354 	/* Find the lgroup node with the smallest latency */
355 	for (j = 0; j < n_lgroups; j++) {
356 		result = get_int(md, nodes[j], PROP_LG_LATENCY,
357 		    &latency);
358 		result |= get_int(md, nodes[j], PROP_LG_MATCH,
359 		    &address_match);
360 		if (result != 0) {
361 			j = -1;
362 			goto to_lgrp_done;
363 		}
364 		if (latency < lowest_latency) {
365 			lowest_latency = latency;
366 			lowest_address_match = address_match;
367 		}
368 	}
369 	for (j = 0; j < n_lgrpnodes; j++) {
370 		if ((mpo_lgroup[j].latency == lowest_latency) &&
371 		    (mpo_lgroup[j].addr_match == lowest_address_match))
372 			break;
373 	}
374 	if (j == n_lgrpnodes)
375 		j = -1;
376 
377 to_lgrp_done:
378 	if (n_lgroups > 0)
379 		md_free_scan_dag(md, &nodes);
380 	return (j);
381 }
382 
383 /* Called when DR'ing in a CPU */
384 void
385 mpo_cpu_add(md_t *md, int cpuid)
386 {
387 	mde_cookie_t cpunode;
388 
389 	int i;
390 
391 	if (n_lgrpnodes <= 0)
392 		return;
393 
394 	if (md == NULL)
395 		goto add_fail;
396 
397 	cpunode = cpuid_to_cpunode(md, cpuid);
398 	if (cpunode == MDE_INVAL_ELEM_COOKIE)
399 		goto add_fail;
400 
401 	i = mpo_cpu_to_lgroup(md, cpunode);
402 	if (i == -1)
403 		goto add_fail;
404 
405 	mpo_cpu[cpuid].lgrp_index = i;
406 	mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
407 	mpo_lgroup[i].ncpu++;
408 	mpo_update_tunables();
409 	return;
410 add_fail:
411 	panic("mpo_cpu_add: Cannot read MD");
412 }
413 
414 /* Called when DR'ing out a CPU */
415 void
416 mpo_cpu_remove(int cpuid)
417 {
418 	int i;
419 
420 	if (n_lgrpnodes <= 0)
421 		return;
422 
423 	i = mpo_cpu[cpuid].lgrp_index;
424 	mpo_lgroup[i].ncpu--;
425 	mpo_cpu[cpuid].home = 0;
426 	mpo_cpu[cpuid].lgrp_index = -1;
427 	mpo_update_tunables();
428 }
429 
430 static mde_cookie_t
431 md_get_root(md_t *md)
432 {
433 	mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
434 	int n_nodes;
435 
436 	n_nodes = md_node_count(md);
437 
438 	if (n_nodes <= 0) {
439 		MPO_STATUS("md_get_root: No nodes in node count\n");
440 		return (root);
441 	}
442 
443 	root = md_root_node(md);
444 
445 	if (root == MDE_INVAL_ELEM_COOKIE) {
446 		MPO_STATUS("md_get_root: Root node is missing\n");
447 		return (root);
448 	}
449 
450 	MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
451 	MPO_DEBUG("md_get_root: md: %p\n", md);
452 	MPO_DEBUG("md_get_root: root: %lx\n", root);
453 done:
454 	return (root);
455 }
456 
457 static int
458 lgrp_update(md_t *md, mde_cookie_t root)
459 {
460 	int i, j, result;
461 	int ret_val = 0;
462 	int sub_page_fix;
463 	mde_cookie_t *nodes, *lgrpnodes;
464 
465 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
466 	    "fwd", &lgrpnodes);
467 
468 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
469 		MPO_STATUS("lgrp_update: No Lgroups\n");
470 		ret_val = -1;
471 		goto fail;
472 	}
473 
474 	MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
475 
476 	for (i = 0; i < n_lgrpnodes; i++) {
477 		mpo_lgroup[i].node = lgrpnodes[i];
478 		mpo_lgroup[i].id = i;
479 		mpo_lgroup[i].ncpu = 0;
480 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
481 		    &mpo_lgroup[i].addr_mask);
482 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
483 		    &mpo_lgroup[i].addr_match);
484 
485 		/*
486 		 * If either the mask or match properties are missing, set to 0
487 		 */
488 		if (result < 0) {
489 			mpo_lgroup[i].addr_mask = 0;
490 			mpo_lgroup[i].addr_match = 0;
491 		}
492 
493 		/* Set latency to 0 if property not present */
494 
495 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
496 		    &mpo_lgroup[i].latency);
497 		if (result < 0)
498 			mpo_lgroup[i].latency = 0;
499 	}
500 
501 	/*
502 	 * Sub-page level interleave is not yet supported.  Check for it,
503 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
504 	 * n_lgrpnodes.  If no lgroups are left, return.
505 	 */
506 
507 	sub_page_fix = fix_interleave();
508 	if (n_lgrpnodes == 0) {
509 		ret_val = -1;
510 		goto fail;
511 	}
512 
513 	/* Ensure that all of the addr_mask values are the same */
514 
515 	for (i = 0; i < n_lgrpnodes; i++) {
516 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
517 			MPO_STATUS("lgrp_update: "
518 			    "addr_mask values are not the same\n");
519 			ret_val = -1;
520 			goto fail;
521 		}
522 	}
523 
524 	/*
525 	 * Ensure that all lgrp nodes see all the mblocks. However, if
526 	 * sub-page interleave is being fixed, they do not, so skip
527 	 * the check.
528 	 */
529 
530 	if (sub_page_fix == 0) {
531 		for (i = 0; i < n_lgrpnodes; i++) {
532 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
533 			    PROP_LG_MBLOCK, "fwd", &nodes);
534 			md_free_scan_dag(md, &nodes);
535 			if (j != n_mblocks) {
536 				MPO_STATUS("lgrp_update: "
537 				    "sub-page interleave is being fixed\n");
538 				ret_val = -1;
539 				goto fail;
540 			}
541 		}
542 	}
543 fail:
544 	if (n_lgrpnodes > 0) {
545 		md_free_scan_dag(md, &lgrpnodes);
546 		for (i = 0; i < n_lgrpnodes; i++)
547 			mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
548 	}
549 
550 	return (ret_val);
551 }
552 
553 /*
554  *
555  * Traverse the MD to determine:
556  *
557  *  Number of CPU nodes, lgrp_nodes, and mblocks
558  *  Then for each lgrp_node, obtain the appropriate data.
559  *  For each CPU, determine its home locality and store it.
560  *  For each mblock, retrieve its data and store it.
561  */
562 static	int
563 lgrp_traverse(md_t *md)
564 {
565 	mde_cookie_t root, *cpunodes, *mblocknodes;
566 	int o;
567 	uint64_t i, k, stripe, stride;
568 	uint64_t mem_lg_homeset = 0;
569 	int ret_val = 0;
570 	int result = 0;
571 	int n_cpunodes = 0;
572 	mpo_config_t new_config;
573 
574 	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
575 		ret_val = -1;
576 		goto fail;
577 	}
578 
579 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
580 	    &mblocknodes);
581 	if (n_mblocks <= 0) {
582 		MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
583 		    "Descriptor\n");
584 		ret_val = -1;
585 		goto fail;
586 	}
587 
588 	/*
589 	 * Build the Memory Nodes.  Do this before any possibility of
590 	 * bailing from this routine so we obtain ra_to_pa (needed for page
591 	 * coloring) even when there are no lgroups defined.
592 	 */
593 	if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
594 		ret_val = -1;
595 		goto fail;
596 	}
597 
598 	mblock_update(&new_config, md, mblocknodes);
599 	mblock_install(&new_config);
600 
601 	/* Page coloring hook is required so we can iterate through mnodes */
602 	if (&page_next_pfn_for_color_cpu == NULL) {
603 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
604 		ret_val = -1;
605 		goto fail;
606 	}
607 
608 	/* Global enable for mpo */
609 	if (sun4v_mpo_enable == 0) {
610 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
611 		ret_val = -1;
612 		goto fail;
613 	}
614 
615 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
616 
617 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
618 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
619 		    "in MD\n");
620 		ret_val = -1;
621 		goto fail;
622 	}
623 
624 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
625 
626 	if ((ret_val = lgrp_update(md, root)) == -1)
627 		goto fail;
628 
629 	/*
630 	 * Use the address mask from the first lgroup node
631 	 * to establish our home_mask.
632 	 */
633 	home_mask = mpo_lgroup[0].addr_mask;
634 	home_mask_pfn = btop(home_mask);
635 	home_mask_shift = lowbit(home_mask) - 1;
636 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
637 	mnode_pages = btop(1ULL << home_mask_shift);
638 
639 	/*
640 	 * How many values are possible in home mask?  Assume the mask
641 	 * bits are contiguous.
642 	 */
643 	max_locality_groups =
644 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
645 
646 	stripe_shift = highbit(max_locality_groups) - 1;
647 	stripe = ptob(mnode_pages);
648 	stride = max_locality_groups * stripe;
649 	mnode_stride = btop(stride);
650 
651 	/* Now verify the home mask bits are contiguous */
652 
653 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
654 		MPO_STATUS("lgrp_traverse: "
655 		    "home mask bits are not contiguous\n");
656 		ret_val = -1;
657 		goto fail;
658 	}
659 
660 	/* Record all of the home bits */
661 
662 	for (i = 0; i < n_lgrpnodes; i++) {
663 		HOMESET_ADD(mem_lg_homeset,
664 		    mpo_lgroup[i].addr_match >> home_mask_shift);
665 	}
666 
667 	/* Count the number different "home"  mem_lg's we've discovered */
668 
669 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
670 
671 	/* If we have only 1 locality group then we can exit */
672 	if (n_locality_groups == 1) {
673 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
674 		ret_val = -1;
675 		goto fail;
676 	}
677 
678 	/*
679 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
680 	 * latency found.  All other memory is considered remote, and the
681 	 * remote latency is represented by the highest latency found.
682 	 * Thus hierarchical lgroups, if any, are approximated by a
683 	 * two level scheme.
684 	 *
685 	 * The Solaris MPO framework by convention wants to see latencies
686 	 * in units of nano-sec/10. In the MD, the units are defined to be
687 	 * pico-seconds.
688 	 */
689 
690 	lower_latency = mpo_lgroup[0].latency;
691 	higher_latency = mpo_lgroup[0].latency;
692 
693 	for (i = 1; i < n_lgrpnodes; i++) {
694 		if (mpo_lgroup[i].latency < lower_latency) {
695 			lower_latency = mpo_lgroup[i].latency;
696 		}
697 		if (mpo_lgroup[i].latency > higher_latency) {
698 			higher_latency = mpo_lgroup[i].latency;
699 		}
700 	}
701 	lower_latency /= 10000;
702 	higher_latency /= 10000;
703 
704 	/* Clear our CPU data */
705 
706 	for (i = 0; i < NCPU; i++) {
707 		mpo_cpu[i].home = 0;
708 		mpo_cpu[i].lgrp_index = -1;
709 	}
710 
711 	/* Build the CPU nodes */
712 	for (i = 0; i < n_cpunodes; i++) {
713 
714 		/* Read in the lgroup nodes */
715 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
716 		if (result < 0) {
717 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
718 			ret_val = -1;
719 			goto fail;
720 		}
721 
722 		o = mpo_cpu_to_lgroup(md, cpunodes[i]);
723 		if (o == -1) {
724 			ret_val = -1;
725 			goto fail;
726 		}
727 		mpo_cpu[k].lgrp_index = o;
728 		mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
729 		mpo_lgroup[o].ncpu++;
730 	}
731 	/* Validate that no large pages cross mnode boundaries. */
732 	if (valid_pages(md, cpunodes[0]) == 0) {
733 		ret_val = -1;
734 		goto fail;
735 	}
736 
737 fail:
738 	if (n_cpunodes > 0)
739 		md_free_scan_dag(md, &cpunodes);
740 	if (n_mblocks > 0)
741 		md_free_scan_dag(md, &mblocknodes);
742 	else
743 		panic("lgrp_traverse: No memory blocks found");
744 
745 	if (ret_val == 0) {
746 		MPO_STATUS("MPO feature is enabled.\n");
747 	} else
748 		sun4v_mpo_enable = 0;	/* set this for DR */
749 
750 	return (ret_val);
751 }
752 
753 /*
754  *  Determine the number of unique mem_lg's present in our system
755  */
756 static	int
757 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
758 {
759 	int homeid;
760 	int count = 0;
761 
762 	/*
763 	 * Scan the "home" bits of the mem_lgs, count
764 	 * the number that are unique.
765 	 */
766 
767 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
768 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
769 			count++;
770 		}
771 	}
772 
773 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
774 	    mem_lg_homeset);
775 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
776 
777 	/* Default must be at least one */
778 	if (count == 0)
779 		count = 1;
780 
781 	return (count);
782 }
783 
784 /*
785  * Platform specific lgroup initialization
786  */
787 void
788 plat_lgrp_init(void)
789 {
790 	md_t *md;
791 	int rc;
792 
793 	/* Get the Machine Descriptor handle */
794 
795 	md = md_get_handle();
796 
797 	/* If not, we cannot continue */
798 
799 	if (md == NULL) {
800 		panic("cannot access machine descriptor\n");
801 	} else {
802 		rc = lgrp_traverse(md);
803 		(void) md_fini_handle(md);
804 	}
805 
806 	/*
807 	 * If we can't process the MD for lgroups then at least let the
808 	 * system try to boot.  Assume we have one lgroup so that
809 	 * when plat_build_mem_nodes is called, it will attempt to init
810 	 * an mnode based on the supplied memory segment.
811 	 */
812 
813 	if (rc == -1) {
814 		home_mask_pfn = 0;
815 		max_locality_groups = 1;
816 		n_locality_groups = 1;
817 		return;
818 	}
819 
820 	mem_node_pfn_shift = 0;
821 	mem_node_physalign = 0;
822 
823 	/* Use lgroup-aware TSB allocations */
824 	tsb_lgrp_affinity = 1;
825 
826 	/* Require that a home lgroup have some memory to be chosen */
827 	lgrp_mem_free_thresh = 1;
828 
829 	/* Standard home-on-next-touch policy */
830 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
831 
832 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
833 	lgrp_load_thresh = UINT32_MAX;
834 
835 	mpo_update_tunables();
836 }
837 
838 /*
839  *  Helper routine for debugging calls to mem_node_add_slice()
840  */
841 static	void
842 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
843 {
844 #if defined(DEBUG) && !defined(lint)
845 	static int slice_count = 0;
846 
847 	slice_count++;
848 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
849 	    slice_count, basepfn, endpfn);
850 #endif
851 	mem_node_add_slice(basepfn, endpfn);
852 }
853 
854 static	void
855 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
856 {
857 #if defined(DEBUG) && !defined(lint)
858 	static int slice_count = 0;
859 
860 	slice_count++;
861 	MPO_DEBUG("mem_del_slice(%d): basepfn: %lx  endpfn: %lx\n",
862 	    slice_count, basepfn, endpfn);
863 #endif
864 	mem_node_del_slice(basepfn, endpfn);
865 }
866 
867 /*
868  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
869  */
870 static	void
871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
872 {
873 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874 	    "mnode index: %d\n", plathand, mnode);
875 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
876 }
877 
878 /*
879  * plat_build_mem_nodes()
880  *
881  * Define the mem_nodes based on the modified boot memory list,
882  * or based on info read from the MD in plat_lgrp_init().
883  *
884  * When the home mask lies in the middle of the address bits (as it does on
885  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886  * it is striped across an mblock in a repeating pattern of contiguous memory
887  * followed by a gap.  The stripe width is the size of the contiguous piece.
888  * The stride is the distance from the start of one contiguous piece to the
889  * start of the next.  The gap is thus stride - stripe_width.
890  *
891  * The stripe of an mnode that falls within an mblock is described by the type
892  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
893  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
894  * this array is predetermined.  The mem_stripe_t that describes mnode m
895  * within mpo_mblock[i] is stored at
896  *	 mem_stripes[ m + i * max_locality_groups ]
897  *
898  * max_locality_groups is the total number of possible locality groups,
899  * as defined by the size of the home mask, even if the memory assigned
900  * to the domain is small and does not cover all the lgroups.  Thus some
901  * mem_stripe_t's may be empty.
902  *
903  * The members of mem_stripe_t are:
904  *	physbase: First valid page in mem_node in the corresponding mblock
905  *	physmax: Last valid page in mem_node in mblock
906  *	offset:  The full stripe width starts at physbase - offset.
907  *	    Thus if offset is non-zero, this mem_node starts in the middle
908  *	    of a stripe width, and the second full stripe starts at
909  *	    physbase - offset + stride.  (even though physmax may fall in the
910  *	    middle of a stripe width, we do not save the ending fragment size
911  *	    in this data structure.)
912  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
913  *
914  *	The stripe width is kept in the global mnode_pages.
915  *	The stride is kept in the global mnode_stride.
916  *	All the above use pfn's as the unit.
917  *
918  * As an example, the memory layout for a domain with 2 mblocks and 4
919  * mem_nodes 0,1,2,3 could look like this:
920  *
921  *	123012301230 ...	012301230123 ...
922  *	  mblock 0		  mblock 1
923  */
924 
925 /*ARGSUSED*/
926 void
927 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
928 {
929 	int elem;
930 	uint64_t base, len;
931 
932 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933 	max_mem_nodes = max_locality_groups;
934 
935 	mstripe_update(&mpo_config);
936 
937 	/* Check for non-MPO sun4v platforms */
938 	if (n_locality_groups <= 1) {
939 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
940 		for (elem = 0; elem < nelems; list++, elem++) {
941 			base = list->addr;
942 			len = list->size;
943 
944 			mpo_mem_node_add_slice(btop(base),
945 			    btop(base + len - 1));
946 		}
947 		mem_node_pfn_shift = 0;
948 		mem_node_physalign = 0;
949 	} else
950 		mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
951 
952 	/*
953 	 * Indicate to vm_pagelist that the hpm_counters array
954 	 * should be shared because the ranges overlap.
955 	 */
956 	if (max_mem_nodes > 1) {
957 		interleaved_mnodes = 1;
958 	}
959 }
960 
961 /*
962  * Return the locality group value for the supplied processor
963  */
964 lgrp_handle_t
965 plat_lgrp_cpu_to_hand(processorid_t id)
966 {
967 	lgrp_handle_t lgrphand;
968 
969 	mpo_rd_lock();
970 	if (n_locality_groups > 1) {
971 		lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
972 	} else {
973 		lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
974 	}
975 	mpo_rd_unlock();
976 
977 	return (lgrphand);
978 }
979 
980 int
981 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
982 {
983 	/*
984 	 * Return min remote latency when there are more than two lgroups
985 	 * (root and child) and getting latency between two different lgroups
986 	 * or root is involved.
987 	 */
988 	if (lgrp_optimizations() && (from != to ||
989 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
990 		return ((int)higher_latency);
991 	} else {
992 		return ((int)lower_latency);
993 	}
994 }
995 
996 int
997 plat_pfn_to_mem_node(pfn_t pfn)
998 {
999 	int i, mnode;
1000 	pfn_t ra_to_pa_pfn;
1001 	struct mblock_md *mb;
1002 
1003 	if (n_locality_groups <= 1)
1004 		return (0);
1005 
1006 	/*
1007 	 * The mnode is defined to be 1:1 with the lgroup handle, which
1008 	 * is taken from from the home bits.  Find the mblock in which
1009 	 * the pfn falls to get the ra_to_pa adjustment, and extract
1010 	 * the home bits.
1011 	 */
1012 	mpo_rd_lock();
1013 	mb = &mpo_mblock[0];
1014 	for (i = 0; i < n_mblocks; i++) {
1015 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016 			ra_to_pa_pfn = btop(mb->ra_to_pa);
1017 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018 			    home_mask_pfn_shift);
1019 			ASSERT(mnode < max_mem_nodes);
1020 			mpo_rd_unlock();
1021 			return (mnode);
1022 		}
1023 		mb++;
1024 	}
1025 
1026 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027 	return (pfn);
1028 }
1029 
1030 /*
1031  * plat_rapfn_to_papfn
1032  *
1033  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034  * and home mask bits are correct.  The upper bits do not necessarily
1035  * match the actual PA, however.
1036  */
1037 pfn_t
1038 plat_rapfn_to_papfn(pfn_t pfn)
1039 {
1040 	int i;
1041 	pfn_t ra_to_pa_pfn;
1042 	struct mblock_md *mb;
1043 
1044 	ASSERT(n_mblocks > 0);
1045 	if (n_mblocks == 1)
1046 		return (pfn + base_ra_to_pa_pfn);
1047 
1048 	/*
1049 	 * Find the mblock in which the pfn falls
1050 	 * in order to get the ra_to_pa adjustment.
1051 	 */
1052 	mpo_rd_lock();
1053 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055 			ra_to_pa_pfn = btop(mb->ra_to_pa);
1056 			mpo_rd_unlock();
1057 			return (pfn + ra_to_pa_pfn);
1058 		}
1059 	}
1060 
1061 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062 	return (pfn);
1063 }
1064 
1065 /*
1066  * plat_mem_node_iterator_init()
1067  *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1068  *      no additional iterator function.  The caller uses the info from
1069  *      the iterator structure directly.
1070  *
1071  *      pfn: starting pfn.
1072  *      mnode: desired mnode.
1073  *	szc: desired page size.
1074  *      init:
1075  *          if 1, start a new traversal, initialize "it", find first
1076  *              mblock containing pfn, and return its starting pfn
1077  *              within the mnode.
1078  *          if 0, continue the previous traversal using passed-in data
1079  *              from "it", advance to the next mblock, and return its
1080  *              starting pfn within the mnode.
1081  *      it: returns readonly data to the caller; see below.
1082  *
1083  *	The input pfn must be aligned for the page size szc.
1084  *
1085  *      Returns: starting pfn for the iteration for the mnode/mblock,
1086  *	    which is aligned according to the page size,
1087  *          or returns (pfn_t)(-1) if the input pfn lies past the last
1088  *          valid pfn of the mnode.
1089  *      Returns misc values in the "it" struct that allows the caller
1090  *          to advance the pfn within an mblock using address arithmetic;
1091  *          see definition of mem_node_iterator_t in vm_dep.h.
1092  *          When the caller calculates a pfn that is greater than the
1093  *          returned value it->mi_mblock_end, the caller should again
1094  *          call plat_mem_node_iterator_init, passing init=0.
1095  *
1096  *          The last mblock in continuation case may be invalid because
1097  *          of memory DR.  To detect this situation mi_genid is checked
1098  *          against mpo_genid which is incremented after a memory DR
1099  *          operation.  See also plat_slice_add()/plat_slice_del().
1100  */
1101 pfn_t
1102 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103     mem_node_iterator_t *it, int init)
1104 {
1105 	int i;
1106 	pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107 	struct mblock_md *mblock;
1108 	pfn_t base, end;
1109 	mem_stripe_t *ms;
1110 	uint64_t szcpagesize;
1111 
1112 	ASSERT(it != NULL);
1113 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114 	ASSERT(n_mblocks > 0);
1115 	ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116 
1117 	mpo_rd_lock();
1118 
1119 	if (init || (it->mi_genid != mpo_genid)) {
1120 		it->mi_genid = mpo_genid;
1121 		it->mi_last_mblock = 0;
1122 		it->mi_init = 1;
1123 	}
1124 
1125 	/* Check if mpo is not enabled and we only have one mblock */
1126 	if (n_locality_groups == 1 && n_mblocks == 1) {
1127 		if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128 			pfn = (pfn_t)-1;
1129 			goto done;
1130 		}
1131 		it->mi_mnode = mnode;
1132 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133 		it->mi_mnode_pfn_mask = 0;
1134 		it->mi_mnode_pfn_shift = 0;
1135 		it->mi_mnode_mask = 0;
1136 		it->mi_mblock_base = mem_node_config[mnode].physbase;
1137 		it->mi_mblock_end = mem_node_config[mnode].physmax;
1138 		if (pfn < it->mi_mblock_base)
1139 			pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140 		if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141 			pfn = (pfn_t)-1;
1142 		goto done;
1143 	}
1144 
1145 	/* init=1 means begin iterator, init=0 means continue */
1146 	if (init == 1) {
1147 		i = 0;
1148 	} else {
1149 		ASSERT(it->mi_last_mblock < n_mblocks);
1150 		i = it->mi_last_mblock;
1151 		ASSERT(pfn >
1152 		    mem_stripes[i * max_locality_groups + mnode].physmax);
1153 		if (++i == n_mblocks) {
1154 			pfn = (pfn_t)-1;
1155 			goto done;
1156 		}
1157 	}
1158 
1159 	/*
1160 	 * Find mblock that contains pfn for mnode's stripe, or first such an
1161 	 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162 	 * mblocks and stripes are sorted in ascending address order.
1163 	 */
1164 	szcpagesize = szcpgcnt << PAGESHIFT;
1165 	for (; i < n_mblocks; i++) {
1166 		if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167 			continue;
1168 		ms = &mem_stripes[i * max_locality_groups + mnode];
1169 		if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170 		    (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171 		    ms->physmax)
1172 			break;
1173 	}
1174 	if (i == n_mblocks) {
1175 		it->mi_last_mblock = i - 1;
1176 		pfn = (pfn_t)-1;
1177 		goto done;
1178 	}
1179 
1180 	it->mi_last_mblock = i;
1181 
1182 	mblock = &mpo_mblock[i];
1183 	base = ms->physbase;
1184 	end = ms->physmax;
1185 
1186 	it->mi_mnode = mnode;
1187 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188 	it->mi_mblock_base = base;
1189 	it->mi_mblock_end = end;
1190 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1191 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192 	it->mi_mnode_mask = max_locality_groups - 1;
1193 	if (pfn < base) {
1194 		pfn = P2ROUNDUP(base, szcpgcnt);
1195 		ASSERT(pfn + szcpgcnt - 1 <= end);
1196 	}
1197 	ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198 done:
1199 	mpo_rd_unlock();
1200 	return (pfn);
1201 }
1202 
1203 /*
1204  * plat_mem_node_intersect_range()
1205  *
1206  * Find the intersection between a memnode and a range of pfn's.
1207  */
1208 void
1209 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210     int mnode, pgcnt_t *npages_out)
1211 {
1212 	pfn_t offset, len, hole, base, end, test_end, frag;
1213 	pfn_t nearest;
1214 	mem_stripe_t *ms;
1215 	int i, npages;
1216 
1217 	*npages_out = 0;
1218 
1219 	if (!mem_node_config[mnode].exists || test_len == 0)
1220 		return;
1221 
1222 	base = mem_node_config[mnode].physbase;
1223 	end = mem_node_config[mnode].physmax;
1224 
1225 	test_end = test_base + test_len - 1;
1226 	if (end < test_base || base > test_end)
1227 		return;
1228 
1229 	if (n_locality_groups == 1) {
1230 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231 		return;
1232 	}
1233 
1234 	hole = mnode_stride - mnode_pages;
1235 	npages = 0;
1236 
1237 	/*
1238 	 * Iterate over all the stripes for this mnode (one per mblock),
1239 	 * find the intersection with each, and accumulate the intersections.
1240 	 *
1241 	 * Determing the intersection with a stripe is tricky.  If base or end
1242 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1243 	 * mem_node.  If base or end fall in a gap, round them to start of
1244 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1245 	 * but calculate the fragment size that should be excluded from the
1246 	 * stripe.  Calculate how many strides fall in the adjusted range,
1247 	 * multiply by stripe width, and add the start and end fragments.
1248 	 */
1249 
1250 	mpo_rd_lock();
1251 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252 		ms = &mem_stripes[i];
1253 		if (ms->exists &&
1254 		    test_base <= (end = ms->physmax) &&
1255 		    test_end >= (base = ms->physbase)) {
1256 
1257 			offset = ms->offset;
1258 
1259 			if (test_base > base) {
1260 				/* Round test_base to next multiple of stride */
1261 				len = P2ROUNDUP(test_base - (base - offset),
1262 				    mnode_stride);
1263 				nearest = base - offset + len;
1264 				/*
1265 				 * Compute distance from test_base to the
1266 				 * stride boundary to see if test_base falls
1267 				 * in the stripe or in the hole.
1268 				 */
1269 				if (nearest - test_base > hole) {
1270 					/*
1271 					 * test_base lies in stripe,
1272 					 * and offset should be excluded.
1273 					 */
1274 					offset = test_base -
1275 					    (nearest - mnode_stride);
1276 					base = test_base;
1277 				} else {
1278 					/* round up to next stripe start */
1279 					offset = 0;
1280 					base = nearest;
1281 					if (base > end)
1282 						continue;
1283 				}
1284 
1285 			}
1286 
1287 			if (test_end < end)
1288 				end = test_end;
1289 			end++;		/* adjust to an exclusive bound */
1290 
1291 			/* Round end to next multiple of stride */
1292 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293 			nearest = (base - offset) + len;
1294 			if (nearest - end <= hole) {
1295 				/* end falls in hole, use entire last stripe */
1296 				frag = 0;
1297 			} else {
1298 				/* end falls in stripe, compute fragment */
1299 				frag = nearest - hole - end;
1300 			}
1301 
1302 			len = (len >> stripe_shift) - offset - frag;
1303 			npages += len;
1304 		}
1305 	}
1306 
1307 	*npages_out = npages;
1308 	mpo_rd_unlock();
1309 }
1310 
1311 /*
1312  * valid_pages()
1313  *
1314  * Return 1 if pages are valid and do not cross mnode boundaries
1315  * (which would break page free list assumptions), and 0 otherwise.
1316  */
1317 
1318 #define	MNODE(pa)	\
1319 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320 
1321 static int
1322 valid_pages(md_t *md, mde_cookie_t cpu0)
1323 {
1324 	int i, max_szc;
1325 	uint64_t last_page_base, szc_mask;
1326 	uint64_t max_page_len, max_coalesce_len;
1327 	struct mblock_md *mb = mpo_mblock;
1328 
1329 	/*
1330 	 * Find the smaller of the largest page possible and supported.
1331 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1332 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1333 	 * to get a sane mask.
1334 	 */
1335 
1336 	if (cpu0 == NULL)
1337 		szc_mask = szc_mask0;
1338 	else {
1339 		if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340 			szc_mask = 0;
1341 		/* largest in sun4v default support */
1342 		szc_mask |=  (1 << TTE4M);
1343 		szc_mask0 = szc_mask;
1344 	}
1345 	max_szc = highbit(szc_mask) - 1;
1346 	if (max_szc > TTE256M)
1347 		max_szc = TTE256M;
1348 	max_page_len = TTEBYTES(max_szc);
1349 
1350 	/*
1351 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353 	 * within one mnode to use MPO.
1354 	 */
1355 	max_coalesce_len = TTEBYTES(TTE256M);
1356 	ASSERT(max_coalesce_len >= max_page_len);
1357 
1358 	if (ptob(mnode_pages) < max_coalesce_len) {
1359 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361 		return (0);
1362 	}
1363 
1364 	for (i = 0; i < n_mblocks; i++) {
1365 		uint64_t base = mb->base;
1366 		uint64_t end = mb->base + mb->size - 1;
1367 		uint64_t ra_to_pa = mb->ra_to_pa;
1368 
1369 		/*
1370 		 * If mblock is smaller than the max page size, then
1371 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1372 		 * not span mnodes.
1373 		 */
1374 		if (mb->size < max_page_len) {
1375 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376 				MPO_STATUS("Small mblock spans mnodes; "
1377 				    "MPO disabled: base = %lx, end = %lx, "
1378 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1379 				return (0);
1380 			}
1381 		} else {
1382 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1383 			uint64_t pa_base = base + ra_to_pa;
1384 			if ((base & (max_coalesce_len - 1)) !=
1385 			    (pa_base & (max_coalesce_len - 1))) {
1386 				MPO_STATUS("bad page alignment; MPO disabled: "
1387 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1388 				    base, pa_base, max_coalesce_len);
1389 				return (0);
1390 			}
1391 		}
1392 
1393 		/*
1394 		 * Find start of last large page in mblock in RA space.
1395 		 * If page extends into the next mblock, verify the
1396 		 * mnode does not change.
1397 		 */
1398 		last_page_base = P2ALIGN(end, max_coalesce_len);
1399 		if (i + 1 < n_mblocks &&
1400 		    last_page_base + max_coalesce_len > mb[1].base &&
1401 		    MNODE(last_page_base + ra_to_pa) !=
1402 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406 			    mb[1].ra_to_pa, max_coalesce_len);
1407 			return (0);
1408 		}
1409 
1410 		mb++;
1411 	}
1412 	return (1);
1413 }
1414 
1415 
1416 /*
1417  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418  * if any, and remove them.  This yields a config where the "coarse
1419  * grained" lgroups cover all of memory, even though part of that memory
1420  * is fine grain interleaved and does not deliver a purely local memory
1421  * latency.
1422  *
1423  * This function reads and modifies the globals:
1424  *	mpo_lgroup[], n_lgrpnodes
1425  *
1426  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427  */
1428 
1429 static int
1430 fix_interleave(void)
1431 {
1432 	int i, j;
1433 	uint64_t mask = 0;
1434 
1435 	j = 0;
1436 	for (i = 0; i < n_lgrpnodes; i++) {
1437 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438 			/* remove this lgroup */
1439 			mask = mpo_lgroup[i].addr_mask;
1440 		} else {
1441 			mpo_lgroup[j++] = mpo_lgroup[i];
1442 		}
1443 	}
1444 	n_lgrpnodes = j;
1445 
1446 	if (mask != 0)
1447 		MPO_STATUS("sub-page interleave %lx found; "
1448 		    "removing lgroup.\n", mask);
1449 
1450 	return (mask != 0);
1451 }
1452 
1453 /*
1454  * mblock_alloc
1455  *
1456  * Allocate memory for mblock an stripe arrays from either static or
1457  * dynamic space depending on utype, and return the result in mc.
1458  * Returns 0 on success and -1 on error.
1459  */
1460 
1461 static int
1462 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1463 {
1464 	mblock_md_t *mb = NULL;
1465 	mem_stripe_t *ms = NULL;
1466 	int nstripes = MAX_MEM_NODES * nmblocks;
1467 	size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468 	size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469 	size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1470 
1471 	/*
1472 	 * Allocate space for mblocks and mstripes.
1473 	 *
1474 	 * For DR allocations, just use kmem_alloc(), and set
1475 	 * mc_alloc_sz to indicate it was used.
1476 	 *
1477 	 * For boot allocation:
1478 	 * If we have a small number of mblocks we will use the space
1479 	 * that we preallocated. Otherwise, we will dynamically
1480 	 * allocate the space from the prom and map it to the
1481 	 * reserved VA at MPOBUF_BASE.
1482 	 */
1483 
1484 	if (utype == U_ADD || utype == U_DEL) {
1485 		mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486 		ms = (mem_stripe_t *)(mb + nmblocks);
1487 		mc->mc_alloc_sz = allocsz;
1488 	} else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489 		mb = &small_mpo_mblocks[0];
1490 		ms = &small_mem_stripes[0];
1491 		mc->mc_alloc_sz = 0;
1492 	} else {
1493 		/* Ensure that we dont request more space than reserved */
1494 		if (allocsz > MPOBUF_SIZE) {
1495 			MPO_STATUS("mblock_alloc: Insufficient space "
1496 			    "for mblock structures \n");
1497 			return (-1);
1498 		}
1499 		mb = (struct mblock_md *)
1500 		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501 		if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502 			MPO_STATUS("mblock_alloc: Cannot allocate space "
1503 			    "for mblocks \n");
1504 			return (-1);
1505 		}
1506 		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507 		mpo_heap32_bufsz = MPOBUF_SIZE;
1508 		ms = (mem_stripe_t *)(mb + nmblocks);
1509 		mc->mc_alloc_sz = 0;
1510 	}
1511 	mc->mc_mblocks = mb;
1512 	mc->mc_stripes = ms;
1513 	mc->mc_nmblocks = nmblocks;
1514 	mc->mc_nstripes = nstripes;
1515 	MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516 	return (0);
1517 }
1518 
1519 /*
1520  * mblock_free
1521  *
1522  * Free memory in mc that was allocated by mblock_alloc.
1523  */
1524 
1525 static void
1526 mblock_free(mpo_config_t *mc)
1527 {
1528 	if (mc->mc_alloc_sz > 0) {
1529 		ASSERT(mc->mc_mblocks != mpo_mblock);
1530 		kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1531 	}
1532 	bzero(mc, sizeof (*mc));
1533 }
1534 
1535 /*
1536  * mblock_install
1537  *
1538  * Install mblock config passed in mc as the global configuration.
1539  * May only be called at boot or while holding mpo_wr_lock.
1540  */
1541 
1542 static void
1543 mblock_install(mpo_config_t *mc)
1544 {
1545 	mpo_mblock = mc->mc_mblocks;
1546 	n_mblocks = mc->mc_nmblocks;
1547 	mem_stripes = mc->mc_stripes;
1548 	n_mem_stripes = mc->mc_nstripes;
1549 	base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550 	mpo_config = *mc;
1551 }
1552 
1553 /*
1554  * mblock_update
1555  *
1556  * Traverse mblocknodes, read the mblock properties from the MD, and
1557  * save the mblocks in mc.
1558  */
1559 
1560 static void
1561 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1562 {
1563 	uint64_t i, j;
1564 	int result = 0;
1565 	mblock_md_t *mblock = mc->mc_mblocks;
1566 
1567 	for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1568 
1569 		/* Without a base or size value we will fail */
1570 		result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571 		    &mblock[i].base);
1572 		if (result < 0) {
1573 			MPO_STATUS("mblock_update: "
1574 			    "PROP_LG_BASE is missing\n");
1575 			mc->mc_nmblocks = 0;
1576 			return;
1577 		}
1578 
1579 		result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580 		    &mblock[i].size);
1581 		if (result < 0) {
1582 			MPO_STATUS("mblock_update: "
1583 			    "PROP_LG_SIZE is missing\n");
1584 			mc->mc_nmblocks = 0;
1585 			return;
1586 		}
1587 
1588 		result = get_int(md, mblocknodes[j],
1589 		    PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1590 
1591 		/* If we don't have an ra_pa_offset, just set it to 0 */
1592 		if (result < 0)
1593 			mblock[i].ra_to_pa = 0;
1594 
1595 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596 		    "ra_to_pa = %lx\n", i,
1597 		    mblock[i].base,
1598 		    mblock[i].size,
1599 		    mblock[i].ra_to_pa);
1600 
1601 		/* check for unsupportable values of base and size */
1602 		if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603 			MPO_STATUS("mblock_update: "
1604 			    "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605 			    "base = %lx, size = %lx\n",
1606 			    mblock[i].base, mblock[i].size);
1607 			mc->mc_nmblocks = 0;
1608 			return;
1609 		}
1610 
1611 		/* eliminate size==0 blocks */
1612 		if (mblock[i].size != 0) {
1613 			uint64_t base = mblock[i].base;
1614 			uint64_t end = base + mblock[i].size;
1615 			ASSERT(end > base);
1616 			mblock[i].base_pfn = btop(base);
1617 			mblock[i].end_pfn = btop(end - 1);
1618 			i++;
1619 		}
1620 	}
1621 
1622 	if (i == 0) {
1623 		MPO_STATUS("mblock_update: "
1624 		    "No non-empty mblock nodes were found "
1625 		    "in the Machine Descriptor\n");
1626 		mc->mc_nmblocks = 0;
1627 		return;
1628 	}
1629 	ASSERT(i <= mc->mc_nmblocks);
1630 	mc->mc_nmblocks = i;
1631 
1632 	/* Must sort mblocks by address for mem_node_iterator_init() */
1633 	mblock_sort(mblock, mc->mc_nmblocks);
1634 }
1635 
1636 /*
1637  * mblock_update_add
1638  *
1639  * Update mblock config after a memory DR add.  The added range is not
1640  * needed, as we read *all* mblock nodes from the MD.  Save the mblocks
1641  * in mc.
1642  */
1643 
1644 static void
1645 mblock_update_add(mpo_config_t *mc)
1646 {
1647 	md_t *md;
1648 	mde_cookie_t root, *mblocknodes;
1649 	int nmblocks = 0;
1650 
1651 	if ((md = md_get_handle()) == NULL) {
1652 		MPO_STATUS("Cannot access Machine Descriptor\n");
1653 		goto error;
1654 	}
1655 
1656 	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657 		goto error;
1658 
1659 	nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660 	    &mblocknodes);
1661 	if (nmblocks <= 0) {
1662 		MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663 		goto error;
1664 	}
1665 
1666 	if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667 		goto error;
1668 
1669 	mblock_update(mc, md, mblocknodes);
1670 	md_free_scan_dag(md, &mblocknodes);
1671 	(void) md_fini_handle(md);
1672 	return;
1673 error:
1674 	panic("mblock_update_add: cannot process mblocks from MD.\n");
1675 }
1676 
1677 /*
1678  * mblock_update_del
1679  *
1680  * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681  * Allocate a new mblock config, copy old config to the new, modify the new
1682  * mblocks to reflect the deletion.   The new mblocks are returned in
1683  * mc_new and are not yet installed as the active config.
1684  */
1685 
1686 static void
1687 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688     pfn_t uend)
1689 {
1690 	int i, j;
1691 	pfn_t base, end;
1692 	mblock_md_t *mblock;
1693 	int nmblocks = mc_old->mc_nmblocks;
1694 
1695 	MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1696 
1697 	/*
1698 	 * Allocate mblocks in mc_new and copy the old to the new.
1699 	 * Allocate one extra in case the deletion splits an mblock.
1700 	 */
1701 	if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702 		return;
1703 	mblock = mc_new->mc_mblocks;
1704 	bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1705 
1706 	/*
1707 	 * Find the mblock containing the deleted range and adjust it in
1708 	 * the new config.
1709 	 */
1710 	for (i = 0; i < nmblocks; i++) {
1711 
1712 		base = btop(mblock[i].base);
1713 		end = base + btop(mblock[i].size) - 1;
1714 
1715 		/*
1716 		 * Adjust the mblock based on the subset that was deleted.
1717 		 *
1718 		 * If the entire mblk was deleted, compact the table.
1719 		 *
1720 		 * If the middle of the mblk was deleted, extend
1721 		 * the table.  Space for the new slot was already
1722 		 * allocated.
1723 		 *
1724 		 * The memory to be deleted is a mblock or a subset of
1725 		 * and does not span multiple mblocks.
1726 		 */
1727 		if (base == ubase && end == uend) {
1728 			for (j = i; j < nmblocks - 1; j++)
1729 				mblock[j] = mblock[j + 1];
1730 			nmblocks--;
1731 			bzero(&mblock[nmblocks], sizeof (*mblock));
1732 			break;
1733 		} else if (base < ubase && end > uend) {
1734 			for (j = nmblocks - 1; j >= i; j--)
1735 				mblock[j + 1] = mblock[j];
1736 			mblock[i].size = ptob(ubase - base);
1737 			mblock[i].end_pfn = ubase - 1;
1738 			mblock[i + 1].base = ptob(uend + 1);
1739 			mblock[i + 1].size = ptob(end - uend);
1740 			mblock[i + 1].base_pfn = uend + 1;
1741 			nmblocks++;
1742 			break;
1743 		} else if (base == ubase) {
1744 			MPO_DEBUG("mblock_update_del: shrink>"
1745 			    " i=%d base=0x%lx end=0x%lx", i, base, end);
1746 			mblock[i].base = ptob(uend + 1);
1747 			mblock[i].size -= ptob(uend - ubase + 1);
1748 			base = uend + 1;
1749 			mblock[i].base_pfn = base;
1750 			mblock[i].end_pfn = end;
1751 			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752 			break;
1753 		} else if (end == uend) {
1754 			MPO_DEBUG("mblock_update_del: shrink<"
1755 			    " i=%d base=0x%lx end=0x%lx", i, base, end);
1756 			mblock[i].size -= ptob(uend - ubase + 1);
1757 			end = ubase - 1;
1758 			mblock[i].base_pfn = base;
1759 			mblock[i].end_pfn = end;
1760 			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761 			break;
1762 		}
1763 	}
1764 	mc_new->mc_nmblocks = nmblocks;
1765 	ASSERT(end > base);
1766 }
1767 
1768 /*
1769  * mstripe_update
1770  *
1771  * Read mblocks from mc and update mstripes in mc
1772  */
1773 
1774 static void
1775 mstripe_update(mpo_config_t *mc)
1776 {
1777 	lgrp_handle_t lgrphand, lgrp_start;
1778 	int i, mnode;
1779 	uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780 	uint64_t stripe, frag, remove;
1781 	mem_stripe_t *ms;
1782 	mblock_md_t *mblock = mc->mc_mblocks;
1783 	int nmblocks = mc->mc_nmblocks;
1784 	int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1785 
1786 	/* Check for non-MPO sun4v platforms or memory DR removal */
1787 	if (n_locality_groups <= 1) {
1788 		ASSERT(n_locality_groups == 1);
1789 		ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1790 
1791 		if (nmblocks == 1) {
1792 			mc->mc_nstripes = 0;
1793 		} else {
1794 			mc->mc_nstripes = nmblocks;
1795 			bzero(mc->mc_stripes, mstripesz);
1796 			for (i = 0; i < nmblocks; i++) {
1797 				mc->mc_stripes[i].exists = 1;
1798 				mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799 				mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1800 			}
1801 		}
1802 		return;
1803 	}
1804 
1805 	bzero(mc->mc_stripes, mstripesz);
1806 	mc->mc_nstripes = max_locality_groups * nmblocks;
1807 	stripe = ptob(mnode_pages);
1808 	stride = max_locality_groups * stripe;
1809 
1810 	for (i = 0; i < nmblocks; i++) {
1811 		base = mblock[i].base;
1812 		end = base + mblock[i].size;
1813 		ra_to_pa = mblock[i].ra_to_pa;
1814 
1815 		/* Find the offset from the prev stripe boundary in PA space. */
1816 		offset = (base + ra_to_pa) & (stripe - 1);
1817 
1818 		/* Set the next stripe boundary. */
1819 		stripe_end = base - offset + stripe;
1820 
1821 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822 		    home_mask_shift);
1823 		lgrphand = lgrp_start;
1824 
1825 		/*
1826 		 * Loop over all lgroups covered by the mblock, creating a
1827 		 * stripe for each.  Stop when lgrp_start is visited again.
1828 		 */
1829 		do {
1830 			/* mblock may not span all lgroups */
1831 			if (base >= end)
1832 				break;
1833 
1834 			mnode = lgrphand;
1835 			ASSERT(mnode < max_mem_nodes);
1836 
1837 			/*
1838 			 * Calculate the size of the fragment that does not
1839 			 * belong to the mnode in the last partial stride.
1840 			 */
1841 			frag = (end - (base - offset)) & (stride - 1);
1842 			if (frag == 0) {
1843 				/* remove the gap */
1844 				remove = stride - stripe;
1845 			} else if (frag < stripe) {
1846 				/* fragment fits in stripe; keep it all */
1847 				remove = 0;
1848 			} else {
1849 				/* fragment is large; trim after whole stripe */
1850 				remove = frag - stripe;
1851 			}
1852 
1853 			ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854 			ms->physbase = btop(base);
1855 			ms->physmax = btop(end - 1 - remove);
1856 			ms->offset = btop(offset);
1857 			ms->exists = 1;
1858 
1859 			base = stripe_end;
1860 			stripe_end += stripe;
1861 			offset = 0;
1862 			lgrphand = (((base + ra_to_pa) & home_mask) >>
1863 			    home_mask_shift);
1864 		} while (lgrphand != lgrp_start);
1865 	}
1866 }
1867 
1868 #define	INTERSECT(a, b, c, d)				\
1869 	if (((a) >= (c) && (a) <= (d)) ||		\
1870 	    ((c) >= (a) && (c) <= (b))) {		\
1871 		(c) = MAX((a), (c));			\
1872 		(d) = MIN((b), (d));			\
1873 	} else {					\
1874 		ASSERT((a) >= (d) || (b) <= (c));	\
1875 		continue;				\
1876 	}						\
1877 
1878 /*
1879  * mnode_update
1880  *
1881  * Read stripes from mc and update mnode extents.  The mnode extents are
1882  * part of the live configuration, so this can only be done at boot time
1883  * or while holding the mpo_wr_lock.
1884  */
1885 
1886 static void
1887 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1888 {
1889 	int i, j, mnode, found;
1890 	pfn_t base, end;
1891 	mem_stripe_t *ms;
1892 
1893 	MPO_DEBUG("mnode_udpate: basepfn: %lx  endpfn: %lx\n", ubase, uend);
1894 
1895 	if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896 		if (utype == U_ADD)
1897 			mpo_mem_node_add_slice(ubase, uend);
1898 		else if (utype == U_DEL)
1899 			mpo_mem_node_del_slice(ubase, uend);
1900 		else
1901 			panic("mnode update: %d: invalid\n", utype);
1902 		return;
1903 	}
1904 
1905 	found = 0;
1906 	for (i = 0; i < mc->mc_nmblocks; i++) {
1907 		for (mnode = 0; mnode < max_locality_groups; mnode++) {
1908 
1909 			j = i * max_locality_groups + mnode;
1910 			ms = &mc->mc_stripes[j];
1911 			if (!ms->exists)
1912 				continue;
1913 
1914 			base = ms->physbase;
1915 			end = ms->physmax;
1916 
1917 			/*
1918 			 * Look for the mstripes intersecting this slice.
1919 			 *
1920 			 * The mstripe and slice pairs may not be equal
1921 			 * if a subset of a mblock is added/deleted.
1922 			 */
1923 			switch (utype) {
1924 			case U_ADD:
1925 				INTERSECT(ubase, uend, base, end);
1926 				/*FALLTHROUGH*/
1927 			case U_ADD_ALL:
1928 				if (n_locality_groups > 1)
1929 					mpo_plat_assign_lgrphand_to_mem_node(
1930 					    mnode, mnode);
1931 				mpo_mem_node_add_slice(base, end);
1932 				break;
1933 			case U_DEL:
1934 				INTERSECT(ubase, uend, base, end);
1935 				mpo_mem_node_del_slice(base, end);
1936 				break;
1937 			default:
1938 				panic("mnode_update: %d: invalid\n", utype);
1939 				break;
1940 			}
1941 
1942 			found++;
1943 		}
1944 	}
1945 
1946 	if (!found)
1947 		panic("mnode_update: mstripe not found");
1948 
1949 #ifdef	DEBUG
1950 	if (utype == U_ADD_ALL || utype == U_DEL)
1951 		return;
1952 	found = 0;
1953 	for (i = 0; i < max_mem_nodes; i++) {
1954 		if (!mem_node_config[i].exists)
1955 			continue;
1956 		if (ubase >= mem_node_config[i].physbase &&
1957 		    ubase <= mem_node_config[i].physmax)
1958 			found |= 1;
1959 		if (uend >= mem_node_config[i].physbase &&
1960 		    uend <= mem_node_config[i].physmax)
1961 			found |= 2;
1962 	}
1963 	ASSERT(found == 3);
1964 	{
1965 		pfn_t minpfn, maxpfn;
1966 
1967 		mem_node_max_range(&minpfn, &maxpfn);
1968 		ASSERT(minpfn <= ubase);
1969 		ASSERT(maxpfn >= uend);
1970 	}
1971 #endif
1972 }
1973 
1974 /*
1975  * Plat_slice_add()/plat_slice_del() are the platform hooks
1976  * for adding/deleting a pfn range to/from the system.
1977  *
1978  * Platform_slice_add() is used for both boot/DR cases.
1979  *
1980  * - Zeus has already added the mblocks to the MD, so read the updated
1981  *   MD and allocate all data structures required to manage the new memory
1982  *   configuration.
1983  *
1984  * - Recompute the stripes which are derived from the mblocks.
1985  *
1986  * - Update (expand) the mnode extents and install the modified mblocks as
1987  *   the new mpo config.  This must be done while holding the mpo_wr_lock
1988  *   to guarantee that no other threads access the mpo meta-data.
1989  *
1990  * - Unlock MPO data structures; the new config is live.  Free the old config.
1991  *
1992  * Plat_slice_del() is used for DR only.
1993  *
1994  * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995  *   the old mpo mblocks and delete the range from the copy.
1996  *
1997  * - Recompute the stripes which are derived from the mblocks.
1998  *
1999  * - Update (shrink) the mnode extents and install the modified mblocks as
2000  *   the new mpo config.  This must be done while holding the mpo_wr_lock
2001  *   to guarantee that no other threads access the mpo meta-data.
2002  *
2003  * - Unlock MPO data structures; the new config is live.  Free the old config.
2004  */
2005 
2006 void
2007 plat_slice_add(pfn_t base, pfn_t end)
2008 {
2009 	mpo_config_t old_config = mpo_config;
2010 	mpo_config_t new_config;
2011 
2012 	VALIDATE_SLICE(base, end);
2013 	mblock_update_add(&new_config);
2014 	mstripe_update(&new_config);
2015 	mpo_wr_lock();
2016 	mblock_install(&new_config);
2017 	/* Use new config to add all ranges for mnode_update */
2018 	mnode_update(&new_config, base, end, U_ADD);
2019 	mpo_genid++;
2020 	mpo_wr_unlock();
2021 	mblock_free(&old_config);
2022 }
2023 
2024 void
2025 plat_slice_del(pfn_t base, pfn_t end)
2026 {
2027 	mpo_config_t old_config = mpo_config;
2028 	mpo_config_t new_config;
2029 
2030 	VALIDATE_SLICE(base, end);
2031 	mblock_update_del(&new_config, &old_config, base, end);
2032 	mstripe_update(&new_config);
2033 	mpo_wr_lock();
2034 	/* Use old config to find deleted range for mnode_update */
2035 	mnode_update(&old_config, base, end, U_DEL);
2036 	mblock_install(&new_config);
2037 	mpo_genid++;
2038 	mpo_wr_unlock();
2039 	mblock_free(&old_config);
2040 }
2041