xref: /titanic_51/usr/src/uts/sun4u/ngdr/io/dr_mem.c (revision c227543f6890bd6f2054360ec1820bfef8132431)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * DR memory support routines.
29  */
30 
31 #include <sys/note.h>
32 #include <sys/debug.h>
33 #include <sys/types.h>
34 #include <sys/errno.h>
35 #include <sys/param.h>
36 #include <sys/dditypes.h>
37 #include <sys/kmem.h>
38 #include <sys/conf.h>
39 #include <sys/ddi.h>
40 #include <sys/sunddi.h>
41 #include <sys/sunndi.h>
42 #include <sys/ddi_impldefs.h>
43 #include <sys/ndi_impldefs.h>
44 #include <sys/sysmacros.h>
45 #include <sys/machsystm.h>
46 #include <sys/spitregs.h>
47 #include <sys/cpuvar.h>
48 #include <sys/promif.h>
49 #include <vm/seg_kmem.h>
50 #include <sys/lgrp.h>
51 #include <sys/platform_module.h>
52 
53 #include <vm/page.h>
54 
55 #include <sys/dr.h>
56 #include <sys/dr_util.h>
57 
58 extern struct memlist	*phys_install;
59 
60 /* TODO: push this reference below drmach line */
61 extern int		kcage_on;
62 
63 /* for the DR*INTERNAL_ERROR macros.  see sys/dr.h. */
64 static char *dr_ie_fmt = "dr_mem.c %d";
65 
66 static int	dr_post_detach_mem_unit(dr_mem_unit_t *mp);
67 static int	dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *mlist);
68 static int	dr_select_mem_target(dr_handle_t *hp, dr_mem_unit_t *mp,
69     struct memlist *ml);
70 static void	dr_init_mem_unit_data(dr_mem_unit_t *mp);
71 
72 static int 	memlist_canfit(struct memlist *s_mlist,
73     struct memlist *t_mlist);
74 
75 /*
76  * dr_mem_unit_t.sbm_flags
77  */
78 #define	DR_MFLAG_RESERVED	0x01	/* mem unit reserved for delete */
79 #define	DR_MFLAG_SOURCE		0x02	/* source brd of copy/rename op */
80 #define	DR_MFLAG_TARGET		0x04	/* target brd of copy/rename op */
81 #define	DR_MFLAG_MEMUPSIZE	0x08	/* move from big to small board */
82 #define	DR_MFLAG_MEMDOWNSIZE	0x10	/* move from small to big board */
83 #define	DR_MFLAG_MEMRESIZE	0x18	/* move to different size board */
84 #define	DR_MFLAG_RELOWNER	0x20	/* memory release (delete) owner */
85 #define	DR_MFLAG_RELDONE	0x40	/* memory release (delete) done */
86 
87 /* helper macros */
88 #define	_ptob64(p) ((uint64_t)(p) << PAGESHIFT)
89 #define	_b64top(b) ((pgcnt_t)((b) >> PAGESHIFT))
90 
91 static struct memlist *
92 dr_get_memlist(dr_mem_unit_t *mp)
93 {
94 	struct memlist	*mlist = NULL;
95 	sbd_error_t	*err;
96 	static fn_t	f = "dr_get_memlist";
97 
98 	PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path);
99 
100 	/*
101 	 * Return cached memlist, if present.
102 	 * This memlist will be present following an
103 	 * unconfigure (a.k.a: detach) of this memunit.
104 	 * It should only be used in the case were a configure
105 	 * is bringing this memunit back in without going
106 	 * through the disconnect and connect states.
107 	 */
108 	if (mp->sbm_mlist) {
109 		PR_MEM("%s: found cached memlist\n", f);
110 
111 		mlist = memlist_dup(mp->sbm_mlist);
112 	} else {
113 		uint64_t basepa = _ptob64(mp->sbm_basepfn);
114 
115 		/* attempt to construct a memlist using phys_install */
116 
117 		/* round down to slice base address */
118 		basepa &= ~(mp->sbm_slice_size - 1);
119 
120 		/* get a copy of phys_install to edit */
121 		memlist_read_lock();
122 		mlist = memlist_dup(phys_install);
123 		memlist_read_unlock();
124 
125 		/* trim lower irrelevant span */
126 		if (mlist)
127 			mlist = memlist_del_span(mlist, 0ull, basepa);
128 
129 		/* trim upper irrelevant span */
130 		if (mlist) {
131 			uint64_t endpa;
132 
133 			basepa += mp->sbm_slice_size;
134 			endpa = _ptob64(physmax + 1);
135 			if (endpa > basepa)
136 				mlist = memlist_del_span(
137 				    mlist,
138 				    basepa,
139 				    endpa - basepa);
140 		}
141 
142 		if (mlist) {
143 			/* successfully built a memlist */
144 			PR_MEM("%s: derived memlist from phys_install\n", f);
145 		}
146 
147 		/* if no mlist yet, try platform layer */
148 		if (!mlist) {
149 			err = drmach_mem_get_memlist(
150 			    mp->sbm_cm.sbdev_id, &mlist);
151 			if (err) {
152 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
153 				mlist = NULL; /* paranoia */
154 			}
155 		}
156 	}
157 
158 	PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path);
159 	PR_MEMLIST_DUMP(mlist);
160 
161 	return (mlist);
162 }
163 
164 typedef struct {
165 	kcondvar_t cond;
166 	kmutex_t lock;
167 	int error;
168 	int done;
169 } dr_release_mem_sync_t;
170 
171 /*
172  * Memory has been logically removed by the time this routine is called.
173  */
174 static void
175 dr_mem_del_done(void *arg, int error)
176 {
177 	dr_release_mem_sync_t *ds = arg;
178 
179 	mutex_enter(&ds->lock);
180 	ds->error = error;
181 	ds->done = 1;
182 	cv_signal(&ds->cond);
183 	mutex_exit(&ds->lock);
184 }
185 
186 /*
187  * When we reach here the memory being drained should have
188  * already been reserved in dr_pre_release_mem().
189  * Our only task here is to kick off the "drain" and wait
190  * for it to finish.
191  */
192 void
193 dr_release_mem(dr_common_unit_t *cp)
194 {
195 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
196 	int		err;
197 	dr_release_mem_sync_t rms;
198 	static fn_t	f = "dr_release_mem";
199 
200 	/* check that this memory unit has been reserved */
201 	if (!(mp->sbm_flags & DR_MFLAG_RELOWNER)) {
202 		DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
203 		return;
204 	}
205 
206 	bzero((void *) &rms, sizeof (rms));
207 
208 	mutex_init(&rms.lock, NULL, MUTEX_DRIVER, NULL);
209 	cv_init(&rms.cond, NULL, CV_DRIVER, NULL);
210 
211 	mutex_enter(&rms.lock);
212 	err = kphysm_del_start(mp->sbm_memhandle, dr_mem_del_done,
213 	    (void *) &rms);
214 	if (err == KPHYSM_OK) {
215 		/* wait for completion or interrupt */
216 		while (!rms.done) {
217 			if (cv_wait_sig(&rms.cond, &rms.lock) == 0) {
218 				/* then there is a pending UNIX signal */
219 				(void) kphysm_del_cancel(mp->sbm_memhandle);
220 
221 				/* wait for completion */
222 				while (!rms.done)
223 					cv_wait(&rms.cond, &rms.lock);
224 			}
225 		}
226 		/* get the result of the memory delete operation */
227 		err = rms.error;
228 	}
229 	mutex_exit(&rms.lock);
230 
231 	cv_destroy(&rms.cond);
232 	mutex_destroy(&rms.lock);
233 
234 	if (err != KPHYSM_OK) {
235 		int e_code;
236 
237 		switch (err) {
238 			case KPHYSM_ENOWORK:
239 				e_code = ESBD_NOERROR;
240 				break;
241 
242 			case KPHYSM_EHANDLE:
243 			case KPHYSM_ESEQUENCE:
244 				e_code = ESBD_INTERNAL;
245 				break;
246 
247 			case KPHYSM_ENOTVIABLE:
248 				e_code = ESBD_MEM_NOTVIABLE;
249 				break;
250 
251 			case KPHYSM_EREFUSED:
252 				e_code = ESBD_MEM_REFUSED;
253 				break;
254 
255 			case KPHYSM_ENONRELOC:
256 				e_code = ESBD_MEM_NONRELOC;
257 				break;
258 
259 			case KPHYSM_ECANCELLED:
260 				e_code = ESBD_MEM_CANCELLED;
261 				break;
262 
263 			case KPHYSM_ERESOURCE:
264 				e_code = ESBD_MEMFAIL;
265 				break;
266 
267 			default:
268 				cmn_err(CE_WARN,
269 				    "%s: unexpected kphysm error code %d,"
270 				    " id 0x%p",
271 				    f, err, mp->sbm_cm.sbdev_id);
272 
273 				e_code = ESBD_IO;
274 				break;
275 		}
276 
277 		if (e_code != ESBD_NOERROR) {
278 			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
279 		}
280 	}
281 }
282 
283 void
284 dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
285 {
286 	_NOTE(ARGUNUSED(hp))
287 
288 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
289 	struct memlist	*ml, *mc;
290 	sbd_error_t	*err;
291 	static fn_t	f = "dr_attach_mem";
292 
293 	PR_MEM("%s...\n", f);
294 
295 	dr_lock_status(hp->h_bd);
296 	err = drmach_configure(cp->sbdev_id, 0);
297 	dr_unlock_status(hp->h_bd);
298 	if (err) {
299 		DRERR_SET_C(&cp->sbdev_error, &err);
300 		return;
301 	}
302 
303 	ml = dr_get_memlist(mp);
304 	for (mc = ml; mc; mc = mc->ml_next) {
305 		int		 rv;
306 		sbd_error_t	*err;
307 
308 		rv = kphysm_add_memory_dynamic(
309 		    (pfn_t)(mc->ml_address >> PAGESHIFT),
310 		    (pgcnt_t)(mc->ml_size >> PAGESHIFT));
311 		if (rv != KPHYSM_OK) {
312 			/*
313 			 * translate kphysm error and
314 			 * store in devlist error
315 			 */
316 			switch (rv) {
317 			case KPHYSM_ERESOURCE:
318 				rv = ESBD_NOMEM;
319 				break;
320 
321 			case KPHYSM_EFAULT:
322 				rv = ESBD_FAULT;
323 				break;
324 
325 			default:
326 				rv = ESBD_INTERNAL;
327 				break;
328 			}
329 
330 			if (rv == ESBD_INTERNAL) {
331 				DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
332 			} else
333 				dr_dev_err(CE_WARN, &mp->sbm_cm, rv);
334 			break;
335 		}
336 
337 		err = drmach_mem_add_span(
338 		    mp->sbm_cm.sbdev_id, mc->ml_address, mc->ml_size);
339 		if (err) {
340 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
341 			break;
342 		}
343 	}
344 
345 	memlist_delete(ml);
346 
347 	/* back out if configure failed */
348 	if (mp->sbm_cm.sbdev_error != NULL) {
349 		dr_lock_status(hp->h_bd);
350 		err = drmach_unconfigure(cp->sbdev_id,
351 		    DEVI_BRANCH_DESTROY);
352 		if (err)
353 			sbd_err_clear(&err);
354 		dr_unlock_status(hp->h_bd);
355 	}
356 }
357 
358 #define	DR_SCRUB_VALUE	0x0d0e0a0d0b0e0e0fULL
359 
360 static void
361 dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
362 {
363 #ifdef DEBUG
364 	clock_t		stime = ddi_get_lbolt();
365 #endif /* DEBUG */
366 
367 	struct memlist	*ml;
368 	uint64_t	scrub_value = DR_SCRUB_VALUE;
369 	processorid_t	cpuid;
370 	static fn_t	f = "dr_mem_ecache_scrub";
371 
372 	cpuid = drmach_mem_cpu_affinity(mp->sbm_cm.sbdev_id);
373 	affinity_set(cpuid);
374 
375 	PR_MEM("%s: using proc %d, memlist...\n", f,
376 	    (cpuid == CPU_CURRENT) ? CPU->cpu_id : cpuid);
377 	PR_MEMLIST_DUMP(mlist);
378 
379 	for (ml = mlist; ml; ml = ml->ml_next) {
380 		uint64_t	dst_pa;
381 		uint64_t	nbytes;
382 
383 		/* calculate the destination physical address */
384 		dst_pa = ml->ml_address;
385 		if (ml->ml_address & PAGEOFFSET)
386 			cmn_err(CE_WARN,
387 			    "%s: address (0x%lx) not on "
388 			    "page boundary", f, ml->ml_address);
389 
390 		nbytes = ml->ml_size;
391 		if (ml->ml_size & PAGEOFFSET)
392 			cmn_err(CE_WARN,
393 			    "%s: size (0x%lx) not on "
394 			    "page boundary", f, ml->ml_size);
395 
396 		/*LINTED*/
397 		while (nbytes > 0) {
398 			/* write 64 bits to dst_pa */
399 			stdphys(dst_pa, scrub_value);
400 
401 			/* increment/decrement by cacheline sizes */
402 			dst_pa += DRMACH_COHERENCY_UNIT;
403 			nbytes -= DRMACH_COHERENCY_UNIT;
404 		}
405 	}
406 
407 	/*
408 	 * flush this cpu's ecache and take care to ensure
409 	 * that all of it's bus transactions have retired.
410 	 */
411 	drmach_cpu_flush_ecache_sync();
412 
413 	affinity_clear();
414 
415 #ifdef DEBUG
416 	stime = ddi_get_lbolt() - stime;
417 	PR_MEM("%s: scrub ticks = %ld (%ld secs)\n", f, stime, stime / hz);
418 #endif /* DEBUG */
419 }
420 
421 static int
422 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
423 {
424 	time_t		 copytime;
425 	drmachid_t	 cr_id;
426 	dr_sr_handle_t	*srhp;
427 	struct memlist	*c_ml, *d_ml;
428 	sbd_error_t	*err;
429 	static fn_t	 f = "dr_move_memory";
430 
431 	PR_MEM("%s: (INLINE) moving memory from %s to %s\n",
432 	    f,
433 	    s_mp->sbm_cm.sbdev_path,
434 	    t_mp->sbm_cm.sbdev_path);
435 
436 	ASSERT(s_mp->sbm_flags & DR_MFLAG_SOURCE);
437 	ASSERT(s_mp->sbm_peer == t_mp);
438 	ASSERT(s_mp->sbm_mlist);
439 
440 	ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
441 	ASSERT(t_mp->sbm_peer == s_mp);
442 
443 	/*
444 	 * create a memlist of spans to copy by removing
445 	 * the spans that have been deleted, if any, from
446 	 * the full source board memlist.  s_mp->sbm_del_mlist
447 	 * will be NULL if there were no spans deleted from
448 	 * the source board.
449 	 */
450 	c_ml = memlist_dup(s_mp->sbm_mlist);
451 	d_ml = s_mp->sbm_del_mlist;
452 	while (d_ml != NULL) {
453 		c_ml = memlist_del_span(c_ml, d_ml->ml_address, d_ml->ml_size);
454 		d_ml = d_ml->ml_next;
455 	}
456 
457 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
458 
459 	err = drmach_copy_rename_init(
460 	    t_mp->sbm_cm.sbdev_id, _ptob64(t_mp->sbm_slice_offset),
461 	    s_mp->sbm_cm.sbdev_id, c_ml, &cr_id);
462 	if (err) {
463 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
464 		affinity_clear();
465 		return (-1);
466 	}
467 
468 	srhp = dr_get_sr_handle(hp);
469 	ASSERT(srhp);
470 
471 	copytime = ddi_get_lbolt();
472 
473 	/* Quiesce the OS.  */
474 	if (dr_suspend(srhp)) {
475 		cmn_err(CE_WARN, "%s: failed to quiesce OS"
476 		    " for copy-rename", f);
477 
478 		dr_release_sr_handle(srhp);
479 		err = drmach_copy_rename_fini(cr_id);
480 		if (err) {
481 			/*
482 			 * no error is expected since the program has
483 			 * not yet run.
484 			 */
485 
486 			/* catch this in debug kernels */
487 			ASSERT(0);
488 
489 			sbd_err_clear(&err);
490 		}
491 
492 		/* suspend error reached via hp */
493 		s_mp->sbm_cm.sbdev_error = hp->h_err;
494 		hp->h_err = NULL;
495 
496 		affinity_clear();
497 		return (-1);
498 	}
499 
500 	/*
501 	 * Rename memory for lgroup.
502 	 * Source and target board numbers are packaged in arg.
503 	 */
504 	{
505 		dr_board_t	*t_bp, *s_bp;
506 
507 		s_bp = s_mp->sbm_cm.sbdev_bp;
508 		t_bp = t_mp->sbm_cm.sbdev_bp;
509 
510 		lgrp_plat_config(LGRP_CONFIG_MEM_RENAME,
511 		    (uintptr_t)(s_bp->b_num | (t_bp->b_num << 16)));
512 	}
513 
514 	drmach_copy_rename(cr_id);
515 
516 	/* Resume the OS.  */
517 	dr_resume(srhp);
518 
519 	copytime = ddi_get_lbolt() - copytime;
520 
521 	dr_release_sr_handle(srhp);
522 	err = drmach_copy_rename_fini(cr_id);
523 	if (err)
524 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
525 
526 	affinity_clear();
527 
528 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
529 	    f, copytime, copytime / hz);
530 
531 	/* return -1 if dr_suspend or copy/rename recorded an error */
532 	return (err == NULL ? 0 : -1);
533 }
534 
535 /*
536  * If detaching node contains memory that is "non-permanent"
537  * then the memory adr's are simply cleared.  If the memory
538  * is non-relocatable, then do a copy-rename.
539  */
540 void
541 dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
542 {
543 	int			rv = 0;
544 	dr_mem_unit_t		*s_mp = (dr_mem_unit_t *)cp;
545 	dr_mem_unit_t		*t_mp;
546 	dr_state_t		state;
547 	static fn_t		f = "dr_detach_mem";
548 
549 	PR_MEM("%s...\n", f);
550 
551 	/* lookup target mem unit and target board structure, if any */
552 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
553 		t_mp = s_mp->sbm_peer;
554 		ASSERT(t_mp != NULL);
555 		ASSERT(t_mp->sbm_peer == s_mp);
556 	} else {
557 		t_mp = NULL;
558 	}
559 
560 	/* verify mem unit's state is UNREFERENCED */
561 	state = s_mp->sbm_cm.sbdev_state;
562 	if (state != DR_STATE_UNREFERENCED) {
563 		dr_dev_err(CE_IGNORE, &s_mp->sbm_cm, ESBD_STATE);
564 		return;
565 	}
566 
567 	/* verify target mem unit's state is UNREFERENCED, if any */
568 	if (t_mp != NULL) {
569 		state = t_mp->sbm_cm.sbdev_state;
570 		if (state != DR_STATE_UNREFERENCED) {
571 			dr_dev_err(CE_IGNORE, &t_mp->sbm_cm, ESBD_STATE);
572 			return;
573 		}
574 	}
575 
576 	/*
577 	 * Scrub deleted memory.  This will cause all cachelines
578 	 * referencing the memory to only be in the local cpu's
579 	 * ecache.
580 	 */
581 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
582 		/* no del mlist for src<=dst mem size copy/rename */
583 		if (s_mp->sbm_del_mlist)
584 			dr_mem_ecache_scrub(s_mp, s_mp->sbm_del_mlist);
585 	}
586 	if (t_mp != NULL && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
587 		ASSERT(t_mp->sbm_del_mlist);
588 		dr_mem_ecache_scrub(t_mp, t_mp->sbm_del_mlist);
589 	}
590 
591 	/*
592 	 * If there is no target board (no copy/rename was needed), then
593 	 * we're done!
594 	 */
595 	if (t_mp == NULL) {
596 		sbd_error_t *err;
597 		/*
598 		 * Reprogram interconnect hardware and disable
599 		 * memory controllers for memory node that's going away.
600 		 */
601 
602 		err = drmach_mem_disable(s_mp->sbm_cm.sbdev_id);
603 		if (err) {
604 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
605 			rv = -1;
606 		}
607 	} else {
608 		rv = dr_move_memory(hp, s_mp, t_mp);
609 		PR_MEM("%s: %s memory COPY-RENAME (board %d -> %d)\n",
610 		    f,
611 		    rv ? "FAILED" : "COMPLETED",
612 		    s_mp->sbm_cm.sbdev_bp->b_num,
613 		    t_mp->sbm_cm.sbdev_bp->b_num);
614 
615 		if (rv != 0)
616 			(void) dr_cancel_mem(s_mp);
617 	}
618 
619 	if (rv == 0) {
620 		sbd_error_t *err;
621 
622 		dr_lock_status(hp->h_bd);
623 		err = drmach_unconfigure(s_mp->sbm_cm.sbdev_id,
624 		    DEVI_BRANCH_DESTROY);
625 		dr_unlock_status(hp->h_bd);
626 		if (err)
627 			sbd_err_clear(&err);
628 	}
629 }
630 
631 #ifndef _STARFIRE
632 /*
633  * XXX workaround for certain lab configurations (see also starcat drmach.c)
634  * Temporary code to get around observed incorrect results from
635  * kphysm_del_span_query when the queried span contains address spans
636  * not occupied by memory in between spans that do have memory.
637  * This routine acts as a wrapper to kphysm_del_span_query.  It builds
638  * a memlist from phys_install of spans that exist between base and
639  * base + npages, inclusively.  Kphysm_del_span_query is called for each
640  * node in the memlist with the results accumulated in *mp.
641  */
642 static int
643 dr_del_span_query(pfn_t base, pgcnt_t npages, memquery_t *mp)
644 {
645 	uint64_t	 pa = _ptob64(base);
646 	uint64_t	 sm = ~ (137438953472ull - 1);
647 	uint64_t	 sa = pa & sm;
648 	struct memlist	*mlist, *ml;
649 	int		 rv;
650 
651 	npages = npages; /* silence lint */
652 	memlist_read_lock();
653 	mlist = memlist_dup(phys_install);
654 	memlist_read_unlock();
655 
656 again:
657 	for (ml = mlist; ml; ml = ml->ml_next) {
658 		if ((ml->ml_address & sm) != sa) {
659 			mlist = memlist_del_span(mlist,
660 			    ml->ml_address, ml->ml_size);
661 			goto again;
662 		}
663 	}
664 
665 	mp->phys_pages = 0;
666 	mp->managed = 0;
667 	mp->nonrelocatable = 0;
668 	mp->first_nonrelocatable = (pfn_t)-1;	/* XXX */
669 	mp->last_nonrelocatable = 0;
670 
671 	for (ml = mlist; ml; ml = ml->ml_next) {
672 		memquery_t mq;
673 
674 		rv = kphysm_del_span_query(
675 		    _b64top(ml->ml_address), _b64top(ml->ml_size), &mq);
676 		if (rv)
677 			break;
678 
679 		mp->phys_pages += mq.phys_pages;
680 		mp->managed += mq.managed;
681 		mp->nonrelocatable += mq.nonrelocatable;
682 
683 		if (mq.nonrelocatable != 0) {
684 			if (mq.first_nonrelocatable < mp->first_nonrelocatable)
685 				mp->first_nonrelocatable =
686 				    mq.first_nonrelocatable;
687 			if (mq.last_nonrelocatable > mp->last_nonrelocatable)
688 				mp->last_nonrelocatable =
689 				    mq.last_nonrelocatable;
690 		}
691 	}
692 
693 	if (mp->nonrelocatable == 0)
694 		mp->first_nonrelocatable = 0;	/* XXX */
695 
696 	memlist_delete(mlist);
697 	return (rv);
698 }
699 
700 #define	kphysm_del_span_query dr_del_span_query
701 #endif /* _STARFIRE */
702 
703 /*
704  * NOTE: This routine is only partially smart about multiple
705  *	 mem-units.  Need to make mem-status structure smart
706  *	 about them also.
707  */
708 int
709 dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp)
710 {
711 	int		m, mix;
712 	memdelstat_t	mdst;
713 	memquery_t	mq;
714 	dr_board_t	*bp;
715 	dr_mem_unit_t	*mp;
716 	sbd_mem_stat_t	*msp;
717 	static fn_t	f = "dr_mem_status";
718 
719 	bp = hp->h_bd;
720 	devset &= DR_DEVS_PRESENT(bp);
721 
722 	for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) {
723 		int		rv;
724 		sbd_error_t	*err;
725 		drmach_status_t	 pstat;
726 		dr_mem_unit_t	*p_mp;
727 
728 		if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0)
729 			continue;
730 
731 		mp = dr_get_mem_unit(bp, m);
732 
733 		if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) {
734 			/* present, but not fully initialized */
735 			continue;
736 		}
737 
738 		if (mp->sbm_cm.sbdev_id == (drmachid_t)0)
739 			continue;
740 
741 		/* fetch platform status */
742 		err = drmach_status(mp->sbm_cm.sbdev_id, &pstat);
743 		if (err) {
744 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
745 			continue;
746 		}
747 
748 		msp = &dsp->d_mem;
749 		bzero((caddr_t)msp, sizeof (*msp));
750 
751 		(void) strncpy(msp->ms_cm.c_id.c_name, pstat.type,
752 		    sizeof (msp->ms_cm.c_id.c_name));
753 		msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type;
754 		msp->ms_cm.c_id.c_unit = SBD_NULL_UNIT;
755 		msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond;
756 		msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy;
757 		msp->ms_cm.c_time = mp->sbm_cm.sbdev_time;
758 		msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate;
759 
760 		msp->ms_totpages = mp->sbm_npages;
761 		msp->ms_basepfn = mp->sbm_basepfn;
762 		msp->ms_pageslost = mp->sbm_pageslost;
763 		msp->ms_cage_enabled = kcage_on;
764 
765 		if (mp->sbm_flags & DR_MFLAG_RESERVED)
766 			p_mp = mp->sbm_peer;
767 		else
768 			p_mp = NULL;
769 
770 		if (p_mp == NULL) {
771 			msp->ms_peer_is_target = 0;
772 			msp->ms_peer_ap_id[0] = '\0';
773 		} else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) {
774 			char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
775 			char *minor;
776 
777 			/*
778 			 * b_dip doesn't have to be held for ddi_pathname()
779 			 * because the board struct (dr_board_t) will be
780 			 * destroyed before b_dip detaches.
781 			 */
782 			(void) ddi_pathname(bp->b_dip, path);
783 			minor = strchr(p_mp->sbm_cm.sbdev_path, ':');
784 
785 			(void) snprintf(msp->ms_peer_ap_id,
786 			    sizeof (msp->ms_peer_ap_id), "%s%s",
787 			    path, (minor == NULL) ? "" : minor);
788 
789 			kmem_free(path, MAXPATHLEN);
790 
791 			if (p_mp->sbm_flags & DR_MFLAG_TARGET)
792 				msp->ms_peer_is_target = 1;
793 		}
794 
795 		if (mp->sbm_flags & DR_MFLAG_RELOWNER)
796 			rv = kphysm_del_status(mp->sbm_memhandle, &mdst);
797 		else
798 			rv = KPHYSM_EHANDLE;	/* force 'if' to fail */
799 
800 		if (rv == KPHYSM_OK) {
801 			/*
802 			 * Any pages above managed is "free",
803 			 * i.e. it's collected.
804 			 */
805 			msp->ms_detpages += (uint_t)(mdst.collected +
806 			    mdst.phys_pages - mdst.managed);
807 		} else {
808 			/*
809 			 * If we're UNREFERENCED or UNCONFIGURED,
810 			 * then the number of detached pages is
811 			 * however many pages are on the board.
812 			 * I.e. detached = not in use by OS.
813 			 */
814 			switch (msp->ms_cm.c_ostate) {
815 			/*
816 			 * changed to use cfgadm states
817 			 *
818 			 * was:
819 			 *	case DR_STATE_UNREFERENCED:
820 			 *	case DR_STATE_UNCONFIGURED:
821 			 */
822 			case SBD_STAT_UNCONFIGURED:
823 				msp->ms_detpages = msp->ms_totpages;
824 				break;
825 
826 			default:
827 				break;
828 			}
829 		}
830 
831 		/*
832 		 * kphysm_del_span_query can report non-reloc pages = total
833 		 * pages for memory that is not yet configured
834 		 */
835 		if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) {
836 
837 			rv = kphysm_del_span_query(mp->sbm_basepfn,
838 			    mp->sbm_npages, &mq);
839 
840 			if (rv == KPHYSM_OK) {
841 				msp->ms_managed_pages = mq.managed;
842 				msp->ms_noreloc_pages = mq.nonrelocatable;
843 				msp->ms_noreloc_first =
844 				    mq.first_nonrelocatable;
845 				msp->ms_noreloc_last =
846 				    mq.last_nonrelocatable;
847 				msp->ms_cm.c_sflags = 0;
848 				if (mq.nonrelocatable) {
849 					SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE,
850 					    msp->ms_cm.c_sflags);
851 				}
852 			} else {
853 				PR_MEM("%s: kphysm_del_span_query() = %d\n",
854 				    f, rv);
855 			}
856 		}
857 
858 		/*
859 		 * Check source unit state during copy-rename
860 		 */
861 		if ((mp->sbm_flags & DR_MFLAG_SOURCE) &&
862 		    (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED ||
863 		    mp->sbm_cm.sbdev_state == DR_STATE_RELEASE))
864 			msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED;
865 
866 		mix++;
867 		dsp++;
868 	}
869 
870 	return (mix);
871 }
872 
873 int
874 dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
875 {
876 	_NOTE(ARGUNUSED(hp))
877 
878 	int		err_flag = 0;
879 	int		d;
880 	sbd_error_t	*err;
881 	static fn_t	f = "dr_pre_attach_mem";
882 
883 	PR_MEM("%s...\n", f);
884 
885 	for (d = 0; d < devnum; d++) {
886 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
887 		dr_state_t	state;
888 
889 		cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path);
890 
891 		state = mp->sbm_cm.sbdev_state;
892 		switch (state) {
893 		case DR_STATE_UNCONFIGURED:
894 			PR_MEM("%s: recovering from UNCONFIG for %s\n",
895 			    f,
896 			    mp->sbm_cm.sbdev_path);
897 
898 			/* use memlist cached by dr_post_detach_mem_unit */
899 			ASSERT(mp->sbm_mlist != NULL);
900 			PR_MEM("%s: re-configuring cached memlist for %s:\n",
901 			    f, mp->sbm_cm.sbdev_path);
902 			PR_MEMLIST_DUMP(mp->sbm_mlist);
903 
904 			/* kphysm del handle should be have been freed */
905 			ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
906 
907 			/*FALLTHROUGH*/
908 
909 		case DR_STATE_CONNECTED:
910 			PR_MEM("%s: reprogramming mem hardware on %s\n",
911 			    f, mp->sbm_cm.sbdev_bp->b_path);
912 
913 			PR_MEM("%s: enabling %s\n",
914 			    f, mp->sbm_cm.sbdev_path);
915 
916 			err = drmach_mem_enable(mp->sbm_cm.sbdev_id);
917 			if (err) {
918 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
919 				err_flag = 1;
920 			}
921 			break;
922 
923 		default:
924 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE);
925 			err_flag = 1;
926 			break;
927 		}
928 
929 		/* exit for loop if error encountered */
930 		if (err_flag)
931 			break;
932 	}
933 
934 	return (err_flag ? -1 : 0);
935 }
936 
937 int
938 dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
939 {
940 	_NOTE(ARGUNUSED(hp))
941 
942 	int		d;
943 	static fn_t	f = "dr_post_attach_mem";
944 
945 	PR_MEM("%s...\n", f);
946 
947 	for (d = 0; d < devnum; d++) {
948 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
949 		struct memlist	*mlist, *ml;
950 
951 		mlist = dr_get_memlist(mp);
952 		if (mlist == NULL) {
953 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_MEMFAIL);
954 			continue;
955 		}
956 
957 		/*
958 		 * Verify the memory really did successfully attach
959 		 * by checking for its existence in phys_install.
960 		 */
961 		memlist_read_lock();
962 		if (memlist_intersect(phys_install, mlist) == 0) {
963 			memlist_read_unlock();
964 
965 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
966 
967 			PR_MEM("%s: %s memlist not in phys_install",
968 			    f, mp->sbm_cm.sbdev_path);
969 
970 			memlist_delete(mlist);
971 			continue;
972 		}
973 		memlist_read_unlock();
974 
975 		for (ml = mlist; ml != NULL; ml = ml->ml_next) {
976 			sbd_error_t *err;
977 
978 			err = drmach_mem_add_span(
979 			    mp->sbm_cm.sbdev_id,
980 			    ml->ml_address,
981 			    ml->ml_size);
982 			if (err)
983 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
984 		}
985 
986 		memlist_delete(mlist);
987 
988 		/*
989 		 * Destroy cached memlist, if any.
990 		 * There will be a cached memlist in sbm_mlist if
991 		 * this board is being configured directly after
992 		 * an unconfigure.
993 		 * To support this transition, dr_post_detach_mem
994 		 * left a copy of the last known memlist in sbm_mlist.
995 		 * This memlist could differ from any derived from
996 		 * hardware if while this memunit was last configured
997 		 * the system detected and deleted bad pages from
998 		 * phys_install.  The location of those bad pages
999 		 * will be reflected in the cached memlist.
1000 		 */
1001 		if (mp->sbm_mlist) {
1002 			memlist_delete(mp->sbm_mlist);
1003 			mp->sbm_mlist = NULL;
1004 		}
1005 
1006 /*
1007  * TODO: why is this call to dr_init_mem_unit_data here?
1008  * this has been done at discovery or connect time, so this is
1009  * probably redundant and unnecessary.
1010  */
1011 		dr_init_mem_unit_data(mp);
1012 	}
1013 
1014 	return (0);
1015 }
1016 
1017 int
1018 dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1019 {
1020 	_NOTE(ARGUNUSED(hp))
1021 
1022 	int d;
1023 
1024 	for (d = 0; d < devnum; d++) {
1025 		dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d];
1026 
1027 		cmn_err(CE_CONT, "OS unconfigure %s", mp->sbm_cm.sbdev_path);
1028 	}
1029 
1030 	return (0);
1031 }
1032 
1033 
1034 int
1035 dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1036 {
1037 	_NOTE(ARGUNUSED(hp))
1038 
1039 	int		d, rv;
1040 	static fn_t	f = "dr_post_detach_mem";
1041 
1042 	PR_MEM("%s...\n", f);
1043 
1044 	rv = 0;
1045 	for (d = 0; d < devnum; d++) {
1046 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1047 
1048 		ASSERT(mp->sbm_cm.sbdev_bp == hp->h_bd);
1049 
1050 		if (dr_post_detach_mem_unit(mp))
1051 			rv = -1;
1052 	}
1053 
1054 	return (rv);
1055 }
1056 
1057 static void
1058 dr_add_memory_spans(dr_mem_unit_t *mp, struct memlist *ml)
1059 {
1060 	static fn_t	f = "dr_add_memory_spans";
1061 
1062 	PR_MEM("%s...", f);
1063 	PR_MEMLIST_DUMP(ml);
1064 
1065 #ifdef DEBUG
1066 	memlist_read_lock();
1067 	if (memlist_intersect(phys_install, ml)) {
1068 		PR_MEM("%s:WARNING: memlist intersects with phys_install\n", f);
1069 	}
1070 	memlist_read_unlock();
1071 #endif
1072 
1073 	for (; ml; ml = ml->ml_next) {
1074 		pfn_t		 base;
1075 		pgcnt_t		 npgs;
1076 		int		 rv;
1077 		sbd_error_t	*err;
1078 
1079 		base = _b64top(ml->ml_address);
1080 		npgs = _b64top(ml->ml_size);
1081 
1082 		rv = kphysm_add_memory_dynamic(base, npgs);
1083 
1084 		err = drmach_mem_add_span(
1085 		    mp->sbm_cm.sbdev_id,
1086 		    ml->ml_address,
1087 		    ml->ml_size);
1088 
1089 		if (err)
1090 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1091 
1092 		if (rv != KPHYSM_OK) {
1093 			cmn_err(CE_WARN, "%s:"
1094 			    " unexpected kphysm_add_memory_dynamic"
1095 			    " return value %d;"
1096 			    " basepfn=0x%lx, npages=%ld\n",
1097 			    f, rv, base, npgs);
1098 
1099 			continue;
1100 		}
1101 	}
1102 }
1103 
1104 static int
1105 dr_post_detach_mem_unit(dr_mem_unit_t *s_mp)
1106 {
1107 	uint64_t	sz = s_mp->sbm_slice_size;
1108 	uint64_t	sm = sz - 1;
1109 	/* old and new below refer to PAs before and after copy-rename */
1110 	uint64_t	s_old_basepa, s_new_basepa;
1111 	uint64_t	t_old_basepa, t_new_basepa;
1112 	uint64_t	t_new_smallsize = 0;
1113 	dr_mem_unit_t	*t_mp, *x_mp;
1114 	struct memlist	*ml;
1115 	int		rv;
1116 	sbd_error_t	*err;
1117 	static fn_t	f = "dr_post_detach_mem_unit";
1118 
1119 	PR_MEM("%s...\n", f);
1120 
1121 	/* s_mp->sbm_del_mlist could be NULL, meaning no deleted spans */
1122 	PR_MEM("%s: %s: deleted memlist (EMPTY maybe okay):\n",
1123 	    f, s_mp->sbm_cm.sbdev_path);
1124 	PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1125 
1126 	/* sanity check */
1127 	ASSERT(s_mp->sbm_del_mlist == NULL ||
1128 	    (s_mp->sbm_flags & DR_MFLAG_RELDONE) != 0);
1129 
1130 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1131 		t_mp = s_mp->sbm_peer;
1132 		ASSERT(t_mp != NULL);
1133 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1134 		ASSERT(t_mp->sbm_peer == s_mp);
1135 
1136 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RELDONE);
1137 		ASSERT(t_mp->sbm_del_mlist);
1138 
1139 		PR_MEM("%s: target %s: deleted memlist:\n",
1140 		    f, t_mp->sbm_cm.sbdev_path);
1141 		PR_MEMLIST_DUMP(t_mp->sbm_del_mlist);
1142 	} else {
1143 		/* this is no target unit */
1144 		t_mp = NULL;
1145 	}
1146 
1147 	/*
1148 	 * Verify the memory really did successfully detach
1149 	 * by checking for its non-existence in phys_install.
1150 	 */
1151 	rv = 0;
1152 	memlist_read_lock();
1153 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
1154 		x_mp = s_mp;
1155 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1156 	}
1157 	if (rv == 0 && t_mp && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
1158 		x_mp = t_mp;
1159 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1160 	}
1161 	memlist_read_unlock();
1162 
1163 	if (rv) {
1164 		/* error: memlist still in phys_install */
1165 		DR_DEV_INTERNAL_ERROR(&x_mp->sbm_cm);
1166 	}
1167 
1168 	/*
1169 	 * clean mem unit state and bail out if an error has been recorded.
1170 	 */
1171 	rv = 0;
1172 	if (s_mp->sbm_cm.sbdev_error) {
1173 		PR_MEM("%s: %s flags=%x", f,
1174 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1175 		DR_DEV_CLR_UNREFERENCED(&s_mp->sbm_cm);
1176 		DR_DEV_CLR_RELEASED(&s_mp->sbm_cm);
1177 		dr_device_transition(&s_mp->sbm_cm, DR_STATE_CONFIGURED);
1178 		rv = -1;
1179 	}
1180 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error != NULL) {
1181 		PR_MEM("%s: %s flags=%x", f,
1182 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1183 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1184 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1185 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1186 		rv = -1;
1187 	}
1188 	if (rv)
1189 		goto cleanup;
1190 
1191 	s_old_basepa = _ptob64(s_mp->sbm_basepfn);
1192 	err = drmach_mem_get_base_physaddr(s_mp->sbm_cm.sbdev_id,
1193 	    &s_new_basepa);
1194 	ASSERT(err == NULL);
1195 
1196 	PR_MEM("%s:s_old_basepa: 0x%lx\n", f, s_old_basepa);
1197 	PR_MEM("%s:s_new_basepa: 0x%lx\n", f, s_new_basepa);
1198 
1199 	if (t_mp != NULL) {
1200 		struct memlist *s_copy_mlist;
1201 
1202 		t_old_basepa	= _ptob64(t_mp->sbm_basepfn);
1203 		err = drmach_mem_get_base_physaddr(t_mp->sbm_cm.sbdev_id,
1204 		    &t_new_basepa);
1205 		ASSERT(err == NULL);
1206 
1207 		PR_MEM("%s:t_old_basepa: 0x%lx\n", f, t_old_basepa);
1208 		PR_MEM("%s:t_new_basepa: 0x%lx\n", f, t_new_basepa);
1209 
1210 		/*
1211 		 * Construct copy list with original source addresses.
1212 		 * Used to add back excess target mem.
1213 		 */
1214 		s_copy_mlist = memlist_dup(s_mp->sbm_mlist);
1215 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1216 			s_copy_mlist = memlist_del_span(s_copy_mlist,
1217 			    ml->ml_address, ml->ml_size);
1218 		}
1219 
1220 		PR_MEM("%s: source copy list:\n:", f);
1221 		PR_MEMLIST_DUMP(s_copy_mlist);
1222 
1223 		/*
1224 		 * We had to swap mem-units, so update
1225 		 * memlists accordingly with new base
1226 		 * addresses.
1227 		 */
1228 		for (ml = t_mp->sbm_mlist; ml; ml = ml->ml_next) {
1229 			ml->ml_address -= t_old_basepa;
1230 			ml->ml_address += t_new_basepa;
1231 		}
1232 
1233 		/*
1234 		 * There is no need to explicitly rename the target delete
1235 		 * memlist, because sbm_del_mlist and sbm_mlist always
1236 		 * point to the same memlist for a copy/rename operation.
1237 		 */
1238 		ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1239 
1240 		PR_MEM("%s: renamed target memlist and delete memlist:\n", f);
1241 		PR_MEMLIST_DUMP(t_mp->sbm_mlist);
1242 
1243 		for (ml = s_mp->sbm_mlist; ml; ml = ml->ml_next) {
1244 			ml->ml_address -= s_old_basepa;
1245 			ml->ml_address += s_new_basepa;
1246 		}
1247 
1248 		PR_MEM("%s: renamed source memlist:\n", f);
1249 		PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1250 
1251 		/*
1252 		 * Keep track of dynamically added segments
1253 		 * since they cannot be split if we need to delete
1254 		 * excess source memory later for this board.
1255 		 */
1256 		if (t_mp->sbm_dyn_segs)
1257 			memlist_delete(t_mp->sbm_dyn_segs);
1258 		t_mp->sbm_dyn_segs = s_mp->sbm_dyn_segs;
1259 		s_mp->sbm_dyn_segs = NULL;
1260 
1261 		/*
1262 		 * If the target memory range with the new target base PA
1263 		 * extends beyond the usable slice, prevent any "target excess"
1264 		 * from being added back after this copy/rename and
1265 		 * calculate the new smaller size of the target board
1266 		 * to be set as part of target cleanup. The base + npages
1267 		 * must only include the range of memory up to the end of
1268 		 * this slice. This will only be used after a category 4
1269 		 * large-to-small target type copy/rename - see comments
1270 		 * in dr_select_mem_target.
1271 		 */
1272 		if (((t_new_basepa & sm) + _ptob64(t_mp->sbm_npages)) > sz) {
1273 			t_new_smallsize = sz - (t_new_basepa & sm);
1274 		}
1275 
1276 		if (s_mp->sbm_flags & DR_MFLAG_MEMRESIZE &&
1277 		    t_new_smallsize == 0) {
1278 			struct memlist	*t_excess_mlist;
1279 
1280 			/*
1281 			 * Add back excess target memory.
1282 			 * Subtract out the portion of the target memory
1283 			 * node that was taken over by the source memory
1284 			 * node.
1285 			 */
1286 			t_excess_mlist = memlist_dup(t_mp->sbm_mlist);
1287 			for (ml = s_copy_mlist; ml; ml = ml->ml_next) {
1288 				t_excess_mlist =
1289 				    memlist_del_span(t_excess_mlist,
1290 				    ml->ml_address, ml->ml_size);
1291 			}
1292 
1293 			/*
1294 			 * Update dynamically added segs
1295 			 */
1296 			for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1297 				t_mp->sbm_dyn_segs =
1298 				    memlist_del_span(t_mp->sbm_dyn_segs,
1299 				    ml->ml_address, ml->ml_size);
1300 			}
1301 			for (ml = t_excess_mlist; ml; ml = ml->ml_next) {
1302 				t_mp->sbm_dyn_segs =
1303 				    memlist_cat_span(t_mp->sbm_dyn_segs,
1304 				    ml->ml_address, ml->ml_size);
1305 			}
1306 			PR_MEM("%s: %s: updated dynamic seg list:\n",
1307 			    f, t_mp->sbm_cm.sbdev_path);
1308 			PR_MEMLIST_DUMP(t_mp->sbm_dyn_segs);
1309 
1310 			PR_MEM("%s: adding back remaining portion"
1311 			    " of %s, memlist:\n",
1312 			    f, t_mp->sbm_cm.sbdev_path);
1313 			PR_MEMLIST_DUMP(t_excess_mlist);
1314 
1315 			dr_add_memory_spans(s_mp, t_excess_mlist);
1316 			memlist_delete(t_excess_mlist);
1317 		}
1318 		memlist_delete(s_copy_mlist);
1319 
1320 #ifdef DEBUG
1321 		/*
1322 		 * Renaming s_mp->sbm_del_mlist is not necessary.  This
1323 		 * list is not used beyond this point, and in fact, is
1324 		 * disposed of at the end of this function.
1325 		 */
1326 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1327 			ml->ml_address -= s_old_basepa;
1328 			ml->ml_address += s_new_basepa;
1329 		}
1330 
1331 		PR_MEM("%s: renamed source delete memlist", f);
1332 		PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1333 #endif
1334 
1335 	}
1336 
1337 	if (t_mp != NULL) {
1338 		/* delete target's entire address space */
1339 		err = drmach_mem_del_span(t_mp->sbm_cm.sbdev_id,
1340 		    t_old_basepa & ~ sm, sz);
1341 		if (err)
1342 			DRERR_SET_C(&t_mp->sbm_cm.sbdev_error, &err);
1343 		ASSERT(err == NULL);
1344 
1345 		/*
1346 		 * After the copy/rename, the original address space
1347 		 * for the source board (which is now located on the
1348 		 * target board) may now have some excess to be deleted.
1349 		 * The amount is calculated by masking the slice
1350 		 * info and keeping the slice offset from t_new_basepa.
1351 		 */
1352 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1353 		    s_old_basepa & ~ sm, t_new_basepa & sm);
1354 		if (err)
1355 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1356 		ASSERT(err == NULL);
1357 
1358 	} else {
1359 		/* delete board's entire address space */
1360 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1361 		    s_old_basepa & ~ sm, sz);
1362 		if (err)
1363 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1364 		ASSERT(err == NULL);
1365 	}
1366 
1367 cleanup:
1368 	/* clean up target mem unit */
1369 	if (t_mp != NULL) {
1370 		memlist_delete(t_mp->sbm_del_mlist);
1371 		/* no need to delete sbm_mlist, it shares sbm_del_mlist */
1372 
1373 		t_mp->sbm_del_mlist = NULL;
1374 		t_mp->sbm_mlist = NULL;
1375 		t_mp->sbm_peer = NULL;
1376 		t_mp->sbm_flags = 0;
1377 		t_mp->sbm_cm.sbdev_busy = 0;
1378 		dr_init_mem_unit_data(t_mp);
1379 
1380 		/* reduce target size if new PAs go past end of usable slice */
1381 		if (t_new_smallsize > 0) {
1382 			t_mp->sbm_npages = _b64top(t_new_smallsize);
1383 			PR_MEM("%s: target new size 0x%lx bytes\n",
1384 			    f, t_new_smallsize);
1385 		}
1386 	}
1387 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error == NULL) {
1388 		/*
1389 		 * now that copy/rename has completed, undo this
1390 		 * work that was done in dr_release_mem_done.
1391 		 */
1392 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1393 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1394 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1395 	}
1396 
1397 	/*
1398 	 * clean up (source) board's mem unit structure.
1399 	 * NOTE: sbm_mlist is retained if no error has been record (in other
1400 	 * words, when s_mp->sbm_cm.sbdev_error is NULL). This memlist is
1401 	 * referred to elsewhere as the cached memlist.  The cached memlist
1402 	 * is used to re-attach (configure back in) this memunit from the
1403 	 * unconfigured state.  The memlist is retained because it may
1404 	 * represent bad pages that were detected while the memory was
1405 	 * configured into the OS.  The OS deletes bad pages from phys_install.
1406 	 * Those deletes, if any, will be represented in the cached mlist.
1407 	 */
1408 	if (s_mp->sbm_del_mlist && s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1409 		memlist_delete(s_mp->sbm_del_mlist);
1410 
1411 	if (s_mp->sbm_cm.sbdev_error && s_mp->sbm_mlist) {
1412 		memlist_delete(s_mp->sbm_mlist);
1413 		s_mp->sbm_mlist = NULL;
1414 	}
1415 
1416 	if (s_mp->sbm_dyn_segs != NULL && s_mp->sbm_cm.sbdev_error == 0) {
1417 		memlist_delete(s_mp->sbm_dyn_segs);
1418 		s_mp->sbm_dyn_segs = NULL;
1419 	}
1420 
1421 	s_mp->sbm_del_mlist = NULL;
1422 	s_mp->sbm_peer = NULL;
1423 	s_mp->sbm_flags = 0;
1424 	s_mp->sbm_cm.sbdev_busy = 0;
1425 	dr_init_mem_unit_data(s_mp);
1426 
1427 	PR_MEM("%s: cached memlist for %s:", f, s_mp->sbm_cm.sbdev_path);
1428 	PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1429 
1430 	return (0);
1431 }
1432 
1433 /*
1434  * Successful return from this function will have the memory
1435  * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated
1436  * and waiting.  This routine's job is to select the memory that
1437  * actually has to be released (detached) which may not necessarily
1438  * be the same memory node that came in in devlist[],
1439  * i.e. a copy-rename is needed.
1440  */
1441 int
1442 dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1443 {
1444 	int		d;
1445 	int		err_flag = 0;
1446 	static fn_t	f = "dr_pre_release_mem";
1447 
1448 	PR_MEM("%s...\n", f);
1449 
1450 	for (d = 0; d < devnum; d++) {
1451 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1452 		int		rv;
1453 		memquery_t	mq;
1454 		struct memlist	*ml;
1455 
1456 		if (mp->sbm_cm.sbdev_error) {
1457 			err_flag = 1;
1458 			continue;
1459 		} else if (!kcage_on) {
1460 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_KCAGE_OFF);
1461 			err_flag = 1;
1462 			continue;
1463 		}
1464 
1465 		if (mp->sbm_flags & DR_MFLAG_RESERVED) {
1466 			/*
1467 			 * Board is currently involved in a delete
1468 			 * memory operation. Can't detach this guy until
1469 			 * that operation completes.
1470 			 */
1471 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_INVAL);
1472 			err_flag = 1;
1473 			break;
1474 		}
1475 
1476 		/*
1477 		 * Check whether the detaching memory requires a
1478 		 * copy-rename.
1479 		 */
1480 		ASSERT(mp->sbm_npages != 0);
1481 		rv = kphysm_del_span_query(mp->sbm_basepfn, mp->sbm_npages,
1482 		    &mq);
1483 		if (rv != KPHYSM_OK) {
1484 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1485 			err_flag = 1;
1486 			break;
1487 		}
1488 
1489 		if (mq.nonrelocatable != 0) {
1490 			if (!(dr_cmd_flags(hp) &
1491 			    (SBD_FLAG_FORCE | SBD_FLAG_QUIESCE_OKAY))) {
1492 				/* caller wasn't prompted for a suspend */
1493 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1494 				    ESBD_QUIESCE_REQD);
1495 				err_flag = 1;
1496 				break;
1497 			}
1498 		}
1499 
1500 		/* flags should be clean at this time */
1501 		ASSERT(mp->sbm_flags == 0);
1502 
1503 		ASSERT(mp->sbm_mlist == NULL);		/* should be null */
1504 		ASSERT(mp->sbm_del_mlist == NULL);	/* should be null */
1505 		if (mp->sbm_mlist != NULL) {
1506 			memlist_delete(mp->sbm_mlist);
1507 			mp->sbm_mlist = NULL;
1508 		}
1509 
1510 		ml = dr_get_memlist(mp);
1511 		if (ml == NULL) {
1512 			err_flag = 1;
1513 			PR_MEM("%s: no memlist found for %s\n",
1514 			    f, mp->sbm_cm.sbdev_path);
1515 			continue;
1516 		}
1517 
1518 		/* allocate a kphysm handle */
1519 		rv = kphysm_del_gethandle(&mp->sbm_memhandle);
1520 		if (rv != KPHYSM_OK) {
1521 			memlist_delete(ml);
1522 
1523 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1524 			err_flag = 1;
1525 			break;
1526 		}
1527 		mp->sbm_flags |= DR_MFLAG_RELOWNER;
1528 
1529 		if ((mq.nonrelocatable != 0) ||
1530 		    dr_reserve_mem_spans(&mp->sbm_memhandle, ml)) {
1531 			/*
1532 			 * Either the detaching memory node contains
1533 			 * non-reloc memory or we failed to reserve the
1534 			 * detaching memory node (which did _not_ have
1535 			 * any non-reloc memory, i.e. some non-reloc mem
1536 			 * got onboard).
1537 			 */
1538 
1539 			if (dr_select_mem_target(hp, mp, ml)) {
1540 				int rv;
1541 
1542 				/*
1543 				 * We had no luck locating a target
1544 				 * memory node to be the recipient of
1545 				 * the non-reloc memory on the node
1546 				 * we're trying to detach.
1547 				 * Clean up be disposing the mem handle
1548 				 * and the mem list.
1549 				 */
1550 				rv = kphysm_del_release(mp->sbm_memhandle);
1551 				if (rv != KPHYSM_OK) {
1552 					/*
1553 					 * can do nothing but complain
1554 					 * and hope helpful for debug
1555 					 */
1556 					cmn_err(CE_WARN, "%s: unexpected"
1557 					    " kphysm_del_release return"
1558 					    " value %d",
1559 					    f, rv);
1560 				}
1561 				mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1562 
1563 				memlist_delete(ml);
1564 
1565 				/* make sure sbm_flags is clean */
1566 				ASSERT(mp->sbm_flags == 0);
1567 
1568 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1569 				    ESBD_NO_TARGET);
1570 
1571 				err_flag = 1;
1572 				break;
1573 			}
1574 
1575 			/*
1576 			 * ml is not memlist_delete'd here because
1577 			 * it has been assigned to mp->sbm_mlist
1578 			 * by dr_select_mem_target.
1579 			 */
1580 		} else {
1581 			/* no target needed to detach this board */
1582 			mp->sbm_flags |= DR_MFLAG_RESERVED;
1583 			mp->sbm_peer = NULL;
1584 			mp->sbm_del_mlist = ml;
1585 			mp->sbm_mlist = ml;
1586 			mp->sbm_cm.sbdev_busy = 1;
1587 		}
1588 #ifdef DEBUG
1589 		ASSERT(mp->sbm_mlist != NULL);
1590 
1591 		if (mp->sbm_flags & DR_MFLAG_SOURCE) {
1592 			PR_MEM("%s: release of %s requires copy/rename;"
1593 			    " selected target board %s\n",
1594 			    f,
1595 			    mp->sbm_cm.sbdev_path,
1596 			    mp->sbm_peer->sbm_cm.sbdev_path);
1597 		} else {
1598 			PR_MEM("%s: copy/rename not required to release %s\n",
1599 			    f, mp->sbm_cm.sbdev_path);
1600 		}
1601 
1602 		ASSERT(mp->sbm_flags & DR_MFLAG_RELOWNER);
1603 		ASSERT(mp->sbm_flags & DR_MFLAG_RESERVED);
1604 #endif
1605 	}
1606 
1607 	return (err_flag ? -1 : 0);
1608 }
1609 
1610 void
1611 dr_release_mem_done(dr_common_unit_t *cp)
1612 {
1613 	dr_mem_unit_t	*s_mp = (dr_mem_unit_t *)cp;
1614 	dr_mem_unit_t *t_mp, *mp;
1615 	int		rv;
1616 	static fn_t	f = "dr_release_mem_done";
1617 
1618 	/*
1619 	 * This unit will be flagged with DR_MFLAG_SOURCE, if it
1620 	 * has a target unit.
1621 	 */
1622 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1623 		t_mp = s_mp->sbm_peer;
1624 		ASSERT(t_mp != NULL);
1625 		ASSERT(t_mp->sbm_peer == s_mp);
1626 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1627 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RESERVED);
1628 	} else {
1629 		/* this is no target unit */
1630 		t_mp = NULL;
1631 	}
1632 
1633 	/* free delete handle */
1634 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RELOWNER);
1635 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RESERVED);
1636 	rv = kphysm_del_release(s_mp->sbm_memhandle);
1637 	if (rv != KPHYSM_OK) {
1638 		/*
1639 		 * can do nothing but complain
1640 		 * and hope helpful for debug
1641 		 */
1642 		cmn_err(CE_WARN, "%s: unexpected kphysm_del_release"
1643 		    " return value %d", f, rv);
1644 	}
1645 	s_mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1646 
1647 	/*
1648 	 * If an error was encountered during release, clean up
1649 	 * the source (and target, if present) unit data.
1650 	 */
1651 /* XXX Can we know that sbdev_error was encountered during release? */
1652 	if (s_mp->sbm_cm.sbdev_error != NULL) {
1653 		PR_MEM("%s: %s: error %d noted\n",
1654 		    f,
1655 		    s_mp->sbm_cm.sbdev_path,
1656 		    s_mp->sbm_cm.sbdev_error->e_code);
1657 
1658 		if (t_mp != NULL) {
1659 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1660 			t_mp->sbm_del_mlist = NULL;
1661 
1662 			if (t_mp->sbm_mlist != NULL) {
1663 				memlist_delete(t_mp->sbm_mlist);
1664 				t_mp->sbm_mlist = NULL;
1665 			}
1666 
1667 			t_mp->sbm_peer = NULL;
1668 			t_mp->sbm_flags = 0;
1669 			t_mp->sbm_cm.sbdev_busy = 0;
1670 		}
1671 
1672 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1673 			memlist_delete(s_mp->sbm_del_mlist);
1674 		s_mp->sbm_del_mlist = NULL;
1675 
1676 		if (s_mp->sbm_mlist != NULL) {
1677 			memlist_delete(s_mp->sbm_mlist);
1678 			s_mp->sbm_mlist = NULL;
1679 		}
1680 
1681 		s_mp->sbm_peer = NULL;
1682 		s_mp->sbm_flags = 0;
1683 		s_mp->sbm_cm.sbdev_busy = 0;
1684 
1685 		/* bail out */
1686 		return;
1687 	}
1688 
1689 	DR_DEV_SET_RELEASED(&s_mp->sbm_cm);
1690 	dr_device_transition(&s_mp->sbm_cm, DR_STATE_RELEASE);
1691 
1692 	if (t_mp != NULL) {
1693 		/*
1694 		 * the kphysm delete operation that drained the source
1695 		 * board also drained this target board.  Since the source
1696 		 * board drain is now known to have succeeded, we know this
1697 		 * target board is drained too.
1698 		 *
1699 		 * because DR_DEV_SET_RELEASED and dr_device_transition
1700 		 * is done here, the dr_release_dev_done should not
1701 		 * fail.
1702 		 */
1703 		DR_DEV_SET_RELEASED(&t_mp->sbm_cm);
1704 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_RELEASE);
1705 
1706 		/*
1707 		 * NOTE: do not transition target's board state,
1708 		 * even if the mem-unit was the last configure
1709 		 * unit of the board.  When copy/rename completes
1710 		 * this mem-unit will transitioned back to
1711 		 * the configured state.  In the meantime, the
1712 		 * board's must remain as is.
1713 		 */
1714 	}
1715 
1716 	/* if board(s) had deleted memory, verify it is gone */
1717 	rv = 0;
1718 	memlist_read_lock();
1719 	if (s_mp->sbm_del_mlist != NULL) {
1720 		mp = s_mp;
1721 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1722 	}
1723 	if (rv == 0 && t_mp && t_mp->sbm_del_mlist != NULL) {
1724 		mp = t_mp;
1725 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1726 	}
1727 	memlist_read_unlock();
1728 	if (rv) {
1729 		cmn_err(CE_WARN, "%s: %smem-unit (%d.%d): "
1730 		    "deleted memory still found in phys_install",
1731 		    f,
1732 		    (mp == t_mp ? "target " : ""),
1733 		    mp->sbm_cm.sbdev_bp->b_num,
1734 		    mp->sbm_cm.sbdev_unum);
1735 
1736 		DR_DEV_INTERNAL_ERROR(&s_mp->sbm_cm);
1737 		return;
1738 	}
1739 
1740 	s_mp->sbm_flags |= DR_MFLAG_RELDONE;
1741 	if (t_mp != NULL)
1742 		t_mp->sbm_flags |= DR_MFLAG_RELDONE;
1743 
1744 	/* this should not fail */
1745 	if (dr_release_dev_done(&s_mp->sbm_cm) != 0) {
1746 		/* catch this in debug kernels */
1747 		ASSERT(0);
1748 		return;
1749 	}
1750 
1751 	PR_MEM("%s: marking %s release DONE\n",
1752 	    f, s_mp->sbm_cm.sbdev_path);
1753 
1754 	s_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1755 
1756 	if (t_mp != NULL) {
1757 		/* should not fail */
1758 		rv = dr_release_dev_done(&t_mp->sbm_cm);
1759 		if (rv != 0) {
1760 			/* catch this in debug kernels */
1761 			ASSERT(0);
1762 			return;
1763 		}
1764 
1765 		PR_MEM("%s: marking %s release DONE\n",
1766 		    f, t_mp->sbm_cm.sbdev_path);
1767 
1768 		t_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1769 	}
1770 }
1771 
1772 /*ARGSUSED*/
1773 int
1774 dr_disconnect_mem(dr_mem_unit_t *mp)
1775 {
1776 	static fn_t	f = "dr_disconnect_mem";
1777 	update_membounds_t umb;
1778 
1779 #ifdef DEBUG
1780 	int state = mp->sbm_cm.sbdev_state;
1781 	ASSERT(state == DR_STATE_CONNECTED || state == DR_STATE_UNCONFIGURED);
1782 #endif
1783 
1784 	PR_MEM("%s...\n", f);
1785 
1786 	if (mp->sbm_del_mlist && mp->sbm_del_mlist != mp->sbm_mlist)
1787 		memlist_delete(mp->sbm_del_mlist);
1788 	mp->sbm_del_mlist = NULL;
1789 
1790 	if (mp->sbm_mlist) {
1791 		memlist_delete(mp->sbm_mlist);
1792 		mp->sbm_mlist = NULL;
1793 	}
1794 
1795 	/*
1796 	 * Remove memory from lgroup
1797 	 * For now, only board info is required.
1798 	 */
1799 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1800 	umb.u_base = (uint64_t)-1;
1801 	umb.u_len = (uint64_t)-1;
1802 
1803 	lgrp_plat_config(LGRP_CONFIG_MEM_DEL, (uintptr_t)&umb);
1804 
1805 	return (0);
1806 }
1807 
1808 int
1809 dr_cancel_mem(dr_mem_unit_t *s_mp)
1810 {
1811 	dr_mem_unit_t	*t_mp;
1812 	dr_state_t	state;
1813 	static fn_t	f = "dr_cancel_mem";
1814 
1815 	state = s_mp->sbm_cm.sbdev_state;
1816 
1817 	if (s_mp->sbm_flags & DR_MFLAG_TARGET) {
1818 		/* must cancel source board, not target board */
1819 		/* TODO: set error */
1820 		return (-1);
1821 	} else if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1822 		t_mp = s_mp->sbm_peer;
1823 		ASSERT(t_mp != NULL);
1824 		ASSERT(t_mp->sbm_peer == s_mp);
1825 
1826 		/* must always match the source board's state */
1827 /* TODO: is this assertion correct? */
1828 		ASSERT(t_mp->sbm_cm.sbdev_state == state);
1829 	} else {
1830 		/* this is no target unit */
1831 		t_mp = NULL;
1832 	}
1833 
1834 	switch (state) {
1835 	case DR_STATE_UNREFERENCED:	/* state set by dr_release_dev_done */
1836 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1837 
1838 		if (t_mp != NULL && t_mp->sbm_del_mlist != NULL) {
1839 			PR_MEM("%s: undoing target %s memory delete\n",
1840 			    f, t_mp->sbm_cm.sbdev_path);
1841 			dr_add_memory_spans(t_mp, t_mp->sbm_del_mlist);
1842 
1843 			DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1844 		}
1845 
1846 		if (s_mp->sbm_del_mlist != NULL) {
1847 			PR_MEM("%s: undoing %s memory delete\n",
1848 			    f, s_mp->sbm_cm.sbdev_path);
1849 
1850 			dr_add_memory_spans(s_mp, s_mp->sbm_del_mlist);
1851 		}
1852 
1853 		/*FALLTHROUGH*/
1854 
1855 /* TODO: should no longer be possible to see the release state here */
1856 	case DR_STATE_RELEASE:	/* state set by dr_release_mem_done */
1857 
1858 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1859 
1860 		if (t_mp != NULL) {
1861 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1862 			t_mp->sbm_del_mlist = NULL;
1863 
1864 			if (t_mp->sbm_mlist != NULL) {
1865 				memlist_delete(t_mp->sbm_mlist);
1866 				t_mp->sbm_mlist = NULL;
1867 			}
1868 
1869 			t_mp->sbm_peer = NULL;
1870 			t_mp->sbm_flags = 0;
1871 			t_mp->sbm_cm.sbdev_busy = 0;
1872 			dr_init_mem_unit_data(t_mp);
1873 
1874 			DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1875 
1876 			dr_device_transition(&t_mp->sbm_cm,
1877 			    DR_STATE_CONFIGURED);
1878 		}
1879 
1880 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1881 			memlist_delete(s_mp->sbm_del_mlist);
1882 		s_mp->sbm_del_mlist = NULL;
1883 
1884 		if (s_mp->sbm_mlist != NULL) {
1885 			memlist_delete(s_mp->sbm_mlist);
1886 			s_mp->sbm_mlist = NULL;
1887 		}
1888 
1889 		s_mp->sbm_peer = NULL;
1890 		s_mp->sbm_flags = 0;
1891 		s_mp->sbm_cm.sbdev_busy = 0;
1892 		dr_init_mem_unit_data(s_mp);
1893 
1894 		return (0);
1895 
1896 	default:
1897 		PR_MEM("%s: WARNING unexpected state (%d) for %s\n",
1898 		    f, (int)state, s_mp->sbm_cm.sbdev_path);
1899 
1900 		return (-1);
1901 	}
1902 	/*NOTREACHED*/
1903 }
1904 
1905 void
1906 dr_init_mem_unit(dr_mem_unit_t *mp)
1907 {
1908 	dr_state_t	new_state;
1909 
1910 
1911 	if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) {
1912 		new_state = DR_STATE_CONFIGURED;
1913 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1914 	} else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) {
1915 		new_state = DR_STATE_CONNECTED;
1916 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1917 	} else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) {
1918 		new_state = DR_STATE_OCCUPIED;
1919 	} else {
1920 		new_state = DR_STATE_EMPTY;
1921 	}
1922 
1923 	if (DR_DEV_IS_PRESENT(&mp->sbm_cm))
1924 		dr_init_mem_unit_data(mp);
1925 
1926 	/* delay transition until fully initialized */
1927 	dr_device_transition(&mp->sbm_cm, new_state);
1928 }
1929 
1930 static void
1931 dr_init_mem_unit_data(dr_mem_unit_t *mp)
1932 {
1933 	drmachid_t	id = mp->sbm_cm.sbdev_id;
1934 	uint64_t	bytes;
1935 	sbd_error_t	*err;
1936 	static fn_t	f = "dr_init_mem_unit_data";
1937 	update_membounds_t umb;
1938 
1939 	PR_MEM("%s...\n", f);
1940 
1941 	/* a little sanity checking */
1942 	ASSERT(mp->sbm_peer == NULL);
1943 	ASSERT(mp->sbm_flags == 0);
1944 
1945 	/* get basepfn of mem unit */
1946 	err = drmach_mem_get_base_physaddr(id, &bytes);
1947 	if (err) {
1948 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1949 		mp->sbm_basepfn = (pfn_t)-1;
1950 	} else
1951 		mp->sbm_basepfn = _b64top(bytes);
1952 
1953 	/* attempt to get number of pages from PDA */
1954 	err = drmach_mem_get_size(id, &bytes);
1955 	if (err) {
1956 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1957 		mp->sbm_npages = 0;
1958 	} else
1959 		mp->sbm_npages = _b64top(bytes);
1960 
1961 	/* if didn't work, calculate using memlist */
1962 	if (mp->sbm_npages == 0) {
1963 		struct memlist	*ml, *mlist;
1964 		/*
1965 		 * Either we couldn't open the PDA or our
1966 		 * PDA has garbage in it.  We must have the
1967 		 * page count consistent and whatever the
1968 		 * OS states has precedence over the PDA
1969 		 * so let's check the kernel.
1970 		 */
1971 /* TODO: curious comment. it suggests pda query should happen if this fails */
1972 		PR_MEM("%s: PDA query failed for npages."
1973 		    " Checking memlist for %s\n",
1974 		    f, mp->sbm_cm.sbdev_path);
1975 
1976 		mlist = dr_get_memlist(mp);
1977 		for (ml = mlist; ml; ml = ml->ml_next)
1978 			mp->sbm_npages += btop(ml->ml_size);
1979 		memlist_delete(mlist);
1980 	}
1981 
1982 	err = drmach_mem_get_alignment(id, &bytes);
1983 	if (err) {
1984 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1985 		mp->sbm_alignment_mask = 0;
1986 	} else
1987 		mp->sbm_alignment_mask = _b64top(bytes);
1988 
1989 	err = drmach_mem_get_slice_size(id, &bytes);
1990 	if (err) {
1991 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1992 		mp->sbm_slice_size = 0; /* paranoia */
1993 	} else
1994 		mp->sbm_slice_size = bytes;
1995 
1996 	/*
1997 	 * Add memory to lgroup
1998 	 */
1999 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
2000 	umb.u_base = (uint64_t)mp->sbm_basepfn << MMU_PAGESHIFT;
2001 	umb.u_len = (uint64_t)mp->sbm_npages << MMU_PAGESHIFT;
2002 
2003 	lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb);
2004 
2005 	PR_MEM("%s: %s (basepfn = 0x%lx, npgs = %ld)\n",
2006 	    f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages);
2007 }
2008 
2009 static int
2010 dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *ml)
2011 {
2012 	int		err;
2013 	pfn_t		base;
2014 	pgcnt_t		npgs;
2015 	struct memlist	*mc;
2016 	static fn_t	f = "dr_reserve_mem_spans";
2017 
2018 	PR_MEM("%s...\n", f);
2019 
2020 	/*
2021 	 * Walk the supplied memlist scheduling each span for removal
2022 	 * with kphysm_del_span.  It is possible that a span may intersect
2023 	 * an area occupied by the cage.
2024 	 */
2025 	for (mc = ml; mc != NULL; mc = mc->ml_next) {
2026 		base = _b64top(mc->ml_address);
2027 		npgs = _b64top(mc->ml_size);
2028 
2029 		err = kphysm_del_span(*mhp, base, npgs);
2030 		if (err != KPHYSM_OK) {
2031 			cmn_err(CE_WARN, "%s memory reserve failed."
2032 			    " unexpected kphysm_del_span return value %d;"
2033 			    " basepfn=0x%lx npages=%ld",
2034 			    f, err, base, npgs);
2035 
2036 			return (-1);
2037 		}
2038 	}
2039 
2040 	return (0);
2041 }
2042 
2043 /* debug counters */
2044 int dr_smt_realigned;
2045 int dr_smt_preference[4];
2046 
2047 #ifdef DEBUG
2048 uint_t dr_ignore_board; /* if bit[bnum-1] set, board won't be candidate */
2049 #endif
2050 
2051 /*
2052  * Find and reserve a copy/rename target board suitable for the
2053  * given source board.
2054  * All boards in the system are examined and categorized in relation to
2055  * their memory size versus the source board's memory size.  Order of
2056  * preference is:
2057  *	1st: board has same memory size
2058  * 	2nd: board has larger memory size
2059  *	3rd: board has smaller memory size
2060  *	4th: board has smaller memory size, available memory will be reduced.
2061  * Boards in category 3 and 4 will have their MC's reprogrammed to locate the
2062  * span to which the MC responds to address span that appropriately covers
2063  * the nonrelocatable span of the source board.
2064  */
2065 static int
2066 dr_select_mem_target(dr_handle_t *hp,
2067 	dr_mem_unit_t *s_mp, struct memlist *s_ml)
2068 {
2069 	pgcnt_t		sz = _b64top(s_mp->sbm_slice_size);
2070 	pgcnt_t		sm = sz - 1; /* mem_slice_mask */
2071 	pfn_t		s_phi, t_phi;
2072 
2073 	int		n_sets = 4; /* same, larger, smaller, clipped */
2074 	int		preference; /* lower value is higher preference */
2075 	int		n_units_per_set;
2076 	int		idx;
2077 	dr_mem_unit_t	**sets;
2078 
2079 	int		t_bd;
2080 	int		t_unit;
2081 	int		rv;
2082 	int		allow_src_memrange_modify;
2083 	int		allow_targ_memrange_modify;
2084 	drmachid_t	t_id;
2085 	dr_board_t	*s_bp, *t_bp;
2086 	dr_mem_unit_t	*t_mp, *c_mp;
2087 	struct memlist	*d_ml, *t_ml, *x_ml;
2088 	memquery_t	s_mq = {0};
2089 	static fn_t	f = "dr_select_mem_target";
2090 
2091 	PR_MEM("%s...\n", f);
2092 
2093 	ASSERT(s_ml != NULL);
2094 
2095 	n_units_per_set = MAX_BOARDS * MAX_MEM_UNITS_PER_BOARD;
2096 	sets = GETSTRUCT(dr_mem_unit_t *, n_units_per_set * n_sets);
2097 
2098 	s_bp = hp->h_bd;
2099 	/* calculate the offset into the slice of the last source board pfn */
2100 	ASSERT(s_mp->sbm_npages != 0);
2101 	s_phi = (s_mp->sbm_basepfn + s_mp->sbm_npages - 1) & sm;
2102 
2103 	allow_src_memrange_modify = drmach_allow_memrange_modify(s_bp->b_id);
2104 
2105 	/*
2106 	 * Make one pass through all memory units on all boards
2107 	 * and categorize them with respect to the source board.
2108 	 */
2109 	for (t_bd = 0; t_bd < MAX_BOARDS; t_bd++) {
2110 		/*
2111 		 * The board structs are a contiguous array
2112 		 * so we take advantage of that to find the
2113 		 * correct board struct pointer for a given
2114 		 * board number.
2115 		 */
2116 		t_bp = dr_lookup_board(t_bd);
2117 
2118 		/* source board can not be its own target */
2119 		if (s_bp->b_num == t_bp->b_num)
2120 			continue;
2121 
2122 		for (t_unit = 0; t_unit < MAX_MEM_UNITS_PER_BOARD; t_unit++) {
2123 
2124 			t_mp = dr_get_mem_unit(t_bp, t_unit);
2125 
2126 			/* this memory node must be attached */
2127 			if (!DR_DEV_IS_ATTACHED(&t_mp->sbm_cm))
2128 				continue;
2129 
2130 			/* source unit can not be its own target */
2131 			if (s_mp == t_mp) {
2132 				/* catch this is debug kernels */
2133 				ASSERT(0);
2134 				continue;
2135 			}
2136 
2137 			/*
2138 			 * this memory node must not already be reserved
2139 			 * by some other memory delete operation.
2140 			 */
2141 			if (t_mp->sbm_flags & DR_MFLAG_RESERVED)
2142 				continue;
2143 
2144 			/*
2145 			 * categorize the memory node
2146 			 * If this is a smaller memory node, create a
2147 			 * temporary, edited copy of the source board's
2148 			 * memlist containing only the span of the non-
2149 			 * relocatable pages.
2150 			 */
2151 			t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2152 			t_id = t_mp->sbm_cm.sbdev_bp->b_id;
2153 			allow_targ_memrange_modify =
2154 			    drmach_allow_memrange_modify(t_id);
2155 			if (t_mp->sbm_npages == s_mp->sbm_npages &&
2156 			    t_phi == s_phi) {
2157 				preference = 0;
2158 				t_mp->sbm_slice_offset = 0;
2159 			} else if (t_mp->sbm_npages > s_mp->sbm_npages &&
2160 			    t_phi > s_phi) {
2161 				/*
2162 				 * Selecting this target will require modifying
2163 				 * the source and/or target physical address
2164 				 * ranges.  Skip if not supported by platform.
2165 				 */
2166 				if (!allow_src_memrange_modify ||
2167 				    !allow_targ_memrange_modify) {
2168 					PR_MEM("%s: skip target %s, memory "
2169 					    "range relocation not supported "
2170 					    "by platform\n", f,
2171 					    t_mp->sbm_cm.sbdev_path);
2172 					continue;
2173 				}
2174 				preference = 1;
2175 				t_mp->sbm_slice_offset = 0;
2176 			} else {
2177 				pfn_t		pfn = 0;
2178 
2179 				/*
2180 				 * Selecting this target will require modifying
2181 				 * the source and/or target physical address
2182 				 * ranges.  Skip if not supported by platform.
2183 				 */
2184 				if (!allow_src_memrange_modify ||
2185 				    !allow_targ_memrange_modify) {
2186 					PR_MEM("%s: skip target %s, memory "
2187 					    "range relocation not supported "
2188 					    "by platform\n", f,
2189 					    t_mp->sbm_cm.sbdev_path);
2190 					continue;
2191 				}
2192 
2193 				/*
2194 				 * Check if its mc can be programmed to relocate
2195 				 * the active address range to match the
2196 				 * nonrelocatable span of the source board.
2197 				 */
2198 				preference = 2;
2199 
2200 				if (s_mq.phys_pages == 0) {
2201 					/*
2202 					 * find non-relocatable span on
2203 					 * source board.
2204 					 */
2205 					rv = kphysm_del_span_query(
2206 					    s_mp->sbm_basepfn,
2207 					    s_mp->sbm_npages, &s_mq);
2208 					if (rv != KPHYSM_OK) {
2209 						PR_MEM("%s: %s: unexpected"
2210 						    " kphysm_del_span_query"
2211 						    " return value %d;"
2212 						    " basepfn 0x%lx,"
2213 						    " npages %ld\n",
2214 						    f,
2215 						    s_mp->sbm_cm.sbdev_path,
2216 						    rv,
2217 						    s_mp->sbm_basepfn,
2218 						    s_mp->sbm_npages);
2219 
2220 						/* paranoia */
2221 						s_mq.phys_pages = 0;
2222 
2223 						continue;
2224 					}
2225 
2226 					/* more paranoia */
2227 					ASSERT(s_mq.phys_pages != 0);
2228 					ASSERT(s_mq.nonrelocatable != 0);
2229 
2230 					/*
2231 					 * this should not happen
2232 					 * if it does, it simply means that
2233 					 * we can not proceed with qualifying
2234 					 * this target candidate.
2235 					 */
2236 					if (s_mq.nonrelocatable == 0)
2237 						continue;
2238 
2239 					PR_MEM("%s: %s: nonrelocatable"
2240 					    " span (0x%lx..0x%lx)\n",
2241 					    f,
2242 					    s_mp->sbm_cm.sbdev_path,
2243 					    s_mq.first_nonrelocatable,
2244 					    s_mq.last_nonrelocatable);
2245 				}
2246 
2247 				/*
2248 				 * Round down the starting pfn of the
2249 				 * nonrelocatable span on the source board
2250 				 * to nearest programmable boundary possible
2251 				 * with this target candidate.
2252 				 */
2253 				pfn = s_mq.first_nonrelocatable &
2254 				    ~t_mp->sbm_alignment_mask;
2255 
2256 				/* skip candidate if memory is too small */
2257 				if (pfn + t_mp->sbm_npages <
2258 				    s_mq.last_nonrelocatable)
2259 					continue;
2260 
2261 				/*
2262 				 * reprogramming an mc to relocate its
2263 				 * active address range means the beginning
2264 				 * address to which the DIMMS respond will
2265 				 * be somewhere above the slice boundary
2266 				 * address.  The larger the size of memory
2267 				 * on this unit, the more likely part of it
2268 				 * will exist beyond the end of the slice.
2269 				 * The portion of the memory that does is
2270 				 * unavailable to the system until the mc
2271 				 * reprogrammed to a more favorable base
2272 				 * address.
2273 				 * An attempt is made to avoid the loss by
2274 				 * recalculating the mc base address relative
2275 				 * to the end of the slice.  This may produce
2276 				 * a more favorable result.  If not, we lower
2277 				 * the board's preference rating so that it
2278 				 * is one the last candidate boards to be
2279 				 * considered.
2280 				 */
2281 				if ((pfn + t_mp->sbm_npages) & ~sm) {
2282 					pfn_t p;
2283 
2284 					ASSERT(sz >= t_mp->sbm_npages);
2285 
2286 					/*
2287 					 * calculate an alternative starting
2288 					 * address relative to the end of the
2289 					 * slice's address space.
2290 					 */
2291 					p = pfn & ~sm;
2292 					p = p + (sz - t_mp->sbm_npages);
2293 					p = p & ~t_mp->sbm_alignment_mask;
2294 
2295 					if ((p > s_mq.first_nonrelocatable) ||
2296 					    (p + t_mp->sbm_npages <
2297 					    s_mq.last_nonrelocatable)) {
2298 
2299 						/*
2300 						 * alternative starting addr
2301 						 * won't work. Lower preference
2302 						 * rating of this board, since
2303 						 * some number of pages will
2304 						 * unavailable for use.
2305 						 */
2306 						preference = 3;
2307 					} else {
2308 						dr_smt_realigned++;
2309 						pfn = p;
2310 					}
2311 				}
2312 
2313 				/*
2314 				 * translate calculated pfn to an offset
2315 				 * relative to the slice boundary.  If the
2316 				 * candidate board is selected, this offset
2317 				 * will be used to calculate the values
2318 				 * programmed into the mc.
2319 				 */
2320 				t_mp->sbm_slice_offset = pfn & sm;
2321 				PR_MEM("%s: %s:"
2322 				    "  proposed mc offset 0x%lx\n",
2323 				    f,
2324 				    t_mp->sbm_cm.sbdev_path,
2325 				    t_mp->sbm_slice_offset);
2326 			}
2327 
2328 			dr_smt_preference[preference]++;
2329 
2330 			/* calculate index to start of preference set */
2331 			idx  = n_units_per_set * preference;
2332 			/* calculate offset to respective element */
2333 			idx += t_bd * MAX_MEM_UNITS_PER_BOARD + t_unit;
2334 
2335 			ASSERT(idx < n_units_per_set * n_sets);
2336 			sets[idx] = t_mp;
2337 		}
2338 	}
2339 
2340 	/*
2341 	 * NOTE: this would be a good place to sort each candidate
2342 	 * set in to some desired order, e.g. memory size in ascending
2343 	 * order.  Without an additional sorting step here, the order
2344 	 * within a set is ascending board number order.
2345 	 */
2346 
2347 	c_mp = NULL;
2348 	x_ml = NULL;
2349 	t_ml = NULL;
2350 	for (idx = 0; idx < n_units_per_set * n_sets; idx++) {
2351 		memquery_t mq;
2352 
2353 		/* cleanup t_ml after previous pass */
2354 		if (t_ml != NULL) {
2355 			memlist_delete(t_ml);
2356 			t_ml = NULL;
2357 		}
2358 
2359 		/* get candidate target board mem unit */
2360 		t_mp = sets[idx];
2361 		if (t_mp == NULL)
2362 			continue;
2363 
2364 		/* get target board memlist */
2365 		t_ml = dr_get_memlist(t_mp);
2366 		if (t_ml == NULL) {
2367 			cmn_err(CE_WARN, "%s: no memlist for"
2368 			    " mem-unit %d, board %d",
2369 			    f,
2370 			    t_mp->sbm_cm.sbdev_bp->b_num,
2371 			    t_mp->sbm_cm.sbdev_unum);
2372 
2373 			continue;
2374 		}
2375 
2376 		/* get appropriate source board memlist */
2377 		t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2378 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2379 			spgcnt_t excess;
2380 
2381 			/*
2382 			 * make a copy of the source board memlist
2383 			 * then edit it to remove the spans that
2384 			 * are outside the calculated span of
2385 			 * [pfn..s_mq.last_nonrelocatable].
2386 			 */
2387 			if (x_ml != NULL)
2388 				memlist_delete(x_ml);
2389 
2390 			x_ml = memlist_dup(s_ml);
2391 			if (x_ml == NULL) {
2392 				PR_MEM("%s: memlist_dup failed\n", f);
2393 				/* TODO: should abort */
2394 				continue;
2395 			}
2396 
2397 			/* trim off lower portion */
2398 			excess = t_mp->sbm_slice_offset -
2399 			    (s_mp->sbm_basepfn & sm);
2400 
2401 			if (excess > 0) {
2402 				x_ml = memlist_del_span(
2403 				    x_ml,
2404 				    _ptob64(s_mp->sbm_basepfn),
2405 				    _ptob64(excess));
2406 			}
2407 			ASSERT(x_ml);
2408 
2409 			/*
2410 			 * Since this candidate target board is smaller
2411 			 * than the source board, s_mq must have been
2412 			 * initialized in previous loop while processing
2413 			 * this or some other candidate board.
2414 			 * FIXME: this is weak.
2415 			 */
2416 			ASSERT(s_mq.phys_pages != 0);
2417 
2418 			/* trim off upper portion */
2419 			excess = (s_mp->sbm_basepfn + s_mp->sbm_npages)
2420 			    - (s_mq.last_nonrelocatable + 1);
2421 			if (excess > 0) {
2422 				pfn_t p;
2423 
2424 				p  = s_mq.last_nonrelocatable + 1;
2425 				x_ml = memlist_del_span(
2426 				    x_ml,
2427 				    _ptob64(p),
2428 				    _ptob64(excess));
2429 			}
2430 
2431 			PR_MEM("%s: %s: edited source memlist:\n",
2432 			    f, s_mp->sbm_cm.sbdev_path);
2433 			PR_MEMLIST_DUMP(x_ml);
2434 
2435 #ifdef DEBUG
2436 			/* sanity check memlist */
2437 			d_ml = x_ml;
2438 			while (d_ml->ml_next != NULL)
2439 				d_ml = d_ml->ml_next;
2440 
2441 			ASSERT(d_ml->ml_address + d_ml->ml_size ==
2442 			    _ptob64(s_mq.last_nonrelocatable + 1));
2443 #endif
2444 
2445 			/*
2446 			 * x_ml now describes only the portion of the
2447 			 * source board that will be moved during the
2448 			 * copy/rename operation.
2449 			 */
2450 			d_ml = x_ml;
2451 		} else {
2452 			/* use original memlist; all spans will be moved */
2453 			d_ml = s_ml;
2454 		}
2455 
2456 		/* verify target can support source memory spans. */
2457 		if (memlist_canfit(d_ml, t_ml) == 0) {
2458 			PR_MEM("%s: source memlist won't"
2459 			    " fit in target memlist\n", f);
2460 			PR_MEM("%s: source memlist:\n", f);
2461 			PR_MEMLIST_DUMP(d_ml);
2462 			PR_MEM("%s: target memlist:\n", f);
2463 			PR_MEMLIST_DUMP(t_ml);
2464 
2465 			continue;
2466 		}
2467 
2468 		/* NOTE: the value of d_ml is not used beyond this point */
2469 
2470 		PR_MEM("%s: checking for no-reloc in %s, "
2471 		    " basepfn=0x%lx, npages=%ld\n",
2472 		    f,
2473 		    t_mp->sbm_cm.sbdev_path,
2474 		    t_mp->sbm_basepfn,
2475 		    t_mp->sbm_npages);
2476 
2477 		rv = kphysm_del_span_query(
2478 		    t_mp->sbm_basepfn, t_mp->sbm_npages, &mq);
2479 		if (rv != KPHYSM_OK) {
2480 			PR_MEM("%s: kphysm_del_span_query:"
2481 			    " unexpected return value %d\n", f, rv);
2482 
2483 			continue;
2484 		}
2485 
2486 		if (mq.nonrelocatable != 0) {
2487 			PR_MEM("%s: candidate %s has"
2488 			    " nonrelocatable span [0x%lx..0x%lx]\n",
2489 			    f,
2490 			    t_mp->sbm_cm.sbdev_path,
2491 			    mq.first_nonrelocatable,
2492 			    mq.last_nonrelocatable);
2493 
2494 			continue;
2495 		}
2496 
2497 #ifdef DEBUG
2498 		/*
2499 		 * This is a debug tool for excluding certain boards
2500 		 * from being selected as a target board candidate.
2501 		 * dr_ignore_board is only tested by this driver.
2502 		 * It must be set with adb, obp, /etc/system or your
2503 		 * favorite debugger.
2504 		 */
2505 		if (dr_ignore_board &
2506 		    (1 << (t_mp->sbm_cm.sbdev_bp->b_num - 1))) {
2507 			PR_MEM("%s: dr_ignore_board flag set,"
2508 			    " ignoring %s as candidate\n",
2509 			    f, t_mp->sbm_cm.sbdev_path);
2510 			continue;
2511 		}
2512 #endif
2513 
2514 		/*
2515 		 * Reserve excess source board memory, if any.
2516 		 *
2517 		 * When the number of pages on the candidate target
2518 		 * board is less than the number of pages on the source,
2519 		 * then some spans (clearly) of the source board's address
2520 		 * space will not be covered by physical memory after the
2521 		 * copy/rename completes.  The following code block
2522 		 * schedules those spans to be deleted.
2523 		 */
2524 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2525 			pfn_t pfn;
2526 			uint64_t s_del_pa;
2527 			struct memlist *ml;
2528 
2529 			d_ml = memlist_dup(s_ml);
2530 			if (d_ml == NULL) {
2531 				PR_MEM("%s: cant dup src brd memlist\n", f);
2532 				/* TODO: should abort */
2533 				continue;
2534 			}
2535 
2536 			/* calculate base pfn relative to target board */
2537 			pfn  = s_mp->sbm_basepfn & ~sm;
2538 			pfn += t_mp->sbm_slice_offset;
2539 
2540 			/*
2541 			 * cannot split dynamically added segment
2542 			 */
2543 			s_del_pa = _ptob64(pfn + t_mp->sbm_npages);
2544 			PR_MEM("%s: proposed src delete pa=0x%lx\n", f,
2545 			    s_del_pa);
2546 			PR_MEM("%s: checking for split of dyn seg list:\n", f);
2547 			PR_MEMLIST_DUMP(s_mp->sbm_dyn_segs);
2548 			for (ml = s_mp->sbm_dyn_segs; ml; ml = ml->ml_next) {
2549 				if (s_del_pa > ml->ml_address &&
2550 				    s_del_pa < ml->ml_address + ml->ml_size) {
2551 					s_del_pa = ml->ml_address;
2552 					break;
2553 				}
2554 			}
2555 
2556 			/* remove span that will reside on candidate board */
2557 			d_ml = memlist_del_span(d_ml, _ptob64(pfn),
2558 			    s_del_pa - _ptob64(pfn));
2559 
2560 			PR_MEM("%s: %s: reserving src brd memlist:\n",
2561 			    f, s_mp->sbm_cm.sbdev_path);
2562 			PR_MEMLIST_DUMP(d_ml);
2563 
2564 			/* reserve excess spans */
2565 			if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, d_ml)
2566 			    != 0) {
2567 
2568 				/* likely more non-reloc pages appeared */
2569 				/* TODO: restart from top? */
2570 				continue;
2571 			}
2572 		} else {
2573 			/* no excess source board memory */
2574 			d_ml = NULL;
2575 		}
2576 
2577 		s_mp->sbm_flags |= DR_MFLAG_RESERVED;
2578 
2579 		/*
2580 		 * reserve all memory on target board.
2581 		 * NOTE: source board's memhandle is used.
2582 		 *
2583 		 * If this succeeds (eq 0), then target selection is
2584 		 * complete and all unwanted memory spans, both source and
2585 		 * target, have been reserved.  Loop is terminated.
2586 		 */
2587 		if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, t_ml) == 0) {
2588 			PR_MEM("%s: %s: target board memory reserved\n",
2589 			    f, t_mp->sbm_cm.sbdev_path);
2590 
2591 			/* a candidate target board is now reserved */
2592 			t_mp->sbm_flags |= DR_MFLAG_RESERVED;
2593 			c_mp = t_mp;
2594 
2595 			/* *** EXITING LOOP *** */
2596 			break;
2597 		}
2598 
2599 		/* did not successfully reserve the target board. */
2600 		PR_MEM("%s: could not reserve target %s\n",
2601 		    f, t_mp->sbm_cm.sbdev_path);
2602 
2603 		/*
2604 		 * NOTE: an undo of the dr_reserve_mem_span work
2605 		 * will happen automatically when the memhandle
2606 		 * (s_mp->sbm_memhandle) is kphysm_del_release'd.
2607 		 */
2608 
2609 		s_mp->sbm_flags &= ~DR_MFLAG_RESERVED;
2610 	}
2611 
2612 	/* clean up after memlist editing logic */
2613 	if (x_ml != NULL)
2614 		memlist_delete(x_ml);
2615 
2616 	FREESTRUCT(sets, dr_mem_unit_t *, n_units_per_set * n_sets);
2617 
2618 	/*
2619 	 * c_mp will be NULL when the entire sets[] array
2620 	 * has been searched without reserving a target board.
2621 	 */
2622 	if (c_mp == NULL) {
2623 		PR_MEM("%s: %s: target selection failed.\n",
2624 		    f, s_mp->sbm_cm.sbdev_path);
2625 
2626 		if (t_ml != NULL)
2627 			memlist_delete(t_ml);
2628 
2629 		return (-1);
2630 	}
2631 
2632 	PR_MEM("%s: found target %s for source %s\n",
2633 	    f,
2634 	    c_mp->sbm_cm.sbdev_path,
2635 	    s_mp->sbm_cm.sbdev_path);
2636 
2637 	s_mp->sbm_peer = c_mp;
2638 	s_mp->sbm_flags |= DR_MFLAG_SOURCE;
2639 	s_mp->sbm_del_mlist = d_ml;	/* spans to be deleted, if any */
2640 	s_mp->sbm_mlist = s_ml;
2641 	s_mp->sbm_cm.sbdev_busy = 1;
2642 
2643 	c_mp->sbm_peer = s_mp;
2644 	c_mp->sbm_flags |= DR_MFLAG_TARGET;
2645 	c_mp->sbm_del_mlist = t_ml;	/* spans to be deleted */
2646 	c_mp->sbm_mlist = t_ml;
2647 	c_mp->sbm_cm.sbdev_busy = 1;
2648 
2649 	s_mp->sbm_flags &= ~DR_MFLAG_MEMRESIZE;
2650 	if (c_mp->sbm_npages > s_mp->sbm_npages) {
2651 		s_mp->sbm_flags |= DR_MFLAG_MEMUPSIZE;
2652 		PR_MEM("%s: upsize detected (source=%ld < target=%ld)\n",
2653 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2654 	} else if (c_mp->sbm_npages < s_mp->sbm_npages) {
2655 		s_mp->sbm_flags |= DR_MFLAG_MEMDOWNSIZE;
2656 		PR_MEM("%s: downsize detected (source=%ld > target=%ld)\n",
2657 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2658 	}
2659 
2660 	return (0);
2661 }
2662 
2663 /*
2664  * Memlist support.
2665  */
2666 
2667 /*
2668  * Determine whether the source memlist (s_mlist) will
2669  * fit into the target memlist (t_mlist) in terms of
2670  * size and holes (i.e. based on same relative base address).
2671  */
2672 static int
2673 memlist_canfit(struct memlist *s_mlist, struct memlist *t_mlist)
2674 {
2675 	int		rv = 0;
2676 	uint64_t	s_basepa, t_basepa;
2677 	struct memlist	*s_ml, *t_ml;
2678 
2679 	if ((s_mlist == NULL) || (t_mlist == NULL))
2680 		return (0);
2681 
2682 	/*
2683 	 * Base both memlists on common base address (0).
2684 	 */
2685 	s_basepa = s_mlist->ml_address;
2686 	t_basepa = t_mlist->ml_address;
2687 
2688 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->ml_next)
2689 		s_ml->ml_address -= s_basepa;
2690 
2691 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->ml_next)
2692 		t_ml->ml_address -= t_basepa;
2693 
2694 	s_ml = s_mlist;
2695 	for (t_ml = t_mlist; t_ml && s_ml; t_ml = t_ml->ml_next) {
2696 		uint64_t	s_start, s_end;
2697 		uint64_t	t_start, t_end;
2698 
2699 		t_start = t_ml->ml_address;
2700 		t_end = t_start + t_ml->ml_size;
2701 
2702 		for (; s_ml; s_ml = s_ml->ml_next) {
2703 			s_start = s_ml->ml_address;
2704 			s_end = s_start + s_ml->ml_size;
2705 
2706 			if ((s_start < t_start) || (s_end > t_end))
2707 				break;
2708 		}
2709 	}
2710 	/*
2711 	 * If we ran out of source memlist chunks that mean
2712 	 * we found a home for all of them.
2713 	 */
2714 	if (s_ml == NULL)
2715 		rv = 1;
2716 
2717 	/*
2718 	 * Need to add base addresses back since memlists
2719 	 * are probably in use by caller.
2720 	 */
2721 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->ml_next)
2722 		s_ml->ml_address += s_basepa;
2723 
2724 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->ml_next)
2725 		t_ml->ml_address += t_basepa;
2726 
2727 	return (rv);
2728 }
2729