xref: /illumos-gate/usr/src/uts/sun4u/ngdr/io/dr_mem.c (revision 5801b0f01c3c34499a929ed96164a5a68b470945)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Peter Tribble.
29  */
30 
31 /*
32  * DR memory support routines.
33  */
34 
35 #include <sys/note.h>
36 #include <sys/debug.h>
37 #include <sys/types.h>
38 #include <sys/errno.h>
39 #include <sys/param.h>
40 #include <sys/dditypes.h>
41 #include <sys/kmem.h>
42 #include <sys/conf.h>
43 #include <sys/ddi.h>
44 #include <sys/sunddi.h>
45 #include <sys/sunndi.h>
46 #include <sys/ddi_impldefs.h>
47 #include <sys/ndi_impldefs.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machsystm.h>
50 #include <sys/spitregs.h>
51 #include <sys/cpuvar.h>
52 #include <sys/promif.h>
53 #include <vm/seg_kmem.h>
54 #include <sys/lgrp.h>
55 #include <sys/platform_module.h>
56 
57 #include <vm/page.h>
58 
59 #include <sys/dr.h>
60 #include <sys/dr_util.h>
61 
62 extern struct memlist	*phys_install;
63 
64 /* TODO: push this reference below drmach line */
65 extern int		kcage_on;
66 
67 /* for the DR*INTERNAL_ERROR macros.  see sys/dr.h. */
68 static char *dr_ie_fmt = "dr_mem.c %d";
69 
70 static int	dr_post_detach_mem_unit(dr_mem_unit_t *mp);
71 static int	dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *mlist);
72 static int	dr_select_mem_target(dr_handle_t *hp, dr_mem_unit_t *mp,
73     struct memlist *ml);
74 static void	dr_init_mem_unit_data(dr_mem_unit_t *mp);
75 
76 static int 	memlist_canfit(struct memlist *s_mlist,
77     struct memlist *t_mlist);
78 
79 /*
80  * dr_mem_unit_t.sbm_flags
81  */
82 #define	DR_MFLAG_RESERVED	0x01	/* mem unit reserved for delete */
83 #define	DR_MFLAG_SOURCE		0x02	/* source brd of copy/rename op */
84 #define	DR_MFLAG_TARGET		0x04	/* target brd of copy/rename op */
85 #define	DR_MFLAG_MEMUPSIZE	0x08	/* move from big to small board */
86 #define	DR_MFLAG_MEMDOWNSIZE	0x10	/* move from small to big board */
87 #define	DR_MFLAG_MEMRESIZE	0x18	/* move to different size board */
88 #define	DR_MFLAG_RELOWNER	0x20	/* memory release (delete) owner */
89 #define	DR_MFLAG_RELDONE	0x40	/* memory release (delete) done */
90 
91 /* helper macros */
92 #define	_ptob64(p) ((uint64_t)(p) << PAGESHIFT)
93 #define	_b64top(b) ((pgcnt_t)((b) >> PAGESHIFT))
94 
95 static struct memlist *
96 dr_get_memlist(dr_mem_unit_t *mp)
97 {
98 	struct memlist	*mlist = NULL;
99 	sbd_error_t	*err;
100 	static fn_t	f = "dr_get_memlist";
101 
102 	PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path);
103 
104 	/*
105 	 * Return cached memlist, if present.
106 	 * This memlist will be present following an
107 	 * unconfigure (a.k.a: detach) of this memunit.
108 	 * It should only be used in the case were a configure
109 	 * is bringing this memunit back in without going
110 	 * through the disconnect and connect states.
111 	 */
112 	if (mp->sbm_mlist) {
113 		PR_MEM("%s: found cached memlist\n", f);
114 
115 		mlist = memlist_dup(mp->sbm_mlist);
116 	} else {
117 		uint64_t basepa = _ptob64(mp->sbm_basepfn);
118 
119 		/* attempt to construct a memlist using phys_install */
120 
121 		/* round down to slice base address */
122 		basepa &= ~(mp->sbm_slice_size - 1);
123 
124 		/* get a copy of phys_install to edit */
125 		memlist_read_lock();
126 		mlist = memlist_dup(phys_install);
127 		memlist_read_unlock();
128 
129 		/* trim lower irrelevant span */
130 		if (mlist)
131 			mlist = memlist_del_span(mlist, 0ull, basepa);
132 
133 		/* trim upper irrelevant span */
134 		if (mlist) {
135 			uint64_t endpa;
136 
137 			basepa += mp->sbm_slice_size;
138 			endpa = _ptob64(physmax + 1);
139 			if (endpa > basepa)
140 				mlist = memlist_del_span(
141 				    mlist,
142 				    basepa,
143 				    endpa - basepa);
144 		}
145 
146 		if (mlist) {
147 			/* successfully built a memlist */
148 			PR_MEM("%s: derived memlist from phys_install\n", f);
149 		}
150 
151 		/* if no mlist yet, try platform layer */
152 		if (!mlist) {
153 			err = drmach_mem_get_memlist(
154 			    mp->sbm_cm.sbdev_id, &mlist);
155 			if (err) {
156 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
157 				mlist = NULL; /* paranoia */
158 			}
159 		}
160 	}
161 
162 	PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path);
163 	PR_MEMLIST_DUMP(mlist);
164 
165 	return (mlist);
166 }
167 
168 typedef struct {
169 	kcondvar_t cond;
170 	kmutex_t lock;
171 	int error;
172 	int done;
173 } dr_release_mem_sync_t;
174 
175 /*
176  * Memory has been logically removed by the time this routine is called.
177  */
178 static void
179 dr_mem_del_done(void *arg, int error)
180 {
181 	dr_release_mem_sync_t *ds = arg;
182 
183 	mutex_enter(&ds->lock);
184 	ds->error = error;
185 	ds->done = 1;
186 	cv_signal(&ds->cond);
187 	mutex_exit(&ds->lock);
188 }
189 
190 /*
191  * When we reach here the memory being drained should have
192  * already been reserved in dr_pre_release_mem().
193  * Our only task here is to kick off the "drain" and wait
194  * for it to finish.
195  */
196 void
197 dr_release_mem(dr_common_unit_t *cp)
198 {
199 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
200 	int		err;
201 	dr_release_mem_sync_t rms;
202 	static fn_t	f = "dr_release_mem";
203 
204 	/* check that this memory unit has been reserved */
205 	if (!(mp->sbm_flags & DR_MFLAG_RELOWNER)) {
206 		DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
207 		return;
208 	}
209 
210 	bzero((void *) &rms, sizeof (rms));
211 
212 	mutex_init(&rms.lock, NULL, MUTEX_DRIVER, NULL);
213 	cv_init(&rms.cond, NULL, CV_DRIVER, NULL);
214 
215 	mutex_enter(&rms.lock);
216 	err = kphysm_del_start(mp->sbm_memhandle, dr_mem_del_done,
217 	    (void *) &rms);
218 	if (err == KPHYSM_OK) {
219 		/* wait for completion or interrupt */
220 		while (!rms.done) {
221 			if (cv_wait_sig(&rms.cond, &rms.lock) == 0) {
222 				/* then there is a pending UNIX signal */
223 				(void) kphysm_del_cancel(mp->sbm_memhandle);
224 
225 				/* wait for completion */
226 				while (!rms.done)
227 					cv_wait(&rms.cond, &rms.lock);
228 			}
229 		}
230 		/* get the result of the memory delete operation */
231 		err = rms.error;
232 	}
233 	mutex_exit(&rms.lock);
234 
235 	cv_destroy(&rms.cond);
236 	mutex_destroy(&rms.lock);
237 
238 	if (err != KPHYSM_OK) {
239 		int e_code;
240 
241 		switch (err) {
242 			case KPHYSM_ENOWORK:
243 				e_code = ESBD_NOERROR;
244 				break;
245 
246 			case KPHYSM_EHANDLE:
247 			case KPHYSM_ESEQUENCE:
248 				e_code = ESBD_INTERNAL;
249 				break;
250 
251 			case KPHYSM_ENOTVIABLE:
252 				e_code = ESBD_MEM_NOTVIABLE;
253 				break;
254 
255 			case KPHYSM_EREFUSED:
256 				e_code = ESBD_MEM_REFUSED;
257 				break;
258 
259 			case KPHYSM_ENONRELOC:
260 				e_code = ESBD_MEM_NONRELOC;
261 				break;
262 
263 			case KPHYSM_ECANCELLED:
264 				e_code = ESBD_MEM_CANCELLED;
265 				break;
266 
267 			case KPHYSM_ERESOURCE:
268 				e_code = ESBD_MEMFAIL;
269 				break;
270 
271 			default:
272 				cmn_err(CE_WARN,
273 				    "%s: unexpected kphysm error code %d,"
274 				    " id 0x%p",
275 				    f, err, mp->sbm_cm.sbdev_id);
276 
277 				e_code = ESBD_IO;
278 				break;
279 		}
280 
281 		if (e_code != ESBD_NOERROR) {
282 			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
283 		}
284 	}
285 }
286 
287 void
288 dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
289 {
290 	_NOTE(ARGUNUSED(hp))
291 
292 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
293 	struct memlist	*ml, *mc;
294 	sbd_error_t	*err;
295 	static fn_t	f = "dr_attach_mem";
296 
297 	PR_MEM("%s...\n", f);
298 
299 	dr_lock_status(hp->h_bd);
300 	err = drmach_configure(cp->sbdev_id, 0);
301 	dr_unlock_status(hp->h_bd);
302 	if (err) {
303 		DRERR_SET_C(&cp->sbdev_error, &err);
304 		return;
305 	}
306 
307 	ml = dr_get_memlist(mp);
308 	for (mc = ml; mc; mc = mc->ml_next) {
309 		int		 rv;
310 		sbd_error_t	*err;
311 
312 		rv = kphysm_add_memory_dynamic(
313 		    (pfn_t)(mc->ml_address >> PAGESHIFT),
314 		    (pgcnt_t)(mc->ml_size >> PAGESHIFT));
315 		if (rv != KPHYSM_OK) {
316 			/*
317 			 * translate kphysm error and
318 			 * store in devlist error
319 			 */
320 			switch (rv) {
321 			case KPHYSM_ERESOURCE:
322 				rv = ESBD_NOMEM;
323 				break;
324 
325 			case KPHYSM_EFAULT:
326 				rv = ESBD_FAULT;
327 				break;
328 
329 			default:
330 				rv = ESBD_INTERNAL;
331 				break;
332 			}
333 
334 			if (rv == ESBD_INTERNAL) {
335 				DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
336 			} else
337 				dr_dev_err(CE_WARN, &mp->sbm_cm, rv);
338 			break;
339 		}
340 
341 		err = drmach_mem_add_span(
342 		    mp->sbm_cm.sbdev_id, mc->ml_address, mc->ml_size);
343 		if (err) {
344 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
345 			break;
346 		}
347 	}
348 
349 	memlist_delete(ml);
350 
351 	/* back out if configure failed */
352 	if (mp->sbm_cm.sbdev_error != NULL) {
353 		dr_lock_status(hp->h_bd);
354 		err = drmach_unconfigure(cp->sbdev_id,
355 		    DEVI_BRANCH_DESTROY);
356 		if (err)
357 			sbd_err_clear(&err);
358 		dr_unlock_status(hp->h_bd);
359 	}
360 }
361 
362 #define	DR_SCRUB_VALUE	0x0d0e0a0d0b0e0e0fULL
363 
364 static void
365 dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
366 {
367 #ifdef DEBUG
368 	clock_t		stime = ddi_get_lbolt();
369 #endif /* DEBUG */
370 
371 	struct memlist	*ml;
372 	uint64_t	scrub_value = DR_SCRUB_VALUE;
373 	processorid_t	cpuid;
374 	static fn_t	f = "dr_mem_ecache_scrub";
375 
376 	cpuid = drmach_mem_cpu_affinity(mp->sbm_cm.sbdev_id);
377 	affinity_set(cpuid);
378 
379 	PR_MEM("%s: using proc %d, memlist...\n", f,
380 	    (cpuid == CPU_CURRENT) ? CPU->cpu_id : cpuid);
381 	PR_MEMLIST_DUMP(mlist);
382 
383 	for (ml = mlist; ml; ml = ml->ml_next) {
384 		uint64_t	dst_pa;
385 		uint64_t	nbytes;
386 
387 		/* calculate the destination physical address */
388 		dst_pa = ml->ml_address;
389 		if (ml->ml_address & PAGEOFFSET)
390 			cmn_err(CE_WARN,
391 			    "%s: address (0x%lx) not on "
392 			    "page boundary", f, ml->ml_address);
393 
394 		nbytes = ml->ml_size;
395 		if (ml->ml_size & PAGEOFFSET)
396 			cmn_err(CE_WARN,
397 			    "%s: size (0x%lx) not on "
398 			    "page boundary", f, ml->ml_size);
399 
400 		/*LINTED*/
401 		while (nbytes > 0) {
402 			/* write 64 bits to dst_pa */
403 			stdphys(dst_pa, scrub_value);
404 
405 			/* increment/decrement by cacheline sizes */
406 			dst_pa += DRMACH_COHERENCY_UNIT;
407 			nbytes -= DRMACH_COHERENCY_UNIT;
408 		}
409 	}
410 
411 	/*
412 	 * flush this cpu's ecache and take care to ensure
413 	 * that all of it's bus transactions have retired.
414 	 */
415 	drmach_cpu_flush_ecache_sync();
416 
417 	affinity_clear();
418 
419 #ifdef DEBUG
420 	stime = ddi_get_lbolt() - stime;
421 	PR_MEM("%s: scrub ticks = %ld (%ld secs)\n", f, stime, stime / hz);
422 #endif /* DEBUG */
423 }
424 
425 static int
426 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
427 {
428 	time_t		 copytime;
429 	drmachid_t	 cr_id;
430 	dr_sr_handle_t	*srhp;
431 	struct memlist	*c_ml, *d_ml;
432 	sbd_error_t	*err;
433 	static fn_t	 f = "dr_move_memory";
434 
435 	PR_MEM("%s: (INLINE) moving memory from %s to %s\n",
436 	    f,
437 	    s_mp->sbm_cm.sbdev_path,
438 	    t_mp->sbm_cm.sbdev_path);
439 
440 	ASSERT(s_mp->sbm_flags & DR_MFLAG_SOURCE);
441 	ASSERT(s_mp->sbm_peer == t_mp);
442 	ASSERT(s_mp->sbm_mlist);
443 
444 	ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
445 	ASSERT(t_mp->sbm_peer == s_mp);
446 
447 	/*
448 	 * create a memlist of spans to copy by removing
449 	 * the spans that have been deleted, if any, from
450 	 * the full source board memlist.  s_mp->sbm_del_mlist
451 	 * will be NULL if there were no spans deleted from
452 	 * the source board.
453 	 */
454 	c_ml = memlist_dup(s_mp->sbm_mlist);
455 	d_ml = s_mp->sbm_del_mlist;
456 	while (d_ml != NULL) {
457 		c_ml = memlist_del_span(c_ml, d_ml->ml_address, d_ml->ml_size);
458 		d_ml = d_ml->ml_next;
459 	}
460 
461 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
462 
463 	err = drmach_copy_rename_init(
464 	    t_mp->sbm_cm.sbdev_id, _ptob64(t_mp->sbm_slice_offset),
465 	    s_mp->sbm_cm.sbdev_id, c_ml, &cr_id);
466 	if (err) {
467 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
468 		affinity_clear();
469 		return (-1);
470 	}
471 
472 	srhp = dr_get_sr_handle(hp);
473 	ASSERT(srhp);
474 
475 	copytime = ddi_get_lbolt();
476 
477 	/* Quiesce the OS.  */
478 	if (dr_suspend(srhp)) {
479 		cmn_err(CE_WARN, "%s: failed to quiesce OS"
480 		    " for copy-rename", f);
481 
482 		dr_release_sr_handle(srhp);
483 		err = drmach_copy_rename_fini(cr_id);
484 		if (err) {
485 			/*
486 			 * no error is expected since the program has
487 			 * not yet run.
488 			 */
489 
490 			/* catch this in debug kernels */
491 			ASSERT(0);
492 
493 			sbd_err_clear(&err);
494 		}
495 
496 		/* suspend error reached via hp */
497 		s_mp->sbm_cm.sbdev_error = hp->h_err;
498 		hp->h_err = NULL;
499 
500 		affinity_clear();
501 		return (-1);
502 	}
503 
504 	/*
505 	 * Rename memory for lgroup.
506 	 * Source and target board numbers are packaged in arg.
507 	 */
508 	{
509 		dr_board_t	*t_bp, *s_bp;
510 
511 		s_bp = s_mp->sbm_cm.sbdev_bp;
512 		t_bp = t_mp->sbm_cm.sbdev_bp;
513 
514 		lgrp_plat_config(LGRP_CONFIG_MEM_RENAME,
515 		    (uintptr_t)(s_bp->b_num | (t_bp->b_num << 16)));
516 	}
517 
518 	drmach_copy_rename(cr_id);
519 
520 	/* Resume the OS.  */
521 	dr_resume(srhp);
522 
523 	copytime = ddi_get_lbolt() - copytime;
524 
525 	dr_release_sr_handle(srhp);
526 	err = drmach_copy_rename_fini(cr_id);
527 	if (err)
528 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
529 
530 	affinity_clear();
531 
532 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
533 	    f, copytime, copytime / hz);
534 
535 	/* return -1 if dr_suspend or copy/rename recorded an error */
536 	return (err == NULL ? 0 : -1);
537 }
538 
539 /*
540  * If detaching node contains memory that is "non-permanent"
541  * then the memory adr's are simply cleared.  If the memory
542  * is non-relocatable, then do a copy-rename.
543  */
544 void
545 dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
546 {
547 	int			rv = 0;
548 	dr_mem_unit_t		*s_mp = (dr_mem_unit_t *)cp;
549 	dr_mem_unit_t		*t_mp;
550 	dr_state_t		state;
551 	static fn_t		f = "dr_detach_mem";
552 
553 	PR_MEM("%s...\n", f);
554 
555 	/* lookup target mem unit and target board structure, if any */
556 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
557 		t_mp = s_mp->sbm_peer;
558 		ASSERT(t_mp != NULL);
559 		ASSERT(t_mp->sbm_peer == s_mp);
560 	} else {
561 		t_mp = NULL;
562 	}
563 
564 	/* verify mem unit's state is UNREFERENCED */
565 	state = s_mp->sbm_cm.sbdev_state;
566 	if (state != DR_STATE_UNREFERENCED) {
567 		dr_dev_err(CE_IGNORE, &s_mp->sbm_cm, ESBD_STATE);
568 		return;
569 	}
570 
571 	/* verify target mem unit's state is UNREFERENCED, if any */
572 	if (t_mp != NULL) {
573 		state = t_mp->sbm_cm.sbdev_state;
574 		if (state != DR_STATE_UNREFERENCED) {
575 			dr_dev_err(CE_IGNORE, &t_mp->sbm_cm, ESBD_STATE);
576 			return;
577 		}
578 	}
579 
580 	/*
581 	 * Scrub deleted memory.  This will cause all cachelines
582 	 * referencing the memory to only be in the local cpu's
583 	 * ecache.
584 	 */
585 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
586 		/* no del mlist for src<=dst mem size copy/rename */
587 		if (s_mp->sbm_del_mlist)
588 			dr_mem_ecache_scrub(s_mp, s_mp->sbm_del_mlist);
589 	}
590 	if (t_mp != NULL && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
591 		ASSERT(t_mp->sbm_del_mlist);
592 		dr_mem_ecache_scrub(t_mp, t_mp->sbm_del_mlist);
593 	}
594 
595 	/*
596 	 * If there is no target board (no copy/rename was needed), then
597 	 * we're done!
598 	 */
599 	if (t_mp == NULL) {
600 		sbd_error_t *err;
601 		/*
602 		 * Reprogram interconnect hardware and disable
603 		 * memory controllers for memory node that's going away.
604 		 */
605 
606 		err = drmach_mem_disable(s_mp->sbm_cm.sbdev_id);
607 		if (err) {
608 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
609 			rv = -1;
610 		}
611 	} else {
612 		rv = dr_move_memory(hp, s_mp, t_mp);
613 		PR_MEM("%s: %s memory COPY-RENAME (board %d -> %d)\n",
614 		    f,
615 		    rv ? "FAILED" : "COMPLETED",
616 		    s_mp->sbm_cm.sbdev_bp->b_num,
617 		    t_mp->sbm_cm.sbdev_bp->b_num);
618 
619 		if (rv != 0)
620 			(void) dr_cancel_mem(s_mp);
621 	}
622 
623 	if (rv == 0) {
624 		sbd_error_t *err;
625 
626 		dr_lock_status(hp->h_bd);
627 		err = drmach_unconfigure(s_mp->sbm_cm.sbdev_id,
628 		    DEVI_BRANCH_DESTROY);
629 		dr_unlock_status(hp->h_bd);
630 		if (err)
631 			sbd_err_clear(&err);
632 	}
633 }
634 
635 /*
636  * XXX workaround for certain lab configurations (see also starcat drmach.c)
637  * Temporary code to get around observed incorrect results from
638  * kphysm_del_span_query when the queried span contains address spans
639  * not occupied by memory in between spans that do have memory.
640  * This routine acts as a wrapper to kphysm_del_span_query.  It builds
641  * a memlist from phys_install of spans that exist between base and
642  * base + npages, inclusively.  Kphysm_del_span_query is called for each
643  * node in the memlist with the results accumulated in *mp.
644  */
645 static int
646 dr_del_span_query(pfn_t base, pgcnt_t npages, memquery_t *mp)
647 {
648 	uint64_t	 pa = _ptob64(base);
649 	uint64_t	 sm = ~ (137438953472ull - 1);
650 	uint64_t	 sa = pa & sm;
651 	struct memlist	*mlist, *ml;
652 	int		 rv;
653 
654 	npages = npages; /* silence lint */
655 	memlist_read_lock();
656 	mlist = memlist_dup(phys_install);
657 	memlist_read_unlock();
658 
659 again:
660 	for (ml = mlist; ml; ml = ml->ml_next) {
661 		if ((ml->ml_address & sm) != sa) {
662 			mlist = memlist_del_span(mlist,
663 			    ml->ml_address, ml->ml_size);
664 			goto again;
665 		}
666 	}
667 
668 	mp->phys_pages = 0;
669 	mp->managed = 0;
670 	mp->nonrelocatable = 0;
671 	mp->first_nonrelocatable = (pfn_t)-1;	/* XXX */
672 	mp->last_nonrelocatable = 0;
673 
674 	for (ml = mlist; ml; ml = ml->ml_next) {
675 		memquery_t mq;
676 
677 		rv = kphysm_del_span_query(
678 		    _b64top(ml->ml_address), _b64top(ml->ml_size), &mq);
679 		if (rv)
680 			break;
681 
682 		mp->phys_pages += mq.phys_pages;
683 		mp->managed += mq.managed;
684 		mp->nonrelocatable += mq.nonrelocatable;
685 
686 		if (mq.nonrelocatable != 0) {
687 			if (mq.first_nonrelocatable < mp->first_nonrelocatable)
688 				mp->first_nonrelocatable =
689 				    mq.first_nonrelocatable;
690 			if (mq.last_nonrelocatable > mp->last_nonrelocatable)
691 				mp->last_nonrelocatable =
692 				    mq.last_nonrelocatable;
693 		}
694 	}
695 
696 	if (mp->nonrelocatable == 0)
697 		mp->first_nonrelocatable = 0;	/* XXX */
698 
699 	memlist_delete(mlist);
700 	return (rv);
701 }
702 
703 #define	kphysm_del_span_query dr_del_span_query
704 
705 /*
706  * NOTE: This routine is only partially smart about multiple
707  *	 mem-units.  Need to make mem-status structure smart
708  *	 about them also.
709  */
710 int
711 dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp)
712 {
713 	int		m, mix;
714 	memdelstat_t	mdst;
715 	memquery_t	mq;
716 	dr_board_t	*bp;
717 	dr_mem_unit_t	*mp;
718 	sbd_mem_stat_t	*msp;
719 	static fn_t	f = "dr_mem_status";
720 
721 	bp = hp->h_bd;
722 	devset &= DR_DEVS_PRESENT(bp);
723 
724 	for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) {
725 		int		rv;
726 		sbd_error_t	*err;
727 		drmach_status_t	 pstat;
728 		dr_mem_unit_t	*p_mp;
729 
730 		if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0)
731 			continue;
732 
733 		mp = dr_get_mem_unit(bp, m);
734 
735 		if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) {
736 			/* present, but not fully initialized */
737 			continue;
738 		}
739 
740 		if (mp->sbm_cm.sbdev_id == (drmachid_t)0)
741 			continue;
742 
743 		/* fetch platform status */
744 		err = drmach_status(mp->sbm_cm.sbdev_id, &pstat);
745 		if (err) {
746 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
747 			continue;
748 		}
749 
750 		msp = &dsp->d_mem;
751 		bzero((caddr_t)msp, sizeof (*msp));
752 
753 		(void) strncpy(msp->ms_cm.c_id.c_name, pstat.type,
754 		    sizeof (msp->ms_cm.c_id.c_name));
755 		msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type;
756 		msp->ms_cm.c_id.c_unit = SBD_NULL_UNIT;
757 		msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond;
758 		msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy;
759 		msp->ms_cm.c_time = mp->sbm_cm.sbdev_time;
760 		msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate;
761 
762 		msp->ms_totpages = mp->sbm_npages;
763 		msp->ms_basepfn = mp->sbm_basepfn;
764 		msp->ms_pageslost = mp->sbm_pageslost;
765 		msp->ms_cage_enabled = kcage_on;
766 
767 		if (mp->sbm_flags & DR_MFLAG_RESERVED)
768 			p_mp = mp->sbm_peer;
769 		else
770 			p_mp = NULL;
771 
772 		if (p_mp == NULL) {
773 			msp->ms_peer_is_target = 0;
774 			msp->ms_peer_ap_id[0] = '\0';
775 		} else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) {
776 			char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
777 			char *minor;
778 
779 			/*
780 			 * b_dip doesn't have to be held for ddi_pathname()
781 			 * because the board struct (dr_board_t) will be
782 			 * destroyed before b_dip detaches.
783 			 */
784 			(void) ddi_pathname(bp->b_dip, path);
785 			minor = strchr(p_mp->sbm_cm.sbdev_path, ':');
786 
787 			(void) snprintf(msp->ms_peer_ap_id,
788 			    sizeof (msp->ms_peer_ap_id), "%s%s",
789 			    path, (minor == NULL) ? "" : minor);
790 
791 			kmem_free(path, MAXPATHLEN);
792 
793 			if (p_mp->sbm_flags & DR_MFLAG_TARGET)
794 				msp->ms_peer_is_target = 1;
795 		}
796 
797 		if (mp->sbm_flags & DR_MFLAG_RELOWNER)
798 			rv = kphysm_del_status(mp->sbm_memhandle, &mdst);
799 		else
800 			rv = KPHYSM_EHANDLE;	/* force 'if' to fail */
801 
802 		if (rv == KPHYSM_OK) {
803 			/*
804 			 * Any pages above managed is "free",
805 			 * i.e. it's collected.
806 			 */
807 			msp->ms_detpages += (uint_t)(mdst.collected +
808 			    mdst.phys_pages - mdst.managed);
809 		} else {
810 			/*
811 			 * If we're UNREFERENCED or UNCONFIGURED,
812 			 * then the number of detached pages is
813 			 * however many pages are on the board.
814 			 * I.e. detached = not in use by OS.
815 			 */
816 			switch (msp->ms_cm.c_ostate) {
817 			/*
818 			 * changed to use cfgadm states
819 			 *
820 			 * was:
821 			 *	case DR_STATE_UNREFERENCED:
822 			 *	case DR_STATE_UNCONFIGURED:
823 			 */
824 			case SBD_STAT_UNCONFIGURED:
825 				msp->ms_detpages = msp->ms_totpages;
826 				break;
827 
828 			default:
829 				break;
830 			}
831 		}
832 
833 		/*
834 		 * kphysm_del_span_query can report non-reloc pages = total
835 		 * pages for memory that is not yet configured
836 		 */
837 		if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) {
838 
839 			rv = kphysm_del_span_query(mp->sbm_basepfn,
840 			    mp->sbm_npages, &mq);
841 
842 			if (rv == KPHYSM_OK) {
843 				msp->ms_managed_pages = mq.managed;
844 				msp->ms_noreloc_pages = mq.nonrelocatable;
845 				msp->ms_noreloc_first =
846 				    mq.first_nonrelocatable;
847 				msp->ms_noreloc_last =
848 				    mq.last_nonrelocatable;
849 				msp->ms_cm.c_sflags = 0;
850 				if (mq.nonrelocatable) {
851 					SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE,
852 					    msp->ms_cm.c_sflags);
853 				}
854 			} else {
855 				PR_MEM("%s: kphysm_del_span_query() = %d\n",
856 				    f, rv);
857 			}
858 		}
859 
860 		/*
861 		 * Check source unit state during copy-rename
862 		 */
863 		if ((mp->sbm_flags & DR_MFLAG_SOURCE) &&
864 		    (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED ||
865 		    mp->sbm_cm.sbdev_state == DR_STATE_RELEASE))
866 			msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED;
867 
868 		mix++;
869 		dsp++;
870 	}
871 
872 	return (mix);
873 }
874 
875 int
876 dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
877 {
878 	_NOTE(ARGUNUSED(hp))
879 
880 	int		err_flag = 0;
881 	int		d;
882 	sbd_error_t	*err;
883 	static fn_t	f = "dr_pre_attach_mem";
884 
885 	PR_MEM("%s...\n", f);
886 
887 	for (d = 0; d < devnum; d++) {
888 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
889 		dr_state_t	state;
890 
891 		cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path);
892 
893 		state = mp->sbm_cm.sbdev_state;
894 		switch (state) {
895 		case DR_STATE_UNCONFIGURED:
896 			PR_MEM("%s: recovering from UNCONFIG for %s\n",
897 			    f,
898 			    mp->sbm_cm.sbdev_path);
899 
900 			/* use memlist cached by dr_post_detach_mem_unit */
901 			ASSERT(mp->sbm_mlist != NULL);
902 			PR_MEM("%s: re-configuring cached memlist for %s:\n",
903 			    f, mp->sbm_cm.sbdev_path);
904 			PR_MEMLIST_DUMP(mp->sbm_mlist);
905 
906 			/* kphysm del handle should be have been freed */
907 			ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
908 
909 			/*FALLTHROUGH*/
910 
911 		case DR_STATE_CONNECTED:
912 			PR_MEM("%s: reprogramming mem hardware on %s\n",
913 			    f, mp->sbm_cm.sbdev_bp->b_path);
914 
915 			PR_MEM("%s: enabling %s\n",
916 			    f, mp->sbm_cm.sbdev_path);
917 
918 			err = drmach_mem_enable(mp->sbm_cm.sbdev_id);
919 			if (err) {
920 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
921 				err_flag = 1;
922 			}
923 			break;
924 
925 		default:
926 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE);
927 			err_flag = 1;
928 			break;
929 		}
930 
931 		/* exit for loop if error encountered */
932 		if (err_flag)
933 			break;
934 	}
935 
936 	return (err_flag ? -1 : 0);
937 }
938 
939 int
940 dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
941 {
942 	_NOTE(ARGUNUSED(hp))
943 
944 	int		d;
945 	static fn_t	f = "dr_post_attach_mem";
946 
947 	PR_MEM("%s...\n", f);
948 
949 	for (d = 0; d < devnum; d++) {
950 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
951 		struct memlist	*mlist, *ml;
952 
953 		mlist = dr_get_memlist(mp);
954 		if (mlist == NULL) {
955 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_MEMFAIL);
956 			continue;
957 		}
958 
959 		/*
960 		 * Verify the memory really did successfully attach
961 		 * by checking for its existence in phys_install.
962 		 */
963 		memlist_read_lock();
964 		if (memlist_intersect(phys_install, mlist) == 0) {
965 			memlist_read_unlock();
966 
967 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
968 
969 			PR_MEM("%s: %s memlist not in phys_install",
970 			    f, mp->sbm_cm.sbdev_path);
971 
972 			memlist_delete(mlist);
973 			continue;
974 		}
975 		memlist_read_unlock();
976 
977 		for (ml = mlist; ml != NULL; ml = ml->ml_next) {
978 			sbd_error_t *err;
979 
980 			err = drmach_mem_add_span(
981 			    mp->sbm_cm.sbdev_id,
982 			    ml->ml_address,
983 			    ml->ml_size);
984 			if (err)
985 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
986 		}
987 
988 		memlist_delete(mlist);
989 
990 		/*
991 		 * Destroy cached memlist, if any.
992 		 * There will be a cached memlist in sbm_mlist if
993 		 * this board is being configured directly after
994 		 * an unconfigure.
995 		 * To support this transition, dr_post_detach_mem
996 		 * left a copy of the last known memlist in sbm_mlist.
997 		 * This memlist could differ from any derived from
998 		 * hardware if while this memunit was last configured
999 		 * the system detected and deleted bad pages from
1000 		 * phys_install.  The location of those bad pages
1001 		 * will be reflected in the cached memlist.
1002 		 */
1003 		if (mp->sbm_mlist) {
1004 			memlist_delete(mp->sbm_mlist);
1005 			mp->sbm_mlist = NULL;
1006 		}
1007 
1008 /*
1009  * TODO: why is this call to dr_init_mem_unit_data here?
1010  * this has been done at discovery or connect time, so this is
1011  * probably redundant and unnecessary.
1012  */
1013 		dr_init_mem_unit_data(mp);
1014 	}
1015 
1016 	return (0);
1017 }
1018 
1019 int
1020 dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1021 {
1022 	_NOTE(ARGUNUSED(hp))
1023 
1024 	int d;
1025 
1026 	for (d = 0; d < devnum; d++) {
1027 		dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d];
1028 
1029 		cmn_err(CE_CONT, "OS unconfigure %s", mp->sbm_cm.sbdev_path);
1030 	}
1031 
1032 	return (0);
1033 }
1034 
1035 
1036 int
1037 dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1038 {
1039 	_NOTE(ARGUNUSED(hp))
1040 
1041 	int		d, rv;
1042 	static fn_t	f = "dr_post_detach_mem";
1043 
1044 	PR_MEM("%s...\n", f);
1045 
1046 	rv = 0;
1047 	for (d = 0; d < devnum; d++) {
1048 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1049 
1050 		ASSERT(mp->sbm_cm.sbdev_bp == hp->h_bd);
1051 
1052 		if (dr_post_detach_mem_unit(mp))
1053 			rv = -1;
1054 	}
1055 
1056 	return (rv);
1057 }
1058 
1059 static void
1060 dr_add_memory_spans(dr_mem_unit_t *mp, struct memlist *ml)
1061 {
1062 	static fn_t	f = "dr_add_memory_spans";
1063 
1064 	PR_MEM("%s...", f);
1065 	PR_MEMLIST_DUMP(ml);
1066 
1067 #ifdef DEBUG
1068 	memlist_read_lock();
1069 	if (memlist_intersect(phys_install, ml)) {
1070 		PR_MEM("%s:WARNING: memlist intersects with phys_install\n", f);
1071 	}
1072 	memlist_read_unlock();
1073 #endif
1074 
1075 	for (; ml; ml = ml->ml_next) {
1076 		pfn_t		 base;
1077 		pgcnt_t		 npgs;
1078 		int		 rv;
1079 		sbd_error_t	*err;
1080 
1081 		base = _b64top(ml->ml_address);
1082 		npgs = _b64top(ml->ml_size);
1083 
1084 		rv = kphysm_add_memory_dynamic(base, npgs);
1085 
1086 		err = drmach_mem_add_span(
1087 		    mp->sbm_cm.sbdev_id,
1088 		    ml->ml_address,
1089 		    ml->ml_size);
1090 
1091 		if (err)
1092 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1093 
1094 		if (rv != KPHYSM_OK) {
1095 			cmn_err(CE_WARN, "%s:"
1096 			    " unexpected kphysm_add_memory_dynamic"
1097 			    " return value %d;"
1098 			    " basepfn=0x%lx, npages=%ld\n",
1099 			    f, rv, base, npgs);
1100 
1101 			continue;
1102 		}
1103 	}
1104 }
1105 
1106 static int
1107 dr_post_detach_mem_unit(dr_mem_unit_t *s_mp)
1108 {
1109 	uint64_t	sz = s_mp->sbm_slice_size;
1110 	uint64_t	sm = sz - 1;
1111 	/* old and new below refer to PAs before and after copy-rename */
1112 	uint64_t	s_old_basepa, s_new_basepa;
1113 	uint64_t	t_old_basepa, t_new_basepa;
1114 	uint64_t	t_new_smallsize = 0;
1115 	dr_mem_unit_t	*t_mp, *x_mp;
1116 	struct memlist	*ml;
1117 	int		rv;
1118 	sbd_error_t	*err;
1119 	static fn_t	f = "dr_post_detach_mem_unit";
1120 
1121 	PR_MEM("%s...\n", f);
1122 
1123 	/* s_mp->sbm_del_mlist could be NULL, meaning no deleted spans */
1124 	PR_MEM("%s: %s: deleted memlist (EMPTY maybe okay):\n",
1125 	    f, s_mp->sbm_cm.sbdev_path);
1126 	PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1127 
1128 	/* sanity check */
1129 	ASSERT(s_mp->sbm_del_mlist == NULL ||
1130 	    (s_mp->sbm_flags & DR_MFLAG_RELDONE) != 0);
1131 
1132 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1133 		t_mp = s_mp->sbm_peer;
1134 		ASSERT(t_mp != NULL);
1135 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1136 		ASSERT(t_mp->sbm_peer == s_mp);
1137 
1138 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RELDONE);
1139 		ASSERT(t_mp->sbm_del_mlist);
1140 
1141 		PR_MEM("%s: target %s: deleted memlist:\n",
1142 		    f, t_mp->sbm_cm.sbdev_path);
1143 		PR_MEMLIST_DUMP(t_mp->sbm_del_mlist);
1144 	} else {
1145 		/* this is no target unit */
1146 		t_mp = NULL;
1147 	}
1148 
1149 	/*
1150 	 * Verify the memory really did successfully detach
1151 	 * by checking for its non-existence in phys_install.
1152 	 */
1153 	rv = 0;
1154 	memlist_read_lock();
1155 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
1156 		x_mp = s_mp;
1157 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1158 	}
1159 	if (rv == 0 && t_mp && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
1160 		x_mp = t_mp;
1161 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1162 	}
1163 	memlist_read_unlock();
1164 
1165 	if (rv) {
1166 		/* error: memlist still in phys_install */
1167 		DR_DEV_INTERNAL_ERROR(&x_mp->sbm_cm);
1168 	}
1169 
1170 	/*
1171 	 * clean mem unit state and bail out if an error has been recorded.
1172 	 */
1173 	rv = 0;
1174 	if (s_mp->sbm_cm.sbdev_error) {
1175 		PR_MEM("%s: %s flags=%x", f,
1176 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1177 		DR_DEV_CLR_UNREFERENCED(&s_mp->sbm_cm);
1178 		DR_DEV_CLR_RELEASED(&s_mp->sbm_cm);
1179 		dr_device_transition(&s_mp->sbm_cm, DR_STATE_CONFIGURED);
1180 		rv = -1;
1181 	}
1182 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error != NULL) {
1183 		PR_MEM("%s: %s flags=%x", f,
1184 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1185 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1186 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1187 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1188 		rv = -1;
1189 	}
1190 	if (rv)
1191 		goto cleanup;
1192 
1193 	s_old_basepa = _ptob64(s_mp->sbm_basepfn);
1194 	err = drmach_mem_get_base_physaddr(s_mp->sbm_cm.sbdev_id,
1195 	    &s_new_basepa);
1196 	ASSERT(err == NULL);
1197 
1198 	PR_MEM("%s:s_old_basepa: 0x%lx\n", f, s_old_basepa);
1199 	PR_MEM("%s:s_new_basepa: 0x%lx\n", f, s_new_basepa);
1200 
1201 	if (t_mp != NULL) {
1202 		struct memlist *s_copy_mlist;
1203 
1204 		t_old_basepa	= _ptob64(t_mp->sbm_basepfn);
1205 		err = drmach_mem_get_base_physaddr(t_mp->sbm_cm.sbdev_id,
1206 		    &t_new_basepa);
1207 		ASSERT(err == NULL);
1208 
1209 		PR_MEM("%s:t_old_basepa: 0x%lx\n", f, t_old_basepa);
1210 		PR_MEM("%s:t_new_basepa: 0x%lx\n", f, t_new_basepa);
1211 
1212 		/*
1213 		 * Construct copy list with original source addresses.
1214 		 * Used to add back excess target mem.
1215 		 */
1216 		s_copy_mlist = memlist_dup(s_mp->sbm_mlist);
1217 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1218 			s_copy_mlist = memlist_del_span(s_copy_mlist,
1219 			    ml->ml_address, ml->ml_size);
1220 		}
1221 
1222 		PR_MEM("%s: source copy list:\n:", f);
1223 		PR_MEMLIST_DUMP(s_copy_mlist);
1224 
1225 		/*
1226 		 * We had to swap mem-units, so update
1227 		 * memlists accordingly with new base
1228 		 * addresses.
1229 		 */
1230 		for (ml = t_mp->sbm_mlist; ml; ml = ml->ml_next) {
1231 			ml->ml_address -= t_old_basepa;
1232 			ml->ml_address += t_new_basepa;
1233 		}
1234 
1235 		/*
1236 		 * There is no need to explicitly rename the target delete
1237 		 * memlist, because sbm_del_mlist and sbm_mlist always
1238 		 * point to the same memlist for a copy/rename operation.
1239 		 */
1240 		ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1241 
1242 		PR_MEM("%s: renamed target memlist and delete memlist:\n", f);
1243 		PR_MEMLIST_DUMP(t_mp->sbm_mlist);
1244 
1245 		for (ml = s_mp->sbm_mlist; ml; ml = ml->ml_next) {
1246 			ml->ml_address -= s_old_basepa;
1247 			ml->ml_address += s_new_basepa;
1248 		}
1249 
1250 		PR_MEM("%s: renamed source memlist:\n", f);
1251 		PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1252 
1253 		/*
1254 		 * Keep track of dynamically added segments
1255 		 * since they cannot be split if we need to delete
1256 		 * excess source memory later for this board.
1257 		 */
1258 		if (t_mp->sbm_dyn_segs)
1259 			memlist_delete(t_mp->sbm_dyn_segs);
1260 		t_mp->sbm_dyn_segs = s_mp->sbm_dyn_segs;
1261 		s_mp->sbm_dyn_segs = NULL;
1262 
1263 		/*
1264 		 * If the target memory range with the new target base PA
1265 		 * extends beyond the usable slice, prevent any "target excess"
1266 		 * from being added back after this copy/rename and
1267 		 * calculate the new smaller size of the target board
1268 		 * to be set as part of target cleanup. The base + npages
1269 		 * must only include the range of memory up to the end of
1270 		 * this slice. This will only be used after a category 4
1271 		 * large-to-small target type copy/rename - see comments
1272 		 * in dr_select_mem_target.
1273 		 */
1274 		if (((t_new_basepa & sm) + _ptob64(t_mp->sbm_npages)) > sz) {
1275 			t_new_smallsize = sz - (t_new_basepa & sm);
1276 		}
1277 
1278 		if (s_mp->sbm_flags & DR_MFLAG_MEMRESIZE &&
1279 		    t_new_smallsize == 0) {
1280 			struct memlist	*t_excess_mlist;
1281 
1282 			/*
1283 			 * Add back excess target memory.
1284 			 * Subtract out the portion of the target memory
1285 			 * node that was taken over by the source memory
1286 			 * node.
1287 			 */
1288 			t_excess_mlist = memlist_dup(t_mp->sbm_mlist);
1289 			for (ml = s_copy_mlist; ml; ml = ml->ml_next) {
1290 				t_excess_mlist =
1291 				    memlist_del_span(t_excess_mlist,
1292 				    ml->ml_address, ml->ml_size);
1293 			}
1294 
1295 			/*
1296 			 * Update dynamically added segs
1297 			 */
1298 			for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1299 				t_mp->sbm_dyn_segs =
1300 				    memlist_del_span(t_mp->sbm_dyn_segs,
1301 				    ml->ml_address, ml->ml_size);
1302 			}
1303 			for (ml = t_excess_mlist; ml; ml = ml->ml_next) {
1304 				t_mp->sbm_dyn_segs =
1305 				    memlist_cat_span(t_mp->sbm_dyn_segs,
1306 				    ml->ml_address, ml->ml_size);
1307 			}
1308 			PR_MEM("%s: %s: updated dynamic seg list:\n",
1309 			    f, t_mp->sbm_cm.sbdev_path);
1310 			PR_MEMLIST_DUMP(t_mp->sbm_dyn_segs);
1311 
1312 			PR_MEM("%s: adding back remaining portion"
1313 			    " of %s, memlist:\n",
1314 			    f, t_mp->sbm_cm.sbdev_path);
1315 			PR_MEMLIST_DUMP(t_excess_mlist);
1316 
1317 			dr_add_memory_spans(s_mp, t_excess_mlist);
1318 			memlist_delete(t_excess_mlist);
1319 		}
1320 		memlist_delete(s_copy_mlist);
1321 
1322 #ifdef DEBUG
1323 		/*
1324 		 * Renaming s_mp->sbm_del_mlist is not necessary.  This
1325 		 * list is not used beyond this point, and in fact, is
1326 		 * disposed of at the end of this function.
1327 		 */
1328 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->ml_next) {
1329 			ml->ml_address -= s_old_basepa;
1330 			ml->ml_address += s_new_basepa;
1331 		}
1332 
1333 		PR_MEM("%s: renamed source delete memlist", f);
1334 		PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1335 #endif
1336 
1337 	}
1338 
1339 	if (t_mp != NULL) {
1340 		/* delete target's entire address space */
1341 		err = drmach_mem_del_span(t_mp->sbm_cm.sbdev_id,
1342 		    t_old_basepa & ~ sm, sz);
1343 		if (err)
1344 			DRERR_SET_C(&t_mp->sbm_cm.sbdev_error, &err);
1345 		ASSERT(err == NULL);
1346 
1347 		/*
1348 		 * After the copy/rename, the original address space
1349 		 * for the source board (which is now located on the
1350 		 * target board) may now have some excess to be deleted.
1351 		 * The amount is calculated by masking the slice
1352 		 * info and keeping the slice offset from t_new_basepa.
1353 		 */
1354 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1355 		    s_old_basepa & ~ sm, t_new_basepa & sm);
1356 		if (err)
1357 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1358 		ASSERT(err == NULL);
1359 
1360 	} else {
1361 		/* delete board's entire address space */
1362 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1363 		    s_old_basepa & ~ sm, sz);
1364 		if (err)
1365 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1366 		ASSERT(err == NULL);
1367 	}
1368 
1369 cleanup:
1370 	/* clean up target mem unit */
1371 	if (t_mp != NULL) {
1372 		memlist_delete(t_mp->sbm_del_mlist);
1373 		/* no need to delete sbm_mlist, it shares sbm_del_mlist */
1374 
1375 		t_mp->sbm_del_mlist = NULL;
1376 		t_mp->sbm_mlist = NULL;
1377 		t_mp->sbm_peer = NULL;
1378 		t_mp->sbm_flags = 0;
1379 		t_mp->sbm_cm.sbdev_busy = 0;
1380 		dr_init_mem_unit_data(t_mp);
1381 
1382 		/* reduce target size if new PAs go past end of usable slice */
1383 		if (t_new_smallsize > 0) {
1384 			t_mp->sbm_npages = _b64top(t_new_smallsize);
1385 			PR_MEM("%s: target new size 0x%lx bytes\n",
1386 			    f, t_new_smallsize);
1387 		}
1388 	}
1389 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error == NULL) {
1390 		/*
1391 		 * now that copy/rename has completed, undo this
1392 		 * work that was done in dr_release_mem_done.
1393 		 */
1394 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1395 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1396 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1397 	}
1398 
1399 	/*
1400 	 * clean up (source) board's mem unit structure.
1401 	 * NOTE: sbm_mlist is retained if no error has been record (in other
1402 	 * words, when s_mp->sbm_cm.sbdev_error is NULL). This memlist is
1403 	 * referred to elsewhere as the cached memlist.  The cached memlist
1404 	 * is used to re-attach (configure back in) this memunit from the
1405 	 * unconfigured state.  The memlist is retained because it may
1406 	 * represent bad pages that were detected while the memory was
1407 	 * configured into the OS.  The OS deletes bad pages from phys_install.
1408 	 * Those deletes, if any, will be represented in the cached mlist.
1409 	 */
1410 	if (s_mp->sbm_del_mlist && s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1411 		memlist_delete(s_mp->sbm_del_mlist);
1412 
1413 	if (s_mp->sbm_cm.sbdev_error && s_mp->sbm_mlist) {
1414 		memlist_delete(s_mp->sbm_mlist);
1415 		s_mp->sbm_mlist = NULL;
1416 	}
1417 
1418 	if (s_mp->sbm_dyn_segs != NULL && s_mp->sbm_cm.sbdev_error == 0) {
1419 		memlist_delete(s_mp->sbm_dyn_segs);
1420 		s_mp->sbm_dyn_segs = NULL;
1421 	}
1422 
1423 	s_mp->sbm_del_mlist = NULL;
1424 	s_mp->sbm_peer = NULL;
1425 	s_mp->sbm_flags = 0;
1426 	s_mp->sbm_cm.sbdev_busy = 0;
1427 	dr_init_mem_unit_data(s_mp);
1428 
1429 	PR_MEM("%s: cached memlist for %s:", f, s_mp->sbm_cm.sbdev_path);
1430 	PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1431 
1432 	return (0);
1433 }
1434 
1435 /*
1436  * Successful return from this function will have the memory
1437  * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated
1438  * and waiting.  This routine's job is to select the memory that
1439  * actually has to be released (detached) which may not necessarily
1440  * be the same memory node that came in in devlist[],
1441  * i.e. a copy-rename is needed.
1442  */
1443 int
1444 dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1445 {
1446 	int		d;
1447 	int		err_flag = 0;
1448 	static fn_t	f = "dr_pre_release_mem";
1449 
1450 	PR_MEM("%s...\n", f);
1451 
1452 	for (d = 0; d < devnum; d++) {
1453 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1454 		int		rv;
1455 		memquery_t	mq;
1456 		struct memlist	*ml;
1457 
1458 		if (mp->sbm_cm.sbdev_error) {
1459 			err_flag = 1;
1460 			continue;
1461 		} else if (!kcage_on) {
1462 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_KCAGE_OFF);
1463 			err_flag = 1;
1464 			continue;
1465 		}
1466 
1467 		if (mp->sbm_flags & DR_MFLAG_RESERVED) {
1468 			/*
1469 			 * Board is currently involved in a delete
1470 			 * memory operation. Can't detach this guy until
1471 			 * that operation completes.
1472 			 */
1473 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_INVAL);
1474 			err_flag = 1;
1475 			break;
1476 		}
1477 
1478 		/*
1479 		 * Check whether the detaching memory requires a
1480 		 * copy-rename.
1481 		 */
1482 		ASSERT(mp->sbm_npages != 0);
1483 		rv = kphysm_del_span_query(mp->sbm_basepfn, mp->sbm_npages,
1484 		    &mq);
1485 		if (rv != KPHYSM_OK) {
1486 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1487 			err_flag = 1;
1488 			break;
1489 		}
1490 
1491 		if (mq.nonrelocatable != 0) {
1492 			if (!(dr_cmd_flags(hp) &
1493 			    (SBD_FLAG_FORCE | SBD_FLAG_QUIESCE_OKAY))) {
1494 				/* caller wasn't prompted for a suspend */
1495 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1496 				    ESBD_QUIESCE_REQD);
1497 				err_flag = 1;
1498 				break;
1499 			}
1500 		}
1501 
1502 		/* flags should be clean at this time */
1503 		ASSERT(mp->sbm_flags == 0);
1504 
1505 		ASSERT(mp->sbm_mlist == NULL);		/* should be null */
1506 		ASSERT(mp->sbm_del_mlist == NULL);	/* should be null */
1507 		if (mp->sbm_mlist != NULL) {
1508 			memlist_delete(mp->sbm_mlist);
1509 			mp->sbm_mlist = NULL;
1510 		}
1511 
1512 		ml = dr_get_memlist(mp);
1513 		if (ml == NULL) {
1514 			err_flag = 1;
1515 			PR_MEM("%s: no memlist found for %s\n",
1516 			    f, mp->sbm_cm.sbdev_path);
1517 			continue;
1518 		}
1519 
1520 		/* allocate a kphysm handle */
1521 		rv = kphysm_del_gethandle(&mp->sbm_memhandle);
1522 		if (rv != KPHYSM_OK) {
1523 			memlist_delete(ml);
1524 
1525 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1526 			err_flag = 1;
1527 			break;
1528 		}
1529 		mp->sbm_flags |= DR_MFLAG_RELOWNER;
1530 
1531 		if ((mq.nonrelocatable != 0) ||
1532 		    dr_reserve_mem_spans(&mp->sbm_memhandle, ml)) {
1533 			/*
1534 			 * Either the detaching memory node contains
1535 			 * non-reloc memory or we failed to reserve the
1536 			 * detaching memory node (which did _not_ have
1537 			 * any non-reloc memory, i.e. some non-reloc mem
1538 			 * got onboard).
1539 			 */
1540 
1541 			if (dr_select_mem_target(hp, mp, ml)) {
1542 				int rv;
1543 
1544 				/*
1545 				 * We had no luck locating a target
1546 				 * memory node to be the recipient of
1547 				 * the non-reloc memory on the node
1548 				 * we're trying to detach.
1549 				 * Clean up be disposing the mem handle
1550 				 * and the mem list.
1551 				 */
1552 				rv = kphysm_del_release(mp->sbm_memhandle);
1553 				if (rv != KPHYSM_OK) {
1554 					/*
1555 					 * can do nothing but complain
1556 					 * and hope helpful for debug
1557 					 */
1558 					cmn_err(CE_WARN, "%s: unexpected"
1559 					    " kphysm_del_release return"
1560 					    " value %d",
1561 					    f, rv);
1562 				}
1563 				mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1564 
1565 				memlist_delete(ml);
1566 
1567 				/* make sure sbm_flags is clean */
1568 				ASSERT(mp->sbm_flags == 0);
1569 
1570 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1571 				    ESBD_NO_TARGET);
1572 
1573 				err_flag = 1;
1574 				break;
1575 			}
1576 
1577 			/*
1578 			 * ml is not memlist_delete'd here because
1579 			 * it has been assigned to mp->sbm_mlist
1580 			 * by dr_select_mem_target.
1581 			 */
1582 		} else {
1583 			/* no target needed to detach this board */
1584 			mp->sbm_flags |= DR_MFLAG_RESERVED;
1585 			mp->sbm_peer = NULL;
1586 			mp->sbm_del_mlist = ml;
1587 			mp->sbm_mlist = ml;
1588 			mp->sbm_cm.sbdev_busy = 1;
1589 		}
1590 #ifdef DEBUG
1591 		ASSERT(mp->sbm_mlist != NULL);
1592 
1593 		if (mp->sbm_flags & DR_MFLAG_SOURCE) {
1594 			PR_MEM("%s: release of %s requires copy/rename;"
1595 			    " selected target board %s\n",
1596 			    f,
1597 			    mp->sbm_cm.sbdev_path,
1598 			    mp->sbm_peer->sbm_cm.sbdev_path);
1599 		} else {
1600 			PR_MEM("%s: copy/rename not required to release %s\n",
1601 			    f, mp->sbm_cm.sbdev_path);
1602 		}
1603 
1604 		ASSERT(mp->sbm_flags & DR_MFLAG_RELOWNER);
1605 		ASSERT(mp->sbm_flags & DR_MFLAG_RESERVED);
1606 #endif
1607 	}
1608 
1609 	return (err_flag ? -1 : 0);
1610 }
1611 
1612 void
1613 dr_release_mem_done(dr_common_unit_t *cp)
1614 {
1615 	dr_mem_unit_t	*s_mp = (dr_mem_unit_t *)cp;
1616 	dr_mem_unit_t *t_mp, *mp;
1617 	int		rv;
1618 	static fn_t	f = "dr_release_mem_done";
1619 
1620 	/*
1621 	 * This unit will be flagged with DR_MFLAG_SOURCE, if it
1622 	 * has a target unit.
1623 	 */
1624 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1625 		t_mp = s_mp->sbm_peer;
1626 		ASSERT(t_mp != NULL);
1627 		ASSERT(t_mp->sbm_peer == s_mp);
1628 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1629 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RESERVED);
1630 	} else {
1631 		/* this is no target unit */
1632 		t_mp = NULL;
1633 	}
1634 
1635 	/* free delete handle */
1636 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RELOWNER);
1637 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RESERVED);
1638 	rv = kphysm_del_release(s_mp->sbm_memhandle);
1639 	if (rv != KPHYSM_OK) {
1640 		/*
1641 		 * can do nothing but complain
1642 		 * and hope helpful for debug
1643 		 */
1644 		cmn_err(CE_WARN, "%s: unexpected kphysm_del_release"
1645 		    " return value %d", f, rv);
1646 	}
1647 	s_mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1648 
1649 	/*
1650 	 * If an error was encountered during release, clean up
1651 	 * the source (and target, if present) unit data.
1652 	 */
1653 /* XXX Can we know that sbdev_error was encountered during release? */
1654 	if (s_mp->sbm_cm.sbdev_error != NULL) {
1655 		PR_MEM("%s: %s: error %d noted\n",
1656 		    f,
1657 		    s_mp->sbm_cm.sbdev_path,
1658 		    s_mp->sbm_cm.sbdev_error->e_code);
1659 
1660 		if (t_mp != NULL) {
1661 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1662 			t_mp->sbm_del_mlist = NULL;
1663 
1664 			if (t_mp->sbm_mlist != NULL) {
1665 				memlist_delete(t_mp->sbm_mlist);
1666 				t_mp->sbm_mlist = NULL;
1667 			}
1668 
1669 			t_mp->sbm_peer = NULL;
1670 			t_mp->sbm_flags = 0;
1671 			t_mp->sbm_cm.sbdev_busy = 0;
1672 		}
1673 
1674 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1675 			memlist_delete(s_mp->sbm_del_mlist);
1676 		s_mp->sbm_del_mlist = NULL;
1677 
1678 		if (s_mp->sbm_mlist != NULL) {
1679 			memlist_delete(s_mp->sbm_mlist);
1680 			s_mp->sbm_mlist = NULL;
1681 		}
1682 
1683 		s_mp->sbm_peer = NULL;
1684 		s_mp->sbm_flags = 0;
1685 		s_mp->sbm_cm.sbdev_busy = 0;
1686 
1687 		/* bail out */
1688 		return;
1689 	}
1690 
1691 	DR_DEV_SET_RELEASED(&s_mp->sbm_cm);
1692 	dr_device_transition(&s_mp->sbm_cm, DR_STATE_RELEASE);
1693 
1694 	if (t_mp != NULL) {
1695 		/*
1696 		 * the kphysm delete operation that drained the source
1697 		 * board also drained this target board.  Since the source
1698 		 * board drain is now known to have succeeded, we know this
1699 		 * target board is drained too.
1700 		 *
1701 		 * because DR_DEV_SET_RELEASED and dr_device_transition
1702 		 * is done here, the dr_release_dev_done should not
1703 		 * fail.
1704 		 */
1705 		DR_DEV_SET_RELEASED(&t_mp->sbm_cm);
1706 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_RELEASE);
1707 
1708 		/*
1709 		 * NOTE: do not transition target's board state,
1710 		 * even if the mem-unit was the last configure
1711 		 * unit of the board.  When copy/rename completes
1712 		 * this mem-unit will transitioned back to
1713 		 * the configured state.  In the meantime, the
1714 		 * board's must remain as is.
1715 		 */
1716 	}
1717 
1718 	/* if board(s) had deleted memory, verify it is gone */
1719 	rv = 0;
1720 	memlist_read_lock();
1721 	if (s_mp->sbm_del_mlist != NULL) {
1722 		mp = s_mp;
1723 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1724 	}
1725 	if (rv == 0 && t_mp && t_mp->sbm_del_mlist != NULL) {
1726 		mp = t_mp;
1727 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1728 	}
1729 	memlist_read_unlock();
1730 	if (rv) {
1731 		cmn_err(CE_WARN, "%s: %smem-unit (%d.%d): "
1732 		    "deleted memory still found in phys_install",
1733 		    f,
1734 		    (mp == t_mp ? "target " : ""),
1735 		    mp->sbm_cm.sbdev_bp->b_num,
1736 		    mp->sbm_cm.sbdev_unum);
1737 
1738 		DR_DEV_INTERNAL_ERROR(&s_mp->sbm_cm);
1739 		return;
1740 	}
1741 
1742 	s_mp->sbm_flags |= DR_MFLAG_RELDONE;
1743 	if (t_mp != NULL)
1744 		t_mp->sbm_flags |= DR_MFLAG_RELDONE;
1745 
1746 	/* this should not fail */
1747 	if (dr_release_dev_done(&s_mp->sbm_cm) != 0) {
1748 		/* catch this in debug kernels */
1749 		ASSERT(0);
1750 		return;
1751 	}
1752 
1753 	PR_MEM("%s: marking %s release DONE\n",
1754 	    f, s_mp->sbm_cm.sbdev_path);
1755 
1756 	s_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1757 
1758 	if (t_mp != NULL) {
1759 		/* should not fail */
1760 		rv = dr_release_dev_done(&t_mp->sbm_cm);
1761 		if (rv != 0) {
1762 			/* catch this in debug kernels */
1763 			ASSERT(0);
1764 			return;
1765 		}
1766 
1767 		PR_MEM("%s: marking %s release DONE\n",
1768 		    f, t_mp->sbm_cm.sbdev_path);
1769 
1770 		t_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1771 	}
1772 }
1773 
1774 /*ARGSUSED*/
1775 int
1776 dr_disconnect_mem(dr_mem_unit_t *mp)
1777 {
1778 	static fn_t	f = "dr_disconnect_mem";
1779 	update_membounds_t umb;
1780 
1781 #ifdef DEBUG
1782 	int state = mp->sbm_cm.sbdev_state;
1783 	ASSERT(state == DR_STATE_CONNECTED || state == DR_STATE_UNCONFIGURED);
1784 #endif
1785 
1786 	PR_MEM("%s...\n", f);
1787 
1788 	if (mp->sbm_del_mlist && mp->sbm_del_mlist != mp->sbm_mlist)
1789 		memlist_delete(mp->sbm_del_mlist);
1790 	mp->sbm_del_mlist = NULL;
1791 
1792 	if (mp->sbm_mlist) {
1793 		memlist_delete(mp->sbm_mlist);
1794 		mp->sbm_mlist = NULL;
1795 	}
1796 
1797 	/*
1798 	 * Remove memory from lgroup
1799 	 * For now, only board info is required.
1800 	 */
1801 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1802 	umb.u_base = (uint64_t)-1;
1803 	umb.u_len = (uint64_t)-1;
1804 
1805 	lgrp_plat_config(LGRP_CONFIG_MEM_DEL, (uintptr_t)&umb);
1806 
1807 	return (0);
1808 }
1809 
1810 int
1811 dr_cancel_mem(dr_mem_unit_t *s_mp)
1812 {
1813 	dr_mem_unit_t	*t_mp;
1814 	dr_state_t	state;
1815 	static fn_t	f = "dr_cancel_mem";
1816 
1817 	state = s_mp->sbm_cm.sbdev_state;
1818 
1819 	if (s_mp->sbm_flags & DR_MFLAG_TARGET) {
1820 		/* must cancel source board, not target board */
1821 		/* TODO: set error */
1822 		return (-1);
1823 	} else if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1824 		t_mp = s_mp->sbm_peer;
1825 		ASSERT(t_mp != NULL);
1826 		ASSERT(t_mp->sbm_peer == s_mp);
1827 
1828 		/* must always match the source board's state */
1829 /* TODO: is this assertion correct? */
1830 		ASSERT(t_mp->sbm_cm.sbdev_state == state);
1831 	} else {
1832 		/* this is no target unit */
1833 		t_mp = NULL;
1834 	}
1835 
1836 	switch (state) {
1837 	case DR_STATE_UNREFERENCED:	/* state set by dr_release_dev_done */
1838 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1839 
1840 		if (t_mp != NULL && t_mp->sbm_del_mlist != NULL) {
1841 			PR_MEM("%s: undoing target %s memory delete\n",
1842 			    f, t_mp->sbm_cm.sbdev_path);
1843 			dr_add_memory_spans(t_mp, t_mp->sbm_del_mlist);
1844 
1845 			DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1846 		}
1847 
1848 		if (s_mp->sbm_del_mlist != NULL) {
1849 			PR_MEM("%s: undoing %s memory delete\n",
1850 			    f, s_mp->sbm_cm.sbdev_path);
1851 
1852 			dr_add_memory_spans(s_mp, s_mp->sbm_del_mlist);
1853 		}
1854 
1855 		/*FALLTHROUGH*/
1856 
1857 /* TODO: should no longer be possible to see the release state here */
1858 	case DR_STATE_RELEASE:	/* state set by dr_release_mem_done */
1859 
1860 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1861 
1862 		if (t_mp != NULL) {
1863 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1864 			t_mp->sbm_del_mlist = NULL;
1865 
1866 			if (t_mp->sbm_mlist != NULL) {
1867 				memlist_delete(t_mp->sbm_mlist);
1868 				t_mp->sbm_mlist = NULL;
1869 			}
1870 
1871 			t_mp->sbm_peer = NULL;
1872 			t_mp->sbm_flags = 0;
1873 			t_mp->sbm_cm.sbdev_busy = 0;
1874 			dr_init_mem_unit_data(t_mp);
1875 
1876 			DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1877 
1878 			dr_device_transition(&t_mp->sbm_cm,
1879 			    DR_STATE_CONFIGURED);
1880 		}
1881 
1882 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1883 			memlist_delete(s_mp->sbm_del_mlist);
1884 		s_mp->sbm_del_mlist = NULL;
1885 
1886 		if (s_mp->sbm_mlist != NULL) {
1887 			memlist_delete(s_mp->sbm_mlist);
1888 			s_mp->sbm_mlist = NULL;
1889 		}
1890 
1891 		s_mp->sbm_peer = NULL;
1892 		s_mp->sbm_flags = 0;
1893 		s_mp->sbm_cm.sbdev_busy = 0;
1894 		dr_init_mem_unit_data(s_mp);
1895 
1896 		return (0);
1897 
1898 	default:
1899 		PR_MEM("%s: WARNING unexpected state (%d) for %s\n",
1900 		    f, (int)state, s_mp->sbm_cm.sbdev_path);
1901 
1902 		return (-1);
1903 	}
1904 	/*NOTREACHED*/
1905 }
1906 
1907 void
1908 dr_init_mem_unit(dr_mem_unit_t *mp)
1909 {
1910 	dr_state_t	new_state;
1911 
1912 
1913 	if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) {
1914 		new_state = DR_STATE_CONFIGURED;
1915 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1916 	} else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) {
1917 		new_state = DR_STATE_CONNECTED;
1918 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1919 	} else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) {
1920 		new_state = DR_STATE_OCCUPIED;
1921 	} else {
1922 		new_state = DR_STATE_EMPTY;
1923 	}
1924 
1925 	if (DR_DEV_IS_PRESENT(&mp->sbm_cm))
1926 		dr_init_mem_unit_data(mp);
1927 
1928 	/* delay transition until fully initialized */
1929 	dr_device_transition(&mp->sbm_cm, new_state);
1930 }
1931 
1932 static void
1933 dr_init_mem_unit_data(dr_mem_unit_t *mp)
1934 {
1935 	drmachid_t	id = mp->sbm_cm.sbdev_id;
1936 	uint64_t	bytes;
1937 	sbd_error_t	*err;
1938 	static fn_t	f = "dr_init_mem_unit_data";
1939 	update_membounds_t umb;
1940 
1941 	PR_MEM("%s...\n", f);
1942 
1943 	/* a little sanity checking */
1944 	ASSERT(mp->sbm_peer == NULL);
1945 	ASSERT(mp->sbm_flags == 0);
1946 
1947 	/* get basepfn of mem unit */
1948 	err = drmach_mem_get_base_physaddr(id, &bytes);
1949 	if (err) {
1950 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1951 		mp->sbm_basepfn = (pfn_t)-1;
1952 	} else
1953 		mp->sbm_basepfn = _b64top(bytes);
1954 
1955 	/* attempt to get number of pages from PDA */
1956 	err = drmach_mem_get_size(id, &bytes);
1957 	if (err) {
1958 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1959 		mp->sbm_npages = 0;
1960 	} else
1961 		mp->sbm_npages = _b64top(bytes);
1962 
1963 	/* if didn't work, calculate using memlist */
1964 	if (mp->sbm_npages == 0) {
1965 		struct memlist	*ml, *mlist;
1966 		/*
1967 		 * Either we couldn't open the PDA or our
1968 		 * PDA has garbage in it.  We must have the
1969 		 * page count consistent and whatever the
1970 		 * OS states has precedence over the PDA
1971 		 * so let's check the kernel.
1972 		 */
1973 /* TODO: curious comment. it suggests pda query should happen if this fails */
1974 		PR_MEM("%s: PDA query failed for npages."
1975 		    " Checking memlist for %s\n",
1976 		    f, mp->sbm_cm.sbdev_path);
1977 
1978 		mlist = dr_get_memlist(mp);
1979 		for (ml = mlist; ml; ml = ml->ml_next)
1980 			mp->sbm_npages += btop(ml->ml_size);
1981 		memlist_delete(mlist);
1982 	}
1983 
1984 	err = drmach_mem_get_alignment(id, &bytes);
1985 	if (err) {
1986 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1987 		mp->sbm_alignment_mask = 0;
1988 	} else
1989 		mp->sbm_alignment_mask = _b64top(bytes);
1990 
1991 	err = drmach_mem_get_slice_size(id, &bytes);
1992 	if (err) {
1993 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1994 		mp->sbm_slice_size = 0; /* paranoia */
1995 	} else
1996 		mp->sbm_slice_size = bytes;
1997 
1998 	/*
1999 	 * Add memory to lgroup
2000 	 */
2001 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
2002 	umb.u_base = (uint64_t)mp->sbm_basepfn << MMU_PAGESHIFT;
2003 	umb.u_len = (uint64_t)mp->sbm_npages << MMU_PAGESHIFT;
2004 
2005 	lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb);
2006 
2007 	PR_MEM("%s: %s (basepfn = 0x%lx, npgs = %ld)\n",
2008 	    f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages);
2009 }
2010 
2011 static int
2012 dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *ml)
2013 {
2014 	int		err;
2015 	pfn_t		base;
2016 	pgcnt_t		npgs;
2017 	struct memlist	*mc;
2018 	static fn_t	f = "dr_reserve_mem_spans";
2019 
2020 	PR_MEM("%s...\n", f);
2021 
2022 	/*
2023 	 * Walk the supplied memlist scheduling each span for removal
2024 	 * with kphysm_del_span.  It is possible that a span may intersect
2025 	 * an area occupied by the cage.
2026 	 */
2027 	for (mc = ml; mc != NULL; mc = mc->ml_next) {
2028 		base = _b64top(mc->ml_address);
2029 		npgs = _b64top(mc->ml_size);
2030 
2031 		err = kphysm_del_span(*mhp, base, npgs);
2032 		if (err != KPHYSM_OK) {
2033 			cmn_err(CE_WARN, "%s memory reserve failed."
2034 			    " unexpected kphysm_del_span return value %d;"
2035 			    " basepfn=0x%lx npages=%ld",
2036 			    f, err, base, npgs);
2037 
2038 			return (-1);
2039 		}
2040 	}
2041 
2042 	return (0);
2043 }
2044 
2045 /* debug counters */
2046 int dr_smt_realigned;
2047 int dr_smt_preference[4];
2048 
2049 #ifdef DEBUG
2050 uint_t dr_ignore_board; /* if bit[bnum-1] set, board won't be candidate */
2051 #endif
2052 
2053 /*
2054  * Find and reserve a copy/rename target board suitable for the
2055  * given source board.
2056  * All boards in the system are examined and categorized in relation to
2057  * their memory size versus the source board's memory size.  Order of
2058  * preference is:
2059  *	1st: board has same memory size
2060  * 	2nd: board has larger memory size
2061  *	3rd: board has smaller memory size
2062  *	4th: board has smaller memory size, available memory will be reduced.
2063  * Boards in category 3 and 4 will have their MC's reprogrammed to locate the
2064  * span to which the MC responds to address span that appropriately covers
2065  * the nonrelocatable span of the source board.
2066  */
2067 static int
2068 dr_select_mem_target(dr_handle_t *hp,
2069 	dr_mem_unit_t *s_mp, struct memlist *s_ml)
2070 {
2071 	pgcnt_t		sz = _b64top(s_mp->sbm_slice_size);
2072 	pgcnt_t		sm = sz - 1; /* mem_slice_mask */
2073 	pfn_t		s_phi, t_phi;
2074 
2075 	int		n_sets = 4; /* same, larger, smaller, clipped */
2076 	int		preference; /* lower value is higher preference */
2077 	int		n_units_per_set;
2078 	int		idx;
2079 	dr_mem_unit_t	**sets;
2080 
2081 	int		t_bd;
2082 	int		t_unit;
2083 	int		rv;
2084 	int		allow_src_memrange_modify;
2085 	int		allow_targ_memrange_modify;
2086 	drmachid_t	t_id;
2087 	dr_board_t	*s_bp, *t_bp;
2088 	dr_mem_unit_t	*t_mp, *c_mp;
2089 	struct memlist	*d_ml, *t_ml, *x_ml;
2090 	memquery_t	s_mq = {0};
2091 	static fn_t	f = "dr_select_mem_target";
2092 
2093 	PR_MEM("%s...\n", f);
2094 
2095 	ASSERT(s_ml != NULL);
2096 
2097 	n_units_per_set = MAX_BOARDS * MAX_MEM_UNITS_PER_BOARD;
2098 	sets = GETSTRUCT(dr_mem_unit_t *, n_units_per_set * n_sets);
2099 
2100 	s_bp = hp->h_bd;
2101 	/* calculate the offset into the slice of the last source board pfn */
2102 	ASSERT(s_mp->sbm_npages != 0);
2103 	s_phi = (s_mp->sbm_basepfn + s_mp->sbm_npages - 1) & sm;
2104 
2105 	allow_src_memrange_modify = drmach_allow_memrange_modify(s_bp->b_id);
2106 
2107 	/*
2108 	 * Make one pass through all memory units on all boards
2109 	 * and categorize them with respect to the source board.
2110 	 */
2111 	for (t_bd = 0; t_bd < MAX_BOARDS; t_bd++) {
2112 		/*
2113 		 * The board structs are a contiguous array
2114 		 * so we take advantage of that to find the
2115 		 * correct board struct pointer for a given
2116 		 * board number.
2117 		 */
2118 		t_bp = dr_lookup_board(t_bd);
2119 
2120 		/* source board can not be its own target */
2121 		if (s_bp->b_num == t_bp->b_num)
2122 			continue;
2123 
2124 		for (t_unit = 0; t_unit < MAX_MEM_UNITS_PER_BOARD; t_unit++) {
2125 
2126 			t_mp = dr_get_mem_unit(t_bp, t_unit);
2127 
2128 			/* this memory node must be attached */
2129 			if (!DR_DEV_IS_ATTACHED(&t_mp->sbm_cm))
2130 				continue;
2131 
2132 			/* source unit can not be its own target */
2133 			if (s_mp == t_mp) {
2134 				/* catch this is debug kernels */
2135 				ASSERT(0);
2136 				continue;
2137 			}
2138 
2139 			/*
2140 			 * this memory node must not already be reserved
2141 			 * by some other memory delete operation.
2142 			 */
2143 			if (t_mp->sbm_flags & DR_MFLAG_RESERVED)
2144 				continue;
2145 
2146 			/*
2147 			 * categorize the memory node
2148 			 * If this is a smaller memory node, create a
2149 			 * temporary, edited copy of the source board's
2150 			 * memlist containing only the span of the non-
2151 			 * relocatable pages.
2152 			 */
2153 			t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2154 			t_id = t_mp->sbm_cm.sbdev_bp->b_id;
2155 			allow_targ_memrange_modify =
2156 			    drmach_allow_memrange_modify(t_id);
2157 			if (t_mp->sbm_npages == s_mp->sbm_npages &&
2158 			    t_phi == s_phi) {
2159 				preference = 0;
2160 				t_mp->sbm_slice_offset = 0;
2161 			} else if (t_mp->sbm_npages > s_mp->sbm_npages &&
2162 			    t_phi > s_phi) {
2163 				/*
2164 				 * Selecting this target will require modifying
2165 				 * the source and/or target physical address
2166 				 * ranges.  Skip if not supported by platform.
2167 				 */
2168 				if (!allow_src_memrange_modify ||
2169 				    !allow_targ_memrange_modify) {
2170 					PR_MEM("%s: skip target %s, memory "
2171 					    "range relocation not supported "
2172 					    "by platform\n", f,
2173 					    t_mp->sbm_cm.sbdev_path);
2174 					continue;
2175 				}
2176 				preference = 1;
2177 				t_mp->sbm_slice_offset = 0;
2178 			} else {
2179 				pfn_t		pfn = 0;
2180 
2181 				/*
2182 				 * Selecting this target will require modifying
2183 				 * the source and/or target physical address
2184 				 * ranges.  Skip if not supported by platform.
2185 				 */
2186 				if (!allow_src_memrange_modify ||
2187 				    !allow_targ_memrange_modify) {
2188 					PR_MEM("%s: skip target %s, memory "
2189 					    "range relocation not supported "
2190 					    "by platform\n", f,
2191 					    t_mp->sbm_cm.sbdev_path);
2192 					continue;
2193 				}
2194 
2195 				/*
2196 				 * Check if its mc can be programmed to relocate
2197 				 * the active address range to match the
2198 				 * nonrelocatable span of the source board.
2199 				 */
2200 				preference = 2;
2201 
2202 				if (s_mq.phys_pages == 0) {
2203 					/*
2204 					 * find non-relocatable span on
2205 					 * source board.
2206 					 */
2207 					rv = kphysm_del_span_query(
2208 					    s_mp->sbm_basepfn,
2209 					    s_mp->sbm_npages, &s_mq);
2210 					if (rv != KPHYSM_OK) {
2211 						PR_MEM("%s: %s: unexpected"
2212 						    " kphysm_del_span_query"
2213 						    " return value %d;"
2214 						    " basepfn 0x%lx,"
2215 						    " npages %ld\n",
2216 						    f,
2217 						    s_mp->sbm_cm.sbdev_path,
2218 						    rv,
2219 						    s_mp->sbm_basepfn,
2220 						    s_mp->sbm_npages);
2221 
2222 						/* paranoia */
2223 						s_mq.phys_pages = 0;
2224 
2225 						continue;
2226 					}
2227 
2228 					/* more paranoia */
2229 					ASSERT(s_mq.phys_pages != 0);
2230 					ASSERT(s_mq.nonrelocatable != 0);
2231 
2232 					/*
2233 					 * this should not happen
2234 					 * if it does, it simply means that
2235 					 * we can not proceed with qualifying
2236 					 * this target candidate.
2237 					 */
2238 					if (s_mq.nonrelocatable == 0)
2239 						continue;
2240 
2241 					PR_MEM("%s: %s: nonrelocatable"
2242 					    " span (0x%lx..0x%lx)\n",
2243 					    f,
2244 					    s_mp->sbm_cm.sbdev_path,
2245 					    s_mq.first_nonrelocatable,
2246 					    s_mq.last_nonrelocatable);
2247 				}
2248 
2249 				/*
2250 				 * Round down the starting pfn of the
2251 				 * nonrelocatable span on the source board
2252 				 * to nearest programmable boundary possible
2253 				 * with this target candidate.
2254 				 */
2255 				pfn = s_mq.first_nonrelocatable &
2256 				    ~t_mp->sbm_alignment_mask;
2257 
2258 				/* skip candidate if memory is too small */
2259 				if (pfn + t_mp->sbm_npages <
2260 				    s_mq.last_nonrelocatable)
2261 					continue;
2262 
2263 				/*
2264 				 * reprogramming an mc to relocate its
2265 				 * active address range means the beginning
2266 				 * address to which the DIMMS respond will
2267 				 * be somewhere above the slice boundary
2268 				 * address.  The larger the size of memory
2269 				 * on this unit, the more likely part of it
2270 				 * will exist beyond the end of the slice.
2271 				 * The portion of the memory that does is
2272 				 * unavailable to the system until the mc
2273 				 * reprogrammed to a more favorable base
2274 				 * address.
2275 				 * An attempt is made to avoid the loss by
2276 				 * recalculating the mc base address relative
2277 				 * to the end of the slice.  This may produce
2278 				 * a more favorable result.  If not, we lower
2279 				 * the board's preference rating so that it
2280 				 * is one the last candidate boards to be
2281 				 * considered.
2282 				 */
2283 				if ((pfn + t_mp->sbm_npages) & ~sm) {
2284 					pfn_t p;
2285 
2286 					ASSERT(sz >= t_mp->sbm_npages);
2287 
2288 					/*
2289 					 * calculate an alternative starting
2290 					 * address relative to the end of the
2291 					 * slice's address space.
2292 					 */
2293 					p = pfn & ~sm;
2294 					p = p + (sz - t_mp->sbm_npages);
2295 					p = p & ~t_mp->sbm_alignment_mask;
2296 
2297 					if ((p > s_mq.first_nonrelocatable) ||
2298 					    (p + t_mp->sbm_npages <
2299 					    s_mq.last_nonrelocatable)) {
2300 
2301 						/*
2302 						 * alternative starting addr
2303 						 * won't work. Lower preference
2304 						 * rating of this board, since
2305 						 * some number of pages will
2306 						 * unavailable for use.
2307 						 */
2308 						preference = 3;
2309 					} else {
2310 						dr_smt_realigned++;
2311 						pfn = p;
2312 					}
2313 				}
2314 
2315 				/*
2316 				 * translate calculated pfn to an offset
2317 				 * relative to the slice boundary.  If the
2318 				 * candidate board is selected, this offset
2319 				 * will be used to calculate the values
2320 				 * programmed into the mc.
2321 				 */
2322 				t_mp->sbm_slice_offset = pfn & sm;
2323 				PR_MEM("%s: %s:"
2324 				    "  proposed mc offset 0x%lx\n",
2325 				    f,
2326 				    t_mp->sbm_cm.sbdev_path,
2327 				    t_mp->sbm_slice_offset);
2328 			}
2329 
2330 			dr_smt_preference[preference]++;
2331 
2332 			/* calculate index to start of preference set */
2333 			idx  = n_units_per_set * preference;
2334 			/* calculate offset to respective element */
2335 			idx += t_bd * MAX_MEM_UNITS_PER_BOARD + t_unit;
2336 
2337 			ASSERT(idx < n_units_per_set * n_sets);
2338 			sets[idx] = t_mp;
2339 		}
2340 	}
2341 
2342 	/*
2343 	 * NOTE: this would be a good place to sort each candidate
2344 	 * set in to some desired order, e.g. memory size in ascending
2345 	 * order.  Without an additional sorting step here, the order
2346 	 * within a set is ascending board number order.
2347 	 */
2348 
2349 	c_mp = NULL;
2350 	x_ml = NULL;
2351 	t_ml = NULL;
2352 	for (idx = 0; idx < n_units_per_set * n_sets; idx++) {
2353 		memquery_t mq;
2354 
2355 		/* cleanup t_ml after previous pass */
2356 		if (t_ml != NULL) {
2357 			memlist_delete(t_ml);
2358 			t_ml = NULL;
2359 		}
2360 
2361 		/* get candidate target board mem unit */
2362 		t_mp = sets[idx];
2363 		if (t_mp == NULL)
2364 			continue;
2365 
2366 		/* get target board memlist */
2367 		t_ml = dr_get_memlist(t_mp);
2368 		if (t_ml == NULL) {
2369 			cmn_err(CE_WARN, "%s: no memlist for"
2370 			    " mem-unit %d, board %d",
2371 			    f,
2372 			    t_mp->sbm_cm.sbdev_bp->b_num,
2373 			    t_mp->sbm_cm.sbdev_unum);
2374 
2375 			continue;
2376 		}
2377 
2378 		/* get appropriate source board memlist */
2379 		t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2380 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2381 			spgcnt_t excess;
2382 
2383 			/*
2384 			 * make a copy of the source board memlist
2385 			 * then edit it to remove the spans that
2386 			 * are outside the calculated span of
2387 			 * [pfn..s_mq.last_nonrelocatable].
2388 			 */
2389 			if (x_ml != NULL)
2390 				memlist_delete(x_ml);
2391 
2392 			x_ml = memlist_dup(s_ml);
2393 			if (x_ml == NULL) {
2394 				PR_MEM("%s: memlist_dup failed\n", f);
2395 				/* TODO: should abort */
2396 				continue;
2397 			}
2398 
2399 			/* trim off lower portion */
2400 			excess = t_mp->sbm_slice_offset -
2401 			    (s_mp->sbm_basepfn & sm);
2402 
2403 			if (excess > 0) {
2404 				x_ml = memlist_del_span(
2405 				    x_ml,
2406 				    _ptob64(s_mp->sbm_basepfn),
2407 				    _ptob64(excess));
2408 			}
2409 			ASSERT(x_ml);
2410 
2411 			/*
2412 			 * Since this candidate target board is smaller
2413 			 * than the source board, s_mq must have been
2414 			 * initialized in previous loop while processing
2415 			 * this or some other candidate board.
2416 			 * FIXME: this is weak.
2417 			 */
2418 			ASSERT(s_mq.phys_pages != 0);
2419 
2420 			/* trim off upper portion */
2421 			excess = (s_mp->sbm_basepfn + s_mp->sbm_npages)
2422 			    - (s_mq.last_nonrelocatable + 1);
2423 			if (excess > 0) {
2424 				pfn_t p;
2425 
2426 				p  = s_mq.last_nonrelocatable + 1;
2427 				x_ml = memlist_del_span(
2428 				    x_ml,
2429 				    _ptob64(p),
2430 				    _ptob64(excess));
2431 			}
2432 
2433 			PR_MEM("%s: %s: edited source memlist:\n",
2434 			    f, s_mp->sbm_cm.sbdev_path);
2435 			PR_MEMLIST_DUMP(x_ml);
2436 
2437 #ifdef DEBUG
2438 			/* sanity check memlist */
2439 			d_ml = x_ml;
2440 			while (d_ml->ml_next != NULL)
2441 				d_ml = d_ml->ml_next;
2442 
2443 			ASSERT(d_ml->ml_address + d_ml->ml_size ==
2444 			    _ptob64(s_mq.last_nonrelocatable + 1));
2445 #endif
2446 
2447 			/*
2448 			 * x_ml now describes only the portion of the
2449 			 * source board that will be moved during the
2450 			 * copy/rename operation.
2451 			 */
2452 			d_ml = x_ml;
2453 		} else {
2454 			/* use original memlist; all spans will be moved */
2455 			d_ml = s_ml;
2456 		}
2457 
2458 		/* verify target can support source memory spans. */
2459 		if (memlist_canfit(d_ml, t_ml) == 0) {
2460 			PR_MEM("%s: source memlist won't"
2461 			    " fit in target memlist\n", f);
2462 			PR_MEM("%s: source memlist:\n", f);
2463 			PR_MEMLIST_DUMP(d_ml);
2464 			PR_MEM("%s: target memlist:\n", f);
2465 			PR_MEMLIST_DUMP(t_ml);
2466 
2467 			continue;
2468 		}
2469 
2470 		/* NOTE: the value of d_ml is not used beyond this point */
2471 
2472 		PR_MEM("%s: checking for no-reloc in %s, "
2473 		    " basepfn=0x%lx, npages=%ld\n",
2474 		    f,
2475 		    t_mp->sbm_cm.sbdev_path,
2476 		    t_mp->sbm_basepfn,
2477 		    t_mp->sbm_npages);
2478 
2479 		rv = kphysm_del_span_query(
2480 		    t_mp->sbm_basepfn, t_mp->sbm_npages, &mq);
2481 		if (rv != KPHYSM_OK) {
2482 			PR_MEM("%s: kphysm_del_span_query:"
2483 			    " unexpected return value %d\n", f, rv);
2484 
2485 			continue;
2486 		}
2487 
2488 		if (mq.nonrelocatable != 0) {
2489 			PR_MEM("%s: candidate %s has"
2490 			    " nonrelocatable span [0x%lx..0x%lx]\n",
2491 			    f,
2492 			    t_mp->sbm_cm.sbdev_path,
2493 			    mq.first_nonrelocatable,
2494 			    mq.last_nonrelocatable);
2495 
2496 			continue;
2497 		}
2498 
2499 #ifdef DEBUG
2500 		/*
2501 		 * This is a debug tool for excluding certain boards
2502 		 * from being selected as a target board candidate.
2503 		 * dr_ignore_board is only tested by this driver.
2504 		 * It must be set with adb, obp, /etc/system or your
2505 		 * favorite debugger.
2506 		 */
2507 		if (dr_ignore_board &
2508 		    (1 << (t_mp->sbm_cm.sbdev_bp->b_num - 1))) {
2509 			PR_MEM("%s: dr_ignore_board flag set,"
2510 			    " ignoring %s as candidate\n",
2511 			    f, t_mp->sbm_cm.sbdev_path);
2512 			continue;
2513 		}
2514 #endif
2515 
2516 		/*
2517 		 * Reserve excess source board memory, if any.
2518 		 *
2519 		 * When the number of pages on the candidate target
2520 		 * board is less than the number of pages on the source,
2521 		 * then some spans (clearly) of the source board's address
2522 		 * space will not be covered by physical memory after the
2523 		 * copy/rename completes.  The following code block
2524 		 * schedules those spans to be deleted.
2525 		 */
2526 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2527 			pfn_t pfn;
2528 			uint64_t s_del_pa;
2529 			struct memlist *ml;
2530 
2531 			d_ml = memlist_dup(s_ml);
2532 			if (d_ml == NULL) {
2533 				PR_MEM("%s: cant dup src brd memlist\n", f);
2534 				/* TODO: should abort */
2535 				continue;
2536 			}
2537 
2538 			/* calculate base pfn relative to target board */
2539 			pfn  = s_mp->sbm_basepfn & ~sm;
2540 			pfn += t_mp->sbm_slice_offset;
2541 
2542 			/*
2543 			 * cannot split dynamically added segment
2544 			 */
2545 			s_del_pa = _ptob64(pfn + t_mp->sbm_npages);
2546 			PR_MEM("%s: proposed src delete pa=0x%lx\n", f,
2547 			    s_del_pa);
2548 			PR_MEM("%s: checking for split of dyn seg list:\n", f);
2549 			PR_MEMLIST_DUMP(s_mp->sbm_dyn_segs);
2550 			for (ml = s_mp->sbm_dyn_segs; ml; ml = ml->ml_next) {
2551 				if (s_del_pa > ml->ml_address &&
2552 				    s_del_pa < ml->ml_address + ml->ml_size) {
2553 					s_del_pa = ml->ml_address;
2554 					break;
2555 				}
2556 			}
2557 
2558 			/* remove span that will reside on candidate board */
2559 			d_ml = memlist_del_span(d_ml, _ptob64(pfn),
2560 			    s_del_pa - _ptob64(pfn));
2561 
2562 			PR_MEM("%s: %s: reserving src brd memlist:\n",
2563 			    f, s_mp->sbm_cm.sbdev_path);
2564 			PR_MEMLIST_DUMP(d_ml);
2565 
2566 			/* reserve excess spans */
2567 			if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, d_ml)
2568 			    != 0) {
2569 
2570 				/* likely more non-reloc pages appeared */
2571 				/* TODO: restart from top? */
2572 				continue;
2573 			}
2574 		} else {
2575 			/* no excess source board memory */
2576 			d_ml = NULL;
2577 		}
2578 
2579 		s_mp->sbm_flags |= DR_MFLAG_RESERVED;
2580 
2581 		/*
2582 		 * reserve all memory on target board.
2583 		 * NOTE: source board's memhandle is used.
2584 		 *
2585 		 * If this succeeds (eq 0), then target selection is
2586 		 * complete and all unwanted memory spans, both source and
2587 		 * target, have been reserved.  Loop is terminated.
2588 		 */
2589 		if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, t_ml) == 0) {
2590 			PR_MEM("%s: %s: target board memory reserved\n",
2591 			    f, t_mp->sbm_cm.sbdev_path);
2592 
2593 			/* a candidate target board is now reserved */
2594 			t_mp->sbm_flags |= DR_MFLAG_RESERVED;
2595 			c_mp = t_mp;
2596 
2597 			/* *** EXITING LOOP *** */
2598 			break;
2599 		}
2600 
2601 		/* did not successfully reserve the target board. */
2602 		PR_MEM("%s: could not reserve target %s\n",
2603 		    f, t_mp->sbm_cm.sbdev_path);
2604 
2605 		/*
2606 		 * NOTE: an undo of the dr_reserve_mem_span work
2607 		 * will happen automatically when the memhandle
2608 		 * (s_mp->sbm_memhandle) is kphysm_del_release'd.
2609 		 */
2610 
2611 		s_mp->sbm_flags &= ~DR_MFLAG_RESERVED;
2612 	}
2613 
2614 	/* clean up after memlist editing logic */
2615 	if (x_ml != NULL)
2616 		memlist_delete(x_ml);
2617 
2618 	FREESTRUCT(sets, dr_mem_unit_t *, n_units_per_set * n_sets);
2619 
2620 	/*
2621 	 * c_mp will be NULL when the entire sets[] array
2622 	 * has been searched without reserving a target board.
2623 	 */
2624 	if (c_mp == NULL) {
2625 		PR_MEM("%s: %s: target selection failed.\n",
2626 		    f, s_mp->sbm_cm.sbdev_path);
2627 
2628 		if (t_ml != NULL)
2629 			memlist_delete(t_ml);
2630 
2631 		return (-1);
2632 	}
2633 
2634 	PR_MEM("%s: found target %s for source %s\n",
2635 	    f,
2636 	    c_mp->sbm_cm.sbdev_path,
2637 	    s_mp->sbm_cm.sbdev_path);
2638 
2639 	s_mp->sbm_peer = c_mp;
2640 	s_mp->sbm_flags |= DR_MFLAG_SOURCE;
2641 	s_mp->sbm_del_mlist = d_ml;	/* spans to be deleted, if any */
2642 	s_mp->sbm_mlist = s_ml;
2643 	s_mp->sbm_cm.sbdev_busy = 1;
2644 
2645 	c_mp->sbm_peer = s_mp;
2646 	c_mp->sbm_flags |= DR_MFLAG_TARGET;
2647 	c_mp->sbm_del_mlist = t_ml;	/* spans to be deleted */
2648 	c_mp->sbm_mlist = t_ml;
2649 	c_mp->sbm_cm.sbdev_busy = 1;
2650 
2651 	s_mp->sbm_flags &= ~DR_MFLAG_MEMRESIZE;
2652 	if (c_mp->sbm_npages > s_mp->sbm_npages) {
2653 		s_mp->sbm_flags |= DR_MFLAG_MEMUPSIZE;
2654 		PR_MEM("%s: upsize detected (source=%ld < target=%ld)\n",
2655 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2656 	} else if (c_mp->sbm_npages < s_mp->sbm_npages) {
2657 		s_mp->sbm_flags |= DR_MFLAG_MEMDOWNSIZE;
2658 		PR_MEM("%s: downsize detected (source=%ld > target=%ld)\n",
2659 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2660 	}
2661 
2662 	return (0);
2663 }
2664 
2665 /*
2666  * Memlist support.
2667  */
2668 
2669 /*
2670  * Determine whether the source memlist (s_mlist) will
2671  * fit into the target memlist (t_mlist) in terms of
2672  * size and holes (i.e. based on same relative base address).
2673  */
2674 static int
2675 memlist_canfit(struct memlist *s_mlist, struct memlist *t_mlist)
2676 {
2677 	int		rv = 0;
2678 	uint64_t	s_basepa, t_basepa;
2679 	struct memlist	*s_ml, *t_ml;
2680 
2681 	if ((s_mlist == NULL) || (t_mlist == NULL))
2682 		return (0);
2683 
2684 	/*
2685 	 * Base both memlists on common base address (0).
2686 	 */
2687 	s_basepa = s_mlist->ml_address;
2688 	t_basepa = t_mlist->ml_address;
2689 
2690 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->ml_next)
2691 		s_ml->ml_address -= s_basepa;
2692 
2693 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->ml_next)
2694 		t_ml->ml_address -= t_basepa;
2695 
2696 	s_ml = s_mlist;
2697 	for (t_ml = t_mlist; t_ml && s_ml; t_ml = t_ml->ml_next) {
2698 		uint64_t	s_start, s_end;
2699 		uint64_t	t_start, t_end;
2700 
2701 		t_start = t_ml->ml_address;
2702 		t_end = t_start + t_ml->ml_size;
2703 
2704 		for (; s_ml; s_ml = s_ml->ml_next) {
2705 			s_start = s_ml->ml_address;
2706 			s_end = s_start + s_ml->ml_size;
2707 
2708 			if ((s_start < t_start) || (s_end > t_end))
2709 				break;
2710 		}
2711 	}
2712 	/*
2713 	 * If we ran out of source memlist chunks that mean
2714 	 * we found a home for all of them.
2715 	 */
2716 	if (s_ml == NULL)
2717 		rv = 1;
2718 
2719 	/*
2720 	 * Need to add base addresses back since memlists
2721 	 * are probably in use by caller.
2722 	 */
2723 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->ml_next)
2724 		s_ml->ml_address += s_basepa;
2725 
2726 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->ml_next)
2727 		t_ml->ml_address += t_basepa;
2728 
2729 	return (rv);
2730 }
2731