xref: /titanic_50/usr/src/uts/sun4u/ngdr/io/dr_mem.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * DR memory support routines.
31  */
32 
33 #include <sys/note.h>
34 #include <sys/debug.h>
35 #include <sys/types.h>
36 #include <sys/errno.h>
37 #include <sys/param.h>
38 #include <sys/dditypes.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/sunndi.h>
44 #include <sys/ddi_impldefs.h>
45 #include <sys/ndi_impldefs.h>
46 #include <sys/sysmacros.h>
47 #include <sys/machsystm.h>
48 #include <sys/spitregs.h>
49 #include <sys/cpuvar.h>
50 #include <sys/promif.h>
51 #include <vm/seg_kmem.h>
52 #include <sys/lgrp.h>
53 #include <sys/platform_module.h>
54 
55 #include <vm/page.h>
56 
57 #include <sys/dr.h>
58 #include <sys/dr_util.h>
59 
60 extern struct memlist	*phys_install;
61 
62 /* TODO: push this reference below drmach line */
63 extern int		kcage_on;
64 
65 /* for the DR*INTERNAL_ERROR macros.  see sys/dr.h. */
66 static char *dr_ie_fmt = "%M% %d";
67 
68 static int		dr_post_detach_mem_unit(dr_mem_unit_t *mp);
69 static int		dr_reserve_mem_spans(memhandle_t *mhp,
70 					struct memlist *mlist);
71 static int		dr_select_mem_target(dr_handle_t *hp,
72 				dr_mem_unit_t *mp, struct memlist *ml);
73 static void		dr_init_mem_unit_data(dr_mem_unit_t *mp);
74 
75 static struct memlist	*memlist_dup(struct memlist *);
76 static int		memlist_canfit(struct memlist *s_mlist,
77 					struct memlist *t_mlist);
78 static struct memlist	*memlist_del_span(struct memlist *mlist,
79 					uint64_t base, uint64_t len);
80 static struct memlist	*memlist_cat_span(struct memlist *mlist,
81 					uint64_t base, uint64_t len);
82 
83 extern void		page_unretire_pages(void);
84 
85 /*
86  * dr_mem_unit_t.sbm_flags
87  */
88 #define	DR_MFLAG_RESERVED	0x01	/* mem unit reserved for delete */
89 #define	DR_MFLAG_SOURCE		0x02	/* source brd of copy/rename op */
90 #define	DR_MFLAG_TARGET		0x04	/* target brd of copy/rename op */
91 #define	DR_MFLAG_MEMUPSIZE	0x08	/* move from big to small board */
92 #define	DR_MFLAG_MEMDOWNSIZE	0x10	/* move from small to big board */
93 #define	DR_MFLAG_MEMRESIZE	0x18	/* move to different size board */
94 #define	DR_MFLAG_RELOWNER	0x20	/* memory release (delete) owner */
95 #define	DR_MFLAG_RELDONE	0x40	/* memory release (delete) done */
96 
97 /* helper macros */
98 #define	_ptob64(p) ((uint64_t)(p) << PAGESHIFT)
99 #define	_b64top(b) ((pgcnt_t)((b) >> PAGESHIFT))
100 
101 static struct memlist *
102 dr_get_memlist(dr_mem_unit_t *mp)
103 {
104 	struct memlist	*mlist = NULL;
105 	sbd_error_t	*err;
106 	static fn_t	f = "dr_get_memlist";
107 
108 	PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path);
109 
110 	/*
111 	 * Return cached memlist, if present.
112 	 * This memlist will be present following an
113 	 * unconfigure (a.k.a: detach) of this memunit.
114 	 * It should only be used in the case were a configure
115 	 * is bringing this memunit back in without going
116 	 * through the disconnect and connect states.
117 	 */
118 	if (mp->sbm_mlist) {
119 		PR_MEM("%s: found cached memlist\n", f);
120 
121 		mlist = memlist_dup(mp->sbm_mlist);
122 	} else {
123 		uint64_t basepa = _ptob64(mp->sbm_basepfn);
124 
125 		/* attempt to construct a memlist using phys_install */
126 
127 		/* round down to slice base address */
128 		basepa &= ~(mp->sbm_slice_size - 1);
129 
130 		/* get a copy of phys_install to edit */
131 		memlist_read_lock();
132 		mlist = memlist_dup(phys_install);
133 		memlist_read_unlock();
134 
135 		/* trim lower irrelevant span */
136 		if (mlist)
137 			mlist = memlist_del_span(mlist, 0ull, basepa);
138 
139 		/* trim upper irrelevant span */
140 		if (mlist) {
141 			uint64_t endpa;
142 
143 			basepa += mp->sbm_slice_size;
144 			endpa = _ptob64(physmax + 1);
145 			if (endpa > basepa)
146 				mlist = memlist_del_span(
147 						mlist,
148 						basepa,
149 						endpa - basepa);
150 		}
151 
152 		if (mlist) {
153 			/* successfully built a memlist */
154 			PR_MEM("%s: derived memlist from phys_install\n", f);
155 		}
156 
157 		/* if no mlist yet, try platform layer */
158 		if (!mlist) {
159 			err = drmach_mem_get_memlist(
160 				mp->sbm_cm.sbdev_id, &mlist);
161 			if (err) {
162 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
163 				mlist = NULL; /* paranoia */
164 			}
165 		}
166 	}
167 
168 	PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path);
169 	PR_MEMLIST_DUMP(mlist);
170 
171 	return (mlist);
172 }
173 
174 typedef struct {
175 	kcondvar_t cond;
176 	kmutex_t lock;
177 	int error;
178 	int done;
179 } dr_release_mem_sync_t;
180 
181 /*
182  * Memory has been logically removed by the time this routine is called.
183  */
184 static void
185 dr_mem_del_done(void *arg, int error)
186 {
187 	dr_release_mem_sync_t *ds = arg;
188 
189 	mutex_enter(&ds->lock);
190 	ds->error = error;
191 	ds->done = 1;
192 	cv_signal(&ds->cond);
193 	mutex_exit(&ds->lock);
194 }
195 
196 /*
197  * When we reach here the memory being drained should have
198  * already been reserved in dr_pre_release_mem().
199  * Our only task here is to kick off the "drain" and wait
200  * for it to finish.
201  */
202 void
203 dr_release_mem(dr_common_unit_t *cp)
204 {
205 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
206 	int		err;
207 	dr_release_mem_sync_t rms;
208 	static fn_t	f = "dr_release_mem";
209 
210 	/* check that this memory unit has been reserved */
211 	if (!(mp->sbm_flags & DR_MFLAG_RELOWNER)) {
212 		DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
213 		return;
214 	}
215 
216 	bzero((void *) &rms, sizeof (rms));
217 
218 	mutex_init(&rms.lock, NULL, MUTEX_DRIVER, NULL);
219 	cv_init(&rms.cond, NULL, CV_DRIVER, NULL);
220 
221 	mutex_enter(&rms.lock);
222 	err = kphysm_del_start(mp->sbm_memhandle,
223 			dr_mem_del_done, (void *) &rms);
224 	if (err == KPHYSM_OK) {
225 		/* wait for completion or interrupt */
226 		while (!rms.done) {
227 			if (cv_wait_sig(&rms.cond, &rms.lock) == 0) {
228 				/* then there is a pending UNIX signal */
229 				(void) kphysm_del_cancel(mp->sbm_memhandle);
230 
231 				/* wait for completion */
232 				while (!rms.done)
233 					cv_wait(&rms.cond, &rms.lock);
234 			}
235 		}
236 		/* get the result of the memory delete operation */
237 		err = rms.error;
238 	}
239 	mutex_exit(&rms.lock);
240 
241 	cv_destroy(&rms.cond);
242 	mutex_destroy(&rms.lock);
243 
244 	if (err != KPHYSM_OK) {
245 		int e_code;
246 
247 		switch (err) {
248 			case KPHYSM_ENOWORK:
249 				e_code = ESBD_NOERROR;
250 				break;
251 
252 			case KPHYSM_EHANDLE:
253 			case KPHYSM_ESEQUENCE:
254 				e_code = ESBD_INTERNAL;
255 				break;
256 
257 			case KPHYSM_ENOTVIABLE:
258 				e_code = ESBD_MEM_NOTVIABLE;
259 				break;
260 
261 			case KPHYSM_EREFUSED:
262 				e_code = ESBD_MEM_REFUSED;
263 				break;
264 
265 			case KPHYSM_ENONRELOC:
266 				e_code = ESBD_MEM_NONRELOC;
267 				break;
268 
269 			case KPHYSM_ECANCELLED:
270 				e_code = ESBD_MEM_CANCELLED;
271 				break;
272 
273 			case KPHYSM_ERESOURCE:
274 				e_code = ESBD_MEMFAIL;
275 				break;
276 
277 			default:
278 				cmn_err(CE_WARN,
279 					"%s: unexpected kphysm error code %d,"
280 					" id 0x%p",
281 					f, err, mp->sbm_cm.sbdev_id);
282 
283 				e_code = ESBD_IO;
284 				break;
285 		}
286 
287 		if (e_code != ESBD_NOERROR) {
288 			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
289 		}
290 	}
291 }
292 
293 void
294 dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
295 {
296 	_NOTE(ARGUNUSED(hp))
297 
298 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
299 	struct memlist	*ml, *mc;
300 	sbd_error_t	*err;
301 	static fn_t	f = "dr_attach_mem";
302 
303 	PR_MEM("%s...\n", f);
304 
305 	dr_lock_status(hp->h_bd);
306 	err = drmach_configure(cp->sbdev_id, 0);
307 	dr_unlock_status(hp->h_bd);
308 	if (err) {
309 		DRERR_SET_C(&cp->sbdev_error, &err);
310 		return;
311 	}
312 
313 	ml = dr_get_memlist(mp);
314 	for (mc = ml; mc; mc = mc->next) {
315 		int		 rv;
316 		sbd_error_t	*err;
317 
318 		rv = kphysm_add_memory_dynamic(
319 				(pfn_t)(mc->address >> PAGESHIFT),
320 				(pgcnt_t)(mc->size >> PAGESHIFT));
321 		if (rv != KPHYSM_OK) {
322 			/*
323 			 * translate kphysm error and
324 			 * store in devlist error
325 			 */
326 			switch (rv) {
327 			case KPHYSM_ERESOURCE:
328 				rv = ESBD_NOMEM;
329 				break;
330 
331 			case KPHYSM_EFAULT:
332 				rv = ESBD_FAULT;
333 				break;
334 
335 			default:
336 				rv = ESBD_INTERNAL;
337 				break;
338 			}
339 
340 			if (rv == ESBD_INTERNAL) {
341 				DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
342 			} else
343 				dr_dev_err(CE_WARN, &mp->sbm_cm, rv);
344 			break;
345 		}
346 
347 		err = drmach_mem_add_span(
348 			mp->sbm_cm.sbdev_id, mc->address, mc->size);
349 		if (err) {
350 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
351 			break;
352 		}
353 	}
354 
355 	memlist_delete(ml);
356 
357 	/* back out if configure failed */
358 	if (mp->sbm_cm.sbdev_error != NULL) {
359 		dr_lock_status(hp->h_bd);
360 		err = drmach_unconfigure(cp->sbdev_id, DRMACH_DEVI_REMOVE);
361 		if (err)
362 			sbd_err_clear(&err);
363 		dr_unlock_status(hp->h_bd);
364 	}
365 }
366 
367 #define	DR_SCRUB_VALUE	0x0d0e0a0d0b0e0e0fULL
368 
369 static void
370 dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
371 {
372 #ifdef DEBUG
373 	clock_t		stime = lbolt;
374 #endif /* DEBUG */
375 
376 	struct memlist	*ml;
377 	uint64_t	scrub_value = DR_SCRUB_VALUE;
378 	processorid_t	cpuid;
379 	static fn_t	f = "dr_mem_ecache_scrub";
380 
381 	cpuid = drmach_mem_cpu_affinity(mp->sbm_cm.sbdev_id);
382 	affinity_set(cpuid);
383 
384 	PR_MEM("%s: using proc %d, memlist...\n", f,
385 	    (cpuid == CPU_CURRENT) ? CPU->cpu_id : cpuid);
386 	PR_MEMLIST_DUMP(mlist);
387 
388 	for (ml = mlist; ml; ml = ml->next) {
389 		uint64_t	dst_pa;
390 		uint64_t	nbytes;
391 
392 		/* calculate the destination physical address */
393 		dst_pa = ml->address;
394 		if (ml->address & PAGEOFFSET)
395 			cmn_err(CE_WARN,
396 				"%s: address (0x%llx) not on "
397 				"page boundary", f, ml->address);
398 
399 		nbytes = ml->size;
400 		if (ml->size & PAGEOFFSET)
401 			cmn_err(CE_WARN,
402 				"%s: size (0x%llx) not on "
403 				"page boundary", f, ml->size);
404 
405 		/*LINTED*/
406 		while (nbytes > 0) {
407 			/* write 64 bits to dst_pa */
408 			stdphys(dst_pa, scrub_value);
409 
410 			/* increment/decrement by cacheline sizes */
411 			dst_pa += DRMACH_COHERENCY_UNIT;
412 			nbytes -= DRMACH_COHERENCY_UNIT;
413 		}
414 	}
415 
416 	/*
417 	 * flush this cpu's ecache and take care to ensure
418 	 * that all of it's bus transactions have retired.
419 	 */
420 	drmach_cpu_flush_ecache_sync();
421 
422 	affinity_clear();
423 
424 #ifdef DEBUG
425 	stime = lbolt - stime;
426 	PR_MEM("%s: scrub ticks = %ld (%ld secs)\n", f, stime, stime / hz);
427 #endif /* DEBUG */
428 }
429 
430 /*
431  * This function marks as clean, all the faulty pages that belong to the
432  * board that is copy-renamed since they are not likely to be bad pages
433  * after the rename. This includes the retired pages on the board.
434  */
435 
436 static void
437 dr_memlist_clrpages(struct memlist *r_ml)
438 {
439 	struct memlist	*t_ml;
440 	page_t		*pp, *epp;
441 	pfn_t		pfn, epfn;
442 	struct memseg	*seg;
443 
444 	if (r_ml == NULL)
445 		return;
446 
447 	for (t_ml = r_ml; (t_ml != NULL); t_ml = t_ml->next) {
448 		pfn = _b64top(t_ml->address);
449 		epfn = _b64top(t_ml->address + t_ml->size);
450 
451 		for (seg = memsegs; seg != NULL; seg = seg->next) {
452 			if (pfn >= seg->pages_end || epfn < seg->pages_base)
453 				continue;
454 
455 			pp = seg->pages;
456 			if (pfn > seg->pages_base)
457 				pp += pfn - seg->pages_base;
458 
459 			epp = seg->epages;
460 			if (epfn < seg->pages_end)
461 				epp -= seg->pages_end - epfn;
462 
463 			ASSERT(pp < epp);
464 			while (pp < epp) {
465 				if (page_isfaulty((page_t *)pp))
466 					page_clrtoxic_flag((page_t *)pp,
467 					    PAGE_IS_FAULTY);
468 				pp++;
469 			}
470 		}
471 	}
472 }
473 
474 static int
475 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
476 {
477 	time_t		 copytime;
478 	drmachid_t	 cr_id;
479 	dr_sr_handle_t	*srhp;
480 	struct memlist	*c_ml, *d_ml, *r_ml;
481 	sbd_error_t	*err;
482 	static fn_t	 f = "dr_move_memory";
483 
484 	PR_MEM("%s: (INLINE) moving memory from %s to %s\n",
485 		f,
486 		s_mp->sbm_cm.sbdev_path,
487 		t_mp->sbm_cm.sbdev_path);
488 
489 	ASSERT(s_mp->sbm_flags & DR_MFLAG_SOURCE);
490 	ASSERT(s_mp->sbm_peer == t_mp);
491 	ASSERT(s_mp->sbm_mlist);
492 
493 	ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
494 	ASSERT(t_mp->sbm_peer == s_mp);
495 
496 	/*
497 	 * create a memlist of spans to copy by removing
498 	 * the spans that have been deleted, if any, from
499 	 * the full source board memlist.  s_mp->sbm_del_mlist
500 	 * will be NULL if there were no spans deleted from
501 	 * the source board.
502 	 */
503 	c_ml = memlist_dup(s_mp->sbm_mlist);
504 	d_ml = s_mp->sbm_del_mlist;
505 	while (d_ml != NULL) {
506 		c_ml = memlist_del_span(c_ml, d_ml->address, d_ml->size);
507 		d_ml = d_ml->next;
508 	}
509 
510 	/*
511 	 * create a copy of the memlist to be used for retiring pages.
512 	 */
513 	r_ml = memlist_dup(c_ml);
514 
515 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
516 
517 	err = drmach_copy_rename_init(
518 		t_mp->sbm_cm.sbdev_id, _ptob64(t_mp->sbm_slice_offset),
519 		s_mp->sbm_cm.sbdev_id, c_ml, &cr_id);
520 	if (err) {
521 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
522 		affinity_clear();
523 		memlist_delete(r_ml);
524 		return (-1);
525 	}
526 
527 	srhp = dr_get_sr_handle(hp);
528 	ASSERT(srhp);
529 
530 	copytime = lbolt;
531 
532 	/* Quiesce the OS.  */
533 	if (dr_suspend(srhp)) {
534 		cmn_err(CE_WARN, "%s: failed to quiesce OS"
535 			" for copy-rename", f);
536 
537 		dr_release_sr_handle(srhp);
538 		err = drmach_copy_rename_fini(cr_id);
539 		if (err) {
540 			/*
541 			 * no error is expected since the program has
542 			 * not yet run.
543 			 */
544 
545 			/* catch this in debug kernels */
546 			ASSERT(0);
547 
548 			sbd_err_clear(&err);
549 		}
550 
551 		/* suspend error reached via hp */
552 		s_mp->sbm_cm.sbdev_error = hp->h_err;
553 		hp->h_err = NULL;
554 
555 		affinity_clear();
556 		memlist_delete(r_ml);
557 		return (-1);
558 	}
559 
560 	/*
561 	 * Rename memory for lgroup.
562 	 * Source and target board numbers are packaged in arg.
563 	 */
564 	{
565 		dr_board_t	*t_bp, *s_bp;
566 
567 		s_bp = s_mp->sbm_cm.sbdev_bp;
568 		t_bp = t_mp->sbm_cm.sbdev_bp;
569 
570 		lgrp_plat_config(LGRP_CONFIG_MEM_RENAME,
571 			(uintptr_t)(s_bp->b_num | (t_bp->b_num << 16)));
572 	}
573 
574 	drmach_copy_rename(cr_id);
575 
576 	/*
577 	 * Clear pages that have been marked as faulty since we are
578 	 * changing the physical memory for the pages.
579 	 */
580 	dr_memlist_clrpages(r_ml);
581 
582 	/* Resume the OS.  */
583 	dr_resume(srhp);
584 
585 	copytime = lbolt - copytime;
586 
587 	dr_release_sr_handle(srhp);
588 	err = drmach_copy_rename_fini(cr_id);
589 	if (err)
590 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
591 
592 	affinity_clear();
593 
594 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
595 		f, copytime, copytime / hz);
596 
597 	memlist_delete(r_ml);
598 
599 	/* Unretire any pages cleared after copy-rename */
600 	page_unretire_pages();
601 
602 	/* return -1 if dr_suspend or copy/rename recorded an error */
603 	return (err == NULL ? 0 : -1);
604 }
605 
606 /*
607  * If detaching node contains memory that is "non-permanent"
608  * then the memory adr's are simply cleared.  If the memory
609  * is non-relocatable, then do a copy-rename.
610  */
611 void
612 dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
613 {
614 	int			rv = 0;
615 	dr_mem_unit_t		*s_mp = (dr_mem_unit_t *)cp;
616 	dr_mem_unit_t		*t_mp;
617 	dr_state_t		state;
618 	static fn_t		f = "dr_detach_mem";
619 
620 	PR_MEM("%s...\n", f);
621 
622 	/* lookup target mem unit and target board structure, if any */
623 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
624 		t_mp = s_mp->sbm_peer;
625 		ASSERT(t_mp != NULL);
626 		ASSERT(t_mp->sbm_peer == s_mp);
627 	} else {
628 		t_mp = NULL;
629 	}
630 
631 	/* verify mem unit's state is UNREFERENCED */
632 	state = s_mp->sbm_cm.sbdev_state;
633 	if (state != DR_STATE_UNREFERENCED) {
634 		dr_dev_err(CE_IGNORE, &s_mp->sbm_cm, ESBD_STATE);
635 		return;
636 	}
637 
638 	/* verify target mem unit's state is UNREFERENCED, if any */
639 	if (t_mp != NULL) {
640 		state = t_mp->sbm_cm.sbdev_state;
641 		if (state != DR_STATE_UNREFERENCED) {
642 			dr_dev_err(CE_IGNORE, &t_mp->sbm_cm, ESBD_STATE);
643 			return;
644 		}
645 	}
646 
647 	/*
648 	 * Scrub deleted memory.  This will cause all cachelines
649 	 * referencing the memory to only be in the local cpu's
650 	 * ecache.
651 	 */
652 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
653 		/* no del mlist for src<=dst mem size copy/rename */
654 		if (s_mp->sbm_del_mlist)
655 			dr_mem_ecache_scrub(s_mp, s_mp->sbm_del_mlist);
656 	}
657 	if (t_mp != NULL && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
658 		ASSERT(t_mp->sbm_del_mlist);
659 		dr_mem_ecache_scrub(t_mp, t_mp->sbm_del_mlist);
660 	}
661 
662 	/*
663 	 * If there is no target board (no copy/rename was needed), then
664 	 * we're done!
665 	 */
666 	if (t_mp == NULL) {
667 		sbd_error_t *err;
668 		/*
669 		 * Reprogram interconnect hardware and disable
670 		 * memory controllers for memory node that's going away.
671 		 */
672 
673 		err = drmach_mem_disable(s_mp->sbm_cm.sbdev_id);
674 		if (err) {
675 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
676 			rv = -1;
677 		}
678 	} else {
679 		rv = dr_move_memory(hp, s_mp, t_mp);
680 		PR_MEM("%s: %s memory COPY-RENAME (board %d -> %d)\n",
681 			f,
682 			rv ? "FAILED" : "COMPLETED",
683 			s_mp->sbm_cm.sbdev_bp->b_num,
684 			t_mp->sbm_cm.sbdev_bp->b_num);
685 
686 		if (rv != 0)
687 			(void) dr_cancel_mem(s_mp);
688 	}
689 
690 	if (rv == 0) {
691 		sbd_error_t *err;
692 
693 		dr_lock_status(hp->h_bd);
694 		err = drmach_unconfigure(s_mp->sbm_cm.sbdev_id,
695 		    DRMACH_DEVI_REMOVE);
696 		dr_unlock_status(hp->h_bd);
697 		if (err)
698 			sbd_err_clear(&err);
699 	}
700 }
701 
702 #ifndef _STARFIRE
703 /*
704  * XXX workaround for certain lab configurations (see also starcat drmach.c)
705  * Temporary code to get around observed incorrect results from
706  * kphysm_del_span_query when the queried span contains address spans
707  * not occupied by memory in between spans that do have memory.
708  * This routine acts as a wrapper to kphysm_del_span_query.  It builds
709  * a memlist from phys_install of spans that exist between base and
710  * base + npages, inclusively.  Kphysm_del_span_query is called for each
711  * node in the memlist with the results accumulated in *mp.
712  */
713 static int
714 dr_del_span_query(pfn_t base, pgcnt_t npages, memquery_t *mp)
715 {
716 	uint64_t	 pa = _ptob64(base);
717 	uint64_t	 sm = ~ (137438953472ull - 1);
718 	uint64_t	 sa = pa & sm;
719 	struct memlist	*mlist, *ml;
720 	int		 rv;
721 
722 	npages = npages; /* silence lint */
723 	memlist_read_lock();
724 	mlist = memlist_dup(phys_install);
725 	memlist_read_unlock();
726 
727 again:
728 	for (ml = mlist; ml; ml = ml->next) {
729 		if ((ml->address & sm) != sa) {
730 			mlist = memlist_del_span(mlist, ml->address, ml->size);
731 			goto again;
732 		}
733 	}
734 
735 	mp->phys_pages = 0;
736 	mp->managed = 0;
737 	mp->nonrelocatable = 0;
738 	mp->first_nonrelocatable = (pfn_t)-1;	/* XXX */
739 	mp->last_nonrelocatable = 0;
740 
741 	for (ml = mlist; ml; ml = ml->next) {
742 		memquery_t mq;
743 
744 		rv = kphysm_del_span_query(
745 			_b64top(ml->address), _b64top(ml->size), &mq);
746 		if (rv)
747 			break;
748 
749 		mp->phys_pages += mq.phys_pages;
750 		mp->managed += mq.managed;
751 		mp->nonrelocatable += mq.nonrelocatable;
752 
753 		if (mq.nonrelocatable != 0) {
754 			if (mq.first_nonrelocatable < mp->first_nonrelocatable)
755 				mp->first_nonrelocatable =
756 					mq.first_nonrelocatable;
757 			if (mq.last_nonrelocatable > mp->last_nonrelocatable)
758 				mp->last_nonrelocatable =
759 					mq.last_nonrelocatable;
760 		}
761 	}
762 
763 	if (mp->nonrelocatable == 0)
764 		mp->first_nonrelocatable = 0;	/* XXX */
765 
766 	memlist_delete(mlist);
767 	return (rv);
768 }
769 
770 #define	kphysm_del_span_query dr_del_span_query
771 #endif /* _STARFIRE */
772 
773 /*
774  * NOTE: This routine is only partially smart about multiple
775  *	 mem-units.  Need to make mem-status structure smart
776  *	 about them also.
777  */
778 int
779 dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp)
780 {
781 	int		m, mix;
782 	memdelstat_t	mdst;
783 	memquery_t	mq;
784 	dr_board_t	*bp;
785 	dr_mem_unit_t	*mp;
786 	sbd_mem_stat_t	*msp;
787 	static fn_t	f = "dr_mem_status";
788 
789 	bp = hp->h_bd;
790 	devset &= DR_DEVS_PRESENT(bp);
791 
792 	for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) {
793 		int		rv;
794 		sbd_error_t	*err;
795 		drmach_status_t	 pstat;
796 		dr_mem_unit_t	*p_mp;
797 
798 		if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0)
799 			continue;
800 
801 		mp = dr_get_mem_unit(bp, m);
802 
803 		if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) {
804 			/* present, but not fully initialized */
805 			continue;
806 		}
807 
808 		if (mp->sbm_cm.sbdev_id == (drmachid_t)0)
809 			continue;
810 
811 		/* fetch platform status */
812 		err = drmach_status(mp->sbm_cm.sbdev_id, &pstat);
813 		if (err) {
814 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
815 			continue;
816 		}
817 
818 		msp = &dsp->d_mem;
819 		bzero((caddr_t)msp, sizeof (*msp));
820 
821 		strncpy(msp->ms_cm.c_id.c_name, pstat.type,
822 			sizeof (msp->ms_cm.c_id.c_name));
823 		msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type;
824 		msp->ms_cm.c_id.c_unit = SBD_NULL_UNIT;
825 		msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond;
826 		msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy;
827 		msp->ms_cm.c_time = mp->sbm_cm.sbdev_time;
828 		msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate;
829 
830 		msp->ms_totpages = mp->sbm_npages;
831 		msp->ms_basepfn = mp->sbm_basepfn;
832 		msp->ms_pageslost = mp->sbm_pageslost;
833 		msp->ms_cage_enabled = kcage_on;
834 
835 		if (mp->sbm_flags & DR_MFLAG_RESERVED)
836 			p_mp = mp->sbm_peer;
837 		else
838 			p_mp = NULL;
839 
840 		if (p_mp == NULL) {
841 			msp->ms_peer_is_target = 0;
842 			msp->ms_peer_ap_id[0] = '\0';
843 		} else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) {
844 			char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
845 			char *minor;
846 
847 			/*
848 			 * b_dip doesn't have to be held for ddi_pathname()
849 			 * because the board struct (dr_board_t) will be
850 			 * destroyed before b_dip detaches.
851 			 */
852 			(void) ddi_pathname(bp->b_dip, path);
853 			minor = strchr(p_mp->sbm_cm.sbdev_path, ':');
854 
855 			snprintf(msp->ms_peer_ap_id,
856 			    sizeof (msp->ms_peer_ap_id), "%s%s",
857 			    path, (minor == NULL) ? "" : minor);
858 
859 			kmem_free(path, MAXPATHLEN);
860 
861 			if (p_mp->sbm_flags & DR_MFLAG_TARGET)
862 				msp->ms_peer_is_target = 1;
863 		}
864 
865 		if (mp->sbm_flags & DR_MFLAG_RELOWNER)
866 			rv = kphysm_del_status(mp->sbm_memhandle, &mdst);
867 		else
868 			rv = KPHYSM_EHANDLE;	/* force 'if' to fail */
869 
870 		if (rv == KPHYSM_OK) {
871 			/*
872 			 * Any pages above managed is "free",
873 			 * i.e. it's collected.
874 			 */
875 			msp->ms_detpages += (uint_t)(mdst.collected +
876 			    mdst.phys_pages - mdst.managed);
877 		} else {
878 			/*
879 			 * If we're UNREFERENCED or UNCONFIGURED,
880 			 * then the number of detached pages is
881 			 * however many pages are on the board.
882 			 * I.e. detached = not in use by OS.
883 			 */
884 			switch (msp->ms_cm.c_ostate) {
885 			/*
886 			 * changed to use cfgadm states
887 			 *
888 			 * was:
889 			 *	case DR_STATE_UNREFERENCED:
890 			 *	case DR_STATE_UNCONFIGURED:
891 			 */
892 			case SBD_STAT_UNCONFIGURED:
893 				msp->ms_detpages = msp->ms_totpages;
894 				break;
895 
896 			default:
897 				break;
898 			}
899 		}
900 
901 		/*
902 		 * kphysm_del_span_query can report non-reloc pages = total
903 		 * pages for memory that is not yet configured
904 		 */
905 		if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) {
906 
907 			rv = kphysm_del_span_query(mp->sbm_basepfn,
908 			    mp->sbm_npages, &mq);
909 
910 			if (rv == KPHYSM_OK) {
911 				msp->ms_managed_pages = mq.managed;
912 				msp->ms_noreloc_pages = mq.nonrelocatable;
913 				msp->ms_noreloc_first =
914 				    mq.first_nonrelocatable;
915 				msp->ms_noreloc_last =
916 				    mq.last_nonrelocatable;
917 				msp->ms_cm.c_sflags = 0;
918 				if (mq.nonrelocatable) {
919 					SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE,
920 					    msp->ms_cm.c_sflags);
921 				}
922 			} else {
923 				PR_MEM("%s: kphysm_del_span_query() = %d\n",
924 				    f, rv);
925 			}
926 		}
927 
928 		/*
929 		 * Check source unit state during copy-rename
930 		 */
931 		if ((mp->sbm_flags & DR_MFLAG_SOURCE) &&
932 		    (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED ||
933 		    mp->sbm_cm.sbdev_state == DR_STATE_RELEASE))
934 			msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED;
935 
936 		mix++;
937 		dsp++;
938 	}
939 
940 	return (mix);
941 }
942 
943 int
944 dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
945 {
946 	_NOTE(ARGUNUSED(hp))
947 
948 	int		err_flag = 0;
949 	int		d;
950 	sbd_error_t	*err;
951 	static fn_t	f = "dr_pre_attach_mem";
952 
953 	PR_MEM("%s...\n", f);
954 
955 	for (d = 0; d < devnum; d++) {
956 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
957 		dr_state_t	state;
958 
959 		cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path);
960 
961 		state = mp->sbm_cm.sbdev_state;
962 		switch (state) {
963 		case DR_STATE_UNCONFIGURED:
964 			PR_MEM("%s: recovering from UNCONFIG for %s\n",
965 				f,
966 				mp->sbm_cm.sbdev_path);
967 
968 			/* use memlist cached by dr_post_detach_mem_unit */
969 			ASSERT(mp->sbm_mlist != NULL);
970 			PR_MEM("%s: re-configuring cached memlist for %s:\n",
971 				f, mp->sbm_cm.sbdev_path);
972 			PR_MEMLIST_DUMP(mp->sbm_mlist);
973 
974 			/* kphysm del handle should be have been freed */
975 			ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
976 
977 			/*FALLTHROUGH*/
978 
979 		case DR_STATE_CONNECTED:
980 			PR_MEM("%s: reprogramming mem hardware on %s\n",
981 				f, mp->sbm_cm.sbdev_bp->b_path);
982 
983 			PR_MEM("%s: enabling %s\n",
984 				f, mp->sbm_cm.sbdev_path);
985 
986 			err = drmach_mem_enable(mp->sbm_cm.sbdev_id);
987 			if (err) {
988 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
989 				err_flag = 1;
990 			}
991 			break;
992 
993 		default:
994 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE);
995 			err_flag = 1;
996 			break;
997 		}
998 
999 		/* exit for loop if error encountered */
1000 		if (err_flag)
1001 			break;
1002 	}
1003 
1004 	return (err_flag ? -1 : 0);
1005 }
1006 
1007 int
1008 dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1009 {
1010 	_NOTE(ARGUNUSED(hp))
1011 
1012 	int		d;
1013 	static fn_t	f = "dr_post_attach_mem";
1014 
1015 	PR_MEM("%s...\n", f);
1016 
1017 	for (d = 0; d < devnum; d++) {
1018 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1019 		struct memlist	*mlist, *ml;
1020 
1021 		mlist = dr_get_memlist(mp);
1022 		if (mlist == NULL) {
1023 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_MEMFAIL);
1024 			continue;
1025 		}
1026 
1027 		/*
1028 		 * Verify the memory really did successfully attach
1029 		 * by checking for its existence in phys_install.
1030 		 */
1031 		memlist_read_lock();
1032 		if (memlist_intersect(phys_install, mlist) == 0) {
1033 			memlist_read_unlock();
1034 
1035 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1036 
1037 			PR_MEM("%s: %s memlist not in phys_install",
1038 				f, mp->sbm_cm.sbdev_path);
1039 
1040 			memlist_delete(mlist);
1041 			continue;
1042 		}
1043 		memlist_read_unlock();
1044 
1045 		for (ml = mlist; ml != NULL; ml = ml->next) {
1046 			sbd_error_t *err;
1047 
1048 			err = drmach_mem_add_span(
1049 				mp->sbm_cm.sbdev_id,
1050 				ml->address,
1051 				ml->size);
1052 			if (err)
1053 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1054 		}
1055 
1056 		memlist_delete(mlist);
1057 
1058 		/*
1059 		 * Destroy cached memlist, if any.
1060 		 * There will be a cached memlist in sbm_mlist if
1061 		 * this board is being configured directly after
1062 		 * an unconfigure.
1063 		 * To support this transition, dr_post_detach_mem
1064 		 * left a copy of the last known memlist in sbm_mlist.
1065 		 * This memlist could differ from any derived from
1066 		 * hardware if while this memunit was last configured
1067 		 * the system detected and deleted bad pages from
1068 		 * phys_install.  The location of those bad pages
1069 		 * will be reflected in the cached memlist.
1070 		 */
1071 		if (mp->sbm_mlist) {
1072 			memlist_delete(mp->sbm_mlist);
1073 			mp->sbm_mlist = NULL;
1074 		}
1075 
1076 /*
1077  * TODO: why is this call to dr_init_mem_unit_data here?
1078  * this has been done at discovery or connect time, so this is
1079  * probably redundant and unnecessary.
1080  */
1081 		dr_init_mem_unit_data(mp);
1082 	}
1083 
1084 	return (0);
1085 }
1086 
1087 int
1088 dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1089 {
1090 	_NOTE(ARGUNUSED(hp))
1091 
1092 	int d;
1093 
1094 	for (d = 0; d < devnum; d++) {
1095 		dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d];
1096 
1097 		cmn_err(CE_CONT, "OS unconfigure %s", mp->sbm_cm.sbdev_path);
1098 	}
1099 
1100 	return (0);
1101 }
1102 
1103 
1104 int
1105 dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1106 {
1107 	_NOTE(ARGUNUSED(hp))
1108 
1109 	int		d, rv;
1110 	static fn_t	f = "dr_post_detach_mem";
1111 
1112 	PR_MEM("%s...\n", f);
1113 
1114 	rv = 0;
1115 	for (d = 0; d < devnum; d++) {
1116 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1117 
1118 		ASSERT(mp->sbm_cm.sbdev_bp == hp->h_bd);
1119 
1120 		if (dr_post_detach_mem_unit(mp))
1121 			rv = -1;
1122 	}
1123 
1124 	return (rv);
1125 }
1126 
1127 static void
1128 dr_add_memory_spans(dr_mem_unit_t *mp, struct memlist *ml)
1129 {
1130 	static fn_t	f = "dr_add_memory_spans";
1131 
1132 	PR_MEM("%s...", f);
1133 	PR_MEMLIST_DUMP(ml);
1134 
1135 #ifdef DEBUG
1136 	memlist_read_lock();
1137 	if (memlist_intersect(phys_install, ml)) {
1138 		PR_MEM("%s:WARNING: memlist intersects with phys_install\n", f);
1139 	}
1140 	memlist_read_unlock();
1141 #endif
1142 
1143 	for (; ml; ml = ml->next) {
1144 		pfn_t		 base;
1145 		pgcnt_t		 npgs;
1146 		int		 rv;
1147 		sbd_error_t	*err;
1148 
1149 		base = _b64top(ml->address);
1150 		npgs = _b64top(ml->size);
1151 
1152 		rv = kphysm_add_memory_dynamic(base, npgs);
1153 
1154 		err = drmach_mem_add_span(
1155 			mp->sbm_cm.sbdev_id,
1156 			ml->address,
1157 			ml->size);
1158 
1159 		if (err)
1160 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1161 
1162 		if (rv != KPHYSM_OK) {
1163 			cmn_err(CE_WARN, "%s:"
1164 				" unexpected kphysm_add_memory_dynamic"
1165 				" return value %d;"
1166 				" basepfn=0x%lx, npages=%ld\n",
1167 				f, rv, base, npgs);
1168 
1169 			continue;
1170 		}
1171 	}
1172 }
1173 
1174 static int
1175 dr_post_detach_mem_unit(dr_mem_unit_t *s_mp)
1176 {
1177 	uint64_t	sz = s_mp->sbm_slice_size;
1178 	uint64_t	sm = sz - 1;
1179 	/* old and new below refer to PAs before and after copy-rename */
1180 	uint64_t	s_old_basepa, s_new_basepa;
1181 	uint64_t	t_old_basepa, t_new_basepa;
1182 	uint64_t	t_new_smallsize = 0;
1183 	dr_mem_unit_t	*t_mp, *x_mp;
1184 	struct memlist	*ml;
1185 	int		rv;
1186 	sbd_error_t	*err;
1187 	static fn_t	f = "dr_post_detach_mem_unit";
1188 
1189 	PR_MEM("%s...\n", f);
1190 
1191 	/* s_mp->sbm_del_mlist could be NULL, meaning no deleted spans */
1192 	PR_MEM("%s: %s: deleted memlist (EMPTY maybe okay):\n",
1193 		f, s_mp->sbm_cm.sbdev_path);
1194 	PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1195 
1196 	/* sanity check */
1197 	ASSERT(s_mp->sbm_del_mlist == NULL ||
1198 		(s_mp->sbm_flags & DR_MFLAG_RELDONE) != 0);
1199 
1200 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1201 		t_mp = s_mp->sbm_peer;
1202 		ASSERT(t_mp != NULL);
1203 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1204 		ASSERT(t_mp->sbm_peer == s_mp);
1205 
1206 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RELDONE);
1207 		ASSERT(t_mp->sbm_del_mlist);
1208 
1209 		PR_MEM("%s: target %s: deleted memlist:\n",
1210 			f, t_mp->sbm_cm.sbdev_path);
1211 		PR_MEMLIST_DUMP(t_mp->sbm_del_mlist);
1212 	} else {
1213 		/* this is no target unit */
1214 		t_mp = NULL;
1215 	}
1216 
1217 	/*
1218 	 * Verify the memory really did successfully detach
1219 	 * by checking for its non-existence in phys_install.
1220 	 */
1221 	rv = 0;
1222 	memlist_read_lock();
1223 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
1224 		x_mp = s_mp;
1225 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1226 	}
1227 	if (rv == 0 && t_mp && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
1228 		x_mp = t_mp;
1229 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1230 	}
1231 	memlist_read_unlock();
1232 
1233 	if (rv) {
1234 		/* error: memlist still in phys_install */
1235 		DR_DEV_INTERNAL_ERROR(&x_mp->sbm_cm);
1236 	}
1237 
1238 	/*
1239 	 * clean mem unit state and bail out if an error has been recorded.
1240 	 */
1241 	rv = 0;
1242 	if (s_mp->sbm_cm.sbdev_error) {
1243 		PR_MEM("%s: %s flags=%x", f,
1244 			s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1245 		DR_DEV_CLR_UNREFERENCED(&s_mp->sbm_cm);
1246 		DR_DEV_CLR_RELEASED(&s_mp->sbm_cm);
1247 		dr_device_transition(&s_mp->sbm_cm, DR_STATE_CONFIGURED);
1248 		rv = -1;
1249 	}
1250 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error != NULL) {
1251 		PR_MEM("%s: %s flags=%x", f,
1252 			s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1253 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1254 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1255 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1256 		rv = -1;
1257 	}
1258 	if (rv)
1259 		goto cleanup;
1260 
1261 	s_old_basepa = _ptob64(s_mp->sbm_basepfn);
1262 	err = drmach_mem_get_base_physaddr(s_mp->sbm_cm.sbdev_id,
1263 	    &s_new_basepa);
1264 	ASSERT(err == NULL);
1265 
1266 	PR_MEM("%s:s_old_basepa: 0x%llx\n", f, s_old_basepa);
1267 	PR_MEM("%s:s_new_basepa: 0x%llx\n", f, s_new_basepa);
1268 
1269 	if (t_mp != NULL) {
1270 		struct memlist *s_copy_mlist;
1271 
1272 		t_old_basepa	= _ptob64(t_mp->sbm_basepfn);
1273 		err = drmach_mem_get_base_physaddr(t_mp->sbm_cm.sbdev_id,
1274 		    &t_new_basepa);
1275 		ASSERT(err == NULL);
1276 
1277 		PR_MEM("%s:t_old_basepa: 0x%llx\n", f, t_old_basepa);
1278 		PR_MEM("%s:t_new_basepa: 0x%llx\n", f, t_new_basepa);
1279 
1280 		/*
1281 		 * Construct copy list with original source addresses.
1282 		 * Used to add back excess target mem.
1283 		 */
1284 		s_copy_mlist = memlist_dup(s_mp->sbm_mlist);
1285 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1286 			s_copy_mlist = memlist_del_span(s_copy_mlist,
1287 			    ml->address, ml->size);
1288 		}
1289 
1290 		PR_MEM("%s: source copy list:\n:", f);
1291 		PR_MEMLIST_DUMP(s_copy_mlist);
1292 
1293 		/*
1294 		 * We had to swap mem-units, so update
1295 		 * memlists accordingly with new base
1296 		 * addresses.
1297 		 */
1298 		for (ml = t_mp->sbm_mlist; ml; ml = ml->next) {
1299 			ml->address -= t_old_basepa;
1300 			ml->address += t_new_basepa;
1301 		}
1302 
1303 		/*
1304 		 * There is no need to explicitly rename the target delete
1305 		 * memlist, because sbm_del_mlist and sbm_mlist always
1306 		 * point to the same memlist for a copy/rename operation.
1307 		 */
1308 		ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1309 
1310 		PR_MEM("%s: renamed target memlist and delete memlist:\n", f);
1311 		PR_MEMLIST_DUMP(t_mp->sbm_mlist);
1312 
1313 		for (ml = s_mp->sbm_mlist; ml; ml = ml->next) {
1314 			ml->address -= s_old_basepa;
1315 			ml->address += s_new_basepa;
1316 		}
1317 
1318 		PR_MEM("%s: renamed source memlist:\n", f);
1319 		PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1320 
1321 		/*
1322 		 * Keep track of dynamically added segments
1323 		 * since they cannot be split if we need to delete
1324 		 * excess source memory later for this board.
1325 		 */
1326 		if (t_mp->sbm_dyn_segs)
1327 			memlist_delete(t_mp->sbm_dyn_segs);
1328 		t_mp->sbm_dyn_segs = s_mp->sbm_dyn_segs;
1329 		s_mp->sbm_dyn_segs = NULL;
1330 
1331 		/*
1332 		 * If the target memory range with the new target base PA
1333 		 * extends beyond the usable slice, prevent any "target excess"
1334 		 * from being added back after this copy/rename and
1335 		 * calculate the new smaller size of the target board
1336 		 * to be set as part of target cleanup. The base + npages
1337 		 * must only include the range of memory up to the end of
1338 		 * this slice. This will only be used after a category 4
1339 		 * large-to-small target type copy/rename - see comments
1340 		 * in dr_select_mem_target.
1341 		 */
1342 		if (((t_new_basepa & sm) + _ptob64(t_mp->sbm_npages)) > sz) {
1343 			t_new_smallsize = sz - (t_new_basepa & sm);
1344 		}
1345 
1346 		if (s_mp->sbm_flags & DR_MFLAG_MEMRESIZE &&
1347 		    t_new_smallsize == 0) {
1348 			struct memlist	*t_excess_mlist;
1349 
1350 			/*
1351 			 * Add back excess target memory.
1352 			 * Subtract out the portion of the target memory
1353 			 * node that was taken over by the source memory
1354 			 * node.
1355 			 */
1356 			t_excess_mlist = memlist_dup(t_mp->sbm_mlist);
1357 			for (ml = s_copy_mlist; ml; ml = ml->next) {
1358 				t_excess_mlist =
1359 				    memlist_del_span(t_excess_mlist,
1360 				    ml->address, ml->size);
1361 			}
1362 
1363 			/*
1364 			 * Update dynamically added segs
1365 			 */
1366 			for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1367 				t_mp->sbm_dyn_segs =
1368 				    memlist_del_span(t_mp->sbm_dyn_segs,
1369 				    ml->address, ml->size);
1370 			}
1371 			for (ml = t_excess_mlist; ml; ml = ml->next) {
1372 				t_mp->sbm_dyn_segs =
1373 				    memlist_cat_span(t_mp->sbm_dyn_segs,
1374 				    ml->address, ml->size);
1375 			}
1376 			PR_MEM("%s: %s: updated dynamic seg list:\n",
1377 			    f, t_mp->sbm_cm.sbdev_path);
1378 			PR_MEMLIST_DUMP(t_mp->sbm_dyn_segs);
1379 
1380 			PR_MEM("%s: adding back remaining portion"
1381 				" of %s, memlist:\n",
1382 				f, t_mp->sbm_cm.sbdev_path);
1383 			PR_MEMLIST_DUMP(t_excess_mlist);
1384 
1385 			dr_add_memory_spans(s_mp, t_excess_mlist);
1386 			memlist_delete(t_excess_mlist);
1387 		}
1388 		memlist_delete(s_copy_mlist);
1389 
1390 #ifdef DEBUG
1391 		/*
1392 		 * Renaming s_mp->sbm_del_mlist is not necessary.  This
1393 		 * list is not used beyond this point, and in fact, is
1394 		 * disposed of at the end of this function.
1395 		 */
1396 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1397 			ml->address -= s_old_basepa;
1398 			ml->address += s_new_basepa;
1399 		}
1400 
1401 		PR_MEM("%s: renamed source delete memlist", f);
1402 		PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1403 #endif
1404 
1405 	}
1406 
1407 	if (t_mp != NULL) {
1408 		/* delete target's entire address space */
1409 		err = drmach_mem_del_span(
1410 			t_mp->sbm_cm.sbdev_id, t_old_basepa & ~ sm, sz);
1411 		if (err)
1412 			DRERR_SET_C(&t_mp->sbm_cm.sbdev_error, &err);
1413 		ASSERT(err == NULL);
1414 
1415 		/*
1416 		 * After the copy/rename, the original address space
1417 		 * for the source board (which is now located on the
1418 		 * target board) may now have some excess to be deleted.
1419 		 * The amount is calculated by masking the slice
1420 		 * info and keeping the slice offset from t_new_basepa.
1421 		 */
1422 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1423 				s_old_basepa & ~ sm, t_new_basepa & sm);
1424 		if (err)
1425 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1426 		ASSERT(err == NULL);
1427 
1428 	} else {
1429 		/* delete board's entire address space */
1430 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1431 						s_old_basepa & ~ sm, sz);
1432 		if (err)
1433 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1434 		ASSERT(err == NULL);
1435 	}
1436 
1437 cleanup:
1438 	/* clean up target mem unit */
1439 	if (t_mp != NULL) {
1440 		memlist_delete(t_mp->sbm_del_mlist);
1441 		/* no need to delete sbm_mlist, it shares sbm_del_mlist */
1442 
1443 		t_mp->sbm_del_mlist = NULL;
1444 		t_mp->sbm_mlist = NULL;
1445 		t_mp->sbm_peer = NULL;
1446 		t_mp->sbm_flags = 0;
1447 		t_mp->sbm_cm.sbdev_busy = 0;
1448 		dr_init_mem_unit_data(t_mp);
1449 
1450 		/* reduce target size if new PAs go past end of usable slice */
1451 		if (t_new_smallsize > 0) {
1452 			t_mp->sbm_npages = _b64top(t_new_smallsize);
1453 			PR_MEM("%s: target new size 0x%llx bytes\n",
1454 				f, t_new_smallsize);
1455 		}
1456 	}
1457 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error == NULL) {
1458 		/*
1459 		 * now that copy/rename has completed, undo this
1460 		 * work that was done in dr_release_mem_done.
1461 		 */
1462 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1463 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1464 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1465 	}
1466 
1467 	/*
1468 	 * clean up (source) board's mem unit structure.
1469 	 * NOTE: sbm_mlist is retained if no error has been record (in other
1470 	 * words, when s_mp->sbm_cm.sbdev_error is NULL). This memlist is
1471 	 * referred to elsewhere as the cached memlist.  The cached memlist
1472 	 * is used to re-attach (configure back in) this memunit from the
1473 	 * unconfigured state.  The memlist is retained because it may
1474 	 * represent bad pages that were detected while the memory was
1475 	 * configured into the OS.  The OS deletes bad pages from phys_install.
1476 	 * Those deletes, if any, will be represented in the cached mlist.
1477 	 */
1478 	if (s_mp->sbm_del_mlist && s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1479 		memlist_delete(s_mp->sbm_del_mlist);
1480 
1481 	if (s_mp->sbm_cm.sbdev_error && s_mp->sbm_mlist) {
1482 		memlist_delete(s_mp->sbm_mlist);
1483 		s_mp->sbm_mlist = NULL;
1484 	}
1485 
1486 	if (s_mp->sbm_dyn_segs != NULL && s_mp->sbm_cm.sbdev_error == 0) {
1487 		memlist_delete(s_mp->sbm_dyn_segs);
1488 		s_mp->sbm_dyn_segs = NULL;
1489 	}
1490 
1491 	s_mp->sbm_del_mlist = NULL;
1492 	s_mp->sbm_peer = NULL;
1493 	s_mp->sbm_flags = 0;
1494 	s_mp->sbm_cm.sbdev_busy = 0;
1495 	dr_init_mem_unit_data(s_mp);
1496 
1497 	PR_MEM("%s: cached memlist for %s:", f, s_mp->sbm_cm.sbdev_path);
1498 	PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1499 
1500 	return (0);
1501 }
1502 
1503 /*
1504  * Successful return from this function will have the memory
1505  * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated
1506  * and waiting.  This routine's job is to select the memory that
1507  * actually has to be released (detached) which may not necessarily
1508  * be the same memory node that came in in devlist[],
1509  * i.e. a copy-rename is needed.
1510  */
1511 int
1512 dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1513 {
1514 	int		d;
1515 	int		err_flag = 0;
1516 	static fn_t	f = "dr_pre_release_mem";
1517 
1518 	PR_MEM("%s...\n", f);
1519 
1520 	for (d = 0; d < devnum; d++) {
1521 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1522 		int		rv;
1523 		memquery_t	mq;
1524 		struct memlist	*ml;
1525 
1526 		if (mp->sbm_cm.sbdev_error) {
1527 			err_flag = 1;
1528 			continue;
1529 		} else if (!kcage_on) {
1530 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_KCAGE_OFF);
1531 			err_flag = 1;
1532 			continue;
1533 		}
1534 
1535 		if (mp->sbm_flags & DR_MFLAG_RESERVED) {
1536 			/*
1537 			 * Board is currently involved in a delete
1538 			 * memory operation. Can't detach this guy until
1539 			 * that operation completes.
1540 			 */
1541 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_INVAL);
1542 			err_flag = 1;
1543 			break;
1544 		}
1545 
1546 		/*
1547 		 * Check whether the detaching memory requires a
1548 		 * copy-rename.
1549 		 */
1550 		ASSERT(mp->sbm_npages != 0);
1551 		rv = kphysm_del_span_query(
1552 			mp->sbm_basepfn, mp->sbm_npages, &mq);
1553 		if (rv != KPHYSM_OK) {
1554 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1555 			err_flag = 1;
1556 			break;
1557 		}
1558 
1559 		if (mq.nonrelocatable != 0) {
1560 			if (!(dr_cmd_flags(hp) &
1561 				(SBD_FLAG_FORCE | SBD_FLAG_QUIESCE_OKAY))) {
1562 				/* caller wasn't prompted for a suspend */
1563 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1564 					ESBD_QUIESCE_REQD);
1565 				err_flag = 1;
1566 				break;
1567 			}
1568 		}
1569 
1570 		/* flags should be clean at this time */
1571 		ASSERT(mp->sbm_flags == 0);
1572 
1573 		ASSERT(mp->sbm_mlist == NULL);		/* should be null */
1574 		ASSERT(mp->sbm_del_mlist == NULL);	/* should be null */
1575 		if (mp->sbm_mlist != NULL) {
1576 			memlist_delete(mp->sbm_mlist);
1577 			mp->sbm_mlist = NULL;
1578 		}
1579 
1580 		ml = dr_get_memlist(mp);
1581 		if (ml == NULL) {
1582 			err_flag = 1;
1583 			PR_MEM("%s: no memlist found for %s\n",
1584 				f, mp->sbm_cm.sbdev_path);
1585 			continue;
1586 		}
1587 
1588 		/* allocate a kphysm handle */
1589 		rv = kphysm_del_gethandle(&mp->sbm_memhandle);
1590 		if (rv != KPHYSM_OK) {
1591 			memlist_delete(ml);
1592 
1593 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1594 			err_flag = 1;
1595 			break;
1596 		}
1597 		mp->sbm_flags |= DR_MFLAG_RELOWNER;
1598 
1599 		if ((mq.nonrelocatable != 0) ||
1600 			dr_reserve_mem_spans(&mp->sbm_memhandle, ml)) {
1601 			/*
1602 			 * Either the detaching memory node contains
1603 			 * non-reloc memory or we failed to reserve the
1604 			 * detaching memory node (which did _not_ have
1605 			 * any non-reloc memory, i.e. some non-reloc mem
1606 			 * got onboard).
1607 			 */
1608 
1609 			if (dr_select_mem_target(hp, mp, ml)) {
1610 				int rv;
1611 
1612 				/*
1613 				 * We had no luck locating a target
1614 				 * memory node to be the recipient of
1615 				 * the non-reloc memory on the node
1616 				 * we're trying to detach.
1617 				 * Clean up be disposing the mem handle
1618 				 * and the mem list.
1619 				 */
1620 				rv = kphysm_del_release(mp->sbm_memhandle);
1621 				if (rv != KPHYSM_OK) {
1622 					/*
1623 					 * can do nothing but complain
1624 					 * and hope helpful for debug
1625 					 */
1626 					cmn_err(CE_WARN, "%s: unexpected"
1627 						" kphysm_del_release return"
1628 						" value %d",
1629 						f, rv);
1630 				}
1631 				mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1632 
1633 				memlist_delete(ml);
1634 
1635 				/* make sure sbm_flags is clean */
1636 				ASSERT(mp->sbm_flags == 0);
1637 
1638 				dr_dev_err(CE_WARN,
1639 					&mp->sbm_cm, ESBD_NO_TARGET);
1640 
1641 				err_flag = 1;
1642 				break;
1643 			}
1644 
1645 			/*
1646 			 * ml is not memlist_delete'd here because
1647 			 * it has been assigned to mp->sbm_mlist
1648 			 * by dr_select_mem_target.
1649 			 */
1650 		} else {
1651 			/* no target needed to detach this board */
1652 			mp->sbm_flags |= DR_MFLAG_RESERVED;
1653 			mp->sbm_peer = NULL;
1654 			mp->sbm_del_mlist = ml;
1655 			mp->sbm_mlist = ml;
1656 			mp->sbm_cm.sbdev_busy = 1;
1657 		}
1658 #ifdef DEBUG
1659 		ASSERT(mp->sbm_mlist != NULL);
1660 
1661 		if (mp->sbm_flags & DR_MFLAG_SOURCE) {
1662 			PR_MEM("%s: release of %s requires copy/rename;"
1663 				" selected target board %s\n",
1664 				f,
1665 				mp->sbm_cm.sbdev_path,
1666 				mp->sbm_peer->sbm_cm.sbdev_path);
1667 		} else {
1668 			PR_MEM("%s: copy/rename not required to release %s\n",
1669 				f, mp->sbm_cm.sbdev_path);
1670 		}
1671 
1672 		ASSERT(mp->sbm_flags & DR_MFLAG_RELOWNER);
1673 		ASSERT(mp->sbm_flags & DR_MFLAG_RESERVED);
1674 #endif
1675 	}
1676 
1677 	return (err_flag ? -1 : 0);
1678 }
1679 
1680 void
1681 dr_release_mem_done(dr_common_unit_t *cp)
1682 {
1683 	dr_mem_unit_t	*s_mp = (dr_mem_unit_t *)cp;
1684 	dr_mem_unit_t *t_mp, *mp;
1685 	int		rv;
1686 	static fn_t	f = "dr_release_mem_done";
1687 
1688 	/*
1689 	 * This unit will be flagged with DR_MFLAG_SOURCE, if it
1690 	 * has a target unit.
1691 	 */
1692 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1693 		t_mp = s_mp->sbm_peer;
1694 		ASSERT(t_mp != NULL);
1695 		ASSERT(t_mp->sbm_peer == s_mp);
1696 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1697 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RESERVED);
1698 	} else {
1699 		/* this is no target unit */
1700 		t_mp = NULL;
1701 	}
1702 
1703 	/* free delete handle */
1704 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RELOWNER);
1705 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RESERVED);
1706 	rv = kphysm_del_release(s_mp->sbm_memhandle);
1707 	if (rv != KPHYSM_OK) {
1708 		/*
1709 		 * can do nothing but complain
1710 		 * and hope helpful for debug
1711 		 */
1712 		cmn_err(CE_WARN, "%s: unexpected kphysm_del_release"
1713 			" return value %d", f, rv);
1714 	}
1715 	s_mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1716 
1717 	/*
1718 	 * If an error was encountered during release, clean up
1719 	 * the source (and target, if present) unit data.
1720 	 */
1721 /* XXX Can we know that sbdev_error was encountered during release? */
1722 	if (s_mp->sbm_cm.sbdev_error != NULL) {
1723 		PR_MEM("%s: %s: error %d noted\n",
1724 			f,
1725 			s_mp->sbm_cm.sbdev_path,
1726 			s_mp->sbm_cm.sbdev_error->e_code);
1727 
1728 		if (t_mp != NULL) {
1729 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1730 			t_mp->sbm_del_mlist = NULL;
1731 
1732 			if (t_mp->sbm_mlist != NULL) {
1733 				memlist_delete(t_mp->sbm_mlist);
1734 				t_mp->sbm_mlist = NULL;
1735 			}
1736 
1737 			t_mp->sbm_peer = NULL;
1738 			t_mp->sbm_flags = 0;
1739 			t_mp->sbm_cm.sbdev_busy = 0;
1740 		}
1741 
1742 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1743 			memlist_delete(s_mp->sbm_del_mlist);
1744 		s_mp->sbm_del_mlist = NULL;
1745 
1746 		if (s_mp->sbm_mlist != NULL) {
1747 			memlist_delete(s_mp->sbm_mlist);
1748 			s_mp->sbm_mlist = NULL;
1749 		}
1750 
1751 		s_mp->sbm_peer = NULL;
1752 		s_mp->sbm_flags = 0;
1753 		s_mp->sbm_cm.sbdev_busy = 0;
1754 
1755 		/* bail out */
1756 		return;
1757 	}
1758 
1759 	DR_DEV_SET_RELEASED(&s_mp->sbm_cm);
1760 	dr_device_transition(&s_mp->sbm_cm, DR_STATE_RELEASE);
1761 
1762 	if (t_mp != NULL) {
1763 		/*
1764 		 * the kphysm delete operation that drained the source
1765 		 * board also drained this target board.  Since the source
1766 		 * board drain is now known to have succeeded, we know this
1767 		 * target board is drained too.
1768 		 *
1769 		 * because DR_DEV_SET_RELEASED and dr_device_transition
1770 		 * is done here, the dr_release_dev_done should not
1771 		 * fail.
1772 		 */
1773 		DR_DEV_SET_RELEASED(&t_mp->sbm_cm);
1774 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_RELEASE);
1775 
1776 		/*
1777 		 * NOTE: do not transition target's board state,
1778 		 * even if the mem-unit was the last configure
1779 		 * unit of the board.  When copy/rename completes
1780 		 * this mem-unit will transitioned back to
1781 		 * the configured state.  In the meantime, the
1782 		 * board's must remain as is.
1783 		 */
1784 	}
1785 
1786 	/* if board(s) had deleted memory, verify it is gone */
1787 	rv = 0;
1788 	memlist_read_lock();
1789 	if (s_mp->sbm_del_mlist != NULL) {
1790 		mp = s_mp;
1791 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1792 	}
1793 	if (rv == 0 && t_mp && t_mp->sbm_del_mlist != NULL) {
1794 		mp = t_mp;
1795 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1796 	}
1797 	memlist_read_unlock();
1798 	if (rv) {
1799 		cmn_err(CE_WARN, "%s: %smem-unit (%d.%d): "
1800 			"deleted memory still found in phys_install",
1801 			f,
1802 			(mp == t_mp ? "target " : ""),
1803 			mp->sbm_cm.sbdev_bp->b_num,
1804 			mp->sbm_cm.sbdev_unum);
1805 
1806 		DR_DEV_INTERNAL_ERROR(&s_mp->sbm_cm);
1807 		return;
1808 	}
1809 
1810 	s_mp->sbm_flags |= DR_MFLAG_RELDONE;
1811 	if (t_mp != NULL)
1812 		t_mp->sbm_flags |= DR_MFLAG_RELDONE;
1813 
1814 	/* this should not fail */
1815 	if (dr_release_dev_done(&s_mp->sbm_cm) != 0) {
1816 		/* catch this in debug kernels */
1817 		ASSERT(0);
1818 		return;
1819 	}
1820 
1821 	PR_MEM("%s: marking %s release DONE\n",
1822 		f, s_mp->sbm_cm.sbdev_path);
1823 
1824 	s_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1825 
1826 	if (t_mp != NULL) {
1827 		/* should not fail */
1828 		rv = dr_release_dev_done(&t_mp->sbm_cm);
1829 		if (rv != 0) {
1830 			/* catch this in debug kernels */
1831 			ASSERT(0);
1832 			return;
1833 		}
1834 
1835 		PR_MEM("%s: marking %s release DONE\n",
1836 			f, t_mp->sbm_cm.sbdev_path);
1837 
1838 		t_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1839 	}
1840 }
1841 
1842 /*ARGSUSED*/
1843 int
1844 dr_disconnect_mem(dr_mem_unit_t *mp)
1845 {
1846 	static fn_t	f = "dr_disconnect_mem";
1847 	update_membounds_t umb;
1848 
1849 #ifdef DEBUG
1850 	int state = mp->sbm_cm.sbdev_state;
1851 	ASSERT(state == DR_STATE_CONNECTED ||
1852 		state == DR_STATE_UNCONFIGURED);
1853 #endif
1854 
1855 	PR_MEM("%s...\n", f);
1856 
1857 	if (mp->sbm_del_mlist && mp->sbm_del_mlist != mp->sbm_mlist)
1858 		memlist_delete(mp->sbm_del_mlist);
1859 	mp->sbm_del_mlist = NULL;
1860 
1861 	if (mp->sbm_mlist) {
1862 		memlist_delete(mp->sbm_mlist);
1863 		mp->sbm_mlist = NULL;
1864 	}
1865 
1866 	/*
1867 	 * Remove memory from lgroup
1868 	 * For now, only board info is required.
1869 	 */
1870 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1871 	umb.u_base = (uint64_t)-1;
1872 	umb.u_len = (uint64_t)-1;
1873 
1874 	lgrp_plat_config(LGRP_CONFIG_MEM_DEL, (uintptr_t)&umb);
1875 
1876 	return (0);
1877 }
1878 
1879 int
1880 dr_cancel_mem(dr_mem_unit_t *s_mp)
1881 {
1882 	dr_mem_unit_t	*t_mp;
1883 	dr_state_t	state;
1884 	static fn_t	f = "dr_cancel_mem";
1885 
1886 	state = s_mp->sbm_cm.sbdev_state;
1887 
1888 	if (s_mp->sbm_flags & DR_MFLAG_TARGET) {
1889 		/* must cancel source board, not target board */
1890 		/* TODO: set error */
1891 		return (-1);
1892 	} else if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1893 		t_mp = s_mp->sbm_peer;
1894 		ASSERT(t_mp != NULL);
1895 		ASSERT(t_mp->sbm_peer == s_mp);
1896 
1897 		/* must always match the source board's state */
1898 /* TODO: is this assertion correct? */
1899 		ASSERT(t_mp->sbm_cm.sbdev_state == state);
1900 	} else {
1901 		/* this is no target unit */
1902 		t_mp = NULL;
1903 	}
1904 
1905 	switch (state) {
1906 	case DR_STATE_UNREFERENCED:	/* state set by dr_release_dev_done */
1907 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1908 
1909 		if (t_mp != NULL && t_mp->sbm_del_mlist != NULL) {
1910 			PR_MEM("%s: undoing target %s memory delete\n",
1911 				f, t_mp->sbm_cm.sbdev_path);
1912 			dr_add_memory_spans(t_mp, t_mp->sbm_del_mlist);
1913 
1914 			DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1915 		}
1916 
1917 		if (s_mp->sbm_del_mlist != NULL) {
1918 			PR_MEM("%s: undoing %s memory delete\n",
1919 				f, s_mp->sbm_cm.sbdev_path);
1920 
1921 			dr_add_memory_spans(s_mp, s_mp->sbm_del_mlist);
1922 		}
1923 
1924 		/*FALLTHROUGH*/
1925 
1926 /* TODO: should no longer be possible to see the release state here */
1927 	case DR_STATE_RELEASE:	/* state set by dr_release_mem_done */
1928 
1929 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1930 
1931 		if (t_mp != NULL) {
1932 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1933 			t_mp->sbm_del_mlist = NULL;
1934 
1935 			if (t_mp->sbm_mlist != NULL) {
1936 				memlist_delete(t_mp->sbm_mlist);
1937 				t_mp->sbm_mlist = NULL;
1938 			}
1939 
1940 			t_mp->sbm_peer = NULL;
1941 			t_mp->sbm_flags = 0;
1942 			t_mp->sbm_cm.sbdev_busy = 0;
1943 			dr_init_mem_unit_data(t_mp);
1944 
1945 			DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1946 
1947 			dr_device_transition(
1948 				&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1949 		}
1950 
1951 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1952 			memlist_delete(s_mp->sbm_del_mlist);
1953 		s_mp->sbm_del_mlist = NULL;
1954 
1955 		if (s_mp->sbm_mlist != NULL) {
1956 			memlist_delete(s_mp->sbm_mlist);
1957 			s_mp->sbm_mlist = NULL;
1958 		}
1959 
1960 		s_mp->sbm_peer = NULL;
1961 		s_mp->sbm_flags = 0;
1962 		s_mp->sbm_cm.sbdev_busy = 0;
1963 		dr_init_mem_unit_data(s_mp);
1964 
1965 		return (0);
1966 
1967 	default:
1968 		PR_MEM("%s: WARNING unexpected state (%d) for %s\n",
1969 			f, (int)state, s_mp->sbm_cm.sbdev_path);
1970 
1971 		return (-1);
1972 	}
1973 	/*NOTREACHED*/
1974 }
1975 
1976 void
1977 dr_init_mem_unit(dr_mem_unit_t *mp)
1978 {
1979 	dr_state_t	new_state;
1980 
1981 
1982 	if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) {
1983 		new_state = DR_STATE_CONFIGURED;
1984 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1985 	} else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) {
1986 		new_state = DR_STATE_CONNECTED;
1987 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1988 	} else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) {
1989 		new_state = DR_STATE_OCCUPIED;
1990 	} else {
1991 		new_state = DR_STATE_EMPTY;
1992 	}
1993 
1994 	if (DR_DEV_IS_PRESENT(&mp->sbm_cm))
1995 		dr_init_mem_unit_data(mp);
1996 
1997 	/* delay transition until fully initialized */
1998 	dr_device_transition(&mp->sbm_cm, new_state);
1999 }
2000 
2001 static void
2002 dr_init_mem_unit_data(dr_mem_unit_t *mp)
2003 {
2004 	drmachid_t	id = mp->sbm_cm.sbdev_id;
2005 	uint64_t	bytes;
2006 	sbd_error_t	*err;
2007 	static fn_t	f = "dr_init_mem_unit_data";
2008 	update_membounds_t umb;
2009 
2010 	PR_MEM("%s...\n", f);
2011 
2012 	/* a little sanity checking */
2013 	ASSERT(mp->sbm_peer == NULL);
2014 	ASSERT(mp->sbm_flags == 0);
2015 
2016 	/* get basepfn of mem unit */
2017 	err = drmach_mem_get_base_physaddr(id, &bytes);
2018 	if (err) {
2019 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
2020 		mp->sbm_basepfn = (pfn_t)-1;
2021 	} else
2022 		mp->sbm_basepfn = _b64top(bytes);
2023 
2024 	/* attempt to get number of pages from PDA */
2025 	err = drmach_mem_get_size(id, &bytes);
2026 	if (err) {
2027 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
2028 		mp->sbm_npages = 0;
2029 	} else
2030 		mp->sbm_npages = _b64top(bytes);
2031 
2032 	/* if didn't work, calculate using memlist */
2033 	if (mp->sbm_npages == 0) {
2034 		struct memlist	*ml, *mlist;
2035 		/*
2036 		 * Either we couldn't open the PDA or our
2037 		 * PDA has garbage in it.  We must have the
2038 		 * page count consistent and whatever the
2039 		 * OS states has precedence over the PDA
2040 		 * so let's check the kernel.
2041 		 */
2042 /* TODO: curious comment. it suggests pda query should happen if this fails */
2043 		PR_MEM("%s: PDA query failed for npages."
2044 			" Checking memlist for %s\n",
2045 			f, mp->sbm_cm.sbdev_path);
2046 
2047 		mlist = dr_get_memlist(mp);
2048 		for (ml = mlist; ml; ml = ml->next)
2049 			mp->sbm_npages += btop(ml->size);
2050 		memlist_delete(mlist);
2051 	}
2052 
2053 	err = drmach_mem_get_alignment(id, &bytes);
2054 	if (err) {
2055 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
2056 		mp->sbm_alignment_mask = 0;
2057 	} else
2058 		mp->sbm_alignment_mask = _b64top(bytes);
2059 
2060 	err = drmach_mem_get_slice_size(id, &bytes);
2061 	if (err) {
2062 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
2063 		mp->sbm_slice_size = 0; /* paranoia */
2064 	} else
2065 		mp->sbm_slice_size = bytes;
2066 
2067 	/*
2068 	 * Add memory to lgroup
2069 	 */
2070 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
2071 	umb.u_base = (uint64_t)mp->sbm_basepfn << MMU_PAGESHIFT;
2072 	umb.u_len = (uint64_t)mp->sbm_npages << MMU_PAGESHIFT;
2073 
2074 	lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb);
2075 
2076 	PR_MEM("%s: %s (basepfn = 0x%x, npgs = %d)\n",
2077 		f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages);
2078 }
2079 
2080 static int
2081 dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *ml)
2082 {
2083 	int		err;
2084 	pfn_t		base;
2085 	pgcnt_t		npgs;
2086 	struct memlist	*mc;
2087 	static fn_t	f = "dr_reserve_mem_spans";
2088 
2089 	PR_MEM("%s...\n", f);
2090 
2091 	/*
2092 	 * Walk the supplied memlist scheduling each span for removal
2093 	 * with kphysm_del_span.  It is possible that a span may intersect
2094 	 * an area occupied by the cage.
2095 	 */
2096 	for (mc = ml; mc != NULL; mc = mc->next) {
2097 		base = _b64top(mc->address);
2098 		npgs = _b64top(mc->size);
2099 
2100 		err = kphysm_del_span(*mhp, base, npgs);
2101 		if (err != KPHYSM_OK) {
2102 			cmn_err(CE_WARN, "%s memory reserve failed."
2103 				" unexpected kphysm_del_span return value %d;"
2104 				" basepfn=0x%lx npages=%ld",
2105 				f, err, base, npgs);
2106 
2107 			return (-1);
2108 		}
2109 	}
2110 
2111 	return (0);
2112 }
2113 
2114 /* debug counters */
2115 int dr_smt_realigned;
2116 int dr_smt_preference[4];
2117 
2118 #ifdef DEBUG
2119 uint_t dr_ignore_board; /* if bit[bnum-1] set, board won't be candidate */
2120 #endif
2121 
2122 /*
2123  * Find and reserve a copy/rename target board suitable for the
2124  * given source board.
2125  * All boards in the system are examined and categorized in relation to
2126  * their memory size versus the source board's memory size.  Order of
2127  * preference is:
2128  *	1st: board has same memory size
2129  * 	2nd: board has larger memory size
2130  *	3rd: board has smaller memory size
2131  *	4th: board has smaller memory size, available memory will be reduced.
2132  * Boards in category 3 and 4 will have their MC's reprogrammed to locate the
2133  * span to which the MC responds to address span that appropriately covers
2134  * the nonrelocatable span of the source board.
2135  */
2136 static int
2137 dr_select_mem_target(dr_handle_t *hp,
2138 	dr_mem_unit_t *s_mp, struct memlist *s_ml)
2139 {
2140 	pgcnt_t		sz = _b64top(s_mp->sbm_slice_size);
2141 	pgcnt_t		sm = sz - 1; /* mem_slice_mask */
2142 	pfn_t		s_phi, t_phi;
2143 
2144 	int		n_sets = 4; /* same, larger, smaller, clipped */
2145 	int		preference; /* lower value is higher preference */
2146 	int		n_units_per_set;
2147 	int		idx;
2148 	dr_mem_unit_t	**sets;
2149 
2150 	int		t_bd;
2151 	int		t_unit;
2152 	int		rv;
2153 	int		allow_src_memrange_modify;
2154 	int		allow_targ_memrange_modify;
2155 	drmachid_t	t_id;
2156 	dr_board_t	*s_bp, *t_bp;
2157 	dr_mem_unit_t	*t_mp, *c_mp;
2158 	struct memlist	*d_ml, *t_ml, *x_ml;
2159 	memquery_t	s_mq = {0};
2160 	static fn_t	f = "dr_select_mem_target";
2161 
2162 	PR_MEM("%s...\n", f);
2163 
2164 	ASSERT(s_ml != NULL);
2165 
2166 	n_units_per_set = MAX_BOARDS * MAX_MEM_UNITS_PER_BOARD;
2167 	sets = GETSTRUCT(dr_mem_unit_t *, n_units_per_set * n_sets);
2168 
2169 	s_bp = hp->h_bd;
2170 	/* calculate the offset into the slice of the last source board pfn */
2171 	ASSERT(s_mp->sbm_npages != 0);
2172 	s_phi = (s_mp->sbm_basepfn + s_mp->sbm_npages - 1) & sm;
2173 
2174 	allow_src_memrange_modify = drmach_allow_memrange_modify(s_bp->b_id);
2175 
2176 	/*
2177 	 * Make one pass through all memory units on all boards
2178 	 * and categorize them with respect to the source board.
2179 	 */
2180 	for (t_bd = 0; t_bd < MAX_BOARDS; t_bd++) {
2181 		/*
2182 		 * The board structs are a contiguous array
2183 		 * so we take advantage of that to find the
2184 		 * correct board struct pointer for a given
2185 		 * board number.
2186 		 */
2187 		t_bp = dr_lookup_board(t_bd);
2188 
2189 		/* source board can not be its own target */
2190 		if (s_bp->b_num == t_bp->b_num)
2191 			continue;
2192 
2193 		for (t_unit = 0; t_unit < MAX_MEM_UNITS_PER_BOARD; t_unit++) {
2194 
2195 			t_mp = dr_get_mem_unit(t_bp, t_unit);
2196 
2197 			/* this memory node must be attached */
2198 			if (!DR_DEV_IS_ATTACHED(&t_mp->sbm_cm))
2199 				continue;
2200 
2201 			/* source unit can not be its own target */
2202 			if (s_mp == t_mp) {
2203 				/* catch this is debug kernels */
2204 				ASSERT(0);
2205 				continue;
2206 			}
2207 
2208 			/*
2209 			 * this memory node must not already be reserved
2210 			 * by some other memory delete operation.
2211 			 */
2212 			if (t_mp->sbm_flags & DR_MFLAG_RESERVED)
2213 				continue;
2214 
2215 			/*
2216 			 * categorize the memory node
2217 			 * If this is a smaller memory node, create a
2218 			 * temporary, edited copy of the source board's
2219 			 * memlist containing only the span of the non-
2220 			 * relocatable pages.
2221 			 */
2222 			t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2223 			t_id = t_mp->sbm_cm.sbdev_bp->b_id;
2224 			allow_targ_memrange_modify =
2225 			    drmach_allow_memrange_modify(t_id);
2226 			if (t_mp->sbm_npages == s_mp->sbm_npages &&
2227 			    t_phi == s_phi) {
2228 				preference = 0;
2229 				t_mp->sbm_slice_offset = 0;
2230 			} else if (t_mp->sbm_npages > s_mp->sbm_npages &&
2231 			    t_phi > s_phi) {
2232 				/*
2233 				 * Selecting this target will require modifying
2234 				 * the source and/or target physical address
2235 				 * ranges.  Skip if not supported by platform.
2236 				 */
2237 				if (!allow_src_memrange_modify ||
2238 				    !allow_targ_memrange_modify) {
2239 					PR_MEM("%s: skip target %s, memory "
2240 					    "range relocation not supported "
2241 					    "by platform\n", f,
2242 					    t_mp->sbm_cm.sbdev_path);
2243 					continue;
2244 				}
2245 				preference = 1;
2246 				t_mp->sbm_slice_offset = 0;
2247 			} else {
2248 				pfn_t		pfn = 0;
2249 
2250 				/*
2251 				 * Selecting this target will require modifying
2252 				 * the source and/or target physical address
2253 				 * ranges.  Skip if not supported by platform.
2254 				 */
2255 				if (!allow_src_memrange_modify ||
2256 				    !allow_targ_memrange_modify) {
2257 					PR_MEM("%s: skip target %s, memory "
2258 					    "range relocation not supported "
2259 					    "by platform\n", f,
2260 					    t_mp->sbm_cm.sbdev_path);
2261 					continue;
2262 				}
2263 
2264 				/*
2265 				 * Check if its mc can be programmed to relocate
2266 				 * the active address range to match the
2267 				 * nonrelocatable span of the source board.
2268 				 */
2269 				preference = 2;
2270 
2271 				if (s_mq.phys_pages == 0) {
2272 					/*
2273 					 * find non-relocatable span on
2274 					 * source board.
2275 					 */
2276 					rv = kphysm_del_span_query(
2277 						s_mp->sbm_basepfn,
2278 						s_mp->sbm_npages, &s_mq);
2279 					if (rv != KPHYSM_OK) {
2280 						PR_MEM("%s: %s: unexpected"
2281 						" kphysm_del_span_query"
2282 						" return value %d;"
2283 						" basepfn 0x%lx, npages %ld\n",
2284 						f,
2285 						s_mp->sbm_cm.sbdev_path,
2286 						rv,
2287 						s_mp->sbm_basepfn,
2288 						s_mp->sbm_npages);
2289 
2290 						/* paranoia */
2291 						s_mq.phys_pages = 0;
2292 
2293 						continue;
2294 					}
2295 
2296 					/* more paranoia */
2297 					ASSERT(s_mq.phys_pages != 0);
2298 					ASSERT(s_mq.nonrelocatable != 0);
2299 
2300 					/*
2301 					 * this should not happen
2302 					 * if it does, it simply means that
2303 					 * we can not proceed with qualifying
2304 					 * this target candidate.
2305 					 */
2306 					if (s_mq.nonrelocatable == 0)
2307 						continue;
2308 
2309 					PR_MEM("%s: %s: nonrelocatable"
2310 						" span (0x%lx..0x%lx)\n",
2311 						f,
2312 						s_mp->sbm_cm.sbdev_path,
2313 						s_mq.first_nonrelocatable,
2314 						s_mq.last_nonrelocatable);
2315 				}
2316 
2317 				/*
2318 				 * Round down the starting pfn of the
2319 				 * nonrelocatable span on the source board
2320 				 * to nearest programmable boundary possible
2321 				 * with this target candidate.
2322 				 */
2323 				pfn = s_mq.first_nonrelocatable &
2324 					~t_mp->sbm_alignment_mask;
2325 
2326 				/* skip candidate if memory is too small */
2327 				if (pfn + t_mp->sbm_npages <
2328 					s_mq.last_nonrelocatable)
2329 					continue;
2330 
2331 				/*
2332 				 * reprogramming an mc to relocate its
2333 				 * active address range means the beginning
2334 				 * address to which the DIMMS respond will
2335 				 * be somewhere above the slice boundary
2336 				 * address.  The larger the size of memory
2337 				 * on this unit, the more likely part of it
2338 				 * will exist beyond the end of the slice.
2339 				 * The portion of the memory that does is
2340 				 * unavailable to the system until the mc
2341 				 * reprogrammed to a more favorable base
2342 				 * address.
2343 				 * An attempt is made to avoid the loss by
2344 				 * recalculating the mc base address relative
2345 				 * to the end of the slice.  This may produce
2346 				 * a more favorable result.  If not, we lower
2347 				 * the board's preference rating so that it
2348 				 * is one the last candidate boards to be
2349 				 * considered.
2350 				 */
2351 				if ((pfn + t_mp->sbm_npages) & ~sm) {
2352 					pfn_t p;
2353 
2354 					ASSERT(sz >= t_mp->sbm_npages);
2355 
2356 					/*
2357 					 * calculate an alternative starting
2358 					 * address relative to the end of the
2359 					 * slice's address space.
2360 					 */
2361 					p = pfn & ~sm;
2362 					p = p + (sz - t_mp->sbm_npages);
2363 					p = p & ~t_mp->sbm_alignment_mask;
2364 
2365 					if ((p > s_mq.first_nonrelocatable) ||
2366 						(p + t_mp->sbm_npages <
2367 						s_mq.last_nonrelocatable)) {
2368 
2369 						/*
2370 						 * alternative starting addr
2371 						 * won't work. Lower preference
2372 						 * rating of this board, since
2373 						 * some number of pages will
2374 						 * unavailable for use.
2375 						 */
2376 						preference = 3;
2377 					} else {
2378 						dr_smt_realigned++;
2379 						pfn = p;
2380 					}
2381 				}
2382 
2383 				/*
2384 				 * translate calculated pfn to an offset
2385 				 * relative to the slice boundary.  If the
2386 				 * candidate board is selected, this offset
2387 				 * will be used to calculate the values
2388 				 * programmed into the mc.
2389 				 */
2390 				t_mp->sbm_slice_offset = pfn & sm;
2391 				PR_MEM("%s: %s:"
2392 					"  proposed mc offset 0x%lx\n",
2393 					f,
2394 					t_mp->sbm_cm.sbdev_path,
2395 					t_mp->sbm_slice_offset);
2396 			}
2397 
2398 			dr_smt_preference[preference]++;
2399 
2400 			/* calculate index to start of preference set */
2401 			idx  = n_units_per_set * preference;
2402 			/* calculate offset to respective element */
2403 			idx += t_bd * MAX_MEM_UNITS_PER_BOARD + t_unit;
2404 
2405 			ASSERT(idx < n_units_per_set * n_sets);
2406 			sets[idx] = t_mp;
2407 		}
2408 	}
2409 
2410 	/*
2411 	 * NOTE: this would be a good place to sort each candidate
2412 	 * set in to some desired order, e.g. memory size in ascending
2413 	 * order.  Without an additional sorting step here, the order
2414 	 * within a set is ascending board number order.
2415 	 */
2416 
2417 	c_mp = NULL;
2418 	x_ml = NULL;
2419 	t_ml = NULL;
2420 	for (idx = 0; idx < n_units_per_set * n_sets; idx++) {
2421 		memquery_t mq;
2422 
2423 		/* cleanup t_ml after previous pass */
2424 		if (t_ml != NULL) {
2425 			memlist_delete(t_ml);
2426 			t_ml = NULL;
2427 		}
2428 
2429 		/* get candidate target board mem unit */
2430 		t_mp = sets[idx];
2431 		if (t_mp == NULL)
2432 			continue;
2433 
2434 		/* get target board memlist */
2435 		t_ml = dr_get_memlist(t_mp);
2436 		if (t_ml == NULL) {
2437 			cmn_err(CE_WARN, "%s: no memlist for"
2438 				" mem-unit %d, board %d",
2439 				f,
2440 				t_mp->sbm_cm.sbdev_bp->b_num,
2441 				t_mp->sbm_cm.sbdev_unum);
2442 
2443 			continue;
2444 		}
2445 
2446 		/* get appropriate source board memlist */
2447 		t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2448 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2449 			spgcnt_t excess;
2450 
2451 			/*
2452 			 * make a copy of the source board memlist
2453 			 * then edit it to remove the spans that
2454 			 * are outside the calculated span of
2455 			 * [pfn..s_mq.last_nonrelocatable].
2456 			 */
2457 			if (x_ml != NULL)
2458 				memlist_delete(x_ml);
2459 
2460 			x_ml = memlist_dup(s_ml);
2461 			if (x_ml == NULL) {
2462 				PR_MEM("%s: memlist_dup failed\n", f);
2463 				/* TODO: should abort */
2464 				continue;
2465 			}
2466 
2467 			/* trim off lower portion */
2468 			excess = t_mp->sbm_slice_offset -
2469 			    (s_mp->sbm_basepfn & sm);
2470 
2471 			if (excess > 0) {
2472 				x_ml = memlist_del_span(
2473 					x_ml,
2474 					_ptob64(s_mp->sbm_basepfn),
2475 					_ptob64(excess));
2476 			}
2477 			ASSERT(x_ml);
2478 
2479 			/*
2480 			 * Since this candidate target board is smaller
2481 			 * than the source board, s_mq must have been
2482 			 * initialized in previous loop while processing
2483 			 * this or some other candidate board.
2484 			 * FIXME: this is weak.
2485 			 */
2486 			ASSERT(s_mq.phys_pages != 0);
2487 
2488 			/* trim off upper portion */
2489 			excess = (s_mp->sbm_basepfn + s_mp->sbm_npages)
2490 				- (s_mq.last_nonrelocatable + 1);
2491 			if (excess > 0) {
2492 				pfn_t p;
2493 
2494 				p  = s_mq.last_nonrelocatable + 1;
2495 				x_ml = memlist_del_span(
2496 					x_ml,
2497 					_ptob64(p),
2498 					_ptob64(excess));
2499 			}
2500 
2501 			PR_MEM("%s: %s: edited source memlist:\n",
2502 				f, s_mp->sbm_cm.sbdev_path);
2503 			PR_MEMLIST_DUMP(x_ml);
2504 
2505 #ifdef DEBUG
2506 			/* sanity check memlist */
2507 			d_ml = x_ml;
2508 			while (d_ml->next != NULL)
2509 				d_ml = d_ml->next;
2510 
2511 			ASSERT(d_ml->address + d_ml->size ==
2512 				_ptob64(s_mq.last_nonrelocatable + 1));
2513 #endif
2514 
2515 			/*
2516 			 * x_ml now describes only the portion of the
2517 			 * source board that will be moved during the
2518 			 * copy/rename operation.
2519 			 */
2520 			d_ml = x_ml;
2521 		} else {
2522 			/* use original memlist; all spans will be moved */
2523 			d_ml = s_ml;
2524 		}
2525 
2526 		/* verify target can support source memory spans. */
2527 		if (memlist_canfit(d_ml, t_ml) == 0) {
2528 			PR_MEM("%s: source memlist won't"
2529 				" fit in target memlist\n", f);
2530 			PR_MEM("%s: source memlist:\n", f);
2531 			PR_MEMLIST_DUMP(d_ml);
2532 			PR_MEM("%s: target memlist:\n", f);
2533 			PR_MEMLIST_DUMP(t_ml);
2534 
2535 			continue;
2536 		}
2537 
2538 		/* NOTE: the value of d_ml is not used beyond this point */
2539 
2540 		PR_MEM("%s: checking for no-reloc in %s, "
2541 			" basepfn=0x%lx, npages=%ld\n",
2542 			f,
2543 			t_mp->sbm_cm.sbdev_path,
2544 			t_mp->sbm_basepfn,
2545 			t_mp->sbm_npages);
2546 
2547 		rv = kphysm_del_span_query(
2548 			t_mp->sbm_basepfn, t_mp->sbm_npages, &mq);
2549 		if (rv != KPHYSM_OK) {
2550 			PR_MEM("%s: kphysm_del_span_query:"
2551 				" unexpected return value %d\n", f, rv);
2552 
2553 			continue;
2554 		}
2555 
2556 		if (mq.nonrelocatable != 0) {
2557 			PR_MEM("%s: candidate %s has"
2558 				" nonrelocatable span [0x%lx..0x%lx]\n",
2559 				f,
2560 				t_mp->sbm_cm.sbdev_path,
2561 				mq.first_nonrelocatable,
2562 				mq.last_nonrelocatable);
2563 
2564 			continue;
2565 		}
2566 
2567 #ifdef DEBUG
2568 		/*
2569 		 * This is a debug tool for excluding certain boards
2570 		 * from being selected as a target board candidate.
2571 		 * dr_ignore_board is only tested by this driver.
2572 		 * It must be set with adb, obp, /etc/system or your
2573 		 * favorite debugger.
2574 		 */
2575 		if (dr_ignore_board &
2576 			(1 << (t_mp->sbm_cm.sbdev_bp->b_num - 1))) {
2577 			PR_MEM("%s: dr_ignore_board flag set,"
2578 				" ignoring %s as candidate\n",
2579 				f, t_mp->sbm_cm.sbdev_path);
2580 			continue;
2581 		}
2582 #endif
2583 
2584 		/*
2585 		 * Reserve excess source board memory, if any.
2586 		 *
2587 		 * When the number of pages on the candidate target
2588 		 * board is less than the number of pages on the source,
2589 		 * then some spans (clearly) of the source board's address
2590 		 * space will not be covered by physical memory after the
2591 		 * copy/rename completes.  The following code block
2592 		 * schedules those spans to be deleted.
2593 		 */
2594 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2595 			pfn_t pfn;
2596 			uint64_t s_del_pa;
2597 			struct memlist *ml;
2598 
2599 			d_ml = memlist_dup(s_ml);
2600 			if (d_ml == NULL) {
2601 				PR_MEM("%s: cant dup src brd memlist\n", f);
2602 				/* TODO: should abort */
2603 				continue;
2604 			}
2605 
2606 			/* calculate base pfn relative to target board */
2607 			pfn  = s_mp->sbm_basepfn & ~sm;
2608 			pfn += t_mp->sbm_slice_offset;
2609 
2610 			/*
2611 			 * cannot split dynamically added segment
2612 			 */
2613 			s_del_pa = _ptob64(pfn + t_mp->sbm_npages);
2614 			PR_MEM("%s: proposed src delete pa=0x%lx\n", f,
2615 			    s_del_pa);
2616 			PR_MEM("%s: checking for split of dyn seg list:\n", f);
2617 			PR_MEMLIST_DUMP(s_mp->sbm_dyn_segs);
2618 			for (ml = s_mp->sbm_dyn_segs; ml; ml = ml->next) {
2619 				if (s_del_pa > ml->address &&
2620 				    s_del_pa < ml->address + ml->size) {
2621 					s_del_pa = ml->address;
2622 					break;
2623 				}
2624 			}
2625 
2626 			/* remove span that will reside on candidate board */
2627 			d_ml = memlist_del_span(d_ml, _ptob64(pfn),
2628 			    s_del_pa - _ptob64(pfn));
2629 
2630 			PR_MEM("%s: %s: reserving src brd memlist:\n",
2631 				f, s_mp->sbm_cm.sbdev_path);
2632 			PR_MEMLIST_DUMP(d_ml);
2633 
2634 			/* reserve excess spans */
2635 			if (dr_reserve_mem_spans(
2636 				&s_mp->sbm_memhandle, d_ml) != 0) {
2637 
2638 				/* likely more non-reloc pages appeared */
2639 				/* TODO: restart from top? */
2640 				continue;
2641 			}
2642 		} else {
2643 			/* no excess source board memory */
2644 			d_ml = NULL;
2645 		}
2646 
2647 		s_mp->sbm_flags |= DR_MFLAG_RESERVED;
2648 
2649 		/*
2650 		 * reserve all memory on target board.
2651 		 * NOTE: source board's memhandle is used.
2652 		 *
2653 		 * If this succeeds (eq 0), then target selection is
2654 		 * complete and all unwanted memory spans, both source and
2655 		 * target, have been reserved.  Loop is terminated.
2656 		 */
2657 		if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, t_ml) == 0) {
2658 			PR_MEM("%s: %s: target board memory reserved\n",
2659 				f, t_mp->sbm_cm.sbdev_path);
2660 
2661 			/* a candidate target board is now reserved */
2662 			t_mp->sbm_flags |= DR_MFLAG_RESERVED;
2663 			c_mp = t_mp;
2664 
2665 			/* *** EXITING LOOP *** */
2666 			break;
2667 		}
2668 
2669 		/* did not successfully reserve the target board. */
2670 		PR_MEM("%s: could not reserve target %s\n",
2671 			f, t_mp->sbm_cm.sbdev_path);
2672 
2673 		/*
2674 		 * NOTE: an undo of the dr_reserve_mem_span work
2675 		 * will happen automatically when the memhandle
2676 		 * (s_mp->sbm_memhandle) is kphysm_del_release'd.
2677 		 */
2678 
2679 		s_mp->sbm_flags &= ~DR_MFLAG_RESERVED;
2680 	}
2681 
2682 	/* clean up after memlist editing logic */
2683 	if (x_ml != NULL)
2684 		memlist_delete(x_ml);
2685 
2686 	FREESTRUCT(sets, dr_mem_unit_t *, n_units_per_set * n_sets);
2687 
2688 	/*
2689 	 * c_mp will be NULL when the entire sets[] array
2690 	 * has been searched without reserving a target board.
2691 	 */
2692 	if (c_mp == NULL) {
2693 		PR_MEM("%s: %s: target selection failed.\n",
2694 			f, s_mp->sbm_cm.sbdev_path);
2695 
2696 		if (t_ml != NULL)
2697 			memlist_delete(t_ml);
2698 
2699 		return (-1);
2700 	}
2701 
2702 	PR_MEM("%s: found target %s for source %s\n",
2703 		f,
2704 		c_mp->sbm_cm.sbdev_path,
2705 		s_mp->sbm_cm.sbdev_path);
2706 
2707 	s_mp->sbm_peer = c_mp;
2708 	s_mp->sbm_flags |= DR_MFLAG_SOURCE;
2709 	s_mp->sbm_del_mlist = d_ml;	/* spans to be deleted, if any */
2710 	s_mp->sbm_mlist = s_ml;
2711 	s_mp->sbm_cm.sbdev_busy = 1;
2712 
2713 	c_mp->sbm_peer = s_mp;
2714 	c_mp->sbm_flags |= DR_MFLAG_TARGET;
2715 	c_mp->sbm_del_mlist = t_ml;	/* spans to be deleted */
2716 	c_mp->sbm_mlist = t_ml;
2717 	c_mp->sbm_cm.sbdev_busy = 1;
2718 
2719 	s_mp->sbm_flags &= ~DR_MFLAG_MEMRESIZE;
2720 	if (c_mp->sbm_npages > s_mp->sbm_npages) {
2721 		s_mp->sbm_flags |= DR_MFLAG_MEMUPSIZE;
2722 		PR_MEM("%s: upsize detected (source=%d < target=%d)\n",
2723 			f, s_mp->sbm_npages, c_mp->sbm_npages);
2724 	} else if (c_mp->sbm_npages < s_mp->sbm_npages) {
2725 		s_mp->sbm_flags |= DR_MFLAG_MEMDOWNSIZE;
2726 		PR_MEM("%s: downsize detected (source=%d > target=%d)\n",
2727 			f, s_mp->sbm_npages, c_mp->sbm_npages);
2728 	}
2729 
2730 	return (0);
2731 }
2732 
2733 /*
2734  * Memlist support.
2735  */
2736 static struct memlist *
2737 memlist_dup(struct memlist *mlist)
2738 {
2739 	struct memlist *hl = NULL, *tl, **mlp;
2740 
2741 	if (mlist == NULL)
2742 		return (NULL);
2743 
2744 	mlp = &hl;
2745 	tl = *mlp;
2746 	for (; mlist; mlist = mlist->next) {
2747 		*mlp = GETSTRUCT(struct memlist, 1);
2748 		(*mlp)->address = mlist->address;
2749 		(*mlp)->size = mlist->size;
2750 		(*mlp)->prev = tl;
2751 		tl = *mlp;
2752 		mlp = &((*mlp)->next);
2753 	}
2754 	*mlp = NULL;
2755 
2756 	return (hl);
2757 }
2758 
2759 /*
2760  * Determine whether the source memlist (s_mlist) will
2761  * fit into the target memlist (t_mlist) in terms of
2762  * size and holes (i.e. based on same relative base address).
2763  */
2764 static int
2765 memlist_canfit(struct memlist *s_mlist, struct memlist *t_mlist)
2766 {
2767 	int		rv = 0;
2768 	uint64_t	s_basepa, t_basepa;
2769 	struct memlist	*s_ml, *t_ml;
2770 
2771 	if ((s_mlist == NULL) || (t_mlist == NULL))
2772 		return (0);
2773 
2774 	/*
2775 	 * Base both memlists on common base address (0).
2776 	 */
2777 	s_basepa = s_mlist->address;
2778 	t_basepa = t_mlist->address;
2779 
2780 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2781 		s_ml->address -= s_basepa;
2782 
2783 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2784 		t_ml->address -= t_basepa;
2785 
2786 	s_ml = s_mlist;
2787 	for (t_ml = t_mlist; t_ml && s_ml; t_ml = t_ml->next) {
2788 		uint64_t	s_start, s_end;
2789 		uint64_t	t_start, t_end;
2790 
2791 		t_start = t_ml->address;
2792 		t_end = t_start + t_ml->size;
2793 
2794 		for (; s_ml; s_ml = s_ml->next) {
2795 			s_start = s_ml->address;
2796 			s_end = s_start + s_ml->size;
2797 
2798 			if ((s_start < t_start) || (s_end > t_end))
2799 				break;
2800 		}
2801 	}
2802 	/*
2803 	 * If we ran out of source memlist chunks that mean
2804 	 * we found a home for all of them.
2805 	 */
2806 	if (s_ml == NULL)
2807 		rv = 1;
2808 
2809 	/*
2810 	 * Need to add base addresses back since memlists
2811 	 * are probably in use by caller.
2812 	 */
2813 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2814 		s_ml->address += s_basepa;
2815 
2816 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2817 		t_ml->address += t_basepa;
2818 
2819 	return (rv);
2820 }
2821 
2822 static struct memlist *
2823 memlist_del_span(struct memlist *mlist, uint64_t base, uint64_t len)
2824 {
2825 	uint64_t	end;
2826 	struct memlist	*ml, *tl, *nlp;
2827 
2828 	if (mlist == NULL)
2829 		return (NULL);
2830 
2831 	end = base + len;
2832 	if ((end <= mlist->address) || (base == end))
2833 		return (mlist);
2834 
2835 	for (tl = ml = mlist; ml; tl = ml, ml = nlp) {
2836 		uint64_t	mend;
2837 
2838 		nlp = ml->next;
2839 
2840 		if (end <= ml->address)
2841 			break;
2842 
2843 		mend = ml->address + ml->size;
2844 		if (base < mend) {
2845 			if (base <= ml->address) {
2846 				ml->address = end;
2847 				if (end >= mend)
2848 					ml->size = 0ull;
2849 				else
2850 					ml->size = mend - ml->address;
2851 			} else {
2852 				ml->size = base - ml->address;
2853 				if (end < mend) {
2854 					struct memlist	*nl;
2855 					/*
2856 					 * splitting an memlist entry.
2857 					 */
2858 					nl = GETSTRUCT(struct memlist, 1);
2859 					nl->address = end;
2860 					nl->size = mend - nl->address;
2861 					if ((nl->next = nlp) != NULL)
2862 						nlp->prev = nl;
2863 					nl->prev = ml;
2864 					ml->next = nl;
2865 					nlp = nl;
2866 				}
2867 			}
2868 			if (ml->size == 0ull) {
2869 				if (ml == mlist) {
2870 					if ((mlist = nlp) != NULL)
2871 						nlp->prev = NULL;
2872 					FREESTRUCT(ml, struct memlist, 1);
2873 					if (mlist == NULL)
2874 						break;
2875 					ml = nlp;
2876 				} else {
2877 					if ((tl->next = nlp) != NULL)
2878 						nlp->prev = tl;
2879 					FREESTRUCT(ml, struct memlist, 1);
2880 					ml = tl;
2881 				}
2882 			}
2883 		}
2884 	}
2885 
2886 	return (mlist);
2887 }
2888 
2889 /*
2890  * add span without merging
2891  */
2892 static struct memlist *
2893 memlist_cat_span(struct memlist *mlist, uint64_t base, uint64_t len)
2894 {
2895 	struct memlist	*ml, *tl, *nl;
2896 
2897 	if (len == 0ull)
2898 		return (NULL);
2899 
2900 	if (mlist == NULL) {
2901 		mlist = GETSTRUCT(struct memlist, 1);
2902 		mlist->address = base;
2903 		mlist->size = len;
2904 		mlist->next = mlist->prev = NULL;
2905 
2906 		return (mlist);
2907 	}
2908 
2909 	for (tl = ml = mlist; ml; tl = ml, ml = ml->next) {
2910 		if (base < ml->address) {
2911 			nl = GETSTRUCT(struct memlist, 1);
2912 			nl->address = base;
2913 			nl->size = len;
2914 			nl->next = ml;
2915 			if ((nl->prev = ml->prev) != NULL)
2916 				nl->prev->next = nl;
2917 			ml->prev = nl;
2918 			if (mlist == ml)
2919 				mlist = nl;
2920 			break;
2921 		}
2922 	}
2923 
2924 	if (ml == NULL) {
2925 		nl = GETSTRUCT(struct memlist, 1);
2926 		nl->address = base;
2927 		nl->size = len;
2928 		nl->next = NULL;
2929 		nl->prev = tl;
2930 		tl->next = nl;
2931 	}
2932 
2933 	return (mlist);
2934 }
2935