xref: /titanic_52/usr/src/uts/sun4u/ngdr/io/dr_mem.c (revision 2eaee53e5b3d4cd48a35cd651c0a8ae149d772c5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * DR memory support routines.
31  */
32 
33 #include <sys/note.h>
34 #include <sys/debug.h>
35 #include <sys/types.h>
36 #include <sys/errno.h>
37 #include <sys/param.h>
38 #include <sys/dditypes.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/sunndi.h>
44 #include <sys/ddi_impldefs.h>
45 #include <sys/ndi_impldefs.h>
46 #include <sys/sysmacros.h>
47 #include <sys/machsystm.h>
48 #include <sys/spitregs.h>
49 #include <sys/cpuvar.h>
50 #include <sys/promif.h>
51 #include <vm/seg_kmem.h>
52 #include <sys/lgrp.h>
53 #include <sys/platform_module.h>
54 
55 #include <vm/page.h>
56 
57 #include <sys/dr.h>
58 #include <sys/dr_util.h>
59 
60 extern struct memlist	*phys_install;
61 
62 /* TODO: push this reference below drmach line */
63 extern int		kcage_on;
64 
65 /* for the DR*INTERNAL_ERROR macros.  see sys/dr.h. */
66 static char *dr_ie_fmt = "%M% %d";
67 
68 static int		dr_post_detach_mem_unit(dr_mem_unit_t *mp);
69 static int		dr_reserve_mem_spans(memhandle_t *mhp,
70 					struct memlist *mlist);
71 static int		dr_select_mem_target(dr_handle_t *hp,
72 				dr_mem_unit_t *mp, struct memlist *ml);
73 static void		dr_init_mem_unit_data(dr_mem_unit_t *mp);
74 
75 static struct memlist	*memlist_dup(struct memlist *);
76 static int		memlist_canfit(struct memlist *s_mlist,
77 					struct memlist *t_mlist);
78 static struct memlist	*memlist_del_span(struct memlist *mlist,
79 					uint64_t base, uint64_t len);
80 static struct memlist	*memlist_cat_span(struct memlist *mlist,
81 					uint64_t base, uint64_t len);
82 
83 /*
84  * dr_mem_unit_t.sbm_flags
85  */
86 #define	DR_MFLAG_RESERVED	0x01	/* mem unit reserved for delete */
87 #define	DR_MFLAG_SOURCE		0x02	/* source brd of copy/rename op */
88 #define	DR_MFLAG_TARGET		0x04	/* target brd of copy/rename op */
89 #define	DR_MFLAG_MEMUPSIZE	0x08	/* move from big to small board */
90 #define	DR_MFLAG_MEMDOWNSIZE	0x10	/* move from small to big board */
91 #define	DR_MFLAG_MEMRESIZE	0x18	/* move to different size board */
92 #define	DR_MFLAG_RELOWNER	0x20	/* memory release (delete) owner */
93 #define	DR_MFLAG_RELDONE	0x40	/* memory release (delete) done */
94 
95 /* helper macros */
96 #define	_ptob64(p) ((uint64_t)(p) << PAGESHIFT)
97 #define	_b64top(b) ((pgcnt_t)((b) >> PAGESHIFT))
98 
99 static struct memlist *
100 dr_get_memlist(dr_mem_unit_t *mp)
101 {
102 	struct memlist	*mlist = NULL;
103 	sbd_error_t	*err;
104 	static fn_t	f = "dr_get_memlist";
105 
106 	PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path);
107 
108 	/*
109 	 * Return cached memlist, if present.
110 	 * This memlist will be present following an
111 	 * unconfigure (a.k.a: detach) of this memunit.
112 	 * It should only be used in the case were a configure
113 	 * is bringing this memunit back in without going
114 	 * through the disconnect and connect states.
115 	 */
116 	if (mp->sbm_mlist) {
117 		PR_MEM("%s: found cached memlist\n", f);
118 
119 		mlist = memlist_dup(mp->sbm_mlist);
120 	} else {
121 		uint64_t basepa = _ptob64(mp->sbm_basepfn);
122 
123 		/* attempt to construct a memlist using phys_install */
124 
125 		/* round down to slice base address */
126 		basepa &= ~(mp->sbm_slice_size - 1);
127 
128 		/* get a copy of phys_install to edit */
129 		memlist_read_lock();
130 		mlist = memlist_dup(phys_install);
131 		memlist_read_unlock();
132 
133 		/* trim lower irrelevant span */
134 		if (mlist)
135 			mlist = memlist_del_span(mlist, 0ull, basepa);
136 
137 		/* trim upper irrelevant span */
138 		if (mlist) {
139 			uint64_t endpa;
140 
141 			basepa += mp->sbm_slice_size;
142 			endpa = _ptob64(physmax + 1);
143 			if (endpa > basepa)
144 				mlist = memlist_del_span(
145 						mlist,
146 						basepa,
147 						endpa - basepa);
148 		}
149 
150 		if (mlist) {
151 			/* successfully built a memlist */
152 			PR_MEM("%s: derived memlist from phys_install\n", f);
153 		}
154 
155 		/* if no mlist yet, try platform layer */
156 		if (!mlist) {
157 			err = drmach_mem_get_memlist(
158 				mp->sbm_cm.sbdev_id, &mlist);
159 			if (err) {
160 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
161 				mlist = NULL; /* paranoia */
162 			}
163 		}
164 	}
165 
166 	PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path);
167 	PR_MEMLIST_DUMP(mlist);
168 
169 	return (mlist);
170 }
171 
172 typedef struct {
173 	kcondvar_t cond;
174 	kmutex_t lock;
175 	int error;
176 	int done;
177 } dr_release_mem_sync_t;
178 
179 /*
180  * Memory has been logically removed by the time this routine is called.
181  */
182 static void
183 dr_mem_del_done(void *arg, int error)
184 {
185 	dr_release_mem_sync_t *ds = arg;
186 
187 	mutex_enter(&ds->lock);
188 	ds->error = error;
189 	ds->done = 1;
190 	cv_signal(&ds->cond);
191 	mutex_exit(&ds->lock);
192 }
193 
194 /*
195  * When we reach here the memory being drained should have
196  * already been reserved in dr_pre_release_mem().
197  * Our only task here is to kick off the "drain" and wait
198  * for it to finish.
199  */
200 void
201 dr_release_mem(dr_common_unit_t *cp)
202 {
203 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
204 	int		err;
205 	dr_release_mem_sync_t rms;
206 	static fn_t	f = "dr_release_mem";
207 
208 	/* check that this memory unit has been reserved */
209 	if (!(mp->sbm_flags & DR_MFLAG_RELOWNER)) {
210 		DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
211 		return;
212 	}
213 
214 	bzero((void *) &rms, sizeof (rms));
215 
216 	mutex_init(&rms.lock, NULL, MUTEX_DRIVER, NULL);
217 	cv_init(&rms.cond, NULL, CV_DRIVER, NULL);
218 
219 	mutex_enter(&rms.lock);
220 	err = kphysm_del_start(mp->sbm_memhandle,
221 			dr_mem_del_done, (void *) &rms);
222 	if (err == KPHYSM_OK) {
223 		/* wait for completion or interrupt */
224 		while (!rms.done) {
225 			if (cv_wait_sig(&rms.cond, &rms.lock) == 0) {
226 				/* then there is a pending UNIX signal */
227 				(void) kphysm_del_cancel(mp->sbm_memhandle);
228 
229 				/* wait for completion */
230 				while (!rms.done)
231 					cv_wait(&rms.cond, &rms.lock);
232 			}
233 		}
234 		/* get the result of the memory delete operation */
235 		err = rms.error;
236 	}
237 	mutex_exit(&rms.lock);
238 
239 	cv_destroy(&rms.cond);
240 	mutex_destroy(&rms.lock);
241 
242 	if (err != KPHYSM_OK) {
243 		int e_code;
244 
245 		switch (err) {
246 			case KPHYSM_ENOWORK:
247 				e_code = ESBD_NOERROR;
248 				break;
249 
250 			case KPHYSM_EHANDLE:
251 			case KPHYSM_ESEQUENCE:
252 				e_code = ESBD_INTERNAL;
253 				break;
254 
255 			case KPHYSM_ENOTVIABLE:
256 				e_code = ESBD_MEM_NOTVIABLE;
257 				break;
258 
259 			case KPHYSM_EREFUSED:
260 				e_code = ESBD_MEM_REFUSED;
261 				break;
262 
263 			case KPHYSM_ENONRELOC:
264 				e_code = ESBD_MEM_NONRELOC;
265 				break;
266 
267 			case KPHYSM_ECANCELLED:
268 				e_code = ESBD_MEM_CANCELLED;
269 				break;
270 
271 			case KPHYSM_ERESOURCE:
272 				e_code = ESBD_MEMFAIL;
273 				break;
274 
275 			default:
276 				cmn_err(CE_WARN,
277 					"%s: unexpected kphysm error code %d,"
278 					" id 0x%p",
279 					f, err, mp->sbm_cm.sbdev_id);
280 
281 				e_code = ESBD_IO;
282 				break;
283 		}
284 
285 		if (e_code != ESBD_NOERROR) {
286 			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
287 		}
288 	}
289 }
290 
291 void
292 dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
293 {
294 	_NOTE(ARGUNUSED(hp))
295 
296 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
297 	struct memlist	*ml, *mc;
298 	sbd_error_t	*err;
299 	static fn_t	f = "dr_attach_mem";
300 
301 	PR_MEM("%s...\n", f);
302 
303 	dr_lock_status(hp->h_bd);
304 	err = drmach_configure(cp->sbdev_id, 0);
305 	dr_unlock_status(hp->h_bd);
306 	if (err) {
307 		DRERR_SET_C(&cp->sbdev_error, &err);
308 		return;
309 	}
310 
311 	ml = dr_get_memlist(mp);
312 	for (mc = ml; mc; mc = mc->next) {
313 		int		 rv;
314 		sbd_error_t	*err;
315 
316 		rv = kphysm_add_memory_dynamic(
317 				(pfn_t)(mc->address >> PAGESHIFT),
318 				(pgcnt_t)(mc->size >> PAGESHIFT));
319 		if (rv != KPHYSM_OK) {
320 			/*
321 			 * translate kphysm error and
322 			 * store in devlist error
323 			 */
324 			switch (rv) {
325 			case KPHYSM_ERESOURCE:
326 				rv = ESBD_NOMEM;
327 				break;
328 
329 			case KPHYSM_EFAULT:
330 				rv = ESBD_FAULT;
331 				break;
332 
333 			default:
334 				rv = ESBD_INTERNAL;
335 				break;
336 			}
337 
338 			if (rv == ESBD_INTERNAL) {
339 				DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
340 			} else
341 				dr_dev_err(CE_WARN, &mp->sbm_cm, rv);
342 			break;
343 		}
344 
345 		err = drmach_mem_add_span(
346 			mp->sbm_cm.sbdev_id, mc->address, mc->size);
347 		if (err) {
348 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
349 			break;
350 		}
351 	}
352 
353 	memlist_delete(ml);
354 
355 	/* back out if configure failed */
356 	if (mp->sbm_cm.sbdev_error != NULL) {
357 		dr_lock_status(hp->h_bd);
358 		err = drmach_unconfigure(cp->sbdev_id, DRMACH_DEVI_REMOVE);
359 		if (err)
360 			sbd_err_clear(&err);
361 		dr_unlock_status(hp->h_bd);
362 	}
363 }
364 
365 #define	DR_SCRUB_VALUE	0x0d0e0a0d0b0e0e0fULL
366 
367 static void
368 dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
369 {
370 #ifdef DEBUG
371 	clock_t		stime = lbolt;
372 #endif /* DEBUG */
373 
374 	struct memlist	*ml;
375 	uint64_t	scrub_value = DR_SCRUB_VALUE;
376 	processorid_t	cpuid;
377 	static fn_t	f = "dr_mem_ecache_scrub";
378 
379 	cpuid = drmach_mem_cpu_affinity(mp->sbm_cm.sbdev_id);
380 	affinity_set(cpuid);
381 
382 	PR_MEM("%s: using proc %d, memlist...\n", f,
383 	    (cpuid == CPU_CURRENT) ? CPU->cpu_id : cpuid);
384 	PR_MEMLIST_DUMP(mlist);
385 
386 	for (ml = mlist; ml; ml = ml->next) {
387 		uint64_t	dst_pa;
388 		uint64_t	nbytes;
389 
390 		/* calculate the destination physical address */
391 		dst_pa = ml->address;
392 		if (ml->address & PAGEOFFSET)
393 			cmn_err(CE_WARN,
394 				"%s: address (0x%lx) not on "
395 				"page boundary", f, ml->address);
396 
397 		nbytes = ml->size;
398 		if (ml->size & PAGEOFFSET)
399 			cmn_err(CE_WARN,
400 				"%s: size (0x%lx) not on "
401 				"page boundary", f, ml->size);
402 
403 		/*LINTED*/
404 		while (nbytes > 0) {
405 			/* write 64 bits to dst_pa */
406 			stdphys(dst_pa, scrub_value);
407 
408 			/* increment/decrement by cacheline sizes */
409 			dst_pa += DRMACH_COHERENCY_UNIT;
410 			nbytes -= DRMACH_COHERENCY_UNIT;
411 		}
412 	}
413 
414 	/*
415 	 * flush this cpu's ecache and take care to ensure
416 	 * that all of it's bus transactions have retired.
417 	 */
418 	drmach_cpu_flush_ecache_sync();
419 
420 	affinity_clear();
421 
422 #ifdef DEBUG
423 	stime = lbolt - stime;
424 	PR_MEM("%s: scrub ticks = %ld (%ld secs)\n", f, stime, stime / hz);
425 #endif /* DEBUG */
426 }
427 
428 static int
429 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
430 {
431 	time_t		 copytime;
432 	drmachid_t	 cr_id;
433 	dr_sr_handle_t	*srhp;
434 	struct memlist	*c_ml, *d_ml;
435 	sbd_error_t	*err;
436 	static fn_t	 f = "dr_move_memory";
437 
438 	PR_MEM("%s: (INLINE) moving memory from %s to %s\n",
439 		f,
440 		s_mp->sbm_cm.sbdev_path,
441 		t_mp->sbm_cm.sbdev_path);
442 
443 	ASSERT(s_mp->sbm_flags & DR_MFLAG_SOURCE);
444 	ASSERT(s_mp->sbm_peer == t_mp);
445 	ASSERT(s_mp->sbm_mlist);
446 
447 	ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
448 	ASSERT(t_mp->sbm_peer == s_mp);
449 
450 	/*
451 	 * create a memlist of spans to copy by removing
452 	 * the spans that have been deleted, if any, from
453 	 * the full source board memlist.  s_mp->sbm_del_mlist
454 	 * will be NULL if there were no spans deleted from
455 	 * the source board.
456 	 */
457 	c_ml = memlist_dup(s_mp->sbm_mlist);
458 	d_ml = s_mp->sbm_del_mlist;
459 	while (d_ml != NULL) {
460 		c_ml = memlist_del_span(c_ml, d_ml->address, d_ml->size);
461 		d_ml = d_ml->next;
462 	}
463 
464 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
465 
466 	err = drmach_copy_rename_init(
467 		t_mp->sbm_cm.sbdev_id, _ptob64(t_mp->sbm_slice_offset),
468 		s_mp->sbm_cm.sbdev_id, c_ml, &cr_id);
469 	if (err) {
470 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
471 		affinity_clear();
472 		return (-1);
473 	}
474 
475 	srhp = dr_get_sr_handle(hp);
476 	ASSERT(srhp);
477 
478 	copytime = lbolt;
479 
480 	/* Quiesce the OS.  */
481 	if (dr_suspend(srhp)) {
482 		cmn_err(CE_WARN, "%s: failed to quiesce OS"
483 			" for copy-rename", f);
484 
485 		dr_release_sr_handle(srhp);
486 		err = drmach_copy_rename_fini(cr_id);
487 		if (err) {
488 			/*
489 			 * no error is expected since the program has
490 			 * not yet run.
491 			 */
492 
493 			/* catch this in debug kernels */
494 			ASSERT(0);
495 
496 			sbd_err_clear(&err);
497 		}
498 
499 		/* suspend error reached via hp */
500 		s_mp->sbm_cm.sbdev_error = hp->h_err;
501 		hp->h_err = NULL;
502 
503 		affinity_clear();
504 		return (-1);
505 	}
506 
507 	/*
508 	 * Rename memory for lgroup.
509 	 * Source and target board numbers are packaged in arg.
510 	 */
511 	{
512 		dr_board_t	*t_bp, *s_bp;
513 
514 		s_bp = s_mp->sbm_cm.sbdev_bp;
515 		t_bp = t_mp->sbm_cm.sbdev_bp;
516 
517 		lgrp_plat_config(LGRP_CONFIG_MEM_RENAME,
518 			(uintptr_t)(s_bp->b_num | (t_bp->b_num << 16)));
519 	}
520 
521 	drmach_copy_rename(cr_id);
522 
523 	/* Resume the OS.  */
524 	dr_resume(srhp);
525 
526 	copytime = lbolt - copytime;
527 
528 	dr_release_sr_handle(srhp);
529 	err = drmach_copy_rename_fini(cr_id);
530 	if (err)
531 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
532 
533 	affinity_clear();
534 
535 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
536 		f, copytime, copytime / hz);
537 
538 	/* return -1 if dr_suspend or copy/rename recorded an error */
539 	return (err == NULL ? 0 : -1);
540 }
541 
542 /*
543  * If detaching node contains memory that is "non-permanent"
544  * then the memory adr's are simply cleared.  If the memory
545  * is non-relocatable, then do a copy-rename.
546  */
547 void
548 dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
549 {
550 	int			rv = 0;
551 	dr_mem_unit_t		*s_mp = (dr_mem_unit_t *)cp;
552 	dr_mem_unit_t		*t_mp;
553 	dr_state_t		state;
554 	static fn_t		f = "dr_detach_mem";
555 
556 	PR_MEM("%s...\n", f);
557 
558 	/* lookup target mem unit and target board structure, if any */
559 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
560 		t_mp = s_mp->sbm_peer;
561 		ASSERT(t_mp != NULL);
562 		ASSERT(t_mp->sbm_peer == s_mp);
563 	} else {
564 		t_mp = NULL;
565 	}
566 
567 	/* verify mem unit's state is UNREFERENCED */
568 	state = s_mp->sbm_cm.sbdev_state;
569 	if (state != DR_STATE_UNREFERENCED) {
570 		dr_dev_err(CE_IGNORE, &s_mp->sbm_cm, ESBD_STATE);
571 		return;
572 	}
573 
574 	/* verify target mem unit's state is UNREFERENCED, if any */
575 	if (t_mp != NULL) {
576 		state = t_mp->sbm_cm.sbdev_state;
577 		if (state != DR_STATE_UNREFERENCED) {
578 			dr_dev_err(CE_IGNORE, &t_mp->sbm_cm, ESBD_STATE);
579 			return;
580 		}
581 	}
582 
583 	/*
584 	 * Scrub deleted memory.  This will cause all cachelines
585 	 * referencing the memory to only be in the local cpu's
586 	 * ecache.
587 	 */
588 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
589 		/* no del mlist for src<=dst mem size copy/rename */
590 		if (s_mp->sbm_del_mlist)
591 			dr_mem_ecache_scrub(s_mp, s_mp->sbm_del_mlist);
592 	}
593 	if (t_mp != NULL && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
594 		ASSERT(t_mp->sbm_del_mlist);
595 		dr_mem_ecache_scrub(t_mp, t_mp->sbm_del_mlist);
596 	}
597 
598 	/*
599 	 * If there is no target board (no copy/rename was needed), then
600 	 * we're done!
601 	 */
602 	if (t_mp == NULL) {
603 		sbd_error_t *err;
604 		/*
605 		 * Reprogram interconnect hardware and disable
606 		 * memory controllers for memory node that's going away.
607 		 */
608 
609 		err = drmach_mem_disable(s_mp->sbm_cm.sbdev_id);
610 		if (err) {
611 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
612 			rv = -1;
613 		}
614 	} else {
615 		rv = dr_move_memory(hp, s_mp, t_mp);
616 		PR_MEM("%s: %s memory COPY-RENAME (board %d -> %d)\n",
617 			f,
618 			rv ? "FAILED" : "COMPLETED",
619 			s_mp->sbm_cm.sbdev_bp->b_num,
620 			t_mp->sbm_cm.sbdev_bp->b_num);
621 
622 		if (rv != 0)
623 			(void) dr_cancel_mem(s_mp);
624 	}
625 
626 	if (rv == 0) {
627 		sbd_error_t *err;
628 
629 		dr_lock_status(hp->h_bd);
630 		err = drmach_unconfigure(s_mp->sbm_cm.sbdev_id,
631 		    DRMACH_DEVI_REMOVE);
632 		dr_unlock_status(hp->h_bd);
633 		if (err)
634 			sbd_err_clear(&err);
635 	}
636 }
637 
638 #ifndef _STARFIRE
639 /*
640  * XXX workaround for certain lab configurations (see also starcat drmach.c)
641  * Temporary code to get around observed incorrect results from
642  * kphysm_del_span_query when the queried span contains address spans
643  * not occupied by memory in between spans that do have memory.
644  * This routine acts as a wrapper to kphysm_del_span_query.  It builds
645  * a memlist from phys_install of spans that exist between base and
646  * base + npages, inclusively.  Kphysm_del_span_query is called for each
647  * node in the memlist with the results accumulated in *mp.
648  */
649 static int
650 dr_del_span_query(pfn_t base, pgcnt_t npages, memquery_t *mp)
651 {
652 	uint64_t	 pa = _ptob64(base);
653 	uint64_t	 sm = ~ (137438953472ull - 1);
654 	uint64_t	 sa = pa & sm;
655 	struct memlist	*mlist, *ml;
656 	int		 rv;
657 
658 	npages = npages; /* silence lint */
659 	memlist_read_lock();
660 	mlist = memlist_dup(phys_install);
661 	memlist_read_unlock();
662 
663 again:
664 	for (ml = mlist; ml; ml = ml->next) {
665 		if ((ml->address & sm) != sa) {
666 			mlist = memlist_del_span(mlist, ml->address, ml->size);
667 			goto again;
668 		}
669 	}
670 
671 	mp->phys_pages = 0;
672 	mp->managed = 0;
673 	mp->nonrelocatable = 0;
674 	mp->first_nonrelocatable = (pfn_t)-1;	/* XXX */
675 	mp->last_nonrelocatable = 0;
676 
677 	for (ml = mlist; ml; ml = ml->next) {
678 		memquery_t mq;
679 
680 		rv = kphysm_del_span_query(
681 			_b64top(ml->address), _b64top(ml->size), &mq);
682 		if (rv)
683 			break;
684 
685 		mp->phys_pages += mq.phys_pages;
686 		mp->managed += mq.managed;
687 		mp->nonrelocatable += mq.nonrelocatable;
688 
689 		if (mq.nonrelocatable != 0) {
690 			if (mq.first_nonrelocatable < mp->first_nonrelocatable)
691 				mp->first_nonrelocatable =
692 					mq.first_nonrelocatable;
693 			if (mq.last_nonrelocatable > mp->last_nonrelocatable)
694 				mp->last_nonrelocatable =
695 					mq.last_nonrelocatable;
696 		}
697 	}
698 
699 	if (mp->nonrelocatable == 0)
700 		mp->first_nonrelocatable = 0;	/* XXX */
701 
702 	memlist_delete(mlist);
703 	return (rv);
704 }
705 
706 #define	kphysm_del_span_query dr_del_span_query
707 #endif /* _STARFIRE */
708 
709 /*
710  * NOTE: This routine is only partially smart about multiple
711  *	 mem-units.  Need to make mem-status structure smart
712  *	 about them also.
713  */
714 int
715 dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp)
716 {
717 	int		m, mix;
718 	memdelstat_t	mdst;
719 	memquery_t	mq;
720 	dr_board_t	*bp;
721 	dr_mem_unit_t	*mp;
722 	sbd_mem_stat_t	*msp;
723 	static fn_t	f = "dr_mem_status";
724 
725 	bp = hp->h_bd;
726 	devset &= DR_DEVS_PRESENT(bp);
727 
728 	for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) {
729 		int		rv;
730 		sbd_error_t	*err;
731 		drmach_status_t	 pstat;
732 		dr_mem_unit_t	*p_mp;
733 
734 		if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0)
735 			continue;
736 
737 		mp = dr_get_mem_unit(bp, m);
738 
739 		if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) {
740 			/* present, but not fully initialized */
741 			continue;
742 		}
743 
744 		if (mp->sbm_cm.sbdev_id == (drmachid_t)0)
745 			continue;
746 
747 		/* fetch platform status */
748 		err = drmach_status(mp->sbm_cm.sbdev_id, &pstat);
749 		if (err) {
750 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
751 			continue;
752 		}
753 
754 		msp = &dsp->d_mem;
755 		bzero((caddr_t)msp, sizeof (*msp));
756 
757 		strncpy(msp->ms_cm.c_id.c_name, pstat.type,
758 			sizeof (msp->ms_cm.c_id.c_name));
759 		msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type;
760 		msp->ms_cm.c_id.c_unit = SBD_NULL_UNIT;
761 		msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond;
762 		msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy;
763 		msp->ms_cm.c_time = mp->sbm_cm.sbdev_time;
764 		msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate;
765 
766 		msp->ms_totpages = mp->sbm_npages;
767 		msp->ms_basepfn = mp->sbm_basepfn;
768 		msp->ms_pageslost = mp->sbm_pageslost;
769 		msp->ms_cage_enabled = kcage_on;
770 
771 		if (mp->sbm_flags & DR_MFLAG_RESERVED)
772 			p_mp = mp->sbm_peer;
773 		else
774 			p_mp = NULL;
775 
776 		if (p_mp == NULL) {
777 			msp->ms_peer_is_target = 0;
778 			msp->ms_peer_ap_id[0] = '\0';
779 		} else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) {
780 			char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
781 			char *minor;
782 
783 			/*
784 			 * b_dip doesn't have to be held for ddi_pathname()
785 			 * because the board struct (dr_board_t) will be
786 			 * destroyed before b_dip detaches.
787 			 */
788 			(void) ddi_pathname(bp->b_dip, path);
789 			minor = strchr(p_mp->sbm_cm.sbdev_path, ':');
790 
791 			snprintf(msp->ms_peer_ap_id,
792 			    sizeof (msp->ms_peer_ap_id), "%s%s",
793 			    path, (minor == NULL) ? "" : minor);
794 
795 			kmem_free(path, MAXPATHLEN);
796 
797 			if (p_mp->sbm_flags & DR_MFLAG_TARGET)
798 				msp->ms_peer_is_target = 1;
799 		}
800 
801 		if (mp->sbm_flags & DR_MFLAG_RELOWNER)
802 			rv = kphysm_del_status(mp->sbm_memhandle, &mdst);
803 		else
804 			rv = KPHYSM_EHANDLE;	/* force 'if' to fail */
805 
806 		if (rv == KPHYSM_OK) {
807 			/*
808 			 * Any pages above managed is "free",
809 			 * i.e. it's collected.
810 			 */
811 			msp->ms_detpages += (uint_t)(mdst.collected +
812 			    mdst.phys_pages - mdst.managed);
813 		} else {
814 			/*
815 			 * If we're UNREFERENCED or UNCONFIGURED,
816 			 * then the number of detached pages is
817 			 * however many pages are on the board.
818 			 * I.e. detached = not in use by OS.
819 			 */
820 			switch (msp->ms_cm.c_ostate) {
821 			/*
822 			 * changed to use cfgadm states
823 			 *
824 			 * was:
825 			 *	case DR_STATE_UNREFERENCED:
826 			 *	case DR_STATE_UNCONFIGURED:
827 			 */
828 			case SBD_STAT_UNCONFIGURED:
829 				msp->ms_detpages = msp->ms_totpages;
830 				break;
831 
832 			default:
833 				break;
834 			}
835 		}
836 
837 		/*
838 		 * kphysm_del_span_query can report non-reloc pages = total
839 		 * pages for memory that is not yet configured
840 		 */
841 		if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) {
842 
843 			rv = kphysm_del_span_query(mp->sbm_basepfn,
844 			    mp->sbm_npages, &mq);
845 
846 			if (rv == KPHYSM_OK) {
847 				msp->ms_managed_pages = mq.managed;
848 				msp->ms_noreloc_pages = mq.nonrelocatable;
849 				msp->ms_noreloc_first =
850 				    mq.first_nonrelocatable;
851 				msp->ms_noreloc_last =
852 				    mq.last_nonrelocatable;
853 				msp->ms_cm.c_sflags = 0;
854 				if (mq.nonrelocatable) {
855 					SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE,
856 					    msp->ms_cm.c_sflags);
857 				}
858 			} else {
859 				PR_MEM("%s: kphysm_del_span_query() = %d\n",
860 				    f, rv);
861 			}
862 		}
863 
864 		/*
865 		 * Check source unit state during copy-rename
866 		 */
867 		if ((mp->sbm_flags & DR_MFLAG_SOURCE) &&
868 		    (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED ||
869 		    mp->sbm_cm.sbdev_state == DR_STATE_RELEASE))
870 			msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED;
871 
872 		mix++;
873 		dsp++;
874 	}
875 
876 	return (mix);
877 }
878 
879 int
880 dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
881 {
882 	_NOTE(ARGUNUSED(hp))
883 
884 	int		err_flag = 0;
885 	int		d;
886 	sbd_error_t	*err;
887 	static fn_t	f = "dr_pre_attach_mem";
888 
889 	PR_MEM("%s...\n", f);
890 
891 	for (d = 0; d < devnum; d++) {
892 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
893 		dr_state_t	state;
894 
895 		cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path);
896 
897 		state = mp->sbm_cm.sbdev_state;
898 		switch (state) {
899 		case DR_STATE_UNCONFIGURED:
900 			PR_MEM("%s: recovering from UNCONFIG for %s\n",
901 				f,
902 				mp->sbm_cm.sbdev_path);
903 
904 			/* use memlist cached by dr_post_detach_mem_unit */
905 			ASSERT(mp->sbm_mlist != NULL);
906 			PR_MEM("%s: re-configuring cached memlist for %s:\n",
907 				f, mp->sbm_cm.sbdev_path);
908 			PR_MEMLIST_DUMP(mp->sbm_mlist);
909 
910 			/* kphysm del handle should be have been freed */
911 			ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
912 
913 			/*FALLTHROUGH*/
914 
915 		case DR_STATE_CONNECTED:
916 			PR_MEM("%s: reprogramming mem hardware on %s\n",
917 				f, mp->sbm_cm.sbdev_bp->b_path);
918 
919 			PR_MEM("%s: enabling %s\n",
920 				f, mp->sbm_cm.sbdev_path);
921 
922 			err = drmach_mem_enable(mp->sbm_cm.sbdev_id);
923 			if (err) {
924 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
925 				err_flag = 1;
926 			}
927 			break;
928 
929 		default:
930 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE);
931 			err_flag = 1;
932 			break;
933 		}
934 
935 		/* exit for loop if error encountered */
936 		if (err_flag)
937 			break;
938 	}
939 
940 	return (err_flag ? -1 : 0);
941 }
942 
943 int
944 dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
945 {
946 	_NOTE(ARGUNUSED(hp))
947 
948 	int		d;
949 	static fn_t	f = "dr_post_attach_mem";
950 
951 	PR_MEM("%s...\n", f);
952 
953 	for (d = 0; d < devnum; d++) {
954 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
955 		struct memlist	*mlist, *ml;
956 
957 		mlist = dr_get_memlist(mp);
958 		if (mlist == NULL) {
959 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_MEMFAIL);
960 			continue;
961 		}
962 
963 		/*
964 		 * Verify the memory really did successfully attach
965 		 * by checking for its existence in phys_install.
966 		 */
967 		memlist_read_lock();
968 		if (memlist_intersect(phys_install, mlist) == 0) {
969 			memlist_read_unlock();
970 
971 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
972 
973 			PR_MEM("%s: %s memlist not in phys_install",
974 				f, mp->sbm_cm.sbdev_path);
975 
976 			memlist_delete(mlist);
977 			continue;
978 		}
979 		memlist_read_unlock();
980 
981 		for (ml = mlist; ml != NULL; ml = ml->next) {
982 			sbd_error_t *err;
983 
984 			err = drmach_mem_add_span(
985 				mp->sbm_cm.sbdev_id,
986 				ml->address,
987 				ml->size);
988 			if (err)
989 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
990 		}
991 
992 		memlist_delete(mlist);
993 
994 		/*
995 		 * Destroy cached memlist, if any.
996 		 * There will be a cached memlist in sbm_mlist if
997 		 * this board is being configured directly after
998 		 * an unconfigure.
999 		 * To support this transition, dr_post_detach_mem
1000 		 * left a copy of the last known memlist in sbm_mlist.
1001 		 * This memlist could differ from any derived from
1002 		 * hardware if while this memunit was last configured
1003 		 * the system detected and deleted bad pages from
1004 		 * phys_install.  The location of those bad pages
1005 		 * will be reflected in the cached memlist.
1006 		 */
1007 		if (mp->sbm_mlist) {
1008 			memlist_delete(mp->sbm_mlist);
1009 			mp->sbm_mlist = NULL;
1010 		}
1011 
1012 /*
1013  * TODO: why is this call to dr_init_mem_unit_data here?
1014  * this has been done at discovery or connect time, so this is
1015  * probably redundant and unnecessary.
1016  */
1017 		dr_init_mem_unit_data(mp);
1018 	}
1019 
1020 	return (0);
1021 }
1022 
1023 int
1024 dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1025 {
1026 	_NOTE(ARGUNUSED(hp))
1027 
1028 	int d;
1029 
1030 	for (d = 0; d < devnum; d++) {
1031 		dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d];
1032 
1033 		cmn_err(CE_CONT, "OS unconfigure %s", mp->sbm_cm.sbdev_path);
1034 	}
1035 
1036 	return (0);
1037 }
1038 
1039 
1040 int
1041 dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1042 {
1043 	_NOTE(ARGUNUSED(hp))
1044 
1045 	int		d, rv;
1046 	static fn_t	f = "dr_post_detach_mem";
1047 
1048 	PR_MEM("%s...\n", f);
1049 
1050 	rv = 0;
1051 	for (d = 0; d < devnum; d++) {
1052 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1053 
1054 		ASSERT(mp->sbm_cm.sbdev_bp == hp->h_bd);
1055 
1056 		if (dr_post_detach_mem_unit(mp))
1057 			rv = -1;
1058 	}
1059 
1060 	return (rv);
1061 }
1062 
1063 static void
1064 dr_add_memory_spans(dr_mem_unit_t *mp, struct memlist *ml)
1065 {
1066 	static fn_t	f = "dr_add_memory_spans";
1067 
1068 	PR_MEM("%s...", f);
1069 	PR_MEMLIST_DUMP(ml);
1070 
1071 #ifdef DEBUG
1072 	memlist_read_lock();
1073 	if (memlist_intersect(phys_install, ml)) {
1074 		PR_MEM("%s:WARNING: memlist intersects with phys_install\n", f);
1075 	}
1076 	memlist_read_unlock();
1077 #endif
1078 
1079 	for (; ml; ml = ml->next) {
1080 		pfn_t		 base;
1081 		pgcnt_t		 npgs;
1082 		int		 rv;
1083 		sbd_error_t	*err;
1084 
1085 		base = _b64top(ml->address);
1086 		npgs = _b64top(ml->size);
1087 
1088 		rv = kphysm_add_memory_dynamic(base, npgs);
1089 
1090 		err = drmach_mem_add_span(
1091 			mp->sbm_cm.sbdev_id,
1092 			ml->address,
1093 			ml->size);
1094 
1095 		if (err)
1096 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1097 
1098 		if (rv != KPHYSM_OK) {
1099 			cmn_err(CE_WARN, "%s:"
1100 				" unexpected kphysm_add_memory_dynamic"
1101 				" return value %d;"
1102 				" basepfn=0x%lx, npages=%ld\n",
1103 				f, rv, base, npgs);
1104 
1105 			continue;
1106 		}
1107 	}
1108 }
1109 
1110 static int
1111 dr_post_detach_mem_unit(dr_mem_unit_t *s_mp)
1112 {
1113 	uint64_t	sz = s_mp->sbm_slice_size;
1114 	uint64_t	sm = sz - 1;
1115 	/* old and new below refer to PAs before and after copy-rename */
1116 	uint64_t	s_old_basepa, s_new_basepa;
1117 	uint64_t	t_old_basepa, t_new_basepa;
1118 	uint64_t	t_new_smallsize = 0;
1119 	dr_mem_unit_t	*t_mp, *x_mp;
1120 	struct memlist	*ml;
1121 	int		rv;
1122 	sbd_error_t	*err;
1123 	static fn_t	f = "dr_post_detach_mem_unit";
1124 
1125 	PR_MEM("%s...\n", f);
1126 
1127 	/* s_mp->sbm_del_mlist could be NULL, meaning no deleted spans */
1128 	PR_MEM("%s: %s: deleted memlist (EMPTY maybe okay):\n",
1129 		f, s_mp->sbm_cm.sbdev_path);
1130 	PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1131 
1132 	/* sanity check */
1133 	ASSERT(s_mp->sbm_del_mlist == NULL ||
1134 		(s_mp->sbm_flags & DR_MFLAG_RELDONE) != 0);
1135 
1136 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1137 		t_mp = s_mp->sbm_peer;
1138 		ASSERT(t_mp != NULL);
1139 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1140 		ASSERT(t_mp->sbm_peer == s_mp);
1141 
1142 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RELDONE);
1143 		ASSERT(t_mp->sbm_del_mlist);
1144 
1145 		PR_MEM("%s: target %s: deleted memlist:\n",
1146 			f, t_mp->sbm_cm.sbdev_path);
1147 		PR_MEMLIST_DUMP(t_mp->sbm_del_mlist);
1148 	} else {
1149 		/* this is no target unit */
1150 		t_mp = NULL;
1151 	}
1152 
1153 	/*
1154 	 * Verify the memory really did successfully detach
1155 	 * by checking for its non-existence in phys_install.
1156 	 */
1157 	rv = 0;
1158 	memlist_read_lock();
1159 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
1160 		x_mp = s_mp;
1161 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1162 	}
1163 	if (rv == 0 && t_mp && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
1164 		x_mp = t_mp;
1165 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1166 	}
1167 	memlist_read_unlock();
1168 
1169 	if (rv) {
1170 		/* error: memlist still in phys_install */
1171 		DR_DEV_INTERNAL_ERROR(&x_mp->sbm_cm);
1172 	}
1173 
1174 	/*
1175 	 * clean mem unit state and bail out if an error has been recorded.
1176 	 */
1177 	rv = 0;
1178 	if (s_mp->sbm_cm.sbdev_error) {
1179 		PR_MEM("%s: %s flags=%x", f,
1180 			s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1181 		DR_DEV_CLR_UNREFERENCED(&s_mp->sbm_cm);
1182 		DR_DEV_CLR_RELEASED(&s_mp->sbm_cm);
1183 		dr_device_transition(&s_mp->sbm_cm, DR_STATE_CONFIGURED);
1184 		rv = -1;
1185 	}
1186 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error != NULL) {
1187 		PR_MEM("%s: %s flags=%x", f,
1188 			s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1189 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1190 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1191 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1192 		rv = -1;
1193 	}
1194 	if (rv)
1195 		goto cleanup;
1196 
1197 	s_old_basepa = _ptob64(s_mp->sbm_basepfn);
1198 	err = drmach_mem_get_base_physaddr(s_mp->sbm_cm.sbdev_id,
1199 	    &s_new_basepa);
1200 	ASSERT(err == NULL);
1201 
1202 	PR_MEM("%s:s_old_basepa: 0x%lx\n", f, s_old_basepa);
1203 	PR_MEM("%s:s_new_basepa: 0x%lx\n", f, s_new_basepa);
1204 
1205 	if (t_mp != NULL) {
1206 		struct memlist *s_copy_mlist;
1207 
1208 		t_old_basepa	= _ptob64(t_mp->sbm_basepfn);
1209 		err = drmach_mem_get_base_physaddr(t_mp->sbm_cm.sbdev_id,
1210 		    &t_new_basepa);
1211 		ASSERT(err == NULL);
1212 
1213 		PR_MEM("%s:t_old_basepa: 0x%lx\n", f, t_old_basepa);
1214 		PR_MEM("%s:t_new_basepa: 0x%lx\n", f, t_new_basepa);
1215 
1216 		/*
1217 		 * Construct copy list with original source addresses.
1218 		 * Used to add back excess target mem.
1219 		 */
1220 		s_copy_mlist = memlist_dup(s_mp->sbm_mlist);
1221 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1222 			s_copy_mlist = memlist_del_span(s_copy_mlist,
1223 			    ml->address, ml->size);
1224 		}
1225 
1226 		PR_MEM("%s: source copy list:\n:", f);
1227 		PR_MEMLIST_DUMP(s_copy_mlist);
1228 
1229 		/*
1230 		 * We had to swap mem-units, so update
1231 		 * memlists accordingly with new base
1232 		 * addresses.
1233 		 */
1234 		for (ml = t_mp->sbm_mlist; ml; ml = ml->next) {
1235 			ml->address -= t_old_basepa;
1236 			ml->address += t_new_basepa;
1237 		}
1238 
1239 		/*
1240 		 * There is no need to explicitly rename the target delete
1241 		 * memlist, because sbm_del_mlist and sbm_mlist always
1242 		 * point to the same memlist for a copy/rename operation.
1243 		 */
1244 		ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1245 
1246 		PR_MEM("%s: renamed target memlist and delete memlist:\n", f);
1247 		PR_MEMLIST_DUMP(t_mp->sbm_mlist);
1248 
1249 		for (ml = s_mp->sbm_mlist; ml; ml = ml->next) {
1250 			ml->address -= s_old_basepa;
1251 			ml->address += s_new_basepa;
1252 		}
1253 
1254 		PR_MEM("%s: renamed source memlist:\n", f);
1255 		PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1256 
1257 		/*
1258 		 * Keep track of dynamically added segments
1259 		 * since they cannot be split if we need to delete
1260 		 * excess source memory later for this board.
1261 		 */
1262 		if (t_mp->sbm_dyn_segs)
1263 			memlist_delete(t_mp->sbm_dyn_segs);
1264 		t_mp->sbm_dyn_segs = s_mp->sbm_dyn_segs;
1265 		s_mp->sbm_dyn_segs = NULL;
1266 
1267 		/*
1268 		 * If the target memory range with the new target base PA
1269 		 * extends beyond the usable slice, prevent any "target excess"
1270 		 * from being added back after this copy/rename and
1271 		 * calculate the new smaller size of the target board
1272 		 * to be set as part of target cleanup. The base + npages
1273 		 * must only include the range of memory up to the end of
1274 		 * this slice. This will only be used after a category 4
1275 		 * large-to-small target type copy/rename - see comments
1276 		 * in dr_select_mem_target.
1277 		 */
1278 		if (((t_new_basepa & sm) + _ptob64(t_mp->sbm_npages)) > sz) {
1279 			t_new_smallsize = sz - (t_new_basepa & sm);
1280 		}
1281 
1282 		if (s_mp->sbm_flags & DR_MFLAG_MEMRESIZE &&
1283 		    t_new_smallsize == 0) {
1284 			struct memlist	*t_excess_mlist;
1285 
1286 			/*
1287 			 * Add back excess target memory.
1288 			 * Subtract out the portion of the target memory
1289 			 * node that was taken over by the source memory
1290 			 * node.
1291 			 */
1292 			t_excess_mlist = memlist_dup(t_mp->sbm_mlist);
1293 			for (ml = s_copy_mlist; ml; ml = ml->next) {
1294 				t_excess_mlist =
1295 				    memlist_del_span(t_excess_mlist,
1296 				    ml->address, ml->size);
1297 			}
1298 
1299 			/*
1300 			 * Update dynamically added segs
1301 			 */
1302 			for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1303 				t_mp->sbm_dyn_segs =
1304 				    memlist_del_span(t_mp->sbm_dyn_segs,
1305 				    ml->address, ml->size);
1306 			}
1307 			for (ml = t_excess_mlist; ml; ml = ml->next) {
1308 				t_mp->sbm_dyn_segs =
1309 				    memlist_cat_span(t_mp->sbm_dyn_segs,
1310 				    ml->address, ml->size);
1311 			}
1312 			PR_MEM("%s: %s: updated dynamic seg list:\n",
1313 			    f, t_mp->sbm_cm.sbdev_path);
1314 			PR_MEMLIST_DUMP(t_mp->sbm_dyn_segs);
1315 
1316 			PR_MEM("%s: adding back remaining portion"
1317 				" of %s, memlist:\n",
1318 				f, t_mp->sbm_cm.sbdev_path);
1319 			PR_MEMLIST_DUMP(t_excess_mlist);
1320 
1321 			dr_add_memory_spans(s_mp, t_excess_mlist);
1322 			memlist_delete(t_excess_mlist);
1323 		}
1324 		memlist_delete(s_copy_mlist);
1325 
1326 #ifdef DEBUG
1327 		/*
1328 		 * Renaming s_mp->sbm_del_mlist is not necessary.  This
1329 		 * list is not used beyond this point, and in fact, is
1330 		 * disposed of at the end of this function.
1331 		 */
1332 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1333 			ml->address -= s_old_basepa;
1334 			ml->address += s_new_basepa;
1335 		}
1336 
1337 		PR_MEM("%s: renamed source delete memlist", f);
1338 		PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1339 #endif
1340 
1341 	}
1342 
1343 	if (t_mp != NULL) {
1344 		/* delete target's entire address space */
1345 		err = drmach_mem_del_span(
1346 			t_mp->sbm_cm.sbdev_id, t_old_basepa & ~ sm, sz);
1347 		if (err)
1348 			DRERR_SET_C(&t_mp->sbm_cm.sbdev_error, &err);
1349 		ASSERT(err == NULL);
1350 
1351 		/*
1352 		 * After the copy/rename, the original address space
1353 		 * for the source board (which is now located on the
1354 		 * target board) may now have some excess to be deleted.
1355 		 * The amount is calculated by masking the slice
1356 		 * info and keeping the slice offset from t_new_basepa.
1357 		 */
1358 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1359 				s_old_basepa & ~ sm, t_new_basepa & sm);
1360 		if (err)
1361 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1362 		ASSERT(err == NULL);
1363 
1364 	} else {
1365 		/* delete board's entire address space */
1366 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1367 						s_old_basepa & ~ sm, sz);
1368 		if (err)
1369 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1370 		ASSERT(err == NULL);
1371 	}
1372 
1373 cleanup:
1374 	/* clean up target mem unit */
1375 	if (t_mp != NULL) {
1376 		memlist_delete(t_mp->sbm_del_mlist);
1377 		/* no need to delete sbm_mlist, it shares sbm_del_mlist */
1378 
1379 		t_mp->sbm_del_mlist = NULL;
1380 		t_mp->sbm_mlist = NULL;
1381 		t_mp->sbm_peer = NULL;
1382 		t_mp->sbm_flags = 0;
1383 		t_mp->sbm_cm.sbdev_busy = 0;
1384 		dr_init_mem_unit_data(t_mp);
1385 
1386 		/* reduce target size if new PAs go past end of usable slice */
1387 		if (t_new_smallsize > 0) {
1388 			t_mp->sbm_npages = _b64top(t_new_smallsize);
1389 			PR_MEM("%s: target new size 0x%lx bytes\n",
1390 				f, t_new_smallsize);
1391 		}
1392 	}
1393 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error == NULL) {
1394 		/*
1395 		 * now that copy/rename has completed, undo this
1396 		 * work that was done in dr_release_mem_done.
1397 		 */
1398 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1399 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1400 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1401 	}
1402 
1403 	/*
1404 	 * clean up (source) board's mem unit structure.
1405 	 * NOTE: sbm_mlist is retained if no error has been record (in other
1406 	 * words, when s_mp->sbm_cm.sbdev_error is NULL). This memlist is
1407 	 * referred to elsewhere as the cached memlist.  The cached memlist
1408 	 * is used to re-attach (configure back in) this memunit from the
1409 	 * unconfigured state.  The memlist is retained because it may
1410 	 * represent bad pages that were detected while the memory was
1411 	 * configured into the OS.  The OS deletes bad pages from phys_install.
1412 	 * Those deletes, if any, will be represented in the cached mlist.
1413 	 */
1414 	if (s_mp->sbm_del_mlist && s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1415 		memlist_delete(s_mp->sbm_del_mlist);
1416 
1417 	if (s_mp->sbm_cm.sbdev_error && s_mp->sbm_mlist) {
1418 		memlist_delete(s_mp->sbm_mlist);
1419 		s_mp->sbm_mlist = NULL;
1420 	}
1421 
1422 	if (s_mp->sbm_dyn_segs != NULL && s_mp->sbm_cm.sbdev_error == 0) {
1423 		memlist_delete(s_mp->sbm_dyn_segs);
1424 		s_mp->sbm_dyn_segs = NULL;
1425 	}
1426 
1427 	s_mp->sbm_del_mlist = NULL;
1428 	s_mp->sbm_peer = NULL;
1429 	s_mp->sbm_flags = 0;
1430 	s_mp->sbm_cm.sbdev_busy = 0;
1431 	dr_init_mem_unit_data(s_mp);
1432 
1433 	PR_MEM("%s: cached memlist for %s:", f, s_mp->sbm_cm.sbdev_path);
1434 	PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1435 
1436 	return (0);
1437 }
1438 
1439 /*
1440  * Successful return from this function will have the memory
1441  * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated
1442  * and waiting.  This routine's job is to select the memory that
1443  * actually has to be released (detached) which may not necessarily
1444  * be the same memory node that came in in devlist[],
1445  * i.e. a copy-rename is needed.
1446  */
1447 int
1448 dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1449 {
1450 	int		d;
1451 	int		err_flag = 0;
1452 	static fn_t	f = "dr_pre_release_mem";
1453 
1454 	PR_MEM("%s...\n", f);
1455 
1456 	for (d = 0; d < devnum; d++) {
1457 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1458 		int		rv;
1459 		memquery_t	mq;
1460 		struct memlist	*ml;
1461 
1462 		if (mp->sbm_cm.sbdev_error) {
1463 			err_flag = 1;
1464 			continue;
1465 		} else if (!kcage_on) {
1466 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_KCAGE_OFF);
1467 			err_flag = 1;
1468 			continue;
1469 		}
1470 
1471 		if (mp->sbm_flags & DR_MFLAG_RESERVED) {
1472 			/*
1473 			 * Board is currently involved in a delete
1474 			 * memory operation. Can't detach this guy until
1475 			 * that operation completes.
1476 			 */
1477 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_INVAL);
1478 			err_flag = 1;
1479 			break;
1480 		}
1481 
1482 		/*
1483 		 * Check whether the detaching memory requires a
1484 		 * copy-rename.
1485 		 */
1486 		ASSERT(mp->sbm_npages != 0);
1487 		rv = kphysm_del_span_query(
1488 			mp->sbm_basepfn, mp->sbm_npages, &mq);
1489 		if (rv != KPHYSM_OK) {
1490 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1491 			err_flag = 1;
1492 			break;
1493 		}
1494 
1495 		if (mq.nonrelocatable != 0) {
1496 			if (!(dr_cmd_flags(hp) &
1497 				(SBD_FLAG_FORCE | SBD_FLAG_QUIESCE_OKAY))) {
1498 				/* caller wasn't prompted for a suspend */
1499 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1500 					ESBD_QUIESCE_REQD);
1501 				err_flag = 1;
1502 				break;
1503 			}
1504 		}
1505 
1506 		/* flags should be clean at this time */
1507 		ASSERT(mp->sbm_flags == 0);
1508 
1509 		ASSERT(mp->sbm_mlist == NULL);		/* should be null */
1510 		ASSERT(mp->sbm_del_mlist == NULL);	/* should be null */
1511 		if (mp->sbm_mlist != NULL) {
1512 			memlist_delete(mp->sbm_mlist);
1513 			mp->sbm_mlist = NULL;
1514 		}
1515 
1516 		ml = dr_get_memlist(mp);
1517 		if (ml == NULL) {
1518 			err_flag = 1;
1519 			PR_MEM("%s: no memlist found for %s\n",
1520 				f, mp->sbm_cm.sbdev_path);
1521 			continue;
1522 		}
1523 
1524 		/* allocate a kphysm handle */
1525 		rv = kphysm_del_gethandle(&mp->sbm_memhandle);
1526 		if (rv != KPHYSM_OK) {
1527 			memlist_delete(ml);
1528 
1529 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1530 			err_flag = 1;
1531 			break;
1532 		}
1533 		mp->sbm_flags |= DR_MFLAG_RELOWNER;
1534 
1535 		if ((mq.nonrelocatable != 0) ||
1536 			dr_reserve_mem_spans(&mp->sbm_memhandle, ml)) {
1537 			/*
1538 			 * Either the detaching memory node contains
1539 			 * non-reloc memory or we failed to reserve the
1540 			 * detaching memory node (which did _not_ have
1541 			 * any non-reloc memory, i.e. some non-reloc mem
1542 			 * got onboard).
1543 			 */
1544 
1545 			if (dr_select_mem_target(hp, mp, ml)) {
1546 				int rv;
1547 
1548 				/*
1549 				 * We had no luck locating a target
1550 				 * memory node to be the recipient of
1551 				 * the non-reloc memory on the node
1552 				 * we're trying to detach.
1553 				 * Clean up be disposing the mem handle
1554 				 * and the mem list.
1555 				 */
1556 				rv = kphysm_del_release(mp->sbm_memhandle);
1557 				if (rv != KPHYSM_OK) {
1558 					/*
1559 					 * can do nothing but complain
1560 					 * and hope helpful for debug
1561 					 */
1562 					cmn_err(CE_WARN, "%s: unexpected"
1563 						" kphysm_del_release return"
1564 						" value %d",
1565 						f, rv);
1566 				}
1567 				mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1568 
1569 				memlist_delete(ml);
1570 
1571 				/* make sure sbm_flags is clean */
1572 				ASSERT(mp->sbm_flags == 0);
1573 
1574 				dr_dev_err(CE_WARN,
1575 					&mp->sbm_cm, ESBD_NO_TARGET);
1576 
1577 				err_flag = 1;
1578 				break;
1579 			}
1580 
1581 			/*
1582 			 * ml is not memlist_delete'd here because
1583 			 * it has been assigned to mp->sbm_mlist
1584 			 * by dr_select_mem_target.
1585 			 */
1586 		} else {
1587 			/* no target needed to detach this board */
1588 			mp->sbm_flags |= DR_MFLAG_RESERVED;
1589 			mp->sbm_peer = NULL;
1590 			mp->sbm_del_mlist = ml;
1591 			mp->sbm_mlist = ml;
1592 			mp->sbm_cm.sbdev_busy = 1;
1593 		}
1594 #ifdef DEBUG
1595 		ASSERT(mp->sbm_mlist != NULL);
1596 
1597 		if (mp->sbm_flags & DR_MFLAG_SOURCE) {
1598 			PR_MEM("%s: release of %s requires copy/rename;"
1599 				" selected target board %s\n",
1600 				f,
1601 				mp->sbm_cm.sbdev_path,
1602 				mp->sbm_peer->sbm_cm.sbdev_path);
1603 		} else {
1604 			PR_MEM("%s: copy/rename not required to release %s\n",
1605 				f, mp->sbm_cm.sbdev_path);
1606 		}
1607 
1608 		ASSERT(mp->sbm_flags & DR_MFLAG_RELOWNER);
1609 		ASSERT(mp->sbm_flags & DR_MFLAG_RESERVED);
1610 #endif
1611 	}
1612 
1613 	return (err_flag ? -1 : 0);
1614 }
1615 
1616 void
1617 dr_release_mem_done(dr_common_unit_t *cp)
1618 {
1619 	dr_mem_unit_t	*s_mp = (dr_mem_unit_t *)cp;
1620 	dr_mem_unit_t *t_mp, *mp;
1621 	int		rv;
1622 	static fn_t	f = "dr_release_mem_done";
1623 
1624 	/*
1625 	 * This unit will be flagged with DR_MFLAG_SOURCE, if it
1626 	 * has a target unit.
1627 	 */
1628 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1629 		t_mp = s_mp->sbm_peer;
1630 		ASSERT(t_mp != NULL);
1631 		ASSERT(t_mp->sbm_peer == s_mp);
1632 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1633 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RESERVED);
1634 	} else {
1635 		/* this is no target unit */
1636 		t_mp = NULL;
1637 	}
1638 
1639 	/* free delete handle */
1640 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RELOWNER);
1641 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RESERVED);
1642 	rv = kphysm_del_release(s_mp->sbm_memhandle);
1643 	if (rv != KPHYSM_OK) {
1644 		/*
1645 		 * can do nothing but complain
1646 		 * and hope helpful for debug
1647 		 */
1648 		cmn_err(CE_WARN, "%s: unexpected kphysm_del_release"
1649 			" return value %d", f, rv);
1650 	}
1651 	s_mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1652 
1653 	/*
1654 	 * If an error was encountered during release, clean up
1655 	 * the source (and target, if present) unit data.
1656 	 */
1657 /* XXX Can we know that sbdev_error was encountered during release? */
1658 	if (s_mp->sbm_cm.sbdev_error != NULL) {
1659 		PR_MEM("%s: %s: error %d noted\n",
1660 			f,
1661 			s_mp->sbm_cm.sbdev_path,
1662 			s_mp->sbm_cm.sbdev_error->e_code);
1663 
1664 		if (t_mp != NULL) {
1665 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1666 			t_mp->sbm_del_mlist = NULL;
1667 
1668 			if (t_mp->sbm_mlist != NULL) {
1669 				memlist_delete(t_mp->sbm_mlist);
1670 				t_mp->sbm_mlist = NULL;
1671 			}
1672 
1673 			t_mp->sbm_peer = NULL;
1674 			t_mp->sbm_flags = 0;
1675 			t_mp->sbm_cm.sbdev_busy = 0;
1676 		}
1677 
1678 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1679 			memlist_delete(s_mp->sbm_del_mlist);
1680 		s_mp->sbm_del_mlist = NULL;
1681 
1682 		if (s_mp->sbm_mlist != NULL) {
1683 			memlist_delete(s_mp->sbm_mlist);
1684 			s_mp->sbm_mlist = NULL;
1685 		}
1686 
1687 		s_mp->sbm_peer = NULL;
1688 		s_mp->sbm_flags = 0;
1689 		s_mp->sbm_cm.sbdev_busy = 0;
1690 
1691 		/* bail out */
1692 		return;
1693 	}
1694 
1695 	DR_DEV_SET_RELEASED(&s_mp->sbm_cm);
1696 	dr_device_transition(&s_mp->sbm_cm, DR_STATE_RELEASE);
1697 
1698 	if (t_mp != NULL) {
1699 		/*
1700 		 * the kphysm delete operation that drained the source
1701 		 * board also drained this target board.  Since the source
1702 		 * board drain is now known to have succeeded, we know this
1703 		 * target board is drained too.
1704 		 *
1705 		 * because DR_DEV_SET_RELEASED and dr_device_transition
1706 		 * is done here, the dr_release_dev_done should not
1707 		 * fail.
1708 		 */
1709 		DR_DEV_SET_RELEASED(&t_mp->sbm_cm);
1710 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_RELEASE);
1711 
1712 		/*
1713 		 * NOTE: do not transition target's board state,
1714 		 * even if the mem-unit was the last configure
1715 		 * unit of the board.  When copy/rename completes
1716 		 * this mem-unit will transitioned back to
1717 		 * the configured state.  In the meantime, the
1718 		 * board's must remain as is.
1719 		 */
1720 	}
1721 
1722 	/* if board(s) had deleted memory, verify it is gone */
1723 	rv = 0;
1724 	memlist_read_lock();
1725 	if (s_mp->sbm_del_mlist != NULL) {
1726 		mp = s_mp;
1727 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1728 	}
1729 	if (rv == 0 && t_mp && t_mp->sbm_del_mlist != NULL) {
1730 		mp = t_mp;
1731 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1732 	}
1733 	memlist_read_unlock();
1734 	if (rv) {
1735 		cmn_err(CE_WARN, "%s: %smem-unit (%d.%d): "
1736 			"deleted memory still found in phys_install",
1737 			f,
1738 			(mp == t_mp ? "target " : ""),
1739 			mp->sbm_cm.sbdev_bp->b_num,
1740 			mp->sbm_cm.sbdev_unum);
1741 
1742 		DR_DEV_INTERNAL_ERROR(&s_mp->sbm_cm);
1743 		return;
1744 	}
1745 
1746 	s_mp->sbm_flags |= DR_MFLAG_RELDONE;
1747 	if (t_mp != NULL)
1748 		t_mp->sbm_flags |= DR_MFLAG_RELDONE;
1749 
1750 	/* this should not fail */
1751 	if (dr_release_dev_done(&s_mp->sbm_cm) != 0) {
1752 		/* catch this in debug kernels */
1753 		ASSERT(0);
1754 		return;
1755 	}
1756 
1757 	PR_MEM("%s: marking %s release DONE\n",
1758 		f, s_mp->sbm_cm.sbdev_path);
1759 
1760 	s_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1761 
1762 	if (t_mp != NULL) {
1763 		/* should not fail */
1764 		rv = dr_release_dev_done(&t_mp->sbm_cm);
1765 		if (rv != 0) {
1766 			/* catch this in debug kernels */
1767 			ASSERT(0);
1768 			return;
1769 		}
1770 
1771 		PR_MEM("%s: marking %s release DONE\n",
1772 			f, t_mp->sbm_cm.sbdev_path);
1773 
1774 		t_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1775 	}
1776 }
1777 
1778 /*ARGSUSED*/
1779 int
1780 dr_disconnect_mem(dr_mem_unit_t *mp)
1781 {
1782 	static fn_t	f = "dr_disconnect_mem";
1783 	update_membounds_t umb;
1784 
1785 #ifdef DEBUG
1786 	int state = mp->sbm_cm.sbdev_state;
1787 	ASSERT(state == DR_STATE_CONNECTED ||
1788 		state == DR_STATE_UNCONFIGURED);
1789 #endif
1790 
1791 	PR_MEM("%s...\n", f);
1792 
1793 	if (mp->sbm_del_mlist && mp->sbm_del_mlist != mp->sbm_mlist)
1794 		memlist_delete(mp->sbm_del_mlist);
1795 	mp->sbm_del_mlist = NULL;
1796 
1797 	if (mp->sbm_mlist) {
1798 		memlist_delete(mp->sbm_mlist);
1799 		mp->sbm_mlist = NULL;
1800 	}
1801 
1802 	/*
1803 	 * Remove memory from lgroup
1804 	 * For now, only board info is required.
1805 	 */
1806 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1807 	umb.u_base = (uint64_t)-1;
1808 	umb.u_len = (uint64_t)-1;
1809 
1810 	lgrp_plat_config(LGRP_CONFIG_MEM_DEL, (uintptr_t)&umb);
1811 
1812 	return (0);
1813 }
1814 
1815 int
1816 dr_cancel_mem(dr_mem_unit_t *s_mp)
1817 {
1818 	dr_mem_unit_t	*t_mp;
1819 	dr_state_t	state;
1820 	static fn_t	f = "dr_cancel_mem";
1821 
1822 	state = s_mp->sbm_cm.sbdev_state;
1823 
1824 	if (s_mp->sbm_flags & DR_MFLAG_TARGET) {
1825 		/* must cancel source board, not target board */
1826 		/* TODO: set error */
1827 		return (-1);
1828 	} else if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1829 		t_mp = s_mp->sbm_peer;
1830 		ASSERT(t_mp != NULL);
1831 		ASSERT(t_mp->sbm_peer == s_mp);
1832 
1833 		/* must always match the source board's state */
1834 /* TODO: is this assertion correct? */
1835 		ASSERT(t_mp->sbm_cm.sbdev_state == state);
1836 	} else {
1837 		/* this is no target unit */
1838 		t_mp = NULL;
1839 	}
1840 
1841 	switch (state) {
1842 	case DR_STATE_UNREFERENCED:	/* state set by dr_release_dev_done */
1843 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1844 
1845 		if (t_mp != NULL && t_mp->sbm_del_mlist != NULL) {
1846 			PR_MEM("%s: undoing target %s memory delete\n",
1847 				f, t_mp->sbm_cm.sbdev_path);
1848 			dr_add_memory_spans(t_mp, t_mp->sbm_del_mlist);
1849 
1850 			DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1851 		}
1852 
1853 		if (s_mp->sbm_del_mlist != NULL) {
1854 			PR_MEM("%s: undoing %s memory delete\n",
1855 				f, s_mp->sbm_cm.sbdev_path);
1856 
1857 			dr_add_memory_spans(s_mp, s_mp->sbm_del_mlist);
1858 		}
1859 
1860 		/*FALLTHROUGH*/
1861 
1862 /* TODO: should no longer be possible to see the release state here */
1863 	case DR_STATE_RELEASE:	/* state set by dr_release_mem_done */
1864 
1865 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1866 
1867 		if (t_mp != NULL) {
1868 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1869 			t_mp->sbm_del_mlist = NULL;
1870 
1871 			if (t_mp->sbm_mlist != NULL) {
1872 				memlist_delete(t_mp->sbm_mlist);
1873 				t_mp->sbm_mlist = NULL;
1874 			}
1875 
1876 			t_mp->sbm_peer = NULL;
1877 			t_mp->sbm_flags = 0;
1878 			t_mp->sbm_cm.sbdev_busy = 0;
1879 			dr_init_mem_unit_data(t_mp);
1880 
1881 			DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1882 
1883 			dr_device_transition(
1884 				&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1885 		}
1886 
1887 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1888 			memlist_delete(s_mp->sbm_del_mlist);
1889 		s_mp->sbm_del_mlist = NULL;
1890 
1891 		if (s_mp->sbm_mlist != NULL) {
1892 			memlist_delete(s_mp->sbm_mlist);
1893 			s_mp->sbm_mlist = NULL;
1894 		}
1895 
1896 		s_mp->sbm_peer = NULL;
1897 		s_mp->sbm_flags = 0;
1898 		s_mp->sbm_cm.sbdev_busy = 0;
1899 		dr_init_mem_unit_data(s_mp);
1900 
1901 		return (0);
1902 
1903 	default:
1904 		PR_MEM("%s: WARNING unexpected state (%d) for %s\n",
1905 			f, (int)state, s_mp->sbm_cm.sbdev_path);
1906 
1907 		return (-1);
1908 	}
1909 	/*NOTREACHED*/
1910 }
1911 
1912 void
1913 dr_init_mem_unit(dr_mem_unit_t *mp)
1914 {
1915 	dr_state_t	new_state;
1916 
1917 
1918 	if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) {
1919 		new_state = DR_STATE_CONFIGURED;
1920 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1921 	} else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) {
1922 		new_state = DR_STATE_CONNECTED;
1923 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1924 	} else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) {
1925 		new_state = DR_STATE_OCCUPIED;
1926 	} else {
1927 		new_state = DR_STATE_EMPTY;
1928 	}
1929 
1930 	if (DR_DEV_IS_PRESENT(&mp->sbm_cm))
1931 		dr_init_mem_unit_data(mp);
1932 
1933 	/* delay transition until fully initialized */
1934 	dr_device_transition(&mp->sbm_cm, new_state);
1935 }
1936 
1937 static void
1938 dr_init_mem_unit_data(dr_mem_unit_t *mp)
1939 {
1940 	drmachid_t	id = mp->sbm_cm.sbdev_id;
1941 	uint64_t	bytes;
1942 	sbd_error_t	*err;
1943 	static fn_t	f = "dr_init_mem_unit_data";
1944 	update_membounds_t umb;
1945 
1946 	PR_MEM("%s...\n", f);
1947 
1948 	/* a little sanity checking */
1949 	ASSERT(mp->sbm_peer == NULL);
1950 	ASSERT(mp->sbm_flags == 0);
1951 
1952 	/* get basepfn of mem unit */
1953 	err = drmach_mem_get_base_physaddr(id, &bytes);
1954 	if (err) {
1955 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1956 		mp->sbm_basepfn = (pfn_t)-1;
1957 	} else
1958 		mp->sbm_basepfn = _b64top(bytes);
1959 
1960 	/* attempt to get number of pages from PDA */
1961 	err = drmach_mem_get_size(id, &bytes);
1962 	if (err) {
1963 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1964 		mp->sbm_npages = 0;
1965 	} else
1966 		mp->sbm_npages = _b64top(bytes);
1967 
1968 	/* if didn't work, calculate using memlist */
1969 	if (mp->sbm_npages == 0) {
1970 		struct memlist	*ml, *mlist;
1971 		/*
1972 		 * Either we couldn't open the PDA or our
1973 		 * PDA has garbage in it.  We must have the
1974 		 * page count consistent and whatever the
1975 		 * OS states has precedence over the PDA
1976 		 * so let's check the kernel.
1977 		 */
1978 /* TODO: curious comment. it suggests pda query should happen if this fails */
1979 		PR_MEM("%s: PDA query failed for npages."
1980 			" Checking memlist for %s\n",
1981 			f, mp->sbm_cm.sbdev_path);
1982 
1983 		mlist = dr_get_memlist(mp);
1984 		for (ml = mlist; ml; ml = ml->next)
1985 			mp->sbm_npages += btop(ml->size);
1986 		memlist_delete(mlist);
1987 	}
1988 
1989 	err = drmach_mem_get_alignment(id, &bytes);
1990 	if (err) {
1991 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1992 		mp->sbm_alignment_mask = 0;
1993 	} else
1994 		mp->sbm_alignment_mask = _b64top(bytes);
1995 
1996 	err = drmach_mem_get_slice_size(id, &bytes);
1997 	if (err) {
1998 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1999 		mp->sbm_slice_size = 0; /* paranoia */
2000 	} else
2001 		mp->sbm_slice_size = bytes;
2002 
2003 	/*
2004 	 * Add memory to lgroup
2005 	 */
2006 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
2007 	umb.u_base = (uint64_t)mp->sbm_basepfn << MMU_PAGESHIFT;
2008 	umb.u_len = (uint64_t)mp->sbm_npages << MMU_PAGESHIFT;
2009 
2010 	lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb);
2011 
2012 	PR_MEM("%s: %s (basepfn = 0x%lx, npgs = %ld)\n",
2013 		f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages);
2014 }
2015 
2016 static int
2017 dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *ml)
2018 {
2019 	int		err;
2020 	pfn_t		base;
2021 	pgcnt_t		npgs;
2022 	struct memlist	*mc;
2023 	static fn_t	f = "dr_reserve_mem_spans";
2024 
2025 	PR_MEM("%s...\n", f);
2026 
2027 	/*
2028 	 * Walk the supplied memlist scheduling each span for removal
2029 	 * with kphysm_del_span.  It is possible that a span may intersect
2030 	 * an area occupied by the cage.
2031 	 */
2032 	for (mc = ml; mc != NULL; mc = mc->next) {
2033 		base = _b64top(mc->address);
2034 		npgs = _b64top(mc->size);
2035 
2036 		err = kphysm_del_span(*mhp, base, npgs);
2037 		if (err != KPHYSM_OK) {
2038 			cmn_err(CE_WARN, "%s memory reserve failed."
2039 				" unexpected kphysm_del_span return value %d;"
2040 				" basepfn=0x%lx npages=%ld",
2041 				f, err, base, npgs);
2042 
2043 			return (-1);
2044 		}
2045 	}
2046 
2047 	return (0);
2048 }
2049 
2050 /* debug counters */
2051 int dr_smt_realigned;
2052 int dr_smt_preference[4];
2053 
2054 #ifdef DEBUG
2055 uint_t dr_ignore_board; /* if bit[bnum-1] set, board won't be candidate */
2056 #endif
2057 
2058 /*
2059  * Find and reserve a copy/rename target board suitable for the
2060  * given source board.
2061  * All boards in the system are examined and categorized in relation to
2062  * their memory size versus the source board's memory size.  Order of
2063  * preference is:
2064  *	1st: board has same memory size
2065  * 	2nd: board has larger memory size
2066  *	3rd: board has smaller memory size
2067  *	4th: board has smaller memory size, available memory will be reduced.
2068  * Boards in category 3 and 4 will have their MC's reprogrammed to locate the
2069  * span to which the MC responds to address span that appropriately covers
2070  * the nonrelocatable span of the source board.
2071  */
2072 static int
2073 dr_select_mem_target(dr_handle_t *hp,
2074 	dr_mem_unit_t *s_mp, struct memlist *s_ml)
2075 {
2076 	pgcnt_t		sz = _b64top(s_mp->sbm_slice_size);
2077 	pgcnt_t		sm = sz - 1; /* mem_slice_mask */
2078 	pfn_t		s_phi, t_phi;
2079 
2080 	int		n_sets = 4; /* same, larger, smaller, clipped */
2081 	int		preference; /* lower value is higher preference */
2082 	int		n_units_per_set;
2083 	int		idx;
2084 	dr_mem_unit_t	**sets;
2085 
2086 	int		t_bd;
2087 	int		t_unit;
2088 	int		rv;
2089 	int		allow_src_memrange_modify;
2090 	int		allow_targ_memrange_modify;
2091 	drmachid_t	t_id;
2092 	dr_board_t	*s_bp, *t_bp;
2093 	dr_mem_unit_t	*t_mp, *c_mp;
2094 	struct memlist	*d_ml, *t_ml, *x_ml;
2095 	memquery_t	s_mq = {0};
2096 	static fn_t	f = "dr_select_mem_target";
2097 
2098 	PR_MEM("%s...\n", f);
2099 
2100 	ASSERT(s_ml != NULL);
2101 
2102 	n_units_per_set = MAX_BOARDS * MAX_MEM_UNITS_PER_BOARD;
2103 	sets = GETSTRUCT(dr_mem_unit_t *, n_units_per_set * n_sets);
2104 
2105 	s_bp = hp->h_bd;
2106 	/* calculate the offset into the slice of the last source board pfn */
2107 	ASSERT(s_mp->sbm_npages != 0);
2108 	s_phi = (s_mp->sbm_basepfn + s_mp->sbm_npages - 1) & sm;
2109 
2110 	allow_src_memrange_modify = drmach_allow_memrange_modify(s_bp->b_id);
2111 
2112 	/*
2113 	 * Make one pass through all memory units on all boards
2114 	 * and categorize them with respect to the source board.
2115 	 */
2116 	for (t_bd = 0; t_bd < MAX_BOARDS; t_bd++) {
2117 		/*
2118 		 * The board structs are a contiguous array
2119 		 * so we take advantage of that to find the
2120 		 * correct board struct pointer for a given
2121 		 * board number.
2122 		 */
2123 		t_bp = dr_lookup_board(t_bd);
2124 
2125 		/* source board can not be its own target */
2126 		if (s_bp->b_num == t_bp->b_num)
2127 			continue;
2128 
2129 		for (t_unit = 0; t_unit < MAX_MEM_UNITS_PER_BOARD; t_unit++) {
2130 
2131 			t_mp = dr_get_mem_unit(t_bp, t_unit);
2132 
2133 			/* this memory node must be attached */
2134 			if (!DR_DEV_IS_ATTACHED(&t_mp->sbm_cm))
2135 				continue;
2136 
2137 			/* source unit can not be its own target */
2138 			if (s_mp == t_mp) {
2139 				/* catch this is debug kernels */
2140 				ASSERT(0);
2141 				continue;
2142 			}
2143 
2144 			/*
2145 			 * this memory node must not already be reserved
2146 			 * by some other memory delete operation.
2147 			 */
2148 			if (t_mp->sbm_flags & DR_MFLAG_RESERVED)
2149 				continue;
2150 
2151 			/*
2152 			 * categorize the memory node
2153 			 * If this is a smaller memory node, create a
2154 			 * temporary, edited copy of the source board's
2155 			 * memlist containing only the span of the non-
2156 			 * relocatable pages.
2157 			 */
2158 			t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2159 			t_id = t_mp->sbm_cm.sbdev_bp->b_id;
2160 			allow_targ_memrange_modify =
2161 			    drmach_allow_memrange_modify(t_id);
2162 			if (t_mp->sbm_npages == s_mp->sbm_npages &&
2163 			    t_phi == s_phi) {
2164 				preference = 0;
2165 				t_mp->sbm_slice_offset = 0;
2166 			} else if (t_mp->sbm_npages > s_mp->sbm_npages &&
2167 			    t_phi > s_phi) {
2168 				/*
2169 				 * Selecting this target will require modifying
2170 				 * the source and/or target physical address
2171 				 * ranges.  Skip if not supported by platform.
2172 				 */
2173 				if (!allow_src_memrange_modify ||
2174 				    !allow_targ_memrange_modify) {
2175 					PR_MEM("%s: skip target %s, memory "
2176 					    "range relocation not supported "
2177 					    "by platform\n", f,
2178 					    t_mp->sbm_cm.sbdev_path);
2179 					continue;
2180 				}
2181 				preference = 1;
2182 				t_mp->sbm_slice_offset = 0;
2183 			} else {
2184 				pfn_t		pfn = 0;
2185 
2186 				/*
2187 				 * Selecting this target will require modifying
2188 				 * the source and/or target physical address
2189 				 * ranges.  Skip if not supported by platform.
2190 				 */
2191 				if (!allow_src_memrange_modify ||
2192 				    !allow_targ_memrange_modify) {
2193 					PR_MEM("%s: skip target %s, memory "
2194 					    "range relocation not supported "
2195 					    "by platform\n", f,
2196 					    t_mp->sbm_cm.sbdev_path);
2197 					continue;
2198 				}
2199 
2200 				/*
2201 				 * Check if its mc can be programmed to relocate
2202 				 * the active address range to match the
2203 				 * nonrelocatable span of the source board.
2204 				 */
2205 				preference = 2;
2206 
2207 				if (s_mq.phys_pages == 0) {
2208 					/*
2209 					 * find non-relocatable span on
2210 					 * source board.
2211 					 */
2212 					rv = kphysm_del_span_query(
2213 						s_mp->sbm_basepfn,
2214 						s_mp->sbm_npages, &s_mq);
2215 					if (rv != KPHYSM_OK) {
2216 						PR_MEM("%s: %s: unexpected"
2217 						" kphysm_del_span_query"
2218 						" return value %d;"
2219 						" basepfn 0x%lx, npages %ld\n",
2220 						f,
2221 						s_mp->sbm_cm.sbdev_path,
2222 						rv,
2223 						s_mp->sbm_basepfn,
2224 						s_mp->sbm_npages);
2225 
2226 						/* paranoia */
2227 						s_mq.phys_pages = 0;
2228 
2229 						continue;
2230 					}
2231 
2232 					/* more paranoia */
2233 					ASSERT(s_mq.phys_pages != 0);
2234 					ASSERT(s_mq.nonrelocatable != 0);
2235 
2236 					/*
2237 					 * this should not happen
2238 					 * if it does, it simply means that
2239 					 * we can not proceed with qualifying
2240 					 * this target candidate.
2241 					 */
2242 					if (s_mq.nonrelocatable == 0)
2243 						continue;
2244 
2245 					PR_MEM("%s: %s: nonrelocatable"
2246 						" span (0x%lx..0x%lx)\n",
2247 						f,
2248 						s_mp->sbm_cm.sbdev_path,
2249 						s_mq.first_nonrelocatable,
2250 						s_mq.last_nonrelocatable);
2251 				}
2252 
2253 				/*
2254 				 * Round down the starting pfn of the
2255 				 * nonrelocatable span on the source board
2256 				 * to nearest programmable boundary possible
2257 				 * with this target candidate.
2258 				 */
2259 				pfn = s_mq.first_nonrelocatable &
2260 					~t_mp->sbm_alignment_mask;
2261 
2262 				/* skip candidate if memory is too small */
2263 				if (pfn + t_mp->sbm_npages <
2264 					s_mq.last_nonrelocatable)
2265 					continue;
2266 
2267 				/*
2268 				 * reprogramming an mc to relocate its
2269 				 * active address range means the beginning
2270 				 * address to which the DIMMS respond will
2271 				 * be somewhere above the slice boundary
2272 				 * address.  The larger the size of memory
2273 				 * on this unit, the more likely part of it
2274 				 * will exist beyond the end of the slice.
2275 				 * The portion of the memory that does is
2276 				 * unavailable to the system until the mc
2277 				 * reprogrammed to a more favorable base
2278 				 * address.
2279 				 * An attempt is made to avoid the loss by
2280 				 * recalculating the mc base address relative
2281 				 * to the end of the slice.  This may produce
2282 				 * a more favorable result.  If not, we lower
2283 				 * the board's preference rating so that it
2284 				 * is one the last candidate boards to be
2285 				 * considered.
2286 				 */
2287 				if ((pfn + t_mp->sbm_npages) & ~sm) {
2288 					pfn_t p;
2289 
2290 					ASSERT(sz >= t_mp->sbm_npages);
2291 
2292 					/*
2293 					 * calculate an alternative starting
2294 					 * address relative to the end of the
2295 					 * slice's address space.
2296 					 */
2297 					p = pfn & ~sm;
2298 					p = p + (sz - t_mp->sbm_npages);
2299 					p = p & ~t_mp->sbm_alignment_mask;
2300 
2301 					if ((p > s_mq.first_nonrelocatable) ||
2302 						(p + t_mp->sbm_npages <
2303 						s_mq.last_nonrelocatable)) {
2304 
2305 						/*
2306 						 * alternative starting addr
2307 						 * won't work. Lower preference
2308 						 * rating of this board, since
2309 						 * some number of pages will
2310 						 * unavailable for use.
2311 						 */
2312 						preference = 3;
2313 					} else {
2314 						dr_smt_realigned++;
2315 						pfn = p;
2316 					}
2317 				}
2318 
2319 				/*
2320 				 * translate calculated pfn to an offset
2321 				 * relative to the slice boundary.  If the
2322 				 * candidate board is selected, this offset
2323 				 * will be used to calculate the values
2324 				 * programmed into the mc.
2325 				 */
2326 				t_mp->sbm_slice_offset = pfn & sm;
2327 				PR_MEM("%s: %s:"
2328 					"  proposed mc offset 0x%lx\n",
2329 					f,
2330 					t_mp->sbm_cm.sbdev_path,
2331 					t_mp->sbm_slice_offset);
2332 			}
2333 
2334 			dr_smt_preference[preference]++;
2335 
2336 			/* calculate index to start of preference set */
2337 			idx  = n_units_per_set * preference;
2338 			/* calculate offset to respective element */
2339 			idx += t_bd * MAX_MEM_UNITS_PER_BOARD + t_unit;
2340 
2341 			ASSERT(idx < n_units_per_set * n_sets);
2342 			sets[idx] = t_mp;
2343 		}
2344 	}
2345 
2346 	/*
2347 	 * NOTE: this would be a good place to sort each candidate
2348 	 * set in to some desired order, e.g. memory size in ascending
2349 	 * order.  Without an additional sorting step here, the order
2350 	 * within a set is ascending board number order.
2351 	 */
2352 
2353 	c_mp = NULL;
2354 	x_ml = NULL;
2355 	t_ml = NULL;
2356 	for (idx = 0; idx < n_units_per_set * n_sets; idx++) {
2357 		memquery_t mq;
2358 
2359 		/* cleanup t_ml after previous pass */
2360 		if (t_ml != NULL) {
2361 			memlist_delete(t_ml);
2362 			t_ml = NULL;
2363 		}
2364 
2365 		/* get candidate target board mem unit */
2366 		t_mp = sets[idx];
2367 		if (t_mp == NULL)
2368 			continue;
2369 
2370 		/* get target board memlist */
2371 		t_ml = dr_get_memlist(t_mp);
2372 		if (t_ml == NULL) {
2373 			cmn_err(CE_WARN, "%s: no memlist for"
2374 				" mem-unit %d, board %d",
2375 				f,
2376 				t_mp->sbm_cm.sbdev_bp->b_num,
2377 				t_mp->sbm_cm.sbdev_unum);
2378 
2379 			continue;
2380 		}
2381 
2382 		/* get appropriate source board memlist */
2383 		t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2384 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2385 			spgcnt_t excess;
2386 
2387 			/*
2388 			 * make a copy of the source board memlist
2389 			 * then edit it to remove the spans that
2390 			 * are outside the calculated span of
2391 			 * [pfn..s_mq.last_nonrelocatable].
2392 			 */
2393 			if (x_ml != NULL)
2394 				memlist_delete(x_ml);
2395 
2396 			x_ml = memlist_dup(s_ml);
2397 			if (x_ml == NULL) {
2398 				PR_MEM("%s: memlist_dup failed\n", f);
2399 				/* TODO: should abort */
2400 				continue;
2401 			}
2402 
2403 			/* trim off lower portion */
2404 			excess = t_mp->sbm_slice_offset -
2405 			    (s_mp->sbm_basepfn & sm);
2406 
2407 			if (excess > 0) {
2408 				x_ml = memlist_del_span(
2409 					x_ml,
2410 					_ptob64(s_mp->sbm_basepfn),
2411 					_ptob64(excess));
2412 			}
2413 			ASSERT(x_ml);
2414 
2415 			/*
2416 			 * Since this candidate target board is smaller
2417 			 * than the source board, s_mq must have been
2418 			 * initialized in previous loop while processing
2419 			 * this or some other candidate board.
2420 			 * FIXME: this is weak.
2421 			 */
2422 			ASSERT(s_mq.phys_pages != 0);
2423 
2424 			/* trim off upper portion */
2425 			excess = (s_mp->sbm_basepfn + s_mp->sbm_npages)
2426 				- (s_mq.last_nonrelocatable + 1);
2427 			if (excess > 0) {
2428 				pfn_t p;
2429 
2430 				p  = s_mq.last_nonrelocatable + 1;
2431 				x_ml = memlist_del_span(
2432 					x_ml,
2433 					_ptob64(p),
2434 					_ptob64(excess));
2435 			}
2436 
2437 			PR_MEM("%s: %s: edited source memlist:\n",
2438 				f, s_mp->sbm_cm.sbdev_path);
2439 			PR_MEMLIST_DUMP(x_ml);
2440 
2441 #ifdef DEBUG
2442 			/* sanity check memlist */
2443 			d_ml = x_ml;
2444 			while (d_ml->next != NULL)
2445 				d_ml = d_ml->next;
2446 
2447 			ASSERT(d_ml->address + d_ml->size ==
2448 				_ptob64(s_mq.last_nonrelocatable + 1));
2449 #endif
2450 
2451 			/*
2452 			 * x_ml now describes only the portion of the
2453 			 * source board that will be moved during the
2454 			 * copy/rename operation.
2455 			 */
2456 			d_ml = x_ml;
2457 		} else {
2458 			/* use original memlist; all spans will be moved */
2459 			d_ml = s_ml;
2460 		}
2461 
2462 		/* verify target can support source memory spans. */
2463 		if (memlist_canfit(d_ml, t_ml) == 0) {
2464 			PR_MEM("%s: source memlist won't"
2465 				" fit in target memlist\n", f);
2466 			PR_MEM("%s: source memlist:\n", f);
2467 			PR_MEMLIST_DUMP(d_ml);
2468 			PR_MEM("%s: target memlist:\n", f);
2469 			PR_MEMLIST_DUMP(t_ml);
2470 
2471 			continue;
2472 		}
2473 
2474 		/* NOTE: the value of d_ml is not used beyond this point */
2475 
2476 		PR_MEM("%s: checking for no-reloc in %s, "
2477 			" basepfn=0x%lx, npages=%ld\n",
2478 			f,
2479 			t_mp->sbm_cm.sbdev_path,
2480 			t_mp->sbm_basepfn,
2481 			t_mp->sbm_npages);
2482 
2483 		rv = kphysm_del_span_query(
2484 			t_mp->sbm_basepfn, t_mp->sbm_npages, &mq);
2485 		if (rv != KPHYSM_OK) {
2486 			PR_MEM("%s: kphysm_del_span_query:"
2487 				" unexpected return value %d\n", f, rv);
2488 
2489 			continue;
2490 		}
2491 
2492 		if (mq.nonrelocatable != 0) {
2493 			PR_MEM("%s: candidate %s has"
2494 				" nonrelocatable span [0x%lx..0x%lx]\n",
2495 				f,
2496 				t_mp->sbm_cm.sbdev_path,
2497 				mq.first_nonrelocatable,
2498 				mq.last_nonrelocatable);
2499 
2500 			continue;
2501 		}
2502 
2503 #ifdef DEBUG
2504 		/*
2505 		 * This is a debug tool for excluding certain boards
2506 		 * from being selected as a target board candidate.
2507 		 * dr_ignore_board is only tested by this driver.
2508 		 * It must be set with adb, obp, /etc/system or your
2509 		 * favorite debugger.
2510 		 */
2511 		if (dr_ignore_board &
2512 			(1 << (t_mp->sbm_cm.sbdev_bp->b_num - 1))) {
2513 			PR_MEM("%s: dr_ignore_board flag set,"
2514 				" ignoring %s as candidate\n",
2515 				f, t_mp->sbm_cm.sbdev_path);
2516 			continue;
2517 		}
2518 #endif
2519 
2520 		/*
2521 		 * Reserve excess source board memory, if any.
2522 		 *
2523 		 * When the number of pages on the candidate target
2524 		 * board is less than the number of pages on the source,
2525 		 * then some spans (clearly) of the source board's address
2526 		 * space will not be covered by physical memory after the
2527 		 * copy/rename completes.  The following code block
2528 		 * schedules those spans to be deleted.
2529 		 */
2530 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2531 			pfn_t pfn;
2532 			uint64_t s_del_pa;
2533 			struct memlist *ml;
2534 
2535 			d_ml = memlist_dup(s_ml);
2536 			if (d_ml == NULL) {
2537 				PR_MEM("%s: cant dup src brd memlist\n", f);
2538 				/* TODO: should abort */
2539 				continue;
2540 			}
2541 
2542 			/* calculate base pfn relative to target board */
2543 			pfn  = s_mp->sbm_basepfn & ~sm;
2544 			pfn += t_mp->sbm_slice_offset;
2545 
2546 			/*
2547 			 * cannot split dynamically added segment
2548 			 */
2549 			s_del_pa = _ptob64(pfn + t_mp->sbm_npages);
2550 			PR_MEM("%s: proposed src delete pa=0x%lx\n", f,
2551 			    s_del_pa);
2552 			PR_MEM("%s: checking for split of dyn seg list:\n", f);
2553 			PR_MEMLIST_DUMP(s_mp->sbm_dyn_segs);
2554 			for (ml = s_mp->sbm_dyn_segs; ml; ml = ml->next) {
2555 				if (s_del_pa > ml->address &&
2556 				    s_del_pa < ml->address + ml->size) {
2557 					s_del_pa = ml->address;
2558 					break;
2559 				}
2560 			}
2561 
2562 			/* remove span that will reside on candidate board */
2563 			d_ml = memlist_del_span(d_ml, _ptob64(pfn),
2564 			    s_del_pa - _ptob64(pfn));
2565 
2566 			PR_MEM("%s: %s: reserving src brd memlist:\n",
2567 				f, s_mp->sbm_cm.sbdev_path);
2568 			PR_MEMLIST_DUMP(d_ml);
2569 
2570 			/* reserve excess spans */
2571 			if (dr_reserve_mem_spans(
2572 				&s_mp->sbm_memhandle, d_ml) != 0) {
2573 
2574 				/* likely more non-reloc pages appeared */
2575 				/* TODO: restart from top? */
2576 				continue;
2577 			}
2578 		} else {
2579 			/* no excess source board memory */
2580 			d_ml = NULL;
2581 		}
2582 
2583 		s_mp->sbm_flags |= DR_MFLAG_RESERVED;
2584 
2585 		/*
2586 		 * reserve all memory on target board.
2587 		 * NOTE: source board's memhandle is used.
2588 		 *
2589 		 * If this succeeds (eq 0), then target selection is
2590 		 * complete and all unwanted memory spans, both source and
2591 		 * target, have been reserved.  Loop is terminated.
2592 		 */
2593 		if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, t_ml) == 0) {
2594 			PR_MEM("%s: %s: target board memory reserved\n",
2595 				f, t_mp->sbm_cm.sbdev_path);
2596 
2597 			/* a candidate target board is now reserved */
2598 			t_mp->sbm_flags |= DR_MFLAG_RESERVED;
2599 			c_mp = t_mp;
2600 
2601 			/* *** EXITING LOOP *** */
2602 			break;
2603 		}
2604 
2605 		/* did not successfully reserve the target board. */
2606 		PR_MEM("%s: could not reserve target %s\n",
2607 			f, t_mp->sbm_cm.sbdev_path);
2608 
2609 		/*
2610 		 * NOTE: an undo of the dr_reserve_mem_span work
2611 		 * will happen automatically when the memhandle
2612 		 * (s_mp->sbm_memhandle) is kphysm_del_release'd.
2613 		 */
2614 
2615 		s_mp->sbm_flags &= ~DR_MFLAG_RESERVED;
2616 	}
2617 
2618 	/* clean up after memlist editing logic */
2619 	if (x_ml != NULL)
2620 		memlist_delete(x_ml);
2621 
2622 	FREESTRUCT(sets, dr_mem_unit_t *, n_units_per_set * n_sets);
2623 
2624 	/*
2625 	 * c_mp will be NULL when the entire sets[] array
2626 	 * has been searched without reserving a target board.
2627 	 */
2628 	if (c_mp == NULL) {
2629 		PR_MEM("%s: %s: target selection failed.\n",
2630 			f, s_mp->sbm_cm.sbdev_path);
2631 
2632 		if (t_ml != NULL)
2633 			memlist_delete(t_ml);
2634 
2635 		return (-1);
2636 	}
2637 
2638 	PR_MEM("%s: found target %s for source %s\n",
2639 		f,
2640 		c_mp->sbm_cm.sbdev_path,
2641 		s_mp->sbm_cm.sbdev_path);
2642 
2643 	s_mp->sbm_peer = c_mp;
2644 	s_mp->sbm_flags |= DR_MFLAG_SOURCE;
2645 	s_mp->sbm_del_mlist = d_ml;	/* spans to be deleted, if any */
2646 	s_mp->sbm_mlist = s_ml;
2647 	s_mp->sbm_cm.sbdev_busy = 1;
2648 
2649 	c_mp->sbm_peer = s_mp;
2650 	c_mp->sbm_flags |= DR_MFLAG_TARGET;
2651 	c_mp->sbm_del_mlist = t_ml;	/* spans to be deleted */
2652 	c_mp->sbm_mlist = t_ml;
2653 	c_mp->sbm_cm.sbdev_busy = 1;
2654 
2655 	s_mp->sbm_flags &= ~DR_MFLAG_MEMRESIZE;
2656 	if (c_mp->sbm_npages > s_mp->sbm_npages) {
2657 		s_mp->sbm_flags |= DR_MFLAG_MEMUPSIZE;
2658 		PR_MEM("%s: upsize detected (source=%ld < target=%ld)\n",
2659 			f, s_mp->sbm_npages, c_mp->sbm_npages);
2660 	} else if (c_mp->sbm_npages < s_mp->sbm_npages) {
2661 		s_mp->sbm_flags |= DR_MFLAG_MEMDOWNSIZE;
2662 		PR_MEM("%s: downsize detected (source=%ld > target=%ld)\n",
2663 			f, s_mp->sbm_npages, c_mp->sbm_npages);
2664 	}
2665 
2666 	return (0);
2667 }
2668 
2669 /*
2670  * Memlist support.
2671  */
2672 static struct memlist *
2673 memlist_dup(struct memlist *mlist)
2674 {
2675 	struct memlist *hl = NULL, *tl, **mlp;
2676 
2677 	if (mlist == NULL)
2678 		return (NULL);
2679 
2680 	mlp = &hl;
2681 	tl = *mlp;
2682 	for (; mlist; mlist = mlist->next) {
2683 		*mlp = GETSTRUCT(struct memlist, 1);
2684 		(*mlp)->address = mlist->address;
2685 		(*mlp)->size = mlist->size;
2686 		(*mlp)->prev = tl;
2687 		tl = *mlp;
2688 		mlp = &((*mlp)->next);
2689 	}
2690 	*mlp = NULL;
2691 
2692 	return (hl);
2693 }
2694 
2695 /*
2696  * Determine whether the source memlist (s_mlist) will
2697  * fit into the target memlist (t_mlist) in terms of
2698  * size and holes (i.e. based on same relative base address).
2699  */
2700 static int
2701 memlist_canfit(struct memlist *s_mlist, struct memlist *t_mlist)
2702 {
2703 	int		rv = 0;
2704 	uint64_t	s_basepa, t_basepa;
2705 	struct memlist	*s_ml, *t_ml;
2706 
2707 	if ((s_mlist == NULL) || (t_mlist == NULL))
2708 		return (0);
2709 
2710 	/*
2711 	 * Base both memlists on common base address (0).
2712 	 */
2713 	s_basepa = s_mlist->address;
2714 	t_basepa = t_mlist->address;
2715 
2716 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2717 		s_ml->address -= s_basepa;
2718 
2719 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2720 		t_ml->address -= t_basepa;
2721 
2722 	s_ml = s_mlist;
2723 	for (t_ml = t_mlist; t_ml && s_ml; t_ml = t_ml->next) {
2724 		uint64_t	s_start, s_end;
2725 		uint64_t	t_start, t_end;
2726 
2727 		t_start = t_ml->address;
2728 		t_end = t_start + t_ml->size;
2729 
2730 		for (; s_ml; s_ml = s_ml->next) {
2731 			s_start = s_ml->address;
2732 			s_end = s_start + s_ml->size;
2733 
2734 			if ((s_start < t_start) || (s_end > t_end))
2735 				break;
2736 		}
2737 	}
2738 	/*
2739 	 * If we ran out of source memlist chunks that mean
2740 	 * we found a home for all of them.
2741 	 */
2742 	if (s_ml == NULL)
2743 		rv = 1;
2744 
2745 	/*
2746 	 * Need to add base addresses back since memlists
2747 	 * are probably in use by caller.
2748 	 */
2749 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2750 		s_ml->address += s_basepa;
2751 
2752 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2753 		t_ml->address += t_basepa;
2754 
2755 	return (rv);
2756 }
2757 
2758 static struct memlist *
2759 memlist_del_span(struct memlist *mlist, uint64_t base, uint64_t len)
2760 {
2761 	uint64_t	end;
2762 	struct memlist	*ml, *tl, *nlp;
2763 
2764 	if (mlist == NULL)
2765 		return (NULL);
2766 
2767 	end = base + len;
2768 	if ((end <= mlist->address) || (base == end))
2769 		return (mlist);
2770 
2771 	for (tl = ml = mlist; ml; tl = ml, ml = nlp) {
2772 		uint64_t	mend;
2773 
2774 		nlp = ml->next;
2775 
2776 		if (end <= ml->address)
2777 			break;
2778 
2779 		mend = ml->address + ml->size;
2780 		if (base < mend) {
2781 			if (base <= ml->address) {
2782 				ml->address = end;
2783 				if (end >= mend)
2784 					ml->size = 0ull;
2785 				else
2786 					ml->size = mend - ml->address;
2787 			} else {
2788 				ml->size = base - ml->address;
2789 				if (end < mend) {
2790 					struct memlist	*nl;
2791 					/*
2792 					 * splitting an memlist entry.
2793 					 */
2794 					nl = GETSTRUCT(struct memlist, 1);
2795 					nl->address = end;
2796 					nl->size = mend - nl->address;
2797 					if ((nl->next = nlp) != NULL)
2798 						nlp->prev = nl;
2799 					nl->prev = ml;
2800 					ml->next = nl;
2801 					nlp = nl;
2802 				}
2803 			}
2804 			if (ml->size == 0ull) {
2805 				if (ml == mlist) {
2806 					if ((mlist = nlp) != NULL)
2807 						nlp->prev = NULL;
2808 					FREESTRUCT(ml, struct memlist, 1);
2809 					if (mlist == NULL)
2810 						break;
2811 					ml = nlp;
2812 				} else {
2813 					if ((tl->next = nlp) != NULL)
2814 						nlp->prev = tl;
2815 					FREESTRUCT(ml, struct memlist, 1);
2816 					ml = tl;
2817 				}
2818 			}
2819 		}
2820 	}
2821 
2822 	return (mlist);
2823 }
2824 
2825 /*
2826  * add span without merging
2827  */
2828 static struct memlist *
2829 memlist_cat_span(struct memlist *mlist, uint64_t base, uint64_t len)
2830 {
2831 	struct memlist	*ml, *tl, *nl;
2832 
2833 	if (len == 0ull)
2834 		return (NULL);
2835 
2836 	if (mlist == NULL) {
2837 		mlist = GETSTRUCT(struct memlist, 1);
2838 		mlist->address = base;
2839 		mlist->size = len;
2840 		mlist->next = mlist->prev = NULL;
2841 
2842 		return (mlist);
2843 	}
2844 
2845 	for (tl = ml = mlist; ml; tl = ml, ml = ml->next) {
2846 		if (base < ml->address) {
2847 			nl = GETSTRUCT(struct memlist, 1);
2848 			nl->address = base;
2849 			nl->size = len;
2850 			nl->next = ml;
2851 			if ((nl->prev = ml->prev) != NULL)
2852 				nl->prev->next = nl;
2853 			ml->prev = nl;
2854 			if (mlist == ml)
2855 				mlist = nl;
2856 			break;
2857 		}
2858 	}
2859 
2860 	if (ml == NULL) {
2861 		nl = GETSTRUCT(struct memlist, 1);
2862 		nl->address = base;
2863 		nl->size = len;
2864 		nl->next = NULL;
2865 		nl->prev = tl;
2866 		tl->next = nl;
2867 	}
2868 
2869 	return (mlist);
2870 }
2871