xref: /illumos-gate/usr/src/uts/sun4u/ngdr/io/dr_mem.c (revision 89b2a9fbeabf42fa54594df0e5927bcc50a07cc9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * DR memory support routines.
28  */
29 
30 #include <sys/note.h>
31 #include <sys/debug.h>
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/param.h>
35 #include <sys/dditypes.h>
36 #include <sys/kmem.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/sunndi.h>
41 #include <sys/ddi_impldefs.h>
42 #include <sys/ndi_impldefs.h>
43 #include <sys/sysmacros.h>
44 #include <sys/machsystm.h>
45 #include <sys/spitregs.h>
46 #include <sys/cpuvar.h>
47 #include <sys/promif.h>
48 #include <vm/seg_kmem.h>
49 #include <sys/lgrp.h>
50 #include <sys/platform_module.h>
51 
52 #include <vm/page.h>
53 
54 #include <sys/dr.h>
55 #include <sys/dr_util.h>
56 
57 extern struct memlist	*phys_install;
58 
59 /* TODO: push this reference below drmach line */
60 extern int		kcage_on;
61 
62 /* for the DR*INTERNAL_ERROR macros.  see sys/dr.h. */
63 static char *dr_ie_fmt = "dr_mem.c %d";
64 
65 static int	dr_post_detach_mem_unit(dr_mem_unit_t *mp);
66 static int	dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *mlist);
67 static int	dr_select_mem_target(dr_handle_t *hp, dr_mem_unit_t *mp,
68     struct memlist *ml);
69 static void	dr_init_mem_unit_data(dr_mem_unit_t *mp);
70 
71 static int 	memlist_canfit(struct memlist *s_mlist,
72     struct memlist *t_mlist);
73 
74 /*
75  * dr_mem_unit_t.sbm_flags
76  */
77 #define	DR_MFLAG_RESERVED	0x01	/* mem unit reserved for delete */
78 #define	DR_MFLAG_SOURCE		0x02	/* source brd of copy/rename op */
79 #define	DR_MFLAG_TARGET		0x04	/* target brd of copy/rename op */
80 #define	DR_MFLAG_MEMUPSIZE	0x08	/* move from big to small board */
81 #define	DR_MFLAG_MEMDOWNSIZE	0x10	/* move from small to big board */
82 #define	DR_MFLAG_MEMRESIZE	0x18	/* move to different size board */
83 #define	DR_MFLAG_RELOWNER	0x20	/* memory release (delete) owner */
84 #define	DR_MFLAG_RELDONE	0x40	/* memory release (delete) done */
85 
86 /* helper macros */
87 #define	_ptob64(p) ((uint64_t)(p) << PAGESHIFT)
88 #define	_b64top(b) ((pgcnt_t)((b) >> PAGESHIFT))
89 
90 static struct memlist *
91 dr_get_memlist(dr_mem_unit_t *mp)
92 {
93 	struct memlist	*mlist = NULL;
94 	sbd_error_t	*err;
95 	static fn_t	f = "dr_get_memlist";
96 
97 	PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path);
98 
99 	/*
100 	 * Return cached memlist, if present.
101 	 * This memlist will be present following an
102 	 * unconfigure (a.k.a: detach) of this memunit.
103 	 * It should only be used in the case were a configure
104 	 * is bringing this memunit back in without going
105 	 * through the disconnect and connect states.
106 	 */
107 	if (mp->sbm_mlist) {
108 		PR_MEM("%s: found cached memlist\n", f);
109 
110 		mlist = memlist_dup(mp->sbm_mlist);
111 	} else {
112 		uint64_t basepa = _ptob64(mp->sbm_basepfn);
113 
114 		/* attempt to construct a memlist using phys_install */
115 
116 		/* round down to slice base address */
117 		basepa &= ~(mp->sbm_slice_size - 1);
118 
119 		/* get a copy of phys_install to edit */
120 		memlist_read_lock();
121 		mlist = memlist_dup(phys_install);
122 		memlist_read_unlock();
123 
124 		/* trim lower irrelevant span */
125 		if (mlist)
126 			mlist = memlist_del_span(mlist, 0ull, basepa);
127 
128 		/* trim upper irrelevant span */
129 		if (mlist) {
130 			uint64_t endpa;
131 
132 			basepa += mp->sbm_slice_size;
133 			endpa = _ptob64(physmax + 1);
134 			if (endpa > basepa)
135 				mlist = memlist_del_span(
136 				    mlist,
137 				    basepa,
138 				    endpa - basepa);
139 		}
140 
141 		if (mlist) {
142 			/* successfully built a memlist */
143 			PR_MEM("%s: derived memlist from phys_install\n", f);
144 		}
145 
146 		/* if no mlist yet, try platform layer */
147 		if (!mlist) {
148 			err = drmach_mem_get_memlist(
149 			    mp->sbm_cm.sbdev_id, &mlist);
150 			if (err) {
151 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
152 				mlist = NULL; /* paranoia */
153 			}
154 		}
155 	}
156 
157 	PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path);
158 	PR_MEMLIST_DUMP(mlist);
159 
160 	return (mlist);
161 }
162 
163 typedef struct {
164 	kcondvar_t cond;
165 	kmutex_t lock;
166 	int error;
167 	int done;
168 } dr_release_mem_sync_t;
169 
170 /*
171  * Memory has been logically removed by the time this routine is called.
172  */
173 static void
174 dr_mem_del_done(void *arg, int error)
175 {
176 	dr_release_mem_sync_t *ds = arg;
177 
178 	mutex_enter(&ds->lock);
179 	ds->error = error;
180 	ds->done = 1;
181 	cv_signal(&ds->cond);
182 	mutex_exit(&ds->lock);
183 }
184 
185 /*
186  * When we reach here the memory being drained should have
187  * already been reserved in dr_pre_release_mem().
188  * Our only task here is to kick off the "drain" and wait
189  * for it to finish.
190  */
191 void
192 dr_release_mem(dr_common_unit_t *cp)
193 {
194 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
195 	int		err;
196 	dr_release_mem_sync_t rms;
197 	static fn_t	f = "dr_release_mem";
198 
199 	/* check that this memory unit has been reserved */
200 	if (!(mp->sbm_flags & DR_MFLAG_RELOWNER)) {
201 		DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
202 		return;
203 	}
204 
205 	bzero((void *) &rms, sizeof (rms));
206 
207 	mutex_init(&rms.lock, NULL, MUTEX_DRIVER, NULL);
208 	cv_init(&rms.cond, NULL, CV_DRIVER, NULL);
209 
210 	mutex_enter(&rms.lock);
211 	err = kphysm_del_start(mp->sbm_memhandle, dr_mem_del_done,
212 	    (void *) &rms);
213 	if (err == KPHYSM_OK) {
214 		/* wait for completion or interrupt */
215 		while (!rms.done) {
216 			if (cv_wait_sig(&rms.cond, &rms.lock) == 0) {
217 				/* then there is a pending UNIX signal */
218 				(void) kphysm_del_cancel(mp->sbm_memhandle);
219 
220 				/* wait for completion */
221 				while (!rms.done)
222 					cv_wait(&rms.cond, &rms.lock);
223 			}
224 		}
225 		/* get the result of the memory delete operation */
226 		err = rms.error;
227 	}
228 	mutex_exit(&rms.lock);
229 
230 	cv_destroy(&rms.cond);
231 	mutex_destroy(&rms.lock);
232 
233 	if (err != KPHYSM_OK) {
234 		int e_code;
235 
236 		switch (err) {
237 			case KPHYSM_ENOWORK:
238 				e_code = ESBD_NOERROR;
239 				break;
240 
241 			case KPHYSM_EHANDLE:
242 			case KPHYSM_ESEQUENCE:
243 				e_code = ESBD_INTERNAL;
244 				break;
245 
246 			case KPHYSM_ENOTVIABLE:
247 				e_code = ESBD_MEM_NOTVIABLE;
248 				break;
249 
250 			case KPHYSM_EREFUSED:
251 				e_code = ESBD_MEM_REFUSED;
252 				break;
253 
254 			case KPHYSM_ENONRELOC:
255 				e_code = ESBD_MEM_NONRELOC;
256 				break;
257 
258 			case KPHYSM_ECANCELLED:
259 				e_code = ESBD_MEM_CANCELLED;
260 				break;
261 
262 			case KPHYSM_ERESOURCE:
263 				e_code = ESBD_MEMFAIL;
264 				break;
265 
266 			default:
267 				cmn_err(CE_WARN,
268 				    "%s: unexpected kphysm error code %d,"
269 				    " id 0x%p",
270 				    f, err, mp->sbm_cm.sbdev_id);
271 
272 				e_code = ESBD_IO;
273 				break;
274 		}
275 
276 		if (e_code != ESBD_NOERROR) {
277 			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
278 		}
279 	}
280 }
281 
282 void
283 dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
284 {
285 	_NOTE(ARGUNUSED(hp))
286 
287 	dr_mem_unit_t	*mp = (dr_mem_unit_t *)cp;
288 	struct memlist	*ml, *mc;
289 	sbd_error_t	*err;
290 	static fn_t	f = "dr_attach_mem";
291 
292 	PR_MEM("%s...\n", f);
293 
294 	dr_lock_status(hp->h_bd);
295 	err = drmach_configure(cp->sbdev_id, 0);
296 	dr_unlock_status(hp->h_bd);
297 	if (err) {
298 		DRERR_SET_C(&cp->sbdev_error, &err);
299 		return;
300 	}
301 
302 	ml = dr_get_memlist(mp);
303 	for (mc = ml; mc; mc = mc->next) {
304 		int		 rv;
305 		sbd_error_t	*err;
306 
307 		rv = kphysm_add_memory_dynamic(
308 		    (pfn_t)(mc->address >> PAGESHIFT),
309 		    (pgcnt_t)(mc->size >> PAGESHIFT));
310 		if (rv != KPHYSM_OK) {
311 			/*
312 			 * translate kphysm error and
313 			 * store in devlist error
314 			 */
315 			switch (rv) {
316 			case KPHYSM_ERESOURCE:
317 				rv = ESBD_NOMEM;
318 				break;
319 
320 			case KPHYSM_EFAULT:
321 				rv = ESBD_FAULT;
322 				break;
323 
324 			default:
325 				rv = ESBD_INTERNAL;
326 				break;
327 			}
328 
329 			if (rv == ESBD_INTERNAL) {
330 				DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
331 			} else
332 				dr_dev_err(CE_WARN, &mp->sbm_cm, rv);
333 			break;
334 		}
335 
336 		err = drmach_mem_add_span(
337 		    mp->sbm_cm.sbdev_id, mc->address, mc->size);
338 		if (err) {
339 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
340 			break;
341 		}
342 	}
343 
344 	memlist_delete(ml);
345 
346 	/* back out if configure failed */
347 	if (mp->sbm_cm.sbdev_error != NULL) {
348 		dr_lock_status(hp->h_bd);
349 		err = drmach_unconfigure(cp->sbdev_id,
350 		    DEVI_BRANCH_DESTROY);
351 		if (err)
352 			sbd_err_clear(&err);
353 		dr_unlock_status(hp->h_bd);
354 	}
355 }
356 
357 #define	DR_SCRUB_VALUE	0x0d0e0a0d0b0e0e0fULL
358 
359 static void
360 dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
361 {
362 #ifdef DEBUG
363 	clock_t		stime = ddi_get_lbolt();
364 #endif /* DEBUG */
365 
366 	struct memlist	*ml;
367 	uint64_t	scrub_value = DR_SCRUB_VALUE;
368 	processorid_t	cpuid;
369 	static fn_t	f = "dr_mem_ecache_scrub";
370 
371 	cpuid = drmach_mem_cpu_affinity(mp->sbm_cm.sbdev_id);
372 	affinity_set(cpuid);
373 
374 	PR_MEM("%s: using proc %d, memlist...\n", f,
375 	    (cpuid == CPU_CURRENT) ? CPU->cpu_id : cpuid);
376 	PR_MEMLIST_DUMP(mlist);
377 
378 	for (ml = mlist; ml; ml = ml->next) {
379 		uint64_t	dst_pa;
380 		uint64_t	nbytes;
381 
382 		/* calculate the destination physical address */
383 		dst_pa = ml->address;
384 		if (ml->address & PAGEOFFSET)
385 			cmn_err(CE_WARN,
386 			    "%s: address (0x%lx) not on "
387 			    "page boundary", f, ml->address);
388 
389 		nbytes = ml->size;
390 		if (ml->size & PAGEOFFSET)
391 			cmn_err(CE_WARN,
392 			    "%s: size (0x%lx) not on "
393 			    "page boundary", f, ml->size);
394 
395 		/*LINTED*/
396 		while (nbytes > 0) {
397 			/* write 64 bits to dst_pa */
398 			stdphys(dst_pa, scrub_value);
399 
400 			/* increment/decrement by cacheline sizes */
401 			dst_pa += DRMACH_COHERENCY_UNIT;
402 			nbytes -= DRMACH_COHERENCY_UNIT;
403 		}
404 	}
405 
406 	/*
407 	 * flush this cpu's ecache and take care to ensure
408 	 * that all of it's bus transactions have retired.
409 	 */
410 	drmach_cpu_flush_ecache_sync();
411 
412 	affinity_clear();
413 
414 #ifdef DEBUG
415 	stime = ddi_get_lbolt() - stime;
416 	PR_MEM("%s: scrub ticks = %ld (%ld secs)\n", f, stime, stime / hz);
417 #endif /* DEBUG */
418 }
419 
420 static int
421 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
422 {
423 	time_t		 copytime;
424 	drmachid_t	 cr_id;
425 	dr_sr_handle_t	*srhp;
426 	struct memlist	*c_ml, *d_ml;
427 	sbd_error_t	*err;
428 	static fn_t	 f = "dr_move_memory";
429 
430 	PR_MEM("%s: (INLINE) moving memory from %s to %s\n",
431 	    f,
432 	    s_mp->sbm_cm.sbdev_path,
433 	    t_mp->sbm_cm.sbdev_path);
434 
435 	ASSERT(s_mp->sbm_flags & DR_MFLAG_SOURCE);
436 	ASSERT(s_mp->sbm_peer == t_mp);
437 	ASSERT(s_mp->sbm_mlist);
438 
439 	ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
440 	ASSERT(t_mp->sbm_peer == s_mp);
441 
442 	/*
443 	 * create a memlist of spans to copy by removing
444 	 * the spans that have been deleted, if any, from
445 	 * the full source board memlist.  s_mp->sbm_del_mlist
446 	 * will be NULL if there were no spans deleted from
447 	 * the source board.
448 	 */
449 	c_ml = memlist_dup(s_mp->sbm_mlist);
450 	d_ml = s_mp->sbm_del_mlist;
451 	while (d_ml != NULL) {
452 		c_ml = memlist_del_span(c_ml, d_ml->address, d_ml->size);
453 		d_ml = d_ml->next;
454 	}
455 
456 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
457 
458 	err = drmach_copy_rename_init(
459 	    t_mp->sbm_cm.sbdev_id, _ptob64(t_mp->sbm_slice_offset),
460 	    s_mp->sbm_cm.sbdev_id, c_ml, &cr_id);
461 	if (err) {
462 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
463 		affinity_clear();
464 		return (-1);
465 	}
466 
467 	srhp = dr_get_sr_handle(hp);
468 	ASSERT(srhp);
469 
470 	copytime = ddi_get_lbolt();
471 
472 	/* Quiesce the OS.  */
473 	if (dr_suspend(srhp)) {
474 		cmn_err(CE_WARN, "%s: failed to quiesce OS"
475 		    " for copy-rename", f);
476 
477 		dr_release_sr_handle(srhp);
478 		err = drmach_copy_rename_fini(cr_id);
479 		if (err) {
480 			/*
481 			 * no error is expected since the program has
482 			 * not yet run.
483 			 */
484 
485 			/* catch this in debug kernels */
486 			ASSERT(0);
487 
488 			sbd_err_clear(&err);
489 		}
490 
491 		/* suspend error reached via hp */
492 		s_mp->sbm_cm.sbdev_error = hp->h_err;
493 		hp->h_err = NULL;
494 
495 		affinity_clear();
496 		return (-1);
497 	}
498 
499 	/*
500 	 * Rename memory for lgroup.
501 	 * Source and target board numbers are packaged in arg.
502 	 */
503 	{
504 		dr_board_t	*t_bp, *s_bp;
505 
506 		s_bp = s_mp->sbm_cm.sbdev_bp;
507 		t_bp = t_mp->sbm_cm.sbdev_bp;
508 
509 		lgrp_plat_config(LGRP_CONFIG_MEM_RENAME,
510 		    (uintptr_t)(s_bp->b_num | (t_bp->b_num << 16)));
511 	}
512 
513 	drmach_copy_rename(cr_id);
514 
515 	/* Resume the OS.  */
516 	dr_resume(srhp);
517 
518 	copytime = ddi_get_lbolt() - copytime;
519 
520 	dr_release_sr_handle(srhp);
521 	err = drmach_copy_rename_fini(cr_id);
522 	if (err)
523 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
524 
525 	affinity_clear();
526 
527 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
528 	    f, copytime, copytime / hz);
529 
530 	/* return -1 if dr_suspend or copy/rename recorded an error */
531 	return (err == NULL ? 0 : -1);
532 }
533 
534 /*
535  * If detaching node contains memory that is "non-permanent"
536  * then the memory adr's are simply cleared.  If the memory
537  * is non-relocatable, then do a copy-rename.
538  */
539 void
540 dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp)
541 {
542 	int			rv = 0;
543 	dr_mem_unit_t		*s_mp = (dr_mem_unit_t *)cp;
544 	dr_mem_unit_t		*t_mp;
545 	dr_state_t		state;
546 	static fn_t		f = "dr_detach_mem";
547 
548 	PR_MEM("%s...\n", f);
549 
550 	/* lookup target mem unit and target board structure, if any */
551 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
552 		t_mp = s_mp->sbm_peer;
553 		ASSERT(t_mp != NULL);
554 		ASSERT(t_mp->sbm_peer == s_mp);
555 	} else {
556 		t_mp = NULL;
557 	}
558 
559 	/* verify mem unit's state is UNREFERENCED */
560 	state = s_mp->sbm_cm.sbdev_state;
561 	if (state != DR_STATE_UNREFERENCED) {
562 		dr_dev_err(CE_IGNORE, &s_mp->sbm_cm, ESBD_STATE);
563 		return;
564 	}
565 
566 	/* verify target mem unit's state is UNREFERENCED, if any */
567 	if (t_mp != NULL) {
568 		state = t_mp->sbm_cm.sbdev_state;
569 		if (state != DR_STATE_UNREFERENCED) {
570 			dr_dev_err(CE_IGNORE, &t_mp->sbm_cm, ESBD_STATE);
571 			return;
572 		}
573 	}
574 
575 	/*
576 	 * Scrub deleted memory.  This will cause all cachelines
577 	 * referencing the memory to only be in the local cpu's
578 	 * ecache.
579 	 */
580 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
581 		/* no del mlist for src<=dst mem size copy/rename */
582 		if (s_mp->sbm_del_mlist)
583 			dr_mem_ecache_scrub(s_mp, s_mp->sbm_del_mlist);
584 	}
585 	if (t_mp != NULL && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
586 		ASSERT(t_mp->sbm_del_mlist);
587 		dr_mem_ecache_scrub(t_mp, t_mp->sbm_del_mlist);
588 	}
589 
590 	/*
591 	 * If there is no target board (no copy/rename was needed), then
592 	 * we're done!
593 	 */
594 	if (t_mp == NULL) {
595 		sbd_error_t *err;
596 		/*
597 		 * Reprogram interconnect hardware and disable
598 		 * memory controllers for memory node that's going away.
599 		 */
600 
601 		err = drmach_mem_disable(s_mp->sbm_cm.sbdev_id);
602 		if (err) {
603 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
604 			rv = -1;
605 		}
606 	} else {
607 		rv = dr_move_memory(hp, s_mp, t_mp);
608 		PR_MEM("%s: %s memory COPY-RENAME (board %d -> %d)\n",
609 		    f,
610 		    rv ? "FAILED" : "COMPLETED",
611 		    s_mp->sbm_cm.sbdev_bp->b_num,
612 		    t_mp->sbm_cm.sbdev_bp->b_num);
613 
614 		if (rv != 0)
615 			(void) dr_cancel_mem(s_mp);
616 	}
617 
618 	if (rv == 0) {
619 		sbd_error_t *err;
620 
621 		dr_lock_status(hp->h_bd);
622 		err = drmach_unconfigure(s_mp->sbm_cm.sbdev_id,
623 		    DEVI_BRANCH_DESTROY);
624 		dr_unlock_status(hp->h_bd);
625 		if (err)
626 			sbd_err_clear(&err);
627 	}
628 }
629 
630 #ifndef _STARFIRE
631 /*
632  * XXX workaround for certain lab configurations (see also starcat drmach.c)
633  * Temporary code to get around observed incorrect results from
634  * kphysm_del_span_query when the queried span contains address spans
635  * not occupied by memory in between spans that do have memory.
636  * This routine acts as a wrapper to kphysm_del_span_query.  It builds
637  * a memlist from phys_install of spans that exist between base and
638  * base + npages, inclusively.  Kphysm_del_span_query is called for each
639  * node in the memlist with the results accumulated in *mp.
640  */
641 static int
642 dr_del_span_query(pfn_t base, pgcnt_t npages, memquery_t *mp)
643 {
644 	uint64_t	 pa = _ptob64(base);
645 	uint64_t	 sm = ~ (137438953472ull - 1);
646 	uint64_t	 sa = pa & sm;
647 	struct memlist	*mlist, *ml;
648 	int		 rv;
649 
650 	npages = npages; /* silence lint */
651 	memlist_read_lock();
652 	mlist = memlist_dup(phys_install);
653 	memlist_read_unlock();
654 
655 again:
656 	for (ml = mlist; ml; ml = ml->next) {
657 		if ((ml->address & sm) != sa) {
658 			mlist = memlist_del_span(mlist, ml->address, ml->size);
659 			goto again;
660 		}
661 	}
662 
663 	mp->phys_pages = 0;
664 	mp->managed = 0;
665 	mp->nonrelocatable = 0;
666 	mp->first_nonrelocatable = (pfn_t)-1;	/* XXX */
667 	mp->last_nonrelocatable = 0;
668 
669 	for (ml = mlist; ml; ml = ml->next) {
670 		memquery_t mq;
671 
672 		rv = kphysm_del_span_query(
673 		    _b64top(ml->address), _b64top(ml->size), &mq);
674 		if (rv)
675 			break;
676 
677 		mp->phys_pages += mq.phys_pages;
678 		mp->managed += mq.managed;
679 		mp->nonrelocatable += mq.nonrelocatable;
680 
681 		if (mq.nonrelocatable != 0) {
682 			if (mq.first_nonrelocatable < mp->first_nonrelocatable)
683 				mp->first_nonrelocatable =
684 				    mq.first_nonrelocatable;
685 			if (mq.last_nonrelocatable > mp->last_nonrelocatable)
686 				mp->last_nonrelocatable =
687 				    mq.last_nonrelocatable;
688 		}
689 	}
690 
691 	if (mp->nonrelocatable == 0)
692 		mp->first_nonrelocatable = 0;	/* XXX */
693 
694 	memlist_delete(mlist);
695 	return (rv);
696 }
697 
698 #define	kphysm_del_span_query dr_del_span_query
699 #endif /* _STARFIRE */
700 
701 /*
702  * NOTE: This routine is only partially smart about multiple
703  *	 mem-units.  Need to make mem-status structure smart
704  *	 about them also.
705  */
706 int
707 dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp)
708 {
709 	int		m, mix;
710 	memdelstat_t	mdst;
711 	memquery_t	mq;
712 	dr_board_t	*bp;
713 	dr_mem_unit_t	*mp;
714 	sbd_mem_stat_t	*msp;
715 	static fn_t	f = "dr_mem_status";
716 
717 	bp = hp->h_bd;
718 	devset &= DR_DEVS_PRESENT(bp);
719 
720 	for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) {
721 		int		rv;
722 		sbd_error_t	*err;
723 		drmach_status_t	 pstat;
724 		dr_mem_unit_t	*p_mp;
725 
726 		if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0)
727 			continue;
728 
729 		mp = dr_get_mem_unit(bp, m);
730 
731 		if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) {
732 			/* present, but not fully initialized */
733 			continue;
734 		}
735 
736 		if (mp->sbm_cm.sbdev_id == (drmachid_t)0)
737 			continue;
738 
739 		/* fetch platform status */
740 		err = drmach_status(mp->sbm_cm.sbdev_id, &pstat);
741 		if (err) {
742 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
743 			continue;
744 		}
745 
746 		msp = &dsp->d_mem;
747 		bzero((caddr_t)msp, sizeof (*msp));
748 
749 		strncpy(msp->ms_cm.c_id.c_name, pstat.type,
750 		    sizeof (msp->ms_cm.c_id.c_name));
751 		msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type;
752 		msp->ms_cm.c_id.c_unit = SBD_NULL_UNIT;
753 		msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond;
754 		msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy;
755 		msp->ms_cm.c_time = mp->sbm_cm.sbdev_time;
756 		msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate;
757 
758 		msp->ms_totpages = mp->sbm_npages;
759 		msp->ms_basepfn = mp->sbm_basepfn;
760 		msp->ms_pageslost = mp->sbm_pageslost;
761 		msp->ms_cage_enabled = kcage_on;
762 
763 		if (mp->sbm_flags & DR_MFLAG_RESERVED)
764 			p_mp = mp->sbm_peer;
765 		else
766 			p_mp = NULL;
767 
768 		if (p_mp == NULL) {
769 			msp->ms_peer_is_target = 0;
770 			msp->ms_peer_ap_id[0] = '\0';
771 		} else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) {
772 			char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
773 			char *minor;
774 
775 			/*
776 			 * b_dip doesn't have to be held for ddi_pathname()
777 			 * because the board struct (dr_board_t) will be
778 			 * destroyed before b_dip detaches.
779 			 */
780 			(void) ddi_pathname(bp->b_dip, path);
781 			minor = strchr(p_mp->sbm_cm.sbdev_path, ':');
782 
783 			snprintf(msp->ms_peer_ap_id,
784 			    sizeof (msp->ms_peer_ap_id), "%s%s",
785 			    path, (minor == NULL) ? "" : minor);
786 
787 			kmem_free(path, MAXPATHLEN);
788 
789 			if (p_mp->sbm_flags & DR_MFLAG_TARGET)
790 				msp->ms_peer_is_target = 1;
791 		}
792 
793 		if (mp->sbm_flags & DR_MFLAG_RELOWNER)
794 			rv = kphysm_del_status(mp->sbm_memhandle, &mdst);
795 		else
796 			rv = KPHYSM_EHANDLE;	/* force 'if' to fail */
797 
798 		if (rv == KPHYSM_OK) {
799 			/*
800 			 * Any pages above managed is "free",
801 			 * i.e. it's collected.
802 			 */
803 			msp->ms_detpages += (uint_t)(mdst.collected +
804 			    mdst.phys_pages - mdst.managed);
805 		} else {
806 			/*
807 			 * If we're UNREFERENCED or UNCONFIGURED,
808 			 * then the number of detached pages is
809 			 * however many pages are on the board.
810 			 * I.e. detached = not in use by OS.
811 			 */
812 			switch (msp->ms_cm.c_ostate) {
813 			/*
814 			 * changed to use cfgadm states
815 			 *
816 			 * was:
817 			 *	case DR_STATE_UNREFERENCED:
818 			 *	case DR_STATE_UNCONFIGURED:
819 			 */
820 			case SBD_STAT_UNCONFIGURED:
821 				msp->ms_detpages = msp->ms_totpages;
822 				break;
823 
824 			default:
825 				break;
826 			}
827 		}
828 
829 		/*
830 		 * kphysm_del_span_query can report non-reloc pages = total
831 		 * pages for memory that is not yet configured
832 		 */
833 		if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) {
834 
835 			rv = kphysm_del_span_query(mp->sbm_basepfn,
836 			    mp->sbm_npages, &mq);
837 
838 			if (rv == KPHYSM_OK) {
839 				msp->ms_managed_pages = mq.managed;
840 				msp->ms_noreloc_pages = mq.nonrelocatable;
841 				msp->ms_noreloc_first =
842 				    mq.first_nonrelocatable;
843 				msp->ms_noreloc_last =
844 				    mq.last_nonrelocatable;
845 				msp->ms_cm.c_sflags = 0;
846 				if (mq.nonrelocatable) {
847 					SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE,
848 					    msp->ms_cm.c_sflags);
849 				}
850 			} else {
851 				PR_MEM("%s: kphysm_del_span_query() = %d\n",
852 				    f, rv);
853 			}
854 		}
855 
856 		/*
857 		 * Check source unit state during copy-rename
858 		 */
859 		if ((mp->sbm_flags & DR_MFLAG_SOURCE) &&
860 		    (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED ||
861 		    mp->sbm_cm.sbdev_state == DR_STATE_RELEASE))
862 			msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED;
863 
864 		mix++;
865 		dsp++;
866 	}
867 
868 	return (mix);
869 }
870 
871 int
872 dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
873 {
874 	_NOTE(ARGUNUSED(hp))
875 
876 	int		err_flag = 0;
877 	int		d;
878 	sbd_error_t	*err;
879 	static fn_t	f = "dr_pre_attach_mem";
880 
881 	PR_MEM("%s...\n", f);
882 
883 	for (d = 0; d < devnum; d++) {
884 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
885 		dr_state_t	state;
886 
887 		cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path);
888 
889 		state = mp->sbm_cm.sbdev_state;
890 		switch (state) {
891 		case DR_STATE_UNCONFIGURED:
892 			PR_MEM("%s: recovering from UNCONFIG for %s\n",
893 			    f,
894 			    mp->sbm_cm.sbdev_path);
895 
896 			/* use memlist cached by dr_post_detach_mem_unit */
897 			ASSERT(mp->sbm_mlist != NULL);
898 			PR_MEM("%s: re-configuring cached memlist for %s:\n",
899 			    f, mp->sbm_cm.sbdev_path);
900 			PR_MEMLIST_DUMP(mp->sbm_mlist);
901 
902 			/* kphysm del handle should be have been freed */
903 			ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
904 
905 			/*FALLTHROUGH*/
906 
907 		case DR_STATE_CONNECTED:
908 			PR_MEM("%s: reprogramming mem hardware on %s\n",
909 			    f, mp->sbm_cm.sbdev_bp->b_path);
910 
911 			PR_MEM("%s: enabling %s\n",
912 			    f, mp->sbm_cm.sbdev_path);
913 
914 			err = drmach_mem_enable(mp->sbm_cm.sbdev_id);
915 			if (err) {
916 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
917 				err_flag = 1;
918 			}
919 			break;
920 
921 		default:
922 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE);
923 			err_flag = 1;
924 			break;
925 		}
926 
927 		/* exit for loop if error encountered */
928 		if (err_flag)
929 			break;
930 	}
931 
932 	return (err_flag ? -1 : 0);
933 }
934 
935 int
936 dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
937 {
938 	_NOTE(ARGUNUSED(hp))
939 
940 	int		d;
941 	static fn_t	f = "dr_post_attach_mem";
942 
943 	PR_MEM("%s...\n", f);
944 
945 	for (d = 0; d < devnum; d++) {
946 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
947 		struct memlist	*mlist, *ml;
948 
949 		mlist = dr_get_memlist(mp);
950 		if (mlist == NULL) {
951 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_MEMFAIL);
952 			continue;
953 		}
954 
955 		/*
956 		 * Verify the memory really did successfully attach
957 		 * by checking for its existence in phys_install.
958 		 */
959 		memlist_read_lock();
960 		if (memlist_intersect(phys_install, mlist) == 0) {
961 			memlist_read_unlock();
962 
963 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
964 
965 			PR_MEM("%s: %s memlist not in phys_install",
966 			    f, mp->sbm_cm.sbdev_path);
967 
968 			memlist_delete(mlist);
969 			continue;
970 		}
971 		memlist_read_unlock();
972 
973 		for (ml = mlist; ml != NULL; ml = ml->next) {
974 			sbd_error_t *err;
975 
976 			err = drmach_mem_add_span(
977 			    mp->sbm_cm.sbdev_id,
978 			    ml->address,
979 			    ml->size);
980 			if (err)
981 				DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
982 		}
983 
984 		memlist_delete(mlist);
985 
986 		/*
987 		 * Destroy cached memlist, if any.
988 		 * There will be a cached memlist in sbm_mlist if
989 		 * this board is being configured directly after
990 		 * an unconfigure.
991 		 * To support this transition, dr_post_detach_mem
992 		 * left a copy of the last known memlist in sbm_mlist.
993 		 * This memlist could differ from any derived from
994 		 * hardware if while this memunit was last configured
995 		 * the system detected and deleted bad pages from
996 		 * phys_install.  The location of those bad pages
997 		 * will be reflected in the cached memlist.
998 		 */
999 		if (mp->sbm_mlist) {
1000 			memlist_delete(mp->sbm_mlist);
1001 			mp->sbm_mlist = NULL;
1002 		}
1003 
1004 /*
1005  * TODO: why is this call to dr_init_mem_unit_data here?
1006  * this has been done at discovery or connect time, so this is
1007  * probably redundant and unnecessary.
1008  */
1009 		dr_init_mem_unit_data(mp);
1010 	}
1011 
1012 	return (0);
1013 }
1014 
1015 int
1016 dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1017 {
1018 	_NOTE(ARGUNUSED(hp))
1019 
1020 	int d;
1021 
1022 	for (d = 0; d < devnum; d++) {
1023 		dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d];
1024 
1025 		cmn_err(CE_CONT, "OS unconfigure %s", mp->sbm_cm.sbdev_path);
1026 	}
1027 
1028 	return (0);
1029 }
1030 
1031 
1032 int
1033 dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1034 {
1035 	_NOTE(ARGUNUSED(hp))
1036 
1037 	int		d, rv;
1038 	static fn_t	f = "dr_post_detach_mem";
1039 
1040 	PR_MEM("%s...\n", f);
1041 
1042 	rv = 0;
1043 	for (d = 0; d < devnum; d++) {
1044 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1045 
1046 		ASSERT(mp->sbm_cm.sbdev_bp == hp->h_bd);
1047 
1048 		if (dr_post_detach_mem_unit(mp))
1049 			rv = -1;
1050 	}
1051 
1052 	return (rv);
1053 }
1054 
1055 static void
1056 dr_add_memory_spans(dr_mem_unit_t *mp, struct memlist *ml)
1057 {
1058 	static fn_t	f = "dr_add_memory_spans";
1059 
1060 	PR_MEM("%s...", f);
1061 	PR_MEMLIST_DUMP(ml);
1062 
1063 #ifdef DEBUG
1064 	memlist_read_lock();
1065 	if (memlist_intersect(phys_install, ml)) {
1066 		PR_MEM("%s:WARNING: memlist intersects with phys_install\n", f);
1067 	}
1068 	memlist_read_unlock();
1069 #endif
1070 
1071 	for (; ml; ml = ml->next) {
1072 		pfn_t		 base;
1073 		pgcnt_t		 npgs;
1074 		int		 rv;
1075 		sbd_error_t	*err;
1076 
1077 		base = _b64top(ml->address);
1078 		npgs = _b64top(ml->size);
1079 
1080 		rv = kphysm_add_memory_dynamic(base, npgs);
1081 
1082 		err = drmach_mem_add_span(
1083 		    mp->sbm_cm.sbdev_id,
1084 		    ml->address,
1085 		    ml->size);
1086 
1087 		if (err)
1088 			DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1089 
1090 		if (rv != KPHYSM_OK) {
1091 			cmn_err(CE_WARN, "%s:"
1092 			    " unexpected kphysm_add_memory_dynamic"
1093 			    " return value %d;"
1094 			    " basepfn=0x%lx, npages=%ld\n",
1095 			    f, rv, base, npgs);
1096 
1097 			continue;
1098 		}
1099 	}
1100 }
1101 
1102 static int
1103 dr_post_detach_mem_unit(dr_mem_unit_t *s_mp)
1104 {
1105 	uint64_t	sz = s_mp->sbm_slice_size;
1106 	uint64_t	sm = sz - 1;
1107 	/* old and new below refer to PAs before and after copy-rename */
1108 	uint64_t	s_old_basepa, s_new_basepa;
1109 	uint64_t	t_old_basepa, t_new_basepa;
1110 	uint64_t	t_new_smallsize = 0;
1111 	dr_mem_unit_t	*t_mp, *x_mp;
1112 	struct memlist	*ml;
1113 	int		rv;
1114 	sbd_error_t	*err;
1115 	static fn_t	f = "dr_post_detach_mem_unit";
1116 
1117 	PR_MEM("%s...\n", f);
1118 
1119 	/* s_mp->sbm_del_mlist could be NULL, meaning no deleted spans */
1120 	PR_MEM("%s: %s: deleted memlist (EMPTY maybe okay):\n",
1121 	    f, s_mp->sbm_cm.sbdev_path);
1122 	PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1123 
1124 	/* sanity check */
1125 	ASSERT(s_mp->sbm_del_mlist == NULL ||
1126 	    (s_mp->sbm_flags & DR_MFLAG_RELDONE) != 0);
1127 
1128 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1129 		t_mp = s_mp->sbm_peer;
1130 		ASSERT(t_mp != NULL);
1131 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1132 		ASSERT(t_mp->sbm_peer == s_mp);
1133 
1134 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RELDONE);
1135 		ASSERT(t_mp->sbm_del_mlist);
1136 
1137 		PR_MEM("%s: target %s: deleted memlist:\n",
1138 		    f, t_mp->sbm_cm.sbdev_path);
1139 		PR_MEMLIST_DUMP(t_mp->sbm_del_mlist);
1140 	} else {
1141 		/* this is no target unit */
1142 		t_mp = NULL;
1143 	}
1144 
1145 	/*
1146 	 * Verify the memory really did successfully detach
1147 	 * by checking for its non-existence in phys_install.
1148 	 */
1149 	rv = 0;
1150 	memlist_read_lock();
1151 	if (s_mp->sbm_flags & DR_MFLAG_RELDONE) {
1152 		x_mp = s_mp;
1153 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1154 	}
1155 	if (rv == 0 && t_mp && (t_mp->sbm_flags & DR_MFLAG_RELDONE)) {
1156 		x_mp = t_mp;
1157 		rv = memlist_intersect(phys_install, x_mp->sbm_del_mlist);
1158 	}
1159 	memlist_read_unlock();
1160 
1161 	if (rv) {
1162 		/* error: memlist still in phys_install */
1163 		DR_DEV_INTERNAL_ERROR(&x_mp->sbm_cm);
1164 	}
1165 
1166 	/*
1167 	 * clean mem unit state and bail out if an error has been recorded.
1168 	 */
1169 	rv = 0;
1170 	if (s_mp->sbm_cm.sbdev_error) {
1171 		PR_MEM("%s: %s flags=%x", f,
1172 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1173 		DR_DEV_CLR_UNREFERENCED(&s_mp->sbm_cm);
1174 		DR_DEV_CLR_RELEASED(&s_mp->sbm_cm);
1175 		dr_device_transition(&s_mp->sbm_cm, DR_STATE_CONFIGURED);
1176 		rv = -1;
1177 	}
1178 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error != NULL) {
1179 		PR_MEM("%s: %s flags=%x", f,
1180 		    s_mp->sbm_cm.sbdev_path, s_mp->sbm_flags);
1181 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1182 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1183 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1184 		rv = -1;
1185 	}
1186 	if (rv)
1187 		goto cleanup;
1188 
1189 	s_old_basepa = _ptob64(s_mp->sbm_basepfn);
1190 	err = drmach_mem_get_base_physaddr(s_mp->sbm_cm.sbdev_id,
1191 	    &s_new_basepa);
1192 	ASSERT(err == NULL);
1193 
1194 	PR_MEM("%s:s_old_basepa: 0x%lx\n", f, s_old_basepa);
1195 	PR_MEM("%s:s_new_basepa: 0x%lx\n", f, s_new_basepa);
1196 
1197 	if (t_mp != NULL) {
1198 		struct memlist *s_copy_mlist;
1199 
1200 		t_old_basepa	= _ptob64(t_mp->sbm_basepfn);
1201 		err = drmach_mem_get_base_physaddr(t_mp->sbm_cm.sbdev_id,
1202 		    &t_new_basepa);
1203 		ASSERT(err == NULL);
1204 
1205 		PR_MEM("%s:t_old_basepa: 0x%lx\n", f, t_old_basepa);
1206 		PR_MEM("%s:t_new_basepa: 0x%lx\n", f, t_new_basepa);
1207 
1208 		/*
1209 		 * Construct copy list with original source addresses.
1210 		 * Used to add back excess target mem.
1211 		 */
1212 		s_copy_mlist = memlist_dup(s_mp->sbm_mlist);
1213 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1214 			s_copy_mlist = memlist_del_span(s_copy_mlist,
1215 			    ml->address, ml->size);
1216 		}
1217 
1218 		PR_MEM("%s: source copy list:\n:", f);
1219 		PR_MEMLIST_DUMP(s_copy_mlist);
1220 
1221 		/*
1222 		 * We had to swap mem-units, so update
1223 		 * memlists accordingly with new base
1224 		 * addresses.
1225 		 */
1226 		for (ml = t_mp->sbm_mlist; ml; ml = ml->next) {
1227 			ml->address -= t_old_basepa;
1228 			ml->address += t_new_basepa;
1229 		}
1230 
1231 		/*
1232 		 * There is no need to explicitly rename the target delete
1233 		 * memlist, because sbm_del_mlist and sbm_mlist always
1234 		 * point to the same memlist for a copy/rename operation.
1235 		 */
1236 		ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1237 
1238 		PR_MEM("%s: renamed target memlist and delete memlist:\n", f);
1239 		PR_MEMLIST_DUMP(t_mp->sbm_mlist);
1240 
1241 		for (ml = s_mp->sbm_mlist; ml; ml = ml->next) {
1242 			ml->address -= s_old_basepa;
1243 			ml->address += s_new_basepa;
1244 		}
1245 
1246 		PR_MEM("%s: renamed source memlist:\n", f);
1247 		PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1248 
1249 		/*
1250 		 * Keep track of dynamically added segments
1251 		 * since they cannot be split if we need to delete
1252 		 * excess source memory later for this board.
1253 		 */
1254 		if (t_mp->sbm_dyn_segs)
1255 			memlist_delete(t_mp->sbm_dyn_segs);
1256 		t_mp->sbm_dyn_segs = s_mp->sbm_dyn_segs;
1257 		s_mp->sbm_dyn_segs = NULL;
1258 
1259 		/*
1260 		 * If the target memory range with the new target base PA
1261 		 * extends beyond the usable slice, prevent any "target excess"
1262 		 * from being added back after this copy/rename and
1263 		 * calculate the new smaller size of the target board
1264 		 * to be set as part of target cleanup. The base + npages
1265 		 * must only include the range of memory up to the end of
1266 		 * this slice. This will only be used after a category 4
1267 		 * large-to-small target type copy/rename - see comments
1268 		 * in dr_select_mem_target.
1269 		 */
1270 		if (((t_new_basepa & sm) + _ptob64(t_mp->sbm_npages)) > sz) {
1271 			t_new_smallsize = sz - (t_new_basepa & sm);
1272 		}
1273 
1274 		if (s_mp->sbm_flags & DR_MFLAG_MEMRESIZE &&
1275 		    t_new_smallsize == 0) {
1276 			struct memlist	*t_excess_mlist;
1277 
1278 			/*
1279 			 * Add back excess target memory.
1280 			 * Subtract out the portion of the target memory
1281 			 * node that was taken over by the source memory
1282 			 * node.
1283 			 */
1284 			t_excess_mlist = memlist_dup(t_mp->sbm_mlist);
1285 			for (ml = s_copy_mlist; ml; ml = ml->next) {
1286 				t_excess_mlist =
1287 				    memlist_del_span(t_excess_mlist,
1288 				    ml->address, ml->size);
1289 			}
1290 
1291 			/*
1292 			 * Update dynamically added segs
1293 			 */
1294 			for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1295 				t_mp->sbm_dyn_segs =
1296 				    memlist_del_span(t_mp->sbm_dyn_segs,
1297 				    ml->address, ml->size);
1298 			}
1299 			for (ml = t_excess_mlist; ml; ml = ml->next) {
1300 				t_mp->sbm_dyn_segs =
1301 				    memlist_cat_span(t_mp->sbm_dyn_segs,
1302 				    ml->address, ml->size);
1303 			}
1304 			PR_MEM("%s: %s: updated dynamic seg list:\n",
1305 			    f, t_mp->sbm_cm.sbdev_path);
1306 			PR_MEMLIST_DUMP(t_mp->sbm_dyn_segs);
1307 
1308 			PR_MEM("%s: adding back remaining portion"
1309 			    " of %s, memlist:\n",
1310 			    f, t_mp->sbm_cm.sbdev_path);
1311 			PR_MEMLIST_DUMP(t_excess_mlist);
1312 
1313 			dr_add_memory_spans(s_mp, t_excess_mlist);
1314 			memlist_delete(t_excess_mlist);
1315 		}
1316 		memlist_delete(s_copy_mlist);
1317 
1318 #ifdef DEBUG
1319 		/*
1320 		 * Renaming s_mp->sbm_del_mlist is not necessary.  This
1321 		 * list is not used beyond this point, and in fact, is
1322 		 * disposed of at the end of this function.
1323 		 */
1324 		for (ml = s_mp->sbm_del_mlist; ml; ml = ml->next) {
1325 			ml->address -= s_old_basepa;
1326 			ml->address += s_new_basepa;
1327 		}
1328 
1329 		PR_MEM("%s: renamed source delete memlist", f);
1330 		PR_MEMLIST_DUMP(s_mp->sbm_del_mlist);
1331 #endif
1332 
1333 	}
1334 
1335 	if (t_mp != NULL) {
1336 		/* delete target's entire address space */
1337 		err = drmach_mem_del_span(t_mp->sbm_cm.sbdev_id,
1338 		    t_old_basepa & ~ sm, sz);
1339 		if (err)
1340 			DRERR_SET_C(&t_mp->sbm_cm.sbdev_error, &err);
1341 		ASSERT(err == NULL);
1342 
1343 		/*
1344 		 * After the copy/rename, the original address space
1345 		 * for the source board (which is now located on the
1346 		 * target board) may now have some excess to be deleted.
1347 		 * The amount is calculated by masking the slice
1348 		 * info and keeping the slice offset from t_new_basepa.
1349 		 */
1350 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1351 		    s_old_basepa & ~ sm, t_new_basepa & sm);
1352 		if (err)
1353 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1354 		ASSERT(err == NULL);
1355 
1356 	} else {
1357 		/* delete board's entire address space */
1358 		err = drmach_mem_del_span(s_mp->sbm_cm.sbdev_id,
1359 		    s_old_basepa & ~ sm, sz);
1360 		if (err)
1361 			DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
1362 		ASSERT(err == NULL);
1363 	}
1364 
1365 cleanup:
1366 	/* clean up target mem unit */
1367 	if (t_mp != NULL) {
1368 		memlist_delete(t_mp->sbm_del_mlist);
1369 		/* no need to delete sbm_mlist, it shares sbm_del_mlist */
1370 
1371 		t_mp->sbm_del_mlist = NULL;
1372 		t_mp->sbm_mlist = NULL;
1373 		t_mp->sbm_peer = NULL;
1374 		t_mp->sbm_flags = 0;
1375 		t_mp->sbm_cm.sbdev_busy = 0;
1376 		dr_init_mem_unit_data(t_mp);
1377 
1378 		/* reduce target size if new PAs go past end of usable slice */
1379 		if (t_new_smallsize > 0) {
1380 			t_mp->sbm_npages = _b64top(t_new_smallsize);
1381 			PR_MEM("%s: target new size 0x%lx bytes\n",
1382 			    f, t_new_smallsize);
1383 		}
1384 	}
1385 	if (t_mp != NULL && t_mp->sbm_cm.sbdev_error == NULL) {
1386 		/*
1387 		 * now that copy/rename has completed, undo this
1388 		 * work that was done in dr_release_mem_done.
1389 		 */
1390 		DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1391 		DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1392 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_CONFIGURED);
1393 	}
1394 
1395 	/*
1396 	 * clean up (source) board's mem unit structure.
1397 	 * NOTE: sbm_mlist is retained if no error has been record (in other
1398 	 * words, when s_mp->sbm_cm.sbdev_error is NULL). This memlist is
1399 	 * referred to elsewhere as the cached memlist.  The cached memlist
1400 	 * is used to re-attach (configure back in) this memunit from the
1401 	 * unconfigured state.  The memlist is retained because it may
1402 	 * represent bad pages that were detected while the memory was
1403 	 * configured into the OS.  The OS deletes bad pages from phys_install.
1404 	 * Those deletes, if any, will be represented in the cached mlist.
1405 	 */
1406 	if (s_mp->sbm_del_mlist && s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1407 		memlist_delete(s_mp->sbm_del_mlist);
1408 
1409 	if (s_mp->sbm_cm.sbdev_error && s_mp->sbm_mlist) {
1410 		memlist_delete(s_mp->sbm_mlist);
1411 		s_mp->sbm_mlist = NULL;
1412 	}
1413 
1414 	if (s_mp->sbm_dyn_segs != NULL && s_mp->sbm_cm.sbdev_error == 0) {
1415 		memlist_delete(s_mp->sbm_dyn_segs);
1416 		s_mp->sbm_dyn_segs = NULL;
1417 	}
1418 
1419 	s_mp->sbm_del_mlist = NULL;
1420 	s_mp->sbm_peer = NULL;
1421 	s_mp->sbm_flags = 0;
1422 	s_mp->sbm_cm.sbdev_busy = 0;
1423 	dr_init_mem_unit_data(s_mp);
1424 
1425 	PR_MEM("%s: cached memlist for %s:", f, s_mp->sbm_cm.sbdev_path);
1426 	PR_MEMLIST_DUMP(s_mp->sbm_mlist);
1427 
1428 	return (0);
1429 }
1430 
1431 /*
1432  * Successful return from this function will have the memory
1433  * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated
1434  * and waiting.  This routine's job is to select the memory that
1435  * actually has to be released (detached) which may not necessarily
1436  * be the same memory node that came in in devlist[],
1437  * i.e. a copy-rename is needed.
1438  */
1439 int
1440 dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum)
1441 {
1442 	int		d;
1443 	int		err_flag = 0;
1444 	static fn_t	f = "dr_pre_release_mem";
1445 
1446 	PR_MEM("%s...\n", f);
1447 
1448 	for (d = 0; d < devnum; d++) {
1449 		dr_mem_unit_t	*mp = (dr_mem_unit_t *)devlist[d];
1450 		int		rv;
1451 		memquery_t	mq;
1452 		struct memlist	*ml;
1453 
1454 		if (mp->sbm_cm.sbdev_error) {
1455 			err_flag = 1;
1456 			continue;
1457 		} else if (!kcage_on) {
1458 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_KCAGE_OFF);
1459 			err_flag = 1;
1460 			continue;
1461 		}
1462 
1463 		if (mp->sbm_flags & DR_MFLAG_RESERVED) {
1464 			/*
1465 			 * Board is currently involved in a delete
1466 			 * memory operation. Can't detach this guy until
1467 			 * that operation completes.
1468 			 */
1469 			dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_INVAL);
1470 			err_flag = 1;
1471 			break;
1472 		}
1473 
1474 		/*
1475 		 * Check whether the detaching memory requires a
1476 		 * copy-rename.
1477 		 */
1478 		ASSERT(mp->sbm_npages != 0);
1479 		rv = kphysm_del_span_query(mp->sbm_basepfn, mp->sbm_npages,
1480 		    &mq);
1481 		if (rv != KPHYSM_OK) {
1482 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1483 			err_flag = 1;
1484 			break;
1485 		}
1486 
1487 		if (mq.nonrelocatable != 0) {
1488 			if (!(dr_cmd_flags(hp) &
1489 			    (SBD_FLAG_FORCE | SBD_FLAG_QUIESCE_OKAY))) {
1490 				/* caller wasn't prompted for a suspend */
1491 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1492 				    ESBD_QUIESCE_REQD);
1493 				err_flag = 1;
1494 				break;
1495 			}
1496 		}
1497 
1498 		/* flags should be clean at this time */
1499 		ASSERT(mp->sbm_flags == 0);
1500 
1501 		ASSERT(mp->sbm_mlist == NULL);		/* should be null */
1502 		ASSERT(mp->sbm_del_mlist == NULL);	/* should be null */
1503 		if (mp->sbm_mlist != NULL) {
1504 			memlist_delete(mp->sbm_mlist);
1505 			mp->sbm_mlist = NULL;
1506 		}
1507 
1508 		ml = dr_get_memlist(mp);
1509 		if (ml == NULL) {
1510 			err_flag = 1;
1511 			PR_MEM("%s: no memlist found for %s\n",
1512 			    f, mp->sbm_cm.sbdev_path);
1513 			continue;
1514 		}
1515 
1516 		/* allocate a kphysm handle */
1517 		rv = kphysm_del_gethandle(&mp->sbm_memhandle);
1518 		if (rv != KPHYSM_OK) {
1519 			memlist_delete(ml);
1520 
1521 			DR_DEV_INTERNAL_ERROR(&mp->sbm_cm);
1522 			err_flag = 1;
1523 			break;
1524 		}
1525 		mp->sbm_flags |= DR_MFLAG_RELOWNER;
1526 
1527 		if ((mq.nonrelocatable != 0) ||
1528 		    dr_reserve_mem_spans(&mp->sbm_memhandle, ml)) {
1529 			/*
1530 			 * Either the detaching memory node contains
1531 			 * non-reloc memory or we failed to reserve the
1532 			 * detaching memory node (which did _not_ have
1533 			 * any non-reloc memory, i.e. some non-reloc mem
1534 			 * got onboard).
1535 			 */
1536 
1537 			if (dr_select_mem_target(hp, mp, ml)) {
1538 				int rv;
1539 
1540 				/*
1541 				 * We had no luck locating a target
1542 				 * memory node to be the recipient of
1543 				 * the non-reloc memory on the node
1544 				 * we're trying to detach.
1545 				 * Clean up be disposing the mem handle
1546 				 * and the mem list.
1547 				 */
1548 				rv = kphysm_del_release(mp->sbm_memhandle);
1549 				if (rv != KPHYSM_OK) {
1550 					/*
1551 					 * can do nothing but complain
1552 					 * and hope helpful for debug
1553 					 */
1554 					cmn_err(CE_WARN, "%s: unexpected"
1555 					    " kphysm_del_release return"
1556 					    " value %d",
1557 					    f, rv);
1558 				}
1559 				mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1560 
1561 				memlist_delete(ml);
1562 
1563 				/* make sure sbm_flags is clean */
1564 				ASSERT(mp->sbm_flags == 0);
1565 
1566 				dr_dev_err(CE_WARN, &mp->sbm_cm,
1567 				    ESBD_NO_TARGET);
1568 
1569 				err_flag = 1;
1570 				break;
1571 			}
1572 
1573 			/*
1574 			 * ml is not memlist_delete'd here because
1575 			 * it has been assigned to mp->sbm_mlist
1576 			 * by dr_select_mem_target.
1577 			 */
1578 		} else {
1579 			/* no target needed to detach this board */
1580 			mp->sbm_flags |= DR_MFLAG_RESERVED;
1581 			mp->sbm_peer = NULL;
1582 			mp->sbm_del_mlist = ml;
1583 			mp->sbm_mlist = ml;
1584 			mp->sbm_cm.sbdev_busy = 1;
1585 		}
1586 #ifdef DEBUG
1587 		ASSERT(mp->sbm_mlist != NULL);
1588 
1589 		if (mp->sbm_flags & DR_MFLAG_SOURCE) {
1590 			PR_MEM("%s: release of %s requires copy/rename;"
1591 			    " selected target board %s\n",
1592 			    f,
1593 			    mp->sbm_cm.sbdev_path,
1594 			    mp->sbm_peer->sbm_cm.sbdev_path);
1595 		} else {
1596 			PR_MEM("%s: copy/rename not required to release %s\n",
1597 			    f, mp->sbm_cm.sbdev_path);
1598 		}
1599 
1600 		ASSERT(mp->sbm_flags & DR_MFLAG_RELOWNER);
1601 		ASSERT(mp->sbm_flags & DR_MFLAG_RESERVED);
1602 #endif
1603 	}
1604 
1605 	return (err_flag ? -1 : 0);
1606 }
1607 
1608 void
1609 dr_release_mem_done(dr_common_unit_t *cp)
1610 {
1611 	dr_mem_unit_t	*s_mp = (dr_mem_unit_t *)cp;
1612 	dr_mem_unit_t *t_mp, *mp;
1613 	int		rv;
1614 	static fn_t	f = "dr_release_mem_done";
1615 
1616 	/*
1617 	 * This unit will be flagged with DR_MFLAG_SOURCE, if it
1618 	 * has a target unit.
1619 	 */
1620 	if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1621 		t_mp = s_mp->sbm_peer;
1622 		ASSERT(t_mp != NULL);
1623 		ASSERT(t_mp->sbm_peer == s_mp);
1624 		ASSERT(t_mp->sbm_flags & DR_MFLAG_TARGET);
1625 		ASSERT(t_mp->sbm_flags & DR_MFLAG_RESERVED);
1626 	} else {
1627 		/* this is no target unit */
1628 		t_mp = NULL;
1629 	}
1630 
1631 	/* free delete handle */
1632 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RELOWNER);
1633 	ASSERT(s_mp->sbm_flags & DR_MFLAG_RESERVED);
1634 	rv = kphysm_del_release(s_mp->sbm_memhandle);
1635 	if (rv != KPHYSM_OK) {
1636 		/*
1637 		 * can do nothing but complain
1638 		 * and hope helpful for debug
1639 		 */
1640 		cmn_err(CE_WARN, "%s: unexpected kphysm_del_release"
1641 		    " return value %d", f, rv);
1642 	}
1643 	s_mp->sbm_flags &= ~DR_MFLAG_RELOWNER;
1644 
1645 	/*
1646 	 * If an error was encountered during release, clean up
1647 	 * the source (and target, if present) unit data.
1648 	 */
1649 /* XXX Can we know that sbdev_error was encountered during release? */
1650 	if (s_mp->sbm_cm.sbdev_error != NULL) {
1651 		PR_MEM("%s: %s: error %d noted\n",
1652 		    f,
1653 		    s_mp->sbm_cm.sbdev_path,
1654 		    s_mp->sbm_cm.sbdev_error->e_code);
1655 
1656 		if (t_mp != NULL) {
1657 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1658 			t_mp->sbm_del_mlist = NULL;
1659 
1660 			if (t_mp->sbm_mlist != NULL) {
1661 				memlist_delete(t_mp->sbm_mlist);
1662 				t_mp->sbm_mlist = NULL;
1663 			}
1664 
1665 			t_mp->sbm_peer = NULL;
1666 			t_mp->sbm_flags = 0;
1667 			t_mp->sbm_cm.sbdev_busy = 0;
1668 		}
1669 
1670 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1671 			memlist_delete(s_mp->sbm_del_mlist);
1672 		s_mp->sbm_del_mlist = NULL;
1673 
1674 		if (s_mp->sbm_mlist != NULL) {
1675 			memlist_delete(s_mp->sbm_mlist);
1676 			s_mp->sbm_mlist = NULL;
1677 		}
1678 
1679 		s_mp->sbm_peer = NULL;
1680 		s_mp->sbm_flags = 0;
1681 		s_mp->sbm_cm.sbdev_busy = 0;
1682 
1683 		/* bail out */
1684 		return;
1685 	}
1686 
1687 	DR_DEV_SET_RELEASED(&s_mp->sbm_cm);
1688 	dr_device_transition(&s_mp->sbm_cm, DR_STATE_RELEASE);
1689 
1690 	if (t_mp != NULL) {
1691 		/*
1692 		 * the kphysm delete operation that drained the source
1693 		 * board also drained this target board.  Since the source
1694 		 * board drain is now known to have succeeded, we know this
1695 		 * target board is drained too.
1696 		 *
1697 		 * because DR_DEV_SET_RELEASED and dr_device_transition
1698 		 * is done here, the dr_release_dev_done should not
1699 		 * fail.
1700 		 */
1701 		DR_DEV_SET_RELEASED(&t_mp->sbm_cm);
1702 		dr_device_transition(&t_mp->sbm_cm, DR_STATE_RELEASE);
1703 
1704 		/*
1705 		 * NOTE: do not transition target's board state,
1706 		 * even if the mem-unit was the last configure
1707 		 * unit of the board.  When copy/rename completes
1708 		 * this mem-unit will transitioned back to
1709 		 * the configured state.  In the meantime, the
1710 		 * board's must remain as is.
1711 		 */
1712 	}
1713 
1714 	/* if board(s) had deleted memory, verify it is gone */
1715 	rv = 0;
1716 	memlist_read_lock();
1717 	if (s_mp->sbm_del_mlist != NULL) {
1718 		mp = s_mp;
1719 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1720 	}
1721 	if (rv == 0 && t_mp && t_mp->sbm_del_mlist != NULL) {
1722 		mp = t_mp;
1723 		rv = memlist_intersect(phys_install, mp->sbm_del_mlist);
1724 	}
1725 	memlist_read_unlock();
1726 	if (rv) {
1727 		cmn_err(CE_WARN, "%s: %smem-unit (%d.%d): "
1728 		    "deleted memory still found in phys_install",
1729 		    f,
1730 		    (mp == t_mp ? "target " : ""),
1731 		    mp->sbm_cm.sbdev_bp->b_num,
1732 		    mp->sbm_cm.sbdev_unum);
1733 
1734 		DR_DEV_INTERNAL_ERROR(&s_mp->sbm_cm);
1735 		return;
1736 	}
1737 
1738 	s_mp->sbm_flags |= DR_MFLAG_RELDONE;
1739 	if (t_mp != NULL)
1740 		t_mp->sbm_flags |= DR_MFLAG_RELDONE;
1741 
1742 	/* this should not fail */
1743 	if (dr_release_dev_done(&s_mp->sbm_cm) != 0) {
1744 		/* catch this in debug kernels */
1745 		ASSERT(0);
1746 		return;
1747 	}
1748 
1749 	PR_MEM("%s: marking %s release DONE\n",
1750 	    f, s_mp->sbm_cm.sbdev_path);
1751 
1752 	s_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1753 
1754 	if (t_mp != NULL) {
1755 		/* should not fail */
1756 		rv = dr_release_dev_done(&t_mp->sbm_cm);
1757 		if (rv != 0) {
1758 			/* catch this in debug kernels */
1759 			ASSERT(0);
1760 			return;
1761 		}
1762 
1763 		PR_MEM("%s: marking %s release DONE\n",
1764 		    f, t_mp->sbm_cm.sbdev_path);
1765 
1766 		t_mp->sbm_cm.sbdev_ostate = SBD_STAT_UNCONFIGURED;
1767 	}
1768 }
1769 
1770 /*ARGSUSED*/
1771 int
1772 dr_disconnect_mem(dr_mem_unit_t *mp)
1773 {
1774 	static fn_t	f = "dr_disconnect_mem";
1775 	update_membounds_t umb;
1776 
1777 #ifdef DEBUG
1778 	int state = mp->sbm_cm.sbdev_state;
1779 	ASSERT(state == DR_STATE_CONNECTED || state == DR_STATE_UNCONFIGURED);
1780 #endif
1781 
1782 	PR_MEM("%s...\n", f);
1783 
1784 	if (mp->sbm_del_mlist && mp->sbm_del_mlist != mp->sbm_mlist)
1785 		memlist_delete(mp->sbm_del_mlist);
1786 	mp->sbm_del_mlist = NULL;
1787 
1788 	if (mp->sbm_mlist) {
1789 		memlist_delete(mp->sbm_mlist);
1790 		mp->sbm_mlist = NULL;
1791 	}
1792 
1793 	/*
1794 	 * Remove memory from lgroup
1795 	 * For now, only board info is required.
1796 	 */
1797 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1798 	umb.u_base = (uint64_t)-1;
1799 	umb.u_len = (uint64_t)-1;
1800 
1801 	lgrp_plat_config(LGRP_CONFIG_MEM_DEL, (uintptr_t)&umb);
1802 
1803 	return (0);
1804 }
1805 
1806 int
1807 dr_cancel_mem(dr_mem_unit_t *s_mp)
1808 {
1809 	dr_mem_unit_t	*t_mp;
1810 	dr_state_t	state;
1811 	static fn_t	f = "dr_cancel_mem";
1812 
1813 	state = s_mp->sbm_cm.sbdev_state;
1814 
1815 	if (s_mp->sbm_flags & DR_MFLAG_TARGET) {
1816 		/* must cancel source board, not target board */
1817 		/* TODO: set error */
1818 		return (-1);
1819 	} else if (s_mp->sbm_flags & DR_MFLAG_SOURCE) {
1820 		t_mp = s_mp->sbm_peer;
1821 		ASSERT(t_mp != NULL);
1822 		ASSERT(t_mp->sbm_peer == s_mp);
1823 
1824 		/* must always match the source board's state */
1825 /* TODO: is this assertion correct? */
1826 		ASSERT(t_mp->sbm_cm.sbdev_state == state);
1827 	} else {
1828 		/* this is no target unit */
1829 		t_mp = NULL;
1830 	}
1831 
1832 	switch (state) {
1833 	case DR_STATE_UNREFERENCED:	/* state set by dr_release_dev_done */
1834 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1835 
1836 		if (t_mp != NULL && t_mp->sbm_del_mlist != NULL) {
1837 			PR_MEM("%s: undoing target %s memory delete\n",
1838 			    f, t_mp->sbm_cm.sbdev_path);
1839 			dr_add_memory_spans(t_mp, t_mp->sbm_del_mlist);
1840 
1841 			DR_DEV_CLR_UNREFERENCED(&t_mp->sbm_cm);
1842 		}
1843 
1844 		if (s_mp->sbm_del_mlist != NULL) {
1845 			PR_MEM("%s: undoing %s memory delete\n",
1846 			    f, s_mp->sbm_cm.sbdev_path);
1847 
1848 			dr_add_memory_spans(s_mp, s_mp->sbm_del_mlist);
1849 		}
1850 
1851 		/*FALLTHROUGH*/
1852 
1853 /* TODO: should no longer be possible to see the release state here */
1854 	case DR_STATE_RELEASE:	/* state set by dr_release_mem_done */
1855 
1856 		ASSERT((s_mp->sbm_flags & DR_MFLAG_RELOWNER) == 0);
1857 
1858 		if (t_mp != NULL) {
1859 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
1860 			t_mp->sbm_del_mlist = NULL;
1861 
1862 			if (t_mp->sbm_mlist != NULL) {
1863 				memlist_delete(t_mp->sbm_mlist);
1864 				t_mp->sbm_mlist = NULL;
1865 			}
1866 
1867 			t_mp->sbm_peer = NULL;
1868 			t_mp->sbm_flags = 0;
1869 			t_mp->sbm_cm.sbdev_busy = 0;
1870 			dr_init_mem_unit_data(t_mp);
1871 
1872 			DR_DEV_CLR_RELEASED(&t_mp->sbm_cm);
1873 
1874 			dr_device_transition(&t_mp->sbm_cm,
1875 			    DR_STATE_CONFIGURED);
1876 		}
1877 
1878 		if (s_mp->sbm_del_mlist != s_mp->sbm_mlist)
1879 			memlist_delete(s_mp->sbm_del_mlist);
1880 		s_mp->sbm_del_mlist = NULL;
1881 
1882 		if (s_mp->sbm_mlist != NULL) {
1883 			memlist_delete(s_mp->sbm_mlist);
1884 			s_mp->sbm_mlist = NULL;
1885 		}
1886 
1887 		s_mp->sbm_peer = NULL;
1888 		s_mp->sbm_flags = 0;
1889 		s_mp->sbm_cm.sbdev_busy = 0;
1890 		dr_init_mem_unit_data(s_mp);
1891 
1892 		return (0);
1893 
1894 	default:
1895 		PR_MEM("%s: WARNING unexpected state (%d) for %s\n",
1896 		    f, (int)state, s_mp->sbm_cm.sbdev_path);
1897 
1898 		return (-1);
1899 	}
1900 	/*NOTREACHED*/
1901 }
1902 
1903 void
1904 dr_init_mem_unit(dr_mem_unit_t *mp)
1905 {
1906 	dr_state_t	new_state;
1907 
1908 
1909 	if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) {
1910 		new_state = DR_STATE_CONFIGURED;
1911 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1912 	} else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) {
1913 		new_state = DR_STATE_CONNECTED;
1914 		mp->sbm_cm.sbdev_cond = SBD_COND_OK;
1915 	} else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) {
1916 		new_state = DR_STATE_OCCUPIED;
1917 	} else {
1918 		new_state = DR_STATE_EMPTY;
1919 	}
1920 
1921 	if (DR_DEV_IS_PRESENT(&mp->sbm_cm))
1922 		dr_init_mem_unit_data(mp);
1923 
1924 	/* delay transition until fully initialized */
1925 	dr_device_transition(&mp->sbm_cm, new_state);
1926 }
1927 
1928 static void
1929 dr_init_mem_unit_data(dr_mem_unit_t *mp)
1930 {
1931 	drmachid_t	id = mp->sbm_cm.sbdev_id;
1932 	uint64_t	bytes;
1933 	sbd_error_t	*err;
1934 	static fn_t	f = "dr_init_mem_unit_data";
1935 	update_membounds_t umb;
1936 
1937 	PR_MEM("%s...\n", f);
1938 
1939 	/* a little sanity checking */
1940 	ASSERT(mp->sbm_peer == NULL);
1941 	ASSERT(mp->sbm_flags == 0);
1942 
1943 	/* get basepfn of mem unit */
1944 	err = drmach_mem_get_base_physaddr(id, &bytes);
1945 	if (err) {
1946 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1947 		mp->sbm_basepfn = (pfn_t)-1;
1948 	} else
1949 		mp->sbm_basepfn = _b64top(bytes);
1950 
1951 	/* attempt to get number of pages from PDA */
1952 	err = drmach_mem_get_size(id, &bytes);
1953 	if (err) {
1954 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1955 		mp->sbm_npages = 0;
1956 	} else
1957 		mp->sbm_npages = _b64top(bytes);
1958 
1959 	/* if didn't work, calculate using memlist */
1960 	if (mp->sbm_npages == 0) {
1961 		struct memlist	*ml, *mlist;
1962 		/*
1963 		 * Either we couldn't open the PDA or our
1964 		 * PDA has garbage in it.  We must have the
1965 		 * page count consistent and whatever the
1966 		 * OS states has precedence over the PDA
1967 		 * so let's check the kernel.
1968 		 */
1969 /* TODO: curious comment. it suggests pda query should happen if this fails */
1970 		PR_MEM("%s: PDA query failed for npages."
1971 		    " Checking memlist for %s\n",
1972 		    f, mp->sbm_cm.sbdev_path);
1973 
1974 		mlist = dr_get_memlist(mp);
1975 		for (ml = mlist; ml; ml = ml->next)
1976 			mp->sbm_npages += btop(ml->size);
1977 		memlist_delete(mlist);
1978 	}
1979 
1980 	err = drmach_mem_get_alignment(id, &bytes);
1981 	if (err) {
1982 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1983 		mp->sbm_alignment_mask = 0;
1984 	} else
1985 		mp->sbm_alignment_mask = _b64top(bytes);
1986 
1987 	err = drmach_mem_get_slice_size(id, &bytes);
1988 	if (err) {
1989 		DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err);
1990 		mp->sbm_slice_size = 0; /* paranoia */
1991 	} else
1992 		mp->sbm_slice_size = bytes;
1993 
1994 	/*
1995 	 * Add memory to lgroup
1996 	 */
1997 	umb.u_board = mp->sbm_cm.sbdev_bp->b_num;
1998 	umb.u_base = (uint64_t)mp->sbm_basepfn << MMU_PAGESHIFT;
1999 	umb.u_len = (uint64_t)mp->sbm_npages << MMU_PAGESHIFT;
2000 
2001 	lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb);
2002 
2003 	PR_MEM("%s: %s (basepfn = 0x%lx, npgs = %ld)\n",
2004 	    f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages);
2005 }
2006 
2007 static int
2008 dr_reserve_mem_spans(memhandle_t *mhp, struct memlist *ml)
2009 {
2010 	int		err;
2011 	pfn_t		base;
2012 	pgcnt_t		npgs;
2013 	struct memlist	*mc;
2014 	static fn_t	f = "dr_reserve_mem_spans";
2015 
2016 	PR_MEM("%s...\n", f);
2017 
2018 	/*
2019 	 * Walk the supplied memlist scheduling each span for removal
2020 	 * with kphysm_del_span.  It is possible that a span may intersect
2021 	 * an area occupied by the cage.
2022 	 */
2023 	for (mc = ml; mc != NULL; mc = mc->next) {
2024 		base = _b64top(mc->address);
2025 		npgs = _b64top(mc->size);
2026 
2027 		err = kphysm_del_span(*mhp, base, npgs);
2028 		if (err != KPHYSM_OK) {
2029 			cmn_err(CE_WARN, "%s memory reserve failed."
2030 			    " unexpected kphysm_del_span return value %d;"
2031 			    " basepfn=0x%lx npages=%ld",
2032 			    f, err, base, npgs);
2033 
2034 			return (-1);
2035 		}
2036 	}
2037 
2038 	return (0);
2039 }
2040 
2041 /* debug counters */
2042 int dr_smt_realigned;
2043 int dr_smt_preference[4];
2044 
2045 #ifdef DEBUG
2046 uint_t dr_ignore_board; /* if bit[bnum-1] set, board won't be candidate */
2047 #endif
2048 
2049 /*
2050  * Find and reserve a copy/rename target board suitable for the
2051  * given source board.
2052  * All boards in the system are examined and categorized in relation to
2053  * their memory size versus the source board's memory size.  Order of
2054  * preference is:
2055  *	1st: board has same memory size
2056  * 	2nd: board has larger memory size
2057  *	3rd: board has smaller memory size
2058  *	4th: board has smaller memory size, available memory will be reduced.
2059  * Boards in category 3 and 4 will have their MC's reprogrammed to locate the
2060  * span to which the MC responds to address span that appropriately covers
2061  * the nonrelocatable span of the source board.
2062  */
2063 static int
2064 dr_select_mem_target(dr_handle_t *hp,
2065 	dr_mem_unit_t *s_mp, struct memlist *s_ml)
2066 {
2067 	pgcnt_t		sz = _b64top(s_mp->sbm_slice_size);
2068 	pgcnt_t		sm = sz - 1; /* mem_slice_mask */
2069 	pfn_t		s_phi, t_phi;
2070 
2071 	int		n_sets = 4; /* same, larger, smaller, clipped */
2072 	int		preference; /* lower value is higher preference */
2073 	int		n_units_per_set;
2074 	int		idx;
2075 	dr_mem_unit_t	**sets;
2076 
2077 	int		t_bd;
2078 	int		t_unit;
2079 	int		rv;
2080 	int		allow_src_memrange_modify;
2081 	int		allow_targ_memrange_modify;
2082 	drmachid_t	t_id;
2083 	dr_board_t	*s_bp, *t_bp;
2084 	dr_mem_unit_t	*t_mp, *c_mp;
2085 	struct memlist	*d_ml, *t_ml, *x_ml;
2086 	memquery_t	s_mq = {0};
2087 	static fn_t	f = "dr_select_mem_target";
2088 
2089 	PR_MEM("%s...\n", f);
2090 
2091 	ASSERT(s_ml != NULL);
2092 
2093 	n_units_per_set = MAX_BOARDS * MAX_MEM_UNITS_PER_BOARD;
2094 	sets = GETSTRUCT(dr_mem_unit_t *, n_units_per_set * n_sets);
2095 
2096 	s_bp = hp->h_bd;
2097 	/* calculate the offset into the slice of the last source board pfn */
2098 	ASSERT(s_mp->sbm_npages != 0);
2099 	s_phi = (s_mp->sbm_basepfn + s_mp->sbm_npages - 1) & sm;
2100 
2101 	allow_src_memrange_modify = drmach_allow_memrange_modify(s_bp->b_id);
2102 
2103 	/*
2104 	 * Make one pass through all memory units on all boards
2105 	 * and categorize them with respect to the source board.
2106 	 */
2107 	for (t_bd = 0; t_bd < MAX_BOARDS; t_bd++) {
2108 		/*
2109 		 * The board structs are a contiguous array
2110 		 * so we take advantage of that to find the
2111 		 * correct board struct pointer for a given
2112 		 * board number.
2113 		 */
2114 		t_bp = dr_lookup_board(t_bd);
2115 
2116 		/* source board can not be its own target */
2117 		if (s_bp->b_num == t_bp->b_num)
2118 			continue;
2119 
2120 		for (t_unit = 0; t_unit < MAX_MEM_UNITS_PER_BOARD; t_unit++) {
2121 
2122 			t_mp = dr_get_mem_unit(t_bp, t_unit);
2123 
2124 			/* this memory node must be attached */
2125 			if (!DR_DEV_IS_ATTACHED(&t_mp->sbm_cm))
2126 				continue;
2127 
2128 			/* source unit can not be its own target */
2129 			if (s_mp == t_mp) {
2130 				/* catch this is debug kernels */
2131 				ASSERT(0);
2132 				continue;
2133 			}
2134 
2135 			/*
2136 			 * this memory node must not already be reserved
2137 			 * by some other memory delete operation.
2138 			 */
2139 			if (t_mp->sbm_flags & DR_MFLAG_RESERVED)
2140 				continue;
2141 
2142 			/*
2143 			 * categorize the memory node
2144 			 * If this is a smaller memory node, create a
2145 			 * temporary, edited copy of the source board's
2146 			 * memlist containing only the span of the non-
2147 			 * relocatable pages.
2148 			 */
2149 			t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2150 			t_id = t_mp->sbm_cm.sbdev_bp->b_id;
2151 			allow_targ_memrange_modify =
2152 			    drmach_allow_memrange_modify(t_id);
2153 			if (t_mp->sbm_npages == s_mp->sbm_npages &&
2154 			    t_phi == s_phi) {
2155 				preference = 0;
2156 				t_mp->sbm_slice_offset = 0;
2157 			} else if (t_mp->sbm_npages > s_mp->sbm_npages &&
2158 			    t_phi > s_phi) {
2159 				/*
2160 				 * Selecting this target will require modifying
2161 				 * the source and/or target physical address
2162 				 * ranges.  Skip if not supported by platform.
2163 				 */
2164 				if (!allow_src_memrange_modify ||
2165 				    !allow_targ_memrange_modify) {
2166 					PR_MEM("%s: skip target %s, memory "
2167 					    "range relocation not supported "
2168 					    "by platform\n", f,
2169 					    t_mp->sbm_cm.sbdev_path);
2170 					continue;
2171 				}
2172 				preference = 1;
2173 				t_mp->sbm_slice_offset = 0;
2174 			} else {
2175 				pfn_t		pfn = 0;
2176 
2177 				/*
2178 				 * Selecting this target will require modifying
2179 				 * the source and/or target physical address
2180 				 * ranges.  Skip if not supported by platform.
2181 				 */
2182 				if (!allow_src_memrange_modify ||
2183 				    !allow_targ_memrange_modify) {
2184 					PR_MEM("%s: skip target %s, memory "
2185 					    "range relocation not supported "
2186 					    "by platform\n", f,
2187 					    t_mp->sbm_cm.sbdev_path);
2188 					continue;
2189 				}
2190 
2191 				/*
2192 				 * Check if its mc can be programmed to relocate
2193 				 * the active address range to match the
2194 				 * nonrelocatable span of the source board.
2195 				 */
2196 				preference = 2;
2197 
2198 				if (s_mq.phys_pages == 0) {
2199 					/*
2200 					 * find non-relocatable span on
2201 					 * source board.
2202 					 */
2203 					rv = kphysm_del_span_query(
2204 					    s_mp->sbm_basepfn,
2205 					    s_mp->sbm_npages, &s_mq);
2206 					if (rv != KPHYSM_OK) {
2207 						PR_MEM("%s: %s: unexpected"
2208 						    " kphysm_del_span_query"
2209 						    " return value %d;"
2210 						    " basepfn 0x%lx,"
2211 						    " npages %ld\n",
2212 						    f,
2213 						    s_mp->sbm_cm.sbdev_path,
2214 						    rv,
2215 						    s_mp->sbm_basepfn,
2216 						    s_mp->sbm_npages);
2217 
2218 						/* paranoia */
2219 						s_mq.phys_pages = 0;
2220 
2221 						continue;
2222 					}
2223 
2224 					/* more paranoia */
2225 					ASSERT(s_mq.phys_pages != 0);
2226 					ASSERT(s_mq.nonrelocatable != 0);
2227 
2228 					/*
2229 					 * this should not happen
2230 					 * if it does, it simply means that
2231 					 * we can not proceed with qualifying
2232 					 * this target candidate.
2233 					 */
2234 					if (s_mq.nonrelocatable == 0)
2235 						continue;
2236 
2237 					PR_MEM("%s: %s: nonrelocatable"
2238 					    " span (0x%lx..0x%lx)\n",
2239 					    f,
2240 					    s_mp->sbm_cm.sbdev_path,
2241 					    s_mq.first_nonrelocatable,
2242 					    s_mq.last_nonrelocatable);
2243 				}
2244 
2245 				/*
2246 				 * Round down the starting pfn of the
2247 				 * nonrelocatable span on the source board
2248 				 * to nearest programmable boundary possible
2249 				 * with this target candidate.
2250 				 */
2251 				pfn = s_mq.first_nonrelocatable &
2252 				    ~t_mp->sbm_alignment_mask;
2253 
2254 				/* skip candidate if memory is too small */
2255 				if (pfn + t_mp->sbm_npages <
2256 				    s_mq.last_nonrelocatable)
2257 					continue;
2258 
2259 				/*
2260 				 * reprogramming an mc to relocate its
2261 				 * active address range means the beginning
2262 				 * address to which the DIMMS respond will
2263 				 * be somewhere above the slice boundary
2264 				 * address.  The larger the size of memory
2265 				 * on this unit, the more likely part of it
2266 				 * will exist beyond the end of the slice.
2267 				 * The portion of the memory that does is
2268 				 * unavailable to the system until the mc
2269 				 * reprogrammed to a more favorable base
2270 				 * address.
2271 				 * An attempt is made to avoid the loss by
2272 				 * recalculating the mc base address relative
2273 				 * to the end of the slice.  This may produce
2274 				 * a more favorable result.  If not, we lower
2275 				 * the board's preference rating so that it
2276 				 * is one the last candidate boards to be
2277 				 * considered.
2278 				 */
2279 				if ((pfn + t_mp->sbm_npages) & ~sm) {
2280 					pfn_t p;
2281 
2282 					ASSERT(sz >= t_mp->sbm_npages);
2283 
2284 					/*
2285 					 * calculate an alternative starting
2286 					 * address relative to the end of the
2287 					 * slice's address space.
2288 					 */
2289 					p = pfn & ~sm;
2290 					p = p + (sz - t_mp->sbm_npages);
2291 					p = p & ~t_mp->sbm_alignment_mask;
2292 
2293 					if ((p > s_mq.first_nonrelocatable) ||
2294 					    (p + t_mp->sbm_npages <
2295 					    s_mq.last_nonrelocatable)) {
2296 
2297 						/*
2298 						 * alternative starting addr
2299 						 * won't work. Lower preference
2300 						 * rating of this board, since
2301 						 * some number of pages will
2302 						 * unavailable for use.
2303 						 */
2304 						preference = 3;
2305 					} else {
2306 						dr_smt_realigned++;
2307 						pfn = p;
2308 					}
2309 				}
2310 
2311 				/*
2312 				 * translate calculated pfn to an offset
2313 				 * relative to the slice boundary.  If the
2314 				 * candidate board is selected, this offset
2315 				 * will be used to calculate the values
2316 				 * programmed into the mc.
2317 				 */
2318 				t_mp->sbm_slice_offset = pfn & sm;
2319 				PR_MEM("%s: %s:"
2320 				    "  proposed mc offset 0x%lx\n",
2321 				    f,
2322 				    t_mp->sbm_cm.sbdev_path,
2323 				    t_mp->sbm_slice_offset);
2324 			}
2325 
2326 			dr_smt_preference[preference]++;
2327 
2328 			/* calculate index to start of preference set */
2329 			idx  = n_units_per_set * preference;
2330 			/* calculate offset to respective element */
2331 			idx += t_bd * MAX_MEM_UNITS_PER_BOARD + t_unit;
2332 
2333 			ASSERT(idx < n_units_per_set * n_sets);
2334 			sets[idx] = t_mp;
2335 		}
2336 	}
2337 
2338 	/*
2339 	 * NOTE: this would be a good place to sort each candidate
2340 	 * set in to some desired order, e.g. memory size in ascending
2341 	 * order.  Without an additional sorting step here, the order
2342 	 * within a set is ascending board number order.
2343 	 */
2344 
2345 	c_mp = NULL;
2346 	x_ml = NULL;
2347 	t_ml = NULL;
2348 	for (idx = 0; idx < n_units_per_set * n_sets; idx++) {
2349 		memquery_t mq;
2350 
2351 		/* cleanup t_ml after previous pass */
2352 		if (t_ml != NULL) {
2353 			memlist_delete(t_ml);
2354 			t_ml = NULL;
2355 		}
2356 
2357 		/* get candidate target board mem unit */
2358 		t_mp = sets[idx];
2359 		if (t_mp == NULL)
2360 			continue;
2361 
2362 		/* get target board memlist */
2363 		t_ml = dr_get_memlist(t_mp);
2364 		if (t_ml == NULL) {
2365 			cmn_err(CE_WARN, "%s: no memlist for"
2366 			    " mem-unit %d, board %d",
2367 			    f,
2368 			    t_mp->sbm_cm.sbdev_bp->b_num,
2369 			    t_mp->sbm_cm.sbdev_unum);
2370 
2371 			continue;
2372 		}
2373 
2374 		/* get appropriate source board memlist */
2375 		t_phi = (t_mp->sbm_basepfn + t_mp->sbm_npages - 1) & sm;
2376 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2377 			spgcnt_t excess;
2378 
2379 			/*
2380 			 * make a copy of the source board memlist
2381 			 * then edit it to remove the spans that
2382 			 * are outside the calculated span of
2383 			 * [pfn..s_mq.last_nonrelocatable].
2384 			 */
2385 			if (x_ml != NULL)
2386 				memlist_delete(x_ml);
2387 
2388 			x_ml = memlist_dup(s_ml);
2389 			if (x_ml == NULL) {
2390 				PR_MEM("%s: memlist_dup failed\n", f);
2391 				/* TODO: should abort */
2392 				continue;
2393 			}
2394 
2395 			/* trim off lower portion */
2396 			excess = t_mp->sbm_slice_offset -
2397 			    (s_mp->sbm_basepfn & sm);
2398 
2399 			if (excess > 0) {
2400 				x_ml = memlist_del_span(
2401 				    x_ml,
2402 				    _ptob64(s_mp->sbm_basepfn),
2403 				    _ptob64(excess));
2404 			}
2405 			ASSERT(x_ml);
2406 
2407 			/*
2408 			 * Since this candidate target board is smaller
2409 			 * than the source board, s_mq must have been
2410 			 * initialized in previous loop while processing
2411 			 * this or some other candidate board.
2412 			 * FIXME: this is weak.
2413 			 */
2414 			ASSERT(s_mq.phys_pages != 0);
2415 
2416 			/* trim off upper portion */
2417 			excess = (s_mp->sbm_basepfn + s_mp->sbm_npages)
2418 			    - (s_mq.last_nonrelocatable + 1);
2419 			if (excess > 0) {
2420 				pfn_t p;
2421 
2422 				p  = s_mq.last_nonrelocatable + 1;
2423 				x_ml = memlist_del_span(
2424 				    x_ml,
2425 				    _ptob64(p),
2426 				    _ptob64(excess));
2427 			}
2428 
2429 			PR_MEM("%s: %s: edited source memlist:\n",
2430 			    f, s_mp->sbm_cm.sbdev_path);
2431 			PR_MEMLIST_DUMP(x_ml);
2432 
2433 #ifdef DEBUG
2434 			/* sanity check memlist */
2435 			d_ml = x_ml;
2436 			while (d_ml->next != NULL)
2437 				d_ml = d_ml->next;
2438 
2439 			ASSERT(d_ml->address + d_ml->size ==
2440 			    _ptob64(s_mq.last_nonrelocatable + 1));
2441 #endif
2442 
2443 			/*
2444 			 * x_ml now describes only the portion of the
2445 			 * source board that will be moved during the
2446 			 * copy/rename operation.
2447 			 */
2448 			d_ml = x_ml;
2449 		} else {
2450 			/* use original memlist; all spans will be moved */
2451 			d_ml = s_ml;
2452 		}
2453 
2454 		/* verify target can support source memory spans. */
2455 		if (memlist_canfit(d_ml, t_ml) == 0) {
2456 			PR_MEM("%s: source memlist won't"
2457 			    " fit in target memlist\n", f);
2458 			PR_MEM("%s: source memlist:\n", f);
2459 			PR_MEMLIST_DUMP(d_ml);
2460 			PR_MEM("%s: target memlist:\n", f);
2461 			PR_MEMLIST_DUMP(t_ml);
2462 
2463 			continue;
2464 		}
2465 
2466 		/* NOTE: the value of d_ml is not used beyond this point */
2467 
2468 		PR_MEM("%s: checking for no-reloc in %s, "
2469 		    " basepfn=0x%lx, npages=%ld\n",
2470 		    f,
2471 		    t_mp->sbm_cm.sbdev_path,
2472 		    t_mp->sbm_basepfn,
2473 		    t_mp->sbm_npages);
2474 
2475 		rv = kphysm_del_span_query(
2476 		    t_mp->sbm_basepfn, t_mp->sbm_npages, &mq);
2477 		if (rv != KPHYSM_OK) {
2478 			PR_MEM("%s: kphysm_del_span_query:"
2479 			    " unexpected return value %d\n", f, rv);
2480 
2481 			continue;
2482 		}
2483 
2484 		if (mq.nonrelocatable != 0) {
2485 			PR_MEM("%s: candidate %s has"
2486 			    " nonrelocatable span [0x%lx..0x%lx]\n",
2487 			    f,
2488 			    t_mp->sbm_cm.sbdev_path,
2489 			    mq.first_nonrelocatable,
2490 			    mq.last_nonrelocatable);
2491 
2492 			continue;
2493 		}
2494 
2495 #ifdef DEBUG
2496 		/*
2497 		 * This is a debug tool for excluding certain boards
2498 		 * from being selected as a target board candidate.
2499 		 * dr_ignore_board is only tested by this driver.
2500 		 * It must be set with adb, obp, /etc/system or your
2501 		 * favorite debugger.
2502 		 */
2503 		if (dr_ignore_board &
2504 		    (1 << (t_mp->sbm_cm.sbdev_bp->b_num - 1))) {
2505 			PR_MEM("%s: dr_ignore_board flag set,"
2506 			    " ignoring %s as candidate\n",
2507 			    f, t_mp->sbm_cm.sbdev_path);
2508 			continue;
2509 		}
2510 #endif
2511 
2512 		/*
2513 		 * Reserve excess source board memory, if any.
2514 		 *
2515 		 * When the number of pages on the candidate target
2516 		 * board is less than the number of pages on the source,
2517 		 * then some spans (clearly) of the source board's address
2518 		 * space will not be covered by physical memory after the
2519 		 * copy/rename completes.  The following code block
2520 		 * schedules those spans to be deleted.
2521 		 */
2522 		if (t_mp->sbm_npages < s_mp->sbm_npages || t_phi < s_phi) {
2523 			pfn_t pfn;
2524 			uint64_t s_del_pa;
2525 			struct memlist *ml;
2526 
2527 			d_ml = memlist_dup(s_ml);
2528 			if (d_ml == NULL) {
2529 				PR_MEM("%s: cant dup src brd memlist\n", f);
2530 				/* TODO: should abort */
2531 				continue;
2532 			}
2533 
2534 			/* calculate base pfn relative to target board */
2535 			pfn  = s_mp->sbm_basepfn & ~sm;
2536 			pfn += t_mp->sbm_slice_offset;
2537 
2538 			/*
2539 			 * cannot split dynamically added segment
2540 			 */
2541 			s_del_pa = _ptob64(pfn + t_mp->sbm_npages);
2542 			PR_MEM("%s: proposed src delete pa=0x%lx\n", f,
2543 			    s_del_pa);
2544 			PR_MEM("%s: checking for split of dyn seg list:\n", f);
2545 			PR_MEMLIST_DUMP(s_mp->sbm_dyn_segs);
2546 			for (ml = s_mp->sbm_dyn_segs; ml; ml = ml->next) {
2547 				if (s_del_pa > ml->address &&
2548 				    s_del_pa < ml->address + ml->size) {
2549 					s_del_pa = ml->address;
2550 					break;
2551 				}
2552 			}
2553 
2554 			/* remove span that will reside on candidate board */
2555 			d_ml = memlist_del_span(d_ml, _ptob64(pfn),
2556 			    s_del_pa - _ptob64(pfn));
2557 
2558 			PR_MEM("%s: %s: reserving src brd memlist:\n",
2559 			    f, s_mp->sbm_cm.sbdev_path);
2560 			PR_MEMLIST_DUMP(d_ml);
2561 
2562 			/* reserve excess spans */
2563 			if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, d_ml)
2564 			    != 0) {
2565 
2566 				/* likely more non-reloc pages appeared */
2567 				/* TODO: restart from top? */
2568 				continue;
2569 			}
2570 		} else {
2571 			/* no excess source board memory */
2572 			d_ml = NULL;
2573 		}
2574 
2575 		s_mp->sbm_flags |= DR_MFLAG_RESERVED;
2576 
2577 		/*
2578 		 * reserve all memory on target board.
2579 		 * NOTE: source board's memhandle is used.
2580 		 *
2581 		 * If this succeeds (eq 0), then target selection is
2582 		 * complete and all unwanted memory spans, both source and
2583 		 * target, have been reserved.  Loop is terminated.
2584 		 */
2585 		if (dr_reserve_mem_spans(&s_mp->sbm_memhandle, t_ml) == 0) {
2586 			PR_MEM("%s: %s: target board memory reserved\n",
2587 			    f, t_mp->sbm_cm.sbdev_path);
2588 
2589 			/* a candidate target board is now reserved */
2590 			t_mp->sbm_flags |= DR_MFLAG_RESERVED;
2591 			c_mp = t_mp;
2592 
2593 			/* *** EXITING LOOP *** */
2594 			break;
2595 		}
2596 
2597 		/* did not successfully reserve the target board. */
2598 		PR_MEM("%s: could not reserve target %s\n",
2599 		    f, t_mp->sbm_cm.sbdev_path);
2600 
2601 		/*
2602 		 * NOTE: an undo of the dr_reserve_mem_span work
2603 		 * will happen automatically when the memhandle
2604 		 * (s_mp->sbm_memhandle) is kphysm_del_release'd.
2605 		 */
2606 
2607 		s_mp->sbm_flags &= ~DR_MFLAG_RESERVED;
2608 	}
2609 
2610 	/* clean up after memlist editing logic */
2611 	if (x_ml != NULL)
2612 		memlist_delete(x_ml);
2613 
2614 	FREESTRUCT(sets, dr_mem_unit_t *, n_units_per_set * n_sets);
2615 
2616 	/*
2617 	 * c_mp will be NULL when the entire sets[] array
2618 	 * has been searched without reserving a target board.
2619 	 */
2620 	if (c_mp == NULL) {
2621 		PR_MEM("%s: %s: target selection failed.\n",
2622 		    f, s_mp->sbm_cm.sbdev_path);
2623 
2624 		if (t_ml != NULL)
2625 			memlist_delete(t_ml);
2626 
2627 		return (-1);
2628 	}
2629 
2630 	PR_MEM("%s: found target %s for source %s\n",
2631 	    f,
2632 	    c_mp->sbm_cm.sbdev_path,
2633 	    s_mp->sbm_cm.sbdev_path);
2634 
2635 	s_mp->sbm_peer = c_mp;
2636 	s_mp->sbm_flags |= DR_MFLAG_SOURCE;
2637 	s_mp->sbm_del_mlist = d_ml;	/* spans to be deleted, if any */
2638 	s_mp->sbm_mlist = s_ml;
2639 	s_mp->sbm_cm.sbdev_busy = 1;
2640 
2641 	c_mp->sbm_peer = s_mp;
2642 	c_mp->sbm_flags |= DR_MFLAG_TARGET;
2643 	c_mp->sbm_del_mlist = t_ml;	/* spans to be deleted */
2644 	c_mp->sbm_mlist = t_ml;
2645 	c_mp->sbm_cm.sbdev_busy = 1;
2646 
2647 	s_mp->sbm_flags &= ~DR_MFLAG_MEMRESIZE;
2648 	if (c_mp->sbm_npages > s_mp->sbm_npages) {
2649 		s_mp->sbm_flags |= DR_MFLAG_MEMUPSIZE;
2650 		PR_MEM("%s: upsize detected (source=%ld < target=%ld)\n",
2651 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2652 	} else if (c_mp->sbm_npages < s_mp->sbm_npages) {
2653 		s_mp->sbm_flags |= DR_MFLAG_MEMDOWNSIZE;
2654 		PR_MEM("%s: downsize detected (source=%ld > target=%ld)\n",
2655 		    f, s_mp->sbm_npages, c_mp->sbm_npages);
2656 	}
2657 
2658 	return (0);
2659 }
2660 
2661 /*
2662  * Memlist support.
2663  */
2664 
2665 /*
2666  * Determine whether the source memlist (s_mlist) will
2667  * fit into the target memlist (t_mlist) in terms of
2668  * size and holes (i.e. based on same relative base address).
2669  */
2670 static int
2671 memlist_canfit(struct memlist *s_mlist, struct memlist *t_mlist)
2672 {
2673 	int		rv = 0;
2674 	uint64_t	s_basepa, t_basepa;
2675 	struct memlist	*s_ml, *t_ml;
2676 
2677 	if ((s_mlist == NULL) || (t_mlist == NULL))
2678 		return (0);
2679 
2680 	/*
2681 	 * Base both memlists on common base address (0).
2682 	 */
2683 	s_basepa = s_mlist->address;
2684 	t_basepa = t_mlist->address;
2685 
2686 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2687 		s_ml->address -= s_basepa;
2688 
2689 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2690 		t_ml->address -= t_basepa;
2691 
2692 	s_ml = s_mlist;
2693 	for (t_ml = t_mlist; t_ml && s_ml; t_ml = t_ml->next) {
2694 		uint64_t	s_start, s_end;
2695 		uint64_t	t_start, t_end;
2696 
2697 		t_start = t_ml->address;
2698 		t_end = t_start + t_ml->size;
2699 
2700 		for (; s_ml; s_ml = s_ml->next) {
2701 			s_start = s_ml->address;
2702 			s_end = s_start + s_ml->size;
2703 
2704 			if ((s_start < t_start) || (s_end > t_end))
2705 				break;
2706 		}
2707 	}
2708 	/*
2709 	 * If we ran out of source memlist chunks that mean
2710 	 * we found a home for all of them.
2711 	 */
2712 	if (s_ml == NULL)
2713 		rv = 1;
2714 
2715 	/*
2716 	 * Need to add base addresses back since memlists
2717 	 * are probably in use by caller.
2718 	 */
2719 	for (s_ml = s_mlist; s_ml; s_ml = s_ml->next)
2720 		s_ml->address += s_basepa;
2721 
2722 	for (t_ml = t_mlist; t_ml; t_ml = t_ml->next)
2723 		t_ml->address += t_basepa;
2724 
2725 	return (rv);
2726 }
2727