xref: /illumos-gate/usr/src/uts/common/os/mem_config.c (revision 67ce1dada345581246cd990d73516418f321a793)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern void mem_node_add(pfn_t, pfn_t);
63 extern void mem_node_del(pfn_t, pfn_t);
64 
65 extern uint_t page_ctrs_adjust(int);
66 static void kphysm_setup_post_add(pgcnt_t);
67 static int kphysm_setup_pre_del(pgcnt_t);
68 static void kphysm_setup_post_del(pgcnt_t, int);
69 
70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
71 
72 static int delspan_reserve(pfn_t, pgcnt_t);
73 static void delspan_unreserve(pfn_t, pgcnt_t);
74 
75 static kmutex_t memseg_lists_lock;
76 static struct memseg *memseg_va_avail;
77 static struct memseg *memseg_delete_junk;
78 static struct memseg *memseg_edit_junk;
79 void memseg_remap_init(void);
80 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
82 static struct memseg *memseg_reuse(pgcnt_t);
83 
84 static struct kmem_cache *memseg_cache;
85 
86 /*
87  * Add a chunk of memory to the system.  page_t's for this memory
88  * are allocated in the first few pages of the chunk.
89  * base: starting PAGESIZE page of new memory.
90  * npgs: length in PAGESIZE pages.
91  *
92  * Adding mem this way doesn't increase the size of the hash tables;
93  * growing them would be too hard.  This should be OK, but adding memory
94  * dynamically most likely means more hash misses, since the tables will
95  * be smaller than they otherwise would be.
96  */
97 int
98 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
99 {
100 	page_t		*pp;
101 	page_t		*opp, *oepp;
102 	struct memseg	*seg;
103 	uint64_t	avmem;
104 	pfn_t		pfn;
105 	pfn_t		pt_base = base;
106 	pgcnt_t		tpgs = npgs;
107 	pgcnt_t		metapgs;
108 	int		exhausted;
109 	pfn_t		pnum;
110 	int		mnode;
111 	caddr_t		vaddr;
112 	int		reuse;
113 	int		mlret;
114 	void		*mapva;
115 	pgcnt_t		nkpmpgs = 0;
116 	offset_t	kpm_pages_off;
117 
118 	cmn_err(CE_CONT,
119 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
120 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
121 
122 	/*
123 	 * Add this span in the delete list to prevent interactions.
124 	 */
125 	if (!delspan_reserve(base, npgs)) {
126 		return (KPHYSM_ESPAN);
127 	}
128 	/*
129 	 * Check to see if any of the memory span has been added
130 	 * by trying an add to the installed memory list. This
131 	 * forms the interlocking process for add.
132 	 */
133 
134 	memlist_write_lock();
135 
136 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
137 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
138 
139 	if (mlret == MEML_SPANOP_OK)
140 		installed_top_size(phys_install, &physmax, &physinstalled);
141 
142 	memlist_write_unlock();
143 
144 	if (mlret != MEML_SPANOP_OK) {
145 		if (mlret == MEML_SPANOP_EALLOC) {
146 			delspan_unreserve(pt_base, tpgs);
147 			return (KPHYSM_ERESOURCE);
148 		} else
149 		if (mlret == MEML_SPANOP_ESPAN) {
150 			delspan_unreserve(pt_base, tpgs);
151 			return (KPHYSM_ESPAN);
152 		} else {
153 			delspan_unreserve(pt_base, tpgs);
154 			return (KPHYSM_ERESOURCE);
155 		}
156 	}
157 
158 	/*
159 	 * We store the page_t's for this new memory in the first
160 	 * few pages of the chunk. Here, we go and get'em ...
161 	 */
162 
163 	/*
164 	 * The expression after the '-' gives the number of pages
165 	 * that will fit in the new memory based on a requirement
166 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
167 	 */
168 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
169 	    (PAGESIZE + sizeof (page_t)));
170 
171 	npgs -= metapgs;
172 	base += metapgs;
173 
174 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
175 
176 	exhausted = (metapgs == 0 || npgs == 0);
177 
178 	if (kpm_enable && !exhausted) {
179 		pgcnt_t start, end, nkpmpgs_prelim;
180 		size_t	ptsz;
181 
182 		/*
183 		 * A viable kpm large page mapping must not overlap two
184 		 * dynamic memsegs. Therefore the total size is checked
185 		 * to be at least kpm_pgsz and also whether start and end
186 		 * points are at least kpm_pgsz aligned.
187 		 */
188 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
189 		    pmodkpmp(base + npgs)) {
190 
191 			kphysm_addmem_error_undospan(pt_base, tpgs);
192 
193 			/*
194 			 * There is no specific error code for violating
195 			 * kpm granularity constraints.
196 			 */
197 			return (KPHYSM_ENOTVIABLE);
198 		}
199 
200 		start = kpmptop(ptokpmp(base));
201 		end = kpmptop(ptokpmp(base + npgs));
202 		nkpmpgs_prelim = ptokpmp(end - start);
203 		ptsz = npgs * sizeof (page_t);
204 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
205 		exhausted = (tpgs <= metapgs);
206 		if (!exhausted) {
207 			npgs = tpgs - metapgs;
208 			base = pt_base + metapgs;
209 
210 			/* final nkpmpgs */
211 			start = kpmptop(ptokpmp(base));
212 			nkpmpgs = ptokpmp(end - start);
213 			kpm_pages_off = ptsz +
214 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
215 		}
216 	}
217 
218 	/*
219 	 * Is memory area supplied too small?
220 	 */
221 	if (exhausted) {
222 		kphysm_addmem_error_undospan(pt_base, tpgs);
223 
224 		/*
225 		 * There is no specific error code for 'too small'.
226 		 */
227 		return (KPHYSM_ERESOURCE);
228 	}
229 
230 	/*
231 	 * We may re-use a previously allocated VA space for the page_ts
232 	 * eventually, but we need to initialize and lock the pages first.
233 	 */
234 
235 	/*
236 	 * Get an address in the kernel address map, map
237 	 * the page_t pages and see if we can touch them.
238 	 */
239 
240 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
241 	if (mapva == NULL) {
242 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
243 		    " Can't allocate VA for page_ts");
244 
245 		kphysm_addmem_error_undospan(pt_base, tpgs);
246 
247 		return (KPHYSM_ERESOURCE);
248 	}
249 	pp = mapva;
250 
251 	if (physmax < (pt_base + tpgs))
252 		physmax = (pt_base + tpgs);
253 
254 	/*
255 	 * In the remapping code we map one page at a time so we must do
256 	 * the same here to match mapping sizes.
257 	 */
258 	pfn = pt_base;
259 	vaddr = (caddr_t)pp;
260 	for (pnum = 0; pnum < metapgs; pnum++) {
261 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
262 		    PROT_READ | PROT_WRITE,
263 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
264 		pfn++;
265 		vaddr += ptob(1);
266 	}
267 
268 	if (ddi_peek32((dev_info_t *)NULL,
269 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
270 
271 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
272 		    " Can't access pp array at 0x%p [phys 0x%lx]",
273 		    (void *)pp, pt_base);
274 
275 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
276 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
277 
278 		vmem_free(heap_arena, mapva, ptob(metapgs));
279 
280 		kphysm_addmem_error_undospan(pt_base, tpgs);
281 
282 		return (KPHYSM_EFAULT);
283 	}
284 
285 	/*
286 	 * Add this memory slice to its memory node translation.
287 	 *
288 	 * Note that right now, each node may have only one slice;
289 	 * this may change with COD or in larger SSM systems with
290 	 * nested latency groups, so we must not assume that the
291 	 * node does not yet exist.
292 	 */
293 	pnum = base + npgs - 1;
294 	mem_node_add_slice(base, pnum);
295 
296 	/*
297 	 * Allocate or resize page counters as necessary to accommodate
298 	 * the increase in memory pages.
299 	 */
300 	mnode = PFN_2_MEM_NODE(pnum);
301 	if (page_ctrs_adjust(mnode) != 0) {
302 
303 		mem_node_pre_del_slice(base, pnum);
304 		mem_node_post_del_slice(base, pnum, 0);
305 
306 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
307 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
308 
309 		vmem_free(heap_arena, mapva, ptob(metapgs));
310 
311 		kphysm_addmem_error_undospan(pt_base, tpgs);
312 
313 		return (KPHYSM_ERESOURCE);
314 	}
315 
316 	/*
317 	 * Update the phys_avail memory list.
318 	 * The phys_install list was done at the start.
319 	 */
320 
321 	memlist_write_lock();
322 
323 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
324 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
325 	ASSERT(mlret == MEML_SPANOP_OK);
326 
327 	memlist_write_unlock();
328 
329 	/* See if we can find a memseg to re-use. */
330 	seg = memseg_reuse(metapgs);
331 
332 	reuse = (seg != NULL);
333 
334 	/*
335 	 * Initialize the memseg structure representing this memory
336 	 * and add it to the existing list of memsegs. Do some basic
337 	 * initialization and add the memory to the system.
338 	 * In order to prevent lock deadlocks, the add_physmem()
339 	 * code is repeated here, but split into several stages.
340 	 */
341 	if (seg == NULL) {
342 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
343 		bzero(seg, sizeof (struct memseg));
344 		seg->msegflags = MEMSEG_DYNAMIC;
345 		seg->pages = pp;
346 	} else {
347 		/*EMPTY*/
348 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
349 	}
350 
351 	seg->epages = seg->pages + npgs;
352 	seg->pages_base = base;
353 	seg->pages_end = base + npgs;
354 
355 	/*
356 	 * Initialize metadata. The page_ts are set to locked state
357 	 * ready to be freed.
358 	 */
359 	bzero((caddr_t)pp, ptob(metapgs));
360 
361 	pfn = seg->pages_base;
362 	/* Save the original pp base in case we reuse a memseg. */
363 	opp = pp;
364 	oepp = opp + npgs;
365 	for (pp = opp; pp < oepp; pp++) {
366 		pp->p_pagenum = pfn;
367 		pfn++;
368 		page_iolock_init(pp);
369 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
370 			continue;
371 		pp->p_offset = (u_offset_t)-1;
372 	}
373 
374 	if (reuse) {
375 		/* Remap our page_ts to the re-used memseg VA space. */
376 		pfn = pt_base;
377 		vaddr = (caddr_t)seg->pages;
378 		for (pnum = 0; pnum < metapgs; pnum++) {
379 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
380 			    PROT_READ | PROT_WRITE,
381 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
382 			pfn++;
383 			vaddr += ptob(1);
384 		}
385 
386 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
387 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
388 
389 		vmem_free(heap_arena, mapva, ptob(metapgs));
390 	}
391 
392 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
393 
394 	memsegs_lock(1);
395 
396 	/*
397 	 * The new memseg is inserted at the beginning of the list.
398 	 * Not only does this save searching for the tail, but in the
399 	 * case of a re-used memseg, it solves the problem of what
400 	 * happens if some process has still got a pointer to the
401 	 * memseg and follows the next pointer to continue traversing
402 	 * the memsegs list.
403 	 */
404 
405 	hat_kpm_addmem_mseg_insert(seg);
406 
407 	seg->next = memsegs;
408 	membar_producer();
409 
410 	hat_kpm_addmem_memsegs_update(seg);
411 
412 	memsegs = seg;
413 
414 	build_pfn_hash();
415 
416 	total_pages += npgs;
417 
418 	/*
419 	 * Recalculate the paging parameters now total_pages has changed.
420 	 * This will also cause the clock hands to be reset before next use.
421 	 */
422 	setupclock(1);
423 
424 	memsegs_unlock(1);
425 
426 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
427 
428 	/*
429 	 * Free the pages outside the lock to avoid locking loops.
430 	 */
431 	for (pp = seg->pages; pp < seg->epages; pp++) {
432 		page_free(pp, 1);
433 	}
434 
435 	/*
436 	 * Now that we've updated the appropriate memory lists we
437 	 * need to reset a number of globals, since we've increased memory.
438 	 * Several have already been updated for us as noted above. The
439 	 * globals we're interested in at this point are:
440 	 *   physmax - highest page frame number.
441 	 *   physinstalled - number of pages currently installed (done earlier)
442 	 *   maxmem - max free pages in the system
443 	 *   physmem - physical memory pages available
444 	 *   availrmem - real memory available
445 	 */
446 
447 	mutex_enter(&freemem_lock);
448 	maxmem += npgs;
449 	physmem += npgs;
450 	availrmem += npgs;
451 	availrmem_initial += npgs;
452 
453 	mutex_exit(&freemem_lock);
454 
455 	dump_resize();
456 
457 	page_freelist_coalesce_all(mnode);
458 
459 	kphysm_setup_post_add(npgs);
460 
461 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
462 	    "(0x%" PRIx64 ")\n",
463 	    physinstalled << (PAGESHIFT - 10),
464 	    (uint64_t)physinstalled << PAGESHIFT);
465 
466 	avmem = (uint64_t)freemem << PAGESHIFT;
467 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
468 	    "avail mem = %" PRId64 "\n", avmem);
469 
470 	/*
471 	 * Update lgroup generation number on single lgroup systems
472 	 */
473 	if (nlgrps == 1)
474 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
475 
476 	delspan_unreserve(pt_base, tpgs);
477 	return (KPHYSM_OK);		/* Successfully added system memory */
478 
479 }
480 
481 /*
482  * There are various error conditions in kphysm_add_memory_dynamic()
483  * which require a rollback of already changed global state.
484  */
485 static void
486 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
487 {
488 	int mlret;
489 
490 	/* Unreserve memory span. */
491 	memlist_write_lock();
492 
493 	mlret = memlist_delete_span(
494 	    (uint64_t)(pt_base) << PAGESHIFT,
495 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
496 
497 	ASSERT(mlret == MEML_SPANOP_OK);
498 	phys_install_has_changed();
499 	installed_top_size(phys_install, &physmax, &physinstalled);
500 
501 	memlist_write_unlock();
502 	delspan_unreserve(pt_base, tpgs);
503 }
504 
505 /*
506  * Only return an available memseg of exactly the right size.
507  * When the meta data area has it's own virtual address space
508  * we will need to manage this more carefully and do best fit
509  * allocations, possibly splitting an available area.
510  */
511 static struct memseg *
512 memseg_reuse(pgcnt_t metapgs)
513 {
514 	struct memseg **segpp, *seg;
515 
516 	mutex_enter(&memseg_lists_lock);
517 
518 	segpp = &memseg_va_avail;
519 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
520 		caddr_t end;
521 
522 		if (kpm_enable)
523 			end = hat_kpm_mseg_reuse(seg);
524 		else
525 			end = (caddr_t)seg->epages;
526 
527 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
528 			*segpp = seg->lnext;
529 			seg->lnext = NULL;
530 			break;
531 		}
532 	}
533 	mutex_exit(&memseg_lists_lock);
534 
535 	return (seg);
536 }
537 
538 static uint_t handle_gen;
539 
540 struct memdelspan {
541 	struct memdelspan *mds_next;
542 	pfn_t		mds_base;
543 	pgcnt_t		mds_npgs;
544 	uint_t		*mds_bitmap;
545 	uint_t		*mds_bitmap_retired;
546 };
547 
548 #define	NBPBMW		(sizeof (uint_t) * NBBY)
549 #define	MDS_BITMAPBYTES(MDSP) \
550 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
551 
552 struct transit_list {
553 	struct transit_list	*trl_next;
554 	struct memdelspan	*trl_spans;
555 	int			trl_collect;
556 };
557 
558 struct transit_list_head {
559 	kmutex_t		trh_lock;
560 	struct transit_list	*trh_head;
561 };
562 
563 static struct transit_list_head transit_list_head;
564 
565 struct mem_handle;
566 static void transit_list_collect(struct mem_handle *, int);
567 static void transit_list_insert(struct transit_list *);
568 static void transit_list_remove(struct transit_list *);
569 
570 #ifdef DEBUG
571 #define	MEM_DEL_STATS
572 #endif /* DEBUG */
573 
574 #ifdef MEM_DEL_STATS
575 static int mem_del_stat_print = 0;
576 struct mem_del_stat {
577 	uint_t	nloop;
578 	uint_t	need_free;
579 	uint_t	free_loop;
580 	uint_t	free_low;
581 	uint_t	free_failed;
582 	uint_t	ncheck;
583 	uint_t	nopaget;
584 	uint_t	lockfail;
585 	uint_t	nfree;
586 	uint_t	nreloc;
587 	uint_t	nrelocfail;
588 	uint_t	already_done;
589 	uint_t	first_notfree;
590 	uint_t	npplocked;
591 	uint_t	nlockreloc;
592 	uint_t	nnorepl;
593 	uint_t	nmodreloc;
594 	uint_t	ndestroy;
595 	uint_t	nputpage;
596 	uint_t	nnoreclaim;
597 	uint_t	ndelay;
598 	uint_t	demotefail;
599 	uint64_t nticks_total;
600 	uint64_t nticks_pgrp;
601 	uint_t	retired;
602 	uint_t	toxic;
603 	uint_t	failing;
604 	uint_t	modtoxic;
605 	uint_t	npplkdtoxic;
606 	uint_t	gptlmodfail;
607 	uint_t	gptllckfail;
608 };
609 /*
610  * The stat values are only incremented in the delete thread
611  * so no locking or atomic required.
612  */
613 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
614 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
615 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
616 static void mem_del_stat_print_func(struct mem_handle *);
617 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
618 #else /* MEM_DEL_STATS */
619 #define	MDSTAT_INCR(MHP, FLD)
620 #define	MDSTAT_TOTAL(MHP, ntck)
621 #define	MDSTAT_PGRP(MHP, ntck)
622 #define	MDSTAT_PRINT(MHP)
623 #endif /* MEM_DEL_STATS */
624 
625 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
626 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
627 
628 /*
629  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
630  * The mutex may not be required for other fields, dependent on mh_state.
631  */
632 struct mem_handle {
633 	kmutex_t	mh_mutex;
634 	struct mem_handle *mh_next;
635 	memhandle_t	mh_exthandle;
636 	mhnd_state_t	mh_state;
637 	struct transit_list mh_transit;
638 	pgcnt_t		mh_phys_pages;
639 	pgcnt_t		mh_vm_pages;
640 	pgcnt_t		mh_hold_todo;
641 	void		(*mh_delete_complete)(void *, int error);
642 	void		*mh_delete_complete_arg;
643 	volatile uint_t mh_cancel;
644 	volatile uint_t mh_dr_aio_cleanup_cancel;
645 	volatile uint_t mh_aio_cleanup_done;
646 	kcondvar_t	mh_cv;
647 	kthread_id_t	mh_thread_id;
648 	page_t		*mh_deleted;	/* link through p_next */
649 #ifdef MEM_DEL_STATS
650 	struct mem_del_stat mh_delstat;
651 #endif /* MEM_DEL_STATS */
652 };
653 
654 static struct mem_handle *mem_handle_head;
655 static kmutex_t mem_handle_list_mutex;
656 
657 static struct mem_handle *
658 kphysm_allocate_mem_handle()
659 {
660 	struct mem_handle *mhp;
661 
662 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
663 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
664 	mutex_enter(&mem_handle_list_mutex);
665 	mutex_enter(&mhp->mh_mutex);
666 	/* handle_gen is protected by list mutex. */
667 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
668 	mhp->mh_next = mem_handle_head;
669 	mem_handle_head = mhp;
670 	mutex_exit(&mem_handle_list_mutex);
671 
672 	return (mhp);
673 }
674 
675 static void
676 kphysm_free_mem_handle(struct mem_handle *mhp)
677 {
678 	struct mem_handle **mhpp;
679 
680 	ASSERT(mutex_owned(&mhp->mh_mutex));
681 	ASSERT(mhp->mh_state == MHND_FREE);
682 	/*
683 	 * Exit the mutex to preserve locking order. This is OK
684 	 * here as once in the FREE state, the handle cannot
685 	 * be found by a lookup.
686 	 */
687 	mutex_exit(&mhp->mh_mutex);
688 
689 	mutex_enter(&mem_handle_list_mutex);
690 	mhpp = &mem_handle_head;
691 	while (*mhpp != NULL && *mhpp != mhp)
692 		mhpp = &(*mhpp)->mh_next;
693 	ASSERT(*mhpp == mhp);
694 	/*
695 	 * No need to lock the handle (mh_mutex) as only
696 	 * mh_next changing and this is the only thread that
697 	 * can be referncing mhp.
698 	 */
699 	*mhpp = mhp->mh_next;
700 	mutex_exit(&mem_handle_list_mutex);
701 
702 	mutex_destroy(&mhp->mh_mutex);
703 	kmem_free(mhp, sizeof (struct mem_handle));
704 }
705 
706 /*
707  * This function finds the internal mem_handle corresponding to an
708  * external handle and returns it with the mh_mutex held.
709  */
710 static struct mem_handle *
711 kphysm_lookup_mem_handle(memhandle_t handle)
712 {
713 	struct mem_handle *mhp;
714 
715 	mutex_enter(&mem_handle_list_mutex);
716 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
717 		if (mhp->mh_exthandle == handle) {
718 			mutex_enter(&mhp->mh_mutex);
719 			/*
720 			 * The state of the handle could have been changed
721 			 * by kphysm_del_release() while waiting for mh_mutex.
722 			 */
723 			if (mhp->mh_state == MHND_FREE) {
724 				mutex_exit(&mhp->mh_mutex);
725 				continue;
726 			}
727 			break;
728 		}
729 	}
730 	mutex_exit(&mem_handle_list_mutex);
731 	return (mhp);
732 }
733 
734 int
735 kphysm_del_gethandle(memhandle_t *xmhp)
736 {
737 	struct mem_handle *mhp;
738 
739 	mhp = kphysm_allocate_mem_handle();
740 	/*
741 	 * The handle is allocated using KM_SLEEP, so cannot fail.
742 	 * If the implementation is changed, the correct error to return
743 	 * here would be KPHYSM_ENOHANDLES.
744 	 */
745 	ASSERT(mhp->mh_state == MHND_FREE);
746 	mhp->mh_state = MHND_INIT;
747 	*xmhp = mhp->mh_exthandle;
748 	mutex_exit(&mhp->mh_mutex);
749 	return (KPHYSM_OK);
750 }
751 
752 static int
753 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
754 {
755 	pfn_t e1, e2;
756 
757 	e1 = b1 + l1;
758 	e2 = b2 + l2;
759 
760 	return (!(b2 >= e1 || b1 >= e2));
761 }
762 
763 static int can_remove_pgs(pgcnt_t);
764 
765 static struct memdelspan *
766 span_to_install(pfn_t base, pgcnt_t npgs)
767 {
768 	struct memdelspan *mdsp;
769 	struct memdelspan *mdsp_new;
770 	uint64_t address, size, thislen;
771 	struct memlist *mlp;
772 
773 	mdsp_new = NULL;
774 
775 	address = (uint64_t)base << PAGESHIFT;
776 	size = (uint64_t)npgs << PAGESHIFT;
777 	while (size != 0) {
778 		memlist_read_lock();
779 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
780 			if (address >= (mlp->address + mlp->size))
781 				continue;
782 			if ((address + size) > mlp->address)
783 				break;
784 		}
785 		if (mlp == NULL) {
786 			address += size;
787 			size = 0;
788 			thislen = 0;
789 		} else {
790 			if (address < mlp->address) {
791 				size -= (mlp->address - address);
792 				address = mlp->address;
793 			}
794 			ASSERT(address >= mlp->address);
795 			if ((address + size) > (mlp->address + mlp->size)) {
796 				thislen = mlp->size - (address - mlp->address);
797 			} else {
798 				thislen = size;
799 			}
800 		}
801 		memlist_read_unlock();
802 		/* TODO: phys_install could change now */
803 		if (thislen == 0)
804 			continue;
805 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
806 		mdsp->mds_base = btop(address);
807 		mdsp->mds_npgs = btop(thislen);
808 		mdsp->mds_next = mdsp_new;
809 		mdsp_new = mdsp;
810 		address += thislen;
811 		size -= thislen;
812 	}
813 	return (mdsp_new);
814 }
815 
816 static void
817 free_delspans(struct memdelspan *mdsp)
818 {
819 	struct memdelspan *amdsp;
820 
821 	while ((amdsp = mdsp) != NULL) {
822 		mdsp = amdsp->mds_next;
823 		kmem_free(amdsp, sizeof (struct memdelspan));
824 	}
825 }
826 
827 /*
828  * Concatenate lists. No list ordering is required.
829  */
830 
831 static void
832 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
833 {
834 	while (*mdspp != NULL)
835 		mdspp = &(*mdspp)->mds_next;
836 
837 	*mdspp = mdsp;
838 }
839 
840 /*
841  * Given a new list of delspans, check there is no overlap with
842  * all existing span activity (add or delete) and then concatenate
843  * the new spans to the given list.
844  * Return 1 for OK, 0 if overlapping.
845  */
846 static int
847 delspan_insert(
848 	struct transit_list *my_tlp,
849 	struct memdelspan *mdsp_new)
850 {
851 	struct transit_list_head *trh;
852 	struct transit_list *tlp;
853 	int ret;
854 
855 	trh = &transit_list_head;
856 
857 	ASSERT(my_tlp != NULL);
858 	ASSERT(mdsp_new != NULL);
859 
860 	ret = 1;
861 	mutex_enter(&trh->trh_lock);
862 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
863 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
864 		struct memdelspan *mdsp;
865 
866 		for (mdsp = tlp->trl_spans; mdsp != NULL;
867 		    mdsp = mdsp->mds_next) {
868 			struct memdelspan *nmdsp;
869 
870 			for (nmdsp = mdsp_new; nmdsp != NULL;
871 			    nmdsp = nmdsp->mds_next) {
872 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
873 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
874 					ret = 0;
875 					goto done;
876 				}
877 			}
878 		}
879 	}
880 done:
881 	if (ret != 0) {
882 		if (my_tlp->trl_spans == NULL)
883 			transit_list_insert(my_tlp);
884 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
885 	}
886 	mutex_exit(&trh->trh_lock);
887 	return (ret);
888 }
889 
890 static void
891 delspan_remove(
892 	struct transit_list *my_tlp,
893 	pfn_t base,
894 	pgcnt_t npgs)
895 {
896 	struct transit_list_head *trh;
897 	struct memdelspan *mdsp;
898 
899 	trh = &transit_list_head;
900 
901 	ASSERT(my_tlp != NULL);
902 
903 	mutex_enter(&trh->trh_lock);
904 	if ((mdsp = my_tlp->trl_spans) != NULL) {
905 		if (npgs == 0) {
906 			my_tlp->trl_spans = NULL;
907 			free_delspans(mdsp);
908 			transit_list_remove(my_tlp);
909 		} else {
910 			struct memdelspan **prv;
911 
912 			prv = &my_tlp->trl_spans;
913 			while (mdsp != NULL) {
914 				pfn_t p_end;
915 
916 				p_end = mdsp->mds_base + mdsp->mds_npgs;
917 				if (mdsp->mds_base >= base &&
918 				    p_end <= (base + npgs)) {
919 					*prv = mdsp->mds_next;
920 					mdsp->mds_next = NULL;
921 					free_delspans(mdsp);
922 				} else {
923 					prv = &mdsp->mds_next;
924 				}
925 				mdsp = *prv;
926 			}
927 			if (my_tlp->trl_spans == NULL)
928 				transit_list_remove(my_tlp);
929 		}
930 	}
931 	mutex_exit(&trh->trh_lock);
932 }
933 
934 /*
935  * Reserve interface for add to stop delete before add finished.
936  * This list is only accessed through the delspan_insert/remove
937  * functions and so is fully protected by the mutex in struct transit_list.
938  */
939 
940 static struct transit_list reserve_transit;
941 
942 static int
943 delspan_reserve(pfn_t base, pgcnt_t npgs)
944 {
945 	struct memdelspan *mdsp;
946 	int ret;
947 
948 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
949 	mdsp->mds_base = base;
950 	mdsp->mds_npgs = npgs;
951 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
952 		free_delspans(mdsp);
953 	}
954 	return (ret);
955 }
956 
957 static void
958 delspan_unreserve(pfn_t base, pgcnt_t npgs)
959 {
960 	delspan_remove(&reserve_transit, base, npgs);
961 }
962 
963 /*
964  * Return whether memseg was created by kphysm_add_memory_dynamic().
965  * If this is the case and startp non zero, return also the start pfn
966  * of the meta data via startp.
967  */
968 static int
969 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
970 {
971 	pfn_t		pt_start;
972 
973 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
974 		return (0);
975 
976 	/* Meta data is required to be at the beginning */
977 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
978 
979 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
980 	if (startp != NULL)
981 		*startp = pt_start;
982 
983 	return (1);
984 }
985 
986 int
987 kphysm_del_span(
988 	memhandle_t handle,
989 	pfn_t base,
990 	pgcnt_t npgs)
991 {
992 	struct mem_handle *mhp;
993 	struct memseg *seg;
994 	struct memdelspan *mdsp;
995 	struct memdelspan *mdsp_new;
996 	pgcnt_t phys_pages, vm_pages;
997 	pfn_t p_end;
998 	page_t *pp;
999 	int ret;
1000 
1001 	mhp = kphysm_lookup_mem_handle(handle);
1002 	if (mhp == NULL) {
1003 		return (KPHYSM_EHANDLE);
1004 	}
1005 	if (mhp->mh_state != MHND_INIT) {
1006 		mutex_exit(&mhp->mh_mutex);
1007 		return (KPHYSM_ESEQUENCE);
1008 	}
1009 
1010 	/*
1011 	 * Intersect the span with the installed memory list (phys_install).
1012 	 */
1013 	mdsp_new = span_to_install(base, npgs);
1014 	if (mdsp_new == NULL) {
1015 		/*
1016 		 * No physical memory in this range. Is this an
1017 		 * error? If an attempt to start the delete is made
1018 		 * for OK returns from del_span such as this, start will
1019 		 * return an error.
1020 		 * Could return KPHYSM_ENOWORK.
1021 		 */
1022 		/*
1023 		 * It is assumed that there are no error returns
1024 		 * from span_to_install() due to kmem_alloc failure.
1025 		 */
1026 		mutex_exit(&mhp->mh_mutex);
1027 		return (KPHYSM_OK);
1028 	}
1029 	/*
1030 	 * Does this span overlap an existing span?
1031 	 */
1032 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1033 		/*
1034 		 * Differentiate between already on list for this handle
1035 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1036 		 */
1037 		ret = KPHYSM_EBUSY;
1038 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1039 		    mdsp = mdsp->mds_next) {
1040 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1041 			    base, npgs)) {
1042 				ret = KPHYSM_EDUP;
1043 				break;
1044 			}
1045 		}
1046 		mutex_exit(&mhp->mh_mutex);
1047 		free_delspans(mdsp_new);
1048 		return (ret);
1049 	}
1050 	/*
1051 	 * At this point the spans in mdsp_new have been inserted into the
1052 	 * list of spans for this handle and thereby to the global list of
1053 	 * spans being processed. Each of these spans must now be checked
1054 	 * for relocatability. As a side-effect segments in the memseg list
1055 	 * may be split.
1056 	 *
1057 	 * Note that mdsp_new can no longer be used as it is now part of
1058 	 * a larger list. Select elements of this larger list based
1059 	 * on base and npgs.
1060 	 */
1061 restart:
1062 	phys_pages = 0;
1063 	vm_pages = 0;
1064 	ret = KPHYSM_OK;
1065 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1066 	    mdsp = mdsp->mds_next) {
1067 		pgcnt_t pages_checked;
1068 
1069 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1070 			continue;
1071 		}
1072 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1073 		/*
1074 		 * The pages_checked count is a hack. All pages should be
1075 		 * checked for relocatability. Those not covered by memsegs
1076 		 * should be tested with arch_kphysm_del_span_ok().
1077 		 */
1078 		pages_checked = 0;
1079 		for (seg = memsegs; seg; seg = seg->next) {
1080 			pfn_t mseg_start;
1081 
1082 			if (seg->pages_base >= p_end ||
1083 			    seg->pages_end <= mdsp->mds_base) {
1084 				/* Span and memseg don't overlap. */
1085 				continue;
1086 			}
1087 			/* Check that segment is suitable for delete. */
1088 			if (memseg_is_dynamic(seg, &mseg_start)) {
1089 				/*
1090 				 * Can only delete whole added segments
1091 				 * for the moment.
1092 				 * Check that this is completely within the
1093 				 * span.
1094 				 */
1095 				if (mseg_start < mdsp->mds_base ||
1096 				    seg->pages_end > p_end) {
1097 					ret = KPHYSM_EBUSY;
1098 					break;
1099 				}
1100 				pages_checked += seg->pages_end - mseg_start;
1101 			} else {
1102 				/*
1103 				 * Set mseg_start for accounting below.
1104 				 */
1105 				mseg_start = seg->pages_base;
1106 				/*
1107 				 * If this segment is larger than the span,
1108 				 * try to split it. After the split, it
1109 				 * is necessary to restart.
1110 				 */
1111 				if (seg->pages_base < mdsp->mds_base ||
1112 				    seg->pages_end > p_end) {
1113 					pfn_t abase;
1114 					pgcnt_t anpgs;
1115 					int s_ret;
1116 
1117 					/* Split required.  */
1118 					if (mdsp->mds_base < seg->pages_base)
1119 						abase = seg->pages_base;
1120 					else
1121 						abase = mdsp->mds_base;
1122 					if (p_end > seg->pages_end)
1123 						anpgs = seg->pages_end - abase;
1124 					else
1125 						anpgs = p_end - abase;
1126 					s_ret = kphysm_split_memseg(abase,
1127 					    anpgs);
1128 					if (s_ret == 0) {
1129 						/* Split failed. */
1130 						ret = KPHYSM_ERESOURCE;
1131 						break;
1132 					}
1133 					goto restart;
1134 				}
1135 				pages_checked +=
1136 				    seg->pages_end - seg->pages_base;
1137 			}
1138 			/*
1139 			 * The memseg is wholly within the delete span.
1140 			 * The individual pages can now be checked.
1141 			 */
1142 			/* Cage test. */
1143 			for (pp = seg->pages; pp < seg->epages; pp++) {
1144 				if (PP_ISNORELOC(pp)) {
1145 					ret = KPHYSM_ENONRELOC;
1146 					break;
1147 				}
1148 			}
1149 			if (ret != KPHYSM_OK) {
1150 				break;
1151 			}
1152 			phys_pages += (seg->pages_end - mseg_start);
1153 			vm_pages += MSEG_NPAGES(seg);
1154 		}
1155 		if (ret != KPHYSM_OK)
1156 			break;
1157 		if (pages_checked != mdsp->mds_npgs) {
1158 			ret = KPHYSM_ENONRELOC;
1159 			break;
1160 		}
1161 	}
1162 
1163 	if (ret == KPHYSM_OK) {
1164 		mhp->mh_phys_pages += phys_pages;
1165 		mhp->mh_vm_pages += vm_pages;
1166 	} else {
1167 		/*
1168 		 * Keep holding the mh_mutex to prevent it going away.
1169 		 */
1170 		delspan_remove(&mhp->mh_transit, base, npgs);
1171 	}
1172 	mutex_exit(&mhp->mh_mutex);
1173 	return (ret);
1174 }
1175 
1176 int
1177 kphysm_del_span_query(
1178 	pfn_t base,
1179 	pgcnt_t npgs,
1180 	memquery_t *mqp)
1181 {
1182 	struct memdelspan *mdsp;
1183 	struct memdelspan *mdsp_new;
1184 	int done_first_nonreloc;
1185 
1186 	mqp->phys_pages = 0;
1187 	mqp->managed = 0;
1188 	mqp->nonrelocatable = 0;
1189 	mqp->first_nonrelocatable = 0;
1190 	mqp->last_nonrelocatable = 0;
1191 
1192 	mdsp_new = span_to_install(base, npgs);
1193 	/*
1194 	 * It is OK to proceed here if mdsp_new == NULL.
1195 	 */
1196 	done_first_nonreloc = 0;
1197 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1198 		pfn_t sbase;
1199 		pgcnt_t snpgs;
1200 
1201 		mqp->phys_pages += mdsp->mds_npgs;
1202 		sbase = mdsp->mds_base;
1203 		snpgs = mdsp->mds_npgs;
1204 		while (snpgs != 0) {
1205 			struct memseg *lseg, *seg;
1206 			pfn_t p_end;
1207 			page_t *pp;
1208 			pfn_t mseg_start;
1209 
1210 			p_end = sbase + snpgs;
1211 			/*
1212 			 * Find the lowest addressed memseg that starts
1213 			 * after sbase and account for it.
1214 			 * This is to catch dynamic memsegs whose start
1215 			 * is hidden.
1216 			 */
1217 			seg = NULL;
1218 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1219 				if ((lseg->pages_base >= sbase) ||
1220 				    (lseg->pages_base < p_end &&
1221 				    lseg->pages_end > sbase)) {
1222 					if (seg == NULL ||
1223 					    seg->pages_base > lseg->pages_base)
1224 						seg = lseg;
1225 				}
1226 			}
1227 			if (seg != NULL) {
1228 				if (!memseg_is_dynamic(seg, &mseg_start)) {
1229 					mseg_start = seg->pages_base;
1230 				}
1231 				/*
1232 				 * Now have the full extent of the memseg so
1233 				 * do the range check.
1234 				 */
1235 				if (mseg_start >= p_end ||
1236 				    seg->pages_end <= sbase) {
1237 					/* Span does not overlap memseg. */
1238 					seg = NULL;
1239 				}
1240 			}
1241 			/*
1242 			 * Account for gap either before the segment if
1243 			 * there is one or to the end of the span.
1244 			 */
1245 			if (seg == NULL || mseg_start > sbase) {
1246 				pfn_t a_end;
1247 
1248 				a_end = (seg == NULL) ? p_end : mseg_start;
1249 				/*
1250 				 * Check with arch layer for relocatability.
1251 				 */
1252 				if (arch_kphysm_del_span_ok(sbase,
1253 				    (a_end - sbase))) {
1254 					/*
1255 					 * No non-relocatble pages in this
1256 					 * area, avoid the fine-grained
1257 					 * test.
1258 					 */
1259 					snpgs -= (a_end - sbase);
1260 					sbase = a_end;
1261 				}
1262 				while (sbase < a_end) {
1263 					if (!arch_kphysm_del_span_ok(sbase,
1264 					    1)) {
1265 						mqp->nonrelocatable++;
1266 						if (!done_first_nonreloc) {
1267 							mqp->
1268 							    first_nonrelocatable
1269 							    = sbase;
1270 							done_first_nonreloc = 1;
1271 						}
1272 						mqp->last_nonrelocatable =
1273 						    sbase;
1274 					}
1275 					sbase++;
1276 					snpgs--;
1277 				}
1278 			}
1279 			if (seg != NULL) {
1280 				ASSERT(mseg_start <= sbase);
1281 				if (seg->pages_base != mseg_start &&
1282 				    seg->pages_base > sbase) {
1283 					pgcnt_t skip_pgs;
1284 
1285 					/*
1286 					 * Skip the page_t area of a
1287 					 * dynamic memseg.
1288 					 */
1289 					skip_pgs = seg->pages_base - sbase;
1290 					if (snpgs <= skip_pgs) {
1291 						sbase += snpgs;
1292 						snpgs = 0;
1293 						continue;
1294 					}
1295 					snpgs -= skip_pgs;
1296 					sbase += skip_pgs;
1297 				}
1298 				ASSERT(snpgs != 0);
1299 				ASSERT(seg->pages_base <= sbase);
1300 				/*
1301 				 * The individual pages can now be checked.
1302 				 */
1303 				for (pp = seg->pages +
1304 				    (sbase - seg->pages_base);
1305 				    snpgs != 0 && pp < seg->epages; pp++) {
1306 					mqp->managed++;
1307 					if (PP_ISNORELOC(pp)) {
1308 						mqp->nonrelocatable++;
1309 						if (!done_first_nonreloc) {
1310 							mqp->
1311 							    first_nonrelocatable
1312 							    = sbase;
1313 							done_first_nonreloc = 1;
1314 						}
1315 						mqp->last_nonrelocatable =
1316 						    sbase;
1317 					}
1318 					sbase++;
1319 					snpgs--;
1320 				}
1321 			}
1322 		}
1323 	}
1324 
1325 	free_delspans(mdsp_new);
1326 
1327 	return (KPHYSM_OK);
1328 }
1329 
1330 /*
1331  * This release function can be called at any stage as follows:
1332  *	_gethandle only called
1333  *	_span(s) only called
1334  *	_start called but failed
1335  *	delete thread exited
1336  */
1337 int
1338 kphysm_del_release(memhandle_t handle)
1339 {
1340 	struct mem_handle *mhp;
1341 
1342 	mhp = kphysm_lookup_mem_handle(handle);
1343 	if (mhp == NULL) {
1344 		return (KPHYSM_EHANDLE);
1345 	}
1346 	switch (mhp->mh_state) {
1347 	case MHND_STARTING:
1348 	case MHND_RUNNING:
1349 		mutex_exit(&mhp->mh_mutex);
1350 		return (KPHYSM_ENOTFINISHED);
1351 	case MHND_FREE:
1352 		ASSERT(mhp->mh_state != MHND_FREE);
1353 		mutex_exit(&mhp->mh_mutex);
1354 		return (KPHYSM_EHANDLE);
1355 	case MHND_INIT:
1356 		break;
1357 	case MHND_DONE:
1358 		break;
1359 	case MHND_RELEASE:
1360 		mutex_exit(&mhp->mh_mutex);
1361 		return (KPHYSM_ESEQUENCE);
1362 	default:
1363 #ifdef DEBUG
1364 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1365 		    (void *)mhp, mhp->mh_state);
1366 #endif /* DEBUG */
1367 		mutex_exit(&mhp->mh_mutex);
1368 		return (KPHYSM_EHANDLE);
1369 	}
1370 	/*
1371 	 * Set state so that we can wait if necessary.
1372 	 * Also this means that we have read/write access to all
1373 	 * fields except mh_exthandle and mh_state.
1374 	 */
1375 	mhp->mh_state = MHND_RELEASE;
1376 	/*
1377 	 * The mem_handle cannot be de-allocated by any other operation
1378 	 * now, so no need to hold mh_mutex.
1379 	 */
1380 	mutex_exit(&mhp->mh_mutex);
1381 
1382 	delspan_remove(&mhp->mh_transit, 0, 0);
1383 	mhp->mh_phys_pages = 0;
1384 	mhp->mh_vm_pages = 0;
1385 	mhp->mh_hold_todo = 0;
1386 	mhp->mh_delete_complete = NULL;
1387 	mhp->mh_delete_complete_arg = NULL;
1388 	mhp->mh_cancel = 0;
1389 
1390 	mutex_enter(&mhp->mh_mutex);
1391 	ASSERT(mhp->mh_state == MHND_RELEASE);
1392 	mhp->mh_state = MHND_FREE;
1393 
1394 	kphysm_free_mem_handle(mhp);
1395 
1396 	return (KPHYSM_OK);
1397 }
1398 
1399 /*
1400  * This cancel function can only be called with the thread running.
1401  */
1402 int
1403 kphysm_del_cancel(memhandle_t handle)
1404 {
1405 	struct mem_handle *mhp;
1406 
1407 	mhp = kphysm_lookup_mem_handle(handle);
1408 	if (mhp == NULL) {
1409 		return (KPHYSM_EHANDLE);
1410 	}
1411 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1412 		mutex_exit(&mhp->mh_mutex);
1413 		return (KPHYSM_ENOTRUNNING);
1414 	}
1415 	/*
1416 	 * Set the cancel flag and wake the delete thread up.
1417 	 * The thread may be waiting on I/O, so the effect of the cancel
1418 	 * may be delayed.
1419 	 */
1420 	if (mhp->mh_cancel == 0) {
1421 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1422 		cv_signal(&mhp->mh_cv);
1423 	}
1424 	mutex_exit(&mhp->mh_mutex);
1425 	return (KPHYSM_OK);
1426 }
1427 
1428 int
1429 kphysm_del_status(
1430 	memhandle_t handle,
1431 	memdelstat_t *mdstp)
1432 {
1433 	struct mem_handle *mhp;
1434 
1435 	mhp = kphysm_lookup_mem_handle(handle);
1436 	if (mhp == NULL) {
1437 		return (KPHYSM_EHANDLE);
1438 	}
1439 	/*
1440 	 * Calling kphysm_del_status() is allowed before the delete
1441 	 * is started to allow for status display.
1442 	 */
1443 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1444 	    mhp->mh_state != MHND_RUNNING) {
1445 		mutex_exit(&mhp->mh_mutex);
1446 		return (KPHYSM_ENOTRUNNING);
1447 	}
1448 	mdstp->phys_pages = mhp->mh_phys_pages;
1449 	mdstp->managed = mhp->mh_vm_pages;
1450 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1451 	mutex_exit(&mhp->mh_mutex);
1452 	return (KPHYSM_OK);
1453 }
1454 
1455 static int mem_delete_additional_pages = 100;
1456 
1457 static int
1458 can_remove_pgs(pgcnt_t npgs)
1459 {
1460 	/*
1461 	 * If all pageable pages were paged out, freemem would
1462 	 * equal availrmem.  There is a minimum requirement for
1463 	 * availrmem.
1464 	 */
1465 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1466 	    < npgs)
1467 		return (0);
1468 	/* TODO: check swap space, etc. */
1469 	return (1);
1470 }
1471 
1472 static int
1473 get_availrmem(pgcnt_t npgs)
1474 {
1475 	int ret;
1476 
1477 	mutex_enter(&freemem_lock);
1478 	ret = can_remove_pgs(npgs);
1479 	if (ret != 0)
1480 		availrmem -= npgs;
1481 	mutex_exit(&freemem_lock);
1482 	return (ret);
1483 }
1484 
1485 static void
1486 put_availrmem(pgcnt_t npgs)
1487 {
1488 	mutex_enter(&freemem_lock);
1489 	availrmem += npgs;
1490 	mutex_exit(&freemem_lock);
1491 }
1492 
1493 #define	FREEMEM_INCR	100
1494 static pgcnt_t freemem_incr = FREEMEM_INCR;
1495 #define	DEL_FREE_WAIT_FRAC	4
1496 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1497 
1498 #define	DEL_BUSY_WAIT_FRAC	20
1499 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1500 
1501 static void kphysm_del_cleanup(struct mem_handle *);
1502 
1503 static void page_delete_collect(page_t *, struct mem_handle *);
1504 
1505 static pgcnt_t
1506 delthr_get_freemem(struct mem_handle *mhp)
1507 {
1508 	pgcnt_t free_get;
1509 	int ret;
1510 
1511 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1512 
1513 	MDSTAT_INCR(mhp, need_free);
1514 	/*
1515 	 * Get up to freemem_incr pages.
1516 	 */
1517 	free_get = freemem_incr;
1518 	if (free_get > mhp->mh_hold_todo)
1519 		free_get = mhp->mh_hold_todo;
1520 	/*
1521 	 * Take free_get pages away from freemem,
1522 	 * waiting if necessary.
1523 	 */
1524 
1525 	while (!mhp->mh_cancel) {
1526 		mutex_exit(&mhp->mh_mutex);
1527 		MDSTAT_INCR(mhp, free_loop);
1528 		/*
1529 		 * Duplicate test from page_create_throttle()
1530 		 * but don't override with !PG_WAIT.
1531 		 */
1532 		if (freemem < (free_get + throttlefree)) {
1533 			MDSTAT_INCR(mhp, free_low);
1534 			ret = 0;
1535 		} else {
1536 			ret = page_create_wait(free_get, 0);
1537 			if (ret == 0) {
1538 				/* EMPTY */
1539 				MDSTAT_INCR(mhp, free_failed);
1540 			}
1541 		}
1542 		if (ret != 0) {
1543 			mutex_enter(&mhp->mh_mutex);
1544 			return (free_get);
1545 		}
1546 
1547 		/*
1548 		 * Put pressure on pageout.
1549 		 */
1550 		page_needfree(free_get);
1551 		cv_signal(&proc_pageout->p_cv);
1552 
1553 		mutex_enter(&mhp->mh_mutex);
1554 		(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
1555 		    (lbolt + DEL_FREE_WAIT_TICKS));
1556 		mutex_exit(&mhp->mh_mutex);
1557 		page_needfree(-(spgcnt_t)free_get);
1558 
1559 		mutex_enter(&mhp->mh_mutex);
1560 	}
1561 	return (0);
1562 }
1563 
1564 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1565 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1566 /*
1567  * This function is run as a helper thread for delete_memory_thread.
1568  * It is needed in order to force kaio cleanup, so that pages used in kaio
1569  * will be unlocked and subsequently relocated by delete_memory_thread.
1570  * The address of the delete_memory_threads's mem_handle is passed in to
1571  * this thread function, and is used to set the mh_aio_cleanup_done member
1572  * prior to calling thread_exit().
1573  */
1574 static void
1575 dr_aio_cleanup_thread(caddr_t amhp)
1576 {
1577 	proc_t *procp;
1578 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1579 	int cleaned;
1580 	int n = 0;
1581 	struct mem_handle *mhp;
1582 	volatile uint_t *pcancel;
1583 
1584 	mhp = (struct mem_handle *)amhp;
1585 	ASSERT(mhp != NULL);
1586 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1587 	if (modload("sys", "kaio") == -1) {
1588 		mhp->mh_aio_cleanup_done = 1;
1589 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1590 		thread_exit();
1591 	}
1592 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1593 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1594 	if (aio_cleanup_dr_delete_memory == NULL) {
1595 		mhp->mh_aio_cleanup_done = 1;
1596 		cmn_err(CE_WARN,
1597 	    "aio_cleanup_dr_delete_memory not found in kaio");
1598 		thread_exit();
1599 	}
1600 	do {
1601 		cleaned = 0;
1602 		mutex_enter(&pidlock);
1603 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1604 		    procp = procp->p_next) {
1605 			mutex_enter(&procp->p_lock);
1606 			if (procp->p_aio != NULL) {
1607 				/* cleanup proc's outstanding kaio */
1608 				cleaned +=
1609 				    (*aio_cleanup_dr_delete_memory)(procp);
1610 			}
1611 			mutex_exit(&procp->p_lock);
1612 		}
1613 		mutex_exit(&pidlock);
1614 		if ((*pcancel == 0) &&
1615 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1616 			/* delay a bit before retrying all procs again */
1617 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1618 			n = 0;
1619 		}
1620 	} while (*pcancel == 0);
1621 	mhp->mh_aio_cleanup_done = 1;
1622 	thread_exit();
1623 }
1624 
1625 static void
1626 delete_memory_thread(caddr_t amhp)
1627 {
1628 	struct mem_handle *mhp;
1629 	struct memdelspan *mdsp;
1630 	callb_cpr_t cprinfo;
1631 	page_t *pp_targ;
1632 	spgcnt_t freemem_left;
1633 	void (*del_complete_funcp)(void *, int error);
1634 	void *del_complete_arg;
1635 	int comp_code;
1636 	int ret;
1637 	int first_scan;
1638 	uint_t szc;
1639 #ifdef MEM_DEL_STATS
1640 	uint64_t start_total, ntick_total;
1641 	uint64_t start_pgrp, ntick_pgrp;
1642 #endif /* MEM_DEL_STATS */
1643 
1644 	mhp = (struct mem_handle *)amhp;
1645 
1646 #ifdef MEM_DEL_STATS
1647 	start_total = ddi_get_lbolt();
1648 #endif /* MEM_DEL_STATS */
1649 
1650 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1651 	    callb_generic_cpr, "memdel");
1652 
1653 	mutex_enter(&mhp->mh_mutex);
1654 	ASSERT(mhp->mh_state == MHND_STARTING);
1655 
1656 	mhp->mh_state = MHND_RUNNING;
1657 	mhp->mh_thread_id = curthread;
1658 
1659 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1660 	mutex_exit(&mhp->mh_mutex);
1661 
1662 	/* Allocate the remap pages now, if necessary. */
1663 	memseg_remap_init();
1664 
1665 	/*
1666 	 * Subtract from availrmem now if possible as availrmem
1667 	 * may not be available by the end of the delete.
1668 	 */
1669 	if (!get_availrmem(mhp->mh_vm_pages)) {
1670 		comp_code = KPHYSM_ENOTVIABLE;
1671 		mutex_enter(&mhp->mh_mutex);
1672 		goto early_exit;
1673 	}
1674 
1675 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1676 
1677 	mutex_enter(&mhp->mh_mutex);
1678 
1679 	if (ret != 0) {
1680 		mhp->mh_cancel = KPHYSM_EREFUSED;
1681 		goto refused;
1682 	}
1683 
1684 	transit_list_collect(mhp, 1);
1685 
1686 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1687 	    mdsp = mdsp->mds_next) {
1688 		ASSERT(mdsp->mds_bitmap == NULL);
1689 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1690 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1691 		    KM_SLEEP);
1692 	}
1693 
1694 	first_scan = 1;
1695 	freemem_left = 0;
1696 	/*
1697 	 * Start dr_aio_cleanup_thread, which periodically iterates
1698 	 * through the process list and invokes aio cleanup.  This
1699 	 * is needed in order to avoid a deadly embrace between the
1700 	 * delete_memory_thread (waiting on writer lock for page, with the
1701 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1702 	 * reader lock on the same page that is wanted by the
1703 	 * delete_memory_thread), and threads waiting for kaio completion
1704 	 * (blocked on spt_amp->lock).
1705 	 */
1706 	mhp->mh_dr_aio_cleanup_cancel = 0;
1707 	mhp->mh_aio_cleanup_done = 0;
1708 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1709 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1710 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1711 		pgcnt_t collected;
1712 
1713 		MDSTAT_INCR(mhp, nloop);
1714 		collected = 0;
1715 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1716 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1717 			pfn_t pfn, p_end;
1718 
1719 			if (first_scan) {
1720 				mem_node_pre_del_slice(mdsp->mds_base,
1721 				    mdsp->mds_base + mdsp->mds_npgs - 1);
1722 			}
1723 
1724 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1725 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1726 			    (mhp->mh_cancel == 0); pfn++) {
1727 				page_t *pp, *tpp, *tpp_targ;
1728 				pgcnt_t bit;
1729 				struct vnode *vp;
1730 				u_offset_t offset;
1731 				int mod, result;
1732 				spgcnt_t pgcnt;
1733 
1734 				bit = pfn - mdsp->mds_base;
1735 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1736 				    (1 << (bit % NBPBMW))) != 0) {
1737 					MDSTAT_INCR(mhp, already_done);
1738 					continue;
1739 				}
1740 				if (freemem_left == 0) {
1741 					freemem_left += delthr_get_freemem(mhp);
1742 					if (freemem_left == 0)
1743 						break;
1744 				}
1745 
1746 				/*
1747 				 * Release mh_mutex - some of this
1748 				 * stuff takes some time (eg PUTPAGE).
1749 				 */
1750 
1751 				mutex_exit(&mhp->mh_mutex);
1752 				MDSTAT_INCR(mhp, ncheck);
1753 
1754 				pp = page_numtopp_nolock(pfn);
1755 				if (pp == NULL) {
1756 					/*
1757 					 * Not covered by a page_t - will
1758 					 * be dealt with elsewhere.
1759 					 */
1760 					MDSTAT_INCR(mhp, nopaget);
1761 					mutex_enter(&mhp->mh_mutex);
1762 					mdsp->mds_bitmap[bit / NBPBMW] |=
1763 					    (1 << (bit % NBPBMW));
1764 					continue;
1765 				}
1766 
1767 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1768 				    SE_EXCL_WANTED | SE_RETIRED)) {
1769 					/*
1770 					 * Page in use elsewhere.  Skip it.
1771 					 */
1772 					MDSTAT_INCR(mhp, lockfail);
1773 					mutex_enter(&mhp->mh_mutex);
1774 					continue;
1775 				}
1776 				/*
1777 				 * See if the cage expanded into the delete.
1778 				 * This can happen as we have to allow the
1779 				 * cage to expand.
1780 				 */
1781 				if (PP_ISNORELOC(pp)) {
1782 					page_unlock(pp);
1783 					mutex_enter(&mhp->mh_mutex);
1784 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1785 					break;
1786 				}
1787 				if (PP_RETIRED(pp)) {
1788 					/*
1789 					 * Page has been retired and is
1790 					 * not part of the cage so we
1791 					 * can now do the accounting for
1792 					 * it.
1793 					 */
1794 					MDSTAT_INCR(mhp, retired);
1795 					mutex_enter(&mhp->mh_mutex);
1796 					mdsp->mds_bitmap[bit / NBPBMW]
1797 					    |= (1 << (bit % NBPBMW));
1798 					mdsp->mds_bitmap_retired[bit /
1799 					    NBPBMW] |=
1800 					    (1 << (bit % NBPBMW));
1801 					mhp->mh_hold_todo--;
1802 					continue;
1803 				}
1804 				ASSERT(freemem_left != 0);
1805 				if (PP_ISFREE(pp)) {
1806 					/*
1807 					 * Like page_reclaim() only 'freemem'
1808 					 * processing is already done.
1809 					 */
1810 					MDSTAT_INCR(mhp, nfree);
1811 				free_page_collect:
1812 					if (PP_ISAGED(pp)) {
1813 						page_list_sub(pp,
1814 						    PG_FREE_LIST);
1815 					} else {
1816 						page_list_sub(pp,
1817 						    PG_CACHE_LIST);
1818 					}
1819 					PP_CLRFREE(pp);
1820 					PP_CLRAGED(pp);
1821 					collected++;
1822 					mutex_enter(&mhp->mh_mutex);
1823 					page_delete_collect(pp, mhp);
1824 					mdsp->mds_bitmap[bit / NBPBMW] |=
1825 					    (1 << (bit % NBPBMW));
1826 					freemem_left--;
1827 					continue;
1828 				}
1829 				ASSERT(pp->p_vnode != NULL);
1830 				if (first_scan) {
1831 					MDSTAT_INCR(mhp, first_notfree);
1832 					page_unlock(pp);
1833 					mutex_enter(&mhp->mh_mutex);
1834 					continue;
1835 				}
1836 				/*
1837 				 * Keep stats on pages encountered that
1838 				 * are marked for retirement.
1839 				 */
1840 				if (PP_TOXIC(pp)) {
1841 					MDSTAT_INCR(mhp, toxic);
1842 				} else if (PP_PR_REQ(pp)) {
1843 					MDSTAT_INCR(mhp, failing);
1844 				}
1845 				/*
1846 				 * In certain cases below, special exceptions
1847 				 * are made for pages that are toxic.  This
1848 				 * is because the current meaning of toxic
1849 				 * is that an uncorrectable error has been
1850 				 * previously associated with the page.
1851 				 */
1852 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1853 					if (!PP_TOXIC(pp)) {
1854 						/*
1855 						 * Must relocate locked in
1856 						 * memory pages.
1857 						 */
1858 #ifdef MEM_DEL_STATS
1859 						start_pgrp = ddi_get_lbolt();
1860 #endif /* MEM_DEL_STATS */
1861 						/*
1862 						 * Lock all constituent pages
1863 						 * of a large page to ensure
1864 						 * that p_szc won't change.
1865 						 */
1866 						if (!group_page_trylock(pp,
1867 						    SE_EXCL)) {
1868 							MDSTAT_INCR(mhp,
1869 							    gptllckfail);
1870 							page_unlock(pp);
1871 							mutex_enter(
1872 							    &mhp->mh_mutex);
1873 							continue;
1874 						}
1875 						MDSTAT_INCR(mhp, npplocked);
1876 						pp_targ =
1877 						    page_get_replacement_page(
1878 						    pp, NULL, 0);
1879 						if (pp_targ != NULL) {
1880 #ifdef MEM_DEL_STATS
1881 							ntick_pgrp =
1882 							    (uint64_t)
1883 							    ddi_get_lbolt() -
1884 							    start_pgrp;
1885 #endif /* MEM_DEL_STATS */
1886 							MDSTAT_PGRP(mhp,
1887 							    ntick_pgrp);
1888 							MDSTAT_INCR(mhp,
1889 							    nlockreloc);
1890 							goto reloc;
1891 						}
1892 						group_page_unlock(pp);
1893 						page_unlock(pp);
1894 #ifdef MEM_DEL_STATS
1895 						ntick_pgrp =
1896 						    (uint64_t)ddi_get_lbolt() -
1897 						    start_pgrp;
1898 #endif /* MEM_DEL_STATS */
1899 						MDSTAT_PGRP(mhp, ntick_pgrp);
1900 						MDSTAT_INCR(mhp, nnorepl);
1901 						mutex_enter(&mhp->mh_mutex);
1902 						continue;
1903 					} else {
1904 						/*
1905 						 * Cannot do anything about
1906 						 * this page because it is
1907 						 * toxic.
1908 						 */
1909 						MDSTAT_INCR(mhp, npplkdtoxic);
1910 						page_unlock(pp);
1911 						mutex_enter(&mhp->mh_mutex);
1912 						continue;
1913 					}
1914 				}
1915 				/*
1916 				 * Unload the mappings and check if mod bit
1917 				 * is set.
1918 				 */
1919 				ASSERT(!PP_ISKAS(pp));
1920 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1921 				mod = hat_ismod(pp);
1922 
1923 #ifdef MEM_DEL_STATS
1924 				start_pgrp = ddi_get_lbolt();
1925 #endif /* MEM_DEL_STATS */
1926 				if (mod && !PP_TOXIC(pp)) {
1927 					/*
1928 					 * Lock all constituent pages
1929 					 * of a large page to ensure
1930 					 * that p_szc won't change.
1931 					 */
1932 					if (!group_page_trylock(pp, SE_EXCL)) {
1933 						MDSTAT_INCR(mhp, gptlmodfail);
1934 						page_unlock(pp);
1935 						mutex_enter(&mhp->mh_mutex);
1936 						continue;
1937 					}
1938 					pp_targ = page_get_replacement_page(pp,
1939 					    NULL, 0);
1940 					if (pp_targ != NULL) {
1941 						MDSTAT_INCR(mhp, nmodreloc);
1942 #ifdef MEM_DEL_STATS
1943 						ntick_pgrp =
1944 						    (uint64_t)ddi_get_lbolt() -
1945 						    start_pgrp;
1946 #endif /* MEM_DEL_STATS */
1947 						MDSTAT_PGRP(mhp, ntick_pgrp);
1948 						goto reloc;
1949 					}
1950 					group_page_unlock(pp);
1951 				}
1952 
1953 				if (!page_try_demote_pages(pp)) {
1954 					MDSTAT_INCR(mhp, demotefail);
1955 					page_unlock(pp);
1956 #ifdef MEM_DEL_STATS
1957 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1958 					    start_pgrp;
1959 #endif /* MEM_DEL_STATS */
1960 					MDSTAT_PGRP(mhp, ntick_pgrp);
1961 					mutex_enter(&mhp->mh_mutex);
1962 					continue;
1963 				}
1964 
1965 				/*
1966 				 * Regular 'page-out'.
1967 				 */
1968 				if (!mod) {
1969 					MDSTAT_INCR(mhp, ndestroy);
1970 					page_destroy(pp, 1);
1971 					/*
1972 					 * page_destroy was called with
1973 					 * dontfree. As long as p_lckcnt
1974 					 * and p_cowcnt are both zero, the
1975 					 * only additional action of
1976 					 * page_destroy with !dontfree is to
1977 					 * call page_free, so we can collect
1978 					 * the page here.
1979 					 */
1980 					collected++;
1981 #ifdef MEM_DEL_STATS
1982 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1983 					    start_pgrp;
1984 #endif /* MEM_DEL_STATS */
1985 					MDSTAT_PGRP(mhp, ntick_pgrp);
1986 					mutex_enter(&mhp->mh_mutex);
1987 					page_delete_collect(pp, mhp);
1988 					mdsp->mds_bitmap[bit / NBPBMW] |=
1989 					    (1 << (bit % NBPBMW));
1990 					continue;
1991 				}
1992 				/*
1993 				 * The page is toxic and the mod bit is
1994 				 * set, we cannot do anything here to deal
1995 				 * with it.
1996 				 */
1997 				if (PP_TOXIC(pp)) {
1998 					page_unlock(pp);
1999 #ifdef MEM_DEL_STATS
2000 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2001 					    start_pgrp;
2002 #endif /* MEM_DEL_STATS */
2003 					MDSTAT_PGRP(mhp, ntick_pgrp);
2004 					MDSTAT_INCR(mhp, modtoxic);
2005 					mutex_enter(&mhp->mh_mutex);
2006 					continue;
2007 				}
2008 				MDSTAT_INCR(mhp, nputpage);
2009 				vp = pp->p_vnode;
2010 				offset = pp->p_offset;
2011 				VN_HOLD(vp);
2012 				page_unlock(pp);
2013 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2014 				    B_INVAL|B_FORCE, kcred, NULL);
2015 				VN_RELE(vp);
2016 #ifdef MEM_DEL_STATS
2017 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2018 				    start_pgrp;
2019 #endif /* MEM_DEL_STATS */
2020 				MDSTAT_PGRP(mhp, ntick_pgrp);
2021 				/*
2022 				 * Try to get the page back immediately
2023 				 * so that it can be collected.
2024 				 */
2025 				pp = page_numtopp_nolock(pfn);
2026 				if (pp == NULL) {
2027 					MDSTAT_INCR(mhp, nnoreclaim);
2028 					/*
2029 					 * This should not happen as this
2030 					 * thread is deleting the page.
2031 					 * If this code is generalized, this
2032 					 * becomes a reality.
2033 					 */
2034 #ifdef DEBUG
2035 					cmn_err(CE_WARN,
2036 					    "delete_memory_thread(0x%p) "
2037 					    "pfn 0x%lx has no page_t",
2038 					    (void *)mhp, pfn);
2039 #endif /* DEBUG */
2040 					mutex_enter(&mhp->mh_mutex);
2041 					continue;
2042 				}
2043 				if (page_try_reclaim_lock(pp, SE_EXCL,
2044 				    SE_EXCL_WANTED | SE_RETIRED)) {
2045 					if (PP_ISFREE(pp)) {
2046 						goto free_page_collect;
2047 					}
2048 					page_unlock(pp);
2049 				}
2050 				MDSTAT_INCR(mhp, nnoreclaim);
2051 				mutex_enter(&mhp->mh_mutex);
2052 				continue;
2053 
2054 			reloc:
2055 				/*
2056 				 * Got some freemem and a target
2057 				 * page, so move the data to avoid
2058 				 * I/O and lock problems.
2059 				 */
2060 				ASSERT(!page_iolock_assert(pp));
2061 				MDSTAT_INCR(mhp, nreloc);
2062 				/*
2063 				 * page_relocate() will return pgcnt: the
2064 				 * number of consecutive pages relocated.
2065 				 * If it is successful, pp will be a
2066 				 * linked list of the page structs that
2067 				 * were relocated. If page_relocate() is
2068 				 * unsuccessful, pp will be unmodified.
2069 				 */
2070 #ifdef MEM_DEL_STATS
2071 				start_pgrp = ddi_get_lbolt();
2072 #endif /* MEM_DEL_STATS */
2073 				result = page_relocate(&pp, &pp_targ, 0, 0,
2074 				    &pgcnt, NULL);
2075 #ifdef MEM_DEL_STATS
2076 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2077 				    start_pgrp;
2078 #endif /* MEM_DEL_STATS */
2079 				MDSTAT_PGRP(mhp, ntick_pgrp);
2080 				if (result != 0) {
2081 					MDSTAT_INCR(mhp, nrelocfail);
2082 					/*
2083 					 * We did not succeed. We need
2084 					 * to give the pp_targ pages back.
2085 					 * page_free(pp_targ, 1) without
2086 					 * the freemem accounting.
2087 					 */
2088 					group_page_unlock(pp);
2089 					page_free_replacement_page(pp_targ);
2090 					page_unlock(pp);
2091 					mutex_enter(&mhp->mh_mutex);
2092 					continue;
2093 				}
2094 
2095 				/*
2096 				 * We will then collect pgcnt pages.
2097 				 */
2098 				ASSERT(pgcnt > 0);
2099 				mutex_enter(&mhp->mh_mutex);
2100 				/*
2101 				 * We need to make sure freemem_left is
2102 				 * large enough.
2103 				 */
2104 				while ((freemem_left < pgcnt) &&
2105 				    (!mhp->mh_cancel)) {
2106 					freemem_left +=
2107 					    delthr_get_freemem(mhp);
2108 				}
2109 
2110 				/*
2111 				 * Do not proceed if mh_cancel is set.
2112 				 */
2113 				if (mhp->mh_cancel) {
2114 					while (pp_targ != NULL) {
2115 						/*
2116 						 * Unlink and unlock each page.
2117 						 */
2118 						tpp_targ = pp_targ;
2119 						page_sub(&pp_targ, tpp_targ);
2120 						page_unlock(tpp_targ);
2121 					}
2122 					/*
2123 					 * We need to give the pp pages back.
2124 					 * page_free(pp, 1) without the
2125 					 * freemem accounting.
2126 					 */
2127 					page_free_replacement_page(pp);
2128 					break;
2129 				}
2130 
2131 				/* Now remove pgcnt from freemem_left */
2132 				freemem_left -= pgcnt;
2133 				ASSERT(freemem_left >= 0);
2134 				szc = pp->p_szc;
2135 				while (pp != NULL) {
2136 					/*
2137 					 * pp and pp_targ were passed back as
2138 					 * a linked list of pages.
2139 					 * Unlink and unlock each page.
2140 					 */
2141 					tpp_targ = pp_targ;
2142 					page_sub(&pp_targ, tpp_targ);
2143 					page_unlock(tpp_targ);
2144 					/*
2145 					 * The original page is now free
2146 					 * so remove it from the linked
2147 					 * list and collect it.
2148 					 */
2149 					tpp = pp;
2150 					page_sub(&pp, tpp);
2151 					pfn = page_pptonum(tpp);
2152 					collected++;
2153 					ASSERT(PAGE_EXCL(tpp));
2154 					ASSERT(tpp->p_vnode == NULL);
2155 					ASSERT(!hat_page_is_mapped(tpp));
2156 					ASSERT(tpp->p_szc == szc);
2157 					tpp->p_szc = 0;
2158 					page_delete_collect(tpp, mhp);
2159 					bit = pfn - mdsp->mds_base;
2160 					mdsp->mds_bitmap[bit / NBPBMW] |=
2161 					    (1 << (bit % NBPBMW));
2162 				}
2163 				ASSERT(pp_targ == NULL);
2164 			}
2165 		}
2166 		first_scan = 0;
2167 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2168 		    (collected == 0)) {
2169 			/*
2170 			 * This code is needed as we cannot wait
2171 			 * for a page to be locked OR the delete to
2172 			 * be cancelled.  Also, we must delay so
2173 			 * that other threads get a chance to run
2174 			 * on our cpu, otherwise page locks may be
2175 			 * held indefinitely by those threads.
2176 			 */
2177 			MDSTAT_INCR(mhp, ndelay);
2178 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2179 			(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
2180 			    (lbolt + DEL_BUSY_WAIT_TICKS));
2181 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2182 		}
2183 	}
2184 	/* stop the dr aio cleanup thread */
2185 	mhp->mh_dr_aio_cleanup_cancel = 1;
2186 	transit_list_collect(mhp, 0);
2187 	if (freemem_left != 0) {
2188 		/* Return any surplus. */
2189 		page_create_putback(freemem_left);
2190 		freemem_left = 0;
2191 	}
2192 #ifdef MEM_DEL_STATS
2193 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2194 #endif /* MEM_DEL_STATS */
2195 	MDSTAT_TOTAL(mhp, ntick_total);
2196 	MDSTAT_PRINT(mhp);
2197 
2198 	/*
2199 	 * If the memory delete was cancelled, exclusive-wanted bits must
2200 	 * be cleared. If there are retired pages being deleted, they need
2201 	 * to be unretired.
2202 	 */
2203 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2204 	    mdsp = mdsp->mds_next) {
2205 		pfn_t pfn, p_end;
2206 
2207 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2208 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2209 			page_t *pp;
2210 			pgcnt_t bit;
2211 
2212 			bit = pfn - mdsp->mds_base;
2213 			if (mhp->mh_cancel) {
2214 				pp = page_numtopp_nolock(pfn);
2215 				if (pp != NULL) {
2216 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2217 					    (1 << (bit % NBPBMW))) == 0) {
2218 						page_lock_clr_exclwanted(pp);
2219 					}
2220 				}
2221 			} else {
2222 				pp = NULL;
2223 			}
2224 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2225 			    (1 << (bit % NBPBMW))) != 0) {
2226 				/* do we already have pp? */
2227 				if (pp == NULL) {
2228 					pp = page_numtopp_nolock(pfn);
2229 				}
2230 				ASSERT(pp != NULL);
2231 				ASSERT(PP_RETIRED(pp));
2232 				if (mhp->mh_cancel != 0) {
2233 					page_unlock(pp);
2234 					/*
2235 					 * To satisfy ASSERT below in
2236 					 * cancel code.
2237 					 */
2238 					mhp->mh_hold_todo++;
2239 				} else {
2240 					(void) page_unretire_pp(pp,
2241 					    PR_UNR_CLEAN);
2242 				}
2243 			}
2244 		}
2245 	}
2246 	/*
2247 	 * Free retired page bitmap and collected page bitmap
2248 	 */
2249 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2250 	    mdsp = mdsp->mds_next) {
2251 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2252 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2253 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2254 		ASSERT(mdsp->mds_bitmap != NULL);
2255 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2256 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2257 	}
2258 
2259 	/* wait for our dr aio cancel thread to exit */
2260 	while (!(mhp->mh_aio_cleanup_done)) {
2261 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2262 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2263 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2264 	}
2265 refused:
2266 	if (mhp->mh_cancel != 0) {
2267 		page_t *pp;
2268 
2269 		comp_code = mhp->mh_cancel;
2270 		/*
2271 		 * Go through list of deleted pages (mh_deleted) freeing
2272 		 * them.
2273 		 */
2274 		while ((pp = mhp->mh_deleted) != NULL) {
2275 			mhp->mh_deleted = pp->p_next;
2276 			mhp->mh_hold_todo++;
2277 			mutex_exit(&mhp->mh_mutex);
2278 			/* Restore p_next. */
2279 			pp->p_next = pp->p_prev;
2280 			if (PP_ISFREE(pp)) {
2281 				cmn_err(CE_PANIC,
2282 				    "page %p is free",
2283 				    (void *)pp);
2284 			}
2285 			page_free(pp, 1);
2286 			mutex_enter(&mhp->mh_mutex);
2287 		}
2288 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2289 
2290 		mutex_exit(&mhp->mh_mutex);
2291 		put_availrmem(mhp->mh_vm_pages);
2292 		mutex_enter(&mhp->mh_mutex);
2293 
2294 		goto t_exit;
2295 	}
2296 
2297 	/*
2298 	 * All the pages are no longer in use and are exclusively locked.
2299 	 */
2300 
2301 	mhp->mh_deleted = NULL;
2302 
2303 	kphysm_del_cleanup(mhp);
2304 
2305 	/*
2306 	 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so
2307 	 * that the mem_node_config[] will remain intact for the cleanup.
2308 	 */
2309 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2310 	    mdsp = mdsp->mds_next) {
2311 		mem_node_post_del_slice(mdsp->mds_base,
2312 		    mdsp->mds_base + mdsp->mds_npgs - 1, 0);
2313 	}
2314 
2315 	comp_code = KPHYSM_OK;
2316 
2317 t_exit:
2318 	mutex_exit(&mhp->mh_mutex);
2319 	kphysm_setup_post_del(mhp->mh_vm_pages,
2320 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2321 	mutex_enter(&mhp->mh_mutex);
2322 
2323 early_exit:
2324 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2325 	mhp->mh_state = MHND_DONE;
2326 	del_complete_funcp = mhp->mh_delete_complete;
2327 	del_complete_arg = mhp->mh_delete_complete_arg;
2328 	CALLB_CPR_EXIT(&cprinfo);
2329 	(*del_complete_funcp)(del_complete_arg, comp_code);
2330 	thread_exit();
2331 	/*NOTREACHED*/
2332 }
2333 
2334 /*
2335  * Start the delete of the memory from the system.
2336  */
2337 int
2338 kphysm_del_start(
2339 	memhandle_t handle,
2340 	void (*complete)(void *, int),
2341 	void *complete_arg)
2342 {
2343 	struct mem_handle *mhp;
2344 
2345 	mhp = kphysm_lookup_mem_handle(handle);
2346 	if (mhp == NULL) {
2347 		return (KPHYSM_EHANDLE);
2348 	}
2349 	switch (mhp->mh_state) {
2350 	case MHND_FREE:
2351 		ASSERT(mhp->mh_state != MHND_FREE);
2352 		mutex_exit(&mhp->mh_mutex);
2353 		return (KPHYSM_EHANDLE);
2354 	case MHND_INIT:
2355 		break;
2356 	case MHND_STARTING:
2357 	case MHND_RUNNING:
2358 		mutex_exit(&mhp->mh_mutex);
2359 		return (KPHYSM_ESEQUENCE);
2360 	case MHND_DONE:
2361 		mutex_exit(&mhp->mh_mutex);
2362 		return (KPHYSM_ESEQUENCE);
2363 	case MHND_RELEASE:
2364 		mutex_exit(&mhp->mh_mutex);
2365 		return (KPHYSM_ESEQUENCE);
2366 	default:
2367 #ifdef DEBUG
2368 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2369 		    (void *)mhp, mhp->mh_state);
2370 #endif /* DEBUG */
2371 		mutex_exit(&mhp->mh_mutex);
2372 		return (KPHYSM_EHANDLE);
2373 	}
2374 
2375 	if (mhp->mh_transit.trl_spans == NULL) {
2376 		mutex_exit(&mhp->mh_mutex);
2377 		return (KPHYSM_ENOWORK);
2378 	}
2379 
2380 	ASSERT(complete != NULL);
2381 	mhp->mh_delete_complete = complete;
2382 	mhp->mh_delete_complete_arg = complete_arg;
2383 	mhp->mh_state = MHND_STARTING;
2384 	/*
2385 	 * Release the mutex in case thread_create sleeps.
2386 	 */
2387 	mutex_exit(&mhp->mh_mutex);
2388 
2389 	/*
2390 	 * The "obvious" process for this thread is pageout (proc_pageout)
2391 	 * but this gives the thread too much power over freemem
2392 	 * which results in freemem starvation.
2393 	 */
2394 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2395 	    TS_RUN, maxclsyspri - 1);
2396 
2397 	return (KPHYSM_OK);
2398 }
2399 
2400 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2401 static caddr_t pp_dummy;
2402 static pgcnt_t pp_dummy_npages;
2403 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2404 
2405 static void
2406 memseg_remap_init_pages(page_t *pages, page_t *epages)
2407 {
2408 	page_t *pp;
2409 
2410 	for (pp = pages; pp < epages; pp++) {
2411 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2412 		pp->p_offset = (u_offset_t)-1;
2413 		page_iolock_init(pp);
2414 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2415 			continue;
2416 		page_lock_delete(pp);
2417 	}
2418 }
2419 
2420 void
2421 memseg_remap_init()
2422 {
2423 	mutex_enter(&pp_dummy_lock);
2424 	if (pp_dummy == NULL) {
2425 		uint_t dpages;
2426 		int i;
2427 
2428 		/*
2429 		 * dpages starts off as the size of the structure and
2430 		 * ends up as the minimum number of pages that will
2431 		 * hold a whole number of page_t structures.
2432 		 */
2433 		dpages = sizeof (page_t);
2434 		ASSERT(dpages != 0);
2435 		ASSERT(dpages <= MMU_PAGESIZE);
2436 
2437 		while ((dpages & 1) == 0)
2438 			dpages >>= 1;
2439 
2440 		pp_dummy_npages = dpages;
2441 		/*
2442 		 * Allocate pp_dummy pages directly from static_arena,
2443 		 * since these are whole page allocations and are
2444 		 * referenced by physical address.  This also has the
2445 		 * nice fringe benefit of hiding the memory from
2446 		 * ::findleaks since it doesn't deal well with allocated
2447 		 * kernel heap memory that doesn't have any mappings.
2448 		 */
2449 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2450 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2451 		bzero(pp_dummy, ptob(pp_dummy_npages));
2452 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2453 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2454 		    pp_dummy_npages, KM_SLEEP);
2455 		for (i = 0; i < pp_dummy_npages; i++) {
2456 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2457 			    &pp_dummy[MMU_PAGESIZE * i]);
2458 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2459 		}
2460 		/*
2461 		 * Initialize the page_t's to a known 'deleted' state
2462 		 * that matches the state of deleted pages.
2463 		 */
2464 		memseg_remap_init_pages((page_t *)pp_dummy,
2465 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2466 		/* Remove kmem mappings for the pages for safety. */
2467 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2468 		    HAT_UNLOAD_UNLOCK);
2469 		/* Leave pp_dummy pointer set as flag that init is done. */
2470 	}
2471 	mutex_exit(&pp_dummy_lock);
2472 }
2473 
2474 static void
2475 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs)
2476 {
2477 	ASSERT(pp_dummy != NULL);
2478 
2479 	while (metapgs != 0) {
2480 		pgcnt_t n;
2481 		int i;
2482 
2483 		n = pp_dummy_npages;
2484 		if (n > metapgs)
2485 			n = metapgs;
2486 		for (i = 0; i < n; i++) {
2487 			hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i],
2488 			    PROT_READ,
2489 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2490 			    HAT_LOAD_REMAP);
2491 			pp += ptob(1);
2492 		}
2493 		metapgs -= n;
2494 	}
2495 }
2496 
2497 /*
2498  * Transition all the deleted pages to the deleted state so that
2499  * page_lock will not wait. The page_lock_delete call will
2500  * also wake up any waiters.
2501  */
2502 static void
2503 memseg_lock_delete_all(struct memseg *seg)
2504 {
2505 	page_t *pp;
2506 
2507 	for (pp = seg->pages; pp < seg->epages; pp++) {
2508 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2509 		page_lock_delete(pp);
2510 	}
2511 }
2512 
2513 static void
2514 kphysm_del_cleanup(struct mem_handle *mhp)
2515 {
2516 	struct memdelspan	*mdsp;
2517 	struct memseg		*seg;
2518 	struct memseg   	**segpp;
2519 	struct memseg		*seglist;
2520 	pfn_t			p_end;
2521 	uint64_t		avmem;
2522 	pgcnt_t			avpgs;
2523 	pgcnt_t			npgs;
2524 
2525 	avpgs = mhp->mh_vm_pages;
2526 
2527 	memsegs_lock(1);
2528 
2529 	/*
2530 	 * remove from main segment list.
2531 	 */
2532 	npgs = 0;
2533 	seglist = NULL;
2534 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2535 	    mdsp = mdsp->mds_next) {
2536 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2537 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2538 			if (seg->pages_base >= p_end ||
2539 			    seg->pages_end <= mdsp->mds_base) {
2540 				/* Span and memseg don't overlap. */
2541 				segpp = &((*segpp)->next);
2542 				continue;
2543 			}
2544 			ASSERT(seg->pages_base >= mdsp->mds_base);
2545 			ASSERT(seg->pages_end <= p_end);
2546 
2547 			PLCNT_MODIFY_MAX(seg->pages_base,
2548 			    seg->pages_base - seg->pages_end);
2549 
2550 			/* Hide the memseg from future scans. */
2551 			hat_kpm_delmem_mseg_update(seg, segpp);
2552 			*segpp = seg->next;
2553 			membar_producer();	/* TODO: Needed? */
2554 			npgs += MSEG_NPAGES(seg);
2555 
2556 			/*
2557 			 * Leave the deleted segment's next pointer intact
2558 			 * in case a memsegs scanning loop is walking this
2559 			 * segment concurrently.
2560 			 */
2561 			seg->lnext = seglist;
2562 			seglist = seg;
2563 		}
2564 	}
2565 
2566 	build_pfn_hash();
2567 
2568 	ASSERT(npgs < total_pages);
2569 	total_pages -= npgs;
2570 
2571 	/*
2572 	 * Recalculate the paging parameters now total_pages has changed.
2573 	 * This will also cause the clock hands to be reset before next use.
2574 	 */
2575 	setupclock(1);
2576 
2577 	memsegs_unlock(1);
2578 
2579 	mutex_exit(&mhp->mh_mutex);
2580 
2581 	while ((seg = seglist) != NULL) {
2582 		pfn_t mseg_start;
2583 		pfn_t mseg_base, mseg_end;
2584 		pgcnt_t mseg_npgs;
2585 		page_t *pp;
2586 		pgcnt_t metapgs;
2587 		int dynamic;
2588 		int mlret;
2589 
2590 		seglist = seg->lnext;
2591 
2592 		/*
2593 		 * Put the page_t's into the deleted state to stop
2594 		 * cv_wait()s on the pages. When we remap, the dummy
2595 		 * page_t's will be in the same state.
2596 		 */
2597 		memseg_lock_delete_all(seg);
2598 		/*
2599 		 * Collect up information based on pages_base and pages_end
2600 		 * early so that we can flag early that the memseg has been
2601 		 * deleted by setting pages_end == pages_base.
2602 		 */
2603 		mseg_base = seg->pages_base;
2604 		mseg_end = seg->pages_end;
2605 		mseg_npgs = MSEG_NPAGES(seg);
2606 		dynamic = memseg_is_dynamic(seg, &mseg_start);
2607 
2608 		seg->pages_end = seg->pages_base;
2609 
2610 		if (dynamic) {
2611 			pp = seg->pages;
2612 			metapgs = mseg_base - mseg_start;
2613 			ASSERT(metapgs != 0);
2614 
2615 			/* Remap the meta data to our special dummy area. */
2616 			memseg_remap_to_dummy((caddr_t)pp, metapgs);
2617 
2618 			mutex_enter(&memseg_lists_lock);
2619 			seg->lnext = memseg_va_avail;
2620 			memseg_va_avail = seg;
2621 			mutex_exit(&memseg_lists_lock);
2622 		} else {
2623 			/*
2624 			 * Set for clean-up below.
2625 			 */
2626 			mseg_start = seg->pages_base;
2627 			/*
2628 			 * For memory whose page_ts were allocated
2629 			 * at boot, we need to find a new use for
2630 			 * the page_t memory.
2631 			 * For the moment, just leak it.
2632 			 * (It is held in the memseg_delete_junk list.)
2633 			 */
2634 
2635 			mutex_enter(&memseg_lists_lock);
2636 			seg->lnext = memseg_delete_junk;
2637 			memseg_delete_junk = seg;
2638 			mutex_exit(&memseg_lists_lock);
2639 		}
2640 
2641 		/* Must not use seg now as it could be re-used. */
2642 
2643 		memlist_write_lock();
2644 
2645 		mlret = memlist_delete_span(
2646 		    (uint64_t)(mseg_base) << PAGESHIFT,
2647 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2648 		    &phys_avail);
2649 		ASSERT(mlret == MEML_SPANOP_OK);
2650 
2651 		mlret = memlist_delete_span(
2652 		    (uint64_t)(mseg_start) << PAGESHIFT,
2653 		    (uint64_t)(mseg_end - mseg_start) <<
2654 		    PAGESHIFT,
2655 		    &phys_install);
2656 		ASSERT(mlret == MEML_SPANOP_OK);
2657 		phys_install_has_changed();
2658 
2659 		memlist_write_unlock();
2660 	}
2661 
2662 	memlist_read_lock();
2663 	installed_top_size(phys_install, &physmax, &physinstalled);
2664 	memlist_read_unlock();
2665 
2666 	mutex_enter(&freemem_lock);
2667 	maxmem -= avpgs;
2668 	physmem -= avpgs;
2669 	/* availrmem is adjusted during the delete. */
2670 	availrmem_initial -= avpgs;
2671 
2672 	mutex_exit(&freemem_lock);
2673 
2674 	dump_resize();
2675 
2676 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2677 	    "(0x%" PRIx64 ")\n",
2678 	    physinstalled << (PAGESHIFT - 10),
2679 	    (uint64_t)physinstalled << PAGESHIFT);
2680 
2681 	avmem = (uint64_t)freemem << PAGESHIFT;
2682 	cmn_err(CE_CONT, "?kphysm_delete: "
2683 	    "avail mem = %" PRId64 "\n", avmem);
2684 
2685 	/*
2686 	 * Update lgroup generation number on single lgroup systems
2687 	 */
2688 	if (nlgrps == 1)
2689 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2690 
2691 	/* Successfully deleted system memory */
2692 	mutex_enter(&mhp->mh_mutex);
2693 }
2694 
2695 static uint_t mdel_nullvp_waiter;
2696 
2697 static void
2698 page_delete_collect(
2699 	page_t *pp,
2700 	struct mem_handle *mhp)
2701 {
2702 	if (pp->p_vnode) {
2703 		page_hashout(pp, (kmutex_t *)NULL);
2704 		/* do not do PP_SETAGED(pp); */
2705 	} else {
2706 		kmutex_t *sep;
2707 
2708 		sep = page_se_mutex(pp);
2709 		mutex_enter(sep);
2710 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2711 			mdel_nullvp_waiter++;
2712 			cv_broadcast(&pp->p_cv);
2713 		}
2714 		mutex_exit(sep);
2715 	}
2716 	ASSERT(pp->p_next == pp->p_prev);
2717 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2718 	pp->p_next = mhp->mh_deleted;
2719 	mhp->mh_deleted = pp;
2720 	ASSERT(mhp->mh_hold_todo != 0);
2721 	mhp->mh_hold_todo--;
2722 }
2723 
2724 static void
2725 transit_list_collect(struct mem_handle *mhp, int v)
2726 {
2727 	struct transit_list_head *trh;
2728 
2729 	trh = &transit_list_head;
2730 	mutex_enter(&trh->trh_lock);
2731 	mhp->mh_transit.trl_collect = v;
2732 	mutex_exit(&trh->trh_lock);
2733 }
2734 
2735 static void
2736 transit_list_insert(struct transit_list *tlp)
2737 {
2738 	struct transit_list_head *trh;
2739 
2740 	trh = &transit_list_head;
2741 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2742 	tlp->trl_next = trh->trh_head;
2743 	trh->trh_head = tlp;
2744 }
2745 
2746 static void
2747 transit_list_remove(struct transit_list *tlp)
2748 {
2749 	struct transit_list_head *trh;
2750 	struct transit_list **tlpp;
2751 
2752 	trh = &transit_list_head;
2753 	tlpp = &trh->trh_head;
2754 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2755 	while (*tlpp != NULL && *tlpp != tlp)
2756 		tlpp = &(*tlpp)->trl_next;
2757 	ASSERT(*tlpp != NULL);
2758 	if (*tlpp == tlp)
2759 		*tlpp = tlp->trl_next;
2760 	tlp->trl_next = NULL;
2761 }
2762 
2763 static struct transit_list *
2764 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2765 {
2766 	struct transit_list *tlp;
2767 
2768 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2769 		struct memdelspan *mdsp;
2770 
2771 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2772 		    mdsp = mdsp->mds_next) {
2773 			if (pfnum >= mdsp->mds_base &&
2774 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2775 				return (tlp);
2776 			}
2777 		}
2778 	}
2779 	return (NULL);
2780 }
2781 
2782 int
2783 pfn_is_being_deleted(pfn_t pfnum)
2784 {
2785 	struct transit_list_head *trh;
2786 	struct transit_list *tlp;
2787 	int ret;
2788 
2789 	trh = &transit_list_head;
2790 	if (trh->trh_head == NULL)
2791 		return (0);
2792 
2793 	mutex_enter(&trh->trh_lock);
2794 	tlp = pfnum_to_transit_list(trh, pfnum);
2795 	ret = (tlp != NULL && tlp->trl_collect);
2796 	mutex_exit(&trh->trh_lock);
2797 
2798 	return (ret);
2799 }
2800 
2801 #ifdef MEM_DEL_STATS
2802 extern int hz;
2803 static void
2804 mem_del_stat_print_func(struct mem_handle *mhp)
2805 {
2806 	uint64_t tmp;
2807 
2808 	if (mem_del_stat_print) {
2809 		printf("memory delete loop %x/%x, statistics%s\n",
2810 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2811 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2812 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2813 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2814 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2815 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2816 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2817 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2818 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2819 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2820 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2821 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2822 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2823 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2824 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2825 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2826 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2827 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2828 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2829 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2830 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2831 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2832 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2833 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2834 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2835 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2836 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2837 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2838 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2839 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2840 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2841 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2842 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2843 		printf(
2844 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2845 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2846 
2847 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2848 		printf(
2849 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2850 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2851 	}
2852 }
2853 #endif /* MEM_DEL_STATS */
2854 
2855 struct mem_callback {
2856 	kphysm_setup_vector_t	*vec;
2857 	void			*arg;
2858 };
2859 
2860 #define	NMEMCALLBACKS		100
2861 
2862 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2863 static uint_t nmemcallbacks;
2864 static krwlock_t mem_callback_rwlock;
2865 
2866 int
2867 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2868 {
2869 	uint_t i, found;
2870 
2871 	/*
2872 	 * This test will become more complicated when the version must
2873 	 * change.
2874 	 */
2875 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2876 		return (EINVAL);
2877 
2878 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2879 	    vec->post_del == NULL)
2880 		return (EINVAL);
2881 
2882 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2883 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2884 		if (mem_callbacks[i].vec == NULL && found == 0)
2885 			found = i + 1;
2886 		if (mem_callbacks[i].vec == vec &&
2887 		    mem_callbacks[i].arg == arg) {
2888 #ifdef DEBUG
2889 			/* Catch this in DEBUG kernels. */
2890 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2891 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2892 			    (void *)vec, arg, (void *)caller());
2893 #endif /* DEBUG */
2894 			rw_exit(&mem_callback_rwlock);
2895 			return (EEXIST);
2896 		}
2897 	}
2898 	if (found != 0) {
2899 		i = found - 1;
2900 	} else {
2901 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
2902 		if (nmemcallbacks == NMEMCALLBACKS) {
2903 			rw_exit(&mem_callback_rwlock);
2904 			return (ENOMEM);
2905 		}
2906 		i = nmemcallbacks++;
2907 	}
2908 	mem_callbacks[i].vec = vec;
2909 	mem_callbacks[i].arg = arg;
2910 	rw_exit(&mem_callback_rwlock);
2911 	return (0);
2912 }
2913 
2914 void
2915 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
2916 {
2917 	uint_t i;
2918 
2919 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2920 	for (i = 0; i < nmemcallbacks; i++) {
2921 		if (mem_callbacks[i].vec == vec &&
2922 		    mem_callbacks[i].arg == arg) {
2923 			mem_callbacks[i].vec = NULL;
2924 			mem_callbacks[i].arg = NULL;
2925 			if (i == (nmemcallbacks - 1))
2926 				nmemcallbacks--;
2927 			break;
2928 		}
2929 	}
2930 	rw_exit(&mem_callback_rwlock);
2931 }
2932 
2933 static void
2934 kphysm_setup_post_add(pgcnt_t delta_pages)
2935 {
2936 	uint_t i;
2937 
2938 	rw_enter(&mem_callback_rwlock, RW_READER);
2939 	for (i = 0; i < nmemcallbacks; i++) {
2940 		if (mem_callbacks[i].vec != NULL) {
2941 			(*mem_callbacks[i].vec->post_add)
2942 			    (mem_callbacks[i].arg, delta_pages);
2943 		}
2944 	}
2945 	rw_exit(&mem_callback_rwlock);
2946 }
2947 
2948 /*
2949  * Note the locking between pre_del and post_del: The reader lock is held
2950  * between the two calls to stop the set of functions from changing.
2951  */
2952 
2953 static int
2954 kphysm_setup_pre_del(pgcnt_t delta_pages)
2955 {
2956 	uint_t i;
2957 	int ret;
2958 	int aret;
2959 
2960 	ret = 0;
2961 	rw_enter(&mem_callback_rwlock, RW_READER);
2962 	for (i = 0; i < nmemcallbacks; i++) {
2963 		if (mem_callbacks[i].vec != NULL) {
2964 			aret = (*mem_callbacks[i].vec->pre_del)
2965 			    (mem_callbacks[i].arg, delta_pages);
2966 			ret |= aret;
2967 		}
2968 	}
2969 
2970 	return (ret);
2971 }
2972 
2973 static void
2974 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
2975 {
2976 	uint_t i;
2977 
2978 	for (i = 0; i < nmemcallbacks; i++) {
2979 		if (mem_callbacks[i].vec != NULL) {
2980 			(*mem_callbacks[i].vec->post_del)
2981 			    (mem_callbacks[i].arg, delta_pages, cancelled);
2982 		}
2983 	}
2984 	rw_exit(&mem_callback_rwlock);
2985 }
2986 
2987 static int
2988 kphysm_split_memseg(
2989 	pfn_t base,
2990 	pgcnt_t npgs)
2991 {
2992 	struct memseg *seg;
2993 	struct memseg **segpp;
2994 	pgcnt_t size_low, size_high;
2995 	struct memseg *seg_low, *seg_mid, *seg_high;
2996 
2997 	/*
2998 	 * Lock the memsegs list against other updates now
2999 	 */
3000 	memsegs_lock(1);
3001 
3002 	/*
3003 	 * Find boot time memseg that wholly covers this area.
3004 	 */
3005 
3006 	/* First find the memseg with page 'base' in it. */
3007 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3008 	    segpp = &((*segpp)->next)) {
3009 		if (base >= seg->pages_base && base < seg->pages_end)
3010 			break;
3011 	}
3012 	if (seg == NULL) {
3013 		memsegs_unlock(1);
3014 		return (0);
3015 	}
3016 	if (memseg_is_dynamic(seg, (pfn_t *)NULL)) {
3017 		memsegs_unlock(1);
3018 		return (0);
3019 	}
3020 	if ((base + npgs) > seg->pages_end) {
3021 		memsegs_unlock(1);
3022 		return (0);
3023 	}
3024 
3025 	/*
3026 	 * Work out the size of the two segments that will
3027 	 * surround the new segment, one for low address
3028 	 * and one for high.
3029 	 */
3030 	ASSERT(base >= seg->pages_base);
3031 	size_low = base - seg->pages_base;
3032 	ASSERT(seg->pages_end >= (base + npgs));
3033 	size_high = seg->pages_end - (base + npgs);
3034 
3035 	/*
3036 	 * Sanity check.
3037 	 */
3038 	if ((size_low + size_high) == 0) {
3039 		memsegs_unlock(1);
3040 		return (0);
3041 	}
3042 
3043 	/*
3044 	 * Allocate the new structures. The old memseg will not be freed
3045 	 * as there may be a reference to it.
3046 	 */
3047 	seg_low = NULL;
3048 	seg_high = NULL;
3049 
3050 	if (size_low != 0) {
3051 		seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3052 		bzero(seg_low, sizeof (struct memseg));
3053 	}
3054 
3055 	seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3056 	bzero(seg_mid, sizeof (struct memseg));
3057 
3058 	if (size_high != 0) {
3059 		seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3060 		bzero(seg_high, sizeof (struct memseg));
3061 	}
3062 
3063 	/*
3064 	 * All allocation done now.
3065 	 */
3066 	if (size_low != 0) {
3067 		seg_low->pages = seg->pages;
3068 		seg_low->epages = seg_low->pages + size_low;
3069 		seg_low->pages_base = seg->pages_base;
3070 		seg_low->pages_end = seg_low->pages_base + size_low;
3071 		seg_low->next = seg_mid;
3072 	}
3073 	if (size_high != 0) {
3074 		seg_high->pages = seg->epages - size_high;
3075 		seg_high->epages = seg_high->pages + size_high;
3076 		seg_high->pages_base = seg->pages_end - size_high;
3077 		seg_high->pages_end = seg_high->pages_base + size_high;
3078 		seg_high->next = seg->next;
3079 	}
3080 
3081 	seg_mid->pages = seg->pages + size_low;
3082 	seg_mid->pages_base = seg->pages_base + size_low;
3083 	seg_mid->epages = seg->epages - size_high;
3084 	seg_mid->pages_end = seg->pages_end - size_high;
3085 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3086 
3087 	/*
3088 	 * Update hat_kpm specific info of all involved memsegs and
3089 	 * allow hat_kpm specific global chain updates.
3090 	 */
3091 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3092 
3093 	/*
3094 	 * At this point we have two equivalent memseg sub-chains,
3095 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3096 	 * the same place in the global chain. By re-writing the pointer
3097 	 * in the previous element we switch atomically from using the old
3098 	 * (seg) to the new.
3099 	 */
3100 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3101 
3102 	membar_enter();
3103 
3104 	build_pfn_hash();
3105 	memsegs_unlock(1);
3106 
3107 	/*
3108 	 * We leave the old segment, 'seg', intact as there may be
3109 	 * references to it. Also, as the value of total_pages has not
3110 	 * changed and the memsegs list is effectively the same when
3111 	 * accessed via the old or the new pointer, we do not have to
3112 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3113 	 *
3114 	 * We currently do not re-use or reclaim the page_t memory.
3115 	 * If we do, then this may have to change.
3116 	 */
3117 
3118 	mutex_enter(&memseg_lists_lock);
3119 	seg->lnext = memseg_edit_junk;
3120 	memseg_edit_junk = seg;
3121 	mutex_exit(&memseg_lists_lock);
3122 
3123 	return (1);
3124 }
3125 
3126 /*
3127  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3128  * structure using physical addresses. Therefore a kmem_cache is
3129  * used with KMC_NOHASH to avoid page crossings within a memseg
3130  * structure. KMC_NOHASH requires that no external (outside of
3131  * slab) information is allowed. This, in turn, implies that the
3132  * cache's slabsize must be exactly a single page, since per-slab
3133  * information (e.g. the freelist for the slab) is kept at the
3134  * end of the slab, where it is easy to locate. Should be changed
3135  * when a more obvious kmem_cache interface/flag will become
3136  * available.
3137  */
3138 void
3139 mem_config_init()
3140 {
3141 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3142 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3143 }
3144