xref: /illumos-gate/usr/src/uts/common/os/mem_config.c (revision f334afcfaebea1b7dc3430015651d8d748fa8a3e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2017 Joyent, Inc.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/cmn_err.h>
29 #include <sys/vmem.h>
30 #include <sys/kmem.h>
31 #include <sys/systm.h>
32 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
33 #include <sys/errno.h>
34 #include <sys/memnode.h>
35 #include <sys/memlist.h>
36 #include <sys/memlist_impl.h>
37 #include <sys/tuneable.h>
38 #include <sys/proc.h>
39 #include <sys/disp.h>
40 #include <sys/debug.h>
41 #include <sys/vm.h>
42 #include <sys/callb.h>
43 #include <sys/memlist_plat.h>	/* for installed_top_size() */
44 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
45 #include <sys/dumphdr.h>	/* for dump_resize() */
46 #include <sys/atomic.h>		/* for use in stats collection */
47 #include <sys/rwlock.h>
48 #include <sys/cpuvar.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
51 #include <vm/page.h>
52 #include <vm/vm_dep.h>
53 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
54 #include <sys/sunddi.h>
55 #include <sys/mem_config.h>
56 #include <sys/mem_cage.h>
57 #include <sys/lgrp.h>
58 #include <sys/ddi.h>
59 #include <sys/modctl.h>
60 
61 extern struct memlist *phys_avail;
62 
63 extern uint_t page_ctrs_adjust(int);
64 void page_ctrs_cleanup(void);
65 static void kphysm_setup_post_add(pgcnt_t);
66 static int kphysm_setup_pre_del(pgcnt_t);
67 static void kphysm_setup_post_del(pgcnt_t, int);
68 
69 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
70 
71 static int delspan_reserve(pfn_t, pgcnt_t);
72 static void delspan_unreserve(pfn_t, pgcnt_t);
73 
74 kmutex_t memseg_lists_lock;
75 struct memseg *memseg_va_avail;
76 struct memseg *memseg_alloc(void);
77 static struct memseg *memseg_delete_junk;
78 static struct memseg *memseg_edit_junk;
79 void memseg_remap_init(void);
80 static void memseg_remap_to_dummy(struct memseg *);
81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
82 static struct memseg *memseg_reuse(pgcnt_t);
83 
84 static struct kmem_cache *memseg_cache;
85 
86 /*
87  * Interfaces to manage externally allocated
88  * page_t memory (metadata) for a memseg.
89  */
90 #pragma weak	memseg_alloc_meta
91 #pragma weak	memseg_free_meta
92 #pragma weak	memseg_get_metapfn
93 #pragma weak	memseg_remap_meta
94 
95 extern int ppvm_enable;
96 extern page_t *ppvm_base;
97 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
98 extern void memseg_free_meta(void *, pgcnt_t);
99 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
100 extern void memseg_remap_meta(struct memseg *);
101 static int memseg_is_dynamic(struct memseg *);
102 static int memseg_includes_meta(struct memseg *);
103 pfn_t memseg_get_start(struct memseg *);
104 static void memseg_cpu_vm_flush(void);
105 
106 int meta_alloc_enable;
107 
108 #ifdef	DEBUG
109 static int memseg_debug;
110 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
111 #else
112 #define	MEMSEG_DEBUG(...)
113 #endif
114 
115 /*
116  * Add a chunk of memory to the system.
117  * base: starting PAGESIZE page of new memory.
118  * npgs: length in PAGESIZE pages.
119  *
120  * Adding mem this way doesn't increase the size of the hash tables;
121  * growing them would be too hard.  This should be OK, but adding memory
122  * dynamically most likely means more hash misses, since the tables will
123  * be smaller than they otherwise would be.
124  */
125 int
126 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
127 {
128 	page_t *pp;
129 	page_t		*opp, *oepp, *segpp;
130 	struct memseg	*seg;
131 	uint64_t	avmem;
132 	pfn_t		pfn;
133 	pfn_t		pt_base = base;
134 	pgcnt_t		tpgs = npgs;
135 	pgcnt_t		metapgs = 0;
136 	int		exhausted;
137 	pfn_t		pnum;
138 	int		mnode;
139 	caddr_t		vaddr;
140 	int		reuse;
141 	int		mlret;
142 	int		rv;
143 	int		flags;
144 	int		meta_alloc = 0;
145 	void		*mapva;
146 	void		*metabase = (void *)base;
147 	pgcnt_t		nkpmpgs = 0;
148 	offset_t	kpm_pages_off = 0;
149 
150 	cmn_err(CE_CONT,
151 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
152 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
153 
154 	/*
155 	 * Add this span in the delete list to prevent interactions.
156 	 */
157 	if (!delspan_reserve(base, npgs)) {
158 		return (KPHYSM_ESPAN);
159 	}
160 	/*
161 	 * Check to see if any of the memory span has been added
162 	 * by trying an add to the installed memory list. This
163 	 * forms the interlocking process for add.
164 	 */
165 
166 	memlist_write_lock();
167 
168 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
169 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
170 
171 	if (mlret == MEML_SPANOP_OK)
172 		installed_top_size(phys_install, &physmax, &physinstalled);
173 
174 	memlist_write_unlock();
175 
176 	if (mlret != MEML_SPANOP_OK) {
177 		if (mlret == MEML_SPANOP_EALLOC) {
178 			delspan_unreserve(pt_base, tpgs);
179 			return (KPHYSM_ERESOURCE);
180 		} else if (mlret == MEML_SPANOP_ESPAN) {
181 			delspan_unreserve(pt_base, tpgs);
182 			return (KPHYSM_ESPAN);
183 		} else {
184 			delspan_unreserve(pt_base, tpgs);
185 			return (KPHYSM_ERESOURCE);
186 		}
187 	}
188 
189 	if (meta_alloc_enable) {
190 		/*
191 		 * Allocate the page_t's from existing memory;
192 		 * if that fails, allocate from the incoming memory.
193 		 */
194 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
195 		if (rv == KPHYSM_OK) {
196 			ASSERT(metapgs);
197 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
198 			meta_alloc = 1;
199 			goto mapalloc;
200 		}
201 	}
202 
203 	/*
204 	 * We store the page_t's for this new memory in the first
205 	 * few pages of the chunk. Here, we go and get'em ...
206 	 */
207 
208 	/*
209 	 * The expression after the '-' gives the number of pages
210 	 * that will fit in the new memory based on a requirement
211 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
212 	 */
213 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
214 	    (PAGESIZE + sizeof (page_t)));
215 
216 	npgs -= metapgs;
217 	base += metapgs;
218 
219 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
220 
221 	exhausted = (metapgs == 0 || npgs == 0);
222 
223 	if (kpm_enable && !exhausted) {
224 		pgcnt_t start, end, nkpmpgs_prelim;
225 		size_t	ptsz;
226 
227 		/*
228 		 * A viable kpm large page mapping must not overlap two
229 		 * dynamic memsegs. Therefore the total size is checked
230 		 * to be at least kpm_pgsz and also whether start and end
231 		 * points are at least kpm_pgsz aligned.
232 		 */
233 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
234 		    pmodkpmp(base + npgs)) {
235 
236 			kphysm_addmem_error_undospan(pt_base, tpgs);
237 
238 			/*
239 			 * There is no specific error code for violating
240 			 * kpm granularity constraints.
241 			 */
242 			return (KPHYSM_ENOTVIABLE);
243 		}
244 
245 		start = kpmptop(ptokpmp(base));
246 		end = kpmptop(ptokpmp(base + npgs));
247 		nkpmpgs_prelim = ptokpmp(end - start);
248 		ptsz = npgs * sizeof (page_t);
249 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
250 		exhausted = (tpgs <= metapgs);
251 		if (!exhausted) {
252 			npgs = tpgs - metapgs;
253 			base = pt_base + metapgs;
254 
255 			/* final nkpmpgs */
256 			start = kpmptop(ptokpmp(base));
257 			nkpmpgs = ptokpmp(end - start);
258 			kpm_pages_off = ptsz +
259 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
260 		}
261 	}
262 
263 	/*
264 	 * Is memory area supplied too small?
265 	 */
266 	if (exhausted) {
267 		kphysm_addmem_error_undospan(pt_base, tpgs);
268 		/*
269 		 * There is no specific error code for 'too small'.
270 		 */
271 		return (KPHYSM_ERESOURCE);
272 	}
273 
274 mapalloc:
275 	/*
276 	 * We may re-use a previously allocated VA space for the page_ts
277 	 * eventually, but we need to initialize and lock the pages first.
278 	 */
279 
280 	/*
281 	 * Get an address in the kernel address map, map
282 	 * the page_t pages and see if we can touch them.
283 	 */
284 
285 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
286 	if (mapva == NULL) {
287 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
288 		    " Can't allocate VA for page_ts");
289 
290 		if (meta_alloc)
291 			memseg_free_meta(metabase, metapgs);
292 		kphysm_addmem_error_undospan(pt_base, tpgs);
293 
294 		return (KPHYSM_ERESOURCE);
295 	}
296 	pp = mapva;
297 
298 	if (physmax < (pt_base + tpgs))
299 		physmax = (pt_base + tpgs);
300 
301 	/*
302 	 * In the remapping code we map one page at a time so we must do
303 	 * the same here to match mapping sizes.
304 	 */
305 	pfn = pt_base;
306 	vaddr = (caddr_t)pp;
307 	for (pnum = 0; pnum < metapgs; pnum++) {
308 		if (meta_alloc)
309 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
310 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
311 		    PROT_READ | PROT_WRITE,
312 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
313 		pfn++;
314 		vaddr += ptob(1);
315 	}
316 
317 	if (ddi_peek32((dev_info_t *)NULL,
318 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
319 
320 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
321 		    " Can't access pp array at 0x%p [phys 0x%lx]",
322 		    (void *)pp, pt_base);
323 
324 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
325 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
326 
327 		vmem_free(heap_arena, mapva, ptob(metapgs));
328 		if (meta_alloc)
329 			memseg_free_meta(metabase, metapgs);
330 		kphysm_addmem_error_undospan(pt_base, tpgs);
331 
332 		return (KPHYSM_EFAULT);
333 	}
334 
335 	/*
336 	 * Add this memory slice to its memory node translation.
337 	 *
338 	 * Note that right now, each node may have only one slice;
339 	 * this may change with COD or in larger SSM systems with
340 	 * nested latency groups, so we must not assume that the
341 	 * node does not yet exist.
342 	 *
343 	 * Note that there may be multiple memory nodes associated with
344 	 * a single lgrp node on x86 systems.
345 	 */
346 	pnum = pt_base + tpgs - 1;
347 	mem_node_add_range(pt_base, pnum);
348 
349 	/*
350 	 * Allocate or resize page counters as necessary to accommodate
351 	 * the increase in memory pages.
352 	 */
353 	mnode = PFN_2_MEM_NODE(pnum);
354 	PAGE_CTRS_ADJUST(base, npgs, rv);
355 	if (rv) {
356 
357 		mem_node_del_range(pt_base, pnum);
358 
359 		/* cleanup the  page counters */
360 		page_ctrs_cleanup();
361 
362 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
363 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
364 
365 		vmem_free(heap_arena, mapva, ptob(metapgs));
366 		if (meta_alloc)
367 			memseg_free_meta(metabase, metapgs);
368 		kphysm_addmem_error_undospan(pt_base, tpgs);
369 
370 		return (KPHYSM_ERESOURCE);
371 	}
372 
373 	/*
374 	 * Update the phys_avail memory list.
375 	 * The phys_install list was done at the start.
376 	 */
377 
378 	memlist_write_lock();
379 
380 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
381 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
382 	ASSERT(mlret == MEML_SPANOP_OK);
383 
384 	memlist_write_unlock();
385 
386 	/* See if we can find a memseg to re-use. */
387 	if (meta_alloc) {
388 		seg = memseg_reuse(0);
389 		reuse = 1;	/* force unmapping of temp mapva */
390 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
391 		/*
392 		 * There is a 1:1 fixed relationship between a pfn
393 		 * and a page_t VA.  The pfn is used as an index into
394 		 * the ppvm_base page_t table in order to calculate
395 		 * the page_t base address for a given pfn range.
396 		 */
397 		segpp = ppvm_base + base;
398 	} else {
399 		seg = memseg_reuse(metapgs);
400 		reuse = (seg != NULL);
401 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
402 		segpp = pp;
403 	}
404 
405 	/*
406 	 * Initialize the memseg structure representing this memory
407 	 * and add it to the existing list of memsegs. Do some basic
408 	 * initialization and add the memory to the system.
409 	 * In order to prevent lock deadlocks, the add_physmem()
410 	 * code is repeated here, but split into several stages.
411 	 *
412 	 * If a memseg is reused, invalidate memseg pointers in
413 	 * all cpu vm caches.  We need to do this this since the check
414 	 *	pp >= seg->pages && pp < seg->epages
415 	 * used in various places is not atomic and so the first compare
416 	 * can happen before reuse and the second compare after reuse.
417 	 * The invalidation ensures that a memseg is not deferenced while
418 	 * it's page/pfn pointers are changing.
419 	 */
420 	if (seg == NULL) {
421 		seg = memseg_alloc();
422 		ASSERT(seg != NULL);
423 		seg->msegflags = flags;
424 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
425 		    (void *)seg, (void *)(seg->pages));
426 		seg->pages = segpp;
427 	} else {
428 		ASSERT(seg->msegflags == flags);
429 		ASSERT(seg->pages_base == seg->pages_end);
430 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
431 		    (void *)seg, (void *)(seg->pages));
432 		if (meta_alloc) {
433 			memseg_cpu_vm_flush();
434 			seg->pages = segpp;
435 		}
436 	}
437 
438 	seg->epages = seg->pages + npgs;
439 	seg->pages_base = base;
440 	seg->pages_end = base + npgs;
441 
442 	/*
443 	 * Initialize metadata. The page_ts are set to locked state
444 	 * ready to be freed.
445 	 */
446 	bzero((caddr_t)pp, ptob(metapgs));
447 
448 	pfn = seg->pages_base;
449 	/* Save the original pp base in case we reuse a memseg. */
450 	opp = pp;
451 	oepp = opp + npgs;
452 	for (pp = opp; pp < oepp; pp++) {
453 		pp->p_pagenum = pfn;
454 		pfn++;
455 		page_iolock_init(pp);
456 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
457 			continue;
458 		pp->p_offset = (u_offset_t)-1;
459 	}
460 
461 	if (reuse) {
462 		/* Remap our page_ts to the re-used memseg VA space. */
463 		pfn = pt_base;
464 		vaddr = (caddr_t)seg->pages;
465 		for (pnum = 0; pnum < metapgs; pnum++) {
466 			if (meta_alloc)
467 				pfn = memseg_get_metapfn(metabase,
468 				    (pgcnt_t)pnum);
469 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
470 			    PROT_READ | PROT_WRITE,
471 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
472 			pfn++;
473 			vaddr += ptob(1);
474 		}
475 
476 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
477 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
478 
479 		vmem_free(heap_arena, mapva, ptob(metapgs));
480 	}
481 
482 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
483 
484 	memsegs_lock(1);
485 
486 	/*
487 	 * The new memseg is inserted at the beginning of the list.
488 	 * Not only does this save searching for the tail, but in the
489 	 * case of a re-used memseg, it solves the problem of what
490 	 * happens if some process has still got a pointer to the
491 	 * memseg and follows the next pointer to continue traversing
492 	 * the memsegs list.
493 	 */
494 
495 	hat_kpm_addmem_mseg_insert(seg);
496 
497 	seg->next = memsegs;
498 	membar_producer();
499 
500 	hat_kpm_addmem_memsegs_update(seg);
501 
502 	memsegs = seg;
503 
504 	build_pfn_hash();
505 
506 	total_pages += npgs;
507 
508 	/*
509 	 * Recalculate the paging parameters now total_pages has changed.
510 	 * This will also cause the clock hands to be reset before next use.
511 	 */
512 	setupclock();
513 
514 	memsegs_unlock(1);
515 
516 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
517 
518 	/*
519 	 * Free the pages outside the lock to avoid locking loops.
520 	 */
521 	for (pp = seg->pages; pp < seg->epages; pp++) {
522 		page_free(pp, 1);
523 	}
524 
525 	/*
526 	 * Now that we've updated the appropriate memory lists we
527 	 * need to reset a number of globals, since we've increased memory.
528 	 * Several have already been updated for us as noted above. The
529 	 * globals we're interested in at this point are:
530 	 *   physmax - highest page frame number.
531 	 *   physinstalled - number of pages currently installed (done earlier)
532 	 *   maxmem - max free pages in the system
533 	 *   physmem - physical memory pages available
534 	 *   availrmem - real memory available
535 	 */
536 
537 	mutex_enter(&freemem_lock);
538 	maxmem += npgs;
539 	physmem += npgs;
540 	availrmem += npgs;
541 	availrmem_initial += npgs;
542 
543 	mutex_exit(&freemem_lock);
544 
545 	dump_resize();
546 
547 	page_freelist_coalesce_all(mnode);
548 
549 	kphysm_setup_post_add(npgs);
550 
551 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
552 	    "(0x%" PRIx64 ")\n",
553 	    physinstalled << (PAGESHIFT - 10),
554 	    (uint64_t)physinstalled << PAGESHIFT);
555 
556 	avmem = (uint64_t)freemem << PAGESHIFT;
557 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
558 	    "avail mem = %" PRId64 "\n", avmem);
559 
560 	/*
561 	 * Update lgroup generation number on single lgroup systems
562 	 */
563 	if (nlgrps == 1)
564 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
565 
566 	/*
567 	 * Inform DDI of update
568 	 */
569 	ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
570 	    (uint64_t)(tpgs) << PAGESHIFT);
571 
572 	delspan_unreserve(pt_base, tpgs);
573 
574 	return (KPHYSM_OK);		/* Successfully added system memory */
575 }
576 
577 /*
578  * There are various error conditions in kphysm_add_memory_dynamic()
579  * which require a rollback of already changed global state.
580  */
581 static void
582 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
583 {
584 	int mlret;
585 
586 	/* Unreserve memory span. */
587 	memlist_write_lock();
588 
589 	mlret = memlist_delete_span(
590 	    (uint64_t)(pt_base) << PAGESHIFT,
591 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
592 
593 	ASSERT(mlret == MEML_SPANOP_OK);
594 	phys_install_has_changed();
595 	installed_top_size(phys_install, &physmax, &physinstalled);
596 
597 	memlist_write_unlock();
598 	delspan_unreserve(pt_base, tpgs);
599 }
600 
601 /*
602  * Only return an available memseg of exactly the right size
603  * if size is required.
604  * When the meta data area has it's own virtual address space
605  * we will need to manage this more carefully and do best fit
606  * allocations, possibly splitting an available area.
607  */
608 struct memseg *
609 memseg_reuse(pgcnt_t metapgs)
610 {
611 	int type;
612 	struct memseg **segpp, *seg;
613 
614 	mutex_enter(&memseg_lists_lock);
615 
616 	segpp = &memseg_va_avail;
617 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
618 		caddr_t end;
619 
620 		/*
621 		 * Make sure we are reusing the right segment type.
622 		 */
623 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
624 
625 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
626 		    != type)
627 			continue;
628 
629 		if (kpm_enable)
630 			end = hat_kpm_mseg_reuse(seg);
631 		else
632 			end = (caddr_t)seg->epages;
633 
634 		/*
635 		 * Check for the right size if it is provided.
636 		 */
637 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
638 			*segpp = seg->lnext;
639 			seg->lnext = NULL;
640 			break;
641 		}
642 	}
643 	mutex_exit(&memseg_lists_lock);
644 
645 	return (seg);
646 }
647 
648 static uint_t handle_gen;
649 
650 struct memdelspan {
651 	struct memdelspan *mds_next;
652 	pfn_t		mds_base;
653 	pgcnt_t		mds_npgs;
654 	uint_t		*mds_bitmap;
655 	uint_t		*mds_bitmap_retired;
656 };
657 
658 #define	NBPBMW		(sizeof (uint_t) * NBBY)
659 #define	MDS_BITMAPBYTES(MDSP) \
660 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
661 
662 struct transit_list {
663 	struct transit_list	*trl_next;
664 	struct memdelspan	*trl_spans;
665 	int			trl_collect;
666 };
667 
668 struct transit_list_head {
669 	kmutex_t		trh_lock;
670 	struct transit_list	*trh_head;
671 };
672 
673 static struct transit_list_head transit_list_head;
674 
675 struct mem_handle;
676 static void transit_list_collect(struct mem_handle *, int);
677 static void transit_list_insert(struct transit_list *);
678 static void transit_list_remove(struct transit_list *);
679 
680 #ifdef DEBUG
681 #define	MEM_DEL_STATS
682 #endif /* DEBUG */
683 
684 #ifdef MEM_DEL_STATS
685 static int mem_del_stat_print = 0;
686 struct mem_del_stat {
687 	uint_t	nloop;
688 	uint_t	need_free;
689 	uint_t	free_loop;
690 	uint_t	free_low;
691 	uint_t	free_failed;
692 	uint_t	ncheck;
693 	uint_t	nopaget;
694 	uint_t	lockfail;
695 	uint_t	nfree;
696 	uint_t	nreloc;
697 	uint_t	nrelocfail;
698 	uint_t	already_done;
699 	uint_t	first_notfree;
700 	uint_t	npplocked;
701 	uint_t	nlockreloc;
702 	uint_t	nnorepl;
703 	uint_t	nmodreloc;
704 	uint_t	ndestroy;
705 	uint_t	nputpage;
706 	uint_t	nnoreclaim;
707 	uint_t	ndelay;
708 	uint_t	demotefail;
709 	uint64_t nticks_total;
710 	uint64_t nticks_pgrp;
711 	uint_t	retired;
712 	uint_t	toxic;
713 	uint_t	failing;
714 	uint_t	modtoxic;
715 	uint_t	npplkdtoxic;
716 	uint_t	gptlmodfail;
717 	uint_t	gptllckfail;
718 };
719 /*
720  * The stat values are only incremented in the delete thread
721  * so no locking or atomic required.
722  */
723 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
724 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
725 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
726 static void mem_del_stat_print_func(struct mem_handle *);
727 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
728 #else /* MEM_DEL_STATS */
729 #define	MDSTAT_INCR(MHP, FLD)
730 #define	MDSTAT_TOTAL(MHP, ntck)
731 #define	MDSTAT_PGRP(MHP, ntck)
732 #define	MDSTAT_PRINT(MHP)
733 #endif /* MEM_DEL_STATS */
734 
735 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
736 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
737 
738 /*
739  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
740  * The mutex may not be required for other fields, dependent on mh_state.
741  */
742 struct mem_handle {
743 	kmutex_t	mh_mutex;
744 	struct mem_handle *mh_next;
745 	memhandle_t	mh_exthandle;
746 	mhnd_state_t	mh_state;
747 	struct transit_list mh_transit;
748 	pgcnt_t		mh_phys_pages;
749 	pgcnt_t		mh_vm_pages;
750 	pgcnt_t		mh_hold_todo;
751 	void		(*mh_delete_complete)(void *, int error);
752 	void		*mh_delete_complete_arg;
753 	volatile uint_t mh_cancel;
754 	volatile uint_t mh_dr_aio_cleanup_cancel;
755 	volatile uint_t mh_aio_cleanup_done;
756 	kcondvar_t	mh_cv;
757 	kthread_id_t	mh_thread_id;
758 	page_t		*mh_deleted;	/* link through p_next */
759 #ifdef MEM_DEL_STATS
760 	struct mem_del_stat mh_delstat;
761 #endif /* MEM_DEL_STATS */
762 };
763 
764 static struct mem_handle *mem_handle_head;
765 static kmutex_t mem_handle_list_mutex;
766 
767 static struct mem_handle *
768 kphysm_allocate_mem_handle()
769 {
770 	struct mem_handle *mhp;
771 
772 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
773 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
774 	mutex_enter(&mem_handle_list_mutex);
775 	mutex_enter(&mhp->mh_mutex);
776 	/* handle_gen is protected by list mutex. */
777 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
778 	mhp->mh_next = mem_handle_head;
779 	mem_handle_head = mhp;
780 	mutex_exit(&mem_handle_list_mutex);
781 
782 	return (mhp);
783 }
784 
785 static void
786 kphysm_free_mem_handle(struct mem_handle *mhp)
787 {
788 	struct mem_handle **mhpp;
789 
790 	ASSERT(mutex_owned(&mhp->mh_mutex));
791 	ASSERT(mhp->mh_state == MHND_FREE);
792 	/*
793 	 * Exit the mutex to preserve locking order. This is OK
794 	 * here as once in the FREE state, the handle cannot
795 	 * be found by a lookup.
796 	 */
797 	mutex_exit(&mhp->mh_mutex);
798 
799 	mutex_enter(&mem_handle_list_mutex);
800 	mhpp = &mem_handle_head;
801 	while (*mhpp != NULL && *mhpp != mhp)
802 		mhpp = &(*mhpp)->mh_next;
803 	ASSERT(*mhpp == mhp);
804 	/*
805 	 * No need to lock the handle (mh_mutex) as only
806 	 * mh_next changing and this is the only thread that
807 	 * can be referncing mhp.
808 	 */
809 	*mhpp = mhp->mh_next;
810 	mutex_exit(&mem_handle_list_mutex);
811 
812 	mutex_destroy(&mhp->mh_mutex);
813 	kmem_free(mhp, sizeof (struct mem_handle));
814 }
815 
816 /*
817  * This function finds the internal mem_handle corresponding to an
818  * external handle and returns it with the mh_mutex held.
819  */
820 static struct mem_handle *
821 kphysm_lookup_mem_handle(memhandle_t handle)
822 {
823 	struct mem_handle *mhp;
824 
825 	mutex_enter(&mem_handle_list_mutex);
826 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
827 		if (mhp->mh_exthandle == handle) {
828 			mutex_enter(&mhp->mh_mutex);
829 			/*
830 			 * The state of the handle could have been changed
831 			 * by kphysm_del_release() while waiting for mh_mutex.
832 			 */
833 			if (mhp->mh_state == MHND_FREE) {
834 				mutex_exit(&mhp->mh_mutex);
835 				continue;
836 			}
837 			break;
838 		}
839 	}
840 	mutex_exit(&mem_handle_list_mutex);
841 	return (mhp);
842 }
843 
844 int
845 kphysm_del_gethandle(memhandle_t *xmhp)
846 {
847 	struct mem_handle *mhp;
848 
849 	mhp = kphysm_allocate_mem_handle();
850 	/*
851 	 * The handle is allocated using KM_SLEEP, so cannot fail.
852 	 * If the implementation is changed, the correct error to return
853 	 * here would be KPHYSM_ENOHANDLES.
854 	 */
855 	ASSERT(mhp->mh_state == MHND_FREE);
856 	mhp->mh_state = MHND_INIT;
857 	*xmhp = mhp->mh_exthandle;
858 	mutex_exit(&mhp->mh_mutex);
859 	return (KPHYSM_OK);
860 }
861 
862 static int
863 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
864 {
865 	pfn_t e1, e2;
866 
867 	e1 = b1 + l1;
868 	e2 = b2 + l2;
869 
870 	return (!(b2 >= e1 || b1 >= e2));
871 }
872 
873 static int can_remove_pgs(pgcnt_t);
874 
875 static struct memdelspan *
876 span_to_install(pfn_t base, pgcnt_t npgs)
877 {
878 	struct memdelspan *mdsp;
879 	struct memdelspan *mdsp_new;
880 	uint64_t address, size, thislen;
881 	struct memlist *mlp;
882 
883 	mdsp_new = NULL;
884 
885 	address = (uint64_t)base << PAGESHIFT;
886 	size = (uint64_t)npgs << PAGESHIFT;
887 	while (size != 0) {
888 		memlist_read_lock();
889 		for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
890 			if (address >= (mlp->ml_address + mlp->ml_size))
891 				continue;
892 			if ((address + size) > mlp->ml_address)
893 				break;
894 		}
895 		if (mlp == NULL) {
896 			address += size;
897 			size = 0;
898 			thislen = 0;
899 		} else {
900 			if (address < mlp->ml_address) {
901 				size -= (mlp->ml_address - address);
902 				address = mlp->ml_address;
903 			}
904 			ASSERT(address >= mlp->ml_address);
905 			if ((address + size) >
906 			    (mlp->ml_address + mlp->ml_size)) {
907 				thislen =
908 				    mlp->ml_size - (address - mlp->ml_address);
909 			} else {
910 				thislen = size;
911 			}
912 		}
913 		memlist_read_unlock();
914 		/* TODO: phys_install could change now */
915 		if (thislen == 0)
916 			continue;
917 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
918 		mdsp->mds_base = btop(address);
919 		mdsp->mds_npgs = btop(thislen);
920 		mdsp->mds_next = mdsp_new;
921 		mdsp_new = mdsp;
922 		address += thislen;
923 		size -= thislen;
924 	}
925 	return (mdsp_new);
926 }
927 
928 static void
929 free_delspans(struct memdelspan *mdsp)
930 {
931 	struct memdelspan *amdsp;
932 
933 	while ((amdsp = mdsp) != NULL) {
934 		mdsp = amdsp->mds_next;
935 		kmem_free(amdsp, sizeof (struct memdelspan));
936 	}
937 }
938 
939 /*
940  * Concatenate lists. No list ordering is required.
941  */
942 
943 static void
944 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
945 {
946 	while (*mdspp != NULL)
947 		mdspp = &(*mdspp)->mds_next;
948 
949 	*mdspp = mdsp;
950 }
951 
952 /*
953  * Given a new list of delspans, check there is no overlap with
954  * all existing span activity (add or delete) and then concatenate
955  * the new spans to the given list.
956  * Return 1 for OK, 0 if overlapping.
957  */
958 static int
959 delspan_insert(
960 	struct transit_list *my_tlp,
961 	struct memdelspan *mdsp_new)
962 {
963 	struct transit_list_head *trh;
964 	struct transit_list *tlp;
965 	int ret;
966 
967 	trh = &transit_list_head;
968 
969 	ASSERT(my_tlp != NULL);
970 	ASSERT(mdsp_new != NULL);
971 
972 	ret = 1;
973 	mutex_enter(&trh->trh_lock);
974 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
975 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
976 		struct memdelspan *mdsp;
977 
978 		for (mdsp = tlp->trl_spans; mdsp != NULL;
979 		    mdsp = mdsp->mds_next) {
980 			struct memdelspan *nmdsp;
981 
982 			for (nmdsp = mdsp_new; nmdsp != NULL;
983 			    nmdsp = nmdsp->mds_next) {
984 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
985 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
986 					ret = 0;
987 					goto done;
988 				}
989 			}
990 		}
991 	}
992 done:
993 	if (ret != 0) {
994 		if (my_tlp->trl_spans == NULL)
995 			transit_list_insert(my_tlp);
996 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
997 	}
998 	mutex_exit(&trh->trh_lock);
999 	return (ret);
1000 }
1001 
1002 static void
1003 delspan_remove(
1004 	struct transit_list *my_tlp,
1005 	pfn_t base,
1006 	pgcnt_t npgs)
1007 {
1008 	struct transit_list_head *trh;
1009 	struct memdelspan *mdsp;
1010 
1011 	trh = &transit_list_head;
1012 
1013 	ASSERT(my_tlp != NULL);
1014 
1015 	mutex_enter(&trh->trh_lock);
1016 	if ((mdsp = my_tlp->trl_spans) != NULL) {
1017 		if (npgs == 0) {
1018 			my_tlp->trl_spans = NULL;
1019 			free_delspans(mdsp);
1020 			transit_list_remove(my_tlp);
1021 		} else {
1022 			struct memdelspan **prv;
1023 
1024 			prv = &my_tlp->trl_spans;
1025 			while (mdsp != NULL) {
1026 				pfn_t p_end;
1027 
1028 				p_end = mdsp->mds_base + mdsp->mds_npgs;
1029 				if (mdsp->mds_base >= base &&
1030 				    p_end <= (base + npgs)) {
1031 					*prv = mdsp->mds_next;
1032 					mdsp->mds_next = NULL;
1033 					free_delspans(mdsp);
1034 				} else {
1035 					prv = &mdsp->mds_next;
1036 				}
1037 				mdsp = *prv;
1038 			}
1039 			if (my_tlp->trl_spans == NULL)
1040 				transit_list_remove(my_tlp);
1041 		}
1042 	}
1043 	mutex_exit(&trh->trh_lock);
1044 }
1045 
1046 /*
1047  * Reserve interface for add to stop delete before add finished.
1048  * This list is only accessed through the delspan_insert/remove
1049  * functions and so is fully protected by the mutex in struct transit_list.
1050  */
1051 
1052 static struct transit_list reserve_transit;
1053 
1054 static int
1055 delspan_reserve(pfn_t base, pgcnt_t npgs)
1056 {
1057 	struct memdelspan *mdsp;
1058 	int ret;
1059 
1060 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1061 	mdsp->mds_base = base;
1062 	mdsp->mds_npgs = npgs;
1063 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1064 		free_delspans(mdsp);
1065 	}
1066 	return (ret);
1067 }
1068 
1069 static void
1070 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1071 {
1072 	delspan_remove(&reserve_transit, base, npgs);
1073 }
1074 
1075 /*
1076  * Return whether memseg was created by kphysm_add_memory_dynamic().
1077  */
1078 static int
1079 memseg_is_dynamic(struct memseg *seg)
1080 {
1081 	return (seg->msegflags & MEMSEG_DYNAMIC);
1082 }
1083 
1084 int
1085 kphysm_del_span(
1086 	memhandle_t handle,
1087 	pfn_t base,
1088 	pgcnt_t npgs)
1089 {
1090 	struct mem_handle *mhp;
1091 	struct memseg *seg;
1092 	struct memdelspan *mdsp;
1093 	struct memdelspan *mdsp_new;
1094 	pgcnt_t phys_pages, vm_pages;
1095 	pfn_t p_end;
1096 	page_t *pp;
1097 	int ret;
1098 
1099 	mhp = kphysm_lookup_mem_handle(handle);
1100 	if (mhp == NULL) {
1101 		return (KPHYSM_EHANDLE);
1102 	}
1103 	if (mhp->mh_state != MHND_INIT) {
1104 		mutex_exit(&mhp->mh_mutex);
1105 		return (KPHYSM_ESEQUENCE);
1106 	}
1107 
1108 	/*
1109 	 * Intersect the span with the installed memory list (phys_install).
1110 	 */
1111 	mdsp_new = span_to_install(base, npgs);
1112 	if (mdsp_new == NULL) {
1113 		/*
1114 		 * No physical memory in this range. Is this an
1115 		 * error? If an attempt to start the delete is made
1116 		 * for OK returns from del_span such as this, start will
1117 		 * return an error.
1118 		 * Could return KPHYSM_ENOWORK.
1119 		 */
1120 		/*
1121 		 * It is assumed that there are no error returns
1122 		 * from span_to_install() due to kmem_alloc failure.
1123 		 */
1124 		mutex_exit(&mhp->mh_mutex);
1125 		return (KPHYSM_OK);
1126 	}
1127 	/*
1128 	 * Does this span overlap an existing span?
1129 	 */
1130 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1131 		/*
1132 		 * Differentiate between already on list for this handle
1133 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1134 		 */
1135 		ret = KPHYSM_EBUSY;
1136 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1137 		    mdsp = mdsp->mds_next) {
1138 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1139 			    base, npgs)) {
1140 				ret = KPHYSM_EDUP;
1141 				break;
1142 			}
1143 		}
1144 		mutex_exit(&mhp->mh_mutex);
1145 		free_delspans(mdsp_new);
1146 		return (ret);
1147 	}
1148 	/*
1149 	 * At this point the spans in mdsp_new have been inserted into the
1150 	 * list of spans for this handle and thereby to the global list of
1151 	 * spans being processed. Each of these spans must now be checked
1152 	 * for relocatability. As a side-effect segments in the memseg list
1153 	 * may be split.
1154 	 *
1155 	 * Note that mdsp_new can no longer be used as it is now part of
1156 	 * a larger list. Select elements of this larger list based
1157 	 * on base and npgs.
1158 	 */
1159 restart:
1160 	phys_pages = 0;
1161 	vm_pages = 0;
1162 	ret = KPHYSM_OK;
1163 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1164 	    mdsp = mdsp->mds_next) {
1165 		pgcnt_t pages_checked;
1166 
1167 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1168 			continue;
1169 		}
1170 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1171 		/*
1172 		 * The pages_checked count is a hack. All pages should be
1173 		 * checked for relocatability. Those not covered by memsegs
1174 		 * should be tested with arch_kphysm_del_span_ok().
1175 		 */
1176 		pages_checked = 0;
1177 		for (seg = memsegs; seg; seg = seg->next) {
1178 			pfn_t mseg_start;
1179 
1180 			if (seg->pages_base >= p_end ||
1181 			    seg->pages_end <= mdsp->mds_base) {
1182 				/* Span and memseg don't overlap. */
1183 				continue;
1184 			}
1185 			mseg_start = memseg_get_start(seg);
1186 			/* Check that segment is suitable for delete. */
1187 			if (memseg_includes_meta(seg)) {
1188 				/*
1189 				 * Check that this segment is completely
1190 				 * within the span.
1191 				 */
1192 				if (mseg_start < mdsp->mds_base ||
1193 				    seg->pages_end > p_end) {
1194 					ret = KPHYSM_EBUSY;
1195 					break;
1196 				}
1197 				pages_checked += seg->pages_end - mseg_start;
1198 			} else {
1199 				/*
1200 				 * If this segment is larger than the span,
1201 				 * try to split it. After the split, it
1202 				 * is necessary to restart.
1203 				 */
1204 				if (seg->pages_base < mdsp->mds_base ||
1205 				    seg->pages_end > p_end) {
1206 					pfn_t abase;
1207 					pgcnt_t anpgs;
1208 					int s_ret;
1209 
1210 					/* Split required.  */
1211 					if (mdsp->mds_base < seg->pages_base)
1212 						abase = seg->pages_base;
1213 					else
1214 						abase = mdsp->mds_base;
1215 					if (p_end > seg->pages_end)
1216 						anpgs = seg->pages_end - abase;
1217 					else
1218 						anpgs = p_end - abase;
1219 					s_ret = kphysm_split_memseg(abase,
1220 					    anpgs);
1221 					if (s_ret == 0) {
1222 						/* Split failed. */
1223 						ret = KPHYSM_ERESOURCE;
1224 						break;
1225 					}
1226 					goto restart;
1227 				}
1228 				pages_checked +=
1229 				    seg->pages_end - seg->pages_base;
1230 			}
1231 			/*
1232 			 * The memseg is wholly within the delete span.
1233 			 * The individual pages can now be checked.
1234 			 */
1235 			/* Cage test. */
1236 			for (pp = seg->pages; pp < seg->epages; pp++) {
1237 				if (PP_ISNORELOC(pp)) {
1238 					ret = KPHYSM_ENONRELOC;
1239 					break;
1240 				}
1241 			}
1242 			if (ret != KPHYSM_OK) {
1243 				break;
1244 			}
1245 			phys_pages += (seg->pages_end - mseg_start);
1246 			vm_pages += MSEG_NPAGES(seg);
1247 		}
1248 		if (ret != KPHYSM_OK)
1249 			break;
1250 		if (pages_checked != mdsp->mds_npgs) {
1251 			ret = KPHYSM_ENONRELOC;
1252 			break;
1253 		}
1254 	}
1255 
1256 	if (ret == KPHYSM_OK) {
1257 		mhp->mh_phys_pages += phys_pages;
1258 		mhp->mh_vm_pages += vm_pages;
1259 	} else {
1260 		/*
1261 		 * Keep holding the mh_mutex to prevent it going away.
1262 		 */
1263 		delspan_remove(&mhp->mh_transit, base, npgs);
1264 	}
1265 	mutex_exit(&mhp->mh_mutex);
1266 	return (ret);
1267 }
1268 
1269 int
1270 kphysm_del_span_query(
1271 	pfn_t base,
1272 	pgcnt_t npgs,
1273 	memquery_t *mqp)
1274 {
1275 	struct memdelspan *mdsp;
1276 	struct memdelspan *mdsp_new;
1277 	int done_first_nonreloc;
1278 
1279 	mqp->phys_pages = 0;
1280 	mqp->managed = 0;
1281 	mqp->nonrelocatable = 0;
1282 	mqp->first_nonrelocatable = 0;
1283 	mqp->last_nonrelocatable = 0;
1284 
1285 	mdsp_new = span_to_install(base, npgs);
1286 	/*
1287 	 * It is OK to proceed here if mdsp_new == NULL.
1288 	 */
1289 	done_first_nonreloc = 0;
1290 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1291 		pfn_t sbase;
1292 		pgcnt_t snpgs;
1293 
1294 		mqp->phys_pages += mdsp->mds_npgs;
1295 		sbase = mdsp->mds_base;
1296 		snpgs = mdsp->mds_npgs;
1297 		while (snpgs != 0) {
1298 			struct memseg *lseg, *seg;
1299 			pfn_t p_end;
1300 			page_t *pp;
1301 			pfn_t mseg_start;
1302 
1303 			p_end = sbase + snpgs;
1304 			/*
1305 			 * Find the lowest addressed memseg that starts
1306 			 * after sbase and account for it.
1307 			 * This is to catch dynamic memsegs whose start
1308 			 * is hidden.
1309 			 */
1310 			seg = NULL;
1311 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1312 				if ((lseg->pages_base >= sbase) ||
1313 				    (lseg->pages_base < p_end &&
1314 				    lseg->pages_end > sbase)) {
1315 					if (seg == NULL ||
1316 					    seg->pages_base > lseg->pages_base)
1317 						seg = lseg;
1318 				}
1319 			}
1320 			if (seg != NULL) {
1321 				mseg_start = memseg_get_start(seg);
1322 				/*
1323 				 * Now have the full extent of the memseg so
1324 				 * do the range check.
1325 				 */
1326 				if (mseg_start >= p_end ||
1327 				    seg->pages_end <= sbase) {
1328 					/* Span does not overlap memseg. */
1329 					seg = NULL;
1330 				}
1331 			}
1332 			/*
1333 			 * Account for gap either before the segment if
1334 			 * there is one or to the end of the span.
1335 			 */
1336 			if (seg == NULL || mseg_start > sbase) {
1337 				pfn_t a_end;
1338 
1339 				a_end = (seg == NULL) ? p_end : mseg_start;
1340 				/*
1341 				 * Check with arch layer for relocatability.
1342 				 */
1343 				if (arch_kphysm_del_span_ok(sbase,
1344 				    (a_end - sbase))) {
1345 					/*
1346 					 * No non-relocatble pages in this
1347 					 * area, avoid the fine-grained
1348 					 * test.
1349 					 */
1350 					snpgs -= (a_end - sbase);
1351 					sbase = a_end;
1352 				}
1353 				while (sbase < a_end) {
1354 					if (!arch_kphysm_del_span_ok(sbase,
1355 					    1)) {
1356 						mqp->nonrelocatable++;
1357 						if (!done_first_nonreloc) {
1358 							mqp->
1359 							    first_nonrelocatable
1360 							    = sbase;
1361 							done_first_nonreloc = 1;
1362 						}
1363 						mqp->last_nonrelocatable =
1364 						    sbase;
1365 					}
1366 					sbase++;
1367 					snpgs--;
1368 				}
1369 			}
1370 			if (seg != NULL) {
1371 				ASSERT(mseg_start <= sbase);
1372 				if (seg->pages_base != mseg_start &&
1373 				    seg->pages_base > sbase) {
1374 					pgcnt_t skip_pgs;
1375 
1376 					/*
1377 					 * Skip the page_t area of a
1378 					 * dynamic memseg.
1379 					 */
1380 					skip_pgs = seg->pages_base - sbase;
1381 					if (snpgs <= skip_pgs) {
1382 						sbase += snpgs;
1383 						snpgs = 0;
1384 						continue;
1385 					}
1386 					snpgs -= skip_pgs;
1387 					sbase += skip_pgs;
1388 				}
1389 				ASSERT(snpgs != 0);
1390 				ASSERT(seg->pages_base <= sbase);
1391 				/*
1392 				 * The individual pages can now be checked.
1393 				 */
1394 				for (pp = seg->pages +
1395 				    (sbase - seg->pages_base);
1396 				    snpgs != 0 && pp < seg->epages; pp++) {
1397 					mqp->managed++;
1398 					if (PP_ISNORELOC(pp)) {
1399 						mqp->nonrelocatable++;
1400 						if (!done_first_nonreloc) {
1401 							mqp->
1402 							    first_nonrelocatable
1403 							    = sbase;
1404 							done_first_nonreloc = 1;
1405 						}
1406 						mqp->last_nonrelocatable =
1407 						    sbase;
1408 					}
1409 					sbase++;
1410 					snpgs--;
1411 				}
1412 			}
1413 		}
1414 	}
1415 
1416 	free_delspans(mdsp_new);
1417 
1418 	return (KPHYSM_OK);
1419 }
1420 
1421 /*
1422  * This release function can be called at any stage as follows:
1423  *	_gethandle only called
1424  *	_span(s) only called
1425  *	_start called but failed
1426  *	delete thread exited
1427  */
1428 int
1429 kphysm_del_release(memhandle_t handle)
1430 {
1431 	struct mem_handle *mhp;
1432 
1433 	mhp = kphysm_lookup_mem_handle(handle);
1434 	if (mhp == NULL) {
1435 		return (KPHYSM_EHANDLE);
1436 	}
1437 	switch (mhp->mh_state) {
1438 	case MHND_STARTING:
1439 	case MHND_RUNNING:
1440 		mutex_exit(&mhp->mh_mutex);
1441 		return (KPHYSM_ENOTFINISHED);
1442 	case MHND_FREE:
1443 		ASSERT(mhp->mh_state != MHND_FREE);
1444 		mutex_exit(&mhp->mh_mutex);
1445 		return (KPHYSM_EHANDLE);
1446 	case MHND_INIT:
1447 		break;
1448 	case MHND_DONE:
1449 		break;
1450 	case MHND_RELEASE:
1451 		mutex_exit(&mhp->mh_mutex);
1452 		return (KPHYSM_ESEQUENCE);
1453 	default:
1454 #ifdef DEBUG
1455 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1456 		    (void *)mhp, mhp->mh_state);
1457 #endif /* DEBUG */
1458 		mutex_exit(&mhp->mh_mutex);
1459 		return (KPHYSM_EHANDLE);
1460 	}
1461 	/*
1462 	 * Set state so that we can wait if necessary.
1463 	 * Also this means that we have read/write access to all
1464 	 * fields except mh_exthandle and mh_state.
1465 	 */
1466 	mhp->mh_state = MHND_RELEASE;
1467 	/*
1468 	 * The mem_handle cannot be de-allocated by any other operation
1469 	 * now, so no need to hold mh_mutex.
1470 	 */
1471 	mutex_exit(&mhp->mh_mutex);
1472 
1473 	delspan_remove(&mhp->mh_transit, 0, 0);
1474 	mhp->mh_phys_pages = 0;
1475 	mhp->mh_vm_pages = 0;
1476 	mhp->mh_hold_todo = 0;
1477 	mhp->mh_delete_complete = NULL;
1478 	mhp->mh_delete_complete_arg = NULL;
1479 	mhp->mh_cancel = 0;
1480 
1481 	mutex_enter(&mhp->mh_mutex);
1482 	ASSERT(mhp->mh_state == MHND_RELEASE);
1483 	mhp->mh_state = MHND_FREE;
1484 
1485 	kphysm_free_mem_handle(mhp);
1486 
1487 	return (KPHYSM_OK);
1488 }
1489 
1490 /*
1491  * This cancel function can only be called with the thread running.
1492  */
1493 int
1494 kphysm_del_cancel(memhandle_t handle)
1495 {
1496 	struct mem_handle *mhp;
1497 
1498 	mhp = kphysm_lookup_mem_handle(handle);
1499 	if (mhp == NULL) {
1500 		return (KPHYSM_EHANDLE);
1501 	}
1502 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1503 		mutex_exit(&mhp->mh_mutex);
1504 		return (KPHYSM_ENOTRUNNING);
1505 	}
1506 	/*
1507 	 * Set the cancel flag and wake the delete thread up.
1508 	 * The thread may be waiting on I/O, so the effect of the cancel
1509 	 * may be delayed.
1510 	 */
1511 	if (mhp->mh_cancel == 0) {
1512 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1513 		cv_signal(&mhp->mh_cv);
1514 	}
1515 	mutex_exit(&mhp->mh_mutex);
1516 	return (KPHYSM_OK);
1517 }
1518 
1519 int
1520 kphysm_del_status(
1521 	memhandle_t handle,
1522 	memdelstat_t *mdstp)
1523 {
1524 	struct mem_handle *mhp;
1525 
1526 	mhp = kphysm_lookup_mem_handle(handle);
1527 	if (mhp == NULL) {
1528 		return (KPHYSM_EHANDLE);
1529 	}
1530 	/*
1531 	 * Calling kphysm_del_status() is allowed before the delete
1532 	 * is started to allow for status display.
1533 	 */
1534 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1535 	    mhp->mh_state != MHND_RUNNING) {
1536 		mutex_exit(&mhp->mh_mutex);
1537 		return (KPHYSM_ENOTRUNNING);
1538 	}
1539 	mdstp->phys_pages = mhp->mh_phys_pages;
1540 	mdstp->managed = mhp->mh_vm_pages;
1541 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1542 	mutex_exit(&mhp->mh_mutex);
1543 	return (KPHYSM_OK);
1544 }
1545 
1546 static int mem_delete_additional_pages = 100;
1547 
1548 static int
1549 can_remove_pgs(pgcnt_t npgs)
1550 {
1551 	/*
1552 	 * If all pageable pages were paged out, freemem would
1553 	 * equal availrmem.  There is a minimum requirement for
1554 	 * availrmem.
1555 	 */
1556 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1557 	    < npgs)
1558 		return (0);
1559 	/* TODO: check swap space, etc. */
1560 	return (1);
1561 }
1562 
1563 static int
1564 get_availrmem(pgcnt_t npgs)
1565 {
1566 	int ret;
1567 
1568 	mutex_enter(&freemem_lock);
1569 	ret = can_remove_pgs(npgs);
1570 	if (ret != 0)
1571 		availrmem -= npgs;
1572 	mutex_exit(&freemem_lock);
1573 	return (ret);
1574 }
1575 
1576 static void
1577 put_availrmem(pgcnt_t npgs)
1578 {
1579 	mutex_enter(&freemem_lock);
1580 	availrmem += npgs;
1581 	mutex_exit(&freemem_lock);
1582 }
1583 
1584 #define	FREEMEM_INCR	100
1585 static pgcnt_t freemem_incr = FREEMEM_INCR;
1586 #define	DEL_FREE_WAIT_FRAC	4
1587 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1588 
1589 #define	DEL_BUSY_WAIT_FRAC	20
1590 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1591 
1592 static void kphysm_del_cleanup(struct mem_handle *);
1593 
1594 static void page_delete_collect(page_t *, struct mem_handle *);
1595 
1596 static pgcnt_t
1597 delthr_get_freemem(struct mem_handle *mhp)
1598 {
1599 	pgcnt_t free_get;
1600 	int ret;
1601 
1602 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1603 
1604 	MDSTAT_INCR(mhp, need_free);
1605 	/*
1606 	 * Get up to freemem_incr pages.
1607 	 */
1608 	free_get = freemem_incr;
1609 	if (free_get > mhp->mh_hold_todo)
1610 		free_get = mhp->mh_hold_todo;
1611 	/*
1612 	 * Take free_get pages away from freemem,
1613 	 * waiting if necessary.
1614 	 */
1615 
1616 	while (!mhp->mh_cancel) {
1617 		mutex_exit(&mhp->mh_mutex);
1618 		MDSTAT_INCR(mhp, free_loop);
1619 		/*
1620 		 * Duplicate test from page_create_throttle()
1621 		 * but don't override with !PG_WAIT.
1622 		 */
1623 		if (freemem < (free_get + throttlefree)) {
1624 			MDSTAT_INCR(mhp, free_low);
1625 			ret = 0;
1626 		} else {
1627 			ret = page_create_wait(free_get, 0);
1628 			if (ret == 0) {
1629 				/* EMPTY */
1630 				MDSTAT_INCR(mhp, free_failed);
1631 			}
1632 		}
1633 		if (ret != 0) {
1634 			mutex_enter(&mhp->mh_mutex);
1635 			return (free_get);
1636 		}
1637 
1638 		/*
1639 		 * Put pressure on pageout.
1640 		 */
1641 		page_needfree(free_get);
1642 		WAKE_PAGEOUT_SCANNER(delthr);
1643 
1644 		mutex_enter(&mhp->mh_mutex);
1645 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1646 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1647 		mutex_exit(&mhp->mh_mutex);
1648 		page_needfree(-(spgcnt_t)free_get);
1649 
1650 		mutex_enter(&mhp->mh_mutex);
1651 	}
1652 	return (0);
1653 }
1654 
1655 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1656 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1657 /*
1658  * This function is run as a helper thread for delete_memory_thread.
1659  * It is needed in order to force kaio cleanup, so that pages used in kaio
1660  * will be unlocked and subsequently relocated by delete_memory_thread.
1661  * The address of the delete_memory_threads's mem_handle is passed in to
1662  * this thread function, and is used to set the mh_aio_cleanup_done member
1663  * prior to calling thread_exit().
1664  */
1665 static void
1666 dr_aio_cleanup_thread(caddr_t amhp)
1667 {
1668 	proc_t *procp;
1669 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1670 	int cleaned;
1671 	int n = 0;
1672 	struct mem_handle *mhp;
1673 	volatile uint_t *pcancel;
1674 
1675 	mhp = (struct mem_handle *)amhp;
1676 	ASSERT(mhp != NULL);
1677 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1678 	if (modload("sys", "kaio") == -1) {
1679 		mhp->mh_aio_cleanup_done = 1;
1680 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1681 		thread_exit();
1682 	}
1683 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1684 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1685 	if (aio_cleanup_dr_delete_memory == NULL) {
1686 		mhp->mh_aio_cleanup_done = 1;
1687 		cmn_err(CE_WARN,
1688 	    "aio_cleanup_dr_delete_memory not found in kaio");
1689 		thread_exit();
1690 	}
1691 	do {
1692 		cleaned = 0;
1693 		mutex_enter(&pidlock);
1694 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1695 		    procp = procp->p_next) {
1696 			mutex_enter(&procp->p_lock);
1697 			if (procp->p_aio != NULL) {
1698 				/* cleanup proc's outstanding kaio */
1699 				cleaned +=
1700 				    (*aio_cleanup_dr_delete_memory)(procp);
1701 			}
1702 			mutex_exit(&procp->p_lock);
1703 		}
1704 		mutex_exit(&pidlock);
1705 		if ((*pcancel == 0) &&
1706 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1707 			/* delay a bit before retrying all procs again */
1708 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1709 			n = 0;
1710 		}
1711 	} while (*pcancel == 0);
1712 	mhp->mh_aio_cleanup_done = 1;
1713 	thread_exit();
1714 }
1715 
1716 static void
1717 delete_memory_thread(caddr_t amhp)
1718 {
1719 	struct mem_handle *mhp;
1720 	struct memdelspan *mdsp;
1721 	callb_cpr_t cprinfo;
1722 	page_t *pp_targ;
1723 	spgcnt_t freemem_left;
1724 	void (*del_complete_funcp)(void *, int error);
1725 	void *del_complete_arg;
1726 	int comp_code;
1727 	int ret;
1728 	int first_scan;
1729 	uint_t szc;
1730 #ifdef MEM_DEL_STATS
1731 	uint64_t start_total, ntick_total;
1732 	uint64_t start_pgrp, ntick_pgrp;
1733 #endif /* MEM_DEL_STATS */
1734 
1735 	mhp = (struct mem_handle *)amhp;
1736 
1737 #ifdef MEM_DEL_STATS
1738 	start_total = ddi_get_lbolt();
1739 #endif /* MEM_DEL_STATS */
1740 
1741 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1742 	    callb_generic_cpr, "memdel");
1743 
1744 	mutex_enter(&mhp->mh_mutex);
1745 	ASSERT(mhp->mh_state == MHND_STARTING);
1746 
1747 	mhp->mh_state = MHND_RUNNING;
1748 	mhp->mh_thread_id = curthread;
1749 
1750 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1751 	mutex_exit(&mhp->mh_mutex);
1752 
1753 	/* Allocate the remap pages now, if necessary. */
1754 	memseg_remap_init();
1755 
1756 	/*
1757 	 * Subtract from availrmem now if possible as availrmem
1758 	 * may not be available by the end of the delete.
1759 	 */
1760 	if (!get_availrmem(mhp->mh_vm_pages)) {
1761 		comp_code = KPHYSM_ENOTVIABLE;
1762 		mutex_enter(&mhp->mh_mutex);
1763 		goto early_exit;
1764 	}
1765 
1766 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1767 
1768 	mutex_enter(&mhp->mh_mutex);
1769 
1770 	if (ret != 0) {
1771 		mhp->mh_cancel = KPHYSM_EREFUSED;
1772 		goto refused;
1773 	}
1774 
1775 	transit_list_collect(mhp, 1);
1776 
1777 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1778 	    mdsp = mdsp->mds_next) {
1779 		ASSERT(mdsp->mds_bitmap == NULL);
1780 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1781 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1782 		    KM_SLEEP);
1783 	}
1784 
1785 	first_scan = 1;
1786 	freemem_left = 0;
1787 	/*
1788 	 * Start dr_aio_cleanup_thread, which periodically iterates
1789 	 * through the process list and invokes aio cleanup.  This
1790 	 * is needed in order to avoid a deadly embrace between the
1791 	 * delete_memory_thread (waiting on writer lock for page, with the
1792 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1793 	 * reader lock on the same page that is wanted by the
1794 	 * delete_memory_thread), and threads waiting for kaio completion
1795 	 * (blocked on spt_amp->lock).
1796 	 */
1797 	mhp->mh_dr_aio_cleanup_cancel = 0;
1798 	mhp->mh_aio_cleanup_done = 0;
1799 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1800 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1801 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1802 		pgcnt_t collected;
1803 
1804 		MDSTAT_INCR(mhp, nloop);
1805 		collected = 0;
1806 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1807 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1808 			pfn_t pfn, p_end;
1809 
1810 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1811 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1812 			    (mhp->mh_cancel == 0); pfn++) {
1813 				page_t *pp, *tpp, *tpp_targ;
1814 				pgcnt_t bit;
1815 				struct vnode *vp;
1816 				u_offset_t offset;
1817 				int mod, result;
1818 				spgcnt_t pgcnt;
1819 
1820 				bit = pfn - mdsp->mds_base;
1821 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1822 				    (1 << (bit % NBPBMW))) != 0) {
1823 					MDSTAT_INCR(mhp, already_done);
1824 					continue;
1825 				}
1826 				if (freemem_left == 0) {
1827 					freemem_left += delthr_get_freemem(mhp);
1828 					if (freemem_left == 0)
1829 						break;
1830 				}
1831 
1832 				/*
1833 				 * Release mh_mutex - some of this
1834 				 * stuff takes some time (eg PUTPAGE).
1835 				 */
1836 
1837 				mutex_exit(&mhp->mh_mutex);
1838 				MDSTAT_INCR(mhp, ncheck);
1839 
1840 				pp = page_numtopp_nolock(pfn);
1841 				if (pp == NULL) {
1842 					/*
1843 					 * Not covered by a page_t - will
1844 					 * be dealt with elsewhere.
1845 					 */
1846 					MDSTAT_INCR(mhp, nopaget);
1847 					mutex_enter(&mhp->mh_mutex);
1848 					mdsp->mds_bitmap[bit / NBPBMW] |=
1849 					    (1 << (bit % NBPBMW));
1850 					continue;
1851 				}
1852 
1853 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1854 				    SE_EXCL_WANTED | SE_RETIRED)) {
1855 					/*
1856 					 * Page in use elsewhere.  Skip it.
1857 					 */
1858 					MDSTAT_INCR(mhp, lockfail);
1859 					mutex_enter(&mhp->mh_mutex);
1860 					continue;
1861 				}
1862 				/*
1863 				 * See if the cage expanded into the delete.
1864 				 * This can happen as we have to allow the
1865 				 * cage to expand.
1866 				 */
1867 				if (PP_ISNORELOC(pp)) {
1868 					page_unlock(pp);
1869 					mutex_enter(&mhp->mh_mutex);
1870 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1871 					break;
1872 				}
1873 				if (PP_RETIRED(pp)) {
1874 					/*
1875 					 * Page has been retired and is
1876 					 * not part of the cage so we
1877 					 * can now do the accounting for
1878 					 * it.
1879 					 */
1880 					MDSTAT_INCR(mhp, retired);
1881 					mutex_enter(&mhp->mh_mutex);
1882 					mdsp->mds_bitmap[bit / NBPBMW]
1883 					    |= (1 << (bit % NBPBMW));
1884 					mdsp->mds_bitmap_retired[bit /
1885 					    NBPBMW] |=
1886 					    (1 << (bit % NBPBMW));
1887 					mhp->mh_hold_todo--;
1888 					continue;
1889 				}
1890 				ASSERT(freemem_left != 0);
1891 				if (PP_ISFREE(pp)) {
1892 					/*
1893 					 * Like page_reclaim() only 'freemem'
1894 					 * processing is already done.
1895 					 */
1896 					MDSTAT_INCR(mhp, nfree);
1897 				free_page_collect:
1898 					if (PP_ISAGED(pp)) {
1899 						page_list_sub(pp,
1900 						    PG_FREE_LIST);
1901 					} else {
1902 						page_list_sub(pp,
1903 						    PG_CACHE_LIST);
1904 					}
1905 					PP_CLRFREE(pp);
1906 					PP_CLRAGED(pp);
1907 					collected++;
1908 					mutex_enter(&mhp->mh_mutex);
1909 					page_delete_collect(pp, mhp);
1910 					mdsp->mds_bitmap[bit / NBPBMW] |=
1911 					    (1 << (bit % NBPBMW));
1912 					freemem_left--;
1913 					continue;
1914 				}
1915 				ASSERT(pp->p_vnode != NULL);
1916 				if (first_scan) {
1917 					MDSTAT_INCR(mhp, first_notfree);
1918 					page_unlock(pp);
1919 					mutex_enter(&mhp->mh_mutex);
1920 					continue;
1921 				}
1922 				/*
1923 				 * Keep stats on pages encountered that
1924 				 * are marked for retirement.
1925 				 */
1926 				if (PP_TOXIC(pp)) {
1927 					MDSTAT_INCR(mhp, toxic);
1928 				} else if (PP_PR_REQ(pp)) {
1929 					MDSTAT_INCR(mhp, failing);
1930 				}
1931 				/*
1932 				 * In certain cases below, special exceptions
1933 				 * are made for pages that are toxic.  This
1934 				 * is because the current meaning of toxic
1935 				 * is that an uncorrectable error has been
1936 				 * previously associated with the page.
1937 				 */
1938 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1939 					if (!PP_TOXIC(pp)) {
1940 						/*
1941 						 * Must relocate locked in
1942 						 * memory pages.
1943 						 */
1944 #ifdef MEM_DEL_STATS
1945 						start_pgrp = ddi_get_lbolt();
1946 #endif /* MEM_DEL_STATS */
1947 						/*
1948 						 * Lock all constituent pages
1949 						 * of a large page to ensure
1950 						 * that p_szc won't change.
1951 						 */
1952 						if (!group_page_trylock(pp,
1953 						    SE_EXCL)) {
1954 							MDSTAT_INCR(mhp,
1955 							    gptllckfail);
1956 							page_unlock(pp);
1957 							mutex_enter(
1958 							    &mhp->mh_mutex);
1959 							continue;
1960 						}
1961 						MDSTAT_INCR(mhp, npplocked);
1962 						pp_targ =
1963 						    page_get_replacement_page(
1964 						    pp, NULL, 0);
1965 						if (pp_targ != NULL) {
1966 #ifdef MEM_DEL_STATS
1967 							ntick_pgrp =
1968 							    (uint64_t)
1969 							    ddi_get_lbolt() -
1970 							    start_pgrp;
1971 #endif /* MEM_DEL_STATS */
1972 							MDSTAT_PGRP(mhp,
1973 							    ntick_pgrp);
1974 							MDSTAT_INCR(mhp,
1975 							    nlockreloc);
1976 							goto reloc;
1977 						}
1978 						group_page_unlock(pp);
1979 						page_unlock(pp);
1980 #ifdef MEM_DEL_STATS
1981 						ntick_pgrp =
1982 						    (uint64_t)ddi_get_lbolt() -
1983 						    start_pgrp;
1984 #endif /* MEM_DEL_STATS */
1985 						MDSTAT_PGRP(mhp, ntick_pgrp);
1986 						MDSTAT_INCR(mhp, nnorepl);
1987 						mutex_enter(&mhp->mh_mutex);
1988 						continue;
1989 					} else {
1990 						/*
1991 						 * Cannot do anything about
1992 						 * this page because it is
1993 						 * toxic.
1994 						 */
1995 						MDSTAT_INCR(mhp, npplkdtoxic);
1996 						page_unlock(pp);
1997 						mutex_enter(&mhp->mh_mutex);
1998 						continue;
1999 					}
2000 				}
2001 				/*
2002 				 * Unload the mappings and check if mod bit
2003 				 * is set.
2004 				 */
2005 				ASSERT(!PP_ISKAS(pp));
2006 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2007 				mod = hat_ismod(pp);
2008 
2009 #ifdef MEM_DEL_STATS
2010 				start_pgrp = ddi_get_lbolt();
2011 #endif /* MEM_DEL_STATS */
2012 				if (mod && !PP_TOXIC(pp)) {
2013 					/*
2014 					 * Lock all constituent pages
2015 					 * of a large page to ensure
2016 					 * that p_szc won't change.
2017 					 */
2018 					if (!group_page_trylock(pp, SE_EXCL)) {
2019 						MDSTAT_INCR(mhp, gptlmodfail);
2020 						page_unlock(pp);
2021 						mutex_enter(&mhp->mh_mutex);
2022 						continue;
2023 					}
2024 					pp_targ = page_get_replacement_page(pp,
2025 					    NULL, 0);
2026 					if (pp_targ != NULL) {
2027 						MDSTAT_INCR(mhp, nmodreloc);
2028 #ifdef MEM_DEL_STATS
2029 						ntick_pgrp =
2030 						    (uint64_t)ddi_get_lbolt() -
2031 						    start_pgrp;
2032 #endif /* MEM_DEL_STATS */
2033 						MDSTAT_PGRP(mhp, ntick_pgrp);
2034 						goto reloc;
2035 					}
2036 					group_page_unlock(pp);
2037 				}
2038 
2039 				if (!page_try_demote_pages(pp)) {
2040 					MDSTAT_INCR(mhp, demotefail);
2041 					page_unlock(pp);
2042 #ifdef MEM_DEL_STATS
2043 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2044 					    start_pgrp;
2045 #endif /* MEM_DEL_STATS */
2046 					MDSTAT_PGRP(mhp, ntick_pgrp);
2047 					mutex_enter(&mhp->mh_mutex);
2048 					continue;
2049 				}
2050 
2051 				/*
2052 				 * Regular 'page-out'.
2053 				 */
2054 				if (!mod) {
2055 					MDSTAT_INCR(mhp, ndestroy);
2056 					page_destroy(pp, 1);
2057 					/*
2058 					 * page_destroy was called with
2059 					 * dontfree. As long as p_lckcnt
2060 					 * and p_cowcnt are both zero, the
2061 					 * only additional action of
2062 					 * page_destroy with !dontfree is to
2063 					 * call page_free, so we can collect
2064 					 * the page here.
2065 					 */
2066 					collected++;
2067 #ifdef MEM_DEL_STATS
2068 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2069 					    start_pgrp;
2070 #endif /* MEM_DEL_STATS */
2071 					MDSTAT_PGRP(mhp, ntick_pgrp);
2072 					mutex_enter(&mhp->mh_mutex);
2073 					page_delete_collect(pp, mhp);
2074 					mdsp->mds_bitmap[bit / NBPBMW] |=
2075 					    (1 << (bit % NBPBMW));
2076 					continue;
2077 				}
2078 				/*
2079 				 * The page is toxic and the mod bit is
2080 				 * set, we cannot do anything here to deal
2081 				 * with it.
2082 				 */
2083 				if (PP_TOXIC(pp)) {
2084 					page_unlock(pp);
2085 #ifdef MEM_DEL_STATS
2086 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2087 					    start_pgrp;
2088 #endif /* MEM_DEL_STATS */
2089 					MDSTAT_PGRP(mhp, ntick_pgrp);
2090 					MDSTAT_INCR(mhp, modtoxic);
2091 					mutex_enter(&mhp->mh_mutex);
2092 					continue;
2093 				}
2094 				MDSTAT_INCR(mhp, nputpage);
2095 				vp = pp->p_vnode;
2096 				offset = pp->p_offset;
2097 				VN_HOLD(vp);
2098 				page_unlock(pp);
2099 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2100 				    B_INVAL|B_FORCE, kcred, NULL);
2101 				VN_RELE(vp);
2102 #ifdef MEM_DEL_STATS
2103 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2104 				    start_pgrp;
2105 #endif /* MEM_DEL_STATS */
2106 				MDSTAT_PGRP(mhp, ntick_pgrp);
2107 				/*
2108 				 * Try to get the page back immediately
2109 				 * so that it can be collected.
2110 				 */
2111 				pp = page_numtopp_nolock(pfn);
2112 				if (pp == NULL) {
2113 					MDSTAT_INCR(mhp, nnoreclaim);
2114 					/*
2115 					 * This should not happen as this
2116 					 * thread is deleting the page.
2117 					 * If this code is generalized, this
2118 					 * becomes a reality.
2119 					 */
2120 #ifdef DEBUG
2121 					cmn_err(CE_WARN,
2122 					    "delete_memory_thread(0x%p) "
2123 					    "pfn 0x%lx has no page_t",
2124 					    (void *)mhp, pfn);
2125 #endif /* DEBUG */
2126 					mutex_enter(&mhp->mh_mutex);
2127 					continue;
2128 				}
2129 				if (page_try_reclaim_lock(pp, SE_EXCL,
2130 				    SE_EXCL_WANTED | SE_RETIRED)) {
2131 					if (PP_ISFREE(pp)) {
2132 						goto free_page_collect;
2133 					}
2134 					page_unlock(pp);
2135 				}
2136 				MDSTAT_INCR(mhp, nnoreclaim);
2137 				mutex_enter(&mhp->mh_mutex);
2138 				continue;
2139 
2140 			reloc:
2141 				/*
2142 				 * Got some freemem and a target
2143 				 * page, so move the data to avoid
2144 				 * I/O and lock problems.
2145 				 */
2146 				ASSERT(!page_iolock_assert(pp));
2147 				MDSTAT_INCR(mhp, nreloc);
2148 				/*
2149 				 * page_relocate() will return pgcnt: the
2150 				 * number of consecutive pages relocated.
2151 				 * If it is successful, pp will be a
2152 				 * linked list of the page structs that
2153 				 * were relocated. If page_relocate() is
2154 				 * unsuccessful, pp will be unmodified.
2155 				 */
2156 #ifdef MEM_DEL_STATS
2157 				start_pgrp = ddi_get_lbolt();
2158 #endif /* MEM_DEL_STATS */
2159 				result = page_relocate(&pp, &pp_targ, 0, 0,
2160 				    &pgcnt, NULL);
2161 #ifdef MEM_DEL_STATS
2162 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2163 				    start_pgrp;
2164 #endif /* MEM_DEL_STATS */
2165 				MDSTAT_PGRP(mhp, ntick_pgrp);
2166 				if (result != 0) {
2167 					MDSTAT_INCR(mhp, nrelocfail);
2168 					/*
2169 					 * We did not succeed. We need
2170 					 * to give the pp_targ pages back.
2171 					 * page_free(pp_targ, 1) without
2172 					 * the freemem accounting.
2173 					 */
2174 					group_page_unlock(pp);
2175 					page_free_replacement_page(pp_targ);
2176 					page_unlock(pp);
2177 					mutex_enter(&mhp->mh_mutex);
2178 					continue;
2179 				}
2180 
2181 				/*
2182 				 * We will then collect pgcnt pages.
2183 				 */
2184 				ASSERT(pgcnt > 0);
2185 				mutex_enter(&mhp->mh_mutex);
2186 				/*
2187 				 * We need to make sure freemem_left is
2188 				 * large enough.
2189 				 */
2190 				while ((freemem_left < pgcnt) &&
2191 				    (!mhp->mh_cancel)) {
2192 					freemem_left +=
2193 					    delthr_get_freemem(mhp);
2194 				}
2195 
2196 				/*
2197 				 * Do not proceed if mh_cancel is set.
2198 				 */
2199 				if (mhp->mh_cancel) {
2200 					while (pp_targ != NULL) {
2201 						/*
2202 						 * Unlink and unlock each page.
2203 						 */
2204 						tpp_targ = pp_targ;
2205 						page_sub(&pp_targ, tpp_targ);
2206 						page_unlock(tpp_targ);
2207 					}
2208 					/*
2209 					 * We need to give the pp pages back.
2210 					 * page_free(pp, 1) without the
2211 					 * freemem accounting.
2212 					 */
2213 					page_free_replacement_page(pp);
2214 					break;
2215 				}
2216 
2217 				/* Now remove pgcnt from freemem_left */
2218 				freemem_left -= pgcnt;
2219 				ASSERT(freemem_left >= 0);
2220 				szc = pp->p_szc;
2221 				while (pp != NULL) {
2222 					/*
2223 					 * pp and pp_targ were passed back as
2224 					 * a linked list of pages.
2225 					 * Unlink and unlock each page.
2226 					 */
2227 					tpp_targ = pp_targ;
2228 					page_sub(&pp_targ, tpp_targ);
2229 					page_unlock(tpp_targ);
2230 					/*
2231 					 * The original page is now free
2232 					 * so remove it from the linked
2233 					 * list and collect it.
2234 					 */
2235 					tpp = pp;
2236 					page_sub(&pp, tpp);
2237 					pfn = page_pptonum(tpp);
2238 					collected++;
2239 					ASSERT(PAGE_EXCL(tpp));
2240 					ASSERT(tpp->p_vnode == NULL);
2241 					ASSERT(!hat_page_is_mapped(tpp));
2242 					ASSERT(tpp->p_szc == szc);
2243 					tpp->p_szc = 0;
2244 					page_delete_collect(tpp, mhp);
2245 					bit = pfn - mdsp->mds_base;
2246 					mdsp->mds_bitmap[bit / NBPBMW] |=
2247 					    (1 << (bit % NBPBMW));
2248 				}
2249 				ASSERT(pp_targ == NULL);
2250 			}
2251 		}
2252 		first_scan = 0;
2253 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2254 		    (collected == 0)) {
2255 			/*
2256 			 * This code is needed as we cannot wait
2257 			 * for a page to be locked OR the delete to
2258 			 * be cancelled.  Also, we must delay so
2259 			 * that other threads get a chance to run
2260 			 * on our cpu, otherwise page locks may be
2261 			 * held indefinitely by those threads.
2262 			 */
2263 			MDSTAT_INCR(mhp, ndelay);
2264 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2265 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2266 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2267 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2268 		}
2269 	}
2270 	/* stop the dr aio cleanup thread */
2271 	mhp->mh_dr_aio_cleanup_cancel = 1;
2272 	transit_list_collect(mhp, 0);
2273 	if (freemem_left != 0) {
2274 		/* Return any surplus. */
2275 		page_create_putback(freemem_left);
2276 		freemem_left = 0;
2277 	}
2278 #ifdef MEM_DEL_STATS
2279 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2280 #endif /* MEM_DEL_STATS */
2281 	MDSTAT_TOTAL(mhp, ntick_total);
2282 	MDSTAT_PRINT(mhp);
2283 
2284 	/*
2285 	 * If the memory delete was cancelled, exclusive-wanted bits must
2286 	 * be cleared. If there are retired pages being deleted, they need
2287 	 * to be unretired.
2288 	 */
2289 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2290 	    mdsp = mdsp->mds_next) {
2291 		pfn_t pfn, p_end;
2292 
2293 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2294 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2295 			page_t *pp;
2296 			pgcnt_t bit;
2297 
2298 			bit = pfn - mdsp->mds_base;
2299 			if (mhp->mh_cancel) {
2300 				pp = page_numtopp_nolock(pfn);
2301 				if (pp != NULL) {
2302 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2303 					    (1 << (bit % NBPBMW))) == 0) {
2304 						page_lock_clr_exclwanted(pp);
2305 					}
2306 				}
2307 			} else {
2308 				pp = NULL;
2309 			}
2310 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2311 			    (1 << (bit % NBPBMW))) != 0) {
2312 				/* do we already have pp? */
2313 				if (pp == NULL) {
2314 					pp = page_numtopp_nolock(pfn);
2315 				}
2316 				ASSERT(pp != NULL);
2317 				ASSERT(PP_RETIRED(pp));
2318 				if (mhp->mh_cancel != 0) {
2319 					page_unlock(pp);
2320 					/*
2321 					 * To satisfy ASSERT below in
2322 					 * cancel code.
2323 					 */
2324 					mhp->mh_hold_todo++;
2325 				} else {
2326 					(void) page_unretire_pp(pp,
2327 					    PR_UNR_CLEAN);
2328 				}
2329 			}
2330 		}
2331 	}
2332 	/*
2333 	 * Free retired page bitmap and collected page bitmap
2334 	 */
2335 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2336 	    mdsp = mdsp->mds_next) {
2337 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2338 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2339 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2340 		ASSERT(mdsp->mds_bitmap != NULL);
2341 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2342 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2343 	}
2344 
2345 	/* wait for our dr aio cancel thread to exit */
2346 	while (!(mhp->mh_aio_cleanup_done)) {
2347 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2348 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2349 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2350 	}
2351 refused:
2352 	if (mhp->mh_cancel != 0) {
2353 		page_t *pp;
2354 
2355 		comp_code = mhp->mh_cancel;
2356 		/*
2357 		 * Go through list of deleted pages (mh_deleted) freeing
2358 		 * them.
2359 		 */
2360 		while ((pp = mhp->mh_deleted) != NULL) {
2361 			mhp->mh_deleted = pp->p_next;
2362 			mhp->mh_hold_todo++;
2363 			mutex_exit(&mhp->mh_mutex);
2364 			/* Restore p_next. */
2365 			pp->p_next = pp->p_prev;
2366 			if (PP_ISFREE(pp)) {
2367 				cmn_err(CE_PANIC,
2368 				    "page %p is free",
2369 				    (void *)pp);
2370 			}
2371 			page_free(pp, 1);
2372 			mutex_enter(&mhp->mh_mutex);
2373 		}
2374 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2375 
2376 		mutex_exit(&mhp->mh_mutex);
2377 		put_availrmem(mhp->mh_vm_pages);
2378 		mutex_enter(&mhp->mh_mutex);
2379 
2380 		goto t_exit;
2381 	}
2382 
2383 	/*
2384 	 * All the pages are no longer in use and are exclusively locked.
2385 	 */
2386 
2387 	mhp->mh_deleted = NULL;
2388 
2389 	kphysm_del_cleanup(mhp);
2390 
2391 	/*
2392 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
2393 	 * that the mem_node_config[] will remain intact for the cleanup.
2394 	 */
2395 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2396 	    mdsp = mdsp->mds_next) {
2397 		mem_node_del_range(mdsp->mds_base,
2398 		    mdsp->mds_base + mdsp->mds_npgs - 1);
2399 	}
2400 	/* cleanup the page counters */
2401 	page_ctrs_cleanup();
2402 
2403 	comp_code = KPHYSM_OK;
2404 
2405 t_exit:
2406 	mutex_exit(&mhp->mh_mutex);
2407 	kphysm_setup_post_del(mhp->mh_vm_pages,
2408 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2409 	mutex_enter(&mhp->mh_mutex);
2410 
2411 early_exit:
2412 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2413 	mhp->mh_state = MHND_DONE;
2414 	del_complete_funcp = mhp->mh_delete_complete;
2415 	del_complete_arg = mhp->mh_delete_complete_arg;
2416 	CALLB_CPR_EXIT(&cprinfo);
2417 	(*del_complete_funcp)(del_complete_arg, comp_code);
2418 	thread_exit();
2419 	/*NOTREACHED*/
2420 }
2421 
2422 /*
2423  * Start the delete of the memory from the system.
2424  */
2425 int
2426 kphysm_del_start(
2427 	memhandle_t handle,
2428 	void (*complete)(void *, int),
2429 	void *complete_arg)
2430 {
2431 	struct mem_handle *mhp;
2432 
2433 	mhp = kphysm_lookup_mem_handle(handle);
2434 	if (mhp == NULL) {
2435 		return (KPHYSM_EHANDLE);
2436 	}
2437 	switch (mhp->mh_state) {
2438 	case MHND_FREE:
2439 		ASSERT(mhp->mh_state != MHND_FREE);
2440 		mutex_exit(&mhp->mh_mutex);
2441 		return (KPHYSM_EHANDLE);
2442 	case MHND_INIT:
2443 		break;
2444 	case MHND_STARTING:
2445 	case MHND_RUNNING:
2446 		mutex_exit(&mhp->mh_mutex);
2447 		return (KPHYSM_ESEQUENCE);
2448 	case MHND_DONE:
2449 		mutex_exit(&mhp->mh_mutex);
2450 		return (KPHYSM_ESEQUENCE);
2451 	case MHND_RELEASE:
2452 		mutex_exit(&mhp->mh_mutex);
2453 		return (KPHYSM_ESEQUENCE);
2454 	default:
2455 #ifdef DEBUG
2456 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2457 		    (void *)mhp, mhp->mh_state);
2458 #endif /* DEBUG */
2459 		mutex_exit(&mhp->mh_mutex);
2460 		return (KPHYSM_EHANDLE);
2461 	}
2462 
2463 	if (mhp->mh_transit.trl_spans == NULL) {
2464 		mutex_exit(&mhp->mh_mutex);
2465 		return (KPHYSM_ENOWORK);
2466 	}
2467 
2468 	ASSERT(complete != NULL);
2469 	mhp->mh_delete_complete = complete;
2470 	mhp->mh_delete_complete_arg = complete_arg;
2471 	mhp->mh_state = MHND_STARTING;
2472 	/*
2473 	 * Release the mutex in case thread_create sleeps.
2474 	 */
2475 	mutex_exit(&mhp->mh_mutex);
2476 
2477 	/*
2478 	 * The "obvious" process for this thread is pageout (proc_pageout)
2479 	 * but this gives the thread too much power over freemem
2480 	 * which results in freemem starvation.
2481 	 */
2482 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2483 	    TS_RUN, maxclsyspri - 1);
2484 
2485 	return (KPHYSM_OK);
2486 }
2487 
2488 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2489 static caddr_t pp_dummy;
2490 static pgcnt_t pp_dummy_npages;
2491 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2492 
2493 static void
2494 memseg_remap_init_pages(page_t *pages, page_t *epages)
2495 {
2496 	page_t *pp;
2497 
2498 	for (pp = pages; pp < epages; pp++) {
2499 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2500 		pp->p_offset = (u_offset_t)-1;
2501 		page_iolock_init(pp);
2502 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2503 			continue;
2504 		page_lock_delete(pp);
2505 	}
2506 }
2507 
2508 void
2509 memseg_remap_init()
2510 {
2511 	mutex_enter(&pp_dummy_lock);
2512 	if (pp_dummy == NULL) {
2513 		uint_t dpages;
2514 		int i;
2515 
2516 		/*
2517 		 * dpages starts off as the size of the structure and
2518 		 * ends up as the minimum number of pages that will
2519 		 * hold a whole number of page_t structures.
2520 		 */
2521 		dpages = sizeof (page_t);
2522 		ASSERT(dpages != 0);
2523 		ASSERT(dpages <= MMU_PAGESIZE);
2524 
2525 		while ((dpages & 1) == 0)
2526 			dpages >>= 1;
2527 
2528 		pp_dummy_npages = dpages;
2529 		/*
2530 		 * Allocate pp_dummy pages directly from static_arena,
2531 		 * since these are whole page allocations and are
2532 		 * referenced by physical address.  This also has the
2533 		 * nice fringe benefit of hiding the memory from
2534 		 * ::findleaks since it doesn't deal well with allocated
2535 		 * kernel heap memory that doesn't have any mappings.
2536 		 */
2537 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2538 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2539 		bzero(pp_dummy, ptob(pp_dummy_npages));
2540 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2541 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2542 		    pp_dummy_npages, KM_SLEEP);
2543 		for (i = 0; i < pp_dummy_npages; i++) {
2544 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2545 			    &pp_dummy[MMU_PAGESIZE * i]);
2546 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2547 		}
2548 		/*
2549 		 * Initialize the page_t's to a known 'deleted' state
2550 		 * that matches the state of deleted pages.
2551 		 */
2552 		memseg_remap_init_pages((page_t *)pp_dummy,
2553 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2554 		/* Remove kmem mappings for the pages for safety. */
2555 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2556 		    HAT_UNLOAD_UNLOCK);
2557 		/* Leave pp_dummy pointer set as flag that init is done. */
2558 	}
2559 	mutex_exit(&pp_dummy_lock);
2560 }
2561 
2562 /*
2563  * Remap a page-aglined range of page_t's to dummy pages.
2564  */
2565 void
2566 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2567 {
2568 	int phase;
2569 
2570 	ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2571 
2572 	/*
2573 	 * We may start remapping at a non-zero page offset
2574 	 * within the dummy pages since the low/high ends
2575 	 * of the outgoing pp's could be shared by other
2576 	 * memsegs (see memseg_remap_meta).
2577 	 */
2578 	phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2579 	/*CONSTCOND*/
2580 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2581 
2582 	while (metapgs != 0) {
2583 		pgcnt_t n;
2584 		int i, j;
2585 
2586 		n = pp_dummy_npages;
2587 		if (n > metapgs)
2588 			n = metapgs;
2589 		for (i = 0; i < n; i++) {
2590 			j = (i + phase) % pp_dummy_npages;
2591 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2592 			    PROT_READ,
2593 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2594 			    HAT_LOAD_REMAP);
2595 			va += ptob(1);
2596 		}
2597 		metapgs -= n;
2598 	}
2599 }
2600 
2601 static void
2602 memseg_remap_to_dummy(struct memseg *seg)
2603 {
2604 	caddr_t pp;
2605 	pgcnt_t metapgs;
2606 
2607 	ASSERT(memseg_is_dynamic(seg));
2608 	ASSERT(pp_dummy != NULL);
2609 
2610 
2611 	if (!memseg_includes_meta(seg)) {
2612 		memseg_remap_meta(seg);
2613 		return;
2614 	}
2615 
2616 	pp = (caddr_t)seg->pages;
2617 	metapgs = seg->pages_base - memseg_get_start(seg);
2618 	ASSERT(metapgs != 0);
2619 
2620 	seg->pages_end = seg->pages_base;
2621 
2622 	remap_to_dummy(pp, metapgs);
2623 }
2624 
2625 /*
2626  * Transition all the deleted pages to the deleted state so that
2627  * page_lock will not wait. The page_lock_delete call will
2628  * also wake up any waiters.
2629  */
2630 static void
2631 memseg_lock_delete_all(struct memseg *seg)
2632 {
2633 	page_t *pp;
2634 
2635 	for (pp = seg->pages; pp < seg->epages; pp++) {
2636 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2637 		page_lock_delete(pp);
2638 	}
2639 }
2640 
2641 static void
2642 kphysm_del_cleanup(struct mem_handle *mhp)
2643 {
2644 	struct memdelspan	*mdsp;
2645 	struct memseg		*seg;
2646 	struct memseg		**segpp;
2647 	struct memseg		*seglist;
2648 	pfn_t			p_end;
2649 	uint64_t		avmem;
2650 	pgcnt_t			avpgs;
2651 	pgcnt_t			npgs;
2652 
2653 	avpgs = mhp->mh_vm_pages;
2654 
2655 	memsegs_lock(1);
2656 
2657 	/*
2658 	 * remove from main segment list.
2659 	 */
2660 	npgs = 0;
2661 	seglist = NULL;
2662 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2663 	    mdsp = mdsp->mds_next) {
2664 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2665 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2666 			if (seg->pages_base >= p_end ||
2667 			    seg->pages_end <= mdsp->mds_base) {
2668 				/* Span and memseg don't overlap. */
2669 				segpp = &((*segpp)->next);
2670 				continue;
2671 			}
2672 			ASSERT(seg->pages_base >= mdsp->mds_base);
2673 			ASSERT(seg->pages_end <= p_end);
2674 
2675 			PLCNT_MODIFY_MAX(seg->pages_base,
2676 			    seg->pages_base - seg->pages_end);
2677 
2678 			/* Hide the memseg from future scans. */
2679 			hat_kpm_delmem_mseg_update(seg, segpp);
2680 			*segpp = seg->next;
2681 			membar_producer();	/* TODO: Needed? */
2682 			npgs += MSEG_NPAGES(seg);
2683 
2684 			/*
2685 			 * Leave the deleted segment's next pointer intact
2686 			 * in case a memsegs scanning loop is walking this
2687 			 * segment concurrently.
2688 			 */
2689 			seg->lnext = seglist;
2690 			seglist = seg;
2691 		}
2692 	}
2693 
2694 	build_pfn_hash();
2695 
2696 	ASSERT(npgs < total_pages);
2697 	total_pages -= npgs;
2698 
2699 	/*
2700 	 * Recalculate the paging parameters now total_pages has changed.
2701 	 * This will also cause the clock hands to be reset before next use.
2702 	 */
2703 	setupclock();
2704 
2705 	memsegs_unlock(1);
2706 
2707 	mutex_exit(&mhp->mh_mutex);
2708 
2709 	while ((seg = seglist) != NULL) {
2710 		pfn_t mseg_start;
2711 		pfn_t mseg_base, mseg_end;
2712 		pgcnt_t mseg_npgs;
2713 		int mlret;
2714 
2715 		seglist = seg->lnext;
2716 
2717 		/*
2718 		 * Put the page_t's into the deleted state to stop
2719 		 * cv_wait()s on the pages. When we remap, the dummy
2720 		 * page_t's will be in the same state.
2721 		 */
2722 		memseg_lock_delete_all(seg);
2723 		/*
2724 		 * Collect up information based on pages_base and pages_end
2725 		 * early so that we can flag early that the memseg has been
2726 		 * deleted by setting pages_end == pages_base.
2727 		 */
2728 		mseg_base = seg->pages_base;
2729 		mseg_end = seg->pages_end;
2730 		mseg_npgs = MSEG_NPAGES(seg);
2731 		mseg_start = memseg_get_start(seg);
2732 
2733 		if (memseg_is_dynamic(seg)) {
2734 			/* Remap the meta data to our special dummy area. */
2735 			memseg_remap_to_dummy(seg);
2736 
2737 			mutex_enter(&memseg_lists_lock);
2738 			seg->lnext = memseg_va_avail;
2739 			memseg_va_avail = seg;
2740 			mutex_exit(&memseg_lists_lock);
2741 		} else {
2742 			/*
2743 			 * For memory whose page_ts were allocated
2744 			 * at boot, we need to find a new use for
2745 			 * the page_t memory.
2746 			 * For the moment, just leak it.
2747 			 * (It is held in the memseg_delete_junk list.)
2748 			 */
2749 			seg->pages_end = seg->pages_base;
2750 
2751 			mutex_enter(&memseg_lists_lock);
2752 			seg->lnext = memseg_delete_junk;
2753 			memseg_delete_junk = seg;
2754 			mutex_exit(&memseg_lists_lock);
2755 		}
2756 
2757 		/* Must not use seg now as it could be re-used. */
2758 
2759 		memlist_write_lock();
2760 
2761 		mlret = memlist_delete_span(
2762 		    (uint64_t)(mseg_base) << PAGESHIFT,
2763 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2764 		    &phys_avail);
2765 		ASSERT(mlret == MEML_SPANOP_OK);
2766 
2767 		mlret = memlist_delete_span(
2768 		    (uint64_t)(mseg_start) << PAGESHIFT,
2769 		    (uint64_t)(mseg_end - mseg_start) <<
2770 		    PAGESHIFT,
2771 		    &phys_install);
2772 		ASSERT(mlret == MEML_SPANOP_OK);
2773 		phys_install_has_changed();
2774 
2775 		memlist_write_unlock();
2776 	}
2777 
2778 	memlist_read_lock();
2779 	installed_top_size(phys_install, &physmax, &physinstalled);
2780 	memlist_read_unlock();
2781 
2782 	mutex_enter(&freemem_lock);
2783 	maxmem -= avpgs;
2784 	physmem -= avpgs;
2785 	/* availrmem is adjusted during the delete. */
2786 	availrmem_initial -= avpgs;
2787 
2788 	mutex_exit(&freemem_lock);
2789 
2790 	dump_resize();
2791 
2792 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2793 	    "(0x%" PRIx64 ")\n",
2794 	    physinstalled << (PAGESHIFT - 10),
2795 	    (uint64_t)physinstalled << PAGESHIFT);
2796 
2797 	avmem = (uint64_t)freemem << PAGESHIFT;
2798 	cmn_err(CE_CONT, "?kphysm_delete: "
2799 	    "avail mem = %" PRId64 "\n", avmem);
2800 
2801 	/*
2802 	 * Update lgroup generation number on single lgroup systems
2803 	 */
2804 	if (nlgrps == 1)
2805 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2806 
2807 	/* Successfully deleted system memory */
2808 	mutex_enter(&mhp->mh_mutex);
2809 }
2810 
2811 static uint_t mdel_nullvp_waiter;
2812 
2813 static void
2814 page_delete_collect(
2815 	page_t *pp,
2816 	struct mem_handle *mhp)
2817 {
2818 	if (pp->p_vnode) {
2819 		page_hashout(pp, (kmutex_t *)NULL);
2820 		/* do not do PP_SETAGED(pp); */
2821 	} else {
2822 		kmutex_t *sep;
2823 
2824 		sep = page_se_mutex(pp);
2825 		mutex_enter(sep);
2826 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2827 			mdel_nullvp_waiter++;
2828 			cv_broadcast(&pp->p_cv);
2829 		}
2830 		mutex_exit(sep);
2831 	}
2832 	ASSERT(pp->p_next == pp->p_prev);
2833 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2834 	pp->p_next = mhp->mh_deleted;
2835 	mhp->mh_deleted = pp;
2836 	ASSERT(mhp->mh_hold_todo != 0);
2837 	mhp->mh_hold_todo--;
2838 }
2839 
2840 static void
2841 transit_list_collect(struct mem_handle *mhp, int v)
2842 {
2843 	struct transit_list_head *trh;
2844 
2845 	trh = &transit_list_head;
2846 	mutex_enter(&trh->trh_lock);
2847 	mhp->mh_transit.trl_collect = v;
2848 	mutex_exit(&trh->trh_lock);
2849 }
2850 
2851 static void
2852 transit_list_insert(struct transit_list *tlp)
2853 {
2854 	struct transit_list_head *trh;
2855 
2856 	trh = &transit_list_head;
2857 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2858 	tlp->trl_next = trh->trh_head;
2859 	trh->trh_head = tlp;
2860 }
2861 
2862 static void
2863 transit_list_remove(struct transit_list *tlp)
2864 {
2865 	struct transit_list_head *trh;
2866 	struct transit_list **tlpp;
2867 
2868 	trh = &transit_list_head;
2869 	tlpp = &trh->trh_head;
2870 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2871 	while (*tlpp != NULL && *tlpp != tlp)
2872 		tlpp = &(*tlpp)->trl_next;
2873 	ASSERT(*tlpp != NULL);
2874 	if (*tlpp == tlp)
2875 		*tlpp = tlp->trl_next;
2876 	tlp->trl_next = NULL;
2877 }
2878 
2879 static struct transit_list *
2880 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2881 {
2882 	struct transit_list *tlp;
2883 
2884 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2885 		struct memdelspan *mdsp;
2886 
2887 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2888 		    mdsp = mdsp->mds_next) {
2889 			if (pfnum >= mdsp->mds_base &&
2890 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2891 				return (tlp);
2892 			}
2893 		}
2894 	}
2895 	return (NULL);
2896 }
2897 
2898 int
2899 pfn_is_being_deleted(pfn_t pfnum)
2900 {
2901 	struct transit_list_head *trh;
2902 	struct transit_list *tlp;
2903 	int ret;
2904 
2905 	trh = &transit_list_head;
2906 	if (trh->trh_head == NULL)
2907 		return (0);
2908 
2909 	mutex_enter(&trh->trh_lock);
2910 	tlp = pfnum_to_transit_list(trh, pfnum);
2911 	ret = (tlp != NULL && tlp->trl_collect);
2912 	mutex_exit(&trh->trh_lock);
2913 
2914 	return (ret);
2915 }
2916 
2917 #ifdef MEM_DEL_STATS
2918 extern int hz;
2919 static void
2920 mem_del_stat_print_func(struct mem_handle *mhp)
2921 {
2922 	uint64_t tmp;
2923 
2924 	if (mem_del_stat_print) {
2925 		printf("memory delete loop %x/%x, statistics%s\n",
2926 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2927 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2928 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2929 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2930 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2931 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2932 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2933 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2934 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2935 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2936 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2937 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2938 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2939 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2940 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2941 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2942 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2943 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2944 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2945 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2946 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2947 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2948 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2949 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2950 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2951 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2952 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2953 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2954 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2955 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2956 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2957 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2958 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2959 		printf(
2960 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2961 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2962 
2963 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2964 		printf(
2965 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2966 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2967 	}
2968 }
2969 #endif /* MEM_DEL_STATS */
2970 
2971 struct mem_callback {
2972 	kphysm_setup_vector_t	*vec;
2973 	void			*arg;
2974 };
2975 
2976 #define	NMEMCALLBACKS		100
2977 
2978 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2979 static uint_t nmemcallbacks;
2980 static krwlock_t mem_callback_rwlock;
2981 
2982 int
2983 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2984 {
2985 	uint_t i, found;
2986 
2987 	/*
2988 	 * This test will become more complicated when the version must
2989 	 * change.
2990 	 */
2991 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2992 		return (EINVAL);
2993 
2994 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2995 	    vec->post_del == NULL)
2996 		return (EINVAL);
2997 
2998 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2999 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
3000 		if (mem_callbacks[i].vec == NULL && found == 0)
3001 			found = i + 1;
3002 		if (mem_callbacks[i].vec == vec &&
3003 		    mem_callbacks[i].arg == arg) {
3004 #ifdef DEBUG
3005 			/* Catch this in DEBUG kernels. */
3006 			cmn_err(CE_WARN, "kphysm_setup_func_register"
3007 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
3008 			    (void *)vec, arg, (void *)caller());
3009 #endif /* DEBUG */
3010 			rw_exit(&mem_callback_rwlock);
3011 			return (EEXIST);
3012 		}
3013 	}
3014 	if (found != 0) {
3015 		i = found - 1;
3016 	} else {
3017 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
3018 		if (nmemcallbacks == NMEMCALLBACKS) {
3019 			rw_exit(&mem_callback_rwlock);
3020 			return (ENOMEM);
3021 		}
3022 		i = nmemcallbacks++;
3023 	}
3024 	mem_callbacks[i].vec = vec;
3025 	mem_callbacks[i].arg = arg;
3026 	rw_exit(&mem_callback_rwlock);
3027 	return (0);
3028 }
3029 
3030 void
3031 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3032 {
3033 	uint_t i;
3034 
3035 	rw_enter(&mem_callback_rwlock, RW_WRITER);
3036 	for (i = 0; i < nmemcallbacks; i++) {
3037 		if (mem_callbacks[i].vec == vec &&
3038 		    mem_callbacks[i].arg == arg) {
3039 			mem_callbacks[i].vec = NULL;
3040 			mem_callbacks[i].arg = NULL;
3041 			if (i == (nmemcallbacks - 1))
3042 				nmemcallbacks--;
3043 			break;
3044 		}
3045 	}
3046 	rw_exit(&mem_callback_rwlock);
3047 }
3048 
3049 static void
3050 kphysm_setup_post_add(pgcnt_t delta_pages)
3051 {
3052 	uint_t i;
3053 
3054 	rw_enter(&mem_callback_rwlock, RW_READER);
3055 	for (i = 0; i < nmemcallbacks; i++) {
3056 		if (mem_callbacks[i].vec != NULL) {
3057 			(*mem_callbacks[i].vec->post_add)
3058 			    (mem_callbacks[i].arg, delta_pages);
3059 		}
3060 	}
3061 	rw_exit(&mem_callback_rwlock);
3062 }
3063 
3064 /*
3065  * Note the locking between pre_del and post_del: The reader lock is held
3066  * between the two calls to stop the set of functions from changing.
3067  */
3068 
3069 static int
3070 kphysm_setup_pre_del(pgcnt_t delta_pages)
3071 {
3072 	uint_t i;
3073 	int ret;
3074 	int aret;
3075 
3076 	ret = 0;
3077 	rw_enter(&mem_callback_rwlock, RW_READER);
3078 	for (i = 0; i < nmemcallbacks; i++) {
3079 		if (mem_callbacks[i].vec != NULL) {
3080 			aret = (*mem_callbacks[i].vec->pre_del)
3081 			    (mem_callbacks[i].arg, delta_pages);
3082 			ret |= aret;
3083 		}
3084 	}
3085 
3086 	return (ret);
3087 }
3088 
3089 static void
3090 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3091 {
3092 	uint_t i;
3093 
3094 	for (i = 0; i < nmemcallbacks; i++) {
3095 		if (mem_callbacks[i].vec != NULL) {
3096 			(*mem_callbacks[i].vec->post_del)
3097 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3098 		}
3099 	}
3100 	rw_exit(&mem_callback_rwlock);
3101 }
3102 
3103 static int
3104 kphysm_split_memseg(
3105 	pfn_t base,
3106 	pgcnt_t npgs)
3107 {
3108 	struct memseg *seg;
3109 	struct memseg **segpp;
3110 	pgcnt_t size_low, size_high;
3111 	struct memseg *seg_low, *seg_mid, *seg_high;
3112 
3113 	/*
3114 	 * Lock the memsegs list against other updates now
3115 	 */
3116 	memsegs_lock(1);
3117 
3118 	/*
3119 	 * Find boot time memseg that wholly covers this area.
3120 	 */
3121 
3122 	/* First find the memseg with page 'base' in it. */
3123 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3124 	    segpp = &((*segpp)->next)) {
3125 		if (base >= seg->pages_base && base < seg->pages_end)
3126 			break;
3127 	}
3128 	if (seg == NULL) {
3129 		memsegs_unlock(1);
3130 		return (0);
3131 	}
3132 	if (memseg_includes_meta(seg)) {
3133 		memsegs_unlock(1);
3134 		return (0);
3135 	}
3136 	if ((base + npgs) > seg->pages_end) {
3137 		memsegs_unlock(1);
3138 		return (0);
3139 	}
3140 
3141 	/*
3142 	 * Work out the size of the two segments that will
3143 	 * surround the new segment, one for low address
3144 	 * and one for high.
3145 	 */
3146 	ASSERT(base >= seg->pages_base);
3147 	size_low = base - seg->pages_base;
3148 	ASSERT(seg->pages_end >= (base + npgs));
3149 	size_high = seg->pages_end - (base + npgs);
3150 
3151 	/*
3152 	 * Sanity check.
3153 	 */
3154 	if ((size_low + size_high) == 0) {
3155 		memsegs_unlock(1);
3156 		return (0);
3157 	}
3158 
3159 	/*
3160 	 * Allocate the new structures. The old memseg will not be freed
3161 	 * as there may be a reference to it.
3162 	 */
3163 	seg_low = NULL;
3164 	seg_high = NULL;
3165 
3166 	if (size_low != 0)
3167 		seg_low = memseg_alloc();
3168 
3169 	seg_mid = memseg_alloc();
3170 
3171 	if (size_high != 0)
3172 		seg_high = memseg_alloc();
3173 
3174 	/*
3175 	 * All allocation done now.
3176 	 */
3177 	if (size_low != 0) {
3178 		seg_low->pages = seg->pages;
3179 		seg_low->epages = seg_low->pages + size_low;
3180 		seg_low->pages_base = seg->pages_base;
3181 		seg_low->pages_end = seg_low->pages_base + size_low;
3182 		seg_low->next = seg_mid;
3183 		seg_low->msegflags = seg->msegflags;
3184 	}
3185 	if (size_high != 0) {
3186 		seg_high->pages = seg->epages - size_high;
3187 		seg_high->epages = seg_high->pages + size_high;
3188 		seg_high->pages_base = seg->pages_end - size_high;
3189 		seg_high->pages_end = seg_high->pages_base + size_high;
3190 		seg_high->next = seg->next;
3191 		seg_high->msegflags = seg->msegflags;
3192 	}
3193 
3194 	seg_mid->pages = seg->pages + size_low;
3195 	seg_mid->pages_base = seg->pages_base + size_low;
3196 	seg_mid->epages = seg->epages - size_high;
3197 	seg_mid->pages_end = seg->pages_end - size_high;
3198 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3199 	seg_mid->msegflags = seg->msegflags;
3200 
3201 	/*
3202 	 * Update hat_kpm specific info of all involved memsegs and
3203 	 * allow hat_kpm specific global chain updates.
3204 	 */
3205 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3206 
3207 	/*
3208 	 * At this point we have two equivalent memseg sub-chains,
3209 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3210 	 * the same place in the global chain. By re-writing the pointer
3211 	 * in the previous element we switch atomically from using the old
3212 	 * (seg) to the new.
3213 	 */
3214 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3215 
3216 	membar_enter();
3217 
3218 	build_pfn_hash();
3219 	memsegs_unlock(1);
3220 
3221 	/*
3222 	 * We leave the old segment, 'seg', intact as there may be
3223 	 * references to it. Also, as the value of total_pages has not
3224 	 * changed and the memsegs list is effectively the same when
3225 	 * accessed via the old or the new pointer, we do not have to
3226 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3227 	 *
3228 	 * We currently do not re-use or reclaim the page_t memory.
3229 	 * If we do, then this may have to change.
3230 	 */
3231 
3232 	mutex_enter(&memseg_lists_lock);
3233 	seg->lnext = memseg_edit_junk;
3234 	memseg_edit_junk = seg;
3235 	mutex_exit(&memseg_lists_lock);
3236 
3237 	return (1);
3238 }
3239 
3240 /*
3241  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3242  * structure using physical addresses. Therefore a kmem_cache is
3243  * used with KMC_NOHASH to avoid page crossings within a memseg
3244  * structure. KMC_NOHASH requires that no external (outside of
3245  * slab) information is allowed. This, in turn, implies that the
3246  * cache's slabsize must be exactly a single page, since per-slab
3247  * information (e.g. the freelist for the slab) is kept at the
3248  * end of the slab, where it is easy to locate. Should be changed
3249  * when a more obvious kmem_cache interface/flag will become
3250  * available.
3251  */
3252 void
3253 mem_config_init()
3254 {
3255 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3256 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3257 }
3258 
3259 struct memseg *
3260 memseg_alloc()
3261 {
3262 	struct memseg *seg;
3263 
3264 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3265 	bzero(seg, sizeof (struct memseg));
3266 
3267 	return (seg);
3268 }
3269 
3270 /*
3271  * Return whether the page_t memory for this memseg
3272  * is included in the memseg itself.
3273  */
3274 static int
3275 memseg_includes_meta(struct memseg *seg)
3276 {
3277 	return (seg->msegflags & MEMSEG_META_INCL);
3278 }
3279 
3280 pfn_t
3281 memseg_get_start(struct memseg *seg)
3282 {
3283 	pfn_t		pt_start;
3284 
3285 	if (memseg_includes_meta(seg)) {
3286 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3287 
3288 		/* Meta data is required to be at the beginning */
3289 		ASSERT(pt_start < seg->pages_base);
3290 	} else
3291 		pt_start = seg->pages_base;
3292 
3293 	return (pt_start);
3294 }
3295 
3296 /*
3297  * Invalidate memseg pointers in cpu private vm data caches.
3298  */
3299 static void
3300 memseg_cpu_vm_flush()
3301 {
3302 	cpu_t *cp;
3303 	vm_cpu_data_t *vc;
3304 
3305 	mutex_enter(&cpu_lock);
3306 	pause_cpus(NULL, NULL);
3307 
3308 	cp = cpu_list;
3309 	do {
3310 		vc = cp->cpu_vm_data;
3311 		vc->vc_pnum_memseg = NULL;
3312 		vc->vc_pnext_memseg = NULL;
3313 
3314 	} while ((cp = cp->cpu_next) != cpu_list);
3315 
3316 	start_cpus();
3317 	mutex_exit(&cpu_lock);
3318 }
3319