xref: /titanic_50/usr/src/uts/common/os/mem_config.c (revision 263f549e5da8b32c4922f586afb365b8ae388a6c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern uint_t page_ctrs_adjust(int);
63 void page_ctrs_cleanup(void);
64 static void kphysm_setup_post_add(pgcnt_t);
65 static int kphysm_setup_pre_del(pgcnt_t);
66 static void kphysm_setup_post_del(pgcnt_t, int);
67 
68 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
69 
70 static int delspan_reserve(pfn_t, pgcnt_t);
71 static void delspan_unreserve(pfn_t, pgcnt_t);
72 
73 kmutex_t memseg_lists_lock;
74 struct memseg *memseg_va_avail;
75 struct memseg *memseg_alloc(void);
76 static struct memseg *memseg_delete_junk;
77 static struct memseg *memseg_edit_junk;
78 void memseg_remap_init(void);
79 static void memseg_remap_to_dummy(struct memseg *);
80 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
81 static struct memseg *memseg_reuse(pgcnt_t);
82 
83 static struct kmem_cache *memseg_cache;
84 
85 /*
86  * Interfaces to manage externally allocated
87  * page_t memory (metadata) for a memseg.
88  */
89 #pragma weak	memseg_alloc_meta
90 #pragma weak	memseg_free_meta
91 #pragma weak	memseg_get_metapfn
92 #pragma weak	memseg_remap_meta
93 
94 extern int ppvm_enable;
95 extern page_t *ppvm_base;
96 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
97 extern void memseg_free_meta(void *, pgcnt_t);
98 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
99 extern void memseg_remap_meta(struct memseg *);
100 static int memseg_is_dynamic(struct memseg *);
101 static int memseg_includes_meta(struct memseg *);
102 pfn_t memseg_get_start(struct memseg *);
103 static void memseg_cpu_vm_flush(void);
104 
105 int meta_alloc_enable;
106 
107 #ifdef	DEBUG
108 static int memseg_debug;
109 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
110 #else
111 #define	MEMSEG_DEBUG(...)
112 #endif
113 
114 /*
115  * Add a chunk of memory to the system.
116  * base: starting PAGESIZE page of new memory.
117  * npgs: length in PAGESIZE pages.
118  *
119  * Adding mem this way doesn't increase the size of the hash tables;
120  * growing them would be too hard.  This should be OK, but adding memory
121  * dynamically most likely means more hash misses, since the tables will
122  * be smaller than they otherwise would be.
123  */
124 int
125 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
126 {
127 	page_t *pp;
128 	page_t		*opp, *oepp, *segpp;
129 	struct memseg	*seg;
130 	uint64_t	avmem;
131 	pfn_t		pfn;
132 	pfn_t		pt_base = base;
133 	pgcnt_t		tpgs = npgs;
134 	pgcnt_t		metapgs = 0;
135 	int		exhausted;
136 	pfn_t		pnum;
137 	int		mnode;
138 	caddr_t		vaddr;
139 	int		reuse;
140 	int		mlret;
141 	int		rv;
142 	int		flags;
143 	int		meta_alloc = 0;
144 	void		*mapva;
145 	void		*metabase = (void *)base;
146 	pgcnt_t		nkpmpgs = 0;
147 	offset_t	kpm_pages_off;
148 
149 	cmn_err(CE_CONT,
150 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
151 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
152 
153 	/*
154 	 * Add this span in the delete list to prevent interactions.
155 	 */
156 	if (!delspan_reserve(base, npgs)) {
157 		return (KPHYSM_ESPAN);
158 	}
159 	/*
160 	 * Check to see if any of the memory span has been added
161 	 * by trying an add to the installed memory list. This
162 	 * forms the interlocking process for add.
163 	 */
164 
165 	memlist_write_lock();
166 
167 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
168 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
169 
170 	if (mlret == MEML_SPANOP_OK)
171 		installed_top_size(phys_install, &physmax, &physinstalled);
172 
173 	memlist_write_unlock();
174 
175 	if (mlret != MEML_SPANOP_OK) {
176 		if (mlret == MEML_SPANOP_EALLOC) {
177 			delspan_unreserve(pt_base, tpgs);
178 			return (KPHYSM_ERESOURCE);
179 		} else if (mlret == MEML_SPANOP_ESPAN) {
180 			delspan_unreserve(pt_base, tpgs);
181 			return (KPHYSM_ESPAN);
182 		} else {
183 			delspan_unreserve(pt_base, tpgs);
184 			return (KPHYSM_ERESOURCE);
185 		}
186 	}
187 
188 	if (meta_alloc_enable) {
189 		/*
190 		 * Allocate the page_t's from existing memory;
191 		 * if that fails, allocate from the incoming memory.
192 		 */
193 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
194 		if (rv == KPHYSM_OK) {
195 			ASSERT(metapgs);
196 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
197 			meta_alloc = 1;
198 			goto mapalloc;
199 		}
200 	}
201 
202 	/*
203 	 * We store the page_t's for this new memory in the first
204 	 * few pages of the chunk. Here, we go and get'em ...
205 	 */
206 
207 	/*
208 	 * The expression after the '-' gives the number of pages
209 	 * that will fit in the new memory based on a requirement
210 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
211 	 */
212 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
213 	    (PAGESIZE + sizeof (page_t)));
214 
215 	npgs -= metapgs;
216 	base += metapgs;
217 
218 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
219 
220 	exhausted = (metapgs == 0 || npgs == 0);
221 
222 	if (kpm_enable && !exhausted) {
223 		pgcnt_t start, end, nkpmpgs_prelim;
224 		size_t	ptsz;
225 
226 		/*
227 		 * A viable kpm large page mapping must not overlap two
228 		 * dynamic memsegs. Therefore the total size is checked
229 		 * to be at least kpm_pgsz and also whether start and end
230 		 * points are at least kpm_pgsz aligned.
231 		 */
232 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
233 		    pmodkpmp(base + npgs)) {
234 
235 			kphysm_addmem_error_undospan(pt_base, tpgs);
236 
237 			/*
238 			 * There is no specific error code for violating
239 			 * kpm granularity constraints.
240 			 */
241 			return (KPHYSM_ENOTVIABLE);
242 		}
243 
244 		start = kpmptop(ptokpmp(base));
245 		end = kpmptop(ptokpmp(base + npgs));
246 		nkpmpgs_prelim = ptokpmp(end - start);
247 		ptsz = npgs * sizeof (page_t);
248 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
249 		exhausted = (tpgs <= metapgs);
250 		if (!exhausted) {
251 			npgs = tpgs - metapgs;
252 			base = pt_base + metapgs;
253 
254 			/* final nkpmpgs */
255 			start = kpmptop(ptokpmp(base));
256 			nkpmpgs = ptokpmp(end - start);
257 			kpm_pages_off = ptsz +
258 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
259 		}
260 	}
261 
262 	/*
263 	 * Is memory area supplied too small?
264 	 */
265 	if (exhausted) {
266 		kphysm_addmem_error_undospan(pt_base, tpgs);
267 		/*
268 		 * There is no specific error code for 'too small'.
269 		 */
270 		return (KPHYSM_ERESOURCE);
271 	}
272 
273 mapalloc:
274 	/*
275 	 * We may re-use a previously allocated VA space for the page_ts
276 	 * eventually, but we need to initialize and lock the pages first.
277 	 */
278 
279 	/*
280 	 * Get an address in the kernel address map, map
281 	 * the page_t pages and see if we can touch them.
282 	 */
283 
284 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
285 	if (mapva == NULL) {
286 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
287 		    " Can't allocate VA for page_ts");
288 
289 		if (meta_alloc)
290 			memseg_free_meta(metabase, metapgs);
291 		kphysm_addmem_error_undospan(pt_base, tpgs);
292 
293 		return (KPHYSM_ERESOURCE);
294 	}
295 	pp = mapva;
296 
297 	if (physmax < (pt_base + tpgs))
298 		physmax = (pt_base + tpgs);
299 
300 	/*
301 	 * In the remapping code we map one page at a time so we must do
302 	 * the same here to match mapping sizes.
303 	 */
304 	pfn = pt_base;
305 	vaddr = (caddr_t)pp;
306 	for (pnum = 0; pnum < metapgs; pnum++) {
307 		if (meta_alloc)
308 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
309 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
310 		    PROT_READ | PROT_WRITE,
311 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
312 		pfn++;
313 		vaddr += ptob(1);
314 	}
315 
316 	if (ddi_peek32((dev_info_t *)NULL,
317 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
318 
319 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
320 		    " Can't access pp array at 0x%p [phys 0x%lx]",
321 		    (void *)pp, pt_base);
322 
323 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
324 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
325 
326 		vmem_free(heap_arena, mapva, ptob(metapgs));
327 		if (meta_alloc)
328 			memseg_free_meta(metabase, metapgs);
329 		kphysm_addmem_error_undospan(pt_base, tpgs);
330 
331 		return (KPHYSM_EFAULT);
332 	}
333 
334 	/*
335 	 * Add this memory slice to its memory node translation.
336 	 *
337 	 * Note that right now, each node may have only one slice;
338 	 * this may change with COD or in larger SSM systems with
339 	 * nested latency groups, so we must not assume that the
340 	 * node does not yet exist.
341 	 *
342 	 * Note that there may be multiple memory nodes associated with
343 	 * a single lgrp node on x86 systems.
344 	 */
345 	pnum = pt_base + tpgs - 1;
346 	mem_node_add_range(pt_base, pnum);
347 
348 	/*
349 	 * Allocate or resize page counters as necessary to accommodate
350 	 * the increase in memory pages.
351 	 */
352 	mnode = PFN_2_MEM_NODE(pnum);
353 	PAGE_CTRS_ADJUST(base, npgs, rv);
354 	if (rv) {
355 
356 		mem_node_del_range(pt_base, pnum);
357 
358 		/* cleanup the  page counters */
359 		page_ctrs_cleanup();
360 
361 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
362 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
363 
364 		vmem_free(heap_arena, mapva, ptob(metapgs));
365 		if (meta_alloc)
366 			memseg_free_meta(metabase, metapgs);
367 		kphysm_addmem_error_undospan(pt_base, tpgs);
368 
369 		return (KPHYSM_ERESOURCE);
370 	}
371 
372 	/*
373 	 * Update the phys_avail memory list.
374 	 * The phys_install list was done at the start.
375 	 */
376 
377 	memlist_write_lock();
378 
379 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
380 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
381 	ASSERT(mlret == MEML_SPANOP_OK);
382 
383 	memlist_write_unlock();
384 
385 	/* See if we can find a memseg to re-use. */
386 	if (meta_alloc) {
387 		seg = memseg_reuse(0);
388 		reuse = 1;	/* force unmapping of temp mapva */
389 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
390 		/*
391 		 * There is a 1:1 fixed relationship between a pfn
392 		 * and a page_t VA.  The pfn is used as an index into
393 		 * the ppvm_base page_t table in order to calculate
394 		 * the page_t base address for a given pfn range.
395 		 */
396 		segpp = ppvm_base + base;
397 	} else {
398 		seg = memseg_reuse(metapgs);
399 		reuse = (seg != NULL);
400 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
401 		segpp = pp;
402 	}
403 
404 	/*
405 	 * Initialize the memseg structure representing this memory
406 	 * and add it to the existing list of memsegs. Do some basic
407 	 * initialization and add the memory to the system.
408 	 * In order to prevent lock deadlocks, the add_physmem()
409 	 * code is repeated here, but split into several stages.
410 	 *
411 	 * If a memseg is reused, invalidate memseg pointers in
412 	 * all cpu vm caches.  We need to do this this since the check
413 	 * 	pp >= seg->pages && pp < seg->epages
414 	 * used in various places is not atomic and so the first compare
415 	 * can happen before reuse and the second compare after reuse.
416 	 * The invalidation ensures that a memseg is not deferenced while
417 	 * it's page/pfn pointers are changing.
418 	 */
419 	if (seg == NULL) {
420 		seg = memseg_alloc();
421 		ASSERT(seg != NULL);
422 		seg->msegflags = flags;
423 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
424 		    (void *)seg, (void *)(seg->pages));
425 		seg->pages = segpp;
426 	} else {
427 		ASSERT(seg->msegflags == flags);
428 		ASSERT(seg->pages_base == seg->pages_end);
429 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
430 		    (void *)seg, (void *)(seg->pages));
431 		if (meta_alloc) {
432 			memseg_cpu_vm_flush();
433 			seg->pages = segpp;
434 		}
435 	}
436 
437 	seg->epages = seg->pages + npgs;
438 	seg->pages_base = base;
439 	seg->pages_end = base + npgs;
440 
441 	/*
442 	 * Initialize metadata. The page_ts are set to locked state
443 	 * ready to be freed.
444 	 */
445 	bzero((caddr_t)pp, ptob(metapgs));
446 
447 	pfn = seg->pages_base;
448 	/* Save the original pp base in case we reuse a memseg. */
449 	opp = pp;
450 	oepp = opp + npgs;
451 	for (pp = opp; pp < oepp; pp++) {
452 		pp->p_pagenum = pfn;
453 		pfn++;
454 		page_iolock_init(pp);
455 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
456 			continue;
457 		pp->p_offset = (u_offset_t)-1;
458 	}
459 
460 	if (reuse) {
461 		/* Remap our page_ts to the re-used memseg VA space. */
462 		pfn = pt_base;
463 		vaddr = (caddr_t)seg->pages;
464 		for (pnum = 0; pnum < metapgs; pnum++) {
465 			if (meta_alloc)
466 				pfn = memseg_get_metapfn(metabase,
467 				    (pgcnt_t)pnum);
468 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
469 			    PROT_READ | PROT_WRITE,
470 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
471 			pfn++;
472 			vaddr += ptob(1);
473 		}
474 
475 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
476 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
477 
478 		vmem_free(heap_arena, mapva, ptob(metapgs));
479 	}
480 
481 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
482 
483 	memsegs_lock(1);
484 
485 	/*
486 	 * The new memseg is inserted at the beginning of the list.
487 	 * Not only does this save searching for the tail, but in the
488 	 * case of a re-used memseg, it solves the problem of what
489 	 * happens if some process has still got a pointer to the
490 	 * memseg and follows the next pointer to continue traversing
491 	 * the memsegs list.
492 	 */
493 
494 	hat_kpm_addmem_mseg_insert(seg);
495 
496 	seg->next = memsegs;
497 	membar_producer();
498 
499 	hat_kpm_addmem_memsegs_update(seg);
500 
501 	memsegs = seg;
502 
503 	build_pfn_hash();
504 
505 	total_pages += npgs;
506 
507 	/*
508 	 * Recalculate the paging parameters now total_pages has changed.
509 	 * This will also cause the clock hands to be reset before next use.
510 	 */
511 	setupclock(1);
512 
513 	memsegs_unlock(1);
514 
515 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
516 
517 	/*
518 	 * Free the pages outside the lock to avoid locking loops.
519 	 */
520 	for (pp = seg->pages; pp < seg->epages; pp++) {
521 		page_free(pp, 1);
522 	}
523 
524 	/*
525 	 * Now that we've updated the appropriate memory lists we
526 	 * need to reset a number of globals, since we've increased memory.
527 	 * Several have already been updated for us as noted above. The
528 	 * globals we're interested in at this point are:
529 	 *   physmax - highest page frame number.
530 	 *   physinstalled - number of pages currently installed (done earlier)
531 	 *   maxmem - max free pages in the system
532 	 *   physmem - physical memory pages available
533 	 *   availrmem - real memory available
534 	 */
535 
536 	mutex_enter(&freemem_lock);
537 	maxmem += npgs;
538 	physmem += npgs;
539 	availrmem += npgs;
540 	availrmem_initial += npgs;
541 
542 	mutex_exit(&freemem_lock);
543 
544 	dump_resize();
545 
546 	page_freelist_coalesce_all(mnode);
547 
548 	kphysm_setup_post_add(npgs);
549 
550 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
551 	    "(0x%" PRIx64 ")\n",
552 	    physinstalled << (PAGESHIFT - 10),
553 	    (uint64_t)physinstalled << PAGESHIFT);
554 
555 	avmem = (uint64_t)freemem << PAGESHIFT;
556 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
557 	    "avail mem = %" PRId64 "\n", avmem);
558 
559 	/*
560 	 * Update lgroup generation number on single lgroup systems
561 	 */
562 	if (nlgrps == 1)
563 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
564 
565 	/*
566 	 * Inform DDI of update
567 	 */
568 	ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
569 	    (uint64_t)(tpgs) << PAGESHIFT);
570 
571 	delspan_unreserve(pt_base, tpgs);
572 
573 	return (KPHYSM_OK);		/* Successfully added system memory */
574 }
575 
576 /*
577  * There are various error conditions in kphysm_add_memory_dynamic()
578  * which require a rollback of already changed global state.
579  */
580 static void
581 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
582 {
583 	int mlret;
584 
585 	/* Unreserve memory span. */
586 	memlist_write_lock();
587 
588 	mlret = memlist_delete_span(
589 	    (uint64_t)(pt_base) << PAGESHIFT,
590 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
591 
592 	ASSERT(mlret == MEML_SPANOP_OK);
593 	phys_install_has_changed();
594 	installed_top_size(phys_install, &physmax, &physinstalled);
595 
596 	memlist_write_unlock();
597 	delspan_unreserve(pt_base, tpgs);
598 }
599 
600 /*
601  * Only return an available memseg of exactly the right size
602  * if size is required.
603  * When the meta data area has it's own virtual address space
604  * we will need to manage this more carefully and do best fit
605  * allocations, possibly splitting an available area.
606  */
607 struct memseg *
608 memseg_reuse(pgcnt_t metapgs)
609 {
610 	int type;
611 	struct memseg **segpp, *seg;
612 
613 	mutex_enter(&memseg_lists_lock);
614 
615 	segpp = &memseg_va_avail;
616 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
617 		caddr_t end;
618 
619 		/*
620 		 * Make sure we are reusing the right segment type.
621 		 */
622 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
623 
624 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
625 		    != type)
626 			continue;
627 
628 		if (kpm_enable)
629 			end = hat_kpm_mseg_reuse(seg);
630 		else
631 			end = (caddr_t)seg->epages;
632 
633 		/*
634 		 * Check for the right size if it is provided.
635 		 */
636 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
637 			*segpp = seg->lnext;
638 			seg->lnext = NULL;
639 			break;
640 		}
641 	}
642 	mutex_exit(&memseg_lists_lock);
643 
644 	return (seg);
645 }
646 
647 static uint_t handle_gen;
648 
649 struct memdelspan {
650 	struct memdelspan *mds_next;
651 	pfn_t		mds_base;
652 	pgcnt_t		mds_npgs;
653 	uint_t		*mds_bitmap;
654 	uint_t		*mds_bitmap_retired;
655 };
656 
657 #define	NBPBMW		(sizeof (uint_t) * NBBY)
658 #define	MDS_BITMAPBYTES(MDSP) \
659 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
660 
661 struct transit_list {
662 	struct transit_list	*trl_next;
663 	struct memdelspan	*trl_spans;
664 	int			trl_collect;
665 };
666 
667 struct transit_list_head {
668 	kmutex_t		trh_lock;
669 	struct transit_list	*trh_head;
670 };
671 
672 static struct transit_list_head transit_list_head;
673 
674 struct mem_handle;
675 static void transit_list_collect(struct mem_handle *, int);
676 static void transit_list_insert(struct transit_list *);
677 static void transit_list_remove(struct transit_list *);
678 
679 #ifdef DEBUG
680 #define	MEM_DEL_STATS
681 #endif /* DEBUG */
682 
683 #ifdef MEM_DEL_STATS
684 static int mem_del_stat_print = 0;
685 struct mem_del_stat {
686 	uint_t	nloop;
687 	uint_t	need_free;
688 	uint_t	free_loop;
689 	uint_t	free_low;
690 	uint_t	free_failed;
691 	uint_t	ncheck;
692 	uint_t	nopaget;
693 	uint_t	lockfail;
694 	uint_t	nfree;
695 	uint_t	nreloc;
696 	uint_t	nrelocfail;
697 	uint_t	already_done;
698 	uint_t	first_notfree;
699 	uint_t	npplocked;
700 	uint_t	nlockreloc;
701 	uint_t	nnorepl;
702 	uint_t	nmodreloc;
703 	uint_t	ndestroy;
704 	uint_t	nputpage;
705 	uint_t	nnoreclaim;
706 	uint_t	ndelay;
707 	uint_t	demotefail;
708 	uint64_t nticks_total;
709 	uint64_t nticks_pgrp;
710 	uint_t	retired;
711 	uint_t	toxic;
712 	uint_t	failing;
713 	uint_t	modtoxic;
714 	uint_t	npplkdtoxic;
715 	uint_t	gptlmodfail;
716 	uint_t	gptllckfail;
717 };
718 /*
719  * The stat values are only incremented in the delete thread
720  * so no locking or atomic required.
721  */
722 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
723 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
724 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
725 static void mem_del_stat_print_func(struct mem_handle *);
726 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
727 #else /* MEM_DEL_STATS */
728 #define	MDSTAT_INCR(MHP, FLD)
729 #define	MDSTAT_TOTAL(MHP, ntck)
730 #define	MDSTAT_PGRP(MHP, ntck)
731 #define	MDSTAT_PRINT(MHP)
732 #endif /* MEM_DEL_STATS */
733 
734 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
735 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
736 
737 /*
738  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
739  * The mutex may not be required for other fields, dependent on mh_state.
740  */
741 struct mem_handle {
742 	kmutex_t	mh_mutex;
743 	struct mem_handle *mh_next;
744 	memhandle_t	mh_exthandle;
745 	mhnd_state_t	mh_state;
746 	struct transit_list mh_transit;
747 	pgcnt_t		mh_phys_pages;
748 	pgcnt_t		mh_vm_pages;
749 	pgcnt_t		mh_hold_todo;
750 	void		(*mh_delete_complete)(void *, int error);
751 	void		*mh_delete_complete_arg;
752 	volatile uint_t mh_cancel;
753 	volatile uint_t mh_dr_aio_cleanup_cancel;
754 	volatile uint_t mh_aio_cleanup_done;
755 	kcondvar_t	mh_cv;
756 	kthread_id_t	mh_thread_id;
757 	page_t		*mh_deleted;	/* link through p_next */
758 #ifdef MEM_DEL_STATS
759 	struct mem_del_stat mh_delstat;
760 #endif /* MEM_DEL_STATS */
761 };
762 
763 static struct mem_handle *mem_handle_head;
764 static kmutex_t mem_handle_list_mutex;
765 
766 static struct mem_handle *
767 kphysm_allocate_mem_handle()
768 {
769 	struct mem_handle *mhp;
770 
771 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
772 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
773 	mutex_enter(&mem_handle_list_mutex);
774 	mutex_enter(&mhp->mh_mutex);
775 	/* handle_gen is protected by list mutex. */
776 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
777 	mhp->mh_next = mem_handle_head;
778 	mem_handle_head = mhp;
779 	mutex_exit(&mem_handle_list_mutex);
780 
781 	return (mhp);
782 }
783 
784 static void
785 kphysm_free_mem_handle(struct mem_handle *mhp)
786 {
787 	struct mem_handle **mhpp;
788 
789 	ASSERT(mutex_owned(&mhp->mh_mutex));
790 	ASSERT(mhp->mh_state == MHND_FREE);
791 	/*
792 	 * Exit the mutex to preserve locking order. This is OK
793 	 * here as once in the FREE state, the handle cannot
794 	 * be found by a lookup.
795 	 */
796 	mutex_exit(&mhp->mh_mutex);
797 
798 	mutex_enter(&mem_handle_list_mutex);
799 	mhpp = &mem_handle_head;
800 	while (*mhpp != NULL && *mhpp != mhp)
801 		mhpp = &(*mhpp)->mh_next;
802 	ASSERT(*mhpp == mhp);
803 	/*
804 	 * No need to lock the handle (mh_mutex) as only
805 	 * mh_next changing and this is the only thread that
806 	 * can be referncing mhp.
807 	 */
808 	*mhpp = mhp->mh_next;
809 	mutex_exit(&mem_handle_list_mutex);
810 
811 	mutex_destroy(&mhp->mh_mutex);
812 	kmem_free(mhp, sizeof (struct mem_handle));
813 }
814 
815 /*
816  * This function finds the internal mem_handle corresponding to an
817  * external handle and returns it with the mh_mutex held.
818  */
819 static struct mem_handle *
820 kphysm_lookup_mem_handle(memhandle_t handle)
821 {
822 	struct mem_handle *mhp;
823 
824 	mutex_enter(&mem_handle_list_mutex);
825 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
826 		if (mhp->mh_exthandle == handle) {
827 			mutex_enter(&mhp->mh_mutex);
828 			/*
829 			 * The state of the handle could have been changed
830 			 * by kphysm_del_release() while waiting for mh_mutex.
831 			 */
832 			if (mhp->mh_state == MHND_FREE) {
833 				mutex_exit(&mhp->mh_mutex);
834 				continue;
835 			}
836 			break;
837 		}
838 	}
839 	mutex_exit(&mem_handle_list_mutex);
840 	return (mhp);
841 }
842 
843 int
844 kphysm_del_gethandle(memhandle_t *xmhp)
845 {
846 	struct mem_handle *mhp;
847 
848 	mhp = kphysm_allocate_mem_handle();
849 	/*
850 	 * The handle is allocated using KM_SLEEP, so cannot fail.
851 	 * If the implementation is changed, the correct error to return
852 	 * here would be KPHYSM_ENOHANDLES.
853 	 */
854 	ASSERT(mhp->mh_state == MHND_FREE);
855 	mhp->mh_state = MHND_INIT;
856 	*xmhp = mhp->mh_exthandle;
857 	mutex_exit(&mhp->mh_mutex);
858 	return (KPHYSM_OK);
859 }
860 
861 static int
862 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
863 {
864 	pfn_t e1, e2;
865 
866 	e1 = b1 + l1;
867 	e2 = b2 + l2;
868 
869 	return (!(b2 >= e1 || b1 >= e2));
870 }
871 
872 static int can_remove_pgs(pgcnt_t);
873 
874 static struct memdelspan *
875 span_to_install(pfn_t base, pgcnt_t npgs)
876 {
877 	struct memdelspan *mdsp;
878 	struct memdelspan *mdsp_new;
879 	uint64_t address, size, thislen;
880 	struct memlist *mlp;
881 
882 	mdsp_new = NULL;
883 
884 	address = (uint64_t)base << PAGESHIFT;
885 	size = (uint64_t)npgs << PAGESHIFT;
886 	while (size != 0) {
887 		memlist_read_lock();
888 		for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
889 			if (address >= (mlp->ml_address + mlp->ml_size))
890 				continue;
891 			if ((address + size) > mlp->ml_address)
892 				break;
893 		}
894 		if (mlp == NULL) {
895 			address += size;
896 			size = 0;
897 			thislen = 0;
898 		} else {
899 			if (address < mlp->ml_address) {
900 				size -= (mlp->ml_address - address);
901 				address = mlp->ml_address;
902 			}
903 			ASSERT(address >= mlp->ml_address);
904 			if ((address + size) >
905 			    (mlp->ml_address + mlp->ml_size)) {
906 				thislen =
907 				    mlp->ml_size - (address - mlp->ml_address);
908 			} else {
909 				thislen = size;
910 			}
911 		}
912 		memlist_read_unlock();
913 		/* TODO: phys_install could change now */
914 		if (thislen == 0)
915 			continue;
916 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
917 		mdsp->mds_base = btop(address);
918 		mdsp->mds_npgs = btop(thislen);
919 		mdsp->mds_next = mdsp_new;
920 		mdsp_new = mdsp;
921 		address += thislen;
922 		size -= thislen;
923 	}
924 	return (mdsp_new);
925 }
926 
927 static void
928 free_delspans(struct memdelspan *mdsp)
929 {
930 	struct memdelspan *amdsp;
931 
932 	while ((amdsp = mdsp) != NULL) {
933 		mdsp = amdsp->mds_next;
934 		kmem_free(amdsp, sizeof (struct memdelspan));
935 	}
936 }
937 
938 /*
939  * Concatenate lists. No list ordering is required.
940  */
941 
942 static void
943 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
944 {
945 	while (*mdspp != NULL)
946 		mdspp = &(*mdspp)->mds_next;
947 
948 	*mdspp = mdsp;
949 }
950 
951 /*
952  * Given a new list of delspans, check there is no overlap with
953  * all existing span activity (add or delete) and then concatenate
954  * the new spans to the given list.
955  * Return 1 for OK, 0 if overlapping.
956  */
957 static int
958 delspan_insert(
959 	struct transit_list *my_tlp,
960 	struct memdelspan *mdsp_new)
961 {
962 	struct transit_list_head *trh;
963 	struct transit_list *tlp;
964 	int ret;
965 
966 	trh = &transit_list_head;
967 
968 	ASSERT(my_tlp != NULL);
969 	ASSERT(mdsp_new != NULL);
970 
971 	ret = 1;
972 	mutex_enter(&trh->trh_lock);
973 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
974 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
975 		struct memdelspan *mdsp;
976 
977 		for (mdsp = tlp->trl_spans; mdsp != NULL;
978 		    mdsp = mdsp->mds_next) {
979 			struct memdelspan *nmdsp;
980 
981 			for (nmdsp = mdsp_new; nmdsp != NULL;
982 			    nmdsp = nmdsp->mds_next) {
983 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
984 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
985 					ret = 0;
986 					goto done;
987 				}
988 			}
989 		}
990 	}
991 done:
992 	if (ret != 0) {
993 		if (my_tlp->trl_spans == NULL)
994 			transit_list_insert(my_tlp);
995 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
996 	}
997 	mutex_exit(&trh->trh_lock);
998 	return (ret);
999 }
1000 
1001 static void
1002 delspan_remove(
1003 	struct transit_list *my_tlp,
1004 	pfn_t base,
1005 	pgcnt_t npgs)
1006 {
1007 	struct transit_list_head *trh;
1008 	struct memdelspan *mdsp;
1009 
1010 	trh = &transit_list_head;
1011 
1012 	ASSERT(my_tlp != NULL);
1013 
1014 	mutex_enter(&trh->trh_lock);
1015 	if ((mdsp = my_tlp->trl_spans) != NULL) {
1016 		if (npgs == 0) {
1017 			my_tlp->trl_spans = NULL;
1018 			free_delspans(mdsp);
1019 			transit_list_remove(my_tlp);
1020 		} else {
1021 			struct memdelspan **prv;
1022 
1023 			prv = &my_tlp->trl_spans;
1024 			while (mdsp != NULL) {
1025 				pfn_t p_end;
1026 
1027 				p_end = mdsp->mds_base + mdsp->mds_npgs;
1028 				if (mdsp->mds_base >= base &&
1029 				    p_end <= (base + npgs)) {
1030 					*prv = mdsp->mds_next;
1031 					mdsp->mds_next = NULL;
1032 					free_delspans(mdsp);
1033 				} else {
1034 					prv = &mdsp->mds_next;
1035 				}
1036 				mdsp = *prv;
1037 			}
1038 			if (my_tlp->trl_spans == NULL)
1039 				transit_list_remove(my_tlp);
1040 		}
1041 	}
1042 	mutex_exit(&trh->trh_lock);
1043 }
1044 
1045 /*
1046  * Reserve interface for add to stop delete before add finished.
1047  * This list is only accessed through the delspan_insert/remove
1048  * functions and so is fully protected by the mutex in struct transit_list.
1049  */
1050 
1051 static struct transit_list reserve_transit;
1052 
1053 static int
1054 delspan_reserve(pfn_t base, pgcnt_t npgs)
1055 {
1056 	struct memdelspan *mdsp;
1057 	int ret;
1058 
1059 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1060 	mdsp->mds_base = base;
1061 	mdsp->mds_npgs = npgs;
1062 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1063 		free_delspans(mdsp);
1064 	}
1065 	return (ret);
1066 }
1067 
1068 static void
1069 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1070 {
1071 	delspan_remove(&reserve_transit, base, npgs);
1072 }
1073 
1074 /*
1075  * Return whether memseg was created by kphysm_add_memory_dynamic().
1076  */
1077 static int
1078 memseg_is_dynamic(struct memseg *seg)
1079 {
1080 	return (seg->msegflags & MEMSEG_DYNAMIC);
1081 }
1082 
1083 int
1084 kphysm_del_span(
1085 	memhandle_t handle,
1086 	pfn_t base,
1087 	pgcnt_t npgs)
1088 {
1089 	struct mem_handle *mhp;
1090 	struct memseg *seg;
1091 	struct memdelspan *mdsp;
1092 	struct memdelspan *mdsp_new;
1093 	pgcnt_t phys_pages, vm_pages;
1094 	pfn_t p_end;
1095 	page_t *pp;
1096 	int ret;
1097 
1098 	mhp = kphysm_lookup_mem_handle(handle);
1099 	if (mhp == NULL) {
1100 		return (KPHYSM_EHANDLE);
1101 	}
1102 	if (mhp->mh_state != MHND_INIT) {
1103 		mutex_exit(&mhp->mh_mutex);
1104 		return (KPHYSM_ESEQUENCE);
1105 	}
1106 
1107 	/*
1108 	 * Intersect the span with the installed memory list (phys_install).
1109 	 */
1110 	mdsp_new = span_to_install(base, npgs);
1111 	if (mdsp_new == NULL) {
1112 		/*
1113 		 * No physical memory in this range. Is this an
1114 		 * error? If an attempt to start the delete is made
1115 		 * for OK returns from del_span such as this, start will
1116 		 * return an error.
1117 		 * Could return KPHYSM_ENOWORK.
1118 		 */
1119 		/*
1120 		 * It is assumed that there are no error returns
1121 		 * from span_to_install() due to kmem_alloc failure.
1122 		 */
1123 		mutex_exit(&mhp->mh_mutex);
1124 		return (KPHYSM_OK);
1125 	}
1126 	/*
1127 	 * Does this span overlap an existing span?
1128 	 */
1129 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1130 		/*
1131 		 * Differentiate between already on list for this handle
1132 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1133 		 */
1134 		ret = KPHYSM_EBUSY;
1135 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1136 		    mdsp = mdsp->mds_next) {
1137 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1138 			    base, npgs)) {
1139 				ret = KPHYSM_EDUP;
1140 				break;
1141 			}
1142 		}
1143 		mutex_exit(&mhp->mh_mutex);
1144 		free_delspans(mdsp_new);
1145 		return (ret);
1146 	}
1147 	/*
1148 	 * At this point the spans in mdsp_new have been inserted into the
1149 	 * list of spans for this handle and thereby to the global list of
1150 	 * spans being processed. Each of these spans must now be checked
1151 	 * for relocatability. As a side-effect segments in the memseg list
1152 	 * may be split.
1153 	 *
1154 	 * Note that mdsp_new can no longer be used as it is now part of
1155 	 * a larger list. Select elements of this larger list based
1156 	 * on base and npgs.
1157 	 */
1158 restart:
1159 	phys_pages = 0;
1160 	vm_pages = 0;
1161 	ret = KPHYSM_OK;
1162 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1163 	    mdsp = mdsp->mds_next) {
1164 		pgcnt_t pages_checked;
1165 
1166 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1167 			continue;
1168 		}
1169 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1170 		/*
1171 		 * The pages_checked count is a hack. All pages should be
1172 		 * checked for relocatability. Those not covered by memsegs
1173 		 * should be tested with arch_kphysm_del_span_ok().
1174 		 */
1175 		pages_checked = 0;
1176 		for (seg = memsegs; seg; seg = seg->next) {
1177 			pfn_t mseg_start;
1178 
1179 			if (seg->pages_base >= p_end ||
1180 			    seg->pages_end <= mdsp->mds_base) {
1181 				/* Span and memseg don't overlap. */
1182 				continue;
1183 			}
1184 			mseg_start = memseg_get_start(seg);
1185 			/* Check that segment is suitable for delete. */
1186 			if (memseg_includes_meta(seg)) {
1187 				/*
1188 				 * Check that this segment is completely
1189 				 * within the span.
1190 				 */
1191 				if (mseg_start < mdsp->mds_base ||
1192 				    seg->pages_end > p_end) {
1193 					ret = KPHYSM_EBUSY;
1194 					break;
1195 				}
1196 				pages_checked += seg->pages_end - mseg_start;
1197 			} else {
1198 				/*
1199 				 * If this segment is larger than the span,
1200 				 * try to split it. After the split, it
1201 				 * is necessary to restart.
1202 				 */
1203 				if (seg->pages_base < mdsp->mds_base ||
1204 				    seg->pages_end > p_end) {
1205 					pfn_t abase;
1206 					pgcnt_t anpgs;
1207 					int s_ret;
1208 
1209 					/* Split required.  */
1210 					if (mdsp->mds_base < seg->pages_base)
1211 						abase = seg->pages_base;
1212 					else
1213 						abase = mdsp->mds_base;
1214 					if (p_end > seg->pages_end)
1215 						anpgs = seg->pages_end - abase;
1216 					else
1217 						anpgs = p_end - abase;
1218 					s_ret = kphysm_split_memseg(abase,
1219 					    anpgs);
1220 					if (s_ret == 0) {
1221 						/* Split failed. */
1222 						ret = KPHYSM_ERESOURCE;
1223 						break;
1224 					}
1225 					goto restart;
1226 				}
1227 				pages_checked +=
1228 				    seg->pages_end - seg->pages_base;
1229 			}
1230 			/*
1231 			 * The memseg is wholly within the delete span.
1232 			 * The individual pages can now be checked.
1233 			 */
1234 			/* Cage test. */
1235 			for (pp = seg->pages; pp < seg->epages; pp++) {
1236 				if (PP_ISNORELOC(pp)) {
1237 					ret = KPHYSM_ENONRELOC;
1238 					break;
1239 				}
1240 			}
1241 			if (ret != KPHYSM_OK) {
1242 				break;
1243 			}
1244 			phys_pages += (seg->pages_end - mseg_start);
1245 			vm_pages += MSEG_NPAGES(seg);
1246 		}
1247 		if (ret != KPHYSM_OK)
1248 			break;
1249 		if (pages_checked != mdsp->mds_npgs) {
1250 			ret = KPHYSM_ENONRELOC;
1251 			break;
1252 		}
1253 	}
1254 
1255 	if (ret == KPHYSM_OK) {
1256 		mhp->mh_phys_pages += phys_pages;
1257 		mhp->mh_vm_pages += vm_pages;
1258 	} else {
1259 		/*
1260 		 * Keep holding the mh_mutex to prevent it going away.
1261 		 */
1262 		delspan_remove(&mhp->mh_transit, base, npgs);
1263 	}
1264 	mutex_exit(&mhp->mh_mutex);
1265 	return (ret);
1266 }
1267 
1268 int
1269 kphysm_del_span_query(
1270 	pfn_t base,
1271 	pgcnt_t npgs,
1272 	memquery_t *mqp)
1273 {
1274 	struct memdelspan *mdsp;
1275 	struct memdelspan *mdsp_new;
1276 	int done_first_nonreloc;
1277 
1278 	mqp->phys_pages = 0;
1279 	mqp->managed = 0;
1280 	mqp->nonrelocatable = 0;
1281 	mqp->first_nonrelocatable = 0;
1282 	mqp->last_nonrelocatable = 0;
1283 
1284 	mdsp_new = span_to_install(base, npgs);
1285 	/*
1286 	 * It is OK to proceed here if mdsp_new == NULL.
1287 	 */
1288 	done_first_nonreloc = 0;
1289 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1290 		pfn_t sbase;
1291 		pgcnt_t snpgs;
1292 
1293 		mqp->phys_pages += mdsp->mds_npgs;
1294 		sbase = mdsp->mds_base;
1295 		snpgs = mdsp->mds_npgs;
1296 		while (snpgs != 0) {
1297 			struct memseg *lseg, *seg;
1298 			pfn_t p_end;
1299 			page_t *pp;
1300 			pfn_t mseg_start;
1301 
1302 			p_end = sbase + snpgs;
1303 			/*
1304 			 * Find the lowest addressed memseg that starts
1305 			 * after sbase and account for it.
1306 			 * This is to catch dynamic memsegs whose start
1307 			 * is hidden.
1308 			 */
1309 			seg = NULL;
1310 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1311 				if ((lseg->pages_base >= sbase) ||
1312 				    (lseg->pages_base < p_end &&
1313 				    lseg->pages_end > sbase)) {
1314 					if (seg == NULL ||
1315 					    seg->pages_base > lseg->pages_base)
1316 						seg = lseg;
1317 				}
1318 			}
1319 			if (seg != NULL) {
1320 				mseg_start = memseg_get_start(seg);
1321 				/*
1322 				 * Now have the full extent of the memseg so
1323 				 * do the range check.
1324 				 */
1325 				if (mseg_start >= p_end ||
1326 				    seg->pages_end <= sbase) {
1327 					/* Span does not overlap memseg. */
1328 					seg = NULL;
1329 				}
1330 			}
1331 			/*
1332 			 * Account for gap either before the segment if
1333 			 * there is one or to the end of the span.
1334 			 */
1335 			if (seg == NULL || mseg_start > sbase) {
1336 				pfn_t a_end;
1337 
1338 				a_end = (seg == NULL) ? p_end : mseg_start;
1339 				/*
1340 				 * Check with arch layer for relocatability.
1341 				 */
1342 				if (arch_kphysm_del_span_ok(sbase,
1343 				    (a_end - sbase))) {
1344 					/*
1345 					 * No non-relocatble pages in this
1346 					 * area, avoid the fine-grained
1347 					 * test.
1348 					 */
1349 					snpgs -= (a_end - sbase);
1350 					sbase = a_end;
1351 				}
1352 				while (sbase < a_end) {
1353 					if (!arch_kphysm_del_span_ok(sbase,
1354 					    1)) {
1355 						mqp->nonrelocatable++;
1356 						if (!done_first_nonreloc) {
1357 							mqp->
1358 							    first_nonrelocatable
1359 							    = sbase;
1360 							done_first_nonreloc = 1;
1361 						}
1362 						mqp->last_nonrelocatable =
1363 						    sbase;
1364 					}
1365 					sbase++;
1366 					snpgs--;
1367 				}
1368 			}
1369 			if (seg != NULL) {
1370 				ASSERT(mseg_start <= sbase);
1371 				if (seg->pages_base != mseg_start &&
1372 				    seg->pages_base > sbase) {
1373 					pgcnt_t skip_pgs;
1374 
1375 					/*
1376 					 * Skip the page_t area of a
1377 					 * dynamic memseg.
1378 					 */
1379 					skip_pgs = seg->pages_base - sbase;
1380 					if (snpgs <= skip_pgs) {
1381 						sbase += snpgs;
1382 						snpgs = 0;
1383 						continue;
1384 					}
1385 					snpgs -= skip_pgs;
1386 					sbase += skip_pgs;
1387 				}
1388 				ASSERT(snpgs != 0);
1389 				ASSERT(seg->pages_base <= sbase);
1390 				/*
1391 				 * The individual pages can now be checked.
1392 				 */
1393 				for (pp = seg->pages +
1394 				    (sbase - seg->pages_base);
1395 				    snpgs != 0 && pp < seg->epages; pp++) {
1396 					mqp->managed++;
1397 					if (PP_ISNORELOC(pp)) {
1398 						mqp->nonrelocatable++;
1399 						if (!done_first_nonreloc) {
1400 							mqp->
1401 							    first_nonrelocatable
1402 							    = sbase;
1403 							done_first_nonreloc = 1;
1404 						}
1405 						mqp->last_nonrelocatable =
1406 						    sbase;
1407 					}
1408 					sbase++;
1409 					snpgs--;
1410 				}
1411 			}
1412 		}
1413 	}
1414 
1415 	free_delspans(mdsp_new);
1416 
1417 	return (KPHYSM_OK);
1418 }
1419 
1420 /*
1421  * This release function can be called at any stage as follows:
1422  *	_gethandle only called
1423  *	_span(s) only called
1424  *	_start called but failed
1425  *	delete thread exited
1426  */
1427 int
1428 kphysm_del_release(memhandle_t handle)
1429 {
1430 	struct mem_handle *mhp;
1431 
1432 	mhp = kphysm_lookup_mem_handle(handle);
1433 	if (mhp == NULL) {
1434 		return (KPHYSM_EHANDLE);
1435 	}
1436 	switch (mhp->mh_state) {
1437 	case MHND_STARTING:
1438 	case MHND_RUNNING:
1439 		mutex_exit(&mhp->mh_mutex);
1440 		return (KPHYSM_ENOTFINISHED);
1441 	case MHND_FREE:
1442 		ASSERT(mhp->mh_state != MHND_FREE);
1443 		mutex_exit(&mhp->mh_mutex);
1444 		return (KPHYSM_EHANDLE);
1445 	case MHND_INIT:
1446 		break;
1447 	case MHND_DONE:
1448 		break;
1449 	case MHND_RELEASE:
1450 		mutex_exit(&mhp->mh_mutex);
1451 		return (KPHYSM_ESEQUENCE);
1452 	default:
1453 #ifdef DEBUG
1454 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1455 		    (void *)mhp, mhp->mh_state);
1456 #endif /* DEBUG */
1457 		mutex_exit(&mhp->mh_mutex);
1458 		return (KPHYSM_EHANDLE);
1459 	}
1460 	/*
1461 	 * Set state so that we can wait if necessary.
1462 	 * Also this means that we have read/write access to all
1463 	 * fields except mh_exthandle and mh_state.
1464 	 */
1465 	mhp->mh_state = MHND_RELEASE;
1466 	/*
1467 	 * The mem_handle cannot be de-allocated by any other operation
1468 	 * now, so no need to hold mh_mutex.
1469 	 */
1470 	mutex_exit(&mhp->mh_mutex);
1471 
1472 	delspan_remove(&mhp->mh_transit, 0, 0);
1473 	mhp->mh_phys_pages = 0;
1474 	mhp->mh_vm_pages = 0;
1475 	mhp->mh_hold_todo = 0;
1476 	mhp->mh_delete_complete = NULL;
1477 	mhp->mh_delete_complete_arg = NULL;
1478 	mhp->mh_cancel = 0;
1479 
1480 	mutex_enter(&mhp->mh_mutex);
1481 	ASSERT(mhp->mh_state == MHND_RELEASE);
1482 	mhp->mh_state = MHND_FREE;
1483 
1484 	kphysm_free_mem_handle(mhp);
1485 
1486 	return (KPHYSM_OK);
1487 }
1488 
1489 /*
1490  * This cancel function can only be called with the thread running.
1491  */
1492 int
1493 kphysm_del_cancel(memhandle_t handle)
1494 {
1495 	struct mem_handle *mhp;
1496 
1497 	mhp = kphysm_lookup_mem_handle(handle);
1498 	if (mhp == NULL) {
1499 		return (KPHYSM_EHANDLE);
1500 	}
1501 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1502 		mutex_exit(&mhp->mh_mutex);
1503 		return (KPHYSM_ENOTRUNNING);
1504 	}
1505 	/*
1506 	 * Set the cancel flag and wake the delete thread up.
1507 	 * The thread may be waiting on I/O, so the effect of the cancel
1508 	 * may be delayed.
1509 	 */
1510 	if (mhp->mh_cancel == 0) {
1511 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1512 		cv_signal(&mhp->mh_cv);
1513 	}
1514 	mutex_exit(&mhp->mh_mutex);
1515 	return (KPHYSM_OK);
1516 }
1517 
1518 int
1519 kphysm_del_status(
1520 	memhandle_t handle,
1521 	memdelstat_t *mdstp)
1522 {
1523 	struct mem_handle *mhp;
1524 
1525 	mhp = kphysm_lookup_mem_handle(handle);
1526 	if (mhp == NULL) {
1527 		return (KPHYSM_EHANDLE);
1528 	}
1529 	/*
1530 	 * Calling kphysm_del_status() is allowed before the delete
1531 	 * is started to allow for status display.
1532 	 */
1533 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1534 	    mhp->mh_state != MHND_RUNNING) {
1535 		mutex_exit(&mhp->mh_mutex);
1536 		return (KPHYSM_ENOTRUNNING);
1537 	}
1538 	mdstp->phys_pages = mhp->mh_phys_pages;
1539 	mdstp->managed = mhp->mh_vm_pages;
1540 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1541 	mutex_exit(&mhp->mh_mutex);
1542 	return (KPHYSM_OK);
1543 }
1544 
1545 static int mem_delete_additional_pages = 100;
1546 
1547 static int
1548 can_remove_pgs(pgcnt_t npgs)
1549 {
1550 	/*
1551 	 * If all pageable pages were paged out, freemem would
1552 	 * equal availrmem.  There is a minimum requirement for
1553 	 * availrmem.
1554 	 */
1555 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1556 	    < npgs)
1557 		return (0);
1558 	/* TODO: check swap space, etc. */
1559 	return (1);
1560 }
1561 
1562 static int
1563 get_availrmem(pgcnt_t npgs)
1564 {
1565 	int ret;
1566 
1567 	mutex_enter(&freemem_lock);
1568 	ret = can_remove_pgs(npgs);
1569 	if (ret != 0)
1570 		availrmem -= npgs;
1571 	mutex_exit(&freemem_lock);
1572 	return (ret);
1573 }
1574 
1575 static void
1576 put_availrmem(pgcnt_t npgs)
1577 {
1578 	mutex_enter(&freemem_lock);
1579 	availrmem += npgs;
1580 	mutex_exit(&freemem_lock);
1581 }
1582 
1583 #define	FREEMEM_INCR	100
1584 static pgcnt_t freemem_incr = FREEMEM_INCR;
1585 #define	DEL_FREE_WAIT_FRAC	4
1586 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1587 
1588 #define	DEL_BUSY_WAIT_FRAC	20
1589 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1590 
1591 static void kphysm_del_cleanup(struct mem_handle *);
1592 
1593 static void page_delete_collect(page_t *, struct mem_handle *);
1594 
1595 static pgcnt_t
1596 delthr_get_freemem(struct mem_handle *mhp)
1597 {
1598 	pgcnt_t free_get;
1599 	int ret;
1600 
1601 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1602 
1603 	MDSTAT_INCR(mhp, need_free);
1604 	/*
1605 	 * Get up to freemem_incr pages.
1606 	 */
1607 	free_get = freemem_incr;
1608 	if (free_get > mhp->mh_hold_todo)
1609 		free_get = mhp->mh_hold_todo;
1610 	/*
1611 	 * Take free_get pages away from freemem,
1612 	 * waiting if necessary.
1613 	 */
1614 
1615 	while (!mhp->mh_cancel) {
1616 		mutex_exit(&mhp->mh_mutex);
1617 		MDSTAT_INCR(mhp, free_loop);
1618 		/*
1619 		 * Duplicate test from page_create_throttle()
1620 		 * but don't override with !PG_WAIT.
1621 		 */
1622 		if (freemem < (free_get + throttlefree)) {
1623 			MDSTAT_INCR(mhp, free_low);
1624 			ret = 0;
1625 		} else {
1626 			ret = page_create_wait(free_get, 0);
1627 			if (ret == 0) {
1628 				/* EMPTY */
1629 				MDSTAT_INCR(mhp, free_failed);
1630 			}
1631 		}
1632 		if (ret != 0) {
1633 			mutex_enter(&mhp->mh_mutex);
1634 			return (free_get);
1635 		}
1636 
1637 		/*
1638 		 * Put pressure on pageout.
1639 		 */
1640 		page_needfree(free_get);
1641 		cv_signal(&proc_pageout->p_cv);
1642 
1643 		mutex_enter(&mhp->mh_mutex);
1644 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1645 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1646 		mutex_exit(&mhp->mh_mutex);
1647 		page_needfree(-(spgcnt_t)free_get);
1648 
1649 		mutex_enter(&mhp->mh_mutex);
1650 	}
1651 	return (0);
1652 }
1653 
1654 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1655 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1656 /*
1657  * This function is run as a helper thread for delete_memory_thread.
1658  * It is needed in order to force kaio cleanup, so that pages used in kaio
1659  * will be unlocked and subsequently relocated by delete_memory_thread.
1660  * The address of the delete_memory_threads's mem_handle is passed in to
1661  * this thread function, and is used to set the mh_aio_cleanup_done member
1662  * prior to calling thread_exit().
1663  */
1664 static void
1665 dr_aio_cleanup_thread(caddr_t amhp)
1666 {
1667 	proc_t *procp;
1668 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1669 	int cleaned;
1670 	int n = 0;
1671 	struct mem_handle *mhp;
1672 	volatile uint_t *pcancel;
1673 
1674 	mhp = (struct mem_handle *)amhp;
1675 	ASSERT(mhp != NULL);
1676 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1677 	if (modload("sys", "kaio") == -1) {
1678 		mhp->mh_aio_cleanup_done = 1;
1679 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1680 		thread_exit();
1681 	}
1682 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1683 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1684 	if (aio_cleanup_dr_delete_memory == NULL) {
1685 		mhp->mh_aio_cleanup_done = 1;
1686 		cmn_err(CE_WARN,
1687 	    "aio_cleanup_dr_delete_memory not found in kaio");
1688 		thread_exit();
1689 	}
1690 	do {
1691 		cleaned = 0;
1692 		mutex_enter(&pidlock);
1693 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1694 		    procp = procp->p_next) {
1695 			mutex_enter(&procp->p_lock);
1696 			if (procp->p_aio != NULL) {
1697 				/* cleanup proc's outstanding kaio */
1698 				cleaned +=
1699 				    (*aio_cleanup_dr_delete_memory)(procp);
1700 			}
1701 			mutex_exit(&procp->p_lock);
1702 		}
1703 		mutex_exit(&pidlock);
1704 		if ((*pcancel == 0) &&
1705 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1706 			/* delay a bit before retrying all procs again */
1707 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1708 			n = 0;
1709 		}
1710 	} while (*pcancel == 0);
1711 	mhp->mh_aio_cleanup_done = 1;
1712 	thread_exit();
1713 }
1714 
1715 static void
1716 delete_memory_thread(caddr_t amhp)
1717 {
1718 	struct mem_handle *mhp;
1719 	struct memdelspan *mdsp;
1720 	callb_cpr_t cprinfo;
1721 	page_t *pp_targ;
1722 	spgcnt_t freemem_left;
1723 	void (*del_complete_funcp)(void *, int error);
1724 	void *del_complete_arg;
1725 	int comp_code;
1726 	int ret;
1727 	int first_scan;
1728 	uint_t szc;
1729 #ifdef MEM_DEL_STATS
1730 	uint64_t start_total, ntick_total;
1731 	uint64_t start_pgrp, ntick_pgrp;
1732 #endif /* MEM_DEL_STATS */
1733 
1734 	mhp = (struct mem_handle *)amhp;
1735 
1736 #ifdef MEM_DEL_STATS
1737 	start_total = ddi_get_lbolt();
1738 #endif /* MEM_DEL_STATS */
1739 
1740 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1741 	    callb_generic_cpr, "memdel");
1742 
1743 	mutex_enter(&mhp->mh_mutex);
1744 	ASSERT(mhp->mh_state == MHND_STARTING);
1745 
1746 	mhp->mh_state = MHND_RUNNING;
1747 	mhp->mh_thread_id = curthread;
1748 
1749 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1750 	mutex_exit(&mhp->mh_mutex);
1751 
1752 	/* Allocate the remap pages now, if necessary. */
1753 	memseg_remap_init();
1754 
1755 	/*
1756 	 * Subtract from availrmem now if possible as availrmem
1757 	 * may not be available by the end of the delete.
1758 	 */
1759 	if (!get_availrmem(mhp->mh_vm_pages)) {
1760 		comp_code = KPHYSM_ENOTVIABLE;
1761 		mutex_enter(&mhp->mh_mutex);
1762 		goto early_exit;
1763 	}
1764 
1765 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1766 
1767 	mutex_enter(&mhp->mh_mutex);
1768 
1769 	if (ret != 0) {
1770 		mhp->mh_cancel = KPHYSM_EREFUSED;
1771 		goto refused;
1772 	}
1773 
1774 	transit_list_collect(mhp, 1);
1775 
1776 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1777 	    mdsp = mdsp->mds_next) {
1778 		ASSERT(mdsp->mds_bitmap == NULL);
1779 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1780 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1781 		    KM_SLEEP);
1782 	}
1783 
1784 	first_scan = 1;
1785 	freemem_left = 0;
1786 	/*
1787 	 * Start dr_aio_cleanup_thread, which periodically iterates
1788 	 * through the process list and invokes aio cleanup.  This
1789 	 * is needed in order to avoid a deadly embrace between the
1790 	 * delete_memory_thread (waiting on writer lock for page, with the
1791 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1792 	 * reader lock on the same page that is wanted by the
1793 	 * delete_memory_thread), and threads waiting for kaio completion
1794 	 * (blocked on spt_amp->lock).
1795 	 */
1796 	mhp->mh_dr_aio_cleanup_cancel = 0;
1797 	mhp->mh_aio_cleanup_done = 0;
1798 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1799 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1800 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1801 		pgcnt_t collected;
1802 
1803 		MDSTAT_INCR(mhp, nloop);
1804 		collected = 0;
1805 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1806 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1807 			pfn_t pfn, p_end;
1808 
1809 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1810 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1811 			    (mhp->mh_cancel == 0); pfn++) {
1812 				page_t *pp, *tpp, *tpp_targ;
1813 				pgcnt_t bit;
1814 				struct vnode *vp;
1815 				u_offset_t offset;
1816 				int mod, result;
1817 				spgcnt_t pgcnt;
1818 
1819 				bit = pfn - mdsp->mds_base;
1820 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1821 				    (1 << (bit % NBPBMW))) != 0) {
1822 					MDSTAT_INCR(mhp, already_done);
1823 					continue;
1824 				}
1825 				if (freemem_left == 0) {
1826 					freemem_left += delthr_get_freemem(mhp);
1827 					if (freemem_left == 0)
1828 						break;
1829 				}
1830 
1831 				/*
1832 				 * Release mh_mutex - some of this
1833 				 * stuff takes some time (eg PUTPAGE).
1834 				 */
1835 
1836 				mutex_exit(&mhp->mh_mutex);
1837 				MDSTAT_INCR(mhp, ncheck);
1838 
1839 				pp = page_numtopp_nolock(pfn);
1840 				if (pp == NULL) {
1841 					/*
1842 					 * Not covered by a page_t - will
1843 					 * be dealt with elsewhere.
1844 					 */
1845 					MDSTAT_INCR(mhp, nopaget);
1846 					mutex_enter(&mhp->mh_mutex);
1847 					mdsp->mds_bitmap[bit / NBPBMW] |=
1848 					    (1 << (bit % NBPBMW));
1849 					continue;
1850 				}
1851 
1852 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1853 				    SE_EXCL_WANTED | SE_RETIRED)) {
1854 					/*
1855 					 * Page in use elsewhere.  Skip it.
1856 					 */
1857 					MDSTAT_INCR(mhp, lockfail);
1858 					mutex_enter(&mhp->mh_mutex);
1859 					continue;
1860 				}
1861 				/*
1862 				 * See if the cage expanded into the delete.
1863 				 * This can happen as we have to allow the
1864 				 * cage to expand.
1865 				 */
1866 				if (PP_ISNORELOC(pp)) {
1867 					page_unlock(pp);
1868 					mutex_enter(&mhp->mh_mutex);
1869 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1870 					break;
1871 				}
1872 				if (PP_RETIRED(pp)) {
1873 					/*
1874 					 * Page has been retired and is
1875 					 * not part of the cage so we
1876 					 * can now do the accounting for
1877 					 * it.
1878 					 */
1879 					MDSTAT_INCR(mhp, retired);
1880 					mutex_enter(&mhp->mh_mutex);
1881 					mdsp->mds_bitmap[bit / NBPBMW]
1882 					    |= (1 << (bit % NBPBMW));
1883 					mdsp->mds_bitmap_retired[bit /
1884 					    NBPBMW] |=
1885 					    (1 << (bit % NBPBMW));
1886 					mhp->mh_hold_todo--;
1887 					continue;
1888 				}
1889 				ASSERT(freemem_left != 0);
1890 				if (PP_ISFREE(pp)) {
1891 					/*
1892 					 * Like page_reclaim() only 'freemem'
1893 					 * processing is already done.
1894 					 */
1895 					MDSTAT_INCR(mhp, nfree);
1896 				free_page_collect:
1897 					if (PP_ISAGED(pp)) {
1898 						page_list_sub(pp,
1899 						    PG_FREE_LIST);
1900 					} else {
1901 						page_list_sub(pp,
1902 						    PG_CACHE_LIST);
1903 					}
1904 					PP_CLRFREE(pp);
1905 					PP_CLRAGED(pp);
1906 					collected++;
1907 					mutex_enter(&mhp->mh_mutex);
1908 					page_delete_collect(pp, mhp);
1909 					mdsp->mds_bitmap[bit / NBPBMW] |=
1910 					    (1 << (bit % NBPBMW));
1911 					freemem_left--;
1912 					continue;
1913 				}
1914 				ASSERT(pp->p_vnode != NULL);
1915 				if (first_scan) {
1916 					MDSTAT_INCR(mhp, first_notfree);
1917 					page_unlock(pp);
1918 					mutex_enter(&mhp->mh_mutex);
1919 					continue;
1920 				}
1921 				/*
1922 				 * Keep stats on pages encountered that
1923 				 * are marked for retirement.
1924 				 */
1925 				if (PP_TOXIC(pp)) {
1926 					MDSTAT_INCR(mhp, toxic);
1927 				} else if (PP_PR_REQ(pp)) {
1928 					MDSTAT_INCR(mhp, failing);
1929 				}
1930 				/*
1931 				 * In certain cases below, special exceptions
1932 				 * are made for pages that are toxic.  This
1933 				 * is because the current meaning of toxic
1934 				 * is that an uncorrectable error has been
1935 				 * previously associated with the page.
1936 				 */
1937 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1938 					if (!PP_TOXIC(pp)) {
1939 						/*
1940 						 * Must relocate locked in
1941 						 * memory pages.
1942 						 */
1943 #ifdef MEM_DEL_STATS
1944 						start_pgrp = ddi_get_lbolt();
1945 #endif /* MEM_DEL_STATS */
1946 						/*
1947 						 * Lock all constituent pages
1948 						 * of a large page to ensure
1949 						 * that p_szc won't change.
1950 						 */
1951 						if (!group_page_trylock(pp,
1952 						    SE_EXCL)) {
1953 							MDSTAT_INCR(mhp,
1954 							    gptllckfail);
1955 							page_unlock(pp);
1956 							mutex_enter(
1957 							    &mhp->mh_mutex);
1958 							continue;
1959 						}
1960 						MDSTAT_INCR(mhp, npplocked);
1961 						pp_targ =
1962 						    page_get_replacement_page(
1963 						    pp, NULL, 0);
1964 						if (pp_targ != NULL) {
1965 #ifdef MEM_DEL_STATS
1966 							ntick_pgrp =
1967 							    (uint64_t)
1968 							    ddi_get_lbolt() -
1969 							    start_pgrp;
1970 #endif /* MEM_DEL_STATS */
1971 							MDSTAT_PGRP(mhp,
1972 							    ntick_pgrp);
1973 							MDSTAT_INCR(mhp,
1974 							    nlockreloc);
1975 							goto reloc;
1976 						}
1977 						group_page_unlock(pp);
1978 						page_unlock(pp);
1979 #ifdef MEM_DEL_STATS
1980 						ntick_pgrp =
1981 						    (uint64_t)ddi_get_lbolt() -
1982 						    start_pgrp;
1983 #endif /* MEM_DEL_STATS */
1984 						MDSTAT_PGRP(mhp, ntick_pgrp);
1985 						MDSTAT_INCR(mhp, nnorepl);
1986 						mutex_enter(&mhp->mh_mutex);
1987 						continue;
1988 					} else {
1989 						/*
1990 						 * Cannot do anything about
1991 						 * this page because it is
1992 						 * toxic.
1993 						 */
1994 						MDSTAT_INCR(mhp, npplkdtoxic);
1995 						page_unlock(pp);
1996 						mutex_enter(&mhp->mh_mutex);
1997 						continue;
1998 					}
1999 				}
2000 				/*
2001 				 * Unload the mappings and check if mod bit
2002 				 * is set.
2003 				 */
2004 				ASSERT(!PP_ISKAS(pp));
2005 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2006 				mod = hat_ismod(pp);
2007 
2008 #ifdef MEM_DEL_STATS
2009 				start_pgrp = ddi_get_lbolt();
2010 #endif /* MEM_DEL_STATS */
2011 				if (mod && !PP_TOXIC(pp)) {
2012 					/*
2013 					 * Lock all constituent pages
2014 					 * of a large page to ensure
2015 					 * that p_szc won't change.
2016 					 */
2017 					if (!group_page_trylock(pp, SE_EXCL)) {
2018 						MDSTAT_INCR(mhp, gptlmodfail);
2019 						page_unlock(pp);
2020 						mutex_enter(&mhp->mh_mutex);
2021 						continue;
2022 					}
2023 					pp_targ = page_get_replacement_page(pp,
2024 					    NULL, 0);
2025 					if (pp_targ != NULL) {
2026 						MDSTAT_INCR(mhp, nmodreloc);
2027 #ifdef MEM_DEL_STATS
2028 						ntick_pgrp =
2029 						    (uint64_t)ddi_get_lbolt() -
2030 						    start_pgrp;
2031 #endif /* MEM_DEL_STATS */
2032 						MDSTAT_PGRP(mhp, ntick_pgrp);
2033 						goto reloc;
2034 					}
2035 					group_page_unlock(pp);
2036 				}
2037 
2038 				if (!page_try_demote_pages(pp)) {
2039 					MDSTAT_INCR(mhp, demotefail);
2040 					page_unlock(pp);
2041 #ifdef MEM_DEL_STATS
2042 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2043 					    start_pgrp;
2044 #endif /* MEM_DEL_STATS */
2045 					MDSTAT_PGRP(mhp, ntick_pgrp);
2046 					mutex_enter(&mhp->mh_mutex);
2047 					continue;
2048 				}
2049 
2050 				/*
2051 				 * Regular 'page-out'.
2052 				 */
2053 				if (!mod) {
2054 					MDSTAT_INCR(mhp, ndestroy);
2055 					page_destroy(pp, 1);
2056 					/*
2057 					 * page_destroy was called with
2058 					 * dontfree. As long as p_lckcnt
2059 					 * and p_cowcnt are both zero, the
2060 					 * only additional action of
2061 					 * page_destroy with !dontfree is to
2062 					 * call page_free, so we can collect
2063 					 * the page here.
2064 					 */
2065 					collected++;
2066 #ifdef MEM_DEL_STATS
2067 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2068 					    start_pgrp;
2069 #endif /* MEM_DEL_STATS */
2070 					MDSTAT_PGRP(mhp, ntick_pgrp);
2071 					mutex_enter(&mhp->mh_mutex);
2072 					page_delete_collect(pp, mhp);
2073 					mdsp->mds_bitmap[bit / NBPBMW] |=
2074 					    (1 << (bit % NBPBMW));
2075 					continue;
2076 				}
2077 				/*
2078 				 * The page is toxic and the mod bit is
2079 				 * set, we cannot do anything here to deal
2080 				 * with it.
2081 				 */
2082 				if (PP_TOXIC(pp)) {
2083 					page_unlock(pp);
2084 #ifdef MEM_DEL_STATS
2085 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2086 					    start_pgrp;
2087 #endif /* MEM_DEL_STATS */
2088 					MDSTAT_PGRP(mhp, ntick_pgrp);
2089 					MDSTAT_INCR(mhp, modtoxic);
2090 					mutex_enter(&mhp->mh_mutex);
2091 					continue;
2092 				}
2093 				MDSTAT_INCR(mhp, nputpage);
2094 				vp = pp->p_vnode;
2095 				offset = pp->p_offset;
2096 				VN_HOLD(vp);
2097 				page_unlock(pp);
2098 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2099 				    B_INVAL|B_FORCE, kcred, NULL);
2100 				VN_RELE(vp);
2101 #ifdef MEM_DEL_STATS
2102 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2103 				    start_pgrp;
2104 #endif /* MEM_DEL_STATS */
2105 				MDSTAT_PGRP(mhp, ntick_pgrp);
2106 				/*
2107 				 * Try to get the page back immediately
2108 				 * so that it can be collected.
2109 				 */
2110 				pp = page_numtopp_nolock(pfn);
2111 				if (pp == NULL) {
2112 					MDSTAT_INCR(mhp, nnoreclaim);
2113 					/*
2114 					 * This should not happen as this
2115 					 * thread is deleting the page.
2116 					 * If this code is generalized, this
2117 					 * becomes a reality.
2118 					 */
2119 #ifdef DEBUG
2120 					cmn_err(CE_WARN,
2121 					    "delete_memory_thread(0x%p) "
2122 					    "pfn 0x%lx has no page_t",
2123 					    (void *)mhp, pfn);
2124 #endif /* DEBUG */
2125 					mutex_enter(&mhp->mh_mutex);
2126 					continue;
2127 				}
2128 				if (page_try_reclaim_lock(pp, SE_EXCL,
2129 				    SE_EXCL_WANTED | SE_RETIRED)) {
2130 					if (PP_ISFREE(pp)) {
2131 						goto free_page_collect;
2132 					}
2133 					page_unlock(pp);
2134 				}
2135 				MDSTAT_INCR(mhp, nnoreclaim);
2136 				mutex_enter(&mhp->mh_mutex);
2137 				continue;
2138 
2139 			reloc:
2140 				/*
2141 				 * Got some freemem and a target
2142 				 * page, so move the data to avoid
2143 				 * I/O and lock problems.
2144 				 */
2145 				ASSERT(!page_iolock_assert(pp));
2146 				MDSTAT_INCR(mhp, nreloc);
2147 				/*
2148 				 * page_relocate() will return pgcnt: the
2149 				 * number of consecutive pages relocated.
2150 				 * If it is successful, pp will be a
2151 				 * linked list of the page structs that
2152 				 * were relocated. If page_relocate() is
2153 				 * unsuccessful, pp will be unmodified.
2154 				 */
2155 #ifdef MEM_DEL_STATS
2156 				start_pgrp = ddi_get_lbolt();
2157 #endif /* MEM_DEL_STATS */
2158 				result = page_relocate(&pp, &pp_targ, 0, 0,
2159 				    &pgcnt, NULL);
2160 #ifdef MEM_DEL_STATS
2161 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2162 				    start_pgrp;
2163 #endif /* MEM_DEL_STATS */
2164 				MDSTAT_PGRP(mhp, ntick_pgrp);
2165 				if (result != 0) {
2166 					MDSTAT_INCR(mhp, nrelocfail);
2167 					/*
2168 					 * We did not succeed. We need
2169 					 * to give the pp_targ pages back.
2170 					 * page_free(pp_targ, 1) without
2171 					 * the freemem accounting.
2172 					 */
2173 					group_page_unlock(pp);
2174 					page_free_replacement_page(pp_targ);
2175 					page_unlock(pp);
2176 					mutex_enter(&mhp->mh_mutex);
2177 					continue;
2178 				}
2179 
2180 				/*
2181 				 * We will then collect pgcnt pages.
2182 				 */
2183 				ASSERT(pgcnt > 0);
2184 				mutex_enter(&mhp->mh_mutex);
2185 				/*
2186 				 * We need to make sure freemem_left is
2187 				 * large enough.
2188 				 */
2189 				while ((freemem_left < pgcnt) &&
2190 				    (!mhp->mh_cancel)) {
2191 					freemem_left +=
2192 					    delthr_get_freemem(mhp);
2193 				}
2194 
2195 				/*
2196 				 * Do not proceed if mh_cancel is set.
2197 				 */
2198 				if (mhp->mh_cancel) {
2199 					while (pp_targ != NULL) {
2200 						/*
2201 						 * Unlink and unlock each page.
2202 						 */
2203 						tpp_targ = pp_targ;
2204 						page_sub(&pp_targ, tpp_targ);
2205 						page_unlock(tpp_targ);
2206 					}
2207 					/*
2208 					 * We need to give the pp pages back.
2209 					 * page_free(pp, 1) without the
2210 					 * freemem accounting.
2211 					 */
2212 					page_free_replacement_page(pp);
2213 					break;
2214 				}
2215 
2216 				/* Now remove pgcnt from freemem_left */
2217 				freemem_left -= pgcnt;
2218 				ASSERT(freemem_left >= 0);
2219 				szc = pp->p_szc;
2220 				while (pp != NULL) {
2221 					/*
2222 					 * pp and pp_targ were passed back as
2223 					 * a linked list of pages.
2224 					 * Unlink and unlock each page.
2225 					 */
2226 					tpp_targ = pp_targ;
2227 					page_sub(&pp_targ, tpp_targ);
2228 					page_unlock(tpp_targ);
2229 					/*
2230 					 * The original page is now free
2231 					 * so remove it from the linked
2232 					 * list and collect it.
2233 					 */
2234 					tpp = pp;
2235 					page_sub(&pp, tpp);
2236 					pfn = page_pptonum(tpp);
2237 					collected++;
2238 					ASSERT(PAGE_EXCL(tpp));
2239 					ASSERT(tpp->p_vnode == NULL);
2240 					ASSERT(!hat_page_is_mapped(tpp));
2241 					ASSERT(tpp->p_szc == szc);
2242 					tpp->p_szc = 0;
2243 					page_delete_collect(tpp, mhp);
2244 					bit = pfn - mdsp->mds_base;
2245 					mdsp->mds_bitmap[bit / NBPBMW] |=
2246 					    (1 << (bit % NBPBMW));
2247 				}
2248 				ASSERT(pp_targ == NULL);
2249 			}
2250 		}
2251 		first_scan = 0;
2252 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2253 		    (collected == 0)) {
2254 			/*
2255 			 * This code is needed as we cannot wait
2256 			 * for a page to be locked OR the delete to
2257 			 * be cancelled.  Also, we must delay so
2258 			 * that other threads get a chance to run
2259 			 * on our cpu, otherwise page locks may be
2260 			 * held indefinitely by those threads.
2261 			 */
2262 			MDSTAT_INCR(mhp, ndelay);
2263 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2264 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2265 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2266 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2267 		}
2268 	}
2269 	/* stop the dr aio cleanup thread */
2270 	mhp->mh_dr_aio_cleanup_cancel = 1;
2271 	transit_list_collect(mhp, 0);
2272 	if (freemem_left != 0) {
2273 		/* Return any surplus. */
2274 		page_create_putback(freemem_left);
2275 		freemem_left = 0;
2276 	}
2277 #ifdef MEM_DEL_STATS
2278 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2279 #endif /* MEM_DEL_STATS */
2280 	MDSTAT_TOTAL(mhp, ntick_total);
2281 	MDSTAT_PRINT(mhp);
2282 
2283 	/*
2284 	 * If the memory delete was cancelled, exclusive-wanted bits must
2285 	 * be cleared. If there are retired pages being deleted, they need
2286 	 * to be unretired.
2287 	 */
2288 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2289 	    mdsp = mdsp->mds_next) {
2290 		pfn_t pfn, p_end;
2291 
2292 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2293 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2294 			page_t *pp;
2295 			pgcnt_t bit;
2296 
2297 			bit = pfn - mdsp->mds_base;
2298 			if (mhp->mh_cancel) {
2299 				pp = page_numtopp_nolock(pfn);
2300 				if (pp != NULL) {
2301 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2302 					    (1 << (bit % NBPBMW))) == 0) {
2303 						page_lock_clr_exclwanted(pp);
2304 					}
2305 				}
2306 			} else {
2307 				pp = NULL;
2308 			}
2309 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2310 			    (1 << (bit % NBPBMW))) != 0) {
2311 				/* do we already have pp? */
2312 				if (pp == NULL) {
2313 					pp = page_numtopp_nolock(pfn);
2314 				}
2315 				ASSERT(pp != NULL);
2316 				ASSERT(PP_RETIRED(pp));
2317 				if (mhp->mh_cancel != 0) {
2318 					page_unlock(pp);
2319 					/*
2320 					 * To satisfy ASSERT below in
2321 					 * cancel code.
2322 					 */
2323 					mhp->mh_hold_todo++;
2324 				} else {
2325 					(void) page_unretire_pp(pp,
2326 					    PR_UNR_CLEAN);
2327 				}
2328 			}
2329 		}
2330 	}
2331 	/*
2332 	 * Free retired page bitmap and collected page bitmap
2333 	 */
2334 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2335 	    mdsp = mdsp->mds_next) {
2336 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2337 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2338 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2339 		ASSERT(mdsp->mds_bitmap != NULL);
2340 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2341 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2342 	}
2343 
2344 	/* wait for our dr aio cancel thread to exit */
2345 	while (!(mhp->mh_aio_cleanup_done)) {
2346 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2347 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2348 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2349 	}
2350 refused:
2351 	if (mhp->mh_cancel != 0) {
2352 		page_t *pp;
2353 
2354 		comp_code = mhp->mh_cancel;
2355 		/*
2356 		 * Go through list of deleted pages (mh_deleted) freeing
2357 		 * them.
2358 		 */
2359 		while ((pp = mhp->mh_deleted) != NULL) {
2360 			mhp->mh_deleted = pp->p_next;
2361 			mhp->mh_hold_todo++;
2362 			mutex_exit(&mhp->mh_mutex);
2363 			/* Restore p_next. */
2364 			pp->p_next = pp->p_prev;
2365 			if (PP_ISFREE(pp)) {
2366 				cmn_err(CE_PANIC,
2367 				    "page %p is free",
2368 				    (void *)pp);
2369 			}
2370 			page_free(pp, 1);
2371 			mutex_enter(&mhp->mh_mutex);
2372 		}
2373 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2374 
2375 		mutex_exit(&mhp->mh_mutex);
2376 		put_availrmem(mhp->mh_vm_pages);
2377 		mutex_enter(&mhp->mh_mutex);
2378 
2379 		goto t_exit;
2380 	}
2381 
2382 	/*
2383 	 * All the pages are no longer in use and are exclusively locked.
2384 	 */
2385 
2386 	mhp->mh_deleted = NULL;
2387 
2388 	kphysm_del_cleanup(mhp);
2389 
2390 	/*
2391 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
2392 	 * that the mem_node_config[] will remain intact for the cleanup.
2393 	 */
2394 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2395 	    mdsp = mdsp->mds_next) {
2396 		mem_node_del_range(mdsp->mds_base,
2397 		    mdsp->mds_base + mdsp->mds_npgs - 1);
2398 	}
2399 	/* cleanup the page counters */
2400 	page_ctrs_cleanup();
2401 
2402 	comp_code = KPHYSM_OK;
2403 
2404 t_exit:
2405 	mutex_exit(&mhp->mh_mutex);
2406 	kphysm_setup_post_del(mhp->mh_vm_pages,
2407 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2408 	mutex_enter(&mhp->mh_mutex);
2409 
2410 early_exit:
2411 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2412 	mhp->mh_state = MHND_DONE;
2413 	del_complete_funcp = mhp->mh_delete_complete;
2414 	del_complete_arg = mhp->mh_delete_complete_arg;
2415 	CALLB_CPR_EXIT(&cprinfo);
2416 	(*del_complete_funcp)(del_complete_arg, comp_code);
2417 	thread_exit();
2418 	/*NOTREACHED*/
2419 }
2420 
2421 /*
2422  * Start the delete of the memory from the system.
2423  */
2424 int
2425 kphysm_del_start(
2426 	memhandle_t handle,
2427 	void (*complete)(void *, int),
2428 	void *complete_arg)
2429 {
2430 	struct mem_handle *mhp;
2431 
2432 	mhp = kphysm_lookup_mem_handle(handle);
2433 	if (mhp == NULL) {
2434 		return (KPHYSM_EHANDLE);
2435 	}
2436 	switch (mhp->mh_state) {
2437 	case MHND_FREE:
2438 		ASSERT(mhp->mh_state != MHND_FREE);
2439 		mutex_exit(&mhp->mh_mutex);
2440 		return (KPHYSM_EHANDLE);
2441 	case MHND_INIT:
2442 		break;
2443 	case MHND_STARTING:
2444 	case MHND_RUNNING:
2445 		mutex_exit(&mhp->mh_mutex);
2446 		return (KPHYSM_ESEQUENCE);
2447 	case MHND_DONE:
2448 		mutex_exit(&mhp->mh_mutex);
2449 		return (KPHYSM_ESEQUENCE);
2450 	case MHND_RELEASE:
2451 		mutex_exit(&mhp->mh_mutex);
2452 		return (KPHYSM_ESEQUENCE);
2453 	default:
2454 #ifdef DEBUG
2455 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2456 		    (void *)mhp, mhp->mh_state);
2457 #endif /* DEBUG */
2458 		mutex_exit(&mhp->mh_mutex);
2459 		return (KPHYSM_EHANDLE);
2460 	}
2461 
2462 	if (mhp->mh_transit.trl_spans == NULL) {
2463 		mutex_exit(&mhp->mh_mutex);
2464 		return (KPHYSM_ENOWORK);
2465 	}
2466 
2467 	ASSERT(complete != NULL);
2468 	mhp->mh_delete_complete = complete;
2469 	mhp->mh_delete_complete_arg = complete_arg;
2470 	mhp->mh_state = MHND_STARTING;
2471 	/*
2472 	 * Release the mutex in case thread_create sleeps.
2473 	 */
2474 	mutex_exit(&mhp->mh_mutex);
2475 
2476 	/*
2477 	 * The "obvious" process for this thread is pageout (proc_pageout)
2478 	 * but this gives the thread too much power over freemem
2479 	 * which results in freemem starvation.
2480 	 */
2481 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2482 	    TS_RUN, maxclsyspri - 1);
2483 
2484 	return (KPHYSM_OK);
2485 }
2486 
2487 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2488 static caddr_t pp_dummy;
2489 static pgcnt_t pp_dummy_npages;
2490 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2491 
2492 static void
2493 memseg_remap_init_pages(page_t *pages, page_t *epages)
2494 {
2495 	page_t *pp;
2496 
2497 	for (pp = pages; pp < epages; pp++) {
2498 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2499 		pp->p_offset = (u_offset_t)-1;
2500 		page_iolock_init(pp);
2501 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2502 			continue;
2503 		page_lock_delete(pp);
2504 	}
2505 }
2506 
2507 void
2508 memseg_remap_init()
2509 {
2510 	mutex_enter(&pp_dummy_lock);
2511 	if (pp_dummy == NULL) {
2512 		uint_t dpages;
2513 		int i;
2514 
2515 		/*
2516 		 * dpages starts off as the size of the structure and
2517 		 * ends up as the minimum number of pages that will
2518 		 * hold a whole number of page_t structures.
2519 		 */
2520 		dpages = sizeof (page_t);
2521 		ASSERT(dpages != 0);
2522 		ASSERT(dpages <= MMU_PAGESIZE);
2523 
2524 		while ((dpages & 1) == 0)
2525 			dpages >>= 1;
2526 
2527 		pp_dummy_npages = dpages;
2528 		/*
2529 		 * Allocate pp_dummy pages directly from static_arena,
2530 		 * since these are whole page allocations and are
2531 		 * referenced by physical address.  This also has the
2532 		 * nice fringe benefit of hiding the memory from
2533 		 * ::findleaks since it doesn't deal well with allocated
2534 		 * kernel heap memory that doesn't have any mappings.
2535 		 */
2536 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2537 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2538 		bzero(pp_dummy, ptob(pp_dummy_npages));
2539 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2540 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2541 		    pp_dummy_npages, KM_SLEEP);
2542 		for (i = 0; i < pp_dummy_npages; i++) {
2543 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2544 			    &pp_dummy[MMU_PAGESIZE * i]);
2545 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2546 		}
2547 		/*
2548 		 * Initialize the page_t's to a known 'deleted' state
2549 		 * that matches the state of deleted pages.
2550 		 */
2551 		memseg_remap_init_pages((page_t *)pp_dummy,
2552 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2553 		/* Remove kmem mappings for the pages for safety. */
2554 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2555 		    HAT_UNLOAD_UNLOCK);
2556 		/* Leave pp_dummy pointer set as flag that init is done. */
2557 	}
2558 	mutex_exit(&pp_dummy_lock);
2559 }
2560 
2561 /*
2562  * Remap a page-aglined range of page_t's to dummy pages.
2563  */
2564 void
2565 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2566 {
2567 	int phase;
2568 
2569 	ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2570 
2571 	/*
2572 	 * We may start remapping at a non-zero page offset
2573 	 * within the dummy pages since the low/high ends
2574 	 * of the outgoing pp's could be shared by other
2575 	 * memsegs (see memseg_remap_meta).
2576 	 */
2577 	phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2578 	/*CONSTCOND*/
2579 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2580 
2581 	while (metapgs != 0) {
2582 		pgcnt_t n;
2583 		int i, j;
2584 
2585 		n = pp_dummy_npages;
2586 		if (n > metapgs)
2587 			n = metapgs;
2588 		for (i = 0; i < n; i++) {
2589 			j = (i + phase) % pp_dummy_npages;
2590 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2591 			    PROT_READ,
2592 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2593 			    HAT_LOAD_REMAP);
2594 			va += ptob(1);
2595 		}
2596 		metapgs -= n;
2597 	}
2598 }
2599 
2600 static void
2601 memseg_remap_to_dummy(struct memseg *seg)
2602 {
2603 	caddr_t pp;
2604 	pgcnt_t metapgs;
2605 
2606 	ASSERT(memseg_is_dynamic(seg));
2607 	ASSERT(pp_dummy != NULL);
2608 
2609 
2610 	if (!memseg_includes_meta(seg)) {
2611 		memseg_remap_meta(seg);
2612 		return;
2613 	}
2614 
2615 	pp = (caddr_t)seg->pages;
2616 	metapgs = seg->pages_base - memseg_get_start(seg);
2617 	ASSERT(metapgs != 0);
2618 
2619 	seg->pages_end = seg->pages_base;
2620 
2621 	remap_to_dummy(pp, metapgs);
2622 }
2623 
2624 /*
2625  * Transition all the deleted pages to the deleted state so that
2626  * page_lock will not wait. The page_lock_delete call will
2627  * also wake up any waiters.
2628  */
2629 static void
2630 memseg_lock_delete_all(struct memseg *seg)
2631 {
2632 	page_t *pp;
2633 
2634 	for (pp = seg->pages; pp < seg->epages; pp++) {
2635 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2636 		page_lock_delete(pp);
2637 	}
2638 }
2639 
2640 static void
2641 kphysm_del_cleanup(struct mem_handle *mhp)
2642 {
2643 	struct memdelspan	*mdsp;
2644 	struct memseg		*seg;
2645 	struct memseg   	**segpp;
2646 	struct memseg		*seglist;
2647 	pfn_t			p_end;
2648 	uint64_t		avmem;
2649 	pgcnt_t			avpgs;
2650 	pgcnt_t			npgs;
2651 
2652 	avpgs = mhp->mh_vm_pages;
2653 
2654 	memsegs_lock(1);
2655 
2656 	/*
2657 	 * remove from main segment list.
2658 	 */
2659 	npgs = 0;
2660 	seglist = NULL;
2661 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2662 	    mdsp = mdsp->mds_next) {
2663 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2664 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2665 			if (seg->pages_base >= p_end ||
2666 			    seg->pages_end <= mdsp->mds_base) {
2667 				/* Span and memseg don't overlap. */
2668 				segpp = &((*segpp)->next);
2669 				continue;
2670 			}
2671 			ASSERT(seg->pages_base >= mdsp->mds_base);
2672 			ASSERT(seg->pages_end <= p_end);
2673 
2674 			PLCNT_MODIFY_MAX(seg->pages_base,
2675 			    seg->pages_base - seg->pages_end);
2676 
2677 			/* Hide the memseg from future scans. */
2678 			hat_kpm_delmem_mseg_update(seg, segpp);
2679 			*segpp = seg->next;
2680 			membar_producer();	/* TODO: Needed? */
2681 			npgs += MSEG_NPAGES(seg);
2682 
2683 			/*
2684 			 * Leave the deleted segment's next pointer intact
2685 			 * in case a memsegs scanning loop is walking this
2686 			 * segment concurrently.
2687 			 */
2688 			seg->lnext = seglist;
2689 			seglist = seg;
2690 		}
2691 	}
2692 
2693 	build_pfn_hash();
2694 
2695 	ASSERT(npgs < total_pages);
2696 	total_pages -= npgs;
2697 
2698 	/*
2699 	 * Recalculate the paging parameters now total_pages has changed.
2700 	 * This will also cause the clock hands to be reset before next use.
2701 	 */
2702 	setupclock(1);
2703 
2704 	memsegs_unlock(1);
2705 
2706 	mutex_exit(&mhp->mh_mutex);
2707 
2708 	while ((seg = seglist) != NULL) {
2709 		pfn_t mseg_start;
2710 		pfn_t mseg_base, mseg_end;
2711 		pgcnt_t mseg_npgs;
2712 		int mlret;
2713 
2714 		seglist = seg->lnext;
2715 
2716 		/*
2717 		 * Put the page_t's into the deleted state to stop
2718 		 * cv_wait()s on the pages. When we remap, the dummy
2719 		 * page_t's will be in the same state.
2720 		 */
2721 		memseg_lock_delete_all(seg);
2722 		/*
2723 		 * Collect up information based on pages_base and pages_end
2724 		 * early so that we can flag early that the memseg has been
2725 		 * deleted by setting pages_end == pages_base.
2726 		 */
2727 		mseg_base = seg->pages_base;
2728 		mseg_end = seg->pages_end;
2729 		mseg_npgs = MSEG_NPAGES(seg);
2730 		mseg_start = memseg_get_start(seg);
2731 
2732 		if (memseg_is_dynamic(seg)) {
2733 			/* Remap the meta data to our special dummy area. */
2734 			memseg_remap_to_dummy(seg);
2735 
2736 			mutex_enter(&memseg_lists_lock);
2737 			seg->lnext = memseg_va_avail;
2738 			memseg_va_avail = seg;
2739 			mutex_exit(&memseg_lists_lock);
2740 		} else {
2741 			/*
2742 			 * For memory whose page_ts were allocated
2743 			 * at boot, we need to find a new use for
2744 			 * the page_t memory.
2745 			 * For the moment, just leak it.
2746 			 * (It is held in the memseg_delete_junk list.)
2747 			 */
2748 			seg->pages_end = seg->pages_base;
2749 
2750 			mutex_enter(&memseg_lists_lock);
2751 			seg->lnext = memseg_delete_junk;
2752 			memseg_delete_junk = seg;
2753 			mutex_exit(&memseg_lists_lock);
2754 		}
2755 
2756 		/* Must not use seg now as it could be re-used. */
2757 
2758 		memlist_write_lock();
2759 
2760 		mlret = memlist_delete_span(
2761 		    (uint64_t)(mseg_base) << PAGESHIFT,
2762 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2763 		    &phys_avail);
2764 		ASSERT(mlret == MEML_SPANOP_OK);
2765 
2766 		mlret = memlist_delete_span(
2767 		    (uint64_t)(mseg_start) << PAGESHIFT,
2768 		    (uint64_t)(mseg_end - mseg_start) <<
2769 		    PAGESHIFT,
2770 		    &phys_install);
2771 		ASSERT(mlret == MEML_SPANOP_OK);
2772 		phys_install_has_changed();
2773 
2774 		memlist_write_unlock();
2775 	}
2776 
2777 	memlist_read_lock();
2778 	installed_top_size(phys_install, &physmax, &physinstalled);
2779 	memlist_read_unlock();
2780 
2781 	mutex_enter(&freemem_lock);
2782 	maxmem -= avpgs;
2783 	physmem -= avpgs;
2784 	/* availrmem is adjusted during the delete. */
2785 	availrmem_initial -= avpgs;
2786 
2787 	mutex_exit(&freemem_lock);
2788 
2789 	dump_resize();
2790 
2791 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2792 	    "(0x%" PRIx64 ")\n",
2793 	    physinstalled << (PAGESHIFT - 10),
2794 	    (uint64_t)physinstalled << PAGESHIFT);
2795 
2796 	avmem = (uint64_t)freemem << PAGESHIFT;
2797 	cmn_err(CE_CONT, "?kphysm_delete: "
2798 	    "avail mem = %" PRId64 "\n", avmem);
2799 
2800 	/*
2801 	 * Update lgroup generation number on single lgroup systems
2802 	 */
2803 	if (nlgrps == 1)
2804 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2805 
2806 	/* Successfully deleted system memory */
2807 	mutex_enter(&mhp->mh_mutex);
2808 }
2809 
2810 static uint_t mdel_nullvp_waiter;
2811 
2812 static void
2813 page_delete_collect(
2814 	page_t *pp,
2815 	struct mem_handle *mhp)
2816 {
2817 	if (pp->p_vnode) {
2818 		page_hashout(pp, (kmutex_t *)NULL);
2819 		/* do not do PP_SETAGED(pp); */
2820 	} else {
2821 		kmutex_t *sep;
2822 
2823 		sep = page_se_mutex(pp);
2824 		mutex_enter(sep);
2825 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2826 			mdel_nullvp_waiter++;
2827 			cv_broadcast(&pp->p_cv);
2828 		}
2829 		mutex_exit(sep);
2830 	}
2831 	ASSERT(pp->p_next == pp->p_prev);
2832 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2833 	pp->p_next = mhp->mh_deleted;
2834 	mhp->mh_deleted = pp;
2835 	ASSERT(mhp->mh_hold_todo != 0);
2836 	mhp->mh_hold_todo--;
2837 }
2838 
2839 static void
2840 transit_list_collect(struct mem_handle *mhp, int v)
2841 {
2842 	struct transit_list_head *trh;
2843 
2844 	trh = &transit_list_head;
2845 	mutex_enter(&trh->trh_lock);
2846 	mhp->mh_transit.trl_collect = v;
2847 	mutex_exit(&trh->trh_lock);
2848 }
2849 
2850 static void
2851 transit_list_insert(struct transit_list *tlp)
2852 {
2853 	struct transit_list_head *trh;
2854 
2855 	trh = &transit_list_head;
2856 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2857 	tlp->trl_next = trh->trh_head;
2858 	trh->trh_head = tlp;
2859 }
2860 
2861 static void
2862 transit_list_remove(struct transit_list *tlp)
2863 {
2864 	struct transit_list_head *trh;
2865 	struct transit_list **tlpp;
2866 
2867 	trh = &transit_list_head;
2868 	tlpp = &trh->trh_head;
2869 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2870 	while (*tlpp != NULL && *tlpp != tlp)
2871 		tlpp = &(*tlpp)->trl_next;
2872 	ASSERT(*tlpp != NULL);
2873 	if (*tlpp == tlp)
2874 		*tlpp = tlp->trl_next;
2875 	tlp->trl_next = NULL;
2876 }
2877 
2878 static struct transit_list *
2879 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2880 {
2881 	struct transit_list *tlp;
2882 
2883 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2884 		struct memdelspan *mdsp;
2885 
2886 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2887 		    mdsp = mdsp->mds_next) {
2888 			if (pfnum >= mdsp->mds_base &&
2889 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2890 				return (tlp);
2891 			}
2892 		}
2893 	}
2894 	return (NULL);
2895 }
2896 
2897 int
2898 pfn_is_being_deleted(pfn_t pfnum)
2899 {
2900 	struct transit_list_head *trh;
2901 	struct transit_list *tlp;
2902 	int ret;
2903 
2904 	trh = &transit_list_head;
2905 	if (trh->trh_head == NULL)
2906 		return (0);
2907 
2908 	mutex_enter(&trh->trh_lock);
2909 	tlp = pfnum_to_transit_list(trh, pfnum);
2910 	ret = (tlp != NULL && tlp->trl_collect);
2911 	mutex_exit(&trh->trh_lock);
2912 
2913 	return (ret);
2914 }
2915 
2916 #ifdef MEM_DEL_STATS
2917 extern int hz;
2918 static void
2919 mem_del_stat_print_func(struct mem_handle *mhp)
2920 {
2921 	uint64_t tmp;
2922 
2923 	if (mem_del_stat_print) {
2924 		printf("memory delete loop %x/%x, statistics%s\n",
2925 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2926 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2927 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2928 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2929 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2930 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2931 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2932 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2933 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2934 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2935 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2936 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2937 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2938 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2939 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2940 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2941 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2942 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2943 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2944 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2945 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2946 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2947 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2948 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2949 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2950 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2951 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2952 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2953 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2954 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2955 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2956 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2957 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2958 		printf(
2959 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2960 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2961 
2962 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2963 		printf(
2964 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2965 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2966 	}
2967 }
2968 #endif /* MEM_DEL_STATS */
2969 
2970 struct mem_callback {
2971 	kphysm_setup_vector_t	*vec;
2972 	void			*arg;
2973 };
2974 
2975 #define	NMEMCALLBACKS		100
2976 
2977 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2978 static uint_t nmemcallbacks;
2979 static krwlock_t mem_callback_rwlock;
2980 
2981 int
2982 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2983 {
2984 	uint_t i, found;
2985 
2986 	/*
2987 	 * This test will become more complicated when the version must
2988 	 * change.
2989 	 */
2990 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2991 		return (EINVAL);
2992 
2993 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2994 	    vec->post_del == NULL)
2995 		return (EINVAL);
2996 
2997 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2998 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2999 		if (mem_callbacks[i].vec == NULL && found == 0)
3000 			found = i + 1;
3001 		if (mem_callbacks[i].vec == vec &&
3002 		    mem_callbacks[i].arg == arg) {
3003 #ifdef DEBUG
3004 			/* Catch this in DEBUG kernels. */
3005 			cmn_err(CE_WARN, "kphysm_setup_func_register"
3006 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
3007 			    (void *)vec, arg, (void *)caller());
3008 #endif /* DEBUG */
3009 			rw_exit(&mem_callback_rwlock);
3010 			return (EEXIST);
3011 		}
3012 	}
3013 	if (found != 0) {
3014 		i = found - 1;
3015 	} else {
3016 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
3017 		if (nmemcallbacks == NMEMCALLBACKS) {
3018 			rw_exit(&mem_callback_rwlock);
3019 			return (ENOMEM);
3020 		}
3021 		i = nmemcallbacks++;
3022 	}
3023 	mem_callbacks[i].vec = vec;
3024 	mem_callbacks[i].arg = arg;
3025 	rw_exit(&mem_callback_rwlock);
3026 	return (0);
3027 }
3028 
3029 void
3030 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3031 {
3032 	uint_t i;
3033 
3034 	rw_enter(&mem_callback_rwlock, RW_WRITER);
3035 	for (i = 0; i < nmemcallbacks; i++) {
3036 		if (mem_callbacks[i].vec == vec &&
3037 		    mem_callbacks[i].arg == arg) {
3038 			mem_callbacks[i].vec = NULL;
3039 			mem_callbacks[i].arg = NULL;
3040 			if (i == (nmemcallbacks - 1))
3041 				nmemcallbacks--;
3042 			break;
3043 		}
3044 	}
3045 	rw_exit(&mem_callback_rwlock);
3046 }
3047 
3048 static void
3049 kphysm_setup_post_add(pgcnt_t delta_pages)
3050 {
3051 	uint_t i;
3052 
3053 	rw_enter(&mem_callback_rwlock, RW_READER);
3054 	for (i = 0; i < nmemcallbacks; i++) {
3055 		if (mem_callbacks[i].vec != NULL) {
3056 			(*mem_callbacks[i].vec->post_add)
3057 			    (mem_callbacks[i].arg, delta_pages);
3058 		}
3059 	}
3060 	rw_exit(&mem_callback_rwlock);
3061 }
3062 
3063 /*
3064  * Note the locking between pre_del and post_del: The reader lock is held
3065  * between the two calls to stop the set of functions from changing.
3066  */
3067 
3068 static int
3069 kphysm_setup_pre_del(pgcnt_t delta_pages)
3070 {
3071 	uint_t i;
3072 	int ret;
3073 	int aret;
3074 
3075 	ret = 0;
3076 	rw_enter(&mem_callback_rwlock, RW_READER);
3077 	for (i = 0; i < nmemcallbacks; i++) {
3078 		if (mem_callbacks[i].vec != NULL) {
3079 			aret = (*mem_callbacks[i].vec->pre_del)
3080 			    (mem_callbacks[i].arg, delta_pages);
3081 			ret |= aret;
3082 		}
3083 	}
3084 
3085 	return (ret);
3086 }
3087 
3088 static void
3089 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3090 {
3091 	uint_t i;
3092 
3093 	for (i = 0; i < nmemcallbacks; i++) {
3094 		if (mem_callbacks[i].vec != NULL) {
3095 			(*mem_callbacks[i].vec->post_del)
3096 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3097 		}
3098 	}
3099 	rw_exit(&mem_callback_rwlock);
3100 }
3101 
3102 static int
3103 kphysm_split_memseg(
3104 	pfn_t base,
3105 	pgcnt_t npgs)
3106 {
3107 	struct memseg *seg;
3108 	struct memseg **segpp;
3109 	pgcnt_t size_low, size_high;
3110 	struct memseg *seg_low, *seg_mid, *seg_high;
3111 
3112 	/*
3113 	 * Lock the memsegs list against other updates now
3114 	 */
3115 	memsegs_lock(1);
3116 
3117 	/*
3118 	 * Find boot time memseg that wholly covers this area.
3119 	 */
3120 
3121 	/* First find the memseg with page 'base' in it. */
3122 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3123 	    segpp = &((*segpp)->next)) {
3124 		if (base >= seg->pages_base && base < seg->pages_end)
3125 			break;
3126 	}
3127 	if (seg == NULL) {
3128 		memsegs_unlock(1);
3129 		return (0);
3130 	}
3131 	if (memseg_includes_meta(seg)) {
3132 		memsegs_unlock(1);
3133 		return (0);
3134 	}
3135 	if ((base + npgs) > seg->pages_end) {
3136 		memsegs_unlock(1);
3137 		return (0);
3138 	}
3139 
3140 	/*
3141 	 * Work out the size of the two segments that will
3142 	 * surround the new segment, one for low address
3143 	 * and one for high.
3144 	 */
3145 	ASSERT(base >= seg->pages_base);
3146 	size_low = base - seg->pages_base;
3147 	ASSERT(seg->pages_end >= (base + npgs));
3148 	size_high = seg->pages_end - (base + npgs);
3149 
3150 	/*
3151 	 * Sanity check.
3152 	 */
3153 	if ((size_low + size_high) == 0) {
3154 		memsegs_unlock(1);
3155 		return (0);
3156 	}
3157 
3158 	/*
3159 	 * Allocate the new structures. The old memseg will not be freed
3160 	 * as there may be a reference to it.
3161 	 */
3162 	seg_low = NULL;
3163 	seg_high = NULL;
3164 
3165 	if (size_low != 0)
3166 		seg_low = memseg_alloc();
3167 
3168 	seg_mid = memseg_alloc();
3169 
3170 	if (size_high != 0)
3171 		seg_high = memseg_alloc();
3172 
3173 	/*
3174 	 * All allocation done now.
3175 	 */
3176 	if (size_low != 0) {
3177 		seg_low->pages = seg->pages;
3178 		seg_low->epages = seg_low->pages + size_low;
3179 		seg_low->pages_base = seg->pages_base;
3180 		seg_low->pages_end = seg_low->pages_base + size_low;
3181 		seg_low->next = seg_mid;
3182 		seg_low->msegflags = seg->msegflags;
3183 	}
3184 	if (size_high != 0) {
3185 		seg_high->pages = seg->epages - size_high;
3186 		seg_high->epages = seg_high->pages + size_high;
3187 		seg_high->pages_base = seg->pages_end - size_high;
3188 		seg_high->pages_end = seg_high->pages_base + size_high;
3189 		seg_high->next = seg->next;
3190 		seg_high->msegflags = seg->msegflags;
3191 	}
3192 
3193 	seg_mid->pages = seg->pages + size_low;
3194 	seg_mid->pages_base = seg->pages_base + size_low;
3195 	seg_mid->epages = seg->epages - size_high;
3196 	seg_mid->pages_end = seg->pages_end - size_high;
3197 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3198 	seg_mid->msegflags = seg->msegflags;
3199 
3200 	/*
3201 	 * Update hat_kpm specific info of all involved memsegs and
3202 	 * allow hat_kpm specific global chain updates.
3203 	 */
3204 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3205 
3206 	/*
3207 	 * At this point we have two equivalent memseg sub-chains,
3208 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3209 	 * the same place in the global chain. By re-writing the pointer
3210 	 * in the previous element we switch atomically from using the old
3211 	 * (seg) to the new.
3212 	 */
3213 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3214 
3215 	membar_enter();
3216 
3217 	build_pfn_hash();
3218 	memsegs_unlock(1);
3219 
3220 	/*
3221 	 * We leave the old segment, 'seg', intact as there may be
3222 	 * references to it. Also, as the value of total_pages has not
3223 	 * changed and the memsegs list is effectively the same when
3224 	 * accessed via the old or the new pointer, we do not have to
3225 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3226 	 *
3227 	 * We currently do not re-use or reclaim the page_t memory.
3228 	 * If we do, then this may have to change.
3229 	 */
3230 
3231 	mutex_enter(&memseg_lists_lock);
3232 	seg->lnext = memseg_edit_junk;
3233 	memseg_edit_junk = seg;
3234 	mutex_exit(&memseg_lists_lock);
3235 
3236 	return (1);
3237 }
3238 
3239 /*
3240  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3241  * structure using physical addresses. Therefore a kmem_cache is
3242  * used with KMC_NOHASH to avoid page crossings within a memseg
3243  * structure. KMC_NOHASH requires that no external (outside of
3244  * slab) information is allowed. This, in turn, implies that the
3245  * cache's slabsize must be exactly a single page, since per-slab
3246  * information (e.g. the freelist for the slab) is kept at the
3247  * end of the slab, where it is easy to locate. Should be changed
3248  * when a more obvious kmem_cache interface/flag will become
3249  * available.
3250  */
3251 void
3252 mem_config_init()
3253 {
3254 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3255 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3256 }
3257 
3258 struct memseg *
3259 memseg_alloc()
3260 {
3261 	struct memseg *seg;
3262 
3263 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3264 	bzero(seg, sizeof (struct memseg));
3265 
3266 	return (seg);
3267 }
3268 
3269 /*
3270  * Return whether the page_t memory for this memseg
3271  * is included in the memseg itself.
3272  */
3273 static int
3274 memseg_includes_meta(struct memseg *seg)
3275 {
3276 	return (seg->msegflags & MEMSEG_META_INCL);
3277 }
3278 
3279 pfn_t
3280 memseg_get_start(struct memseg *seg)
3281 {
3282 	pfn_t		pt_start;
3283 
3284 	if (memseg_includes_meta(seg)) {
3285 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3286 
3287 		/* Meta data is required to be at the beginning */
3288 		ASSERT(pt_start < seg->pages_base);
3289 	} else
3290 		pt_start = seg->pages_base;
3291 
3292 	return (pt_start);
3293 }
3294 
3295 /*
3296  * Invalidate memseg pointers in cpu private vm data caches.
3297  */
3298 static void
3299 memseg_cpu_vm_flush()
3300 {
3301 	cpu_t *cp;
3302 	vm_cpu_data_t *vc;
3303 
3304 	mutex_enter(&cpu_lock);
3305 	pause_cpus(NULL, NULL);
3306 
3307 	cp = cpu_list;
3308 	do {
3309 		vc = cp->cpu_vm_data;
3310 		vc->vc_pnum_memseg = NULL;
3311 		vc->vc_pnext_memseg = NULL;
3312 
3313 	} while ((cp = cp->cpu_next) != cpu_list);
3314 
3315 	start_cpus();
3316 	mutex_exit(&cpu_lock);
3317 }
3318