xref: /illumos-gate/usr/src/uts/common/os/mem_config.c (revision aedf2b3bb56b025fcaf87b49ec6c8aeea07f16d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern void mem_node_add(pfn_t, pfn_t);
63 extern void mem_node_del(pfn_t, pfn_t);
64 
65 extern uint_t page_ctrs_adjust(int);
66 static void kphysm_setup_post_add(pgcnt_t);
67 static int kphysm_setup_pre_del(pgcnt_t);
68 static void kphysm_setup_post_del(pgcnt_t, int);
69 
70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
71 
72 static int delspan_reserve(pfn_t, pgcnt_t);
73 static void delspan_unreserve(pfn_t, pgcnt_t);
74 
75 kmutex_t memseg_lists_lock;
76 struct memseg *memseg_va_avail;
77 struct memseg *memseg_alloc(void);
78 static struct memseg *memseg_delete_junk;
79 static struct memseg *memseg_edit_junk;
80 void memseg_remap_init(void);
81 static void memseg_remap_to_dummy(struct memseg *);
82 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
83 static struct memseg *memseg_reuse(pgcnt_t);
84 
85 static struct kmem_cache *memseg_cache;
86 
87 /*
88  * Interfaces to manage externally allocated
89  * page_t memory (metadata) for a memseg.
90  */
91 #pragma weak	memseg_alloc_meta
92 #pragma weak	memseg_free_meta
93 #pragma weak	memseg_get_metapfn
94 #pragma weak	memseg_remap_meta
95 
96 extern int ppvm_enable;
97 extern page_t *ppvm_base;
98 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
99 extern void memseg_free_meta(void *, pgcnt_t);
100 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
101 extern void memseg_remap_meta(struct memseg *);
102 static int memseg_is_dynamic(struct memseg *);
103 static int memseg_includes_meta(struct memseg *);
104 static pfn_t memseg_get_start(struct memseg *);
105 static void memseg_cpu_vm_flush(void);
106 
107 int meta_alloc_enable;
108 
109 /*
110  * Add a chunk of memory to the system.
111  * base: starting PAGESIZE page of new memory.
112  * npgs: length in PAGESIZE pages.
113  *
114  * Adding mem this way doesn't increase the size of the hash tables;
115  * growing them would be too hard.  This should be OK, but adding memory
116  * dynamically most likely means more hash misses, since the tables will
117  * be smaller than they otherwise would be.
118  */
119 #ifdef	DEBUG
120 static int memseg_debug;
121 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
122 #else
123 #define	MEMSEG_DEBUG(...)
124 #endif
125 
126 int
127 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
128 {
129 	page_t *pp;
130 	page_t		*opp, *oepp, *segpp;
131 	struct memseg	*seg;
132 	uint64_t	avmem;
133 	pfn_t		pfn;
134 	pfn_t		pt_base = base;
135 	pgcnt_t		tpgs = npgs;
136 	pgcnt_t		metapgs = 0;
137 	int		exhausted;
138 	pfn_t		pnum;
139 	int		mnode;
140 	caddr_t		vaddr;
141 	int		reuse;
142 	int		mlret;
143 	int		rv;
144 	int		flags;
145 	int		meta_alloc = 0;
146 	void		*mapva;
147 	void		*metabase = (void *)base;
148 	pgcnt_t		nkpmpgs = 0;
149 	offset_t	kpm_pages_off;
150 
151 	cmn_err(CE_CONT,
152 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
153 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
154 
155 	/*
156 	 * Add this span in the delete list to prevent interactions.
157 	 */
158 	if (!delspan_reserve(base, npgs)) {
159 		return (KPHYSM_ESPAN);
160 	}
161 	/*
162 	 * Check to see if any of the memory span has been added
163 	 * by trying an add to the installed memory list. This
164 	 * forms the interlocking process for add.
165 	 */
166 
167 	memlist_write_lock();
168 
169 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
170 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
171 
172 	if (mlret == MEML_SPANOP_OK)
173 		installed_top_size(phys_install, &physmax, &physinstalled);
174 
175 	memlist_write_unlock();
176 
177 	if (mlret != MEML_SPANOP_OK) {
178 		if (mlret == MEML_SPANOP_EALLOC) {
179 			delspan_unreserve(pt_base, tpgs);
180 			return (KPHYSM_ERESOURCE);
181 		} else if (mlret == MEML_SPANOP_ESPAN) {
182 			delspan_unreserve(pt_base, tpgs);
183 			return (KPHYSM_ESPAN);
184 		} else {
185 			delspan_unreserve(pt_base, tpgs);
186 			return (KPHYSM_ERESOURCE);
187 		}
188 	}
189 
190 	if (meta_alloc_enable) {
191 		/*
192 		 * Allocate the page_t's from existing memory;
193 		 * if that fails, allocate from the incoming memory.
194 		 */
195 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
196 		if (rv == KPHYSM_OK) {
197 			ASSERT(metapgs);
198 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
199 			meta_alloc = 1;
200 			goto mapalloc;
201 		}
202 	}
203 
204 	/*
205 	 * We store the page_t's for this new memory in the first
206 	 * few pages of the chunk. Here, we go and get'em ...
207 	 */
208 
209 	/*
210 	 * The expression after the '-' gives the number of pages
211 	 * that will fit in the new memory based on a requirement
212 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
213 	 */
214 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
215 	    (PAGESIZE + sizeof (page_t)));
216 
217 	npgs -= metapgs;
218 	base += metapgs;
219 
220 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
221 
222 	exhausted = (metapgs == 0 || npgs == 0);
223 
224 	if (kpm_enable && !exhausted) {
225 		pgcnt_t start, end, nkpmpgs_prelim;
226 		size_t	ptsz;
227 
228 		/*
229 		 * A viable kpm large page mapping must not overlap two
230 		 * dynamic memsegs. Therefore the total size is checked
231 		 * to be at least kpm_pgsz and also whether start and end
232 		 * points are at least kpm_pgsz aligned.
233 		 */
234 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
235 		    pmodkpmp(base + npgs)) {
236 
237 			kphysm_addmem_error_undospan(pt_base, tpgs);
238 
239 			/*
240 			 * There is no specific error code for violating
241 			 * kpm granularity constraints.
242 			 */
243 			return (KPHYSM_ENOTVIABLE);
244 		}
245 
246 		start = kpmptop(ptokpmp(base));
247 		end = kpmptop(ptokpmp(base + npgs));
248 		nkpmpgs_prelim = ptokpmp(end - start);
249 		ptsz = npgs * sizeof (page_t);
250 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
251 		exhausted = (tpgs <= metapgs);
252 		if (!exhausted) {
253 			npgs = tpgs - metapgs;
254 			base = pt_base + metapgs;
255 
256 			/* final nkpmpgs */
257 			start = kpmptop(ptokpmp(base));
258 			nkpmpgs = ptokpmp(end - start);
259 			kpm_pages_off = ptsz +
260 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
261 		}
262 	}
263 
264 	/*
265 	 * Is memory area supplied too small?
266 	 */
267 	if (exhausted) {
268 		kphysm_addmem_error_undospan(pt_base, tpgs);
269 		/*
270 		 * There is no specific error code for 'too small'.
271 		 */
272 		return (KPHYSM_ERESOURCE);
273 	}
274 
275 mapalloc:
276 	/*
277 	 * We may re-use a previously allocated VA space for the page_ts
278 	 * eventually, but we need to initialize and lock the pages first.
279 	 */
280 
281 	/*
282 	 * Get an address in the kernel address map, map
283 	 * the page_t pages and see if we can touch them.
284 	 */
285 
286 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
287 	if (mapva == NULL) {
288 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
289 		    " Can't allocate VA for page_ts");
290 
291 		if (meta_alloc)
292 			memseg_free_meta(metabase, metapgs);
293 		kphysm_addmem_error_undospan(pt_base, tpgs);
294 
295 		return (KPHYSM_ERESOURCE);
296 	}
297 	pp = mapva;
298 
299 	if (physmax < (pt_base + tpgs))
300 		physmax = (pt_base + tpgs);
301 
302 	/*
303 	 * In the remapping code we map one page at a time so we must do
304 	 * the same here to match mapping sizes.
305 	 */
306 	pfn = pt_base;
307 	vaddr = (caddr_t)pp;
308 	for (pnum = 0; pnum < metapgs; pnum++) {
309 		if (meta_alloc)
310 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
311 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
312 		    PROT_READ | PROT_WRITE,
313 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
314 		pfn++;
315 		vaddr += ptob(1);
316 	}
317 
318 	if (ddi_peek32((dev_info_t *)NULL,
319 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
320 
321 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
322 		    " Can't access pp array at 0x%p [phys 0x%lx]",
323 		    (void *)pp, pt_base);
324 
325 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
326 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
327 
328 		vmem_free(heap_arena, mapva, ptob(metapgs));
329 		if (meta_alloc)
330 			memseg_free_meta(metabase, metapgs);
331 		kphysm_addmem_error_undospan(pt_base, tpgs);
332 
333 		return (KPHYSM_EFAULT);
334 	}
335 
336 	/*
337 	 * Add this memory slice to its memory node translation.
338 	 *
339 	 * Note that right now, each node may have only one slice;
340 	 * this may change with COD or in larger SSM systems with
341 	 * nested latency groups, so we must not assume that the
342 	 * node does not yet exist.
343 	 */
344 	pnum = pt_base + tpgs - 1;
345 	mem_node_add_range(pt_base, pnum);
346 
347 	/*
348 	 * Allocate or resize page counters as necessary to accommodate
349 	 * the increase in memory pages.
350 	 */
351 	mnode = PFN_2_MEM_NODE(pnum);
352 	PAGE_CTRS_ADJUST(base, npgs, rv);
353 	if (rv) {
354 
355 		mem_node_del_range(pt_base, pnum);
356 
357 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
358 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
359 
360 		vmem_free(heap_arena, mapva, ptob(metapgs));
361 		if (meta_alloc)
362 			memseg_free_meta(metabase, metapgs);
363 		kphysm_addmem_error_undospan(pt_base, tpgs);
364 
365 		return (KPHYSM_ERESOURCE);
366 	}
367 
368 	/*
369 	 * Update the phys_avail memory list.
370 	 * The phys_install list was done at the start.
371 	 */
372 
373 	memlist_write_lock();
374 
375 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
376 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
377 	ASSERT(mlret == MEML_SPANOP_OK);
378 
379 	memlist_write_unlock();
380 
381 	/* See if we can find a memseg to re-use. */
382 	if (meta_alloc) {
383 		seg = memseg_reuse(0);
384 		reuse = 1;	/* force unmapping of temp mapva */
385 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
386 		/*
387 		 * There is a 1:1 fixed relationship between a pfn
388 		 * and a page_t VA.  The pfn is used as an index into
389 		 * the ppvm_base page_t table in order to calculate
390 		 * the page_t base address for a given pfn range.
391 		 */
392 		segpp = ppvm_base + base;
393 	} else {
394 		seg = memseg_reuse(metapgs);
395 		reuse = (seg != NULL);
396 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
397 		segpp = pp;
398 	}
399 
400 	/*
401 	 * Initialize the memseg structure representing this memory
402 	 * and add it to the existing list of memsegs. Do some basic
403 	 * initialization and add the memory to the system.
404 	 * In order to prevent lock deadlocks, the add_physmem()
405 	 * code is repeated here, but split into several stages.
406 	 *
407 	 * If a memseg is reused, invalidate memseg pointers in
408 	 * all cpu vm caches.  We need to do this this since the check
409 	 * 	pp >= seg->pages && pp < seg->epages
410 	 * used in various places is not atomic and so the first compare
411 	 * can happen before reuse and the second compare after reuse.
412 	 * The invalidation ensures that a memseg is not deferenced while
413 	 * it's page/pfn pointers are changing.
414 	 */
415 	if (seg == NULL) {
416 		seg = memseg_alloc();
417 		ASSERT(seg != NULL);
418 		seg->msegflags = flags;
419 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
420 		    (void *)seg, (void *)(seg->pages));
421 		seg->pages = segpp;
422 	} else {
423 		ASSERT(seg->msegflags == flags);
424 		ASSERT(seg->pages_base == seg->pages_end);
425 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
426 		    (void *)seg, (void *)(seg->pages));
427 		if (meta_alloc) {
428 			memseg_cpu_vm_flush();
429 			seg->pages = segpp;
430 		}
431 	}
432 
433 	seg->epages = seg->pages + npgs;
434 	seg->pages_base = base;
435 	seg->pages_end = base + npgs;
436 
437 	/*
438 	 * Initialize metadata. The page_ts are set to locked state
439 	 * ready to be freed.
440 	 */
441 	bzero((caddr_t)pp, ptob(metapgs));
442 
443 	pfn = seg->pages_base;
444 	/* Save the original pp base in case we reuse a memseg. */
445 	opp = pp;
446 	oepp = opp + npgs;
447 	for (pp = opp; pp < oepp; pp++) {
448 		pp->p_pagenum = pfn;
449 		pfn++;
450 		page_iolock_init(pp);
451 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
452 			continue;
453 		pp->p_offset = (u_offset_t)-1;
454 	}
455 
456 	if (reuse) {
457 		/* Remap our page_ts to the re-used memseg VA space. */
458 		pfn = pt_base;
459 		vaddr = (caddr_t)seg->pages;
460 		for (pnum = 0; pnum < metapgs; pnum++) {
461 			if (meta_alloc)
462 				pfn = memseg_get_metapfn(metabase,
463 				    (pgcnt_t)pnum);
464 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
465 			    PROT_READ | PROT_WRITE,
466 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
467 			pfn++;
468 			vaddr += ptob(1);
469 		}
470 
471 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
472 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
473 
474 		vmem_free(heap_arena, mapva, ptob(metapgs));
475 	}
476 
477 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
478 
479 	memsegs_lock(1);
480 
481 	/*
482 	 * The new memseg is inserted at the beginning of the list.
483 	 * Not only does this save searching for the tail, but in the
484 	 * case of a re-used memseg, it solves the problem of what
485 	 * happens if some process has still got a pointer to the
486 	 * memseg and follows the next pointer to continue traversing
487 	 * the memsegs list.
488 	 */
489 
490 	hat_kpm_addmem_mseg_insert(seg);
491 
492 	seg->next = memsegs;
493 	membar_producer();
494 
495 	hat_kpm_addmem_memsegs_update(seg);
496 
497 	memsegs = seg;
498 
499 	build_pfn_hash();
500 
501 	total_pages += npgs;
502 
503 	/*
504 	 * Recalculate the paging parameters now total_pages has changed.
505 	 * This will also cause the clock hands to be reset before next use.
506 	 */
507 	setupclock(1);
508 
509 	memsegs_unlock(1);
510 
511 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
512 
513 	/*
514 	 * Free the pages outside the lock to avoid locking loops.
515 	 */
516 	for (pp = seg->pages; pp < seg->epages; pp++) {
517 		page_free(pp, 1);
518 	}
519 
520 	/*
521 	 * Now that we've updated the appropriate memory lists we
522 	 * need to reset a number of globals, since we've increased memory.
523 	 * Several have already been updated for us as noted above. The
524 	 * globals we're interested in at this point are:
525 	 *   physmax - highest page frame number.
526 	 *   physinstalled - number of pages currently installed (done earlier)
527 	 *   maxmem - max free pages in the system
528 	 *   physmem - physical memory pages available
529 	 *   availrmem - real memory available
530 	 */
531 
532 	mutex_enter(&freemem_lock);
533 	maxmem += npgs;
534 	physmem += npgs;
535 	availrmem += npgs;
536 	availrmem_initial += npgs;
537 
538 	mutex_exit(&freemem_lock);
539 
540 	dump_resize();
541 
542 	page_freelist_coalesce_all(mnode);
543 
544 	kphysm_setup_post_add(npgs);
545 
546 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
547 	    "(0x%" PRIx64 ")\n",
548 	    physinstalled << (PAGESHIFT - 10),
549 	    (uint64_t)physinstalled << PAGESHIFT);
550 
551 	avmem = (uint64_t)freemem << PAGESHIFT;
552 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
553 	    "avail mem = %" PRId64 "\n", avmem);
554 
555 	/*
556 	 * Update lgroup generation number on single lgroup systems
557 	 */
558 	if (nlgrps == 1)
559 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
560 
561 	delspan_unreserve(pt_base, tpgs);
562 	return (KPHYSM_OK);		/* Successfully added system memory */
563 
564 }
565 
566 /*
567  * There are various error conditions in kphysm_add_memory_dynamic()
568  * which require a rollback of already changed global state.
569  */
570 static void
571 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
572 {
573 	int mlret;
574 
575 	/* Unreserve memory span. */
576 	memlist_write_lock();
577 
578 	mlret = memlist_delete_span(
579 	    (uint64_t)(pt_base) << PAGESHIFT,
580 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
581 
582 	ASSERT(mlret == MEML_SPANOP_OK);
583 	phys_install_has_changed();
584 	installed_top_size(phys_install, &physmax, &physinstalled);
585 
586 	memlist_write_unlock();
587 	delspan_unreserve(pt_base, tpgs);
588 }
589 
590 /*
591  * Only return an available memseg of exactly the right size
592  * if size is required.
593  * When the meta data area has it's own virtual address space
594  * we will need to manage this more carefully and do best fit
595  * allocations, possibly splitting an available area.
596  */
597 struct memseg *
598 memseg_reuse(pgcnt_t metapgs)
599 {
600 	int type;
601 	struct memseg **segpp, *seg;
602 
603 	mutex_enter(&memseg_lists_lock);
604 
605 	segpp = &memseg_va_avail;
606 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
607 		caddr_t end;
608 
609 		/*
610 		 * Make sure we are reusing the right segment type.
611 		 */
612 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
613 
614 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
615 		    != type)
616 			continue;
617 
618 		if (kpm_enable)
619 			end = hat_kpm_mseg_reuse(seg);
620 		else
621 			end = (caddr_t)seg->epages;
622 
623 		/*
624 		 * Check for the right size if it is provided.
625 		 */
626 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
627 			*segpp = seg->lnext;
628 			seg->lnext = NULL;
629 			break;
630 		}
631 	}
632 	mutex_exit(&memseg_lists_lock);
633 
634 	return (seg);
635 }
636 
637 static uint_t handle_gen;
638 
639 struct memdelspan {
640 	struct memdelspan *mds_next;
641 	pfn_t		mds_base;
642 	pgcnt_t		mds_npgs;
643 	uint_t		*mds_bitmap;
644 	uint_t		*mds_bitmap_retired;
645 };
646 
647 #define	NBPBMW		(sizeof (uint_t) * NBBY)
648 #define	MDS_BITMAPBYTES(MDSP) \
649 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
650 
651 struct transit_list {
652 	struct transit_list	*trl_next;
653 	struct memdelspan	*trl_spans;
654 	int			trl_collect;
655 };
656 
657 struct transit_list_head {
658 	kmutex_t		trh_lock;
659 	struct transit_list	*trh_head;
660 };
661 
662 static struct transit_list_head transit_list_head;
663 
664 struct mem_handle;
665 static void transit_list_collect(struct mem_handle *, int);
666 static void transit_list_insert(struct transit_list *);
667 static void transit_list_remove(struct transit_list *);
668 
669 #ifdef DEBUG
670 #define	MEM_DEL_STATS
671 #endif /* DEBUG */
672 
673 #ifdef MEM_DEL_STATS
674 static int mem_del_stat_print = 0;
675 struct mem_del_stat {
676 	uint_t	nloop;
677 	uint_t	need_free;
678 	uint_t	free_loop;
679 	uint_t	free_low;
680 	uint_t	free_failed;
681 	uint_t	ncheck;
682 	uint_t	nopaget;
683 	uint_t	lockfail;
684 	uint_t	nfree;
685 	uint_t	nreloc;
686 	uint_t	nrelocfail;
687 	uint_t	already_done;
688 	uint_t	first_notfree;
689 	uint_t	npplocked;
690 	uint_t	nlockreloc;
691 	uint_t	nnorepl;
692 	uint_t	nmodreloc;
693 	uint_t	ndestroy;
694 	uint_t	nputpage;
695 	uint_t	nnoreclaim;
696 	uint_t	ndelay;
697 	uint_t	demotefail;
698 	uint64_t nticks_total;
699 	uint64_t nticks_pgrp;
700 	uint_t	retired;
701 	uint_t	toxic;
702 	uint_t	failing;
703 	uint_t	modtoxic;
704 	uint_t	npplkdtoxic;
705 	uint_t	gptlmodfail;
706 	uint_t	gptllckfail;
707 };
708 /*
709  * The stat values are only incremented in the delete thread
710  * so no locking or atomic required.
711  */
712 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
713 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
714 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
715 static void mem_del_stat_print_func(struct mem_handle *);
716 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
717 #else /* MEM_DEL_STATS */
718 #define	MDSTAT_INCR(MHP, FLD)
719 #define	MDSTAT_TOTAL(MHP, ntck)
720 #define	MDSTAT_PGRP(MHP, ntck)
721 #define	MDSTAT_PRINT(MHP)
722 #endif /* MEM_DEL_STATS */
723 
724 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
725 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
726 
727 /*
728  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
729  * The mutex may not be required for other fields, dependent on mh_state.
730  */
731 struct mem_handle {
732 	kmutex_t	mh_mutex;
733 	struct mem_handle *mh_next;
734 	memhandle_t	mh_exthandle;
735 	mhnd_state_t	mh_state;
736 	struct transit_list mh_transit;
737 	pgcnt_t		mh_phys_pages;
738 	pgcnt_t		mh_vm_pages;
739 	pgcnt_t		mh_hold_todo;
740 	void		(*mh_delete_complete)(void *, int error);
741 	void		*mh_delete_complete_arg;
742 	volatile uint_t mh_cancel;
743 	volatile uint_t mh_dr_aio_cleanup_cancel;
744 	volatile uint_t mh_aio_cleanup_done;
745 	kcondvar_t	mh_cv;
746 	kthread_id_t	mh_thread_id;
747 	page_t		*mh_deleted;	/* link through p_next */
748 #ifdef MEM_DEL_STATS
749 	struct mem_del_stat mh_delstat;
750 #endif /* MEM_DEL_STATS */
751 };
752 
753 static struct mem_handle *mem_handle_head;
754 static kmutex_t mem_handle_list_mutex;
755 
756 static struct mem_handle *
757 kphysm_allocate_mem_handle()
758 {
759 	struct mem_handle *mhp;
760 
761 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
762 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
763 	mutex_enter(&mem_handle_list_mutex);
764 	mutex_enter(&mhp->mh_mutex);
765 	/* handle_gen is protected by list mutex. */
766 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
767 	mhp->mh_next = mem_handle_head;
768 	mem_handle_head = mhp;
769 	mutex_exit(&mem_handle_list_mutex);
770 
771 	return (mhp);
772 }
773 
774 static void
775 kphysm_free_mem_handle(struct mem_handle *mhp)
776 {
777 	struct mem_handle **mhpp;
778 
779 	ASSERT(mutex_owned(&mhp->mh_mutex));
780 	ASSERT(mhp->mh_state == MHND_FREE);
781 	/*
782 	 * Exit the mutex to preserve locking order. This is OK
783 	 * here as once in the FREE state, the handle cannot
784 	 * be found by a lookup.
785 	 */
786 	mutex_exit(&mhp->mh_mutex);
787 
788 	mutex_enter(&mem_handle_list_mutex);
789 	mhpp = &mem_handle_head;
790 	while (*mhpp != NULL && *mhpp != mhp)
791 		mhpp = &(*mhpp)->mh_next;
792 	ASSERT(*mhpp == mhp);
793 	/*
794 	 * No need to lock the handle (mh_mutex) as only
795 	 * mh_next changing and this is the only thread that
796 	 * can be referncing mhp.
797 	 */
798 	*mhpp = mhp->mh_next;
799 	mutex_exit(&mem_handle_list_mutex);
800 
801 	mutex_destroy(&mhp->mh_mutex);
802 	kmem_free(mhp, sizeof (struct mem_handle));
803 }
804 
805 /*
806  * This function finds the internal mem_handle corresponding to an
807  * external handle and returns it with the mh_mutex held.
808  */
809 static struct mem_handle *
810 kphysm_lookup_mem_handle(memhandle_t handle)
811 {
812 	struct mem_handle *mhp;
813 
814 	mutex_enter(&mem_handle_list_mutex);
815 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
816 		if (mhp->mh_exthandle == handle) {
817 			mutex_enter(&mhp->mh_mutex);
818 			/*
819 			 * The state of the handle could have been changed
820 			 * by kphysm_del_release() while waiting for mh_mutex.
821 			 */
822 			if (mhp->mh_state == MHND_FREE) {
823 				mutex_exit(&mhp->mh_mutex);
824 				continue;
825 			}
826 			break;
827 		}
828 	}
829 	mutex_exit(&mem_handle_list_mutex);
830 	return (mhp);
831 }
832 
833 int
834 kphysm_del_gethandle(memhandle_t *xmhp)
835 {
836 	struct mem_handle *mhp;
837 
838 	mhp = kphysm_allocate_mem_handle();
839 	/*
840 	 * The handle is allocated using KM_SLEEP, so cannot fail.
841 	 * If the implementation is changed, the correct error to return
842 	 * here would be KPHYSM_ENOHANDLES.
843 	 */
844 	ASSERT(mhp->mh_state == MHND_FREE);
845 	mhp->mh_state = MHND_INIT;
846 	*xmhp = mhp->mh_exthandle;
847 	mutex_exit(&mhp->mh_mutex);
848 	return (KPHYSM_OK);
849 }
850 
851 static int
852 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
853 {
854 	pfn_t e1, e2;
855 
856 	e1 = b1 + l1;
857 	e2 = b2 + l2;
858 
859 	return (!(b2 >= e1 || b1 >= e2));
860 }
861 
862 static int can_remove_pgs(pgcnt_t);
863 
864 static struct memdelspan *
865 span_to_install(pfn_t base, pgcnt_t npgs)
866 {
867 	struct memdelspan *mdsp;
868 	struct memdelspan *mdsp_new;
869 	uint64_t address, size, thislen;
870 	struct memlist *mlp;
871 
872 	mdsp_new = NULL;
873 
874 	address = (uint64_t)base << PAGESHIFT;
875 	size = (uint64_t)npgs << PAGESHIFT;
876 	while (size != 0) {
877 		memlist_read_lock();
878 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
879 			if (address >= (mlp->address + mlp->size))
880 				continue;
881 			if ((address + size) > mlp->address)
882 				break;
883 		}
884 		if (mlp == NULL) {
885 			address += size;
886 			size = 0;
887 			thislen = 0;
888 		} else {
889 			if (address < mlp->address) {
890 				size -= (mlp->address - address);
891 				address = mlp->address;
892 			}
893 			ASSERT(address >= mlp->address);
894 			if ((address + size) > (mlp->address + mlp->size)) {
895 				thislen = mlp->size - (address - mlp->address);
896 			} else {
897 				thislen = size;
898 			}
899 		}
900 		memlist_read_unlock();
901 		/* TODO: phys_install could change now */
902 		if (thislen == 0)
903 			continue;
904 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
905 		mdsp->mds_base = btop(address);
906 		mdsp->mds_npgs = btop(thislen);
907 		mdsp->mds_next = mdsp_new;
908 		mdsp_new = mdsp;
909 		address += thislen;
910 		size -= thislen;
911 	}
912 	return (mdsp_new);
913 }
914 
915 static void
916 free_delspans(struct memdelspan *mdsp)
917 {
918 	struct memdelspan *amdsp;
919 
920 	while ((amdsp = mdsp) != NULL) {
921 		mdsp = amdsp->mds_next;
922 		kmem_free(amdsp, sizeof (struct memdelspan));
923 	}
924 }
925 
926 /*
927  * Concatenate lists. No list ordering is required.
928  */
929 
930 static void
931 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
932 {
933 	while (*mdspp != NULL)
934 		mdspp = &(*mdspp)->mds_next;
935 
936 	*mdspp = mdsp;
937 }
938 
939 /*
940  * Given a new list of delspans, check there is no overlap with
941  * all existing span activity (add or delete) and then concatenate
942  * the new spans to the given list.
943  * Return 1 for OK, 0 if overlapping.
944  */
945 static int
946 delspan_insert(
947 	struct transit_list *my_tlp,
948 	struct memdelspan *mdsp_new)
949 {
950 	struct transit_list_head *trh;
951 	struct transit_list *tlp;
952 	int ret;
953 
954 	trh = &transit_list_head;
955 
956 	ASSERT(my_tlp != NULL);
957 	ASSERT(mdsp_new != NULL);
958 
959 	ret = 1;
960 	mutex_enter(&trh->trh_lock);
961 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
962 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
963 		struct memdelspan *mdsp;
964 
965 		for (mdsp = tlp->trl_spans; mdsp != NULL;
966 		    mdsp = mdsp->mds_next) {
967 			struct memdelspan *nmdsp;
968 
969 			for (nmdsp = mdsp_new; nmdsp != NULL;
970 			    nmdsp = nmdsp->mds_next) {
971 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
972 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
973 					ret = 0;
974 					goto done;
975 				}
976 			}
977 		}
978 	}
979 done:
980 	if (ret != 0) {
981 		if (my_tlp->trl_spans == NULL)
982 			transit_list_insert(my_tlp);
983 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
984 	}
985 	mutex_exit(&trh->trh_lock);
986 	return (ret);
987 }
988 
989 static void
990 delspan_remove(
991 	struct transit_list *my_tlp,
992 	pfn_t base,
993 	pgcnt_t npgs)
994 {
995 	struct transit_list_head *trh;
996 	struct memdelspan *mdsp;
997 
998 	trh = &transit_list_head;
999 
1000 	ASSERT(my_tlp != NULL);
1001 
1002 	mutex_enter(&trh->trh_lock);
1003 	if ((mdsp = my_tlp->trl_spans) != NULL) {
1004 		if (npgs == 0) {
1005 			my_tlp->trl_spans = NULL;
1006 			free_delspans(mdsp);
1007 			transit_list_remove(my_tlp);
1008 		} else {
1009 			struct memdelspan **prv;
1010 
1011 			prv = &my_tlp->trl_spans;
1012 			while (mdsp != NULL) {
1013 				pfn_t p_end;
1014 
1015 				p_end = mdsp->mds_base + mdsp->mds_npgs;
1016 				if (mdsp->mds_base >= base &&
1017 				    p_end <= (base + npgs)) {
1018 					*prv = mdsp->mds_next;
1019 					mdsp->mds_next = NULL;
1020 					free_delspans(mdsp);
1021 				} else {
1022 					prv = &mdsp->mds_next;
1023 				}
1024 				mdsp = *prv;
1025 			}
1026 			if (my_tlp->trl_spans == NULL)
1027 				transit_list_remove(my_tlp);
1028 		}
1029 	}
1030 	mutex_exit(&trh->trh_lock);
1031 }
1032 
1033 /*
1034  * Reserve interface for add to stop delete before add finished.
1035  * This list is only accessed through the delspan_insert/remove
1036  * functions and so is fully protected by the mutex in struct transit_list.
1037  */
1038 
1039 static struct transit_list reserve_transit;
1040 
1041 static int
1042 delspan_reserve(pfn_t base, pgcnt_t npgs)
1043 {
1044 	struct memdelspan *mdsp;
1045 	int ret;
1046 
1047 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1048 	mdsp->mds_base = base;
1049 	mdsp->mds_npgs = npgs;
1050 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1051 		free_delspans(mdsp);
1052 	}
1053 	return (ret);
1054 }
1055 
1056 static void
1057 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1058 {
1059 	delspan_remove(&reserve_transit, base, npgs);
1060 }
1061 
1062 /*
1063  * Return whether memseg was created by kphysm_add_memory_dynamic().
1064  */
1065 static int
1066 memseg_is_dynamic(struct memseg *seg)
1067 {
1068 	return (seg->msegflags & MEMSEG_DYNAMIC);
1069 }
1070 
1071 int
1072 kphysm_del_span(
1073 	memhandle_t handle,
1074 	pfn_t base,
1075 	pgcnt_t npgs)
1076 {
1077 	struct mem_handle *mhp;
1078 	struct memseg *seg;
1079 	struct memdelspan *mdsp;
1080 	struct memdelspan *mdsp_new;
1081 	pgcnt_t phys_pages, vm_pages;
1082 	pfn_t p_end;
1083 	page_t *pp;
1084 	int ret;
1085 
1086 	mhp = kphysm_lookup_mem_handle(handle);
1087 	if (mhp == NULL) {
1088 		return (KPHYSM_EHANDLE);
1089 	}
1090 	if (mhp->mh_state != MHND_INIT) {
1091 		mutex_exit(&mhp->mh_mutex);
1092 		return (KPHYSM_ESEQUENCE);
1093 	}
1094 
1095 	/*
1096 	 * Intersect the span with the installed memory list (phys_install).
1097 	 */
1098 	mdsp_new = span_to_install(base, npgs);
1099 	if (mdsp_new == NULL) {
1100 		/*
1101 		 * No physical memory in this range. Is this an
1102 		 * error? If an attempt to start the delete is made
1103 		 * for OK returns from del_span such as this, start will
1104 		 * return an error.
1105 		 * Could return KPHYSM_ENOWORK.
1106 		 */
1107 		/*
1108 		 * It is assumed that there are no error returns
1109 		 * from span_to_install() due to kmem_alloc failure.
1110 		 */
1111 		mutex_exit(&mhp->mh_mutex);
1112 		return (KPHYSM_OK);
1113 	}
1114 	/*
1115 	 * Does this span overlap an existing span?
1116 	 */
1117 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1118 		/*
1119 		 * Differentiate between already on list for this handle
1120 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1121 		 */
1122 		ret = KPHYSM_EBUSY;
1123 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1124 		    mdsp = mdsp->mds_next) {
1125 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1126 			    base, npgs)) {
1127 				ret = KPHYSM_EDUP;
1128 				break;
1129 			}
1130 		}
1131 		mutex_exit(&mhp->mh_mutex);
1132 		free_delspans(mdsp_new);
1133 		return (ret);
1134 	}
1135 	/*
1136 	 * At this point the spans in mdsp_new have been inserted into the
1137 	 * list of spans for this handle and thereby to the global list of
1138 	 * spans being processed. Each of these spans must now be checked
1139 	 * for relocatability. As a side-effect segments in the memseg list
1140 	 * may be split.
1141 	 *
1142 	 * Note that mdsp_new can no longer be used as it is now part of
1143 	 * a larger list. Select elements of this larger list based
1144 	 * on base and npgs.
1145 	 */
1146 restart:
1147 	phys_pages = 0;
1148 	vm_pages = 0;
1149 	ret = KPHYSM_OK;
1150 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1151 	    mdsp = mdsp->mds_next) {
1152 		pgcnt_t pages_checked;
1153 
1154 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1155 			continue;
1156 		}
1157 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1158 		/*
1159 		 * The pages_checked count is a hack. All pages should be
1160 		 * checked for relocatability. Those not covered by memsegs
1161 		 * should be tested with arch_kphysm_del_span_ok().
1162 		 */
1163 		pages_checked = 0;
1164 		for (seg = memsegs; seg; seg = seg->next) {
1165 			pfn_t mseg_start;
1166 
1167 			if (seg->pages_base >= p_end ||
1168 			    seg->pages_end <= mdsp->mds_base) {
1169 				/* Span and memseg don't overlap. */
1170 				continue;
1171 			}
1172 			mseg_start = memseg_get_start(seg);
1173 			/* Check that segment is suitable for delete. */
1174 			if (memseg_includes_meta(seg)) {
1175 				/*
1176 				 * Check that this segment is completely
1177 				 * within the span.
1178 				 */
1179 				if (mseg_start < mdsp->mds_base ||
1180 				    seg->pages_end > p_end) {
1181 					ret = KPHYSM_EBUSY;
1182 					break;
1183 				}
1184 				pages_checked += seg->pages_end - mseg_start;
1185 			} else {
1186 				/*
1187 				 * If this segment is larger than the span,
1188 				 * try to split it. After the split, it
1189 				 * is necessary to restart.
1190 				 */
1191 				if (seg->pages_base < mdsp->mds_base ||
1192 				    seg->pages_end > p_end) {
1193 					pfn_t abase;
1194 					pgcnt_t anpgs;
1195 					int s_ret;
1196 
1197 					/* Split required.  */
1198 					if (mdsp->mds_base < seg->pages_base)
1199 						abase = seg->pages_base;
1200 					else
1201 						abase = mdsp->mds_base;
1202 					if (p_end > seg->pages_end)
1203 						anpgs = seg->pages_end - abase;
1204 					else
1205 						anpgs = p_end - abase;
1206 					s_ret = kphysm_split_memseg(abase,
1207 					    anpgs);
1208 					if (s_ret == 0) {
1209 						/* Split failed. */
1210 						ret = KPHYSM_ERESOURCE;
1211 						break;
1212 					}
1213 					goto restart;
1214 				}
1215 				pages_checked +=
1216 				    seg->pages_end - seg->pages_base;
1217 			}
1218 			/*
1219 			 * The memseg is wholly within the delete span.
1220 			 * The individual pages can now be checked.
1221 			 */
1222 			/* Cage test. */
1223 			for (pp = seg->pages; pp < seg->epages; pp++) {
1224 				if (PP_ISNORELOC(pp)) {
1225 					ret = KPHYSM_ENONRELOC;
1226 					break;
1227 				}
1228 			}
1229 			if (ret != KPHYSM_OK) {
1230 				break;
1231 			}
1232 			phys_pages += (seg->pages_end - mseg_start);
1233 			vm_pages += MSEG_NPAGES(seg);
1234 		}
1235 		if (ret != KPHYSM_OK)
1236 			break;
1237 		if (pages_checked != mdsp->mds_npgs) {
1238 			ret = KPHYSM_ENONRELOC;
1239 			break;
1240 		}
1241 	}
1242 
1243 	if (ret == KPHYSM_OK) {
1244 		mhp->mh_phys_pages += phys_pages;
1245 		mhp->mh_vm_pages += vm_pages;
1246 	} else {
1247 		/*
1248 		 * Keep holding the mh_mutex to prevent it going away.
1249 		 */
1250 		delspan_remove(&mhp->mh_transit, base, npgs);
1251 	}
1252 	mutex_exit(&mhp->mh_mutex);
1253 	return (ret);
1254 }
1255 
1256 int
1257 kphysm_del_span_query(
1258 	pfn_t base,
1259 	pgcnt_t npgs,
1260 	memquery_t *mqp)
1261 {
1262 	struct memdelspan *mdsp;
1263 	struct memdelspan *mdsp_new;
1264 	int done_first_nonreloc;
1265 
1266 	mqp->phys_pages = 0;
1267 	mqp->managed = 0;
1268 	mqp->nonrelocatable = 0;
1269 	mqp->first_nonrelocatable = 0;
1270 	mqp->last_nonrelocatable = 0;
1271 
1272 	mdsp_new = span_to_install(base, npgs);
1273 	/*
1274 	 * It is OK to proceed here if mdsp_new == NULL.
1275 	 */
1276 	done_first_nonreloc = 0;
1277 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1278 		pfn_t sbase;
1279 		pgcnt_t snpgs;
1280 
1281 		mqp->phys_pages += mdsp->mds_npgs;
1282 		sbase = mdsp->mds_base;
1283 		snpgs = mdsp->mds_npgs;
1284 		while (snpgs != 0) {
1285 			struct memseg *lseg, *seg;
1286 			pfn_t p_end;
1287 			page_t *pp;
1288 			pfn_t mseg_start;
1289 
1290 			p_end = sbase + snpgs;
1291 			/*
1292 			 * Find the lowest addressed memseg that starts
1293 			 * after sbase and account for it.
1294 			 * This is to catch dynamic memsegs whose start
1295 			 * is hidden.
1296 			 */
1297 			seg = NULL;
1298 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1299 				if ((lseg->pages_base >= sbase) ||
1300 				    (lseg->pages_base < p_end &&
1301 				    lseg->pages_end > sbase)) {
1302 					if (seg == NULL ||
1303 					    seg->pages_base > lseg->pages_base)
1304 						seg = lseg;
1305 				}
1306 			}
1307 			if (seg != NULL) {
1308 				mseg_start = memseg_get_start(seg);
1309 				/*
1310 				 * Now have the full extent of the memseg so
1311 				 * do the range check.
1312 				 */
1313 				if (mseg_start >= p_end ||
1314 				    seg->pages_end <= sbase) {
1315 					/* Span does not overlap memseg. */
1316 					seg = NULL;
1317 				}
1318 			}
1319 			/*
1320 			 * Account for gap either before the segment if
1321 			 * there is one or to the end of the span.
1322 			 */
1323 			if (seg == NULL || mseg_start > sbase) {
1324 				pfn_t a_end;
1325 
1326 				a_end = (seg == NULL) ? p_end : mseg_start;
1327 				/*
1328 				 * Check with arch layer for relocatability.
1329 				 */
1330 				if (arch_kphysm_del_span_ok(sbase,
1331 				    (a_end - sbase))) {
1332 					/*
1333 					 * No non-relocatble pages in this
1334 					 * area, avoid the fine-grained
1335 					 * test.
1336 					 */
1337 					snpgs -= (a_end - sbase);
1338 					sbase = a_end;
1339 				}
1340 				while (sbase < a_end) {
1341 					if (!arch_kphysm_del_span_ok(sbase,
1342 					    1)) {
1343 						mqp->nonrelocatable++;
1344 						if (!done_first_nonreloc) {
1345 							mqp->
1346 							    first_nonrelocatable
1347 							    = sbase;
1348 							done_first_nonreloc = 1;
1349 						}
1350 						mqp->last_nonrelocatable =
1351 						    sbase;
1352 					}
1353 					sbase++;
1354 					snpgs--;
1355 				}
1356 			}
1357 			if (seg != NULL) {
1358 				ASSERT(mseg_start <= sbase);
1359 				if (seg->pages_base != mseg_start &&
1360 				    seg->pages_base > sbase) {
1361 					pgcnt_t skip_pgs;
1362 
1363 					/*
1364 					 * Skip the page_t area of a
1365 					 * dynamic memseg.
1366 					 */
1367 					skip_pgs = seg->pages_base - sbase;
1368 					if (snpgs <= skip_pgs) {
1369 						sbase += snpgs;
1370 						snpgs = 0;
1371 						continue;
1372 					}
1373 					snpgs -= skip_pgs;
1374 					sbase += skip_pgs;
1375 				}
1376 				ASSERT(snpgs != 0);
1377 				ASSERT(seg->pages_base <= sbase);
1378 				/*
1379 				 * The individual pages can now be checked.
1380 				 */
1381 				for (pp = seg->pages +
1382 				    (sbase - seg->pages_base);
1383 				    snpgs != 0 && pp < seg->epages; pp++) {
1384 					mqp->managed++;
1385 					if (PP_ISNORELOC(pp)) {
1386 						mqp->nonrelocatable++;
1387 						if (!done_first_nonreloc) {
1388 							mqp->
1389 							    first_nonrelocatable
1390 							    = sbase;
1391 							done_first_nonreloc = 1;
1392 						}
1393 						mqp->last_nonrelocatable =
1394 						    sbase;
1395 					}
1396 					sbase++;
1397 					snpgs--;
1398 				}
1399 			}
1400 		}
1401 	}
1402 
1403 	free_delspans(mdsp_new);
1404 
1405 	return (KPHYSM_OK);
1406 }
1407 
1408 /*
1409  * This release function can be called at any stage as follows:
1410  *	_gethandle only called
1411  *	_span(s) only called
1412  *	_start called but failed
1413  *	delete thread exited
1414  */
1415 int
1416 kphysm_del_release(memhandle_t handle)
1417 {
1418 	struct mem_handle *mhp;
1419 
1420 	mhp = kphysm_lookup_mem_handle(handle);
1421 	if (mhp == NULL) {
1422 		return (KPHYSM_EHANDLE);
1423 	}
1424 	switch (mhp->mh_state) {
1425 	case MHND_STARTING:
1426 	case MHND_RUNNING:
1427 		mutex_exit(&mhp->mh_mutex);
1428 		return (KPHYSM_ENOTFINISHED);
1429 	case MHND_FREE:
1430 		ASSERT(mhp->mh_state != MHND_FREE);
1431 		mutex_exit(&mhp->mh_mutex);
1432 		return (KPHYSM_EHANDLE);
1433 	case MHND_INIT:
1434 		break;
1435 	case MHND_DONE:
1436 		break;
1437 	case MHND_RELEASE:
1438 		mutex_exit(&mhp->mh_mutex);
1439 		return (KPHYSM_ESEQUENCE);
1440 	default:
1441 #ifdef DEBUG
1442 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1443 		    (void *)mhp, mhp->mh_state);
1444 #endif /* DEBUG */
1445 		mutex_exit(&mhp->mh_mutex);
1446 		return (KPHYSM_EHANDLE);
1447 	}
1448 	/*
1449 	 * Set state so that we can wait if necessary.
1450 	 * Also this means that we have read/write access to all
1451 	 * fields except mh_exthandle and mh_state.
1452 	 */
1453 	mhp->mh_state = MHND_RELEASE;
1454 	/*
1455 	 * The mem_handle cannot be de-allocated by any other operation
1456 	 * now, so no need to hold mh_mutex.
1457 	 */
1458 	mutex_exit(&mhp->mh_mutex);
1459 
1460 	delspan_remove(&mhp->mh_transit, 0, 0);
1461 	mhp->mh_phys_pages = 0;
1462 	mhp->mh_vm_pages = 0;
1463 	mhp->mh_hold_todo = 0;
1464 	mhp->mh_delete_complete = NULL;
1465 	mhp->mh_delete_complete_arg = NULL;
1466 	mhp->mh_cancel = 0;
1467 
1468 	mutex_enter(&mhp->mh_mutex);
1469 	ASSERT(mhp->mh_state == MHND_RELEASE);
1470 	mhp->mh_state = MHND_FREE;
1471 
1472 	kphysm_free_mem_handle(mhp);
1473 
1474 	return (KPHYSM_OK);
1475 }
1476 
1477 /*
1478  * This cancel function can only be called with the thread running.
1479  */
1480 int
1481 kphysm_del_cancel(memhandle_t handle)
1482 {
1483 	struct mem_handle *mhp;
1484 
1485 	mhp = kphysm_lookup_mem_handle(handle);
1486 	if (mhp == NULL) {
1487 		return (KPHYSM_EHANDLE);
1488 	}
1489 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1490 		mutex_exit(&mhp->mh_mutex);
1491 		return (KPHYSM_ENOTRUNNING);
1492 	}
1493 	/*
1494 	 * Set the cancel flag and wake the delete thread up.
1495 	 * The thread may be waiting on I/O, so the effect of the cancel
1496 	 * may be delayed.
1497 	 */
1498 	if (mhp->mh_cancel == 0) {
1499 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1500 		cv_signal(&mhp->mh_cv);
1501 	}
1502 	mutex_exit(&mhp->mh_mutex);
1503 	return (KPHYSM_OK);
1504 }
1505 
1506 int
1507 kphysm_del_status(
1508 	memhandle_t handle,
1509 	memdelstat_t *mdstp)
1510 {
1511 	struct mem_handle *mhp;
1512 
1513 	mhp = kphysm_lookup_mem_handle(handle);
1514 	if (mhp == NULL) {
1515 		return (KPHYSM_EHANDLE);
1516 	}
1517 	/*
1518 	 * Calling kphysm_del_status() is allowed before the delete
1519 	 * is started to allow for status display.
1520 	 */
1521 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1522 	    mhp->mh_state != MHND_RUNNING) {
1523 		mutex_exit(&mhp->mh_mutex);
1524 		return (KPHYSM_ENOTRUNNING);
1525 	}
1526 	mdstp->phys_pages = mhp->mh_phys_pages;
1527 	mdstp->managed = mhp->mh_vm_pages;
1528 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1529 	mutex_exit(&mhp->mh_mutex);
1530 	return (KPHYSM_OK);
1531 }
1532 
1533 static int mem_delete_additional_pages = 100;
1534 
1535 static int
1536 can_remove_pgs(pgcnt_t npgs)
1537 {
1538 	/*
1539 	 * If all pageable pages were paged out, freemem would
1540 	 * equal availrmem.  There is a minimum requirement for
1541 	 * availrmem.
1542 	 */
1543 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1544 	    < npgs)
1545 		return (0);
1546 	/* TODO: check swap space, etc. */
1547 	return (1);
1548 }
1549 
1550 static int
1551 get_availrmem(pgcnt_t npgs)
1552 {
1553 	int ret;
1554 
1555 	mutex_enter(&freemem_lock);
1556 	ret = can_remove_pgs(npgs);
1557 	if (ret != 0)
1558 		availrmem -= npgs;
1559 	mutex_exit(&freemem_lock);
1560 	return (ret);
1561 }
1562 
1563 static void
1564 put_availrmem(pgcnt_t npgs)
1565 {
1566 	mutex_enter(&freemem_lock);
1567 	availrmem += npgs;
1568 	mutex_exit(&freemem_lock);
1569 }
1570 
1571 #define	FREEMEM_INCR	100
1572 static pgcnt_t freemem_incr = FREEMEM_INCR;
1573 #define	DEL_FREE_WAIT_FRAC	4
1574 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1575 
1576 #define	DEL_BUSY_WAIT_FRAC	20
1577 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1578 
1579 static void kphysm_del_cleanup(struct mem_handle *);
1580 
1581 static void page_delete_collect(page_t *, struct mem_handle *);
1582 
1583 static pgcnt_t
1584 delthr_get_freemem(struct mem_handle *mhp)
1585 {
1586 	pgcnt_t free_get;
1587 	int ret;
1588 
1589 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1590 
1591 	MDSTAT_INCR(mhp, need_free);
1592 	/*
1593 	 * Get up to freemem_incr pages.
1594 	 */
1595 	free_get = freemem_incr;
1596 	if (free_get > mhp->mh_hold_todo)
1597 		free_get = mhp->mh_hold_todo;
1598 	/*
1599 	 * Take free_get pages away from freemem,
1600 	 * waiting if necessary.
1601 	 */
1602 
1603 	while (!mhp->mh_cancel) {
1604 		mutex_exit(&mhp->mh_mutex);
1605 		MDSTAT_INCR(mhp, free_loop);
1606 		/*
1607 		 * Duplicate test from page_create_throttle()
1608 		 * but don't override with !PG_WAIT.
1609 		 */
1610 		if (freemem < (free_get + throttlefree)) {
1611 			MDSTAT_INCR(mhp, free_low);
1612 			ret = 0;
1613 		} else {
1614 			ret = page_create_wait(free_get, 0);
1615 			if (ret == 0) {
1616 				/* EMPTY */
1617 				MDSTAT_INCR(mhp, free_failed);
1618 			}
1619 		}
1620 		if (ret != 0) {
1621 			mutex_enter(&mhp->mh_mutex);
1622 			return (free_get);
1623 		}
1624 
1625 		/*
1626 		 * Put pressure on pageout.
1627 		 */
1628 		page_needfree(free_get);
1629 		cv_signal(&proc_pageout->p_cv);
1630 
1631 		mutex_enter(&mhp->mh_mutex);
1632 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1633 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1634 		mutex_exit(&mhp->mh_mutex);
1635 		page_needfree(-(spgcnt_t)free_get);
1636 
1637 		mutex_enter(&mhp->mh_mutex);
1638 	}
1639 	return (0);
1640 }
1641 
1642 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1643 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1644 /*
1645  * This function is run as a helper thread for delete_memory_thread.
1646  * It is needed in order to force kaio cleanup, so that pages used in kaio
1647  * will be unlocked and subsequently relocated by delete_memory_thread.
1648  * The address of the delete_memory_threads's mem_handle is passed in to
1649  * this thread function, and is used to set the mh_aio_cleanup_done member
1650  * prior to calling thread_exit().
1651  */
1652 static void
1653 dr_aio_cleanup_thread(caddr_t amhp)
1654 {
1655 	proc_t *procp;
1656 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1657 	int cleaned;
1658 	int n = 0;
1659 	struct mem_handle *mhp;
1660 	volatile uint_t *pcancel;
1661 
1662 	mhp = (struct mem_handle *)amhp;
1663 	ASSERT(mhp != NULL);
1664 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1665 	if (modload("sys", "kaio") == -1) {
1666 		mhp->mh_aio_cleanup_done = 1;
1667 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1668 		thread_exit();
1669 	}
1670 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1671 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1672 	if (aio_cleanup_dr_delete_memory == NULL) {
1673 		mhp->mh_aio_cleanup_done = 1;
1674 		cmn_err(CE_WARN,
1675 	    "aio_cleanup_dr_delete_memory not found in kaio");
1676 		thread_exit();
1677 	}
1678 	do {
1679 		cleaned = 0;
1680 		mutex_enter(&pidlock);
1681 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1682 		    procp = procp->p_next) {
1683 			mutex_enter(&procp->p_lock);
1684 			if (procp->p_aio != NULL) {
1685 				/* cleanup proc's outstanding kaio */
1686 				cleaned +=
1687 				    (*aio_cleanup_dr_delete_memory)(procp);
1688 			}
1689 			mutex_exit(&procp->p_lock);
1690 		}
1691 		mutex_exit(&pidlock);
1692 		if ((*pcancel == 0) &&
1693 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1694 			/* delay a bit before retrying all procs again */
1695 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1696 			n = 0;
1697 		}
1698 	} while (*pcancel == 0);
1699 	mhp->mh_aio_cleanup_done = 1;
1700 	thread_exit();
1701 }
1702 
1703 static void
1704 delete_memory_thread(caddr_t amhp)
1705 {
1706 	struct mem_handle *mhp;
1707 	struct memdelspan *mdsp;
1708 	callb_cpr_t cprinfo;
1709 	page_t *pp_targ;
1710 	spgcnt_t freemem_left;
1711 	void (*del_complete_funcp)(void *, int error);
1712 	void *del_complete_arg;
1713 	int comp_code;
1714 	int ret;
1715 	int first_scan;
1716 	uint_t szc;
1717 #ifdef MEM_DEL_STATS
1718 	uint64_t start_total, ntick_total;
1719 	uint64_t start_pgrp, ntick_pgrp;
1720 #endif /* MEM_DEL_STATS */
1721 
1722 	mhp = (struct mem_handle *)amhp;
1723 
1724 #ifdef MEM_DEL_STATS
1725 	start_total = ddi_get_lbolt();
1726 #endif /* MEM_DEL_STATS */
1727 
1728 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1729 	    callb_generic_cpr, "memdel");
1730 
1731 	mutex_enter(&mhp->mh_mutex);
1732 	ASSERT(mhp->mh_state == MHND_STARTING);
1733 
1734 	mhp->mh_state = MHND_RUNNING;
1735 	mhp->mh_thread_id = curthread;
1736 
1737 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1738 	mutex_exit(&mhp->mh_mutex);
1739 
1740 	/* Allocate the remap pages now, if necessary. */
1741 	memseg_remap_init();
1742 
1743 	/*
1744 	 * Subtract from availrmem now if possible as availrmem
1745 	 * may not be available by the end of the delete.
1746 	 */
1747 	if (!get_availrmem(mhp->mh_vm_pages)) {
1748 		comp_code = KPHYSM_ENOTVIABLE;
1749 		mutex_enter(&mhp->mh_mutex);
1750 		goto early_exit;
1751 	}
1752 
1753 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1754 
1755 	mutex_enter(&mhp->mh_mutex);
1756 
1757 	if (ret != 0) {
1758 		mhp->mh_cancel = KPHYSM_EREFUSED;
1759 		goto refused;
1760 	}
1761 
1762 	transit_list_collect(mhp, 1);
1763 
1764 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1765 	    mdsp = mdsp->mds_next) {
1766 		ASSERT(mdsp->mds_bitmap == NULL);
1767 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1768 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1769 		    KM_SLEEP);
1770 	}
1771 
1772 	first_scan = 1;
1773 	freemem_left = 0;
1774 	/*
1775 	 * Start dr_aio_cleanup_thread, which periodically iterates
1776 	 * through the process list and invokes aio cleanup.  This
1777 	 * is needed in order to avoid a deadly embrace between the
1778 	 * delete_memory_thread (waiting on writer lock for page, with the
1779 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1780 	 * reader lock on the same page that is wanted by the
1781 	 * delete_memory_thread), and threads waiting for kaio completion
1782 	 * (blocked on spt_amp->lock).
1783 	 */
1784 	mhp->mh_dr_aio_cleanup_cancel = 0;
1785 	mhp->mh_aio_cleanup_done = 0;
1786 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1787 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1788 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1789 		pgcnt_t collected;
1790 
1791 		MDSTAT_INCR(mhp, nloop);
1792 		collected = 0;
1793 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1794 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1795 			pfn_t pfn, p_end;
1796 
1797 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1798 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1799 			    (mhp->mh_cancel == 0); pfn++) {
1800 				page_t *pp, *tpp, *tpp_targ;
1801 				pgcnt_t bit;
1802 				struct vnode *vp;
1803 				u_offset_t offset;
1804 				int mod, result;
1805 				spgcnt_t pgcnt;
1806 
1807 				bit = pfn - mdsp->mds_base;
1808 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1809 				    (1 << (bit % NBPBMW))) != 0) {
1810 					MDSTAT_INCR(mhp, already_done);
1811 					continue;
1812 				}
1813 				if (freemem_left == 0) {
1814 					freemem_left += delthr_get_freemem(mhp);
1815 					if (freemem_left == 0)
1816 						break;
1817 				}
1818 
1819 				/*
1820 				 * Release mh_mutex - some of this
1821 				 * stuff takes some time (eg PUTPAGE).
1822 				 */
1823 
1824 				mutex_exit(&mhp->mh_mutex);
1825 				MDSTAT_INCR(mhp, ncheck);
1826 
1827 				pp = page_numtopp_nolock(pfn);
1828 				if (pp == NULL) {
1829 					/*
1830 					 * Not covered by a page_t - will
1831 					 * be dealt with elsewhere.
1832 					 */
1833 					MDSTAT_INCR(mhp, nopaget);
1834 					mutex_enter(&mhp->mh_mutex);
1835 					mdsp->mds_bitmap[bit / NBPBMW] |=
1836 					    (1 << (bit % NBPBMW));
1837 					continue;
1838 				}
1839 
1840 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1841 				    SE_EXCL_WANTED | SE_RETIRED)) {
1842 					/*
1843 					 * Page in use elsewhere.  Skip it.
1844 					 */
1845 					MDSTAT_INCR(mhp, lockfail);
1846 					mutex_enter(&mhp->mh_mutex);
1847 					continue;
1848 				}
1849 				/*
1850 				 * See if the cage expanded into the delete.
1851 				 * This can happen as we have to allow the
1852 				 * cage to expand.
1853 				 */
1854 				if (PP_ISNORELOC(pp)) {
1855 					page_unlock(pp);
1856 					mutex_enter(&mhp->mh_mutex);
1857 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1858 					break;
1859 				}
1860 				if (PP_RETIRED(pp)) {
1861 					/*
1862 					 * Page has been retired and is
1863 					 * not part of the cage so we
1864 					 * can now do the accounting for
1865 					 * it.
1866 					 */
1867 					MDSTAT_INCR(mhp, retired);
1868 					mutex_enter(&mhp->mh_mutex);
1869 					mdsp->mds_bitmap[bit / NBPBMW]
1870 					    |= (1 << (bit % NBPBMW));
1871 					mdsp->mds_bitmap_retired[bit /
1872 					    NBPBMW] |=
1873 					    (1 << (bit % NBPBMW));
1874 					mhp->mh_hold_todo--;
1875 					continue;
1876 				}
1877 				ASSERT(freemem_left != 0);
1878 				if (PP_ISFREE(pp)) {
1879 					/*
1880 					 * Like page_reclaim() only 'freemem'
1881 					 * processing is already done.
1882 					 */
1883 					MDSTAT_INCR(mhp, nfree);
1884 				free_page_collect:
1885 					if (PP_ISAGED(pp)) {
1886 						page_list_sub(pp,
1887 						    PG_FREE_LIST);
1888 					} else {
1889 						page_list_sub(pp,
1890 						    PG_CACHE_LIST);
1891 					}
1892 					PP_CLRFREE(pp);
1893 					PP_CLRAGED(pp);
1894 					collected++;
1895 					mutex_enter(&mhp->mh_mutex);
1896 					page_delete_collect(pp, mhp);
1897 					mdsp->mds_bitmap[bit / NBPBMW] |=
1898 					    (1 << (bit % NBPBMW));
1899 					freemem_left--;
1900 					continue;
1901 				}
1902 				ASSERT(pp->p_vnode != NULL);
1903 				if (first_scan) {
1904 					MDSTAT_INCR(mhp, first_notfree);
1905 					page_unlock(pp);
1906 					mutex_enter(&mhp->mh_mutex);
1907 					continue;
1908 				}
1909 				/*
1910 				 * Keep stats on pages encountered that
1911 				 * are marked for retirement.
1912 				 */
1913 				if (PP_TOXIC(pp)) {
1914 					MDSTAT_INCR(mhp, toxic);
1915 				} else if (PP_PR_REQ(pp)) {
1916 					MDSTAT_INCR(mhp, failing);
1917 				}
1918 				/*
1919 				 * In certain cases below, special exceptions
1920 				 * are made for pages that are toxic.  This
1921 				 * is because the current meaning of toxic
1922 				 * is that an uncorrectable error has been
1923 				 * previously associated with the page.
1924 				 */
1925 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1926 					if (!PP_TOXIC(pp)) {
1927 						/*
1928 						 * Must relocate locked in
1929 						 * memory pages.
1930 						 */
1931 #ifdef MEM_DEL_STATS
1932 						start_pgrp = ddi_get_lbolt();
1933 #endif /* MEM_DEL_STATS */
1934 						/*
1935 						 * Lock all constituent pages
1936 						 * of a large page to ensure
1937 						 * that p_szc won't change.
1938 						 */
1939 						if (!group_page_trylock(pp,
1940 						    SE_EXCL)) {
1941 							MDSTAT_INCR(mhp,
1942 							    gptllckfail);
1943 							page_unlock(pp);
1944 							mutex_enter(
1945 							    &mhp->mh_mutex);
1946 							continue;
1947 						}
1948 						MDSTAT_INCR(mhp, npplocked);
1949 						pp_targ =
1950 						    page_get_replacement_page(
1951 						    pp, NULL, 0);
1952 						if (pp_targ != NULL) {
1953 #ifdef MEM_DEL_STATS
1954 							ntick_pgrp =
1955 							    (uint64_t)
1956 							    ddi_get_lbolt() -
1957 							    start_pgrp;
1958 #endif /* MEM_DEL_STATS */
1959 							MDSTAT_PGRP(mhp,
1960 							    ntick_pgrp);
1961 							MDSTAT_INCR(mhp,
1962 							    nlockreloc);
1963 							goto reloc;
1964 						}
1965 						group_page_unlock(pp);
1966 						page_unlock(pp);
1967 #ifdef MEM_DEL_STATS
1968 						ntick_pgrp =
1969 						    (uint64_t)ddi_get_lbolt() -
1970 						    start_pgrp;
1971 #endif /* MEM_DEL_STATS */
1972 						MDSTAT_PGRP(mhp, ntick_pgrp);
1973 						MDSTAT_INCR(mhp, nnorepl);
1974 						mutex_enter(&mhp->mh_mutex);
1975 						continue;
1976 					} else {
1977 						/*
1978 						 * Cannot do anything about
1979 						 * this page because it is
1980 						 * toxic.
1981 						 */
1982 						MDSTAT_INCR(mhp, npplkdtoxic);
1983 						page_unlock(pp);
1984 						mutex_enter(&mhp->mh_mutex);
1985 						continue;
1986 					}
1987 				}
1988 				/*
1989 				 * Unload the mappings and check if mod bit
1990 				 * is set.
1991 				 */
1992 				ASSERT(!PP_ISKAS(pp));
1993 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1994 				mod = hat_ismod(pp);
1995 
1996 #ifdef MEM_DEL_STATS
1997 				start_pgrp = ddi_get_lbolt();
1998 #endif /* MEM_DEL_STATS */
1999 				if (mod && !PP_TOXIC(pp)) {
2000 					/*
2001 					 * Lock all constituent pages
2002 					 * of a large page to ensure
2003 					 * that p_szc won't change.
2004 					 */
2005 					if (!group_page_trylock(pp, SE_EXCL)) {
2006 						MDSTAT_INCR(mhp, gptlmodfail);
2007 						page_unlock(pp);
2008 						mutex_enter(&mhp->mh_mutex);
2009 						continue;
2010 					}
2011 					pp_targ = page_get_replacement_page(pp,
2012 					    NULL, 0);
2013 					if (pp_targ != NULL) {
2014 						MDSTAT_INCR(mhp, nmodreloc);
2015 #ifdef MEM_DEL_STATS
2016 						ntick_pgrp =
2017 						    (uint64_t)ddi_get_lbolt() -
2018 						    start_pgrp;
2019 #endif /* MEM_DEL_STATS */
2020 						MDSTAT_PGRP(mhp, ntick_pgrp);
2021 						goto reloc;
2022 					}
2023 					group_page_unlock(pp);
2024 				}
2025 
2026 				if (!page_try_demote_pages(pp)) {
2027 					MDSTAT_INCR(mhp, demotefail);
2028 					page_unlock(pp);
2029 #ifdef MEM_DEL_STATS
2030 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2031 					    start_pgrp;
2032 #endif /* MEM_DEL_STATS */
2033 					MDSTAT_PGRP(mhp, ntick_pgrp);
2034 					mutex_enter(&mhp->mh_mutex);
2035 					continue;
2036 				}
2037 
2038 				/*
2039 				 * Regular 'page-out'.
2040 				 */
2041 				if (!mod) {
2042 					MDSTAT_INCR(mhp, ndestroy);
2043 					page_destroy(pp, 1);
2044 					/*
2045 					 * page_destroy was called with
2046 					 * dontfree. As long as p_lckcnt
2047 					 * and p_cowcnt are both zero, the
2048 					 * only additional action of
2049 					 * page_destroy with !dontfree is to
2050 					 * call page_free, so we can collect
2051 					 * the page here.
2052 					 */
2053 					collected++;
2054 #ifdef MEM_DEL_STATS
2055 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2056 					    start_pgrp;
2057 #endif /* MEM_DEL_STATS */
2058 					MDSTAT_PGRP(mhp, ntick_pgrp);
2059 					mutex_enter(&mhp->mh_mutex);
2060 					page_delete_collect(pp, mhp);
2061 					mdsp->mds_bitmap[bit / NBPBMW] |=
2062 					    (1 << (bit % NBPBMW));
2063 					continue;
2064 				}
2065 				/*
2066 				 * The page is toxic and the mod bit is
2067 				 * set, we cannot do anything here to deal
2068 				 * with it.
2069 				 */
2070 				if (PP_TOXIC(pp)) {
2071 					page_unlock(pp);
2072 #ifdef MEM_DEL_STATS
2073 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2074 					    start_pgrp;
2075 #endif /* MEM_DEL_STATS */
2076 					MDSTAT_PGRP(mhp, ntick_pgrp);
2077 					MDSTAT_INCR(mhp, modtoxic);
2078 					mutex_enter(&mhp->mh_mutex);
2079 					continue;
2080 				}
2081 				MDSTAT_INCR(mhp, nputpage);
2082 				vp = pp->p_vnode;
2083 				offset = pp->p_offset;
2084 				VN_HOLD(vp);
2085 				page_unlock(pp);
2086 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2087 				    B_INVAL|B_FORCE, kcred, NULL);
2088 				VN_RELE(vp);
2089 #ifdef MEM_DEL_STATS
2090 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2091 				    start_pgrp;
2092 #endif /* MEM_DEL_STATS */
2093 				MDSTAT_PGRP(mhp, ntick_pgrp);
2094 				/*
2095 				 * Try to get the page back immediately
2096 				 * so that it can be collected.
2097 				 */
2098 				pp = page_numtopp_nolock(pfn);
2099 				if (pp == NULL) {
2100 					MDSTAT_INCR(mhp, nnoreclaim);
2101 					/*
2102 					 * This should not happen as this
2103 					 * thread is deleting the page.
2104 					 * If this code is generalized, this
2105 					 * becomes a reality.
2106 					 */
2107 #ifdef DEBUG
2108 					cmn_err(CE_WARN,
2109 					    "delete_memory_thread(0x%p) "
2110 					    "pfn 0x%lx has no page_t",
2111 					    (void *)mhp, pfn);
2112 #endif /* DEBUG */
2113 					mutex_enter(&mhp->mh_mutex);
2114 					continue;
2115 				}
2116 				if (page_try_reclaim_lock(pp, SE_EXCL,
2117 				    SE_EXCL_WANTED | SE_RETIRED)) {
2118 					if (PP_ISFREE(pp)) {
2119 						goto free_page_collect;
2120 					}
2121 					page_unlock(pp);
2122 				}
2123 				MDSTAT_INCR(mhp, nnoreclaim);
2124 				mutex_enter(&mhp->mh_mutex);
2125 				continue;
2126 
2127 			reloc:
2128 				/*
2129 				 * Got some freemem and a target
2130 				 * page, so move the data to avoid
2131 				 * I/O and lock problems.
2132 				 */
2133 				ASSERT(!page_iolock_assert(pp));
2134 				MDSTAT_INCR(mhp, nreloc);
2135 				/*
2136 				 * page_relocate() will return pgcnt: the
2137 				 * number of consecutive pages relocated.
2138 				 * If it is successful, pp will be a
2139 				 * linked list of the page structs that
2140 				 * were relocated. If page_relocate() is
2141 				 * unsuccessful, pp will be unmodified.
2142 				 */
2143 #ifdef MEM_DEL_STATS
2144 				start_pgrp = ddi_get_lbolt();
2145 #endif /* MEM_DEL_STATS */
2146 				result = page_relocate(&pp, &pp_targ, 0, 0,
2147 				    &pgcnt, NULL);
2148 #ifdef MEM_DEL_STATS
2149 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2150 				    start_pgrp;
2151 #endif /* MEM_DEL_STATS */
2152 				MDSTAT_PGRP(mhp, ntick_pgrp);
2153 				if (result != 0) {
2154 					MDSTAT_INCR(mhp, nrelocfail);
2155 					/*
2156 					 * We did not succeed. We need
2157 					 * to give the pp_targ pages back.
2158 					 * page_free(pp_targ, 1) without
2159 					 * the freemem accounting.
2160 					 */
2161 					group_page_unlock(pp);
2162 					page_free_replacement_page(pp_targ);
2163 					page_unlock(pp);
2164 					mutex_enter(&mhp->mh_mutex);
2165 					continue;
2166 				}
2167 
2168 				/*
2169 				 * We will then collect pgcnt pages.
2170 				 */
2171 				ASSERT(pgcnt > 0);
2172 				mutex_enter(&mhp->mh_mutex);
2173 				/*
2174 				 * We need to make sure freemem_left is
2175 				 * large enough.
2176 				 */
2177 				while ((freemem_left < pgcnt) &&
2178 				    (!mhp->mh_cancel)) {
2179 					freemem_left +=
2180 					    delthr_get_freemem(mhp);
2181 				}
2182 
2183 				/*
2184 				 * Do not proceed if mh_cancel is set.
2185 				 */
2186 				if (mhp->mh_cancel) {
2187 					while (pp_targ != NULL) {
2188 						/*
2189 						 * Unlink and unlock each page.
2190 						 */
2191 						tpp_targ = pp_targ;
2192 						page_sub(&pp_targ, tpp_targ);
2193 						page_unlock(tpp_targ);
2194 					}
2195 					/*
2196 					 * We need to give the pp pages back.
2197 					 * page_free(pp, 1) without the
2198 					 * freemem accounting.
2199 					 */
2200 					page_free_replacement_page(pp);
2201 					break;
2202 				}
2203 
2204 				/* Now remove pgcnt from freemem_left */
2205 				freemem_left -= pgcnt;
2206 				ASSERT(freemem_left >= 0);
2207 				szc = pp->p_szc;
2208 				while (pp != NULL) {
2209 					/*
2210 					 * pp and pp_targ were passed back as
2211 					 * a linked list of pages.
2212 					 * Unlink and unlock each page.
2213 					 */
2214 					tpp_targ = pp_targ;
2215 					page_sub(&pp_targ, tpp_targ);
2216 					page_unlock(tpp_targ);
2217 					/*
2218 					 * The original page is now free
2219 					 * so remove it from the linked
2220 					 * list and collect it.
2221 					 */
2222 					tpp = pp;
2223 					page_sub(&pp, tpp);
2224 					pfn = page_pptonum(tpp);
2225 					collected++;
2226 					ASSERT(PAGE_EXCL(tpp));
2227 					ASSERT(tpp->p_vnode == NULL);
2228 					ASSERT(!hat_page_is_mapped(tpp));
2229 					ASSERT(tpp->p_szc == szc);
2230 					tpp->p_szc = 0;
2231 					page_delete_collect(tpp, mhp);
2232 					bit = pfn - mdsp->mds_base;
2233 					mdsp->mds_bitmap[bit / NBPBMW] |=
2234 					    (1 << (bit % NBPBMW));
2235 				}
2236 				ASSERT(pp_targ == NULL);
2237 			}
2238 		}
2239 		first_scan = 0;
2240 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2241 		    (collected == 0)) {
2242 			/*
2243 			 * This code is needed as we cannot wait
2244 			 * for a page to be locked OR the delete to
2245 			 * be cancelled.  Also, we must delay so
2246 			 * that other threads get a chance to run
2247 			 * on our cpu, otherwise page locks may be
2248 			 * held indefinitely by those threads.
2249 			 */
2250 			MDSTAT_INCR(mhp, ndelay);
2251 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2252 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2253 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2254 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2255 		}
2256 	}
2257 	/* stop the dr aio cleanup thread */
2258 	mhp->mh_dr_aio_cleanup_cancel = 1;
2259 	transit_list_collect(mhp, 0);
2260 	if (freemem_left != 0) {
2261 		/* Return any surplus. */
2262 		page_create_putback(freemem_left);
2263 		freemem_left = 0;
2264 	}
2265 #ifdef MEM_DEL_STATS
2266 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2267 #endif /* MEM_DEL_STATS */
2268 	MDSTAT_TOTAL(mhp, ntick_total);
2269 	MDSTAT_PRINT(mhp);
2270 
2271 	/*
2272 	 * If the memory delete was cancelled, exclusive-wanted bits must
2273 	 * be cleared. If there are retired pages being deleted, they need
2274 	 * to be unretired.
2275 	 */
2276 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2277 	    mdsp = mdsp->mds_next) {
2278 		pfn_t pfn, p_end;
2279 
2280 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2281 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2282 			page_t *pp;
2283 			pgcnt_t bit;
2284 
2285 			bit = pfn - mdsp->mds_base;
2286 			if (mhp->mh_cancel) {
2287 				pp = page_numtopp_nolock(pfn);
2288 				if (pp != NULL) {
2289 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2290 					    (1 << (bit % NBPBMW))) == 0) {
2291 						page_lock_clr_exclwanted(pp);
2292 					}
2293 				}
2294 			} else {
2295 				pp = NULL;
2296 			}
2297 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2298 			    (1 << (bit % NBPBMW))) != 0) {
2299 				/* do we already have pp? */
2300 				if (pp == NULL) {
2301 					pp = page_numtopp_nolock(pfn);
2302 				}
2303 				ASSERT(pp != NULL);
2304 				ASSERT(PP_RETIRED(pp));
2305 				if (mhp->mh_cancel != 0) {
2306 					page_unlock(pp);
2307 					/*
2308 					 * To satisfy ASSERT below in
2309 					 * cancel code.
2310 					 */
2311 					mhp->mh_hold_todo++;
2312 				} else {
2313 					(void) page_unretire_pp(pp,
2314 					    PR_UNR_CLEAN);
2315 				}
2316 			}
2317 		}
2318 	}
2319 	/*
2320 	 * Free retired page bitmap and collected page bitmap
2321 	 */
2322 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2323 	    mdsp = mdsp->mds_next) {
2324 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2325 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2326 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2327 		ASSERT(mdsp->mds_bitmap != NULL);
2328 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2329 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2330 	}
2331 
2332 	/* wait for our dr aio cancel thread to exit */
2333 	while (!(mhp->mh_aio_cleanup_done)) {
2334 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2335 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2336 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2337 	}
2338 refused:
2339 	if (mhp->mh_cancel != 0) {
2340 		page_t *pp;
2341 
2342 		comp_code = mhp->mh_cancel;
2343 		/*
2344 		 * Go through list of deleted pages (mh_deleted) freeing
2345 		 * them.
2346 		 */
2347 		while ((pp = mhp->mh_deleted) != NULL) {
2348 			mhp->mh_deleted = pp->p_next;
2349 			mhp->mh_hold_todo++;
2350 			mutex_exit(&mhp->mh_mutex);
2351 			/* Restore p_next. */
2352 			pp->p_next = pp->p_prev;
2353 			if (PP_ISFREE(pp)) {
2354 				cmn_err(CE_PANIC,
2355 				    "page %p is free",
2356 				    (void *)pp);
2357 			}
2358 			page_free(pp, 1);
2359 			mutex_enter(&mhp->mh_mutex);
2360 		}
2361 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2362 
2363 		mutex_exit(&mhp->mh_mutex);
2364 		put_availrmem(mhp->mh_vm_pages);
2365 		mutex_enter(&mhp->mh_mutex);
2366 
2367 		goto t_exit;
2368 	}
2369 
2370 	/*
2371 	 * All the pages are no longer in use and are exclusively locked.
2372 	 */
2373 
2374 	mhp->mh_deleted = NULL;
2375 
2376 	kphysm_del_cleanup(mhp);
2377 
2378 	/*
2379 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
2380 	 * that the mem_node_config[] will remain intact for the cleanup.
2381 	 */
2382 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2383 	    mdsp = mdsp->mds_next) {
2384 		mem_node_del_range(mdsp->mds_base,
2385 		    mdsp->mds_base + mdsp->mds_npgs - 1);
2386 	}
2387 
2388 	comp_code = KPHYSM_OK;
2389 
2390 t_exit:
2391 	mutex_exit(&mhp->mh_mutex);
2392 	kphysm_setup_post_del(mhp->mh_vm_pages,
2393 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2394 	mutex_enter(&mhp->mh_mutex);
2395 
2396 early_exit:
2397 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2398 	mhp->mh_state = MHND_DONE;
2399 	del_complete_funcp = mhp->mh_delete_complete;
2400 	del_complete_arg = mhp->mh_delete_complete_arg;
2401 	CALLB_CPR_EXIT(&cprinfo);
2402 	(*del_complete_funcp)(del_complete_arg, comp_code);
2403 	thread_exit();
2404 	/*NOTREACHED*/
2405 }
2406 
2407 /*
2408  * Start the delete of the memory from the system.
2409  */
2410 int
2411 kphysm_del_start(
2412 	memhandle_t handle,
2413 	void (*complete)(void *, int),
2414 	void *complete_arg)
2415 {
2416 	struct mem_handle *mhp;
2417 
2418 	mhp = kphysm_lookup_mem_handle(handle);
2419 	if (mhp == NULL) {
2420 		return (KPHYSM_EHANDLE);
2421 	}
2422 	switch (mhp->mh_state) {
2423 	case MHND_FREE:
2424 		ASSERT(mhp->mh_state != MHND_FREE);
2425 		mutex_exit(&mhp->mh_mutex);
2426 		return (KPHYSM_EHANDLE);
2427 	case MHND_INIT:
2428 		break;
2429 	case MHND_STARTING:
2430 	case MHND_RUNNING:
2431 		mutex_exit(&mhp->mh_mutex);
2432 		return (KPHYSM_ESEQUENCE);
2433 	case MHND_DONE:
2434 		mutex_exit(&mhp->mh_mutex);
2435 		return (KPHYSM_ESEQUENCE);
2436 	case MHND_RELEASE:
2437 		mutex_exit(&mhp->mh_mutex);
2438 		return (KPHYSM_ESEQUENCE);
2439 	default:
2440 #ifdef DEBUG
2441 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2442 		    (void *)mhp, mhp->mh_state);
2443 #endif /* DEBUG */
2444 		mutex_exit(&mhp->mh_mutex);
2445 		return (KPHYSM_EHANDLE);
2446 	}
2447 
2448 	if (mhp->mh_transit.trl_spans == NULL) {
2449 		mutex_exit(&mhp->mh_mutex);
2450 		return (KPHYSM_ENOWORK);
2451 	}
2452 
2453 	ASSERT(complete != NULL);
2454 	mhp->mh_delete_complete = complete;
2455 	mhp->mh_delete_complete_arg = complete_arg;
2456 	mhp->mh_state = MHND_STARTING;
2457 	/*
2458 	 * Release the mutex in case thread_create sleeps.
2459 	 */
2460 	mutex_exit(&mhp->mh_mutex);
2461 
2462 	/*
2463 	 * The "obvious" process for this thread is pageout (proc_pageout)
2464 	 * but this gives the thread too much power over freemem
2465 	 * which results in freemem starvation.
2466 	 */
2467 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2468 	    TS_RUN, maxclsyspri - 1);
2469 
2470 	return (KPHYSM_OK);
2471 }
2472 
2473 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2474 static caddr_t pp_dummy;
2475 static pgcnt_t pp_dummy_npages;
2476 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2477 
2478 static void
2479 memseg_remap_init_pages(page_t *pages, page_t *epages)
2480 {
2481 	page_t *pp;
2482 
2483 	for (pp = pages; pp < epages; pp++) {
2484 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2485 		pp->p_offset = (u_offset_t)-1;
2486 		page_iolock_init(pp);
2487 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2488 			continue;
2489 		page_lock_delete(pp);
2490 	}
2491 }
2492 
2493 void
2494 memseg_remap_init()
2495 {
2496 	mutex_enter(&pp_dummy_lock);
2497 	if (pp_dummy == NULL) {
2498 		uint_t dpages;
2499 		int i;
2500 
2501 		/*
2502 		 * dpages starts off as the size of the structure and
2503 		 * ends up as the minimum number of pages that will
2504 		 * hold a whole number of page_t structures.
2505 		 */
2506 		dpages = sizeof (page_t);
2507 		ASSERT(dpages != 0);
2508 		ASSERT(dpages <= MMU_PAGESIZE);
2509 
2510 		while ((dpages & 1) == 0)
2511 			dpages >>= 1;
2512 
2513 		pp_dummy_npages = dpages;
2514 		/*
2515 		 * Allocate pp_dummy pages directly from static_arena,
2516 		 * since these are whole page allocations and are
2517 		 * referenced by physical address.  This also has the
2518 		 * nice fringe benefit of hiding the memory from
2519 		 * ::findleaks since it doesn't deal well with allocated
2520 		 * kernel heap memory that doesn't have any mappings.
2521 		 */
2522 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2523 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2524 		bzero(pp_dummy, ptob(pp_dummy_npages));
2525 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2526 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2527 		    pp_dummy_npages, KM_SLEEP);
2528 		for (i = 0; i < pp_dummy_npages; i++) {
2529 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2530 			    &pp_dummy[MMU_PAGESIZE * i]);
2531 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2532 		}
2533 		/*
2534 		 * Initialize the page_t's to a known 'deleted' state
2535 		 * that matches the state of deleted pages.
2536 		 */
2537 		memseg_remap_init_pages((page_t *)pp_dummy,
2538 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2539 		/* Remove kmem mappings for the pages for safety. */
2540 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2541 		    HAT_UNLOAD_UNLOCK);
2542 		/* Leave pp_dummy pointer set as flag that init is done. */
2543 	}
2544 	mutex_exit(&pp_dummy_lock);
2545 }
2546 
2547 /*
2548  * Remap a page-aglined range of page_t's to dummy pages.
2549  */
2550 void
2551 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2552 {
2553 	int phase;
2554 
2555 	ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE));
2556 
2557 	/*
2558 	 * We may start remapping at a non-zero page offset
2559 	 * within the dummy pages since the low/high ends
2560 	 * of the outgoing pp's could be shared by other
2561 	 * memsegs (see memseg_remap_meta).
2562 	 */
2563 	phase = btop((uint64_t)va) % pp_dummy_npages;
2564 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2565 
2566 	while (metapgs != 0) {
2567 		pgcnt_t n;
2568 		int i, j;
2569 
2570 		n = pp_dummy_npages;
2571 		if (n > metapgs)
2572 			n = metapgs;
2573 		for (i = 0; i < n; i++) {
2574 			j = (i + phase) % pp_dummy_npages;
2575 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2576 			    PROT_READ,
2577 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2578 			    HAT_LOAD_REMAP);
2579 			va += ptob(1);
2580 		}
2581 		metapgs -= n;
2582 	}
2583 }
2584 
2585 static void
2586 memseg_remap_to_dummy(struct memseg *seg)
2587 {
2588 	caddr_t pp;
2589 	pgcnt_t metapgs;
2590 
2591 	ASSERT(memseg_is_dynamic(seg));
2592 	ASSERT(pp_dummy != NULL);
2593 
2594 
2595 	if (!memseg_includes_meta(seg)) {
2596 		memseg_remap_meta(seg);
2597 		return;
2598 	}
2599 
2600 	pp = (caddr_t)seg->pages;
2601 	metapgs = seg->pages_base - memseg_get_start(seg);
2602 	ASSERT(metapgs != 0);
2603 
2604 	seg->pages_end = seg->pages_base;
2605 
2606 	remap_to_dummy(pp, metapgs);
2607 }
2608 
2609 /*
2610  * Transition all the deleted pages to the deleted state so that
2611  * page_lock will not wait. The page_lock_delete call will
2612  * also wake up any waiters.
2613  */
2614 static void
2615 memseg_lock_delete_all(struct memseg *seg)
2616 {
2617 	page_t *pp;
2618 
2619 	for (pp = seg->pages; pp < seg->epages; pp++) {
2620 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2621 		page_lock_delete(pp);
2622 	}
2623 }
2624 
2625 static void
2626 kphysm_del_cleanup(struct mem_handle *mhp)
2627 {
2628 	struct memdelspan	*mdsp;
2629 	struct memseg		*seg;
2630 	struct memseg   	**segpp;
2631 	struct memseg		*seglist;
2632 	pfn_t			p_end;
2633 	uint64_t		avmem;
2634 	pgcnt_t			avpgs;
2635 	pgcnt_t			npgs;
2636 
2637 	avpgs = mhp->mh_vm_pages;
2638 
2639 	memsegs_lock(1);
2640 
2641 	/*
2642 	 * remove from main segment list.
2643 	 */
2644 	npgs = 0;
2645 	seglist = NULL;
2646 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2647 	    mdsp = mdsp->mds_next) {
2648 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2649 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2650 			if (seg->pages_base >= p_end ||
2651 			    seg->pages_end <= mdsp->mds_base) {
2652 				/* Span and memseg don't overlap. */
2653 				segpp = &((*segpp)->next);
2654 				continue;
2655 			}
2656 			ASSERT(seg->pages_base >= mdsp->mds_base);
2657 			ASSERT(seg->pages_end <= p_end);
2658 
2659 			PLCNT_MODIFY_MAX(seg->pages_base,
2660 			    seg->pages_base - seg->pages_end);
2661 
2662 			/* Hide the memseg from future scans. */
2663 			hat_kpm_delmem_mseg_update(seg, segpp);
2664 			*segpp = seg->next;
2665 			membar_producer();	/* TODO: Needed? */
2666 			npgs += MSEG_NPAGES(seg);
2667 
2668 			/*
2669 			 * Leave the deleted segment's next pointer intact
2670 			 * in case a memsegs scanning loop is walking this
2671 			 * segment concurrently.
2672 			 */
2673 			seg->lnext = seglist;
2674 			seglist = seg;
2675 		}
2676 	}
2677 
2678 	build_pfn_hash();
2679 
2680 	ASSERT(npgs < total_pages);
2681 	total_pages -= npgs;
2682 
2683 	/*
2684 	 * Recalculate the paging parameters now total_pages has changed.
2685 	 * This will also cause the clock hands to be reset before next use.
2686 	 */
2687 	setupclock(1);
2688 
2689 	memsegs_unlock(1);
2690 
2691 	mutex_exit(&mhp->mh_mutex);
2692 
2693 	while ((seg = seglist) != NULL) {
2694 		pfn_t mseg_start;
2695 		pfn_t mseg_base, mseg_end;
2696 		pgcnt_t mseg_npgs;
2697 		int mlret;
2698 
2699 		seglist = seg->lnext;
2700 
2701 		/*
2702 		 * Put the page_t's into the deleted state to stop
2703 		 * cv_wait()s on the pages. When we remap, the dummy
2704 		 * page_t's will be in the same state.
2705 		 */
2706 		memseg_lock_delete_all(seg);
2707 		/*
2708 		 * Collect up information based on pages_base and pages_end
2709 		 * early so that we can flag early that the memseg has been
2710 		 * deleted by setting pages_end == pages_base.
2711 		 */
2712 		mseg_base = seg->pages_base;
2713 		mseg_end = seg->pages_end;
2714 		mseg_npgs = MSEG_NPAGES(seg);
2715 		mseg_start = memseg_get_start(seg);
2716 
2717 		if (memseg_is_dynamic(seg)) {
2718 			/* Remap the meta data to our special dummy area. */
2719 			memseg_remap_to_dummy(seg);
2720 
2721 			mutex_enter(&memseg_lists_lock);
2722 			seg->lnext = memseg_va_avail;
2723 			memseg_va_avail = seg;
2724 			mutex_exit(&memseg_lists_lock);
2725 		} else {
2726 			/*
2727 			 * For memory whose page_ts were allocated
2728 			 * at boot, we need to find a new use for
2729 			 * the page_t memory.
2730 			 * For the moment, just leak it.
2731 			 * (It is held in the memseg_delete_junk list.)
2732 			 */
2733 			seg->pages_end = seg->pages_base;
2734 
2735 			mutex_enter(&memseg_lists_lock);
2736 			seg->lnext = memseg_delete_junk;
2737 			memseg_delete_junk = seg;
2738 			mutex_exit(&memseg_lists_lock);
2739 		}
2740 
2741 		/* Must not use seg now as it could be re-used. */
2742 
2743 		memlist_write_lock();
2744 
2745 		mlret = memlist_delete_span(
2746 		    (uint64_t)(mseg_base) << PAGESHIFT,
2747 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2748 		    &phys_avail);
2749 		ASSERT(mlret == MEML_SPANOP_OK);
2750 
2751 		mlret = memlist_delete_span(
2752 		    (uint64_t)(mseg_start) << PAGESHIFT,
2753 		    (uint64_t)(mseg_end - mseg_start) <<
2754 		    PAGESHIFT,
2755 		    &phys_install);
2756 		ASSERT(mlret == MEML_SPANOP_OK);
2757 		phys_install_has_changed();
2758 
2759 		memlist_write_unlock();
2760 	}
2761 
2762 	memlist_read_lock();
2763 	installed_top_size(phys_install, &physmax, &physinstalled);
2764 	memlist_read_unlock();
2765 
2766 	mutex_enter(&freemem_lock);
2767 	maxmem -= avpgs;
2768 	physmem -= avpgs;
2769 	/* availrmem is adjusted during the delete. */
2770 	availrmem_initial -= avpgs;
2771 
2772 	mutex_exit(&freemem_lock);
2773 
2774 	dump_resize();
2775 
2776 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2777 	    "(0x%" PRIx64 ")\n",
2778 	    physinstalled << (PAGESHIFT - 10),
2779 	    (uint64_t)physinstalled << PAGESHIFT);
2780 
2781 	avmem = (uint64_t)freemem << PAGESHIFT;
2782 	cmn_err(CE_CONT, "?kphysm_delete: "
2783 	    "avail mem = %" PRId64 "\n", avmem);
2784 
2785 	/*
2786 	 * Update lgroup generation number on single lgroup systems
2787 	 */
2788 	if (nlgrps == 1)
2789 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2790 
2791 	/* Successfully deleted system memory */
2792 	mutex_enter(&mhp->mh_mutex);
2793 }
2794 
2795 static uint_t mdel_nullvp_waiter;
2796 
2797 static void
2798 page_delete_collect(
2799 	page_t *pp,
2800 	struct mem_handle *mhp)
2801 {
2802 	if (pp->p_vnode) {
2803 		page_hashout(pp, (kmutex_t *)NULL);
2804 		/* do not do PP_SETAGED(pp); */
2805 	} else {
2806 		kmutex_t *sep;
2807 
2808 		sep = page_se_mutex(pp);
2809 		mutex_enter(sep);
2810 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2811 			mdel_nullvp_waiter++;
2812 			cv_broadcast(&pp->p_cv);
2813 		}
2814 		mutex_exit(sep);
2815 	}
2816 	ASSERT(pp->p_next == pp->p_prev);
2817 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2818 	pp->p_next = mhp->mh_deleted;
2819 	mhp->mh_deleted = pp;
2820 	ASSERT(mhp->mh_hold_todo != 0);
2821 	mhp->mh_hold_todo--;
2822 }
2823 
2824 static void
2825 transit_list_collect(struct mem_handle *mhp, int v)
2826 {
2827 	struct transit_list_head *trh;
2828 
2829 	trh = &transit_list_head;
2830 	mutex_enter(&trh->trh_lock);
2831 	mhp->mh_transit.trl_collect = v;
2832 	mutex_exit(&trh->trh_lock);
2833 }
2834 
2835 static void
2836 transit_list_insert(struct transit_list *tlp)
2837 {
2838 	struct transit_list_head *trh;
2839 
2840 	trh = &transit_list_head;
2841 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2842 	tlp->trl_next = trh->trh_head;
2843 	trh->trh_head = tlp;
2844 }
2845 
2846 static void
2847 transit_list_remove(struct transit_list *tlp)
2848 {
2849 	struct transit_list_head *trh;
2850 	struct transit_list **tlpp;
2851 
2852 	trh = &transit_list_head;
2853 	tlpp = &trh->trh_head;
2854 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2855 	while (*tlpp != NULL && *tlpp != tlp)
2856 		tlpp = &(*tlpp)->trl_next;
2857 	ASSERT(*tlpp != NULL);
2858 	if (*tlpp == tlp)
2859 		*tlpp = tlp->trl_next;
2860 	tlp->trl_next = NULL;
2861 }
2862 
2863 static struct transit_list *
2864 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2865 {
2866 	struct transit_list *tlp;
2867 
2868 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2869 		struct memdelspan *mdsp;
2870 
2871 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2872 		    mdsp = mdsp->mds_next) {
2873 			if (pfnum >= mdsp->mds_base &&
2874 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2875 				return (tlp);
2876 			}
2877 		}
2878 	}
2879 	return (NULL);
2880 }
2881 
2882 int
2883 pfn_is_being_deleted(pfn_t pfnum)
2884 {
2885 	struct transit_list_head *trh;
2886 	struct transit_list *tlp;
2887 	int ret;
2888 
2889 	trh = &transit_list_head;
2890 	if (trh->trh_head == NULL)
2891 		return (0);
2892 
2893 	mutex_enter(&trh->trh_lock);
2894 	tlp = pfnum_to_transit_list(trh, pfnum);
2895 	ret = (tlp != NULL && tlp->trl_collect);
2896 	mutex_exit(&trh->trh_lock);
2897 
2898 	return (ret);
2899 }
2900 
2901 #ifdef MEM_DEL_STATS
2902 extern int hz;
2903 static void
2904 mem_del_stat_print_func(struct mem_handle *mhp)
2905 {
2906 	uint64_t tmp;
2907 
2908 	if (mem_del_stat_print) {
2909 		printf("memory delete loop %x/%x, statistics%s\n",
2910 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2911 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2912 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2913 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2914 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2915 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2916 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2917 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2918 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2919 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2920 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2921 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2922 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2923 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2924 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2925 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2926 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2927 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2928 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2929 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2930 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2931 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2932 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2933 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2934 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2935 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2936 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2937 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2938 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2939 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2940 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2941 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2942 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2943 		printf(
2944 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2945 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2946 
2947 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2948 		printf(
2949 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2950 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2951 	}
2952 }
2953 #endif /* MEM_DEL_STATS */
2954 
2955 struct mem_callback {
2956 	kphysm_setup_vector_t	*vec;
2957 	void			*arg;
2958 };
2959 
2960 #define	NMEMCALLBACKS		100
2961 
2962 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2963 static uint_t nmemcallbacks;
2964 static krwlock_t mem_callback_rwlock;
2965 
2966 int
2967 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2968 {
2969 	uint_t i, found;
2970 
2971 	/*
2972 	 * This test will become more complicated when the version must
2973 	 * change.
2974 	 */
2975 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2976 		return (EINVAL);
2977 
2978 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2979 	    vec->post_del == NULL)
2980 		return (EINVAL);
2981 
2982 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2983 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2984 		if (mem_callbacks[i].vec == NULL && found == 0)
2985 			found = i + 1;
2986 		if (mem_callbacks[i].vec == vec &&
2987 		    mem_callbacks[i].arg == arg) {
2988 #ifdef DEBUG
2989 			/* Catch this in DEBUG kernels. */
2990 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2991 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2992 			    (void *)vec, arg, (void *)caller());
2993 #endif /* DEBUG */
2994 			rw_exit(&mem_callback_rwlock);
2995 			return (EEXIST);
2996 		}
2997 	}
2998 	if (found != 0) {
2999 		i = found - 1;
3000 	} else {
3001 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
3002 		if (nmemcallbacks == NMEMCALLBACKS) {
3003 			rw_exit(&mem_callback_rwlock);
3004 			return (ENOMEM);
3005 		}
3006 		i = nmemcallbacks++;
3007 	}
3008 	mem_callbacks[i].vec = vec;
3009 	mem_callbacks[i].arg = arg;
3010 	rw_exit(&mem_callback_rwlock);
3011 	return (0);
3012 }
3013 
3014 void
3015 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3016 {
3017 	uint_t i;
3018 
3019 	rw_enter(&mem_callback_rwlock, RW_WRITER);
3020 	for (i = 0; i < nmemcallbacks; i++) {
3021 		if (mem_callbacks[i].vec == vec &&
3022 		    mem_callbacks[i].arg == arg) {
3023 			mem_callbacks[i].vec = NULL;
3024 			mem_callbacks[i].arg = NULL;
3025 			if (i == (nmemcallbacks - 1))
3026 				nmemcallbacks--;
3027 			break;
3028 		}
3029 	}
3030 	rw_exit(&mem_callback_rwlock);
3031 }
3032 
3033 static void
3034 kphysm_setup_post_add(pgcnt_t delta_pages)
3035 {
3036 	uint_t i;
3037 
3038 	rw_enter(&mem_callback_rwlock, RW_READER);
3039 	for (i = 0; i < nmemcallbacks; i++) {
3040 		if (mem_callbacks[i].vec != NULL) {
3041 			(*mem_callbacks[i].vec->post_add)
3042 			    (mem_callbacks[i].arg, delta_pages);
3043 		}
3044 	}
3045 	rw_exit(&mem_callback_rwlock);
3046 }
3047 
3048 /*
3049  * Note the locking between pre_del and post_del: The reader lock is held
3050  * between the two calls to stop the set of functions from changing.
3051  */
3052 
3053 static int
3054 kphysm_setup_pre_del(pgcnt_t delta_pages)
3055 {
3056 	uint_t i;
3057 	int ret;
3058 	int aret;
3059 
3060 	ret = 0;
3061 	rw_enter(&mem_callback_rwlock, RW_READER);
3062 	for (i = 0; i < nmemcallbacks; i++) {
3063 		if (mem_callbacks[i].vec != NULL) {
3064 			aret = (*mem_callbacks[i].vec->pre_del)
3065 			    (mem_callbacks[i].arg, delta_pages);
3066 			ret |= aret;
3067 		}
3068 	}
3069 
3070 	return (ret);
3071 }
3072 
3073 static void
3074 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3075 {
3076 	uint_t i;
3077 
3078 	for (i = 0; i < nmemcallbacks; i++) {
3079 		if (mem_callbacks[i].vec != NULL) {
3080 			(*mem_callbacks[i].vec->post_del)
3081 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3082 		}
3083 	}
3084 	rw_exit(&mem_callback_rwlock);
3085 }
3086 
3087 static int
3088 kphysm_split_memseg(
3089 	pfn_t base,
3090 	pgcnt_t npgs)
3091 {
3092 	struct memseg *seg;
3093 	struct memseg **segpp;
3094 	pgcnt_t size_low, size_high;
3095 	struct memseg *seg_low, *seg_mid, *seg_high;
3096 
3097 	/*
3098 	 * Lock the memsegs list against other updates now
3099 	 */
3100 	memsegs_lock(1);
3101 
3102 	/*
3103 	 * Find boot time memseg that wholly covers this area.
3104 	 */
3105 
3106 	/* First find the memseg with page 'base' in it. */
3107 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3108 	    segpp = &((*segpp)->next)) {
3109 		if (base >= seg->pages_base && base < seg->pages_end)
3110 			break;
3111 	}
3112 	if (seg == NULL) {
3113 		memsegs_unlock(1);
3114 		return (0);
3115 	}
3116 	if (memseg_includes_meta(seg)) {
3117 		memsegs_unlock(1);
3118 		return (0);
3119 	}
3120 	if ((base + npgs) > seg->pages_end) {
3121 		memsegs_unlock(1);
3122 		return (0);
3123 	}
3124 
3125 	/*
3126 	 * Work out the size of the two segments that will
3127 	 * surround the new segment, one for low address
3128 	 * and one for high.
3129 	 */
3130 	ASSERT(base >= seg->pages_base);
3131 	size_low = base - seg->pages_base;
3132 	ASSERT(seg->pages_end >= (base + npgs));
3133 	size_high = seg->pages_end - (base + npgs);
3134 
3135 	/*
3136 	 * Sanity check.
3137 	 */
3138 	if ((size_low + size_high) == 0) {
3139 		memsegs_unlock(1);
3140 		return (0);
3141 	}
3142 
3143 	/*
3144 	 * Allocate the new structures. The old memseg will not be freed
3145 	 * as there may be a reference to it.
3146 	 */
3147 	seg_low = NULL;
3148 	seg_high = NULL;
3149 
3150 	if (size_low != 0)
3151 		seg_low = memseg_alloc();
3152 
3153 	seg_mid = memseg_alloc();
3154 
3155 	if (size_high != 0)
3156 		seg_high = memseg_alloc();
3157 
3158 	/*
3159 	 * All allocation done now.
3160 	 */
3161 	if (size_low != 0) {
3162 		seg_low->pages = seg->pages;
3163 		seg_low->epages = seg_low->pages + size_low;
3164 		seg_low->pages_base = seg->pages_base;
3165 		seg_low->pages_end = seg_low->pages_base + size_low;
3166 		seg_low->next = seg_mid;
3167 		seg_low->msegflags = seg->msegflags;
3168 	}
3169 	if (size_high != 0) {
3170 		seg_high->pages = seg->epages - size_high;
3171 		seg_high->epages = seg_high->pages + size_high;
3172 		seg_high->pages_base = seg->pages_end - size_high;
3173 		seg_high->pages_end = seg_high->pages_base + size_high;
3174 		seg_high->next = seg->next;
3175 		seg_high->msegflags = seg->msegflags;
3176 	}
3177 
3178 	seg_mid->pages = seg->pages + size_low;
3179 	seg_mid->pages_base = seg->pages_base + size_low;
3180 	seg_mid->epages = seg->epages - size_high;
3181 	seg_mid->pages_end = seg->pages_end - size_high;
3182 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3183 	seg_mid->msegflags = seg->msegflags;
3184 
3185 	/*
3186 	 * Update hat_kpm specific info of all involved memsegs and
3187 	 * allow hat_kpm specific global chain updates.
3188 	 */
3189 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3190 
3191 	/*
3192 	 * At this point we have two equivalent memseg sub-chains,
3193 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3194 	 * the same place in the global chain. By re-writing the pointer
3195 	 * in the previous element we switch atomically from using the old
3196 	 * (seg) to the new.
3197 	 */
3198 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3199 
3200 	membar_enter();
3201 
3202 	build_pfn_hash();
3203 	memsegs_unlock(1);
3204 
3205 	/*
3206 	 * We leave the old segment, 'seg', intact as there may be
3207 	 * references to it. Also, as the value of total_pages has not
3208 	 * changed and the memsegs list is effectively the same when
3209 	 * accessed via the old or the new pointer, we do not have to
3210 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3211 	 *
3212 	 * We currently do not re-use or reclaim the page_t memory.
3213 	 * If we do, then this may have to change.
3214 	 */
3215 
3216 	mutex_enter(&memseg_lists_lock);
3217 	seg->lnext = memseg_edit_junk;
3218 	memseg_edit_junk = seg;
3219 	mutex_exit(&memseg_lists_lock);
3220 
3221 	return (1);
3222 }
3223 
3224 /*
3225  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3226  * structure using physical addresses. Therefore a kmem_cache is
3227  * used with KMC_NOHASH to avoid page crossings within a memseg
3228  * structure. KMC_NOHASH requires that no external (outside of
3229  * slab) information is allowed. This, in turn, implies that the
3230  * cache's slabsize must be exactly a single page, since per-slab
3231  * information (e.g. the freelist for the slab) is kept at the
3232  * end of the slab, where it is easy to locate. Should be changed
3233  * when a more obvious kmem_cache interface/flag will become
3234  * available.
3235  */
3236 void
3237 mem_config_init()
3238 {
3239 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3240 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3241 }
3242 
3243 struct memseg *
3244 memseg_alloc()
3245 {
3246 	struct memseg *seg;
3247 
3248 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3249 	bzero(seg, sizeof (struct memseg));
3250 
3251 	return (seg);
3252 }
3253 
3254 /*
3255  * Return whether the page_t memory for this memseg
3256  * is included in the memseg itself.
3257  */
3258 static int
3259 memseg_includes_meta(struct memseg *seg)
3260 {
3261 	return (seg->msegflags & MEMSEG_META_INCL);
3262 }
3263 
3264 pfn_t
3265 memseg_get_start(struct memseg *seg)
3266 {
3267 	pfn_t		pt_start;
3268 
3269 	if (memseg_includes_meta(seg)) {
3270 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3271 
3272 		/* Meta data is required to be at the beginning */
3273 		ASSERT(pt_start < seg->pages_base);
3274 	} else
3275 		pt_start = seg->pages_base;
3276 
3277 	return (pt_start);
3278 }
3279 
3280 /*
3281  * Invalidate memseg pointers in cpu private vm data caches.
3282  */
3283 static void
3284 memseg_cpu_vm_flush()
3285 {
3286 	cpu_t *cp;
3287 	vm_cpu_data_t *vc;
3288 
3289 	mutex_enter(&cpu_lock);
3290 	pause_cpus(NULL);
3291 
3292 	cp = cpu_list;
3293 	do {
3294 		vc = cp->cpu_vm_data;
3295 		vc->vc_pnum_memseg = NULL;
3296 		vc->vc_pnext_memseg = NULL;
3297 
3298 	} while ((cp = cp->cpu_next) != cpu_list);
3299 
3300 	start_cpus();
3301 	mutex_exit(&cpu_lock);
3302 }
3303