xref: /titanic_52/usr/src/uts/common/os/mem_config.c (revision 73a3eccd27d9673a6407274ea0de350699562fd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern void mem_node_add(pfn_t, pfn_t);
63 extern void mem_node_del(pfn_t, pfn_t);
64 
65 extern uint_t page_ctrs_adjust(int);
66 void page_ctrs_cleanup(void);
67 static void kphysm_setup_post_add(pgcnt_t);
68 static int kphysm_setup_pre_del(pgcnt_t);
69 static void kphysm_setup_post_del(pgcnt_t, int);
70 
71 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
72 
73 static int delspan_reserve(pfn_t, pgcnt_t);
74 static void delspan_unreserve(pfn_t, pgcnt_t);
75 
76 kmutex_t memseg_lists_lock;
77 struct memseg *memseg_va_avail;
78 struct memseg *memseg_alloc(void);
79 static struct memseg *memseg_delete_junk;
80 static struct memseg *memseg_edit_junk;
81 void memseg_remap_init(void);
82 static void memseg_remap_to_dummy(struct memseg *);
83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
84 static struct memseg *memseg_reuse(pgcnt_t);
85 
86 static struct kmem_cache *memseg_cache;
87 
88 /*
89  * Interfaces to manage externally allocated
90  * page_t memory (metadata) for a memseg.
91  */
92 #pragma weak	memseg_alloc_meta
93 #pragma weak	memseg_free_meta
94 #pragma weak	memseg_get_metapfn
95 #pragma weak	memseg_remap_meta
96 
97 extern int ppvm_enable;
98 extern page_t *ppvm_base;
99 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
100 extern void memseg_free_meta(void *, pgcnt_t);
101 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
102 extern void memseg_remap_meta(struct memseg *);
103 static int memseg_is_dynamic(struct memseg *);
104 static int memseg_includes_meta(struct memseg *);
105 pfn_t memseg_get_start(struct memseg *);
106 static void memseg_cpu_vm_flush(void);
107 
108 int meta_alloc_enable;
109 
110 /*
111  * Add a chunk of memory to the system.
112  * base: starting PAGESIZE page of new memory.
113  * npgs: length in PAGESIZE pages.
114  *
115  * Adding mem this way doesn't increase the size of the hash tables;
116  * growing them would be too hard.  This should be OK, but adding memory
117  * dynamically most likely means more hash misses, since the tables will
118  * be smaller than they otherwise would be.
119  */
120 #ifdef	DEBUG
121 static int memseg_debug;
122 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
123 #else
124 #define	MEMSEG_DEBUG(...)
125 #endif
126 
127 int
128 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
129 {
130 	page_t *pp;
131 	page_t		*opp, *oepp, *segpp;
132 	struct memseg	*seg;
133 	uint64_t	avmem;
134 	pfn_t		pfn;
135 	pfn_t		pt_base = base;
136 	pgcnt_t		tpgs = npgs;
137 	pgcnt_t		metapgs = 0;
138 	int		exhausted;
139 	pfn_t		pnum;
140 	int		mnode;
141 	caddr_t		vaddr;
142 	int		reuse;
143 	int		mlret;
144 	int		rv;
145 	int		flags;
146 	int		meta_alloc = 0;
147 	void		*mapva;
148 	void		*metabase = (void *)base;
149 	pgcnt_t		nkpmpgs = 0;
150 	offset_t	kpm_pages_off;
151 
152 	cmn_err(CE_CONT,
153 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
154 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
155 
156 	/*
157 	 * Add this span in the delete list to prevent interactions.
158 	 */
159 	if (!delspan_reserve(base, npgs)) {
160 		return (KPHYSM_ESPAN);
161 	}
162 	/*
163 	 * Check to see if any of the memory span has been added
164 	 * by trying an add to the installed memory list. This
165 	 * forms the interlocking process for add.
166 	 */
167 
168 	memlist_write_lock();
169 
170 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
171 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
172 
173 	if (mlret == MEML_SPANOP_OK)
174 		installed_top_size(phys_install, &physmax, &physinstalled);
175 
176 	memlist_write_unlock();
177 
178 	if (mlret != MEML_SPANOP_OK) {
179 		if (mlret == MEML_SPANOP_EALLOC) {
180 			delspan_unreserve(pt_base, tpgs);
181 			return (KPHYSM_ERESOURCE);
182 		} else if (mlret == MEML_SPANOP_ESPAN) {
183 			delspan_unreserve(pt_base, tpgs);
184 			return (KPHYSM_ESPAN);
185 		} else {
186 			delspan_unreserve(pt_base, tpgs);
187 			return (KPHYSM_ERESOURCE);
188 		}
189 	}
190 
191 	if (meta_alloc_enable) {
192 		/*
193 		 * Allocate the page_t's from existing memory;
194 		 * if that fails, allocate from the incoming memory.
195 		 */
196 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
197 		if (rv == KPHYSM_OK) {
198 			ASSERT(metapgs);
199 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
200 			meta_alloc = 1;
201 			goto mapalloc;
202 		}
203 	}
204 
205 	/*
206 	 * We store the page_t's for this new memory in the first
207 	 * few pages of the chunk. Here, we go and get'em ...
208 	 */
209 
210 	/*
211 	 * The expression after the '-' gives the number of pages
212 	 * that will fit in the new memory based on a requirement
213 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
214 	 */
215 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
216 	    (PAGESIZE + sizeof (page_t)));
217 
218 	npgs -= metapgs;
219 	base += metapgs;
220 
221 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
222 
223 	exhausted = (metapgs == 0 || npgs == 0);
224 
225 	if (kpm_enable && !exhausted) {
226 		pgcnt_t start, end, nkpmpgs_prelim;
227 		size_t	ptsz;
228 
229 		/*
230 		 * A viable kpm large page mapping must not overlap two
231 		 * dynamic memsegs. Therefore the total size is checked
232 		 * to be at least kpm_pgsz and also whether start and end
233 		 * points are at least kpm_pgsz aligned.
234 		 */
235 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
236 		    pmodkpmp(base + npgs)) {
237 
238 			kphysm_addmem_error_undospan(pt_base, tpgs);
239 
240 			/*
241 			 * There is no specific error code for violating
242 			 * kpm granularity constraints.
243 			 */
244 			return (KPHYSM_ENOTVIABLE);
245 		}
246 
247 		start = kpmptop(ptokpmp(base));
248 		end = kpmptop(ptokpmp(base + npgs));
249 		nkpmpgs_prelim = ptokpmp(end - start);
250 		ptsz = npgs * sizeof (page_t);
251 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
252 		exhausted = (tpgs <= metapgs);
253 		if (!exhausted) {
254 			npgs = tpgs - metapgs;
255 			base = pt_base + metapgs;
256 
257 			/* final nkpmpgs */
258 			start = kpmptop(ptokpmp(base));
259 			nkpmpgs = ptokpmp(end - start);
260 			kpm_pages_off = ptsz +
261 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
262 		}
263 	}
264 
265 	/*
266 	 * Is memory area supplied too small?
267 	 */
268 	if (exhausted) {
269 		kphysm_addmem_error_undospan(pt_base, tpgs);
270 		/*
271 		 * There is no specific error code for 'too small'.
272 		 */
273 		return (KPHYSM_ERESOURCE);
274 	}
275 
276 mapalloc:
277 	/*
278 	 * We may re-use a previously allocated VA space for the page_ts
279 	 * eventually, but we need to initialize and lock the pages first.
280 	 */
281 
282 	/*
283 	 * Get an address in the kernel address map, map
284 	 * the page_t pages and see if we can touch them.
285 	 */
286 
287 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
288 	if (mapva == NULL) {
289 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
290 		    " Can't allocate VA for page_ts");
291 
292 		if (meta_alloc)
293 			memseg_free_meta(metabase, metapgs);
294 		kphysm_addmem_error_undospan(pt_base, tpgs);
295 
296 		return (KPHYSM_ERESOURCE);
297 	}
298 	pp = mapva;
299 
300 	if (physmax < (pt_base + tpgs))
301 		physmax = (pt_base + tpgs);
302 
303 	/*
304 	 * In the remapping code we map one page at a time so we must do
305 	 * the same here to match mapping sizes.
306 	 */
307 	pfn = pt_base;
308 	vaddr = (caddr_t)pp;
309 	for (pnum = 0; pnum < metapgs; pnum++) {
310 		if (meta_alloc)
311 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
312 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
313 		    PROT_READ | PROT_WRITE,
314 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
315 		pfn++;
316 		vaddr += ptob(1);
317 	}
318 
319 	if (ddi_peek32((dev_info_t *)NULL,
320 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
321 
322 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
323 		    " Can't access pp array at 0x%p [phys 0x%lx]",
324 		    (void *)pp, pt_base);
325 
326 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
327 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
328 
329 		vmem_free(heap_arena, mapva, ptob(metapgs));
330 		if (meta_alloc)
331 			memseg_free_meta(metabase, metapgs);
332 		kphysm_addmem_error_undospan(pt_base, tpgs);
333 
334 		return (KPHYSM_EFAULT);
335 	}
336 
337 	/*
338 	 * Add this memory slice to its memory node translation.
339 	 *
340 	 * Note that right now, each node may have only one slice;
341 	 * this may change with COD or in larger SSM systems with
342 	 * nested latency groups, so we must not assume that the
343 	 * node does not yet exist.
344 	 */
345 	pnum = pt_base + tpgs - 1;
346 	mem_node_add_range(pt_base, pnum);
347 
348 	/*
349 	 * Allocate or resize page counters as necessary to accommodate
350 	 * the increase in memory pages.
351 	 */
352 	mnode = PFN_2_MEM_NODE(pnum);
353 	PAGE_CTRS_ADJUST(base, npgs, rv);
354 	if (rv) {
355 
356 		mem_node_del_range(pt_base, pnum);
357 
358 		/* cleanup the  page counters */
359 		page_ctrs_cleanup();
360 
361 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
362 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
363 
364 		vmem_free(heap_arena, mapva, ptob(metapgs));
365 		if (meta_alloc)
366 			memseg_free_meta(metabase, metapgs);
367 		kphysm_addmem_error_undospan(pt_base, tpgs);
368 
369 		return (KPHYSM_ERESOURCE);
370 	}
371 
372 	/*
373 	 * Update the phys_avail memory list.
374 	 * The phys_install list was done at the start.
375 	 */
376 
377 	memlist_write_lock();
378 
379 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
380 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
381 	ASSERT(mlret == MEML_SPANOP_OK);
382 
383 	memlist_write_unlock();
384 
385 	/* See if we can find a memseg to re-use. */
386 	if (meta_alloc) {
387 		seg = memseg_reuse(0);
388 		reuse = 1;	/* force unmapping of temp mapva */
389 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
390 		/*
391 		 * There is a 1:1 fixed relationship between a pfn
392 		 * and a page_t VA.  The pfn is used as an index into
393 		 * the ppvm_base page_t table in order to calculate
394 		 * the page_t base address for a given pfn range.
395 		 */
396 		segpp = ppvm_base + base;
397 	} else {
398 		seg = memseg_reuse(metapgs);
399 		reuse = (seg != NULL);
400 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
401 		segpp = pp;
402 	}
403 
404 	/*
405 	 * Initialize the memseg structure representing this memory
406 	 * and add it to the existing list of memsegs. Do some basic
407 	 * initialization and add the memory to the system.
408 	 * In order to prevent lock deadlocks, the add_physmem()
409 	 * code is repeated here, but split into several stages.
410 	 *
411 	 * If a memseg is reused, invalidate memseg pointers in
412 	 * all cpu vm caches.  We need to do this this since the check
413 	 * 	pp >= seg->pages && pp < seg->epages
414 	 * used in various places is not atomic and so the first compare
415 	 * can happen before reuse and the second compare after reuse.
416 	 * The invalidation ensures that a memseg is not deferenced while
417 	 * it's page/pfn pointers are changing.
418 	 */
419 	if (seg == NULL) {
420 		seg = memseg_alloc();
421 		ASSERT(seg != NULL);
422 		seg->msegflags = flags;
423 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
424 		    (void *)seg, (void *)(seg->pages));
425 		seg->pages = segpp;
426 	} else {
427 		ASSERT(seg->msegflags == flags);
428 		ASSERT(seg->pages_base == seg->pages_end);
429 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
430 		    (void *)seg, (void *)(seg->pages));
431 		if (meta_alloc) {
432 			memseg_cpu_vm_flush();
433 			seg->pages = segpp;
434 		}
435 	}
436 
437 	seg->epages = seg->pages + npgs;
438 	seg->pages_base = base;
439 	seg->pages_end = base + npgs;
440 
441 	/*
442 	 * Initialize metadata. The page_ts are set to locked state
443 	 * ready to be freed.
444 	 */
445 	bzero((caddr_t)pp, ptob(metapgs));
446 
447 	pfn = seg->pages_base;
448 	/* Save the original pp base in case we reuse a memseg. */
449 	opp = pp;
450 	oepp = opp + npgs;
451 	for (pp = opp; pp < oepp; pp++) {
452 		pp->p_pagenum = pfn;
453 		pfn++;
454 		page_iolock_init(pp);
455 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
456 			continue;
457 		pp->p_offset = (u_offset_t)-1;
458 	}
459 
460 	if (reuse) {
461 		/* Remap our page_ts to the re-used memseg VA space. */
462 		pfn = pt_base;
463 		vaddr = (caddr_t)seg->pages;
464 		for (pnum = 0; pnum < metapgs; pnum++) {
465 			if (meta_alloc)
466 				pfn = memseg_get_metapfn(metabase,
467 				    (pgcnt_t)pnum);
468 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
469 			    PROT_READ | PROT_WRITE,
470 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
471 			pfn++;
472 			vaddr += ptob(1);
473 		}
474 
475 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
476 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
477 
478 		vmem_free(heap_arena, mapva, ptob(metapgs));
479 	}
480 
481 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
482 
483 	memsegs_lock(1);
484 
485 	/*
486 	 * The new memseg is inserted at the beginning of the list.
487 	 * Not only does this save searching for the tail, but in the
488 	 * case of a re-used memseg, it solves the problem of what
489 	 * happens if some process has still got a pointer to the
490 	 * memseg and follows the next pointer to continue traversing
491 	 * the memsegs list.
492 	 */
493 
494 	hat_kpm_addmem_mseg_insert(seg);
495 
496 	seg->next = memsegs;
497 	membar_producer();
498 
499 	hat_kpm_addmem_memsegs_update(seg);
500 
501 	memsegs = seg;
502 
503 	build_pfn_hash();
504 
505 	total_pages += npgs;
506 
507 	/*
508 	 * Recalculate the paging parameters now total_pages has changed.
509 	 * This will also cause the clock hands to be reset before next use.
510 	 */
511 	setupclock(1);
512 
513 	memsegs_unlock(1);
514 
515 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
516 
517 	/*
518 	 * Free the pages outside the lock to avoid locking loops.
519 	 */
520 	for (pp = seg->pages; pp < seg->epages; pp++) {
521 		page_free(pp, 1);
522 	}
523 
524 	/*
525 	 * Now that we've updated the appropriate memory lists we
526 	 * need to reset a number of globals, since we've increased memory.
527 	 * Several have already been updated for us as noted above. The
528 	 * globals we're interested in at this point are:
529 	 *   physmax - highest page frame number.
530 	 *   physinstalled - number of pages currently installed (done earlier)
531 	 *   maxmem - max free pages in the system
532 	 *   physmem - physical memory pages available
533 	 *   availrmem - real memory available
534 	 */
535 
536 	mutex_enter(&freemem_lock);
537 	maxmem += npgs;
538 	physmem += npgs;
539 	availrmem += npgs;
540 	availrmem_initial += npgs;
541 
542 	mutex_exit(&freemem_lock);
543 
544 	dump_resize();
545 
546 	page_freelist_coalesce_all(mnode);
547 
548 	kphysm_setup_post_add(npgs);
549 
550 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
551 	    "(0x%" PRIx64 ")\n",
552 	    physinstalled << (PAGESHIFT - 10),
553 	    (uint64_t)physinstalled << PAGESHIFT);
554 
555 	avmem = (uint64_t)freemem << PAGESHIFT;
556 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
557 	    "avail mem = %" PRId64 "\n", avmem);
558 
559 	/*
560 	 * Update lgroup generation number on single lgroup systems
561 	 */
562 	if (nlgrps == 1)
563 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
564 
565 	delspan_unreserve(pt_base, tpgs);
566 	return (KPHYSM_OK);		/* Successfully added system memory */
567 
568 }
569 
570 /*
571  * There are various error conditions in kphysm_add_memory_dynamic()
572  * which require a rollback of already changed global state.
573  */
574 static void
575 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
576 {
577 	int mlret;
578 
579 	/* Unreserve memory span. */
580 	memlist_write_lock();
581 
582 	mlret = memlist_delete_span(
583 	    (uint64_t)(pt_base) << PAGESHIFT,
584 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
585 
586 	ASSERT(mlret == MEML_SPANOP_OK);
587 	phys_install_has_changed();
588 	installed_top_size(phys_install, &physmax, &physinstalled);
589 
590 	memlist_write_unlock();
591 	delspan_unreserve(pt_base, tpgs);
592 }
593 
594 /*
595  * Only return an available memseg of exactly the right size
596  * if size is required.
597  * When the meta data area has it's own virtual address space
598  * we will need to manage this more carefully and do best fit
599  * allocations, possibly splitting an available area.
600  */
601 struct memseg *
602 memseg_reuse(pgcnt_t metapgs)
603 {
604 	int type;
605 	struct memseg **segpp, *seg;
606 
607 	mutex_enter(&memseg_lists_lock);
608 
609 	segpp = &memseg_va_avail;
610 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
611 		caddr_t end;
612 
613 		/*
614 		 * Make sure we are reusing the right segment type.
615 		 */
616 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
617 
618 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
619 		    != type)
620 			continue;
621 
622 		if (kpm_enable)
623 			end = hat_kpm_mseg_reuse(seg);
624 		else
625 			end = (caddr_t)seg->epages;
626 
627 		/*
628 		 * Check for the right size if it is provided.
629 		 */
630 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
631 			*segpp = seg->lnext;
632 			seg->lnext = NULL;
633 			break;
634 		}
635 	}
636 	mutex_exit(&memseg_lists_lock);
637 
638 	return (seg);
639 }
640 
641 static uint_t handle_gen;
642 
643 struct memdelspan {
644 	struct memdelspan *mds_next;
645 	pfn_t		mds_base;
646 	pgcnt_t		mds_npgs;
647 	uint_t		*mds_bitmap;
648 	uint_t		*mds_bitmap_retired;
649 };
650 
651 #define	NBPBMW		(sizeof (uint_t) * NBBY)
652 #define	MDS_BITMAPBYTES(MDSP) \
653 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
654 
655 struct transit_list {
656 	struct transit_list	*trl_next;
657 	struct memdelspan	*trl_spans;
658 	int			trl_collect;
659 };
660 
661 struct transit_list_head {
662 	kmutex_t		trh_lock;
663 	struct transit_list	*trh_head;
664 };
665 
666 static struct transit_list_head transit_list_head;
667 
668 struct mem_handle;
669 static void transit_list_collect(struct mem_handle *, int);
670 static void transit_list_insert(struct transit_list *);
671 static void transit_list_remove(struct transit_list *);
672 
673 #ifdef DEBUG
674 #define	MEM_DEL_STATS
675 #endif /* DEBUG */
676 
677 #ifdef MEM_DEL_STATS
678 static int mem_del_stat_print = 0;
679 struct mem_del_stat {
680 	uint_t	nloop;
681 	uint_t	need_free;
682 	uint_t	free_loop;
683 	uint_t	free_low;
684 	uint_t	free_failed;
685 	uint_t	ncheck;
686 	uint_t	nopaget;
687 	uint_t	lockfail;
688 	uint_t	nfree;
689 	uint_t	nreloc;
690 	uint_t	nrelocfail;
691 	uint_t	already_done;
692 	uint_t	first_notfree;
693 	uint_t	npplocked;
694 	uint_t	nlockreloc;
695 	uint_t	nnorepl;
696 	uint_t	nmodreloc;
697 	uint_t	ndestroy;
698 	uint_t	nputpage;
699 	uint_t	nnoreclaim;
700 	uint_t	ndelay;
701 	uint_t	demotefail;
702 	uint64_t nticks_total;
703 	uint64_t nticks_pgrp;
704 	uint_t	retired;
705 	uint_t	toxic;
706 	uint_t	failing;
707 	uint_t	modtoxic;
708 	uint_t	npplkdtoxic;
709 	uint_t	gptlmodfail;
710 	uint_t	gptllckfail;
711 };
712 /*
713  * The stat values are only incremented in the delete thread
714  * so no locking or atomic required.
715  */
716 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
717 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
718 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
719 static void mem_del_stat_print_func(struct mem_handle *);
720 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
721 #else /* MEM_DEL_STATS */
722 #define	MDSTAT_INCR(MHP, FLD)
723 #define	MDSTAT_TOTAL(MHP, ntck)
724 #define	MDSTAT_PGRP(MHP, ntck)
725 #define	MDSTAT_PRINT(MHP)
726 #endif /* MEM_DEL_STATS */
727 
728 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
729 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
730 
731 /*
732  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
733  * The mutex may not be required for other fields, dependent on mh_state.
734  */
735 struct mem_handle {
736 	kmutex_t	mh_mutex;
737 	struct mem_handle *mh_next;
738 	memhandle_t	mh_exthandle;
739 	mhnd_state_t	mh_state;
740 	struct transit_list mh_transit;
741 	pgcnt_t		mh_phys_pages;
742 	pgcnt_t		mh_vm_pages;
743 	pgcnt_t		mh_hold_todo;
744 	void		(*mh_delete_complete)(void *, int error);
745 	void		*mh_delete_complete_arg;
746 	volatile uint_t mh_cancel;
747 	volatile uint_t mh_dr_aio_cleanup_cancel;
748 	volatile uint_t mh_aio_cleanup_done;
749 	kcondvar_t	mh_cv;
750 	kthread_id_t	mh_thread_id;
751 	page_t		*mh_deleted;	/* link through p_next */
752 #ifdef MEM_DEL_STATS
753 	struct mem_del_stat mh_delstat;
754 #endif /* MEM_DEL_STATS */
755 };
756 
757 static struct mem_handle *mem_handle_head;
758 static kmutex_t mem_handle_list_mutex;
759 
760 static struct mem_handle *
761 kphysm_allocate_mem_handle()
762 {
763 	struct mem_handle *mhp;
764 
765 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
766 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
767 	mutex_enter(&mem_handle_list_mutex);
768 	mutex_enter(&mhp->mh_mutex);
769 	/* handle_gen is protected by list mutex. */
770 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
771 	mhp->mh_next = mem_handle_head;
772 	mem_handle_head = mhp;
773 	mutex_exit(&mem_handle_list_mutex);
774 
775 	return (mhp);
776 }
777 
778 static void
779 kphysm_free_mem_handle(struct mem_handle *mhp)
780 {
781 	struct mem_handle **mhpp;
782 
783 	ASSERT(mutex_owned(&mhp->mh_mutex));
784 	ASSERT(mhp->mh_state == MHND_FREE);
785 	/*
786 	 * Exit the mutex to preserve locking order. This is OK
787 	 * here as once in the FREE state, the handle cannot
788 	 * be found by a lookup.
789 	 */
790 	mutex_exit(&mhp->mh_mutex);
791 
792 	mutex_enter(&mem_handle_list_mutex);
793 	mhpp = &mem_handle_head;
794 	while (*mhpp != NULL && *mhpp != mhp)
795 		mhpp = &(*mhpp)->mh_next;
796 	ASSERT(*mhpp == mhp);
797 	/*
798 	 * No need to lock the handle (mh_mutex) as only
799 	 * mh_next changing and this is the only thread that
800 	 * can be referncing mhp.
801 	 */
802 	*mhpp = mhp->mh_next;
803 	mutex_exit(&mem_handle_list_mutex);
804 
805 	mutex_destroy(&mhp->mh_mutex);
806 	kmem_free(mhp, sizeof (struct mem_handle));
807 }
808 
809 /*
810  * This function finds the internal mem_handle corresponding to an
811  * external handle and returns it with the mh_mutex held.
812  */
813 static struct mem_handle *
814 kphysm_lookup_mem_handle(memhandle_t handle)
815 {
816 	struct mem_handle *mhp;
817 
818 	mutex_enter(&mem_handle_list_mutex);
819 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
820 		if (mhp->mh_exthandle == handle) {
821 			mutex_enter(&mhp->mh_mutex);
822 			/*
823 			 * The state of the handle could have been changed
824 			 * by kphysm_del_release() while waiting for mh_mutex.
825 			 */
826 			if (mhp->mh_state == MHND_FREE) {
827 				mutex_exit(&mhp->mh_mutex);
828 				continue;
829 			}
830 			break;
831 		}
832 	}
833 	mutex_exit(&mem_handle_list_mutex);
834 	return (mhp);
835 }
836 
837 int
838 kphysm_del_gethandle(memhandle_t *xmhp)
839 {
840 	struct mem_handle *mhp;
841 
842 	mhp = kphysm_allocate_mem_handle();
843 	/*
844 	 * The handle is allocated using KM_SLEEP, so cannot fail.
845 	 * If the implementation is changed, the correct error to return
846 	 * here would be KPHYSM_ENOHANDLES.
847 	 */
848 	ASSERT(mhp->mh_state == MHND_FREE);
849 	mhp->mh_state = MHND_INIT;
850 	*xmhp = mhp->mh_exthandle;
851 	mutex_exit(&mhp->mh_mutex);
852 	return (KPHYSM_OK);
853 }
854 
855 static int
856 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
857 {
858 	pfn_t e1, e2;
859 
860 	e1 = b1 + l1;
861 	e2 = b2 + l2;
862 
863 	return (!(b2 >= e1 || b1 >= e2));
864 }
865 
866 static int can_remove_pgs(pgcnt_t);
867 
868 static struct memdelspan *
869 span_to_install(pfn_t base, pgcnt_t npgs)
870 {
871 	struct memdelspan *mdsp;
872 	struct memdelspan *mdsp_new;
873 	uint64_t address, size, thislen;
874 	struct memlist *mlp;
875 
876 	mdsp_new = NULL;
877 
878 	address = (uint64_t)base << PAGESHIFT;
879 	size = (uint64_t)npgs << PAGESHIFT;
880 	while (size != 0) {
881 		memlist_read_lock();
882 		for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
883 			if (address >= (mlp->ml_address + mlp->ml_size))
884 				continue;
885 			if ((address + size) > mlp->ml_address)
886 				break;
887 		}
888 		if (mlp == NULL) {
889 			address += size;
890 			size = 0;
891 			thislen = 0;
892 		} else {
893 			if (address < mlp->ml_address) {
894 				size -= (mlp->ml_address - address);
895 				address = mlp->ml_address;
896 			}
897 			ASSERT(address >= mlp->ml_address);
898 			if ((address + size) >
899 			    (mlp->ml_address + mlp->ml_size)) {
900 				thislen =
901 				    mlp->ml_size - (address - mlp->ml_address);
902 			} else {
903 				thislen = size;
904 			}
905 		}
906 		memlist_read_unlock();
907 		/* TODO: phys_install could change now */
908 		if (thislen == 0)
909 			continue;
910 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
911 		mdsp->mds_base = btop(address);
912 		mdsp->mds_npgs = btop(thislen);
913 		mdsp->mds_next = mdsp_new;
914 		mdsp_new = mdsp;
915 		address += thislen;
916 		size -= thislen;
917 	}
918 	return (mdsp_new);
919 }
920 
921 static void
922 free_delspans(struct memdelspan *mdsp)
923 {
924 	struct memdelspan *amdsp;
925 
926 	while ((amdsp = mdsp) != NULL) {
927 		mdsp = amdsp->mds_next;
928 		kmem_free(amdsp, sizeof (struct memdelspan));
929 	}
930 }
931 
932 /*
933  * Concatenate lists. No list ordering is required.
934  */
935 
936 static void
937 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
938 {
939 	while (*mdspp != NULL)
940 		mdspp = &(*mdspp)->mds_next;
941 
942 	*mdspp = mdsp;
943 }
944 
945 /*
946  * Given a new list of delspans, check there is no overlap with
947  * all existing span activity (add or delete) and then concatenate
948  * the new spans to the given list.
949  * Return 1 for OK, 0 if overlapping.
950  */
951 static int
952 delspan_insert(
953 	struct transit_list *my_tlp,
954 	struct memdelspan *mdsp_new)
955 {
956 	struct transit_list_head *trh;
957 	struct transit_list *tlp;
958 	int ret;
959 
960 	trh = &transit_list_head;
961 
962 	ASSERT(my_tlp != NULL);
963 	ASSERT(mdsp_new != NULL);
964 
965 	ret = 1;
966 	mutex_enter(&trh->trh_lock);
967 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
968 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
969 		struct memdelspan *mdsp;
970 
971 		for (mdsp = tlp->trl_spans; mdsp != NULL;
972 		    mdsp = mdsp->mds_next) {
973 			struct memdelspan *nmdsp;
974 
975 			for (nmdsp = mdsp_new; nmdsp != NULL;
976 			    nmdsp = nmdsp->mds_next) {
977 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
978 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
979 					ret = 0;
980 					goto done;
981 				}
982 			}
983 		}
984 	}
985 done:
986 	if (ret != 0) {
987 		if (my_tlp->trl_spans == NULL)
988 			transit_list_insert(my_tlp);
989 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
990 	}
991 	mutex_exit(&trh->trh_lock);
992 	return (ret);
993 }
994 
995 static void
996 delspan_remove(
997 	struct transit_list *my_tlp,
998 	pfn_t base,
999 	pgcnt_t npgs)
1000 {
1001 	struct transit_list_head *trh;
1002 	struct memdelspan *mdsp;
1003 
1004 	trh = &transit_list_head;
1005 
1006 	ASSERT(my_tlp != NULL);
1007 
1008 	mutex_enter(&trh->trh_lock);
1009 	if ((mdsp = my_tlp->trl_spans) != NULL) {
1010 		if (npgs == 0) {
1011 			my_tlp->trl_spans = NULL;
1012 			free_delspans(mdsp);
1013 			transit_list_remove(my_tlp);
1014 		} else {
1015 			struct memdelspan **prv;
1016 
1017 			prv = &my_tlp->trl_spans;
1018 			while (mdsp != NULL) {
1019 				pfn_t p_end;
1020 
1021 				p_end = mdsp->mds_base + mdsp->mds_npgs;
1022 				if (mdsp->mds_base >= base &&
1023 				    p_end <= (base + npgs)) {
1024 					*prv = mdsp->mds_next;
1025 					mdsp->mds_next = NULL;
1026 					free_delspans(mdsp);
1027 				} else {
1028 					prv = &mdsp->mds_next;
1029 				}
1030 				mdsp = *prv;
1031 			}
1032 			if (my_tlp->trl_spans == NULL)
1033 				transit_list_remove(my_tlp);
1034 		}
1035 	}
1036 	mutex_exit(&trh->trh_lock);
1037 }
1038 
1039 /*
1040  * Reserve interface for add to stop delete before add finished.
1041  * This list is only accessed through the delspan_insert/remove
1042  * functions and so is fully protected by the mutex in struct transit_list.
1043  */
1044 
1045 static struct transit_list reserve_transit;
1046 
1047 static int
1048 delspan_reserve(pfn_t base, pgcnt_t npgs)
1049 {
1050 	struct memdelspan *mdsp;
1051 	int ret;
1052 
1053 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1054 	mdsp->mds_base = base;
1055 	mdsp->mds_npgs = npgs;
1056 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1057 		free_delspans(mdsp);
1058 	}
1059 	return (ret);
1060 }
1061 
1062 static void
1063 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1064 {
1065 	delspan_remove(&reserve_transit, base, npgs);
1066 }
1067 
1068 /*
1069  * Return whether memseg was created by kphysm_add_memory_dynamic().
1070  */
1071 static int
1072 memseg_is_dynamic(struct memseg *seg)
1073 {
1074 	return (seg->msegflags & MEMSEG_DYNAMIC);
1075 }
1076 
1077 int
1078 kphysm_del_span(
1079 	memhandle_t handle,
1080 	pfn_t base,
1081 	pgcnt_t npgs)
1082 {
1083 	struct mem_handle *mhp;
1084 	struct memseg *seg;
1085 	struct memdelspan *mdsp;
1086 	struct memdelspan *mdsp_new;
1087 	pgcnt_t phys_pages, vm_pages;
1088 	pfn_t p_end;
1089 	page_t *pp;
1090 	int ret;
1091 
1092 	mhp = kphysm_lookup_mem_handle(handle);
1093 	if (mhp == NULL) {
1094 		return (KPHYSM_EHANDLE);
1095 	}
1096 	if (mhp->mh_state != MHND_INIT) {
1097 		mutex_exit(&mhp->mh_mutex);
1098 		return (KPHYSM_ESEQUENCE);
1099 	}
1100 
1101 	/*
1102 	 * Intersect the span with the installed memory list (phys_install).
1103 	 */
1104 	mdsp_new = span_to_install(base, npgs);
1105 	if (mdsp_new == NULL) {
1106 		/*
1107 		 * No physical memory in this range. Is this an
1108 		 * error? If an attempt to start the delete is made
1109 		 * for OK returns from del_span such as this, start will
1110 		 * return an error.
1111 		 * Could return KPHYSM_ENOWORK.
1112 		 */
1113 		/*
1114 		 * It is assumed that there are no error returns
1115 		 * from span_to_install() due to kmem_alloc failure.
1116 		 */
1117 		mutex_exit(&mhp->mh_mutex);
1118 		return (KPHYSM_OK);
1119 	}
1120 	/*
1121 	 * Does this span overlap an existing span?
1122 	 */
1123 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1124 		/*
1125 		 * Differentiate between already on list for this handle
1126 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1127 		 */
1128 		ret = KPHYSM_EBUSY;
1129 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1130 		    mdsp = mdsp->mds_next) {
1131 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1132 			    base, npgs)) {
1133 				ret = KPHYSM_EDUP;
1134 				break;
1135 			}
1136 		}
1137 		mutex_exit(&mhp->mh_mutex);
1138 		free_delspans(mdsp_new);
1139 		return (ret);
1140 	}
1141 	/*
1142 	 * At this point the spans in mdsp_new have been inserted into the
1143 	 * list of spans for this handle and thereby to the global list of
1144 	 * spans being processed. Each of these spans must now be checked
1145 	 * for relocatability. As a side-effect segments in the memseg list
1146 	 * may be split.
1147 	 *
1148 	 * Note that mdsp_new can no longer be used as it is now part of
1149 	 * a larger list. Select elements of this larger list based
1150 	 * on base and npgs.
1151 	 */
1152 restart:
1153 	phys_pages = 0;
1154 	vm_pages = 0;
1155 	ret = KPHYSM_OK;
1156 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1157 	    mdsp = mdsp->mds_next) {
1158 		pgcnt_t pages_checked;
1159 
1160 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1161 			continue;
1162 		}
1163 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1164 		/*
1165 		 * The pages_checked count is a hack. All pages should be
1166 		 * checked for relocatability. Those not covered by memsegs
1167 		 * should be tested with arch_kphysm_del_span_ok().
1168 		 */
1169 		pages_checked = 0;
1170 		for (seg = memsegs; seg; seg = seg->next) {
1171 			pfn_t mseg_start;
1172 
1173 			if (seg->pages_base >= p_end ||
1174 			    seg->pages_end <= mdsp->mds_base) {
1175 				/* Span and memseg don't overlap. */
1176 				continue;
1177 			}
1178 			mseg_start = memseg_get_start(seg);
1179 			/* Check that segment is suitable for delete. */
1180 			if (memseg_includes_meta(seg)) {
1181 				/*
1182 				 * Check that this segment is completely
1183 				 * within the span.
1184 				 */
1185 				if (mseg_start < mdsp->mds_base ||
1186 				    seg->pages_end > p_end) {
1187 					ret = KPHYSM_EBUSY;
1188 					break;
1189 				}
1190 				pages_checked += seg->pages_end - mseg_start;
1191 			} else {
1192 				/*
1193 				 * If this segment is larger than the span,
1194 				 * try to split it. After the split, it
1195 				 * is necessary to restart.
1196 				 */
1197 				if (seg->pages_base < mdsp->mds_base ||
1198 				    seg->pages_end > p_end) {
1199 					pfn_t abase;
1200 					pgcnt_t anpgs;
1201 					int s_ret;
1202 
1203 					/* Split required.  */
1204 					if (mdsp->mds_base < seg->pages_base)
1205 						abase = seg->pages_base;
1206 					else
1207 						abase = mdsp->mds_base;
1208 					if (p_end > seg->pages_end)
1209 						anpgs = seg->pages_end - abase;
1210 					else
1211 						anpgs = p_end - abase;
1212 					s_ret = kphysm_split_memseg(abase,
1213 					    anpgs);
1214 					if (s_ret == 0) {
1215 						/* Split failed. */
1216 						ret = KPHYSM_ERESOURCE;
1217 						break;
1218 					}
1219 					goto restart;
1220 				}
1221 				pages_checked +=
1222 				    seg->pages_end - seg->pages_base;
1223 			}
1224 			/*
1225 			 * The memseg is wholly within the delete span.
1226 			 * The individual pages can now be checked.
1227 			 */
1228 			/* Cage test. */
1229 			for (pp = seg->pages; pp < seg->epages; pp++) {
1230 				if (PP_ISNORELOC(pp)) {
1231 					ret = KPHYSM_ENONRELOC;
1232 					break;
1233 				}
1234 			}
1235 			if (ret != KPHYSM_OK) {
1236 				break;
1237 			}
1238 			phys_pages += (seg->pages_end - mseg_start);
1239 			vm_pages += MSEG_NPAGES(seg);
1240 		}
1241 		if (ret != KPHYSM_OK)
1242 			break;
1243 		if (pages_checked != mdsp->mds_npgs) {
1244 			ret = KPHYSM_ENONRELOC;
1245 			break;
1246 		}
1247 	}
1248 
1249 	if (ret == KPHYSM_OK) {
1250 		mhp->mh_phys_pages += phys_pages;
1251 		mhp->mh_vm_pages += vm_pages;
1252 	} else {
1253 		/*
1254 		 * Keep holding the mh_mutex to prevent it going away.
1255 		 */
1256 		delspan_remove(&mhp->mh_transit, base, npgs);
1257 	}
1258 	mutex_exit(&mhp->mh_mutex);
1259 	return (ret);
1260 }
1261 
1262 int
1263 kphysm_del_span_query(
1264 	pfn_t base,
1265 	pgcnt_t npgs,
1266 	memquery_t *mqp)
1267 {
1268 	struct memdelspan *mdsp;
1269 	struct memdelspan *mdsp_new;
1270 	int done_first_nonreloc;
1271 
1272 	mqp->phys_pages = 0;
1273 	mqp->managed = 0;
1274 	mqp->nonrelocatable = 0;
1275 	mqp->first_nonrelocatable = 0;
1276 	mqp->last_nonrelocatable = 0;
1277 
1278 	mdsp_new = span_to_install(base, npgs);
1279 	/*
1280 	 * It is OK to proceed here if mdsp_new == NULL.
1281 	 */
1282 	done_first_nonreloc = 0;
1283 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1284 		pfn_t sbase;
1285 		pgcnt_t snpgs;
1286 
1287 		mqp->phys_pages += mdsp->mds_npgs;
1288 		sbase = mdsp->mds_base;
1289 		snpgs = mdsp->mds_npgs;
1290 		while (snpgs != 0) {
1291 			struct memseg *lseg, *seg;
1292 			pfn_t p_end;
1293 			page_t *pp;
1294 			pfn_t mseg_start;
1295 
1296 			p_end = sbase + snpgs;
1297 			/*
1298 			 * Find the lowest addressed memseg that starts
1299 			 * after sbase and account for it.
1300 			 * This is to catch dynamic memsegs whose start
1301 			 * is hidden.
1302 			 */
1303 			seg = NULL;
1304 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1305 				if ((lseg->pages_base >= sbase) ||
1306 				    (lseg->pages_base < p_end &&
1307 				    lseg->pages_end > sbase)) {
1308 					if (seg == NULL ||
1309 					    seg->pages_base > lseg->pages_base)
1310 						seg = lseg;
1311 				}
1312 			}
1313 			if (seg != NULL) {
1314 				mseg_start = memseg_get_start(seg);
1315 				/*
1316 				 * Now have the full extent of the memseg so
1317 				 * do the range check.
1318 				 */
1319 				if (mseg_start >= p_end ||
1320 				    seg->pages_end <= sbase) {
1321 					/* Span does not overlap memseg. */
1322 					seg = NULL;
1323 				}
1324 			}
1325 			/*
1326 			 * Account for gap either before the segment if
1327 			 * there is one or to the end of the span.
1328 			 */
1329 			if (seg == NULL || mseg_start > sbase) {
1330 				pfn_t a_end;
1331 
1332 				a_end = (seg == NULL) ? p_end : mseg_start;
1333 				/*
1334 				 * Check with arch layer for relocatability.
1335 				 */
1336 				if (arch_kphysm_del_span_ok(sbase,
1337 				    (a_end - sbase))) {
1338 					/*
1339 					 * No non-relocatble pages in this
1340 					 * area, avoid the fine-grained
1341 					 * test.
1342 					 */
1343 					snpgs -= (a_end - sbase);
1344 					sbase = a_end;
1345 				}
1346 				while (sbase < a_end) {
1347 					if (!arch_kphysm_del_span_ok(sbase,
1348 					    1)) {
1349 						mqp->nonrelocatable++;
1350 						if (!done_first_nonreloc) {
1351 							mqp->
1352 							    first_nonrelocatable
1353 							    = sbase;
1354 							done_first_nonreloc = 1;
1355 						}
1356 						mqp->last_nonrelocatable =
1357 						    sbase;
1358 					}
1359 					sbase++;
1360 					snpgs--;
1361 				}
1362 			}
1363 			if (seg != NULL) {
1364 				ASSERT(mseg_start <= sbase);
1365 				if (seg->pages_base != mseg_start &&
1366 				    seg->pages_base > sbase) {
1367 					pgcnt_t skip_pgs;
1368 
1369 					/*
1370 					 * Skip the page_t area of a
1371 					 * dynamic memseg.
1372 					 */
1373 					skip_pgs = seg->pages_base - sbase;
1374 					if (snpgs <= skip_pgs) {
1375 						sbase += snpgs;
1376 						snpgs = 0;
1377 						continue;
1378 					}
1379 					snpgs -= skip_pgs;
1380 					sbase += skip_pgs;
1381 				}
1382 				ASSERT(snpgs != 0);
1383 				ASSERT(seg->pages_base <= sbase);
1384 				/*
1385 				 * The individual pages can now be checked.
1386 				 */
1387 				for (pp = seg->pages +
1388 				    (sbase - seg->pages_base);
1389 				    snpgs != 0 && pp < seg->epages; pp++) {
1390 					mqp->managed++;
1391 					if (PP_ISNORELOC(pp)) {
1392 						mqp->nonrelocatable++;
1393 						if (!done_first_nonreloc) {
1394 							mqp->
1395 							    first_nonrelocatable
1396 							    = sbase;
1397 							done_first_nonreloc = 1;
1398 						}
1399 						mqp->last_nonrelocatable =
1400 						    sbase;
1401 					}
1402 					sbase++;
1403 					snpgs--;
1404 				}
1405 			}
1406 		}
1407 	}
1408 
1409 	free_delspans(mdsp_new);
1410 
1411 	return (KPHYSM_OK);
1412 }
1413 
1414 /*
1415  * This release function can be called at any stage as follows:
1416  *	_gethandle only called
1417  *	_span(s) only called
1418  *	_start called but failed
1419  *	delete thread exited
1420  */
1421 int
1422 kphysm_del_release(memhandle_t handle)
1423 {
1424 	struct mem_handle *mhp;
1425 
1426 	mhp = kphysm_lookup_mem_handle(handle);
1427 	if (mhp == NULL) {
1428 		return (KPHYSM_EHANDLE);
1429 	}
1430 	switch (mhp->mh_state) {
1431 	case MHND_STARTING:
1432 	case MHND_RUNNING:
1433 		mutex_exit(&mhp->mh_mutex);
1434 		return (KPHYSM_ENOTFINISHED);
1435 	case MHND_FREE:
1436 		ASSERT(mhp->mh_state != MHND_FREE);
1437 		mutex_exit(&mhp->mh_mutex);
1438 		return (KPHYSM_EHANDLE);
1439 	case MHND_INIT:
1440 		break;
1441 	case MHND_DONE:
1442 		break;
1443 	case MHND_RELEASE:
1444 		mutex_exit(&mhp->mh_mutex);
1445 		return (KPHYSM_ESEQUENCE);
1446 	default:
1447 #ifdef DEBUG
1448 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1449 		    (void *)mhp, mhp->mh_state);
1450 #endif /* DEBUG */
1451 		mutex_exit(&mhp->mh_mutex);
1452 		return (KPHYSM_EHANDLE);
1453 	}
1454 	/*
1455 	 * Set state so that we can wait if necessary.
1456 	 * Also this means that we have read/write access to all
1457 	 * fields except mh_exthandle and mh_state.
1458 	 */
1459 	mhp->mh_state = MHND_RELEASE;
1460 	/*
1461 	 * The mem_handle cannot be de-allocated by any other operation
1462 	 * now, so no need to hold mh_mutex.
1463 	 */
1464 	mutex_exit(&mhp->mh_mutex);
1465 
1466 	delspan_remove(&mhp->mh_transit, 0, 0);
1467 	mhp->mh_phys_pages = 0;
1468 	mhp->mh_vm_pages = 0;
1469 	mhp->mh_hold_todo = 0;
1470 	mhp->mh_delete_complete = NULL;
1471 	mhp->mh_delete_complete_arg = NULL;
1472 	mhp->mh_cancel = 0;
1473 
1474 	mutex_enter(&mhp->mh_mutex);
1475 	ASSERT(mhp->mh_state == MHND_RELEASE);
1476 	mhp->mh_state = MHND_FREE;
1477 
1478 	kphysm_free_mem_handle(mhp);
1479 
1480 	return (KPHYSM_OK);
1481 }
1482 
1483 /*
1484  * This cancel function can only be called with the thread running.
1485  */
1486 int
1487 kphysm_del_cancel(memhandle_t handle)
1488 {
1489 	struct mem_handle *mhp;
1490 
1491 	mhp = kphysm_lookup_mem_handle(handle);
1492 	if (mhp == NULL) {
1493 		return (KPHYSM_EHANDLE);
1494 	}
1495 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1496 		mutex_exit(&mhp->mh_mutex);
1497 		return (KPHYSM_ENOTRUNNING);
1498 	}
1499 	/*
1500 	 * Set the cancel flag and wake the delete thread up.
1501 	 * The thread may be waiting on I/O, so the effect of the cancel
1502 	 * may be delayed.
1503 	 */
1504 	if (mhp->mh_cancel == 0) {
1505 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1506 		cv_signal(&mhp->mh_cv);
1507 	}
1508 	mutex_exit(&mhp->mh_mutex);
1509 	return (KPHYSM_OK);
1510 }
1511 
1512 int
1513 kphysm_del_status(
1514 	memhandle_t handle,
1515 	memdelstat_t *mdstp)
1516 {
1517 	struct mem_handle *mhp;
1518 
1519 	mhp = kphysm_lookup_mem_handle(handle);
1520 	if (mhp == NULL) {
1521 		return (KPHYSM_EHANDLE);
1522 	}
1523 	/*
1524 	 * Calling kphysm_del_status() is allowed before the delete
1525 	 * is started to allow for status display.
1526 	 */
1527 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1528 	    mhp->mh_state != MHND_RUNNING) {
1529 		mutex_exit(&mhp->mh_mutex);
1530 		return (KPHYSM_ENOTRUNNING);
1531 	}
1532 	mdstp->phys_pages = mhp->mh_phys_pages;
1533 	mdstp->managed = mhp->mh_vm_pages;
1534 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1535 	mutex_exit(&mhp->mh_mutex);
1536 	return (KPHYSM_OK);
1537 }
1538 
1539 static int mem_delete_additional_pages = 100;
1540 
1541 static int
1542 can_remove_pgs(pgcnt_t npgs)
1543 {
1544 	/*
1545 	 * If all pageable pages were paged out, freemem would
1546 	 * equal availrmem.  There is a minimum requirement for
1547 	 * availrmem.
1548 	 */
1549 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1550 	    < npgs)
1551 		return (0);
1552 	/* TODO: check swap space, etc. */
1553 	return (1);
1554 }
1555 
1556 static int
1557 get_availrmem(pgcnt_t npgs)
1558 {
1559 	int ret;
1560 
1561 	mutex_enter(&freemem_lock);
1562 	ret = can_remove_pgs(npgs);
1563 	if (ret != 0)
1564 		availrmem -= npgs;
1565 	mutex_exit(&freemem_lock);
1566 	return (ret);
1567 }
1568 
1569 static void
1570 put_availrmem(pgcnt_t npgs)
1571 {
1572 	mutex_enter(&freemem_lock);
1573 	availrmem += npgs;
1574 	mutex_exit(&freemem_lock);
1575 }
1576 
1577 #define	FREEMEM_INCR	100
1578 static pgcnt_t freemem_incr = FREEMEM_INCR;
1579 #define	DEL_FREE_WAIT_FRAC	4
1580 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1581 
1582 #define	DEL_BUSY_WAIT_FRAC	20
1583 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1584 
1585 static void kphysm_del_cleanup(struct mem_handle *);
1586 
1587 static void page_delete_collect(page_t *, struct mem_handle *);
1588 
1589 static pgcnt_t
1590 delthr_get_freemem(struct mem_handle *mhp)
1591 {
1592 	pgcnt_t free_get;
1593 	int ret;
1594 
1595 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1596 
1597 	MDSTAT_INCR(mhp, need_free);
1598 	/*
1599 	 * Get up to freemem_incr pages.
1600 	 */
1601 	free_get = freemem_incr;
1602 	if (free_get > mhp->mh_hold_todo)
1603 		free_get = mhp->mh_hold_todo;
1604 	/*
1605 	 * Take free_get pages away from freemem,
1606 	 * waiting if necessary.
1607 	 */
1608 
1609 	while (!mhp->mh_cancel) {
1610 		mutex_exit(&mhp->mh_mutex);
1611 		MDSTAT_INCR(mhp, free_loop);
1612 		/*
1613 		 * Duplicate test from page_create_throttle()
1614 		 * but don't override with !PG_WAIT.
1615 		 */
1616 		if (freemem < (free_get + throttlefree)) {
1617 			MDSTAT_INCR(mhp, free_low);
1618 			ret = 0;
1619 		} else {
1620 			ret = page_create_wait(free_get, 0);
1621 			if (ret == 0) {
1622 				/* EMPTY */
1623 				MDSTAT_INCR(mhp, free_failed);
1624 			}
1625 		}
1626 		if (ret != 0) {
1627 			mutex_enter(&mhp->mh_mutex);
1628 			return (free_get);
1629 		}
1630 
1631 		/*
1632 		 * Put pressure on pageout.
1633 		 */
1634 		page_needfree(free_get);
1635 		cv_signal(&proc_pageout->p_cv);
1636 
1637 		mutex_enter(&mhp->mh_mutex);
1638 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1639 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1640 		mutex_exit(&mhp->mh_mutex);
1641 		page_needfree(-(spgcnt_t)free_get);
1642 
1643 		mutex_enter(&mhp->mh_mutex);
1644 	}
1645 	return (0);
1646 }
1647 
1648 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1649 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1650 /*
1651  * This function is run as a helper thread for delete_memory_thread.
1652  * It is needed in order to force kaio cleanup, so that pages used in kaio
1653  * will be unlocked and subsequently relocated by delete_memory_thread.
1654  * The address of the delete_memory_threads's mem_handle is passed in to
1655  * this thread function, and is used to set the mh_aio_cleanup_done member
1656  * prior to calling thread_exit().
1657  */
1658 static void
1659 dr_aio_cleanup_thread(caddr_t amhp)
1660 {
1661 	proc_t *procp;
1662 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1663 	int cleaned;
1664 	int n = 0;
1665 	struct mem_handle *mhp;
1666 	volatile uint_t *pcancel;
1667 
1668 	mhp = (struct mem_handle *)amhp;
1669 	ASSERT(mhp != NULL);
1670 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1671 	if (modload("sys", "kaio") == -1) {
1672 		mhp->mh_aio_cleanup_done = 1;
1673 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1674 		thread_exit();
1675 	}
1676 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1677 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1678 	if (aio_cleanup_dr_delete_memory == NULL) {
1679 		mhp->mh_aio_cleanup_done = 1;
1680 		cmn_err(CE_WARN,
1681 	    "aio_cleanup_dr_delete_memory not found in kaio");
1682 		thread_exit();
1683 	}
1684 	do {
1685 		cleaned = 0;
1686 		mutex_enter(&pidlock);
1687 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1688 		    procp = procp->p_next) {
1689 			mutex_enter(&procp->p_lock);
1690 			if (procp->p_aio != NULL) {
1691 				/* cleanup proc's outstanding kaio */
1692 				cleaned +=
1693 				    (*aio_cleanup_dr_delete_memory)(procp);
1694 			}
1695 			mutex_exit(&procp->p_lock);
1696 		}
1697 		mutex_exit(&pidlock);
1698 		if ((*pcancel == 0) &&
1699 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1700 			/* delay a bit before retrying all procs again */
1701 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1702 			n = 0;
1703 		}
1704 	} while (*pcancel == 0);
1705 	mhp->mh_aio_cleanup_done = 1;
1706 	thread_exit();
1707 }
1708 
1709 static void
1710 delete_memory_thread(caddr_t amhp)
1711 {
1712 	struct mem_handle *mhp;
1713 	struct memdelspan *mdsp;
1714 	callb_cpr_t cprinfo;
1715 	page_t *pp_targ;
1716 	spgcnt_t freemem_left;
1717 	void (*del_complete_funcp)(void *, int error);
1718 	void *del_complete_arg;
1719 	int comp_code;
1720 	int ret;
1721 	int first_scan;
1722 	uint_t szc;
1723 #ifdef MEM_DEL_STATS
1724 	uint64_t start_total, ntick_total;
1725 	uint64_t start_pgrp, ntick_pgrp;
1726 #endif /* MEM_DEL_STATS */
1727 
1728 	mhp = (struct mem_handle *)amhp;
1729 
1730 #ifdef MEM_DEL_STATS
1731 	start_total = ddi_get_lbolt();
1732 #endif /* MEM_DEL_STATS */
1733 
1734 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1735 	    callb_generic_cpr, "memdel");
1736 
1737 	mutex_enter(&mhp->mh_mutex);
1738 	ASSERT(mhp->mh_state == MHND_STARTING);
1739 
1740 	mhp->mh_state = MHND_RUNNING;
1741 	mhp->mh_thread_id = curthread;
1742 
1743 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1744 	mutex_exit(&mhp->mh_mutex);
1745 
1746 	/* Allocate the remap pages now, if necessary. */
1747 	memseg_remap_init();
1748 
1749 	/*
1750 	 * Subtract from availrmem now if possible as availrmem
1751 	 * may not be available by the end of the delete.
1752 	 */
1753 	if (!get_availrmem(mhp->mh_vm_pages)) {
1754 		comp_code = KPHYSM_ENOTVIABLE;
1755 		mutex_enter(&mhp->mh_mutex);
1756 		goto early_exit;
1757 	}
1758 
1759 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1760 
1761 	mutex_enter(&mhp->mh_mutex);
1762 
1763 	if (ret != 0) {
1764 		mhp->mh_cancel = KPHYSM_EREFUSED;
1765 		goto refused;
1766 	}
1767 
1768 	transit_list_collect(mhp, 1);
1769 
1770 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1771 	    mdsp = mdsp->mds_next) {
1772 		ASSERT(mdsp->mds_bitmap == NULL);
1773 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1774 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1775 		    KM_SLEEP);
1776 	}
1777 
1778 	first_scan = 1;
1779 	freemem_left = 0;
1780 	/*
1781 	 * Start dr_aio_cleanup_thread, which periodically iterates
1782 	 * through the process list and invokes aio cleanup.  This
1783 	 * is needed in order to avoid a deadly embrace between the
1784 	 * delete_memory_thread (waiting on writer lock for page, with the
1785 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1786 	 * reader lock on the same page that is wanted by the
1787 	 * delete_memory_thread), and threads waiting for kaio completion
1788 	 * (blocked on spt_amp->lock).
1789 	 */
1790 	mhp->mh_dr_aio_cleanup_cancel = 0;
1791 	mhp->mh_aio_cleanup_done = 0;
1792 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1793 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1794 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1795 		pgcnt_t collected;
1796 
1797 		MDSTAT_INCR(mhp, nloop);
1798 		collected = 0;
1799 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1800 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1801 			pfn_t pfn, p_end;
1802 
1803 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1804 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1805 			    (mhp->mh_cancel == 0); pfn++) {
1806 				page_t *pp, *tpp, *tpp_targ;
1807 				pgcnt_t bit;
1808 				struct vnode *vp;
1809 				u_offset_t offset;
1810 				int mod, result;
1811 				spgcnt_t pgcnt;
1812 
1813 				bit = pfn - mdsp->mds_base;
1814 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1815 				    (1 << (bit % NBPBMW))) != 0) {
1816 					MDSTAT_INCR(mhp, already_done);
1817 					continue;
1818 				}
1819 				if (freemem_left == 0) {
1820 					freemem_left += delthr_get_freemem(mhp);
1821 					if (freemem_left == 0)
1822 						break;
1823 				}
1824 
1825 				/*
1826 				 * Release mh_mutex - some of this
1827 				 * stuff takes some time (eg PUTPAGE).
1828 				 */
1829 
1830 				mutex_exit(&mhp->mh_mutex);
1831 				MDSTAT_INCR(mhp, ncheck);
1832 
1833 				pp = page_numtopp_nolock(pfn);
1834 				if (pp == NULL) {
1835 					/*
1836 					 * Not covered by a page_t - will
1837 					 * be dealt with elsewhere.
1838 					 */
1839 					MDSTAT_INCR(mhp, nopaget);
1840 					mutex_enter(&mhp->mh_mutex);
1841 					mdsp->mds_bitmap[bit / NBPBMW] |=
1842 					    (1 << (bit % NBPBMW));
1843 					continue;
1844 				}
1845 
1846 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1847 				    SE_EXCL_WANTED | SE_RETIRED)) {
1848 					/*
1849 					 * Page in use elsewhere.  Skip it.
1850 					 */
1851 					MDSTAT_INCR(mhp, lockfail);
1852 					mutex_enter(&mhp->mh_mutex);
1853 					continue;
1854 				}
1855 				/*
1856 				 * See if the cage expanded into the delete.
1857 				 * This can happen as we have to allow the
1858 				 * cage to expand.
1859 				 */
1860 				if (PP_ISNORELOC(pp)) {
1861 					page_unlock(pp);
1862 					mutex_enter(&mhp->mh_mutex);
1863 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1864 					break;
1865 				}
1866 				if (PP_RETIRED(pp)) {
1867 					/*
1868 					 * Page has been retired and is
1869 					 * not part of the cage so we
1870 					 * can now do the accounting for
1871 					 * it.
1872 					 */
1873 					MDSTAT_INCR(mhp, retired);
1874 					mutex_enter(&mhp->mh_mutex);
1875 					mdsp->mds_bitmap[bit / NBPBMW]
1876 					    |= (1 << (bit % NBPBMW));
1877 					mdsp->mds_bitmap_retired[bit /
1878 					    NBPBMW] |=
1879 					    (1 << (bit % NBPBMW));
1880 					mhp->mh_hold_todo--;
1881 					continue;
1882 				}
1883 				ASSERT(freemem_left != 0);
1884 				if (PP_ISFREE(pp)) {
1885 					/*
1886 					 * Like page_reclaim() only 'freemem'
1887 					 * processing is already done.
1888 					 */
1889 					MDSTAT_INCR(mhp, nfree);
1890 				free_page_collect:
1891 					if (PP_ISAGED(pp)) {
1892 						page_list_sub(pp,
1893 						    PG_FREE_LIST);
1894 					} else {
1895 						page_list_sub(pp,
1896 						    PG_CACHE_LIST);
1897 					}
1898 					PP_CLRFREE(pp);
1899 					PP_CLRAGED(pp);
1900 					collected++;
1901 					mutex_enter(&mhp->mh_mutex);
1902 					page_delete_collect(pp, mhp);
1903 					mdsp->mds_bitmap[bit / NBPBMW] |=
1904 					    (1 << (bit % NBPBMW));
1905 					freemem_left--;
1906 					continue;
1907 				}
1908 				ASSERT(pp->p_vnode != NULL);
1909 				if (first_scan) {
1910 					MDSTAT_INCR(mhp, first_notfree);
1911 					page_unlock(pp);
1912 					mutex_enter(&mhp->mh_mutex);
1913 					continue;
1914 				}
1915 				/*
1916 				 * Keep stats on pages encountered that
1917 				 * are marked for retirement.
1918 				 */
1919 				if (PP_TOXIC(pp)) {
1920 					MDSTAT_INCR(mhp, toxic);
1921 				} else if (PP_PR_REQ(pp)) {
1922 					MDSTAT_INCR(mhp, failing);
1923 				}
1924 				/*
1925 				 * In certain cases below, special exceptions
1926 				 * are made for pages that are toxic.  This
1927 				 * is because the current meaning of toxic
1928 				 * is that an uncorrectable error has been
1929 				 * previously associated with the page.
1930 				 */
1931 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1932 					if (!PP_TOXIC(pp)) {
1933 						/*
1934 						 * Must relocate locked in
1935 						 * memory pages.
1936 						 */
1937 #ifdef MEM_DEL_STATS
1938 						start_pgrp = ddi_get_lbolt();
1939 #endif /* MEM_DEL_STATS */
1940 						/*
1941 						 * Lock all constituent pages
1942 						 * of a large page to ensure
1943 						 * that p_szc won't change.
1944 						 */
1945 						if (!group_page_trylock(pp,
1946 						    SE_EXCL)) {
1947 							MDSTAT_INCR(mhp,
1948 							    gptllckfail);
1949 							page_unlock(pp);
1950 							mutex_enter(
1951 							    &mhp->mh_mutex);
1952 							continue;
1953 						}
1954 						MDSTAT_INCR(mhp, npplocked);
1955 						pp_targ =
1956 						    page_get_replacement_page(
1957 						    pp, NULL, 0);
1958 						if (pp_targ != NULL) {
1959 #ifdef MEM_DEL_STATS
1960 							ntick_pgrp =
1961 							    (uint64_t)
1962 							    ddi_get_lbolt() -
1963 							    start_pgrp;
1964 #endif /* MEM_DEL_STATS */
1965 							MDSTAT_PGRP(mhp,
1966 							    ntick_pgrp);
1967 							MDSTAT_INCR(mhp,
1968 							    nlockreloc);
1969 							goto reloc;
1970 						}
1971 						group_page_unlock(pp);
1972 						page_unlock(pp);
1973 #ifdef MEM_DEL_STATS
1974 						ntick_pgrp =
1975 						    (uint64_t)ddi_get_lbolt() -
1976 						    start_pgrp;
1977 #endif /* MEM_DEL_STATS */
1978 						MDSTAT_PGRP(mhp, ntick_pgrp);
1979 						MDSTAT_INCR(mhp, nnorepl);
1980 						mutex_enter(&mhp->mh_mutex);
1981 						continue;
1982 					} else {
1983 						/*
1984 						 * Cannot do anything about
1985 						 * this page because it is
1986 						 * toxic.
1987 						 */
1988 						MDSTAT_INCR(mhp, npplkdtoxic);
1989 						page_unlock(pp);
1990 						mutex_enter(&mhp->mh_mutex);
1991 						continue;
1992 					}
1993 				}
1994 				/*
1995 				 * Unload the mappings and check if mod bit
1996 				 * is set.
1997 				 */
1998 				ASSERT(!PP_ISKAS(pp));
1999 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2000 				mod = hat_ismod(pp);
2001 
2002 #ifdef MEM_DEL_STATS
2003 				start_pgrp = ddi_get_lbolt();
2004 #endif /* MEM_DEL_STATS */
2005 				if (mod && !PP_TOXIC(pp)) {
2006 					/*
2007 					 * Lock all constituent pages
2008 					 * of a large page to ensure
2009 					 * that p_szc won't change.
2010 					 */
2011 					if (!group_page_trylock(pp, SE_EXCL)) {
2012 						MDSTAT_INCR(mhp, gptlmodfail);
2013 						page_unlock(pp);
2014 						mutex_enter(&mhp->mh_mutex);
2015 						continue;
2016 					}
2017 					pp_targ = page_get_replacement_page(pp,
2018 					    NULL, 0);
2019 					if (pp_targ != NULL) {
2020 						MDSTAT_INCR(mhp, nmodreloc);
2021 #ifdef MEM_DEL_STATS
2022 						ntick_pgrp =
2023 						    (uint64_t)ddi_get_lbolt() -
2024 						    start_pgrp;
2025 #endif /* MEM_DEL_STATS */
2026 						MDSTAT_PGRP(mhp, ntick_pgrp);
2027 						goto reloc;
2028 					}
2029 					group_page_unlock(pp);
2030 				}
2031 
2032 				if (!page_try_demote_pages(pp)) {
2033 					MDSTAT_INCR(mhp, demotefail);
2034 					page_unlock(pp);
2035 #ifdef MEM_DEL_STATS
2036 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2037 					    start_pgrp;
2038 #endif /* MEM_DEL_STATS */
2039 					MDSTAT_PGRP(mhp, ntick_pgrp);
2040 					mutex_enter(&mhp->mh_mutex);
2041 					continue;
2042 				}
2043 
2044 				/*
2045 				 * Regular 'page-out'.
2046 				 */
2047 				if (!mod) {
2048 					MDSTAT_INCR(mhp, ndestroy);
2049 					page_destroy(pp, 1);
2050 					/*
2051 					 * page_destroy was called with
2052 					 * dontfree. As long as p_lckcnt
2053 					 * and p_cowcnt are both zero, the
2054 					 * only additional action of
2055 					 * page_destroy with !dontfree is to
2056 					 * call page_free, so we can collect
2057 					 * the page here.
2058 					 */
2059 					collected++;
2060 #ifdef MEM_DEL_STATS
2061 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2062 					    start_pgrp;
2063 #endif /* MEM_DEL_STATS */
2064 					MDSTAT_PGRP(mhp, ntick_pgrp);
2065 					mutex_enter(&mhp->mh_mutex);
2066 					page_delete_collect(pp, mhp);
2067 					mdsp->mds_bitmap[bit / NBPBMW] |=
2068 					    (1 << (bit % NBPBMW));
2069 					continue;
2070 				}
2071 				/*
2072 				 * The page is toxic and the mod bit is
2073 				 * set, we cannot do anything here to deal
2074 				 * with it.
2075 				 */
2076 				if (PP_TOXIC(pp)) {
2077 					page_unlock(pp);
2078 #ifdef MEM_DEL_STATS
2079 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2080 					    start_pgrp;
2081 #endif /* MEM_DEL_STATS */
2082 					MDSTAT_PGRP(mhp, ntick_pgrp);
2083 					MDSTAT_INCR(mhp, modtoxic);
2084 					mutex_enter(&mhp->mh_mutex);
2085 					continue;
2086 				}
2087 				MDSTAT_INCR(mhp, nputpage);
2088 				vp = pp->p_vnode;
2089 				offset = pp->p_offset;
2090 				VN_HOLD(vp);
2091 				page_unlock(pp);
2092 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2093 				    B_INVAL|B_FORCE, kcred, NULL);
2094 				VN_RELE(vp);
2095 #ifdef MEM_DEL_STATS
2096 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2097 				    start_pgrp;
2098 #endif /* MEM_DEL_STATS */
2099 				MDSTAT_PGRP(mhp, ntick_pgrp);
2100 				/*
2101 				 * Try to get the page back immediately
2102 				 * so that it can be collected.
2103 				 */
2104 				pp = page_numtopp_nolock(pfn);
2105 				if (pp == NULL) {
2106 					MDSTAT_INCR(mhp, nnoreclaim);
2107 					/*
2108 					 * This should not happen as this
2109 					 * thread is deleting the page.
2110 					 * If this code is generalized, this
2111 					 * becomes a reality.
2112 					 */
2113 #ifdef DEBUG
2114 					cmn_err(CE_WARN,
2115 					    "delete_memory_thread(0x%p) "
2116 					    "pfn 0x%lx has no page_t",
2117 					    (void *)mhp, pfn);
2118 #endif /* DEBUG */
2119 					mutex_enter(&mhp->mh_mutex);
2120 					continue;
2121 				}
2122 				if (page_try_reclaim_lock(pp, SE_EXCL,
2123 				    SE_EXCL_WANTED | SE_RETIRED)) {
2124 					if (PP_ISFREE(pp)) {
2125 						goto free_page_collect;
2126 					}
2127 					page_unlock(pp);
2128 				}
2129 				MDSTAT_INCR(mhp, nnoreclaim);
2130 				mutex_enter(&mhp->mh_mutex);
2131 				continue;
2132 
2133 			reloc:
2134 				/*
2135 				 * Got some freemem and a target
2136 				 * page, so move the data to avoid
2137 				 * I/O and lock problems.
2138 				 */
2139 				ASSERT(!page_iolock_assert(pp));
2140 				MDSTAT_INCR(mhp, nreloc);
2141 				/*
2142 				 * page_relocate() will return pgcnt: the
2143 				 * number of consecutive pages relocated.
2144 				 * If it is successful, pp will be a
2145 				 * linked list of the page structs that
2146 				 * were relocated. If page_relocate() is
2147 				 * unsuccessful, pp will be unmodified.
2148 				 */
2149 #ifdef MEM_DEL_STATS
2150 				start_pgrp = ddi_get_lbolt();
2151 #endif /* MEM_DEL_STATS */
2152 				result = page_relocate(&pp, &pp_targ, 0, 0,
2153 				    &pgcnt, NULL);
2154 #ifdef MEM_DEL_STATS
2155 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2156 				    start_pgrp;
2157 #endif /* MEM_DEL_STATS */
2158 				MDSTAT_PGRP(mhp, ntick_pgrp);
2159 				if (result != 0) {
2160 					MDSTAT_INCR(mhp, nrelocfail);
2161 					/*
2162 					 * We did not succeed. We need
2163 					 * to give the pp_targ pages back.
2164 					 * page_free(pp_targ, 1) without
2165 					 * the freemem accounting.
2166 					 */
2167 					group_page_unlock(pp);
2168 					page_free_replacement_page(pp_targ);
2169 					page_unlock(pp);
2170 					mutex_enter(&mhp->mh_mutex);
2171 					continue;
2172 				}
2173 
2174 				/*
2175 				 * We will then collect pgcnt pages.
2176 				 */
2177 				ASSERT(pgcnt > 0);
2178 				mutex_enter(&mhp->mh_mutex);
2179 				/*
2180 				 * We need to make sure freemem_left is
2181 				 * large enough.
2182 				 */
2183 				while ((freemem_left < pgcnt) &&
2184 				    (!mhp->mh_cancel)) {
2185 					freemem_left +=
2186 					    delthr_get_freemem(mhp);
2187 				}
2188 
2189 				/*
2190 				 * Do not proceed if mh_cancel is set.
2191 				 */
2192 				if (mhp->mh_cancel) {
2193 					while (pp_targ != NULL) {
2194 						/*
2195 						 * Unlink and unlock each page.
2196 						 */
2197 						tpp_targ = pp_targ;
2198 						page_sub(&pp_targ, tpp_targ);
2199 						page_unlock(tpp_targ);
2200 					}
2201 					/*
2202 					 * We need to give the pp pages back.
2203 					 * page_free(pp, 1) without the
2204 					 * freemem accounting.
2205 					 */
2206 					page_free_replacement_page(pp);
2207 					break;
2208 				}
2209 
2210 				/* Now remove pgcnt from freemem_left */
2211 				freemem_left -= pgcnt;
2212 				ASSERT(freemem_left >= 0);
2213 				szc = pp->p_szc;
2214 				while (pp != NULL) {
2215 					/*
2216 					 * pp and pp_targ were passed back as
2217 					 * a linked list of pages.
2218 					 * Unlink and unlock each page.
2219 					 */
2220 					tpp_targ = pp_targ;
2221 					page_sub(&pp_targ, tpp_targ);
2222 					page_unlock(tpp_targ);
2223 					/*
2224 					 * The original page is now free
2225 					 * so remove it from the linked
2226 					 * list and collect it.
2227 					 */
2228 					tpp = pp;
2229 					page_sub(&pp, tpp);
2230 					pfn = page_pptonum(tpp);
2231 					collected++;
2232 					ASSERT(PAGE_EXCL(tpp));
2233 					ASSERT(tpp->p_vnode == NULL);
2234 					ASSERT(!hat_page_is_mapped(tpp));
2235 					ASSERT(tpp->p_szc == szc);
2236 					tpp->p_szc = 0;
2237 					page_delete_collect(tpp, mhp);
2238 					bit = pfn - mdsp->mds_base;
2239 					mdsp->mds_bitmap[bit / NBPBMW] |=
2240 					    (1 << (bit % NBPBMW));
2241 				}
2242 				ASSERT(pp_targ == NULL);
2243 			}
2244 		}
2245 		first_scan = 0;
2246 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2247 		    (collected == 0)) {
2248 			/*
2249 			 * This code is needed as we cannot wait
2250 			 * for a page to be locked OR the delete to
2251 			 * be cancelled.  Also, we must delay so
2252 			 * that other threads get a chance to run
2253 			 * on our cpu, otherwise page locks may be
2254 			 * held indefinitely by those threads.
2255 			 */
2256 			MDSTAT_INCR(mhp, ndelay);
2257 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2258 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2259 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2260 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2261 		}
2262 	}
2263 	/* stop the dr aio cleanup thread */
2264 	mhp->mh_dr_aio_cleanup_cancel = 1;
2265 	transit_list_collect(mhp, 0);
2266 	if (freemem_left != 0) {
2267 		/* Return any surplus. */
2268 		page_create_putback(freemem_left);
2269 		freemem_left = 0;
2270 	}
2271 #ifdef MEM_DEL_STATS
2272 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2273 #endif /* MEM_DEL_STATS */
2274 	MDSTAT_TOTAL(mhp, ntick_total);
2275 	MDSTAT_PRINT(mhp);
2276 
2277 	/*
2278 	 * If the memory delete was cancelled, exclusive-wanted bits must
2279 	 * be cleared. If there are retired pages being deleted, they need
2280 	 * to be unretired.
2281 	 */
2282 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2283 	    mdsp = mdsp->mds_next) {
2284 		pfn_t pfn, p_end;
2285 
2286 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2287 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2288 			page_t *pp;
2289 			pgcnt_t bit;
2290 
2291 			bit = pfn - mdsp->mds_base;
2292 			if (mhp->mh_cancel) {
2293 				pp = page_numtopp_nolock(pfn);
2294 				if (pp != NULL) {
2295 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2296 					    (1 << (bit % NBPBMW))) == 0) {
2297 						page_lock_clr_exclwanted(pp);
2298 					}
2299 				}
2300 			} else {
2301 				pp = NULL;
2302 			}
2303 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2304 			    (1 << (bit % NBPBMW))) != 0) {
2305 				/* do we already have pp? */
2306 				if (pp == NULL) {
2307 					pp = page_numtopp_nolock(pfn);
2308 				}
2309 				ASSERT(pp != NULL);
2310 				ASSERT(PP_RETIRED(pp));
2311 				if (mhp->mh_cancel != 0) {
2312 					page_unlock(pp);
2313 					/*
2314 					 * To satisfy ASSERT below in
2315 					 * cancel code.
2316 					 */
2317 					mhp->mh_hold_todo++;
2318 				} else {
2319 					(void) page_unretire_pp(pp,
2320 					    PR_UNR_CLEAN);
2321 				}
2322 			}
2323 		}
2324 	}
2325 	/*
2326 	 * Free retired page bitmap and collected page bitmap
2327 	 */
2328 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2329 	    mdsp = mdsp->mds_next) {
2330 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2331 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2332 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2333 		ASSERT(mdsp->mds_bitmap != NULL);
2334 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2335 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2336 	}
2337 
2338 	/* wait for our dr aio cancel thread to exit */
2339 	while (!(mhp->mh_aio_cleanup_done)) {
2340 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2341 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2342 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2343 	}
2344 refused:
2345 	if (mhp->mh_cancel != 0) {
2346 		page_t *pp;
2347 
2348 		comp_code = mhp->mh_cancel;
2349 		/*
2350 		 * Go through list of deleted pages (mh_deleted) freeing
2351 		 * them.
2352 		 */
2353 		while ((pp = mhp->mh_deleted) != NULL) {
2354 			mhp->mh_deleted = pp->p_next;
2355 			mhp->mh_hold_todo++;
2356 			mutex_exit(&mhp->mh_mutex);
2357 			/* Restore p_next. */
2358 			pp->p_next = pp->p_prev;
2359 			if (PP_ISFREE(pp)) {
2360 				cmn_err(CE_PANIC,
2361 				    "page %p is free",
2362 				    (void *)pp);
2363 			}
2364 			page_free(pp, 1);
2365 			mutex_enter(&mhp->mh_mutex);
2366 		}
2367 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2368 
2369 		mutex_exit(&mhp->mh_mutex);
2370 		put_availrmem(mhp->mh_vm_pages);
2371 		mutex_enter(&mhp->mh_mutex);
2372 
2373 		goto t_exit;
2374 	}
2375 
2376 	/*
2377 	 * All the pages are no longer in use and are exclusively locked.
2378 	 */
2379 
2380 	mhp->mh_deleted = NULL;
2381 
2382 	kphysm_del_cleanup(mhp);
2383 
2384 	/*
2385 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
2386 	 * that the mem_node_config[] will remain intact for the cleanup.
2387 	 */
2388 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2389 	    mdsp = mdsp->mds_next) {
2390 		mem_node_del_range(mdsp->mds_base,
2391 		    mdsp->mds_base + mdsp->mds_npgs - 1);
2392 	}
2393 	/* cleanup the page counters */
2394 	page_ctrs_cleanup();
2395 
2396 	comp_code = KPHYSM_OK;
2397 
2398 t_exit:
2399 	mutex_exit(&mhp->mh_mutex);
2400 	kphysm_setup_post_del(mhp->mh_vm_pages,
2401 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2402 	mutex_enter(&mhp->mh_mutex);
2403 
2404 early_exit:
2405 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2406 	mhp->mh_state = MHND_DONE;
2407 	del_complete_funcp = mhp->mh_delete_complete;
2408 	del_complete_arg = mhp->mh_delete_complete_arg;
2409 	CALLB_CPR_EXIT(&cprinfo);
2410 	(*del_complete_funcp)(del_complete_arg, comp_code);
2411 	thread_exit();
2412 	/*NOTREACHED*/
2413 }
2414 
2415 /*
2416  * Start the delete of the memory from the system.
2417  */
2418 int
2419 kphysm_del_start(
2420 	memhandle_t handle,
2421 	void (*complete)(void *, int),
2422 	void *complete_arg)
2423 {
2424 	struct mem_handle *mhp;
2425 
2426 	mhp = kphysm_lookup_mem_handle(handle);
2427 	if (mhp == NULL) {
2428 		return (KPHYSM_EHANDLE);
2429 	}
2430 	switch (mhp->mh_state) {
2431 	case MHND_FREE:
2432 		ASSERT(mhp->mh_state != MHND_FREE);
2433 		mutex_exit(&mhp->mh_mutex);
2434 		return (KPHYSM_EHANDLE);
2435 	case MHND_INIT:
2436 		break;
2437 	case MHND_STARTING:
2438 	case MHND_RUNNING:
2439 		mutex_exit(&mhp->mh_mutex);
2440 		return (KPHYSM_ESEQUENCE);
2441 	case MHND_DONE:
2442 		mutex_exit(&mhp->mh_mutex);
2443 		return (KPHYSM_ESEQUENCE);
2444 	case MHND_RELEASE:
2445 		mutex_exit(&mhp->mh_mutex);
2446 		return (KPHYSM_ESEQUENCE);
2447 	default:
2448 #ifdef DEBUG
2449 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2450 		    (void *)mhp, mhp->mh_state);
2451 #endif /* DEBUG */
2452 		mutex_exit(&mhp->mh_mutex);
2453 		return (KPHYSM_EHANDLE);
2454 	}
2455 
2456 	if (mhp->mh_transit.trl_spans == NULL) {
2457 		mutex_exit(&mhp->mh_mutex);
2458 		return (KPHYSM_ENOWORK);
2459 	}
2460 
2461 	ASSERT(complete != NULL);
2462 	mhp->mh_delete_complete = complete;
2463 	mhp->mh_delete_complete_arg = complete_arg;
2464 	mhp->mh_state = MHND_STARTING;
2465 	/*
2466 	 * Release the mutex in case thread_create sleeps.
2467 	 */
2468 	mutex_exit(&mhp->mh_mutex);
2469 
2470 	/*
2471 	 * The "obvious" process for this thread is pageout (proc_pageout)
2472 	 * but this gives the thread too much power over freemem
2473 	 * which results in freemem starvation.
2474 	 */
2475 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2476 	    TS_RUN, maxclsyspri - 1);
2477 
2478 	return (KPHYSM_OK);
2479 }
2480 
2481 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2482 static caddr_t pp_dummy;
2483 static pgcnt_t pp_dummy_npages;
2484 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2485 
2486 static void
2487 memseg_remap_init_pages(page_t *pages, page_t *epages)
2488 {
2489 	page_t *pp;
2490 
2491 	for (pp = pages; pp < epages; pp++) {
2492 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2493 		pp->p_offset = (u_offset_t)-1;
2494 		page_iolock_init(pp);
2495 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2496 			continue;
2497 		page_lock_delete(pp);
2498 	}
2499 }
2500 
2501 void
2502 memseg_remap_init()
2503 {
2504 	mutex_enter(&pp_dummy_lock);
2505 	if (pp_dummy == NULL) {
2506 		uint_t dpages;
2507 		int i;
2508 
2509 		/*
2510 		 * dpages starts off as the size of the structure and
2511 		 * ends up as the minimum number of pages that will
2512 		 * hold a whole number of page_t structures.
2513 		 */
2514 		dpages = sizeof (page_t);
2515 		ASSERT(dpages != 0);
2516 		ASSERT(dpages <= MMU_PAGESIZE);
2517 
2518 		while ((dpages & 1) == 0)
2519 			dpages >>= 1;
2520 
2521 		pp_dummy_npages = dpages;
2522 		/*
2523 		 * Allocate pp_dummy pages directly from static_arena,
2524 		 * since these are whole page allocations and are
2525 		 * referenced by physical address.  This also has the
2526 		 * nice fringe benefit of hiding the memory from
2527 		 * ::findleaks since it doesn't deal well with allocated
2528 		 * kernel heap memory that doesn't have any mappings.
2529 		 */
2530 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2531 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2532 		bzero(pp_dummy, ptob(pp_dummy_npages));
2533 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2534 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2535 		    pp_dummy_npages, KM_SLEEP);
2536 		for (i = 0; i < pp_dummy_npages; i++) {
2537 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2538 			    &pp_dummy[MMU_PAGESIZE * i]);
2539 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2540 		}
2541 		/*
2542 		 * Initialize the page_t's to a known 'deleted' state
2543 		 * that matches the state of deleted pages.
2544 		 */
2545 		memseg_remap_init_pages((page_t *)pp_dummy,
2546 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2547 		/* Remove kmem mappings for the pages for safety. */
2548 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2549 		    HAT_UNLOAD_UNLOCK);
2550 		/* Leave pp_dummy pointer set as flag that init is done. */
2551 	}
2552 	mutex_exit(&pp_dummy_lock);
2553 }
2554 
2555 /*
2556  * Remap a page-aglined range of page_t's to dummy pages.
2557  */
2558 void
2559 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2560 {
2561 	int phase;
2562 
2563 	ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE));
2564 
2565 	/*
2566 	 * We may start remapping at a non-zero page offset
2567 	 * within the dummy pages since the low/high ends
2568 	 * of the outgoing pp's could be shared by other
2569 	 * memsegs (see memseg_remap_meta).
2570 	 */
2571 	phase = btop((uint64_t)va) % pp_dummy_npages;
2572 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2573 
2574 	while (metapgs != 0) {
2575 		pgcnt_t n;
2576 		int i, j;
2577 
2578 		n = pp_dummy_npages;
2579 		if (n > metapgs)
2580 			n = metapgs;
2581 		for (i = 0; i < n; i++) {
2582 			j = (i + phase) % pp_dummy_npages;
2583 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2584 			    PROT_READ,
2585 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2586 			    HAT_LOAD_REMAP);
2587 			va += ptob(1);
2588 		}
2589 		metapgs -= n;
2590 	}
2591 }
2592 
2593 static void
2594 memseg_remap_to_dummy(struct memseg *seg)
2595 {
2596 	caddr_t pp;
2597 	pgcnt_t metapgs;
2598 
2599 	ASSERT(memseg_is_dynamic(seg));
2600 	ASSERT(pp_dummy != NULL);
2601 
2602 
2603 	if (!memseg_includes_meta(seg)) {
2604 		memseg_remap_meta(seg);
2605 		return;
2606 	}
2607 
2608 	pp = (caddr_t)seg->pages;
2609 	metapgs = seg->pages_base - memseg_get_start(seg);
2610 	ASSERT(metapgs != 0);
2611 
2612 	seg->pages_end = seg->pages_base;
2613 
2614 	remap_to_dummy(pp, metapgs);
2615 }
2616 
2617 /*
2618  * Transition all the deleted pages to the deleted state so that
2619  * page_lock will not wait. The page_lock_delete call will
2620  * also wake up any waiters.
2621  */
2622 static void
2623 memseg_lock_delete_all(struct memseg *seg)
2624 {
2625 	page_t *pp;
2626 
2627 	for (pp = seg->pages; pp < seg->epages; pp++) {
2628 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2629 		page_lock_delete(pp);
2630 	}
2631 }
2632 
2633 static void
2634 kphysm_del_cleanup(struct mem_handle *mhp)
2635 {
2636 	struct memdelspan	*mdsp;
2637 	struct memseg		*seg;
2638 	struct memseg   	**segpp;
2639 	struct memseg		*seglist;
2640 	pfn_t			p_end;
2641 	uint64_t		avmem;
2642 	pgcnt_t			avpgs;
2643 	pgcnt_t			npgs;
2644 
2645 	avpgs = mhp->mh_vm_pages;
2646 
2647 	memsegs_lock(1);
2648 
2649 	/*
2650 	 * remove from main segment list.
2651 	 */
2652 	npgs = 0;
2653 	seglist = NULL;
2654 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2655 	    mdsp = mdsp->mds_next) {
2656 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2657 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2658 			if (seg->pages_base >= p_end ||
2659 			    seg->pages_end <= mdsp->mds_base) {
2660 				/* Span and memseg don't overlap. */
2661 				segpp = &((*segpp)->next);
2662 				continue;
2663 			}
2664 			ASSERT(seg->pages_base >= mdsp->mds_base);
2665 			ASSERT(seg->pages_end <= p_end);
2666 
2667 			PLCNT_MODIFY_MAX(seg->pages_base,
2668 			    seg->pages_base - seg->pages_end);
2669 
2670 			/* Hide the memseg from future scans. */
2671 			hat_kpm_delmem_mseg_update(seg, segpp);
2672 			*segpp = seg->next;
2673 			membar_producer();	/* TODO: Needed? */
2674 			npgs += MSEG_NPAGES(seg);
2675 
2676 			/*
2677 			 * Leave the deleted segment's next pointer intact
2678 			 * in case a memsegs scanning loop is walking this
2679 			 * segment concurrently.
2680 			 */
2681 			seg->lnext = seglist;
2682 			seglist = seg;
2683 		}
2684 	}
2685 
2686 	build_pfn_hash();
2687 
2688 	ASSERT(npgs < total_pages);
2689 	total_pages -= npgs;
2690 
2691 	/*
2692 	 * Recalculate the paging parameters now total_pages has changed.
2693 	 * This will also cause the clock hands to be reset before next use.
2694 	 */
2695 	setupclock(1);
2696 
2697 	memsegs_unlock(1);
2698 
2699 	mutex_exit(&mhp->mh_mutex);
2700 
2701 	while ((seg = seglist) != NULL) {
2702 		pfn_t mseg_start;
2703 		pfn_t mseg_base, mseg_end;
2704 		pgcnt_t mseg_npgs;
2705 		int mlret;
2706 
2707 		seglist = seg->lnext;
2708 
2709 		/*
2710 		 * Put the page_t's into the deleted state to stop
2711 		 * cv_wait()s on the pages. When we remap, the dummy
2712 		 * page_t's will be in the same state.
2713 		 */
2714 		memseg_lock_delete_all(seg);
2715 		/*
2716 		 * Collect up information based on pages_base and pages_end
2717 		 * early so that we can flag early that the memseg has been
2718 		 * deleted by setting pages_end == pages_base.
2719 		 */
2720 		mseg_base = seg->pages_base;
2721 		mseg_end = seg->pages_end;
2722 		mseg_npgs = MSEG_NPAGES(seg);
2723 		mseg_start = memseg_get_start(seg);
2724 
2725 		if (memseg_is_dynamic(seg)) {
2726 			/* Remap the meta data to our special dummy area. */
2727 			memseg_remap_to_dummy(seg);
2728 
2729 			mutex_enter(&memseg_lists_lock);
2730 			seg->lnext = memseg_va_avail;
2731 			memseg_va_avail = seg;
2732 			mutex_exit(&memseg_lists_lock);
2733 		} else {
2734 			/*
2735 			 * For memory whose page_ts were allocated
2736 			 * at boot, we need to find a new use for
2737 			 * the page_t memory.
2738 			 * For the moment, just leak it.
2739 			 * (It is held in the memseg_delete_junk list.)
2740 			 */
2741 			seg->pages_end = seg->pages_base;
2742 
2743 			mutex_enter(&memseg_lists_lock);
2744 			seg->lnext = memseg_delete_junk;
2745 			memseg_delete_junk = seg;
2746 			mutex_exit(&memseg_lists_lock);
2747 		}
2748 
2749 		/* Must not use seg now as it could be re-used. */
2750 
2751 		memlist_write_lock();
2752 
2753 		mlret = memlist_delete_span(
2754 		    (uint64_t)(mseg_base) << PAGESHIFT,
2755 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2756 		    &phys_avail);
2757 		ASSERT(mlret == MEML_SPANOP_OK);
2758 
2759 		mlret = memlist_delete_span(
2760 		    (uint64_t)(mseg_start) << PAGESHIFT,
2761 		    (uint64_t)(mseg_end - mseg_start) <<
2762 		    PAGESHIFT,
2763 		    &phys_install);
2764 		ASSERT(mlret == MEML_SPANOP_OK);
2765 		phys_install_has_changed();
2766 
2767 		memlist_write_unlock();
2768 	}
2769 
2770 	memlist_read_lock();
2771 	installed_top_size(phys_install, &physmax, &physinstalled);
2772 	memlist_read_unlock();
2773 
2774 	mutex_enter(&freemem_lock);
2775 	maxmem -= avpgs;
2776 	physmem -= avpgs;
2777 	/* availrmem is adjusted during the delete. */
2778 	availrmem_initial -= avpgs;
2779 
2780 	mutex_exit(&freemem_lock);
2781 
2782 	dump_resize();
2783 
2784 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2785 	    "(0x%" PRIx64 ")\n",
2786 	    physinstalled << (PAGESHIFT - 10),
2787 	    (uint64_t)physinstalled << PAGESHIFT);
2788 
2789 	avmem = (uint64_t)freemem << PAGESHIFT;
2790 	cmn_err(CE_CONT, "?kphysm_delete: "
2791 	    "avail mem = %" PRId64 "\n", avmem);
2792 
2793 	/*
2794 	 * Update lgroup generation number on single lgroup systems
2795 	 */
2796 	if (nlgrps == 1)
2797 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2798 
2799 	/* Successfully deleted system memory */
2800 	mutex_enter(&mhp->mh_mutex);
2801 }
2802 
2803 static uint_t mdel_nullvp_waiter;
2804 
2805 static void
2806 page_delete_collect(
2807 	page_t *pp,
2808 	struct mem_handle *mhp)
2809 {
2810 	if (pp->p_vnode) {
2811 		page_hashout(pp, (kmutex_t *)NULL);
2812 		/* do not do PP_SETAGED(pp); */
2813 	} else {
2814 		kmutex_t *sep;
2815 
2816 		sep = page_se_mutex(pp);
2817 		mutex_enter(sep);
2818 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2819 			mdel_nullvp_waiter++;
2820 			cv_broadcast(&pp->p_cv);
2821 		}
2822 		mutex_exit(sep);
2823 	}
2824 	ASSERT(pp->p_next == pp->p_prev);
2825 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2826 	pp->p_next = mhp->mh_deleted;
2827 	mhp->mh_deleted = pp;
2828 	ASSERT(mhp->mh_hold_todo != 0);
2829 	mhp->mh_hold_todo--;
2830 }
2831 
2832 static void
2833 transit_list_collect(struct mem_handle *mhp, int v)
2834 {
2835 	struct transit_list_head *trh;
2836 
2837 	trh = &transit_list_head;
2838 	mutex_enter(&trh->trh_lock);
2839 	mhp->mh_transit.trl_collect = v;
2840 	mutex_exit(&trh->trh_lock);
2841 }
2842 
2843 static void
2844 transit_list_insert(struct transit_list *tlp)
2845 {
2846 	struct transit_list_head *trh;
2847 
2848 	trh = &transit_list_head;
2849 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2850 	tlp->trl_next = trh->trh_head;
2851 	trh->trh_head = tlp;
2852 }
2853 
2854 static void
2855 transit_list_remove(struct transit_list *tlp)
2856 {
2857 	struct transit_list_head *trh;
2858 	struct transit_list **tlpp;
2859 
2860 	trh = &transit_list_head;
2861 	tlpp = &trh->trh_head;
2862 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2863 	while (*tlpp != NULL && *tlpp != tlp)
2864 		tlpp = &(*tlpp)->trl_next;
2865 	ASSERT(*tlpp != NULL);
2866 	if (*tlpp == tlp)
2867 		*tlpp = tlp->trl_next;
2868 	tlp->trl_next = NULL;
2869 }
2870 
2871 static struct transit_list *
2872 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2873 {
2874 	struct transit_list *tlp;
2875 
2876 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2877 		struct memdelspan *mdsp;
2878 
2879 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2880 		    mdsp = mdsp->mds_next) {
2881 			if (pfnum >= mdsp->mds_base &&
2882 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2883 				return (tlp);
2884 			}
2885 		}
2886 	}
2887 	return (NULL);
2888 }
2889 
2890 int
2891 pfn_is_being_deleted(pfn_t pfnum)
2892 {
2893 	struct transit_list_head *trh;
2894 	struct transit_list *tlp;
2895 	int ret;
2896 
2897 	trh = &transit_list_head;
2898 	if (trh->trh_head == NULL)
2899 		return (0);
2900 
2901 	mutex_enter(&trh->trh_lock);
2902 	tlp = pfnum_to_transit_list(trh, pfnum);
2903 	ret = (tlp != NULL && tlp->trl_collect);
2904 	mutex_exit(&trh->trh_lock);
2905 
2906 	return (ret);
2907 }
2908 
2909 #ifdef MEM_DEL_STATS
2910 extern int hz;
2911 static void
2912 mem_del_stat_print_func(struct mem_handle *mhp)
2913 {
2914 	uint64_t tmp;
2915 
2916 	if (mem_del_stat_print) {
2917 		printf("memory delete loop %x/%x, statistics%s\n",
2918 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2919 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2920 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2921 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2922 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2923 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2924 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2925 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2926 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2927 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2928 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2929 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2930 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2931 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2932 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2933 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2934 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2935 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2936 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2937 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2938 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2939 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2940 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2941 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2942 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2943 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2944 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2945 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2946 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2947 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2948 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2949 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2950 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2951 		printf(
2952 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2953 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2954 
2955 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2956 		printf(
2957 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2958 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2959 	}
2960 }
2961 #endif /* MEM_DEL_STATS */
2962 
2963 struct mem_callback {
2964 	kphysm_setup_vector_t	*vec;
2965 	void			*arg;
2966 };
2967 
2968 #define	NMEMCALLBACKS		100
2969 
2970 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2971 static uint_t nmemcallbacks;
2972 static krwlock_t mem_callback_rwlock;
2973 
2974 int
2975 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2976 {
2977 	uint_t i, found;
2978 
2979 	/*
2980 	 * This test will become more complicated when the version must
2981 	 * change.
2982 	 */
2983 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2984 		return (EINVAL);
2985 
2986 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2987 	    vec->post_del == NULL)
2988 		return (EINVAL);
2989 
2990 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2991 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2992 		if (mem_callbacks[i].vec == NULL && found == 0)
2993 			found = i + 1;
2994 		if (mem_callbacks[i].vec == vec &&
2995 		    mem_callbacks[i].arg == arg) {
2996 #ifdef DEBUG
2997 			/* Catch this in DEBUG kernels. */
2998 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2999 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
3000 			    (void *)vec, arg, (void *)caller());
3001 #endif /* DEBUG */
3002 			rw_exit(&mem_callback_rwlock);
3003 			return (EEXIST);
3004 		}
3005 	}
3006 	if (found != 0) {
3007 		i = found - 1;
3008 	} else {
3009 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
3010 		if (nmemcallbacks == NMEMCALLBACKS) {
3011 			rw_exit(&mem_callback_rwlock);
3012 			return (ENOMEM);
3013 		}
3014 		i = nmemcallbacks++;
3015 	}
3016 	mem_callbacks[i].vec = vec;
3017 	mem_callbacks[i].arg = arg;
3018 	rw_exit(&mem_callback_rwlock);
3019 	return (0);
3020 }
3021 
3022 void
3023 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3024 {
3025 	uint_t i;
3026 
3027 	rw_enter(&mem_callback_rwlock, RW_WRITER);
3028 	for (i = 0; i < nmemcallbacks; i++) {
3029 		if (mem_callbacks[i].vec == vec &&
3030 		    mem_callbacks[i].arg == arg) {
3031 			mem_callbacks[i].vec = NULL;
3032 			mem_callbacks[i].arg = NULL;
3033 			if (i == (nmemcallbacks - 1))
3034 				nmemcallbacks--;
3035 			break;
3036 		}
3037 	}
3038 	rw_exit(&mem_callback_rwlock);
3039 }
3040 
3041 static void
3042 kphysm_setup_post_add(pgcnt_t delta_pages)
3043 {
3044 	uint_t i;
3045 
3046 	rw_enter(&mem_callback_rwlock, RW_READER);
3047 	for (i = 0; i < nmemcallbacks; i++) {
3048 		if (mem_callbacks[i].vec != NULL) {
3049 			(*mem_callbacks[i].vec->post_add)
3050 			    (mem_callbacks[i].arg, delta_pages);
3051 		}
3052 	}
3053 	rw_exit(&mem_callback_rwlock);
3054 }
3055 
3056 /*
3057  * Note the locking between pre_del and post_del: The reader lock is held
3058  * between the two calls to stop the set of functions from changing.
3059  */
3060 
3061 static int
3062 kphysm_setup_pre_del(pgcnt_t delta_pages)
3063 {
3064 	uint_t i;
3065 	int ret;
3066 	int aret;
3067 
3068 	ret = 0;
3069 	rw_enter(&mem_callback_rwlock, RW_READER);
3070 	for (i = 0; i < nmemcallbacks; i++) {
3071 		if (mem_callbacks[i].vec != NULL) {
3072 			aret = (*mem_callbacks[i].vec->pre_del)
3073 			    (mem_callbacks[i].arg, delta_pages);
3074 			ret |= aret;
3075 		}
3076 	}
3077 
3078 	return (ret);
3079 }
3080 
3081 static void
3082 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3083 {
3084 	uint_t i;
3085 
3086 	for (i = 0; i < nmemcallbacks; i++) {
3087 		if (mem_callbacks[i].vec != NULL) {
3088 			(*mem_callbacks[i].vec->post_del)
3089 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3090 		}
3091 	}
3092 	rw_exit(&mem_callback_rwlock);
3093 }
3094 
3095 static int
3096 kphysm_split_memseg(
3097 	pfn_t base,
3098 	pgcnt_t npgs)
3099 {
3100 	struct memseg *seg;
3101 	struct memseg **segpp;
3102 	pgcnt_t size_low, size_high;
3103 	struct memseg *seg_low, *seg_mid, *seg_high;
3104 
3105 	/*
3106 	 * Lock the memsegs list against other updates now
3107 	 */
3108 	memsegs_lock(1);
3109 
3110 	/*
3111 	 * Find boot time memseg that wholly covers this area.
3112 	 */
3113 
3114 	/* First find the memseg with page 'base' in it. */
3115 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3116 	    segpp = &((*segpp)->next)) {
3117 		if (base >= seg->pages_base && base < seg->pages_end)
3118 			break;
3119 	}
3120 	if (seg == NULL) {
3121 		memsegs_unlock(1);
3122 		return (0);
3123 	}
3124 	if (memseg_includes_meta(seg)) {
3125 		memsegs_unlock(1);
3126 		return (0);
3127 	}
3128 	if ((base + npgs) > seg->pages_end) {
3129 		memsegs_unlock(1);
3130 		return (0);
3131 	}
3132 
3133 	/*
3134 	 * Work out the size of the two segments that will
3135 	 * surround the new segment, one for low address
3136 	 * and one for high.
3137 	 */
3138 	ASSERT(base >= seg->pages_base);
3139 	size_low = base - seg->pages_base;
3140 	ASSERT(seg->pages_end >= (base + npgs));
3141 	size_high = seg->pages_end - (base + npgs);
3142 
3143 	/*
3144 	 * Sanity check.
3145 	 */
3146 	if ((size_low + size_high) == 0) {
3147 		memsegs_unlock(1);
3148 		return (0);
3149 	}
3150 
3151 	/*
3152 	 * Allocate the new structures. The old memseg will not be freed
3153 	 * as there may be a reference to it.
3154 	 */
3155 	seg_low = NULL;
3156 	seg_high = NULL;
3157 
3158 	if (size_low != 0)
3159 		seg_low = memseg_alloc();
3160 
3161 	seg_mid = memseg_alloc();
3162 
3163 	if (size_high != 0)
3164 		seg_high = memseg_alloc();
3165 
3166 	/*
3167 	 * All allocation done now.
3168 	 */
3169 	if (size_low != 0) {
3170 		seg_low->pages = seg->pages;
3171 		seg_low->epages = seg_low->pages + size_low;
3172 		seg_low->pages_base = seg->pages_base;
3173 		seg_low->pages_end = seg_low->pages_base + size_low;
3174 		seg_low->next = seg_mid;
3175 		seg_low->msegflags = seg->msegflags;
3176 	}
3177 	if (size_high != 0) {
3178 		seg_high->pages = seg->epages - size_high;
3179 		seg_high->epages = seg_high->pages + size_high;
3180 		seg_high->pages_base = seg->pages_end - size_high;
3181 		seg_high->pages_end = seg_high->pages_base + size_high;
3182 		seg_high->next = seg->next;
3183 		seg_high->msegflags = seg->msegflags;
3184 	}
3185 
3186 	seg_mid->pages = seg->pages + size_low;
3187 	seg_mid->pages_base = seg->pages_base + size_low;
3188 	seg_mid->epages = seg->epages - size_high;
3189 	seg_mid->pages_end = seg->pages_end - size_high;
3190 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3191 	seg_mid->msegflags = seg->msegflags;
3192 
3193 	/*
3194 	 * Update hat_kpm specific info of all involved memsegs and
3195 	 * allow hat_kpm specific global chain updates.
3196 	 */
3197 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3198 
3199 	/*
3200 	 * At this point we have two equivalent memseg sub-chains,
3201 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3202 	 * the same place in the global chain. By re-writing the pointer
3203 	 * in the previous element we switch atomically from using the old
3204 	 * (seg) to the new.
3205 	 */
3206 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3207 
3208 	membar_enter();
3209 
3210 	build_pfn_hash();
3211 	memsegs_unlock(1);
3212 
3213 	/*
3214 	 * We leave the old segment, 'seg', intact as there may be
3215 	 * references to it. Also, as the value of total_pages has not
3216 	 * changed and the memsegs list is effectively the same when
3217 	 * accessed via the old or the new pointer, we do not have to
3218 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3219 	 *
3220 	 * We currently do not re-use or reclaim the page_t memory.
3221 	 * If we do, then this may have to change.
3222 	 */
3223 
3224 	mutex_enter(&memseg_lists_lock);
3225 	seg->lnext = memseg_edit_junk;
3226 	memseg_edit_junk = seg;
3227 	mutex_exit(&memseg_lists_lock);
3228 
3229 	return (1);
3230 }
3231 
3232 /*
3233  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3234  * structure using physical addresses. Therefore a kmem_cache is
3235  * used with KMC_NOHASH to avoid page crossings within a memseg
3236  * structure. KMC_NOHASH requires that no external (outside of
3237  * slab) information is allowed. This, in turn, implies that the
3238  * cache's slabsize must be exactly a single page, since per-slab
3239  * information (e.g. the freelist for the slab) is kept at the
3240  * end of the slab, where it is easy to locate. Should be changed
3241  * when a more obvious kmem_cache interface/flag will become
3242  * available.
3243  */
3244 void
3245 mem_config_init()
3246 {
3247 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3248 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3249 }
3250 
3251 struct memseg *
3252 memseg_alloc()
3253 {
3254 	struct memseg *seg;
3255 
3256 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3257 	bzero(seg, sizeof (struct memseg));
3258 
3259 	return (seg);
3260 }
3261 
3262 /*
3263  * Return whether the page_t memory for this memseg
3264  * is included in the memseg itself.
3265  */
3266 static int
3267 memseg_includes_meta(struct memseg *seg)
3268 {
3269 	return (seg->msegflags & MEMSEG_META_INCL);
3270 }
3271 
3272 pfn_t
3273 memseg_get_start(struct memseg *seg)
3274 {
3275 	pfn_t		pt_start;
3276 
3277 	if (memseg_includes_meta(seg)) {
3278 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3279 
3280 		/* Meta data is required to be at the beginning */
3281 		ASSERT(pt_start < seg->pages_base);
3282 	} else
3283 		pt_start = seg->pages_base;
3284 
3285 	return (pt_start);
3286 }
3287 
3288 /*
3289  * Invalidate memseg pointers in cpu private vm data caches.
3290  */
3291 static void
3292 memseg_cpu_vm_flush()
3293 {
3294 	cpu_t *cp;
3295 	vm_cpu_data_t *vc;
3296 
3297 	mutex_enter(&cpu_lock);
3298 	pause_cpus(NULL);
3299 
3300 	cp = cpu_list;
3301 	do {
3302 		vc = cp->cpu_vm_data;
3303 		vc->vc_pnum_memseg = NULL;
3304 		vc->vc_pnext_memseg = NULL;
3305 
3306 	} while ((cp = cp->cpu_next) != cpu_list);
3307 
3308 	start_cpus();
3309 	mutex_exit(&cpu_lock);
3310 }
3311