xref: /illumos-gate/usr/src/uts/common/os/mem_config.c (revision ab5a7454a6d76e82a121d74c74d5589cc3d37a8f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern void mem_node_add(pfn_t, pfn_t);
63 extern void mem_node_del(pfn_t, pfn_t);
64 
65 extern uint_t page_ctrs_adjust(int);
66 void page_ctrs_cleanup(void);
67 static void kphysm_setup_post_add(pgcnt_t);
68 static int kphysm_setup_pre_del(pgcnt_t);
69 static void kphysm_setup_post_del(pgcnt_t, int);
70 
71 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
72 
73 static int delspan_reserve(pfn_t, pgcnt_t);
74 static void delspan_unreserve(pfn_t, pgcnt_t);
75 
76 kmutex_t memseg_lists_lock;
77 struct memseg *memseg_va_avail;
78 struct memseg *memseg_alloc(void);
79 static struct memseg *memseg_delete_junk;
80 static struct memseg *memseg_edit_junk;
81 void memseg_remap_init(void);
82 static void memseg_remap_to_dummy(struct memseg *);
83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
84 static struct memseg *memseg_reuse(pgcnt_t);
85 
86 static struct kmem_cache *memseg_cache;
87 
88 /*
89  * Interfaces to manage externally allocated
90  * page_t memory (metadata) for a memseg.
91  */
92 #pragma weak	memseg_alloc_meta
93 #pragma weak	memseg_free_meta
94 #pragma weak	memseg_get_metapfn
95 #pragma weak	memseg_remap_meta
96 
97 extern int ppvm_enable;
98 extern page_t *ppvm_base;
99 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
100 extern void memseg_free_meta(void *, pgcnt_t);
101 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
102 extern void memseg_remap_meta(struct memseg *);
103 static int memseg_is_dynamic(struct memseg *);
104 static int memseg_includes_meta(struct memseg *);
105 pfn_t memseg_get_start(struct memseg *);
106 static void memseg_cpu_vm_flush(void);
107 
108 int meta_alloc_enable;
109 
110 /*
111  * Add a chunk of memory to the system.
112  * base: starting PAGESIZE page of new memory.
113  * npgs: length in PAGESIZE pages.
114  *
115  * Adding mem this way doesn't increase the size of the hash tables;
116  * growing them would be too hard.  This should be OK, but adding memory
117  * dynamically most likely means more hash misses, since the tables will
118  * be smaller than they otherwise would be.
119  */
120 #ifdef	DEBUG
121 static int memseg_debug;
122 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
123 #else
124 #define	MEMSEG_DEBUG(...)
125 #endif
126 
127 int
128 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
129 {
130 	page_t *pp;
131 	page_t		*opp, *oepp, *segpp;
132 	struct memseg	*seg;
133 	uint64_t	avmem;
134 	pfn_t		pfn;
135 	pfn_t		pt_base = base;
136 	pgcnt_t		tpgs = npgs;
137 	pgcnt_t		metapgs = 0;
138 	int		exhausted;
139 	pfn_t		pnum;
140 	int		mnode;
141 	caddr_t		vaddr;
142 	int		reuse;
143 	int		mlret;
144 	int		rv;
145 	int		flags;
146 	int		meta_alloc = 0;
147 	void		*mapva;
148 	void		*metabase = (void *)base;
149 	pgcnt_t		nkpmpgs = 0;
150 	offset_t	kpm_pages_off;
151 
152 	cmn_err(CE_CONT,
153 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
154 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
155 
156 	/*
157 	 * Add this span in the delete list to prevent interactions.
158 	 */
159 	if (!delspan_reserve(base, npgs)) {
160 		return (KPHYSM_ESPAN);
161 	}
162 	/*
163 	 * Check to see if any of the memory span has been added
164 	 * by trying an add to the installed memory list. This
165 	 * forms the interlocking process for add.
166 	 */
167 
168 	memlist_write_lock();
169 
170 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
171 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
172 
173 	if (mlret == MEML_SPANOP_OK)
174 		installed_top_size(phys_install, &physmax, &physinstalled);
175 
176 	memlist_write_unlock();
177 
178 	if (mlret != MEML_SPANOP_OK) {
179 		if (mlret == MEML_SPANOP_EALLOC) {
180 			delspan_unreserve(pt_base, tpgs);
181 			return (KPHYSM_ERESOURCE);
182 		} else if (mlret == MEML_SPANOP_ESPAN) {
183 			delspan_unreserve(pt_base, tpgs);
184 			return (KPHYSM_ESPAN);
185 		} else {
186 			delspan_unreserve(pt_base, tpgs);
187 			return (KPHYSM_ERESOURCE);
188 		}
189 	}
190 
191 	if (meta_alloc_enable) {
192 		/*
193 		 * Allocate the page_t's from existing memory;
194 		 * if that fails, allocate from the incoming memory.
195 		 */
196 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
197 		if (rv == KPHYSM_OK) {
198 			ASSERT(metapgs);
199 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
200 			meta_alloc = 1;
201 			goto mapalloc;
202 		}
203 	}
204 
205 	/*
206 	 * We store the page_t's for this new memory in the first
207 	 * few pages of the chunk. Here, we go and get'em ...
208 	 */
209 
210 	/*
211 	 * The expression after the '-' gives the number of pages
212 	 * that will fit in the new memory based on a requirement
213 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
214 	 */
215 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
216 	    (PAGESIZE + sizeof (page_t)));
217 
218 	npgs -= metapgs;
219 	base += metapgs;
220 
221 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
222 
223 	exhausted = (metapgs == 0 || npgs == 0);
224 
225 	if (kpm_enable && !exhausted) {
226 		pgcnt_t start, end, nkpmpgs_prelim;
227 		size_t	ptsz;
228 
229 		/*
230 		 * A viable kpm large page mapping must not overlap two
231 		 * dynamic memsegs. Therefore the total size is checked
232 		 * to be at least kpm_pgsz and also whether start and end
233 		 * points are at least kpm_pgsz aligned.
234 		 */
235 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
236 		    pmodkpmp(base + npgs)) {
237 
238 			kphysm_addmem_error_undospan(pt_base, tpgs);
239 
240 			/*
241 			 * There is no specific error code for violating
242 			 * kpm granularity constraints.
243 			 */
244 			return (KPHYSM_ENOTVIABLE);
245 		}
246 
247 		start = kpmptop(ptokpmp(base));
248 		end = kpmptop(ptokpmp(base + npgs));
249 		nkpmpgs_prelim = ptokpmp(end - start);
250 		ptsz = npgs * sizeof (page_t);
251 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
252 		exhausted = (tpgs <= metapgs);
253 		if (!exhausted) {
254 			npgs = tpgs - metapgs;
255 			base = pt_base + metapgs;
256 
257 			/* final nkpmpgs */
258 			start = kpmptop(ptokpmp(base));
259 			nkpmpgs = ptokpmp(end - start);
260 			kpm_pages_off = ptsz +
261 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
262 		}
263 	}
264 
265 	/*
266 	 * Is memory area supplied too small?
267 	 */
268 	if (exhausted) {
269 		kphysm_addmem_error_undospan(pt_base, tpgs);
270 		/*
271 		 * There is no specific error code for 'too small'.
272 		 */
273 		return (KPHYSM_ERESOURCE);
274 	}
275 
276 mapalloc:
277 	/*
278 	 * We may re-use a previously allocated VA space for the page_ts
279 	 * eventually, but we need to initialize and lock the pages first.
280 	 */
281 
282 	/*
283 	 * Get an address in the kernel address map, map
284 	 * the page_t pages and see if we can touch them.
285 	 */
286 
287 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
288 	if (mapva == NULL) {
289 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
290 		    " Can't allocate VA for page_ts");
291 
292 		if (meta_alloc)
293 			memseg_free_meta(metabase, metapgs);
294 		kphysm_addmem_error_undospan(pt_base, tpgs);
295 
296 		return (KPHYSM_ERESOURCE);
297 	}
298 	pp = mapva;
299 
300 	if (physmax < (pt_base + tpgs))
301 		physmax = (pt_base + tpgs);
302 
303 	/*
304 	 * In the remapping code we map one page at a time so we must do
305 	 * the same here to match mapping sizes.
306 	 */
307 	pfn = pt_base;
308 	vaddr = (caddr_t)pp;
309 	for (pnum = 0; pnum < metapgs; pnum++) {
310 		if (meta_alloc)
311 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
312 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
313 		    PROT_READ | PROT_WRITE,
314 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
315 		pfn++;
316 		vaddr += ptob(1);
317 	}
318 
319 	if (ddi_peek32((dev_info_t *)NULL,
320 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
321 
322 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
323 		    " Can't access pp array at 0x%p [phys 0x%lx]",
324 		    (void *)pp, pt_base);
325 
326 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
327 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
328 
329 		vmem_free(heap_arena, mapva, ptob(metapgs));
330 		if (meta_alloc)
331 			memseg_free_meta(metabase, metapgs);
332 		kphysm_addmem_error_undospan(pt_base, tpgs);
333 
334 		return (KPHYSM_EFAULT);
335 	}
336 
337 	/*
338 	 * Add this memory slice to its memory node translation.
339 	 *
340 	 * Note that right now, each node may have only one slice;
341 	 * this may change with COD or in larger SSM systems with
342 	 * nested latency groups, so we must not assume that the
343 	 * node does not yet exist.
344 	 */
345 	pnum = pt_base + tpgs - 1;
346 	mem_node_add_range(pt_base, pnum);
347 
348 	/*
349 	 * Allocate or resize page counters as necessary to accommodate
350 	 * the increase in memory pages.
351 	 */
352 	mnode = PFN_2_MEM_NODE(pnum);
353 	PAGE_CTRS_ADJUST(base, npgs, rv);
354 	if (rv) {
355 
356 		mem_node_del_range(pt_base, pnum);
357 
358 		/* cleanup the  page counters */
359 		page_ctrs_cleanup();
360 
361 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
362 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
363 
364 		vmem_free(heap_arena, mapva, ptob(metapgs));
365 		if (meta_alloc)
366 			memseg_free_meta(metabase, metapgs);
367 		kphysm_addmem_error_undospan(pt_base, tpgs);
368 
369 		return (KPHYSM_ERESOURCE);
370 	}
371 
372 	/*
373 	 * Update the phys_avail memory list.
374 	 * The phys_install list was done at the start.
375 	 */
376 
377 	memlist_write_lock();
378 
379 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
380 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
381 	ASSERT(mlret == MEML_SPANOP_OK);
382 
383 	memlist_write_unlock();
384 
385 	/* See if we can find a memseg to re-use. */
386 	if (meta_alloc) {
387 		seg = memseg_reuse(0);
388 		reuse = 1;	/* force unmapping of temp mapva */
389 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
390 		/*
391 		 * There is a 1:1 fixed relationship between a pfn
392 		 * and a page_t VA.  The pfn is used as an index into
393 		 * the ppvm_base page_t table in order to calculate
394 		 * the page_t base address for a given pfn range.
395 		 */
396 		segpp = ppvm_base + base;
397 	} else {
398 		seg = memseg_reuse(metapgs);
399 		reuse = (seg != NULL);
400 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
401 		segpp = pp;
402 	}
403 
404 	/*
405 	 * Initialize the memseg structure representing this memory
406 	 * and add it to the existing list of memsegs. Do some basic
407 	 * initialization and add the memory to the system.
408 	 * In order to prevent lock deadlocks, the add_physmem()
409 	 * code is repeated here, but split into several stages.
410 	 *
411 	 * If a memseg is reused, invalidate memseg pointers in
412 	 * all cpu vm caches.  We need to do this this since the check
413 	 * 	pp >= seg->pages && pp < seg->epages
414 	 * used in various places is not atomic and so the first compare
415 	 * can happen before reuse and the second compare after reuse.
416 	 * The invalidation ensures that a memseg is not deferenced while
417 	 * it's page/pfn pointers are changing.
418 	 */
419 	if (seg == NULL) {
420 		seg = memseg_alloc();
421 		ASSERT(seg != NULL);
422 		seg->msegflags = flags;
423 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
424 		    (void *)seg, (void *)(seg->pages));
425 		seg->pages = segpp;
426 	} else {
427 		ASSERT(seg->msegflags == flags);
428 		ASSERT(seg->pages_base == seg->pages_end);
429 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
430 		    (void *)seg, (void *)(seg->pages));
431 		if (meta_alloc) {
432 			memseg_cpu_vm_flush();
433 			seg->pages = segpp;
434 		}
435 	}
436 
437 	seg->epages = seg->pages + npgs;
438 	seg->pages_base = base;
439 	seg->pages_end = base + npgs;
440 
441 	/*
442 	 * Initialize metadata. The page_ts are set to locked state
443 	 * ready to be freed.
444 	 */
445 	bzero((caddr_t)pp, ptob(metapgs));
446 
447 	pfn = seg->pages_base;
448 	/* Save the original pp base in case we reuse a memseg. */
449 	opp = pp;
450 	oepp = opp + npgs;
451 	for (pp = opp; pp < oepp; pp++) {
452 		pp->p_pagenum = pfn;
453 		pfn++;
454 		page_iolock_init(pp);
455 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
456 			continue;
457 		pp->p_offset = (u_offset_t)-1;
458 	}
459 
460 	if (reuse) {
461 		/* Remap our page_ts to the re-used memseg VA space. */
462 		pfn = pt_base;
463 		vaddr = (caddr_t)seg->pages;
464 		for (pnum = 0; pnum < metapgs; pnum++) {
465 			if (meta_alloc)
466 				pfn = memseg_get_metapfn(metabase,
467 				    (pgcnt_t)pnum);
468 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
469 			    PROT_READ | PROT_WRITE,
470 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
471 			pfn++;
472 			vaddr += ptob(1);
473 		}
474 
475 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
476 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
477 
478 		vmem_free(heap_arena, mapva, ptob(metapgs));
479 	}
480 
481 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
482 
483 	memsegs_lock(1);
484 
485 	/*
486 	 * The new memseg is inserted at the beginning of the list.
487 	 * Not only does this save searching for the tail, but in the
488 	 * case of a re-used memseg, it solves the problem of what
489 	 * happens if some process has still got a pointer to the
490 	 * memseg and follows the next pointer to continue traversing
491 	 * the memsegs list.
492 	 */
493 
494 	hat_kpm_addmem_mseg_insert(seg);
495 
496 	seg->next = memsegs;
497 	membar_producer();
498 
499 	hat_kpm_addmem_memsegs_update(seg);
500 
501 	memsegs = seg;
502 
503 	build_pfn_hash();
504 
505 	total_pages += npgs;
506 
507 	/*
508 	 * Recalculate the paging parameters now total_pages has changed.
509 	 * This will also cause the clock hands to be reset before next use.
510 	 */
511 	setupclock(1);
512 
513 	memsegs_unlock(1);
514 
515 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
516 
517 	/*
518 	 * Free the pages outside the lock to avoid locking loops.
519 	 */
520 	for (pp = seg->pages; pp < seg->epages; pp++) {
521 		page_free(pp, 1);
522 	}
523 
524 	/*
525 	 * Now that we've updated the appropriate memory lists we
526 	 * need to reset a number of globals, since we've increased memory.
527 	 * Several have already been updated for us as noted above. The
528 	 * globals we're interested in at this point are:
529 	 *   physmax - highest page frame number.
530 	 *   physinstalled - number of pages currently installed (done earlier)
531 	 *   maxmem - max free pages in the system
532 	 *   physmem - physical memory pages available
533 	 *   availrmem - real memory available
534 	 */
535 
536 	mutex_enter(&freemem_lock);
537 	maxmem += npgs;
538 	physmem += npgs;
539 	availrmem += npgs;
540 	availrmem_initial += npgs;
541 
542 	mutex_exit(&freemem_lock);
543 
544 	dump_resize();
545 
546 	page_freelist_coalesce_all(mnode);
547 
548 	kphysm_setup_post_add(npgs);
549 
550 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
551 	    "(0x%" PRIx64 ")\n",
552 	    physinstalled << (PAGESHIFT - 10),
553 	    (uint64_t)physinstalled << PAGESHIFT);
554 
555 	avmem = (uint64_t)freemem << PAGESHIFT;
556 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
557 	    "avail mem = %" PRId64 "\n", avmem);
558 
559 	/*
560 	 * Update lgroup generation number on single lgroup systems
561 	 */
562 	if (nlgrps == 1)
563 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
564 
565 	delspan_unreserve(pt_base, tpgs);
566 	return (KPHYSM_OK);		/* Successfully added system memory */
567 
568 }
569 
570 /*
571  * There are various error conditions in kphysm_add_memory_dynamic()
572  * which require a rollback of already changed global state.
573  */
574 static void
575 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
576 {
577 	int mlret;
578 
579 	/* Unreserve memory span. */
580 	memlist_write_lock();
581 
582 	mlret = memlist_delete_span(
583 	    (uint64_t)(pt_base) << PAGESHIFT,
584 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
585 
586 	ASSERT(mlret == MEML_SPANOP_OK);
587 	phys_install_has_changed();
588 	installed_top_size(phys_install, &physmax, &physinstalled);
589 
590 	memlist_write_unlock();
591 	delspan_unreserve(pt_base, tpgs);
592 }
593 
594 /*
595  * Only return an available memseg of exactly the right size
596  * if size is required.
597  * When the meta data area has it's own virtual address space
598  * we will need to manage this more carefully and do best fit
599  * allocations, possibly splitting an available area.
600  */
601 struct memseg *
602 memseg_reuse(pgcnt_t metapgs)
603 {
604 	int type;
605 	struct memseg **segpp, *seg;
606 
607 	mutex_enter(&memseg_lists_lock);
608 
609 	segpp = &memseg_va_avail;
610 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
611 		caddr_t end;
612 
613 		/*
614 		 * Make sure we are reusing the right segment type.
615 		 */
616 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
617 
618 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
619 		    != type)
620 			continue;
621 
622 		if (kpm_enable)
623 			end = hat_kpm_mseg_reuse(seg);
624 		else
625 			end = (caddr_t)seg->epages;
626 
627 		/*
628 		 * Check for the right size if it is provided.
629 		 */
630 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
631 			*segpp = seg->lnext;
632 			seg->lnext = NULL;
633 			break;
634 		}
635 	}
636 	mutex_exit(&memseg_lists_lock);
637 
638 	return (seg);
639 }
640 
641 static uint_t handle_gen;
642 
643 struct memdelspan {
644 	struct memdelspan *mds_next;
645 	pfn_t		mds_base;
646 	pgcnt_t		mds_npgs;
647 	uint_t		*mds_bitmap;
648 	uint_t		*mds_bitmap_retired;
649 };
650 
651 #define	NBPBMW		(sizeof (uint_t) * NBBY)
652 #define	MDS_BITMAPBYTES(MDSP) \
653 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
654 
655 struct transit_list {
656 	struct transit_list	*trl_next;
657 	struct memdelspan	*trl_spans;
658 	int			trl_collect;
659 };
660 
661 struct transit_list_head {
662 	kmutex_t		trh_lock;
663 	struct transit_list	*trh_head;
664 };
665 
666 static struct transit_list_head transit_list_head;
667 
668 struct mem_handle;
669 static void transit_list_collect(struct mem_handle *, int);
670 static void transit_list_insert(struct transit_list *);
671 static void transit_list_remove(struct transit_list *);
672 
673 #ifdef DEBUG
674 #define	MEM_DEL_STATS
675 #endif /* DEBUG */
676 
677 #ifdef MEM_DEL_STATS
678 static int mem_del_stat_print = 0;
679 struct mem_del_stat {
680 	uint_t	nloop;
681 	uint_t	need_free;
682 	uint_t	free_loop;
683 	uint_t	free_low;
684 	uint_t	free_failed;
685 	uint_t	ncheck;
686 	uint_t	nopaget;
687 	uint_t	lockfail;
688 	uint_t	nfree;
689 	uint_t	nreloc;
690 	uint_t	nrelocfail;
691 	uint_t	already_done;
692 	uint_t	first_notfree;
693 	uint_t	npplocked;
694 	uint_t	nlockreloc;
695 	uint_t	nnorepl;
696 	uint_t	nmodreloc;
697 	uint_t	ndestroy;
698 	uint_t	nputpage;
699 	uint_t	nnoreclaim;
700 	uint_t	ndelay;
701 	uint_t	demotefail;
702 	uint64_t nticks_total;
703 	uint64_t nticks_pgrp;
704 	uint_t	retired;
705 	uint_t	toxic;
706 	uint_t	failing;
707 	uint_t	modtoxic;
708 	uint_t	npplkdtoxic;
709 	uint_t	gptlmodfail;
710 	uint_t	gptllckfail;
711 };
712 /*
713  * The stat values are only incremented in the delete thread
714  * so no locking or atomic required.
715  */
716 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
717 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
718 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
719 static void mem_del_stat_print_func(struct mem_handle *);
720 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
721 #else /* MEM_DEL_STATS */
722 #define	MDSTAT_INCR(MHP, FLD)
723 #define	MDSTAT_TOTAL(MHP, ntck)
724 #define	MDSTAT_PGRP(MHP, ntck)
725 #define	MDSTAT_PRINT(MHP)
726 #endif /* MEM_DEL_STATS */
727 
728 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
729 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
730 
731 /*
732  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
733  * The mutex may not be required for other fields, dependent on mh_state.
734  */
735 struct mem_handle {
736 	kmutex_t	mh_mutex;
737 	struct mem_handle *mh_next;
738 	memhandle_t	mh_exthandle;
739 	mhnd_state_t	mh_state;
740 	struct transit_list mh_transit;
741 	pgcnt_t		mh_phys_pages;
742 	pgcnt_t		mh_vm_pages;
743 	pgcnt_t		mh_hold_todo;
744 	void		(*mh_delete_complete)(void *, int error);
745 	void		*mh_delete_complete_arg;
746 	volatile uint_t mh_cancel;
747 	volatile uint_t mh_dr_aio_cleanup_cancel;
748 	volatile uint_t mh_aio_cleanup_done;
749 	kcondvar_t	mh_cv;
750 	kthread_id_t	mh_thread_id;
751 	page_t		*mh_deleted;	/* link through p_next */
752 #ifdef MEM_DEL_STATS
753 	struct mem_del_stat mh_delstat;
754 #endif /* MEM_DEL_STATS */
755 };
756 
757 static struct mem_handle *mem_handle_head;
758 static kmutex_t mem_handle_list_mutex;
759 
760 static struct mem_handle *
761 kphysm_allocate_mem_handle()
762 {
763 	struct mem_handle *mhp;
764 
765 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
766 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
767 	mutex_enter(&mem_handle_list_mutex);
768 	mutex_enter(&mhp->mh_mutex);
769 	/* handle_gen is protected by list mutex. */
770 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
771 	mhp->mh_next = mem_handle_head;
772 	mem_handle_head = mhp;
773 	mutex_exit(&mem_handle_list_mutex);
774 
775 	return (mhp);
776 }
777 
778 static void
779 kphysm_free_mem_handle(struct mem_handle *mhp)
780 {
781 	struct mem_handle **mhpp;
782 
783 	ASSERT(mutex_owned(&mhp->mh_mutex));
784 	ASSERT(mhp->mh_state == MHND_FREE);
785 	/*
786 	 * Exit the mutex to preserve locking order. This is OK
787 	 * here as once in the FREE state, the handle cannot
788 	 * be found by a lookup.
789 	 */
790 	mutex_exit(&mhp->mh_mutex);
791 
792 	mutex_enter(&mem_handle_list_mutex);
793 	mhpp = &mem_handle_head;
794 	while (*mhpp != NULL && *mhpp != mhp)
795 		mhpp = &(*mhpp)->mh_next;
796 	ASSERT(*mhpp == mhp);
797 	/*
798 	 * No need to lock the handle (mh_mutex) as only
799 	 * mh_next changing and this is the only thread that
800 	 * can be referncing mhp.
801 	 */
802 	*mhpp = mhp->mh_next;
803 	mutex_exit(&mem_handle_list_mutex);
804 
805 	mutex_destroy(&mhp->mh_mutex);
806 	kmem_free(mhp, sizeof (struct mem_handle));
807 }
808 
809 /*
810  * This function finds the internal mem_handle corresponding to an
811  * external handle and returns it with the mh_mutex held.
812  */
813 static struct mem_handle *
814 kphysm_lookup_mem_handle(memhandle_t handle)
815 {
816 	struct mem_handle *mhp;
817 
818 	mutex_enter(&mem_handle_list_mutex);
819 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
820 		if (mhp->mh_exthandle == handle) {
821 			mutex_enter(&mhp->mh_mutex);
822 			/*
823 			 * The state of the handle could have been changed
824 			 * by kphysm_del_release() while waiting for mh_mutex.
825 			 */
826 			if (mhp->mh_state == MHND_FREE) {
827 				mutex_exit(&mhp->mh_mutex);
828 				continue;
829 			}
830 			break;
831 		}
832 	}
833 	mutex_exit(&mem_handle_list_mutex);
834 	return (mhp);
835 }
836 
837 int
838 kphysm_del_gethandle(memhandle_t *xmhp)
839 {
840 	struct mem_handle *mhp;
841 
842 	mhp = kphysm_allocate_mem_handle();
843 	/*
844 	 * The handle is allocated using KM_SLEEP, so cannot fail.
845 	 * If the implementation is changed, the correct error to return
846 	 * here would be KPHYSM_ENOHANDLES.
847 	 */
848 	ASSERT(mhp->mh_state == MHND_FREE);
849 	mhp->mh_state = MHND_INIT;
850 	*xmhp = mhp->mh_exthandle;
851 	mutex_exit(&mhp->mh_mutex);
852 	return (KPHYSM_OK);
853 }
854 
855 static int
856 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
857 {
858 	pfn_t e1, e2;
859 
860 	e1 = b1 + l1;
861 	e2 = b2 + l2;
862 
863 	return (!(b2 >= e1 || b1 >= e2));
864 }
865 
866 static int can_remove_pgs(pgcnt_t);
867 
868 static struct memdelspan *
869 span_to_install(pfn_t base, pgcnt_t npgs)
870 {
871 	struct memdelspan *mdsp;
872 	struct memdelspan *mdsp_new;
873 	uint64_t address, size, thislen;
874 	struct memlist *mlp;
875 
876 	mdsp_new = NULL;
877 
878 	address = (uint64_t)base << PAGESHIFT;
879 	size = (uint64_t)npgs << PAGESHIFT;
880 	while (size != 0) {
881 		memlist_read_lock();
882 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
883 			if (address >= (mlp->address + mlp->size))
884 				continue;
885 			if ((address + size) > mlp->address)
886 				break;
887 		}
888 		if (mlp == NULL) {
889 			address += size;
890 			size = 0;
891 			thislen = 0;
892 		} else {
893 			if (address < mlp->address) {
894 				size -= (mlp->address - address);
895 				address = mlp->address;
896 			}
897 			ASSERT(address >= mlp->address);
898 			if ((address + size) > (mlp->address + mlp->size)) {
899 				thislen = mlp->size - (address - mlp->address);
900 			} else {
901 				thislen = size;
902 			}
903 		}
904 		memlist_read_unlock();
905 		/* TODO: phys_install could change now */
906 		if (thislen == 0)
907 			continue;
908 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
909 		mdsp->mds_base = btop(address);
910 		mdsp->mds_npgs = btop(thislen);
911 		mdsp->mds_next = mdsp_new;
912 		mdsp_new = mdsp;
913 		address += thislen;
914 		size -= thislen;
915 	}
916 	return (mdsp_new);
917 }
918 
919 static void
920 free_delspans(struct memdelspan *mdsp)
921 {
922 	struct memdelspan *amdsp;
923 
924 	while ((amdsp = mdsp) != NULL) {
925 		mdsp = amdsp->mds_next;
926 		kmem_free(amdsp, sizeof (struct memdelspan));
927 	}
928 }
929 
930 /*
931  * Concatenate lists. No list ordering is required.
932  */
933 
934 static void
935 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
936 {
937 	while (*mdspp != NULL)
938 		mdspp = &(*mdspp)->mds_next;
939 
940 	*mdspp = mdsp;
941 }
942 
943 /*
944  * Given a new list of delspans, check there is no overlap with
945  * all existing span activity (add or delete) and then concatenate
946  * the new spans to the given list.
947  * Return 1 for OK, 0 if overlapping.
948  */
949 static int
950 delspan_insert(
951 	struct transit_list *my_tlp,
952 	struct memdelspan *mdsp_new)
953 {
954 	struct transit_list_head *trh;
955 	struct transit_list *tlp;
956 	int ret;
957 
958 	trh = &transit_list_head;
959 
960 	ASSERT(my_tlp != NULL);
961 	ASSERT(mdsp_new != NULL);
962 
963 	ret = 1;
964 	mutex_enter(&trh->trh_lock);
965 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
966 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
967 		struct memdelspan *mdsp;
968 
969 		for (mdsp = tlp->trl_spans; mdsp != NULL;
970 		    mdsp = mdsp->mds_next) {
971 			struct memdelspan *nmdsp;
972 
973 			for (nmdsp = mdsp_new; nmdsp != NULL;
974 			    nmdsp = nmdsp->mds_next) {
975 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
976 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
977 					ret = 0;
978 					goto done;
979 				}
980 			}
981 		}
982 	}
983 done:
984 	if (ret != 0) {
985 		if (my_tlp->trl_spans == NULL)
986 			transit_list_insert(my_tlp);
987 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
988 	}
989 	mutex_exit(&trh->trh_lock);
990 	return (ret);
991 }
992 
993 static void
994 delspan_remove(
995 	struct transit_list *my_tlp,
996 	pfn_t base,
997 	pgcnt_t npgs)
998 {
999 	struct transit_list_head *trh;
1000 	struct memdelspan *mdsp;
1001 
1002 	trh = &transit_list_head;
1003 
1004 	ASSERT(my_tlp != NULL);
1005 
1006 	mutex_enter(&trh->trh_lock);
1007 	if ((mdsp = my_tlp->trl_spans) != NULL) {
1008 		if (npgs == 0) {
1009 			my_tlp->trl_spans = NULL;
1010 			free_delspans(mdsp);
1011 			transit_list_remove(my_tlp);
1012 		} else {
1013 			struct memdelspan **prv;
1014 
1015 			prv = &my_tlp->trl_spans;
1016 			while (mdsp != NULL) {
1017 				pfn_t p_end;
1018 
1019 				p_end = mdsp->mds_base + mdsp->mds_npgs;
1020 				if (mdsp->mds_base >= base &&
1021 				    p_end <= (base + npgs)) {
1022 					*prv = mdsp->mds_next;
1023 					mdsp->mds_next = NULL;
1024 					free_delspans(mdsp);
1025 				} else {
1026 					prv = &mdsp->mds_next;
1027 				}
1028 				mdsp = *prv;
1029 			}
1030 			if (my_tlp->trl_spans == NULL)
1031 				transit_list_remove(my_tlp);
1032 		}
1033 	}
1034 	mutex_exit(&trh->trh_lock);
1035 }
1036 
1037 /*
1038  * Reserve interface for add to stop delete before add finished.
1039  * This list is only accessed through the delspan_insert/remove
1040  * functions and so is fully protected by the mutex in struct transit_list.
1041  */
1042 
1043 static struct transit_list reserve_transit;
1044 
1045 static int
1046 delspan_reserve(pfn_t base, pgcnt_t npgs)
1047 {
1048 	struct memdelspan *mdsp;
1049 	int ret;
1050 
1051 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1052 	mdsp->mds_base = base;
1053 	mdsp->mds_npgs = npgs;
1054 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1055 		free_delspans(mdsp);
1056 	}
1057 	return (ret);
1058 }
1059 
1060 static void
1061 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1062 {
1063 	delspan_remove(&reserve_transit, base, npgs);
1064 }
1065 
1066 /*
1067  * Return whether memseg was created by kphysm_add_memory_dynamic().
1068  */
1069 static int
1070 memseg_is_dynamic(struct memseg *seg)
1071 {
1072 	return (seg->msegflags & MEMSEG_DYNAMIC);
1073 }
1074 
1075 int
1076 kphysm_del_span(
1077 	memhandle_t handle,
1078 	pfn_t base,
1079 	pgcnt_t npgs)
1080 {
1081 	struct mem_handle *mhp;
1082 	struct memseg *seg;
1083 	struct memdelspan *mdsp;
1084 	struct memdelspan *mdsp_new;
1085 	pgcnt_t phys_pages, vm_pages;
1086 	pfn_t p_end;
1087 	page_t *pp;
1088 	int ret;
1089 
1090 	mhp = kphysm_lookup_mem_handle(handle);
1091 	if (mhp == NULL) {
1092 		return (KPHYSM_EHANDLE);
1093 	}
1094 	if (mhp->mh_state != MHND_INIT) {
1095 		mutex_exit(&mhp->mh_mutex);
1096 		return (KPHYSM_ESEQUENCE);
1097 	}
1098 
1099 	/*
1100 	 * Intersect the span with the installed memory list (phys_install).
1101 	 */
1102 	mdsp_new = span_to_install(base, npgs);
1103 	if (mdsp_new == NULL) {
1104 		/*
1105 		 * No physical memory in this range. Is this an
1106 		 * error? If an attempt to start the delete is made
1107 		 * for OK returns from del_span such as this, start will
1108 		 * return an error.
1109 		 * Could return KPHYSM_ENOWORK.
1110 		 */
1111 		/*
1112 		 * It is assumed that there are no error returns
1113 		 * from span_to_install() due to kmem_alloc failure.
1114 		 */
1115 		mutex_exit(&mhp->mh_mutex);
1116 		return (KPHYSM_OK);
1117 	}
1118 	/*
1119 	 * Does this span overlap an existing span?
1120 	 */
1121 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1122 		/*
1123 		 * Differentiate between already on list for this handle
1124 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1125 		 */
1126 		ret = KPHYSM_EBUSY;
1127 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1128 		    mdsp = mdsp->mds_next) {
1129 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1130 			    base, npgs)) {
1131 				ret = KPHYSM_EDUP;
1132 				break;
1133 			}
1134 		}
1135 		mutex_exit(&mhp->mh_mutex);
1136 		free_delspans(mdsp_new);
1137 		return (ret);
1138 	}
1139 	/*
1140 	 * At this point the spans in mdsp_new have been inserted into the
1141 	 * list of spans for this handle and thereby to the global list of
1142 	 * spans being processed. Each of these spans must now be checked
1143 	 * for relocatability. As a side-effect segments in the memseg list
1144 	 * may be split.
1145 	 *
1146 	 * Note that mdsp_new can no longer be used as it is now part of
1147 	 * a larger list. Select elements of this larger list based
1148 	 * on base and npgs.
1149 	 */
1150 restart:
1151 	phys_pages = 0;
1152 	vm_pages = 0;
1153 	ret = KPHYSM_OK;
1154 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1155 	    mdsp = mdsp->mds_next) {
1156 		pgcnt_t pages_checked;
1157 
1158 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1159 			continue;
1160 		}
1161 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1162 		/*
1163 		 * The pages_checked count is a hack. All pages should be
1164 		 * checked for relocatability. Those not covered by memsegs
1165 		 * should be tested with arch_kphysm_del_span_ok().
1166 		 */
1167 		pages_checked = 0;
1168 		for (seg = memsegs; seg; seg = seg->next) {
1169 			pfn_t mseg_start;
1170 
1171 			if (seg->pages_base >= p_end ||
1172 			    seg->pages_end <= mdsp->mds_base) {
1173 				/* Span and memseg don't overlap. */
1174 				continue;
1175 			}
1176 			mseg_start = memseg_get_start(seg);
1177 			/* Check that segment is suitable for delete. */
1178 			if (memseg_includes_meta(seg)) {
1179 				/*
1180 				 * Check that this segment is completely
1181 				 * within the span.
1182 				 */
1183 				if (mseg_start < mdsp->mds_base ||
1184 				    seg->pages_end > p_end) {
1185 					ret = KPHYSM_EBUSY;
1186 					break;
1187 				}
1188 				pages_checked += seg->pages_end - mseg_start;
1189 			} else {
1190 				/*
1191 				 * If this segment is larger than the span,
1192 				 * try to split it. After the split, it
1193 				 * is necessary to restart.
1194 				 */
1195 				if (seg->pages_base < mdsp->mds_base ||
1196 				    seg->pages_end > p_end) {
1197 					pfn_t abase;
1198 					pgcnt_t anpgs;
1199 					int s_ret;
1200 
1201 					/* Split required.  */
1202 					if (mdsp->mds_base < seg->pages_base)
1203 						abase = seg->pages_base;
1204 					else
1205 						abase = mdsp->mds_base;
1206 					if (p_end > seg->pages_end)
1207 						anpgs = seg->pages_end - abase;
1208 					else
1209 						anpgs = p_end - abase;
1210 					s_ret = kphysm_split_memseg(abase,
1211 					    anpgs);
1212 					if (s_ret == 0) {
1213 						/* Split failed. */
1214 						ret = KPHYSM_ERESOURCE;
1215 						break;
1216 					}
1217 					goto restart;
1218 				}
1219 				pages_checked +=
1220 				    seg->pages_end - seg->pages_base;
1221 			}
1222 			/*
1223 			 * The memseg is wholly within the delete span.
1224 			 * The individual pages can now be checked.
1225 			 */
1226 			/* Cage test. */
1227 			for (pp = seg->pages; pp < seg->epages; pp++) {
1228 				if (PP_ISNORELOC(pp)) {
1229 					ret = KPHYSM_ENONRELOC;
1230 					break;
1231 				}
1232 			}
1233 			if (ret != KPHYSM_OK) {
1234 				break;
1235 			}
1236 			phys_pages += (seg->pages_end - mseg_start);
1237 			vm_pages += MSEG_NPAGES(seg);
1238 		}
1239 		if (ret != KPHYSM_OK)
1240 			break;
1241 		if (pages_checked != mdsp->mds_npgs) {
1242 			ret = KPHYSM_ENONRELOC;
1243 			break;
1244 		}
1245 	}
1246 
1247 	if (ret == KPHYSM_OK) {
1248 		mhp->mh_phys_pages += phys_pages;
1249 		mhp->mh_vm_pages += vm_pages;
1250 	} else {
1251 		/*
1252 		 * Keep holding the mh_mutex to prevent it going away.
1253 		 */
1254 		delspan_remove(&mhp->mh_transit, base, npgs);
1255 	}
1256 	mutex_exit(&mhp->mh_mutex);
1257 	return (ret);
1258 }
1259 
1260 int
1261 kphysm_del_span_query(
1262 	pfn_t base,
1263 	pgcnt_t npgs,
1264 	memquery_t *mqp)
1265 {
1266 	struct memdelspan *mdsp;
1267 	struct memdelspan *mdsp_new;
1268 	int done_first_nonreloc;
1269 
1270 	mqp->phys_pages = 0;
1271 	mqp->managed = 0;
1272 	mqp->nonrelocatable = 0;
1273 	mqp->first_nonrelocatable = 0;
1274 	mqp->last_nonrelocatable = 0;
1275 
1276 	mdsp_new = span_to_install(base, npgs);
1277 	/*
1278 	 * It is OK to proceed here if mdsp_new == NULL.
1279 	 */
1280 	done_first_nonreloc = 0;
1281 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1282 		pfn_t sbase;
1283 		pgcnt_t snpgs;
1284 
1285 		mqp->phys_pages += mdsp->mds_npgs;
1286 		sbase = mdsp->mds_base;
1287 		snpgs = mdsp->mds_npgs;
1288 		while (snpgs != 0) {
1289 			struct memseg *lseg, *seg;
1290 			pfn_t p_end;
1291 			page_t *pp;
1292 			pfn_t mseg_start;
1293 
1294 			p_end = sbase + snpgs;
1295 			/*
1296 			 * Find the lowest addressed memseg that starts
1297 			 * after sbase and account for it.
1298 			 * This is to catch dynamic memsegs whose start
1299 			 * is hidden.
1300 			 */
1301 			seg = NULL;
1302 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1303 				if ((lseg->pages_base >= sbase) ||
1304 				    (lseg->pages_base < p_end &&
1305 				    lseg->pages_end > sbase)) {
1306 					if (seg == NULL ||
1307 					    seg->pages_base > lseg->pages_base)
1308 						seg = lseg;
1309 				}
1310 			}
1311 			if (seg != NULL) {
1312 				mseg_start = memseg_get_start(seg);
1313 				/*
1314 				 * Now have the full extent of the memseg so
1315 				 * do the range check.
1316 				 */
1317 				if (mseg_start >= p_end ||
1318 				    seg->pages_end <= sbase) {
1319 					/* Span does not overlap memseg. */
1320 					seg = NULL;
1321 				}
1322 			}
1323 			/*
1324 			 * Account for gap either before the segment if
1325 			 * there is one or to the end of the span.
1326 			 */
1327 			if (seg == NULL || mseg_start > sbase) {
1328 				pfn_t a_end;
1329 
1330 				a_end = (seg == NULL) ? p_end : mseg_start;
1331 				/*
1332 				 * Check with arch layer for relocatability.
1333 				 */
1334 				if (arch_kphysm_del_span_ok(sbase,
1335 				    (a_end - sbase))) {
1336 					/*
1337 					 * No non-relocatble pages in this
1338 					 * area, avoid the fine-grained
1339 					 * test.
1340 					 */
1341 					snpgs -= (a_end - sbase);
1342 					sbase = a_end;
1343 				}
1344 				while (sbase < a_end) {
1345 					if (!arch_kphysm_del_span_ok(sbase,
1346 					    1)) {
1347 						mqp->nonrelocatable++;
1348 						if (!done_first_nonreloc) {
1349 							mqp->
1350 							    first_nonrelocatable
1351 							    = sbase;
1352 							done_first_nonreloc = 1;
1353 						}
1354 						mqp->last_nonrelocatable =
1355 						    sbase;
1356 					}
1357 					sbase++;
1358 					snpgs--;
1359 				}
1360 			}
1361 			if (seg != NULL) {
1362 				ASSERT(mseg_start <= sbase);
1363 				if (seg->pages_base != mseg_start &&
1364 				    seg->pages_base > sbase) {
1365 					pgcnt_t skip_pgs;
1366 
1367 					/*
1368 					 * Skip the page_t area of a
1369 					 * dynamic memseg.
1370 					 */
1371 					skip_pgs = seg->pages_base - sbase;
1372 					if (snpgs <= skip_pgs) {
1373 						sbase += snpgs;
1374 						snpgs = 0;
1375 						continue;
1376 					}
1377 					snpgs -= skip_pgs;
1378 					sbase += skip_pgs;
1379 				}
1380 				ASSERT(snpgs != 0);
1381 				ASSERT(seg->pages_base <= sbase);
1382 				/*
1383 				 * The individual pages can now be checked.
1384 				 */
1385 				for (pp = seg->pages +
1386 				    (sbase - seg->pages_base);
1387 				    snpgs != 0 && pp < seg->epages; pp++) {
1388 					mqp->managed++;
1389 					if (PP_ISNORELOC(pp)) {
1390 						mqp->nonrelocatable++;
1391 						if (!done_first_nonreloc) {
1392 							mqp->
1393 							    first_nonrelocatable
1394 							    = sbase;
1395 							done_first_nonreloc = 1;
1396 						}
1397 						mqp->last_nonrelocatable =
1398 						    sbase;
1399 					}
1400 					sbase++;
1401 					snpgs--;
1402 				}
1403 			}
1404 		}
1405 	}
1406 
1407 	free_delspans(mdsp_new);
1408 
1409 	return (KPHYSM_OK);
1410 }
1411 
1412 /*
1413  * This release function can be called at any stage as follows:
1414  *	_gethandle only called
1415  *	_span(s) only called
1416  *	_start called but failed
1417  *	delete thread exited
1418  */
1419 int
1420 kphysm_del_release(memhandle_t handle)
1421 {
1422 	struct mem_handle *mhp;
1423 
1424 	mhp = kphysm_lookup_mem_handle(handle);
1425 	if (mhp == NULL) {
1426 		return (KPHYSM_EHANDLE);
1427 	}
1428 	switch (mhp->mh_state) {
1429 	case MHND_STARTING:
1430 	case MHND_RUNNING:
1431 		mutex_exit(&mhp->mh_mutex);
1432 		return (KPHYSM_ENOTFINISHED);
1433 	case MHND_FREE:
1434 		ASSERT(mhp->mh_state != MHND_FREE);
1435 		mutex_exit(&mhp->mh_mutex);
1436 		return (KPHYSM_EHANDLE);
1437 	case MHND_INIT:
1438 		break;
1439 	case MHND_DONE:
1440 		break;
1441 	case MHND_RELEASE:
1442 		mutex_exit(&mhp->mh_mutex);
1443 		return (KPHYSM_ESEQUENCE);
1444 	default:
1445 #ifdef DEBUG
1446 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1447 		    (void *)mhp, mhp->mh_state);
1448 #endif /* DEBUG */
1449 		mutex_exit(&mhp->mh_mutex);
1450 		return (KPHYSM_EHANDLE);
1451 	}
1452 	/*
1453 	 * Set state so that we can wait if necessary.
1454 	 * Also this means that we have read/write access to all
1455 	 * fields except mh_exthandle and mh_state.
1456 	 */
1457 	mhp->mh_state = MHND_RELEASE;
1458 	/*
1459 	 * The mem_handle cannot be de-allocated by any other operation
1460 	 * now, so no need to hold mh_mutex.
1461 	 */
1462 	mutex_exit(&mhp->mh_mutex);
1463 
1464 	delspan_remove(&mhp->mh_transit, 0, 0);
1465 	mhp->mh_phys_pages = 0;
1466 	mhp->mh_vm_pages = 0;
1467 	mhp->mh_hold_todo = 0;
1468 	mhp->mh_delete_complete = NULL;
1469 	mhp->mh_delete_complete_arg = NULL;
1470 	mhp->mh_cancel = 0;
1471 
1472 	mutex_enter(&mhp->mh_mutex);
1473 	ASSERT(mhp->mh_state == MHND_RELEASE);
1474 	mhp->mh_state = MHND_FREE;
1475 
1476 	kphysm_free_mem_handle(mhp);
1477 
1478 	return (KPHYSM_OK);
1479 }
1480 
1481 /*
1482  * This cancel function can only be called with the thread running.
1483  */
1484 int
1485 kphysm_del_cancel(memhandle_t handle)
1486 {
1487 	struct mem_handle *mhp;
1488 
1489 	mhp = kphysm_lookup_mem_handle(handle);
1490 	if (mhp == NULL) {
1491 		return (KPHYSM_EHANDLE);
1492 	}
1493 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1494 		mutex_exit(&mhp->mh_mutex);
1495 		return (KPHYSM_ENOTRUNNING);
1496 	}
1497 	/*
1498 	 * Set the cancel flag and wake the delete thread up.
1499 	 * The thread may be waiting on I/O, so the effect of the cancel
1500 	 * may be delayed.
1501 	 */
1502 	if (mhp->mh_cancel == 0) {
1503 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1504 		cv_signal(&mhp->mh_cv);
1505 	}
1506 	mutex_exit(&mhp->mh_mutex);
1507 	return (KPHYSM_OK);
1508 }
1509 
1510 int
1511 kphysm_del_status(
1512 	memhandle_t handle,
1513 	memdelstat_t *mdstp)
1514 {
1515 	struct mem_handle *mhp;
1516 
1517 	mhp = kphysm_lookup_mem_handle(handle);
1518 	if (mhp == NULL) {
1519 		return (KPHYSM_EHANDLE);
1520 	}
1521 	/*
1522 	 * Calling kphysm_del_status() is allowed before the delete
1523 	 * is started to allow for status display.
1524 	 */
1525 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1526 	    mhp->mh_state != MHND_RUNNING) {
1527 		mutex_exit(&mhp->mh_mutex);
1528 		return (KPHYSM_ENOTRUNNING);
1529 	}
1530 	mdstp->phys_pages = mhp->mh_phys_pages;
1531 	mdstp->managed = mhp->mh_vm_pages;
1532 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1533 	mutex_exit(&mhp->mh_mutex);
1534 	return (KPHYSM_OK);
1535 }
1536 
1537 static int mem_delete_additional_pages = 100;
1538 
1539 static int
1540 can_remove_pgs(pgcnt_t npgs)
1541 {
1542 	/*
1543 	 * If all pageable pages were paged out, freemem would
1544 	 * equal availrmem.  There is a minimum requirement for
1545 	 * availrmem.
1546 	 */
1547 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1548 	    < npgs)
1549 		return (0);
1550 	/* TODO: check swap space, etc. */
1551 	return (1);
1552 }
1553 
1554 static int
1555 get_availrmem(pgcnt_t npgs)
1556 {
1557 	int ret;
1558 
1559 	mutex_enter(&freemem_lock);
1560 	ret = can_remove_pgs(npgs);
1561 	if (ret != 0)
1562 		availrmem -= npgs;
1563 	mutex_exit(&freemem_lock);
1564 	return (ret);
1565 }
1566 
1567 static void
1568 put_availrmem(pgcnt_t npgs)
1569 {
1570 	mutex_enter(&freemem_lock);
1571 	availrmem += npgs;
1572 	mutex_exit(&freemem_lock);
1573 }
1574 
1575 #define	FREEMEM_INCR	100
1576 static pgcnt_t freemem_incr = FREEMEM_INCR;
1577 #define	DEL_FREE_WAIT_FRAC	4
1578 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1579 
1580 #define	DEL_BUSY_WAIT_FRAC	20
1581 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1582 
1583 static void kphysm_del_cleanup(struct mem_handle *);
1584 
1585 static void page_delete_collect(page_t *, struct mem_handle *);
1586 
1587 static pgcnt_t
1588 delthr_get_freemem(struct mem_handle *mhp)
1589 {
1590 	pgcnt_t free_get;
1591 	int ret;
1592 
1593 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1594 
1595 	MDSTAT_INCR(mhp, need_free);
1596 	/*
1597 	 * Get up to freemem_incr pages.
1598 	 */
1599 	free_get = freemem_incr;
1600 	if (free_get > mhp->mh_hold_todo)
1601 		free_get = mhp->mh_hold_todo;
1602 	/*
1603 	 * Take free_get pages away from freemem,
1604 	 * waiting if necessary.
1605 	 */
1606 
1607 	while (!mhp->mh_cancel) {
1608 		mutex_exit(&mhp->mh_mutex);
1609 		MDSTAT_INCR(mhp, free_loop);
1610 		/*
1611 		 * Duplicate test from page_create_throttle()
1612 		 * but don't override with !PG_WAIT.
1613 		 */
1614 		if (freemem < (free_get + throttlefree)) {
1615 			MDSTAT_INCR(mhp, free_low);
1616 			ret = 0;
1617 		} else {
1618 			ret = page_create_wait(free_get, 0);
1619 			if (ret == 0) {
1620 				/* EMPTY */
1621 				MDSTAT_INCR(mhp, free_failed);
1622 			}
1623 		}
1624 		if (ret != 0) {
1625 			mutex_enter(&mhp->mh_mutex);
1626 			return (free_get);
1627 		}
1628 
1629 		/*
1630 		 * Put pressure on pageout.
1631 		 */
1632 		page_needfree(free_get);
1633 		cv_signal(&proc_pageout->p_cv);
1634 
1635 		mutex_enter(&mhp->mh_mutex);
1636 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1637 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1638 		mutex_exit(&mhp->mh_mutex);
1639 		page_needfree(-(spgcnt_t)free_get);
1640 
1641 		mutex_enter(&mhp->mh_mutex);
1642 	}
1643 	return (0);
1644 }
1645 
1646 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1647 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1648 /*
1649  * This function is run as a helper thread for delete_memory_thread.
1650  * It is needed in order to force kaio cleanup, so that pages used in kaio
1651  * will be unlocked and subsequently relocated by delete_memory_thread.
1652  * The address of the delete_memory_threads's mem_handle is passed in to
1653  * this thread function, and is used to set the mh_aio_cleanup_done member
1654  * prior to calling thread_exit().
1655  */
1656 static void
1657 dr_aio_cleanup_thread(caddr_t amhp)
1658 {
1659 	proc_t *procp;
1660 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1661 	int cleaned;
1662 	int n = 0;
1663 	struct mem_handle *mhp;
1664 	volatile uint_t *pcancel;
1665 
1666 	mhp = (struct mem_handle *)amhp;
1667 	ASSERT(mhp != NULL);
1668 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1669 	if (modload("sys", "kaio") == -1) {
1670 		mhp->mh_aio_cleanup_done = 1;
1671 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1672 		thread_exit();
1673 	}
1674 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1675 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1676 	if (aio_cleanup_dr_delete_memory == NULL) {
1677 		mhp->mh_aio_cleanup_done = 1;
1678 		cmn_err(CE_WARN,
1679 	    "aio_cleanup_dr_delete_memory not found in kaio");
1680 		thread_exit();
1681 	}
1682 	do {
1683 		cleaned = 0;
1684 		mutex_enter(&pidlock);
1685 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1686 		    procp = procp->p_next) {
1687 			mutex_enter(&procp->p_lock);
1688 			if (procp->p_aio != NULL) {
1689 				/* cleanup proc's outstanding kaio */
1690 				cleaned +=
1691 				    (*aio_cleanup_dr_delete_memory)(procp);
1692 			}
1693 			mutex_exit(&procp->p_lock);
1694 		}
1695 		mutex_exit(&pidlock);
1696 		if ((*pcancel == 0) &&
1697 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1698 			/* delay a bit before retrying all procs again */
1699 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1700 			n = 0;
1701 		}
1702 	} while (*pcancel == 0);
1703 	mhp->mh_aio_cleanup_done = 1;
1704 	thread_exit();
1705 }
1706 
1707 static void
1708 delete_memory_thread(caddr_t amhp)
1709 {
1710 	struct mem_handle *mhp;
1711 	struct memdelspan *mdsp;
1712 	callb_cpr_t cprinfo;
1713 	page_t *pp_targ;
1714 	spgcnt_t freemem_left;
1715 	void (*del_complete_funcp)(void *, int error);
1716 	void *del_complete_arg;
1717 	int comp_code;
1718 	int ret;
1719 	int first_scan;
1720 	uint_t szc;
1721 #ifdef MEM_DEL_STATS
1722 	uint64_t start_total, ntick_total;
1723 	uint64_t start_pgrp, ntick_pgrp;
1724 #endif /* MEM_DEL_STATS */
1725 
1726 	mhp = (struct mem_handle *)amhp;
1727 
1728 #ifdef MEM_DEL_STATS
1729 	start_total = ddi_get_lbolt();
1730 #endif /* MEM_DEL_STATS */
1731 
1732 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1733 	    callb_generic_cpr, "memdel");
1734 
1735 	mutex_enter(&mhp->mh_mutex);
1736 	ASSERT(mhp->mh_state == MHND_STARTING);
1737 
1738 	mhp->mh_state = MHND_RUNNING;
1739 	mhp->mh_thread_id = curthread;
1740 
1741 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1742 	mutex_exit(&mhp->mh_mutex);
1743 
1744 	/* Allocate the remap pages now, if necessary. */
1745 	memseg_remap_init();
1746 
1747 	/*
1748 	 * Subtract from availrmem now if possible as availrmem
1749 	 * may not be available by the end of the delete.
1750 	 */
1751 	if (!get_availrmem(mhp->mh_vm_pages)) {
1752 		comp_code = KPHYSM_ENOTVIABLE;
1753 		mutex_enter(&mhp->mh_mutex);
1754 		goto early_exit;
1755 	}
1756 
1757 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1758 
1759 	mutex_enter(&mhp->mh_mutex);
1760 
1761 	if (ret != 0) {
1762 		mhp->mh_cancel = KPHYSM_EREFUSED;
1763 		goto refused;
1764 	}
1765 
1766 	transit_list_collect(mhp, 1);
1767 
1768 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1769 	    mdsp = mdsp->mds_next) {
1770 		ASSERT(mdsp->mds_bitmap == NULL);
1771 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1772 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1773 		    KM_SLEEP);
1774 	}
1775 
1776 	first_scan = 1;
1777 	freemem_left = 0;
1778 	/*
1779 	 * Start dr_aio_cleanup_thread, which periodically iterates
1780 	 * through the process list and invokes aio cleanup.  This
1781 	 * is needed in order to avoid a deadly embrace between the
1782 	 * delete_memory_thread (waiting on writer lock for page, with the
1783 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1784 	 * reader lock on the same page that is wanted by the
1785 	 * delete_memory_thread), and threads waiting for kaio completion
1786 	 * (blocked on spt_amp->lock).
1787 	 */
1788 	mhp->mh_dr_aio_cleanup_cancel = 0;
1789 	mhp->mh_aio_cleanup_done = 0;
1790 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1791 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1792 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1793 		pgcnt_t collected;
1794 
1795 		MDSTAT_INCR(mhp, nloop);
1796 		collected = 0;
1797 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1798 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1799 			pfn_t pfn, p_end;
1800 
1801 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1802 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1803 			    (mhp->mh_cancel == 0); pfn++) {
1804 				page_t *pp, *tpp, *tpp_targ;
1805 				pgcnt_t bit;
1806 				struct vnode *vp;
1807 				u_offset_t offset;
1808 				int mod, result;
1809 				spgcnt_t pgcnt;
1810 
1811 				bit = pfn - mdsp->mds_base;
1812 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1813 				    (1 << (bit % NBPBMW))) != 0) {
1814 					MDSTAT_INCR(mhp, already_done);
1815 					continue;
1816 				}
1817 				if (freemem_left == 0) {
1818 					freemem_left += delthr_get_freemem(mhp);
1819 					if (freemem_left == 0)
1820 						break;
1821 				}
1822 
1823 				/*
1824 				 * Release mh_mutex - some of this
1825 				 * stuff takes some time (eg PUTPAGE).
1826 				 */
1827 
1828 				mutex_exit(&mhp->mh_mutex);
1829 				MDSTAT_INCR(mhp, ncheck);
1830 
1831 				pp = page_numtopp_nolock(pfn);
1832 				if (pp == NULL) {
1833 					/*
1834 					 * Not covered by a page_t - will
1835 					 * be dealt with elsewhere.
1836 					 */
1837 					MDSTAT_INCR(mhp, nopaget);
1838 					mutex_enter(&mhp->mh_mutex);
1839 					mdsp->mds_bitmap[bit / NBPBMW] |=
1840 					    (1 << (bit % NBPBMW));
1841 					continue;
1842 				}
1843 
1844 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1845 				    SE_EXCL_WANTED | SE_RETIRED)) {
1846 					/*
1847 					 * Page in use elsewhere.  Skip it.
1848 					 */
1849 					MDSTAT_INCR(mhp, lockfail);
1850 					mutex_enter(&mhp->mh_mutex);
1851 					continue;
1852 				}
1853 				/*
1854 				 * See if the cage expanded into the delete.
1855 				 * This can happen as we have to allow the
1856 				 * cage to expand.
1857 				 */
1858 				if (PP_ISNORELOC(pp)) {
1859 					page_unlock(pp);
1860 					mutex_enter(&mhp->mh_mutex);
1861 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1862 					break;
1863 				}
1864 				if (PP_RETIRED(pp)) {
1865 					/*
1866 					 * Page has been retired and is
1867 					 * not part of the cage so we
1868 					 * can now do the accounting for
1869 					 * it.
1870 					 */
1871 					MDSTAT_INCR(mhp, retired);
1872 					mutex_enter(&mhp->mh_mutex);
1873 					mdsp->mds_bitmap[bit / NBPBMW]
1874 					    |= (1 << (bit % NBPBMW));
1875 					mdsp->mds_bitmap_retired[bit /
1876 					    NBPBMW] |=
1877 					    (1 << (bit % NBPBMW));
1878 					mhp->mh_hold_todo--;
1879 					continue;
1880 				}
1881 				ASSERT(freemem_left != 0);
1882 				if (PP_ISFREE(pp)) {
1883 					/*
1884 					 * Like page_reclaim() only 'freemem'
1885 					 * processing is already done.
1886 					 */
1887 					MDSTAT_INCR(mhp, nfree);
1888 				free_page_collect:
1889 					if (PP_ISAGED(pp)) {
1890 						page_list_sub(pp,
1891 						    PG_FREE_LIST);
1892 					} else {
1893 						page_list_sub(pp,
1894 						    PG_CACHE_LIST);
1895 					}
1896 					PP_CLRFREE(pp);
1897 					PP_CLRAGED(pp);
1898 					collected++;
1899 					mutex_enter(&mhp->mh_mutex);
1900 					page_delete_collect(pp, mhp);
1901 					mdsp->mds_bitmap[bit / NBPBMW] |=
1902 					    (1 << (bit % NBPBMW));
1903 					freemem_left--;
1904 					continue;
1905 				}
1906 				ASSERT(pp->p_vnode != NULL);
1907 				if (first_scan) {
1908 					MDSTAT_INCR(mhp, first_notfree);
1909 					page_unlock(pp);
1910 					mutex_enter(&mhp->mh_mutex);
1911 					continue;
1912 				}
1913 				/*
1914 				 * Keep stats on pages encountered that
1915 				 * are marked for retirement.
1916 				 */
1917 				if (PP_TOXIC(pp)) {
1918 					MDSTAT_INCR(mhp, toxic);
1919 				} else if (PP_PR_REQ(pp)) {
1920 					MDSTAT_INCR(mhp, failing);
1921 				}
1922 				/*
1923 				 * In certain cases below, special exceptions
1924 				 * are made for pages that are toxic.  This
1925 				 * is because the current meaning of toxic
1926 				 * is that an uncorrectable error has been
1927 				 * previously associated with the page.
1928 				 */
1929 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1930 					if (!PP_TOXIC(pp)) {
1931 						/*
1932 						 * Must relocate locked in
1933 						 * memory pages.
1934 						 */
1935 #ifdef MEM_DEL_STATS
1936 						start_pgrp = ddi_get_lbolt();
1937 #endif /* MEM_DEL_STATS */
1938 						/*
1939 						 * Lock all constituent pages
1940 						 * of a large page to ensure
1941 						 * that p_szc won't change.
1942 						 */
1943 						if (!group_page_trylock(pp,
1944 						    SE_EXCL)) {
1945 							MDSTAT_INCR(mhp,
1946 							    gptllckfail);
1947 							page_unlock(pp);
1948 							mutex_enter(
1949 							    &mhp->mh_mutex);
1950 							continue;
1951 						}
1952 						MDSTAT_INCR(mhp, npplocked);
1953 						pp_targ =
1954 						    page_get_replacement_page(
1955 						    pp, NULL, 0);
1956 						if (pp_targ != NULL) {
1957 #ifdef MEM_DEL_STATS
1958 							ntick_pgrp =
1959 							    (uint64_t)
1960 							    ddi_get_lbolt() -
1961 							    start_pgrp;
1962 #endif /* MEM_DEL_STATS */
1963 							MDSTAT_PGRP(mhp,
1964 							    ntick_pgrp);
1965 							MDSTAT_INCR(mhp,
1966 							    nlockreloc);
1967 							goto reloc;
1968 						}
1969 						group_page_unlock(pp);
1970 						page_unlock(pp);
1971 #ifdef MEM_DEL_STATS
1972 						ntick_pgrp =
1973 						    (uint64_t)ddi_get_lbolt() -
1974 						    start_pgrp;
1975 #endif /* MEM_DEL_STATS */
1976 						MDSTAT_PGRP(mhp, ntick_pgrp);
1977 						MDSTAT_INCR(mhp, nnorepl);
1978 						mutex_enter(&mhp->mh_mutex);
1979 						continue;
1980 					} else {
1981 						/*
1982 						 * Cannot do anything about
1983 						 * this page because it is
1984 						 * toxic.
1985 						 */
1986 						MDSTAT_INCR(mhp, npplkdtoxic);
1987 						page_unlock(pp);
1988 						mutex_enter(&mhp->mh_mutex);
1989 						continue;
1990 					}
1991 				}
1992 				/*
1993 				 * Unload the mappings and check if mod bit
1994 				 * is set.
1995 				 */
1996 				ASSERT(!PP_ISKAS(pp));
1997 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1998 				mod = hat_ismod(pp);
1999 
2000 #ifdef MEM_DEL_STATS
2001 				start_pgrp = ddi_get_lbolt();
2002 #endif /* MEM_DEL_STATS */
2003 				if (mod && !PP_TOXIC(pp)) {
2004 					/*
2005 					 * Lock all constituent pages
2006 					 * of a large page to ensure
2007 					 * that p_szc won't change.
2008 					 */
2009 					if (!group_page_trylock(pp, SE_EXCL)) {
2010 						MDSTAT_INCR(mhp, gptlmodfail);
2011 						page_unlock(pp);
2012 						mutex_enter(&mhp->mh_mutex);
2013 						continue;
2014 					}
2015 					pp_targ = page_get_replacement_page(pp,
2016 					    NULL, 0);
2017 					if (pp_targ != NULL) {
2018 						MDSTAT_INCR(mhp, nmodreloc);
2019 #ifdef MEM_DEL_STATS
2020 						ntick_pgrp =
2021 						    (uint64_t)ddi_get_lbolt() -
2022 						    start_pgrp;
2023 #endif /* MEM_DEL_STATS */
2024 						MDSTAT_PGRP(mhp, ntick_pgrp);
2025 						goto reloc;
2026 					}
2027 					group_page_unlock(pp);
2028 				}
2029 
2030 				if (!page_try_demote_pages(pp)) {
2031 					MDSTAT_INCR(mhp, demotefail);
2032 					page_unlock(pp);
2033 #ifdef MEM_DEL_STATS
2034 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2035 					    start_pgrp;
2036 #endif /* MEM_DEL_STATS */
2037 					MDSTAT_PGRP(mhp, ntick_pgrp);
2038 					mutex_enter(&mhp->mh_mutex);
2039 					continue;
2040 				}
2041 
2042 				/*
2043 				 * Regular 'page-out'.
2044 				 */
2045 				if (!mod) {
2046 					MDSTAT_INCR(mhp, ndestroy);
2047 					page_destroy(pp, 1);
2048 					/*
2049 					 * page_destroy was called with
2050 					 * dontfree. As long as p_lckcnt
2051 					 * and p_cowcnt are both zero, the
2052 					 * only additional action of
2053 					 * page_destroy with !dontfree is to
2054 					 * call page_free, so we can collect
2055 					 * the page here.
2056 					 */
2057 					collected++;
2058 #ifdef MEM_DEL_STATS
2059 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2060 					    start_pgrp;
2061 #endif /* MEM_DEL_STATS */
2062 					MDSTAT_PGRP(mhp, ntick_pgrp);
2063 					mutex_enter(&mhp->mh_mutex);
2064 					page_delete_collect(pp, mhp);
2065 					mdsp->mds_bitmap[bit / NBPBMW] |=
2066 					    (1 << (bit % NBPBMW));
2067 					continue;
2068 				}
2069 				/*
2070 				 * The page is toxic and the mod bit is
2071 				 * set, we cannot do anything here to deal
2072 				 * with it.
2073 				 */
2074 				if (PP_TOXIC(pp)) {
2075 					page_unlock(pp);
2076 #ifdef MEM_DEL_STATS
2077 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2078 					    start_pgrp;
2079 #endif /* MEM_DEL_STATS */
2080 					MDSTAT_PGRP(mhp, ntick_pgrp);
2081 					MDSTAT_INCR(mhp, modtoxic);
2082 					mutex_enter(&mhp->mh_mutex);
2083 					continue;
2084 				}
2085 				MDSTAT_INCR(mhp, nputpage);
2086 				vp = pp->p_vnode;
2087 				offset = pp->p_offset;
2088 				VN_HOLD(vp);
2089 				page_unlock(pp);
2090 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2091 				    B_INVAL|B_FORCE, kcred, NULL);
2092 				VN_RELE(vp);
2093 #ifdef MEM_DEL_STATS
2094 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2095 				    start_pgrp;
2096 #endif /* MEM_DEL_STATS */
2097 				MDSTAT_PGRP(mhp, ntick_pgrp);
2098 				/*
2099 				 * Try to get the page back immediately
2100 				 * so that it can be collected.
2101 				 */
2102 				pp = page_numtopp_nolock(pfn);
2103 				if (pp == NULL) {
2104 					MDSTAT_INCR(mhp, nnoreclaim);
2105 					/*
2106 					 * This should not happen as this
2107 					 * thread is deleting the page.
2108 					 * If this code is generalized, this
2109 					 * becomes a reality.
2110 					 */
2111 #ifdef DEBUG
2112 					cmn_err(CE_WARN,
2113 					    "delete_memory_thread(0x%p) "
2114 					    "pfn 0x%lx has no page_t",
2115 					    (void *)mhp, pfn);
2116 #endif /* DEBUG */
2117 					mutex_enter(&mhp->mh_mutex);
2118 					continue;
2119 				}
2120 				if (page_try_reclaim_lock(pp, SE_EXCL,
2121 				    SE_EXCL_WANTED | SE_RETIRED)) {
2122 					if (PP_ISFREE(pp)) {
2123 						goto free_page_collect;
2124 					}
2125 					page_unlock(pp);
2126 				}
2127 				MDSTAT_INCR(mhp, nnoreclaim);
2128 				mutex_enter(&mhp->mh_mutex);
2129 				continue;
2130 
2131 			reloc:
2132 				/*
2133 				 * Got some freemem and a target
2134 				 * page, so move the data to avoid
2135 				 * I/O and lock problems.
2136 				 */
2137 				ASSERT(!page_iolock_assert(pp));
2138 				MDSTAT_INCR(mhp, nreloc);
2139 				/*
2140 				 * page_relocate() will return pgcnt: the
2141 				 * number of consecutive pages relocated.
2142 				 * If it is successful, pp will be a
2143 				 * linked list of the page structs that
2144 				 * were relocated. If page_relocate() is
2145 				 * unsuccessful, pp will be unmodified.
2146 				 */
2147 #ifdef MEM_DEL_STATS
2148 				start_pgrp = ddi_get_lbolt();
2149 #endif /* MEM_DEL_STATS */
2150 				result = page_relocate(&pp, &pp_targ, 0, 0,
2151 				    &pgcnt, NULL);
2152 #ifdef MEM_DEL_STATS
2153 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2154 				    start_pgrp;
2155 #endif /* MEM_DEL_STATS */
2156 				MDSTAT_PGRP(mhp, ntick_pgrp);
2157 				if (result != 0) {
2158 					MDSTAT_INCR(mhp, nrelocfail);
2159 					/*
2160 					 * We did not succeed. We need
2161 					 * to give the pp_targ pages back.
2162 					 * page_free(pp_targ, 1) without
2163 					 * the freemem accounting.
2164 					 */
2165 					group_page_unlock(pp);
2166 					page_free_replacement_page(pp_targ);
2167 					page_unlock(pp);
2168 					mutex_enter(&mhp->mh_mutex);
2169 					continue;
2170 				}
2171 
2172 				/*
2173 				 * We will then collect pgcnt pages.
2174 				 */
2175 				ASSERT(pgcnt > 0);
2176 				mutex_enter(&mhp->mh_mutex);
2177 				/*
2178 				 * We need to make sure freemem_left is
2179 				 * large enough.
2180 				 */
2181 				while ((freemem_left < pgcnt) &&
2182 				    (!mhp->mh_cancel)) {
2183 					freemem_left +=
2184 					    delthr_get_freemem(mhp);
2185 				}
2186 
2187 				/*
2188 				 * Do not proceed if mh_cancel is set.
2189 				 */
2190 				if (mhp->mh_cancel) {
2191 					while (pp_targ != NULL) {
2192 						/*
2193 						 * Unlink and unlock each page.
2194 						 */
2195 						tpp_targ = pp_targ;
2196 						page_sub(&pp_targ, tpp_targ);
2197 						page_unlock(tpp_targ);
2198 					}
2199 					/*
2200 					 * We need to give the pp pages back.
2201 					 * page_free(pp, 1) without the
2202 					 * freemem accounting.
2203 					 */
2204 					page_free_replacement_page(pp);
2205 					break;
2206 				}
2207 
2208 				/* Now remove pgcnt from freemem_left */
2209 				freemem_left -= pgcnt;
2210 				ASSERT(freemem_left >= 0);
2211 				szc = pp->p_szc;
2212 				while (pp != NULL) {
2213 					/*
2214 					 * pp and pp_targ were passed back as
2215 					 * a linked list of pages.
2216 					 * Unlink and unlock each page.
2217 					 */
2218 					tpp_targ = pp_targ;
2219 					page_sub(&pp_targ, tpp_targ);
2220 					page_unlock(tpp_targ);
2221 					/*
2222 					 * The original page is now free
2223 					 * so remove it from the linked
2224 					 * list and collect it.
2225 					 */
2226 					tpp = pp;
2227 					page_sub(&pp, tpp);
2228 					pfn = page_pptonum(tpp);
2229 					collected++;
2230 					ASSERT(PAGE_EXCL(tpp));
2231 					ASSERT(tpp->p_vnode == NULL);
2232 					ASSERT(!hat_page_is_mapped(tpp));
2233 					ASSERT(tpp->p_szc == szc);
2234 					tpp->p_szc = 0;
2235 					page_delete_collect(tpp, mhp);
2236 					bit = pfn - mdsp->mds_base;
2237 					mdsp->mds_bitmap[bit / NBPBMW] |=
2238 					    (1 << (bit % NBPBMW));
2239 				}
2240 				ASSERT(pp_targ == NULL);
2241 			}
2242 		}
2243 		first_scan = 0;
2244 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2245 		    (collected == 0)) {
2246 			/*
2247 			 * This code is needed as we cannot wait
2248 			 * for a page to be locked OR the delete to
2249 			 * be cancelled.  Also, we must delay so
2250 			 * that other threads get a chance to run
2251 			 * on our cpu, otherwise page locks may be
2252 			 * held indefinitely by those threads.
2253 			 */
2254 			MDSTAT_INCR(mhp, ndelay);
2255 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2256 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2257 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2258 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2259 		}
2260 	}
2261 	/* stop the dr aio cleanup thread */
2262 	mhp->mh_dr_aio_cleanup_cancel = 1;
2263 	transit_list_collect(mhp, 0);
2264 	if (freemem_left != 0) {
2265 		/* Return any surplus. */
2266 		page_create_putback(freemem_left);
2267 		freemem_left = 0;
2268 	}
2269 #ifdef MEM_DEL_STATS
2270 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2271 #endif /* MEM_DEL_STATS */
2272 	MDSTAT_TOTAL(mhp, ntick_total);
2273 	MDSTAT_PRINT(mhp);
2274 
2275 	/*
2276 	 * If the memory delete was cancelled, exclusive-wanted bits must
2277 	 * be cleared. If there are retired pages being deleted, they need
2278 	 * to be unretired.
2279 	 */
2280 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2281 	    mdsp = mdsp->mds_next) {
2282 		pfn_t pfn, p_end;
2283 
2284 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2285 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2286 			page_t *pp;
2287 			pgcnt_t bit;
2288 
2289 			bit = pfn - mdsp->mds_base;
2290 			if (mhp->mh_cancel) {
2291 				pp = page_numtopp_nolock(pfn);
2292 				if (pp != NULL) {
2293 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2294 					    (1 << (bit % NBPBMW))) == 0) {
2295 						page_lock_clr_exclwanted(pp);
2296 					}
2297 				}
2298 			} else {
2299 				pp = NULL;
2300 			}
2301 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2302 			    (1 << (bit % NBPBMW))) != 0) {
2303 				/* do we already have pp? */
2304 				if (pp == NULL) {
2305 					pp = page_numtopp_nolock(pfn);
2306 				}
2307 				ASSERT(pp != NULL);
2308 				ASSERT(PP_RETIRED(pp));
2309 				if (mhp->mh_cancel != 0) {
2310 					page_unlock(pp);
2311 					/*
2312 					 * To satisfy ASSERT below in
2313 					 * cancel code.
2314 					 */
2315 					mhp->mh_hold_todo++;
2316 				} else {
2317 					(void) page_unretire_pp(pp,
2318 					    PR_UNR_CLEAN);
2319 				}
2320 			}
2321 		}
2322 	}
2323 	/*
2324 	 * Free retired page bitmap and collected page bitmap
2325 	 */
2326 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2327 	    mdsp = mdsp->mds_next) {
2328 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2329 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2330 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2331 		ASSERT(mdsp->mds_bitmap != NULL);
2332 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2333 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2334 	}
2335 
2336 	/* wait for our dr aio cancel thread to exit */
2337 	while (!(mhp->mh_aio_cleanup_done)) {
2338 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2339 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2340 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2341 	}
2342 refused:
2343 	if (mhp->mh_cancel != 0) {
2344 		page_t *pp;
2345 
2346 		comp_code = mhp->mh_cancel;
2347 		/*
2348 		 * Go through list of deleted pages (mh_deleted) freeing
2349 		 * them.
2350 		 */
2351 		while ((pp = mhp->mh_deleted) != NULL) {
2352 			mhp->mh_deleted = pp->p_next;
2353 			mhp->mh_hold_todo++;
2354 			mutex_exit(&mhp->mh_mutex);
2355 			/* Restore p_next. */
2356 			pp->p_next = pp->p_prev;
2357 			if (PP_ISFREE(pp)) {
2358 				cmn_err(CE_PANIC,
2359 				    "page %p is free",
2360 				    (void *)pp);
2361 			}
2362 			page_free(pp, 1);
2363 			mutex_enter(&mhp->mh_mutex);
2364 		}
2365 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2366 
2367 		mutex_exit(&mhp->mh_mutex);
2368 		put_availrmem(mhp->mh_vm_pages);
2369 		mutex_enter(&mhp->mh_mutex);
2370 
2371 		goto t_exit;
2372 	}
2373 
2374 	/*
2375 	 * All the pages are no longer in use and are exclusively locked.
2376 	 */
2377 
2378 	mhp->mh_deleted = NULL;
2379 
2380 	kphysm_del_cleanup(mhp);
2381 
2382 	/*
2383 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
2384 	 * that the mem_node_config[] will remain intact for the cleanup.
2385 	 */
2386 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2387 	    mdsp = mdsp->mds_next) {
2388 		mem_node_del_range(mdsp->mds_base,
2389 		    mdsp->mds_base + mdsp->mds_npgs - 1);
2390 	}
2391 	/* cleanup the page counters */
2392 	page_ctrs_cleanup();
2393 
2394 	comp_code = KPHYSM_OK;
2395 
2396 t_exit:
2397 	mutex_exit(&mhp->mh_mutex);
2398 	kphysm_setup_post_del(mhp->mh_vm_pages,
2399 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2400 	mutex_enter(&mhp->mh_mutex);
2401 
2402 early_exit:
2403 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2404 	mhp->mh_state = MHND_DONE;
2405 	del_complete_funcp = mhp->mh_delete_complete;
2406 	del_complete_arg = mhp->mh_delete_complete_arg;
2407 	CALLB_CPR_EXIT(&cprinfo);
2408 	(*del_complete_funcp)(del_complete_arg, comp_code);
2409 	thread_exit();
2410 	/*NOTREACHED*/
2411 }
2412 
2413 /*
2414  * Start the delete of the memory from the system.
2415  */
2416 int
2417 kphysm_del_start(
2418 	memhandle_t handle,
2419 	void (*complete)(void *, int),
2420 	void *complete_arg)
2421 {
2422 	struct mem_handle *mhp;
2423 
2424 	mhp = kphysm_lookup_mem_handle(handle);
2425 	if (mhp == NULL) {
2426 		return (KPHYSM_EHANDLE);
2427 	}
2428 	switch (mhp->mh_state) {
2429 	case MHND_FREE:
2430 		ASSERT(mhp->mh_state != MHND_FREE);
2431 		mutex_exit(&mhp->mh_mutex);
2432 		return (KPHYSM_EHANDLE);
2433 	case MHND_INIT:
2434 		break;
2435 	case MHND_STARTING:
2436 	case MHND_RUNNING:
2437 		mutex_exit(&mhp->mh_mutex);
2438 		return (KPHYSM_ESEQUENCE);
2439 	case MHND_DONE:
2440 		mutex_exit(&mhp->mh_mutex);
2441 		return (KPHYSM_ESEQUENCE);
2442 	case MHND_RELEASE:
2443 		mutex_exit(&mhp->mh_mutex);
2444 		return (KPHYSM_ESEQUENCE);
2445 	default:
2446 #ifdef DEBUG
2447 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2448 		    (void *)mhp, mhp->mh_state);
2449 #endif /* DEBUG */
2450 		mutex_exit(&mhp->mh_mutex);
2451 		return (KPHYSM_EHANDLE);
2452 	}
2453 
2454 	if (mhp->mh_transit.trl_spans == NULL) {
2455 		mutex_exit(&mhp->mh_mutex);
2456 		return (KPHYSM_ENOWORK);
2457 	}
2458 
2459 	ASSERT(complete != NULL);
2460 	mhp->mh_delete_complete = complete;
2461 	mhp->mh_delete_complete_arg = complete_arg;
2462 	mhp->mh_state = MHND_STARTING;
2463 	/*
2464 	 * Release the mutex in case thread_create sleeps.
2465 	 */
2466 	mutex_exit(&mhp->mh_mutex);
2467 
2468 	/*
2469 	 * The "obvious" process for this thread is pageout (proc_pageout)
2470 	 * but this gives the thread too much power over freemem
2471 	 * which results in freemem starvation.
2472 	 */
2473 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2474 	    TS_RUN, maxclsyspri - 1);
2475 
2476 	return (KPHYSM_OK);
2477 }
2478 
2479 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2480 static caddr_t pp_dummy;
2481 static pgcnt_t pp_dummy_npages;
2482 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2483 
2484 static void
2485 memseg_remap_init_pages(page_t *pages, page_t *epages)
2486 {
2487 	page_t *pp;
2488 
2489 	for (pp = pages; pp < epages; pp++) {
2490 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2491 		pp->p_offset = (u_offset_t)-1;
2492 		page_iolock_init(pp);
2493 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2494 			continue;
2495 		page_lock_delete(pp);
2496 	}
2497 }
2498 
2499 void
2500 memseg_remap_init()
2501 {
2502 	mutex_enter(&pp_dummy_lock);
2503 	if (pp_dummy == NULL) {
2504 		uint_t dpages;
2505 		int i;
2506 
2507 		/*
2508 		 * dpages starts off as the size of the structure and
2509 		 * ends up as the minimum number of pages that will
2510 		 * hold a whole number of page_t structures.
2511 		 */
2512 		dpages = sizeof (page_t);
2513 		ASSERT(dpages != 0);
2514 		ASSERT(dpages <= MMU_PAGESIZE);
2515 
2516 		while ((dpages & 1) == 0)
2517 			dpages >>= 1;
2518 
2519 		pp_dummy_npages = dpages;
2520 		/*
2521 		 * Allocate pp_dummy pages directly from static_arena,
2522 		 * since these are whole page allocations and are
2523 		 * referenced by physical address.  This also has the
2524 		 * nice fringe benefit of hiding the memory from
2525 		 * ::findleaks since it doesn't deal well with allocated
2526 		 * kernel heap memory that doesn't have any mappings.
2527 		 */
2528 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2529 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2530 		bzero(pp_dummy, ptob(pp_dummy_npages));
2531 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2532 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2533 		    pp_dummy_npages, KM_SLEEP);
2534 		for (i = 0; i < pp_dummy_npages; i++) {
2535 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2536 			    &pp_dummy[MMU_PAGESIZE * i]);
2537 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2538 		}
2539 		/*
2540 		 * Initialize the page_t's to a known 'deleted' state
2541 		 * that matches the state of deleted pages.
2542 		 */
2543 		memseg_remap_init_pages((page_t *)pp_dummy,
2544 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2545 		/* Remove kmem mappings for the pages for safety. */
2546 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2547 		    HAT_UNLOAD_UNLOCK);
2548 		/* Leave pp_dummy pointer set as flag that init is done. */
2549 	}
2550 	mutex_exit(&pp_dummy_lock);
2551 }
2552 
2553 /*
2554  * Remap a page-aglined range of page_t's to dummy pages.
2555  */
2556 void
2557 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2558 {
2559 	int phase;
2560 
2561 	ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE));
2562 
2563 	/*
2564 	 * We may start remapping at a non-zero page offset
2565 	 * within the dummy pages since the low/high ends
2566 	 * of the outgoing pp's could be shared by other
2567 	 * memsegs (see memseg_remap_meta).
2568 	 */
2569 	phase = btop((uint64_t)va) % pp_dummy_npages;
2570 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2571 
2572 	while (metapgs != 0) {
2573 		pgcnt_t n;
2574 		int i, j;
2575 
2576 		n = pp_dummy_npages;
2577 		if (n > metapgs)
2578 			n = metapgs;
2579 		for (i = 0; i < n; i++) {
2580 			j = (i + phase) % pp_dummy_npages;
2581 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2582 			    PROT_READ,
2583 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2584 			    HAT_LOAD_REMAP);
2585 			va += ptob(1);
2586 		}
2587 		metapgs -= n;
2588 	}
2589 }
2590 
2591 static void
2592 memseg_remap_to_dummy(struct memseg *seg)
2593 {
2594 	caddr_t pp;
2595 	pgcnt_t metapgs;
2596 
2597 	ASSERT(memseg_is_dynamic(seg));
2598 	ASSERT(pp_dummy != NULL);
2599 
2600 
2601 	if (!memseg_includes_meta(seg)) {
2602 		memseg_remap_meta(seg);
2603 		return;
2604 	}
2605 
2606 	pp = (caddr_t)seg->pages;
2607 	metapgs = seg->pages_base - memseg_get_start(seg);
2608 	ASSERT(metapgs != 0);
2609 
2610 	seg->pages_end = seg->pages_base;
2611 
2612 	remap_to_dummy(pp, metapgs);
2613 }
2614 
2615 /*
2616  * Transition all the deleted pages to the deleted state so that
2617  * page_lock will not wait. The page_lock_delete call will
2618  * also wake up any waiters.
2619  */
2620 static void
2621 memseg_lock_delete_all(struct memseg *seg)
2622 {
2623 	page_t *pp;
2624 
2625 	for (pp = seg->pages; pp < seg->epages; pp++) {
2626 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2627 		page_lock_delete(pp);
2628 	}
2629 }
2630 
2631 static void
2632 kphysm_del_cleanup(struct mem_handle *mhp)
2633 {
2634 	struct memdelspan	*mdsp;
2635 	struct memseg		*seg;
2636 	struct memseg   	**segpp;
2637 	struct memseg		*seglist;
2638 	pfn_t			p_end;
2639 	uint64_t		avmem;
2640 	pgcnt_t			avpgs;
2641 	pgcnt_t			npgs;
2642 
2643 	avpgs = mhp->mh_vm_pages;
2644 
2645 	memsegs_lock(1);
2646 
2647 	/*
2648 	 * remove from main segment list.
2649 	 */
2650 	npgs = 0;
2651 	seglist = NULL;
2652 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2653 	    mdsp = mdsp->mds_next) {
2654 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2655 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2656 			if (seg->pages_base >= p_end ||
2657 			    seg->pages_end <= mdsp->mds_base) {
2658 				/* Span and memseg don't overlap. */
2659 				segpp = &((*segpp)->next);
2660 				continue;
2661 			}
2662 			ASSERT(seg->pages_base >= mdsp->mds_base);
2663 			ASSERT(seg->pages_end <= p_end);
2664 
2665 			PLCNT_MODIFY_MAX(seg->pages_base,
2666 			    seg->pages_base - seg->pages_end);
2667 
2668 			/* Hide the memseg from future scans. */
2669 			hat_kpm_delmem_mseg_update(seg, segpp);
2670 			*segpp = seg->next;
2671 			membar_producer();	/* TODO: Needed? */
2672 			npgs += MSEG_NPAGES(seg);
2673 
2674 			/*
2675 			 * Leave the deleted segment's next pointer intact
2676 			 * in case a memsegs scanning loop is walking this
2677 			 * segment concurrently.
2678 			 */
2679 			seg->lnext = seglist;
2680 			seglist = seg;
2681 		}
2682 	}
2683 
2684 	build_pfn_hash();
2685 
2686 	ASSERT(npgs < total_pages);
2687 	total_pages -= npgs;
2688 
2689 	/*
2690 	 * Recalculate the paging parameters now total_pages has changed.
2691 	 * This will also cause the clock hands to be reset before next use.
2692 	 */
2693 	setupclock(1);
2694 
2695 	memsegs_unlock(1);
2696 
2697 	mutex_exit(&mhp->mh_mutex);
2698 
2699 	while ((seg = seglist) != NULL) {
2700 		pfn_t mseg_start;
2701 		pfn_t mseg_base, mseg_end;
2702 		pgcnt_t mseg_npgs;
2703 		int mlret;
2704 
2705 		seglist = seg->lnext;
2706 
2707 		/*
2708 		 * Put the page_t's into the deleted state to stop
2709 		 * cv_wait()s on the pages. When we remap, the dummy
2710 		 * page_t's will be in the same state.
2711 		 */
2712 		memseg_lock_delete_all(seg);
2713 		/*
2714 		 * Collect up information based on pages_base and pages_end
2715 		 * early so that we can flag early that the memseg has been
2716 		 * deleted by setting pages_end == pages_base.
2717 		 */
2718 		mseg_base = seg->pages_base;
2719 		mseg_end = seg->pages_end;
2720 		mseg_npgs = MSEG_NPAGES(seg);
2721 		mseg_start = memseg_get_start(seg);
2722 
2723 		if (memseg_is_dynamic(seg)) {
2724 			/* Remap the meta data to our special dummy area. */
2725 			memseg_remap_to_dummy(seg);
2726 
2727 			mutex_enter(&memseg_lists_lock);
2728 			seg->lnext = memseg_va_avail;
2729 			memseg_va_avail = seg;
2730 			mutex_exit(&memseg_lists_lock);
2731 		} else {
2732 			/*
2733 			 * For memory whose page_ts were allocated
2734 			 * at boot, we need to find a new use for
2735 			 * the page_t memory.
2736 			 * For the moment, just leak it.
2737 			 * (It is held in the memseg_delete_junk list.)
2738 			 */
2739 			seg->pages_end = seg->pages_base;
2740 
2741 			mutex_enter(&memseg_lists_lock);
2742 			seg->lnext = memseg_delete_junk;
2743 			memseg_delete_junk = seg;
2744 			mutex_exit(&memseg_lists_lock);
2745 		}
2746 
2747 		/* Must not use seg now as it could be re-used. */
2748 
2749 		memlist_write_lock();
2750 
2751 		mlret = memlist_delete_span(
2752 		    (uint64_t)(mseg_base) << PAGESHIFT,
2753 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2754 		    &phys_avail);
2755 		ASSERT(mlret == MEML_SPANOP_OK);
2756 
2757 		mlret = memlist_delete_span(
2758 		    (uint64_t)(mseg_start) << PAGESHIFT,
2759 		    (uint64_t)(mseg_end - mseg_start) <<
2760 		    PAGESHIFT,
2761 		    &phys_install);
2762 		ASSERT(mlret == MEML_SPANOP_OK);
2763 		phys_install_has_changed();
2764 
2765 		memlist_write_unlock();
2766 	}
2767 
2768 	memlist_read_lock();
2769 	installed_top_size(phys_install, &physmax, &physinstalled);
2770 	memlist_read_unlock();
2771 
2772 	mutex_enter(&freemem_lock);
2773 	maxmem -= avpgs;
2774 	physmem -= avpgs;
2775 	/* availrmem is adjusted during the delete. */
2776 	availrmem_initial -= avpgs;
2777 
2778 	mutex_exit(&freemem_lock);
2779 
2780 	dump_resize();
2781 
2782 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2783 	    "(0x%" PRIx64 ")\n",
2784 	    physinstalled << (PAGESHIFT - 10),
2785 	    (uint64_t)physinstalled << PAGESHIFT);
2786 
2787 	avmem = (uint64_t)freemem << PAGESHIFT;
2788 	cmn_err(CE_CONT, "?kphysm_delete: "
2789 	    "avail mem = %" PRId64 "\n", avmem);
2790 
2791 	/*
2792 	 * Update lgroup generation number on single lgroup systems
2793 	 */
2794 	if (nlgrps == 1)
2795 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2796 
2797 	/* Successfully deleted system memory */
2798 	mutex_enter(&mhp->mh_mutex);
2799 }
2800 
2801 static uint_t mdel_nullvp_waiter;
2802 
2803 static void
2804 page_delete_collect(
2805 	page_t *pp,
2806 	struct mem_handle *mhp)
2807 {
2808 	if (pp->p_vnode) {
2809 		page_hashout(pp, (kmutex_t *)NULL);
2810 		/* do not do PP_SETAGED(pp); */
2811 	} else {
2812 		kmutex_t *sep;
2813 
2814 		sep = page_se_mutex(pp);
2815 		mutex_enter(sep);
2816 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2817 			mdel_nullvp_waiter++;
2818 			cv_broadcast(&pp->p_cv);
2819 		}
2820 		mutex_exit(sep);
2821 	}
2822 	ASSERT(pp->p_next == pp->p_prev);
2823 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2824 	pp->p_next = mhp->mh_deleted;
2825 	mhp->mh_deleted = pp;
2826 	ASSERT(mhp->mh_hold_todo != 0);
2827 	mhp->mh_hold_todo--;
2828 }
2829 
2830 static void
2831 transit_list_collect(struct mem_handle *mhp, int v)
2832 {
2833 	struct transit_list_head *trh;
2834 
2835 	trh = &transit_list_head;
2836 	mutex_enter(&trh->trh_lock);
2837 	mhp->mh_transit.trl_collect = v;
2838 	mutex_exit(&trh->trh_lock);
2839 }
2840 
2841 static void
2842 transit_list_insert(struct transit_list *tlp)
2843 {
2844 	struct transit_list_head *trh;
2845 
2846 	trh = &transit_list_head;
2847 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2848 	tlp->trl_next = trh->trh_head;
2849 	trh->trh_head = tlp;
2850 }
2851 
2852 static void
2853 transit_list_remove(struct transit_list *tlp)
2854 {
2855 	struct transit_list_head *trh;
2856 	struct transit_list **tlpp;
2857 
2858 	trh = &transit_list_head;
2859 	tlpp = &trh->trh_head;
2860 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2861 	while (*tlpp != NULL && *tlpp != tlp)
2862 		tlpp = &(*tlpp)->trl_next;
2863 	ASSERT(*tlpp != NULL);
2864 	if (*tlpp == tlp)
2865 		*tlpp = tlp->trl_next;
2866 	tlp->trl_next = NULL;
2867 }
2868 
2869 static struct transit_list *
2870 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2871 {
2872 	struct transit_list *tlp;
2873 
2874 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2875 		struct memdelspan *mdsp;
2876 
2877 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2878 		    mdsp = mdsp->mds_next) {
2879 			if (pfnum >= mdsp->mds_base &&
2880 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2881 				return (tlp);
2882 			}
2883 		}
2884 	}
2885 	return (NULL);
2886 }
2887 
2888 int
2889 pfn_is_being_deleted(pfn_t pfnum)
2890 {
2891 	struct transit_list_head *trh;
2892 	struct transit_list *tlp;
2893 	int ret;
2894 
2895 	trh = &transit_list_head;
2896 	if (trh->trh_head == NULL)
2897 		return (0);
2898 
2899 	mutex_enter(&trh->trh_lock);
2900 	tlp = pfnum_to_transit_list(trh, pfnum);
2901 	ret = (tlp != NULL && tlp->trl_collect);
2902 	mutex_exit(&trh->trh_lock);
2903 
2904 	return (ret);
2905 }
2906 
2907 #ifdef MEM_DEL_STATS
2908 extern int hz;
2909 static void
2910 mem_del_stat_print_func(struct mem_handle *mhp)
2911 {
2912 	uint64_t tmp;
2913 
2914 	if (mem_del_stat_print) {
2915 		printf("memory delete loop %x/%x, statistics%s\n",
2916 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2917 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2918 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2919 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2920 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2921 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2922 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2923 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2924 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2925 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2926 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2927 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2928 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2929 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2930 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2931 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2932 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2933 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2934 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2935 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2936 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2937 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2938 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2939 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2940 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2941 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2942 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2943 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2944 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2945 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2946 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2947 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2948 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2949 		printf(
2950 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2951 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2952 
2953 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2954 		printf(
2955 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2956 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2957 	}
2958 }
2959 #endif /* MEM_DEL_STATS */
2960 
2961 struct mem_callback {
2962 	kphysm_setup_vector_t	*vec;
2963 	void			*arg;
2964 };
2965 
2966 #define	NMEMCALLBACKS		100
2967 
2968 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2969 static uint_t nmemcallbacks;
2970 static krwlock_t mem_callback_rwlock;
2971 
2972 int
2973 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2974 {
2975 	uint_t i, found;
2976 
2977 	/*
2978 	 * This test will become more complicated when the version must
2979 	 * change.
2980 	 */
2981 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2982 		return (EINVAL);
2983 
2984 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2985 	    vec->post_del == NULL)
2986 		return (EINVAL);
2987 
2988 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2989 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2990 		if (mem_callbacks[i].vec == NULL && found == 0)
2991 			found = i + 1;
2992 		if (mem_callbacks[i].vec == vec &&
2993 		    mem_callbacks[i].arg == arg) {
2994 #ifdef DEBUG
2995 			/* Catch this in DEBUG kernels. */
2996 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2997 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2998 			    (void *)vec, arg, (void *)caller());
2999 #endif /* DEBUG */
3000 			rw_exit(&mem_callback_rwlock);
3001 			return (EEXIST);
3002 		}
3003 	}
3004 	if (found != 0) {
3005 		i = found - 1;
3006 	} else {
3007 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
3008 		if (nmemcallbacks == NMEMCALLBACKS) {
3009 			rw_exit(&mem_callback_rwlock);
3010 			return (ENOMEM);
3011 		}
3012 		i = nmemcallbacks++;
3013 	}
3014 	mem_callbacks[i].vec = vec;
3015 	mem_callbacks[i].arg = arg;
3016 	rw_exit(&mem_callback_rwlock);
3017 	return (0);
3018 }
3019 
3020 void
3021 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3022 {
3023 	uint_t i;
3024 
3025 	rw_enter(&mem_callback_rwlock, RW_WRITER);
3026 	for (i = 0; i < nmemcallbacks; i++) {
3027 		if (mem_callbacks[i].vec == vec &&
3028 		    mem_callbacks[i].arg == arg) {
3029 			mem_callbacks[i].vec = NULL;
3030 			mem_callbacks[i].arg = NULL;
3031 			if (i == (nmemcallbacks - 1))
3032 				nmemcallbacks--;
3033 			break;
3034 		}
3035 	}
3036 	rw_exit(&mem_callback_rwlock);
3037 }
3038 
3039 static void
3040 kphysm_setup_post_add(pgcnt_t delta_pages)
3041 {
3042 	uint_t i;
3043 
3044 	rw_enter(&mem_callback_rwlock, RW_READER);
3045 	for (i = 0; i < nmemcallbacks; i++) {
3046 		if (mem_callbacks[i].vec != NULL) {
3047 			(*mem_callbacks[i].vec->post_add)
3048 			    (mem_callbacks[i].arg, delta_pages);
3049 		}
3050 	}
3051 	rw_exit(&mem_callback_rwlock);
3052 }
3053 
3054 /*
3055  * Note the locking between pre_del and post_del: The reader lock is held
3056  * between the two calls to stop the set of functions from changing.
3057  */
3058 
3059 static int
3060 kphysm_setup_pre_del(pgcnt_t delta_pages)
3061 {
3062 	uint_t i;
3063 	int ret;
3064 	int aret;
3065 
3066 	ret = 0;
3067 	rw_enter(&mem_callback_rwlock, RW_READER);
3068 	for (i = 0; i < nmemcallbacks; i++) {
3069 		if (mem_callbacks[i].vec != NULL) {
3070 			aret = (*mem_callbacks[i].vec->pre_del)
3071 			    (mem_callbacks[i].arg, delta_pages);
3072 			ret |= aret;
3073 		}
3074 	}
3075 
3076 	return (ret);
3077 }
3078 
3079 static void
3080 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3081 {
3082 	uint_t i;
3083 
3084 	for (i = 0; i < nmemcallbacks; i++) {
3085 		if (mem_callbacks[i].vec != NULL) {
3086 			(*mem_callbacks[i].vec->post_del)
3087 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3088 		}
3089 	}
3090 	rw_exit(&mem_callback_rwlock);
3091 }
3092 
3093 static int
3094 kphysm_split_memseg(
3095 	pfn_t base,
3096 	pgcnt_t npgs)
3097 {
3098 	struct memseg *seg;
3099 	struct memseg **segpp;
3100 	pgcnt_t size_low, size_high;
3101 	struct memseg *seg_low, *seg_mid, *seg_high;
3102 
3103 	/*
3104 	 * Lock the memsegs list against other updates now
3105 	 */
3106 	memsegs_lock(1);
3107 
3108 	/*
3109 	 * Find boot time memseg that wholly covers this area.
3110 	 */
3111 
3112 	/* First find the memseg with page 'base' in it. */
3113 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3114 	    segpp = &((*segpp)->next)) {
3115 		if (base >= seg->pages_base && base < seg->pages_end)
3116 			break;
3117 	}
3118 	if (seg == NULL) {
3119 		memsegs_unlock(1);
3120 		return (0);
3121 	}
3122 	if (memseg_includes_meta(seg)) {
3123 		memsegs_unlock(1);
3124 		return (0);
3125 	}
3126 	if ((base + npgs) > seg->pages_end) {
3127 		memsegs_unlock(1);
3128 		return (0);
3129 	}
3130 
3131 	/*
3132 	 * Work out the size of the two segments that will
3133 	 * surround the new segment, one for low address
3134 	 * and one for high.
3135 	 */
3136 	ASSERT(base >= seg->pages_base);
3137 	size_low = base - seg->pages_base;
3138 	ASSERT(seg->pages_end >= (base + npgs));
3139 	size_high = seg->pages_end - (base + npgs);
3140 
3141 	/*
3142 	 * Sanity check.
3143 	 */
3144 	if ((size_low + size_high) == 0) {
3145 		memsegs_unlock(1);
3146 		return (0);
3147 	}
3148 
3149 	/*
3150 	 * Allocate the new structures. The old memseg will not be freed
3151 	 * as there may be a reference to it.
3152 	 */
3153 	seg_low = NULL;
3154 	seg_high = NULL;
3155 
3156 	if (size_low != 0)
3157 		seg_low = memseg_alloc();
3158 
3159 	seg_mid = memseg_alloc();
3160 
3161 	if (size_high != 0)
3162 		seg_high = memseg_alloc();
3163 
3164 	/*
3165 	 * All allocation done now.
3166 	 */
3167 	if (size_low != 0) {
3168 		seg_low->pages = seg->pages;
3169 		seg_low->epages = seg_low->pages + size_low;
3170 		seg_low->pages_base = seg->pages_base;
3171 		seg_low->pages_end = seg_low->pages_base + size_low;
3172 		seg_low->next = seg_mid;
3173 		seg_low->msegflags = seg->msegflags;
3174 	}
3175 	if (size_high != 0) {
3176 		seg_high->pages = seg->epages - size_high;
3177 		seg_high->epages = seg_high->pages + size_high;
3178 		seg_high->pages_base = seg->pages_end - size_high;
3179 		seg_high->pages_end = seg_high->pages_base + size_high;
3180 		seg_high->next = seg->next;
3181 		seg_high->msegflags = seg->msegflags;
3182 	}
3183 
3184 	seg_mid->pages = seg->pages + size_low;
3185 	seg_mid->pages_base = seg->pages_base + size_low;
3186 	seg_mid->epages = seg->epages - size_high;
3187 	seg_mid->pages_end = seg->pages_end - size_high;
3188 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3189 	seg_mid->msegflags = seg->msegflags;
3190 
3191 	/*
3192 	 * Update hat_kpm specific info of all involved memsegs and
3193 	 * allow hat_kpm specific global chain updates.
3194 	 */
3195 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3196 
3197 	/*
3198 	 * At this point we have two equivalent memseg sub-chains,
3199 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3200 	 * the same place in the global chain. By re-writing the pointer
3201 	 * in the previous element we switch atomically from using the old
3202 	 * (seg) to the new.
3203 	 */
3204 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3205 
3206 	membar_enter();
3207 
3208 	build_pfn_hash();
3209 	memsegs_unlock(1);
3210 
3211 	/*
3212 	 * We leave the old segment, 'seg', intact as there may be
3213 	 * references to it. Also, as the value of total_pages has not
3214 	 * changed and the memsegs list is effectively the same when
3215 	 * accessed via the old or the new pointer, we do not have to
3216 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3217 	 *
3218 	 * We currently do not re-use or reclaim the page_t memory.
3219 	 * If we do, then this may have to change.
3220 	 */
3221 
3222 	mutex_enter(&memseg_lists_lock);
3223 	seg->lnext = memseg_edit_junk;
3224 	memseg_edit_junk = seg;
3225 	mutex_exit(&memseg_lists_lock);
3226 
3227 	return (1);
3228 }
3229 
3230 /*
3231  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3232  * structure using physical addresses. Therefore a kmem_cache is
3233  * used with KMC_NOHASH to avoid page crossings within a memseg
3234  * structure. KMC_NOHASH requires that no external (outside of
3235  * slab) information is allowed. This, in turn, implies that the
3236  * cache's slabsize must be exactly a single page, since per-slab
3237  * information (e.g. the freelist for the slab) is kept at the
3238  * end of the slab, where it is easy to locate. Should be changed
3239  * when a more obvious kmem_cache interface/flag will become
3240  * available.
3241  */
3242 void
3243 mem_config_init()
3244 {
3245 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3246 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3247 }
3248 
3249 struct memseg *
3250 memseg_alloc()
3251 {
3252 	struct memseg *seg;
3253 
3254 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3255 	bzero(seg, sizeof (struct memseg));
3256 
3257 	return (seg);
3258 }
3259 
3260 /*
3261  * Return whether the page_t memory for this memseg
3262  * is included in the memseg itself.
3263  */
3264 static int
3265 memseg_includes_meta(struct memseg *seg)
3266 {
3267 	return (seg->msegflags & MEMSEG_META_INCL);
3268 }
3269 
3270 pfn_t
3271 memseg_get_start(struct memseg *seg)
3272 {
3273 	pfn_t		pt_start;
3274 
3275 	if (memseg_includes_meta(seg)) {
3276 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3277 
3278 		/* Meta data is required to be at the beginning */
3279 		ASSERT(pt_start < seg->pages_base);
3280 	} else
3281 		pt_start = seg->pages_base;
3282 
3283 	return (pt_start);
3284 }
3285 
3286 /*
3287  * Invalidate memseg pointers in cpu private vm data caches.
3288  */
3289 static void
3290 memseg_cpu_vm_flush()
3291 {
3292 	cpu_t *cp;
3293 	vm_cpu_data_t *vc;
3294 
3295 	mutex_enter(&cpu_lock);
3296 	pause_cpus(NULL);
3297 
3298 	cp = cpu_list;
3299 	do {
3300 		vc = cp->cpu_vm_data;
3301 		vc->vc_pnum_memseg = NULL;
3302 		vc->vc_pnext_memseg = NULL;
3303 
3304 	} while ((cp = cp->cpu_next) != cpu_list);
3305 
3306 	start_cpus();
3307 	mutex_exit(&cpu_lock);
3308 }
3309