xref: /illumos-gate/usr/src/uts/common/os/mem_config.c (revision 622200ad88c6c6382403a01985a94e22484baac6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/cmn_err.h>
31 #include <sys/vmem.h>
32 #include <sys/kmem.h>
33 #include <sys/systm.h>
34 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
35 #include <sys/errno.h>
36 #include <sys/memnode.h>
37 #include <sys/memlist.h>
38 #include <sys/memlist_impl.h>
39 #include <sys/tuneable.h>
40 #include <sys/proc.h>
41 #include <sys/disp.h>
42 #include <sys/debug.h>
43 #include <sys/vm.h>
44 #include <sys/callb.h>
45 #include <sys/memlist_plat.h>	/* for installed_top_size() */
46 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
47 #include <sys/dumphdr.h>	/* for dump_resize() */
48 #include <sys/atomic.h>		/* for use in stats collection */
49 #include <sys/rwlock.h>
50 #include <sys/cpuvar.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_kpm.h>
53 #include <vm/page.h>
54 #include <vm/vm_dep.h>
55 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
56 #include <sys/sunddi.h>
57 #include <sys/mem_config.h>
58 #include <sys/mem_cage.h>
59 #include <sys/lgrp.h>
60 #include <sys/ddi.h>
61 #include <sys/modctl.h>
62 
63 extern void memlist_read_lock(void);
64 extern void memlist_read_unlock(void);
65 extern void memlist_write_lock(void);
66 extern void memlist_write_unlock(void);
67 
68 extern struct memlist *phys_avail;
69 
70 extern void mem_node_add(pfn_t, pfn_t);
71 extern void mem_node_del(pfn_t, pfn_t);
72 
73 extern uint_t page_ctrs_adjust(int);
74 static void kphysm_setup_post_add(pgcnt_t);
75 static int kphysm_setup_pre_del(pgcnt_t);
76 static void kphysm_setup_post_del(pgcnt_t, int);
77 
78 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
79 
80 static int delspan_reserve(pfn_t, pgcnt_t);
81 static void delspan_unreserve(pfn_t, pgcnt_t);
82 
83 static kmutex_t memseg_lists_lock;
84 static struct memseg *memseg_va_avail;
85 static struct memseg *memseg_delete_junk;
86 static struct memseg *memseg_edit_junk;
87 void memseg_remap_init(void);
88 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
89 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
90 static struct memseg *memseg_reuse(pgcnt_t);
91 
92 static struct kmem_cache *memseg_cache;
93 
94 /*
95  * Add a chunk of memory to the system.  page_t's for this memory
96  * are allocated in the first few pages of the chunk.
97  * base: starting PAGESIZE page of new memory.
98  * npgs: length in PAGESIZE pages.
99  *
100  * Adding mem this way doesn't increase the size of the hash tables;
101  * growing them would be too hard.  This should be OK, but adding memory
102  * dynamically most likely means more hash misses, since the tables will
103  * be smaller than they otherwise would be.
104  */
105 int
106 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
107 {
108 	page_t		*pp;
109 	page_t		*opp, *oepp;
110 	struct memseg	*seg;
111 	uint64_t	avmem;
112 	pfn_t		pfn;
113 	pfn_t		pt_base = base;
114 	pgcnt_t		tpgs = npgs;
115 	pgcnt_t		metapgs;
116 	int		exhausted;
117 	pfn_t		pnum;
118 	int		mnode;
119 	caddr_t		vaddr;
120 	int		reuse;
121 	int		mlret;
122 	void		*mapva;
123 	pgcnt_t		nkpmpgs = 0;
124 	offset_t	kpm_pages_off;
125 
126 	cmn_err(CE_CONT,
127 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
128 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
129 
130 	/*
131 	 * Add this span in the delete list to prevent interactions.
132 	 */
133 	if (!delspan_reserve(base, npgs)) {
134 		return (KPHYSM_ESPAN);
135 	}
136 	/*
137 	 * Check to see if any of the memory span has been added
138 	 * by trying an add to the installed memory list. This
139 	 * forms the interlocking process for add.
140 	 */
141 
142 	memlist_write_lock();
143 
144 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
145 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
146 
147 	if (mlret == MEML_SPANOP_OK)
148 		installed_top_size(phys_install, &physmax, &physinstalled);
149 
150 	memlist_write_unlock();
151 
152 	if (mlret != MEML_SPANOP_OK) {
153 		if (mlret == MEML_SPANOP_EALLOC) {
154 			delspan_unreserve(pt_base, tpgs);
155 			return (KPHYSM_ERESOURCE);
156 		} else
157 		if (mlret == MEML_SPANOP_ESPAN) {
158 			delspan_unreserve(pt_base, tpgs);
159 			return (KPHYSM_ESPAN);
160 		} else {
161 			delspan_unreserve(pt_base, tpgs);
162 			return (KPHYSM_ERESOURCE);
163 		}
164 	}
165 
166 	/*
167 	 * We store the page_t's for this new memory in the first
168 	 * few pages of the chunk. Here, we go and get'em ...
169 	 */
170 
171 	/*
172 	 * The expression after the '-' gives the number of pages
173 	 * that will fit in the new memory based on a requirement
174 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
175 	 */
176 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
177 	    (PAGESIZE + sizeof (page_t)));
178 
179 	npgs -= metapgs;
180 	base += metapgs;
181 
182 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
183 
184 	exhausted = (metapgs == 0 || npgs == 0);
185 
186 	if (kpm_enable && !exhausted) {
187 		pgcnt_t start, end, nkpmpgs_prelim;
188 		size_t	ptsz;
189 
190 		/*
191 		 * A viable kpm large page mapping must not overlap two
192 		 * dynamic memsegs. Therefore the total size is checked
193 		 * to be at least kpm_pgsz and also whether start and end
194 		 * points are at least kpm_pgsz aligned.
195 		 */
196 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
197 		    pmodkpmp(base + npgs)) {
198 
199 			kphysm_addmem_error_undospan(pt_base, tpgs);
200 
201 			/*
202 			 * There is no specific error code for violating
203 			 * kpm granularity constraints.
204 			 */
205 			return (KPHYSM_ENOTVIABLE);
206 		}
207 
208 		start = kpmptop(ptokpmp(base));
209 		end = kpmptop(ptokpmp(base + npgs));
210 		nkpmpgs_prelim = ptokpmp(end - start);
211 		ptsz = npgs * sizeof (page_t);
212 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
213 		exhausted = (tpgs <= metapgs);
214 		if (!exhausted) {
215 			npgs = tpgs - metapgs;
216 			base = pt_base + metapgs;
217 
218 			/* final nkpmpgs */
219 			start = kpmptop(ptokpmp(base));
220 			nkpmpgs = ptokpmp(end - start);
221 			kpm_pages_off = ptsz +
222 				(nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
223 		}
224 	}
225 
226 	/*
227 	 * Is memory area supplied too small?
228 	 */
229 	if (exhausted) {
230 		kphysm_addmem_error_undospan(pt_base, tpgs);
231 
232 		/*
233 		 * There is no specific error code for 'too small'.
234 		 */
235 		return (KPHYSM_ERESOURCE);
236 	}
237 
238 	/*
239 	 * We may re-use a previously allocated VA space for the page_ts
240 	 * eventually, but we need to initialize and lock the pages first.
241 	 */
242 
243 	/*
244 	 * Get an address in the kernel address map, map
245 	 * the page_t pages and see if we can touch them.
246 	 */
247 
248 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
249 	if (mapva == NULL) {
250 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
251 		    " Can't allocate VA for page_ts");
252 
253 		kphysm_addmem_error_undospan(pt_base, tpgs);
254 
255 		return (KPHYSM_ERESOURCE);
256 	}
257 	pp = mapva;
258 
259 	if (physmax < (pt_base + tpgs))
260 		physmax = (pt_base + tpgs);
261 
262 	/*
263 	 * In the remapping code we map one page at a time so we must do
264 	 * the same here to match mapping sizes.
265 	 */
266 	pfn = pt_base;
267 	vaddr = (caddr_t)pp;
268 	for (pnum = 0; pnum < metapgs; pnum++) {
269 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
270 		    PROT_READ | PROT_WRITE,
271 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
272 		pfn++;
273 		vaddr += ptob(1);
274 	}
275 
276 	if (ddi_peek32((dev_info_t *)NULL,
277 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
278 
279 		cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:"
280 		    " Can't access pp array at 0x%p [phys 0x%lx]",
281 		    (void *)pp, pt_base);
282 
283 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
284 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
285 
286 		vmem_free(heap_arena, mapva, ptob(metapgs));
287 
288 		kphysm_addmem_error_undospan(pt_base, tpgs);
289 
290 		return (KPHYSM_EFAULT);
291 	}
292 
293 	/*
294 	 * Add this memory slice to its memory node translation.
295 	 *
296 	 * Note that right now, each node may have only one slice;
297 	 * this may change with COD or in larger SSM systems with
298 	 * nested latency groups, so we must not assume that the
299 	 * node does not yet exist.
300 	 */
301 	pnum = base + npgs - 1;
302 	mem_node_add_slice(base, pnum);
303 
304 	/*
305 	 * Allocate or resize page counters as necessary to accomodate
306 	 * the increase in memory pages.
307 	 */
308 	mnode = PFN_2_MEM_NODE(pnum);
309 	if (page_ctrs_adjust(mnode) != 0) {
310 
311 		mem_node_pre_del_slice(base, pnum);
312 		mem_node_post_del_slice(base, pnum, 0);
313 
314 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
315 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
316 
317 		vmem_free(heap_arena, mapva, ptob(metapgs));
318 
319 		kphysm_addmem_error_undospan(pt_base, tpgs);
320 
321 		return (KPHYSM_ERESOURCE);
322 	}
323 
324 	/*
325 	 * Update the phys_avail memory list.
326 	 * The phys_install list was done at the start.
327 	 */
328 
329 	memlist_write_lock();
330 
331 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
332 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
333 	ASSERT(mlret == MEML_SPANOP_OK);
334 
335 	memlist_write_unlock();
336 
337 	/* See if we can find a memseg to re-use. */
338 	seg = memseg_reuse(metapgs);
339 
340 	reuse = (seg != NULL);
341 
342 	/*
343 	 * Initialize the memseg structure representing this memory
344 	 * and add it to the existing list of memsegs. Do some basic
345 	 * initialization and add the memory to the system.
346 	 * In order to prevent lock deadlocks, the add_physmem()
347 	 * code is repeated here, but split into several stages.
348 	 */
349 	if (seg == NULL) {
350 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
351 		bzero(seg, sizeof (struct memseg));
352 		seg->msegflags = MEMSEG_DYNAMIC;
353 		seg->pages = pp;
354 	} else {
355 		/*EMPTY*/
356 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
357 	}
358 
359 	seg->epages = seg->pages + npgs;
360 	seg->pages_base = base;
361 	seg->pages_end = base + npgs;
362 
363 	/*
364 	 * Initialize metadata. The page_ts are set to locked state
365 	 * ready to be freed.
366 	 */
367 	bzero((caddr_t)pp, ptob(metapgs));
368 
369 	pfn = seg->pages_base;
370 	/* Save the original pp base in case we reuse a memseg. */
371 	opp = pp;
372 	oepp = opp + npgs;
373 	for (pp = opp; pp < oepp; pp++) {
374 		pp->p_pagenum = pfn;
375 		pfn++;
376 		page_iolock_init(pp);
377 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
378 			continue;
379 		pp->p_offset = (u_offset_t)-1;
380 	}
381 
382 	if (reuse) {
383 		/* Remap our page_ts to the re-used memseg VA space. */
384 		pfn = pt_base;
385 		vaddr = (caddr_t)seg->pages;
386 		for (pnum = 0; pnum < metapgs; pnum++) {
387 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
388 			    PROT_READ | PROT_WRITE,
389 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
390 			pfn++;
391 			vaddr += ptob(1);
392 		}
393 
394 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
395 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
396 
397 		vmem_free(heap_arena, mapva, ptob(metapgs));
398 	}
399 
400 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
401 
402 	memsegs_lock(1);
403 
404 	/*
405 	 * The new memseg is inserted at the beginning of the list.
406 	 * Not only does this save searching for the tail, but in the
407 	 * case of a re-used memseg, it solves the problem of what
408 	 * happens of some process has still got a pointer to the
409 	 * memseg and follows the next pointer to continue traversing
410 	 * the memsegs list.
411 	 */
412 
413 	hat_kpm_addmem_mseg_insert(seg);
414 
415 	seg->next = memsegs;
416 	membar_producer();
417 
418 	hat_kpm_addmem_memsegs_update(seg);
419 
420 	memsegs = seg;
421 
422 	build_pfn_hash();
423 
424 	total_pages += npgs;
425 
426 	/*
427 	 * Recalculate the paging parameters now total_pages has changed.
428 	 * This will also cause the clock hands to be reset before next use.
429 	 */
430 	setupclock(1);
431 
432 	memsegs_unlock(1);
433 
434 	/*
435 	 * Free the pages outside the lock to avoid locking loops.
436 	 */
437 	for (pp = seg->pages; pp < seg->epages; pp++) {
438 		page_free(pp, 1);
439 	}
440 
441 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
442 
443 	/*
444 	 * Now that we've updated the appropriate memory lists we
445 	 * need to reset a number of globals, since we've increased memory.
446 	 * Several have already been updated for us as noted above. The
447 	 * globals we're interested in at this point are:
448 	 *   physmax - highest page frame number.
449 	 *   physinstalled - number of pages currently installed (done earlier)
450 	 *   maxmem - max free pages in the system
451 	 *   physmem - physical memory pages available
452 	 *   availrmem - real memory available
453 	 */
454 
455 	mutex_enter(&freemem_lock);
456 	maxmem += npgs;
457 	physmem += npgs;
458 	availrmem += npgs;
459 	availrmem_initial += npgs;
460 
461 	mutex_exit(&freemem_lock);
462 
463 	dump_resize();
464 
465 	page_freelist_coalesce_all(mnode);
466 
467 	kphysm_setup_post_add(npgs);
468 
469 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
470 	    "(0x%" PRIx64 ")\n",
471 	    physinstalled << (PAGESHIFT - 10),
472 	    (uint64_t)physinstalled << PAGESHIFT);
473 
474 	avmem = (uint64_t)freemem << PAGESHIFT;
475 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
476 	    "avail mem = %" PRId64 "\n", avmem);
477 
478 	/*
479 	 * Update lgroup generation number on single lgroup systems
480 	 */
481 	if (nlgrps == 1)
482 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
483 
484 	delspan_unreserve(pt_base, tpgs);
485 	return (KPHYSM_OK);		/* Successfully added system memory */
486 
487 }
488 
489 /*
490  * There are various error conditions in kphysm_add_memory_dynamic()
491  * which require a rollback of already changed global state.
492  */
493 static void
494 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
495 {
496 	int mlret;
497 
498 	/* Unreserve memory span. */
499 	memlist_write_lock();
500 
501 	mlret = memlist_delete_span(
502 	    (uint64_t)(pt_base) << PAGESHIFT,
503 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
504 
505 	ASSERT(mlret == MEML_SPANOP_OK);
506 	phys_install_has_changed();
507 	installed_top_size(phys_install, &physmax, &physinstalled);
508 
509 	memlist_write_unlock();
510 	delspan_unreserve(pt_base, tpgs);
511 }
512 
513 /*
514  * Only return an available memseg of exactly the right size.
515  * When the meta data area has it's own virtual address space
516  * we will need to manage this more carefully and do best fit
517  * allocations, possibly splitting an availble area.
518  */
519 static struct memseg *
520 memseg_reuse(pgcnt_t metapgs)
521 {
522 	struct memseg **segpp, *seg;
523 
524 	mutex_enter(&memseg_lists_lock);
525 
526 	segpp = &memseg_va_avail;
527 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
528 		caddr_t end;
529 
530 		if (kpm_enable)
531 			end = hat_kpm_mseg_reuse(seg);
532 		else
533 			end = (caddr_t)seg->epages;
534 
535 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
536 			*segpp = seg->lnext;
537 			seg->lnext = NULL;
538 			break;
539 		}
540 	}
541 	mutex_exit(&memseg_lists_lock);
542 
543 	return (seg);
544 }
545 
546 static uint_t handle_gen;
547 
548 struct memdelspan {
549 	struct memdelspan *mds_next;
550 	pfn_t		mds_base;
551 	pgcnt_t		mds_npgs;
552 	uint_t		*mds_bitmap;
553 	uint_t		*mds_bitmap_retired;
554 };
555 
556 #define	NBPBMW		(sizeof (uint_t) * NBBY)
557 #define	MDS_BITMAPBYTES(MDSP) \
558 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
559 
560 struct transit_list {
561 	struct transit_list	*trl_next;
562 	struct memdelspan	*trl_spans;
563 	int			trl_collect;
564 };
565 
566 struct transit_list_head {
567 	kmutex_t		trh_lock;
568 	struct transit_list	*trh_head;
569 };
570 
571 static struct transit_list_head transit_list_head;
572 
573 struct mem_handle;
574 static void transit_list_collect(struct mem_handle *, int);
575 static void transit_list_insert(struct transit_list *);
576 static void transit_list_remove(struct transit_list *);
577 
578 #ifdef DEBUG
579 #define	MEM_DEL_STATS
580 #endif /* DEBUG */
581 
582 #ifdef MEM_DEL_STATS
583 static int mem_del_stat_print = 0;
584 struct mem_del_stat {
585 	uint_t	nloop;
586 	uint_t	need_free;
587 	uint_t	free_loop;
588 	uint_t	free_low;
589 	uint_t	free_failed;
590 	uint_t	ncheck;
591 	uint_t	nopaget;
592 	uint_t	lockfail;
593 	uint_t	nfree;
594 	uint_t	nreloc;
595 	uint_t	nrelocfail;
596 	uint_t	already_done;
597 	uint_t	first_notfree;
598 	uint_t	npplocked;
599 	uint_t	nlockreloc;
600 	uint_t	nnorepl;
601 	uint_t	nmodreloc;
602 	uint_t	ndestroy;
603 	uint_t	nputpage;
604 	uint_t	nnoreclaim;
605 	uint_t	ndelay;
606 	uint_t	demotefail;
607 	uint64_t nticks_total;
608 	uint64_t nticks_pgrp;
609 	uint_t	retired;
610 	uint_t	toxic;
611 	uint_t	failing;
612 	uint_t	modtoxic;
613 	uint_t	npplkdtoxic;
614 	uint_t	gptlmodfail;
615 	uint_t	gptllckfail;
616 };
617 /*
618  * The stat values are only incremented in the delete thread
619  * so no locking or atomic required.
620  */
621 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
622 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
623 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
624 static void mem_del_stat_print_func(struct mem_handle *);
625 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
626 #else /* MEM_DEL_STATS */
627 #define	MDSTAT_INCR(MHP, FLD)
628 #define	MDSTAT_TOTAL(MHP, ntck)
629 #define	MDSTAT_PGRP(MHP, ntck)
630 #define	MDSTAT_PRINT(MHP)
631 #endif /* MEM_DEL_STATS */
632 
633 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
634 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
635 
636 /*
637  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
638  * The mutex may not be required for other fields, dependent on mh_state.
639  */
640 struct mem_handle {
641 	kmutex_t	mh_mutex;
642 	struct mem_handle *mh_next;
643 	memhandle_t	mh_exthandle;
644 	mhnd_state_t	mh_state;
645 	struct transit_list mh_transit;
646 	pgcnt_t		mh_phys_pages;
647 	pgcnt_t		mh_vm_pages;
648 	pgcnt_t		mh_hold_todo;
649 	void		(*mh_delete_complete)(void *, int error);
650 	void		*mh_delete_complete_arg;
651 	volatile uint_t mh_cancel;
652 	volatile uint_t mh_dr_aio_cleanup_cancel;
653 	volatile uint_t mh_aio_cleanup_done;
654 	kcondvar_t	mh_cv;
655 	kthread_id_t	mh_thread_id;
656 	page_t		*mh_deleted;	/* link through p_next */
657 #ifdef MEM_DEL_STATS
658 	struct mem_del_stat mh_delstat;
659 #endif /* MEM_DEL_STATS */
660 };
661 
662 static struct mem_handle *mem_handle_head;
663 static kmutex_t mem_handle_list_mutex;
664 
665 static struct mem_handle *
666 kphysm_allocate_mem_handle()
667 {
668 	struct mem_handle *mhp;
669 
670 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
671 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
672 	mutex_enter(&mem_handle_list_mutex);
673 	mutex_enter(&mhp->mh_mutex);
674 	/* handle_gen is protected by list mutex. */
675 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
676 	mhp->mh_next = mem_handle_head;
677 	mem_handle_head = mhp;
678 	mutex_exit(&mem_handle_list_mutex);
679 
680 	return (mhp);
681 }
682 
683 static void
684 kphysm_free_mem_handle(struct mem_handle *mhp)
685 {
686 	struct mem_handle **mhpp;
687 
688 	ASSERT(mutex_owned(&mhp->mh_mutex));
689 	ASSERT(mhp->mh_state == MHND_FREE);
690 	/*
691 	 * Exit the mutex to preserve locking order. This is OK
692 	 * here as once in the FREE state, the handle cannot
693 	 * be found by a lookup.
694 	 */
695 	mutex_exit(&mhp->mh_mutex);
696 
697 	mutex_enter(&mem_handle_list_mutex);
698 	mhpp = &mem_handle_head;
699 	while (*mhpp != NULL && *mhpp != mhp)
700 		mhpp = &(*mhpp)->mh_next;
701 	ASSERT(*mhpp == mhp);
702 	/*
703 	 * No need to lock the handle (mh_mutex) as only
704 	 * mh_next changing and this is the only thread that
705 	 * can be referncing mhp.
706 	 */
707 	*mhpp = mhp->mh_next;
708 	mutex_exit(&mem_handle_list_mutex);
709 
710 	mutex_destroy(&mhp->mh_mutex);
711 	kmem_free(mhp, sizeof (struct mem_handle));
712 }
713 
714 /*
715  * This function finds the internal mem_handle corresponding to an
716  * external handle and returns it with the mh_mutex held.
717  */
718 static struct mem_handle *
719 kphysm_lookup_mem_handle(memhandle_t handle)
720 {
721 	struct mem_handle *mhp;
722 
723 	mutex_enter(&mem_handle_list_mutex);
724 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
725 		if (mhp->mh_exthandle == handle) {
726 			mutex_enter(&mhp->mh_mutex);
727 			/*
728 			 * The state of the handle could have been changed
729 			 * by kphysm_del_release() while waiting for mh_mutex.
730 			 */
731 			if (mhp->mh_state == MHND_FREE) {
732 				mutex_exit(&mhp->mh_mutex);
733 				continue;
734 			}
735 			break;
736 		}
737 	}
738 	mutex_exit(&mem_handle_list_mutex);
739 	return (mhp);
740 }
741 
742 int
743 kphysm_del_gethandle(memhandle_t *xmhp)
744 {
745 	struct mem_handle *mhp;
746 
747 	mhp = kphysm_allocate_mem_handle();
748 	/*
749 	 * The handle is allocated using KM_SLEEP, so cannot fail.
750 	 * If the implementation is changed, the correct error to return
751 	 * here would be KPHYSM_ENOHANDLES.
752 	 */
753 	ASSERT(mhp->mh_state == MHND_FREE);
754 	mhp->mh_state = MHND_INIT;
755 	*xmhp = mhp->mh_exthandle;
756 	mutex_exit(&mhp->mh_mutex);
757 	return (KPHYSM_OK);
758 }
759 
760 static int
761 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
762 {
763 	pfn_t e1, e2;
764 
765 	e1 = b1 + l1;
766 	e2 = b2 + l2;
767 
768 	return (!(b2 >= e1 || b1 >= e2));
769 }
770 
771 static int can_remove_pgs(pgcnt_t);
772 
773 static struct memdelspan *
774 span_to_install(pfn_t base, pgcnt_t npgs)
775 {
776 	struct memdelspan *mdsp;
777 	struct memdelspan *mdsp_new;
778 	uint64_t address, size, thislen;
779 	struct memlist *mlp;
780 
781 	mdsp_new = NULL;
782 
783 	address = (uint64_t)base << PAGESHIFT;
784 	size = (uint64_t)npgs << PAGESHIFT;
785 	while (size != 0) {
786 		memlist_read_lock();
787 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
788 			if (address >= (mlp->address + mlp->size))
789 				continue;
790 			if ((address + size) > mlp->address)
791 				break;
792 		}
793 		if (mlp == NULL) {
794 			address += size;
795 			size = 0;
796 			thislen = 0;
797 		} else {
798 			if (address < mlp->address) {
799 				size -= (mlp->address - address);
800 				address = mlp->address;
801 			}
802 			ASSERT(address >= mlp->address);
803 			if ((address + size) > (mlp->address + mlp->size)) {
804 				thislen = mlp->size - (address - mlp->address);
805 			} else {
806 				thislen = size;
807 			}
808 		}
809 		memlist_read_unlock();
810 		/* TODO: phys_install could change now */
811 		if (thislen == 0)
812 			continue;
813 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
814 		mdsp->mds_base = btop(address);
815 		mdsp->mds_npgs = btop(thislen);
816 		mdsp->mds_next = mdsp_new;
817 		mdsp_new = mdsp;
818 		address += thislen;
819 		size -= thislen;
820 	}
821 	return (mdsp_new);
822 }
823 
824 static void
825 free_delspans(struct memdelspan *mdsp)
826 {
827 	struct memdelspan *amdsp;
828 
829 	while ((amdsp = mdsp) != NULL) {
830 		mdsp = amdsp->mds_next;
831 		kmem_free(amdsp, sizeof (struct memdelspan));
832 	}
833 }
834 
835 /*
836  * Concatenate lists. No list ordering is required.
837  */
838 
839 static void
840 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
841 {
842 	while (*mdspp != NULL)
843 		mdspp = &(*mdspp)->mds_next;
844 
845 	*mdspp = mdsp;
846 }
847 
848 /*
849  * Given a new list of delspans, check there is no overlap with
850  * all existing span activity (add or delete) and then concatenate
851  * the new spans to the given list.
852  * Return 1 for OK, 0 if overlapping.
853  */
854 static int
855 delspan_insert(
856 	struct transit_list *my_tlp,
857 	struct memdelspan *mdsp_new)
858 {
859 	struct transit_list_head *trh;
860 	struct transit_list *tlp;
861 	int ret;
862 
863 	trh = &transit_list_head;
864 
865 	ASSERT(my_tlp != NULL);
866 	ASSERT(mdsp_new != NULL);
867 
868 	ret = 1;
869 	mutex_enter(&trh->trh_lock);
870 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
871 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
872 		struct memdelspan *mdsp;
873 
874 		for (mdsp = tlp->trl_spans; mdsp != NULL;
875 		    mdsp = mdsp->mds_next) {
876 			struct memdelspan *nmdsp;
877 
878 			for (nmdsp = mdsp_new; nmdsp != NULL;
879 			    nmdsp = nmdsp->mds_next) {
880 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
881 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
882 					ret = 0;
883 					goto done;
884 				}
885 			}
886 		}
887 	}
888 done:
889 	if (ret != 0) {
890 		if (my_tlp->trl_spans == NULL)
891 			transit_list_insert(my_tlp);
892 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
893 	}
894 	mutex_exit(&trh->trh_lock);
895 	return (ret);
896 }
897 
898 static void
899 delspan_remove(
900 	struct transit_list *my_tlp,
901 	pfn_t base,
902 	pgcnt_t npgs)
903 {
904 	struct transit_list_head *trh;
905 	struct memdelspan *mdsp;
906 
907 	trh = &transit_list_head;
908 
909 	ASSERT(my_tlp != NULL);
910 
911 	mutex_enter(&trh->trh_lock);
912 	if ((mdsp = my_tlp->trl_spans) != NULL) {
913 		if (npgs == 0) {
914 			my_tlp->trl_spans = NULL;
915 			free_delspans(mdsp);
916 			transit_list_remove(my_tlp);
917 		} else {
918 			struct memdelspan **prv;
919 
920 			prv = &my_tlp->trl_spans;
921 			while (mdsp != NULL) {
922 				pfn_t p_end;
923 
924 				p_end = mdsp->mds_base + mdsp->mds_npgs;
925 				if (mdsp->mds_base >= base &&
926 				    p_end <= (base + npgs)) {
927 					*prv = mdsp->mds_next;
928 					mdsp->mds_next = NULL;
929 					free_delspans(mdsp);
930 				} else {
931 					prv = &mdsp->mds_next;
932 				}
933 				mdsp = *prv;
934 			}
935 			if (my_tlp->trl_spans == NULL)
936 				transit_list_remove(my_tlp);
937 		}
938 	}
939 	mutex_exit(&trh->trh_lock);
940 }
941 
942 /*
943  * Reserve interface for add to stop delete before add finished.
944  * This list is only accessed through the delspan_insert/remove
945  * functions and so is fully protected by the mutex in struct transit_list.
946  */
947 
948 static struct transit_list reserve_transit;
949 
950 static int
951 delspan_reserve(pfn_t base, pgcnt_t npgs)
952 {
953 	struct memdelspan *mdsp;
954 	int ret;
955 
956 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
957 	mdsp->mds_base = base;
958 	mdsp->mds_npgs = npgs;
959 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
960 		free_delspans(mdsp);
961 	}
962 	return (ret);
963 }
964 
965 static void
966 delspan_unreserve(pfn_t base, pgcnt_t npgs)
967 {
968 	delspan_remove(&reserve_transit, base, npgs);
969 }
970 
971 /*
972  * Return whether memseg was created by kphysm_add_memory_dynamic().
973  * If this is the case and startp non zero, return also the start pfn
974  * of the meta data via startp.
975  */
976 static int
977 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
978 {
979 	pfn_t		pt_start;
980 
981 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
982 		return (0);
983 
984 	/* Meta data is required to be at the beginning */
985 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
986 
987 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
988 	if (startp != NULL)
989 		*startp = pt_start;
990 
991 	return (1);
992 }
993 
994 int
995 kphysm_del_span(
996 	memhandle_t handle,
997 	pfn_t base,
998 	pgcnt_t npgs)
999 {
1000 	struct mem_handle *mhp;
1001 	struct memseg *seg;
1002 	struct memdelspan *mdsp;
1003 	struct memdelspan *mdsp_new;
1004 	pgcnt_t phys_pages, vm_pages;
1005 	pfn_t p_end;
1006 	page_t *pp;
1007 	int ret;
1008 
1009 	mhp = kphysm_lookup_mem_handle(handle);
1010 	if (mhp == NULL) {
1011 		return (KPHYSM_EHANDLE);
1012 	}
1013 	if (mhp->mh_state != MHND_INIT) {
1014 		mutex_exit(&mhp->mh_mutex);
1015 		return (KPHYSM_ESEQUENCE);
1016 	}
1017 
1018 	/*
1019 	 * Intersect the span with the installed memory list (phys_install).
1020 	 */
1021 	mdsp_new = span_to_install(base, npgs);
1022 	if (mdsp_new == NULL) {
1023 		/*
1024 		 * No physical memory in this range. Is this an
1025 		 * error? If an attempt to start the delete is made
1026 		 * for OK returns from del_span such as this, start will
1027 		 * return an error.
1028 		 * Could return KPHYSM_ENOWORK.
1029 		 */
1030 		/*
1031 		 * It is assumed that there are no error returns
1032 		 * from span_to_install() due to kmem_alloc failure.
1033 		 */
1034 		mutex_exit(&mhp->mh_mutex);
1035 		return (KPHYSM_OK);
1036 	}
1037 	/*
1038 	 * Does this span overlap an existing span?
1039 	 */
1040 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1041 		/*
1042 		 * Differentiate between already on list for this handle
1043 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1044 		 */
1045 		ret = KPHYSM_EBUSY;
1046 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1047 		    mdsp = mdsp->mds_next) {
1048 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1049 			    base, npgs)) {
1050 				ret = KPHYSM_EDUP;
1051 				break;
1052 			}
1053 		}
1054 		mutex_exit(&mhp->mh_mutex);
1055 		free_delspans(mdsp_new);
1056 		return (ret);
1057 	}
1058 	/*
1059 	 * At this point the spans in mdsp_new have been inserted into the
1060 	 * list of spans for this handle and thereby to the global list of
1061 	 * spans being processed. Each of these spans must now be checked
1062 	 * for relocatability. As a side-effect segments in the memseg list
1063 	 * may be split.
1064 	 *
1065 	 * Note that mdsp_new can no longer be used as it is now part of
1066 	 * a larger list. Select elements of this larger list based
1067 	 * on base and npgs.
1068 	 */
1069 restart:
1070 	phys_pages = 0;
1071 	vm_pages = 0;
1072 	ret = KPHYSM_OK;
1073 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1074 	    mdsp = mdsp->mds_next) {
1075 		pgcnt_t pages_checked;
1076 
1077 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1078 			continue;
1079 		}
1080 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1081 		/*
1082 		 * The pages_checked count is a hack. All pages should be
1083 		 * checked for relocatability. Those not covered by memsegs
1084 		 * should be tested with arch_kphysm_del_span_ok().
1085 		 */
1086 		pages_checked = 0;
1087 		for (seg = memsegs; seg; seg = seg->next) {
1088 			pfn_t mseg_start;
1089 
1090 			if (seg->pages_base >= p_end ||
1091 			    seg->pages_end <= mdsp->mds_base) {
1092 				/* Span and memseg don't overlap. */
1093 				continue;
1094 			}
1095 			/* Check that segment is suitable for delete. */
1096 			if (memseg_is_dynamic(seg, &mseg_start)) {
1097 				/*
1098 				 * Can only delete whole added segments
1099 				 * for the moment.
1100 				 * Check that this is completely within the
1101 				 * span.
1102 				 */
1103 				if (mseg_start < mdsp->mds_base ||
1104 				    seg->pages_end > p_end) {
1105 					ret = KPHYSM_EBUSY;
1106 					break;
1107 				}
1108 				pages_checked += seg->pages_end - mseg_start;
1109 			} else {
1110 				/*
1111 				 * Set mseg_start for accounting below.
1112 				 */
1113 				mseg_start = seg->pages_base;
1114 				/*
1115 				 * If this segment is larger than the span,
1116 				 * try to split it. After the split, it
1117 				 * is necessary to restart.
1118 				 */
1119 				if (seg->pages_base < mdsp->mds_base ||
1120 				    seg->pages_end > p_end) {
1121 					pfn_t abase;
1122 					pgcnt_t anpgs;
1123 					int s_ret;
1124 
1125 					/* Split required.  */
1126 					if (mdsp->mds_base < seg->pages_base)
1127 						abase = seg->pages_base;
1128 					else
1129 						abase = mdsp->mds_base;
1130 					if (p_end > seg->pages_end)
1131 						anpgs = seg->pages_end - abase;
1132 					else
1133 						anpgs = p_end - abase;
1134 					s_ret = kphysm_split_memseg(abase,
1135 					    anpgs);
1136 					if (s_ret == 0) {
1137 						/* Split failed. */
1138 						ret = KPHYSM_ERESOURCE;
1139 						break;
1140 					}
1141 					goto restart;
1142 				}
1143 				pages_checked +=
1144 				    seg->pages_end - seg->pages_base;
1145 			}
1146 			/*
1147 			 * The memseg is wholly within the delete span.
1148 			 * The individual pages can now be checked.
1149 			 */
1150 			/* Cage test. */
1151 			for (pp = seg->pages; pp < seg->epages; pp++) {
1152 				if (PP_ISNORELOC(pp)) {
1153 					ret = KPHYSM_ENONRELOC;
1154 					break;
1155 				}
1156 			}
1157 			if (ret != KPHYSM_OK) {
1158 				break;
1159 			}
1160 			phys_pages += (seg->pages_end - mseg_start);
1161 			vm_pages += MSEG_NPAGES(seg);
1162 		}
1163 		if (ret != KPHYSM_OK)
1164 			break;
1165 		if (pages_checked != mdsp->mds_npgs) {
1166 			ret = KPHYSM_ENONRELOC;
1167 			break;
1168 		}
1169 	}
1170 
1171 	if (ret == KPHYSM_OK) {
1172 		mhp->mh_phys_pages += phys_pages;
1173 		mhp->mh_vm_pages += vm_pages;
1174 	} else {
1175 		/*
1176 		 * Keep holding the mh_mutex to prevent it going away.
1177 		 */
1178 		delspan_remove(&mhp->mh_transit, base, npgs);
1179 	}
1180 	mutex_exit(&mhp->mh_mutex);
1181 	return (ret);
1182 }
1183 
1184 int
1185 kphysm_del_span_query(
1186 	pfn_t base,
1187 	pgcnt_t npgs,
1188 	memquery_t *mqp)
1189 {
1190 	struct memdelspan *mdsp;
1191 	struct memdelspan *mdsp_new;
1192 	int done_first_nonreloc;
1193 
1194 	mqp->phys_pages = 0;
1195 	mqp->managed = 0;
1196 	mqp->nonrelocatable = 0;
1197 	mqp->first_nonrelocatable = 0;
1198 	mqp->last_nonrelocatable = 0;
1199 
1200 	mdsp_new = span_to_install(base, npgs);
1201 	/*
1202 	 * It is OK to proceed here if mdsp_new == NULL.
1203 	 */
1204 	done_first_nonreloc = 0;
1205 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1206 		pfn_t sbase;
1207 		pgcnt_t snpgs;
1208 
1209 		mqp->phys_pages += mdsp->mds_npgs;
1210 		sbase = mdsp->mds_base;
1211 		snpgs = mdsp->mds_npgs;
1212 		while (snpgs != 0) {
1213 			struct memseg *lseg, *seg;
1214 			pfn_t p_end;
1215 			page_t *pp;
1216 			pfn_t mseg_start;
1217 
1218 			p_end = sbase + snpgs;
1219 			/*
1220 			 * Find the lowest addressed memseg that starts
1221 			 * after sbase and account for it.
1222 			 * This is to catch dynamic memsegs whose start
1223 			 * is hidden.
1224 			 */
1225 			seg = NULL;
1226 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1227 				if ((lseg->pages_base >= sbase) ||
1228 				    (lseg->pages_base < p_end &&
1229 				    lseg->pages_end > sbase)) {
1230 					if (seg == NULL ||
1231 					    seg->pages_base > lseg->pages_base)
1232 						seg = lseg;
1233 				}
1234 			}
1235 			if (seg != NULL) {
1236 				if (!memseg_is_dynamic(seg, &mseg_start)) {
1237 					mseg_start = seg->pages_base;
1238 				}
1239 				/*
1240 				 * Now have the full extent of the memseg so
1241 				 * do the range check.
1242 				 */
1243 				if (mseg_start >= p_end ||
1244 				    seg->pages_end <= sbase) {
1245 					/* Span does not overlap memseg. */
1246 					seg = NULL;
1247 				}
1248 			}
1249 			/*
1250 			 * Account for gap either before the segment if
1251 			 * there is one or to the end of the span.
1252 			 */
1253 			if (seg == NULL || mseg_start > sbase) {
1254 				pfn_t a_end;
1255 
1256 				a_end = (seg == NULL) ? p_end : mseg_start;
1257 				/*
1258 				 * Check with arch layer for relocatability.
1259 				 */
1260 				if (arch_kphysm_del_span_ok(sbase,
1261 				    (a_end - sbase))) {
1262 					/*
1263 					 * No non-relocatble pages in this
1264 					 * area, avoid the fine-grained
1265 					 * test.
1266 					 */
1267 					snpgs -= (a_end - sbase);
1268 					sbase = a_end;
1269 				}
1270 				while (sbase < a_end) {
1271 					if (!arch_kphysm_del_span_ok(sbase,
1272 					    1)) {
1273 						mqp->nonrelocatable++;
1274 						if (!done_first_nonreloc) {
1275 							mqp->
1276 							    first_nonrelocatable
1277 							    = sbase;
1278 							done_first_nonreloc = 1;
1279 						}
1280 						mqp->last_nonrelocatable =
1281 						    sbase;
1282 					}
1283 					sbase++;
1284 					snpgs--;
1285 				}
1286 			}
1287 			if (seg != NULL) {
1288 				ASSERT(mseg_start <= sbase);
1289 				if (seg->pages_base != mseg_start &&
1290 				    seg->pages_base > sbase) {
1291 					pgcnt_t skip_pgs;
1292 
1293 					/*
1294 					 * Skip the page_t area of a
1295 					 * dynamic memseg.
1296 					 */
1297 					skip_pgs = seg->pages_base - sbase;
1298 					if (snpgs <= skip_pgs) {
1299 						sbase += snpgs;
1300 						snpgs = 0;
1301 						continue;
1302 					}
1303 					snpgs -= skip_pgs;
1304 					sbase += skip_pgs;
1305 				}
1306 				ASSERT(snpgs != 0);
1307 				ASSERT(seg->pages_base <= sbase);
1308 				/*
1309 				 * The individual pages can now be checked.
1310 				 */
1311 				for (pp = seg->pages +
1312 				    (sbase - seg->pages_base);
1313 				    snpgs != 0 && pp < seg->epages; pp++) {
1314 					mqp->managed++;
1315 					if (PP_ISNORELOC(pp)) {
1316 						mqp->nonrelocatable++;
1317 						if (!done_first_nonreloc) {
1318 							mqp->
1319 							    first_nonrelocatable
1320 							    = sbase;
1321 							done_first_nonreloc = 1;
1322 						}
1323 						mqp->last_nonrelocatable =
1324 						    sbase;
1325 					}
1326 					sbase++;
1327 					snpgs--;
1328 				}
1329 			}
1330 		}
1331 	}
1332 
1333 	free_delspans(mdsp_new);
1334 
1335 	return (KPHYSM_OK);
1336 }
1337 
1338 /*
1339  * This release function can be called at any stage as follows:
1340  *	_gethandle only called
1341  *	_span(s) only called
1342  *	_start called but failed
1343  *	delete thread exited
1344  */
1345 int
1346 kphysm_del_release(memhandle_t handle)
1347 {
1348 	struct mem_handle *mhp;
1349 
1350 	mhp = kphysm_lookup_mem_handle(handle);
1351 	if (mhp == NULL) {
1352 		return (KPHYSM_EHANDLE);
1353 	}
1354 	switch (mhp->mh_state) {
1355 	case MHND_STARTING:
1356 	case MHND_RUNNING:
1357 		mutex_exit(&mhp->mh_mutex);
1358 		return (KPHYSM_ENOTFINISHED);
1359 	case MHND_FREE:
1360 		ASSERT(mhp->mh_state != MHND_FREE);
1361 		mutex_exit(&mhp->mh_mutex);
1362 		return (KPHYSM_EHANDLE);
1363 	case MHND_INIT:
1364 		break;
1365 	case MHND_DONE:
1366 		break;
1367 	case MHND_RELEASE:
1368 		mutex_exit(&mhp->mh_mutex);
1369 		return (KPHYSM_ESEQUENCE);
1370 	default:
1371 #ifdef DEBUG
1372 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1373 		    (void *)mhp, mhp->mh_state);
1374 #endif /* DEBUG */
1375 		mutex_exit(&mhp->mh_mutex);
1376 		return (KPHYSM_EHANDLE);
1377 	}
1378 	/*
1379 	 * Set state so that we can wait if necessary.
1380 	 * Also this means that we have read/write access to all
1381 	 * fields except mh_exthandle and mh_state.
1382 	 */
1383 	mhp->mh_state = MHND_RELEASE;
1384 	/*
1385 	 * The mem_handle cannot be de-allocated by any other operation
1386 	 * now, so no need to hold mh_mutex.
1387 	 */
1388 	mutex_exit(&mhp->mh_mutex);
1389 
1390 	delspan_remove(&mhp->mh_transit, 0, 0);
1391 	mhp->mh_phys_pages = 0;
1392 	mhp->mh_vm_pages = 0;
1393 	mhp->mh_hold_todo = 0;
1394 	mhp->mh_delete_complete = NULL;
1395 	mhp->mh_delete_complete_arg = NULL;
1396 	mhp->mh_cancel = 0;
1397 
1398 	mutex_enter(&mhp->mh_mutex);
1399 	ASSERT(mhp->mh_state == MHND_RELEASE);
1400 	mhp->mh_state = MHND_FREE;
1401 
1402 	kphysm_free_mem_handle(mhp);
1403 
1404 	return (KPHYSM_OK);
1405 }
1406 
1407 /*
1408  * This cancel function can only be called with the thread running.
1409  */
1410 int
1411 kphysm_del_cancel(memhandle_t handle)
1412 {
1413 	struct mem_handle *mhp;
1414 
1415 	mhp = kphysm_lookup_mem_handle(handle);
1416 	if (mhp == NULL) {
1417 		return (KPHYSM_EHANDLE);
1418 	}
1419 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1420 		mutex_exit(&mhp->mh_mutex);
1421 		return (KPHYSM_ENOTRUNNING);
1422 	}
1423 	/*
1424 	 * Set the cancel flag and wake the delete thread up.
1425 	 * The thread may be waiting on I/O, so the effect of the cancel
1426 	 * may be delayed.
1427 	 */
1428 	if (mhp->mh_cancel == 0) {
1429 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1430 		cv_signal(&mhp->mh_cv);
1431 	}
1432 	mutex_exit(&mhp->mh_mutex);
1433 	return (KPHYSM_OK);
1434 }
1435 
1436 int
1437 kphysm_del_status(
1438 	memhandle_t handle,
1439 	memdelstat_t *mdstp)
1440 {
1441 	struct mem_handle *mhp;
1442 
1443 	mhp = kphysm_lookup_mem_handle(handle);
1444 	if (mhp == NULL) {
1445 		return (KPHYSM_EHANDLE);
1446 	}
1447 	/*
1448 	 * Calling kphysm_del_status() is allowed before the delete
1449 	 * is started to allow for status display.
1450 	 */
1451 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1452 	    mhp->mh_state != MHND_RUNNING) {
1453 		mutex_exit(&mhp->mh_mutex);
1454 		return (KPHYSM_ENOTRUNNING);
1455 	}
1456 	mdstp->phys_pages = mhp->mh_phys_pages;
1457 	mdstp->managed = mhp->mh_vm_pages;
1458 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1459 	mutex_exit(&mhp->mh_mutex);
1460 	return (KPHYSM_OK);
1461 }
1462 
1463 static int mem_delete_additional_pages = 100;
1464 
1465 static int
1466 can_remove_pgs(pgcnt_t npgs)
1467 {
1468 	/*
1469 	 * If all pageable pages were paged out, freemem would
1470 	 * equal availrmem.  There is a minimum requirement for
1471 	 * availrmem.
1472 	 */
1473 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1474 	    < npgs)
1475 		return (0);
1476 	/* TODO: check swap space, etc. */
1477 	return (1);
1478 }
1479 
1480 static int
1481 get_availrmem(pgcnt_t npgs)
1482 {
1483 	int ret;
1484 
1485 	mutex_enter(&freemem_lock);
1486 	ret = can_remove_pgs(npgs);
1487 	if (ret != 0)
1488 		availrmem -= npgs;
1489 	mutex_exit(&freemem_lock);
1490 	return (ret);
1491 }
1492 
1493 static void
1494 put_availrmem(pgcnt_t npgs)
1495 {
1496 	mutex_enter(&freemem_lock);
1497 	availrmem += npgs;
1498 	mutex_exit(&freemem_lock);
1499 }
1500 
1501 #define	FREEMEM_INCR	100
1502 static pgcnt_t freemem_incr = FREEMEM_INCR;
1503 #define	DEL_FREE_WAIT_FRAC	4
1504 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1505 
1506 #define	DEL_BUSY_WAIT_FRAC	20
1507 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1508 
1509 static void kphysm_del_cleanup(struct mem_handle *);
1510 
1511 static void page_delete_collect(page_t *, struct mem_handle *);
1512 
1513 static pgcnt_t
1514 delthr_get_freemem(struct mem_handle *mhp)
1515 {
1516 	pgcnt_t free_get;
1517 	int ret;
1518 
1519 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1520 
1521 	MDSTAT_INCR(mhp, need_free);
1522 	/*
1523 	 * Get up to freemem_incr pages.
1524 	 */
1525 	free_get = freemem_incr;
1526 	if (free_get > mhp->mh_hold_todo)
1527 		free_get = mhp->mh_hold_todo;
1528 	/*
1529 	 * Take free_get pages away from freemem,
1530 	 * waiting if necessary.
1531 	 */
1532 
1533 	while (!mhp->mh_cancel) {
1534 		mutex_exit(&mhp->mh_mutex);
1535 		MDSTAT_INCR(mhp, free_loop);
1536 		/*
1537 		 * Duplicate test from page_create_throttle()
1538 		 * but don't override with !PG_WAIT.
1539 		 */
1540 		if (freemem < (free_get + throttlefree)) {
1541 			MDSTAT_INCR(mhp, free_low);
1542 			ret = 0;
1543 		} else {
1544 			ret = page_create_wait(free_get, 0);
1545 			if (ret == 0) {
1546 				/* EMPTY */
1547 				MDSTAT_INCR(mhp, free_failed);
1548 			}
1549 		}
1550 		if (ret != 0) {
1551 			mutex_enter(&mhp->mh_mutex);
1552 			return (free_get);
1553 		}
1554 
1555 		/*
1556 		 * Put pressure on pageout.
1557 		 */
1558 		page_needfree(free_get);
1559 		cv_signal(&proc_pageout->p_cv);
1560 
1561 		mutex_enter(&mhp->mh_mutex);
1562 		(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
1563 		    (lbolt + DEL_FREE_WAIT_TICKS));
1564 		mutex_exit(&mhp->mh_mutex);
1565 		page_needfree(-(spgcnt_t)free_get);
1566 
1567 		mutex_enter(&mhp->mh_mutex);
1568 	}
1569 	return (0);
1570 }
1571 
1572 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1573 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1574 /*
1575  * This function is run as a helper thread for delete_memory_thread.
1576  * It is needed in order to force kaio cleanup, so that pages used in kaio
1577  * will be unlocked and subsequently relocated by delete_memory_thread.
1578  * The address of the delete_memory_threads's mem_handle is passed in to
1579  * this thread function, and is used to set the mh_aio_cleanup_done member
1580  * prior to calling thread_exit().
1581  */
1582 static void
1583 dr_aio_cleanup_thread(caddr_t amhp)
1584 {
1585 	proc_t *procp;
1586 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1587 	int cleaned;
1588 	int n = 0;
1589 	struct mem_handle *mhp;
1590 	volatile uint_t *pcancel;
1591 
1592 	mhp = (struct mem_handle *)amhp;
1593 	ASSERT(mhp != NULL);
1594 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1595 	if (modload("sys", "kaio") == -1) {
1596 		mhp->mh_aio_cleanup_done = 1;
1597 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1598 		thread_exit();
1599 	}
1600 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1601 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1602 	if (aio_cleanup_dr_delete_memory == NULL) {
1603 		mhp->mh_aio_cleanup_done = 1;
1604 		cmn_err(CE_WARN,
1605 	    "aio_cleanup_dr_delete_memory not found in kaio");
1606 		thread_exit();
1607 	}
1608 	do {
1609 		cleaned = 0;
1610 		mutex_enter(&pidlock);
1611 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1612 		    procp = procp->p_next) {
1613 			mutex_enter(&procp->p_lock);
1614 			if (procp->p_aio != NULL) {
1615 				/* cleanup proc's outstanding kaio */
1616 				cleaned +=
1617 				    (*aio_cleanup_dr_delete_memory)(procp);
1618 			}
1619 			mutex_exit(&procp->p_lock);
1620 		}
1621 		mutex_exit(&pidlock);
1622 		if ((*pcancel == 0) &&
1623 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1624 			/* delay a bit before retrying all procs again */
1625 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1626 			n = 0;
1627 		}
1628 	} while (*pcancel == 0);
1629 	mhp->mh_aio_cleanup_done = 1;
1630 	thread_exit();
1631 }
1632 
1633 static void
1634 delete_memory_thread(caddr_t amhp)
1635 {
1636 	struct mem_handle *mhp;
1637 	struct memdelspan *mdsp;
1638 	callb_cpr_t cprinfo;
1639 	page_t *pp_targ;
1640 	spgcnt_t freemem_left;
1641 	void (*del_complete_funcp)(void *, int error);
1642 	void *del_complete_arg;
1643 	int comp_code;
1644 	int ret;
1645 	int first_scan;
1646 	uint_t szc;
1647 #ifdef MEM_DEL_STATS
1648 	uint64_t start_total, ntick_total;
1649 	uint64_t start_pgrp, ntick_pgrp;
1650 #endif /* MEM_DEL_STATS */
1651 
1652 	mhp = (struct mem_handle *)amhp;
1653 
1654 #ifdef MEM_DEL_STATS
1655 	start_total = ddi_get_lbolt();
1656 #endif /* MEM_DEL_STATS */
1657 
1658 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1659 	    callb_generic_cpr, "memdel");
1660 
1661 	mutex_enter(&mhp->mh_mutex);
1662 	ASSERT(mhp->mh_state == MHND_STARTING);
1663 
1664 	mhp->mh_state = MHND_RUNNING;
1665 	mhp->mh_thread_id = curthread;
1666 
1667 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1668 	mutex_exit(&mhp->mh_mutex);
1669 
1670 	/* Allocate the remap pages now, if necessary. */
1671 	memseg_remap_init();
1672 
1673 	/*
1674 	 * Subtract from availrmem now if possible as availrmem
1675 	 * may not be available by the end of the delete.
1676 	 */
1677 	if (!get_availrmem(mhp->mh_vm_pages)) {
1678 		comp_code = KPHYSM_ENOTVIABLE;
1679 		mutex_enter(&mhp->mh_mutex);
1680 		goto early_exit;
1681 	}
1682 
1683 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1684 
1685 	mutex_enter(&mhp->mh_mutex);
1686 
1687 	if (ret != 0) {
1688 		mhp->mh_cancel = KPHYSM_EREFUSED;
1689 		goto refused;
1690 	}
1691 
1692 	transit_list_collect(mhp, 1);
1693 
1694 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1695 	    mdsp = mdsp->mds_next) {
1696 		ASSERT(mdsp->mds_bitmap == NULL);
1697 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1698 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1699 							KM_SLEEP);
1700 	}
1701 
1702 	first_scan = 1;
1703 	freemem_left = 0;
1704 	/*
1705 	 * Start dr_aio_cleanup_thread, which periodically iterates
1706 	 * through the process list and invokes aio cleanup.  This
1707 	 * is needed in order to avoid a deadly embrace between the
1708 	 * delete_memory_thread (waiting on writer lock for page, with the
1709 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1710 	 * reader lock on the same page that is wanted by the
1711 	 * delete_memory_thread), and threads waiting for kaio completion
1712 	 * (blocked on spt_amp->lock).
1713 	 */
1714 	mhp->mh_dr_aio_cleanup_cancel = 0;
1715 	mhp->mh_aio_cleanup_done = 0;
1716 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1717 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1718 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1719 		pgcnt_t collected;
1720 
1721 		MDSTAT_INCR(mhp, nloop);
1722 		collected = 0;
1723 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1724 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1725 			pfn_t pfn, p_end;
1726 
1727 			if (first_scan) {
1728 				mem_node_pre_del_slice(mdsp->mds_base,
1729 					mdsp->mds_base + mdsp->mds_npgs - 1);
1730 			}
1731 
1732 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1733 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1734 			    (mhp->mh_cancel == 0); pfn++) {
1735 				page_t *pp, *tpp, *tpp_targ;
1736 				pgcnt_t bit;
1737 				struct vnode *vp;
1738 				u_offset_t offset;
1739 				int mod, result;
1740 				spgcnt_t pgcnt;
1741 
1742 				bit = pfn - mdsp->mds_base;
1743 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1744 				    (1 << (bit % NBPBMW))) != 0) {
1745 					MDSTAT_INCR(mhp, already_done);
1746 					continue;
1747 				}
1748 				if (freemem_left == 0) {
1749 					freemem_left += delthr_get_freemem(mhp);
1750 					if (freemem_left == 0)
1751 						break;
1752 				}
1753 
1754 				/*
1755 				 * Release mh_mutex - some of this
1756 				 * stuff takes some time (eg PUTPAGE).
1757 				 */
1758 
1759 				mutex_exit(&mhp->mh_mutex);
1760 				MDSTAT_INCR(mhp, ncheck);
1761 
1762 				pp = page_numtopp_nolock(pfn);
1763 				if (pp == NULL) {
1764 					/*
1765 					 * Not covered by a page_t - will
1766 					 * be dealt with elsewhere.
1767 					 */
1768 					MDSTAT_INCR(mhp, nopaget);
1769 					mutex_enter(&mhp->mh_mutex);
1770 					mdsp->mds_bitmap[bit / NBPBMW] |=
1771 					    (1 << (bit % NBPBMW));
1772 					continue;
1773 				}
1774 
1775 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1776 				    SE_EXCL_WANTED | SE_RETIRED)) {
1777 					/*
1778 					 * Page in use elsewhere.  Skip it.
1779 					 */
1780 					MDSTAT_INCR(mhp, lockfail);
1781 					mutex_enter(&mhp->mh_mutex);
1782 					continue;
1783 				}
1784 				/*
1785 				 * See if the cage expanded into the delete.
1786 				 * This can happen as we have to allow the
1787 				 * cage to expand.
1788 				 */
1789 				if (PP_ISNORELOC(pp)) {
1790 					page_unlock(pp);
1791 					mutex_enter(&mhp->mh_mutex);
1792 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1793 					break;
1794 				}
1795 				if (PP_RETIRED(pp)) {
1796 					/*
1797 					 * Page has been retired and is
1798 					 * not part of the cage so we
1799 					 * can now do the accounting for
1800 					 * it.
1801 					 */
1802 					MDSTAT_INCR(mhp, retired);
1803 					mutex_enter(&mhp->mh_mutex);
1804 					mdsp->mds_bitmap[bit / NBPBMW]
1805 					    |= (1 << (bit % NBPBMW));
1806 					mdsp->mds_bitmap_retired[bit /
1807 					    NBPBMW] |=
1808 					    (1 << (bit % NBPBMW));
1809 					mhp->mh_hold_todo--;
1810 					continue;
1811 				}
1812 				ASSERT(freemem_left != 0);
1813 				if (PP_ISFREE(pp)) {
1814 					/*
1815 					 * Like page_reclaim() only 'freemem'
1816 					 * processing is already done.
1817 					 */
1818 					MDSTAT_INCR(mhp, nfree);
1819 				free_page_collect:
1820 					if (PP_ISAGED(pp)) {
1821 						page_list_sub(pp,
1822 						    PG_FREE_LIST);
1823 					} else {
1824 						page_list_sub(pp,
1825 						    PG_CACHE_LIST);
1826 					}
1827 					PP_CLRFREE(pp);
1828 					PP_CLRAGED(pp);
1829 					collected++;
1830 					mutex_enter(&mhp->mh_mutex);
1831 					page_delete_collect(pp, mhp);
1832 					mdsp->mds_bitmap[bit / NBPBMW] |=
1833 					    (1 << (bit % NBPBMW));
1834 					freemem_left--;
1835 					continue;
1836 				}
1837 				ASSERT(pp->p_vnode != NULL);
1838 				if (first_scan) {
1839 					MDSTAT_INCR(mhp, first_notfree);
1840 					page_unlock(pp);
1841 					mutex_enter(&mhp->mh_mutex);
1842 					continue;
1843 				}
1844 				/*
1845 				 * Keep stats on pages encountered that
1846 				 * are marked for retirement.
1847 				 */
1848 				if (PP_TOXIC(pp)) {
1849 					MDSTAT_INCR(mhp, toxic);
1850 				} else if (PP_PR_REQ(pp)) {
1851 					MDSTAT_INCR(mhp, failing);
1852 				}
1853 				/*
1854 				 * In certain cases below, special exceptions
1855 				 * are made for pages that are toxic.  This
1856 				 * is because the current meaning of toxic
1857 				 * is that an uncorrectable error has been
1858 				 * previously associated with the page.
1859 				 */
1860 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1861 					if (!PP_TOXIC(pp)) {
1862 						/*
1863 						 * Must relocate locked in
1864 						 * memory pages.
1865 						 */
1866 #ifdef MEM_DEL_STATS
1867 						start_pgrp = ddi_get_lbolt();
1868 #endif /* MEM_DEL_STATS */
1869 						/*
1870 						 * Lock all constituent pages
1871 						 * of a large page to ensure
1872 						 * that p_szc won't change.
1873 						 */
1874 						if (!group_page_trylock(pp,
1875 						    SE_EXCL)) {
1876 							MDSTAT_INCR(mhp,
1877 							    gptllckfail);
1878 							page_unlock(pp);
1879 							mutex_enter(
1880 							    &mhp->mh_mutex);
1881 							continue;
1882 						}
1883 						MDSTAT_INCR(mhp, npplocked);
1884 						pp_targ =
1885 						    page_get_replacement_page(
1886 							pp, NULL, 0);
1887 						if (pp_targ != NULL) {
1888 #ifdef MEM_DEL_STATS
1889 							ntick_pgrp =
1890 							    (uint64_t)
1891 							    ddi_get_lbolt() -
1892 							    start_pgrp;
1893 #endif /* MEM_DEL_STATS */
1894 							MDSTAT_PGRP(mhp,
1895 							    ntick_pgrp);
1896 							MDSTAT_INCR(mhp,
1897 							    nlockreloc);
1898 							goto reloc;
1899 						}
1900 						group_page_unlock(pp);
1901 						page_unlock(pp);
1902 #ifdef MEM_DEL_STATS
1903 						ntick_pgrp =
1904 						    (uint64_t)ddi_get_lbolt() -
1905 						    start_pgrp;
1906 #endif /* MEM_DEL_STATS */
1907 						MDSTAT_PGRP(mhp, ntick_pgrp);
1908 						MDSTAT_INCR(mhp, nnorepl);
1909 						mutex_enter(&mhp->mh_mutex);
1910 						continue;
1911 					} else {
1912 						/*
1913 						 * Cannot do anything about
1914 						 * this page because it is
1915 						 * toxic.
1916 						 */
1917 						MDSTAT_INCR(mhp, npplkdtoxic);
1918 						page_unlock(pp);
1919 						mutex_enter(&mhp->mh_mutex);
1920 						continue;
1921 					}
1922 				}
1923 				/*
1924 				 * Unload the mappings and check if mod bit
1925 				 * is set.
1926 				 */
1927 				ASSERT(pp->p_vnode != &kvp);
1928 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1929 				mod = hat_ismod(pp);
1930 
1931 #ifdef MEM_DEL_STATS
1932 				start_pgrp = ddi_get_lbolt();
1933 #endif /* MEM_DEL_STATS */
1934 				if (mod && !PP_TOXIC(pp)) {
1935 					/*
1936 					 * Lock all constituent pages
1937 					 * of a large page to ensure
1938 					 * that p_szc won't change.
1939 					 */
1940 					if (!group_page_trylock(pp, SE_EXCL)) {
1941 						MDSTAT_INCR(mhp, gptlmodfail);
1942 						page_unlock(pp);
1943 						mutex_enter(&mhp->mh_mutex);
1944 						continue;
1945 					}
1946 					pp_targ = page_get_replacement_page(pp,
1947 					    NULL, 0);
1948 					if (pp_targ != NULL) {
1949 						MDSTAT_INCR(mhp, nmodreloc);
1950 #ifdef MEM_DEL_STATS
1951 						ntick_pgrp =
1952 						    (uint64_t)ddi_get_lbolt() -
1953 							start_pgrp;
1954 #endif /* MEM_DEL_STATS */
1955 						MDSTAT_PGRP(mhp, ntick_pgrp);
1956 						goto reloc;
1957 					}
1958 					group_page_unlock(pp);
1959 				}
1960 
1961 				if (!page_try_demote_pages(pp)) {
1962 					MDSTAT_INCR(mhp, demotefail);
1963 					page_unlock(pp);
1964 #ifdef MEM_DEL_STATS
1965 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1966 					    start_pgrp;
1967 #endif /* MEM_DEL_STATS */
1968 					MDSTAT_PGRP(mhp, ntick_pgrp);
1969 					mutex_enter(&mhp->mh_mutex);
1970 					continue;
1971 				}
1972 
1973 				/*
1974 				 * Regular 'page-out'.
1975 				 */
1976 				if (!mod) {
1977 					MDSTAT_INCR(mhp, ndestroy);
1978 					page_destroy(pp, 1);
1979 					/*
1980 					 * page_destroy was called with
1981 					 * dontfree. As long as p_lckcnt
1982 					 * and p_cowcnt are both zero, the
1983 					 * only additional action of
1984 					 * page_destroy with !dontfree is to
1985 					 * call page_free, so we can collect
1986 					 * the page here.
1987 					 */
1988 					collected++;
1989 #ifdef MEM_DEL_STATS
1990 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1991 					    start_pgrp;
1992 #endif /* MEM_DEL_STATS */
1993 					MDSTAT_PGRP(mhp, ntick_pgrp);
1994 					mutex_enter(&mhp->mh_mutex);
1995 					page_delete_collect(pp, mhp);
1996 					mdsp->mds_bitmap[bit / NBPBMW] |=
1997 					    (1 << (bit % NBPBMW));
1998 					continue;
1999 				}
2000 				/*
2001 				 * The page is toxic and the mod bit is
2002 				 * set, we cannot do anything here to deal
2003 				 * with it.
2004 				 */
2005 				if (PP_TOXIC(pp)) {
2006 					page_unlock(pp);
2007 #ifdef MEM_DEL_STATS
2008 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2009 					    start_pgrp;
2010 #endif /* MEM_DEL_STATS */
2011 					MDSTAT_PGRP(mhp, ntick_pgrp);
2012 					MDSTAT_INCR(mhp, modtoxic);
2013 					mutex_enter(&mhp->mh_mutex);
2014 					continue;
2015 				}
2016 				MDSTAT_INCR(mhp, nputpage);
2017 				vp = pp->p_vnode;
2018 				offset = pp->p_offset;
2019 				VN_HOLD(vp);
2020 				page_unlock(pp);
2021 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2022 				    B_INVAL|B_FORCE, kcred);
2023 				VN_RELE(vp);
2024 #ifdef MEM_DEL_STATS
2025 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2026 				    start_pgrp;
2027 #endif /* MEM_DEL_STATS */
2028 				MDSTAT_PGRP(mhp, ntick_pgrp);
2029 				/*
2030 				 * Try to get the page back immediately
2031 				 * so that it can be collected.
2032 				 */
2033 				pp = page_numtopp_nolock(pfn);
2034 				if (pp == NULL) {
2035 					MDSTAT_INCR(mhp, nnoreclaim);
2036 					/*
2037 					 * This should not happen as this
2038 					 * thread is deleting the page.
2039 					 * If this code is generalized, this
2040 					 * becomes a reality.
2041 					 */
2042 #ifdef DEBUG
2043 					cmn_err(CE_WARN,
2044 					    "delete_memory_thread(0x%p) "
2045 					    "pfn 0x%lx has no page_t",
2046 					    (void *)mhp, pfn);
2047 #endif /* DEBUG */
2048 					mutex_enter(&mhp->mh_mutex);
2049 					continue;
2050 				}
2051 				if (page_try_reclaim_lock(pp, SE_EXCL,
2052 				    SE_EXCL_WANTED | SE_RETIRED)) {
2053 					if (PP_ISFREE(pp)) {
2054 						goto free_page_collect;
2055 					}
2056 					page_unlock(pp);
2057 				}
2058 				MDSTAT_INCR(mhp, nnoreclaim);
2059 				mutex_enter(&mhp->mh_mutex);
2060 				continue;
2061 
2062 			reloc:
2063 				/*
2064 				 * Got some freemem and a target
2065 				 * page, so move the data to avoid
2066 				 * I/O and lock problems.
2067 				 */
2068 				ASSERT(!page_iolock_assert(pp));
2069 				MDSTAT_INCR(mhp, nreloc);
2070 				/*
2071 				 * page_relocate() will return pgcnt: the
2072 				 * number of consecutive pages relocated.
2073 				 * If it is successful, pp will be a
2074 				 * linked list of the page structs that
2075 				 * were relocated. If page_relocate() is
2076 				 * unsuccessful, pp will be unmodified.
2077 				 */
2078 #ifdef MEM_DEL_STATS
2079 				start_pgrp = ddi_get_lbolt();
2080 #endif /* MEM_DEL_STATS */
2081 				result = page_relocate(&pp, &pp_targ, 0, 0,
2082 				    &pgcnt, NULL);
2083 #ifdef MEM_DEL_STATS
2084 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2085 				    start_pgrp;
2086 #endif /* MEM_DEL_STATS */
2087 				MDSTAT_PGRP(mhp, ntick_pgrp);
2088 				if (result != 0) {
2089 					MDSTAT_INCR(mhp, nrelocfail);
2090 					/*
2091 					 * We did not succeed. We need
2092 					 * to give the pp_targ pages back.
2093 					 * page_free(pp_targ, 1) without
2094 					 * the freemem accounting.
2095 					 */
2096 					group_page_unlock(pp);
2097 					page_free_replacement_page(pp_targ);
2098 					page_unlock(pp);
2099 					mutex_enter(&mhp->mh_mutex);
2100 					continue;
2101 				}
2102 
2103 				/*
2104 				 * We will then collect pgcnt pages.
2105 				 */
2106 				ASSERT(pgcnt > 0);
2107 				mutex_enter(&mhp->mh_mutex);
2108 				/*
2109 				 * We need to make sure freemem_left is
2110 				 * large enough.
2111 				 */
2112 				while ((freemem_left < pgcnt) &&
2113 					(!mhp->mh_cancel)) {
2114 					freemem_left +=
2115 						delthr_get_freemem(mhp);
2116 				}
2117 
2118 				/*
2119 				 * Do not proceed if mh_cancel is set.
2120 				 */
2121 				if (mhp->mh_cancel) {
2122 					while (pp_targ != NULL) {
2123 						/*
2124 						 * Unlink and unlock each page.
2125 						 */
2126 						tpp_targ = pp_targ;
2127 						page_sub(&pp_targ, tpp_targ);
2128 						page_unlock(tpp_targ);
2129 					}
2130 					/*
2131 					 * We need to give the pp pages back.
2132 					 * page_free(pp, 1) without the
2133 					 * freemem accounting.
2134 					 */
2135 					page_free_replacement_page(pp);
2136 					break;
2137 				}
2138 
2139 				/* Now remove pgcnt from freemem_left */
2140 				freemem_left -= pgcnt;
2141 				ASSERT(freemem_left >= 0);
2142 				szc = pp->p_szc;
2143 				while (pp != NULL) {
2144 					/*
2145 					 * pp and pp_targ were passed back as
2146 					 * a linked list of pages.
2147 					 * Unlink and unlock each page.
2148 					 */
2149 					tpp_targ = pp_targ;
2150 					page_sub(&pp_targ, tpp_targ);
2151 					page_unlock(tpp_targ);
2152 					/*
2153 					 * The original page is now free
2154 					 * so remove it from the linked
2155 					 * list and collect it.
2156 					 */
2157 					tpp = pp;
2158 					page_sub(&pp, tpp);
2159 					pfn = page_pptonum(tpp);
2160 					collected++;
2161 					ASSERT(PAGE_EXCL(tpp));
2162 					ASSERT(tpp->p_vnode == NULL);
2163 					ASSERT(!hat_page_is_mapped(tpp));
2164 					ASSERT(tpp->p_szc == szc);
2165 					tpp->p_szc = 0;
2166 					page_delete_collect(tpp, mhp);
2167 					bit = pfn - mdsp->mds_base;
2168 					mdsp->mds_bitmap[bit / NBPBMW] |=
2169 					(1 << (bit % NBPBMW));
2170 				}
2171 				ASSERT(pp_targ == NULL);
2172 			}
2173 		}
2174 		first_scan = 0;
2175 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2176 			(collected == 0)) {
2177 			/*
2178 			 * This code is needed as we cannot wait
2179 			 * for a page to be locked OR the delete to
2180 			 * be cancelled.  Also, we must delay so
2181 			 * that other threads get a chance to run
2182 			 * on our cpu, otherwise page locks may be
2183 			 * held indefinitely by those threads.
2184 			 */
2185 			MDSTAT_INCR(mhp, ndelay);
2186 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2187 			(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
2188 			    (lbolt + DEL_BUSY_WAIT_TICKS));
2189 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2190 		}
2191 	}
2192 	/* stop the dr aio cleanup thread */
2193 	mhp->mh_dr_aio_cleanup_cancel = 1;
2194 	transit_list_collect(mhp, 0);
2195 	if (freemem_left != 0) {
2196 		/* Return any surplus. */
2197 		page_create_putback(freemem_left);
2198 		freemem_left = 0;
2199 	}
2200 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2201 	    mdsp = mdsp->mds_next) {
2202 		mem_node_post_del_slice(mdsp->mds_base,
2203 				mdsp->mds_base + mdsp->mds_npgs - 1,
2204 				(mhp->mh_cancel != 0));
2205 	}
2206 #ifdef MEM_DEL_STATS
2207 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2208 #endif /* MEM_DEL_STATS */
2209 	MDSTAT_TOTAL(mhp, ntick_total);
2210 	MDSTAT_PRINT(mhp);
2211 
2212 	/*
2213 	 * If the memory delete was cancelled, exclusive-wanted bits must
2214 	 * be cleared. If there are retired pages being deleted, they need
2215 	 * to be unretired.
2216 	 */
2217 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2218 	    mdsp = mdsp->mds_next) {
2219 		pfn_t pfn, p_end;
2220 
2221 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2222 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2223 			page_t *pp;
2224 			pgcnt_t bit;
2225 
2226 			bit = pfn - mdsp->mds_base;
2227 			if (mhp->mh_cancel) {
2228 				pp = page_numtopp_nolock(pfn);
2229 				if (pp != NULL) {
2230 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2231 					    (1 << (bit % NBPBMW))) == 0) {
2232 						page_lock_clr_exclwanted(pp);
2233 					}
2234 				}
2235 			} else {
2236 				pp = NULL;
2237 			}
2238 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2239 			    (1 << (bit % NBPBMW))) != 0) {
2240 				/* do we already have pp? */
2241 				if (pp == NULL) {
2242 					pp = page_numtopp_nolock(pfn);
2243 				}
2244 				ASSERT(pp != NULL);
2245 				ASSERT(PP_RETIRED(pp));
2246 				if (mhp->mh_cancel != 0) {
2247 					page_unlock(pp);
2248 					/*
2249 					 * To satisfy ASSERT below in
2250 					 * cancel code.
2251 					 */
2252 					mhp->mh_hold_todo++;
2253 				} else {
2254 					(void) page_unretire_pp(pp, 0);
2255 				}
2256 			}
2257 		}
2258 	}
2259 	/*
2260 	 * Free retired page bitmap and collected page bitmap
2261 	 */
2262 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2263 	    mdsp = mdsp->mds_next) {
2264 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2265 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2266 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2267 		ASSERT(mdsp->mds_bitmap != NULL);
2268 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2269 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2270 	}
2271 
2272 	/* wait for our dr aio cancel thread to exit */
2273 	while (!(mhp->mh_aio_cleanup_done)) {
2274 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2275 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2276 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2277 	}
2278 refused:
2279 	if (mhp->mh_cancel != 0) {
2280 		page_t *pp;
2281 
2282 		comp_code = mhp->mh_cancel;
2283 		/*
2284 		 * Go through list of deleted pages (mh_deleted) freeing
2285 		 * them.
2286 		 */
2287 		while ((pp = mhp->mh_deleted) != NULL) {
2288 			mhp->mh_deleted = pp->p_next;
2289 			mhp->mh_hold_todo++;
2290 			mutex_exit(&mhp->mh_mutex);
2291 			/* Restore p_next. */
2292 			pp->p_next = pp->p_prev;
2293 			if (PP_ISFREE(pp)) {
2294 				cmn_err(CE_PANIC,
2295 				    "page %p is free",
2296 				    (void *)pp);
2297 			}
2298 			page_free(pp, 1);
2299 			mutex_enter(&mhp->mh_mutex);
2300 		}
2301 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2302 
2303 		mutex_exit(&mhp->mh_mutex);
2304 		put_availrmem(mhp->mh_vm_pages);
2305 		mutex_enter(&mhp->mh_mutex);
2306 
2307 		goto t_exit;
2308 	}
2309 
2310 	/*
2311 	 * All the pages are no longer in use and are exclusively locked.
2312 	 */
2313 
2314 	mhp->mh_deleted = NULL;
2315 
2316 	kphysm_del_cleanup(mhp);
2317 
2318 	comp_code = KPHYSM_OK;
2319 
2320 t_exit:
2321 	mutex_exit(&mhp->mh_mutex);
2322 	kphysm_setup_post_del(mhp->mh_vm_pages,
2323 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2324 	mutex_enter(&mhp->mh_mutex);
2325 
2326 early_exit:
2327 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2328 	mhp->mh_state = MHND_DONE;
2329 	del_complete_funcp = mhp->mh_delete_complete;
2330 	del_complete_arg = mhp->mh_delete_complete_arg;
2331 	CALLB_CPR_EXIT(&cprinfo);
2332 	(*del_complete_funcp)(del_complete_arg, comp_code);
2333 	thread_exit();
2334 	/*NOTREACHED*/
2335 }
2336 
2337 /*
2338  * Start the delete of the memory from the system.
2339  */
2340 int
2341 kphysm_del_start(
2342 	memhandle_t handle,
2343 	void (*complete)(void *, int),
2344 	void *complete_arg)
2345 {
2346 	struct mem_handle *mhp;
2347 
2348 	mhp = kphysm_lookup_mem_handle(handle);
2349 	if (mhp == NULL) {
2350 		return (KPHYSM_EHANDLE);
2351 	}
2352 	switch (mhp->mh_state) {
2353 	case MHND_FREE:
2354 		ASSERT(mhp->mh_state != MHND_FREE);
2355 		mutex_exit(&mhp->mh_mutex);
2356 		return (KPHYSM_EHANDLE);
2357 	case MHND_INIT:
2358 		break;
2359 	case MHND_STARTING:
2360 	case MHND_RUNNING:
2361 		mutex_exit(&mhp->mh_mutex);
2362 		return (KPHYSM_ESEQUENCE);
2363 	case MHND_DONE:
2364 		mutex_exit(&mhp->mh_mutex);
2365 		return (KPHYSM_ESEQUENCE);
2366 	case MHND_RELEASE:
2367 		mutex_exit(&mhp->mh_mutex);
2368 		return (KPHYSM_ESEQUENCE);
2369 	default:
2370 #ifdef DEBUG
2371 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2372 		    (void *)mhp, mhp->mh_state);
2373 #endif /* DEBUG */
2374 		mutex_exit(&mhp->mh_mutex);
2375 		return (KPHYSM_EHANDLE);
2376 	}
2377 
2378 	if (mhp->mh_transit.trl_spans == NULL) {
2379 		mutex_exit(&mhp->mh_mutex);
2380 		return (KPHYSM_ENOWORK);
2381 	}
2382 
2383 	ASSERT(complete != NULL);
2384 	mhp->mh_delete_complete = complete;
2385 	mhp->mh_delete_complete_arg = complete_arg;
2386 	mhp->mh_state = MHND_STARTING;
2387 	/*
2388 	 * Release the mutex in case thread_create sleeps.
2389 	 */
2390 	mutex_exit(&mhp->mh_mutex);
2391 
2392 	/*
2393 	 * The "obvious" process for this thread is pageout (proc_pageout)
2394 	 * but this gives the thread too much power over freemem
2395 	 * which results in freemem starvation.
2396 	 */
2397 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2398 	    TS_RUN, maxclsyspri - 1);
2399 
2400 	return (KPHYSM_OK);
2401 }
2402 
2403 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2404 static caddr_t pp_dummy;
2405 static pgcnt_t pp_dummy_npages;
2406 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2407 
2408 static void
2409 memseg_remap_init_pages(page_t *pages, page_t *epages)
2410 {
2411 	page_t *pp;
2412 
2413 	for (pp = pages; pp < epages; pp++) {
2414 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2415 		pp->p_offset = (u_offset_t)-1;
2416 		page_iolock_init(pp);
2417 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2418 			continue;
2419 		page_lock_delete(pp);
2420 	}
2421 }
2422 
2423 void
2424 memseg_remap_init()
2425 {
2426 	mutex_enter(&pp_dummy_lock);
2427 	if (pp_dummy == NULL) {
2428 		uint_t dpages;
2429 		int i;
2430 
2431 		/*
2432 		 * dpages starts off as the size of the structure and
2433 		 * ends up as the minimum number of pages that will
2434 		 * hold a whole number of page_t structures.
2435 		 */
2436 		dpages = sizeof (page_t);
2437 		ASSERT(dpages != 0);
2438 		ASSERT(dpages <= MMU_PAGESIZE);
2439 
2440 		while ((dpages & 1) == 0)
2441 			dpages >>= 1;
2442 
2443 		pp_dummy_npages = dpages;
2444 		/*
2445 		 * Allocate pp_dummy pages directly from static_arena,
2446 		 * since these are whole page allocations and are
2447 		 * referenced by physical address.  This also has the
2448 		 * nice fringe benefit of hiding the memory from
2449 		 * ::findleaks since it doesn't deal well with allocated
2450 		 * kernel heap memory that doesn't have any mappings.
2451 		 */
2452 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2453 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2454 		bzero(pp_dummy, ptob(pp_dummy_npages));
2455 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2456 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2457 		    pp_dummy_npages, KM_SLEEP);
2458 		for (i = 0; i < pp_dummy_npages; i++) {
2459 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2460 			    &pp_dummy[MMU_PAGESIZE * i]);
2461 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2462 		}
2463 		/*
2464 		 * Initialize the page_t's to a known 'deleted' state
2465 		 * that matches the state of deleted pages.
2466 		 */
2467 		memseg_remap_init_pages((page_t *)pp_dummy,
2468 					(page_t *)(pp_dummy +
2469 					    ptob(pp_dummy_npages)));
2470 		/* Remove kmem mappings for the pages for safety. */
2471 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2472 		    HAT_UNLOAD_UNLOCK);
2473 		/* Leave pp_dummy pointer set as flag that init is done. */
2474 	}
2475 	mutex_exit(&pp_dummy_lock);
2476 }
2477 
2478 static void
2479 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs)
2480 {
2481 	ASSERT(pp_dummy != NULL);
2482 
2483 	while (metapgs != 0) {
2484 		pgcnt_t n;
2485 		int i;
2486 
2487 		n = pp_dummy_npages;
2488 		if (n > metapgs)
2489 			n = metapgs;
2490 		for (i = 0; i < n; i++) {
2491 			hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i],
2492 			    PROT_READ,
2493 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2494 			    HAT_LOAD_REMAP);
2495 			pp += ptob(1);
2496 		}
2497 		metapgs -= n;
2498 	}
2499 }
2500 
2501 /*
2502  * Transition all the deleted pages to the deleted state so that
2503  * page_lock will not wait. The page_lock_delete call will
2504  * also wake up any waiters.
2505  */
2506 static void
2507 memseg_lock_delete_all(struct memseg *seg)
2508 {
2509 	page_t *pp;
2510 
2511 	for (pp = seg->pages; pp < seg->epages; pp++) {
2512 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2513 		page_lock_delete(pp);
2514 	}
2515 }
2516 
2517 static void
2518 kphysm_del_cleanup(struct mem_handle *mhp)
2519 {
2520 	struct memdelspan	*mdsp;
2521 	struct memseg		*seg;
2522 	struct memseg   	**segpp;
2523 	struct memseg		*seglist;
2524 	pfn_t			p_end;
2525 	uint64_t		avmem;
2526 	pgcnt_t			avpgs;
2527 	pgcnt_t			npgs;
2528 
2529 	avpgs = mhp->mh_vm_pages;
2530 
2531 	memsegs_lock(1);
2532 
2533 	/*
2534 	 * remove from main segment list.
2535 	 */
2536 	npgs = 0;
2537 	seglist = NULL;
2538 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2539 	    mdsp = mdsp->mds_next) {
2540 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2541 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2542 			if (seg->pages_base >= p_end ||
2543 			    seg->pages_end <= mdsp->mds_base) {
2544 				/* Span and memseg don't overlap. */
2545 				segpp = &((*segpp)->next);
2546 				continue;
2547 			}
2548 			ASSERT(seg->pages_base >= mdsp->mds_base);
2549 			ASSERT(seg->pages_end <= p_end);
2550 
2551 			PLCNT_MODIFY_MAX(seg->pages_base,
2552 			    seg->pages_base - seg->pages_end);
2553 
2554 			/* Hide the memseg from future scans. */
2555 			hat_kpm_delmem_mseg_update(seg, segpp);
2556 			*segpp = seg->next;
2557 			membar_producer();	/* TODO: Needed? */
2558 			npgs += MSEG_NPAGES(seg);
2559 
2560 			/*
2561 			 * Leave the deleted segment's next pointer intact
2562 			 * in case a memsegs scanning loop is walking this
2563 			 * segment concurrently.
2564 			 */
2565 			seg->lnext = seglist;
2566 			seglist = seg;
2567 		}
2568 	}
2569 
2570 	build_pfn_hash();
2571 
2572 	ASSERT(npgs < total_pages);
2573 	total_pages -= npgs;
2574 
2575 	/*
2576 	 * Recalculate the paging parameters now total_pages has changed.
2577 	 * This will also cause the clock hands to be reset before next use.
2578 	 */
2579 	setupclock(1);
2580 
2581 	memsegs_unlock(1);
2582 
2583 	mutex_exit(&mhp->mh_mutex);
2584 
2585 	while ((seg = seglist) != NULL) {
2586 		pfn_t mseg_start;
2587 		pfn_t mseg_base, mseg_end;
2588 		pgcnt_t mseg_npgs;
2589 		page_t *pp;
2590 		pgcnt_t metapgs;
2591 		int dynamic;
2592 		int mlret;
2593 
2594 		seglist = seg->lnext;
2595 
2596 		/*
2597 		 * Put the page_t's into the deleted state to stop
2598 		 * cv_wait()s on the pages. When we remap, the dummy
2599 		 * page_t's will be in the same state.
2600 		 */
2601 		memseg_lock_delete_all(seg);
2602 		/*
2603 		 * Collect up information based on pages_base and pages_end
2604 		 * early so that we can flag early that the memseg has been
2605 		 * deleted by setting pages_end == pages_base.
2606 		 */
2607 		mseg_base = seg->pages_base;
2608 		mseg_end = seg->pages_end;
2609 		mseg_npgs = MSEG_NPAGES(seg);
2610 		dynamic = memseg_is_dynamic(seg, &mseg_start);
2611 
2612 		seg->pages_end = seg->pages_base;
2613 
2614 		if (dynamic) {
2615 			pp = seg->pages;
2616 			metapgs = mseg_base - mseg_start;
2617 			ASSERT(metapgs != 0);
2618 
2619 			/* Remap the meta data to our special dummy area. */
2620 			memseg_remap_to_dummy((caddr_t)pp, metapgs);
2621 
2622 			mutex_enter(&memseg_lists_lock);
2623 			seg->lnext = memseg_va_avail;
2624 			memseg_va_avail = seg;
2625 			mutex_exit(&memseg_lists_lock);
2626 		} else {
2627 			/*
2628 			 * Set for clean-up below.
2629 			 */
2630 			mseg_start = seg->pages_base;
2631 			/*
2632 			 * For memory whose page_ts were allocated
2633 			 * at boot, we need to find a new use for
2634 			 * the page_t memory.
2635 			 * For the moment, just leak it.
2636 			 * (It is held in the memseg_delete_junk list.)
2637 			 */
2638 
2639 			mutex_enter(&memseg_lists_lock);
2640 			seg->lnext = memseg_delete_junk;
2641 			memseg_delete_junk = seg;
2642 			mutex_exit(&memseg_lists_lock);
2643 		}
2644 
2645 		/* Must not use seg now as it could be re-used. */
2646 
2647 		memlist_write_lock();
2648 
2649 		mlret = memlist_delete_span(
2650 		    (uint64_t)(mseg_base) << PAGESHIFT,
2651 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2652 		    &phys_avail);
2653 		ASSERT(mlret == MEML_SPANOP_OK);
2654 
2655 		mlret = memlist_delete_span(
2656 		    (uint64_t)(mseg_start) << PAGESHIFT,
2657 		    (uint64_t)(mseg_end - mseg_start) <<
2658 		    PAGESHIFT,
2659 		    &phys_install);
2660 		ASSERT(mlret == MEML_SPANOP_OK);
2661 		phys_install_has_changed();
2662 
2663 		memlist_write_unlock();
2664 	}
2665 
2666 	memlist_read_lock();
2667 	installed_top_size(phys_install, &physmax, &physinstalled);
2668 	memlist_read_unlock();
2669 
2670 	mutex_enter(&freemem_lock);
2671 	maxmem -= avpgs;
2672 	physmem -= avpgs;
2673 	/* availrmem is adjusted during the delete. */
2674 	availrmem_initial -= avpgs;
2675 
2676 	mutex_exit(&freemem_lock);
2677 
2678 	dump_resize();
2679 
2680 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2681 	    "(0x%" PRIx64 ")\n",
2682 	    physinstalled << (PAGESHIFT - 10),
2683 	    (uint64_t)physinstalled << PAGESHIFT);
2684 
2685 	avmem = (uint64_t)freemem << PAGESHIFT;
2686 	cmn_err(CE_CONT, "?kphysm_delete: "
2687 	    "avail mem = %" PRId64 "\n", avmem);
2688 
2689 	/*
2690 	 * Update lgroup generation number on single lgroup systems
2691 	 */
2692 	if (nlgrps == 1)
2693 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2694 
2695 	/* Successfully deleted system memory */
2696 	mutex_enter(&mhp->mh_mutex);
2697 }
2698 
2699 static uint_t mdel_nullvp_waiter;
2700 
2701 static void
2702 page_delete_collect(
2703 	page_t *pp,
2704 	struct mem_handle *mhp)
2705 {
2706 	if (pp->p_vnode) {
2707 		page_hashout(pp, (kmutex_t *)NULL);
2708 		/* do not do PP_SETAGED(pp); */
2709 	} else {
2710 		kmutex_t *sep;
2711 
2712 		sep = page_se_mutex(pp);
2713 		mutex_enter(sep);
2714 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2715 			mdel_nullvp_waiter++;
2716 			cv_broadcast(&pp->p_cv);
2717 		}
2718 		mutex_exit(sep);
2719 	}
2720 	ASSERT(pp->p_next == pp->p_prev);
2721 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2722 	pp->p_next = mhp->mh_deleted;
2723 	mhp->mh_deleted = pp;
2724 	ASSERT(mhp->mh_hold_todo != 0);
2725 	mhp->mh_hold_todo--;
2726 }
2727 
2728 static void
2729 transit_list_collect(struct mem_handle *mhp, int v)
2730 {
2731 	struct transit_list_head *trh;
2732 
2733 	trh = &transit_list_head;
2734 	mutex_enter(&trh->trh_lock);
2735 	mhp->mh_transit.trl_collect = v;
2736 	mutex_exit(&trh->trh_lock);
2737 }
2738 
2739 static void
2740 transit_list_insert(struct transit_list *tlp)
2741 {
2742 	struct transit_list_head *trh;
2743 
2744 	trh = &transit_list_head;
2745 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2746 	tlp->trl_next = trh->trh_head;
2747 	trh->trh_head = tlp;
2748 }
2749 
2750 static void
2751 transit_list_remove(struct transit_list *tlp)
2752 {
2753 	struct transit_list_head *trh;
2754 	struct transit_list **tlpp;
2755 
2756 	trh = &transit_list_head;
2757 	tlpp = &trh->trh_head;
2758 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2759 	while (*tlpp != NULL && *tlpp != tlp)
2760 		tlpp = &(*tlpp)->trl_next;
2761 	ASSERT(*tlpp != NULL);
2762 	if (*tlpp == tlp)
2763 		*tlpp = tlp->trl_next;
2764 	tlp->trl_next = NULL;
2765 }
2766 
2767 static struct transit_list *
2768 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2769 {
2770 	struct transit_list *tlp;
2771 
2772 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2773 		struct memdelspan *mdsp;
2774 
2775 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2776 		    mdsp = mdsp->mds_next) {
2777 			if (pfnum >= mdsp->mds_base &&
2778 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2779 				return (tlp);
2780 			}
2781 		}
2782 	}
2783 	return (NULL);
2784 }
2785 
2786 int
2787 pfn_is_being_deleted(pfn_t pfnum)
2788 {
2789 	struct transit_list_head *trh;
2790 	struct transit_list *tlp;
2791 	int ret;
2792 
2793 	trh = &transit_list_head;
2794 	if (trh->trh_head == NULL)
2795 		return (0);
2796 
2797 	mutex_enter(&trh->trh_lock);
2798 	tlp = pfnum_to_transit_list(trh, pfnum);
2799 	ret = (tlp != NULL && tlp->trl_collect);
2800 	mutex_exit(&trh->trh_lock);
2801 
2802 	return (ret);
2803 }
2804 
2805 #ifdef MEM_DEL_STATS
2806 extern int hz;
2807 static void
2808 mem_del_stat_print_func(struct mem_handle *mhp)
2809 {
2810 	uint64_t tmp;
2811 
2812 	if (mem_del_stat_print) {
2813 		printf("memory delete loop %x/%x, statistics%s\n",
2814 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2815 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2816 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2817 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2818 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2819 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2820 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2821 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2822 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2823 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2824 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2825 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2826 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2827 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2828 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2829 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2830 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2831 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2832 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2833 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2834 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2835 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2836 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2837 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2838 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2839 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2840 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2841 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2842 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2843 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2844 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2845 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2846 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2847 		printf(
2848 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2849 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2850 
2851 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2852 		printf(
2853 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2854 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2855 	}
2856 }
2857 #endif /* MEM_DEL_STATS */
2858 
2859 struct mem_callback {
2860 	kphysm_setup_vector_t	*vec;
2861 	void			*arg;
2862 };
2863 
2864 #define	NMEMCALLBACKS		100
2865 
2866 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2867 static uint_t nmemcallbacks;
2868 static krwlock_t mem_callback_rwlock;
2869 
2870 int
2871 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2872 {
2873 	uint_t i, found;
2874 
2875 	/*
2876 	 * This test will become more complicated when the version must
2877 	 * change.
2878 	 */
2879 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2880 		return (EINVAL);
2881 
2882 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2883 	    vec->post_del == NULL)
2884 		return (EINVAL);
2885 
2886 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2887 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2888 		if (mem_callbacks[i].vec == NULL && found == 0)
2889 			found = i + 1;
2890 		if (mem_callbacks[i].vec == vec &&
2891 		    mem_callbacks[i].arg == arg) {
2892 #ifdef DEBUG
2893 			/* Catch this in DEBUG kernels. */
2894 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2895 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2896 			    (void *)vec, arg, (void *)caller());
2897 #endif /* DEBUG */
2898 			rw_exit(&mem_callback_rwlock);
2899 			return (EEXIST);
2900 		}
2901 	}
2902 	if (found != 0) {
2903 		i = found - 1;
2904 	} else {
2905 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
2906 		if (nmemcallbacks == NMEMCALLBACKS) {
2907 			rw_exit(&mem_callback_rwlock);
2908 			return (ENOMEM);
2909 		}
2910 		i = nmemcallbacks++;
2911 	}
2912 	mem_callbacks[i].vec = vec;
2913 	mem_callbacks[i].arg = arg;
2914 	rw_exit(&mem_callback_rwlock);
2915 	return (0);
2916 }
2917 
2918 void
2919 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
2920 {
2921 	uint_t i;
2922 
2923 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2924 	for (i = 0; i < nmemcallbacks; i++) {
2925 		if (mem_callbacks[i].vec == vec &&
2926 		    mem_callbacks[i].arg == arg) {
2927 			mem_callbacks[i].vec = NULL;
2928 			mem_callbacks[i].arg = NULL;
2929 			if (i == (nmemcallbacks - 1))
2930 				nmemcallbacks--;
2931 			break;
2932 		}
2933 	}
2934 	rw_exit(&mem_callback_rwlock);
2935 }
2936 
2937 static void
2938 kphysm_setup_post_add(pgcnt_t delta_pages)
2939 {
2940 	uint_t i;
2941 
2942 	rw_enter(&mem_callback_rwlock, RW_READER);
2943 	for (i = 0; i < nmemcallbacks; i++) {
2944 		if (mem_callbacks[i].vec != NULL) {
2945 			(*mem_callbacks[i].vec->post_add)
2946 			    (mem_callbacks[i].arg, delta_pages);
2947 		}
2948 	}
2949 	rw_exit(&mem_callback_rwlock);
2950 }
2951 
2952 /*
2953  * Note the locking between pre_del and post_del: The reader lock is held
2954  * between the two calls to stop the set of functions from changing.
2955  */
2956 
2957 static int
2958 kphysm_setup_pre_del(pgcnt_t delta_pages)
2959 {
2960 	uint_t i;
2961 	int ret;
2962 	int aret;
2963 
2964 	ret = 0;
2965 	rw_enter(&mem_callback_rwlock, RW_READER);
2966 	for (i = 0; i < nmemcallbacks; i++) {
2967 		if (mem_callbacks[i].vec != NULL) {
2968 			aret = (*mem_callbacks[i].vec->pre_del)
2969 			    (mem_callbacks[i].arg, delta_pages);
2970 			ret |= aret;
2971 		}
2972 	}
2973 
2974 	return (ret);
2975 }
2976 
2977 static void
2978 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
2979 {
2980 	uint_t i;
2981 
2982 	for (i = 0; i < nmemcallbacks; i++) {
2983 		if (mem_callbacks[i].vec != NULL) {
2984 			(*mem_callbacks[i].vec->post_del)
2985 			    (mem_callbacks[i].arg, delta_pages, cancelled);
2986 		}
2987 	}
2988 	rw_exit(&mem_callback_rwlock);
2989 }
2990 
2991 static int
2992 kphysm_split_memseg(
2993 	pfn_t base,
2994 	pgcnt_t npgs)
2995 {
2996 	struct memseg *seg;
2997 	struct memseg **segpp;
2998 	pgcnt_t size_low, size_high;
2999 	struct memseg *seg_low, *seg_mid, *seg_high;
3000 
3001 	/*
3002 	 * Lock the memsegs list against other updates now
3003 	 */
3004 	memsegs_lock(1);
3005 
3006 	/*
3007 	 * Find boot time memseg that wholly covers this area.
3008 	 */
3009 
3010 	/* First find the memseg with page 'base' in it. */
3011 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3012 	    segpp = &((*segpp)->next)) {
3013 		if (base >= seg->pages_base && base < seg->pages_end)
3014 			break;
3015 	}
3016 	if (seg == NULL) {
3017 		memsegs_unlock(1);
3018 		return (0);
3019 	}
3020 	if (memseg_is_dynamic(seg, (pfn_t *)NULL)) {
3021 		memsegs_unlock(1);
3022 		return (0);
3023 	}
3024 	if ((base + npgs) > seg->pages_end) {
3025 		memsegs_unlock(1);
3026 		return (0);
3027 	}
3028 
3029 	/*
3030 	 * Work out the size of the two segments that will
3031 	 * surround the new segment, one for low address
3032 	 * and one for high.
3033 	 */
3034 	ASSERT(base >= seg->pages_base);
3035 	size_low = base - seg->pages_base;
3036 	ASSERT(seg->pages_end >= (base + npgs));
3037 	size_high = seg->pages_end - (base + npgs);
3038 
3039 	/*
3040 	 * Sanity check.
3041 	 */
3042 	if ((size_low + size_high) == 0) {
3043 		memsegs_unlock(1);
3044 		return (0);
3045 	}
3046 
3047 	/*
3048 	 * Allocate the new structures. The old memseg will not be freed
3049 	 * as there may be a reference to it.
3050 	 */
3051 	seg_low = NULL;
3052 	seg_high = NULL;
3053 
3054 	if (size_low != 0) {
3055 		seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3056 		bzero(seg_low, sizeof (struct memseg));
3057 	}
3058 
3059 	seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3060 	bzero(seg_mid, sizeof (struct memseg));
3061 
3062 	if (size_high != 0) {
3063 		seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3064 		bzero(seg_high, sizeof (struct memseg));
3065 	}
3066 
3067 	/*
3068 	 * All allocation done now.
3069 	 */
3070 	if (size_low != 0) {
3071 		seg_low->pages = seg->pages;
3072 		seg_low->epages = seg_low->pages + size_low;
3073 		seg_low->pages_base = seg->pages_base;
3074 		seg_low->pages_end = seg_low->pages_base + size_low;
3075 		seg_low->next = seg_mid;
3076 	}
3077 	if (size_high != 0) {
3078 		seg_high->pages = seg->epages - size_high;
3079 		seg_high->epages = seg_high->pages + size_high;
3080 		seg_high->pages_base = seg->pages_end - size_high;
3081 		seg_high->pages_end = seg_high->pages_base + size_high;
3082 		seg_high->next = seg->next;
3083 	}
3084 
3085 	seg_mid->pages = seg->pages + size_low;
3086 	seg_mid->pages_base = seg->pages_base + size_low;
3087 	seg_mid->epages = seg->epages - size_high;
3088 	seg_mid->pages_end = seg->pages_end - size_high;
3089 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3090 
3091 	/*
3092 	 * Update hat_kpm specific info of all involved memsegs and
3093 	 * allow hat_kpm specific global chain updates.
3094 	 */
3095 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3096 
3097 	/*
3098 	 * At this point we have two equivalent memseg sub-chains,
3099 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3100 	 * the same place in the global chain. By re-writing the pointer
3101 	 * in the previous element we switch atomically from using the old
3102 	 * (seg) to the new.
3103 	 */
3104 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3105 
3106 	membar_enter();
3107 
3108 	build_pfn_hash();
3109 	memsegs_unlock(1);
3110 
3111 	/*
3112 	 * We leave the old segment, 'seg', intact as there may be
3113 	 * references to it. Also, as the value of total_pages has not
3114 	 * changed and the memsegs list is effectively the same when
3115 	 * accessed via the old or the new pointer, we do not have to
3116 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3117 	 *
3118 	 * We currently do not re-use or reclaim the page_t memory.
3119 	 * If we do, then this may have to change.
3120 	 */
3121 
3122 	mutex_enter(&memseg_lists_lock);
3123 	seg->lnext = memseg_edit_junk;
3124 	memseg_edit_junk = seg;
3125 	mutex_exit(&memseg_lists_lock);
3126 
3127 	return (1);
3128 }
3129 
3130 /*
3131  * The memsegs lock is only taken when modifying the memsegs list
3132  * and rebuilding the pfn hash table (after boot).
3133  * No lock is needed for read as memseg structure are never de-allocated
3134  * and the pointer linkage is never updated until the memseg is ready.
3135  */
3136 krwlock_t memsegslock;
3137 
3138 void
3139 memsegs_lock(int writer)
3140 {
3141 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
3142 }
3143 
3144 /*ARGSUSED*/
3145 void
3146 memsegs_unlock(int writer)
3147 {
3148 	rw_exit(&memsegslock);
3149 }
3150 
3151 /*
3152  * memlist (phys_install, phys_avail) locking.
3153  */
3154 
3155 /*
3156  * A read/write lock might be better here.
3157  */
3158 static kmutex_t memlists_mutex;
3159 
3160 void
3161 memlist_read_lock()
3162 {
3163 	mutex_enter(&memlists_mutex);
3164 }
3165 
3166 void
3167 memlist_read_unlock()
3168 {
3169 	mutex_exit(&memlists_mutex);
3170 }
3171 
3172 void
3173 memlist_write_lock()
3174 {
3175 	mutex_enter(&memlists_mutex);
3176 }
3177 
3178 void
3179 memlist_write_unlock()
3180 {
3181 	mutex_exit(&memlists_mutex);
3182 }
3183 
3184 /*
3185  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3186  * structure using physical addresses. Therefore a kmem_cache is
3187  * used with KMC_NOHASH to avoid page crossings within a memseg
3188  * structure. KMC_NOHASH requires that no external (outside of
3189  * slab) information is allowed. This, in turn, implies that the
3190  * cache's slabsize must be exactly a single page, since per-slab
3191  * information (e.g. the freelist for the slab) is kept at the
3192  * end of the slab, where it is easy to locate. Should be changed
3193  * when a more obvious kmem_cache interface/flag will become
3194  * available.
3195  */
3196 void
3197 mem_config_init()
3198 {
3199 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3200 		0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3201 }
3202