xref: /titanic_50/usr/src/uts/common/os/mem_config.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/cmn_err.h>
31 #include <sys/vmem.h>
32 #include <sys/kmem.h>
33 #include <sys/systm.h>
34 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
35 #include <sys/errno.h>
36 #include <sys/memnode.h>
37 #include <sys/memlist.h>
38 #include <sys/memlist_impl.h>
39 #include <sys/tuneable.h>
40 #include <sys/proc.h>
41 #include <sys/disp.h>
42 #include <sys/debug.h>
43 #include <sys/vm.h>
44 #include <sys/callb.h>
45 #include <sys/memlist_plat.h>	/* for installed_top_size() */
46 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
47 #include <sys/dumphdr.h>	/* for dump_resize() */
48 #include <sys/atomic.h>		/* for use in stats collection */
49 #include <sys/rwlock.h>
50 #include <sys/cpuvar.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_kpm.h>
53 #include <vm/page.h>
54 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
55 #include <sys/sunddi.h>
56 #include <sys/mem_config.h>
57 #include <sys/mem_cage.h>
58 #include <sys/lgrp.h>
59 #include <sys/ddi.h>
60 #include <sys/modctl.h>
61 
62 extern void memlist_read_lock(void);
63 extern void memlist_read_unlock(void);
64 extern void memlist_write_lock(void);
65 extern void memlist_write_unlock(void);
66 
67 extern struct memlist *phys_avail;
68 
69 extern void mem_node_add(pfn_t, pfn_t);
70 extern void mem_node_del(pfn_t, pfn_t);
71 
72 extern uint_t page_ctrs_adjust(int);
73 static void kphysm_setup_post_add(pgcnt_t);
74 static int kphysm_setup_pre_del(pgcnt_t);
75 static void kphysm_setup_post_del(pgcnt_t, int);
76 
77 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
78 
79 static int delspan_reserve(pfn_t, pgcnt_t);
80 static void delspan_unreserve(pfn_t, pgcnt_t);
81 
82 static kmutex_t memseg_lists_lock;
83 static struct memseg *memseg_va_avail;
84 static struct memseg *memseg_delete_junk;
85 static struct memseg *memseg_edit_junk;
86 void memseg_remap_init(void);
87 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
88 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
89 static struct memseg *memseg_reuse(pgcnt_t);
90 
91 static struct kmem_cache *memseg_cache;
92 
93 /*
94  * Add a chunk of memory to the system.  page_t's for this memory
95  * are allocated in the first few pages of the chunk.
96  * base: starting PAGESIZE page of new memory.
97  * npgs: length in PAGESIZE pages.
98  *
99  * Adding mem this way doesn't increase the size of the hash tables;
100  * growing them would be too hard.  This should be OK, but adding memory
101  * dynamically most likely means more hash misses, since the tables will
102  * be smaller than they otherwise would be.
103  */
104 int
105 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
106 {
107 	page_t		*pp;
108 	page_t		*opp, *oepp;
109 	struct memseg	*seg;
110 	uint64_t	avmem;
111 	pfn_t		pfn;
112 	pfn_t		pt_base = base;
113 	pgcnt_t		tpgs = npgs;
114 	pgcnt_t		metapgs;
115 	int		exhausted;
116 	pfn_t		pnum;
117 	int		mnode;
118 	caddr_t		vaddr;
119 	int		reuse;
120 	int		mlret;
121 	void		*mapva;
122 	pgcnt_t		nkpmpgs = 0;
123 	offset_t	kpm_pages_off;
124 
125 	cmn_err(CE_CONT,
126 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
127 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
128 
129 	/*
130 	 * Add this span in the delete list to prevent interactions.
131 	 */
132 	if (!delspan_reserve(base, npgs)) {
133 		return (KPHYSM_ESPAN);
134 	}
135 	/*
136 	 * Check to see if any of the memory span has been added
137 	 * by trying an add to the installed memory list. This
138 	 * forms the interlocking process for add.
139 	 */
140 
141 	memlist_write_lock();
142 
143 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
144 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
145 
146 	if (mlret == MEML_SPANOP_OK)
147 		installed_top_size(phys_install, &physmax, &physinstalled);
148 
149 	memlist_write_unlock();
150 
151 	if (mlret != MEML_SPANOP_OK) {
152 		if (mlret == MEML_SPANOP_EALLOC) {
153 			delspan_unreserve(pt_base, tpgs);
154 			return (KPHYSM_ERESOURCE);
155 		} else
156 		if (mlret == MEML_SPANOP_ESPAN) {
157 			delspan_unreserve(pt_base, tpgs);
158 			return (KPHYSM_ESPAN);
159 		} else {
160 			delspan_unreserve(pt_base, tpgs);
161 			return (KPHYSM_ERESOURCE);
162 		}
163 	}
164 
165 	/*
166 	 * We store the page_t's for this new memory in the first
167 	 * few pages of the chunk. Here, we go and get'em ...
168 	 */
169 
170 	/*
171 	 * The expression after the '-' gives the number of pages
172 	 * that will fit in the new memory based on a requirement
173 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
174 	 */
175 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
176 	    (PAGESIZE + sizeof (page_t)));
177 
178 	npgs -= metapgs;
179 	base += metapgs;
180 
181 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
182 
183 	exhausted = (metapgs == 0 || npgs == 0);
184 
185 	if (kpm_enable && !exhausted) {
186 		pgcnt_t start, end, nkpmpgs_prelim;
187 		size_t	ptsz;
188 
189 		/*
190 		 * A viable kpm large page mapping must not overlap two
191 		 * dynamic memsegs. Therefore the total size is checked
192 		 * to be at least kpm_pgsz and also whether start and end
193 		 * points are at least kpm_pgsz aligned.
194 		 */
195 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
196 		    pmodkpmp(base + npgs)) {
197 
198 			kphysm_addmem_error_undospan(pt_base, tpgs);
199 
200 			/*
201 			 * There is no specific error code for violating
202 			 * kpm granularity constraints.
203 			 */
204 			return (KPHYSM_ENOTVIABLE);
205 		}
206 
207 		start = kpmptop(ptokpmp(base));
208 		end = kpmptop(ptokpmp(base + npgs));
209 		nkpmpgs_prelim = ptokpmp(end - start);
210 		ptsz = npgs * sizeof (page_t);
211 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
212 		exhausted = (tpgs <= metapgs);
213 		if (!exhausted) {
214 			npgs = tpgs - metapgs;
215 			base = pt_base + metapgs;
216 
217 			/* final nkpmpgs */
218 			start = kpmptop(ptokpmp(base));
219 			nkpmpgs = ptokpmp(end - start);
220 			kpm_pages_off = ptsz +
221 				(nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
222 		}
223 	}
224 
225 	/*
226 	 * Is memory area supplied too small?
227 	 */
228 	if (exhausted) {
229 		kphysm_addmem_error_undospan(pt_base, tpgs);
230 
231 		/*
232 		 * There is no specific error code for 'too small'.
233 		 */
234 		return (KPHYSM_ERESOURCE);
235 	}
236 
237 	/*
238 	 * We may re-use a previously allocated VA space for the page_ts
239 	 * eventually, but we need to initialize and lock the pages first.
240 	 */
241 
242 	/*
243 	 * Get an address in the kernel address map, map
244 	 * the page_t pages and see if we can touch them.
245 	 */
246 
247 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
248 	if (mapva == NULL) {
249 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
250 		    " Can't allocate VA for page_ts");
251 
252 		kphysm_addmem_error_undospan(pt_base, tpgs);
253 
254 		return (KPHYSM_ERESOURCE);
255 	}
256 	pp = mapva;
257 
258 	if (physmax < (pt_base + tpgs))
259 		physmax = (pt_base + tpgs);
260 
261 	/*
262 	 * In the remapping code we map one page at a time so we must do
263 	 * the same here to match mapping sizes.
264 	 */
265 	pfn = pt_base;
266 	vaddr = (caddr_t)pp;
267 	for (pnum = 0; pnum < metapgs; pnum++) {
268 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
269 		    PROT_READ | PROT_WRITE,
270 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
271 		pfn++;
272 		vaddr += ptob(1);
273 	}
274 
275 	if (ddi_peek32((dev_info_t *)NULL,
276 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
277 
278 		cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:"
279 		    " Can't access pp array at 0x%p [phys 0x%lx]",
280 		    (void *)pp, pt_base);
281 
282 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
283 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
284 
285 		vmem_free(heap_arena, mapva, ptob(metapgs));
286 
287 		kphysm_addmem_error_undospan(pt_base, tpgs);
288 
289 		return (KPHYSM_EFAULT);
290 	}
291 
292 	/*
293 	 * Add this memory slice to its memory node translation.
294 	 *
295 	 * Note that right now, each node may have only one slice;
296 	 * this may change with COD or in larger SSM systems with
297 	 * nested latency groups, so we must not assume that the
298 	 * node does not yet exist.
299 	 */
300 	pnum = base + npgs - 1;
301 	mem_node_add_slice(base, pnum);
302 
303 	/*
304 	 * Allocate or resize page counters as necessary to accomodate
305 	 * the increase in memory pages.
306 	 */
307 	mnode = PFN_2_MEM_NODE(pnum);
308 	if (page_ctrs_adjust(mnode) != 0) {
309 
310 		mem_node_pre_del_slice(base, pnum);
311 		mem_node_post_del_slice(base, pnum, 0);
312 
313 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
314 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
315 
316 		vmem_free(heap_arena, mapva, ptob(metapgs));
317 
318 		kphysm_addmem_error_undospan(pt_base, tpgs);
319 
320 		return (KPHYSM_ERESOURCE);
321 	}
322 
323 	/*
324 	 * Update the phys_avail memory list.
325 	 * The phys_install list was done at the start.
326 	 */
327 
328 	memlist_write_lock();
329 
330 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
331 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
332 	ASSERT(mlret == MEML_SPANOP_OK);
333 
334 	memlist_write_unlock();
335 
336 	/* See if we can find a memseg to re-use. */
337 	seg = memseg_reuse(metapgs);
338 
339 	reuse = (seg != NULL);
340 
341 	/*
342 	 * Initialize the memseg structure representing this memory
343 	 * and add it to the existing list of memsegs. Do some basic
344 	 * initialization and add the memory to the system.
345 	 * In order to prevent lock deadlocks, the add_physmem()
346 	 * code is repeated here, but split into several stages.
347 	 */
348 	if (seg == NULL) {
349 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
350 		bzero(seg, sizeof (struct memseg));
351 		seg->msegflags = MEMSEG_DYNAMIC;
352 		seg->pages = pp;
353 	} else {
354 		/*EMPTY*/
355 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
356 	}
357 
358 	seg->epages = seg->pages + npgs;
359 	seg->pages_base = base;
360 	seg->pages_end = base + npgs;
361 
362 	/*
363 	 * Initialize metadata. The page_ts are set to locked state
364 	 * ready to be freed.
365 	 */
366 	bzero((caddr_t)pp, ptob(metapgs));
367 
368 	pfn = seg->pages_base;
369 	/* Save the original pp base in case we reuse a memseg. */
370 	opp = pp;
371 	oepp = opp + npgs;
372 	for (pp = opp; pp < oepp; pp++) {
373 		pp->p_pagenum = pfn;
374 		pfn++;
375 		page_iolock_init(pp);
376 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
377 			continue;
378 		pp->p_offset = (u_offset_t)-1;
379 	}
380 
381 	if (reuse) {
382 		/* Remap our page_ts to the re-used memseg VA space. */
383 		pfn = pt_base;
384 		vaddr = (caddr_t)seg->pages;
385 		for (pnum = 0; pnum < metapgs; pnum++) {
386 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
387 			    PROT_READ | PROT_WRITE,
388 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
389 			pfn++;
390 			vaddr += ptob(1);
391 		}
392 
393 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
394 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
395 
396 		vmem_free(heap_arena, mapva, ptob(metapgs));
397 	}
398 
399 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
400 
401 	memsegs_lock(1);
402 
403 	/*
404 	 * The new memseg is inserted at the beginning of the list.
405 	 * Not only does this save searching for the tail, but in the
406 	 * case of a re-used memseg, it solves the problem of what
407 	 * happens of some process has still got a pointer to the
408 	 * memseg and follows the next pointer to continue traversing
409 	 * the memsegs list.
410 	 */
411 
412 	hat_kpm_addmem_mseg_insert(seg);
413 
414 	seg->next = memsegs;
415 	membar_producer();
416 
417 	hat_kpm_addmem_memsegs_update(seg);
418 
419 	memsegs = seg;
420 
421 	build_pfn_hash();
422 
423 	total_pages += npgs;
424 
425 	/*
426 	 * Recalculate the paging parameters now total_pages has changed.
427 	 * This will also cause the clock hands to be reset before next use.
428 	 */
429 	setupclock(1);
430 
431 	memsegs_unlock(1);
432 
433 	/*
434 	 * Free the pages outside the lock to avoid locking loops.
435 	 */
436 	for (pp = seg->pages; pp < seg->epages; pp++) {
437 		page_free(pp, 1);
438 	}
439 
440 	/*
441 	 * Now that we've updated the appropriate memory lists we
442 	 * need to reset a number of globals, since we've increased memory.
443 	 * Several have already been updated for us as noted above. The
444 	 * globals we're interested in at this point are:
445 	 *   physmax - highest page frame number.
446 	 *   physinstalled - number of pages currently installed (done earlier)
447 	 *   maxmem - max free pages in the system
448 	 *   physmem - physical memory pages available
449 	 *   availrmem - real memory available
450 	 */
451 
452 	mutex_enter(&freemem_lock);
453 	maxmem += npgs;
454 	physmem += npgs;
455 	availrmem += npgs;
456 	availrmem_initial += npgs;
457 
458 	mutex_exit(&freemem_lock);
459 
460 	dump_resize();
461 
462 	page_freelist_coalesce_all(mnode);
463 
464 	kphysm_setup_post_add(npgs);
465 
466 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
467 	    "(0x%" PRIx64 ")\n",
468 	    physinstalled << (PAGESHIFT - 10),
469 	    (uint64_t)physinstalled << PAGESHIFT);
470 
471 	avmem = (uint64_t)freemem << PAGESHIFT;
472 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
473 	    "avail mem = %" PRId64 "\n", avmem);
474 
475 	/*
476 	 * Update lgroup generation number on single lgroup systems
477 	 */
478 	if (nlgrps == 1)
479 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
480 
481 	delspan_unreserve(pt_base, tpgs);
482 	return (KPHYSM_OK);		/* Successfully added system memory */
483 
484 }
485 
486 /*
487  * There are various error conditions in kphysm_add_memory_dynamic()
488  * which require a rollback of already changed global state.
489  */
490 static void
491 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
492 {
493 	int mlret;
494 
495 	/* Unreserve memory span. */
496 	memlist_write_lock();
497 
498 	mlret = memlist_delete_span(
499 	    (uint64_t)(pt_base) << PAGESHIFT,
500 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
501 
502 	ASSERT(mlret == MEML_SPANOP_OK);
503 	phys_install_has_changed();
504 	installed_top_size(phys_install, &physmax, &physinstalled);
505 
506 	memlist_write_unlock();
507 	delspan_unreserve(pt_base, tpgs);
508 }
509 
510 /*
511  * Only return an available memseg of exactly the right size.
512  * When the meta data area has it's own virtual address space
513  * we will need to manage this more carefully and do best fit
514  * allocations, possibly splitting an availble area.
515  */
516 static struct memseg *
517 memseg_reuse(pgcnt_t metapgs)
518 {
519 	struct memseg **segpp, *seg;
520 
521 	mutex_enter(&memseg_lists_lock);
522 
523 	segpp = &memseg_va_avail;
524 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
525 		caddr_t end;
526 
527 		if (kpm_enable)
528 			end = hat_kpm_mseg_reuse(seg);
529 		else
530 			end = (caddr_t)seg->epages;
531 
532 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
533 			*segpp = seg->lnext;
534 			seg->lnext = NULL;
535 			break;
536 		}
537 	}
538 	mutex_exit(&memseg_lists_lock);
539 
540 	return (seg);
541 }
542 
543 static uint_t handle_gen;
544 
545 struct memdelspan {
546 	struct memdelspan *mds_next;
547 	pfn_t		mds_base;
548 	pgcnt_t		mds_npgs;
549 	uint_t		*mds_bitmap;
550 	uint_t		*mds_bitmap_retired;
551 };
552 
553 #define	NBPBMW		(sizeof (uint_t) * NBBY)
554 #define	MDS_BITMAPBYTES(MDSP) \
555 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
556 
557 struct transit_list {
558 	struct transit_list	*trl_next;
559 	struct memdelspan	*trl_spans;
560 	int			trl_collect;
561 };
562 
563 struct transit_list_head {
564 	kmutex_t		trh_lock;
565 	struct transit_list	*trh_head;
566 };
567 
568 static struct transit_list_head transit_list_head;
569 
570 struct mem_handle;
571 static void transit_list_collect(struct mem_handle *, int);
572 static void transit_list_insert(struct transit_list *);
573 static void transit_list_remove(struct transit_list *);
574 
575 #ifdef DEBUG
576 #define	MEM_DEL_STATS
577 #endif /* DEBUG */
578 
579 #ifdef MEM_DEL_STATS
580 static int mem_del_stat_print = 0;
581 struct mem_del_stat {
582 	uint_t	nloop;
583 	uint_t	need_free;
584 	uint_t	free_loop;
585 	uint_t	free_low;
586 	uint_t	free_failed;
587 	uint_t	ncheck;
588 	uint_t	nopaget;
589 	uint_t	lockfail;
590 	uint_t	nfree;
591 	uint_t	nreloc;
592 	uint_t	nrelocfail;
593 	uint_t	already_done;
594 	uint_t	first_notfree;
595 	uint_t	npplocked;
596 	uint_t	nlockreloc;
597 	uint_t	nnorepl;
598 	uint_t	nmodreloc;
599 	uint_t	ndestroy;
600 	uint_t	nputpage;
601 	uint_t	nnoreclaim;
602 	uint_t	ndelay;
603 	uint_t	demotefail;
604 	uint64_t nticks_total;
605 	uint64_t nticks_pgrp;
606 	uint_t	retired;
607 	uint_t	toxic;
608 	uint_t	failing;
609 	uint_t	modtoxic;
610 	uint_t	npplkdtoxic;
611 	uint_t	gptlmodfail;
612 	uint_t	gptllckfail;
613 };
614 /*
615  * The stat values are only incremented in the delete thread
616  * so no locking or atomic required.
617  */
618 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
619 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
620 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
621 static void mem_del_stat_print_func(struct mem_handle *);
622 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
623 #else /* MEM_DEL_STATS */
624 #define	MDSTAT_INCR(MHP, FLD)
625 #define	MDSTAT_TOTAL(MHP, ntck)
626 #define	MDSTAT_PGRP(MHP, ntck)
627 #define	MDSTAT_PRINT(MHP)
628 #endif /* MEM_DEL_STATS */
629 
630 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
631 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
632 
633 /*
634  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
635  * The mutex may not be required for other fields, dependent on mh_state.
636  */
637 struct mem_handle {
638 	kmutex_t	mh_mutex;
639 	struct mem_handle *mh_next;
640 	memhandle_t	mh_exthandle;
641 	mhnd_state_t	mh_state;
642 	struct transit_list mh_transit;
643 	pgcnt_t		mh_phys_pages;
644 	pgcnt_t		mh_vm_pages;
645 	pgcnt_t		mh_hold_todo;
646 	void		(*mh_delete_complete)(void *, int error);
647 	void		*mh_delete_complete_arg;
648 	volatile uint_t mh_cancel;
649 	volatile uint_t mh_dr_aio_cleanup_cancel;
650 	volatile uint_t mh_aio_cleanup_done;
651 	kcondvar_t	mh_cv;
652 	kthread_id_t	mh_thread_id;
653 	page_t		*mh_deleted;	/* link through p_next */
654 #ifdef MEM_DEL_STATS
655 	struct mem_del_stat mh_delstat;
656 #endif /* MEM_DEL_STATS */
657 };
658 
659 static struct mem_handle *mem_handle_head;
660 static kmutex_t mem_handle_list_mutex;
661 
662 static struct mem_handle *
663 kphysm_allocate_mem_handle()
664 {
665 	struct mem_handle *mhp;
666 
667 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
668 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
669 	mutex_enter(&mem_handle_list_mutex);
670 	mutex_enter(&mhp->mh_mutex);
671 	/* handle_gen is protected by list mutex. */
672 	mhp->mh_exthandle = (memhandle_t)(++handle_gen);
673 	mhp->mh_next = mem_handle_head;
674 	mem_handle_head = mhp;
675 	mutex_exit(&mem_handle_list_mutex);
676 
677 	return (mhp);
678 }
679 
680 static void
681 kphysm_free_mem_handle(struct mem_handle *mhp)
682 {
683 	struct mem_handle **mhpp;
684 
685 	ASSERT(mutex_owned(&mhp->mh_mutex));
686 	ASSERT(mhp->mh_state == MHND_FREE);
687 	/*
688 	 * Exit the mutex to preserve locking order. This is OK
689 	 * here as once in the FREE state, the handle cannot
690 	 * be found by a lookup.
691 	 */
692 	mutex_exit(&mhp->mh_mutex);
693 
694 	mutex_enter(&mem_handle_list_mutex);
695 	mhpp = &mem_handle_head;
696 	while (*mhpp != NULL && *mhpp != mhp)
697 		mhpp = &(*mhpp)->mh_next;
698 	ASSERT(*mhpp == mhp);
699 	/*
700 	 * No need to lock the handle (mh_mutex) as only
701 	 * mh_next changing and this is the only thread that
702 	 * can be referncing mhp.
703 	 */
704 	*mhpp = mhp->mh_next;
705 	mutex_exit(&mem_handle_list_mutex);
706 
707 	mutex_destroy(&mhp->mh_mutex);
708 	kmem_free(mhp, sizeof (struct mem_handle));
709 }
710 
711 /*
712  * This function finds the internal mem_handle corresponding to an
713  * external handle and returns it with the mh_mutex held.
714  */
715 static struct mem_handle *
716 kphysm_lookup_mem_handle(memhandle_t handle)
717 {
718 	struct mem_handle *mhp;
719 
720 	mutex_enter(&mem_handle_list_mutex);
721 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
722 		if (mhp->mh_exthandle == handle) {
723 			mutex_enter(&mhp->mh_mutex);
724 			/*
725 			 * The state of the handle could have been changed
726 			 * by kphysm_del_release() while waiting for mh_mutex.
727 			 */
728 			if (mhp->mh_state == MHND_FREE) {
729 				mutex_exit(&mhp->mh_mutex);
730 				continue;
731 			}
732 			break;
733 		}
734 	}
735 	mutex_exit(&mem_handle_list_mutex);
736 	return (mhp);
737 }
738 
739 int
740 kphysm_del_gethandle(memhandle_t *xmhp)
741 {
742 	struct mem_handle *mhp;
743 
744 	mhp = kphysm_allocate_mem_handle();
745 	/*
746 	 * The handle is allocated using KM_SLEEP, so cannot fail.
747 	 * If the implementation is changed, the correct error to return
748 	 * here would be KPHYSM_ENOHANDLES.
749 	 */
750 	ASSERT(mhp->mh_state == MHND_FREE);
751 	mhp->mh_state = MHND_INIT;
752 	*xmhp = mhp->mh_exthandle;
753 	mutex_exit(&mhp->mh_mutex);
754 	return (KPHYSM_OK);
755 }
756 
757 static int
758 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
759 {
760 	pfn_t e1, e2;
761 
762 	e1 = b1 + l1;
763 	e2 = b2 + l2;
764 
765 	return (!(b2 >= e1 || b1 >= e2));
766 }
767 
768 static int can_remove_pgs(pgcnt_t);
769 
770 static struct memdelspan *
771 span_to_install(pfn_t base, pgcnt_t npgs)
772 {
773 	struct memdelspan *mdsp;
774 	struct memdelspan *mdsp_new;
775 	uint64_t address, size, thislen;
776 	struct memlist *mlp;
777 
778 	mdsp_new = NULL;
779 
780 	address = (uint64_t)base << PAGESHIFT;
781 	size = (uint64_t)npgs << PAGESHIFT;
782 	while (size != 0) {
783 		memlist_read_lock();
784 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
785 			if (address >= (mlp->address + mlp->size))
786 				continue;
787 			if ((address + size) > mlp->address)
788 				break;
789 		}
790 		if (mlp == NULL) {
791 			address += size;
792 			size = 0;
793 			thislen = 0;
794 		} else {
795 			if (address < mlp->address) {
796 				size -= (mlp->address - address);
797 				address = mlp->address;
798 			}
799 			ASSERT(address >= mlp->address);
800 			if ((address + size) > (mlp->address + mlp->size)) {
801 				thislen = mlp->size - (address - mlp->address);
802 			} else {
803 				thislen = size;
804 			}
805 		}
806 		memlist_read_unlock();
807 		/* TODO: phys_install could change now */
808 		if (thislen == 0)
809 			continue;
810 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
811 		mdsp->mds_base = btop(address);
812 		mdsp->mds_npgs = btop(thislen);
813 		mdsp->mds_next = mdsp_new;
814 		mdsp_new = mdsp;
815 		address += thislen;
816 		size -= thislen;
817 	}
818 	return (mdsp_new);
819 }
820 
821 static void
822 free_delspans(struct memdelspan *mdsp)
823 {
824 	struct memdelspan *amdsp;
825 
826 	while ((amdsp = mdsp) != NULL) {
827 		mdsp = amdsp->mds_next;
828 		kmem_free(amdsp, sizeof (struct memdelspan));
829 	}
830 }
831 
832 /*
833  * Concatenate lists. No list ordering is required.
834  */
835 
836 static void
837 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
838 {
839 	while (*mdspp != NULL)
840 		mdspp = &(*mdspp)->mds_next;
841 
842 	*mdspp = mdsp;
843 }
844 
845 /*
846  * Given a new list of delspans, check there is no overlap with
847  * all existing span activity (add or delete) and then concatenate
848  * the new spans to the given list.
849  * Return 1 for OK, 0 if overlapping.
850  */
851 static int
852 delspan_insert(
853 	struct transit_list *my_tlp,
854 	struct memdelspan *mdsp_new)
855 {
856 	struct transit_list_head *trh;
857 	struct transit_list *tlp;
858 	int ret;
859 
860 	trh = &transit_list_head;
861 
862 	ASSERT(my_tlp != NULL);
863 	ASSERT(mdsp_new != NULL);
864 
865 	ret = 1;
866 	mutex_enter(&trh->trh_lock);
867 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
868 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
869 		struct memdelspan *mdsp;
870 
871 		for (mdsp = tlp->trl_spans; mdsp != NULL;
872 		    mdsp = mdsp->mds_next) {
873 			struct memdelspan *nmdsp;
874 
875 			for (nmdsp = mdsp_new; nmdsp != NULL;
876 			    nmdsp = nmdsp->mds_next) {
877 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
878 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
879 					ret = 0;
880 					goto done;
881 				}
882 			}
883 		}
884 	}
885 done:
886 	if (ret != 0) {
887 		if (my_tlp->trl_spans == NULL)
888 			transit_list_insert(my_tlp);
889 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
890 	}
891 	mutex_exit(&trh->trh_lock);
892 	return (ret);
893 }
894 
895 static void
896 delspan_remove(
897 	struct transit_list *my_tlp,
898 	pfn_t base,
899 	pgcnt_t npgs)
900 {
901 	struct transit_list_head *trh;
902 	struct memdelspan *mdsp;
903 
904 	trh = &transit_list_head;
905 
906 	ASSERT(my_tlp != NULL);
907 
908 	mutex_enter(&trh->trh_lock);
909 	if ((mdsp = my_tlp->trl_spans) != NULL) {
910 		if (npgs == 0) {
911 			my_tlp->trl_spans = NULL;
912 			free_delspans(mdsp);
913 			transit_list_remove(my_tlp);
914 		} else {
915 			struct memdelspan **prv;
916 
917 			prv = &my_tlp->trl_spans;
918 			while (mdsp != NULL) {
919 				pfn_t p_end;
920 
921 				p_end = mdsp->mds_base + mdsp->mds_npgs;
922 				if (mdsp->mds_base >= base &&
923 				    p_end <= (base + npgs)) {
924 					*prv = mdsp->mds_next;
925 					mdsp->mds_next = NULL;
926 					free_delspans(mdsp);
927 				} else {
928 					prv = &mdsp->mds_next;
929 				}
930 				mdsp = *prv;
931 			}
932 			if (my_tlp->trl_spans == NULL)
933 				transit_list_remove(my_tlp);
934 		}
935 	}
936 	mutex_exit(&trh->trh_lock);
937 }
938 
939 /*
940  * Reserve interface for add to stop delete before add finished.
941  * This list is only accessed through the delspan_insert/remove
942  * functions and so is fully protected by the mutex in struct transit_list.
943  */
944 
945 static struct transit_list reserve_transit;
946 
947 static int
948 delspan_reserve(pfn_t base, pgcnt_t npgs)
949 {
950 	struct memdelspan *mdsp;
951 	int ret;
952 
953 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
954 	mdsp->mds_base = base;
955 	mdsp->mds_npgs = npgs;
956 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
957 		free_delspans(mdsp);
958 	}
959 	return (ret);
960 }
961 
962 static void
963 delspan_unreserve(pfn_t base, pgcnt_t npgs)
964 {
965 	delspan_remove(&reserve_transit, base, npgs);
966 }
967 
968 /*
969  * Return whether memseg was created by kphysm_add_memory_dynamic().
970  * If this is the case and startp non zero, return also the start pfn
971  * of the meta data via startp.
972  */
973 static int
974 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
975 {
976 	pfn_t		pt_start;
977 
978 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
979 		return (0);
980 
981 	/* Meta data is required to be at the beginning */
982 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
983 
984 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
985 	if (startp != NULL)
986 		*startp = pt_start;
987 
988 	return (1);
989 }
990 
991 int
992 kphysm_del_span(
993 	memhandle_t handle,
994 	pfn_t base,
995 	pgcnt_t npgs)
996 {
997 	struct mem_handle *mhp;
998 	struct memseg *seg;
999 	struct memdelspan *mdsp;
1000 	struct memdelspan *mdsp_new;
1001 	pgcnt_t phys_pages, vm_pages;
1002 	pfn_t p_end;
1003 	page_t *pp;
1004 	int ret;
1005 
1006 	mhp = kphysm_lookup_mem_handle(handle);
1007 	if (mhp == NULL) {
1008 		return (KPHYSM_EHANDLE);
1009 	}
1010 	if (mhp->mh_state != MHND_INIT) {
1011 		mutex_exit(&mhp->mh_mutex);
1012 		return (KPHYSM_ESEQUENCE);
1013 	}
1014 
1015 	/*
1016 	 * Intersect the span with the installed memory list (phys_install).
1017 	 */
1018 	mdsp_new = span_to_install(base, npgs);
1019 	if (mdsp_new == NULL) {
1020 		/*
1021 		 * No physical memory in this range. Is this an
1022 		 * error? If an attempt to start the delete is made
1023 		 * for OK returns from del_span such as this, start will
1024 		 * return an error.
1025 		 * Could return KPHYSM_ENOWORK.
1026 		 */
1027 		/*
1028 		 * It is assumed that there are no error returns
1029 		 * from span_to_install() due to kmem_alloc failure.
1030 		 */
1031 		mutex_exit(&mhp->mh_mutex);
1032 		return (KPHYSM_OK);
1033 	}
1034 	/*
1035 	 * Does this span overlap an existing span?
1036 	 */
1037 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1038 		/*
1039 		 * Differentiate between already on list for this handle
1040 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1041 		 */
1042 		ret = KPHYSM_EBUSY;
1043 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1044 		    mdsp = mdsp->mds_next) {
1045 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1046 			    base, npgs)) {
1047 				ret = KPHYSM_EDUP;
1048 				break;
1049 			}
1050 		}
1051 		mutex_exit(&mhp->mh_mutex);
1052 		free_delspans(mdsp_new);
1053 		return (ret);
1054 	}
1055 	/*
1056 	 * At this point the spans in mdsp_new have been inserted into the
1057 	 * list of spans for this handle and thereby to the global list of
1058 	 * spans being processed. Each of these spans must now be checked
1059 	 * for relocatability. As a side-effect segments in the memseg list
1060 	 * may be split.
1061 	 *
1062 	 * Note that mdsp_new can no longer be used as it is now part of
1063 	 * a larger list. Select elements of this larger list based
1064 	 * on base and npgs.
1065 	 */
1066 restart:
1067 	phys_pages = 0;
1068 	vm_pages = 0;
1069 	ret = KPHYSM_OK;
1070 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1071 	    mdsp = mdsp->mds_next) {
1072 		pgcnt_t pages_checked;
1073 
1074 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1075 			continue;
1076 		}
1077 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1078 		/*
1079 		 * The pages_checked count is a hack. All pages should be
1080 		 * checked for relocatability. Those not covered by memsegs
1081 		 * should be tested with arch_kphysm_del_span_ok().
1082 		 */
1083 		pages_checked = 0;
1084 		for (seg = memsegs; seg; seg = seg->next) {
1085 			pfn_t mseg_start;
1086 
1087 			if (seg->pages_base >= p_end ||
1088 			    seg->pages_end <= mdsp->mds_base) {
1089 				/* Span and memseg don't overlap. */
1090 				continue;
1091 			}
1092 			/* Check that segment is suitable for delete. */
1093 			if (memseg_is_dynamic(seg, &mseg_start)) {
1094 				/*
1095 				 * Can only delete whole added segments
1096 				 * for the moment.
1097 				 * Check that this is completely within the
1098 				 * span.
1099 				 */
1100 				if (mseg_start < mdsp->mds_base ||
1101 				    seg->pages_end > p_end) {
1102 					ret = KPHYSM_EBUSY;
1103 					break;
1104 				}
1105 				pages_checked += seg->pages_end - mseg_start;
1106 			} else {
1107 				/*
1108 				 * Set mseg_start for accounting below.
1109 				 */
1110 				mseg_start = seg->pages_base;
1111 				/*
1112 				 * If this segment is larger than the span,
1113 				 * try to split it. After the split, it
1114 				 * is necessary to restart.
1115 				 */
1116 				if (seg->pages_base < mdsp->mds_base ||
1117 				    seg->pages_end > p_end) {
1118 					pfn_t abase;
1119 					pgcnt_t anpgs;
1120 					int s_ret;
1121 
1122 					/* Split required.  */
1123 					if (mdsp->mds_base < seg->pages_base)
1124 						abase = seg->pages_base;
1125 					else
1126 						abase = mdsp->mds_base;
1127 					if (p_end > seg->pages_end)
1128 						anpgs = seg->pages_end - abase;
1129 					else
1130 						anpgs = p_end - abase;
1131 					s_ret = kphysm_split_memseg(abase,
1132 					    anpgs);
1133 					if (s_ret == 0) {
1134 						/* Split failed. */
1135 						ret = KPHYSM_ERESOURCE;
1136 						break;
1137 					}
1138 					goto restart;
1139 				}
1140 				pages_checked +=
1141 				    seg->pages_end - seg->pages_base;
1142 			}
1143 			/*
1144 			 * The memseg is wholly within the delete span.
1145 			 * The individual pages can now be checked.
1146 			 */
1147 			/* Cage test. */
1148 			for (pp = seg->pages; pp < seg->epages; pp++) {
1149 				if (PP_ISNORELOC(pp)) {
1150 					ret = KPHYSM_ENONRELOC;
1151 					break;
1152 				}
1153 			}
1154 			if (ret != KPHYSM_OK) {
1155 				break;
1156 			}
1157 			phys_pages += (seg->pages_end - mseg_start);
1158 			vm_pages += MSEG_NPAGES(seg);
1159 		}
1160 		if (ret != KPHYSM_OK)
1161 			break;
1162 		if (pages_checked != mdsp->mds_npgs) {
1163 			ret = KPHYSM_ENONRELOC;
1164 			break;
1165 		}
1166 	}
1167 
1168 	if (ret == KPHYSM_OK) {
1169 		mhp->mh_phys_pages += phys_pages;
1170 		mhp->mh_vm_pages += vm_pages;
1171 	} else {
1172 		/*
1173 		 * Keep holding the mh_mutex to prevent it going away.
1174 		 */
1175 		delspan_remove(&mhp->mh_transit, base, npgs);
1176 	}
1177 	mutex_exit(&mhp->mh_mutex);
1178 	return (ret);
1179 }
1180 
1181 int
1182 kphysm_del_span_query(
1183 	pfn_t base,
1184 	pgcnt_t npgs,
1185 	memquery_t *mqp)
1186 {
1187 	struct memdelspan *mdsp;
1188 	struct memdelspan *mdsp_new;
1189 	int done_first_nonreloc;
1190 
1191 	mqp->phys_pages = 0;
1192 	mqp->managed = 0;
1193 	mqp->nonrelocatable = 0;
1194 	mqp->first_nonrelocatable = 0;
1195 	mqp->last_nonrelocatable = 0;
1196 
1197 	mdsp_new = span_to_install(base, npgs);
1198 	/*
1199 	 * It is OK to proceed here if mdsp_new == NULL.
1200 	 */
1201 	done_first_nonreloc = 0;
1202 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1203 		pfn_t sbase;
1204 		pgcnt_t snpgs;
1205 
1206 		mqp->phys_pages += mdsp->mds_npgs;
1207 		sbase = mdsp->mds_base;
1208 		snpgs = mdsp->mds_npgs;
1209 		while (snpgs != 0) {
1210 			struct memseg *lseg, *seg;
1211 			pfn_t p_end;
1212 			page_t *pp;
1213 			pfn_t mseg_start;
1214 
1215 			p_end = sbase + snpgs;
1216 			/*
1217 			 * Find the lowest addressed memseg that starts
1218 			 * after sbase and account for it.
1219 			 * This is to catch dynamic memsegs whose start
1220 			 * is hidden.
1221 			 */
1222 			seg = NULL;
1223 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1224 				if ((lseg->pages_base >= sbase) ||
1225 				    (lseg->pages_base < p_end &&
1226 				    lseg->pages_end > sbase)) {
1227 					if (seg == NULL ||
1228 					    seg->pages_base > lseg->pages_base)
1229 						seg = lseg;
1230 				}
1231 			}
1232 			if (seg != NULL) {
1233 				if (!memseg_is_dynamic(seg, &mseg_start)) {
1234 					mseg_start = seg->pages_base;
1235 				}
1236 				/*
1237 				 * Now have the full extent of the memseg so
1238 				 * do the range check.
1239 				 */
1240 				if (mseg_start >= p_end ||
1241 				    seg->pages_end <= sbase) {
1242 					/* Span does not overlap memseg. */
1243 					seg = NULL;
1244 				}
1245 			}
1246 			/*
1247 			 * Account for gap either before the segment if
1248 			 * there is one or to the end of the span.
1249 			 */
1250 			if (seg == NULL || mseg_start > sbase) {
1251 				pfn_t a_end;
1252 
1253 				a_end = (seg == NULL) ? p_end : mseg_start;
1254 				/*
1255 				 * Check with arch layer for relocatability.
1256 				 */
1257 				if (arch_kphysm_del_span_ok(sbase,
1258 				    (a_end - sbase))) {
1259 					/*
1260 					 * No non-relocatble pages in this
1261 					 * area, avoid the fine-grained
1262 					 * test.
1263 					 */
1264 					snpgs -= (a_end - sbase);
1265 					sbase = a_end;
1266 				}
1267 				while (sbase < a_end) {
1268 					if (!arch_kphysm_del_span_ok(sbase,
1269 					    1)) {
1270 						mqp->nonrelocatable++;
1271 						if (!done_first_nonreloc) {
1272 							mqp->
1273 							    first_nonrelocatable
1274 							    = sbase;
1275 							done_first_nonreloc = 1;
1276 						}
1277 						mqp->last_nonrelocatable =
1278 						    sbase;
1279 					}
1280 					sbase++;
1281 					snpgs--;
1282 				}
1283 			}
1284 			if (seg != NULL) {
1285 				ASSERT(mseg_start <= sbase);
1286 				if (seg->pages_base != mseg_start &&
1287 				    seg->pages_base > sbase) {
1288 					pgcnt_t skip_pgs;
1289 
1290 					/*
1291 					 * Skip the page_t area of a
1292 					 * dynamic memseg.
1293 					 */
1294 					skip_pgs = seg->pages_base - sbase;
1295 					if (snpgs <= skip_pgs) {
1296 						sbase += snpgs;
1297 						snpgs = 0;
1298 						continue;
1299 					}
1300 					snpgs -= skip_pgs;
1301 					sbase += skip_pgs;
1302 				}
1303 				ASSERT(snpgs != 0);
1304 				ASSERT(seg->pages_base <= sbase);
1305 				/*
1306 				 * The individual pages can now be checked.
1307 				 */
1308 				for (pp = seg->pages +
1309 				    (sbase - seg->pages_base);
1310 				    snpgs != 0 && pp < seg->epages; pp++) {
1311 					mqp->managed++;
1312 					if (PP_ISNORELOC(pp)) {
1313 						mqp->nonrelocatable++;
1314 						if (!done_first_nonreloc) {
1315 							mqp->
1316 							    first_nonrelocatable
1317 							    = sbase;
1318 							done_first_nonreloc = 1;
1319 						}
1320 						mqp->last_nonrelocatable =
1321 						    sbase;
1322 					}
1323 					sbase++;
1324 					snpgs--;
1325 				}
1326 			}
1327 		}
1328 	}
1329 
1330 	free_delspans(mdsp_new);
1331 
1332 	return (KPHYSM_OK);
1333 }
1334 
1335 /*
1336  * This release function can be called at any stage as follows:
1337  *	_gethandle only called
1338  *	_span(s) only called
1339  *	_start called but failed
1340  *	delete thread exited
1341  */
1342 int
1343 kphysm_del_release(memhandle_t handle)
1344 {
1345 	struct mem_handle *mhp;
1346 
1347 	mhp = kphysm_lookup_mem_handle(handle);
1348 	if (mhp == NULL) {
1349 		return (KPHYSM_EHANDLE);
1350 	}
1351 	switch (mhp->mh_state) {
1352 	case MHND_STARTING:
1353 	case MHND_RUNNING:
1354 		mutex_exit(&mhp->mh_mutex);
1355 		return (KPHYSM_ENOTFINISHED);
1356 	case MHND_FREE:
1357 		ASSERT(mhp->mh_state != MHND_FREE);
1358 		mutex_exit(&mhp->mh_mutex);
1359 		return (KPHYSM_EHANDLE);
1360 	case MHND_INIT:
1361 		break;
1362 	case MHND_DONE:
1363 		break;
1364 	case MHND_RELEASE:
1365 		mutex_exit(&mhp->mh_mutex);
1366 		return (KPHYSM_ESEQUENCE);
1367 	default:
1368 #ifdef DEBUG
1369 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1370 		    (void *)mhp, mhp->mh_state);
1371 #endif /* DEBUG */
1372 		mutex_exit(&mhp->mh_mutex);
1373 		return (KPHYSM_EHANDLE);
1374 	}
1375 	/*
1376 	 * Set state so that we can wait if necessary.
1377 	 * Also this means that we have read/write access to all
1378 	 * fields except mh_exthandle and mh_state.
1379 	 */
1380 	mhp->mh_state = MHND_RELEASE;
1381 	/*
1382 	 * The mem_handle cannot be de-allocated by any other operation
1383 	 * now, so no need to hold mh_mutex.
1384 	 */
1385 	mutex_exit(&mhp->mh_mutex);
1386 
1387 	delspan_remove(&mhp->mh_transit, 0, 0);
1388 	mhp->mh_phys_pages = 0;
1389 	mhp->mh_vm_pages = 0;
1390 	mhp->mh_hold_todo = 0;
1391 	mhp->mh_delete_complete = NULL;
1392 	mhp->mh_delete_complete_arg = NULL;
1393 	mhp->mh_cancel = 0;
1394 
1395 	mutex_enter(&mhp->mh_mutex);
1396 	ASSERT(mhp->mh_state == MHND_RELEASE);
1397 	mhp->mh_state = MHND_FREE;
1398 
1399 	kphysm_free_mem_handle(mhp);
1400 
1401 	return (KPHYSM_OK);
1402 }
1403 
1404 /*
1405  * This cancel function can only be called with the thread running.
1406  */
1407 int
1408 kphysm_del_cancel(memhandle_t handle)
1409 {
1410 	struct mem_handle *mhp;
1411 
1412 	mhp = kphysm_lookup_mem_handle(handle);
1413 	if (mhp == NULL) {
1414 		return (KPHYSM_EHANDLE);
1415 	}
1416 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1417 		mutex_exit(&mhp->mh_mutex);
1418 		return (KPHYSM_ENOTRUNNING);
1419 	}
1420 	/*
1421 	 * Set the cancel flag and wake the delete thread up.
1422 	 * The thread may be waiting on I/O, so the effect of the cancel
1423 	 * may be delayed.
1424 	 */
1425 	if (mhp->mh_cancel == 0) {
1426 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1427 		cv_signal(&mhp->mh_cv);
1428 	}
1429 	mutex_exit(&mhp->mh_mutex);
1430 	return (KPHYSM_OK);
1431 }
1432 
1433 int
1434 kphysm_del_status(
1435 	memhandle_t handle,
1436 	memdelstat_t *mdstp)
1437 {
1438 	struct mem_handle *mhp;
1439 
1440 	mhp = kphysm_lookup_mem_handle(handle);
1441 	if (mhp == NULL) {
1442 		return (KPHYSM_EHANDLE);
1443 	}
1444 	/*
1445 	 * Calling kphysm_del_status() is allowed before the delete
1446 	 * is started to allow for status display.
1447 	 */
1448 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1449 	    mhp->mh_state != MHND_RUNNING) {
1450 		mutex_exit(&mhp->mh_mutex);
1451 		return (KPHYSM_ENOTRUNNING);
1452 	}
1453 	mdstp->phys_pages = mhp->mh_phys_pages;
1454 	mdstp->managed = mhp->mh_vm_pages;
1455 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1456 	mutex_exit(&mhp->mh_mutex);
1457 	return (KPHYSM_OK);
1458 }
1459 
1460 static int mem_delete_additional_pages = 100;
1461 
1462 static int
1463 can_remove_pgs(pgcnt_t npgs)
1464 {
1465 	/*
1466 	 * If all pageable pages were paged out, freemem would
1467 	 * equal availrmem.  There is a minimum requirement for
1468 	 * availrmem.
1469 	 */
1470 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1471 	    < npgs)
1472 		return (0);
1473 	/* TODO: check swap space, etc. */
1474 	return (1);
1475 }
1476 
1477 static int
1478 get_availrmem(pgcnt_t npgs)
1479 {
1480 	int ret;
1481 
1482 	mutex_enter(&freemem_lock);
1483 	ret = can_remove_pgs(npgs);
1484 	if (ret != 0)
1485 		availrmem -= npgs;
1486 	mutex_exit(&freemem_lock);
1487 	return (ret);
1488 }
1489 
1490 static void
1491 put_availrmem(pgcnt_t npgs)
1492 {
1493 	mutex_enter(&freemem_lock);
1494 	availrmem += npgs;
1495 	mutex_exit(&freemem_lock);
1496 }
1497 
1498 #define	FREEMEM_INCR	100
1499 static pgcnt_t freemem_incr = FREEMEM_INCR;
1500 #define	DEL_FREE_WAIT_FRAC	4
1501 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1502 
1503 #define	DEL_BUSY_WAIT_FRAC	20
1504 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1505 
1506 static void kphysm_del_cleanup(struct mem_handle *);
1507 
1508 static void page_delete_collect(page_t *, struct mem_handle *);
1509 
1510 static pgcnt_t
1511 delthr_get_freemem(struct mem_handle *mhp)
1512 {
1513 	pgcnt_t free_get;
1514 	int ret;
1515 
1516 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1517 
1518 	MDSTAT_INCR(mhp, need_free);
1519 	/*
1520 	 * Get up to freemem_incr pages.
1521 	 */
1522 	free_get = freemem_incr;
1523 	if (free_get > mhp->mh_hold_todo)
1524 		free_get = mhp->mh_hold_todo;
1525 	/*
1526 	 * Take free_get pages away from freemem,
1527 	 * waiting if necessary.
1528 	 */
1529 
1530 	while (!mhp->mh_cancel) {
1531 		mutex_exit(&mhp->mh_mutex);
1532 		MDSTAT_INCR(mhp, free_loop);
1533 		/*
1534 		 * Duplicate test from page_create_throttle()
1535 		 * but don't override with !PG_WAIT.
1536 		 */
1537 		if (freemem < (free_get + throttlefree)) {
1538 			MDSTAT_INCR(mhp, free_low);
1539 			ret = 0;
1540 		} else {
1541 			ret = page_create_wait(free_get, 0);
1542 			if (ret == 0) {
1543 				/* EMPTY */
1544 				MDSTAT_INCR(mhp, free_failed);
1545 			}
1546 		}
1547 		if (ret != 0) {
1548 			mutex_enter(&mhp->mh_mutex);
1549 			return (free_get);
1550 		}
1551 
1552 		/*
1553 		 * Put pressure on pageout.
1554 		 */
1555 		page_needfree(free_get);
1556 		cv_signal(&proc_pageout->p_cv);
1557 
1558 		mutex_enter(&mhp->mh_mutex);
1559 		(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
1560 		    (lbolt + DEL_FREE_WAIT_TICKS));
1561 		mutex_exit(&mhp->mh_mutex);
1562 		page_needfree(-(spgcnt_t)free_get);
1563 
1564 		mutex_enter(&mhp->mh_mutex);
1565 	}
1566 	return (0);
1567 }
1568 
1569 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1570 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1571 /*
1572  * This function is run as a helper thread for delete_memory_thread.
1573  * It is needed in order to force kaio cleanup, so that pages used in kaio
1574  * will be unlocked and subsequently relocated by delete_memory_thread.
1575  * The address of the delete_memory_threads's mem_handle is passed in to
1576  * this thread function, and is used to set the mh_aio_cleanup_done member
1577  * prior to calling thread_exit().
1578  */
1579 static void
1580 dr_aio_cleanup_thread(caddr_t amhp)
1581 {
1582 	proc_t *procp;
1583 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1584 	int cleaned;
1585 	int n = 0;
1586 	struct mem_handle *mhp;
1587 	volatile uint_t *pcancel;
1588 
1589 	mhp = (struct mem_handle *)amhp;
1590 	ASSERT(mhp != NULL);
1591 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1592 	if (modload("sys", "kaio") == -1) {
1593 		mhp->mh_aio_cleanup_done = 1;
1594 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1595 		thread_exit();
1596 	}
1597 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1598 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1599 	if (aio_cleanup_dr_delete_memory == NULL) {
1600 		mhp->mh_aio_cleanup_done = 1;
1601 		cmn_err(CE_WARN,
1602 	    "aio_cleanup_dr_delete_memory not found in kaio");
1603 		thread_exit();
1604 	}
1605 	do {
1606 		cleaned = 0;
1607 		mutex_enter(&pidlock);
1608 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1609 		    procp = procp->p_next) {
1610 			mutex_enter(&procp->p_lock);
1611 			if (procp->p_aio != NULL) {
1612 				/* cleanup proc's outstanding kaio */
1613 				cleaned +=
1614 				    (*aio_cleanup_dr_delete_memory)(procp);
1615 			}
1616 			mutex_exit(&procp->p_lock);
1617 		}
1618 		mutex_exit(&pidlock);
1619 		if ((*pcancel == 0) &&
1620 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1621 			/* delay a bit before retrying all procs again */
1622 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1623 			n = 0;
1624 		}
1625 	} while (*pcancel == 0);
1626 	mhp->mh_aio_cleanup_done = 1;
1627 	thread_exit();
1628 }
1629 
1630 static void
1631 delete_memory_thread(caddr_t amhp)
1632 {
1633 	struct mem_handle *mhp;
1634 	struct memdelspan *mdsp;
1635 	callb_cpr_t cprinfo;
1636 	page_t *pp_targ;
1637 	spgcnt_t freemem_left;
1638 	void (*del_complete_funcp)(void *, int error);
1639 	void *del_complete_arg;
1640 	int comp_code;
1641 	int ret;
1642 	int first_scan;
1643 	uint_t szc;
1644 #ifdef MEM_DEL_STATS
1645 	uint64_t start_total, ntick_total;
1646 	uint64_t start_pgrp, ntick_pgrp;
1647 #endif /* MEM_DEL_STATS */
1648 
1649 	mhp = (struct mem_handle *)amhp;
1650 
1651 #ifdef MEM_DEL_STATS
1652 	start_total = ddi_get_lbolt();
1653 #endif /* MEM_DEL_STATS */
1654 
1655 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1656 	    callb_generic_cpr, "memdel");
1657 
1658 	mutex_enter(&mhp->mh_mutex);
1659 	ASSERT(mhp->mh_state == MHND_STARTING);
1660 
1661 	mhp->mh_state = MHND_RUNNING;
1662 	mhp->mh_thread_id = curthread;
1663 
1664 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1665 	mutex_exit(&mhp->mh_mutex);
1666 
1667 	/* Allocate the remap pages now, if necessary. */
1668 	memseg_remap_init();
1669 
1670 	/*
1671 	 * Subtract from availrmem now if possible as availrmem
1672 	 * may not be available by the end of the delete.
1673 	 */
1674 	if (!get_availrmem(mhp->mh_vm_pages)) {
1675 		comp_code = KPHYSM_ENOTVIABLE;
1676 		mutex_enter(&mhp->mh_mutex);
1677 		goto early_exit;
1678 	}
1679 
1680 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1681 
1682 	mutex_enter(&mhp->mh_mutex);
1683 
1684 	if (ret != 0) {
1685 		mhp->mh_cancel = KPHYSM_EREFUSED;
1686 		goto refused;
1687 	}
1688 
1689 	transit_list_collect(mhp, 1);
1690 
1691 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1692 	    mdsp = mdsp->mds_next) {
1693 		ASSERT(mdsp->mds_bitmap == NULL);
1694 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1695 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1696 							KM_SLEEP);
1697 	}
1698 
1699 	first_scan = 1;
1700 	freemem_left = 0;
1701 	/*
1702 	 * Start dr_aio_cleanup_thread, which periodically iterates
1703 	 * through the process list and invokes aio cleanup.  This
1704 	 * is needed in order to avoid a deadly embrace between the
1705 	 * delete_memory_thread (waiting on writer lock for page, with the
1706 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1707 	 * reader lock on the same page that is wanted by the
1708 	 * delete_memory_thread), and threads waiting for kaio completion
1709 	 * (blocked on spt_amp->lock).
1710 	 */
1711 	mhp->mh_dr_aio_cleanup_cancel = 0;
1712 	mhp->mh_aio_cleanup_done = 0;
1713 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1714 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1715 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1716 		pgcnt_t collected;
1717 
1718 		MDSTAT_INCR(mhp, nloop);
1719 		collected = 0;
1720 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1721 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1722 			pfn_t pfn, p_end;
1723 
1724 			if (first_scan) {
1725 				mem_node_pre_del_slice(mdsp->mds_base,
1726 					mdsp->mds_base + mdsp->mds_npgs - 1);
1727 			}
1728 
1729 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1730 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1731 			    (mhp->mh_cancel == 0); pfn++) {
1732 				page_t *pp, *tpp, *tpp_targ;
1733 				pgcnt_t bit;
1734 				struct vnode *vp;
1735 				u_offset_t offset;
1736 				int mod, result;
1737 				spgcnt_t pgcnt;
1738 
1739 				bit = pfn - mdsp->mds_base;
1740 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1741 				    (1 << (bit % NBPBMW))) != 0) {
1742 					MDSTAT_INCR(mhp, already_done);
1743 					continue;
1744 				}
1745 				if (freemem_left == 0) {
1746 					freemem_left += delthr_get_freemem(mhp);
1747 					if (freemem_left == 0)
1748 						break;
1749 				}
1750 
1751 				/*
1752 				 * Release mh_mutex - some of this
1753 				 * stuff takes some time (eg PUTPAGE).
1754 				 */
1755 
1756 				mutex_exit(&mhp->mh_mutex);
1757 				MDSTAT_INCR(mhp, ncheck);
1758 
1759 				pp = page_numtopp_nolock(pfn);
1760 				if (pp == NULL) {
1761 					/*
1762 					 * Not covered by a page_t - will
1763 					 * be dealt with elsewhere.
1764 					 */
1765 					MDSTAT_INCR(mhp, nopaget);
1766 					mutex_enter(&mhp->mh_mutex);
1767 					mdsp->mds_bitmap[bit / NBPBMW] |=
1768 					    (1 << (bit % NBPBMW));
1769 					continue;
1770 				}
1771 
1772 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1773 				    SE_EXCL_WANTED)) {
1774 					if (page_isretired(pp)) {
1775 						/*
1776 						 * Page has been retired.
1777 						 *
1778 						 * Its shared lock can and
1779 						 * must be upgraded to an
1780 						 * exclusive lock in order
1781 						 * to hashout the page when
1782 						 * the delete completes.
1783 						 */
1784 						page_lock_clr_exclwanted(pp);
1785 						if (!page_tryupgrade(pp)) {
1786 							mutex_enter(
1787 							    &mhp->mh_mutex);
1788 							continue;
1789 						}
1790 					} else {
1791 						/*
1792 						 * Page in use elsewhere.
1793 						 */
1794 						MDSTAT_INCR(mhp, lockfail);
1795 						mutex_enter(&mhp->mh_mutex);
1796 						continue;
1797 					}
1798 				}
1799 				/*
1800 				 * See if the cage expanded into the delete.
1801 				 * This can happen as we have to allow the
1802 				 * cage to expand.
1803 				 */
1804 				if (PP_ISNORELOC(pp)) {
1805 					if (page_isretired(pp))
1806 						page_downgrade(pp);
1807 					else
1808 						page_unlock(pp);
1809 					mutex_enter(&mhp->mh_mutex);
1810 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1811 					break;
1812 				}
1813 				if (page_isretired(pp)) {
1814 					/*
1815 					 * Page has been retired and is
1816 					 * not part of the cage so we
1817 					 * can now do the accounting for
1818 					 * it.
1819 					 */
1820 					MDSTAT_INCR(mhp, retired);
1821 					mutex_enter(&mhp->mh_mutex);
1822 					mdsp->mds_bitmap[bit / NBPBMW]
1823 					    |= (1 << (bit % NBPBMW));
1824 					mdsp->mds_bitmap_retired[bit /
1825 					    NBPBMW] |=
1826 					    (1 << (bit % NBPBMW));
1827 					mhp->mh_hold_todo--;
1828 					continue;
1829 				}
1830 				ASSERT(freemem_left != 0);
1831 				if (PP_ISFREE(pp)) {
1832 					/*
1833 					 * Like page_reclaim() only 'freemem'
1834 					 * processing is already done.
1835 					 */
1836 					MDSTAT_INCR(mhp, nfree);
1837 				free_page_collect:
1838 					if (PP_ISAGED(pp)) {
1839 						page_list_sub(pp,
1840 						    PG_FREE_LIST);
1841 					} else {
1842 						page_list_sub(pp,
1843 						    PG_CACHE_LIST);
1844 					}
1845 					PP_CLRFREE(pp);
1846 					PP_CLRAGED(pp);
1847 					collected++;
1848 					mutex_enter(&mhp->mh_mutex);
1849 					page_delete_collect(pp, mhp);
1850 					mdsp->mds_bitmap[bit / NBPBMW] |=
1851 					    (1 << (bit % NBPBMW));
1852 					freemem_left--;
1853 					continue;
1854 				}
1855 				ASSERT(pp->p_vnode != NULL);
1856 				if (first_scan) {
1857 					MDSTAT_INCR(mhp, first_notfree);
1858 					page_unlock(pp);
1859 					mutex_enter(&mhp->mh_mutex);
1860 					continue;
1861 				}
1862 				/*
1863 				 * Keep stats on pages encountered that
1864 				 * are toxic or failing but not retired.
1865 				 */
1866 				if (page_istoxic(pp)) {
1867 					MDSTAT_INCR(mhp, toxic);
1868 				} else if (page_isfailing(pp)) {
1869 					MDSTAT_INCR(mhp, failing);
1870 				}
1871 				/*
1872 				 * In certain cases below, special exceptions
1873 				 * are made for pages that are toxic.  This
1874 				 * is because the current meaning of toxic
1875 				 * is that an uncorrectable error has been
1876 				 * previously associated with the page.
1877 				 */
1878 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1879 					if (!page_istoxic(pp)) {
1880 						/*
1881 						 * Must relocate locked in
1882 						 * memory pages.
1883 						 */
1884 #ifdef MEM_DEL_STATS
1885 						start_pgrp = ddi_get_lbolt();
1886 #endif /* MEM_DEL_STATS */
1887 						/*
1888 						 * Lock all constituent pages
1889 						 * of a large page to ensure
1890 						 * that p_szc won't change.
1891 						 */
1892 						if (!group_page_trylock(pp,
1893 						    SE_EXCL)) {
1894 							MDSTAT_INCR(mhp,
1895 							    gptllckfail);
1896 							page_unlock(pp);
1897 							mutex_enter(
1898 							    &mhp->mh_mutex);
1899 							continue;
1900 						}
1901 						MDSTAT_INCR(mhp, npplocked);
1902 						pp_targ =
1903 						    page_get_replacement_page(
1904 							pp, NULL, 0);
1905 						if (pp_targ != NULL) {
1906 #ifdef MEM_DEL_STATS
1907 							ntick_pgrp =
1908 							    (uint64_t)
1909 							    ddi_get_lbolt() -
1910 							    start_pgrp;
1911 #endif /* MEM_DEL_STATS */
1912 							MDSTAT_PGRP(mhp,
1913 							    ntick_pgrp);
1914 							MDSTAT_INCR(mhp,
1915 							    nlockreloc);
1916 							goto reloc;
1917 						}
1918 						group_page_unlock(pp);
1919 						page_unlock(pp);
1920 #ifdef MEM_DEL_STATS
1921 						ntick_pgrp =
1922 						    (uint64_t)ddi_get_lbolt() -
1923 						    start_pgrp;
1924 #endif /* MEM_DEL_STATS */
1925 						MDSTAT_PGRP(mhp, ntick_pgrp);
1926 						MDSTAT_INCR(mhp, nnorepl);
1927 						mutex_enter(&mhp->mh_mutex);
1928 						continue;
1929 					} else {
1930 						/*
1931 						 * Cannot do anything about
1932 						 * this page because it is
1933 						 * toxic.
1934 						 */
1935 						MDSTAT_INCR(mhp, npplkdtoxic);
1936 						page_unlock(pp);
1937 						mutex_enter(&mhp->mh_mutex);
1938 						continue;
1939 					}
1940 				}
1941 				/*
1942 				 * Unload the mappings and check if mod bit
1943 				 * is set.
1944 				 */
1945 				ASSERT(pp->p_vnode != &kvp);
1946 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1947 				mod = hat_ismod(pp);
1948 
1949 #ifdef MEM_DEL_STATS
1950 				start_pgrp = ddi_get_lbolt();
1951 #endif /* MEM_DEL_STATS */
1952 				if (mod && !page_istoxic(pp)) {
1953 					/*
1954 					 * Lock all constituent pages
1955 					 * of a large page to ensure
1956 					 * that p_szc won't change.
1957 					 */
1958 					if (!group_page_trylock(pp, SE_EXCL)) {
1959 						MDSTAT_INCR(mhp, gptlmodfail);
1960 						page_unlock(pp);
1961 						mutex_enter(&mhp->mh_mutex);
1962 						continue;
1963 					}
1964 					pp_targ = page_get_replacement_page(pp,
1965 					    NULL, 0);
1966 					if (pp_targ != NULL) {
1967 						MDSTAT_INCR(mhp, nmodreloc);
1968 #ifdef MEM_DEL_STATS
1969 						ntick_pgrp =
1970 						    (uint64_t)ddi_get_lbolt() -
1971 							start_pgrp;
1972 #endif /* MEM_DEL_STATS */
1973 						MDSTAT_PGRP(mhp, ntick_pgrp);
1974 						goto reloc;
1975 					}
1976 					group_page_unlock(pp);
1977 				}
1978 
1979 				if (!page_try_demote_pages(pp)) {
1980 					MDSTAT_INCR(mhp, demotefail);
1981 					page_unlock(pp);
1982 #ifdef MEM_DEL_STATS
1983 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1984 					    start_pgrp;
1985 #endif /* MEM_DEL_STATS */
1986 					MDSTAT_PGRP(mhp, ntick_pgrp);
1987 					mutex_enter(&mhp->mh_mutex);
1988 					continue;
1989 				}
1990 
1991 				/*
1992 				 * Regular 'page-out'.
1993 				 */
1994 				if (!mod) {
1995 					MDSTAT_INCR(mhp, ndestroy);
1996 					page_destroy(pp, 1);
1997 					/*
1998 					 * page_destroy was called with
1999 					 * dontfree. As long as p_lckcnt
2000 					 * and p_cowcnt are both zero, the
2001 					 * only additional action of
2002 					 * page_destroy with !dontfree is to
2003 					 * call page_free, so we can collect
2004 					 * the page here.
2005 					 */
2006 					collected++;
2007 #ifdef MEM_DEL_STATS
2008 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2009 					    start_pgrp;
2010 #endif /* MEM_DEL_STATS */
2011 					MDSTAT_PGRP(mhp, ntick_pgrp);
2012 					mutex_enter(&mhp->mh_mutex);
2013 					page_delete_collect(pp, mhp);
2014 					mdsp->mds_bitmap[bit / NBPBMW] |=
2015 					    (1 << (bit % NBPBMW));
2016 					continue;
2017 				}
2018 				/*
2019 				 * The page is toxic and the mod bit is
2020 				 * set, we cannot do anything here to deal
2021 				 * with it.
2022 				 */
2023 				if (page_istoxic(pp)) {
2024 					page_unlock(pp);
2025 #ifdef MEM_DEL_STATS
2026 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2027 					    start_pgrp;
2028 #endif /* MEM_DEL_STATS */
2029 					MDSTAT_PGRP(mhp, ntick_pgrp);
2030 					MDSTAT_INCR(mhp, modtoxic);
2031 					mutex_enter(&mhp->mh_mutex);
2032 					continue;
2033 				}
2034 				MDSTAT_INCR(mhp, nputpage);
2035 				vp = pp->p_vnode;
2036 				offset = pp->p_offset;
2037 				VN_HOLD(vp);
2038 				page_unlock(pp);
2039 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2040 				    B_INVAL|B_FORCE, kcred);
2041 				VN_RELE(vp);
2042 #ifdef MEM_DEL_STATS
2043 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2044 				    start_pgrp;
2045 #endif /* MEM_DEL_STATS */
2046 				MDSTAT_PGRP(mhp, ntick_pgrp);
2047 				/*
2048 				 * Try to get the page back immediately
2049 				 * so that it can be collected.
2050 				 */
2051 				pp = page_numtopp_nolock(pfn);
2052 				if (pp == NULL) {
2053 					MDSTAT_INCR(mhp, nnoreclaim);
2054 					/*
2055 					 * This should not happen as this
2056 					 * thread is deleting the page.
2057 					 * If this code is generalized, this
2058 					 * becomes a reality.
2059 					 */
2060 #ifdef DEBUG
2061 					cmn_err(CE_WARN,
2062 					    "delete_memory_thread(0x%p) "
2063 					    "pfn 0x%lx has no page_t",
2064 					    (void *)mhp, pfn);
2065 #endif /* DEBUG */
2066 					mutex_enter(&mhp->mh_mutex);
2067 					continue;
2068 				}
2069 				if (page_try_reclaim_lock(pp, SE_EXCL,
2070 				    SE_EXCL_WANTED)) {
2071 					if (PP_ISFREE(pp)) {
2072 						goto free_page_collect;
2073 					}
2074 					page_unlock(pp);
2075 				}
2076 				MDSTAT_INCR(mhp, nnoreclaim);
2077 				mutex_enter(&mhp->mh_mutex);
2078 				continue;
2079 
2080 			reloc:
2081 				/*
2082 				 * Got some freemem and a target
2083 				 * page, so move the data to avoid
2084 				 * I/O and lock problems.
2085 				 */
2086 				ASSERT(!page_iolock_assert(pp));
2087 				MDSTAT_INCR(mhp, nreloc);
2088 				/*
2089 				 * page_relocate() will return pgcnt: the
2090 				 * number of consecutive pages relocated.
2091 				 * If it is successful, pp will be a
2092 				 * linked list of the page structs that
2093 				 * were relocated. If page_relocate() is
2094 				 * unsuccessful, pp will be unmodified.
2095 				 */
2096 #ifdef MEM_DEL_STATS
2097 				start_pgrp = ddi_get_lbolt();
2098 #endif /* MEM_DEL_STATS */
2099 				result = page_relocate(&pp, &pp_targ, 0, 0,
2100 				    &pgcnt, NULL);
2101 #ifdef MEM_DEL_STATS
2102 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2103 				    start_pgrp;
2104 #endif /* MEM_DEL_STATS */
2105 				MDSTAT_PGRP(mhp, ntick_pgrp);
2106 				if (result != 0) {
2107 					MDSTAT_INCR(mhp, nrelocfail);
2108 					/*
2109 					 * We did not succeed. We need
2110 					 * to give the pp_targ pages back.
2111 					 * page_free(pp_targ, 1) without
2112 					 * the freemem accounting.
2113 					 */
2114 					group_page_unlock(pp);
2115 					page_free_replacement_page(pp_targ);
2116 					page_unlock(pp);
2117 					mutex_enter(&mhp->mh_mutex);
2118 					continue;
2119 				}
2120 
2121 				/*
2122 				 * We will then collect pgcnt pages.
2123 				 */
2124 				ASSERT(pgcnt > 0);
2125 				mutex_enter(&mhp->mh_mutex);
2126 				/*
2127 				 * We need to make sure freemem_left is
2128 				 * large enough.
2129 				 */
2130 				while ((freemem_left < pgcnt) &&
2131 					(!mhp->mh_cancel)) {
2132 					freemem_left +=
2133 						delthr_get_freemem(mhp);
2134 				}
2135 
2136 				/*
2137 				 * Do not proceed if mh_cancel is set.
2138 				 */
2139 				if (mhp->mh_cancel) {
2140 					while (pp_targ != NULL) {
2141 						/*
2142 						 * Unlink and unlock each page.
2143 						 */
2144 						tpp_targ = pp_targ;
2145 						page_sub(&pp_targ, tpp_targ);
2146 						page_unlock(tpp_targ);
2147 					}
2148 					/*
2149 					 * We need to give the pp pages back.
2150 					 * page_free(pp, 1) without the
2151 					 * freemem accounting.
2152 					 */
2153 					page_free_replacement_page(pp);
2154 					break;
2155 				}
2156 
2157 				/* Now remove pgcnt from freemem_left */
2158 				freemem_left -= pgcnt;
2159 				ASSERT(freemem_left >= 0);
2160 				szc = pp->p_szc;
2161 				while (pp != NULL) {
2162 					/*
2163 					 * pp and pp_targ were passed back as
2164 					 * a linked list of pages.
2165 					 * Unlink and unlock each page.
2166 					 */
2167 					tpp_targ = pp_targ;
2168 					page_sub(&pp_targ, tpp_targ);
2169 					page_unlock(tpp_targ);
2170 					/*
2171 					 * The original page is now free
2172 					 * so remove it from the linked
2173 					 * list and collect it.
2174 					 */
2175 					tpp = pp;
2176 					page_sub(&pp, tpp);
2177 					pfn = page_pptonum(tpp);
2178 					collected++;
2179 					ASSERT(PAGE_EXCL(tpp));
2180 					ASSERT(tpp->p_vnode == NULL);
2181 					ASSERT(!hat_page_is_mapped(tpp));
2182 					ASSERT(tpp->p_szc == szc);
2183 					tpp->p_szc = 0;
2184 					page_delete_collect(tpp, mhp);
2185 					bit = pfn - mdsp->mds_base;
2186 					mdsp->mds_bitmap[bit / NBPBMW] |=
2187 					(1 << (bit % NBPBMW));
2188 				}
2189 				ASSERT(pp_targ == NULL);
2190 			}
2191 		}
2192 		first_scan = 0;
2193 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2194 			(collected == 0)) {
2195 			/*
2196 			 * This code is needed as we cannot wait
2197 			 * for a page to be locked OR the delete to
2198 			 * be cancelled.  Also, we must delay so
2199 			 * that other threads get a chance to run
2200 			 * on our cpu, otherwise page locks may be
2201 			 * held indefinitely by those threads.
2202 			 */
2203 			MDSTAT_INCR(mhp, ndelay);
2204 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2205 			(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
2206 			    (lbolt + DEL_BUSY_WAIT_TICKS));
2207 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2208 		}
2209 	}
2210 	/* stop the dr aio cleanup thread */
2211 	mhp->mh_dr_aio_cleanup_cancel = 1;
2212 	transit_list_collect(mhp, 0);
2213 	if (freemem_left != 0) {
2214 		/* Return any surplus. */
2215 		page_create_putback(freemem_left);
2216 		freemem_left = 0;
2217 	}
2218 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2219 	    mdsp = mdsp->mds_next) {
2220 		mem_node_post_del_slice(mdsp->mds_base,
2221 				mdsp->mds_base + mdsp->mds_npgs - 1,
2222 				(mhp->mh_cancel != 0));
2223 	}
2224 #ifdef MEM_DEL_STATS
2225 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2226 #endif /* MEM_DEL_STATS */
2227 	MDSTAT_TOTAL(mhp, ntick_total);
2228 	MDSTAT_PRINT(mhp);
2229 
2230 	/*
2231 	 * If the memory delete was cancelled, exclusive-wanted bits must
2232 	 * be cleared, and also any retired pages that
2233 	 * were accounted for above must have their exclusive lock
2234 	 * downgraded to a shared lock to return them to their previous
2235 	 * state.
2236 	 * Otherwise, if the memory delete has completed, retired pages
2237 	 * must be hashed out.
2238 	 */
2239 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2240 	    mdsp = mdsp->mds_next) {
2241 		pfn_t pfn, p_end;
2242 
2243 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2244 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2245 			page_t *pp;
2246 			pgcnt_t bit;
2247 
2248 			bit = pfn - mdsp->mds_base;
2249 			if (mhp->mh_cancel) {
2250 				pp = page_numtopp_nolock(pfn);
2251 				if (pp != NULL) {
2252 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2253 					    (1 << (bit % NBPBMW))) == 0) {
2254 						page_lock_clr_exclwanted(pp);
2255 					}
2256 				}
2257 			} else {
2258 				pp = NULL;
2259 			}
2260 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2261 			    (1 << (bit % NBPBMW))) != 0) {
2262 				/* do we already have pp? */
2263 				if (pp == NULL) {
2264 					pp = page_numtopp_nolock(pfn);
2265 				}
2266 				ASSERT(pp != NULL);
2267 				ASSERT(page_isretired(pp));
2268 				if (mhp->mh_cancel != 0) {
2269 					page_downgrade(pp);
2270 					/*
2271 					 * To satisfy ASSERT below in
2272 					 * cancel code.
2273 					 */
2274 					mhp->mh_hold_todo++;
2275 				} else {
2276 					page_hashout(pp, (kmutex_t *)NULL);
2277 				}
2278 			}
2279 		}
2280 	}
2281 	/*
2282 	 * Free retired page bitmap and collected page bitmap
2283 	 */
2284 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2285 	    mdsp = mdsp->mds_next) {
2286 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2287 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2288 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2289 		ASSERT(mdsp->mds_bitmap != NULL);
2290 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2291 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2292 	}
2293 
2294 	/* wait for our dr aio cancel thread to exit */
2295 	while (!(mhp->mh_aio_cleanup_done)) {
2296 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2297 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2298 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2299 	}
2300 refused:
2301 	if (mhp->mh_cancel != 0) {
2302 		page_t *pp;
2303 
2304 		comp_code = mhp->mh_cancel;
2305 		/*
2306 		 * Go through list of deleted pages (mh_deleted) freeing
2307 		 * them.
2308 		 */
2309 		while ((pp = mhp->mh_deleted) != NULL) {
2310 			mhp->mh_deleted = pp->p_next;
2311 			mhp->mh_hold_todo++;
2312 			mutex_exit(&mhp->mh_mutex);
2313 			/* Restore p_next. */
2314 			pp->p_next = pp->p_prev;
2315 			if (PP_ISFREE(pp)) {
2316 				cmn_err(CE_PANIC,
2317 				    "page %p is free",
2318 				    (void *)pp);
2319 			}
2320 			page_free(pp, 1);
2321 			mutex_enter(&mhp->mh_mutex);
2322 		}
2323 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2324 
2325 		mutex_exit(&mhp->mh_mutex);
2326 		put_availrmem(mhp->mh_vm_pages);
2327 		mutex_enter(&mhp->mh_mutex);
2328 
2329 		goto t_exit;
2330 	}
2331 
2332 	/*
2333 	 * All the pages are no longer in use and are exclusively locked.
2334 	 */
2335 
2336 	mhp->mh_deleted = NULL;
2337 
2338 	kphysm_del_cleanup(mhp);
2339 
2340 	comp_code = KPHYSM_OK;
2341 
2342 t_exit:
2343 	mutex_exit(&mhp->mh_mutex);
2344 	kphysm_setup_post_del(mhp->mh_vm_pages,
2345 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2346 	mutex_enter(&mhp->mh_mutex);
2347 
2348 early_exit:
2349 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2350 	mhp->mh_state = MHND_DONE;
2351 	del_complete_funcp = mhp->mh_delete_complete;
2352 	del_complete_arg = mhp->mh_delete_complete_arg;
2353 	CALLB_CPR_EXIT(&cprinfo);
2354 	(*del_complete_funcp)(del_complete_arg, comp_code);
2355 	thread_exit();
2356 	/*NOTREACHED*/
2357 }
2358 
2359 /*
2360  * Start the delete of the memory from the system.
2361  */
2362 int
2363 kphysm_del_start(
2364 	memhandle_t handle,
2365 	void (*complete)(void *, int),
2366 	void *complete_arg)
2367 {
2368 	struct mem_handle *mhp;
2369 
2370 	mhp = kphysm_lookup_mem_handle(handle);
2371 	if (mhp == NULL) {
2372 		return (KPHYSM_EHANDLE);
2373 	}
2374 	switch (mhp->mh_state) {
2375 	case MHND_FREE:
2376 		ASSERT(mhp->mh_state != MHND_FREE);
2377 		mutex_exit(&mhp->mh_mutex);
2378 		return (KPHYSM_EHANDLE);
2379 	case MHND_INIT:
2380 		break;
2381 	case MHND_STARTING:
2382 	case MHND_RUNNING:
2383 		mutex_exit(&mhp->mh_mutex);
2384 		return (KPHYSM_ESEQUENCE);
2385 	case MHND_DONE:
2386 		mutex_exit(&mhp->mh_mutex);
2387 		return (KPHYSM_ESEQUENCE);
2388 	case MHND_RELEASE:
2389 		mutex_exit(&mhp->mh_mutex);
2390 		return (KPHYSM_ESEQUENCE);
2391 	default:
2392 #ifdef DEBUG
2393 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2394 		    (void *)mhp, mhp->mh_state);
2395 #endif /* DEBUG */
2396 		mutex_exit(&mhp->mh_mutex);
2397 		return (KPHYSM_EHANDLE);
2398 	}
2399 
2400 	if (mhp->mh_transit.trl_spans == NULL) {
2401 		mutex_exit(&mhp->mh_mutex);
2402 		return (KPHYSM_ENOWORK);
2403 	}
2404 
2405 	ASSERT(complete != NULL);
2406 	mhp->mh_delete_complete = complete;
2407 	mhp->mh_delete_complete_arg = complete_arg;
2408 	mhp->mh_state = MHND_STARTING;
2409 	/*
2410 	 * Release the mutex in case thread_create sleeps.
2411 	 */
2412 	mutex_exit(&mhp->mh_mutex);
2413 
2414 	/*
2415 	 * The "obvious" process for this thread is pageout (proc_pageout)
2416 	 * but this gives the thread too much power over freemem
2417 	 * which results in freemem starvation.
2418 	 */
2419 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2420 	    TS_RUN, maxclsyspri - 1);
2421 
2422 	return (KPHYSM_OK);
2423 }
2424 
2425 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2426 static caddr_t pp_dummy;
2427 static pgcnt_t pp_dummy_npages;
2428 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2429 
2430 static void
2431 memseg_remap_init_pages(page_t *pages, page_t *epages)
2432 {
2433 	page_t *pp;
2434 
2435 	for (pp = pages; pp < epages; pp++) {
2436 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2437 		pp->p_offset = (u_offset_t)-1;
2438 		page_iolock_init(pp);
2439 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2440 			continue;
2441 		page_lock_delete(pp);
2442 	}
2443 }
2444 
2445 void
2446 memseg_remap_init()
2447 {
2448 	mutex_enter(&pp_dummy_lock);
2449 	if (pp_dummy == NULL) {
2450 		uint_t dpages;
2451 		int i;
2452 
2453 		/*
2454 		 * dpages starts off as the size of the structure and
2455 		 * ends up as the minimum number of pages that will
2456 		 * hold a whole number of page_t structures.
2457 		 */
2458 		dpages = sizeof (page_t);
2459 		ASSERT(dpages != 0);
2460 		ASSERT(dpages <= MMU_PAGESIZE);
2461 
2462 		while ((dpages & 1) == 0)
2463 			dpages >>= 1;
2464 
2465 		pp_dummy_npages = dpages;
2466 		/*
2467 		 * Allocate pp_dummy pages directly from static_arena,
2468 		 * since these are whole page allocations and are
2469 		 * referenced by physical address.  This also has the
2470 		 * nice fringe benefit of hiding the memory from
2471 		 * ::findleaks since it doesn't deal well with allocated
2472 		 * kernel heap memory that doesn't have any mappings.
2473 		 */
2474 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2475 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2476 		bzero(pp_dummy, ptob(pp_dummy_npages));
2477 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2478 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2479 		    pp_dummy_npages, KM_SLEEP);
2480 		for (i = 0; i < pp_dummy_npages; i++) {
2481 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2482 			    &pp_dummy[MMU_PAGESIZE * i]);
2483 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2484 		}
2485 		/*
2486 		 * Initialize the page_t's to a known 'deleted' state
2487 		 * that matches the state of deleted pages.
2488 		 */
2489 		memseg_remap_init_pages((page_t *)pp_dummy,
2490 					(page_t *)(pp_dummy +
2491 					    ptob(pp_dummy_npages)));
2492 		/* Remove kmem mappings for the pages for safety. */
2493 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2494 		    HAT_UNLOAD_UNLOCK);
2495 		/* Leave pp_dummy pointer set as flag that init is done. */
2496 	}
2497 	mutex_exit(&pp_dummy_lock);
2498 }
2499 
2500 static void
2501 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs)
2502 {
2503 	ASSERT(pp_dummy != NULL);
2504 
2505 	while (metapgs != 0) {
2506 		pgcnt_t n;
2507 		int i;
2508 
2509 		n = pp_dummy_npages;
2510 		if (n > metapgs)
2511 			n = metapgs;
2512 		for (i = 0; i < n; i++) {
2513 			hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i],
2514 			    PROT_READ,
2515 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2516 			    HAT_LOAD_REMAP);
2517 			pp += ptob(1);
2518 		}
2519 		metapgs -= n;
2520 	}
2521 }
2522 
2523 /*
2524  * Transition all the deleted pages to the deleted state so that
2525  * page_lock will not wait. The page_lock_delete call will
2526  * also wake up any waiters.
2527  */
2528 static void
2529 memseg_lock_delete_all(struct memseg *seg)
2530 {
2531 	page_t *pp;
2532 
2533 	for (pp = seg->pages; pp < seg->epages; pp++) {
2534 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2535 		page_lock_delete(pp);
2536 	}
2537 }
2538 
2539 static void
2540 kphysm_del_cleanup(struct mem_handle *mhp)
2541 {
2542 	struct memdelspan	*mdsp;
2543 	struct memseg		*seg;
2544 	struct memseg   	**segpp;
2545 	struct memseg		*seglist;
2546 	pfn_t			p_end;
2547 	uint64_t		avmem;
2548 	pgcnt_t			avpgs;
2549 	pgcnt_t			npgs;
2550 
2551 	avpgs = mhp->mh_vm_pages;
2552 
2553 	memsegs_lock(1);
2554 
2555 	/*
2556 	 * remove from main segment list.
2557 	 */
2558 	npgs = 0;
2559 	seglist = NULL;
2560 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2561 	    mdsp = mdsp->mds_next) {
2562 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2563 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2564 			if (seg->pages_base >= p_end ||
2565 			    seg->pages_end <= mdsp->mds_base) {
2566 				/* Span and memseg don't overlap. */
2567 				segpp = &((*segpp)->next);
2568 				continue;
2569 			}
2570 			ASSERT(seg->pages_base >= mdsp->mds_base);
2571 			ASSERT(seg->pages_end <= p_end);
2572 
2573 			/* Hide the memseg from future scans. */
2574 			hat_kpm_delmem_mseg_update(seg, segpp);
2575 			*segpp = seg->next;
2576 			membar_producer();	/* TODO: Needed? */
2577 			npgs += MSEG_NPAGES(seg);
2578 
2579 			/*
2580 			 * Leave the deleted segment's next pointer intact
2581 			 * in case a memsegs scanning loop is walking this
2582 			 * segment concurrently.
2583 			 */
2584 			seg->lnext = seglist;
2585 			seglist = seg;
2586 		}
2587 	}
2588 
2589 	build_pfn_hash();
2590 
2591 	ASSERT(npgs < total_pages);
2592 	total_pages -= npgs;
2593 
2594 	/*
2595 	 * Recalculate the paging parameters now total_pages has changed.
2596 	 * This will also cause the clock hands to be reset before next use.
2597 	 */
2598 	setupclock(1);
2599 
2600 	memsegs_unlock(1);
2601 
2602 	mutex_exit(&mhp->mh_mutex);
2603 
2604 	while ((seg = seglist) != NULL) {
2605 		pfn_t mseg_start;
2606 		pfn_t mseg_base, mseg_end;
2607 		pgcnt_t mseg_npgs;
2608 		page_t *pp;
2609 		pgcnt_t metapgs;
2610 		int dynamic;
2611 		int mlret;
2612 
2613 		seglist = seg->lnext;
2614 
2615 		/*
2616 		 * Put the page_t's into the deleted state to stop
2617 		 * cv_wait()s on the pages. When we remap, the dummy
2618 		 * page_t's will be in the same state.
2619 		 */
2620 		memseg_lock_delete_all(seg);
2621 		/*
2622 		 * Collect up information based on pages_base and pages_end
2623 		 * early so that we can flag early that the memseg has been
2624 		 * deleted by setting pages_end == pages_base.
2625 		 */
2626 		mseg_base = seg->pages_base;
2627 		mseg_end = seg->pages_end;
2628 		mseg_npgs = MSEG_NPAGES(seg);
2629 		dynamic = memseg_is_dynamic(seg, &mseg_start);
2630 
2631 		seg->pages_end = seg->pages_base;
2632 
2633 		if (dynamic) {
2634 			pp = seg->pages;
2635 			metapgs = mseg_base - mseg_start;
2636 			ASSERT(metapgs != 0);
2637 
2638 			/* Remap the meta data to our special dummy area. */
2639 			memseg_remap_to_dummy((caddr_t)pp, metapgs);
2640 
2641 			mutex_enter(&memseg_lists_lock);
2642 			seg->lnext = memseg_va_avail;
2643 			memseg_va_avail = seg;
2644 			mutex_exit(&memseg_lists_lock);
2645 		} else {
2646 			/*
2647 			 * Set for clean-up below.
2648 			 */
2649 			mseg_start = seg->pages_base;
2650 			/*
2651 			 * For memory whose page_ts were allocated
2652 			 * at boot, we need to find a new use for
2653 			 * the page_t memory.
2654 			 * For the moment, just leak it.
2655 			 * (It is held in the memseg_delete_junk list.)
2656 			 */
2657 
2658 			mutex_enter(&memseg_lists_lock);
2659 			seg->lnext = memseg_delete_junk;
2660 			memseg_delete_junk = seg;
2661 			mutex_exit(&memseg_lists_lock);
2662 		}
2663 
2664 		/* Must not use seg now as it could be re-used. */
2665 
2666 		memlist_write_lock();
2667 
2668 		mlret = memlist_delete_span(
2669 		    (uint64_t)(mseg_base) << PAGESHIFT,
2670 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2671 		    &phys_avail);
2672 		ASSERT(mlret == MEML_SPANOP_OK);
2673 
2674 		mlret = memlist_delete_span(
2675 		    (uint64_t)(mseg_start) << PAGESHIFT,
2676 		    (uint64_t)(mseg_end - mseg_start) <<
2677 		    PAGESHIFT,
2678 		    &phys_install);
2679 		ASSERT(mlret == MEML_SPANOP_OK);
2680 		phys_install_has_changed();
2681 
2682 		memlist_write_unlock();
2683 	}
2684 
2685 	memlist_read_lock();
2686 	installed_top_size(phys_install, &physmax, &physinstalled);
2687 	memlist_read_unlock();
2688 
2689 	mutex_enter(&freemem_lock);
2690 	maxmem -= avpgs;
2691 	physmem -= avpgs;
2692 	/* availrmem is adjusted during the delete. */
2693 	availrmem_initial -= avpgs;
2694 
2695 	mutex_exit(&freemem_lock);
2696 
2697 	dump_resize();
2698 
2699 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2700 	    "(0x%" PRIx64 ")\n",
2701 	    physinstalled << (PAGESHIFT - 10),
2702 	    (uint64_t)physinstalled << PAGESHIFT);
2703 
2704 	avmem = (uint64_t)freemem << PAGESHIFT;
2705 	cmn_err(CE_CONT, "?kphysm_delete: "
2706 	    "avail mem = %" PRId64 "\n", avmem);
2707 
2708 	/*
2709 	 * Update lgroup generation number on single lgroup systems
2710 	 */
2711 	if (nlgrps == 1)
2712 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2713 
2714 	/* Successfully deleted system memory */
2715 	mutex_enter(&mhp->mh_mutex);
2716 }
2717 
2718 static uint_t mdel_nullvp_waiter;
2719 
2720 static void
2721 page_delete_collect(
2722 	page_t *pp,
2723 	struct mem_handle *mhp)
2724 {
2725 	if (pp->p_vnode) {
2726 		page_hashout(pp, (kmutex_t *)NULL);
2727 		/* do not do PP_SETAGED(pp); */
2728 	} else {
2729 		kmutex_t *sep;
2730 
2731 		sep = page_se_mutex(pp);
2732 		mutex_enter(sep);
2733 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2734 			mdel_nullvp_waiter++;
2735 			cv_broadcast(&pp->p_cv);
2736 		}
2737 		mutex_exit(sep);
2738 	}
2739 	ASSERT(pp->p_next == pp->p_prev);
2740 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2741 	pp->p_next = mhp->mh_deleted;
2742 	mhp->mh_deleted = pp;
2743 	ASSERT(mhp->mh_hold_todo != 0);
2744 	mhp->mh_hold_todo--;
2745 }
2746 
2747 static void
2748 transit_list_collect(struct mem_handle *mhp, int v)
2749 {
2750 	struct transit_list_head *trh;
2751 
2752 	trh = &transit_list_head;
2753 	mutex_enter(&trh->trh_lock);
2754 	mhp->mh_transit.trl_collect = v;
2755 	mutex_exit(&trh->trh_lock);
2756 }
2757 
2758 static void
2759 transit_list_insert(struct transit_list *tlp)
2760 {
2761 	struct transit_list_head *trh;
2762 
2763 	trh = &transit_list_head;
2764 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2765 	tlp->trl_next = trh->trh_head;
2766 	trh->trh_head = tlp;
2767 }
2768 
2769 static void
2770 transit_list_remove(struct transit_list *tlp)
2771 {
2772 	struct transit_list_head *trh;
2773 	struct transit_list **tlpp;
2774 
2775 	trh = &transit_list_head;
2776 	tlpp = &trh->trh_head;
2777 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2778 	while (*tlpp != NULL && *tlpp != tlp)
2779 		tlpp = &(*tlpp)->trl_next;
2780 	ASSERT(*tlpp != NULL);
2781 	if (*tlpp == tlp)
2782 		*tlpp = tlp->trl_next;
2783 	tlp->trl_next = NULL;
2784 }
2785 
2786 static struct transit_list *
2787 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2788 {
2789 	struct transit_list *tlp;
2790 
2791 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2792 		struct memdelspan *mdsp;
2793 
2794 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2795 		    mdsp = mdsp->mds_next) {
2796 			if (pfnum >= mdsp->mds_base &&
2797 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2798 				return (tlp);
2799 			}
2800 		}
2801 	}
2802 	return (NULL);
2803 }
2804 
2805 int
2806 pfn_is_being_deleted(pfn_t pfnum)
2807 {
2808 	struct transit_list_head *trh;
2809 	struct transit_list *tlp;
2810 	int ret;
2811 
2812 	trh = &transit_list_head;
2813 	if (trh->trh_head == NULL)
2814 		return (0);
2815 
2816 	mutex_enter(&trh->trh_lock);
2817 	tlp = pfnum_to_transit_list(trh, pfnum);
2818 	ret = (tlp != NULL && tlp->trl_collect);
2819 	mutex_exit(&trh->trh_lock);
2820 
2821 	return (ret);
2822 }
2823 
2824 #ifdef MEM_DEL_STATS
2825 extern int hz;
2826 static void
2827 mem_del_stat_print_func(struct mem_handle *mhp)
2828 {
2829 	uint64_t tmp;
2830 
2831 	if (mem_del_stat_print) {
2832 		printf("memory delete loop %x/%x, statistics%s\n",
2833 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2834 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2835 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2836 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2837 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2838 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2839 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2840 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2841 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2842 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2843 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2844 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2845 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2846 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2847 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2848 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2849 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2850 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2851 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2852 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2853 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2854 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2855 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2856 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2857 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2858 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2859 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2860 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2861 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2862 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2863 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2864 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2865 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2866 		printf(
2867 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2868 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2869 
2870 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2871 		printf(
2872 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2873 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2874 	}
2875 }
2876 #endif /* MEM_DEL_STATS */
2877 
2878 struct mem_callback {
2879 	kphysm_setup_vector_t	*vec;
2880 	void			*arg;
2881 };
2882 
2883 #define	NMEMCALLBACKS		100
2884 
2885 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2886 static uint_t nmemcallbacks;
2887 static krwlock_t mem_callback_rwlock;
2888 
2889 int
2890 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2891 {
2892 	uint_t i, found;
2893 
2894 	/*
2895 	 * This test will become more complicated when the version must
2896 	 * change.
2897 	 */
2898 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2899 		return (EINVAL);
2900 
2901 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2902 	    vec->post_del == NULL)
2903 		return (EINVAL);
2904 
2905 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2906 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2907 		if (mem_callbacks[i].vec == NULL && found == 0)
2908 			found = i + 1;
2909 		if (mem_callbacks[i].vec == vec &&
2910 		    mem_callbacks[i].arg == arg) {
2911 #ifdef DEBUG
2912 			/* Catch this in DEBUG kernels. */
2913 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2914 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2915 			    (void *)vec, arg, (void *)caller());
2916 #endif /* DEBUG */
2917 			rw_exit(&mem_callback_rwlock);
2918 			return (EEXIST);
2919 		}
2920 	}
2921 	if (found != 0) {
2922 		i = found - 1;
2923 	} else {
2924 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
2925 		if (nmemcallbacks == NMEMCALLBACKS) {
2926 			rw_exit(&mem_callback_rwlock);
2927 			return (ENOMEM);
2928 		}
2929 		i = nmemcallbacks++;
2930 	}
2931 	mem_callbacks[i].vec = vec;
2932 	mem_callbacks[i].arg = arg;
2933 	rw_exit(&mem_callback_rwlock);
2934 	return (0);
2935 }
2936 
2937 void
2938 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
2939 {
2940 	uint_t i;
2941 
2942 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2943 	for (i = 0; i < nmemcallbacks; i++) {
2944 		if (mem_callbacks[i].vec == vec &&
2945 		    mem_callbacks[i].arg == arg) {
2946 			mem_callbacks[i].vec = NULL;
2947 			mem_callbacks[i].arg = NULL;
2948 			if (i == (nmemcallbacks - 1))
2949 				nmemcallbacks--;
2950 			break;
2951 		}
2952 	}
2953 	rw_exit(&mem_callback_rwlock);
2954 }
2955 
2956 static void
2957 kphysm_setup_post_add(pgcnt_t delta_pages)
2958 {
2959 	uint_t i;
2960 
2961 	rw_enter(&mem_callback_rwlock, RW_READER);
2962 	for (i = 0; i < nmemcallbacks; i++) {
2963 		if (mem_callbacks[i].vec != NULL) {
2964 			(*mem_callbacks[i].vec->post_add)
2965 			    (mem_callbacks[i].arg, delta_pages);
2966 		}
2967 	}
2968 	rw_exit(&mem_callback_rwlock);
2969 }
2970 
2971 /*
2972  * Note the locking between pre_del and post_del: The reader lock is held
2973  * between the two calls to stop the set of functions from changing.
2974  */
2975 
2976 static int
2977 kphysm_setup_pre_del(pgcnt_t delta_pages)
2978 {
2979 	uint_t i;
2980 	int ret;
2981 	int aret;
2982 
2983 	ret = 0;
2984 	rw_enter(&mem_callback_rwlock, RW_READER);
2985 	for (i = 0; i < nmemcallbacks; i++) {
2986 		if (mem_callbacks[i].vec != NULL) {
2987 			aret = (*mem_callbacks[i].vec->pre_del)
2988 			    (mem_callbacks[i].arg, delta_pages);
2989 			ret |= aret;
2990 		}
2991 	}
2992 
2993 	return (ret);
2994 }
2995 
2996 static void
2997 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
2998 {
2999 	uint_t i;
3000 
3001 	for (i = 0; i < nmemcallbacks; i++) {
3002 		if (mem_callbacks[i].vec != NULL) {
3003 			(*mem_callbacks[i].vec->post_del)
3004 			    (mem_callbacks[i].arg, delta_pages, cancelled);
3005 		}
3006 	}
3007 	rw_exit(&mem_callback_rwlock);
3008 }
3009 
3010 static int
3011 kphysm_split_memseg(
3012 	pfn_t base,
3013 	pgcnt_t npgs)
3014 {
3015 	struct memseg *seg;
3016 	struct memseg **segpp;
3017 	pgcnt_t size_low, size_high;
3018 	struct memseg *seg_low, *seg_mid, *seg_high;
3019 
3020 	/*
3021 	 * Lock the memsegs list against other updates now
3022 	 */
3023 	memsegs_lock(1);
3024 
3025 	/*
3026 	 * Find boot time memseg that wholly covers this area.
3027 	 */
3028 
3029 	/* First find the memseg with page 'base' in it. */
3030 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3031 	    segpp = &((*segpp)->next)) {
3032 		if (base >= seg->pages_base && base < seg->pages_end)
3033 			break;
3034 	}
3035 	if (seg == NULL) {
3036 		memsegs_unlock(1);
3037 		return (0);
3038 	}
3039 	if (memseg_is_dynamic(seg, (pfn_t *)NULL)) {
3040 		memsegs_unlock(1);
3041 		return (0);
3042 	}
3043 	if ((base + npgs) > seg->pages_end) {
3044 		memsegs_unlock(1);
3045 		return (0);
3046 	}
3047 
3048 	/*
3049 	 * Work out the size of the two segments that will
3050 	 * surround the new segment, one for low address
3051 	 * and one for high.
3052 	 */
3053 	ASSERT(base >= seg->pages_base);
3054 	size_low = base - seg->pages_base;
3055 	ASSERT(seg->pages_end >= (base + npgs));
3056 	size_high = seg->pages_end - (base + npgs);
3057 
3058 	/*
3059 	 * Sanity check.
3060 	 */
3061 	if ((size_low + size_high) == 0) {
3062 		memsegs_unlock(1);
3063 		return (0);
3064 	}
3065 
3066 	/*
3067 	 * Allocate the new structures. The old memseg will not be freed
3068 	 * as there may be a reference to it.
3069 	 */
3070 	seg_low = NULL;
3071 	seg_high = NULL;
3072 
3073 	if (size_low != 0) {
3074 		seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3075 		bzero(seg_low, sizeof (struct memseg));
3076 	}
3077 
3078 	seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3079 	bzero(seg_mid, sizeof (struct memseg));
3080 
3081 	if (size_high != 0) {
3082 		seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3083 		bzero(seg_high, sizeof (struct memseg));
3084 	}
3085 
3086 	/*
3087 	 * All allocation done now.
3088 	 */
3089 	if (size_low != 0) {
3090 		seg_low->pages = seg->pages;
3091 		seg_low->epages = seg_low->pages + size_low;
3092 		seg_low->pages_base = seg->pages_base;
3093 		seg_low->pages_end = seg_low->pages_base + size_low;
3094 		seg_low->next = seg_mid;
3095 	}
3096 	if (size_high != 0) {
3097 		seg_high->pages = seg->epages - size_high;
3098 		seg_high->epages = seg_high->pages + size_high;
3099 		seg_high->pages_base = seg->pages_end - size_high;
3100 		seg_high->pages_end = seg_high->pages_base + size_high;
3101 		seg_high->next = seg->next;
3102 	}
3103 
3104 	seg_mid->pages = seg->pages + size_low;
3105 	seg_mid->pages_base = seg->pages_base + size_low;
3106 	seg_mid->epages = seg->epages - size_high;
3107 	seg_mid->pages_end = seg->pages_end - size_high;
3108 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3109 
3110 	/*
3111 	 * Update hat_kpm specific info of all involved memsegs and
3112 	 * allow hat_kpm specific global chain updates.
3113 	 */
3114 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3115 
3116 	/*
3117 	 * At this point we have two equivalent memseg sub-chains,
3118 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3119 	 * the same place in the global chain. By re-writing the pointer
3120 	 * in the previous element we switch atomically from using the old
3121 	 * (seg) to the new.
3122 	 */
3123 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3124 
3125 	membar_enter();
3126 
3127 	build_pfn_hash();
3128 	memsegs_unlock(1);
3129 
3130 	/*
3131 	 * We leave the old segment, 'seg', intact as there may be
3132 	 * references to it. Also, as the value of total_pages has not
3133 	 * changed and the memsegs list is effectively the same when
3134 	 * accessed via the old or the new pointer, we do not have to
3135 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3136 	 *
3137 	 * We currently do not re-use or reclaim the page_t memory.
3138 	 * If we do, then this may have to change.
3139 	 */
3140 
3141 	mutex_enter(&memseg_lists_lock);
3142 	seg->lnext = memseg_edit_junk;
3143 	memseg_edit_junk = seg;
3144 	mutex_exit(&memseg_lists_lock);
3145 
3146 	return (1);
3147 }
3148 
3149 /*
3150  * The memsegs lock is only taken when modifying the memsegs list
3151  * and rebuilding the pfn hash table (after boot).
3152  * No lock is needed for read as memseg structure are never de-allocated
3153  * and the pointer linkage is never updated until the memseg is ready.
3154  */
3155 krwlock_t memsegslock;
3156 
3157 void
3158 memsegs_lock(int writer)
3159 {
3160 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
3161 }
3162 
3163 /*ARGSUSED*/
3164 void
3165 memsegs_unlock(int writer)
3166 {
3167 	rw_exit(&memsegslock);
3168 }
3169 
3170 /*
3171  * memlist (phys_install, phys_avail) locking.
3172  */
3173 
3174 /*
3175  * A read/write lock might be better here.
3176  */
3177 static kmutex_t memlists_mutex;
3178 
3179 void
3180 memlist_read_lock()
3181 {
3182 	mutex_enter(&memlists_mutex);
3183 }
3184 
3185 void
3186 memlist_read_unlock()
3187 {
3188 	mutex_exit(&memlists_mutex);
3189 }
3190 
3191 void
3192 memlist_write_lock()
3193 {
3194 	mutex_enter(&memlists_mutex);
3195 }
3196 
3197 void
3198 memlist_write_unlock()
3199 {
3200 	mutex_exit(&memlists_mutex);
3201 }
3202 
3203 /*
3204  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3205  * structure using physical addresses. Therefore a kmem_cache is
3206  * used with KMC_NOHASH to avoid page crossings within a memseg
3207  * structure. KMC_NOHASH requires that no external (outside of
3208  * slab) information is allowed. This, in turn, implies that the
3209  * cache's slabsize must be exactly a single page, since per-slab
3210  * information (e.g. the freelist for the slab) is kept at the
3211  * end of the slab, where it is easy to locate. Should be changed
3212  * when a more obvious kmem_cache interface/flag will become
3213  * available.
3214  */
3215 void
3216 mem_config_init()
3217 {
3218 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3219 		0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3220 }
3221