xref: /titanic_51/usr/src/uts/common/os/mem_config.c (revision 78ed97a7b79b59ef2ef41f190c9be35c54d90119)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/cmn_err.h>
30 #include <sys/vmem.h>
31 #include <sys/kmem.h>
32 #include <sys/systm.h>
33 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
34 #include <sys/errno.h>
35 #include <sys/memnode.h>
36 #include <sys/memlist.h>
37 #include <sys/memlist_impl.h>
38 #include <sys/tuneable.h>
39 #include <sys/proc.h>
40 #include <sys/disp.h>
41 #include <sys/debug.h>
42 #include <sys/vm.h>
43 #include <sys/callb.h>
44 #include <sys/memlist_plat.h>	/* for installed_top_size() */
45 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
46 #include <sys/dumphdr.h>	/* for dump_resize() */
47 #include <sys/atomic.h>		/* for use in stats collection */
48 #include <sys/rwlock.h>
49 #include <sys/cpuvar.h>
50 #include <vm/seg_kmem.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/page.h>
53 #include <vm/vm_dep.h>
54 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
55 #include <sys/sunddi.h>
56 #include <sys/mem_config.h>
57 #include <sys/mem_cage.h>
58 #include <sys/lgrp.h>
59 #include <sys/ddi.h>
60 #include <sys/modctl.h>
61 
62 extern struct memlist *phys_avail;
63 
64 extern void mem_node_add(pfn_t, pfn_t);
65 extern void mem_node_del(pfn_t, pfn_t);
66 
67 extern uint_t page_ctrs_adjust(int);
68 static void kphysm_setup_post_add(pgcnt_t);
69 static int kphysm_setup_pre_del(pgcnt_t);
70 static void kphysm_setup_post_del(pgcnt_t, int);
71 
72 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
73 
74 static int delspan_reserve(pfn_t, pgcnt_t);
75 static void delspan_unreserve(pfn_t, pgcnt_t);
76 
77 static kmutex_t memseg_lists_lock;
78 static struct memseg *memseg_va_avail;
79 static struct memseg *memseg_delete_junk;
80 static struct memseg *memseg_edit_junk;
81 void memseg_remap_init(void);
82 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
84 static struct memseg *memseg_reuse(pgcnt_t);
85 
86 static struct kmem_cache *memseg_cache;
87 
88 /*
89  * Add a chunk of memory to the system.  page_t's for this memory
90  * are allocated in the first few pages of the chunk.
91  * base: starting PAGESIZE page of new memory.
92  * npgs: length in PAGESIZE pages.
93  *
94  * Adding mem this way doesn't increase the size of the hash tables;
95  * growing them would be too hard.  This should be OK, but adding memory
96  * dynamically most likely means more hash misses, since the tables will
97  * be smaller than they otherwise would be.
98  */
99 int
100 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
101 {
102 	page_t		*pp;
103 	page_t		*opp, *oepp;
104 	struct memseg	*seg;
105 	uint64_t	avmem;
106 	pfn_t		pfn;
107 	pfn_t		pt_base = base;
108 	pgcnt_t		tpgs = npgs;
109 	pgcnt_t		metapgs;
110 	int		exhausted;
111 	pfn_t		pnum;
112 	int		mnode;
113 	caddr_t		vaddr;
114 	int		reuse;
115 	int		mlret;
116 	void		*mapva;
117 	pgcnt_t		nkpmpgs = 0;
118 	offset_t	kpm_pages_off;
119 
120 	cmn_err(CE_CONT,
121 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
122 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
123 
124 	/*
125 	 * Add this span in the delete list to prevent interactions.
126 	 */
127 	if (!delspan_reserve(base, npgs)) {
128 		return (KPHYSM_ESPAN);
129 	}
130 	/*
131 	 * Check to see if any of the memory span has been added
132 	 * by trying an add to the installed memory list. This
133 	 * forms the interlocking process for add.
134 	 */
135 
136 	memlist_write_lock();
137 
138 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
139 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
140 
141 	if (mlret == MEML_SPANOP_OK)
142 		installed_top_size(phys_install, &physmax, &physinstalled);
143 
144 	memlist_write_unlock();
145 
146 	if (mlret != MEML_SPANOP_OK) {
147 		if (mlret == MEML_SPANOP_EALLOC) {
148 			delspan_unreserve(pt_base, tpgs);
149 			return (KPHYSM_ERESOURCE);
150 		} else
151 		if (mlret == MEML_SPANOP_ESPAN) {
152 			delspan_unreserve(pt_base, tpgs);
153 			return (KPHYSM_ESPAN);
154 		} else {
155 			delspan_unreserve(pt_base, tpgs);
156 			return (KPHYSM_ERESOURCE);
157 		}
158 	}
159 
160 	/*
161 	 * We store the page_t's for this new memory in the first
162 	 * few pages of the chunk. Here, we go and get'em ...
163 	 */
164 
165 	/*
166 	 * The expression after the '-' gives the number of pages
167 	 * that will fit in the new memory based on a requirement
168 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
169 	 */
170 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
171 	    (PAGESIZE + sizeof (page_t)));
172 
173 	npgs -= metapgs;
174 	base += metapgs;
175 
176 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
177 
178 	exhausted = (metapgs == 0 || npgs == 0);
179 
180 	if (kpm_enable && !exhausted) {
181 		pgcnt_t start, end, nkpmpgs_prelim;
182 		size_t	ptsz;
183 
184 		/*
185 		 * A viable kpm large page mapping must not overlap two
186 		 * dynamic memsegs. Therefore the total size is checked
187 		 * to be at least kpm_pgsz and also whether start and end
188 		 * points are at least kpm_pgsz aligned.
189 		 */
190 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
191 		    pmodkpmp(base + npgs)) {
192 
193 			kphysm_addmem_error_undospan(pt_base, tpgs);
194 
195 			/*
196 			 * There is no specific error code for violating
197 			 * kpm granularity constraints.
198 			 */
199 			return (KPHYSM_ENOTVIABLE);
200 		}
201 
202 		start = kpmptop(ptokpmp(base));
203 		end = kpmptop(ptokpmp(base + npgs));
204 		nkpmpgs_prelim = ptokpmp(end - start);
205 		ptsz = npgs * sizeof (page_t);
206 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
207 		exhausted = (tpgs <= metapgs);
208 		if (!exhausted) {
209 			npgs = tpgs - metapgs;
210 			base = pt_base + metapgs;
211 
212 			/* final nkpmpgs */
213 			start = kpmptop(ptokpmp(base));
214 			nkpmpgs = ptokpmp(end - start);
215 			kpm_pages_off = ptsz +
216 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
217 		}
218 	}
219 
220 	/*
221 	 * Is memory area supplied too small?
222 	 */
223 	if (exhausted) {
224 		kphysm_addmem_error_undospan(pt_base, tpgs);
225 
226 		/*
227 		 * There is no specific error code for 'too small'.
228 		 */
229 		return (KPHYSM_ERESOURCE);
230 	}
231 
232 	/*
233 	 * We may re-use a previously allocated VA space for the page_ts
234 	 * eventually, but we need to initialize and lock the pages first.
235 	 */
236 
237 	/*
238 	 * Get an address in the kernel address map, map
239 	 * the page_t pages and see if we can touch them.
240 	 */
241 
242 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
243 	if (mapva == NULL) {
244 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
245 		    " Can't allocate VA for page_ts");
246 
247 		kphysm_addmem_error_undospan(pt_base, tpgs);
248 
249 		return (KPHYSM_ERESOURCE);
250 	}
251 	pp = mapva;
252 
253 	if (physmax < (pt_base + tpgs))
254 		physmax = (pt_base + tpgs);
255 
256 	/*
257 	 * In the remapping code we map one page at a time so we must do
258 	 * the same here to match mapping sizes.
259 	 */
260 	pfn = pt_base;
261 	vaddr = (caddr_t)pp;
262 	for (pnum = 0; pnum < metapgs; pnum++) {
263 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
264 		    PROT_READ | PROT_WRITE,
265 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
266 		pfn++;
267 		vaddr += ptob(1);
268 	}
269 
270 	if (ddi_peek32((dev_info_t *)NULL,
271 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
272 
273 		cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:"
274 		    " Can't access pp array at 0x%p [phys 0x%lx]",
275 		    (void *)pp, pt_base);
276 
277 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
278 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
279 
280 		vmem_free(heap_arena, mapva, ptob(metapgs));
281 
282 		kphysm_addmem_error_undospan(pt_base, tpgs);
283 
284 		return (KPHYSM_EFAULT);
285 	}
286 
287 	/*
288 	 * Add this memory slice to its memory node translation.
289 	 *
290 	 * Note that right now, each node may have only one slice;
291 	 * this may change with COD or in larger SSM systems with
292 	 * nested latency groups, so we must not assume that the
293 	 * node does not yet exist.
294 	 */
295 	pnum = base + npgs - 1;
296 	mem_node_add_slice(base, pnum);
297 
298 	/*
299 	 * Allocate or resize page counters as necessary to accommodate
300 	 * the increase in memory pages.
301 	 */
302 	mnode = PFN_2_MEM_NODE(pnum);
303 	if (page_ctrs_adjust(mnode) != 0) {
304 
305 		mem_node_pre_del_slice(base, pnum);
306 		mem_node_post_del_slice(base, pnum, 0);
307 
308 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
309 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
310 
311 		vmem_free(heap_arena, mapva, ptob(metapgs));
312 
313 		kphysm_addmem_error_undospan(pt_base, tpgs);
314 
315 		return (KPHYSM_ERESOURCE);
316 	}
317 
318 	/*
319 	 * Update the phys_avail memory list.
320 	 * The phys_install list was done at the start.
321 	 */
322 
323 	memlist_write_lock();
324 
325 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
326 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
327 	ASSERT(mlret == MEML_SPANOP_OK);
328 
329 	memlist_write_unlock();
330 
331 	/* See if we can find a memseg to re-use. */
332 	seg = memseg_reuse(metapgs);
333 
334 	reuse = (seg != NULL);
335 
336 	/*
337 	 * Initialize the memseg structure representing this memory
338 	 * and add it to the existing list of memsegs. Do some basic
339 	 * initialization and add the memory to the system.
340 	 * In order to prevent lock deadlocks, the add_physmem()
341 	 * code is repeated here, but split into several stages.
342 	 */
343 	if (seg == NULL) {
344 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
345 		bzero(seg, sizeof (struct memseg));
346 		seg->msegflags = MEMSEG_DYNAMIC;
347 		seg->pages = pp;
348 	} else {
349 		/*EMPTY*/
350 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
351 	}
352 
353 	seg->epages = seg->pages + npgs;
354 	seg->pages_base = base;
355 	seg->pages_end = base + npgs;
356 
357 	/*
358 	 * Initialize metadata. The page_ts are set to locked state
359 	 * ready to be freed.
360 	 */
361 	bzero((caddr_t)pp, ptob(metapgs));
362 
363 	pfn = seg->pages_base;
364 	/* Save the original pp base in case we reuse a memseg. */
365 	opp = pp;
366 	oepp = opp + npgs;
367 	for (pp = opp; pp < oepp; pp++) {
368 		pp->p_pagenum = pfn;
369 		pfn++;
370 		page_iolock_init(pp);
371 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
372 			continue;
373 		pp->p_offset = (u_offset_t)-1;
374 	}
375 
376 	if (reuse) {
377 		/* Remap our page_ts to the re-used memseg VA space. */
378 		pfn = pt_base;
379 		vaddr = (caddr_t)seg->pages;
380 		for (pnum = 0; pnum < metapgs; pnum++) {
381 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
382 			    PROT_READ | PROT_WRITE,
383 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
384 			pfn++;
385 			vaddr += ptob(1);
386 		}
387 
388 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
389 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
390 
391 		vmem_free(heap_arena, mapva, ptob(metapgs));
392 	}
393 
394 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
395 
396 	memsegs_lock(1);
397 
398 	/*
399 	 * The new memseg is inserted at the beginning of the list.
400 	 * Not only does this save searching for the tail, but in the
401 	 * case of a re-used memseg, it solves the problem of what
402 	 * happens of some process has still got a pointer to the
403 	 * memseg and follows the next pointer to continue traversing
404 	 * the memsegs list.
405 	 */
406 
407 	hat_kpm_addmem_mseg_insert(seg);
408 
409 	seg->next = memsegs;
410 	membar_producer();
411 
412 	hat_kpm_addmem_memsegs_update(seg);
413 
414 	memsegs = seg;
415 
416 	build_pfn_hash();
417 
418 	total_pages += npgs;
419 
420 	/*
421 	 * Recalculate the paging parameters now total_pages has changed.
422 	 * This will also cause the clock hands to be reset before next use.
423 	 */
424 	setupclock(1);
425 
426 	memsegs_unlock(1);
427 
428 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
429 
430 	/*
431 	 * Free the pages outside the lock to avoid locking loops.
432 	 */
433 	for (pp = seg->pages; pp < seg->epages; pp++) {
434 		page_free(pp, 1);
435 	}
436 
437 	/*
438 	 * Now that we've updated the appropriate memory lists we
439 	 * need to reset a number of globals, since we've increased memory.
440 	 * Several have already been updated for us as noted above. The
441 	 * globals we're interested in at this point are:
442 	 *   physmax - highest page frame number.
443 	 *   physinstalled - number of pages currently installed (done earlier)
444 	 *   maxmem - max free pages in the system
445 	 *   physmem - physical memory pages available
446 	 *   availrmem - real memory available
447 	 */
448 
449 	mutex_enter(&freemem_lock);
450 	maxmem += npgs;
451 	physmem += npgs;
452 	availrmem += npgs;
453 	availrmem_initial += npgs;
454 
455 	mutex_exit(&freemem_lock);
456 
457 	dump_resize();
458 
459 	page_freelist_coalesce_all(mnode);
460 
461 	kphysm_setup_post_add(npgs);
462 
463 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
464 	    "(0x%" PRIx64 ")\n",
465 	    physinstalled << (PAGESHIFT - 10),
466 	    (uint64_t)physinstalled << PAGESHIFT);
467 
468 	avmem = (uint64_t)freemem << PAGESHIFT;
469 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
470 	    "avail mem = %" PRId64 "\n", avmem);
471 
472 	/*
473 	 * Update lgroup generation number on single lgroup systems
474 	 */
475 	if (nlgrps == 1)
476 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
477 
478 	delspan_unreserve(pt_base, tpgs);
479 	return (KPHYSM_OK);		/* Successfully added system memory */
480 
481 }
482 
483 /*
484  * There are various error conditions in kphysm_add_memory_dynamic()
485  * which require a rollback of already changed global state.
486  */
487 static void
488 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
489 {
490 	int mlret;
491 
492 	/* Unreserve memory span. */
493 	memlist_write_lock();
494 
495 	mlret = memlist_delete_span(
496 	    (uint64_t)(pt_base) << PAGESHIFT,
497 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
498 
499 	ASSERT(mlret == MEML_SPANOP_OK);
500 	phys_install_has_changed();
501 	installed_top_size(phys_install, &physmax, &physinstalled);
502 
503 	memlist_write_unlock();
504 	delspan_unreserve(pt_base, tpgs);
505 }
506 
507 /*
508  * Only return an available memseg of exactly the right size.
509  * When the meta data area has it's own virtual address space
510  * we will need to manage this more carefully and do best fit
511  * allocations, possibly splitting an available area.
512  */
513 static struct memseg *
514 memseg_reuse(pgcnt_t metapgs)
515 {
516 	struct memseg **segpp, *seg;
517 
518 	mutex_enter(&memseg_lists_lock);
519 
520 	segpp = &memseg_va_avail;
521 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
522 		caddr_t end;
523 
524 		if (kpm_enable)
525 			end = hat_kpm_mseg_reuse(seg);
526 		else
527 			end = (caddr_t)seg->epages;
528 
529 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
530 			*segpp = seg->lnext;
531 			seg->lnext = NULL;
532 			break;
533 		}
534 	}
535 	mutex_exit(&memseg_lists_lock);
536 
537 	return (seg);
538 }
539 
540 static uint_t handle_gen;
541 
542 struct memdelspan {
543 	struct memdelspan *mds_next;
544 	pfn_t		mds_base;
545 	pgcnt_t		mds_npgs;
546 	uint_t		*mds_bitmap;
547 	uint_t		*mds_bitmap_retired;
548 };
549 
550 #define	NBPBMW		(sizeof (uint_t) * NBBY)
551 #define	MDS_BITMAPBYTES(MDSP) \
552 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
553 
554 struct transit_list {
555 	struct transit_list	*trl_next;
556 	struct memdelspan	*trl_spans;
557 	int			trl_collect;
558 };
559 
560 struct transit_list_head {
561 	kmutex_t		trh_lock;
562 	struct transit_list	*trh_head;
563 };
564 
565 static struct transit_list_head transit_list_head;
566 
567 struct mem_handle;
568 static void transit_list_collect(struct mem_handle *, int);
569 static void transit_list_insert(struct transit_list *);
570 static void transit_list_remove(struct transit_list *);
571 
572 #ifdef DEBUG
573 #define	MEM_DEL_STATS
574 #endif /* DEBUG */
575 
576 #ifdef MEM_DEL_STATS
577 static int mem_del_stat_print = 0;
578 struct mem_del_stat {
579 	uint_t	nloop;
580 	uint_t	need_free;
581 	uint_t	free_loop;
582 	uint_t	free_low;
583 	uint_t	free_failed;
584 	uint_t	ncheck;
585 	uint_t	nopaget;
586 	uint_t	lockfail;
587 	uint_t	nfree;
588 	uint_t	nreloc;
589 	uint_t	nrelocfail;
590 	uint_t	already_done;
591 	uint_t	first_notfree;
592 	uint_t	npplocked;
593 	uint_t	nlockreloc;
594 	uint_t	nnorepl;
595 	uint_t	nmodreloc;
596 	uint_t	ndestroy;
597 	uint_t	nputpage;
598 	uint_t	nnoreclaim;
599 	uint_t	ndelay;
600 	uint_t	demotefail;
601 	uint64_t nticks_total;
602 	uint64_t nticks_pgrp;
603 	uint_t	retired;
604 	uint_t	toxic;
605 	uint_t	failing;
606 	uint_t	modtoxic;
607 	uint_t	npplkdtoxic;
608 	uint_t	gptlmodfail;
609 	uint_t	gptllckfail;
610 };
611 /*
612  * The stat values are only incremented in the delete thread
613  * so no locking or atomic required.
614  */
615 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
616 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
617 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
618 static void mem_del_stat_print_func(struct mem_handle *);
619 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
620 #else /* MEM_DEL_STATS */
621 #define	MDSTAT_INCR(MHP, FLD)
622 #define	MDSTAT_TOTAL(MHP, ntck)
623 #define	MDSTAT_PGRP(MHP, ntck)
624 #define	MDSTAT_PRINT(MHP)
625 #endif /* MEM_DEL_STATS */
626 
627 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
628 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
629 
630 /*
631  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
632  * The mutex may not be required for other fields, dependent on mh_state.
633  */
634 struct mem_handle {
635 	kmutex_t	mh_mutex;
636 	struct mem_handle *mh_next;
637 	memhandle_t	mh_exthandle;
638 	mhnd_state_t	mh_state;
639 	struct transit_list mh_transit;
640 	pgcnt_t		mh_phys_pages;
641 	pgcnt_t		mh_vm_pages;
642 	pgcnt_t		mh_hold_todo;
643 	void		(*mh_delete_complete)(void *, int error);
644 	void		*mh_delete_complete_arg;
645 	volatile uint_t mh_cancel;
646 	volatile uint_t mh_dr_aio_cleanup_cancel;
647 	volatile uint_t mh_aio_cleanup_done;
648 	kcondvar_t	mh_cv;
649 	kthread_id_t	mh_thread_id;
650 	page_t		*mh_deleted;	/* link through p_next */
651 #ifdef MEM_DEL_STATS
652 	struct mem_del_stat mh_delstat;
653 #endif /* MEM_DEL_STATS */
654 };
655 
656 static struct mem_handle *mem_handle_head;
657 static kmutex_t mem_handle_list_mutex;
658 
659 static struct mem_handle *
660 kphysm_allocate_mem_handle()
661 {
662 	struct mem_handle *mhp;
663 
664 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
665 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
666 	mutex_enter(&mem_handle_list_mutex);
667 	mutex_enter(&mhp->mh_mutex);
668 	/* handle_gen is protected by list mutex. */
669 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
670 	mhp->mh_next = mem_handle_head;
671 	mem_handle_head = mhp;
672 	mutex_exit(&mem_handle_list_mutex);
673 
674 	return (mhp);
675 }
676 
677 static void
678 kphysm_free_mem_handle(struct mem_handle *mhp)
679 {
680 	struct mem_handle **mhpp;
681 
682 	ASSERT(mutex_owned(&mhp->mh_mutex));
683 	ASSERT(mhp->mh_state == MHND_FREE);
684 	/*
685 	 * Exit the mutex to preserve locking order. This is OK
686 	 * here as once in the FREE state, the handle cannot
687 	 * be found by a lookup.
688 	 */
689 	mutex_exit(&mhp->mh_mutex);
690 
691 	mutex_enter(&mem_handle_list_mutex);
692 	mhpp = &mem_handle_head;
693 	while (*mhpp != NULL && *mhpp != mhp)
694 		mhpp = &(*mhpp)->mh_next;
695 	ASSERT(*mhpp == mhp);
696 	/*
697 	 * No need to lock the handle (mh_mutex) as only
698 	 * mh_next changing and this is the only thread that
699 	 * can be referncing mhp.
700 	 */
701 	*mhpp = mhp->mh_next;
702 	mutex_exit(&mem_handle_list_mutex);
703 
704 	mutex_destroy(&mhp->mh_mutex);
705 	kmem_free(mhp, sizeof (struct mem_handle));
706 }
707 
708 /*
709  * This function finds the internal mem_handle corresponding to an
710  * external handle and returns it with the mh_mutex held.
711  */
712 static struct mem_handle *
713 kphysm_lookup_mem_handle(memhandle_t handle)
714 {
715 	struct mem_handle *mhp;
716 
717 	mutex_enter(&mem_handle_list_mutex);
718 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
719 		if (mhp->mh_exthandle == handle) {
720 			mutex_enter(&mhp->mh_mutex);
721 			/*
722 			 * The state of the handle could have been changed
723 			 * by kphysm_del_release() while waiting for mh_mutex.
724 			 */
725 			if (mhp->mh_state == MHND_FREE) {
726 				mutex_exit(&mhp->mh_mutex);
727 				continue;
728 			}
729 			break;
730 		}
731 	}
732 	mutex_exit(&mem_handle_list_mutex);
733 	return (mhp);
734 }
735 
736 int
737 kphysm_del_gethandle(memhandle_t *xmhp)
738 {
739 	struct mem_handle *mhp;
740 
741 	mhp = kphysm_allocate_mem_handle();
742 	/*
743 	 * The handle is allocated using KM_SLEEP, so cannot fail.
744 	 * If the implementation is changed, the correct error to return
745 	 * here would be KPHYSM_ENOHANDLES.
746 	 */
747 	ASSERT(mhp->mh_state == MHND_FREE);
748 	mhp->mh_state = MHND_INIT;
749 	*xmhp = mhp->mh_exthandle;
750 	mutex_exit(&mhp->mh_mutex);
751 	return (KPHYSM_OK);
752 }
753 
754 static int
755 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
756 {
757 	pfn_t e1, e2;
758 
759 	e1 = b1 + l1;
760 	e2 = b2 + l2;
761 
762 	return (!(b2 >= e1 || b1 >= e2));
763 }
764 
765 static int can_remove_pgs(pgcnt_t);
766 
767 static struct memdelspan *
768 span_to_install(pfn_t base, pgcnt_t npgs)
769 {
770 	struct memdelspan *mdsp;
771 	struct memdelspan *mdsp_new;
772 	uint64_t address, size, thislen;
773 	struct memlist *mlp;
774 
775 	mdsp_new = NULL;
776 
777 	address = (uint64_t)base << PAGESHIFT;
778 	size = (uint64_t)npgs << PAGESHIFT;
779 	while (size != 0) {
780 		memlist_read_lock();
781 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
782 			if (address >= (mlp->address + mlp->size))
783 				continue;
784 			if ((address + size) > mlp->address)
785 				break;
786 		}
787 		if (mlp == NULL) {
788 			address += size;
789 			size = 0;
790 			thislen = 0;
791 		} else {
792 			if (address < mlp->address) {
793 				size -= (mlp->address - address);
794 				address = mlp->address;
795 			}
796 			ASSERT(address >= mlp->address);
797 			if ((address + size) > (mlp->address + mlp->size)) {
798 				thislen = mlp->size - (address - mlp->address);
799 			} else {
800 				thislen = size;
801 			}
802 		}
803 		memlist_read_unlock();
804 		/* TODO: phys_install could change now */
805 		if (thislen == 0)
806 			continue;
807 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
808 		mdsp->mds_base = btop(address);
809 		mdsp->mds_npgs = btop(thislen);
810 		mdsp->mds_next = mdsp_new;
811 		mdsp_new = mdsp;
812 		address += thislen;
813 		size -= thislen;
814 	}
815 	return (mdsp_new);
816 }
817 
818 static void
819 free_delspans(struct memdelspan *mdsp)
820 {
821 	struct memdelspan *amdsp;
822 
823 	while ((amdsp = mdsp) != NULL) {
824 		mdsp = amdsp->mds_next;
825 		kmem_free(amdsp, sizeof (struct memdelspan));
826 	}
827 }
828 
829 /*
830  * Concatenate lists. No list ordering is required.
831  */
832 
833 static void
834 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
835 {
836 	while (*mdspp != NULL)
837 		mdspp = &(*mdspp)->mds_next;
838 
839 	*mdspp = mdsp;
840 }
841 
842 /*
843  * Given a new list of delspans, check there is no overlap with
844  * all existing span activity (add or delete) and then concatenate
845  * the new spans to the given list.
846  * Return 1 for OK, 0 if overlapping.
847  */
848 static int
849 delspan_insert(
850 	struct transit_list *my_tlp,
851 	struct memdelspan *mdsp_new)
852 {
853 	struct transit_list_head *trh;
854 	struct transit_list *tlp;
855 	int ret;
856 
857 	trh = &transit_list_head;
858 
859 	ASSERT(my_tlp != NULL);
860 	ASSERT(mdsp_new != NULL);
861 
862 	ret = 1;
863 	mutex_enter(&trh->trh_lock);
864 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
865 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
866 		struct memdelspan *mdsp;
867 
868 		for (mdsp = tlp->trl_spans; mdsp != NULL;
869 		    mdsp = mdsp->mds_next) {
870 			struct memdelspan *nmdsp;
871 
872 			for (nmdsp = mdsp_new; nmdsp != NULL;
873 			    nmdsp = nmdsp->mds_next) {
874 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
875 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
876 					ret = 0;
877 					goto done;
878 				}
879 			}
880 		}
881 	}
882 done:
883 	if (ret != 0) {
884 		if (my_tlp->trl_spans == NULL)
885 			transit_list_insert(my_tlp);
886 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
887 	}
888 	mutex_exit(&trh->trh_lock);
889 	return (ret);
890 }
891 
892 static void
893 delspan_remove(
894 	struct transit_list *my_tlp,
895 	pfn_t base,
896 	pgcnt_t npgs)
897 {
898 	struct transit_list_head *trh;
899 	struct memdelspan *mdsp;
900 
901 	trh = &transit_list_head;
902 
903 	ASSERT(my_tlp != NULL);
904 
905 	mutex_enter(&trh->trh_lock);
906 	if ((mdsp = my_tlp->trl_spans) != NULL) {
907 		if (npgs == 0) {
908 			my_tlp->trl_spans = NULL;
909 			free_delspans(mdsp);
910 			transit_list_remove(my_tlp);
911 		} else {
912 			struct memdelspan **prv;
913 
914 			prv = &my_tlp->trl_spans;
915 			while (mdsp != NULL) {
916 				pfn_t p_end;
917 
918 				p_end = mdsp->mds_base + mdsp->mds_npgs;
919 				if (mdsp->mds_base >= base &&
920 				    p_end <= (base + npgs)) {
921 					*prv = mdsp->mds_next;
922 					mdsp->mds_next = NULL;
923 					free_delspans(mdsp);
924 				} else {
925 					prv = &mdsp->mds_next;
926 				}
927 				mdsp = *prv;
928 			}
929 			if (my_tlp->trl_spans == NULL)
930 				transit_list_remove(my_tlp);
931 		}
932 	}
933 	mutex_exit(&trh->trh_lock);
934 }
935 
936 /*
937  * Reserve interface for add to stop delete before add finished.
938  * This list is only accessed through the delspan_insert/remove
939  * functions and so is fully protected by the mutex in struct transit_list.
940  */
941 
942 static struct transit_list reserve_transit;
943 
944 static int
945 delspan_reserve(pfn_t base, pgcnt_t npgs)
946 {
947 	struct memdelspan *mdsp;
948 	int ret;
949 
950 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
951 	mdsp->mds_base = base;
952 	mdsp->mds_npgs = npgs;
953 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
954 		free_delspans(mdsp);
955 	}
956 	return (ret);
957 }
958 
959 static void
960 delspan_unreserve(pfn_t base, pgcnt_t npgs)
961 {
962 	delspan_remove(&reserve_transit, base, npgs);
963 }
964 
965 /*
966  * Return whether memseg was created by kphysm_add_memory_dynamic().
967  * If this is the case and startp non zero, return also the start pfn
968  * of the meta data via startp.
969  */
970 static int
971 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
972 {
973 	pfn_t		pt_start;
974 
975 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
976 		return (0);
977 
978 	/* Meta data is required to be at the beginning */
979 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
980 
981 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
982 	if (startp != NULL)
983 		*startp = pt_start;
984 
985 	return (1);
986 }
987 
988 int
989 kphysm_del_span(
990 	memhandle_t handle,
991 	pfn_t base,
992 	pgcnt_t npgs)
993 {
994 	struct mem_handle *mhp;
995 	struct memseg *seg;
996 	struct memdelspan *mdsp;
997 	struct memdelspan *mdsp_new;
998 	pgcnt_t phys_pages, vm_pages;
999 	pfn_t p_end;
1000 	page_t *pp;
1001 	int ret;
1002 
1003 	mhp = kphysm_lookup_mem_handle(handle);
1004 	if (mhp == NULL) {
1005 		return (KPHYSM_EHANDLE);
1006 	}
1007 	if (mhp->mh_state != MHND_INIT) {
1008 		mutex_exit(&mhp->mh_mutex);
1009 		return (KPHYSM_ESEQUENCE);
1010 	}
1011 
1012 	/*
1013 	 * Intersect the span with the installed memory list (phys_install).
1014 	 */
1015 	mdsp_new = span_to_install(base, npgs);
1016 	if (mdsp_new == NULL) {
1017 		/*
1018 		 * No physical memory in this range. Is this an
1019 		 * error? If an attempt to start the delete is made
1020 		 * for OK returns from del_span such as this, start will
1021 		 * return an error.
1022 		 * Could return KPHYSM_ENOWORK.
1023 		 */
1024 		/*
1025 		 * It is assumed that there are no error returns
1026 		 * from span_to_install() due to kmem_alloc failure.
1027 		 */
1028 		mutex_exit(&mhp->mh_mutex);
1029 		return (KPHYSM_OK);
1030 	}
1031 	/*
1032 	 * Does this span overlap an existing span?
1033 	 */
1034 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1035 		/*
1036 		 * Differentiate between already on list for this handle
1037 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1038 		 */
1039 		ret = KPHYSM_EBUSY;
1040 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1041 		    mdsp = mdsp->mds_next) {
1042 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1043 			    base, npgs)) {
1044 				ret = KPHYSM_EDUP;
1045 				break;
1046 			}
1047 		}
1048 		mutex_exit(&mhp->mh_mutex);
1049 		free_delspans(mdsp_new);
1050 		return (ret);
1051 	}
1052 	/*
1053 	 * At this point the spans in mdsp_new have been inserted into the
1054 	 * list of spans for this handle and thereby to the global list of
1055 	 * spans being processed. Each of these spans must now be checked
1056 	 * for relocatability. As a side-effect segments in the memseg list
1057 	 * may be split.
1058 	 *
1059 	 * Note that mdsp_new can no longer be used as it is now part of
1060 	 * a larger list. Select elements of this larger list based
1061 	 * on base and npgs.
1062 	 */
1063 restart:
1064 	phys_pages = 0;
1065 	vm_pages = 0;
1066 	ret = KPHYSM_OK;
1067 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1068 	    mdsp = mdsp->mds_next) {
1069 		pgcnt_t pages_checked;
1070 
1071 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1072 			continue;
1073 		}
1074 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1075 		/*
1076 		 * The pages_checked count is a hack. All pages should be
1077 		 * checked for relocatability. Those not covered by memsegs
1078 		 * should be tested with arch_kphysm_del_span_ok().
1079 		 */
1080 		pages_checked = 0;
1081 		for (seg = memsegs; seg; seg = seg->next) {
1082 			pfn_t mseg_start;
1083 
1084 			if (seg->pages_base >= p_end ||
1085 			    seg->pages_end <= mdsp->mds_base) {
1086 				/* Span and memseg don't overlap. */
1087 				continue;
1088 			}
1089 			/* Check that segment is suitable for delete. */
1090 			if (memseg_is_dynamic(seg, &mseg_start)) {
1091 				/*
1092 				 * Can only delete whole added segments
1093 				 * for the moment.
1094 				 * Check that this is completely within the
1095 				 * span.
1096 				 */
1097 				if (mseg_start < mdsp->mds_base ||
1098 				    seg->pages_end > p_end) {
1099 					ret = KPHYSM_EBUSY;
1100 					break;
1101 				}
1102 				pages_checked += seg->pages_end - mseg_start;
1103 			} else {
1104 				/*
1105 				 * Set mseg_start for accounting below.
1106 				 */
1107 				mseg_start = seg->pages_base;
1108 				/*
1109 				 * If this segment is larger than the span,
1110 				 * try to split it. After the split, it
1111 				 * is necessary to restart.
1112 				 */
1113 				if (seg->pages_base < mdsp->mds_base ||
1114 				    seg->pages_end > p_end) {
1115 					pfn_t abase;
1116 					pgcnt_t anpgs;
1117 					int s_ret;
1118 
1119 					/* Split required.  */
1120 					if (mdsp->mds_base < seg->pages_base)
1121 						abase = seg->pages_base;
1122 					else
1123 						abase = mdsp->mds_base;
1124 					if (p_end > seg->pages_end)
1125 						anpgs = seg->pages_end - abase;
1126 					else
1127 						anpgs = p_end - abase;
1128 					s_ret = kphysm_split_memseg(abase,
1129 					    anpgs);
1130 					if (s_ret == 0) {
1131 						/* Split failed. */
1132 						ret = KPHYSM_ERESOURCE;
1133 						break;
1134 					}
1135 					goto restart;
1136 				}
1137 				pages_checked +=
1138 				    seg->pages_end - seg->pages_base;
1139 			}
1140 			/*
1141 			 * The memseg is wholly within the delete span.
1142 			 * The individual pages can now be checked.
1143 			 */
1144 			/* Cage test. */
1145 			for (pp = seg->pages; pp < seg->epages; pp++) {
1146 				if (PP_ISNORELOC(pp)) {
1147 					ret = KPHYSM_ENONRELOC;
1148 					break;
1149 				}
1150 			}
1151 			if (ret != KPHYSM_OK) {
1152 				break;
1153 			}
1154 			phys_pages += (seg->pages_end - mseg_start);
1155 			vm_pages += MSEG_NPAGES(seg);
1156 		}
1157 		if (ret != KPHYSM_OK)
1158 			break;
1159 		if (pages_checked != mdsp->mds_npgs) {
1160 			ret = KPHYSM_ENONRELOC;
1161 			break;
1162 		}
1163 	}
1164 
1165 	if (ret == KPHYSM_OK) {
1166 		mhp->mh_phys_pages += phys_pages;
1167 		mhp->mh_vm_pages += vm_pages;
1168 	} else {
1169 		/*
1170 		 * Keep holding the mh_mutex to prevent it going away.
1171 		 */
1172 		delspan_remove(&mhp->mh_transit, base, npgs);
1173 	}
1174 	mutex_exit(&mhp->mh_mutex);
1175 	return (ret);
1176 }
1177 
1178 int
1179 kphysm_del_span_query(
1180 	pfn_t base,
1181 	pgcnt_t npgs,
1182 	memquery_t *mqp)
1183 {
1184 	struct memdelspan *mdsp;
1185 	struct memdelspan *mdsp_new;
1186 	int done_first_nonreloc;
1187 
1188 	mqp->phys_pages = 0;
1189 	mqp->managed = 0;
1190 	mqp->nonrelocatable = 0;
1191 	mqp->first_nonrelocatable = 0;
1192 	mqp->last_nonrelocatable = 0;
1193 
1194 	mdsp_new = span_to_install(base, npgs);
1195 	/*
1196 	 * It is OK to proceed here if mdsp_new == NULL.
1197 	 */
1198 	done_first_nonreloc = 0;
1199 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1200 		pfn_t sbase;
1201 		pgcnt_t snpgs;
1202 
1203 		mqp->phys_pages += mdsp->mds_npgs;
1204 		sbase = mdsp->mds_base;
1205 		snpgs = mdsp->mds_npgs;
1206 		while (snpgs != 0) {
1207 			struct memseg *lseg, *seg;
1208 			pfn_t p_end;
1209 			page_t *pp;
1210 			pfn_t mseg_start;
1211 
1212 			p_end = sbase + snpgs;
1213 			/*
1214 			 * Find the lowest addressed memseg that starts
1215 			 * after sbase and account for it.
1216 			 * This is to catch dynamic memsegs whose start
1217 			 * is hidden.
1218 			 */
1219 			seg = NULL;
1220 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1221 				if ((lseg->pages_base >= sbase) ||
1222 				    (lseg->pages_base < p_end &&
1223 				    lseg->pages_end > sbase)) {
1224 					if (seg == NULL ||
1225 					    seg->pages_base > lseg->pages_base)
1226 						seg = lseg;
1227 				}
1228 			}
1229 			if (seg != NULL) {
1230 				if (!memseg_is_dynamic(seg, &mseg_start)) {
1231 					mseg_start = seg->pages_base;
1232 				}
1233 				/*
1234 				 * Now have the full extent of the memseg so
1235 				 * do the range check.
1236 				 */
1237 				if (mseg_start >= p_end ||
1238 				    seg->pages_end <= sbase) {
1239 					/* Span does not overlap memseg. */
1240 					seg = NULL;
1241 				}
1242 			}
1243 			/*
1244 			 * Account for gap either before the segment if
1245 			 * there is one or to the end of the span.
1246 			 */
1247 			if (seg == NULL || mseg_start > sbase) {
1248 				pfn_t a_end;
1249 
1250 				a_end = (seg == NULL) ? p_end : mseg_start;
1251 				/*
1252 				 * Check with arch layer for relocatability.
1253 				 */
1254 				if (arch_kphysm_del_span_ok(sbase,
1255 				    (a_end - sbase))) {
1256 					/*
1257 					 * No non-relocatble pages in this
1258 					 * area, avoid the fine-grained
1259 					 * test.
1260 					 */
1261 					snpgs -= (a_end - sbase);
1262 					sbase = a_end;
1263 				}
1264 				while (sbase < a_end) {
1265 					if (!arch_kphysm_del_span_ok(sbase,
1266 					    1)) {
1267 						mqp->nonrelocatable++;
1268 						if (!done_first_nonreloc) {
1269 							mqp->
1270 							    first_nonrelocatable
1271 							    = sbase;
1272 							done_first_nonreloc = 1;
1273 						}
1274 						mqp->last_nonrelocatable =
1275 						    sbase;
1276 					}
1277 					sbase++;
1278 					snpgs--;
1279 				}
1280 			}
1281 			if (seg != NULL) {
1282 				ASSERT(mseg_start <= sbase);
1283 				if (seg->pages_base != mseg_start &&
1284 				    seg->pages_base > sbase) {
1285 					pgcnt_t skip_pgs;
1286 
1287 					/*
1288 					 * Skip the page_t area of a
1289 					 * dynamic memseg.
1290 					 */
1291 					skip_pgs = seg->pages_base - sbase;
1292 					if (snpgs <= skip_pgs) {
1293 						sbase += snpgs;
1294 						snpgs = 0;
1295 						continue;
1296 					}
1297 					snpgs -= skip_pgs;
1298 					sbase += skip_pgs;
1299 				}
1300 				ASSERT(snpgs != 0);
1301 				ASSERT(seg->pages_base <= sbase);
1302 				/*
1303 				 * The individual pages can now be checked.
1304 				 */
1305 				for (pp = seg->pages +
1306 				    (sbase - seg->pages_base);
1307 				    snpgs != 0 && pp < seg->epages; pp++) {
1308 					mqp->managed++;
1309 					if (PP_ISNORELOC(pp)) {
1310 						mqp->nonrelocatable++;
1311 						if (!done_first_nonreloc) {
1312 							mqp->
1313 							    first_nonrelocatable
1314 							    = sbase;
1315 							done_first_nonreloc = 1;
1316 						}
1317 						mqp->last_nonrelocatable =
1318 						    sbase;
1319 					}
1320 					sbase++;
1321 					snpgs--;
1322 				}
1323 			}
1324 		}
1325 	}
1326 
1327 	free_delspans(mdsp_new);
1328 
1329 	return (KPHYSM_OK);
1330 }
1331 
1332 /*
1333  * This release function can be called at any stage as follows:
1334  *	_gethandle only called
1335  *	_span(s) only called
1336  *	_start called but failed
1337  *	delete thread exited
1338  */
1339 int
1340 kphysm_del_release(memhandle_t handle)
1341 {
1342 	struct mem_handle *mhp;
1343 
1344 	mhp = kphysm_lookup_mem_handle(handle);
1345 	if (mhp == NULL) {
1346 		return (KPHYSM_EHANDLE);
1347 	}
1348 	switch (mhp->mh_state) {
1349 	case MHND_STARTING:
1350 	case MHND_RUNNING:
1351 		mutex_exit(&mhp->mh_mutex);
1352 		return (KPHYSM_ENOTFINISHED);
1353 	case MHND_FREE:
1354 		ASSERT(mhp->mh_state != MHND_FREE);
1355 		mutex_exit(&mhp->mh_mutex);
1356 		return (KPHYSM_EHANDLE);
1357 	case MHND_INIT:
1358 		break;
1359 	case MHND_DONE:
1360 		break;
1361 	case MHND_RELEASE:
1362 		mutex_exit(&mhp->mh_mutex);
1363 		return (KPHYSM_ESEQUENCE);
1364 	default:
1365 #ifdef DEBUG
1366 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1367 		    (void *)mhp, mhp->mh_state);
1368 #endif /* DEBUG */
1369 		mutex_exit(&mhp->mh_mutex);
1370 		return (KPHYSM_EHANDLE);
1371 	}
1372 	/*
1373 	 * Set state so that we can wait if necessary.
1374 	 * Also this means that we have read/write access to all
1375 	 * fields except mh_exthandle and mh_state.
1376 	 */
1377 	mhp->mh_state = MHND_RELEASE;
1378 	/*
1379 	 * The mem_handle cannot be de-allocated by any other operation
1380 	 * now, so no need to hold mh_mutex.
1381 	 */
1382 	mutex_exit(&mhp->mh_mutex);
1383 
1384 	delspan_remove(&mhp->mh_transit, 0, 0);
1385 	mhp->mh_phys_pages = 0;
1386 	mhp->mh_vm_pages = 0;
1387 	mhp->mh_hold_todo = 0;
1388 	mhp->mh_delete_complete = NULL;
1389 	mhp->mh_delete_complete_arg = NULL;
1390 	mhp->mh_cancel = 0;
1391 
1392 	mutex_enter(&mhp->mh_mutex);
1393 	ASSERT(mhp->mh_state == MHND_RELEASE);
1394 	mhp->mh_state = MHND_FREE;
1395 
1396 	kphysm_free_mem_handle(mhp);
1397 
1398 	return (KPHYSM_OK);
1399 }
1400 
1401 /*
1402  * This cancel function can only be called with the thread running.
1403  */
1404 int
1405 kphysm_del_cancel(memhandle_t handle)
1406 {
1407 	struct mem_handle *mhp;
1408 
1409 	mhp = kphysm_lookup_mem_handle(handle);
1410 	if (mhp == NULL) {
1411 		return (KPHYSM_EHANDLE);
1412 	}
1413 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1414 		mutex_exit(&mhp->mh_mutex);
1415 		return (KPHYSM_ENOTRUNNING);
1416 	}
1417 	/*
1418 	 * Set the cancel flag and wake the delete thread up.
1419 	 * The thread may be waiting on I/O, so the effect of the cancel
1420 	 * may be delayed.
1421 	 */
1422 	if (mhp->mh_cancel == 0) {
1423 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1424 		cv_signal(&mhp->mh_cv);
1425 	}
1426 	mutex_exit(&mhp->mh_mutex);
1427 	return (KPHYSM_OK);
1428 }
1429 
1430 int
1431 kphysm_del_status(
1432 	memhandle_t handle,
1433 	memdelstat_t *mdstp)
1434 {
1435 	struct mem_handle *mhp;
1436 
1437 	mhp = kphysm_lookup_mem_handle(handle);
1438 	if (mhp == NULL) {
1439 		return (KPHYSM_EHANDLE);
1440 	}
1441 	/*
1442 	 * Calling kphysm_del_status() is allowed before the delete
1443 	 * is started to allow for status display.
1444 	 */
1445 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1446 	    mhp->mh_state != MHND_RUNNING) {
1447 		mutex_exit(&mhp->mh_mutex);
1448 		return (KPHYSM_ENOTRUNNING);
1449 	}
1450 	mdstp->phys_pages = mhp->mh_phys_pages;
1451 	mdstp->managed = mhp->mh_vm_pages;
1452 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1453 	mutex_exit(&mhp->mh_mutex);
1454 	return (KPHYSM_OK);
1455 }
1456 
1457 static int mem_delete_additional_pages = 100;
1458 
1459 static int
1460 can_remove_pgs(pgcnt_t npgs)
1461 {
1462 	/*
1463 	 * If all pageable pages were paged out, freemem would
1464 	 * equal availrmem.  There is a minimum requirement for
1465 	 * availrmem.
1466 	 */
1467 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1468 	    < npgs)
1469 		return (0);
1470 	/* TODO: check swap space, etc. */
1471 	return (1);
1472 }
1473 
1474 static int
1475 get_availrmem(pgcnt_t npgs)
1476 {
1477 	int ret;
1478 
1479 	mutex_enter(&freemem_lock);
1480 	ret = can_remove_pgs(npgs);
1481 	if (ret != 0)
1482 		availrmem -= npgs;
1483 	mutex_exit(&freemem_lock);
1484 	return (ret);
1485 }
1486 
1487 static void
1488 put_availrmem(pgcnt_t npgs)
1489 {
1490 	mutex_enter(&freemem_lock);
1491 	availrmem += npgs;
1492 	mutex_exit(&freemem_lock);
1493 }
1494 
1495 #define	FREEMEM_INCR	100
1496 static pgcnt_t freemem_incr = FREEMEM_INCR;
1497 #define	DEL_FREE_WAIT_FRAC	4
1498 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1499 
1500 #define	DEL_BUSY_WAIT_FRAC	20
1501 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1502 
1503 static void kphysm_del_cleanup(struct mem_handle *);
1504 
1505 static void page_delete_collect(page_t *, struct mem_handle *);
1506 
1507 static pgcnt_t
1508 delthr_get_freemem(struct mem_handle *mhp)
1509 {
1510 	pgcnt_t free_get;
1511 	int ret;
1512 
1513 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1514 
1515 	MDSTAT_INCR(mhp, need_free);
1516 	/*
1517 	 * Get up to freemem_incr pages.
1518 	 */
1519 	free_get = freemem_incr;
1520 	if (free_get > mhp->mh_hold_todo)
1521 		free_get = mhp->mh_hold_todo;
1522 	/*
1523 	 * Take free_get pages away from freemem,
1524 	 * waiting if necessary.
1525 	 */
1526 
1527 	while (!mhp->mh_cancel) {
1528 		mutex_exit(&mhp->mh_mutex);
1529 		MDSTAT_INCR(mhp, free_loop);
1530 		/*
1531 		 * Duplicate test from page_create_throttle()
1532 		 * but don't override with !PG_WAIT.
1533 		 */
1534 		if (freemem < (free_get + throttlefree)) {
1535 			MDSTAT_INCR(mhp, free_low);
1536 			ret = 0;
1537 		} else {
1538 			ret = page_create_wait(free_get, 0);
1539 			if (ret == 0) {
1540 				/* EMPTY */
1541 				MDSTAT_INCR(mhp, free_failed);
1542 			}
1543 		}
1544 		if (ret != 0) {
1545 			mutex_enter(&mhp->mh_mutex);
1546 			return (free_get);
1547 		}
1548 
1549 		/*
1550 		 * Put pressure on pageout.
1551 		 */
1552 		page_needfree(free_get);
1553 		cv_signal(&proc_pageout->p_cv);
1554 
1555 		mutex_enter(&mhp->mh_mutex);
1556 		(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
1557 		    (lbolt + DEL_FREE_WAIT_TICKS));
1558 		mutex_exit(&mhp->mh_mutex);
1559 		page_needfree(-(spgcnt_t)free_get);
1560 
1561 		mutex_enter(&mhp->mh_mutex);
1562 	}
1563 	return (0);
1564 }
1565 
1566 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1567 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1568 /*
1569  * This function is run as a helper thread for delete_memory_thread.
1570  * It is needed in order to force kaio cleanup, so that pages used in kaio
1571  * will be unlocked and subsequently relocated by delete_memory_thread.
1572  * The address of the delete_memory_threads's mem_handle is passed in to
1573  * this thread function, and is used to set the mh_aio_cleanup_done member
1574  * prior to calling thread_exit().
1575  */
1576 static void
1577 dr_aio_cleanup_thread(caddr_t amhp)
1578 {
1579 	proc_t *procp;
1580 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1581 	int cleaned;
1582 	int n = 0;
1583 	struct mem_handle *mhp;
1584 	volatile uint_t *pcancel;
1585 
1586 	mhp = (struct mem_handle *)amhp;
1587 	ASSERT(mhp != NULL);
1588 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1589 	if (modload("sys", "kaio") == -1) {
1590 		mhp->mh_aio_cleanup_done = 1;
1591 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1592 		thread_exit();
1593 	}
1594 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1595 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1596 	if (aio_cleanup_dr_delete_memory == NULL) {
1597 		mhp->mh_aio_cleanup_done = 1;
1598 		cmn_err(CE_WARN,
1599 	    "aio_cleanup_dr_delete_memory not found in kaio");
1600 		thread_exit();
1601 	}
1602 	do {
1603 		cleaned = 0;
1604 		mutex_enter(&pidlock);
1605 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1606 		    procp = procp->p_next) {
1607 			mutex_enter(&procp->p_lock);
1608 			if (procp->p_aio != NULL) {
1609 				/* cleanup proc's outstanding kaio */
1610 				cleaned +=
1611 				    (*aio_cleanup_dr_delete_memory)(procp);
1612 			}
1613 			mutex_exit(&procp->p_lock);
1614 		}
1615 		mutex_exit(&pidlock);
1616 		if ((*pcancel == 0) &&
1617 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1618 			/* delay a bit before retrying all procs again */
1619 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1620 			n = 0;
1621 		}
1622 	} while (*pcancel == 0);
1623 	mhp->mh_aio_cleanup_done = 1;
1624 	thread_exit();
1625 }
1626 
1627 static void
1628 delete_memory_thread(caddr_t amhp)
1629 {
1630 	struct mem_handle *mhp;
1631 	struct memdelspan *mdsp;
1632 	callb_cpr_t cprinfo;
1633 	page_t *pp_targ;
1634 	spgcnt_t freemem_left;
1635 	void (*del_complete_funcp)(void *, int error);
1636 	void *del_complete_arg;
1637 	int comp_code;
1638 	int ret;
1639 	int first_scan;
1640 	uint_t szc;
1641 #ifdef MEM_DEL_STATS
1642 	uint64_t start_total, ntick_total;
1643 	uint64_t start_pgrp, ntick_pgrp;
1644 #endif /* MEM_DEL_STATS */
1645 
1646 	mhp = (struct mem_handle *)amhp;
1647 
1648 #ifdef MEM_DEL_STATS
1649 	start_total = ddi_get_lbolt();
1650 #endif /* MEM_DEL_STATS */
1651 
1652 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1653 	    callb_generic_cpr, "memdel");
1654 
1655 	mutex_enter(&mhp->mh_mutex);
1656 	ASSERT(mhp->mh_state == MHND_STARTING);
1657 
1658 	mhp->mh_state = MHND_RUNNING;
1659 	mhp->mh_thread_id = curthread;
1660 
1661 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1662 	mutex_exit(&mhp->mh_mutex);
1663 
1664 	/* Allocate the remap pages now, if necessary. */
1665 	memseg_remap_init();
1666 
1667 	/*
1668 	 * Subtract from availrmem now if possible as availrmem
1669 	 * may not be available by the end of the delete.
1670 	 */
1671 	if (!get_availrmem(mhp->mh_vm_pages)) {
1672 		comp_code = KPHYSM_ENOTVIABLE;
1673 		mutex_enter(&mhp->mh_mutex);
1674 		goto early_exit;
1675 	}
1676 
1677 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1678 
1679 	mutex_enter(&mhp->mh_mutex);
1680 
1681 	if (ret != 0) {
1682 		mhp->mh_cancel = KPHYSM_EREFUSED;
1683 		goto refused;
1684 	}
1685 
1686 	transit_list_collect(mhp, 1);
1687 
1688 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1689 	    mdsp = mdsp->mds_next) {
1690 		ASSERT(mdsp->mds_bitmap == NULL);
1691 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1692 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1693 		    KM_SLEEP);
1694 	}
1695 
1696 	first_scan = 1;
1697 	freemem_left = 0;
1698 	/*
1699 	 * Start dr_aio_cleanup_thread, which periodically iterates
1700 	 * through the process list and invokes aio cleanup.  This
1701 	 * is needed in order to avoid a deadly embrace between the
1702 	 * delete_memory_thread (waiting on writer lock for page, with the
1703 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1704 	 * reader lock on the same page that is wanted by the
1705 	 * delete_memory_thread), and threads waiting for kaio completion
1706 	 * (blocked on spt_amp->lock).
1707 	 */
1708 	mhp->mh_dr_aio_cleanup_cancel = 0;
1709 	mhp->mh_aio_cleanup_done = 0;
1710 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1711 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1712 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1713 		pgcnt_t collected;
1714 
1715 		MDSTAT_INCR(mhp, nloop);
1716 		collected = 0;
1717 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1718 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1719 			pfn_t pfn, p_end;
1720 
1721 			if (first_scan) {
1722 				mem_node_pre_del_slice(mdsp->mds_base,
1723 				    mdsp->mds_base + mdsp->mds_npgs - 1);
1724 			}
1725 
1726 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1727 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1728 			    (mhp->mh_cancel == 0); pfn++) {
1729 				page_t *pp, *tpp, *tpp_targ;
1730 				pgcnt_t bit;
1731 				struct vnode *vp;
1732 				u_offset_t offset;
1733 				int mod, result;
1734 				spgcnt_t pgcnt;
1735 
1736 				bit = pfn - mdsp->mds_base;
1737 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1738 				    (1 << (bit % NBPBMW))) != 0) {
1739 					MDSTAT_INCR(mhp, already_done);
1740 					continue;
1741 				}
1742 				if (freemem_left == 0) {
1743 					freemem_left += delthr_get_freemem(mhp);
1744 					if (freemem_left == 0)
1745 						break;
1746 				}
1747 
1748 				/*
1749 				 * Release mh_mutex - some of this
1750 				 * stuff takes some time (eg PUTPAGE).
1751 				 */
1752 
1753 				mutex_exit(&mhp->mh_mutex);
1754 				MDSTAT_INCR(mhp, ncheck);
1755 
1756 				pp = page_numtopp_nolock(pfn);
1757 				if (pp == NULL) {
1758 					/*
1759 					 * Not covered by a page_t - will
1760 					 * be dealt with elsewhere.
1761 					 */
1762 					MDSTAT_INCR(mhp, nopaget);
1763 					mutex_enter(&mhp->mh_mutex);
1764 					mdsp->mds_bitmap[bit / NBPBMW] |=
1765 					    (1 << (bit % NBPBMW));
1766 					continue;
1767 				}
1768 
1769 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1770 				    SE_EXCL_WANTED | SE_RETIRED)) {
1771 					/*
1772 					 * Page in use elsewhere.  Skip it.
1773 					 */
1774 					MDSTAT_INCR(mhp, lockfail);
1775 					mutex_enter(&mhp->mh_mutex);
1776 					continue;
1777 				}
1778 				/*
1779 				 * See if the cage expanded into the delete.
1780 				 * This can happen as we have to allow the
1781 				 * cage to expand.
1782 				 */
1783 				if (PP_ISNORELOC(pp)) {
1784 					page_unlock(pp);
1785 					mutex_enter(&mhp->mh_mutex);
1786 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1787 					break;
1788 				}
1789 				if (PP_RETIRED(pp)) {
1790 					/*
1791 					 * Page has been retired and is
1792 					 * not part of the cage so we
1793 					 * can now do the accounting for
1794 					 * it.
1795 					 */
1796 					MDSTAT_INCR(mhp, retired);
1797 					mutex_enter(&mhp->mh_mutex);
1798 					mdsp->mds_bitmap[bit / NBPBMW]
1799 					    |= (1 << (bit % NBPBMW));
1800 					mdsp->mds_bitmap_retired[bit /
1801 					    NBPBMW] |=
1802 					    (1 << (bit % NBPBMW));
1803 					mhp->mh_hold_todo--;
1804 					continue;
1805 				}
1806 				ASSERT(freemem_left != 0);
1807 				if (PP_ISFREE(pp)) {
1808 					/*
1809 					 * Like page_reclaim() only 'freemem'
1810 					 * processing is already done.
1811 					 */
1812 					MDSTAT_INCR(mhp, nfree);
1813 				free_page_collect:
1814 					if (PP_ISAGED(pp)) {
1815 						page_list_sub(pp,
1816 						    PG_FREE_LIST);
1817 					} else {
1818 						page_list_sub(pp,
1819 						    PG_CACHE_LIST);
1820 					}
1821 					PP_CLRFREE(pp);
1822 					PP_CLRAGED(pp);
1823 					collected++;
1824 					mutex_enter(&mhp->mh_mutex);
1825 					page_delete_collect(pp, mhp);
1826 					mdsp->mds_bitmap[bit / NBPBMW] |=
1827 					    (1 << (bit % NBPBMW));
1828 					freemem_left--;
1829 					continue;
1830 				}
1831 				ASSERT(pp->p_vnode != NULL);
1832 				if (first_scan) {
1833 					MDSTAT_INCR(mhp, first_notfree);
1834 					page_unlock(pp);
1835 					mutex_enter(&mhp->mh_mutex);
1836 					continue;
1837 				}
1838 				/*
1839 				 * Keep stats on pages encountered that
1840 				 * are marked for retirement.
1841 				 */
1842 				if (PP_TOXIC(pp)) {
1843 					MDSTAT_INCR(mhp, toxic);
1844 				} else if (PP_PR_REQ(pp)) {
1845 					MDSTAT_INCR(mhp, failing);
1846 				}
1847 				/*
1848 				 * In certain cases below, special exceptions
1849 				 * are made for pages that are toxic.  This
1850 				 * is because the current meaning of toxic
1851 				 * is that an uncorrectable error has been
1852 				 * previously associated with the page.
1853 				 */
1854 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1855 					if (!PP_TOXIC(pp)) {
1856 						/*
1857 						 * Must relocate locked in
1858 						 * memory pages.
1859 						 */
1860 #ifdef MEM_DEL_STATS
1861 						start_pgrp = ddi_get_lbolt();
1862 #endif /* MEM_DEL_STATS */
1863 						/*
1864 						 * Lock all constituent pages
1865 						 * of a large page to ensure
1866 						 * that p_szc won't change.
1867 						 */
1868 						if (!group_page_trylock(pp,
1869 						    SE_EXCL)) {
1870 							MDSTAT_INCR(mhp,
1871 							    gptllckfail);
1872 							page_unlock(pp);
1873 							mutex_enter(
1874 							    &mhp->mh_mutex);
1875 							continue;
1876 						}
1877 						MDSTAT_INCR(mhp, npplocked);
1878 						pp_targ =
1879 						    page_get_replacement_page(
1880 						    pp, NULL, 0);
1881 						if (pp_targ != NULL) {
1882 #ifdef MEM_DEL_STATS
1883 							ntick_pgrp =
1884 							    (uint64_t)
1885 							    ddi_get_lbolt() -
1886 							    start_pgrp;
1887 #endif /* MEM_DEL_STATS */
1888 							MDSTAT_PGRP(mhp,
1889 							    ntick_pgrp);
1890 							MDSTAT_INCR(mhp,
1891 							    nlockreloc);
1892 							goto reloc;
1893 						}
1894 						group_page_unlock(pp);
1895 						page_unlock(pp);
1896 #ifdef MEM_DEL_STATS
1897 						ntick_pgrp =
1898 						    (uint64_t)ddi_get_lbolt() -
1899 						    start_pgrp;
1900 #endif /* MEM_DEL_STATS */
1901 						MDSTAT_PGRP(mhp, ntick_pgrp);
1902 						MDSTAT_INCR(mhp, nnorepl);
1903 						mutex_enter(&mhp->mh_mutex);
1904 						continue;
1905 					} else {
1906 						/*
1907 						 * Cannot do anything about
1908 						 * this page because it is
1909 						 * toxic.
1910 						 */
1911 						MDSTAT_INCR(mhp, npplkdtoxic);
1912 						page_unlock(pp);
1913 						mutex_enter(&mhp->mh_mutex);
1914 						continue;
1915 					}
1916 				}
1917 				/*
1918 				 * Unload the mappings and check if mod bit
1919 				 * is set.
1920 				 */
1921 				ASSERT(!PP_ISKAS(pp));
1922 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1923 				mod = hat_ismod(pp);
1924 
1925 #ifdef MEM_DEL_STATS
1926 				start_pgrp = ddi_get_lbolt();
1927 #endif /* MEM_DEL_STATS */
1928 				if (mod && !PP_TOXIC(pp)) {
1929 					/*
1930 					 * Lock all constituent pages
1931 					 * of a large page to ensure
1932 					 * that p_szc won't change.
1933 					 */
1934 					if (!group_page_trylock(pp, SE_EXCL)) {
1935 						MDSTAT_INCR(mhp, gptlmodfail);
1936 						page_unlock(pp);
1937 						mutex_enter(&mhp->mh_mutex);
1938 						continue;
1939 					}
1940 					pp_targ = page_get_replacement_page(pp,
1941 					    NULL, 0);
1942 					if (pp_targ != NULL) {
1943 						MDSTAT_INCR(mhp, nmodreloc);
1944 #ifdef MEM_DEL_STATS
1945 						ntick_pgrp =
1946 						    (uint64_t)ddi_get_lbolt() -
1947 						    start_pgrp;
1948 #endif /* MEM_DEL_STATS */
1949 						MDSTAT_PGRP(mhp, ntick_pgrp);
1950 						goto reloc;
1951 					}
1952 					group_page_unlock(pp);
1953 				}
1954 
1955 				if (!page_try_demote_pages(pp)) {
1956 					MDSTAT_INCR(mhp, demotefail);
1957 					page_unlock(pp);
1958 #ifdef MEM_DEL_STATS
1959 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1960 					    start_pgrp;
1961 #endif /* MEM_DEL_STATS */
1962 					MDSTAT_PGRP(mhp, ntick_pgrp);
1963 					mutex_enter(&mhp->mh_mutex);
1964 					continue;
1965 				}
1966 
1967 				/*
1968 				 * Regular 'page-out'.
1969 				 */
1970 				if (!mod) {
1971 					MDSTAT_INCR(mhp, ndestroy);
1972 					page_destroy(pp, 1);
1973 					/*
1974 					 * page_destroy was called with
1975 					 * dontfree. As long as p_lckcnt
1976 					 * and p_cowcnt are both zero, the
1977 					 * only additional action of
1978 					 * page_destroy with !dontfree is to
1979 					 * call page_free, so we can collect
1980 					 * the page here.
1981 					 */
1982 					collected++;
1983 #ifdef MEM_DEL_STATS
1984 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1985 					    start_pgrp;
1986 #endif /* MEM_DEL_STATS */
1987 					MDSTAT_PGRP(mhp, ntick_pgrp);
1988 					mutex_enter(&mhp->mh_mutex);
1989 					page_delete_collect(pp, mhp);
1990 					mdsp->mds_bitmap[bit / NBPBMW] |=
1991 					    (1 << (bit % NBPBMW));
1992 					continue;
1993 				}
1994 				/*
1995 				 * The page is toxic and the mod bit is
1996 				 * set, we cannot do anything here to deal
1997 				 * with it.
1998 				 */
1999 				if (PP_TOXIC(pp)) {
2000 					page_unlock(pp);
2001 #ifdef MEM_DEL_STATS
2002 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2003 					    start_pgrp;
2004 #endif /* MEM_DEL_STATS */
2005 					MDSTAT_PGRP(mhp, ntick_pgrp);
2006 					MDSTAT_INCR(mhp, modtoxic);
2007 					mutex_enter(&mhp->mh_mutex);
2008 					continue;
2009 				}
2010 				MDSTAT_INCR(mhp, nputpage);
2011 				vp = pp->p_vnode;
2012 				offset = pp->p_offset;
2013 				VN_HOLD(vp);
2014 				page_unlock(pp);
2015 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2016 				    B_INVAL|B_FORCE, kcred, NULL);
2017 				VN_RELE(vp);
2018 #ifdef MEM_DEL_STATS
2019 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2020 				    start_pgrp;
2021 #endif /* MEM_DEL_STATS */
2022 				MDSTAT_PGRP(mhp, ntick_pgrp);
2023 				/*
2024 				 * Try to get the page back immediately
2025 				 * so that it can be collected.
2026 				 */
2027 				pp = page_numtopp_nolock(pfn);
2028 				if (pp == NULL) {
2029 					MDSTAT_INCR(mhp, nnoreclaim);
2030 					/*
2031 					 * This should not happen as this
2032 					 * thread is deleting the page.
2033 					 * If this code is generalized, this
2034 					 * becomes a reality.
2035 					 */
2036 #ifdef DEBUG
2037 					cmn_err(CE_WARN,
2038 					    "delete_memory_thread(0x%p) "
2039 					    "pfn 0x%lx has no page_t",
2040 					    (void *)mhp, pfn);
2041 #endif /* DEBUG */
2042 					mutex_enter(&mhp->mh_mutex);
2043 					continue;
2044 				}
2045 				if (page_try_reclaim_lock(pp, SE_EXCL,
2046 				    SE_EXCL_WANTED | SE_RETIRED)) {
2047 					if (PP_ISFREE(pp)) {
2048 						goto free_page_collect;
2049 					}
2050 					page_unlock(pp);
2051 				}
2052 				MDSTAT_INCR(mhp, nnoreclaim);
2053 				mutex_enter(&mhp->mh_mutex);
2054 				continue;
2055 
2056 			reloc:
2057 				/*
2058 				 * Got some freemem and a target
2059 				 * page, so move the data to avoid
2060 				 * I/O and lock problems.
2061 				 */
2062 				ASSERT(!page_iolock_assert(pp));
2063 				MDSTAT_INCR(mhp, nreloc);
2064 				/*
2065 				 * page_relocate() will return pgcnt: the
2066 				 * number of consecutive pages relocated.
2067 				 * If it is successful, pp will be a
2068 				 * linked list of the page structs that
2069 				 * were relocated. If page_relocate() is
2070 				 * unsuccessful, pp will be unmodified.
2071 				 */
2072 #ifdef MEM_DEL_STATS
2073 				start_pgrp = ddi_get_lbolt();
2074 #endif /* MEM_DEL_STATS */
2075 				result = page_relocate(&pp, &pp_targ, 0, 0,
2076 				    &pgcnt, NULL);
2077 #ifdef MEM_DEL_STATS
2078 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2079 				    start_pgrp;
2080 #endif /* MEM_DEL_STATS */
2081 				MDSTAT_PGRP(mhp, ntick_pgrp);
2082 				if (result != 0) {
2083 					MDSTAT_INCR(mhp, nrelocfail);
2084 					/*
2085 					 * We did not succeed. We need
2086 					 * to give the pp_targ pages back.
2087 					 * page_free(pp_targ, 1) without
2088 					 * the freemem accounting.
2089 					 */
2090 					group_page_unlock(pp);
2091 					page_free_replacement_page(pp_targ);
2092 					page_unlock(pp);
2093 					mutex_enter(&mhp->mh_mutex);
2094 					continue;
2095 				}
2096 
2097 				/*
2098 				 * We will then collect pgcnt pages.
2099 				 */
2100 				ASSERT(pgcnt > 0);
2101 				mutex_enter(&mhp->mh_mutex);
2102 				/*
2103 				 * We need to make sure freemem_left is
2104 				 * large enough.
2105 				 */
2106 				while ((freemem_left < pgcnt) &&
2107 				    (!mhp->mh_cancel)) {
2108 					freemem_left +=
2109 					    delthr_get_freemem(mhp);
2110 				}
2111 
2112 				/*
2113 				 * Do not proceed if mh_cancel is set.
2114 				 */
2115 				if (mhp->mh_cancel) {
2116 					while (pp_targ != NULL) {
2117 						/*
2118 						 * Unlink and unlock each page.
2119 						 */
2120 						tpp_targ = pp_targ;
2121 						page_sub(&pp_targ, tpp_targ);
2122 						page_unlock(tpp_targ);
2123 					}
2124 					/*
2125 					 * We need to give the pp pages back.
2126 					 * page_free(pp, 1) without the
2127 					 * freemem accounting.
2128 					 */
2129 					page_free_replacement_page(pp);
2130 					break;
2131 				}
2132 
2133 				/* Now remove pgcnt from freemem_left */
2134 				freemem_left -= pgcnt;
2135 				ASSERT(freemem_left >= 0);
2136 				szc = pp->p_szc;
2137 				while (pp != NULL) {
2138 					/*
2139 					 * pp and pp_targ were passed back as
2140 					 * a linked list of pages.
2141 					 * Unlink and unlock each page.
2142 					 */
2143 					tpp_targ = pp_targ;
2144 					page_sub(&pp_targ, tpp_targ);
2145 					page_unlock(tpp_targ);
2146 					/*
2147 					 * The original page is now free
2148 					 * so remove it from the linked
2149 					 * list and collect it.
2150 					 */
2151 					tpp = pp;
2152 					page_sub(&pp, tpp);
2153 					pfn = page_pptonum(tpp);
2154 					collected++;
2155 					ASSERT(PAGE_EXCL(tpp));
2156 					ASSERT(tpp->p_vnode == NULL);
2157 					ASSERT(!hat_page_is_mapped(tpp));
2158 					ASSERT(tpp->p_szc == szc);
2159 					tpp->p_szc = 0;
2160 					page_delete_collect(tpp, mhp);
2161 					bit = pfn - mdsp->mds_base;
2162 					mdsp->mds_bitmap[bit / NBPBMW] |=
2163 					    (1 << (bit % NBPBMW));
2164 				}
2165 				ASSERT(pp_targ == NULL);
2166 			}
2167 		}
2168 		first_scan = 0;
2169 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2170 		    (collected == 0)) {
2171 			/*
2172 			 * This code is needed as we cannot wait
2173 			 * for a page to be locked OR the delete to
2174 			 * be cancelled.  Also, we must delay so
2175 			 * that other threads get a chance to run
2176 			 * on our cpu, otherwise page locks may be
2177 			 * held indefinitely by those threads.
2178 			 */
2179 			MDSTAT_INCR(mhp, ndelay);
2180 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2181 			(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
2182 			    (lbolt + DEL_BUSY_WAIT_TICKS));
2183 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2184 		}
2185 	}
2186 	/* stop the dr aio cleanup thread */
2187 	mhp->mh_dr_aio_cleanup_cancel = 1;
2188 	transit_list_collect(mhp, 0);
2189 	if (freemem_left != 0) {
2190 		/* Return any surplus. */
2191 		page_create_putback(freemem_left);
2192 		freemem_left = 0;
2193 	}
2194 #ifdef MEM_DEL_STATS
2195 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2196 #endif /* MEM_DEL_STATS */
2197 	MDSTAT_TOTAL(mhp, ntick_total);
2198 	MDSTAT_PRINT(mhp);
2199 
2200 	/*
2201 	 * If the memory delete was cancelled, exclusive-wanted bits must
2202 	 * be cleared. If there are retired pages being deleted, they need
2203 	 * to be unretired.
2204 	 */
2205 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2206 	    mdsp = mdsp->mds_next) {
2207 		pfn_t pfn, p_end;
2208 
2209 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2210 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2211 			page_t *pp;
2212 			pgcnt_t bit;
2213 
2214 			bit = pfn - mdsp->mds_base;
2215 			if (mhp->mh_cancel) {
2216 				pp = page_numtopp_nolock(pfn);
2217 				if (pp != NULL) {
2218 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2219 					    (1 << (bit % NBPBMW))) == 0) {
2220 						page_lock_clr_exclwanted(pp);
2221 					}
2222 				}
2223 			} else {
2224 				pp = NULL;
2225 			}
2226 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2227 			    (1 << (bit % NBPBMW))) != 0) {
2228 				/* do we already have pp? */
2229 				if (pp == NULL) {
2230 					pp = page_numtopp_nolock(pfn);
2231 				}
2232 				ASSERT(pp != NULL);
2233 				ASSERT(PP_RETIRED(pp));
2234 				if (mhp->mh_cancel != 0) {
2235 					page_unlock(pp);
2236 					/*
2237 					 * To satisfy ASSERT below in
2238 					 * cancel code.
2239 					 */
2240 					mhp->mh_hold_todo++;
2241 				} else {
2242 					(void) page_unretire_pp(pp,
2243 					    PR_UNR_CLEAN);
2244 				}
2245 			}
2246 		}
2247 	}
2248 	/*
2249 	 * Free retired page bitmap and collected page bitmap
2250 	 */
2251 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2252 	    mdsp = mdsp->mds_next) {
2253 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2254 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2255 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2256 		ASSERT(mdsp->mds_bitmap != NULL);
2257 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2258 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2259 	}
2260 
2261 	/* wait for our dr aio cancel thread to exit */
2262 	while (!(mhp->mh_aio_cleanup_done)) {
2263 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2264 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2265 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2266 	}
2267 refused:
2268 	if (mhp->mh_cancel != 0) {
2269 		page_t *pp;
2270 
2271 		comp_code = mhp->mh_cancel;
2272 		/*
2273 		 * Go through list of deleted pages (mh_deleted) freeing
2274 		 * them.
2275 		 */
2276 		while ((pp = mhp->mh_deleted) != NULL) {
2277 			mhp->mh_deleted = pp->p_next;
2278 			mhp->mh_hold_todo++;
2279 			mutex_exit(&mhp->mh_mutex);
2280 			/* Restore p_next. */
2281 			pp->p_next = pp->p_prev;
2282 			if (PP_ISFREE(pp)) {
2283 				cmn_err(CE_PANIC,
2284 				    "page %p is free",
2285 				    (void *)pp);
2286 			}
2287 			page_free(pp, 1);
2288 			mutex_enter(&mhp->mh_mutex);
2289 		}
2290 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2291 
2292 		mutex_exit(&mhp->mh_mutex);
2293 		put_availrmem(mhp->mh_vm_pages);
2294 		mutex_enter(&mhp->mh_mutex);
2295 
2296 		goto t_exit;
2297 	}
2298 
2299 	/*
2300 	 * All the pages are no longer in use and are exclusively locked.
2301 	 */
2302 
2303 	mhp->mh_deleted = NULL;
2304 
2305 	kphysm_del_cleanup(mhp);
2306 
2307 	/*
2308 	 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so
2309 	 * that the mem_node_config[] will remain intact for the cleanup.
2310 	 */
2311 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2312 	    mdsp = mdsp->mds_next) {
2313 		mem_node_post_del_slice(mdsp->mds_base,
2314 		    mdsp->mds_base + mdsp->mds_npgs - 1, 0);
2315 	}
2316 
2317 	comp_code = KPHYSM_OK;
2318 
2319 t_exit:
2320 	mutex_exit(&mhp->mh_mutex);
2321 	kphysm_setup_post_del(mhp->mh_vm_pages,
2322 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2323 	mutex_enter(&mhp->mh_mutex);
2324 
2325 early_exit:
2326 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2327 	mhp->mh_state = MHND_DONE;
2328 	del_complete_funcp = mhp->mh_delete_complete;
2329 	del_complete_arg = mhp->mh_delete_complete_arg;
2330 	CALLB_CPR_EXIT(&cprinfo);
2331 	(*del_complete_funcp)(del_complete_arg, comp_code);
2332 	thread_exit();
2333 	/*NOTREACHED*/
2334 }
2335 
2336 /*
2337  * Start the delete of the memory from the system.
2338  */
2339 int
2340 kphysm_del_start(
2341 	memhandle_t handle,
2342 	void (*complete)(void *, int),
2343 	void *complete_arg)
2344 {
2345 	struct mem_handle *mhp;
2346 
2347 	mhp = kphysm_lookup_mem_handle(handle);
2348 	if (mhp == NULL) {
2349 		return (KPHYSM_EHANDLE);
2350 	}
2351 	switch (mhp->mh_state) {
2352 	case MHND_FREE:
2353 		ASSERT(mhp->mh_state != MHND_FREE);
2354 		mutex_exit(&mhp->mh_mutex);
2355 		return (KPHYSM_EHANDLE);
2356 	case MHND_INIT:
2357 		break;
2358 	case MHND_STARTING:
2359 	case MHND_RUNNING:
2360 		mutex_exit(&mhp->mh_mutex);
2361 		return (KPHYSM_ESEQUENCE);
2362 	case MHND_DONE:
2363 		mutex_exit(&mhp->mh_mutex);
2364 		return (KPHYSM_ESEQUENCE);
2365 	case MHND_RELEASE:
2366 		mutex_exit(&mhp->mh_mutex);
2367 		return (KPHYSM_ESEQUENCE);
2368 	default:
2369 #ifdef DEBUG
2370 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2371 		    (void *)mhp, mhp->mh_state);
2372 #endif /* DEBUG */
2373 		mutex_exit(&mhp->mh_mutex);
2374 		return (KPHYSM_EHANDLE);
2375 	}
2376 
2377 	if (mhp->mh_transit.trl_spans == NULL) {
2378 		mutex_exit(&mhp->mh_mutex);
2379 		return (KPHYSM_ENOWORK);
2380 	}
2381 
2382 	ASSERT(complete != NULL);
2383 	mhp->mh_delete_complete = complete;
2384 	mhp->mh_delete_complete_arg = complete_arg;
2385 	mhp->mh_state = MHND_STARTING;
2386 	/*
2387 	 * Release the mutex in case thread_create sleeps.
2388 	 */
2389 	mutex_exit(&mhp->mh_mutex);
2390 
2391 	/*
2392 	 * The "obvious" process for this thread is pageout (proc_pageout)
2393 	 * but this gives the thread too much power over freemem
2394 	 * which results in freemem starvation.
2395 	 */
2396 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2397 	    TS_RUN, maxclsyspri - 1);
2398 
2399 	return (KPHYSM_OK);
2400 }
2401 
2402 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2403 static caddr_t pp_dummy;
2404 static pgcnt_t pp_dummy_npages;
2405 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2406 
2407 static void
2408 memseg_remap_init_pages(page_t *pages, page_t *epages)
2409 {
2410 	page_t *pp;
2411 
2412 	for (pp = pages; pp < epages; pp++) {
2413 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2414 		pp->p_offset = (u_offset_t)-1;
2415 		page_iolock_init(pp);
2416 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2417 			continue;
2418 		page_lock_delete(pp);
2419 	}
2420 }
2421 
2422 void
2423 memseg_remap_init()
2424 {
2425 	mutex_enter(&pp_dummy_lock);
2426 	if (pp_dummy == NULL) {
2427 		uint_t dpages;
2428 		int i;
2429 
2430 		/*
2431 		 * dpages starts off as the size of the structure and
2432 		 * ends up as the minimum number of pages that will
2433 		 * hold a whole number of page_t structures.
2434 		 */
2435 		dpages = sizeof (page_t);
2436 		ASSERT(dpages != 0);
2437 		ASSERT(dpages <= MMU_PAGESIZE);
2438 
2439 		while ((dpages & 1) == 0)
2440 			dpages >>= 1;
2441 
2442 		pp_dummy_npages = dpages;
2443 		/*
2444 		 * Allocate pp_dummy pages directly from static_arena,
2445 		 * since these are whole page allocations and are
2446 		 * referenced by physical address.  This also has the
2447 		 * nice fringe benefit of hiding the memory from
2448 		 * ::findleaks since it doesn't deal well with allocated
2449 		 * kernel heap memory that doesn't have any mappings.
2450 		 */
2451 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2452 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2453 		bzero(pp_dummy, ptob(pp_dummy_npages));
2454 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2455 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2456 		    pp_dummy_npages, KM_SLEEP);
2457 		for (i = 0; i < pp_dummy_npages; i++) {
2458 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2459 			    &pp_dummy[MMU_PAGESIZE * i]);
2460 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2461 		}
2462 		/*
2463 		 * Initialize the page_t's to a known 'deleted' state
2464 		 * that matches the state of deleted pages.
2465 		 */
2466 		memseg_remap_init_pages((page_t *)pp_dummy,
2467 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2468 		/* Remove kmem mappings for the pages for safety. */
2469 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2470 		    HAT_UNLOAD_UNLOCK);
2471 		/* Leave pp_dummy pointer set as flag that init is done. */
2472 	}
2473 	mutex_exit(&pp_dummy_lock);
2474 }
2475 
2476 static void
2477 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs)
2478 {
2479 	ASSERT(pp_dummy != NULL);
2480 
2481 	while (metapgs != 0) {
2482 		pgcnt_t n;
2483 		int i;
2484 
2485 		n = pp_dummy_npages;
2486 		if (n > metapgs)
2487 			n = metapgs;
2488 		for (i = 0; i < n; i++) {
2489 			hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i],
2490 			    PROT_READ,
2491 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2492 			    HAT_LOAD_REMAP);
2493 			pp += ptob(1);
2494 		}
2495 		metapgs -= n;
2496 	}
2497 }
2498 
2499 /*
2500  * Transition all the deleted pages to the deleted state so that
2501  * page_lock will not wait. The page_lock_delete call will
2502  * also wake up any waiters.
2503  */
2504 static void
2505 memseg_lock_delete_all(struct memseg *seg)
2506 {
2507 	page_t *pp;
2508 
2509 	for (pp = seg->pages; pp < seg->epages; pp++) {
2510 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2511 		page_lock_delete(pp);
2512 	}
2513 }
2514 
2515 static void
2516 kphysm_del_cleanup(struct mem_handle *mhp)
2517 {
2518 	struct memdelspan	*mdsp;
2519 	struct memseg		*seg;
2520 	struct memseg   	**segpp;
2521 	struct memseg		*seglist;
2522 	pfn_t			p_end;
2523 	uint64_t		avmem;
2524 	pgcnt_t			avpgs;
2525 	pgcnt_t			npgs;
2526 
2527 	avpgs = mhp->mh_vm_pages;
2528 
2529 	memsegs_lock(1);
2530 
2531 	/*
2532 	 * remove from main segment list.
2533 	 */
2534 	npgs = 0;
2535 	seglist = NULL;
2536 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2537 	    mdsp = mdsp->mds_next) {
2538 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2539 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2540 			if (seg->pages_base >= p_end ||
2541 			    seg->pages_end <= mdsp->mds_base) {
2542 				/* Span and memseg don't overlap. */
2543 				segpp = &((*segpp)->next);
2544 				continue;
2545 			}
2546 			ASSERT(seg->pages_base >= mdsp->mds_base);
2547 			ASSERT(seg->pages_end <= p_end);
2548 
2549 			PLCNT_MODIFY_MAX(seg->pages_base,
2550 			    seg->pages_base - seg->pages_end);
2551 
2552 			/* Hide the memseg from future scans. */
2553 			hat_kpm_delmem_mseg_update(seg, segpp);
2554 			*segpp = seg->next;
2555 			membar_producer();	/* TODO: Needed? */
2556 			npgs += MSEG_NPAGES(seg);
2557 
2558 			/*
2559 			 * Leave the deleted segment's next pointer intact
2560 			 * in case a memsegs scanning loop is walking this
2561 			 * segment concurrently.
2562 			 */
2563 			seg->lnext = seglist;
2564 			seglist = seg;
2565 		}
2566 	}
2567 
2568 	build_pfn_hash();
2569 
2570 	ASSERT(npgs < total_pages);
2571 	total_pages -= npgs;
2572 
2573 	/*
2574 	 * Recalculate the paging parameters now total_pages has changed.
2575 	 * This will also cause the clock hands to be reset before next use.
2576 	 */
2577 	setupclock(1);
2578 
2579 	memsegs_unlock(1);
2580 
2581 	mutex_exit(&mhp->mh_mutex);
2582 
2583 	while ((seg = seglist) != NULL) {
2584 		pfn_t mseg_start;
2585 		pfn_t mseg_base, mseg_end;
2586 		pgcnt_t mseg_npgs;
2587 		page_t *pp;
2588 		pgcnt_t metapgs;
2589 		int dynamic;
2590 		int mlret;
2591 
2592 		seglist = seg->lnext;
2593 
2594 		/*
2595 		 * Put the page_t's into the deleted state to stop
2596 		 * cv_wait()s on the pages. When we remap, the dummy
2597 		 * page_t's will be in the same state.
2598 		 */
2599 		memseg_lock_delete_all(seg);
2600 		/*
2601 		 * Collect up information based on pages_base and pages_end
2602 		 * early so that we can flag early that the memseg has been
2603 		 * deleted by setting pages_end == pages_base.
2604 		 */
2605 		mseg_base = seg->pages_base;
2606 		mseg_end = seg->pages_end;
2607 		mseg_npgs = MSEG_NPAGES(seg);
2608 		dynamic = memseg_is_dynamic(seg, &mseg_start);
2609 
2610 		seg->pages_end = seg->pages_base;
2611 
2612 		if (dynamic) {
2613 			pp = seg->pages;
2614 			metapgs = mseg_base - mseg_start;
2615 			ASSERT(metapgs != 0);
2616 
2617 			/* Remap the meta data to our special dummy area. */
2618 			memseg_remap_to_dummy((caddr_t)pp, metapgs);
2619 
2620 			mutex_enter(&memseg_lists_lock);
2621 			seg->lnext = memseg_va_avail;
2622 			memseg_va_avail = seg;
2623 			mutex_exit(&memseg_lists_lock);
2624 		} else {
2625 			/*
2626 			 * Set for clean-up below.
2627 			 */
2628 			mseg_start = seg->pages_base;
2629 			/*
2630 			 * For memory whose page_ts were allocated
2631 			 * at boot, we need to find a new use for
2632 			 * the page_t memory.
2633 			 * For the moment, just leak it.
2634 			 * (It is held in the memseg_delete_junk list.)
2635 			 */
2636 
2637 			mutex_enter(&memseg_lists_lock);
2638 			seg->lnext = memseg_delete_junk;
2639 			memseg_delete_junk = seg;
2640 			mutex_exit(&memseg_lists_lock);
2641 		}
2642 
2643 		/* Must not use seg now as it could be re-used. */
2644 
2645 		memlist_write_lock();
2646 
2647 		mlret = memlist_delete_span(
2648 		    (uint64_t)(mseg_base) << PAGESHIFT,
2649 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2650 		    &phys_avail);
2651 		ASSERT(mlret == MEML_SPANOP_OK);
2652 
2653 		mlret = memlist_delete_span(
2654 		    (uint64_t)(mseg_start) << PAGESHIFT,
2655 		    (uint64_t)(mseg_end - mseg_start) <<
2656 		    PAGESHIFT,
2657 		    &phys_install);
2658 		ASSERT(mlret == MEML_SPANOP_OK);
2659 		phys_install_has_changed();
2660 
2661 		memlist_write_unlock();
2662 	}
2663 
2664 	memlist_read_lock();
2665 	installed_top_size(phys_install, &physmax, &physinstalled);
2666 	memlist_read_unlock();
2667 
2668 	mutex_enter(&freemem_lock);
2669 	maxmem -= avpgs;
2670 	physmem -= avpgs;
2671 	/* availrmem is adjusted during the delete. */
2672 	availrmem_initial -= avpgs;
2673 
2674 	mutex_exit(&freemem_lock);
2675 
2676 	dump_resize();
2677 
2678 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2679 	    "(0x%" PRIx64 ")\n",
2680 	    physinstalled << (PAGESHIFT - 10),
2681 	    (uint64_t)physinstalled << PAGESHIFT);
2682 
2683 	avmem = (uint64_t)freemem << PAGESHIFT;
2684 	cmn_err(CE_CONT, "?kphysm_delete: "
2685 	    "avail mem = %" PRId64 "\n", avmem);
2686 
2687 	/*
2688 	 * Update lgroup generation number on single lgroup systems
2689 	 */
2690 	if (nlgrps == 1)
2691 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2692 
2693 	/* Successfully deleted system memory */
2694 	mutex_enter(&mhp->mh_mutex);
2695 }
2696 
2697 static uint_t mdel_nullvp_waiter;
2698 
2699 static void
2700 page_delete_collect(
2701 	page_t *pp,
2702 	struct mem_handle *mhp)
2703 {
2704 	if (pp->p_vnode) {
2705 		page_hashout(pp, (kmutex_t *)NULL);
2706 		/* do not do PP_SETAGED(pp); */
2707 	} else {
2708 		kmutex_t *sep;
2709 
2710 		sep = page_se_mutex(pp);
2711 		mutex_enter(sep);
2712 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2713 			mdel_nullvp_waiter++;
2714 			cv_broadcast(&pp->p_cv);
2715 		}
2716 		mutex_exit(sep);
2717 	}
2718 	ASSERT(pp->p_next == pp->p_prev);
2719 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2720 	pp->p_next = mhp->mh_deleted;
2721 	mhp->mh_deleted = pp;
2722 	ASSERT(mhp->mh_hold_todo != 0);
2723 	mhp->mh_hold_todo--;
2724 }
2725 
2726 static void
2727 transit_list_collect(struct mem_handle *mhp, int v)
2728 {
2729 	struct transit_list_head *trh;
2730 
2731 	trh = &transit_list_head;
2732 	mutex_enter(&trh->trh_lock);
2733 	mhp->mh_transit.trl_collect = v;
2734 	mutex_exit(&trh->trh_lock);
2735 }
2736 
2737 static void
2738 transit_list_insert(struct transit_list *tlp)
2739 {
2740 	struct transit_list_head *trh;
2741 
2742 	trh = &transit_list_head;
2743 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2744 	tlp->trl_next = trh->trh_head;
2745 	trh->trh_head = tlp;
2746 }
2747 
2748 static void
2749 transit_list_remove(struct transit_list *tlp)
2750 {
2751 	struct transit_list_head *trh;
2752 	struct transit_list **tlpp;
2753 
2754 	trh = &transit_list_head;
2755 	tlpp = &trh->trh_head;
2756 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2757 	while (*tlpp != NULL && *tlpp != tlp)
2758 		tlpp = &(*tlpp)->trl_next;
2759 	ASSERT(*tlpp != NULL);
2760 	if (*tlpp == tlp)
2761 		*tlpp = tlp->trl_next;
2762 	tlp->trl_next = NULL;
2763 }
2764 
2765 static struct transit_list *
2766 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2767 {
2768 	struct transit_list *tlp;
2769 
2770 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2771 		struct memdelspan *mdsp;
2772 
2773 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2774 		    mdsp = mdsp->mds_next) {
2775 			if (pfnum >= mdsp->mds_base &&
2776 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2777 				return (tlp);
2778 			}
2779 		}
2780 	}
2781 	return (NULL);
2782 }
2783 
2784 int
2785 pfn_is_being_deleted(pfn_t pfnum)
2786 {
2787 	struct transit_list_head *trh;
2788 	struct transit_list *tlp;
2789 	int ret;
2790 
2791 	trh = &transit_list_head;
2792 	if (trh->trh_head == NULL)
2793 		return (0);
2794 
2795 	mutex_enter(&trh->trh_lock);
2796 	tlp = pfnum_to_transit_list(trh, pfnum);
2797 	ret = (tlp != NULL && tlp->trl_collect);
2798 	mutex_exit(&trh->trh_lock);
2799 
2800 	return (ret);
2801 }
2802 
2803 #ifdef MEM_DEL_STATS
2804 extern int hz;
2805 static void
2806 mem_del_stat_print_func(struct mem_handle *mhp)
2807 {
2808 	uint64_t tmp;
2809 
2810 	if (mem_del_stat_print) {
2811 		printf("memory delete loop %x/%x, statistics%s\n",
2812 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2813 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2814 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2815 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2816 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2817 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2818 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2819 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2820 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2821 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2822 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2823 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2824 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2825 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2826 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2827 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2828 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2829 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2830 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2831 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2832 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2833 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2834 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2835 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2836 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2837 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2838 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2839 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2840 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2841 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2842 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2843 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2844 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2845 		printf(
2846 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2847 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2848 
2849 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2850 		printf(
2851 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2852 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2853 	}
2854 }
2855 #endif /* MEM_DEL_STATS */
2856 
2857 struct mem_callback {
2858 	kphysm_setup_vector_t	*vec;
2859 	void			*arg;
2860 };
2861 
2862 #define	NMEMCALLBACKS		100
2863 
2864 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2865 static uint_t nmemcallbacks;
2866 static krwlock_t mem_callback_rwlock;
2867 
2868 int
2869 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2870 {
2871 	uint_t i, found;
2872 
2873 	/*
2874 	 * This test will become more complicated when the version must
2875 	 * change.
2876 	 */
2877 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2878 		return (EINVAL);
2879 
2880 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2881 	    vec->post_del == NULL)
2882 		return (EINVAL);
2883 
2884 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2885 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2886 		if (mem_callbacks[i].vec == NULL && found == 0)
2887 			found = i + 1;
2888 		if (mem_callbacks[i].vec == vec &&
2889 		    mem_callbacks[i].arg == arg) {
2890 #ifdef DEBUG
2891 			/* Catch this in DEBUG kernels. */
2892 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2893 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2894 			    (void *)vec, arg, (void *)caller());
2895 #endif /* DEBUG */
2896 			rw_exit(&mem_callback_rwlock);
2897 			return (EEXIST);
2898 		}
2899 	}
2900 	if (found != 0) {
2901 		i = found - 1;
2902 	} else {
2903 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
2904 		if (nmemcallbacks == NMEMCALLBACKS) {
2905 			rw_exit(&mem_callback_rwlock);
2906 			return (ENOMEM);
2907 		}
2908 		i = nmemcallbacks++;
2909 	}
2910 	mem_callbacks[i].vec = vec;
2911 	mem_callbacks[i].arg = arg;
2912 	rw_exit(&mem_callback_rwlock);
2913 	return (0);
2914 }
2915 
2916 void
2917 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
2918 {
2919 	uint_t i;
2920 
2921 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2922 	for (i = 0; i < nmemcallbacks; i++) {
2923 		if (mem_callbacks[i].vec == vec &&
2924 		    mem_callbacks[i].arg == arg) {
2925 			mem_callbacks[i].vec = NULL;
2926 			mem_callbacks[i].arg = NULL;
2927 			if (i == (nmemcallbacks - 1))
2928 				nmemcallbacks--;
2929 			break;
2930 		}
2931 	}
2932 	rw_exit(&mem_callback_rwlock);
2933 }
2934 
2935 static void
2936 kphysm_setup_post_add(pgcnt_t delta_pages)
2937 {
2938 	uint_t i;
2939 
2940 	rw_enter(&mem_callback_rwlock, RW_READER);
2941 	for (i = 0; i < nmemcallbacks; i++) {
2942 		if (mem_callbacks[i].vec != NULL) {
2943 			(*mem_callbacks[i].vec->post_add)
2944 			    (mem_callbacks[i].arg, delta_pages);
2945 		}
2946 	}
2947 	rw_exit(&mem_callback_rwlock);
2948 }
2949 
2950 /*
2951  * Note the locking between pre_del and post_del: The reader lock is held
2952  * between the two calls to stop the set of functions from changing.
2953  */
2954 
2955 static int
2956 kphysm_setup_pre_del(pgcnt_t delta_pages)
2957 {
2958 	uint_t i;
2959 	int ret;
2960 	int aret;
2961 
2962 	ret = 0;
2963 	rw_enter(&mem_callback_rwlock, RW_READER);
2964 	for (i = 0; i < nmemcallbacks; i++) {
2965 		if (mem_callbacks[i].vec != NULL) {
2966 			aret = (*mem_callbacks[i].vec->pre_del)
2967 			    (mem_callbacks[i].arg, delta_pages);
2968 			ret |= aret;
2969 		}
2970 	}
2971 
2972 	return (ret);
2973 }
2974 
2975 static void
2976 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
2977 {
2978 	uint_t i;
2979 
2980 	for (i = 0; i < nmemcallbacks; i++) {
2981 		if (mem_callbacks[i].vec != NULL) {
2982 			(*mem_callbacks[i].vec->post_del)
2983 			    (mem_callbacks[i].arg, delta_pages, cancelled);
2984 		}
2985 	}
2986 	rw_exit(&mem_callback_rwlock);
2987 }
2988 
2989 static int
2990 kphysm_split_memseg(
2991 	pfn_t base,
2992 	pgcnt_t npgs)
2993 {
2994 	struct memseg *seg;
2995 	struct memseg **segpp;
2996 	pgcnt_t size_low, size_high;
2997 	struct memseg *seg_low, *seg_mid, *seg_high;
2998 
2999 	/*
3000 	 * Lock the memsegs list against other updates now
3001 	 */
3002 	memsegs_lock(1);
3003 
3004 	/*
3005 	 * Find boot time memseg that wholly covers this area.
3006 	 */
3007 
3008 	/* First find the memseg with page 'base' in it. */
3009 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3010 	    segpp = &((*segpp)->next)) {
3011 		if (base >= seg->pages_base && base < seg->pages_end)
3012 			break;
3013 	}
3014 	if (seg == NULL) {
3015 		memsegs_unlock(1);
3016 		return (0);
3017 	}
3018 	if (memseg_is_dynamic(seg, (pfn_t *)NULL)) {
3019 		memsegs_unlock(1);
3020 		return (0);
3021 	}
3022 	if ((base + npgs) > seg->pages_end) {
3023 		memsegs_unlock(1);
3024 		return (0);
3025 	}
3026 
3027 	/*
3028 	 * Work out the size of the two segments that will
3029 	 * surround the new segment, one for low address
3030 	 * and one for high.
3031 	 */
3032 	ASSERT(base >= seg->pages_base);
3033 	size_low = base - seg->pages_base;
3034 	ASSERT(seg->pages_end >= (base + npgs));
3035 	size_high = seg->pages_end - (base + npgs);
3036 
3037 	/*
3038 	 * Sanity check.
3039 	 */
3040 	if ((size_low + size_high) == 0) {
3041 		memsegs_unlock(1);
3042 		return (0);
3043 	}
3044 
3045 	/*
3046 	 * Allocate the new structures. The old memseg will not be freed
3047 	 * as there may be a reference to it.
3048 	 */
3049 	seg_low = NULL;
3050 	seg_high = NULL;
3051 
3052 	if (size_low != 0) {
3053 		seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3054 		bzero(seg_low, sizeof (struct memseg));
3055 	}
3056 
3057 	seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3058 	bzero(seg_mid, sizeof (struct memseg));
3059 
3060 	if (size_high != 0) {
3061 		seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3062 		bzero(seg_high, sizeof (struct memseg));
3063 	}
3064 
3065 	/*
3066 	 * All allocation done now.
3067 	 */
3068 	if (size_low != 0) {
3069 		seg_low->pages = seg->pages;
3070 		seg_low->epages = seg_low->pages + size_low;
3071 		seg_low->pages_base = seg->pages_base;
3072 		seg_low->pages_end = seg_low->pages_base + size_low;
3073 		seg_low->next = seg_mid;
3074 	}
3075 	if (size_high != 0) {
3076 		seg_high->pages = seg->epages - size_high;
3077 		seg_high->epages = seg_high->pages + size_high;
3078 		seg_high->pages_base = seg->pages_end - size_high;
3079 		seg_high->pages_end = seg_high->pages_base + size_high;
3080 		seg_high->next = seg->next;
3081 	}
3082 
3083 	seg_mid->pages = seg->pages + size_low;
3084 	seg_mid->pages_base = seg->pages_base + size_low;
3085 	seg_mid->epages = seg->epages - size_high;
3086 	seg_mid->pages_end = seg->pages_end - size_high;
3087 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3088 
3089 	/*
3090 	 * Update hat_kpm specific info of all involved memsegs and
3091 	 * allow hat_kpm specific global chain updates.
3092 	 */
3093 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3094 
3095 	/*
3096 	 * At this point we have two equivalent memseg sub-chains,
3097 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3098 	 * the same place in the global chain. By re-writing the pointer
3099 	 * in the previous element we switch atomically from using the old
3100 	 * (seg) to the new.
3101 	 */
3102 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3103 
3104 	membar_enter();
3105 
3106 	build_pfn_hash();
3107 	memsegs_unlock(1);
3108 
3109 	/*
3110 	 * We leave the old segment, 'seg', intact as there may be
3111 	 * references to it. Also, as the value of total_pages has not
3112 	 * changed and the memsegs list is effectively the same when
3113 	 * accessed via the old or the new pointer, we do not have to
3114 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3115 	 *
3116 	 * We currently do not re-use or reclaim the page_t memory.
3117 	 * If we do, then this may have to change.
3118 	 */
3119 
3120 	mutex_enter(&memseg_lists_lock);
3121 	seg->lnext = memseg_edit_junk;
3122 	memseg_edit_junk = seg;
3123 	mutex_exit(&memseg_lists_lock);
3124 
3125 	return (1);
3126 }
3127 
3128 /*
3129  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3130  * structure using physical addresses. Therefore a kmem_cache is
3131  * used with KMC_NOHASH to avoid page crossings within a memseg
3132  * structure. KMC_NOHASH requires that no external (outside of
3133  * slab) information is allowed. This, in turn, implies that the
3134  * cache's slabsize must be exactly a single page, since per-slab
3135  * information (e.g. the freelist for the slab) is kept at the
3136  * end of the slab, where it is easy to locate. Should be changed
3137  * when a more obvious kmem_cache interface/flag will become
3138  * available.
3139  */
3140 void
3141 mem_config_init()
3142 {
3143 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3144 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3145 }
3146