xref: /titanic_50/usr/src/uts/common/os/mem_config.c (revision a1e9eea083a8f257157edb8a1efb5bbd300eb4bf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cmn_err.h>
28 #include <sys/vmem.h>
29 #include <sys/kmem.h>
30 #include <sys/systm.h>
31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
32 #include <sys/errno.h>
33 #include <sys/memnode.h>
34 #include <sys/memlist.h>
35 #include <sys/memlist_impl.h>
36 #include <sys/tuneable.h>
37 #include <sys/proc.h>
38 #include <sys/disp.h>
39 #include <sys/debug.h>
40 #include <sys/vm.h>
41 #include <sys/callb.h>
42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
44 #include <sys/dumphdr.h>	/* for dump_resize() */
45 #include <sys/atomic.h>		/* for use in stats collection */
46 #include <sys/rwlock.h>
47 #include <sys/cpuvar.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kpm.h>
50 #include <vm/page.h>
51 #include <vm/vm_dep.h>
52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
53 #include <sys/sunddi.h>
54 #include <sys/mem_config.h>
55 #include <sys/mem_cage.h>
56 #include <sys/lgrp.h>
57 #include <sys/ddi.h>
58 #include <sys/modctl.h>
59 
60 extern struct memlist *phys_avail;
61 
62 extern void mem_node_add(pfn_t, pfn_t);
63 extern void mem_node_del(pfn_t, pfn_t);
64 
65 extern uint_t page_ctrs_adjust(int);
66 static void kphysm_setup_post_add(pgcnt_t);
67 static int kphysm_setup_pre_del(pgcnt_t);
68 static void kphysm_setup_post_del(pgcnt_t, int);
69 
70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
71 
72 static int delspan_reserve(pfn_t, pgcnt_t);
73 static void delspan_unreserve(pfn_t, pgcnt_t);
74 
75 static kmutex_t memseg_lists_lock;
76 static struct memseg *memseg_va_avail;
77 static struct memseg *memseg_delete_junk;
78 static struct memseg *memseg_edit_junk;
79 void memseg_remap_init(void);
80 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
82 static struct memseg *memseg_reuse(pgcnt_t);
83 
84 static struct kmem_cache *memseg_cache;
85 
86 /*
87  * Add a chunk of memory to the system.  page_t's for this memory
88  * are allocated in the first few pages of the chunk.
89  * base: starting PAGESIZE page of new memory.
90  * npgs: length in PAGESIZE pages.
91  *
92  * Adding mem this way doesn't increase the size of the hash tables;
93  * growing them would be too hard.  This should be OK, but adding memory
94  * dynamically most likely means more hash misses, since the tables will
95  * be smaller than they otherwise would be.
96  */
97 int
98 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
99 {
100 	page_t		*pp;
101 	page_t		*opp, *oepp;
102 	struct memseg	*seg;
103 	uint64_t	avmem;
104 	pfn_t		pfn;
105 	pfn_t		pt_base = base;
106 	pgcnt_t		tpgs = npgs;
107 	pgcnt_t		metapgs;
108 	int		exhausted;
109 	pfn_t		pnum;
110 	int		mnode;
111 	caddr_t		vaddr;
112 	int		reuse;
113 	int		mlret;
114 	void		*mapva;
115 	pgcnt_t		nkpmpgs = 0;
116 	offset_t	kpm_pages_off;
117 
118 	cmn_err(CE_CONT,
119 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
120 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
121 
122 	/*
123 	 * Add this span in the delete list to prevent interactions.
124 	 */
125 	if (!delspan_reserve(base, npgs)) {
126 		return (KPHYSM_ESPAN);
127 	}
128 	/*
129 	 * Check to see if any of the memory span has been added
130 	 * by trying an add to the installed memory list. This
131 	 * forms the interlocking process for add.
132 	 */
133 
134 	memlist_write_lock();
135 
136 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
137 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
138 
139 	if (mlret == MEML_SPANOP_OK)
140 		installed_top_size(phys_install, &physmax, &physinstalled);
141 
142 	memlist_write_unlock();
143 
144 	if (mlret != MEML_SPANOP_OK) {
145 		if (mlret == MEML_SPANOP_EALLOC) {
146 			delspan_unreserve(pt_base, tpgs);
147 			return (KPHYSM_ERESOURCE);
148 		} else
149 		if (mlret == MEML_SPANOP_ESPAN) {
150 			delspan_unreserve(pt_base, tpgs);
151 			return (KPHYSM_ESPAN);
152 		} else {
153 			delspan_unreserve(pt_base, tpgs);
154 			return (KPHYSM_ERESOURCE);
155 		}
156 	}
157 
158 	/*
159 	 * We store the page_t's for this new memory in the first
160 	 * few pages of the chunk. Here, we go and get'em ...
161 	 */
162 
163 	/*
164 	 * The expression after the '-' gives the number of pages
165 	 * that will fit in the new memory based on a requirement
166 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
167 	 */
168 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
169 	    (PAGESIZE + sizeof (page_t)));
170 
171 	npgs -= metapgs;
172 	base += metapgs;
173 
174 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
175 
176 	exhausted = (metapgs == 0 || npgs == 0);
177 
178 	if (kpm_enable && !exhausted) {
179 		pgcnt_t start, end, nkpmpgs_prelim;
180 		size_t	ptsz;
181 
182 		/*
183 		 * A viable kpm large page mapping must not overlap two
184 		 * dynamic memsegs. Therefore the total size is checked
185 		 * to be at least kpm_pgsz and also whether start and end
186 		 * points are at least kpm_pgsz aligned.
187 		 */
188 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
189 		    pmodkpmp(base + npgs)) {
190 
191 			kphysm_addmem_error_undospan(pt_base, tpgs);
192 
193 			/*
194 			 * There is no specific error code for violating
195 			 * kpm granularity constraints.
196 			 */
197 			return (KPHYSM_ENOTVIABLE);
198 		}
199 
200 		start = kpmptop(ptokpmp(base));
201 		end = kpmptop(ptokpmp(base + npgs));
202 		nkpmpgs_prelim = ptokpmp(end - start);
203 		ptsz = npgs * sizeof (page_t);
204 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
205 		exhausted = (tpgs <= metapgs);
206 		if (!exhausted) {
207 			npgs = tpgs - metapgs;
208 			base = pt_base + metapgs;
209 
210 			/* final nkpmpgs */
211 			start = kpmptop(ptokpmp(base));
212 			nkpmpgs = ptokpmp(end - start);
213 			kpm_pages_off = ptsz +
214 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
215 		}
216 	}
217 
218 	/*
219 	 * Is memory area supplied too small?
220 	 */
221 	if (exhausted) {
222 		kphysm_addmem_error_undospan(pt_base, tpgs);
223 
224 		/*
225 		 * There is no specific error code for 'too small'.
226 		 */
227 		return (KPHYSM_ERESOURCE);
228 	}
229 
230 	/*
231 	 * We may re-use a previously allocated VA space for the page_ts
232 	 * eventually, but we need to initialize and lock the pages first.
233 	 */
234 
235 	/*
236 	 * Get an address in the kernel address map, map
237 	 * the page_t pages and see if we can touch them.
238 	 */
239 
240 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
241 	if (mapva == NULL) {
242 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
243 		    " Can't allocate VA for page_ts");
244 
245 		kphysm_addmem_error_undospan(pt_base, tpgs);
246 
247 		return (KPHYSM_ERESOURCE);
248 	}
249 	pp = mapva;
250 
251 	if (physmax < (pt_base + tpgs))
252 		physmax = (pt_base + tpgs);
253 
254 	/*
255 	 * In the remapping code we map one page at a time so we must do
256 	 * the same here to match mapping sizes.
257 	 */
258 	pfn = pt_base;
259 	vaddr = (caddr_t)pp;
260 	for (pnum = 0; pnum < metapgs; pnum++) {
261 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
262 		    PROT_READ | PROT_WRITE,
263 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
264 		pfn++;
265 		vaddr += ptob(1);
266 	}
267 
268 	if (ddi_peek32((dev_info_t *)NULL,
269 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
270 
271 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
272 		    " Can't access pp array at 0x%p [phys 0x%lx]",
273 		    (void *)pp, pt_base);
274 
275 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
276 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
277 
278 		vmem_free(heap_arena, mapva, ptob(metapgs));
279 
280 		kphysm_addmem_error_undospan(pt_base, tpgs);
281 
282 		return (KPHYSM_EFAULT);
283 	}
284 
285 	/*
286 	 * Add this memory slice to its memory node translation.
287 	 *
288 	 * Note that right now, each node may have only one slice;
289 	 * this may change with COD or in larger SSM systems with
290 	 * nested latency groups, so we must not assume that the
291 	 * node does not yet exist.
292 	 *
293 	 * Also, using pt_base (page table base address)
294 	 * and tpgs (total number of pages) to mimic the case when a
295 	 * memory board is already installed in a system at boot
296 	 * time.  This will ensure the entire address range is
297 	 * specified in order to have proper deletion.
298 	 */
299 	pnum = pt_base + tpgs - 1;
300 	mem_node_add_slice(pt_base, pnum);
301 
302 	/*
303 	 * Allocate or resize page counters as necessary to accommodate
304 	 * the increase in memory pages.
305 	 */
306 	mnode = PFN_2_MEM_NODE(pnum);
307 	if (page_ctrs_adjust(mnode) != 0) {
308 
309 		mem_node_pre_del_slice(pt_base, pnum);
310 		mem_node_post_del_slice(pt_base, pnum, 0);
311 
312 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
313 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
314 
315 		vmem_free(heap_arena, mapva, ptob(metapgs));
316 
317 		kphysm_addmem_error_undospan(pt_base, tpgs);
318 
319 		return (KPHYSM_ERESOURCE);
320 	}
321 
322 	/*
323 	 * Update the phys_avail memory list.
324 	 * The phys_install list was done at the start.
325 	 */
326 
327 	memlist_write_lock();
328 
329 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
330 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
331 	ASSERT(mlret == MEML_SPANOP_OK);
332 
333 	memlist_write_unlock();
334 
335 	/* See if we can find a memseg to re-use. */
336 	seg = memseg_reuse(metapgs);
337 
338 	reuse = (seg != NULL);
339 
340 	/*
341 	 * Initialize the memseg structure representing this memory
342 	 * and add it to the existing list of memsegs. Do some basic
343 	 * initialization and add the memory to the system.
344 	 * In order to prevent lock deadlocks, the add_physmem()
345 	 * code is repeated here, but split into several stages.
346 	 */
347 	if (seg == NULL) {
348 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
349 		bzero(seg, sizeof (struct memseg));
350 		seg->msegflags = MEMSEG_DYNAMIC;
351 		seg->pages = pp;
352 	} else {
353 		/*EMPTY*/
354 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
355 	}
356 
357 	seg->epages = seg->pages + npgs;
358 	seg->pages_base = base;
359 	seg->pages_end = base + npgs;
360 
361 	/*
362 	 * Initialize metadata. The page_ts are set to locked state
363 	 * ready to be freed.
364 	 */
365 	bzero((caddr_t)pp, ptob(metapgs));
366 
367 	pfn = seg->pages_base;
368 	/* Save the original pp base in case we reuse a memseg. */
369 	opp = pp;
370 	oepp = opp + npgs;
371 	for (pp = opp; pp < oepp; pp++) {
372 		pp->p_pagenum = pfn;
373 		pfn++;
374 		page_iolock_init(pp);
375 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
376 			continue;
377 		pp->p_offset = (u_offset_t)-1;
378 	}
379 
380 	if (reuse) {
381 		/* Remap our page_ts to the re-used memseg VA space. */
382 		pfn = pt_base;
383 		vaddr = (caddr_t)seg->pages;
384 		for (pnum = 0; pnum < metapgs; pnum++) {
385 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
386 			    PROT_READ | PROT_WRITE,
387 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
388 			pfn++;
389 			vaddr += ptob(1);
390 		}
391 
392 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
393 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
394 
395 		vmem_free(heap_arena, mapva, ptob(metapgs));
396 	}
397 
398 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
399 
400 	memsegs_lock(1);
401 
402 	/*
403 	 * The new memseg is inserted at the beginning of the list.
404 	 * Not only does this save searching for the tail, but in the
405 	 * case of a re-used memseg, it solves the problem of what
406 	 * happens if some process has still got a pointer to the
407 	 * memseg and follows the next pointer to continue traversing
408 	 * the memsegs list.
409 	 */
410 
411 	hat_kpm_addmem_mseg_insert(seg);
412 
413 	seg->next = memsegs;
414 	membar_producer();
415 
416 	hat_kpm_addmem_memsegs_update(seg);
417 
418 	memsegs = seg;
419 
420 	build_pfn_hash();
421 
422 	total_pages += npgs;
423 
424 	/*
425 	 * Recalculate the paging parameters now total_pages has changed.
426 	 * This will also cause the clock hands to be reset before next use.
427 	 */
428 	setupclock(1);
429 
430 	memsegs_unlock(1);
431 
432 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
433 
434 	/*
435 	 * Free the pages outside the lock to avoid locking loops.
436 	 */
437 	for (pp = seg->pages; pp < seg->epages; pp++) {
438 		page_free(pp, 1);
439 	}
440 
441 	/*
442 	 * Now that we've updated the appropriate memory lists we
443 	 * need to reset a number of globals, since we've increased memory.
444 	 * Several have already been updated for us as noted above. The
445 	 * globals we're interested in at this point are:
446 	 *   physmax - highest page frame number.
447 	 *   physinstalled - number of pages currently installed (done earlier)
448 	 *   maxmem - max free pages in the system
449 	 *   physmem - physical memory pages available
450 	 *   availrmem - real memory available
451 	 */
452 
453 	mutex_enter(&freemem_lock);
454 	maxmem += npgs;
455 	physmem += npgs;
456 	availrmem += npgs;
457 	availrmem_initial += npgs;
458 
459 	mutex_exit(&freemem_lock);
460 
461 	dump_resize();
462 
463 	page_freelist_coalesce_all(mnode);
464 
465 	kphysm_setup_post_add(npgs);
466 
467 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
468 	    "(0x%" PRIx64 ")\n",
469 	    physinstalled << (PAGESHIFT - 10),
470 	    (uint64_t)physinstalled << PAGESHIFT);
471 
472 	avmem = (uint64_t)freemem << PAGESHIFT;
473 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
474 	    "avail mem = %" PRId64 "\n", avmem);
475 
476 	/*
477 	 * Update lgroup generation number on single lgroup systems
478 	 */
479 	if (nlgrps == 1)
480 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
481 
482 	delspan_unreserve(pt_base, tpgs);
483 	return (KPHYSM_OK);		/* Successfully added system memory */
484 
485 }
486 
487 /*
488  * There are various error conditions in kphysm_add_memory_dynamic()
489  * which require a rollback of already changed global state.
490  */
491 static void
492 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
493 {
494 	int mlret;
495 
496 	/* Unreserve memory span. */
497 	memlist_write_lock();
498 
499 	mlret = memlist_delete_span(
500 	    (uint64_t)(pt_base) << PAGESHIFT,
501 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
502 
503 	ASSERT(mlret == MEML_SPANOP_OK);
504 	phys_install_has_changed();
505 	installed_top_size(phys_install, &physmax, &physinstalled);
506 
507 	memlist_write_unlock();
508 	delspan_unreserve(pt_base, tpgs);
509 }
510 
511 /*
512  * Only return an available memseg of exactly the right size.
513  * When the meta data area has it's own virtual address space
514  * we will need to manage this more carefully and do best fit
515  * allocations, possibly splitting an available area.
516  */
517 static struct memseg *
518 memseg_reuse(pgcnt_t metapgs)
519 {
520 	struct memseg **segpp, *seg;
521 
522 	mutex_enter(&memseg_lists_lock);
523 
524 	segpp = &memseg_va_avail;
525 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
526 		caddr_t end;
527 
528 		if (kpm_enable)
529 			end = hat_kpm_mseg_reuse(seg);
530 		else
531 			end = (caddr_t)seg->epages;
532 
533 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
534 			*segpp = seg->lnext;
535 			seg->lnext = NULL;
536 			break;
537 		}
538 	}
539 	mutex_exit(&memseg_lists_lock);
540 
541 	return (seg);
542 }
543 
544 static uint_t handle_gen;
545 
546 struct memdelspan {
547 	struct memdelspan *mds_next;
548 	pfn_t		mds_base;
549 	pgcnt_t		mds_npgs;
550 	uint_t		*mds_bitmap;
551 	uint_t		*mds_bitmap_retired;
552 };
553 
554 #define	NBPBMW		(sizeof (uint_t) * NBBY)
555 #define	MDS_BITMAPBYTES(MDSP) \
556 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
557 
558 struct transit_list {
559 	struct transit_list	*trl_next;
560 	struct memdelspan	*trl_spans;
561 	int			trl_collect;
562 };
563 
564 struct transit_list_head {
565 	kmutex_t		trh_lock;
566 	struct transit_list	*trh_head;
567 };
568 
569 static struct transit_list_head transit_list_head;
570 
571 struct mem_handle;
572 static void transit_list_collect(struct mem_handle *, int);
573 static void transit_list_insert(struct transit_list *);
574 static void transit_list_remove(struct transit_list *);
575 
576 #ifdef DEBUG
577 #define	MEM_DEL_STATS
578 #endif /* DEBUG */
579 
580 #ifdef MEM_DEL_STATS
581 static int mem_del_stat_print = 0;
582 struct mem_del_stat {
583 	uint_t	nloop;
584 	uint_t	need_free;
585 	uint_t	free_loop;
586 	uint_t	free_low;
587 	uint_t	free_failed;
588 	uint_t	ncheck;
589 	uint_t	nopaget;
590 	uint_t	lockfail;
591 	uint_t	nfree;
592 	uint_t	nreloc;
593 	uint_t	nrelocfail;
594 	uint_t	already_done;
595 	uint_t	first_notfree;
596 	uint_t	npplocked;
597 	uint_t	nlockreloc;
598 	uint_t	nnorepl;
599 	uint_t	nmodreloc;
600 	uint_t	ndestroy;
601 	uint_t	nputpage;
602 	uint_t	nnoreclaim;
603 	uint_t	ndelay;
604 	uint_t	demotefail;
605 	uint64_t nticks_total;
606 	uint64_t nticks_pgrp;
607 	uint_t	retired;
608 	uint_t	toxic;
609 	uint_t	failing;
610 	uint_t	modtoxic;
611 	uint_t	npplkdtoxic;
612 	uint_t	gptlmodfail;
613 	uint_t	gptllckfail;
614 };
615 /*
616  * The stat values are only incremented in the delete thread
617  * so no locking or atomic required.
618  */
619 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
620 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
621 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
622 static void mem_del_stat_print_func(struct mem_handle *);
623 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
624 #else /* MEM_DEL_STATS */
625 #define	MDSTAT_INCR(MHP, FLD)
626 #define	MDSTAT_TOTAL(MHP, ntck)
627 #define	MDSTAT_PGRP(MHP, ntck)
628 #define	MDSTAT_PRINT(MHP)
629 #endif /* MEM_DEL_STATS */
630 
631 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
632 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
633 
634 /*
635  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
636  * The mutex may not be required for other fields, dependent on mh_state.
637  */
638 struct mem_handle {
639 	kmutex_t	mh_mutex;
640 	struct mem_handle *mh_next;
641 	memhandle_t	mh_exthandle;
642 	mhnd_state_t	mh_state;
643 	struct transit_list mh_transit;
644 	pgcnt_t		mh_phys_pages;
645 	pgcnt_t		mh_vm_pages;
646 	pgcnt_t		mh_hold_todo;
647 	void		(*mh_delete_complete)(void *, int error);
648 	void		*mh_delete_complete_arg;
649 	volatile uint_t mh_cancel;
650 	volatile uint_t mh_dr_aio_cleanup_cancel;
651 	volatile uint_t mh_aio_cleanup_done;
652 	kcondvar_t	mh_cv;
653 	kthread_id_t	mh_thread_id;
654 	page_t		*mh_deleted;	/* link through p_next */
655 #ifdef MEM_DEL_STATS
656 	struct mem_del_stat mh_delstat;
657 #endif /* MEM_DEL_STATS */
658 };
659 
660 static struct mem_handle *mem_handle_head;
661 static kmutex_t mem_handle_list_mutex;
662 
663 static struct mem_handle *
664 kphysm_allocate_mem_handle()
665 {
666 	struct mem_handle *mhp;
667 
668 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
669 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
670 	mutex_enter(&mem_handle_list_mutex);
671 	mutex_enter(&mhp->mh_mutex);
672 	/* handle_gen is protected by list mutex. */
673 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
674 	mhp->mh_next = mem_handle_head;
675 	mem_handle_head = mhp;
676 	mutex_exit(&mem_handle_list_mutex);
677 
678 	return (mhp);
679 }
680 
681 static void
682 kphysm_free_mem_handle(struct mem_handle *mhp)
683 {
684 	struct mem_handle **mhpp;
685 
686 	ASSERT(mutex_owned(&mhp->mh_mutex));
687 	ASSERT(mhp->mh_state == MHND_FREE);
688 	/*
689 	 * Exit the mutex to preserve locking order. This is OK
690 	 * here as once in the FREE state, the handle cannot
691 	 * be found by a lookup.
692 	 */
693 	mutex_exit(&mhp->mh_mutex);
694 
695 	mutex_enter(&mem_handle_list_mutex);
696 	mhpp = &mem_handle_head;
697 	while (*mhpp != NULL && *mhpp != mhp)
698 		mhpp = &(*mhpp)->mh_next;
699 	ASSERT(*mhpp == mhp);
700 	/*
701 	 * No need to lock the handle (mh_mutex) as only
702 	 * mh_next changing and this is the only thread that
703 	 * can be referncing mhp.
704 	 */
705 	*mhpp = mhp->mh_next;
706 	mutex_exit(&mem_handle_list_mutex);
707 
708 	mutex_destroy(&mhp->mh_mutex);
709 	kmem_free(mhp, sizeof (struct mem_handle));
710 }
711 
712 /*
713  * This function finds the internal mem_handle corresponding to an
714  * external handle and returns it with the mh_mutex held.
715  */
716 static struct mem_handle *
717 kphysm_lookup_mem_handle(memhandle_t handle)
718 {
719 	struct mem_handle *mhp;
720 
721 	mutex_enter(&mem_handle_list_mutex);
722 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
723 		if (mhp->mh_exthandle == handle) {
724 			mutex_enter(&mhp->mh_mutex);
725 			/*
726 			 * The state of the handle could have been changed
727 			 * by kphysm_del_release() while waiting for mh_mutex.
728 			 */
729 			if (mhp->mh_state == MHND_FREE) {
730 				mutex_exit(&mhp->mh_mutex);
731 				continue;
732 			}
733 			break;
734 		}
735 	}
736 	mutex_exit(&mem_handle_list_mutex);
737 	return (mhp);
738 }
739 
740 int
741 kphysm_del_gethandle(memhandle_t *xmhp)
742 {
743 	struct mem_handle *mhp;
744 
745 	mhp = kphysm_allocate_mem_handle();
746 	/*
747 	 * The handle is allocated using KM_SLEEP, so cannot fail.
748 	 * If the implementation is changed, the correct error to return
749 	 * here would be KPHYSM_ENOHANDLES.
750 	 */
751 	ASSERT(mhp->mh_state == MHND_FREE);
752 	mhp->mh_state = MHND_INIT;
753 	*xmhp = mhp->mh_exthandle;
754 	mutex_exit(&mhp->mh_mutex);
755 	return (KPHYSM_OK);
756 }
757 
758 static int
759 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
760 {
761 	pfn_t e1, e2;
762 
763 	e1 = b1 + l1;
764 	e2 = b2 + l2;
765 
766 	return (!(b2 >= e1 || b1 >= e2));
767 }
768 
769 static int can_remove_pgs(pgcnt_t);
770 
771 static struct memdelspan *
772 span_to_install(pfn_t base, pgcnt_t npgs)
773 {
774 	struct memdelspan *mdsp;
775 	struct memdelspan *mdsp_new;
776 	uint64_t address, size, thislen;
777 	struct memlist *mlp;
778 
779 	mdsp_new = NULL;
780 
781 	address = (uint64_t)base << PAGESHIFT;
782 	size = (uint64_t)npgs << PAGESHIFT;
783 	while (size != 0) {
784 		memlist_read_lock();
785 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
786 			if (address >= (mlp->address + mlp->size))
787 				continue;
788 			if ((address + size) > mlp->address)
789 				break;
790 		}
791 		if (mlp == NULL) {
792 			address += size;
793 			size = 0;
794 			thislen = 0;
795 		} else {
796 			if (address < mlp->address) {
797 				size -= (mlp->address - address);
798 				address = mlp->address;
799 			}
800 			ASSERT(address >= mlp->address);
801 			if ((address + size) > (mlp->address + mlp->size)) {
802 				thislen = mlp->size - (address - mlp->address);
803 			} else {
804 				thislen = size;
805 			}
806 		}
807 		memlist_read_unlock();
808 		/* TODO: phys_install could change now */
809 		if (thislen == 0)
810 			continue;
811 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
812 		mdsp->mds_base = btop(address);
813 		mdsp->mds_npgs = btop(thislen);
814 		mdsp->mds_next = mdsp_new;
815 		mdsp_new = mdsp;
816 		address += thislen;
817 		size -= thislen;
818 	}
819 	return (mdsp_new);
820 }
821 
822 static void
823 free_delspans(struct memdelspan *mdsp)
824 {
825 	struct memdelspan *amdsp;
826 
827 	while ((amdsp = mdsp) != NULL) {
828 		mdsp = amdsp->mds_next;
829 		kmem_free(amdsp, sizeof (struct memdelspan));
830 	}
831 }
832 
833 /*
834  * Concatenate lists. No list ordering is required.
835  */
836 
837 static void
838 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
839 {
840 	while (*mdspp != NULL)
841 		mdspp = &(*mdspp)->mds_next;
842 
843 	*mdspp = mdsp;
844 }
845 
846 /*
847  * Given a new list of delspans, check there is no overlap with
848  * all existing span activity (add or delete) and then concatenate
849  * the new spans to the given list.
850  * Return 1 for OK, 0 if overlapping.
851  */
852 static int
853 delspan_insert(
854 	struct transit_list *my_tlp,
855 	struct memdelspan *mdsp_new)
856 {
857 	struct transit_list_head *trh;
858 	struct transit_list *tlp;
859 	int ret;
860 
861 	trh = &transit_list_head;
862 
863 	ASSERT(my_tlp != NULL);
864 	ASSERT(mdsp_new != NULL);
865 
866 	ret = 1;
867 	mutex_enter(&trh->trh_lock);
868 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
869 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
870 		struct memdelspan *mdsp;
871 
872 		for (mdsp = tlp->trl_spans; mdsp != NULL;
873 		    mdsp = mdsp->mds_next) {
874 			struct memdelspan *nmdsp;
875 
876 			for (nmdsp = mdsp_new; nmdsp != NULL;
877 			    nmdsp = nmdsp->mds_next) {
878 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
879 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
880 					ret = 0;
881 					goto done;
882 				}
883 			}
884 		}
885 	}
886 done:
887 	if (ret != 0) {
888 		if (my_tlp->trl_spans == NULL)
889 			transit_list_insert(my_tlp);
890 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
891 	}
892 	mutex_exit(&trh->trh_lock);
893 	return (ret);
894 }
895 
896 static void
897 delspan_remove(
898 	struct transit_list *my_tlp,
899 	pfn_t base,
900 	pgcnt_t npgs)
901 {
902 	struct transit_list_head *trh;
903 	struct memdelspan *mdsp;
904 
905 	trh = &transit_list_head;
906 
907 	ASSERT(my_tlp != NULL);
908 
909 	mutex_enter(&trh->trh_lock);
910 	if ((mdsp = my_tlp->trl_spans) != NULL) {
911 		if (npgs == 0) {
912 			my_tlp->trl_spans = NULL;
913 			free_delspans(mdsp);
914 			transit_list_remove(my_tlp);
915 		} else {
916 			struct memdelspan **prv;
917 
918 			prv = &my_tlp->trl_spans;
919 			while (mdsp != NULL) {
920 				pfn_t p_end;
921 
922 				p_end = mdsp->mds_base + mdsp->mds_npgs;
923 				if (mdsp->mds_base >= base &&
924 				    p_end <= (base + npgs)) {
925 					*prv = mdsp->mds_next;
926 					mdsp->mds_next = NULL;
927 					free_delspans(mdsp);
928 				} else {
929 					prv = &mdsp->mds_next;
930 				}
931 				mdsp = *prv;
932 			}
933 			if (my_tlp->trl_spans == NULL)
934 				transit_list_remove(my_tlp);
935 		}
936 	}
937 	mutex_exit(&trh->trh_lock);
938 }
939 
940 /*
941  * Reserve interface for add to stop delete before add finished.
942  * This list is only accessed through the delspan_insert/remove
943  * functions and so is fully protected by the mutex in struct transit_list.
944  */
945 
946 static struct transit_list reserve_transit;
947 
948 static int
949 delspan_reserve(pfn_t base, pgcnt_t npgs)
950 {
951 	struct memdelspan *mdsp;
952 	int ret;
953 
954 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
955 	mdsp->mds_base = base;
956 	mdsp->mds_npgs = npgs;
957 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
958 		free_delspans(mdsp);
959 	}
960 	return (ret);
961 }
962 
963 static void
964 delspan_unreserve(pfn_t base, pgcnt_t npgs)
965 {
966 	delspan_remove(&reserve_transit, base, npgs);
967 }
968 
969 /*
970  * Return whether memseg was created by kphysm_add_memory_dynamic().
971  * If this is the case and startp non zero, return also the start pfn
972  * of the meta data via startp.
973  */
974 static int
975 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
976 {
977 	pfn_t		pt_start;
978 
979 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
980 		return (0);
981 
982 	/* Meta data is required to be at the beginning */
983 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
984 
985 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
986 	if (startp != NULL)
987 		*startp = pt_start;
988 
989 	return (1);
990 }
991 
992 int
993 kphysm_del_span(
994 	memhandle_t handle,
995 	pfn_t base,
996 	pgcnt_t npgs)
997 {
998 	struct mem_handle *mhp;
999 	struct memseg *seg;
1000 	struct memdelspan *mdsp;
1001 	struct memdelspan *mdsp_new;
1002 	pgcnt_t phys_pages, vm_pages;
1003 	pfn_t p_end;
1004 	page_t *pp;
1005 	int ret;
1006 
1007 	mhp = kphysm_lookup_mem_handle(handle);
1008 	if (mhp == NULL) {
1009 		return (KPHYSM_EHANDLE);
1010 	}
1011 	if (mhp->mh_state != MHND_INIT) {
1012 		mutex_exit(&mhp->mh_mutex);
1013 		return (KPHYSM_ESEQUENCE);
1014 	}
1015 
1016 	/*
1017 	 * Intersect the span with the installed memory list (phys_install).
1018 	 */
1019 	mdsp_new = span_to_install(base, npgs);
1020 	if (mdsp_new == NULL) {
1021 		/*
1022 		 * No physical memory in this range. Is this an
1023 		 * error? If an attempt to start the delete is made
1024 		 * for OK returns from del_span such as this, start will
1025 		 * return an error.
1026 		 * Could return KPHYSM_ENOWORK.
1027 		 */
1028 		/*
1029 		 * It is assumed that there are no error returns
1030 		 * from span_to_install() due to kmem_alloc failure.
1031 		 */
1032 		mutex_exit(&mhp->mh_mutex);
1033 		return (KPHYSM_OK);
1034 	}
1035 	/*
1036 	 * Does this span overlap an existing span?
1037 	 */
1038 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1039 		/*
1040 		 * Differentiate between already on list for this handle
1041 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1042 		 */
1043 		ret = KPHYSM_EBUSY;
1044 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1045 		    mdsp = mdsp->mds_next) {
1046 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1047 			    base, npgs)) {
1048 				ret = KPHYSM_EDUP;
1049 				break;
1050 			}
1051 		}
1052 		mutex_exit(&mhp->mh_mutex);
1053 		free_delspans(mdsp_new);
1054 		return (ret);
1055 	}
1056 	/*
1057 	 * At this point the spans in mdsp_new have been inserted into the
1058 	 * list of spans for this handle and thereby to the global list of
1059 	 * spans being processed. Each of these spans must now be checked
1060 	 * for relocatability. As a side-effect segments in the memseg list
1061 	 * may be split.
1062 	 *
1063 	 * Note that mdsp_new can no longer be used as it is now part of
1064 	 * a larger list. Select elements of this larger list based
1065 	 * on base and npgs.
1066 	 */
1067 restart:
1068 	phys_pages = 0;
1069 	vm_pages = 0;
1070 	ret = KPHYSM_OK;
1071 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1072 	    mdsp = mdsp->mds_next) {
1073 		pgcnt_t pages_checked;
1074 
1075 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1076 			continue;
1077 		}
1078 		p_end = mdsp->mds_base + mdsp->mds_npgs;
1079 		/*
1080 		 * The pages_checked count is a hack. All pages should be
1081 		 * checked for relocatability. Those not covered by memsegs
1082 		 * should be tested with arch_kphysm_del_span_ok().
1083 		 */
1084 		pages_checked = 0;
1085 		for (seg = memsegs; seg; seg = seg->next) {
1086 			pfn_t mseg_start;
1087 
1088 			if (seg->pages_base >= p_end ||
1089 			    seg->pages_end <= mdsp->mds_base) {
1090 				/* Span and memseg don't overlap. */
1091 				continue;
1092 			}
1093 			/* Check that segment is suitable for delete. */
1094 			if (memseg_is_dynamic(seg, &mseg_start)) {
1095 				/*
1096 				 * Can only delete whole added segments
1097 				 * for the moment.
1098 				 * Check that this is completely within the
1099 				 * span.
1100 				 */
1101 				if (mseg_start < mdsp->mds_base ||
1102 				    seg->pages_end > p_end) {
1103 					ret = KPHYSM_EBUSY;
1104 					break;
1105 				}
1106 				pages_checked += seg->pages_end - mseg_start;
1107 			} else {
1108 				/*
1109 				 * Set mseg_start for accounting below.
1110 				 */
1111 				mseg_start = seg->pages_base;
1112 				/*
1113 				 * If this segment is larger than the span,
1114 				 * try to split it. After the split, it
1115 				 * is necessary to restart.
1116 				 */
1117 				if (seg->pages_base < mdsp->mds_base ||
1118 				    seg->pages_end > p_end) {
1119 					pfn_t abase;
1120 					pgcnt_t anpgs;
1121 					int s_ret;
1122 
1123 					/* Split required.  */
1124 					if (mdsp->mds_base < seg->pages_base)
1125 						abase = seg->pages_base;
1126 					else
1127 						abase = mdsp->mds_base;
1128 					if (p_end > seg->pages_end)
1129 						anpgs = seg->pages_end - abase;
1130 					else
1131 						anpgs = p_end - abase;
1132 					s_ret = kphysm_split_memseg(abase,
1133 					    anpgs);
1134 					if (s_ret == 0) {
1135 						/* Split failed. */
1136 						ret = KPHYSM_ERESOURCE;
1137 						break;
1138 					}
1139 					goto restart;
1140 				}
1141 				pages_checked +=
1142 				    seg->pages_end - seg->pages_base;
1143 			}
1144 			/*
1145 			 * The memseg is wholly within the delete span.
1146 			 * The individual pages can now be checked.
1147 			 */
1148 			/* Cage test. */
1149 			for (pp = seg->pages; pp < seg->epages; pp++) {
1150 				if (PP_ISNORELOC(pp)) {
1151 					ret = KPHYSM_ENONRELOC;
1152 					break;
1153 				}
1154 			}
1155 			if (ret != KPHYSM_OK) {
1156 				break;
1157 			}
1158 			phys_pages += (seg->pages_end - mseg_start);
1159 			vm_pages += MSEG_NPAGES(seg);
1160 		}
1161 		if (ret != KPHYSM_OK)
1162 			break;
1163 		if (pages_checked != mdsp->mds_npgs) {
1164 			ret = KPHYSM_ENONRELOC;
1165 			break;
1166 		}
1167 	}
1168 
1169 	if (ret == KPHYSM_OK) {
1170 		mhp->mh_phys_pages += phys_pages;
1171 		mhp->mh_vm_pages += vm_pages;
1172 	} else {
1173 		/*
1174 		 * Keep holding the mh_mutex to prevent it going away.
1175 		 */
1176 		delspan_remove(&mhp->mh_transit, base, npgs);
1177 	}
1178 	mutex_exit(&mhp->mh_mutex);
1179 	return (ret);
1180 }
1181 
1182 int
1183 kphysm_del_span_query(
1184 	pfn_t base,
1185 	pgcnt_t npgs,
1186 	memquery_t *mqp)
1187 {
1188 	struct memdelspan *mdsp;
1189 	struct memdelspan *mdsp_new;
1190 	int done_first_nonreloc;
1191 
1192 	mqp->phys_pages = 0;
1193 	mqp->managed = 0;
1194 	mqp->nonrelocatable = 0;
1195 	mqp->first_nonrelocatable = 0;
1196 	mqp->last_nonrelocatable = 0;
1197 
1198 	mdsp_new = span_to_install(base, npgs);
1199 	/*
1200 	 * It is OK to proceed here if mdsp_new == NULL.
1201 	 */
1202 	done_first_nonreloc = 0;
1203 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1204 		pfn_t sbase;
1205 		pgcnt_t snpgs;
1206 
1207 		mqp->phys_pages += mdsp->mds_npgs;
1208 		sbase = mdsp->mds_base;
1209 		snpgs = mdsp->mds_npgs;
1210 		while (snpgs != 0) {
1211 			struct memseg *lseg, *seg;
1212 			pfn_t p_end;
1213 			page_t *pp;
1214 			pfn_t mseg_start;
1215 
1216 			p_end = sbase + snpgs;
1217 			/*
1218 			 * Find the lowest addressed memseg that starts
1219 			 * after sbase and account for it.
1220 			 * This is to catch dynamic memsegs whose start
1221 			 * is hidden.
1222 			 */
1223 			seg = NULL;
1224 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1225 				if ((lseg->pages_base >= sbase) ||
1226 				    (lseg->pages_base < p_end &&
1227 				    lseg->pages_end > sbase)) {
1228 					if (seg == NULL ||
1229 					    seg->pages_base > lseg->pages_base)
1230 						seg = lseg;
1231 				}
1232 			}
1233 			if (seg != NULL) {
1234 				if (!memseg_is_dynamic(seg, &mseg_start)) {
1235 					mseg_start = seg->pages_base;
1236 				}
1237 				/*
1238 				 * Now have the full extent of the memseg so
1239 				 * do the range check.
1240 				 */
1241 				if (mseg_start >= p_end ||
1242 				    seg->pages_end <= sbase) {
1243 					/* Span does not overlap memseg. */
1244 					seg = NULL;
1245 				}
1246 			}
1247 			/*
1248 			 * Account for gap either before the segment if
1249 			 * there is one or to the end of the span.
1250 			 */
1251 			if (seg == NULL || mseg_start > sbase) {
1252 				pfn_t a_end;
1253 
1254 				a_end = (seg == NULL) ? p_end : mseg_start;
1255 				/*
1256 				 * Check with arch layer for relocatability.
1257 				 */
1258 				if (arch_kphysm_del_span_ok(sbase,
1259 				    (a_end - sbase))) {
1260 					/*
1261 					 * No non-relocatble pages in this
1262 					 * area, avoid the fine-grained
1263 					 * test.
1264 					 */
1265 					snpgs -= (a_end - sbase);
1266 					sbase = a_end;
1267 				}
1268 				while (sbase < a_end) {
1269 					if (!arch_kphysm_del_span_ok(sbase,
1270 					    1)) {
1271 						mqp->nonrelocatable++;
1272 						if (!done_first_nonreloc) {
1273 							mqp->
1274 							    first_nonrelocatable
1275 							    = sbase;
1276 							done_first_nonreloc = 1;
1277 						}
1278 						mqp->last_nonrelocatable =
1279 						    sbase;
1280 					}
1281 					sbase++;
1282 					snpgs--;
1283 				}
1284 			}
1285 			if (seg != NULL) {
1286 				ASSERT(mseg_start <= sbase);
1287 				if (seg->pages_base != mseg_start &&
1288 				    seg->pages_base > sbase) {
1289 					pgcnt_t skip_pgs;
1290 
1291 					/*
1292 					 * Skip the page_t area of a
1293 					 * dynamic memseg.
1294 					 */
1295 					skip_pgs = seg->pages_base - sbase;
1296 					if (snpgs <= skip_pgs) {
1297 						sbase += snpgs;
1298 						snpgs = 0;
1299 						continue;
1300 					}
1301 					snpgs -= skip_pgs;
1302 					sbase += skip_pgs;
1303 				}
1304 				ASSERT(snpgs != 0);
1305 				ASSERT(seg->pages_base <= sbase);
1306 				/*
1307 				 * The individual pages can now be checked.
1308 				 */
1309 				for (pp = seg->pages +
1310 				    (sbase - seg->pages_base);
1311 				    snpgs != 0 && pp < seg->epages; pp++) {
1312 					mqp->managed++;
1313 					if (PP_ISNORELOC(pp)) {
1314 						mqp->nonrelocatable++;
1315 						if (!done_first_nonreloc) {
1316 							mqp->
1317 							    first_nonrelocatable
1318 							    = sbase;
1319 							done_first_nonreloc = 1;
1320 						}
1321 						mqp->last_nonrelocatable =
1322 						    sbase;
1323 					}
1324 					sbase++;
1325 					snpgs--;
1326 				}
1327 			}
1328 		}
1329 	}
1330 
1331 	free_delspans(mdsp_new);
1332 
1333 	return (KPHYSM_OK);
1334 }
1335 
1336 /*
1337  * This release function can be called at any stage as follows:
1338  *	_gethandle only called
1339  *	_span(s) only called
1340  *	_start called but failed
1341  *	delete thread exited
1342  */
1343 int
1344 kphysm_del_release(memhandle_t handle)
1345 {
1346 	struct mem_handle *mhp;
1347 
1348 	mhp = kphysm_lookup_mem_handle(handle);
1349 	if (mhp == NULL) {
1350 		return (KPHYSM_EHANDLE);
1351 	}
1352 	switch (mhp->mh_state) {
1353 	case MHND_STARTING:
1354 	case MHND_RUNNING:
1355 		mutex_exit(&mhp->mh_mutex);
1356 		return (KPHYSM_ENOTFINISHED);
1357 	case MHND_FREE:
1358 		ASSERT(mhp->mh_state != MHND_FREE);
1359 		mutex_exit(&mhp->mh_mutex);
1360 		return (KPHYSM_EHANDLE);
1361 	case MHND_INIT:
1362 		break;
1363 	case MHND_DONE:
1364 		break;
1365 	case MHND_RELEASE:
1366 		mutex_exit(&mhp->mh_mutex);
1367 		return (KPHYSM_ESEQUENCE);
1368 	default:
1369 #ifdef DEBUG
1370 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1371 		    (void *)mhp, mhp->mh_state);
1372 #endif /* DEBUG */
1373 		mutex_exit(&mhp->mh_mutex);
1374 		return (KPHYSM_EHANDLE);
1375 	}
1376 	/*
1377 	 * Set state so that we can wait if necessary.
1378 	 * Also this means that we have read/write access to all
1379 	 * fields except mh_exthandle and mh_state.
1380 	 */
1381 	mhp->mh_state = MHND_RELEASE;
1382 	/*
1383 	 * The mem_handle cannot be de-allocated by any other operation
1384 	 * now, so no need to hold mh_mutex.
1385 	 */
1386 	mutex_exit(&mhp->mh_mutex);
1387 
1388 	delspan_remove(&mhp->mh_transit, 0, 0);
1389 	mhp->mh_phys_pages = 0;
1390 	mhp->mh_vm_pages = 0;
1391 	mhp->mh_hold_todo = 0;
1392 	mhp->mh_delete_complete = NULL;
1393 	mhp->mh_delete_complete_arg = NULL;
1394 	mhp->mh_cancel = 0;
1395 
1396 	mutex_enter(&mhp->mh_mutex);
1397 	ASSERT(mhp->mh_state == MHND_RELEASE);
1398 	mhp->mh_state = MHND_FREE;
1399 
1400 	kphysm_free_mem_handle(mhp);
1401 
1402 	return (KPHYSM_OK);
1403 }
1404 
1405 /*
1406  * This cancel function can only be called with the thread running.
1407  */
1408 int
1409 kphysm_del_cancel(memhandle_t handle)
1410 {
1411 	struct mem_handle *mhp;
1412 
1413 	mhp = kphysm_lookup_mem_handle(handle);
1414 	if (mhp == NULL) {
1415 		return (KPHYSM_EHANDLE);
1416 	}
1417 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1418 		mutex_exit(&mhp->mh_mutex);
1419 		return (KPHYSM_ENOTRUNNING);
1420 	}
1421 	/*
1422 	 * Set the cancel flag and wake the delete thread up.
1423 	 * The thread may be waiting on I/O, so the effect of the cancel
1424 	 * may be delayed.
1425 	 */
1426 	if (mhp->mh_cancel == 0) {
1427 		mhp->mh_cancel = KPHYSM_ECANCELLED;
1428 		cv_signal(&mhp->mh_cv);
1429 	}
1430 	mutex_exit(&mhp->mh_mutex);
1431 	return (KPHYSM_OK);
1432 }
1433 
1434 int
1435 kphysm_del_status(
1436 	memhandle_t handle,
1437 	memdelstat_t *mdstp)
1438 {
1439 	struct mem_handle *mhp;
1440 
1441 	mhp = kphysm_lookup_mem_handle(handle);
1442 	if (mhp == NULL) {
1443 		return (KPHYSM_EHANDLE);
1444 	}
1445 	/*
1446 	 * Calling kphysm_del_status() is allowed before the delete
1447 	 * is started to allow for status display.
1448 	 */
1449 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1450 	    mhp->mh_state != MHND_RUNNING) {
1451 		mutex_exit(&mhp->mh_mutex);
1452 		return (KPHYSM_ENOTRUNNING);
1453 	}
1454 	mdstp->phys_pages = mhp->mh_phys_pages;
1455 	mdstp->managed = mhp->mh_vm_pages;
1456 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1457 	mutex_exit(&mhp->mh_mutex);
1458 	return (KPHYSM_OK);
1459 }
1460 
1461 static int mem_delete_additional_pages = 100;
1462 
1463 static int
1464 can_remove_pgs(pgcnt_t npgs)
1465 {
1466 	/*
1467 	 * If all pageable pages were paged out, freemem would
1468 	 * equal availrmem.  There is a minimum requirement for
1469 	 * availrmem.
1470 	 */
1471 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1472 	    < npgs)
1473 		return (0);
1474 	/* TODO: check swap space, etc. */
1475 	return (1);
1476 }
1477 
1478 static int
1479 get_availrmem(pgcnt_t npgs)
1480 {
1481 	int ret;
1482 
1483 	mutex_enter(&freemem_lock);
1484 	ret = can_remove_pgs(npgs);
1485 	if (ret != 0)
1486 		availrmem -= npgs;
1487 	mutex_exit(&freemem_lock);
1488 	return (ret);
1489 }
1490 
1491 static void
1492 put_availrmem(pgcnt_t npgs)
1493 {
1494 	mutex_enter(&freemem_lock);
1495 	availrmem += npgs;
1496 	mutex_exit(&freemem_lock);
1497 }
1498 
1499 #define	FREEMEM_INCR	100
1500 static pgcnt_t freemem_incr = FREEMEM_INCR;
1501 #define	DEL_FREE_WAIT_FRAC	4
1502 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1503 
1504 #define	DEL_BUSY_WAIT_FRAC	20
1505 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1506 
1507 static void kphysm_del_cleanup(struct mem_handle *);
1508 
1509 static void page_delete_collect(page_t *, struct mem_handle *);
1510 
1511 static pgcnt_t
1512 delthr_get_freemem(struct mem_handle *mhp)
1513 {
1514 	pgcnt_t free_get;
1515 	int ret;
1516 
1517 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1518 
1519 	MDSTAT_INCR(mhp, need_free);
1520 	/*
1521 	 * Get up to freemem_incr pages.
1522 	 */
1523 	free_get = freemem_incr;
1524 	if (free_get > mhp->mh_hold_todo)
1525 		free_get = mhp->mh_hold_todo;
1526 	/*
1527 	 * Take free_get pages away from freemem,
1528 	 * waiting if necessary.
1529 	 */
1530 
1531 	while (!mhp->mh_cancel) {
1532 		mutex_exit(&mhp->mh_mutex);
1533 		MDSTAT_INCR(mhp, free_loop);
1534 		/*
1535 		 * Duplicate test from page_create_throttle()
1536 		 * but don't override with !PG_WAIT.
1537 		 */
1538 		if (freemem < (free_get + throttlefree)) {
1539 			MDSTAT_INCR(mhp, free_low);
1540 			ret = 0;
1541 		} else {
1542 			ret = page_create_wait(free_get, 0);
1543 			if (ret == 0) {
1544 				/* EMPTY */
1545 				MDSTAT_INCR(mhp, free_failed);
1546 			}
1547 		}
1548 		if (ret != 0) {
1549 			mutex_enter(&mhp->mh_mutex);
1550 			return (free_get);
1551 		}
1552 
1553 		/*
1554 		 * Put pressure on pageout.
1555 		 */
1556 		page_needfree(free_get);
1557 		cv_signal(&proc_pageout->p_cv);
1558 
1559 		mutex_enter(&mhp->mh_mutex);
1560 		(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
1561 		    (lbolt + DEL_FREE_WAIT_TICKS));
1562 		mutex_exit(&mhp->mh_mutex);
1563 		page_needfree(-(spgcnt_t)free_get);
1564 
1565 		mutex_enter(&mhp->mh_mutex);
1566 	}
1567 	return (0);
1568 }
1569 
1570 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
1571 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
1572 /*
1573  * This function is run as a helper thread for delete_memory_thread.
1574  * It is needed in order to force kaio cleanup, so that pages used in kaio
1575  * will be unlocked and subsequently relocated by delete_memory_thread.
1576  * The address of the delete_memory_threads's mem_handle is passed in to
1577  * this thread function, and is used to set the mh_aio_cleanup_done member
1578  * prior to calling thread_exit().
1579  */
1580 static void
1581 dr_aio_cleanup_thread(caddr_t amhp)
1582 {
1583 	proc_t *procp;
1584 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
1585 	int cleaned;
1586 	int n = 0;
1587 	struct mem_handle *mhp;
1588 	volatile uint_t *pcancel;
1589 
1590 	mhp = (struct mem_handle *)amhp;
1591 	ASSERT(mhp != NULL);
1592 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1593 	if (modload("sys", "kaio") == -1) {
1594 		mhp->mh_aio_cleanup_done = 1;
1595 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1596 		thread_exit();
1597 	}
1598 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1599 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1600 	if (aio_cleanup_dr_delete_memory == NULL) {
1601 		mhp->mh_aio_cleanup_done = 1;
1602 		cmn_err(CE_WARN,
1603 	    "aio_cleanup_dr_delete_memory not found in kaio");
1604 		thread_exit();
1605 	}
1606 	do {
1607 		cleaned = 0;
1608 		mutex_enter(&pidlock);
1609 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
1610 		    procp = procp->p_next) {
1611 			mutex_enter(&procp->p_lock);
1612 			if (procp->p_aio != NULL) {
1613 				/* cleanup proc's outstanding kaio */
1614 				cleaned +=
1615 				    (*aio_cleanup_dr_delete_memory)(procp);
1616 			}
1617 			mutex_exit(&procp->p_lock);
1618 		}
1619 		mutex_exit(&pidlock);
1620 		if ((*pcancel == 0) &&
1621 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1622 			/* delay a bit before retrying all procs again */
1623 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1624 			n = 0;
1625 		}
1626 	} while (*pcancel == 0);
1627 	mhp->mh_aio_cleanup_done = 1;
1628 	thread_exit();
1629 }
1630 
1631 static void
1632 delete_memory_thread(caddr_t amhp)
1633 {
1634 	struct mem_handle *mhp;
1635 	struct memdelspan *mdsp;
1636 	callb_cpr_t cprinfo;
1637 	page_t *pp_targ;
1638 	spgcnt_t freemem_left;
1639 	void (*del_complete_funcp)(void *, int error);
1640 	void *del_complete_arg;
1641 	int comp_code;
1642 	int ret;
1643 	int first_scan;
1644 	uint_t szc;
1645 #ifdef MEM_DEL_STATS
1646 	uint64_t start_total, ntick_total;
1647 	uint64_t start_pgrp, ntick_pgrp;
1648 #endif /* MEM_DEL_STATS */
1649 
1650 	mhp = (struct mem_handle *)amhp;
1651 
1652 #ifdef MEM_DEL_STATS
1653 	start_total = ddi_get_lbolt();
1654 #endif /* MEM_DEL_STATS */
1655 
1656 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1657 	    callb_generic_cpr, "memdel");
1658 
1659 	mutex_enter(&mhp->mh_mutex);
1660 	ASSERT(mhp->mh_state == MHND_STARTING);
1661 
1662 	mhp->mh_state = MHND_RUNNING;
1663 	mhp->mh_thread_id = curthread;
1664 
1665 	mhp->mh_hold_todo = mhp->mh_vm_pages;
1666 	mutex_exit(&mhp->mh_mutex);
1667 
1668 	/* Allocate the remap pages now, if necessary. */
1669 	memseg_remap_init();
1670 
1671 	/*
1672 	 * Subtract from availrmem now if possible as availrmem
1673 	 * may not be available by the end of the delete.
1674 	 */
1675 	if (!get_availrmem(mhp->mh_vm_pages)) {
1676 		comp_code = KPHYSM_ENOTVIABLE;
1677 		mutex_enter(&mhp->mh_mutex);
1678 		goto early_exit;
1679 	}
1680 
1681 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1682 
1683 	mutex_enter(&mhp->mh_mutex);
1684 
1685 	if (ret != 0) {
1686 		mhp->mh_cancel = KPHYSM_EREFUSED;
1687 		goto refused;
1688 	}
1689 
1690 	transit_list_collect(mhp, 1);
1691 
1692 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1693 	    mdsp = mdsp->mds_next) {
1694 		ASSERT(mdsp->mds_bitmap == NULL);
1695 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1696 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1697 		    KM_SLEEP);
1698 	}
1699 
1700 	first_scan = 1;
1701 	freemem_left = 0;
1702 	/*
1703 	 * Start dr_aio_cleanup_thread, which periodically iterates
1704 	 * through the process list and invokes aio cleanup.  This
1705 	 * is needed in order to avoid a deadly embrace between the
1706 	 * delete_memory_thread (waiting on writer lock for page, with the
1707 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
1708 	 * reader lock on the same page that is wanted by the
1709 	 * delete_memory_thread), and threads waiting for kaio completion
1710 	 * (blocked on spt_amp->lock).
1711 	 */
1712 	mhp->mh_dr_aio_cleanup_cancel = 0;
1713 	mhp->mh_aio_cleanup_done = 0;
1714 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1715 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1716 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1717 		pgcnt_t collected;
1718 
1719 		MDSTAT_INCR(mhp, nloop);
1720 		collected = 0;
1721 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1722 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1723 			pfn_t pfn, p_end;
1724 
1725 			if (first_scan) {
1726 				mem_node_pre_del_slice(mdsp->mds_base,
1727 				    mdsp->mds_base + mdsp->mds_npgs - 1);
1728 			}
1729 
1730 			p_end = mdsp->mds_base + mdsp->mds_npgs;
1731 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
1732 			    (mhp->mh_cancel == 0); pfn++) {
1733 				page_t *pp, *tpp, *tpp_targ;
1734 				pgcnt_t bit;
1735 				struct vnode *vp;
1736 				u_offset_t offset;
1737 				int mod, result;
1738 				spgcnt_t pgcnt;
1739 
1740 				bit = pfn - mdsp->mds_base;
1741 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
1742 				    (1 << (bit % NBPBMW))) != 0) {
1743 					MDSTAT_INCR(mhp, already_done);
1744 					continue;
1745 				}
1746 				if (freemem_left == 0) {
1747 					freemem_left += delthr_get_freemem(mhp);
1748 					if (freemem_left == 0)
1749 						break;
1750 				}
1751 
1752 				/*
1753 				 * Release mh_mutex - some of this
1754 				 * stuff takes some time (eg PUTPAGE).
1755 				 */
1756 
1757 				mutex_exit(&mhp->mh_mutex);
1758 				MDSTAT_INCR(mhp, ncheck);
1759 
1760 				pp = page_numtopp_nolock(pfn);
1761 				if (pp == NULL) {
1762 					/*
1763 					 * Not covered by a page_t - will
1764 					 * be dealt with elsewhere.
1765 					 */
1766 					MDSTAT_INCR(mhp, nopaget);
1767 					mutex_enter(&mhp->mh_mutex);
1768 					mdsp->mds_bitmap[bit / NBPBMW] |=
1769 					    (1 << (bit % NBPBMW));
1770 					continue;
1771 				}
1772 
1773 				if (!page_try_reclaim_lock(pp, SE_EXCL,
1774 				    SE_EXCL_WANTED | SE_RETIRED)) {
1775 					/*
1776 					 * Page in use elsewhere.  Skip it.
1777 					 */
1778 					MDSTAT_INCR(mhp, lockfail);
1779 					mutex_enter(&mhp->mh_mutex);
1780 					continue;
1781 				}
1782 				/*
1783 				 * See if the cage expanded into the delete.
1784 				 * This can happen as we have to allow the
1785 				 * cage to expand.
1786 				 */
1787 				if (PP_ISNORELOC(pp)) {
1788 					page_unlock(pp);
1789 					mutex_enter(&mhp->mh_mutex);
1790 					mhp->mh_cancel = KPHYSM_ENONRELOC;
1791 					break;
1792 				}
1793 				if (PP_RETIRED(pp)) {
1794 					/*
1795 					 * Page has been retired and is
1796 					 * not part of the cage so we
1797 					 * can now do the accounting for
1798 					 * it.
1799 					 */
1800 					MDSTAT_INCR(mhp, retired);
1801 					mutex_enter(&mhp->mh_mutex);
1802 					mdsp->mds_bitmap[bit / NBPBMW]
1803 					    |= (1 << (bit % NBPBMW));
1804 					mdsp->mds_bitmap_retired[bit /
1805 					    NBPBMW] |=
1806 					    (1 << (bit % NBPBMW));
1807 					mhp->mh_hold_todo--;
1808 					continue;
1809 				}
1810 				ASSERT(freemem_left != 0);
1811 				if (PP_ISFREE(pp)) {
1812 					/*
1813 					 * Like page_reclaim() only 'freemem'
1814 					 * processing is already done.
1815 					 */
1816 					MDSTAT_INCR(mhp, nfree);
1817 				free_page_collect:
1818 					if (PP_ISAGED(pp)) {
1819 						page_list_sub(pp,
1820 						    PG_FREE_LIST);
1821 					} else {
1822 						page_list_sub(pp,
1823 						    PG_CACHE_LIST);
1824 					}
1825 					PP_CLRFREE(pp);
1826 					PP_CLRAGED(pp);
1827 					collected++;
1828 					mutex_enter(&mhp->mh_mutex);
1829 					page_delete_collect(pp, mhp);
1830 					mdsp->mds_bitmap[bit / NBPBMW] |=
1831 					    (1 << (bit % NBPBMW));
1832 					freemem_left--;
1833 					continue;
1834 				}
1835 				ASSERT(pp->p_vnode != NULL);
1836 				if (first_scan) {
1837 					MDSTAT_INCR(mhp, first_notfree);
1838 					page_unlock(pp);
1839 					mutex_enter(&mhp->mh_mutex);
1840 					continue;
1841 				}
1842 				/*
1843 				 * Keep stats on pages encountered that
1844 				 * are marked for retirement.
1845 				 */
1846 				if (PP_TOXIC(pp)) {
1847 					MDSTAT_INCR(mhp, toxic);
1848 				} else if (PP_PR_REQ(pp)) {
1849 					MDSTAT_INCR(mhp, failing);
1850 				}
1851 				/*
1852 				 * In certain cases below, special exceptions
1853 				 * are made for pages that are toxic.  This
1854 				 * is because the current meaning of toxic
1855 				 * is that an uncorrectable error has been
1856 				 * previously associated with the page.
1857 				 */
1858 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1859 					if (!PP_TOXIC(pp)) {
1860 						/*
1861 						 * Must relocate locked in
1862 						 * memory pages.
1863 						 */
1864 #ifdef MEM_DEL_STATS
1865 						start_pgrp = ddi_get_lbolt();
1866 #endif /* MEM_DEL_STATS */
1867 						/*
1868 						 * Lock all constituent pages
1869 						 * of a large page to ensure
1870 						 * that p_szc won't change.
1871 						 */
1872 						if (!group_page_trylock(pp,
1873 						    SE_EXCL)) {
1874 							MDSTAT_INCR(mhp,
1875 							    gptllckfail);
1876 							page_unlock(pp);
1877 							mutex_enter(
1878 							    &mhp->mh_mutex);
1879 							continue;
1880 						}
1881 						MDSTAT_INCR(mhp, npplocked);
1882 						pp_targ =
1883 						    page_get_replacement_page(
1884 						    pp, NULL, 0);
1885 						if (pp_targ != NULL) {
1886 #ifdef MEM_DEL_STATS
1887 							ntick_pgrp =
1888 							    (uint64_t)
1889 							    ddi_get_lbolt() -
1890 							    start_pgrp;
1891 #endif /* MEM_DEL_STATS */
1892 							MDSTAT_PGRP(mhp,
1893 							    ntick_pgrp);
1894 							MDSTAT_INCR(mhp,
1895 							    nlockreloc);
1896 							goto reloc;
1897 						}
1898 						group_page_unlock(pp);
1899 						page_unlock(pp);
1900 #ifdef MEM_DEL_STATS
1901 						ntick_pgrp =
1902 						    (uint64_t)ddi_get_lbolt() -
1903 						    start_pgrp;
1904 #endif /* MEM_DEL_STATS */
1905 						MDSTAT_PGRP(mhp, ntick_pgrp);
1906 						MDSTAT_INCR(mhp, nnorepl);
1907 						mutex_enter(&mhp->mh_mutex);
1908 						continue;
1909 					} else {
1910 						/*
1911 						 * Cannot do anything about
1912 						 * this page because it is
1913 						 * toxic.
1914 						 */
1915 						MDSTAT_INCR(mhp, npplkdtoxic);
1916 						page_unlock(pp);
1917 						mutex_enter(&mhp->mh_mutex);
1918 						continue;
1919 					}
1920 				}
1921 				/*
1922 				 * Unload the mappings and check if mod bit
1923 				 * is set.
1924 				 */
1925 				ASSERT(!PP_ISKAS(pp));
1926 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1927 				mod = hat_ismod(pp);
1928 
1929 #ifdef MEM_DEL_STATS
1930 				start_pgrp = ddi_get_lbolt();
1931 #endif /* MEM_DEL_STATS */
1932 				if (mod && !PP_TOXIC(pp)) {
1933 					/*
1934 					 * Lock all constituent pages
1935 					 * of a large page to ensure
1936 					 * that p_szc won't change.
1937 					 */
1938 					if (!group_page_trylock(pp, SE_EXCL)) {
1939 						MDSTAT_INCR(mhp, gptlmodfail);
1940 						page_unlock(pp);
1941 						mutex_enter(&mhp->mh_mutex);
1942 						continue;
1943 					}
1944 					pp_targ = page_get_replacement_page(pp,
1945 					    NULL, 0);
1946 					if (pp_targ != NULL) {
1947 						MDSTAT_INCR(mhp, nmodreloc);
1948 #ifdef MEM_DEL_STATS
1949 						ntick_pgrp =
1950 						    (uint64_t)ddi_get_lbolt() -
1951 						    start_pgrp;
1952 #endif /* MEM_DEL_STATS */
1953 						MDSTAT_PGRP(mhp, ntick_pgrp);
1954 						goto reloc;
1955 					}
1956 					group_page_unlock(pp);
1957 				}
1958 
1959 				if (!page_try_demote_pages(pp)) {
1960 					MDSTAT_INCR(mhp, demotefail);
1961 					page_unlock(pp);
1962 #ifdef MEM_DEL_STATS
1963 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1964 					    start_pgrp;
1965 #endif /* MEM_DEL_STATS */
1966 					MDSTAT_PGRP(mhp, ntick_pgrp);
1967 					mutex_enter(&mhp->mh_mutex);
1968 					continue;
1969 				}
1970 
1971 				/*
1972 				 * Regular 'page-out'.
1973 				 */
1974 				if (!mod) {
1975 					MDSTAT_INCR(mhp, ndestroy);
1976 					page_destroy(pp, 1);
1977 					/*
1978 					 * page_destroy was called with
1979 					 * dontfree. As long as p_lckcnt
1980 					 * and p_cowcnt are both zero, the
1981 					 * only additional action of
1982 					 * page_destroy with !dontfree is to
1983 					 * call page_free, so we can collect
1984 					 * the page here.
1985 					 */
1986 					collected++;
1987 #ifdef MEM_DEL_STATS
1988 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
1989 					    start_pgrp;
1990 #endif /* MEM_DEL_STATS */
1991 					MDSTAT_PGRP(mhp, ntick_pgrp);
1992 					mutex_enter(&mhp->mh_mutex);
1993 					page_delete_collect(pp, mhp);
1994 					mdsp->mds_bitmap[bit / NBPBMW] |=
1995 					    (1 << (bit % NBPBMW));
1996 					continue;
1997 				}
1998 				/*
1999 				 * The page is toxic and the mod bit is
2000 				 * set, we cannot do anything here to deal
2001 				 * with it.
2002 				 */
2003 				if (PP_TOXIC(pp)) {
2004 					page_unlock(pp);
2005 #ifdef MEM_DEL_STATS
2006 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2007 					    start_pgrp;
2008 #endif /* MEM_DEL_STATS */
2009 					MDSTAT_PGRP(mhp, ntick_pgrp);
2010 					MDSTAT_INCR(mhp, modtoxic);
2011 					mutex_enter(&mhp->mh_mutex);
2012 					continue;
2013 				}
2014 				MDSTAT_INCR(mhp, nputpage);
2015 				vp = pp->p_vnode;
2016 				offset = pp->p_offset;
2017 				VN_HOLD(vp);
2018 				page_unlock(pp);
2019 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2020 				    B_INVAL|B_FORCE, kcred, NULL);
2021 				VN_RELE(vp);
2022 #ifdef MEM_DEL_STATS
2023 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2024 				    start_pgrp;
2025 #endif /* MEM_DEL_STATS */
2026 				MDSTAT_PGRP(mhp, ntick_pgrp);
2027 				/*
2028 				 * Try to get the page back immediately
2029 				 * so that it can be collected.
2030 				 */
2031 				pp = page_numtopp_nolock(pfn);
2032 				if (pp == NULL) {
2033 					MDSTAT_INCR(mhp, nnoreclaim);
2034 					/*
2035 					 * This should not happen as this
2036 					 * thread is deleting the page.
2037 					 * If this code is generalized, this
2038 					 * becomes a reality.
2039 					 */
2040 #ifdef DEBUG
2041 					cmn_err(CE_WARN,
2042 					    "delete_memory_thread(0x%p) "
2043 					    "pfn 0x%lx has no page_t",
2044 					    (void *)mhp, pfn);
2045 #endif /* DEBUG */
2046 					mutex_enter(&mhp->mh_mutex);
2047 					continue;
2048 				}
2049 				if (page_try_reclaim_lock(pp, SE_EXCL,
2050 				    SE_EXCL_WANTED | SE_RETIRED)) {
2051 					if (PP_ISFREE(pp)) {
2052 						goto free_page_collect;
2053 					}
2054 					page_unlock(pp);
2055 				}
2056 				MDSTAT_INCR(mhp, nnoreclaim);
2057 				mutex_enter(&mhp->mh_mutex);
2058 				continue;
2059 
2060 			reloc:
2061 				/*
2062 				 * Got some freemem and a target
2063 				 * page, so move the data to avoid
2064 				 * I/O and lock problems.
2065 				 */
2066 				ASSERT(!page_iolock_assert(pp));
2067 				MDSTAT_INCR(mhp, nreloc);
2068 				/*
2069 				 * page_relocate() will return pgcnt: the
2070 				 * number of consecutive pages relocated.
2071 				 * If it is successful, pp will be a
2072 				 * linked list of the page structs that
2073 				 * were relocated. If page_relocate() is
2074 				 * unsuccessful, pp will be unmodified.
2075 				 */
2076 #ifdef MEM_DEL_STATS
2077 				start_pgrp = ddi_get_lbolt();
2078 #endif /* MEM_DEL_STATS */
2079 				result = page_relocate(&pp, &pp_targ, 0, 0,
2080 				    &pgcnt, NULL);
2081 #ifdef MEM_DEL_STATS
2082 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2083 				    start_pgrp;
2084 #endif /* MEM_DEL_STATS */
2085 				MDSTAT_PGRP(mhp, ntick_pgrp);
2086 				if (result != 0) {
2087 					MDSTAT_INCR(mhp, nrelocfail);
2088 					/*
2089 					 * We did not succeed. We need
2090 					 * to give the pp_targ pages back.
2091 					 * page_free(pp_targ, 1) without
2092 					 * the freemem accounting.
2093 					 */
2094 					group_page_unlock(pp);
2095 					page_free_replacement_page(pp_targ);
2096 					page_unlock(pp);
2097 					mutex_enter(&mhp->mh_mutex);
2098 					continue;
2099 				}
2100 
2101 				/*
2102 				 * We will then collect pgcnt pages.
2103 				 */
2104 				ASSERT(pgcnt > 0);
2105 				mutex_enter(&mhp->mh_mutex);
2106 				/*
2107 				 * We need to make sure freemem_left is
2108 				 * large enough.
2109 				 */
2110 				while ((freemem_left < pgcnt) &&
2111 				    (!mhp->mh_cancel)) {
2112 					freemem_left +=
2113 					    delthr_get_freemem(mhp);
2114 				}
2115 
2116 				/*
2117 				 * Do not proceed if mh_cancel is set.
2118 				 */
2119 				if (mhp->mh_cancel) {
2120 					while (pp_targ != NULL) {
2121 						/*
2122 						 * Unlink and unlock each page.
2123 						 */
2124 						tpp_targ = pp_targ;
2125 						page_sub(&pp_targ, tpp_targ);
2126 						page_unlock(tpp_targ);
2127 					}
2128 					/*
2129 					 * We need to give the pp pages back.
2130 					 * page_free(pp, 1) without the
2131 					 * freemem accounting.
2132 					 */
2133 					page_free_replacement_page(pp);
2134 					break;
2135 				}
2136 
2137 				/* Now remove pgcnt from freemem_left */
2138 				freemem_left -= pgcnt;
2139 				ASSERT(freemem_left >= 0);
2140 				szc = pp->p_szc;
2141 				while (pp != NULL) {
2142 					/*
2143 					 * pp and pp_targ were passed back as
2144 					 * a linked list of pages.
2145 					 * Unlink and unlock each page.
2146 					 */
2147 					tpp_targ = pp_targ;
2148 					page_sub(&pp_targ, tpp_targ);
2149 					page_unlock(tpp_targ);
2150 					/*
2151 					 * The original page is now free
2152 					 * so remove it from the linked
2153 					 * list and collect it.
2154 					 */
2155 					tpp = pp;
2156 					page_sub(&pp, tpp);
2157 					pfn = page_pptonum(tpp);
2158 					collected++;
2159 					ASSERT(PAGE_EXCL(tpp));
2160 					ASSERT(tpp->p_vnode == NULL);
2161 					ASSERT(!hat_page_is_mapped(tpp));
2162 					ASSERT(tpp->p_szc == szc);
2163 					tpp->p_szc = 0;
2164 					page_delete_collect(tpp, mhp);
2165 					bit = pfn - mdsp->mds_base;
2166 					mdsp->mds_bitmap[bit / NBPBMW] |=
2167 					    (1 << (bit % NBPBMW));
2168 				}
2169 				ASSERT(pp_targ == NULL);
2170 			}
2171 		}
2172 		first_scan = 0;
2173 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2174 		    (collected == 0)) {
2175 			/*
2176 			 * This code is needed as we cannot wait
2177 			 * for a page to be locked OR the delete to
2178 			 * be cancelled.  Also, we must delay so
2179 			 * that other threads get a chance to run
2180 			 * on our cpu, otherwise page locks may be
2181 			 * held indefinitely by those threads.
2182 			 */
2183 			MDSTAT_INCR(mhp, ndelay);
2184 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2185 			(void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex,
2186 			    (lbolt + DEL_BUSY_WAIT_TICKS));
2187 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2188 		}
2189 	}
2190 	/* stop the dr aio cleanup thread */
2191 	mhp->mh_dr_aio_cleanup_cancel = 1;
2192 	transit_list_collect(mhp, 0);
2193 	if (freemem_left != 0) {
2194 		/* Return any surplus. */
2195 		page_create_putback(freemem_left);
2196 		freemem_left = 0;
2197 	}
2198 #ifdef MEM_DEL_STATS
2199 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2200 #endif /* MEM_DEL_STATS */
2201 	MDSTAT_TOTAL(mhp, ntick_total);
2202 	MDSTAT_PRINT(mhp);
2203 
2204 	/*
2205 	 * If the memory delete was cancelled, exclusive-wanted bits must
2206 	 * be cleared. If there are retired pages being deleted, they need
2207 	 * to be unretired.
2208 	 */
2209 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2210 	    mdsp = mdsp->mds_next) {
2211 		pfn_t pfn, p_end;
2212 
2213 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2214 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2215 			page_t *pp;
2216 			pgcnt_t bit;
2217 
2218 			bit = pfn - mdsp->mds_base;
2219 			if (mhp->mh_cancel) {
2220 				pp = page_numtopp_nolock(pfn);
2221 				if (pp != NULL) {
2222 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
2223 					    (1 << (bit % NBPBMW))) == 0) {
2224 						page_lock_clr_exclwanted(pp);
2225 					}
2226 				}
2227 			} else {
2228 				pp = NULL;
2229 			}
2230 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2231 			    (1 << (bit % NBPBMW))) != 0) {
2232 				/* do we already have pp? */
2233 				if (pp == NULL) {
2234 					pp = page_numtopp_nolock(pfn);
2235 				}
2236 				ASSERT(pp != NULL);
2237 				ASSERT(PP_RETIRED(pp));
2238 				if (mhp->mh_cancel != 0) {
2239 					page_unlock(pp);
2240 					/*
2241 					 * To satisfy ASSERT below in
2242 					 * cancel code.
2243 					 */
2244 					mhp->mh_hold_todo++;
2245 				} else {
2246 					(void) page_unretire_pp(pp,
2247 					    PR_UNR_CLEAN);
2248 				}
2249 			}
2250 		}
2251 	}
2252 	/*
2253 	 * Free retired page bitmap and collected page bitmap
2254 	 */
2255 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2256 	    mdsp = mdsp->mds_next) {
2257 		ASSERT(mdsp->mds_bitmap_retired != NULL);
2258 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2259 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
2260 		ASSERT(mdsp->mds_bitmap != NULL);
2261 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2262 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
2263 	}
2264 
2265 	/* wait for our dr aio cancel thread to exit */
2266 	while (!(mhp->mh_aio_cleanup_done)) {
2267 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2268 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2269 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2270 	}
2271 refused:
2272 	if (mhp->mh_cancel != 0) {
2273 		page_t *pp;
2274 
2275 		comp_code = mhp->mh_cancel;
2276 		/*
2277 		 * Go through list of deleted pages (mh_deleted) freeing
2278 		 * them.
2279 		 */
2280 		while ((pp = mhp->mh_deleted) != NULL) {
2281 			mhp->mh_deleted = pp->p_next;
2282 			mhp->mh_hold_todo++;
2283 			mutex_exit(&mhp->mh_mutex);
2284 			/* Restore p_next. */
2285 			pp->p_next = pp->p_prev;
2286 			if (PP_ISFREE(pp)) {
2287 				cmn_err(CE_PANIC,
2288 				    "page %p is free",
2289 				    (void *)pp);
2290 			}
2291 			page_free(pp, 1);
2292 			mutex_enter(&mhp->mh_mutex);
2293 		}
2294 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2295 
2296 		mutex_exit(&mhp->mh_mutex);
2297 		put_availrmem(mhp->mh_vm_pages);
2298 		mutex_enter(&mhp->mh_mutex);
2299 
2300 		goto t_exit;
2301 	}
2302 
2303 	/*
2304 	 * All the pages are no longer in use and are exclusively locked.
2305 	 */
2306 
2307 	mhp->mh_deleted = NULL;
2308 
2309 	kphysm_del_cleanup(mhp);
2310 
2311 	/*
2312 	 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so
2313 	 * that the mem_node_config[] will remain intact for the cleanup.
2314 	 */
2315 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2316 	    mdsp = mdsp->mds_next) {
2317 		mem_node_post_del_slice(mdsp->mds_base,
2318 		    mdsp->mds_base + mdsp->mds_npgs - 1, 0);
2319 	}
2320 
2321 	comp_code = KPHYSM_OK;
2322 
2323 t_exit:
2324 	mutex_exit(&mhp->mh_mutex);
2325 	kphysm_setup_post_del(mhp->mh_vm_pages,
2326 	    (comp_code == KPHYSM_OK) ? 0 : 1);
2327 	mutex_enter(&mhp->mh_mutex);
2328 
2329 early_exit:
2330 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2331 	mhp->mh_state = MHND_DONE;
2332 	del_complete_funcp = mhp->mh_delete_complete;
2333 	del_complete_arg = mhp->mh_delete_complete_arg;
2334 	CALLB_CPR_EXIT(&cprinfo);
2335 	(*del_complete_funcp)(del_complete_arg, comp_code);
2336 	thread_exit();
2337 	/*NOTREACHED*/
2338 }
2339 
2340 /*
2341  * Start the delete of the memory from the system.
2342  */
2343 int
2344 kphysm_del_start(
2345 	memhandle_t handle,
2346 	void (*complete)(void *, int),
2347 	void *complete_arg)
2348 {
2349 	struct mem_handle *mhp;
2350 
2351 	mhp = kphysm_lookup_mem_handle(handle);
2352 	if (mhp == NULL) {
2353 		return (KPHYSM_EHANDLE);
2354 	}
2355 	switch (mhp->mh_state) {
2356 	case MHND_FREE:
2357 		ASSERT(mhp->mh_state != MHND_FREE);
2358 		mutex_exit(&mhp->mh_mutex);
2359 		return (KPHYSM_EHANDLE);
2360 	case MHND_INIT:
2361 		break;
2362 	case MHND_STARTING:
2363 	case MHND_RUNNING:
2364 		mutex_exit(&mhp->mh_mutex);
2365 		return (KPHYSM_ESEQUENCE);
2366 	case MHND_DONE:
2367 		mutex_exit(&mhp->mh_mutex);
2368 		return (KPHYSM_ESEQUENCE);
2369 	case MHND_RELEASE:
2370 		mutex_exit(&mhp->mh_mutex);
2371 		return (KPHYSM_ESEQUENCE);
2372 	default:
2373 #ifdef DEBUG
2374 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2375 		    (void *)mhp, mhp->mh_state);
2376 #endif /* DEBUG */
2377 		mutex_exit(&mhp->mh_mutex);
2378 		return (KPHYSM_EHANDLE);
2379 	}
2380 
2381 	if (mhp->mh_transit.trl_spans == NULL) {
2382 		mutex_exit(&mhp->mh_mutex);
2383 		return (KPHYSM_ENOWORK);
2384 	}
2385 
2386 	ASSERT(complete != NULL);
2387 	mhp->mh_delete_complete = complete;
2388 	mhp->mh_delete_complete_arg = complete_arg;
2389 	mhp->mh_state = MHND_STARTING;
2390 	/*
2391 	 * Release the mutex in case thread_create sleeps.
2392 	 */
2393 	mutex_exit(&mhp->mh_mutex);
2394 
2395 	/*
2396 	 * The "obvious" process for this thread is pageout (proc_pageout)
2397 	 * but this gives the thread too much power over freemem
2398 	 * which results in freemem starvation.
2399 	 */
2400 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2401 	    TS_RUN, maxclsyspri - 1);
2402 
2403 	return (KPHYSM_OK);
2404 }
2405 
2406 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
2407 static caddr_t pp_dummy;
2408 static pgcnt_t pp_dummy_npages;
2409 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
2410 
2411 static void
2412 memseg_remap_init_pages(page_t *pages, page_t *epages)
2413 {
2414 	page_t *pp;
2415 
2416 	for (pp = pages; pp < epages; pp++) {
2417 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2418 		pp->p_offset = (u_offset_t)-1;
2419 		page_iolock_init(pp);
2420 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2421 			continue;
2422 		page_lock_delete(pp);
2423 	}
2424 }
2425 
2426 void
2427 memseg_remap_init()
2428 {
2429 	mutex_enter(&pp_dummy_lock);
2430 	if (pp_dummy == NULL) {
2431 		uint_t dpages;
2432 		int i;
2433 
2434 		/*
2435 		 * dpages starts off as the size of the structure and
2436 		 * ends up as the minimum number of pages that will
2437 		 * hold a whole number of page_t structures.
2438 		 */
2439 		dpages = sizeof (page_t);
2440 		ASSERT(dpages != 0);
2441 		ASSERT(dpages <= MMU_PAGESIZE);
2442 
2443 		while ((dpages & 1) == 0)
2444 			dpages >>= 1;
2445 
2446 		pp_dummy_npages = dpages;
2447 		/*
2448 		 * Allocate pp_dummy pages directly from static_arena,
2449 		 * since these are whole page allocations and are
2450 		 * referenced by physical address.  This also has the
2451 		 * nice fringe benefit of hiding the memory from
2452 		 * ::findleaks since it doesn't deal well with allocated
2453 		 * kernel heap memory that doesn't have any mappings.
2454 		 */
2455 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2456 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2457 		bzero(pp_dummy, ptob(pp_dummy_npages));
2458 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2459 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2460 		    pp_dummy_npages, KM_SLEEP);
2461 		for (i = 0; i < pp_dummy_npages; i++) {
2462 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2463 			    &pp_dummy[MMU_PAGESIZE * i]);
2464 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2465 		}
2466 		/*
2467 		 * Initialize the page_t's to a known 'deleted' state
2468 		 * that matches the state of deleted pages.
2469 		 */
2470 		memseg_remap_init_pages((page_t *)pp_dummy,
2471 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2472 		/* Remove kmem mappings for the pages for safety. */
2473 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2474 		    HAT_UNLOAD_UNLOCK);
2475 		/* Leave pp_dummy pointer set as flag that init is done. */
2476 	}
2477 	mutex_exit(&pp_dummy_lock);
2478 }
2479 
2480 static void
2481 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs)
2482 {
2483 	ASSERT(pp_dummy != NULL);
2484 
2485 	while (metapgs != 0) {
2486 		pgcnt_t n;
2487 		int i;
2488 
2489 		n = pp_dummy_npages;
2490 		if (n > metapgs)
2491 			n = metapgs;
2492 		for (i = 0; i < n; i++) {
2493 			hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i],
2494 			    PROT_READ,
2495 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
2496 			    HAT_LOAD_REMAP);
2497 			pp += ptob(1);
2498 		}
2499 		metapgs -= n;
2500 	}
2501 }
2502 
2503 /*
2504  * Transition all the deleted pages to the deleted state so that
2505  * page_lock will not wait. The page_lock_delete call will
2506  * also wake up any waiters.
2507  */
2508 static void
2509 memseg_lock_delete_all(struct memseg *seg)
2510 {
2511 	page_t *pp;
2512 
2513 	for (pp = seg->pages; pp < seg->epages; pp++) {
2514 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
2515 		page_lock_delete(pp);
2516 	}
2517 }
2518 
2519 static void
2520 kphysm_del_cleanup(struct mem_handle *mhp)
2521 {
2522 	struct memdelspan	*mdsp;
2523 	struct memseg		*seg;
2524 	struct memseg   	**segpp;
2525 	struct memseg		*seglist;
2526 	pfn_t			p_end;
2527 	uint64_t		avmem;
2528 	pgcnt_t			avpgs;
2529 	pgcnt_t			npgs;
2530 
2531 	avpgs = mhp->mh_vm_pages;
2532 
2533 	memsegs_lock(1);
2534 
2535 	/*
2536 	 * remove from main segment list.
2537 	 */
2538 	npgs = 0;
2539 	seglist = NULL;
2540 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2541 	    mdsp = mdsp->mds_next) {
2542 		p_end = mdsp->mds_base + mdsp->mds_npgs;
2543 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2544 			if (seg->pages_base >= p_end ||
2545 			    seg->pages_end <= mdsp->mds_base) {
2546 				/* Span and memseg don't overlap. */
2547 				segpp = &((*segpp)->next);
2548 				continue;
2549 			}
2550 			ASSERT(seg->pages_base >= mdsp->mds_base);
2551 			ASSERT(seg->pages_end <= p_end);
2552 
2553 			PLCNT_MODIFY_MAX(seg->pages_base,
2554 			    seg->pages_base - seg->pages_end);
2555 
2556 			/* Hide the memseg from future scans. */
2557 			hat_kpm_delmem_mseg_update(seg, segpp);
2558 			*segpp = seg->next;
2559 			membar_producer();	/* TODO: Needed? */
2560 			npgs += MSEG_NPAGES(seg);
2561 
2562 			/*
2563 			 * Leave the deleted segment's next pointer intact
2564 			 * in case a memsegs scanning loop is walking this
2565 			 * segment concurrently.
2566 			 */
2567 			seg->lnext = seglist;
2568 			seglist = seg;
2569 		}
2570 	}
2571 
2572 	build_pfn_hash();
2573 
2574 	ASSERT(npgs < total_pages);
2575 	total_pages -= npgs;
2576 
2577 	/*
2578 	 * Recalculate the paging parameters now total_pages has changed.
2579 	 * This will also cause the clock hands to be reset before next use.
2580 	 */
2581 	setupclock(1);
2582 
2583 	memsegs_unlock(1);
2584 
2585 	mutex_exit(&mhp->mh_mutex);
2586 
2587 	while ((seg = seglist) != NULL) {
2588 		pfn_t mseg_start;
2589 		pfn_t mseg_base, mseg_end;
2590 		pgcnt_t mseg_npgs;
2591 		page_t *pp;
2592 		pgcnt_t metapgs;
2593 		int dynamic;
2594 		int mlret;
2595 
2596 		seglist = seg->lnext;
2597 
2598 		/*
2599 		 * Put the page_t's into the deleted state to stop
2600 		 * cv_wait()s on the pages. When we remap, the dummy
2601 		 * page_t's will be in the same state.
2602 		 */
2603 		memseg_lock_delete_all(seg);
2604 		/*
2605 		 * Collect up information based on pages_base and pages_end
2606 		 * early so that we can flag early that the memseg has been
2607 		 * deleted by setting pages_end == pages_base.
2608 		 */
2609 		mseg_base = seg->pages_base;
2610 		mseg_end = seg->pages_end;
2611 		mseg_npgs = MSEG_NPAGES(seg);
2612 		dynamic = memseg_is_dynamic(seg, &mseg_start);
2613 
2614 		seg->pages_end = seg->pages_base;
2615 
2616 		if (dynamic) {
2617 			pp = seg->pages;
2618 			metapgs = mseg_base - mseg_start;
2619 			ASSERT(metapgs != 0);
2620 
2621 			/* Remap the meta data to our special dummy area. */
2622 			memseg_remap_to_dummy((caddr_t)pp, metapgs);
2623 
2624 			mutex_enter(&memseg_lists_lock);
2625 			seg->lnext = memseg_va_avail;
2626 			memseg_va_avail = seg;
2627 			mutex_exit(&memseg_lists_lock);
2628 		} else {
2629 			/*
2630 			 * Set for clean-up below.
2631 			 */
2632 			mseg_start = seg->pages_base;
2633 			/*
2634 			 * For memory whose page_ts were allocated
2635 			 * at boot, we need to find a new use for
2636 			 * the page_t memory.
2637 			 * For the moment, just leak it.
2638 			 * (It is held in the memseg_delete_junk list.)
2639 			 */
2640 
2641 			mutex_enter(&memseg_lists_lock);
2642 			seg->lnext = memseg_delete_junk;
2643 			memseg_delete_junk = seg;
2644 			mutex_exit(&memseg_lists_lock);
2645 		}
2646 
2647 		/* Must not use seg now as it could be re-used. */
2648 
2649 		memlist_write_lock();
2650 
2651 		mlret = memlist_delete_span(
2652 		    (uint64_t)(mseg_base) << PAGESHIFT,
2653 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
2654 		    &phys_avail);
2655 		ASSERT(mlret == MEML_SPANOP_OK);
2656 
2657 		mlret = memlist_delete_span(
2658 		    (uint64_t)(mseg_start) << PAGESHIFT,
2659 		    (uint64_t)(mseg_end - mseg_start) <<
2660 		    PAGESHIFT,
2661 		    &phys_install);
2662 		ASSERT(mlret == MEML_SPANOP_OK);
2663 		phys_install_has_changed();
2664 
2665 		memlist_write_unlock();
2666 	}
2667 
2668 	memlist_read_lock();
2669 	installed_top_size(phys_install, &physmax, &physinstalled);
2670 	memlist_read_unlock();
2671 
2672 	mutex_enter(&freemem_lock);
2673 	maxmem -= avpgs;
2674 	physmem -= avpgs;
2675 	/* availrmem is adjusted during the delete. */
2676 	availrmem_initial -= avpgs;
2677 
2678 	mutex_exit(&freemem_lock);
2679 
2680 	dump_resize();
2681 
2682 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2683 	    "(0x%" PRIx64 ")\n",
2684 	    physinstalled << (PAGESHIFT - 10),
2685 	    (uint64_t)physinstalled << PAGESHIFT);
2686 
2687 	avmem = (uint64_t)freemem << PAGESHIFT;
2688 	cmn_err(CE_CONT, "?kphysm_delete: "
2689 	    "avail mem = %" PRId64 "\n", avmem);
2690 
2691 	/*
2692 	 * Update lgroup generation number on single lgroup systems
2693 	 */
2694 	if (nlgrps == 1)
2695 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2696 
2697 	/* Successfully deleted system memory */
2698 	mutex_enter(&mhp->mh_mutex);
2699 }
2700 
2701 static uint_t mdel_nullvp_waiter;
2702 
2703 static void
2704 page_delete_collect(
2705 	page_t *pp,
2706 	struct mem_handle *mhp)
2707 {
2708 	if (pp->p_vnode) {
2709 		page_hashout(pp, (kmutex_t *)NULL);
2710 		/* do not do PP_SETAGED(pp); */
2711 	} else {
2712 		kmutex_t *sep;
2713 
2714 		sep = page_se_mutex(pp);
2715 		mutex_enter(sep);
2716 		if (CV_HAS_WAITERS(&pp->p_cv)) {
2717 			mdel_nullvp_waiter++;
2718 			cv_broadcast(&pp->p_cv);
2719 		}
2720 		mutex_exit(sep);
2721 	}
2722 	ASSERT(pp->p_next == pp->p_prev);
2723 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
2724 	pp->p_next = mhp->mh_deleted;
2725 	mhp->mh_deleted = pp;
2726 	ASSERT(mhp->mh_hold_todo != 0);
2727 	mhp->mh_hold_todo--;
2728 }
2729 
2730 static void
2731 transit_list_collect(struct mem_handle *mhp, int v)
2732 {
2733 	struct transit_list_head *trh;
2734 
2735 	trh = &transit_list_head;
2736 	mutex_enter(&trh->trh_lock);
2737 	mhp->mh_transit.trl_collect = v;
2738 	mutex_exit(&trh->trh_lock);
2739 }
2740 
2741 static void
2742 transit_list_insert(struct transit_list *tlp)
2743 {
2744 	struct transit_list_head *trh;
2745 
2746 	trh = &transit_list_head;
2747 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2748 	tlp->trl_next = trh->trh_head;
2749 	trh->trh_head = tlp;
2750 }
2751 
2752 static void
2753 transit_list_remove(struct transit_list *tlp)
2754 {
2755 	struct transit_list_head *trh;
2756 	struct transit_list **tlpp;
2757 
2758 	trh = &transit_list_head;
2759 	tlpp = &trh->trh_head;
2760 	ASSERT(MUTEX_HELD(&trh->trh_lock));
2761 	while (*tlpp != NULL && *tlpp != tlp)
2762 		tlpp = &(*tlpp)->trl_next;
2763 	ASSERT(*tlpp != NULL);
2764 	if (*tlpp == tlp)
2765 		*tlpp = tlp->trl_next;
2766 	tlp->trl_next = NULL;
2767 }
2768 
2769 static struct transit_list *
2770 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2771 {
2772 	struct transit_list *tlp;
2773 
2774 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2775 		struct memdelspan *mdsp;
2776 
2777 		for (mdsp = tlp->trl_spans; mdsp != NULL;
2778 		    mdsp = mdsp->mds_next) {
2779 			if (pfnum >= mdsp->mds_base &&
2780 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2781 				return (tlp);
2782 			}
2783 		}
2784 	}
2785 	return (NULL);
2786 }
2787 
2788 int
2789 pfn_is_being_deleted(pfn_t pfnum)
2790 {
2791 	struct transit_list_head *trh;
2792 	struct transit_list *tlp;
2793 	int ret;
2794 
2795 	trh = &transit_list_head;
2796 	if (trh->trh_head == NULL)
2797 		return (0);
2798 
2799 	mutex_enter(&trh->trh_lock);
2800 	tlp = pfnum_to_transit_list(trh, pfnum);
2801 	ret = (tlp != NULL && tlp->trl_collect);
2802 	mutex_exit(&trh->trh_lock);
2803 
2804 	return (ret);
2805 }
2806 
2807 #ifdef MEM_DEL_STATS
2808 extern int hz;
2809 static void
2810 mem_del_stat_print_func(struct mem_handle *mhp)
2811 {
2812 	uint64_t tmp;
2813 
2814 	if (mem_del_stat_print) {
2815 		printf("memory delete loop %x/%x, statistics%s\n",
2816 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
2817 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2818 		    (mhp->mh_cancel ? " (cancelled)" : ""));
2819 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2820 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2821 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2822 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2823 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2824 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2825 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2826 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2827 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2828 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2829 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2830 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2831 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2832 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2833 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2834 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2835 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2836 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2837 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2838 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2839 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2840 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2841 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
2842 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2843 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
2844 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2845 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2846 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2847 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2848 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2849 		printf(
2850 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2851 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2852 
2853 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2854 		printf(
2855 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2856 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2857 	}
2858 }
2859 #endif /* MEM_DEL_STATS */
2860 
2861 struct mem_callback {
2862 	kphysm_setup_vector_t	*vec;
2863 	void			*arg;
2864 };
2865 
2866 #define	NMEMCALLBACKS		100
2867 
2868 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2869 static uint_t nmemcallbacks;
2870 static krwlock_t mem_callback_rwlock;
2871 
2872 int
2873 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2874 {
2875 	uint_t i, found;
2876 
2877 	/*
2878 	 * This test will become more complicated when the version must
2879 	 * change.
2880 	 */
2881 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2882 		return (EINVAL);
2883 
2884 	if (vec->post_add == NULL || vec->pre_del == NULL ||
2885 	    vec->post_del == NULL)
2886 		return (EINVAL);
2887 
2888 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2889 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
2890 		if (mem_callbacks[i].vec == NULL && found == 0)
2891 			found = i + 1;
2892 		if (mem_callbacks[i].vec == vec &&
2893 		    mem_callbacks[i].arg == arg) {
2894 #ifdef DEBUG
2895 			/* Catch this in DEBUG kernels. */
2896 			cmn_err(CE_WARN, "kphysm_setup_func_register"
2897 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
2898 			    (void *)vec, arg, (void *)caller());
2899 #endif /* DEBUG */
2900 			rw_exit(&mem_callback_rwlock);
2901 			return (EEXIST);
2902 		}
2903 	}
2904 	if (found != 0) {
2905 		i = found - 1;
2906 	} else {
2907 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
2908 		if (nmemcallbacks == NMEMCALLBACKS) {
2909 			rw_exit(&mem_callback_rwlock);
2910 			return (ENOMEM);
2911 		}
2912 		i = nmemcallbacks++;
2913 	}
2914 	mem_callbacks[i].vec = vec;
2915 	mem_callbacks[i].arg = arg;
2916 	rw_exit(&mem_callback_rwlock);
2917 	return (0);
2918 }
2919 
2920 void
2921 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
2922 {
2923 	uint_t i;
2924 
2925 	rw_enter(&mem_callback_rwlock, RW_WRITER);
2926 	for (i = 0; i < nmemcallbacks; i++) {
2927 		if (mem_callbacks[i].vec == vec &&
2928 		    mem_callbacks[i].arg == arg) {
2929 			mem_callbacks[i].vec = NULL;
2930 			mem_callbacks[i].arg = NULL;
2931 			if (i == (nmemcallbacks - 1))
2932 				nmemcallbacks--;
2933 			break;
2934 		}
2935 	}
2936 	rw_exit(&mem_callback_rwlock);
2937 }
2938 
2939 static void
2940 kphysm_setup_post_add(pgcnt_t delta_pages)
2941 {
2942 	uint_t i;
2943 
2944 	rw_enter(&mem_callback_rwlock, RW_READER);
2945 	for (i = 0; i < nmemcallbacks; i++) {
2946 		if (mem_callbacks[i].vec != NULL) {
2947 			(*mem_callbacks[i].vec->post_add)
2948 			    (mem_callbacks[i].arg, delta_pages);
2949 		}
2950 	}
2951 	rw_exit(&mem_callback_rwlock);
2952 }
2953 
2954 /*
2955  * Note the locking between pre_del and post_del: The reader lock is held
2956  * between the two calls to stop the set of functions from changing.
2957  */
2958 
2959 static int
2960 kphysm_setup_pre_del(pgcnt_t delta_pages)
2961 {
2962 	uint_t i;
2963 	int ret;
2964 	int aret;
2965 
2966 	ret = 0;
2967 	rw_enter(&mem_callback_rwlock, RW_READER);
2968 	for (i = 0; i < nmemcallbacks; i++) {
2969 		if (mem_callbacks[i].vec != NULL) {
2970 			aret = (*mem_callbacks[i].vec->pre_del)
2971 			    (mem_callbacks[i].arg, delta_pages);
2972 			ret |= aret;
2973 		}
2974 	}
2975 
2976 	return (ret);
2977 }
2978 
2979 static void
2980 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
2981 {
2982 	uint_t i;
2983 
2984 	for (i = 0; i < nmemcallbacks; i++) {
2985 		if (mem_callbacks[i].vec != NULL) {
2986 			(*mem_callbacks[i].vec->post_del)
2987 			    (mem_callbacks[i].arg, delta_pages, cancelled);
2988 		}
2989 	}
2990 	rw_exit(&mem_callback_rwlock);
2991 }
2992 
2993 static int
2994 kphysm_split_memseg(
2995 	pfn_t base,
2996 	pgcnt_t npgs)
2997 {
2998 	struct memseg *seg;
2999 	struct memseg **segpp;
3000 	pgcnt_t size_low, size_high;
3001 	struct memseg *seg_low, *seg_mid, *seg_high;
3002 
3003 	/*
3004 	 * Lock the memsegs list against other updates now
3005 	 */
3006 	memsegs_lock(1);
3007 
3008 	/*
3009 	 * Find boot time memseg that wholly covers this area.
3010 	 */
3011 
3012 	/* First find the memseg with page 'base' in it. */
3013 	for (segpp = &memsegs; (seg = *segpp) != NULL;
3014 	    segpp = &((*segpp)->next)) {
3015 		if (base >= seg->pages_base && base < seg->pages_end)
3016 			break;
3017 	}
3018 	if (seg == NULL) {
3019 		memsegs_unlock(1);
3020 		return (0);
3021 	}
3022 	if (memseg_is_dynamic(seg, (pfn_t *)NULL)) {
3023 		memsegs_unlock(1);
3024 		return (0);
3025 	}
3026 	if ((base + npgs) > seg->pages_end) {
3027 		memsegs_unlock(1);
3028 		return (0);
3029 	}
3030 
3031 	/*
3032 	 * Work out the size of the two segments that will
3033 	 * surround the new segment, one for low address
3034 	 * and one for high.
3035 	 */
3036 	ASSERT(base >= seg->pages_base);
3037 	size_low = base - seg->pages_base;
3038 	ASSERT(seg->pages_end >= (base + npgs));
3039 	size_high = seg->pages_end - (base + npgs);
3040 
3041 	/*
3042 	 * Sanity check.
3043 	 */
3044 	if ((size_low + size_high) == 0) {
3045 		memsegs_unlock(1);
3046 		return (0);
3047 	}
3048 
3049 	/*
3050 	 * Allocate the new structures. The old memseg will not be freed
3051 	 * as there may be a reference to it.
3052 	 */
3053 	seg_low = NULL;
3054 	seg_high = NULL;
3055 
3056 	if (size_low != 0) {
3057 		seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3058 		bzero(seg_low, sizeof (struct memseg));
3059 	}
3060 
3061 	seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3062 	bzero(seg_mid, sizeof (struct memseg));
3063 
3064 	if (size_high != 0) {
3065 		seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3066 		bzero(seg_high, sizeof (struct memseg));
3067 	}
3068 
3069 	/*
3070 	 * All allocation done now.
3071 	 */
3072 	if (size_low != 0) {
3073 		seg_low->pages = seg->pages;
3074 		seg_low->epages = seg_low->pages + size_low;
3075 		seg_low->pages_base = seg->pages_base;
3076 		seg_low->pages_end = seg_low->pages_base + size_low;
3077 		seg_low->next = seg_mid;
3078 	}
3079 	if (size_high != 0) {
3080 		seg_high->pages = seg->epages - size_high;
3081 		seg_high->epages = seg_high->pages + size_high;
3082 		seg_high->pages_base = seg->pages_end - size_high;
3083 		seg_high->pages_end = seg_high->pages_base + size_high;
3084 		seg_high->next = seg->next;
3085 	}
3086 
3087 	seg_mid->pages = seg->pages + size_low;
3088 	seg_mid->pages_base = seg->pages_base + size_low;
3089 	seg_mid->epages = seg->epages - size_high;
3090 	seg_mid->pages_end = seg->pages_end - size_high;
3091 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3092 
3093 	/*
3094 	 * Update hat_kpm specific info of all involved memsegs and
3095 	 * allow hat_kpm specific global chain updates.
3096 	 */
3097 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3098 
3099 	/*
3100 	 * At this point we have two equivalent memseg sub-chains,
3101 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
3102 	 * the same place in the global chain. By re-writing the pointer
3103 	 * in the previous element we switch atomically from using the old
3104 	 * (seg) to the new.
3105 	 */
3106 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
3107 
3108 	membar_enter();
3109 
3110 	build_pfn_hash();
3111 	memsegs_unlock(1);
3112 
3113 	/*
3114 	 * We leave the old segment, 'seg', intact as there may be
3115 	 * references to it. Also, as the value of total_pages has not
3116 	 * changed and the memsegs list is effectively the same when
3117 	 * accessed via the old or the new pointer, we do not have to
3118 	 * cause pageout_scanner() to re-evaluate its hand pointers.
3119 	 *
3120 	 * We currently do not re-use or reclaim the page_t memory.
3121 	 * If we do, then this may have to change.
3122 	 */
3123 
3124 	mutex_enter(&memseg_lists_lock);
3125 	seg->lnext = memseg_edit_junk;
3126 	memseg_edit_junk = seg;
3127 	mutex_exit(&memseg_lists_lock);
3128 
3129 	return (1);
3130 }
3131 
3132 /*
3133  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3134  * structure using physical addresses. Therefore a kmem_cache is
3135  * used with KMC_NOHASH to avoid page crossings within a memseg
3136  * structure. KMC_NOHASH requires that no external (outside of
3137  * slab) information is allowed. This, in turn, implies that the
3138  * cache's slabsize must be exactly a single page, since per-slab
3139  * information (e.g. the freelist for the slab) is kept at the
3140  * end of the slab, where it is easy to locate. Should be changed
3141  * when a more obvious kmem_cache interface/flag will become
3142  * available.
3143  */
3144 void
3145 mem_config_init()
3146 {
3147 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3148 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3149 }
3150