xref: /titanic_52/usr/src/uts/common/vm/seg_kp.c (revision 922d2c76afbee21520ffa2088c4e60dcb80d3945)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * segkp is a segment driver that administers the allocation and deallocation
38  * of pageable variable size chunks of kernel virtual address space. Each
39  * allocated resource is page-aligned.
40  *
41  * The user may specify whether the resource should be initialized to 0,
42  * include a redzone, or locked in memory.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/thread.h>
48 #include <sys/param.h>
49 #include <sys/errno.h>
50 #include <sys/sysmacros.h>
51 #include <sys/systm.h>
52 #include <sys/buf.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cmn_err.h>
56 #include <sys/swap.h>
57 #include <sys/tuneable.h>
58 #include <sys/kmem.h>
59 #include <sys/vmem.h>
60 #include <sys/cred.h>
61 #include <sys/dumphdr.h>
62 #include <sys/debug.h>
63 #include <sys/vtrace.h>
64 #include <sys/stack.h>
65 #include <sys/atomic.h>
66 #include <sys/archsystm.h>
67 #include <sys/lgrp.h>
68 
69 #include <vm/as.h>
70 #include <vm/seg.h>
71 #include <vm/seg_kp.h>
72 #include <vm/seg_kmem.h>
73 #include <vm/anon.h>
74 #include <vm/page.h>
75 #include <vm/hat.h>
76 #include <sys/bitmap.h>
77 
78 /*
79  * Private seg op routines
80  */
81 static void	segkp_badop(void);
82 static void	segkp_dump(struct seg *seg);
83 static int	segkp_checkprot(struct seg *seg, caddr_t addr, size_t len,
84 			uint_t prot);
85 static int	segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
86 static int	segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
87 			struct page ***page, enum lock_type type,
88 			enum seg_rw rw);
89 static void	segkp_insert(struct seg *seg, struct segkp_data *kpd);
90 static void	segkp_delete(struct seg *seg, struct segkp_data *kpd);
91 static caddr_t	segkp_get_internal(struct seg *seg, size_t len, uint_t flags,
92 			struct segkp_data **tkpd, struct anon_map *amp);
93 static void	segkp_release_internal(struct seg *seg,
94 			struct segkp_data *kpd, size_t len);
95 static int	segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr,
96 			size_t len, struct segkp_data *kpd, uint_t flags);
97 static int	segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr,
98 			size_t len, struct segkp_data *kpd, uint_t flags);
99 static struct	segkp_data *segkp_find(struct seg *seg, caddr_t vaddr);
100 static int	segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
101 static lgrp_mem_policy_info_t	*segkp_getpolicy(struct seg *seg,
102     caddr_t addr);
103 static int	segkp_capable(struct seg *seg, segcapability_t capability);
104 
105 /*
106  * Lock used to protect the hash table(s) and caches.
107  */
108 static kmutex_t	segkp_lock;
109 
110 /*
111  * The segkp caches
112  */
113 static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE];
114 
115 #define	SEGKP_BADOP(t)	(t(*)())segkp_badop
116 
117 /*
118  * When there are fewer than red_minavail bytes left on the stack,
119  * segkp_map_red() will map in the redzone (if called).  5000 seems
120  * to work reasonably well...
121  */
122 long		red_minavail = 5000;
123 
124 /*
125  * will be set to 1 for 32 bit x86 systems only, in startup.c
126  */
127 int	segkp_fromheap = 0;
128 ulong_t *segkp_bitmap;
129 
130 /*
131  * If segkp_map_red() is called with the redzone already mapped and
132  * with less than RED_DEEP_THRESHOLD bytes available on the stack,
133  * then the stack situation has become quite serious;  if much more stack
134  * is consumed, we have the potential of scrogging the next thread/LWP
135  * structure.  To help debug the "can't happen" panics which may
136  * result from this condition, we record lbolt and the calling thread
137  * in red_deep_lbolt and red_deep_thread respectively.
138  */
139 #define	RED_DEEP_THRESHOLD	2000
140 
141 clock_t		red_deep_lbolt;
142 kthread_t	*red_deep_thread;
143 
144 uint32_t	red_nmapped;
145 uint32_t	red_closest = UINT_MAX;
146 uint32_t	red_ndoubles;
147 
148 pgcnt_t anon_segkp_pages_locked;	/* See vm/anon.h */
149 pgcnt_t anon_segkp_pages_resv;		/* anon reserved by seg_kp */
150 
151 static struct	seg_ops segkp_ops = {
152 	SEGKP_BADOP(int),		/* dup */
153 	SEGKP_BADOP(int),		/* unmap */
154 	SEGKP_BADOP(void),		/* free */
155 	segkp_fault,
156 	SEGKP_BADOP(faultcode_t),	/* faulta */
157 	SEGKP_BADOP(int),		/* setprot */
158 	segkp_checkprot,
159 	segkp_kluster,
160 	SEGKP_BADOP(size_t),		/* swapout */
161 	SEGKP_BADOP(int),		/* sync */
162 	SEGKP_BADOP(size_t),		/* incore */
163 	SEGKP_BADOP(int),		/* lockop */
164 	SEGKP_BADOP(int),		/* getprot */
165 	SEGKP_BADOP(u_offset_t),		/* getoffset */
166 	SEGKP_BADOP(int),		/* gettype */
167 	SEGKP_BADOP(int),		/* getvp */
168 	SEGKP_BADOP(int),		/* advise */
169 	segkp_dump,			/* dump */
170 	segkp_pagelock,			/* pagelock */
171 	SEGKP_BADOP(int),		/* setpgsz */
172 	segkp_getmemid,			/* getmemid */
173 	segkp_getpolicy,		/* getpolicy */
174 	segkp_capable,			/* capable */
175 };
176 
177 
178 static void
179 segkp_badop(void)
180 {
181 	panic("segkp_badop");
182 	/*NOTREACHED*/
183 }
184 
185 static void segkpinit_mem_config(struct seg *);
186 
187 static uint32_t segkp_indel;
188 
189 /*
190  * Allocate the segment specific private data struct and fill it in
191  * with the per kp segment mutex, anon ptr. array and hash table.
192  */
193 int
194 segkp_create(struct seg *seg)
195 {
196 	struct segkp_segdata *kpsd;
197 	size_t	np;
198 
199 	ASSERT(seg != NULL && seg->s_as == &kas);
200 	ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock));
201 
202 	if (seg->s_size & PAGEOFFSET) {
203 		panic("Bad segkp size");
204 		/*NOTREACHED*/
205 	}
206 
207 	kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP);
208 
209 	/*
210 	 * Allocate the virtual memory for segkp and initialize it
211 	 */
212 	if (segkp_fromheap) {
213 		np = btop(kvseg.s_size);
214 		segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP);
215 		kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE,
216 		    vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP);
217 	} else {
218 		segkp_bitmap = NULL;
219 		np = btop(seg->s_size);
220 		kpsd->kpsd_arena = vmem_create("segkp", seg->s_base,
221 		    seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE,
222 		    VM_SLEEP);
223 	}
224 
225 	kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE);
226 
227 	kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *),
228 	    KM_SLEEP);
229 	seg->s_data = (void *)kpsd;
230 	seg->s_ops = &segkp_ops;
231 	segkpinit_mem_config(seg);
232 	return (0);
233 }
234 
235 
236 /*
237  * Find a free 'freelist' and initialize it with the appropriate attributes
238  */
239 void *
240 segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags)
241 {
242 	int i;
243 
244 	if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED))
245 		return ((void *)-1);
246 
247 	mutex_enter(&segkp_lock);
248 	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
249 		if (segkp_cache[i].kpf_inuse)
250 			continue;
251 		segkp_cache[i].kpf_inuse = 1;
252 		segkp_cache[i].kpf_max = maxsize;
253 		segkp_cache[i].kpf_flags = flags;
254 		segkp_cache[i].kpf_seg = seg;
255 		segkp_cache[i].kpf_len = len;
256 		mutex_exit(&segkp_lock);
257 		return ((void *)(uintptr_t)i);
258 	}
259 	mutex_exit(&segkp_lock);
260 	return ((void *)-1);
261 }
262 
263 /*
264  * Free all the cache resources.
265  */
266 void
267 segkp_cache_free(void)
268 {
269 	struct segkp_data *kpd;
270 	struct seg *seg;
271 	int i;
272 
273 	mutex_enter(&segkp_lock);
274 	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
275 		if (!segkp_cache[i].kpf_inuse)
276 			continue;
277 		/*
278 		 * Disconnect the freelist and process each element
279 		 */
280 		kpd = segkp_cache[i].kpf_list;
281 		seg = segkp_cache[i].kpf_seg;
282 		segkp_cache[i].kpf_list = NULL;
283 		segkp_cache[i].kpf_count = 0;
284 		mutex_exit(&segkp_lock);
285 
286 		while (kpd != NULL) {
287 			struct segkp_data *next;
288 
289 			next = kpd->kp_next;
290 			segkp_release_internal(seg, kpd, kpd->kp_len);
291 			kpd = next;
292 		}
293 		mutex_enter(&segkp_lock);
294 	}
295 	mutex_exit(&segkp_lock);
296 }
297 
298 /*
299  * There are 2 entries into segkp_get_internal. The first includes a cookie
300  * used to access a pool of cached segkp resources. The second does not
301  * use the cache.
302  */
303 caddr_t
304 segkp_get(struct seg *seg, size_t len, uint_t flags)
305 {
306 	struct segkp_data *kpd = NULL;
307 
308 	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
309 		kpd->kp_cookie = -1;
310 		return (stom(kpd->kp_base, flags));
311 	}
312 	return (NULL);
313 }
314 
315 /*
316  * Return a 'cached' segkp address
317  */
318 caddr_t
319 segkp_cache_get(void *cookie)
320 {
321 	struct segkp_cache *freelist = NULL;
322 	struct segkp_data *kpd = NULL;
323 	int index = (int)(uintptr_t)cookie;
324 	struct seg *seg;
325 	size_t len;
326 	uint_t flags;
327 
328 	if (index < 0 || index >= SEGKP_MAX_CACHE)
329 		return (NULL);
330 	freelist = &segkp_cache[index];
331 
332 	mutex_enter(&segkp_lock);
333 	seg = freelist->kpf_seg;
334 	flags = freelist->kpf_flags;
335 	if (freelist->kpf_list != NULL) {
336 		kpd = freelist->kpf_list;
337 		freelist->kpf_list = kpd->kp_next;
338 		freelist->kpf_count--;
339 		mutex_exit(&segkp_lock);
340 		kpd->kp_next = NULL;
341 		segkp_insert(seg, kpd);
342 		return (stom(kpd->kp_base, flags));
343 	}
344 	len = freelist->kpf_len;
345 	mutex_exit(&segkp_lock);
346 	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
347 		kpd->kp_cookie = index;
348 		return (stom(kpd->kp_base, flags));
349 	}
350 	return (NULL);
351 }
352 
353 caddr_t
354 segkp_get_withanonmap(
355 	struct seg *seg,
356 	size_t len,
357 	uint_t flags,
358 	struct anon_map *amp)
359 {
360 	struct segkp_data *kpd = NULL;
361 
362 	ASSERT(amp != NULL);
363 	flags |= KPD_HASAMP;
364 	if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) {
365 		kpd->kp_cookie = -1;
366 		return (stom(kpd->kp_base, flags));
367 	}
368 	return (NULL);
369 }
370 
371 /*
372  * This does the real work of segkp allocation.
373  * Return to client base addr. len must be page-aligned. A null value is
374  * returned if there are no more vm resources (e.g. pages, swap). The len
375  * and base recorded in the private data structure include the redzone
376  * and the redzone length (if applicable). If the user requests a redzone
377  * either the first or last page is left unmapped depending whether stacks
378  * grow to low or high memory.
379  *
380  * The client may also specify a no-wait flag. If that is set then the
381  * request will choose a non-blocking path when requesting resources.
382  * The default is make the client wait.
383  */
384 static caddr_t
385 segkp_get_internal(
386 	struct seg *seg,
387 	size_t len,
388 	uint_t flags,
389 	struct segkp_data **tkpd,
390 	struct anon_map *amp)
391 {
392 	struct segkp_segdata	*kpsd = (struct segkp_segdata *)seg->s_data;
393 	struct segkp_data	*kpd;
394 	caddr_t vbase = NULL;	/* always first virtual, may not be mapped */
395 	pgcnt_t np = 0;		/* number of pages in the resource */
396 	pgcnt_t segkpindex;
397 	long i;
398 	caddr_t va;
399 	pgcnt_t pages = 0;
400 	ulong_t anon_idx = 0;
401 	int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
402 	caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;
403 
404 	if (len & PAGEOFFSET) {
405 		panic("segkp_get: len is not page-aligned");
406 		/*NOTREACHED*/
407 	}
408 
409 	ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));
410 
411 	/* Only allow KPD_NO_ANON if we are going to lock it down */
412 	if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
413 		return (NULL);
414 
415 	if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
416 		return (NULL);
417 	/*
418 	 * Fix up the len to reflect the REDZONE if applicable
419 	 */
420 	if (flags & KPD_HASREDZONE)
421 		len += PAGESIZE;
422 	np = btop(len);
423 
424 	vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
425 	if (vbase == NULL) {
426 		kmem_free(kpd, sizeof (struct segkp_data));
427 		return (NULL);
428 	}
429 
430 	/* If locking, reserve physical memory */
431 	if (flags & KPD_LOCKED) {
432 		pages = btop(SEGKP_MAPLEN(len, flags));
433 		if (page_resv(pages, kmflag) == 0) {
434 			vmem_free(SEGKP_VMEM(seg), vbase, len);
435 			kmem_free(kpd, sizeof (struct segkp_data));
436 			return (NULL);
437 		}
438 		if ((flags & KPD_NO_ANON) == 0)
439 			atomic_add_long(&anon_segkp_pages_locked, pages);
440 	}
441 
442 	/*
443 	 * Reserve sufficient swap space for this vm resource.  We'll
444 	 * actually allocate it in the loop below, but reserving it
445 	 * here allows us to back out more gracefully than if we
446 	 * had an allocation failure in the body of the loop.
447 	 *
448 	 * Note that we don't need swap space for the red zone page.
449 	 */
450 	if (amp != NULL) {
451 		/*
452 		 * The swap reservation has been done, if required, and the
453 		 * anon_hdr is separate.
454 		 */
455 		anon_idx = 0;
456 		kpd->kp_anon_idx = anon_idx;
457 		kpd->kp_anon = amp->ahp;
458 
459 		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
460 		    kpd, vbase, len, flags, 1);
461 
462 	} else if ((flags & KPD_NO_ANON) == 0) {
463 		if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
464 			if (flags & KPD_LOCKED) {
465 				atomic_add_long(&anon_segkp_pages_locked,
466 				    -pages);
467 				page_unresv(pages);
468 			}
469 			vmem_free(SEGKP_VMEM(seg), vbase, len);
470 			kmem_free(kpd, sizeof (struct segkp_data));
471 			return (NULL);
472 		}
473 		atomic_add_long(&anon_segkp_pages_resv,
474 		    btop(SEGKP_MAPLEN(len, flags)));
475 		anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
476 		kpd->kp_anon_idx = anon_idx;
477 		kpd->kp_anon = kpsd->kpsd_anon;
478 
479 		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
480 		    kpd, vbase, len, flags, 1);
481 	} else {
482 		kpd->kp_anon = NULL;
483 		kpd->kp_anon_idx = 0;
484 	}
485 
486 	/*
487 	 * Allocate page and anon resources for the virtual address range
488 	 * except the redzone
489 	 */
490 	if (segkp_fromheap)
491 		segkpindex = btop((uintptr_t)(vbase - kvseg.s_base));
492 	for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) {
493 		page_t		*pl[2];
494 		struct vnode	*vp;
495 		anoff_t		off;
496 		int		err;
497 		page_t		*pp = NULL;
498 
499 		/*
500 		 * Mark this page to be a segkp page in the bitmap.
501 		 */
502 		if (segkp_fromheap) {
503 			BT_ATOMIC_SET(segkp_bitmap, segkpindex);
504 			segkpindex++;
505 		}
506 
507 		/*
508 		 * If this page is the red zone page, we don't need swap
509 		 * space for it.  Note that we skip over the code that
510 		 * establishes MMU mappings, so that the page remains
511 		 * invalid.
512 		 */
513 		if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i)
514 			continue;
515 
516 		if (kpd->kp_anon != NULL) {
517 			struct anon *ap;
518 
519 			ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i)
520 			    == NULL);
521 			/*
522 			 * Determine the "vp" and "off" of the anon slot.
523 			 */
524 			ap = anon_alloc(NULL, 0);
525 			if (amp != NULL)
526 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
527 			(void) anon_set_ptr(kpd->kp_anon, anon_idx + i,
528 			    ap, ANON_SLEEP);
529 			if (amp != NULL)
530 				ANON_LOCK_EXIT(&amp->a_rwlock);
531 			swap_xlate(ap, &vp, &off);
532 
533 			/*
534 			 * Create a page with the specified identity.  The
535 			 * page is returned with the "shared" lock held.
536 			 */
537 			err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
538 			    NULL, pl, PAGESIZE, seg, va, S_CREATE,
539 			    kcred, NULL);
540 			if (err) {
541 				/*
542 				 * XXX - This should not fail.
543 				 */
544 				panic("segkp_get: no pages");
545 				/*NOTREACHED*/
546 			}
547 			pp = pl[0];
548 		} else {
549 			ASSERT(page_exists(&kvp,
550 			    (u_offset_t)(uintptr_t)va) == NULL);
551 
552 			if ((pp = page_create_va(&kvp,
553 			    (u_offset_t)(uintptr_t)va, PAGESIZE,
554 			    (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL |
555 			    PG_NORELOC, seg, va)) == NULL) {
556 				/*
557 				 * Legitimize resource; then destroy it.
558 				 * Easier than trying to unwind here.
559 				 */
560 				kpd->kp_flags = flags;
561 				kpd->kp_base = vbase;
562 				kpd->kp_len = len;
563 				segkp_release_internal(seg, kpd, va - vbase);
564 				return (NULL);
565 			}
566 			page_io_unlock(pp);
567 		}
568 
569 		if (flags & KPD_ZERO)
570 			pagezero(pp, 0, PAGESIZE);
571 
572 		/*
573 		 * Load and lock an MMU translation for the page.
574 		 */
575 		hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE),
576 		    ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD));
577 
578 		/*
579 		 * Now, release lock on the page.
580 		 */
581 		if (flags & KPD_LOCKED)
582 			page_downgrade(pp);
583 		else
584 			page_unlock(pp);
585 	}
586 
587 	kpd->kp_flags = flags;
588 	kpd->kp_base = vbase;
589 	kpd->kp_len = len;
590 	segkp_insert(seg, kpd);
591 	*tkpd = kpd;
592 	return (stom(kpd->kp_base, flags));
593 }
594 
595 /*
596  * Release the resource to cache if the pool(designate by the cookie)
597  * has less than the maximum allowable. If inserted in cache,
598  * segkp_delete insures element is taken off of active list.
599  */
600 void
601 segkp_release(struct seg *seg, caddr_t vaddr)
602 {
603 	struct segkp_cache *freelist;
604 	struct segkp_data *kpd = NULL;
605 
606 	if ((kpd = segkp_find(seg, vaddr)) == NULL) {
607 		panic("segkp_release: null kpd");
608 		/*NOTREACHED*/
609 	}
610 
611 	if (kpd->kp_cookie != -1) {
612 		freelist = &segkp_cache[kpd->kp_cookie];
613 		mutex_enter(&segkp_lock);
614 		if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) {
615 			segkp_delete(seg, kpd);
616 			kpd->kp_next = freelist->kpf_list;
617 			freelist->kpf_list = kpd;
618 			freelist->kpf_count++;
619 			mutex_exit(&segkp_lock);
620 			return;
621 		} else {
622 			mutex_exit(&segkp_lock);
623 			kpd->kp_cookie = -1;
624 		}
625 	}
626 	segkp_release_internal(seg, kpd, kpd->kp_len);
627 }
628 
629 /*
630  * Free the entire resource. segkp_unlock gets called with the start of the
631  * mapped portion of the resource. The length is the size of the mapped
632  * portion
633  */
634 static void
635 segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
636 {
637 	caddr_t		va;
638 	long		i;
639 	long		redzone;
640 	size_t		np;
641 	page_t		*pp;
642 	struct vnode 	*vp;
643 	anoff_t		off;
644 	struct anon	*ap;
645 	pgcnt_t		segkpindex;
646 
647 	ASSERT(kpd != NULL);
648 	ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1);
649 	np = btop(len);
650 
651 	/* Remove from active hash list */
652 	if (kpd->kp_cookie == -1) {
653 		mutex_enter(&segkp_lock);
654 		segkp_delete(seg, kpd);
655 		mutex_exit(&segkp_lock);
656 	}
657 
658 	/*
659 	 * Precompute redzone page index.
660 	 */
661 	redzone = -1;
662 	if (kpd->kp_flags & KPD_HASREDZONE)
663 		redzone = KPD_REDZONE(kpd);
664 
665 
666 	va = kpd->kp_base;
667 
668 	hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT),
669 	    ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
670 	/*
671 	 * Free up those anon resources that are quiescent.
672 	 */
673 	if (segkp_fromheap)
674 		segkpindex = btop((uintptr_t)(va - kvseg.s_base));
675 	for (i = 0; i < np; i++, va += PAGESIZE) {
676 
677 		/*
678 		 * Clear the bit for this page from the bitmap.
679 		 */
680 		if (segkp_fromheap) {
681 			BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex);
682 			segkpindex++;
683 		}
684 
685 		if (i == redzone)
686 			continue;
687 		if (kpd->kp_anon) {
688 			/*
689 			 * Free up anon resources and destroy the
690 			 * associated pages.
691 			 *
692 			 * Release the lock if there is one. Have to get the
693 			 * page to do this, unfortunately.
694 			 */
695 			if (kpd->kp_flags & KPD_LOCKED) {
696 				ap = anon_get_ptr(kpd->kp_anon,
697 				    kpd->kp_anon_idx + i);
698 				swap_xlate(ap, &vp, &off);
699 				/* Find the shared-locked page. */
700 				pp = page_find(vp, (u_offset_t)off);
701 				if (pp == NULL) {
702 					panic("segkp_release: "
703 					    "kp_anon: no page to unlock ");
704 					/*NOTREACHED*/
705 				}
706 				page_unlock(pp);
707 			}
708 			if ((kpd->kp_flags & KPD_HASAMP) == 0) {
709 				anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
710 				    PAGESIZE);
711 				anon_unresv_zone(PAGESIZE, NULL);
712 				atomic_add_long(&anon_segkp_pages_resv,
713 				    -1);
714 			}
715 			TRACE_5(TR_FAC_VM,
716 			    TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
717 			    kpd, va, PAGESIZE, 0, 0);
718 		} else {
719 			if (kpd->kp_flags & KPD_LOCKED) {
720 				pp = page_find(&kvp, (u_offset_t)(uintptr_t)va);
721 				if (pp == NULL) {
722 					panic("segkp_release: "
723 					    "no page to unlock");
724 					/*NOTREACHED*/
725 				}
726 				/*
727 				 * We should just upgrade the lock here
728 				 * but there is no upgrade that waits.
729 				 */
730 				page_unlock(pp);
731 			}
732 			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va,
733 			    SE_EXCL);
734 			if (pp != NULL)
735 				page_destroy(pp, 0);
736 		}
737 	}
738 
739 	/* If locked, release physical memory reservation */
740 	if (kpd->kp_flags & KPD_LOCKED) {
741 		pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
742 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
743 			atomic_add_long(&anon_segkp_pages_locked, -pages);
744 		page_unresv(pages);
745 	}
746 
747 	vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len);
748 	kmem_free(kpd, sizeof (struct segkp_data));
749 }
750 
751 /*
752  * segkp_map_red() will check the current frame pointer against the
753  * stack base.  If the amount of stack remaining is questionable
754  * (less than red_minavail), then segkp_map_red() will map in the redzone
755  * and return 1.  Otherwise, it will return 0.  segkp_map_red() can
756  * _only_ be called when:
757  *
758  *   - it is safe to sleep on page_create_va().
759  *   - the caller is non-swappable.
760  *
761  * It is up to the caller to remember whether segkp_map_red() successfully
762  * mapped the redzone, and, if so, to call segkp_unmap_red() at a later
763  * time.  Note that the caller must _remain_ non-swappable until after
764  * calling segkp_unmap_red().
765  *
766  * Currently, this routine is only called from pagefault() (which necessarily
767  * satisfies the above conditions).
768  */
769 #if defined(STACK_GROWTH_DOWN)
770 int
771 segkp_map_red(void)
772 {
773 	uintptr_t fp = STACK_BIAS + (uintptr_t)getfp();
774 #ifndef _LP64
775 	caddr_t stkbase;
776 #endif
777 
778 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
779 
780 	/*
781 	 * Optimize for the common case where we simply return.
782 	 */
783 	if ((curthread->t_red_pp == NULL) &&
784 	    (fp - (uintptr_t)curthread->t_stkbase >= red_minavail))
785 		return (0);
786 
787 #if defined(_LP64)
788 	/*
789 	 * XXX	We probably need something better than this.
790 	 */
791 	panic("kernel stack overflow");
792 	/*NOTREACHED*/
793 #else /* _LP64 */
794 	if (curthread->t_red_pp == NULL) {
795 		page_t *red_pp;
796 		struct seg kseg;
797 
798 		caddr_t red_va = (caddr_t)
799 		    (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) -
800 		    PAGESIZE);
801 
802 		ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) ==
803 		    NULL);
804 
805 		/*
806 		 * Allocate the physical for the red page.
807 		 */
808 		/*
809 		 * No PG_NORELOC here to avoid waits. Unlikely to get
810 		 * a relocate happening in the short time the page exists
811 		 * and it will be OK anyway.
812 		 */
813 
814 		kseg.s_as = &kas;
815 		red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va,
816 		    PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va);
817 		ASSERT(red_pp != NULL);
818 
819 		/*
820 		 * So we now have a page to jam into the redzone...
821 		 */
822 		page_io_unlock(red_pp);
823 
824 		hat_memload(kas.a_hat, red_va, red_pp,
825 		    (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK);
826 		page_downgrade(red_pp);
827 
828 		/*
829 		 * The page is left SE_SHARED locked so we can hold on to
830 		 * the page_t pointer.
831 		 */
832 		curthread->t_red_pp = red_pp;
833 
834 		atomic_add_32(&red_nmapped, 1);
835 		while (fp - (uintptr_t)curthread->t_stkbase < red_closest) {
836 			(void) cas32(&red_closest, red_closest,
837 			    (uint32_t)(fp - (uintptr_t)curthread->t_stkbase));
838 		}
839 		return (1);
840 	}
841 
842 	stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase &
843 	    (uintptr_t)PAGEMASK) - PAGESIZE);
844 
845 	atomic_add_32(&red_ndoubles, 1);
846 
847 	if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) {
848 		/*
849 		 * Oh boy.  We're already deep within the mapped-in
850 		 * redzone page, and the caller is trying to prepare
851 		 * for a deep stack run.  We're running without a
852 		 * redzone right now:  if the caller plows off the
853 		 * end of the stack, it'll plow another thread or
854 		 * LWP structure.  That situation could result in
855 		 * a very hard-to-debug panic, so, in the spirit of
856 		 * recording the name of one's killer in one's own
857 		 * blood, we're going to record lbolt and the calling
858 		 * thread.
859 		 */
860 		red_deep_lbolt = lbolt;
861 		red_deep_thread = curthread;
862 	}
863 
864 	/*
865 	 * If this is a DEBUG kernel, and we've run too deep for comfort, toss.
866 	 */
867 	ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD);
868 	return (0);
869 #endif /* _LP64 */
870 }
871 
872 void
873 segkp_unmap_red(void)
874 {
875 	page_t *pp;
876 	caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase &
877 	    (uintptr_t)PAGEMASK) - PAGESIZE);
878 
879 	ASSERT(curthread->t_red_pp != NULL);
880 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
881 
882 	/*
883 	 * Because we locked the mapping down, we can't simply rely
884 	 * on page_destroy() to clean everything up;  we need to call
885 	 * hat_unload() to explicitly unlock the mapping resources.
886 	 */
887 	hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK);
888 
889 	pp = curthread->t_red_pp;
890 
891 	ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va));
892 
893 	/*
894 	 * Need to upgrade the SE_SHARED lock to SE_EXCL.
895 	 */
896 	if (!page_tryupgrade(pp)) {
897 		/*
898 		 * As there is now wait for upgrade, release the
899 		 * SE_SHARED lock and wait for SE_EXCL.
900 		 */
901 		page_unlock(pp);
902 		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL);
903 		/* pp may be NULL here, hence the test below */
904 	}
905 
906 	/*
907 	 * Destroy the page, with dontfree set to zero (i.e. free it).
908 	 */
909 	if (pp != NULL)
910 		page_destroy(pp, 0);
911 	curthread->t_red_pp = NULL;
912 }
913 #else
914 #error Red stacks only supported with downwards stack growth.
915 #endif
916 
917 /*
918  * Handle a fault on an address corresponding to one of the
919  * resources in the segkp segment.
920  */
921 faultcode_t
922 segkp_fault(
923 	struct hat	*hat,
924 	struct seg	*seg,
925 	caddr_t		vaddr,
926 	size_t		len,
927 	enum fault_type	type,
928 	enum seg_rw rw)
929 {
930 	struct segkp_data	*kpd = NULL;
931 	int			err;
932 
933 	ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock));
934 
935 	/*
936 	 * Sanity checks.
937 	 */
938 	if (type == F_PROT) {
939 		panic("segkp_fault: unexpected F_PROT fault");
940 		/*NOTREACHED*/
941 	}
942 
943 	if ((kpd = segkp_find(seg, vaddr)) == NULL)
944 		return (FC_NOMAP);
945 
946 	mutex_enter(&kpd->kp_lock);
947 
948 	if (type == F_SOFTLOCK) {
949 		ASSERT(!(kpd->kp_flags & KPD_LOCKED));
950 		/*
951 		 * The F_SOFTLOCK case has more stringent
952 		 * range requirements: the given range must exactly coincide
953 		 * with the resource's mapped portion. Note reference to
954 		 * redzone is handled since vaddr would not equal base
955 		 */
956 		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
957 		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
958 			mutex_exit(&kpd->kp_lock);
959 			return (FC_MAKE_ERR(EFAULT));
960 		}
961 
962 		if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) {
963 			mutex_exit(&kpd->kp_lock);
964 			return (FC_MAKE_ERR(err));
965 		}
966 		kpd->kp_flags |= KPD_LOCKED;
967 		mutex_exit(&kpd->kp_lock);
968 		return (0);
969 	}
970 
971 	if (type == F_INVAL) {
972 		ASSERT(!(kpd->kp_flags & KPD_NO_ANON));
973 
974 		/*
975 		 * Check if we touched the redzone. Somewhat optimistic
976 		 * here if we are touching the redzone of our own stack
977 		 * since we wouldn't have a stack to get this far...
978 		 */
979 		if ((kpd->kp_flags & KPD_HASREDZONE) &&
980 		    btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd))
981 			panic("segkp_fault: accessing redzone");
982 
983 		/*
984 		 * This fault may occur while the page is being F_SOFTLOCK'ed.
985 		 * Return since a 2nd segkp_load is unnecessary and also would
986 		 * result in the page being locked twice and eventually
987 		 * hang the thread_reaper thread.
988 		 */
989 		if (kpd->kp_flags & KPD_LOCKED) {
990 			mutex_exit(&kpd->kp_lock);
991 			return (0);
992 		}
993 
994 		err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags);
995 		mutex_exit(&kpd->kp_lock);
996 		return (err ? FC_MAKE_ERR(err) : 0);
997 	}
998 
999 	if (type == F_SOFTUNLOCK) {
1000 		uint_t	flags;
1001 
1002 		/*
1003 		 * Make sure the addr is LOCKED and it has anon backing
1004 		 * before unlocking
1005 		 */
1006 		if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) {
1007 			panic("segkp_fault: bad unlock");
1008 			/*NOTREACHED*/
1009 		}
1010 
1011 		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
1012 		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
1013 			panic("segkp_fault: bad range");
1014 			/*NOTREACHED*/
1015 		}
1016 
1017 		if (rw == S_WRITE)
1018 			flags = kpd->kp_flags | KPD_WRITEDIRTY;
1019 		else
1020 			flags = kpd->kp_flags;
1021 		err = segkp_unlock(hat, seg, vaddr, len, kpd, flags);
1022 		kpd->kp_flags &= ~KPD_LOCKED;
1023 		mutex_exit(&kpd->kp_lock);
1024 		return (err ? FC_MAKE_ERR(err) : 0);
1025 	}
1026 	mutex_exit(&kpd->kp_lock);
1027 	panic("segkp_fault: bogus fault type: %d\n", type);
1028 	/*NOTREACHED*/
1029 }
1030 
1031 /*
1032  * Check that the given protections suffice over the range specified by
1033  * vaddr and len.  For this segment type, the only issue is whether or
1034  * not the range lies completely within the mapped part of an allocated
1035  * resource.
1036  */
1037 /* ARGSUSED */
1038 static int
1039 segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot)
1040 {
1041 	struct segkp_data *kpd = NULL;
1042 	caddr_t mbase;
1043 	size_t mlen;
1044 
1045 	if ((kpd = segkp_find(seg, vaddr)) == NULL)
1046 		return (EACCES);
1047 
1048 	mutex_enter(&kpd->kp_lock);
1049 	mbase = stom(kpd->kp_base, kpd->kp_flags);
1050 	mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags);
1051 	if (len > mlen || vaddr < mbase ||
1052 	    ((vaddr + len) > (mbase + mlen))) {
1053 		mutex_exit(&kpd->kp_lock);
1054 		return (EACCES);
1055 	}
1056 	mutex_exit(&kpd->kp_lock);
1057 	return (0);
1058 }
1059 
1060 
1061 /*
1062  * Check to see if it makes sense to do kluster/read ahead to
1063  * addr + delta relative to the mapping at addr.  We assume here
1064  * that delta is a signed PAGESIZE'd multiple (which can be negative).
1065  *
1066  * For seg_u we always "approve" of this action from our standpoint.
1067  */
1068 /*ARGSUSED*/
1069 static int
1070 segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
1071 {
1072 	return (0);
1073 }
1074 
1075 /*
1076  * Load and possibly lock intra-slot resources in the range given by
1077  * vaddr and len.
1078  */
1079 static int
1080 segkp_load(
1081 	struct hat *hat,
1082 	struct seg *seg,
1083 	caddr_t vaddr,
1084 	size_t len,
1085 	struct segkp_data *kpd,
1086 	uint_t flags)
1087 {
1088 	caddr_t va;
1089 	caddr_t vlim;
1090 	ulong_t i;
1091 	uint_t lock;
1092 
1093 	ASSERT(MUTEX_HELD(&kpd->kp_lock));
1094 
1095 	len = P2ROUNDUP(len, PAGESIZE);
1096 
1097 	/* If locking, reserve physical memory */
1098 	if (flags & KPD_LOCKED) {
1099 		pgcnt_t pages = btop(len);
1100 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
1101 			atomic_add_long(&anon_segkp_pages_locked, pages);
1102 		(void) page_resv(pages, KM_SLEEP);
1103 	}
1104 
1105 	/*
1106 	 * Loop through the pages in the given range.
1107 	 */
1108 	va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
1109 	vaddr = va;
1110 	vlim = va + len;
1111 	lock = flags & KPD_LOCKED;
1112 	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
1113 	for (; va < vlim; va += PAGESIZE, i++) {
1114 		page_t		*pl[2];	/* second element NULL terminator */
1115 		struct vnode    *vp;
1116 		anoff_t		off;
1117 		int		err;
1118 		struct anon	*ap;
1119 
1120 		/*
1121 		 * Summon the page.  If it's not resident, arrange
1122 		 * for synchronous i/o to pull it in.
1123 		 */
1124 		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
1125 		swap_xlate(ap, &vp, &off);
1126 
1127 		/*
1128 		 * The returned page list will have exactly one entry,
1129 		 * which is returned to us already kept.
1130 		 */
1131 		err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL,
1132 		    pl, PAGESIZE, seg, va, S_READ, kcred, NULL);
1133 
1134 		if (err) {
1135 			/*
1136 			 * Back out of what we've done so far.
1137 			 */
1138 			(void) segkp_unlock(hat, seg, vaddr,
1139 			    (va - vaddr), kpd, flags);
1140 			return (err);
1141 		}
1142 
1143 		/*
1144 		 * Load an MMU translation for the page.
1145 		 */
1146 		hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE),
1147 		    lock ? HAT_LOAD_LOCK : HAT_LOAD);
1148 
1149 		if (!lock) {
1150 			/*
1151 			 * Now, release "shared" lock on the page.
1152 			 */
1153 			page_unlock(pl[0]);
1154 		}
1155 	}
1156 	return (0);
1157 }
1158 
1159 /*
1160  * At the very least unload the mmu-translations and unlock the range if locked
1161  * Can be called with the following flag value KPD_WRITEDIRTY which specifies
1162  * any dirty pages should be written to disk.
1163  */
1164 static int
1165 segkp_unlock(
1166 	struct hat *hat,
1167 	struct seg *seg,
1168 	caddr_t vaddr,
1169 	size_t len,
1170 	struct segkp_data *kpd,
1171 	uint_t flags)
1172 {
1173 	caddr_t va;
1174 	caddr_t vlim;
1175 	ulong_t i;
1176 	struct page *pp;
1177 	struct vnode *vp;
1178 	anoff_t off;
1179 	struct anon *ap;
1180 
1181 #ifdef lint
1182 	seg = seg;
1183 #endif /* lint */
1184 
1185 	ASSERT(MUTEX_HELD(&kpd->kp_lock));
1186 
1187 	/*
1188 	 * Loop through the pages in the given range. It is assumed
1189 	 * segkp_unlock is called with page aligned base
1190 	 */
1191 	va = vaddr;
1192 	vlim = va + len;
1193 	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
1194 	hat_unload(hat, va, len,
1195 	    ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
1196 	for (; va < vlim; va += PAGESIZE, i++) {
1197 		/*
1198 		 * Find the page associated with this part of the
1199 		 * slot, tracking it down through its associated swap
1200 		 * space.
1201 		 */
1202 		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
1203 		swap_xlate(ap, &vp, &off);
1204 
1205 		if (flags & KPD_LOCKED) {
1206 			if ((pp = page_find(vp, off)) == NULL) {
1207 				if (flags & KPD_LOCKED) {
1208 					panic("segkp_softunlock: missing page");
1209 					/*NOTREACHED*/
1210 				}
1211 			}
1212 		} else {
1213 			/*
1214 			 * Nothing to do if the slot is not locked and the
1215 			 * page doesn't exist.
1216 			 */
1217 			if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL)
1218 				continue;
1219 		}
1220 
1221 		/*
1222 		 * If the page doesn't have any translations, is
1223 		 * dirty and not being shared, then push it out
1224 		 * asynchronously and avoid waiting for the
1225 		 * pageout daemon to do it for us.
1226 		 *
1227 		 * XXX - Do we really need to get the "exclusive"
1228 		 * lock via an upgrade?
1229 		 */
1230 		if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) &&
1231 		    hat_ismod(pp) && page_tryupgrade(pp)) {
1232 			/*
1233 			 * Hold the vnode before releasing the page lock to
1234 			 * prevent it from being freed and re-used by some
1235 			 * other thread.
1236 			 */
1237 			VN_HOLD(vp);
1238 			page_unlock(pp);
1239 
1240 			/*
1241 			 * Want most powerful credentials we can get so
1242 			 * use kcred.
1243 			 */
1244 			(void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
1245 			    B_ASYNC | B_FREE, kcred, NULL);
1246 			VN_RELE(vp);
1247 		} else {
1248 			page_unlock(pp);
1249 		}
1250 	}
1251 
1252 	/* If unlocking, release physical memory */
1253 	if (flags & KPD_LOCKED) {
1254 		pgcnt_t pages = btopr(len);
1255 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
1256 			atomic_add_long(&anon_segkp_pages_locked, -pages);
1257 		page_unresv(pages);
1258 	}
1259 	return (0);
1260 }
1261 
1262 /*
1263  * Insert the kpd in the hash table.
1264  */
1265 static void
1266 segkp_insert(struct seg *seg, struct segkp_data *kpd)
1267 {
1268 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1269 	int index;
1270 
1271 	/*
1272 	 * Insert the kpd based on the address that will be returned
1273 	 * via segkp_release.
1274 	 */
1275 	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
1276 	mutex_enter(&segkp_lock);
1277 	kpd->kp_next = kpsd->kpsd_hash[index];
1278 	kpsd->kpsd_hash[index] = kpd;
1279 	mutex_exit(&segkp_lock);
1280 }
1281 
1282 /*
1283  * Remove kpd from the hash table.
1284  */
1285 static void
1286 segkp_delete(struct seg *seg, struct segkp_data *kpd)
1287 {
1288 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1289 	struct segkp_data **kpp;
1290 	int index;
1291 
1292 	ASSERT(MUTEX_HELD(&segkp_lock));
1293 
1294 	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
1295 	for (kpp = &kpsd->kpsd_hash[index];
1296 	    *kpp != NULL; kpp = &((*kpp)->kp_next)) {
1297 		if (*kpp == kpd) {
1298 			*kpp = kpd->kp_next;
1299 			return;
1300 		}
1301 	}
1302 	panic("segkp_delete: unable to find element to delete");
1303 	/*NOTREACHED*/
1304 }
1305 
1306 /*
1307  * Find the kpd associated with a vaddr.
1308  *
1309  * Most of the callers of segkp_find will pass the vaddr that
1310  * hashes to the desired index, but there are cases where
1311  * this is not true in which case we have to (potentially) scan
1312  * the whole table looking for it. This should be very rare
1313  * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the
1314  * middle of the segkp_data region).
1315  */
1316 static struct segkp_data *
1317 segkp_find(struct seg *seg, caddr_t vaddr)
1318 {
1319 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1320 	struct segkp_data *kpd;
1321 	int	i;
1322 	int	stop;
1323 
1324 	i = stop = SEGKP_HASH(vaddr);
1325 	mutex_enter(&segkp_lock);
1326 	do {
1327 		for (kpd = kpsd->kpsd_hash[i]; kpd != NULL;
1328 						kpd = kpd->kp_next) {
1329 			if (vaddr >= kpd->kp_base &&
1330 			    vaddr < kpd->kp_base + kpd->kp_len) {
1331 				mutex_exit(&segkp_lock);
1332 				return (kpd);
1333 			}
1334 		}
1335 		if (--i < 0)
1336 			i = SEGKP_HASHSZ - 1;	/* Wrap */
1337 	} while (i != stop);
1338 	mutex_exit(&segkp_lock);
1339 	return (NULL);		/* Not found */
1340 }
1341 
1342 /*
1343  * returns size of swappable area.
1344  */
1345 size_t
1346 swapsize(caddr_t v)
1347 {
1348 	struct segkp_data *kpd;
1349 
1350 	if ((kpd = segkp_find(segkp, v)) != NULL)
1351 		return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
1352 	else
1353 		return (NULL);
1354 }
1355 
1356 /*
1357  * Dump out all the active segkp pages
1358  */
1359 static void
1360 segkp_dump(struct seg *seg)
1361 {
1362 	int i;
1363 	struct segkp_data *kpd;
1364 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1365 
1366 	for (i = 0; i < SEGKP_HASHSZ; i++) {
1367 		for (kpd = kpsd->kpsd_hash[i];
1368 		    kpd != NULL; kpd = kpd->kp_next) {
1369 			pfn_t pfn;
1370 			caddr_t addr;
1371 			caddr_t eaddr;
1372 
1373 			addr = kpd->kp_base;
1374 			eaddr = addr + kpd->kp_len;
1375 			while (addr < eaddr) {
1376 				ASSERT(seg->s_as == &kas);
1377 				pfn = hat_getpfnum(seg->s_as->a_hat, addr);
1378 				if (pfn != PFN_INVALID)
1379 					dump_addpage(seg->s_as, addr, pfn);
1380 				addr += PAGESIZE;
1381 				dump_timeleft = dump_timeout;
1382 			}
1383 		}
1384 	}
1385 }
1386 
1387 /*ARGSUSED*/
1388 static int
1389 segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
1390     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1391 {
1392 	return (ENOTSUP);
1393 }
1394 
1395 /*ARGSUSED*/
1396 static int
1397 segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
1398 {
1399 	return (ENODEV);
1400 }
1401 
1402 /*ARGSUSED*/
1403 static lgrp_mem_policy_info_t	*
1404 segkp_getpolicy(struct seg *seg, caddr_t addr)
1405 {
1406 	return (NULL);
1407 }
1408 
1409 /*ARGSUSED*/
1410 static int
1411 segkp_capable(struct seg *seg, segcapability_t capability)
1412 {
1413 	return (0);
1414 }
1415 
1416 #include <sys/mem_config.h>
1417 
1418 /*ARGSUSED*/
1419 static void
1420 segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages)
1421 {}
1422 
1423 /*
1424  * During memory delete, turn off caches so that pages are not held.
1425  * A better solution may be to unlock the pages while they are
1426  * in the cache so that they may be collected naturally.
1427  */
1428 
1429 /*ARGSUSED*/
1430 static int
1431 segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages)
1432 {
1433 	atomic_add_32(&segkp_indel, 1);
1434 	segkp_cache_free();
1435 	return (0);
1436 }
1437 
1438 /*ARGSUSED*/
1439 static void
1440 segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
1441 {
1442 	atomic_add_32(&segkp_indel, -1);
1443 }
1444 
1445 static kphysm_setup_vector_t segkp_mem_config_vec = {
1446 	KPHYSM_SETUP_VECTOR_VERSION,
1447 	segkp_mem_config_post_add,
1448 	segkp_mem_config_pre_del,
1449 	segkp_mem_config_post_del,
1450 };
1451 
1452 static void
1453 segkpinit_mem_config(struct seg *seg)
1454 {
1455 	int ret;
1456 
1457 	ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg);
1458 	ASSERT(ret == 0);
1459 }
1460