xref: /titanic_52/usr/src/uts/common/vm/vm_seg.c (revision 5bb86dd8f405a48942aaaab3ca1f410ed7e6db4d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * VM - segment management.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/inttypes.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kmem.h>
51 #include <sys/vmsystm.h>
52 #include <sys/debug.h>
53 #include <sys/cmn_err.h>
54 #include <sys/callb.h>
55 #include <sys/mem_config.h>
56 #include <sys/mman.h>
57 
58 #include <vm/hat.h>
59 #include <vm/as.h>
60 #include <vm/seg.h>
61 #include <vm/seg_kmem.h>
62 #include <vm/seg_spt.h>
63 #include <vm/seg_vn.h>
64 /*
65  * kstats for segment advise
66  */
67 segadvstat_t segadvstat = {
68 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
69 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
70 };
71 
72 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
73 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
74 
75 /* #define	PDEBUG */
76 #if defined(PDEBUG) || defined(lint) || defined(__lint)
77 int pdebug = 0;
78 #else
79 #define	pdebug		0
80 #endif	/* PDEBUG */
81 
82 #define	PPRINTF				if (pdebug) printf
83 #define	PPRINT(x)			PPRINTF(x)
84 #define	PPRINT1(x, a)			PPRINTF(x, a)
85 #define	PPRINT2(x, a, b)		PPRINTF(x, a, b)
86 #define	PPRINT3(x, a, b, c)		PPRINTF(x, a, b, c)
87 #define	PPRINT4(x, a, b, c, d)		PPRINTF(x, a, b, c, d)
88 #define	PPRINT5(x, a, b, c, d, e)	PPRINTF(x, a, b, c, d, e)
89 
90 #define	P_HASHMASK		(p_hashsize - 1)
91 #define	P_BASESHIFT		6
92 
93 /*
94  * entry in the segment page cache
95  */
96 struct seg_pcache {
97 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
98 	struct seg_pcache *p_hprev;
99 	int		p_active;	/* active count */
100 	int		p_ref;		/* ref bit */
101 	size_t		p_len;		/* segment length */
102 	caddr_t		p_addr;		/* base address */
103 	struct seg 	*p_seg;		/* segment */
104 	struct page	**p_pp;		/* pp shadow list */
105 	enum seg_rw	p_rw;		/* rw */
106 	uint_t		p_flags;	/* bit flags */
107 	int		(*p_callback)(struct seg *, caddr_t, size_t,
108 			    struct page **, enum seg_rw);
109 };
110 
111 struct seg_phash {
112 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
113 	struct seg_pcache *p_hprev;
114 	int p_qlen;			/* Q length */
115 	kmutex_t p_hmutex;		/* protects hash bucket */
116 };
117 
118 static int seg_preap_time = 20;	/* reclaim every 20 secs */
119 static int seg_pmaxqlen = 5;	/* max Q length in hash list */
120 static int seg_ppcount = 5;	/* max # of purges per reclaim interval */
121 static int seg_plazy = 1;	/* if 1, pages are cached after pageunlock */
122 static pgcnt_t seg_pwindow;	/* max # of pages that can be cached */
123 static pgcnt_t seg_plocked;	/* # of pages which are cached by pagelock */
124 static pgcnt_t seg_plocked_window; /* # pages from window */
125 int seg_preapahead;
126 
127 static uint_t seg_pdisable = 0;	/* if not 0, caching temporarily disabled */
128 
129 static int seg_pupdate_active = 1;	/* background reclaim thread */
130 static clock_t seg_preap_interval;	/* reap interval in ticks */
131 
132 static kmutex_t seg_pcache;	/* protects the whole pagelock cache */
133 static kmutex_t seg_pmem;	/* protects window counter */
134 static ksema_t seg_pasync_sem;	/* sema for reclaim thread */
135 static struct seg_phash *p_hashtab;
136 static int p_hashsize = 0;
137 
138 #define	p_hash(seg) \
139 	(P_HASHMASK & \
140 	((uintptr_t)(seg) >> P_BASESHIFT))
141 
142 #define	p_match(pcp, seg, addr, len, rw) \
143 	(((pcp)->p_seg == (seg) && \
144 	(pcp)->p_addr == (addr) && \
145 	(pcp)->p_rw == (rw) && \
146 	(pcp)->p_len == (len)) ? 1 : 0)
147 
148 #define	p_match_pp(pcp, seg, addr, len, pp, rw) \
149 	(((pcp)->p_seg == (seg) && \
150 	(pcp)->p_addr == (addr) && \
151 	(pcp)->p_pp == (pp) && \
152 	(pcp)->p_rw == (rw) && \
153 	(pcp)->p_len == (len)) ? 1 : 0)
154 
155 
156 /*
157  * lookup an address range in pagelock cache. Return shadow list
158  * and bump up active count.
159  */
160 struct page **
161 seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
162 {
163 	struct seg_pcache *pcp;
164 	struct seg_phash *hp;
165 
166 	/*
167 	 * Skip pagelock cache, while DR is in progress or
168 	 * seg_pcache is off.
169 	 */
170 	if (seg_pdisable || seg_plazy == 0) {
171 		return (NULL);
172 	}
173 
174 	hp = &p_hashtab[p_hash(seg)];
175 	mutex_enter(&hp->p_hmutex);
176 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
177 	    pcp = pcp->p_hnext) {
178 		if (p_match(pcp, seg, addr, len, rw)) {
179 			pcp->p_active++;
180 			mutex_exit(&hp->p_hmutex);
181 
182 			PPRINT5("seg_plookup hit: seg %p, addr %p, "
183 			    "len %lx, count %d, pplist %p \n",
184 			    (void *)seg, (void *)addr, len, pcp->p_active,
185 			    (void *)pcp->p_pp);
186 
187 			return (pcp->p_pp);
188 		}
189 	}
190 	mutex_exit(&hp->p_hmutex);
191 
192 	PPRINT("seg_plookup miss:\n");
193 
194 	return (NULL);
195 }
196 
197 /*
198  * mark address range inactive. If the cache is off or the address
199  * range is not in the cache we call the segment driver to reclaim
200  * the pages. Otherwise just decrement active count and set ref bit.
201  */
202 void
203 seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
204     enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
205     struct page **, enum seg_rw))
206 {
207 	struct seg_pcache *pcp;
208 	struct seg_phash *hp;
209 
210 	if (seg_plazy == 0) {
211 		(void) (*callback)(seg, addr, len, pp, rw);
212 		return;
213 	}
214 	hp = &p_hashtab[p_hash(seg)];
215 	mutex_enter(&hp->p_hmutex);
216 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
217 	    pcp = pcp->p_hnext) {
218 		if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
219 			pcp->p_active--;
220 			ASSERT(pcp->p_active >= 0);
221 			if (pcp->p_active == 0 && seg_pdisable) {
222 				int npages;
223 
224 				ASSERT(callback == pcp->p_callback);
225 				/* free the entry */
226 				hp->p_qlen--;
227 				pcp->p_hprev->p_hnext = pcp->p_hnext;
228 				pcp->p_hnext->p_hprev = pcp->p_hprev;
229 				mutex_exit(&hp->p_hmutex);
230 				npages = pcp->p_len >> PAGESHIFT;
231 				mutex_enter(&seg_pmem);
232 				seg_plocked -= npages;
233 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
234 					seg_plocked_window -= npages;
235 				}
236 				mutex_exit(&seg_pmem);
237 				kmem_free(pcp, sizeof (struct seg_pcache));
238 				goto out;
239 			}
240 			pcp->p_ref = 1;
241 			mutex_exit(&hp->p_hmutex);
242 			return;
243 		}
244 	}
245 	mutex_exit(&hp->p_hmutex);
246 out:
247 	(void) (*callback)(seg, addr, len, pp, rw);
248 }
249 
250 /*
251  * The seg_pinsert_check() is used by segment drivers to predict whether
252  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
253  */
254 
255 int
256 seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
257 {
258 	struct seg_phash *hp;
259 
260 	if (seg_plazy == 0) {
261 		return (SEGP_FAIL);
262 	}
263 	if (seg_pdisable != 0) {
264 		return (SEGP_FAIL);
265 	}
266 	ASSERT((len & PAGEOFFSET) == 0);
267 	hp = &p_hashtab[p_hash(seg)];
268 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
269 		return (SEGP_FAIL);
270 	}
271 	/*
272 	 * If the SEGP_FORCE_WIRED flag is set,
273 	 * we skip the check for seg_pwindow.
274 	 */
275 	if ((flags & SEGP_FORCE_WIRED) == 0) {
276 		pgcnt_t npages;
277 
278 		npages = len >> PAGESHIFT;
279 		if ((seg_plocked_window + npages) > seg_pwindow) {
280 			return (SEGP_FAIL);
281 		}
282 	}
283 	return (SEGP_SUCCESS);
284 }
285 
286 
287 /*
288  * insert address range with shadow list into pagelock cache. If
289  * the cache is off or caching is temporarily disabled or the allowed
290  * 'window' is exceeded - return SEGP_FAIL. Otherwise return
291  * SEGP_SUCCESS.
292  */
293 int
294 seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
295     enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
296     size_t, struct page **, enum seg_rw))
297 {
298 	struct seg_pcache *pcp;
299 	struct seg_phash *hp;
300 	pgcnt_t npages;
301 
302 	if (seg_plazy == 0) {
303 		return (SEGP_FAIL);
304 	}
305 	if (seg_pdisable != 0) {
306 		return (SEGP_FAIL);
307 	}
308 	ASSERT((len & PAGEOFFSET) == 0);
309 	hp = &p_hashtab[p_hash(seg)];
310 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
311 		return (SEGP_FAIL);
312 	}
313 	npages = len >> PAGESHIFT;
314 	mutex_enter(&seg_pmem);
315 	/*
316 	 * If the SEGP_FORCE_WIRED flag is set,
317 	 * we skip the check for seg_pwindow.
318 	 */
319 	if ((flags & SEGP_FORCE_WIRED) == 0) {
320 		seg_plocked_window += npages;
321 		if (seg_plocked_window > seg_pwindow) {
322 			seg_plocked_window -= npages;
323 			mutex_exit(&seg_pmem);
324 			return (SEGP_FAIL);
325 		}
326 	}
327 	seg_plocked += npages;
328 	mutex_exit(&seg_pmem);
329 
330 	pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
331 	pcp->p_seg = seg;
332 	pcp->p_addr = addr;
333 	pcp->p_len = len;
334 	pcp->p_pp = pp;
335 	pcp->p_rw = rw;
336 	pcp->p_callback = callback;
337 	pcp->p_active = 1;
338 	pcp->p_flags = flags;
339 
340 	PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
341 	    (void *)seg, (void *)addr, len, (void *)pp);
342 
343 	hp = &p_hashtab[p_hash(seg)];
344 	mutex_enter(&hp->p_hmutex);
345 	hp->p_qlen++;
346 	pcp->p_hnext = hp->p_hnext;
347 	pcp->p_hprev = (struct seg_pcache *)hp;
348 	hp->p_hnext->p_hprev = pcp;
349 	hp->p_hnext = pcp;
350 	mutex_exit(&hp->p_hmutex);
351 	return (SEGP_SUCCESS);
352 }
353 
354 /*
355  * purge all entries from the pagelock cache if not active
356  * and not recently used. Drop all locks and call through
357  * the address space into the segment driver to reclaim
358  * the pages. This makes sure we get the address space
359  * and segment driver locking right.
360  */
361 static void
362 seg_ppurge_all(int force)
363 {
364 	struct seg_pcache *delcallb_list = NULL;
365 	struct seg_pcache *pcp;
366 	struct seg_phash *hp;
367 	int purge_count = 0;
368 	pgcnt_t npages = 0;
369 	pgcnt_t npages_window = 0;
370 
371 	/*
372 	 * if the cache if off or empty, return
373 	 */
374 	if (seg_plazy == 0 || seg_plocked == 0) {
375 		return;
376 	}
377 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
378 		mutex_enter(&hp->p_hmutex);
379 		pcp = hp->p_hnext;
380 
381 		/*
382 		 * While 'force' is set, seg_pasync_thread is not
383 		 * throttled.  This is to speedup flushing of seg_pcache
384 		 * in preparation for DR.
385 		 *
386 		 * In normal case, when 'force' is not set, we throttle
387 		 * seg_pasync_thread so that we don't spend all the time
388 		 * time in purging the cache.
389 		 */
390 		while ((pcp != (struct seg_pcache *)hp) &&
391 		    (force || (purge_count <= seg_ppcount))) {
392 
393 			/*
394 			 * purge entries which are not active and
395 			 * have not been used recently and
396 			 * have the SEGP_ASYNC_FLUSH flag.
397 			 *
398 			 * In the 'force' case, we ignore the
399 			 * SEGP_ASYNC_FLUSH flag.
400 			 */
401 			if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
402 				pcp->p_ref = 1;
403 			if (force)
404 				pcp->p_ref = 0;
405 			if (!pcp->p_ref && !pcp->p_active) {
406 				struct as *as = pcp->p_seg->s_as;
407 
408 				/*
409 				 * try to get the readers lock on the address
410 				 * space before taking out the cache element.
411 				 * This ensures as_pagereclaim() can actually
412 				 * call through the address space and free
413 				 * the pages. If we don't get the lock, just
414 				 * skip this entry. The pages will be reclaimed
415 				 * by the segment driver at unmap time.
416 				 */
417 				if (AS_LOCK_TRYENTER(as, &as->a_lock,
418 				    RW_READER)) {
419 					hp->p_qlen--;
420 					pcp->p_hprev->p_hnext = pcp->p_hnext;
421 					pcp->p_hnext->p_hprev = pcp->p_hprev;
422 					pcp->p_hprev = delcallb_list;
423 					delcallb_list = pcp;
424 					purge_count++;
425 				}
426 			} else {
427 				pcp->p_ref = 0;
428 			}
429 			pcp = pcp->p_hnext;
430 		}
431 		mutex_exit(&hp->p_hmutex);
432 		if (!force && purge_count > seg_ppcount)
433 			break;
434 	}
435 
436 	/*
437 	 * run the delayed callback list. We don't want to hold the
438 	 * cache lock during a call through the address space.
439 	 */
440 	while (delcallb_list != NULL) {
441 		struct as *as;
442 
443 		pcp = delcallb_list;
444 		delcallb_list = pcp->p_hprev;
445 		as = pcp->p_seg->s_as;
446 
447 		PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
448 		    "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
449 		    pcp->p_len, (void *)pcp->p_pp);
450 
451 		as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
452 		    pcp->p_len, pcp->p_rw);
453 		AS_LOCK_EXIT(as, &as->a_lock);
454 		npages += pcp->p_len >> PAGESHIFT;
455 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
456 			npages_window += pcp->p_len >> PAGESHIFT;
457 		}
458 		kmem_free(pcp, sizeof (struct seg_pcache));
459 	}
460 	mutex_enter(&seg_pmem);
461 	seg_plocked -= npages;
462 	seg_plocked_window -= npages_window;
463 	mutex_exit(&seg_pmem);
464 }
465 
466 /*
467  * Remove cached pages for segment(s) entries from hashtable.
468  * The segments are identified by a given clients callback
469  * function.
470  * This is useful for multiple seg's cached on behalf of
471  * dummy segment (ISM/DISM) with common callback function.
472  * The clients callback function may return status indicating
473  * that the last seg's entry has been purged. In such a case
474  * the seg_ppurge_seg() stops searching hashtable and exits.
475  * Otherwise all hashtable entries are scanned.
476  */
477 void
478 seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
479     struct page **, enum seg_rw))
480 {
481 	struct seg_pcache *pcp, *npcp;
482 	struct seg_phash *hp;
483 	pgcnt_t npages = 0;
484 	pgcnt_t npages_window = 0;
485 	int	done = 0;
486 
487 	/*
488 	 * if the cache if off or empty, return
489 	 */
490 	if (seg_plazy == 0 || seg_plocked == 0) {
491 		return;
492 	}
493 	mutex_enter(&seg_pcache);
494 	seg_pdisable++;
495 	mutex_exit(&seg_pcache);
496 
497 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
498 
499 		mutex_enter(&hp->p_hmutex);
500 		pcp = hp->p_hnext;
501 		while (pcp != (struct seg_pcache *)hp) {
502 
503 			/*
504 			 * purge entries which are not active
505 			 */
506 			npcp = pcp->p_hnext;
507 			if (!pcp->p_active && pcp->p_callback == callback) {
508 				hp->p_qlen--;
509 				pcp->p_hprev->p_hnext = pcp->p_hnext;
510 				pcp->p_hnext->p_hprev = pcp->p_hprev;
511 
512 				if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
513 				    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
514 					done = 1;
515 				}
516 
517 				npages += pcp->p_len >> PAGESHIFT;
518 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
519 					npages_window +=
520 					    pcp->p_len >> PAGESHIFT;
521 				}
522 				kmem_free(pcp, sizeof (struct seg_pcache));
523 			}
524 			pcp = npcp;
525 			if (done)
526 				break;
527 		}
528 		mutex_exit(&hp->p_hmutex);
529 		if (done)
530 			break;
531 	}
532 
533 	mutex_enter(&seg_pcache);
534 	seg_pdisable--;
535 	mutex_exit(&seg_pcache);
536 
537 	mutex_enter(&seg_pmem);
538 	seg_plocked -= npages;
539 	seg_plocked_window -= npages_window;
540 	mutex_exit(&seg_pmem);
541 }
542 
543 /*
544  * purge all entries for a given segment. Since we
545  * callback into the segment driver directly for page
546  * reclaim the caller needs to hold the right locks.
547  */
548 void
549 seg_ppurge(struct seg *seg)
550 {
551 	struct seg_pcache *delcallb_list = NULL;
552 	struct seg_pcache *pcp;
553 	struct seg_phash *hp;
554 	pgcnt_t npages = 0;
555 	pgcnt_t npages_window = 0;
556 
557 	if (seg_plazy == 0) {
558 		return;
559 	}
560 	hp = &p_hashtab[p_hash(seg)];
561 	mutex_enter(&hp->p_hmutex);
562 	pcp = hp->p_hnext;
563 	while (pcp != (struct seg_pcache *)hp) {
564 		if (pcp->p_seg == seg) {
565 			if (pcp->p_active) {
566 				break;
567 			}
568 			hp->p_qlen--;
569 			pcp->p_hprev->p_hnext = pcp->p_hnext;
570 			pcp->p_hnext->p_hprev = pcp->p_hprev;
571 			pcp->p_hprev = delcallb_list;
572 			delcallb_list = pcp;
573 		}
574 		pcp = pcp->p_hnext;
575 	}
576 	mutex_exit(&hp->p_hmutex);
577 	while (delcallb_list != NULL) {
578 		pcp = delcallb_list;
579 		delcallb_list = pcp->p_hprev;
580 
581 		PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
582 		    "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
583 		    pcp->p_len, (void *)pcp->p_pp);
584 
585 		ASSERT(seg == pcp->p_seg);
586 		(void) (*pcp->p_callback)(seg, pcp->p_addr,
587 		    pcp->p_len, pcp->p_pp, pcp->p_rw);
588 		npages += pcp->p_len >> PAGESHIFT;
589 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
590 			npages_window += pcp->p_len >> PAGESHIFT;
591 		}
592 		kmem_free(pcp, sizeof (struct seg_pcache));
593 	}
594 	mutex_enter(&seg_pmem);
595 	seg_plocked -= npages;
596 	seg_plocked_window -= npages_window;
597 	mutex_exit(&seg_pmem);
598 }
599 
600 static void seg_pinit_mem_config(void);
601 
602 /*
603  * setup the pagelock cache
604  */
605 static void
606 seg_pinit(void)
607 {
608 	struct seg_phash *hp;
609 	int i;
610 	uint_t physmegs;
611 
612 	sema_init(&seg_pasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
613 
614 	mutex_enter(&seg_pcache);
615 	if (p_hashtab == NULL) {
616 		physmegs = physmem >> (20 - PAGESHIFT);
617 
618 		/* If p_hashsize was not set in /etc/system ... */
619 		if (p_hashsize == 0) {
620 			/*
621 			 * Choose p_hashsize based on physmem.
622 			 */
623 			if (physmegs < 64) {
624 				p_hashsize = 64;
625 			} else if (physmegs < 1024) {
626 				p_hashsize = 1024;
627 			} else if (physmegs < 10 * 1024) {
628 				p_hashsize = 8192;
629 			} else if (physmegs < 20 * 1024) {
630 				p_hashsize = 2 * 8192;
631 				seg_pmaxqlen = 16;
632 			} else {
633 				p_hashsize = 128 * 1024;
634 				seg_pmaxqlen = 128;
635 			}
636 		}
637 
638 		p_hashtab = kmem_zalloc(p_hashsize * sizeof (struct seg_phash),
639 		    KM_SLEEP);
640 		for (i = 0; i < p_hashsize; i++) {
641 			hp = (struct seg_phash *)&p_hashtab[i];
642 			hp->p_hnext = (struct seg_pcache *)hp;
643 			hp->p_hprev = (struct seg_pcache *)hp;
644 			mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
645 		}
646 		if (seg_pwindow == 0) {
647 			if (physmegs < 24) {
648 				/* don't use cache */
649 				seg_plazy = 0;
650 			} else if (physmegs < 64) {
651 				seg_pwindow = physmem >> 5; /* 3% of memory */
652 			} else if (physmegs < 10 * 1024) {
653 				seg_pwindow = physmem >> 3; /* 12% of memory */
654 			} else {
655 				seg_pwindow = physmem >> 1;
656 			}
657 		}
658 	}
659 	mutex_exit(&seg_pcache);
660 
661 	seg_pinit_mem_config();
662 }
663 
664 /*
665  * called by pageout if memory is low
666  */
667 void
668 seg_preap(void)
669 {
670 	/*
671 	 * if the cache if off or empty, return
672 	 */
673 	if (seg_plocked == 0 || seg_plazy == 0) {
674 		return;
675 	}
676 	sema_v(&seg_pasync_sem);
677 }
678 
679 static void seg_pupdate(void *);
680 
681 /*
682  * run as a backgroud thread and reclaim pagelock
683  * pages which have not been used recently
684  */
685 void
686 seg_pasync_thread(void)
687 {
688 	callb_cpr_t cpr_info;
689 	kmutex_t pasync_lock;	/* just for CPR stuff */
690 
691 	mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
692 
693 	CALLB_CPR_INIT(&cpr_info, &pasync_lock, callb_generic_cpr,
694 	    "seg_pasync");
695 
696 	if (seg_preap_interval == 0) {
697 		seg_preap_interval = seg_preap_time * hz;
698 	} else {
699 		seg_preap_interval *= hz;
700 	}
701 	if (seg_plazy && seg_pupdate_active) {
702 		(void) timeout(seg_pupdate, NULL, seg_preap_interval);
703 	}
704 
705 	for (;;) {
706 		mutex_enter(&pasync_lock);
707 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
708 		mutex_exit(&pasync_lock);
709 		sema_p(&seg_pasync_sem);
710 		mutex_enter(&pasync_lock);
711 		CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
712 		mutex_exit(&pasync_lock);
713 
714 		seg_ppurge_all(0);
715 	}
716 }
717 
718 static void
719 seg_pupdate(void *dummy)
720 {
721 	sema_v(&seg_pasync_sem);
722 
723 	if (seg_plazy && seg_pupdate_active) {
724 		(void) timeout(seg_pupdate, dummy, seg_preap_interval);
725 	}
726 }
727 
728 static struct kmem_cache *seg_cache;
729 
730 /*
731  * Initialize segment management data structures.
732  */
733 void
734 seg_init(void)
735 {
736 	kstat_t *ksp;
737 
738 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 0,
739 	    NULL, NULL, NULL, NULL, NULL, 0);
740 
741 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
742 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
743 	if (ksp) {
744 		ksp->ks_data = (void *)segadvstat_ptr;
745 		kstat_install(ksp);
746 	}
747 
748 	seg_pinit();
749 }
750 
751 /*
752  * Allocate a segment to cover [base, base+size]
753  * and attach it to the specified address space.
754  */
755 struct seg *
756 seg_alloc(struct as *as, caddr_t base, size_t size)
757 {
758 	struct seg *new;
759 	caddr_t segbase;
760 	size_t segsize;
761 
762 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
763 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
764 	    (uintptr_t)segbase;
765 
766 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
767 		return ((struct seg *)NULL);	/* bad virtual addr range */
768 
769 	if (as != &kas &&
770 	    valid_usr_range(segbase, segsize, 0, as,
771 	    as->a_userlimit) != RANGE_OKAY)
772 		return ((struct seg *)NULL);	/* bad virtual addr range */
773 
774 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
775 	new->s_ops = NULL;
776 	new->s_data = NULL;
777 	new->s_szc = 0;
778 	new->s_flags = 0;
779 	if (seg_attach(as, segbase, segsize, new) < 0) {
780 		kmem_cache_free(seg_cache, new);
781 		return ((struct seg *)NULL);
782 	}
783 	/* caller must fill in ops, data */
784 	return (new);
785 }
786 
787 /*
788  * Attach a segment to the address space.  Used by seg_alloc()
789  * and for kernel startup to attach to static segments.
790  */
791 int
792 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
793 {
794 	seg->s_as = as;
795 	seg->s_base = base;
796 	seg->s_size = size;
797 
798 	/*
799 	 * as_addseg() will add the segment at the appropraite point
800 	 * in the list. It will return -1 if there is overlap with
801 	 * an already existing segment.
802 	 */
803 	return (as_addseg(as, seg));
804 }
805 
806 /*
807  * Unmap a segment and free it from its associated address space.
808  * This should be called by anybody who's finished with a whole segment's
809  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
810  * responsibility of the segment driver to unlink the the segment
811  * from the address space, and to free public and private data structures
812  * associated with the segment.  (This is typically done by a call to
813  * seg_free()).
814  */
815 void
816 seg_unmap(struct seg *seg)
817 {
818 #ifdef DEBUG
819 	int ret;
820 #endif /* DEBUG */
821 
822 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
823 
824 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
825 	ASSERT(seg->s_data != NULL);
826 
827 	/* Unmap the whole mapping */
828 #ifdef DEBUG
829 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
830 	ASSERT(ret == 0);
831 #else
832 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
833 #endif /* DEBUG */
834 }
835 
836 /*
837  * Free the segment from its associated as. This should only be called
838  * if a mapping to the segment has not yet been established (e.g., if
839  * an error occurs in the middle of doing an as_map when the segment
840  * has already been partially set up) or if it has already been deleted
841  * (e.g., from a segment driver unmap routine if the unmap applies to the
842  * entire segment). If the mapping is currently set up then seg_unmap() should
843  * be called instead.
844  */
845 void
846 seg_free(struct seg *seg)
847 {
848 	register struct as *as = seg->s_as;
849 	struct seg *tseg = as_removeseg(as, seg);
850 
851 	ASSERT(tseg == seg);
852 
853 	/*
854 	 * If the segment private data field is NULL,
855 	 * then segment driver is not attached yet.
856 	 */
857 	if (seg->s_data != NULL)
858 		SEGOP_FREE(seg);
859 
860 	kmem_cache_free(seg_cache, seg);
861 }
862 
863 /*ARGSUSED*/
864 static void
865 seg_p_mem_config_post_add(
866 	void *arg,
867 	pgcnt_t delta_pages)
868 {
869 	/* Nothing to do. */
870 }
871 
872 void
873 seg_p_enable(void)
874 {
875 	mutex_enter(&seg_pcache);
876 	ASSERT(seg_pdisable != 0);
877 	seg_pdisable--;
878 	mutex_exit(&seg_pcache);
879 }
880 
881 /*
882  * seg_p_disable - disables seg_pcache, and then attempts to empty the
883  * cache.
884  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
885  * SEGP_FAIL if the cache could not be emptied.
886  */
887 int
888 seg_p_disable(void)
889 {
890 	pgcnt_t	old_plocked;
891 	int stall_count = 0;
892 
893 	mutex_enter(&seg_pcache);
894 	seg_pdisable++;
895 	ASSERT(seg_pdisable != 0);
896 	mutex_exit(&seg_pcache);
897 
898 	/*
899 	 * Attempt to empty the cache. Terminate if seg_plocked does not
900 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
901 	 */
902 	while (seg_plocked != 0) {
903 		old_plocked = seg_plocked;
904 		seg_ppurge_all(1);
905 		if (seg_plocked == old_plocked) {
906 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
907 				return (SEGP_FAIL);
908 			}
909 		} else
910 			stall_count = 0;
911 		if (seg_plocked != 0)
912 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
913 	}
914 	return (SEGP_SUCCESS);
915 }
916 
917 /*
918  * Attempt to purge seg_pcache.  May need to return before this has
919  * completed to allow other pre_del callbacks to unlock pages. This is
920  * ok because:
921  *	1) The seg_pdisable flag has been set so at least we won't
922  *	cache anymore locks and the locks we couldn't purge
923  *	will not be held if they do get released by a subsequent
924  *	pre-delete callback.
925  *
926  *	2) The rest of the memory delete thread processing does not
927  *	depend on the changes made in this pre-delete callback. No
928  *	panics will result, the worst that will happen is that the
929  *	DR code will timeout and cancel the delete.
930  */
931 /*ARGSUSED*/
932 static int
933 seg_p_mem_config_pre_del(
934 	void *arg,
935 	pgcnt_t delta_pages)
936 {
937 	if (seg_p_disable() != SEGP_SUCCESS)
938 		cmn_err(CE_NOTE,
939 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
940 	return (0);
941 }
942 
943 /*ARGSUSED*/
944 static void
945 seg_p_mem_config_post_del(
946 	void *arg,
947 	pgcnt_t delta_pages,
948 	int cancelled)
949 {
950 	seg_p_enable();
951 }
952 
953 static kphysm_setup_vector_t seg_p_mem_config_vec = {
954 	KPHYSM_SETUP_VECTOR_VERSION,
955 	seg_p_mem_config_post_add,
956 	seg_p_mem_config_pre_del,
957 	seg_p_mem_config_post_del,
958 };
959 
960 static void
961 seg_pinit_mem_config(void)
962 {
963 	int ret;
964 
965 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
966 	/*
967 	 * Want to catch this in the debug kernel. At run time, if the
968 	 * callbacks don't get run all will be OK as the disable just makes
969 	 * it more likely that the pages can be collected.
970 	 */
971 	ASSERT(ret == 0);
972 }
973 
974 extern struct seg_ops segvn_ops;
975 extern struct seg_ops segspt_shmops;
976 
977 /*
978  * Verify that segment is not a shared anonymous segment which reserves
979  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
980  * from one zone to another if any segments are shared.  This is because the
981  * last process to exit will credit the swap reservation.  This could lead
982  * to the swap being reserved by one zone, and credited to another.
983  */
984 boolean_t
985 seg_can_change_zones(struct seg *seg)
986 {
987 	struct segvn_data *svd;
988 
989 	if (seg->s_ops == &segspt_shmops)
990 		return (B_FALSE);
991 
992 	if (seg->s_ops == &segvn_ops) {
993 		svd = (struct segvn_data *)seg->s_data;
994 		if (svd->type == MAP_SHARED &&
995 		    svd->amp != NULL &&
996 		    svd->amp->swresv > 0)
997 		return (B_FALSE);
998 	}
999 	return (B_TRUE);
1000 }
1001 
1002 /*
1003  * Return swap reserved by a segment backing a private mapping.
1004  */
1005 size_t
1006 seg_swresv(struct seg *seg)
1007 {
1008 	struct segvn_data *svd;
1009 	size_t swap = 0;
1010 
1011 	if (seg->s_ops == &segvn_ops) {
1012 		svd = (struct segvn_data *)seg->s_data;
1013 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1014 			swap = svd->swresv;
1015 	}
1016 	return (swap);
1017 }
1018