1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 /*
40 * VM - paged vnode.
41 *
42 * This file supplies vm support for the vnode operations that deal with pages.
43 */
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/time.h>
50 #include <sys/buf.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/vmsystm.h>
54 #include <sys/mman.h>
55 #include <sys/vfs.h>
56 #include <sys/cred.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/cmn_err.h>
60 #include <sys/debug.h>
61 #include <sys/cpuvar.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/rm.h>
69 #include <vm/pvn.h>
70 #include <vm/page.h>
71 #include <vm/seg_map.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/fs/swapnode.h>
74
75 int pvn_nofodklust = 0;
76 int pvn_write_noklust = 0;
77
78 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
79 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
80 /* support for vmodsort for testing */
81
82 static struct kmem_cache *marker_cache = NULL;
83
84 /*
85 * Find the largest contiguous block which contains `addr' for file offset
86 * `offset' in it while living within the file system block sizes (`vp_off'
87 * and `vp_len') and the address space limits for which no pages currently
88 * exist and which map to consecutive file offsets.
89 */
90 page_t *
pvn_read_kluster(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t addr,u_offset_t * offp,size_t * lenp,u_offset_t vp_off,size_t vp_len,int isra)91 pvn_read_kluster(
92 struct vnode *vp,
93 u_offset_t off,
94 struct seg *seg,
95 caddr_t addr,
96 u_offset_t *offp, /* return values */
97 size_t *lenp, /* return values */
98 u_offset_t vp_off,
99 size_t vp_len,
100 int isra)
101 {
102 ssize_t deltaf, deltab;
103 page_t *pp;
104 page_t *plist = NULL;
105 spgcnt_t pagesavail;
106 u_offset_t vp_end;
107
108 ASSERT(off >= vp_off && off < vp_off + vp_len);
109
110 /*
111 * We only want to do klustering/read ahead if there
112 * is more than minfree pages currently available.
113 */
114 pagesavail = freemem - minfree;
115
116 if (pagesavail <= 0)
117 if (isra)
118 return ((page_t *)NULL); /* ra case - give up */
119 else
120 pagesavail = 1; /* must return a page */
121
122 /* We calculate in pages instead of bytes due to 32-bit overflows */
123 if (pagesavail < (spgcnt_t)btopr(vp_len)) {
124 /*
125 * Don't have enough free memory for the
126 * max request, try sizing down vp request.
127 */
128 deltab = (ssize_t)(off - vp_off);
129 vp_len -= deltab;
130 vp_off += deltab;
131 if (pagesavail < btopr(vp_len)) {
132 /*
133 * Still not enough memory, just settle for
134 * pagesavail which is at least 1.
135 */
136 vp_len = ptob(pagesavail);
137 }
138 }
139
140 vp_end = vp_off + vp_len;
141 ASSERT(off >= vp_off && off < vp_end);
142
143 if (isra && SEGOP_KLUSTER(seg, addr, 0))
144 return ((page_t *)NULL); /* segment driver says no */
145
146 if ((plist = page_create_va(vp, off,
147 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
148 return ((page_t *)NULL);
149
150 if (vp_len <= PAGESIZE || pvn_nofodklust) {
151 *offp = off;
152 *lenp = MIN(vp_len, PAGESIZE);
153 } else {
154 /*
155 * Scan back from front by incrementing "deltab" and
156 * comparing "off" with "vp_off + deltab" to avoid
157 * "signed" versus "unsigned" conversion problems.
158 */
159 for (deltab = PAGESIZE; off >= vp_off + deltab;
160 deltab += PAGESIZE) {
161 /*
162 * Call back to the segment driver to verify that
163 * the klustering/read ahead operation makes sense.
164 */
165 if (SEGOP_KLUSTER(seg, addr, -deltab))
166 break; /* page not eligible */
167 if ((pp = page_create_va(vp, off - deltab,
168 PAGESIZE, PG_EXCL, seg, addr - deltab))
169 == NULL)
170 break; /* already have the page */
171 /*
172 * Add page to front of page list.
173 */
174 page_add(&plist, pp);
175 }
176 deltab -= PAGESIZE;
177
178 /* scan forward from front */
179 for (deltaf = PAGESIZE; off + deltaf < vp_end;
180 deltaf += PAGESIZE) {
181 /*
182 * Call back to the segment driver to verify that
183 * the klustering/read ahead operation makes sense.
184 */
185 if (SEGOP_KLUSTER(seg, addr, deltaf))
186 break; /* page not file extension */
187 if ((pp = page_create_va(vp, off + deltaf,
188 PAGESIZE, PG_EXCL, seg, addr + deltaf))
189 == NULL)
190 break; /* already have page */
191
192 /*
193 * Add page to end of page list.
194 */
195 page_add(&plist, pp);
196 plist = plist->p_next;
197 }
198 *offp = off = off - deltab;
199 *lenp = deltab + deltaf;
200 ASSERT(off >= vp_off);
201
202 /*
203 * If we ended up getting more than was actually
204 * requested, retract the returned length to only
205 * reflect what was requested. This might happen
206 * if we were allowed to kluster pages across a
207 * span of (say) 5 frags, and frag size is less
208 * than PAGESIZE. We need a whole number of
209 * pages to contain those frags, but the returned
210 * size should only allow the returned range to
211 * extend as far as the end of the frags.
212 */
213 if ((vp_off + vp_len) < (off + *lenp)) {
214 ASSERT(vp_end > off);
215 *lenp = vp_end - off;
216 }
217 }
218 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
219 "pvn_read_kluster:seg %p addr %x isra %x",
220 seg, addr, isra);
221 return (plist);
222 }
223
224 /*
225 * Handle pages for this vnode on either side of the page "pp"
226 * which has been locked by the caller. This routine will also
227 * do klustering in the range [vp_off, vp_off + vp_len] up
228 * until a page which is not found. The offset and length
229 * of pages included is returned in "*offp" and "*lenp".
230 *
231 * Returns a list of dirty locked pages all ready to be
232 * written back.
233 */
234 page_t *
pvn_write_kluster(struct vnode * vp,page_t * pp,u_offset_t * offp,size_t * lenp,u_offset_t vp_off,size_t vp_len,int flags)235 pvn_write_kluster(
236 struct vnode *vp,
237 page_t *pp,
238 u_offset_t *offp, /* return values */
239 size_t *lenp, /* return values */
240 u_offset_t vp_off,
241 size_t vp_len,
242 int flags)
243 {
244 u_offset_t off;
245 page_t *dirty;
246 size_t deltab, deltaf;
247 se_t se;
248 u_offset_t vp_end;
249
250 off = pp->p_offset;
251
252 /*
253 * Kustering should not be done if we are invalidating
254 * pages since we could destroy pages that belong to
255 * some other process if this is a swap vnode.
256 */
257 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
258 *offp = off;
259 *lenp = PAGESIZE;
260 return (pp);
261 }
262
263 if (flags & (B_FREE | B_INVAL))
264 se = SE_EXCL;
265 else
266 se = SE_SHARED;
267
268 dirty = pp;
269 /*
270 * Scan backwards looking for pages to kluster by incrementing
271 * "deltab" and comparing "off" with "vp_off + deltab" to
272 * avoid "signed" versus "unsigned" conversion problems.
273 */
274 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
275 pp = page_lookup_nowait(vp, off - deltab, se);
276 if (pp == NULL)
277 break; /* page not found */
278 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
279 break;
280 page_add(&dirty, pp);
281 }
282 deltab -= PAGESIZE;
283
284 vp_end = vp_off + vp_len;
285 /* now scan forwards looking for pages to kluster */
286 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
287 pp = page_lookup_nowait(vp, off + deltaf, se);
288 if (pp == NULL)
289 break; /* page not found */
290 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
291 break;
292 page_add(&dirty, pp);
293 dirty = dirty->p_next;
294 }
295
296 *offp = off - deltab;
297 *lenp = deltab + deltaf;
298 return (dirty);
299 }
300
301 /*
302 * Generic entry point used to release the "shared/exclusive" lock
303 * and the "p_iolock" on pages after i/o is complete.
304 */
305 void
pvn_io_done(page_t * plist)306 pvn_io_done(page_t *plist)
307 {
308 page_t *pp;
309
310 while (plist != NULL) {
311 pp = plist;
312 page_sub(&plist, pp);
313 page_io_unlock(pp);
314 page_unlock(pp);
315 }
316 }
317
318 /*
319 * Entry point to be used by file system getpage subr's and
320 * other such routines which either want to unlock pages (B_ASYNC
321 * request) or destroy a list of pages if an error occurred.
322 */
323 void
pvn_read_done(page_t * plist,int flags)324 pvn_read_done(page_t *plist, int flags)
325 {
326 page_t *pp;
327
328 while (plist != NULL) {
329 pp = plist;
330 page_sub(&plist, pp);
331 page_io_unlock(pp);
332 if (flags & B_ERROR) {
333 /*LINTED: constant in conditional context*/
334 VN_DISPOSE(pp, B_INVAL, 0, kcred);
335 } else {
336 (void) page_release(pp, 0);
337 }
338 }
339 }
340
341 /*
342 * Automagic pageout.
343 * When memory gets tight, start freeing pages popping out of the
344 * write queue.
345 */
346 int write_free = 1;
347 pgcnt_t pages_before_pager = 200; /* LMXXX */
348
349 /*
350 * Routine to be called when page-out's complete.
351 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
352 * after waiting for i/o to complete (biowait) to free the list of
353 * pages associated with the buffer. These pages must be locked
354 * before i/o is initiated.
355 *
356 * If a write error occurs, the pages are marked as modified
357 * so the write will be re-tried later.
358 */
359
360 void
pvn_write_done(page_t * plist,int flags)361 pvn_write_done(page_t *plist, int flags)
362 {
363 int dfree = 0;
364 int pgrec = 0;
365 int pgout = 0;
366 int pgpgout = 0;
367 int anonpgout = 0;
368 int anonfree = 0;
369 int fspgout = 0;
370 int fsfree = 0;
371 int execpgout = 0;
372 int execfree = 0;
373 page_t *pp;
374 struct cpu *cpup;
375 struct vnode *vp = NULL; /* for probe */
376 uint_t ppattr;
377 kmutex_t *vphm = NULL;
378
379 ASSERT((flags & B_READ) == 0);
380
381 /*
382 * If we are about to start paging anyway, start freeing pages.
383 */
384 if (write_free && freemem < lotsfree + pages_before_pager &&
385 (flags & B_ERROR) == 0) {
386 flags |= B_FREE;
387 }
388
389 /*
390 * Handle each page involved in the i/o operation.
391 */
392 while (plist != NULL) {
393 pp = plist;
394 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
395 page_sub(&plist, pp);
396
397 /* Kernel probe support */
398 if (vp == NULL)
399 vp = pp->p_vnode;
400
401 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
402 /*
403 * Move page to the top of the v_page list.
404 * Skip pages modified during IO.
405 */
406 vphm = page_vnode_mutex(vp);
407 mutex_enter(vphm);
408 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
409 page_vpsub(&vp->v_pages, pp);
410 page_vpadd(&vp->v_pages, pp);
411 }
412 mutex_exit(vphm);
413 }
414
415 if (flags & B_ERROR) {
416 /*
417 * Write operation failed. We don't want
418 * to destroy (or free) the page unless B_FORCE
419 * is set. We set the mod bit again and release
420 * all locks on the page so that it will get written
421 * back again later when things are hopefully
422 * better again.
423 * If B_INVAL and B_FORCE is set we really have
424 * to destroy the page.
425 */
426 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
427 page_io_unlock(pp);
428 /*LINTED: constant in conditional context*/
429 VN_DISPOSE(pp, B_INVAL, 0, kcred);
430 } else {
431 hat_setmod_only(pp);
432 page_io_unlock(pp);
433 page_unlock(pp);
434 }
435 } else if (flags & B_INVAL) {
436 /*
437 * XXX - Failed writes with B_INVAL set are
438 * not handled appropriately.
439 */
440 page_io_unlock(pp);
441 /*LINTED: constant in conditional context*/
442 VN_DISPOSE(pp, B_INVAL, 0, kcred);
443 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
444 /*
445 * Update statistics for pages being paged out
446 */
447 if (pp->p_vnode) {
448 if (IS_SWAPFSVP(pp->p_vnode)) {
449 anonpgout++;
450 } else {
451 if (pp->p_vnode->v_flag & VVMEXEC) {
452 execpgout++;
453 } else {
454 fspgout++;
455 }
456 }
457 }
458 page_io_unlock(pp);
459 pgout = 1;
460 pgpgout++;
461 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
462 "page_ws_out:pp %p", pp);
463
464 /*
465 * The page_struct_lock need not be acquired to
466 * examine "p_lckcnt" and "p_cowcnt" since we'll
467 * have an "exclusive" lock if the upgrade succeeds.
468 */
469 if (page_tryupgrade(pp) &&
470 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
471 /*
472 * Check if someone has reclaimed the
473 * page. If ref and mod are not set, no
474 * one is using it so we can free it.
475 * The rest of the system is careful
476 * to use the NOSYNC flag to unload
477 * translations set up for i/o w/o
478 * affecting ref and mod bits.
479 *
480 * Obtain a copy of the real hardware
481 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
482 * to avoid having to flush the cache.
483 */
484 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
485 HAT_SYNC_STOPON_MOD);
486 ck_refmod:
487 if (!(ppattr & (P_REF | P_MOD))) {
488 if (hat_page_is_mapped(pp)) {
489 /*
490 * Doesn't look like the page
491 * was modified so now we
492 * really have to unload the
493 * translations. Meanwhile
494 * another CPU could've
495 * modified it so we have to
496 * check again. We don't loop
497 * forever here because now
498 * the translations are gone
499 * and no one can get a new one
500 * since we have the "exclusive"
501 * lock on the page.
502 */
503 (void) hat_pageunload(pp,
504 HAT_FORCE_PGUNLOAD);
505 ppattr = hat_page_getattr(pp,
506 P_REF | P_MOD);
507 goto ck_refmod;
508 }
509 /*
510 * Update statistics for pages being
511 * freed
512 */
513 if (pp->p_vnode) {
514 if (IS_SWAPFSVP(pp->p_vnode)) {
515 anonfree++;
516 } else {
517 if (pp->p_vnode->v_flag
518 & VVMEXEC) {
519 execfree++;
520 } else {
521 fsfree++;
522 }
523 }
524 }
525 /*LINTED: constant in conditional ctx*/
526 VN_DISPOSE(pp, B_FREE,
527 (flags & B_DONTNEED), kcred);
528 dfree++;
529 } else {
530 page_unlock(pp);
531 pgrec++;
532 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
533 "page_ws_free:pp %p", pp);
534 }
535 } else {
536 /*
537 * Page is either `locked' in memory
538 * or was reclaimed and now has a
539 * "shared" lock, so release it.
540 */
541 page_unlock(pp);
542 }
543 } else {
544 /*
545 * Neither B_FREE nor B_INVAL nor B_ERROR.
546 * Just release locks.
547 */
548 page_io_unlock(pp);
549 page_unlock(pp);
550 }
551 }
552
553 CPU_STATS_ENTER_K();
554 cpup = CPU; /* get cpup now that CPU cannot change */
555 CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
556 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
557 CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
558 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
559 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
560 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
561 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
562 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
563 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
564 CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
565 CPU_STATS_EXIT_K();
566
567 /* Kernel probe */
568 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
569 tnf_opaque, vnode, vp,
570 tnf_ulong, pages_pageout, pgpgout,
571 tnf_ulong, pages_freed, dfree,
572 tnf_ulong, pages_reclaimed, pgrec);
573 }
574
575 /*
576 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
577 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
578 * operation and is only to be considered if it doesn't involve any
579 * waiting here. B_TRUNC indicates that the file is being truncated
580 * and so no i/o needs to be done. B_FORCE indicates that the page
581 * must be destroyed so don't try wrting it out.
582 *
583 * The caller must ensure that the page is locked. Returns 1, if
584 * the page should be written back (the "iolock" is held in this
585 * case), or 0 if the page has been dealt with or has been
586 * unlocked.
587 */
588 int
pvn_getdirty(page_t * pp,int flags)589 pvn_getdirty(page_t *pp, int flags)
590 {
591 ASSERT((flags & (B_INVAL | B_FREE)) ?
592 PAGE_EXCL(pp) : PAGE_SHARED(pp));
593 ASSERT(PP_ISFREE(pp) == 0);
594
595 /*
596 * If trying to invalidate or free a logically `locked' page,
597 * forget it. Don't need page_struct_lock to check p_lckcnt and
598 * p_cowcnt as the page is exclusively locked.
599 */
600 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
601 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
602 page_unlock(pp);
603 return (0);
604 }
605
606 /*
607 * Now acquire the i/o lock so we can add it to the dirty
608 * list (if necessary). We avoid blocking on the i/o lock
609 * in the following cases:
610 *
611 * If B_DELWRI is set, which implies that this request is
612 * due to a klustering operartion.
613 *
614 * If this is an async (B_ASYNC) operation and we are not doing
615 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
616 * that the the page is written out].
617 */
618 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
619 if (!page_io_trylock(pp)) {
620 page_unlock(pp);
621 return (0);
622 }
623 } else {
624 page_io_lock(pp);
625 }
626
627 /*
628 * If we want to free or invalidate the page then
629 * we need to unload it so that anyone who wants
630 * it will have to take a minor fault to get it.
631 * Otherwise, we're just writing the page back so we
632 * need to sync up the hardwre and software mod bit to
633 * detect any future modifications. We clear the
634 * software mod bit when we put the page on the dirty
635 * list.
636 */
637 if (flags & (B_INVAL | B_FREE)) {
638 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
639 } else {
640 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
641 }
642
643 if (!hat_ismod(pp) || (flags & B_TRUNC)) {
644 /*
645 * Don't need to add it to the
646 * list after all.
647 */
648 page_io_unlock(pp);
649 if (flags & B_INVAL) {
650 /*LINTED: constant in conditional context*/
651 VN_DISPOSE(pp, B_INVAL, 0, kcred);
652 } else if (flags & B_FREE) {
653 /*LINTED: constant in conditional context*/
654 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
655 } else {
656 /*
657 * This is advisory path for the callers
658 * of VOP_PUTPAGE() who prefer freeing the
659 * page _only_ if no one else is accessing it.
660 * E.g. segmap_release()
661 *
662 * The above hat_ismod() check is useless because:
663 * (1) we may not be holding SE_EXCL lock;
664 * (2) we've not unloaded _all_ translations
665 *
666 * Let page_release() do the heavy-lifting.
667 */
668 (void) page_release(pp, 1);
669 }
670 return (0);
671 }
672
673 /*
674 * Page is dirty, get it ready for the write back
675 * and add page to the dirty list.
676 */
677 hat_clrrefmod(pp);
678
679 /*
680 * If we're going to free the page when we're done
681 * then we can let others try to use it starting now.
682 * We'll detect the fact that they used it when the
683 * i/o is done and avoid freeing the page.
684 */
685 if (flags & B_FREE)
686 page_downgrade(pp);
687
688
689 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
690
691 return (1);
692 }
693
694
695 /*ARGSUSED*/
696 static int
marker_constructor(void * buf,void * cdrarg,int kmflags)697 marker_constructor(void *buf, void *cdrarg, int kmflags)
698 {
699 page_t *mark = buf;
700 bzero(mark, sizeof (page_t));
701 mark->p_hash = PVN_VPLIST_HASH_TAG;
702 return (0);
703 }
704
705 void
pvn_init()706 pvn_init()
707 {
708 if (pvn_vmodsort_disable == 0)
709 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
710 marker_cache = kmem_cache_create("marker_cache",
711 sizeof (page_t), 0, marker_constructor,
712 NULL, NULL, NULL, NULL, 0);
713 }
714
715
716 /*
717 * Process a vnode's page list for all pages whose offset is >= off.
718 * Pages are to either be free'd, invalidated, or written back to disk.
719 *
720 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
721 * is specified, otherwise they are "shared" locked.
722 *
723 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
724 *
725 * Special marker page_t's are inserted in the list in order
726 * to keep track of where we are in the list when locks are dropped.
727 *
728 * Note the list is circular and insertions can happen only at the
729 * head and tail of the list. The algorithm ensures visiting all pages
730 * on the list in the following way:
731 *
732 * Drop two marker pages at the end of the list.
733 *
734 * Move one marker page backwards towards the start of the list until
735 * it is at the list head, processing the pages passed along the way.
736 *
737 * Due to race conditions when the vphm mutex is dropped, additional pages
738 * can be added to either end of the list, so we'll continue to move
739 * the marker and process pages until it is up against the end marker.
740 *
741 * There is one special exit condition. If we are processing a VMODSORT
742 * vnode and only writing back modified pages, we can stop as soon as
743 * we run into an unmodified page. This makes fsync(3) operations fast.
744 */
745 int
pvn_vplist_dirty(vnode_t * vp,u_offset_t off,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int flags,cred_t * cred)746 pvn_vplist_dirty(
747 vnode_t *vp,
748 u_offset_t off,
749 int (*putapage)(vnode_t *, page_t *, u_offset_t *,
750 size_t *, int, cred_t *),
751 int flags,
752 cred_t *cred)
753 {
754 page_t *pp;
755 page_t *mark; /* marker page that moves toward head */
756 page_t *end; /* marker page at end of list */
757 int err = 0;
758 int error;
759 kmutex_t *vphm;
760 se_t se;
761 page_t **where_to_move;
762
763 ASSERT(vp->v_type != VCHR);
764
765 if (vp->v_pages == NULL)
766 return (0);
767
768
769 /*
770 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
771 *
772 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
773 * from getting blocked while flushing pages to a dead NFS server.
774 */
775 mutex_enter(&vp->v_lock);
776 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
777 mutex_exit(&vp->v_lock);
778 return (EAGAIN);
779 }
780
781 while (vp->v_flag & VVMLOCK)
782 cv_wait(&vp->v_cv, &vp->v_lock);
783
784 if (vp->v_pages == NULL) {
785 mutex_exit(&vp->v_lock);
786 return (0);
787 }
788
789 vp->v_flag |= VVMLOCK;
790 mutex_exit(&vp->v_lock);
791
792
793 /*
794 * Set up the marker pages used to walk the list
795 */
796 end = kmem_cache_alloc(marker_cache, KM_SLEEP);
797 end->p_vnode = vp;
798 end->p_offset = (u_offset_t)-2;
799 mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
800 mark->p_vnode = vp;
801 mark->p_offset = (u_offset_t)-1;
802
803 /*
804 * Grab the lock protecting the vnode's page list
805 * note that this lock is dropped at times in the loop.
806 */
807 vphm = page_vnode_mutex(vp);
808 mutex_enter(vphm);
809 if (vp->v_pages == NULL)
810 goto leave;
811
812 /*
813 * insert the markers and loop through the list of pages
814 */
815 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
816 page_vpadd(&mark->p_vpnext, end);
817 for (;;) {
818
819 /*
820 * If only doing an async write back, then we can
821 * stop as soon as we get to start of the list.
822 */
823 if (flags == B_ASYNC && vp->v_pages == mark)
824 break;
825
826 /*
827 * otherwise stop when we've gone through all the pages
828 */
829 if (mark->p_vpprev == end)
830 break;
831
832 pp = mark->p_vpprev;
833 if (vp->v_pages == pp)
834 where_to_move = &vp->v_pages;
835 else
836 where_to_move = &pp->p_vpprev->p_vpnext;
837
838 ASSERT(pp->p_vnode == vp);
839
840 /*
841 * If just flushing dirty pages to disk and this vnode
842 * is using a sorted list of pages, we can stop processing
843 * as soon as we find an unmodified page. Since all the
844 * modified pages are visited first.
845 */
846 if (IS_VMODSORT(vp) &&
847 !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
848 if (!hat_ismod(pp) && !page_io_locked(pp)) {
849 #ifdef DEBUG
850 /*
851 * For debug kernels examine what should be
852 * all the remaining clean pages, asserting
853 * that they are not modified.
854 */
855 page_t *chk = pp;
856 int attr;
857
858 page_vpsub(&vp->v_pages, mark);
859 page_vpadd(where_to_move, mark);
860 do {
861 chk = chk->p_vpprev;
862 ASSERT(chk != end);
863 if (chk == mark)
864 continue;
865 attr = hat_page_getattr(chk, P_MOD |
866 P_REF);
867 if ((attr & P_MOD) == 0)
868 continue;
869 panic("v_pages list not all clean: "
870 "page_t*=%p vnode=%p off=%lx "
871 "attr=0x%x last clean page_t*=%p\n",
872 (void *)chk, (void *)chk->p_vnode,
873 (long)chk->p_offset, attr,
874 (void *)pp);
875 } while (chk != vp->v_pages);
876 #endif
877 break;
878 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
879 /*
880 * Couldn't get io lock, wait until IO is done.
881 * Block only for sync IO since we don't want
882 * to block async IO.
883 */
884 mutex_exit(vphm);
885 page_io_wait(pp);
886 mutex_enter(vphm);
887 continue;
888 }
889 }
890
891 /*
892 * Skip this page if the offset is out of the desired range.
893 * Just move the marker and continue.
894 */
895 if (pp->p_offset < off) {
896 page_vpsub(&vp->v_pages, mark);
897 page_vpadd(where_to_move, mark);
898 continue;
899 }
900
901 /*
902 * If we are supposed to invalidate or free this
903 * page, then we need an exclusive lock.
904 */
905 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
906
907 /*
908 * We must acquire the page lock for all synchronous
909 * operations (invalidate, free and write).
910 */
911 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
912 /*
913 * If the page_lock() drops the mutex
914 * we must retry the loop.
915 */
916 if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
917 continue;
918
919 /*
920 * It's ok to move the marker page now.
921 */
922 page_vpsub(&vp->v_pages, mark);
923 page_vpadd(where_to_move, mark);
924 } else {
925
926 /*
927 * update the marker page for all remaining cases
928 */
929 page_vpsub(&vp->v_pages, mark);
930 page_vpadd(where_to_move, mark);
931
932 /*
933 * For write backs, If we can't lock the page, it's
934 * invalid or in the process of being destroyed. Skip
935 * it, assuming someone else is writing it.
936 */
937 if (!page_trylock(pp, se))
938 continue;
939 }
940
941 ASSERT(pp->p_vnode == vp);
942
943 /*
944 * Successfully locked the page, now figure out what to
945 * do with it. Free pages are easily dealt with, invalidate
946 * if desired or just go on to the next page.
947 */
948 if (PP_ISFREE(pp)) {
949 if ((flags & B_INVAL) == 0) {
950 page_unlock(pp);
951 continue;
952 }
953
954 /*
955 * Invalidate (destroy) the page.
956 */
957 mutex_exit(vphm);
958 page_destroy_free(pp);
959 mutex_enter(vphm);
960 continue;
961 }
962
963 /*
964 * pvn_getdirty() figures out what do do with a dirty page.
965 * If the page is dirty, the putapage() routine will write it
966 * and will kluster any other adjacent dirty pages it can.
967 *
968 * pvn_getdirty() and `(*putapage)' unlock the page.
969 */
970 mutex_exit(vphm);
971 if (pvn_getdirty(pp, flags)) {
972 error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
973 if (!err)
974 err = error;
975 }
976 mutex_enter(vphm);
977 }
978 page_vpsub(&vp->v_pages, mark);
979 page_vpsub(&vp->v_pages, end);
980
981 leave:
982 /*
983 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
984 */
985 mutex_exit(vphm);
986 kmem_cache_free(marker_cache, mark);
987 kmem_cache_free(marker_cache, end);
988 mutex_enter(&vp->v_lock);
989 vp->v_flag &= ~VVMLOCK;
990 cv_broadcast(&vp->v_cv);
991 mutex_exit(&vp->v_lock);
992 return (err);
993 }
994
995 /*
996 * Walk the vp->v_pages list, for every page call the callback function
997 * pointed by *page_check. If page_check returns non-zero, then mark the
998 * page as modified and if VMODSORT is set, move it to the end of v_pages
999 * list. Moving makes sense only if we have at least two pages - this also
1000 * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001 * if there was just one page.
1002 */
1003 void
pvn_vplist_setdirty(vnode_t * vp,int (* page_check)(page_t *))1004 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005 {
1006 page_t *pp, *next, *end;
1007 kmutex_t *vphm;
1008 int shuffle;
1009
1010 vphm = page_vnode_mutex(vp);
1011 mutex_enter(vphm);
1012
1013 if (vp->v_pages == NULL) {
1014 mutex_exit(vphm);
1015 return;
1016 }
1017
1018 end = vp->v_pages->p_vpprev;
1019 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020 pp = vp->v_pages;
1021
1022 for (;;) {
1023 next = pp->p_vpnext;
1024 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025 /*
1026 * hat_setmod_only() in contrast to hat_setmod() does
1027 * not shuffle the pages and does not grab the mutex
1028 * page_vnode_mutex. Exactly what we need.
1029 */
1030 hat_setmod_only(pp);
1031 if (shuffle) {
1032 page_vpsub(&vp->v_pages, pp);
1033 ASSERT(vp->v_pages != NULL);
1034 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035 pp);
1036 }
1037 }
1038 /* Stop if we have just processed the last page. */
1039 if (pp == end)
1040 break;
1041 pp = next;
1042 }
1043
1044 mutex_exit(vphm);
1045 }
1046
1047 /*
1048 * Zero out zbytes worth of data. Caller should be aware that this
1049 * routine may enter back into the fs layer (xxx_getpage). Locks
1050 * that the xxx_getpage routine may need should not be held while
1051 * calling this.
1052 */
1053 void
pvn_vpzero(struct vnode * vp,u_offset_t vplen,size_t zbytes)1054 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1055 {
1056 caddr_t addr;
1057
1058 ASSERT(vp->v_type != VCHR);
1059
1060 if (vp->v_pages == NULL)
1061 return;
1062
1063 /*
1064 * zbytes may be zero but there still may be some portion of
1065 * a page which needs clearing (since zbytes is a function
1066 * of filesystem block size, not pagesize.)
1067 */
1068 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1069 return;
1070
1071 /*
1072 * We get the last page and handle the partial
1073 * zeroing via kernel mappings. This will make the page
1074 * dirty so that we know that when this page is written
1075 * back, the zeroed information will go out with it. If
1076 * the page is not currently in memory, then the kzero
1077 * operation will cause it to be brought it. We use kzero
1078 * instead of bzero so that if the page cannot be read in
1079 * for any reason, the system will not panic. We need
1080 * to zero out a minimum of the fs given zbytes, but we
1081 * might also have to do more to get the entire last page.
1082 */
1083
1084 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1085 panic("pvn_vptrunc zbytes");
1086 addr = segmap_getmapflt(segkmap, vp, vplen,
1087 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1088 (void) kzero(addr + (vplen & MAXBOFFSET),
1089 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1090 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1091 }
1092
1093 /*
1094 * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 * calling the getpage helper for each.
1096 */
1097 int
pvn_getpages(int (* getpage)(vnode_t *,u_offset_t,size_t,uint_t *,page_t * [],size_t,struct seg *,caddr_t,enum seg_rw,cred_t *),struct vnode * vp,u_offset_t off,size_t len,uint_t * protp,page_t * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)1098 pvn_getpages(
1099 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1100 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1101 struct vnode *vp,
1102 u_offset_t off,
1103 size_t len,
1104 uint_t *protp,
1105 page_t *pl[],
1106 size_t plsz,
1107 struct seg *seg,
1108 caddr_t addr,
1109 enum seg_rw rw,
1110 struct cred *cred)
1111 {
1112 page_t **ppp;
1113 u_offset_t o, eoff;
1114 size_t sz, xlen;
1115 int err;
1116
1117 /* ensure that we have enough space */
1118 ASSERT(pl == NULL || plsz >= len);
1119
1120 /*
1121 * Loop one page at a time and let getapage function fill
1122 * in the next page in array. We only allow one page to be
1123 * returned at a time (except for the last page) so that we
1124 * don't have any problems with duplicates and other such
1125 * painful problems. This is a very simple minded algorithm,
1126 * but it does the job correctly. We hope that the cost of a
1127 * getapage call for a resident page that we might have been
1128 * able to get from an earlier call doesn't cost too much.
1129 */
1130 ppp = pl;
1131 sz = (pl != NULL) ? PAGESIZE : 0;
1132 eoff = off + len;
1133 xlen = len;
1134 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135 xlen -= PAGESIZE) {
1136 if (o + PAGESIZE >= eoff && pl != NULL) {
1137 /*
1138 * Last time through - allow the all of
1139 * what's left of the pl[] array to be used.
1140 */
1141 sz = plsz - (o - off);
1142 }
1143 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144 rw, cred);
1145 if (err) {
1146 /*
1147 * Release any pages we already got.
1148 */
1149 if (o > off && pl != NULL) {
1150 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151 (void) page_release(*ppp, 1);
1152 }
1153 break;
1154 }
1155 if (pl != NULL)
1156 ppp++;
1157 }
1158 return (err);
1159 }
1160
1161 /*
1162 * Initialize the page list array.
1163 */
1164 /*ARGSUSED*/
1165 void
pvn_plist_init(page_t * pp,page_t * pl[],size_t plsz,u_offset_t off,size_t io_len,enum seg_rw rw)1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167 u_offset_t off, size_t io_len, enum seg_rw rw)
1168 {
1169 ssize_t sz;
1170 page_t *ppcur, **ppp;
1171
1172 /*
1173 * Set up to load plsz worth
1174 * starting at the needed page.
1175 */
1176 while (pp != NULL && pp->p_offset != off) {
1177 /*
1178 * Remove page from the i/o list,
1179 * release the i/o and the page lock.
1180 */
1181 ppcur = pp;
1182 page_sub(&pp, ppcur);
1183 page_io_unlock(ppcur);
1184 (void) page_release(ppcur, 1);
1185 }
1186
1187 if (pp == NULL) {
1188 pl[0] = NULL;
1189 return;
1190 }
1191
1192 sz = plsz;
1193
1194 /*
1195 * Initialize the page list array.
1196 */
1197 ppp = pl;
1198 do {
1199 ppcur = pp;
1200 *ppp++ = ppcur;
1201 page_sub(&pp, ppcur);
1202 page_io_unlock(ppcur);
1203 if (rw != S_CREATE)
1204 page_downgrade(ppcur);
1205 sz -= PAGESIZE;
1206 } while (sz > 0 && pp != NULL);
1207 *ppp = NULL; /* terminate list */
1208
1209 /*
1210 * Now free the remaining pages that weren't
1211 * loaded in the page list.
1212 */
1213 while (pp != NULL) {
1214 ppcur = pp;
1215 page_sub(&pp, ppcur);
1216 page_io_unlock(ppcur);
1217 (void) page_release(ppcur, 1);
1218 }
1219 }
1220