1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 /*
40 * VM - paged vnode.
41 *
42 * This file supplies vm support for the vnode operations that deal with pages.
43 */
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/time.h>
50 #include <sys/buf.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/vmsystm.h>
54 #include <sys/mman.h>
55 #include <sys/vfs.h>
56 #include <sys/cred.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/cmn_err.h>
60 #include <sys/debug.h>
61 #include <sys/cpuvar.h>
62 #include <sys/vtrace.h>
63
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/rm.h>
68 #include <vm/pvn.h>
69 #include <vm/page.h>
70 #include <vm/seg_map.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/fs/swapnode.h>
73
74 int pvn_nofodklust = 0;
75 int pvn_write_noklust = 0;
76
77 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
78 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
79 /* support for vmodsort for testing */
80
81 static struct kmem_cache *marker_cache = NULL;
82
83 /*
84 * Find the largest contiguous block which contains `addr' for file offset
85 * `offset' in it while living within the file system block sizes (`vp_off'
86 * and `vp_len') and the address space limits for which no pages currently
87 * exist and which map to consecutive file offsets.
88 */
89 page_t *
pvn_read_kluster(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t addr,u_offset_t * offp,size_t * lenp,u_offset_t vp_off,size_t vp_len,int isra)90 pvn_read_kluster(
91 struct vnode *vp,
92 u_offset_t off,
93 struct seg *seg,
94 caddr_t addr,
95 u_offset_t *offp, /* return values */
96 size_t *lenp, /* return values */
97 u_offset_t vp_off,
98 size_t vp_len,
99 int isra)
100 {
101 ssize_t deltaf, deltab;
102 page_t *pp;
103 page_t *plist = NULL;
104 spgcnt_t pagesavail;
105 u_offset_t vp_end;
106
107 ASSERT(off >= vp_off && off < vp_off + vp_len);
108
109 /*
110 * We only want to do klustering/read ahead if there
111 * is more than minfree pages currently available.
112 */
113 pagesavail = freemem - minfree;
114
115 if (pagesavail <= 0)
116 if (isra)
117 return ((page_t *)NULL); /* ra case - give up */
118 else
119 pagesavail = 1; /* must return a page */
120
121 /* We calculate in pages instead of bytes due to 32-bit overflows */
122 if (pagesavail < (spgcnt_t)btopr(vp_len)) {
123 /*
124 * Don't have enough free memory for the
125 * max request, try sizing down vp request.
126 */
127 deltab = (ssize_t)(off - vp_off);
128 vp_len -= deltab;
129 vp_off += deltab;
130 if (pagesavail < btopr(vp_len)) {
131 /*
132 * Still not enough memory, just settle for
133 * pagesavail which is at least 1.
134 */
135 vp_len = ptob(pagesavail);
136 }
137 }
138
139 vp_end = vp_off + vp_len;
140 ASSERT(off >= vp_off && off < vp_end);
141
142 if (isra && SEGOP_KLUSTER(seg, addr, 0))
143 return ((page_t *)NULL); /* segment driver says no */
144
145 if ((plist = page_create_va(vp, off,
146 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
147 return ((page_t *)NULL);
148
149 if (vp_len <= PAGESIZE || pvn_nofodklust) {
150 *offp = off;
151 *lenp = MIN(vp_len, PAGESIZE);
152 } else {
153 /*
154 * Scan back from front by incrementing "deltab" and
155 * comparing "off" with "vp_off + deltab" to avoid
156 * "signed" versus "unsigned" conversion problems.
157 */
158 for (deltab = PAGESIZE; off >= vp_off + deltab;
159 deltab += PAGESIZE) {
160 /*
161 * Call back to the segment driver to verify that
162 * the klustering/read ahead operation makes sense.
163 */
164 if (SEGOP_KLUSTER(seg, addr, -deltab))
165 break; /* page not eligible */
166 if ((pp = page_create_va(vp, off - deltab,
167 PAGESIZE, PG_EXCL, seg, addr - deltab))
168 == NULL)
169 break; /* already have the page */
170 /*
171 * Add page to front of page list.
172 */
173 page_add(&plist, pp);
174 }
175 deltab -= PAGESIZE;
176
177 /* scan forward from front */
178 for (deltaf = PAGESIZE; off + deltaf < vp_end;
179 deltaf += PAGESIZE) {
180 /*
181 * Call back to the segment driver to verify that
182 * the klustering/read ahead operation makes sense.
183 */
184 if (SEGOP_KLUSTER(seg, addr, deltaf))
185 break; /* page not file extension */
186 if ((pp = page_create_va(vp, off + deltaf,
187 PAGESIZE, PG_EXCL, seg, addr + deltaf))
188 == NULL)
189 break; /* already have page */
190
191 /*
192 * Add page to end of page list.
193 */
194 page_add(&plist, pp);
195 plist = plist->p_next;
196 }
197 *offp = off = off - deltab;
198 *lenp = deltab + deltaf;
199 ASSERT(off >= vp_off);
200
201 /*
202 * If we ended up getting more than was actually
203 * requested, retract the returned length to only
204 * reflect what was requested. This might happen
205 * if we were allowed to kluster pages across a
206 * span of (say) 5 frags, and frag size is less
207 * than PAGESIZE. We need a whole number of
208 * pages to contain those frags, but the returned
209 * size should only allow the returned range to
210 * extend as far as the end of the frags.
211 */
212 if ((vp_off + vp_len) < (off + *lenp)) {
213 ASSERT(vp_end > off);
214 *lenp = vp_end - off;
215 }
216 }
217 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
218 "pvn_read_kluster:seg %p addr %x isra %x",
219 seg, addr, isra);
220 return (plist);
221 }
222
223 /*
224 * Handle pages for this vnode on either side of the page "pp"
225 * which has been locked by the caller. This routine will also
226 * do klustering in the range [vp_off, vp_off + vp_len] up
227 * until a page which is not found. The offset and length
228 * of pages included is returned in "*offp" and "*lenp".
229 *
230 * Returns a list of dirty locked pages all ready to be
231 * written back.
232 */
233 page_t *
pvn_write_kluster(struct vnode * vp,page_t * pp,u_offset_t * offp,size_t * lenp,u_offset_t vp_off,size_t vp_len,int flags)234 pvn_write_kluster(
235 struct vnode *vp,
236 page_t *pp,
237 u_offset_t *offp, /* return values */
238 size_t *lenp, /* return values */
239 u_offset_t vp_off,
240 size_t vp_len,
241 int flags)
242 {
243 u_offset_t off;
244 page_t *dirty;
245 size_t deltab, deltaf;
246 se_t se;
247 u_offset_t vp_end;
248
249 off = pp->p_offset;
250
251 /*
252 * Kustering should not be done if we are invalidating
253 * pages since we could destroy pages that belong to
254 * some other process if this is a swap vnode.
255 */
256 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
257 *offp = off;
258 *lenp = PAGESIZE;
259 return (pp);
260 }
261
262 if (flags & (B_FREE | B_INVAL))
263 se = SE_EXCL;
264 else
265 se = SE_SHARED;
266
267 dirty = pp;
268 /*
269 * Scan backwards looking for pages to kluster by incrementing
270 * "deltab" and comparing "off" with "vp_off + deltab" to
271 * avoid "signed" versus "unsigned" conversion problems.
272 */
273 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
274 pp = page_lookup_nowait(vp, off - deltab, se);
275 if (pp == NULL)
276 break; /* page not found */
277 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
278 break;
279 page_add(&dirty, pp);
280 }
281 deltab -= PAGESIZE;
282
283 vp_end = vp_off + vp_len;
284 /* now scan forwards looking for pages to kluster */
285 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
286 pp = page_lookup_nowait(vp, off + deltaf, se);
287 if (pp == NULL)
288 break; /* page not found */
289 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
290 break;
291 page_add(&dirty, pp);
292 dirty = dirty->p_next;
293 }
294
295 *offp = off - deltab;
296 *lenp = deltab + deltaf;
297 return (dirty);
298 }
299
300 /*
301 * Generic entry point used to release the "shared/exclusive" lock
302 * and the "p_iolock" on pages after i/o is complete.
303 */
304 void
pvn_io_done(page_t * plist)305 pvn_io_done(page_t *plist)
306 {
307 page_t *pp;
308
309 while (plist != NULL) {
310 pp = plist;
311 page_sub(&plist, pp);
312 page_io_unlock(pp);
313 page_unlock(pp);
314 }
315 }
316
317 /*
318 * Entry point to be used by file system getpage subr's and
319 * other such routines which either want to unlock pages (B_ASYNC
320 * request) or destroy a list of pages if an error occurred.
321 */
322 void
pvn_read_done(page_t * plist,int flags)323 pvn_read_done(page_t *plist, int flags)
324 {
325 page_t *pp;
326
327 while (plist != NULL) {
328 pp = plist;
329 page_sub(&plist, pp);
330 page_io_unlock(pp);
331 if (flags & B_ERROR) {
332 /*LINTED: constant in conditional context*/
333 VN_DISPOSE(pp, B_INVAL, 0, kcred);
334 } else {
335 (void) page_release(pp, 0);
336 }
337 }
338 }
339
340 /*
341 * Automagic pageout.
342 * When memory gets tight, start freeing pages popping out of the
343 * write queue.
344 */
345 int write_free = 1;
346 pgcnt_t pages_before_pager = 200; /* LMXXX */
347
348 /*
349 * Routine to be called when page-out's complete.
350 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
351 * after waiting for i/o to complete (biowait) to free the list of
352 * pages associated with the buffer. These pages must be locked
353 * before i/o is initiated.
354 *
355 * If a write error occurs, the pages are marked as modified
356 * so the write will be re-tried later.
357 */
358
359 void
pvn_write_done(page_t * plist,int flags)360 pvn_write_done(page_t *plist, int flags)
361 {
362 int dfree = 0;
363 int pgrec = 0;
364 int pgout = 0;
365 int pgpgout = 0;
366 int anonpgout = 0;
367 int anonfree = 0;
368 int fspgout = 0;
369 int fsfree = 0;
370 int execpgout = 0;
371 int execfree = 0;
372 page_t *pp;
373 struct cpu *cpup;
374 struct vnode *vp = NULL; /* for probe */
375 uint_t ppattr;
376 kmutex_t *vphm = NULL;
377
378 ASSERT((flags & B_READ) == 0);
379
380 /*
381 * If we are about to start paging anyway, start freeing pages.
382 */
383 if (write_free && freemem < lotsfree + pages_before_pager &&
384 (flags & B_ERROR) == 0) {
385 flags |= B_FREE;
386 }
387
388 /*
389 * Handle each page involved in the i/o operation.
390 */
391 while (plist != NULL) {
392 pp = plist;
393 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
394 page_sub(&plist, pp);
395
396 /* Kernel probe support */
397 if (vp == NULL)
398 vp = pp->p_vnode;
399
400 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
401 /*
402 * Move page to the top of the v_page list.
403 * Skip pages modified during IO.
404 */
405 vphm = page_vnode_mutex(vp);
406 mutex_enter(vphm);
407 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
408 page_vpsub(&vp->v_pages, pp);
409 page_vpadd(&vp->v_pages, pp);
410 }
411 mutex_exit(vphm);
412 }
413
414 if (flags & B_ERROR) {
415 /*
416 * Write operation failed. We don't want
417 * to destroy (or free) the page unless B_FORCE
418 * is set. We set the mod bit again and release
419 * all locks on the page so that it will get written
420 * back again later when things are hopefully
421 * better again.
422 * If B_INVAL and B_FORCE is set we really have
423 * to destroy the page.
424 */
425 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
426 page_io_unlock(pp);
427 /*LINTED: constant in conditional context*/
428 VN_DISPOSE(pp, B_INVAL, 0, kcred);
429 } else {
430 hat_setmod_only(pp);
431 page_io_unlock(pp);
432 page_unlock(pp);
433 }
434 } else if (flags & B_INVAL) {
435 /*
436 * XXX - Failed writes with B_INVAL set are
437 * not handled appropriately.
438 */
439 page_io_unlock(pp);
440 /*LINTED: constant in conditional context*/
441 VN_DISPOSE(pp, B_INVAL, 0, kcred);
442 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
443 /*
444 * Update statistics for pages being paged out
445 */
446 if (pp->p_vnode) {
447 if (IS_SWAPFSVP(pp->p_vnode)) {
448 anonpgout++;
449 } else {
450 if (pp->p_vnode->v_flag & VVMEXEC) {
451 execpgout++;
452 } else {
453 fspgout++;
454 }
455 }
456 }
457 page_io_unlock(pp);
458 pgout = 1;
459 pgpgout++;
460 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
461 "page_ws_out:pp %p", pp);
462
463 /*
464 * The page_struct_lock need not be acquired to
465 * examine "p_lckcnt" and "p_cowcnt" since we'll
466 * have an "exclusive" lock if the upgrade succeeds.
467 */
468 if (page_tryupgrade(pp) &&
469 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
470 /*
471 * Check if someone has reclaimed the
472 * page. If ref and mod are not set, no
473 * one is using it so we can free it.
474 * The rest of the system is careful
475 * to use the NOSYNC flag to unload
476 * translations set up for i/o w/o
477 * affecting ref and mod bits.
478 *
479 * Obtain a copy of the real hardware
480 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
481 * to avoid having to flush the cache.
482 */
483 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
484 HAT_SYNC_STOPON_MOD);
485 ck_refmod:
486 if (!(ppattr & (P_REF | P_MOD))) {
487 if (hat_page_is_mapped(pp)) {
488 /*
489 * Doesn't look like the page
490 * was modified so now we
491 * really have to unload the
492 * translations. Meanwhile
493 * another CPU could've
494 * modified it so we have to
495 * check again. We don't loop
496 * forever here because now
497 * the translations are gone
498 * and no one can get a new one
499 * since we have the "exclusive"
500 * lock on the page.
501 */
502 (void) hat_pageunload(pp,
503 HAT_FORCE_PGUNLOAD);
504 ppattr = hat_page_getattr(pp,
505 P_REF | P_MOD);
506 goto ck_refmod;
507 }
508 /*
509 * Update statistics for pages being
510 * freed
511 */
512 if (pp->p_vnode) {
513 if (IS_SWAPFSVP(pp->p_vnode)) {
514 anonfree++;
515 } else {
516 if (pp->p_vnode->v_flag
517 & VVMEXEC) {
518 execfree++;
519 } else {
520 fsfree++;
521 }
522 }
523 }
524 /*LINTED: constant in conditional ctx*/
525 VN_DISPOSE(pp, B_FREE,
526 (flags & B_DONTNEED), kcred);
527 dfree++;
528 } else {
529 page_unlock(pp);
530 pgrec++;
531 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
532 "page_ws_free:pp %p", pp);
533 }
534 } else {
535 /*
536 * Page is either `locked' in memory
537 * or was reclaimed and now has a
538 * "shared" lock, so release it.
539 */
540 page_unlock(pp);
541 }
542 } else {
543 /*
544 * Neither B_FREE nor B_INVAL nor B_ERROR.
545 * Just release locks.
546 */
547 page_io_unlock(pp);
548 page_unlock(pp);
549 }
550 }
551
552 CPU_STATS_ENTER_K();
553 cpup = CPU; /* get cpup now that CPU cannot change */
554 CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
555 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
556 CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
557 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
558 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
559 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
560 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
561 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
562 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
563 CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
564 CPU_STATS_EXIT_K();
565 }
566
567 /*
568 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
569 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
570 * operation and is only to be considered if it doesn't involve any
571 * waiting here. B_TRUNC indicates that the file is being truncated
572 * and so no i/o needs to be done. B_FORCE indicates that the page
573 * must be destroyed so don't try wrting it out.
574 *
575 * The caller must ensure that the page is locked. Returns 1, if
576 * the page should be written back (the "iolock" is held in this
577 * case), or 0 if the page has been dealt with or has been
578 * unlocked.
579 */
580 int
pvn_getdirty(page_t * pp,int flags)581 pvn_getdirty(page_t *pp, int flags)
582 {
583 ASSERT((flags & (B_INVAL | B_FREE)) ?
584 PAGE_EXCL(pp) : PAGE_SHARED(pp));
585 ASSERT(PP_ISFREE(pp) == 0);
586
587 /*
588 * If trying to invalidate or free a logically `locked' page,
589 * forget it. Don't need page_struct_lock to check p_lckcnt and
590 * p_cowcnt as the page is exclusively locked.
591 */
592 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
593 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
594 page_unlock(pp);
595 return (0);
596 }
597
598 /*
599 * Now acquire the i/o lock so we can add it to the dirty
600 * list (if necessary). We avoid blocking on the i/o lock
601 * in the following cases:
602 *
603 * If B_DELWRI is set, which implies that this request is
604 * due to a klustering operartion.
605 *
606 * If this is an async (B_ASYNC) operation and we are not doing
607 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
608 * that the the page is written out].
609 */
610 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
611 if (!page_io_trylock(pp)) {
612 page_unlock(pp);
613 return (0);
614 }
615 } else {
616 page_io_lock(pp);
617 }
618
619 /*
620 * If we want to free or invalidate the page then
621 * we need to unload it so that anyone who wants
622 * it will have to take a minor fault to get it.
623 * Otherwise, we're just writing the page back so we
624 * need to sync up the hardwre and software mod bit to
625 * detect any future modifications. We clear the
626 * software mod bit when we put the page on the dirty
627 * list.
628 */
629 if (flags & (B_INVAL | B_FREE)) {
630 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
631 } else {
632 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
633 }
634
635 if (!hat_ismod(pp) || (flags & B_TRUNC)) {
636 /*
637 * Don't need to add it to the
638 * list after all.
639 */
640 page_io_unlock(pp);
641 if (flags & B_INVAL) {
642 /*LINTED: constant in conditional context*/
643 VN_DISPOSE(pp, B_INVAL, 0, kcred);
644 } else if (flags & B_FREE) {
645 /*LINTED: constant in conditional context*/
646 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
647 } else {
648 /*
649 * This is advisory path for the callers
650 * of VOP_PUTPAGE() who prefer freeing the
651 * page _only_ if no one else is accessing it.
652 * E.g. segmap_release()
653 *
654 * The above hat_ismod() check is useless because:
655 * (1) we may not be holding SE_EXCL lock;
656 * (2) we've not unloaded _all_ translations
657 *
658 * Let page_release() do the heavy-lifting.
659 */
660 (void) page_release(pp, 1);
661 }
662 return (0);
663 }
664
665 /*
666 * Page is dirty, get it ready for the write back
667 * and add page to the dirty list.
668 */
669 hat_clrrefmod(pp);
670
671 /*
672 * If we're going to free the page when we're done
673 * then we can let others try to use it starting now.
674 * We'll detect the fact that they used it when the
675 * i/o is done and avoid freeing the page.
676 */
677 if (flags & B_FREE)
678 page_downgrade(pp);
679
680
681 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
682
683 return (1);
684 }
685
686
687 /*ARGSUSED*/
688 static int
marker_constructor(void * buf,void * cdrarg,int kmflags)689 marker_constructor(void *buf, void *cdrarg, int kmflags)
690 {
691 page_t *mark = buf;
692 bzero(mark, sizeof (page_t));
693 mark->p_hash = PVN_VPLIST_HASH_TAG;
694 return (0);
695 }
696
697 void
pvn_init()698 pvn_init()
699 {
700 if (pvn_vmodsort_disable == 0)
701 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
702 marker_cache = kmem_cache_create("marker_cache",
703 sizeof (page_t), 0, marker_constructor,
704 NULL, NULL, NULL, NULL, 0);
705 }
706
707
708 /*
709 * Process a vnode's page list for all pages whose offset is >= off.
710 * Pages are to either be free'd, invalidated, or written back to disk.
711 *
712 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
713 * is specified, otherwise they are "shared" locked.
714 *
715 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
716 *
717 * Special marker page_t's are inserted in the list in order
718 * to keep track of where we are in the list when locks are dropped.
719 *
720 * Note the list is circular and insertions can happen only at the
721 * head and tail of the list. The algorithm ensures visiting all pages
722 * on the list in the following way:
723 *
724 * Drop two marker pages at the end of the list.
725 *
726 * Move one marker page backwards towards the start of the list until
727 * it is at the list head, processing the pages passed along the way.
728 *
729 * Due to race conditions when the vphm mutex is dropped, additional pages
730 * can be added to either end of the list, so we'll continue to move
731 * the marker and process pages until it is up against the end marker.
732 *
733 * There is one special exit condition. If we are processing a VMODSORT
734 * vnode and only writing back modified pages, we can stop as soon as
735 * we run into an unmodified page. This makes fsync(3) operations fast.
736 */
737 int
pvn_vplist_dirty(vnode_t * vp,u_offset_t off,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int flags,cred_t * cred)738 pvn_vplist_dirty(
739 vnode_t *vp,
740 u_offset_t off,
741 int (*putapage)(vnode_t *, page_t *, u_offset_t *,
742 size_t *, int, cred_t *),
743 int flags,
744 cred_t *cred)
745 {
746 page_t *pp;
747 page_t *mark; /* marker page that moves toward head */
748 page_t *end; /* marker page at end of list */
749 int err = 0;
750 int error;
751 kmutex_t *vphm;
752 se_t se;
753 page_t **where_to_move;
754
755 ASSERT(vp->v_type != VCHR);
756
757 if (vp->v_pages == NULL)
758 return (0);
759
760
761 /*
762 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
763 *
764 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
765 * from getting blocked while flushing pages to a dead NFS server.
766 */
767 mutex_enter(&vp->v_lock);
768 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
769 mutex_exit(&vp->v_lock);
770 return (EAGAIN);
771 }
772
773 while (vp->v_flag & VVMLOCK)
774 cv_wait(&vp->v_cv, &vp->v_lock);
775
776 if (vp->v_pages == NULL) {
777 mutex_exit(&vp->v_lock);
778 return (0);
779 }
780
781 vp->v_flag |= VVMLOCK;
782 mutex_exit(&vp->v_lock);
783
784
785 /*
786 * Set up the marker pages used to walk the list
787 */
788 end = kmem_cache_alloc(marker_cache, KM_SLEEP);
789 end->p_vnode = vp;
790 end->p_offset = (u_offset_t)-2;
791 mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
792 mark->p_vnode = vp;
793 mark->p_offset = (u_offset_t)-1;
794
795 /*
796 * Grab the lock protecting the vnode's page list
797 * note that this lock is dropped at times in the loop.
798 */
799 vphm = page_vnode_mutex(vp);
800 mutex_enter(vphm);
801 if (vp->v_pages == NULL)
802 goto leave;
803
804 /*
805 * insert the markers and loop through the list of pages
806 */
807 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
808 page_vpadd(&mark->p_vpnext, end);
809 for (;;) {
810
811 /*
812 * If only doing an async write back, then we can
813 * stop as soon as we get to start of the list.
814 */
815 if (flags == B_ASYNC && vp->v_pages == mark)
816 break;
817
818 /*
819 * otherwise stop when we've gone through all the pages
820 */
821 if (mark->p_vpprev == end)
822 break;
823
824 pp = mark->p_vpprev;
825 if (vp->v_pages == pp)
826 where_to_move = &vp->v_pages;
827 else
828 where_to_move = &pp->p_vpprev->p_vpnext;
829
830 ASSERT(pp->p_vnode == vp);
831
832 /*
833 * If just flushing dirty pages to disk and this vnode
834 * is using a sorted list of pages, we can stop processing
835 * as soon as we find an unmodified page. Since all the
836 * modified pages are visited first.
837 */
838 if (IS_VMODSORT(vp) &&
839 !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
840 if (!hat_ismod(pp) && !page_io_locked(pp)) {
841 #ifdef DEBUG
842 /*
843 * For debug kernels examine what should be
844 * all the remaining clean pages, asserting
845 * that they are not modified.
846 */
847 page_t *chk = pp;
848 int attr;
849
850 page_vpsub(&vp->v_pages, mark);
851 page_vpadd(where_to_move, mark);
852 do {
853 chk = chk->p_vpprev;
854 ASSERT(chk != end);
855 if (chk == mark)
856 continue;
857 attr = hat_page_getattr(chk, P_MOD |
858 P_REF);
859 if ((attr & P_MOD) == 0)
860 continue;
861 panic("v_pages list not all clean: "
862 "page_t*=%p vnode=%p off=%lx "
863 "attr=0x%x last clean page_t*=%p\n",
864 (void *)chk, (void *)chk->p_vnode,
865 (long)chk->p_offset, attr,
866 (void *)pp);
867 } while (chk != vp->v_pages);
868 #endif
869 break;
870 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
871 /*
872 * Couldn't get io lock, wait until IO is done.
873 * Block only for sync IO since we don't want
874 * to block async IO.
875 */
876 mutex_exit(vphm);
877 page_io_wait(pp);
878 mutex_enter(vphm);
879 continue;
880 }
881 }
882
883 /*
884 * Skip this page if the offset is out of the desired range.
885 * Just move the marker and continue.
886 */
887 if (pp->p_offset < off) {
888 page_vpsub(&vp->v_pages, mark);
889 page_vpadd(where_to_move, mark);
890 continue;
891 }
892
893 /*
894 * If we are supposed to invalidate or free this
895 * page, then we need an exclusive lock.
896 */
897 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
898
899 /*
900 * We must acquire the page lock for all synchronous
901 * operations (invalidate, free and write).
902 */
903 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
904 /*
905 * If the page_lock() drops the mutex
906 * we must retry the loop.
907 */
908 if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
909 continue;
910
911 /*
912 * It's ok to move the marker page now.
913 */
914 page_vpsub(&vp->v_pages, mark);
915 page_vpadd(where_to_move, mark);
916 } else {
917
918 /*
919 * update the marker page for all remaining cases
920 */
921 page_vpsub(&vp->v_pages, mark);
922 page_vpadd(where_to_move, mark);
923
924 /*
925 * For write backs, If we can't lock the page, it's
926 * invalid or in the process of being destroyed. Skip
927 * it, assuming someone else is writing it.
928 */
929 if (!page_trylock(pp, se))
930 continue;
931 }
932
933 ASSERT(pp->p_vnode == vp);
934
935 /*
936 * Successfully locked the page, now figure out what to
937 * do with it. Free pages are easily dealt with, invalidate
938 * if desired or just go on to the next page.
939 */
940 if (PP_ISFREE(pp)) {
941 if ((flags & B_INVAL) == 0) {
942 page_unlock(pp);
943 continue;
944 }
945
946 /*
947 * Invalidate (destroy) the page.
948 */
949 mutex_exit(vphm);
950 page_destroy_free(pp);
951 mutex_enter(vphm);
952 continue;
953 }
954
955 /*
956 * pvn_getdirty() figures out what do do with a dirty page.
957 * If the page is dirty, the putapage() routine will write it
958 * and will kluster any other adjacent dirty pages it can.
959 *
960 * pvn_getdirty() and `(*putapage)' unlock the page.
961 */
962 mutex_exit(vphm);
963 if (pvn_getdirty(pp, flags)) {
964 error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
965 if (!err)
966 err = error;
967 }
968 mutex_enter(vphm);
969 }
970 page_vpsub(&vp->v_pages, mark);
971 page_vpsub(&vp->v_pages, end);
972
973 leave:
974 /*
975 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
976 */
977 mutex_exit(vphm);
978 kmem_cache_free(marker_cache, mark);
979 kmem_cache_free(marker_cache, end);
980 mutex_enter(&vp->v_lock);
981 vp->v_flag &= ~VVMLOCK;
982 cv_broadcast(&vp->v_cv);
983 mutex_exit(&vp->v_lock);
984 return (err);
985 }
986
987 /*
988 * Walk the vp->v_pages list, for every page call the callback function
989 * pointed by *page_check. If page_check returns non-zero, then mark the
990 * page as modified and if VMODSORT is set, move it to the end of v_pages
991 * list. Moving makes sense only if we have at least two pages - this also
992 * avoids having v_pages temporarily being NULL after calling page_vpsub()
993 * if there was just one page.
994 */
995 void
pvn_vplist_setdirty(vnode_t * vp,int (* page_check)(page_t *))996 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
997 {
998 page_t *pp, *next, *end;
999 kmutex_t *vphm;
1000 int shuffle;
1001
1002 vphm = page_vnode_mutex(vp);
1003 mutex_enter(vphm);
1004
1005 if (vp->v_pages == NULL) {
1006 mutex_exit(vphm);
1007 return;
1008 }
1009
1010 end = vp->v_pages->p_vpprev;
1011 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1012 pp = vp->v_pages;
1013
1014 for (;;) {
1015 next = pp->p_vpnext;
1016 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1017 /*
1018 * hat_setmod_only() in contrast to hat_setmod() does
1019 * not shuffle the pages and does not grab the mutex
1020 * page_vnode_mutex. Exactly what we need.
1021 */
1022 hat_setmod_only(pp);
1023 if (shuffle) {
1024 page_vpsub(&vp->v_pages, pp);
1025 ASSERT(vp->v_pages != NULL);
1026 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1027 pp);
1028 }
1029 }
1030 /* Stop if we have just processed the last page. */
1031 if (pp == end)
1032 break;
1033 pp = next;
1034 }
1035
1036 mutex_exit(vphm);
1037 }
1038
1039 /*
1040 * Zero out zbytes worth of data. Caller should be aware that this
1041 * routine may enter back into the fs layer (xxx_getpage). Locks
1042 * that the xxx_getpage routine may need should not be held while
1043 * calling this.
1044 */
1045 void
pvn_vpzero(struct vnode * vp,u_offset_t vplen,size_t zbytes)1046 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1047 {
1048 caddr_t addr;
1049
1050 ASSERT(vp->v_type != VCHR);
1051
1052 if (vp->v_pages == NULL)
1053 return;
1054
1055 /*
1056 * zbytes may be zero but there still may be some portion of
1057 * a page which needs clearing (since zbytes is a function
1058 * of filesystem block size, not pagesize.)
1059 */
1060 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1061 return;
1062
1063 /*
1064 * We get the last page and handle the partial
1065 * zeroing via kernel mappings. This will make the page
1066 * dirty so that we know that when this page is written
1067 * back, the zeroed information will go out with it. If
1068 * the page is not currently in memory, then the kzero
1069 * operation will cause it to be brought it. We use kzero
1070 * instead of bzero so that if the page cannot be read in
1071 * for any reason, the system will not panic. We need
1072 * to zero out a minimum of the fs given zbytes, but we
1073 * might also have to do more to get the entire last page.
1074 */
1075
1076 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1077 panic("pvn_vptrunc zbytes");
1078 addr = segmap_getmapflt(segkmap, vp, vplen,
1079 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1080 (void) kzero(addr + (vplen & MAXBOFFSET),
1081 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1082 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1083 }
1084
1085 /*
1086 * Handles common work of the VOP_GETPAGE routines by iterating page by page
1087 * calling the getpage helper for each.
1088 */
1089 int
pvn_getpages(int (* getpage)(vnode_t *,u_offset_t,size_t,uint_t *,page_t * [],size_t,struct seg *,caddr_t,enum seg_rw,cred_t *),struct vnode * vp,u_offset_t off,size_t len,uint_t * protp,page_t * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)1090 pvn_getpages(
1091 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1092 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1093 struct vnode *vp,
1094 u_offset_t off,
1095 size_t len,
1096 uint_t *protp,
1097 page_t *pl[],
1098 size_t plsz,
1099 struct seg *seg,
1100 caddr_t addr,
1101 enum seg_rw rw,
1102 struct cred *cred)
1103 {
1104 page_t **ppp;
1105 u_offset_t o, eoff;
1106 size_t sz, xlen;
1107 int err;
1108
1109 /* ensure that we have enough space */
1110 ASSERT(pl == NULL || plsz >= len);
1111
1112 /*
1113 * Loop one page at a time and let getapage function fill
1114 * in the next page in array. We only allow one page to be
1115 * returned at a time (except for the last page) so that we
1116 * don't have any problems with duplicates and other such
1117 * painful problems. This is a very simple minded algorithm,
1118 * but it does the job correctly. We hope that the cost of a
1119 * getapage call for a resident page that we might have been
1120 * able to get from an earlier call doesn't cost too much.
1121 */
1122 ppp = pl;
1123 sz = (pl != NULL) ? PAGESIZE : 0;
1124 eoff = off + len;
1125 xlen = len;
1126 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1127 xlen -= PAGESIZE) {
1128 if (o + PAGESIZE >= eoff && pl != NULL) {
1129 /*
1130 * Last time through - allow the all of
1131 * what's left of the pl[] array to be used.
1132 */
1133 sz = plsz - (o - off);
1134 }
1135 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1136 rw, cred);
1137 if (err) {
1138 /*
1139 * Release any pages we already got.
1140 */
1141 if (o > off && pl != NULL) {
1142 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1143 (void) page_release(*ppp, 1);
1144 }
1145 break;
1146 }
1147 if (pl != NULL)
1148 ppp++;
1149 }
1150 return (err);
1151 }
1152
1153 /*
1154 * Initialize the page list array.
1155 */
1156 /*ARGSUSED*/
1157 void
pvn_plist_init(page_t * pp,page_t * pl[],size_t plsz,u_offset_t off,size_t io_len,enum seg_rw rw)1158 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1159 u_offset_t off, size_t io_len, enum seg_rw rw)
1160 {
1161 ssize_t sz;
1162 page_t *ppcur, **ppp;
1163
1164 /*
1165 * Set up to load plsz worth
1166 * starting at the needed page.
1167 */
1168 while (pp != NULL && pp->p_offset != off) {
1169 /*
1170 * Remove page from the i/o list,
1171 * release the i/o and the page lock.
1172 */
1173 ppcur = pp;
1174 page_sub(&pp, ppcur);
1175 page_io_unlock(ppcur);
1176 (void) page_release(ppcur, 1);
1177 }
1178
1179 if (pp == NULL) {
1180 pl[0] = NULL;
1181 return;
1182 }
1183
1184 sz = plsz;
1185
1186 /*
1187 * Initialize the page list array.
1188 */
1189 ppp = pl;
1190 do {
1191 ppcur = pp;
1192 *ppp++ = ppcur;
1193 page_sub(&pp, ppcur);
1194 page_io_unlock(ppcur);
1195 if (rw != S_CREATE)
1196 page_downgrade(ppcur);
1197 sz -= PAGESIZE;
1198 } while (sz > 0 && pp != NULL);
1199 *ppp = NULL; /* terminate list */
1200
1201 /*
1202 * Now free the remaining pages that weren't
1203 * loaded in the page list.
1204 */
1205 while (pp != NULL) {
1206 ppcur = pp;
1207 page_sub(&pp, ppcur);
1208 page_io_unlock(ppcur);
1209 (void) page_release(ppcur, 1);
1210 }
1211 }
1212