1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Portions of this source code were derived from Berkeley 4.3 BSD
32 * under license from the Regents of the University of California.
33 */
34
35 #include <sys/types.h>
36 #include <sys/t_lock.h>
37 #include <sys/param.h>
38 #include <sys/time.h>
39 #include <sys/systm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/resource.h>
42 #include <sys/signal.h>
43 #include <sys/cred.h>
44 #include <sys/user.h>
45 #include <sys/buf.h>
46 #include <sys/vfs.h>
47 #include <sys/vnode.h>
48 #include <sys/proc.h>
49 #include <sys/disp.h>
50 #include <sys/file.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/kmem.h>
54 #include <sys/uio.h>
55 #include <sys/dnlc.h>
56 #include <sys/conf.h>
57 #include <sys/mman.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/filio.h>
63 #include <sys/atomic.h>
64
65 #include <sys/fssnap_if.h>
66 #include <sys/fs/ufs_fs.h>
67 #include <sys/fs/ufs_lockfs.h>
68 #include <sys/fs/ufs_filio.h>
69 #include <sys/fs/ufs_inode.h>
70 #include <sys/fs/ufs_fsdir.h>
71 #include <sys/fs/ufs_quota.h>
72 #include <sys/fs/ufs_trans.h>
73 #include <sys/fs/ufs_panic.h>
74 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */
75 #include <sys/errno.h>
76
77 #include <sys/filio.h> /* _FIOIO */
78
79 #include <vm/hat.h>
80 #include <vm/page.h>
81 #include <vm/pvn.h>
82 #include <vm/as.h>
83 #include <vm/seg.h>
84 #include <vm/seg_map.h>
85 #include <vm/seg_vn.h>
86 #include <vm/seg_kmem.h>
87 #include <vm/rm.h>
88 #include <sys/swap.h>
89 #include <sys/epm.h>
90
91 #include <fs/fs_subr.h>
92
93 static void *ufs_directio_zero_buf;
94 static int ufs_directio_zero_len = 8192;
95
96 int ufs_directio_enabled = 1; /* feature is enabled */
97
98 /*
99 * for kstats reader
100 */
101 struct ufs_directio_kstats {
102 kstat_named_t logical_reads;
103 kstat_named_t phys_reads;
104 kstat_named_t hole_reads;
105 kstat_named_t nread;
106 kstat_named_t logical_writes;
107 kstat_named_t phys_writes;
108 kstat_named_t nwritten;
109 kstat_named_t nflushes;
110 } ufs_directio_kstats = {
111 { "logical_reads", KSTAT_DATA_UINT64 },
112 { "phys_reads", KSTAT_DATA_UINT64 },
113 { "hole_reads", KSTAT_DATA_UINT64 },
114 { "nread", KSTAT_DATA_UINT64 },
115 { "logical_writes", KSTAT_DATA_UINT64 },
116 { "phys_writes", KSTAT_DATA_UINT64 },
117 { "nwritten", KSTAT_DATA_UINT64 },
118 { "nflushes", KSTAT_DATA_UINT64 },
119 };
120
121 kstat_t *ufs_directio_kstatsp;
122
123 /*
124 * use kmem_cache_create for direct-physio buffers. This has shown
125 * a better cache distribution compared to buffers on the
126 * stack. It also avoids semaphore construction/deconstruction
127 * per request
128 */
129 struct directio_buf {
130 struct directio_buf *next;
131 char *addr;
132 size_t nbytes;
133 struct buf buf;
134 };
135 static struct kmem_cache *directio_buf_cache;
136
137
138 /* ARGSUSED */
139 static int
directio_buf_constructor(void * dbp,void * cdrarg,int kmflags)140 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
141 {
142 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
143 return (0);
144 }
145
146 /* ARGSUSED */
147 static void
directio_buf_destructor(void * dbp,void * cdrarg)148 directio_buf_destructor(void *dbp, void *cdrarg)
149 {
150 biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
151 }
152
153 void
directio_bufs_init(void)154 directio_bufs_init(void)
155 {
156 directio_buf_cache = kmem_cache_create("directio_buf_cache",
157 sizeof (struct directio_buf), 0,
158 directio_buf_constructor, directio_buf_destructor,
159 NULL, NULL, NULL, 0);
160 }
161
162 void
ufs_directio_init(void)163 ufs_directio_init(void)
164 {
165 /*
166 * kstats
167 */
168 ufs_directio_kstatsp = kstat_create("ufs", 0,
169 "directio", "ufs", KSTAT_TYPE_NAMED,
170 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
171 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
172 if (ufs_directio_kstatsp) {
173 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
174 kstat_install(ufs_directio_kstatsp);
175 }
176 /*
177 * kzero is broken so we have to use a private buf of zeroes
178 */
179 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
180 directio_bufs_init();
181 }
182
183 /*
184 * Wait for the first direct IO operation to finish
185 */
186 static int
directio_wait_one(struct directio_buf * dbp,long * bytes_iop)187 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
188 {
189 buf_t *bp;
190 int error;
191
192 /*
193 * Wait for IO to finish
194 */
195 bp = &dbp->buf;
196 error = biowait(bp);
197
198 /*
199 * bytes_io will be used to figure out a resid
200 * for the caller. The resid is approximated by reporting
201 * the bytes following the first failed IO as the residual.
202 *
203 * I am cautious about using b_resid because I
204 * am not sure how well the disk drivers maintain it.
205 */
206 if (error)
207 if (bp->b_resid)
208 *bytes_iop = bp->b_bcount - bp->b_resid;
209 else
210 *bytes_iop = 0;
211 else
212 *bytes_iop += bp->b_bcount;
213 /*
214 * Release direct IO resources
215 */
216 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
217 kmem_cache_free(directio_buf_cache, dbp);
218 return (error);
219 }
220
221 /*
222 * Wait for all of the direct IO operations to finish
223 */
224
225 static int
directio_wait(struct directio_buf * tail,long * bytes_iop)226 directio_wait(struct directio_buf *tail, long *bytes_iop)
227 {
228 int error = 0, newerror;
229 struct directio_buf *dbp;
230
231 /*
232 * The linked list of directio buf structures is maintained
233 * in reverse order (tail->last request->penultimate request->...)
234 */
235 while ((dbp = tail) != NULL) {
236 tail = dbp->next;
237 newerror = directio_wait_one(dbp, bytes_iop);
238 if (error == 0)
239 error = newerror;
240 }
241 return (error);
242 }
243 /*
244 * Initiate direct IO request
245 */
246 static void
directio_start(struct ufsvfs * ufsvfsp,struct inode * ip,size_t nbytes,offset_t offset,char * addr,enum seg_rw rw,struct proc * procp,struct directio_buf ** tailp,page_t ** pplist)247 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
248 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
249 struct directio_buf **tailp, page_t **pplist)
250 {
251 buf_t *bp;
252 struct directio_buf *dbp;
253
254 /*
255 * Allocate a directio buf header
256 * Note - list is maintained in reverse order.
257 * directio_wait_one() depends on this fact when
258 * adjusting the ``bytes_io'' param. bytes_io
259 * is used to compute a residual in the case of error.
260 */
261 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
262 dbp->next = *tailp;
263 *tailp = dbp;
264
265 /*
266 * Initialize buf header
267 */
268 dbp->addr = addr;
269 dbp->nbytes = nbytes;
270 bp = &dbp->buf;
271 bp->b_edev = ip->i_dev;
272 bp->b_lblkno = btodt(offset);
273 bp->b_bcount = nbytes;
274 bp->b_un.b_addr = addr;
275 bp->b_proc = procp;
276 bp->b_file = ip->i_vnode;
277
278 /*
279 * Note that S_WRITE implies B_READ and vice versa: a read(2)
280 * will B_READ data from the filesystem and S_WRITE it into
281 * the user's buffer; a write(2) will S_READ data from the
282 * user's buffer and B_WRITE it to the filesystem.
283 */
284 if (rw == S_WRITE) {
285 bp->b_flags = B_BUSY | B_PHYS | B_READ;
286 ufs_directio_kstats.phys_reads.value.ui64++;
287 ufs_directio_kstats.nread.value.ui64 += nbytes;
288 } else {
289 bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
290 ufs_directio_kstats.phys_writes.value.ui64++;
291 ufs_directio_kstats.nwritten.value.ui64 += nbytes;
292 }
293 bp->b_shadow = pplist;
294 if (pplist != NULL)
295 bp->b_flags |= B_SHADOW;
296
297 /*
298 * Issue I/O request.
299 */
300 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
301 if (ufsvfsp->vfs_snapshot)
302 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
303 else
304 (void) bdev_strategy(bp);
305
306 if (rw == S_WRITE)
307 lwp_stat_update(LWP_STAT_OUBLK, 1);
308 else
309 lwp_stat_update(LWP_STAT_INBLK, 1);
310
311 }
312
313 uint32_t ufs_shared_writes; /* writes done w/ lock shared */
314 uint32_t ufs_cur_writes; /* # concurrent writes */
315 uint32_t ufs_maxcur_writes; /* high water concurrent writes */
316 uint32_t ufs_posix_hits; /* writes done /w lock excl. */
317
318 /*
319 * Force POSIX syncronous data integrity on all writes for testing.
320 */
321 uint32_t ufs_force_posix_sdi = 0;
322
323 /*
324 * Direct Write
325 */
326
327 int
ufs_directio_write(struct inode * ip,uio_t * arg_uio,int ioflag,int rewrite,cred_t * cr,int * statusp)328 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
329 cred_t *cr, int *statusp)
330 {
331 long resid, bytes_written;
332 u_offset_t size, uoff;
333 uio_t *uio = arg_uio;
334 rlim64_t limit = uio->uio_llimit;
335 int on, n, error, newerror, len, has_holes;
336 daddr_t bn;
337 size_t nbytes;
338 struct fs *fs;
339 vnode_t *vp;
340 iovec_t *iov;
341 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
342 struct proc *procp;
343 struct as *as;
344 struct directio_buf *tail;
345 int exclusive, ncur, bmap_peek;
346 uio_t copy_uio;
347 iovec_t copy_iov;
348 char *copy_base;
349 long copy_resid;
350
351 /*
352 * assume that directio isn't possible (normal case)
353 */
354 *statusp = DIRECTIO_FAILURE;
355
356 /*
357 * Don't go direct
358 */
359 if (ufs_directio_enabled == 0)
360 return (0);
361
362 /*
363 * mapped file; nevermind
364 */
365 if (ip->i_mapcnt)
366 return (0);
367
368 /*
369 * CAN WE DO DIRECT IO?
370 */
371 uoff = uio->uio_loffset;
372 resid = uio->uio_resid;
373
374 /*
375 * beyond limit
376 */
377 if (uoff + resid > limit)
378 return (0);
379
380 /*
381 * must be sector aligned
382 */
383 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
384 return (0);
385
386 /*
387 * SHOULD WE DO DIRECT IO?
388 */
389 size = ip->i_size;
390 has_holes = -1;
391
392 /*
393 * only on regular files; no metadata
394 */
395 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
396 return (0);
397
398 /*
399 * Synchronous, allocating writes run very slow in Direct-Mode
400 * XXX - can be fixed with bmap_write changes for large writes!!!
401 * XXX - can be fixed for updates to "almost-full" files
402 * XXX - WARNING - system hangs if bmap_write() has to
403 * allocate lots of pages since pageout
404 * suspends on locked inode
405 */
406 if (!rewrite && (ip->i_flag & ISYNC)) {
407 if ((uoff + resid) > size)
408 return (0);
409 has_holes = bmap_has_holes(ip);
410 if (has_holes)
411 return (0);
412 }
413
414 /*
415 * Each iovec must be short aligned and sector aligned. If
416 * one is not, then kmem_alloc a new buffer and copy all of
417 * the smaller buffers into the new buffer. This new
418 * buffer will be short aligned and sector aligned.
419 */
420 iov = uio->uio_iov;
421 nbytes = uio->uio_iovcnt;
422 while (nbytes--) {
423 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
424 (intptr_t)(iov->iov_base) & 1) {
425 copy_resid = uio->uio_resid;
426 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
427 if (copy_base == NULL)
428 return (0);
429 copy_iov.iov_base = copy_base;
430 copy_iov.iov_len = copy_resid;
431 copy_uio.uio_iov = ©_iov;
432 copy_uio.uio_iovcnt = 1;
433 copy_uio.uio_segflg = UIO_SYSSPACE;
434 copy_uio.uio_extflg = UIO_COPY_DEFAULT;
435 copy_uio.uio_loffset = uio->uio_loffset;
436 copy_uio.uio_resid = uio->uio_resid;
437 copy_uio.uio_llimit = uio->uio_llimit;
438 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
439 if (error) {
440 kmem_free(copy_base, copy_resid);
441 return (0);
442 }
443 uio = ©_uio;
444 break;
445 }
446 iov++;
447 }
448
449 /*
450 * From here on down, all error exits must go to errout and
451 * not simply return a 0.
452 */
453
454 /*
455 * DIRECTIO
456 */
457
458 fs = ip->i_fs;
459
460 /*
461 * POSIX check. If attempting a concurrent re-write, make sure
462 * that this will be a single request to the driver to meet
463 * POSIX synchronous data integrity requirements.
464 */
465 bmap_peek = 0;
466 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
467 int upgrade = 0;
468
469 /* check easy conditions first */
470 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
471 upgrade = 1;
472 } else {
473 /* now look for contiguous allocation */
474 len = (ssize_t)blkroundup(fs, resid);
475 error = bmap_read(ip, uoff, &bn, &len);
476 if (error || bn == UFS_HOLE || len == 0)
477 goto errout;
478 /* save a call to bmap_read later */
479 bmap_peek = 1;
480 if (len < resid)
481 upgrade = 1;
482 }
483 if (upgrade) {
484 rw_exit(&ip->i_contents);
485 rw_enter(&ip->i_contents, RW_WRITER);
486 ufs_posix_hits++;
487 }
488 }
489
490
491 /*
492 * allocate space
493 */
494
495 /*
496 * If attempting a re-write, there is no allocation to do.
497 * bmap_write would trip an ASSERT if i_contents is held shared.
498 */
499 if (rewrite)
500 goto skip_alloc;
501
502 do {
503 on = (int)blkoff(fs, uoff);
504 n = (int)MIN(fs->fs_bsize - on, resid);
505 if ((uoff + n) > ip->i_size) {
506 error = bmap_write(ip, uoff, (int)(on + n),
507 (int)(uoff & (offset_t)MAXBOFFSET) == 0,
508 NULL, cr);
509 /* Caller is responsible for updating i_seq if needed */
510 if (error)
511 break;
512 ip->i_size = uoff + n;
513 ip->i_flag |= IATTCHG;
514 } else if (n == MAXBSIZE) {
515 error = bmap_write(ip, uoff, (int)(on + n),
516 BI_ALLOC_ONLY, NULL, cr);
517 /* Caller is responsible for updating i_seq if needed */
518 } else {
519 if (has_holes < 0)
520 has_holes = bmap_has_holes(ip);
521 if (has_holes) {
522 uint_t blk_size;
523 u_offset_t offset;
524
525 offset = uoff & (offset_t)fs->fs_bmask;
526 blk_size = (int)blksize(fs, ip,
527 (daddr_t)lblkno(fs, offset));
528 error = bmap_write(ip, uoff, blk_size,
529 BI_NORMAL, NULL, cr);
530 /*
531 * Caller is responsible for updating
532 * i_seq if needed
533 */
534 } else
535 error = 0;
536 }
537 if (error)
538 break;
539 uoff += n;
540 resid -= n;
541 /*
542 * if file has grown larger than 2GB, set flag
543 * in superblock if not already set
544 */
545 if ((ip->i_size > MAXOFF32_T) &&
546 !(fs->fs_flags & FSLARGEFILES)) {
547 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
548 mutex_enter(&ufsvfsp->vfs_lock);
549 fs->fs_flags |= FSLARGEFILES;
550 ufs_sbwrite(ufsvfsp);
551 mutex_exit(&ufsvfsp->vfs_lock);
552 }
553 } while (resid);
554
555 if (error) {
556 /*
557 * restore original state
558 */
559 if (resid) {
560 if (size == ip->i_size)
561 goto errout;
562 (void) ufs_itrunc(ip, size, 0, cr);
563 }
564 /*
565 * try non-directio path
566 */
567 goto errout;
568 }
569 skip_alloc:
570
571 /*
572 * get rid of cached pages
573 */
574 vp = ITOV(ip);
575 exclusive = rw_write_held(&ip->i_contents);
576 if (vn_has_cached_data(vp)) {
577 if (!exclusive) {
578 /*
579 * Still holding i_rwlock, so no allocations
580 * can happen after dropping contents.
581 */
582 rw_exit(&ip->i_contents);
583 rw_enter(&ip->i_contents, RW_WRITER);
584 }
585 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
586 B_INVAL, cr, NULL);
587 if (vn_has_cached_data(vp))
588 goto errout;
589 if (!exclusive)
590 rw_downgrade(&ip->i_contents);
591 ufs_directio_kstats.nflushes.value.ui64++;
592 }
593
594 /*
595 * Direct Writes
596 */
597
598 if (!exclusive) {
599 ufs_shared_writes++;
600 ncur = atomic_inc_32_nv(&ufs_cur_writes);
601 if (ncur > ufs_maxcur_writes)
602 ufs_maxcur_writes = ncur;
603 }
604
605 /*
606 * proc and as are for VM operations in directio_start()
607 */
608 if (uio->uio_segflg == UIO_USERSPACE) {
609 procp = ttoproc(curthread);
610 as = procp->p_as;
611 } else {
612 procp = NULL;
613 as = &kas;
614 }
615 *statusp = DIRECTIO_SUCCESS;
616 error = 0;
617 newerror = 0;
618 resid = uio->uio_resid;
619 bytes_written = 0;
620 ufs_directio_kstats.logical_writes.value.ui64++;
621 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
622 size_t pglck_len, pglck_size;
623 caddr_t pglck_base;
624 page_t **pplist, **spplist;
625
626 tail = NULL;
627
628 /*
629 * Adjust number of bytes
630 */
631 iov = uio->uio_iov;
632 pglck_len = (size_t)MIN(iov->iov_len, resid);
633 pglck_base = iov->iov_base;
634 if (pglck_len == 0) {
635 uio->uio_iov++;
636 uio->uio_iovcnt--;
637 continue;
638 }
639
640 /*
641 * Try to Lock down the largest chunck of pages possible.
642 */
643 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
644 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
645
646 if (error)
647 break;
648
649 pglck_size = pglck_len;
650 while (pglck_len) {
651
652 nbytes = pglck_len;
653 uoff = uio->uio_loffset;
654
655 if (!bmap_peek) {
656
657 /*
658 * Re-adjust number of bytes to contiguous
659 * range. May have already called bmap_read
660 * in the case of a concurrent rewrite.
661 */
662 len = (ssize_t)blkroundup(fs, nbytes);
663 error = bmap_read(ip, uoff, &bn, &len);
664 if (error)
665 break;
666 if (bn == UFS_HOLE || len == 0)
667 break;
668 }
669 nbytes = (size_t)MIN(nbytes, len);
670 bmap_peek = 0;
671
672 /*
673 * Get the pagelist pointer for this offset to be
674 * passed to directio_start.
675 */
676
677 if (pplist != NULL)
678 spplist = pplist +
679 btop((uintptr_t)iov->iov_base -
680 ((uintptr_t)pglck_base & PAGEMASK));
681 else
682 spplist = NULL;
683
684 /*
685 * Kick off the direct write requests
686 */
687 directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
688 iov->iov_base, S_READ, procp, &tail, spplist);
689
690 /*
691 * Adjust pointers and counters
692 */
693 iov->iov_len -= nbytes;
694 iov->iov_base += nbytes;
695 uio->uio_loffset += nbytes;
696 resid -= nbytes;
697 pglck_len -= nbytes;
698 }
699
700 /*
701 * Wait for outstanding requests
702 */
703 newerror = directio_wait(tail, &bytes_written);
704
705 /*
706 * Release VM resources
707 */
708 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
709
710 }
711
712 if (!exclusive) {
713 atomic_dec_32(&ufs_cur_writes);
714 /*
715 * If this write was done shared, readers may
716 * have pulled in unmodified pages. Get rid of
717 * these potentially stale pages.
718 */
719 if (vn_has_cached_data(vp)) {
720 rw_exit(&ip->i_contents);
721 rw_enter(&ip->i_contents, RW_WRITER);
722 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
723 B_INVAL, cr, NULL);
724 ufs_directio_kstats.nflushes.value.ui64++;
725 rw_downgrade(&ip->i_contents);
726 }
727 }
728
729 /*
730 * If error, adjust resid to begin at the first
731 * un-writable byte.
732 */
733 if (error == 0)
734 error = newerror;
735 if (error)
736 resid = uio->uio_resid - bytes_written;
737 arg_uio->uio_resid = resid;
738
739 if (!rewrite) {
740 ip->i_flag |= IUPD | ICHG;
741 /* Caller will update i_seq */
742 TRANS_INODE(ip->i_ufsvfs, ip);
743 }
744 /*
745 * If there is a residual; adjust the EOF if necessary
746 */
747 if (resid) {
748 if (size != ip->i_size) {
749 if (uio->uio_loffset > size)
750 size = uio->uio_loffset;
751 (void) ufs_itrunc(ip, size, 0, cr);
752 }
753 }
754
755 if (uio == ©_uio)
756 kmem_free(copy_base, copy_resid);
757
758 return (error);
759
760 errout:
761 if (uio == ©_uio)
762 kmem_free(copy_base, copy_resid);
763
764 return (0);
765 }
766 /*
767 * Direct read of a hole
768 */
769 static int
directio_hole(struct uio * uio,size_t nbytes)770 directio_hole(struct uio *uio, size_t nbytes)
771 {
772 int error = 0, nzero;
773 uio_t phys_uio;
774 iovec_t phys_iov;
775
776 ufs_directio_kstats.hole_reads.value.ui64++;
777 ufs_directio_kstats.nread.value.ui64 += nbytes;
778
779 phys_iov.iov_base = uio->uio_iov->iov_base;
780 phys_iov.iov_len = nbytes;
781
782 phys_uio.uio_iov = &phys_iov;
783 phys_uio.uio_iovcnt = 1;
784 phys_uio.uio_resid = phys_iov.iov_len;
785 phys_uio.uio_segflg = uio->uio_segflg;
786 phys_uio.uio_extflg = uio->uio_extflg;
787 while (error == 0 && phys_uio.uio_resid) {
788 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
789 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
790 &phys_uio);
791 }
792 return (error);
793 }
794
795 /*
796 * Direct Read
797 */
798 int
ufs_directio_read(struct inode * ip,uio_t * uio,cred_t * cr,int * statusp)799 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
800 {
801 ssize_t resid, bytes_read;
802 u_offset_t size, uoff;
803 int error, newerror, len;
804 size_t nbytes;
805 struct fs *fs;
806 vnode_t *vp;
807 daddr_t bn;
808 iovec_t *iov;
809 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
810 struct proc *procp;
811 struct as *as;
812 struct directio_buf *tail;
813
814 /*
815 * assume that directio isn't possible (normal case)
816 */
817 *statusp = DIRECTIO_FAILURE;
818
819 /*
820 * Don't go direct
821 */
822 if (ufs_directio_enabled == 0)
823 return (0);
824
825 /*
826 * mapped file; nevermind
827 */
828 if (ip->i_mapcnt)
829 return (0);
830
831 /*
832 * CAN WE DO DIRECT IO?
833 */
834 /*
835 * must be sector aligned
836 */
837 uoff = uio->uio_loffset;
838 resid = uio->uio_resid;
839 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
840 return (0);
841 /*
842 * must be short aligned and sector aligned
843 */
844 iov = uio->uio_iov;
845 nbytes = uio->uio_iovcnt;
846 while (nbytes--) {
847 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
848 return (0);
849 if ((intptr_t)(iov++->iov_base) & 1)
850 return (0);
851 }
852
853 /*
854 * DIRECTIO
855 */
856 fs = ip->i_fs;
857
858 /*
859 * don't read past EOF
860 */
861 size = ip->i_size;
862
863 /*
864 * The file offset is past EOF so bail out here; we don't want
865 * to update uio_resid and make it look like we read something.
866 * We say that direct I/O was a success to avoid having rdip()
867 * go through the same "read past EOF logic".
868 */
869 if (uoff >= size) {
870 *statusp = DIRECTIO_SUCCESS;
871 return (0);
872 }
873
874 /*
875 * The read would extend past EOF so make it smaller.
876 */
877 if ((uoff + resid) > size) {
878 resid = size - uoff;
879 /*
880 * recheck sector alignment
881 */
882 if (resid & (DEV_BSIZE - 1))
883 return (0);
884 }
885
886 /*
887 * At this point, we know there is some real work to do.
888 */
889 ASSERT(resid);
890
891 /*
892 * get rid of cached pages
893 */
894 vp = ITOV(ip);
895 if (vn_has_cached_data(vp)) {
896 rw_exit(&ip->i_contents);
897 rw_enter(&ip->i_contents, RW_WRITER);
898 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
899 B_INVAL, cr, NULL);
900 if (vn_has_cached_data(vp))
901 return (0);
902 rw_downgrade(&ip->i_contents);
903 ufs_directio_kstats.nflushes.value.ui64++;
904 }
905 /*
906 * Direct Reads
907 */
908
909 /*
910 * proc and as are for VM operations in directio_start()
911 */
912 if (uio->uio_segflg == UIO_USERSPACE) {
913 procp = ttoproc(curthread);
914 as = procp->p_as;
915 } else {
916 procp = NULL;
917 as = &kas;
918 }
919
920 *statusp = DIRECTIO_SUCCESS;
921 error = 0;
922 newerror = 0;
923 bytes_read = 0;
924 ufs_directio_kstats.logical_reads.value.ui64++;
925 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
926 size_t pglck_len, pglck_size;
927 caddr_t pglck_base;
928 page_t **pplist, **spplist;
929
930 tail = NULL;
931
932 /*
933 * Adjust number of bytes
934 */
935 iov = uio->uio_iov;
936 pglck_len = (size_t)MIN(iov->iov_len, resid);
937 pglck_base = iov->iov_base;
938 if (pglck_len == 0) {
939 uio->uio_iov++;
940 uio->uio_iovcnt--;
941 continue;
942 }
943
944 /*
945 * Try to Lock down the largest chunck of pages possible.
946 */
947 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
948 error = as_pagelock(as, &pplist, pglck_base,
949 pglck_len, S_WRITE);
950
951 if (error)
952 break;
953
954 pglck_size = pglck_len;
955 while (pglck_len) {
956
957 nbytes = pglck_len;
958 uoff = uio->uio_loffset;
959
960 /*
961 * Re-adjust number of bytes to contiguous range
962 */
963 len = (ssize_t)blkroundup(fs, nbytes);
964 error = bmap_read(ip, uoff, &bn, &len);
965 if (error)
966 break;
967
968 if (bn == UFS_HOLE) {
969 nbytes = (size_t)MIN(fs->fs_bsize -
970 (long)blkoff(fs, uoff), nbytes);
971 error = directio_hole(uio, nbytes);
972 /*
973 * Hole reads are not added to the list
974 * processed by directio_wait() below so
975 * account for bytes read here.
976 */
977 if (!error)
978 bytes_read += nbytes;
979 } else {
980 nbytes = (size_t)MIN(nbytes, len);
981
982 /*
983 * Get the pagelist pointer for this offset
984 * to be passed to directio_start.
985 */
986 if (pplist != NULL)
987 spplist = pplist +
988 btop((uintptr_t)iov->iov_base -
989 ((uintptr_t)pglck_base & PAGEMASK));
990 else
991 spplist = NULL;
992
993 /*
994 * Kick off the direct read requests
995 */
996 directio_start(ufsvfsp, ip, nbytes,
997 ldbtob(bn), iov->iov_base,
998 S_WRITE, procp, &tail, spplist);
999 }
1000
1001 if (error)
1002 break;
1003
1004 /*
1005 * Adjust pointers and counters
1006 */
1007 iov->iov_len -= nbytes;
1008 iov->iov_base += nbytes;
1009 uio->uio_loffset += nbytes;
1010 resid -= nbytes;
1011 pglck_len -= nbytes;
1012 }
1013
1014 /*
1015 * Wait for outstanding requests
1016 */
1017 newerror = directio_wait(tail, &bytes_read);
1018 /*
1019 * Release VM resources
1020 */
1021 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1022
1023 }
1024
1025 /*
1026 * If error, adjust resid to begin at the first
1027 * un-read byte.
1028 */
1029 if (error == 0)
1030 error = newerror;
1031 uio->uio_resid -= bytes_read;
1032 return (error);
1033 }
1034