xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_directio.c (revision 9b664393d4fdda96221e6ea9ea95790d3c15be70)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/t_lock.h>
37 #include <sys/param.h>
38 #include <sys/time.h>
39 #include <sys/systm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/resource.h>
42 #include <sys/signal.h>
43 #include <sys/cred.h>
44 #include <sys/user.h>
45 #include <sys/buf.h>
46 #include <sys/vfs.h>
47 #include <sys/vnode.h>
48 #include <sys/proc.h>
49 #include <sys/disp.h>
50 #include <sys/file.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/kmem.h>
54 #include <sys/uio.h>
55 #include <sys/dnlc.h>
56 #include <sys/conf.h>
57 #include <sys/mman.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/filio.h>
63 #include <sys/atomic.h>
64 
65 #include <sys/fssnap_if.h>
66 #include <sys/fs/ufs_fs.h>
67 #include <sys/fs/ufs_lockfs.h>
68 #include <sys/fs/ufs_filio.h>
69 #include <sys/fs/ufs_inode.h>
70 #include <sys/fs/ufs_fsdir.h>
71 #include <sys/fs/ufs_quota.h>
72 #include <sys/fs/ufs_trans.h>
73 #include <sys/fs/ufs_panic.h>
74 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
75 #include <sys/errno.h>
76 
77 #include <sys/filio.h>		/* _FIOIO */
78 
79 #include <vm/hat.h>
80 #include <vm/page.h>
81 #include <vm/pvn.h>
82 #include <vm/as.h>
83 #include <vm/seg.h>
84 #include <vm/seg_map.h>
85 #include <vm/seg_vn.h>
86 #include <vm/seg_kmem.h>
87 #include <vm/rm.h>
88 #include <sys/swap.h>
89 #include <sys/epm.h>
90 
91 #include <fs/fs_subr.h>
92 
93 static void	*ufs_directio_zero_buf;
94 static int	ufs_directio_zero_len	= 8192;
95 
96 int	ufs_directio_enabled = 1;	/* feature is enabled */
97 
98 /*
99  * for kstats reader
100  */
101 struct ufs_directio_kstats {
102 	kstat_named_t	logical_reads;
103 	kstat_named_t	phys_reads;
104 	kstat_named_t	hole_reads;
105 	kstat_named_t	nread;
106 	kstat_named_t	logical_writes;
107 	kstat_named_t	phys_writes;
108 	kstat_named_t	nwritten;
109 	kstat_named_t	nflushes;
110 } ufs_directio_kstats = {
111 	{ "logical_reads",	KSTAT_DATA_UINT64 },
112 	{ "phys_reads",		KSTAT_DATA_UINT64 },
113 	{ "hole_reads",		KSTAT_DATA_UINT64 },
114 	{ "nread",		KSTAT_DATA_UINT64 },
115 	{ "logical_writes",	KSTAT_DATA_UINT64 },
116 	{ "phys_writes",	KSTAT_DATA_UINT64 },
117 	{ "nwritten",		KSTAT_DATA_UINT64 },
118 	{ "nflushes",		KSTAT_DATA_UINT64 },
119 };
120 
121 kstat_t	*ufs_directio_kstatsp;
122 
123 /*
124  * use kmem_cache_create for direct-physio buffers. This has shown
125  * a better cache distribution compared to buffers on the
126  * stack. It also avoids semaphore construction/deconstruction
127  * per request
128  */
129 struct directio_buf {
130 	struct directio_buf	*next;
131 	char		*addr;
132 	size_t		nbytes;
133 	struct buf	buf;
134 };
135 static struct kmem_cache *directio_buf_cache;
136 
137 
138 /* ARGSUSED */
139 static int
140 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
141 {
142 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
143 	return (0);
144 }
145 
146 /* ARGSUSED */
147 static void
148 directio_buf_destructor(void *dbp, void *cdrarg)
149 {
150 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
151 }
152 
153 void
154 directio_bufs_init(void)
155 {
156 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
157 	    sizeof (struct directio_buf), 0,
158 	    directio_buf_constructor, directio_buf_destructor,
159 	    NULL, NULL, NULL, 0);
160 }
161 
162 void
163 ufs_directio_init(void)
164 {
165 	/*
166 	 * kstats
167 	 */
168 	ufs_directio_kstatsp = kstat_create("ufs", 0,
169 	    "directio", "ufs", KSTAT_TYPE_NAMED,
170 	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
171 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
172 	if (ufs_directio_kstatsp) {
173 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
174 		kstat_install(ufs_directio_kstatsp);
175 	}
176 	/*
177 	 * kzero is broken so we have to use a private buf of zeroes
178 	 */
179 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
180 	directio_bufs_init();
181 }
182 
183 /*
184  * Wait for the first direct IO operation to finish
185  */
186 static int
187 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
188 {
189 	buf_t	*bp;
190 	int	error;
191 
192 	/*
193 	 * Wait for IO to finish
194 	 */
195 	bp = &dbp->buf;
196 	error = biowait(bp);
197 
198 	/*
199 	 * bytes_io will be used to figure out a resid
200 	 * for the caller. The resid is approximated by reporting
201 	 * the bytes following the first failed IO as the residual.
202 	 *
203 	 * I am cautious about using b_resid because I
204 	 * am not sure how well the disk drivers maintain it.
205 	 */
206 	if (error)
207 		if (bp->b_resid)
208 			*bytes_iop = bp->b_bcount - bp->b_resid;
209 		else
210 			*bytes_iop = 0;
211 	else
212 		*bytes_iop += bp->b_bcount;
213 	/*
214 	 * Release direct IO resources
215 	 */
216 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
217 	kmem_cache_free(directio_buf_cache, dbp);
218 	return (error);
219 }
220 
221 /*
222  * Wait for all of the direct IO operations to finish
223  */
224 
225 static int
226 directio_wait(struct directio_buf *tail, long *bytes_iop)
227 {
228 	int	error = 0, newerror;
229 	struct directio_buf	*dbp;
230 
231 	/*
232 	 * The linked list of directio buf structures is maintained
233 	 * in reverse order (tail->last request->penultimate request->...)
234 	 */
235 	while ((dbp = tail) != NULL) {
236 		tail = dbp->next;
237 		newerror = directio_wait_one(dbp, bytes_iop);
238 		if (error == 0)
239 			error = newerror;
240 	}
241 	return (error);
242 }
243 /*
244  * Initiate direct IO request
245  */
246 static void
247 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
248     offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
249     struct directio_buf **tailp, page_t **pplist)
250 {
251 	buf_t *bp;
252 	struct directio_buf *dbp;
253 
254 	/*
255 	 * Allocate a directio buf header
256 	 *   Note - list is maintained in reverse order.
257 	 *   directio_wait_one() depends on this fact when
258 	 *   adjusting the ``bytes_io'' param. bytes_io
259 	 *   is used to compute a residual in the case of error.
260 	 */
261 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
262 	dbp->next = *tailp;
263 	*tailp = dbp;
264 
265 	/*
266 	 * Initialize buf header
267 	 */
268 	dbp->addr = addr;
269 	dbp->nbytes = nbytes;
270 	bp = &dbp->buf;
271 	bp->b_edev = ip->i_dev;
272 	bp->b_lblkno = btodt(offset);
273 	bp->b_bcount = nbytes;
274 	bp->b_un.b_addr = addr;
275 	bp->b_proc = procp;
276 	bp->b_file = ip->i_vnode;
277 
278 	/*
279 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
280 	 * will B_READ data from the filesystem and S_WRITE it into
281 	 * the user's buffer; a write(2) will S_READ data from the
282 	 * user's buffer and B_WRITE it to the filesystem.
283 	 */
284 	if (rw == S_WRITE) {
285 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
286 		ufs_directio_kstats.phys_reads.value.ui64++;
287 		ufs_directio_kstats.nread.value.ui64 += nbytes;
288 	} else {
289 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
290 		ufs_directio_kstats.phys_writes.value.ui64++;
291 		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
292 	}
293 	bp->b_shadow = pplist;
294 	if (pplist != NULL)
295 		bp->b_flags |= B_SHADOW;
296 
297 	/*
298 	 * Issue I/O request.
299 	 */
300 	ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
301 	if (ufsvfsp->vfs_snapshot)
302 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
303 	else
304 		(void) bdev_strategy(bp);
305 
306 	if (rw == S_WRITE)
307 		lwp_stat_update(LWP_STAT_OUBLK, 1);
308 	else
309 		lwp_stat_update(LWP_STAT_INBLK, 1);
310 
311 }
312 
313 uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
314 uint32_t	ufs_cur_writes;		/* # concurrent writes */
315 uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
316 uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
317 
318 /*
319  * Force POSIX syncronous data integrity on all writes for testing.
320  */
321 uint32_t	ufs_force_posix_sdi = 0;
322 
323 /*
324  * Direct Write
325  */
326 
327 int
328 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
329     cred_t *cr, int *statusp)
330 {
331 	long		resid, bytes_written;
332 	u_offset_t	size, uoff;
333 	uio_t		*uio = arg_uio;
334 	rlim64_t	limit = uio->uio_llimit;
335 	int		on, n, error, newerror, len, has_holes;
336 	daddr_t		bn;
337 	size_t		nbytes;
338 	struct fs	*fs;
339 	vnode_t		*vp;
340 	iovec_t		*iov;
341 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
342 	struct proc	*procp;
343 	struct as	*as;
344 	struct directio_buf	*tail;
345 	int		exclusive, ncur, bmap_peek;
346 	uio_t		copy_uio;
347 	iovec_t		copy_iov;
348 	char		*copy_base;
349 	long		copy_resid;
350 
351 	/*
352 	 * assume that directio isn't possible (normal case)
353 	 */
354 	*statusp = DIRECTIO_FAILURE;
355 
356 	/*
357 	 * Don't go direct
358 	 */
359 	if (ufs_directio_enabled == 0)
360 		return (0);
361 
362 	/*
363 	 * mapped file; nevermind
364 	 */
365 	if (ip->i_mapcnt)
366 		return (0);
367 
368 	/*
369 	 * CAN WE DO DIRECT IO?
370 	 */
371 	uoff = uio->uio_loffset;
372 	resid = uio->uio_resid;
373 
374 	/*
375 	 * beyond limit
376 	 */
377 	if (uoff + resid > limit)
378 		return (0);
379 
380 	/*
381 	 * must be sector aligned
382 	 */
383 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
384 		return (0);
385 
386 	/*
387 	 * SHOULD WE DO DIRECT IO?
388 	 */
389 	size = ip->i_size;
390 	has_holes = -1;
391 
392 	/*
393 	 * only on regular files; no metadata
394 	 */
395 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
396 		return (0);
397 
398 	/*
399 	 * Synchronous, allocating writes run very slow in Direct-Mode
400 	 *	XXX - can be fixed with bmap_write changes for large writes!!!
401 	 *	XXX - can be fixed for updates to "almost-full" files
402 	 *	XXX - WARNING - system hangs if bmap_write() has to
403 	 *			allocate lots of pages since pageout
404 	 *			suspends on locked inode
405 	 */
406 	if (!rewrite && (ip->i_flag & ISYNC)) {
407 		if ((uoff + resid) > size)
408 			return (0);
409 		has_holes = bmap_has_holes(ip);
410 		if (has_holes)
411 			return (0);
412 	}
413 
414 	/*
415 	 * Each iovec must be short aligned and sector aligned.  If
416 	 * one is not, then kmem_alloc a new buffer and copy all of
417 	 * the smaller buffers into the new buffer.  This new
418 	 * buffer will be short aligned and sector aligned.
419 	 */
420 	iov = uio->uio_iov;
421 	nbytes = uio->uio_iovcnt;
422 	while (nbytes--) {
423 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
424 		    (intptr_t)(iov->iov_base) & 1) {
425 			copy_resid = uio->uio_resid;
426 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
427 			if (copy_base == NULL)
428 				return (0);
429 			copy_iov.iov_base = copy_base;
430 			copy_iov.iov_len = copy_resid;
431 			copy_uio.uio_iov = &copy_iov;
432 			copy_uio.uio_iovcnt = 1;
433 			copy_uio.uio_segflg = UIO_SYSSPACE;
434 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
435 			copy_uio.uio_loffset = uio->uio_loffset;
436 			copy_uio.uio_resid = uio->uio_resid;
437 			copy_uio.uio_llimit = uio->uio_llimit;
438 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
439 			if (error) {
440 				kmem_free(copy_base, copy_resid);
441 				return (0);
442 			}
443 			uio = &copy_uio;
444 			break;
445 		}
446 		iov++;
447 	}
448 
449 	/*
450 	 * From here on down, all error exits must go to errout and
451 	 * not simply return a 0.
452 	 */
453 
454 	/*
455 	 * DIRECTIO
456 	 */
457 
458 	fs = ip->i_fs;
459 
460 	/*
461 	 * POSIX check. If attempting a concurrent re-write, make sure
462 	 * that this will be a single request to the driver to meet
463 	 * POSIX synchronous data integrity requirements.
464 	 */
465 	bmap_peek = 0;
466 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
467 		int upgrade = 0;
468 
469 		/* check easy conditions first */
470 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
471 			upgrade = 1;
472 		} else {
473 			/* now look for contiguous allocation */
474 			len = (ssize_t)blkroundup(fs, resid);
475 			error = bmap_read(ip, uoff, &bn, &len);
476 			if (error || bn == UFS_HOLE || len == 0)
477 				goto errout;
478 			/* save a call to bmap_read later */
479 			bmap_peek = 1;
480 			if (len < resid)
481 				upgrade = 1;
482 		}
483 		if (upgrade) {
484 			rw_exit(&ip->i_contents);
485 			rw_enter(&ip->i_contents, RW_WRITER);
486 			ufs_posix_hits++;
487 		}
488 	}
489 
490 
491 	/*
492 	 * allocate space
493 	 */
494 
495 	/*
496 	 * If attempting a re-write, there is no allocation to do.
497 	 * bmap_write would trip an ASSERT if i_contents is held shared.
498 	 */
499 	if (rewrite)
500 		goto skip_alloc;
501 
502 	do {
503 		on = (int)blkoff(fs, uoff);
504 		n = (int)MIN(fs->fs_bsize - on, resid);
505 		if ((uoff + n) > ip->i_size) {
506 			error = bmap_write(ip, uoff, (int)(on + n),
507 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
508 			    NULL, cr);
509 			/* Caller is responsible for updating i_seq if needed */
510 			if (error)
511 				break;
512 			ip->i_size = uoff + n;
513 			ip->i_flag |= IATTCHG;
514 		} else if (n == MAXBSIZE) {
515 			error = bmap_write(ip, uoff, (int)(on + n),
516 			    BI_ALLOC_ONLY, NULL, cr);
517 			/* Caller is responsible for updating i_seq if needed */
518 		} else {
519 			if (has_holes < 0)
520 				has_holes = bmap_has_holes(ip);
521 			if (has_holes) {
522 				uint_t	blk_size;
523 				u_offset_t offset;
524 
525 				offset = uoff & (offset_t)fs->fs_bmask;
526 				blk_size = (int)blksize(fs, ip,
527 				    (daddr_t)lblkno(fs, offset));
528 				error = bmap_write(ip, uoff, blk_size,
529 				    BI_NORMAL, NULL, cr);
530 				/*
531 				 * Caller is responsible for updating
532 				 * i_seq if needed
533 				 */
534 			} else
535 				error = 0;
536 		}
537 		if (error)
538 			break;
539 		uoff += n;
540 		resid -= n;
541 		/*
542 		 * if file has grown larger than 2GB, set flag
543 		 * in superblock if not already set
544 		 */
545 		if ((ip->i_size > MAXOFF32_T) &&
546 		    !(fs->fs_flags & FSLARGEFILES)) {
547 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
548 			mutex_enter(&ufsvfsp->vfs_lock);
549 			fs->fs_flags |= FSLARGEFILES;
550 			ufs_sbwrite(ufsvfsp);
551 			mutex_exit(&ufsvfsp->vfs_lock);
552 		}
553 	} while (resid);
554 
555 	if (error) {
556 		/*
557 		 * restore original state
558 		 */
559 		if (resid) {
560 			if (size == ip->i_size)
561 				goto errout;
562 			(void) ufs_itrunc(ip, size, 0, cr);
563 		}
564 		/*
565 		 * try non-directio path
566 		 */
567 		goto errout;
568 	}
569 skip_alloc:
570 
571 	/*
572 	 * get rid of cached pages
573 	 */
574 	vp = ITOV(ip);
575 	exclusive = rw_write_held(&ip->i_contents);
576 	if (vn_has_cached_data(vp)) {
577 		if (!exclusive) {
578 			/*
579 			 * Still holding i_rwlock, so no allocations
580 			 * can happen after dropping contents.
581 			 */
582 			rw_exit(&ip->i_contents);
583 			rw_enter(&ip->i_contents, RW_WRITER);
584 		}
585 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
586 		    B_INVAL, cr, NULL);
587 		if (vn_has_cached_data(vp))
588 			goto errout;
589 		if (!exclusive)
590 			rw_downgrade(&ip->i_contents);
591 		ufs_directio_kstats.nflushes.value.ui64++;
592 	}
593 
594 	/*
595 	 * Direct Writes
596 	 */
597 
598 	if (!exclusive) {
599 		ufs_shared_writes++;
600 		ncur = atomic_inc_32_nv(&ufs_cur_writes);
601 		if (ncur > ufs_maxcur_writes)
602 			ufs_maxcur_writes = ncur;
603 	}
604 
605 	/*
606 	 * proc and as are for VM operations in directio_start()
607 	 */
608 	if (uio->uio_segflg == UIO_USERSPACE) {
609 		procp = ttoproc(curthread);
610 		as = procp->p_as;
611 	} else {
612 		procp = NULL;
613 		as = &kas;
614 	}
615 	*statusp = DIRECTIO_SUCCESS;
616 	error = 0;
617 	newerror = 0;
618 	resid = uio->uio_resid;
619 	bytes_written = 0;
620 	ufs_directio_kstats.logical_writes.value.ui64++;
621 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
622 		size_t pglck_len, pglck_size;
623 		caddr_t pglck_base;
624 		page_t **pplist, **spplist;
625 
626 		tail = NULL;
627 
628 		/*
629 		 * Adjust number of bytes
630 		 */
631 		iov = uio->uio_iov;
632 		pglck_len = (size_t)MIN(iov->iov_len, resid);
633 		pglck_base = iov->iov_base;
634 		if (pglck_len == 0) {
635 			uio->uio_iov++;
636 			uio->uio_iovcnt--;
637 			continue;
638 		}
639 
640 		/*
641 		 * Try to Lock down the largest chunck of pages possible.
642 		 */
643 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
644 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
645 
646 		if (error)
647 			break;
648 
649 		pglck_size = pglck_len;
650 		while (pglck_len) {
651 
652 			nbytes = pglck_len;
653 			uoff = uio->uio_loffset;
654 
655 			if (!bmap_peek) {
656 
657 				/*
658 				 * Re-adjust number of bytes to contiguous
659 				 * range. May have already called bmap_read
660 				 * in the case of a concurrent rewrite.
661 				 */
662 				len = (ssize_t)blkroundup(fs, nbytes);
663 				error = bmap_read(ip, uoff, &bn, &len);
664 				if (error)
665 					break;
666 				if (bn == UFS_HOLE || len == 0)
667 					break;
668 			}
669 			nbytes = (size_t)MIN(nbytes, len);
670 			bmap_peek = 0;
671 
672 			/*
673 			 * Get the pagelist pointer for this offset to be
674 			 * passed to directio_start.
675 			 */
676 
677 			if (pplist != NULL)
678 				spplist = pplist +
679 				    btop((uintptr_t)iov->iov_base -
680 				    ((uintptr_t)pglck_base & PAGEMASK));
681 			else
682 				spplist = NULL;
683 
684 			/*
685 			 * Kick off the direct write requests
686 			 */
687 			directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
688 			    iov->iov_base, S_READ, procp, &tail, spplist);
689 
690 			/*
691 			 * Adjust pointers and counters
692 			 */
693 			iov->iov_len -= nbytes;
694 			iov->iov_base += nbytes;
695 			uio->uio_loffset += nbytes;
696 			resid -= nbytes;
697 			pglck_len -= nbytes;
698 		}
699 
700 		/*
701 		 * Wait for outstanding requests
702 		 */
703 		newerror = directio_wait(tail, &bytes_written);
704 
705 		/*
706 		 * Release VM resources
707 		 */
708 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
709 
710 	}
711 
712 	if (!exclusive) {
713 		atomic_dec_32(&ufs_cur_writes);
714 		/*
715 		 * If this write was done shared, readers may
716 		 * have pulled in unmodified pages. Get rid of
717 		 * these potentially stale pages.
718 		 */
719 		if (vn_has_cached_data(vp)) {
720 			rw_exit(&ip->i_contents);
721 			rw_enter(&ip->i_contents, RW_WRITER);
722 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
723 			    B_INVAL, cr, NULL);
724 			ufs_directio_kstats.nflushes.value.ui64++;
725 			rw_downgrade(&ip->i_contents);
726 		}
727 	}
728 
729 	/*
730 	 * If error, adjust resid to begin at the first
731 	 * un-writable byte.
732 	 */
733 	if (error == 0)
734 		error = newerror;
735 	if (error)
736 		resid = uio->uio_resid - bytes_written;
737 	arg_uio->uio_resid = resid;
738 
739 	if (!rewrite) {
740 		ip->i_flag |= IUPD | ICHG;
741 		/* Caller will update i_seq */
742 		TRANS_INODE(ip->i_ufsvfs, ip);
743 	}
744 	/*
745 	 * If there is a residual; adjust the EOF if necessary
746 	 */
747 	if (resid) {
748 		if (size != ip->i_size) {
749 			if (uio->uio_loffset > size)
750 				size = uio->uio_loffset;
751 			(void) ufs_itrunc(ip, size, 0, cr);
752 		}
753 	}
754 
755 	if (uio == &copy_uio)
756 		kmem_free(copy_base, copy_resid);
757 
758 	return (error);
759 
760 errout:
761 	if (uio == &copy_uio)
762 		kmem_free(copy_base, copy_resid);
763 
764 	return (0);
765 }
766 /*
767  * Direct read of a hole
768  */
769 static int
770 directio_hole(struct uio *uio, size_t nbytes)
771 {
772 	int		error = 0, nzero;
773 	uio_t		phys_uio;
774 	iovec_t		phys_iov;
775 
776 	ufs_directio_kstats.hole_reads.value.ui64++;
777 	ufs_directio_kstats.nread.value.ui64 += nbytes;
778 
779 	phys_iov.iov_base = uio->uio_iov->iov_base;
780 	phys_iov.iov_len = nbytes;
781 
782 	phys_uio.uio_iov = &phys_iov;
783 	phys_uio.uio_iovcnt = 1;
784 	phys_uio.uio_resid = phys_iov.iov_len;
785 	phys_uio.uio_segflg = uio->uio_segflg;
786 	phys_uio.uio_extflg = uio->uio_extflg;
787 	while (error == 0 && phys_uio.uio_resid) {
788 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
789 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
790 		    &phys_uio);
791 	}
792 	return (error);
793 }
794 
795 /*
796  * Direct Read
797  */
798 int
799 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
800 {
801 	ssize_t		resid, bytes_read;
802 	u_offset_t	size, uoff;
803 	int		error, newerror, len;
804 	size_t		nbytes;
805 	struct fs	*fs;
806 	vnode_t		*vp;
807 	daddr_t		bn;
808 	iovec_t		*iov;
809 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
810 	struct proc	*procp;
811 	struct as	*as;
812 	struct directio_buf	*tail;
813 
814 	/*
815 	 * assume that directio isn't possible (normal case)
816 	 */
817 	*statusp = DIRECTIO_FAILURE;
818 
819 	/*
820 	 * Don't go direct
821 	 */
822 	if (ufs_directio_enabled == 0)
823 		return (0);
824 
825 	/*
826 	 * mapped file; nevermind
827 	 */
828 	if (ip->i_mapcnt)
829 		return (0);
830 
831 	/*
832 	 * CAN WE DO DIRECT IO?
833 	 */
834 	/*
835 	 * must be sector aligned
836 	 */
837 	uoff = uio->uio_loffset;
838 	resid = uio->uio_resid;
839 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
840 		return (0);
841 	/*
842 	 * must be short aligned and sector aligned
843 	 */
844 	iov = uio->uio_iov;
845 	nbytes = uio->uio_iovcnt;
846 	while (nbytes--) {
847 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
848 			return (0);
849 		if ((intptr_t)(iov++->iov_base) & 1)
850 			return (0);
851 	}
852 
853 	/*
854 	 * DIRECTIO
855 	 */
856 	fs = ip->i_fs;
857 
858 	/*
859 	 * don't read past EOF
860 	 */
861 	size = ip->i_size;
862 
863 	/*
864 	 * The file offset is past EOF so bail out here; we don't want
865 	 * to update uio_resid and make it look like we read something.
866 	 * We say that direct I/O was a success to avoid having rdip()
867 	 * go through the same "read past EOF logic".
868 	 */
869 	if (uoff >= size) {
870 		*statusp = DIRECTIO_SUCCESS;
871 		return (0);
872 	}
873 
874 	/*
875 	 * The read would extend past EOF so make it smaller.
876 	 */
877 	if ((uoff + resid) > size) {
878 		resid = size - uoff;
879 		/*
880 		 * recheck sector alignment
881 		 */
882 		if (resid & (DEV_BSIZE - 1))
883 			return (0);
884 	}
885 
886 	/*
887 	 * At this point, we know there is some real work to do.
888 	 */
889 	ASSERT(resid);
890 
891 	/*
892 	 * get rid of cached pages
893 	 */
894 	vp = ITOV(ip);
895 	if (vn_has_cached_data(vp)) {
896 		rw_exit(&ip->i_contents);
897 		rw_enter(&ip->i_contents, RW_WRITER);
898 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
899 		    B_INVAL, cr, NULL);
900 		if (vn_has_cached_data(vp))
901 			return (0);
902 		rw_downgrade(&ip->i_contents);
903 		ufs_directio_kstats.nflushes.value.ui64++;
904 	}
905 	/*
906 	 * Direct Reads
907 	 */
908 
909 	/*
910 	 * proc and as are for VM operations in directio_start()
911 	 */
912 	if (uio->uio_segflg == UIO_USERSPACE) {
913 		procp = ttoproc(curthread);
914 		as = procp->p_as;
915 	} else {
916 		procp = NULL;
917 		as = &kas;
918 	}
919 
920 	*statusp = DIRECTIO_SUCCESS;
921 	error = 0;
922 	newerror = 0;
923 	bytes_read = 0;
924 	ufs_directio_kstats.logical_reads.value.ui64++;
925 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
926 		size_t pglck_len, pglck_size;
927 		caddr_t pglck_base;
928 		page_t **pplist, **spplist;
929 
930 		tail = NULL;
931 
932 		/*
933 		 * Adjust number of bytes
934 		 */
935 		iov = uio->uio_iov;
936 		pglck_len = (size_t)MIN(iov->iov_len, resid);
937 		pglck_base = iov->iov_base;
938 		if (pglck_len == 0) {
939 			uio->uio_iov++;
940 			uio->uio_iovcnt--;
941 			continue;
942 		}
943 
944 		/*
945 		 * Try to Lock down the largest chunck of pages possible.
946 		 */
947 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
948 		error = as_pagelock(as, &pplist, pglck_base,
949 		    pglck_len, S_WRITE);
950 
951 		if (error)
952 			break;
953 
954 		pglck_size = pglck_len;
955 		while (pglck_len) {
956 
957 			nbytes = pglck_len;
958 			uoff = uio->uio_loffset;
959 
960 			/*
961 			 * Re-adjust number of bytes to contiguous range
962 			 */
963 			len = (ssize_t)blkroundup(fs, nbytes);
964 			error = bmap_read(ip, uoff, &bn, &len);
965 			if (error)
966 				break;
967 
968 			if (bn == UFS_HOLE) {
969 				nbytes = (size_t)MIN(fs->fs_bsize -
970 				    (long)blkoff(fs, uoff), nbytes);
971 				error = directio_hole(uio, nbytes);
972 				/*
973 				 * Hole reads are not added to the list
974 				 * processed by directio_wait() below so
975 				 * account for bytes read here.
976 				 */
977 				if (!error)
978 					bytes_read += nbytes;
979 			} else {
980 				nbytes = (size_t)MIN(nbytes, len);
981 
982 				/*
983 				 * Get the pagelist pointer for this offset
984 				 * to be passed to directio_start.
985 				 */
986 				if (pplist != NULL)
987 					spplist = pplist +
988 					    btop((uintptr_t)iov->iov_base -
989 					    ((uintptr_t)pglck_base & PAGEMASK));
990 				else
991 					spplist = NULL;
992 
993 				/*
994 				 * Kick off the direct read requests
995 				 */
996 				directio_start(ufsvfsp, ip, nbytes,
997 				    ldbtob(bn), iov->iov_base,
998 				    S_WRITE, procp, &tail, spplist);
999 			}
1000 
1001 			if (error)
1002 				break;
1003 
1004 			/*
1005 			 * Adjust pointers and counters
1006 			 */
1007 			iov->iov_len -= nbytes;
1008 			iov->iov_base += nbytes;
1009 			uio->uio_loffset += nbytes;
1010 			resid -= nbytes;
1011 			pglck_len -= nbytes;
1012 		}
1013 
1014 		/*
1015 		 * Wait for outstanding requests
1016 		 */
1017 		newerror = directio_wait(tail, &bytes_read);
1018 		/*
1019 		 * Release VM resources
1020 		 */
1021 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1022 
1023 	}
1024 
1025 	/*
1026 	 * If error, adjust resid to begin at the first
1027 	 * un-read byte.
1028 	 */
1029 	if (error == 0)
1030 		error = newerror;
1031 	uio->uio_resid -= bytes_read;
1032 	return (error);
1033 }
1034