xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_directio.c (revision 2d6b5ea734bb47d251c82670646fde46af15fd69)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #include <sys/types.h>
35 #include <sys/t_lock.h>
36 #include <sys/param.h>
37 #include <sys/time.h>
38 #include <sys/systm.h>
39 #include <sys/sysmacros.h>
40 #include <sys/resource.h>
41 #include <sys/signal.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/buf.h>
45 #include <sys/vfs.h>
46 #include <sys/vnode.h>
47 #include <sys/proc.h>
48 #include <sys/disp.h>
49 #include <sys/file.h>
50 #include <sys/fcntl.h>
51 #include <sys/flock.h>
52 #include <sys/kmem.h>
53 #include <sys/uio.h>
54 #include <sys/dnlc.h>
55 #include <sys/conf.h>
56 #include <sys/mman.h>
57 #include <sys/pathname.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/cmn_err.h>
61 #include <sys/filio.h>
62 #include <sys/atomic.h>
63 
64 #include <sys/fssnap_if.h>
65 #include <sys/fs/ufs_fs.h>
66 #include <sys/fs/ufs_lockfs.h>
67 #include <sys/fs/ufs_filio.h>
68 #include <sys/fs/ufs_inode.h>
69 #include <sys/fs/ufs_fsdir.h>
70 #include <sys/fs/ufs_quota.h>
71 #include <sys/fs/ufs_trans.h>
72 #include <sys/fs/ufs_panic.h>
73 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
74 #include <sys/errno.h>
75 
76 #include <sys/filio.h>		/* _FIOIO */
77 
78 #include <vm/hat.h>
79 #include <vm/page.h>
80 #include <vm/pvn.h>
81 #include <vm/as.h>
82 #include <vm/seg.h>
83 #include <vm/seg_map.h>
84 #include <vm/seg_vn.h>
85 #include <vm/seg_kmem.h>
86 #include <vm/rm.h>
87 #include <sys/swap.h>
88 #include <sys/epm.h>
89 
90 #include <fs/fs_subr.h>
91 
92 static void	*ufs_directio_zero_buf;
93 static int	ufs_directio_zero_len	= 8192;
94 
95 int	ufs_directio_enabled = 1;	/* feature is enabled */
96 
97 /*
98  * for kstats reader
99  */
100 struct ufs_directio_kstats {
101 	kstat_named_t	logical_reads;
102 	kstat_named_t	phys_reads;
103 	kstat_named_t	hole_reads;
104 	kstat_named_t	nread;
105 	kstat_named_t	logical_writes;
106 	kstat_named_t	phys_writes;
107 	kstat_named_t	nwritten;
108 	kstat_named_t	nflushes;
109 } ufs_directio_kstats = {
110 	{ "logical_reads",	KSTAT_DATA_UINT64 },
111 	{ "phys_reads",		KSTAT_DATA_UINT64 },
112 	{ "hole_reads",		KSTAT_DATA_UINT64 },
113 	{ "nread",		KSTAT_DATA_UINT64 },
114 	{ "logical_writes",	KSTAT_DATA_UINT64 },
115 	{ "phys_writes",	KSTAT_DATA_UINT64 },
116 	{ "nwritten",		KSTAT_DATA_UINT64 },
117 	{ "nflushes",		KSTAT_DATA_UINT64 },
118 };
119 
120 kstat_t	*ufs_directio_kstatsp;
121 
122 /*
123  * use kmem_cache_create for direct-physio buffers. This has shown
124  * a better cache distribution compared to buffers on the
125  * stack. It also avoids semaphore construction/deconstruction
126  * per request
127  */
128 struct directio_buf {
129 	struct directio_buf	*next;
130 	char		*addr;
131 	size_t		nbytes;
132 	struct buf	buf;
133 };
134 static struct kmem_cache *directio_buf_cache;
135 
136 
137 /* ARGSUSED */
138 static int
139 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
140 {
141 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
142 	return (0);
143 }
144 
145 /* ARGSUSED */
146 static void
147 directio_buf_destructor(void *dbp, void *cdrarg)
148 {
149 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
150 }
151 
152 void
153 directio_bufs_init(void)
154 {
155 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
156 	    sizeof (struct directio_buf), 0,
157 	    directio_buf_constructor, directio_buf_destructor,
158 	    NULL, NULL, NULL, 0);
159 }
160 
161 void
162 ufs_directio_init(void)
163 {
164 	/*
165 	 * kstats
166 	 */
167 	ufs_directio_kstatsp = kstat_create("ufs", 0,
168 	    "directio", "ufs", KSTAT_TYPE_NAMED,
169 	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
170 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
171 	if (ufs_directio_kstatsp) {
172 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
173 		kstat_install(ufs_directio_kstatsp);
174 	}
175 	/*
176 	 * kzero is broken so we have to use a private buf of zeroes
177 	 */
178 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
179 	directio_bufs_init();
180 }
181 
182 /*
183  * Wait for the first direct IO operation to finish
184  */
185 static int
186 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
187 {
188 	buf_t	*bp;
189 	int	error;
190 
191 	/*
192 	 * Wait for IO to finish
193 	 */
194 	bp = &dbp->buf;
195 	error = biowait(bp);
196 
197 	/*
198 	 * bytes_io will be used to figure out a resid
199 	 * for the caller. The resid is approximated by reporting
200 	 * the bytes following the first failed IO as the residual.
201 	 *
202 	 * I am cautious about using b_resid because I
203 	 * am not sure how well the disk drivers maintain it.
204 	 */
205 	if (error)
206 		if (bp->b_resid)
207 			*bytes_iop = bp->b_bcount - bp->b_resid;
208 		else
209 			*bytes_iop = 0;
210 	else
211 		*bytes_iop += bp->b_bcount;
212 	/*
213 	 * Release direct IO resources
214 	 */
215 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
216 	kmem_cache_free(directio_buf_cache, dbp);
217 	return (error);
218 }
219 
220 /*
221  * Wait for all of the direct IO operations to finish
222  */
223 
224 uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
225 
226 static int
227 directio_wait(struct directio_buf *tail, long *bytes_iop)
228 {
229 	int	error = 0, newerror;
230 	struct directio_buf	*dbp;
231 	uint_t	kpri_req_save;
232 
233 	/*
234 	 * The linked list of directio buf structures is maintained
235 	 * in reverse order (tail->last request->penultimate request->...)
236 	 */
237 	/*
238 	 * This is the k_pri_req hack. Large numbers of threads
239 	 * sleeping with kernel priority will cause scheduler thrashing
240 	 * on an MP machine. This can be seen running Oracle using
241 	 * directio to ufs files. Sleep at normal priority here to
242 	 * more closely mimic physio to a device partition. This
243 	 * workaround is disabled by default as a niced thread could
244 	 * be starved from running while holding i_rwlock and i_contents.
245 	 */
246 	if (ufs_directio_drop_kpri) {
247 		kpri_req_save = curthread->t_kpri_req;
248 		curthread->t_kpri_req = 0;
249 	}
250 	while ((dbp = tail) != NULL) {
251 		tail = dbp->next;
252 		newerror = directio_wait_one(dbp, bytes_iop);
253 		if (error == 0)
254 			error = newerror;
255 	}
256 	if (ufs_directio_drop_kpri)
257 		curthread->t_kpri_req = kpri_req_save;
258 	return (error);
259 }
260 /*
261  * Initiate direct IO request
262  */
263 static void
264 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
265 	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
266 	struct directio_buf **tailp, page_t **pplist)
267 {
268 	buf_t *bp;
269 	struct directio_buf *dbp;
270 
271 	/*
272 	 * Allocate a directio buf header
273 	 *   Note - list is maintained in reverse order.
274 	 *   directio_wait_one() depends on this fact when
275 	 *   adjusting the ``bytes_io'' param. bytes_io
276 	 *   is used to compute a residual in the case of error.
277 	 */
278 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
279 	dbp->next = *tailp;
280 	*tailp = dbp;
281 
282 	/*
283 	 * Initialize buf header
284 	 */
285 	dbp->addr = addr;
286 	dbp->nbytes = nbytes;
287 	bp = &dbp->buf;
288 	bp->b_edev = ip->i_dev;
289 	bp->b_lblkno = btodt(offset);
290 	bp->b_bcount = nbytes;
291 	bp->b_un.b_addr = addr;
292 	bp->b_proc = procp;
293 	bp->b_file = ip->i_vnode;
294 
295 	/*
296 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
297 	 * will B_READ data from the filesystem and S_WRITE it into
298 	 * the user's buffer; a write(2) will S_READ data from the
299 	 * user's buffer and B_WRITE it to the filesystem.
300 	 */
301 	if (rw == S_WRITE) {
302 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
303 		ufs_directio_kstats.phys_reads.value.ui64++;
304 		ufs_directio_kstats.nread.value.ui64 += nbytes;
305 	} else {
306 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
307 		ufs_directio_kstats.phys_writes.value.ui64++;
308 		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
309 	}
310 	bp->b_shadow = pplist;
311 	if (pplist != NULL)
312 		bp->b_flags |= B_SHADOW;
313 
314 	/*
315 	 * Issue I/O request.
316 	 */
317 	ufsvfsp->vfs_iotstamp = lbolt;
318 	if (ufsvfsp->vfs_snapshot)
319 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
320 	else
321 		(void) bdev_strategy(bp);
322 
323 	if (rw == S_WRITE)
324 		lwp_stat_update(LWP_STAT_OUBLK, 1);
325 	else
326 		lwp_stat_update(LWP_STAT_INBLK, 1);
327 
328 }
329 
330 uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
331 uint32_t	ufs_cur_writes;		/* # concurrent writes */
332 uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
333 uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
334 
335 /*
336  * Force POSIX syncronous data integrity on all writes for testing.
337  */
338 uint32_t	ufs_force_posix_sdi = 0;
339 
340 /*
341  * Direct Write
342  */
343 
344 int
345 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
346 	cred_t *cr, int *statusp)
347 {
348 	long		resid, bytes_written;
349 	u_offset_t	size, uoff;
350 	uio_t		*uio = arg_uio;
351 	rlim64_t	limit = uio->uio_llimit;
352 	int		on, n, error, newerror, len, has_holes;
353 	daddr_t		bn;
354 	size_t		nbytes;
355 	struct fs	*fs;
356 	vnode_t		*vp;
357 	iovec_t		*iov;
358 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
359 	struct proc	*procp;
360 	struct as	*as;
361 	struct directio_buf	*tail;
362 	int		exclusive, ncur, bmap_peek;
363 	uio_t		copy_uio;
364 	iovec_t		copy_iov;
365 	char		*copy_base;
366 	long		copy_resid;
367 
368 	/*
369 	 * assume that directio isn't possible (normal case)
370 	 */
371 	*statusp = DIRECTIO_FAILURE;
372 
373 	/*
374 	 * Don't go direct
375 	 */
376 	if (ufs_directio_enabled == 0)
377 		return (0);
378 
379 	/*
380 	 * mapped file; nevermind
381 	 */
382 	if (ip->i_mapcnt)
383 		return (0);
384 
385 	/*
386 	 * CAN WE DO DIRECT IO?
387 	 */
388 	uoff = uio->uio_loffset;
389 	resid = uio->uio_resid;
390 
391 	/*
392 	 * beyond limit
393 	 */
394 	if (uoff + resid > limit)
395 		return (0);
396 
397 	/*
398 	 * must be sector aligned
399 	 */
400 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
401 		return (0);
402 
403 	/*
404 	 * SHOULD WE DO DIRECT IO?
405 	 */
406 	size = ip->i_size;
407 	has_holes = -1;
408 
409 	/*
410 	 * only on regular files; no metadata
411 	 */
412 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
413 		return (0);
414 
415 	/*
416 	 * Synchronous, allocating writes run very slow in Direct-Mode
417 	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
418 	 *	XXX - can be fixed for updates to "almost-full" files
419 	 *	XXX - WARNING - system hangs if bmap_write() has to
420 	 * 			allocate lots of pages since pageout
421 	 * 			suspends on locked inode
422 	 */
423 	if (!rewrite && (ip->i_flag & ISYNC)) {
424 		if ((uoff + resid) > size)
425 			return (0);
426 		has_holes = bmap_has_holes(ip);
427 		if (has_holes)
428 			return (0);
429 	}
430 
431 	/*
432 	 * Each iovec must be short aligned and sector aligned.  If
433 	 * one is not, then kmem_alloc a new buffer and copy all of
434 	 * the smaller buffers into the new buffer.  This new
435 	 * buffer will be short aligned and sector aligned.
436 	 */
437 	iov = uio->uio_iov;
438 	nbytes = uio->uio_iovcnt;
439 	while (nbytes--) {
440 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
441 		    (intptr_t)(iov->iov_base) & 1) {
442 			copy_resid = uio->uio_resid;
443 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
444 			if (copy_base == NULL)
445 				return (0);
446 			copy_iov.iov_base = copy_base;
447 			copy_iov.iov_len = copy_resid;
448 			copy_uio.uio_iov = &copy_iov;
449 			copy_uio.uio_iovcnt = 1;
450 			copy_uio.uio_segflg = UIO_SYSSPACE;
451 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
452 			copy_uio.uio_loffset = uio->uio_loffset;
453 			copy_uio.uio_resid = uio->uio_resid;
454 			copy_uio.uio_llimit = uio->uio_llimit;
455 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
456 			if (error) {
457 				kmem_free(copy_base, copy_resid);
458 				return (0);
459 			}
460 			uio = &copy_uio;
461 			break;
462 		}
463 		iov++;
464 	}
465 
466 	/*
467 	 * From here on down, all error exits must go to errout and
468 	 * not simply return a 0.
469 	 */
470 
471 	/*
472 	 * DIRECTIO
473 	 */
474 
475 	fs = ip->i_fs;
476 
477 	/*
478 	 * POSIX check. If attempting a concurrent re-write, make sure
479 	 * that this will be a single request to the driver to meet
480 	 * POSIX synchronous data integrity requirements.
481 	 */
482 	bmap_peek = 0;
483 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
484 		int upgrade = 0;
485 
486 		/* check easy conditions first */
487 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
488 			upgrade = 1;
489 		} else {
490 			/* now look for contiguous allocation */
491 			len = (ssize_t)blkroundup(fs, resid);
492 			error = bmap_read(ip, uoff, &bn, &len);
493 			if (error || bn == UFS_HOLE || len == 0)
494 				goto errout;
495 			/* save a call to bmap_read later */
496 			bmap_peek = 1;
497 			if (len < resid)
498 				upgrade = 1;
499 		}
500 		if (upgrade) {
501 			rw_exit(&ip->i_contents);
502 			rw_enter(&ip->i_contents, RW_WRITER);
503 			ufs_posix_hits++;
504 		}
505 	}
506 
507 
508 	/*
509 	 * allocate space
510 	 */
511 
512 	/*
513 	 * If attempting a re-write, there is no allocation to do.
514 	 * bmap_write would trip an ASSERT if i_contents is held shared.
515 	 */
516 	if (rewrite)
517 		goto skip_alloc;
518 
519 	do {
520 		on = (int)blkoff(fs, uoff);
521 		n = (int)MIN(fs->fs_bsize - on, resid);
522 		if ((uoff + n) > ip->i_size) {
523 			error = bmap_write(ip, uoff, (int)(on + n),
524 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
525 			    NULL, cr);
526 			/* Caller is responsible for updating i_seq if needed */
527 			if (error)
528 				break;
529 			ip->i_size = uoff + n;
530 			ip->i_flag |= IATTCHG;
531 		} else if (n == MAXBSIZE) {
532 			error = bmap_write(ip, uoff, (int)(on + n),
533 			    BI_ALLOC_ONLY, NULL, cr);
534 			/* Caller is responsible for updating i_seq if needed */
535 		} else {
536 			if (has_holes < 0)
537 				has_holes = bmap_has_holes(ip);
538 			if (has_holes) {
539 				uint_t	blk_size;
540 				u_offset_t offset;
541 
542 				offset = uoff & (offset_t)fs->fs_bmask;
543 				blk_size = (int)blksize(fs, ip,
544 				    (daddr_t)lblkno(fs, offset));
545 				error = bmap_write(ip, uoff, blk_size,
546 				    BI_NORMAL, NULL, cr);
547 				/*
548 				 * Caller is responsible for updating
549 				 * i_seq if needed
550 				 */
551 			} else
552 				error = 0;
553 		}
554 		if (error)
555 			break;
556 		uoff += n;
557 		resid -= n;
558 		/*
559 		 * if file has grown larger than 2GB, set flag
560 		 * in superblock if not already set
561 		 */
562 		if ((ip->i_size > MAXOFF32_T) &&
563 		    !(fs->fs_flags & FSLARGEFILES)) {
564 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
565 			mutex_enter(&ufsvfsp->vfs_lock);
566 			fs->fs_flags |= FSLARGEFILES;
567 			ufs_sbwrite(ufsvfsp);
568 			mutex_exit(&ufsvfsp->vfs_lock);
569 		}
570 	} while (resid);
571 
572 	if (error) {
573 		/*
574 		 * restore original state
575 		 */
576 		if (resid) {
577 			if (size == ip->i_size)
578 				goto errout;
579 			(void) ufs_itrunc(ip, size, 0, cr);
580 		}
581 		/*
582 		 * try non-directio path
583 		 */
584 		goto errout;
585 	}
586 skip_alloc:
587 
588 	/*
589 	 * get rid of cached pages
590 	 */
591 	vp = ITOV(ip);
592 	exclusive = rw_write_held(&ip->i_contents);
593 	if (vn_has_cached_data(vp)) {
594 		if (!exclusive) {
595 			/*
596 			 * Still holding i_rwlock, so no allocations
597 			 * can happen after dropping contents.
598 			 */
599 			rw_exit(&ip->i_contents);
600 			rw_enter(&ip->i_contents, RW_WRITER);
601 		}
602 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
603 		    B_INVAL, cr, NULL);
604 		if (vn_has_cached_data(vp))
605 			goto errout;
606 		if (!exclusive)
607 			rw_downgrade(&ip->i_contents);
608 		ufs_directio_kstats.nflushes.value.ui64++;
609 	}
610 
611 	/*
612 	 * Direct Writes
613 	 */
614 
615 	if (!exclusive) {
616 		ufs_shared_writes++;
617 		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
618 		if (ncur > ufs_maxcur_writes)
619 			ufs_maxcur_writes = ncur;
620 	}
621 
622 	/*
623 	 * proc and as are for VM operations in directio_start()
624 	 */
625 	if (uio->uio_segflg == UIO_USERSPACE) {
626 		procp = ttoproc(curthread);
627 		as = procp->p_as;
628 	} else {
629 		procp = NULL;
630 		as = &kas;
631 	}
632 	*statusp = DIRECTIO_SUCCESS;
633 	error = 0;
634 	newerror = 0;
635 	resid = uio->uio_resid;
636 	bytes_written = 0;
637 	ufs_directio_kstats.logical_writes.value.ui64++;
638 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
639 		size_t pglck_len, pglck_size;
640 		caddr_t pglck_base;
641 		page_t **pplist, **spplist;
642 
643 		tail = NULL;
644 
645 		/*
646 		 * Adjust number of bytes
647 		 */
648 		iov = uio->uio_iov;
649 		pglck_len = (size_t)MIN(iov->iov_len, resid);
650 		pglck_base = iov->iov_base;
651 		if (pglck_len == 0) {
652 			uio->uio_iov++;
653 			uio->uio_iovcnt--;
654 			continue;
655 		}
656 
657 		/*
658 		 * Try to Lock down the largest chunck of pages possible.
659 		 */
660 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
661 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
662 
663 		if (error)
664 			break;
665 
666 		pglck_size = pglck_len;
667 		while (pglck_len) {
668 
669 			nbytes = pglck_len;
670 			uoff = uio->uio_loffset;
671 
672 			if (!bmap_peek) {
673 
674 				/*
675 				 * Re-adjust number of bytes to contiguous
676 				 * range. May have already called bmap_read
677 				 * in the case of a concurrent rewrite.
678 				 */
679 				len = (ssize_t)blkroundup(fs, nbytes);
680 				error = bmap_read(ip, uoff, &bn, &len);
681 				if (error)
682 					break;
683 				if (bn == UFS_HOLE || len == 0)
684 					break;
685 			}
686 			nbytes = (size_t)MIN(nbytes, len);
687 			bmap_peek = 0;
688 
689 			/*
690 			 * Get the pagelist pointer for this offset to be
691 			 * passed to directio_start.
692 			 */
693 
694 			if (pplist != NULL)
695 				spplist = pplist +
696 				    btop((uintptr_t)iov->iov_base -
697 				    ((uintptr_t)pglck_base & PAGEMASK));
698 			else
699 				spplist = NULL;
700 
701 			/*
702 			 * Kick off the direct write requests
703 			 */
704 			directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
705 			    iov->iov_base, S_READ, procp, &tail, spplist);
706 
707 			/*
708 			 * Adjust pointers and counters
709 			 */
710 			iov->iov_len -= nbytes;
711 			iov->iov_base += nbytes;
712 			uio->uio_loffset += nbytes;
713 			resid -= nbytes;
714 			pglck_len -= nbytes;
715 		}
716 
717 		/*
718 		 * Wait for outstanding requests
719 		 */
720 		newerror = directio_wait(tail, &bytes_written);
721 
722 		/*
723 		 * Release VM resources
724 		 */
725 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
726 
727 	}
728 
729 	if (!exclusive) {
730 		atomic_add_32(&ufs_cur_writes, -1);
731 		/*
732 		 * If this write was done shared, readers may
733 		 * have pulled in unmodified pages. Get rid of
734 		 * these potentially stale pages.
735 		 */
736 		if (vn_has_cached_data(vp)) {
737 			rw_exit(&ip->i_contents);
738 			rw_enter(&ip->i_contents, RW_WRITER);
739 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
740 			    B_INVAL, cr, NULL);
741 			ufs_directio_kstats.nflushes.value.ui64++;
742 			rw_downgrade(&ip->i_contents);
743 		}
744 	}
745 
746 	/*
747 	 * If error, adjust resid to begin at the first
748 	 * un-writable byte.
749 	 */
750 	if (error == 0)
751 		error = newerror;
752 	if (error)
753 		resid = uio->uio_resid - bytes_written;
754 	arg_uio->uio_resid = resid;
755 
756 	if (!rewrite) {
757 		ip->i_flag |= IUPD | ICHG;
758 		/* Caller will update i_seq */
759 		TRANS_INODE(ip->i_ufsvfs, ip);
760 	}
761 	/*
762 	 * If there is a residual; adjust the EOF if necessary
763 	 */
764 	if (resid) {
765 		if (size != ip->i_size) {
766 			if (uio->uio_loffset > size)
767 				size = uio->uio_loffset;
768 			(void) ufs_itrunc(ip, size, 0, cr);
769 		}
770 	}
771 
772 	if (uio == &copy_uio)
773 		kmem_free(copy_base, copy_resid);
774 
775 	return (error);
776 
777 errout:
778 	if (uio == &copy_uio)
779 		kmem_free(copy_base, copy_resid);
780 
781 	return (0);
782 }
783 /*
784  * Direct read of a hole
785  */
786 static int
787 directio_hole(struct uio *uio, size_t nbytes)
788 {
789 	int		error = 0, nzero;
790 	uio_t		phys_uio;
791 	iovec_t		phys_iov;
792 
793 	ufs_directio_kstats.hole_reads.value.ui64++;
794 	ufs_directio_kstats.nread.value.ui64 += nbytes;
795 
796 	phys_iov.iov_base = uio->uio_iov->iov_base;
797 	phys_iov.iov_len = nbytes;
798 
799 	phys_uio.uio_iov = &phys_iov;
800 	phys_uio.uio_iovcnt = 1;
801 	phys_uio.uio_resid = phys_iov.iov_len;
802 	phys_uio.uio_segflg = uio->uio_segflg;
803 	phys_uio.uio_extflg = uio->uio_extflg;
804 	while (error == 0 && phys_uio.uio_resid) {
805 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
806 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
807 		    &phys_uio);
808 	}
809 	return (error);
810 }
811 
812 /*
813  * Direct Read
814  */
815 int
816 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
817 {
818 	ssize_t		resid, bytes_read;
819 	u_offset_t	size, uoff;
820 	int		error, newerror, len;
821 	size_t		nbytes;
822 	struct fs	*fs;
823 	vnode_t		*vp;
824 	daddr_t		bn;
825 	iovec_t		*iov;
826 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
827 	struct proc	*procp;
828 	struct as	*as;
829 	struct directio_buf	*tail;
830 
831 	/*
832 	 * assume that directio isn't possible (normal case)
833 	 */
834 	*statusp = DIRECTIO_FAILURE;
835 
836 	/*
837 	 * Don't go direct
838 	 */
839 	if (ufs_directio_enabled == 0)
840 		return (0);
841 
842 	/*
843 	 * mapped file; nevermind
844 	 */
845 	if (ip->i_mapcnt)
846 		return (0);
847 
848 	/*
849 	 * CAN WE DO DIRECT IO?
850 	 */
851 	/*
852 	 * must be sector aligned
853 	 */
854 	uoff = uio->uio_loffset;
855 	resid = uio->uio_resid;
856 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
857 		return (0);
858 	/*
859 	 * must be short aligned and sector aligned
860 	 */
861 	iov = uio->uio_iov;
862 	nbytes = uio->uio_iovcnt;
863 	while (nbytes--) {
864 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
865 			return (0);
866 		if ((intptr_t)(iov++->iov_base) & 1)
867 			return (0);
868 	}
869 
870 	/*
871 	 * DIRECTIO
872 	 */
873 	fs = ip->i_fs;
874 
875 	/*
876 	 * don't read past EOF
877 	 */
878 	size = ip->i_size;
879 
880 	/*
881 	 * The file offset is past EOF so bail out here; we don't want
882 	 * to update uio_resid and make it look like we read something.
883 	 * We say that direct I/O was a success to avoid having rdip()
884 	 * go through the same "read past EOF logic".
885 	 */
886 	if (uoff >= size) {
887 		*statusp = DIRECTIO_SUCCESS;
888 		return (0);
889 	}
890 
891 	/*
892 	 * The read would extend past EOF so make it smaller.
893 	 */
894 	if ((uoff + resid) > size) {
895 		resid = size - uoff;
896 		/*
897 		 * recheck sector alignment
898 		 */
899 		if (resid & (DEV_BSIZE - 1))
900 			return (0);
901 	}
902 
903 	/*
904 	 * At this point, we know there is some real work to do.
905 	 */
906 	ASSERT(resid);
907 
908 	/*
909 	 * get rid of cached pages
910 	 */
911 	vp = ITOV(ip);
912 	if (vn_has_cached_data(vp)) {
913 		rw_exit(&ip->i_contents);
914 		rw_enter(&ip->i_contents, RW_WRITER);
915 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
916 		    B_INVAL, cr, NULL);
917 		if (vn_has_cached_data(vp))
918 			return (0);
919 		rw_downgrade(&ip->i_contents);
920 		ufs_directio_kstats.nflushes.value.ui64++;
921 	}
922 	/*
923 	 * Direct Reads
924 	 */
925 
926 	/*
927 	 * proc and as are for VM operations in directio_start()
928 	 */
929 	if (uio->uio_segflg == UIO_USERSPACE) {
930 		procp = ttoproc(curthread);
931 		as = procp->p_as;
932 	} else {
933 		procp = NULL;
934 		as = &kas;
935 	}
936 
937 	*statusp = DIRECTIO_SUCCESS;
938 	error = 0;
939 	newerror = 0;
940 	bytes_read = 0;
941 	ufs_directio_kstats.logical_reads.value.ui64++;
942 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
943 		size_t pglck_len, pglck_size;
944 		caddr_t pglck_base;
945 		page_t **pplist, **spplist;
946 
947 		tail = NULL;
948 
949 		/*
950 		 * Adjust number of bytes
951 		 */
952 		iov = uio->uio_iov;
953 		pglck_len = (size_t)MIN(iov->iov_len, resid);
954 		pglck_base = iov->iov_base;
955 		if (pglck_len == 0) {
956 			uio->uio_iov++;
957 			uio->uio_iovcnt--;
958 			continue;
959 		}
960 
961 		/*
962 		 * Try to Lock down the largest chunck of pages possible.
963 		 */
964 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
965 		error = as_pagelock(as, &pplist, pglck_base,
966 		    pglck_len, S_WRITE);
967 
968 		if (error)
969 			break;
970 
971 		pglck_size = pglck_len;
972 		while (pglck_len) {
973 
974 			nbytes = pglck_len;
975 			uoff = uio->uio_loffset;
976 
977 			/*
978 			 * Re-adjust number of bytes to contiguous range
979 			 */
980 			len = (ssize_t)blkroundup(fs, nbytes);
981 			error = bmap_read(ip, uoff, &bn, &len);
982 			if (error)
983 				break;
984 
985 			if (bn == UFS_HOLE) {
986 				nbytes = (size_t)MIN(fs->fs_bsize -
987 				    (long)blkoff(fs, uoff), nbytes);
988 				error = directio_hole(uio, nbytes);
989 				/*
990 				 * Hole reads are not added to the list
991 				 * processed by directio_wait() below so
992 				 * account for bytes read here.
993 				 */
994 				if (!error)
995 					bytes_read += nbytes;
996 			} else {
997 				nbytes = (size_t)MIN(nbytes, len);
998 
999 				/*
1000 				 * Get the pagelist pointer for this offset
1001 				 * to be passed to directio_start.
1002 				 */
1003 				if (pplist != NULL)
1004 					spplist = pplist +
1005 					    btop((uintptr_t)iov->iov_base -
1006 					    ((uintptr_t)pglck_base & PAGEMASK));
1007 				else
1008 					spplist = NULL;
1009 
1010 				/*
1011 				 * Kick off the direct read requests
1012 				 */
1013 				directio_start(ufsvfsp, ip, nbytes,
1014 				    ldbtob(bn), iov->iov_base,
1015 				    S_WRITE, procp, &tail, spplist);
1016 			}
1017 
1018 			if (error)
1019 				break;
1020 
1021 			/*
1022 			 * Adjust pointers and counters
1023 			 */
1024 			iov->iov_len -= nbytes;
1025 			iov->iov_base += nbytes;
1026 			uio->uio_loffset += nbytes;
1027 			resid -= nbytes;
1028 			pglck_len -= nbytes;
1029 		}
1030 
1031 		/*
1032 		 * Wait for outstanding requests
1033 		 */
1034 		newerror = directio_wait(tail, &bytes_read);
1035 		/*
1036 		 * Release VM resources
1037 		 */
1038 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 
1040 	}
1041 
1042 	/*
1043 	 * If error, adjust resid to begin at the first
1044 	 * un-read byte.
1045 	 */
1046 	if (error == 0)
1047 		error = newerror;
1048 	uio->uio_resid -= bytes_read;
1049 	return (error);
1050 }
1051