xref: /titanic_44/usr/src/uts/common/fs/ufs/ufs_directio.c (revision 8ba25627aaba4ea5318b8a7588142fdcdc1c765a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/types.h>
38 #include <sys/t_lock.h>
39 #include <sys/param.h>
40 #include <sys/time.h>
41 #include <sys/systm.h>
42 #include <sys/sysmacros.h>
43 #include <sys/resource.h>
44 #include <sys/signal.h>
45 #include <sys/cred.h>
46 #include <sys/user.h>
47 #include <sys/buf.h>
48 #include <sys/vfs.h>
49 #include <sys/vnode.h>
50 #include <sys/proc.h>
51 #include <sys/disp.h>
52 #include <sys/file.h>
53 #include <sys/fcntl.h>
54 #include <sys/flock.h>
55 #include <sys/kmem.h>
56 #include <sys/uio.h>
57 #include <sys/dnlc.h>
58 #include <sys/conf.h>
59 #include <sys/mman.h>
60 #include <sys/pathname.h>
61 #include <sys/debug.h>
62 #include <sys/vmsystm.h>
63 #include <sys/cmn_err.h>
64 #include <sys/vtrace.h>
65 #include <sys/filio.h>
66 #include <sys/atomic.h>
67 
68 #include <sys/fssnap_if.h>
69 #include <sys/fs/ufs_fs.h>
70 #include <sys/fs/ufs_lockfs.h>
71 #include <sys/fs/ufs_filio.h>
72 #include <sys/fs/ufs_inode.h>
73 #include <sys/fs/ufs_fsdir.h>
74 #include <sys/fs/ufs_quota.h>
75 #include <sys/fs/ufs_trans.h>
76 #include <sys/fs/ufs_panic.h>
77 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
78 #include <sys/errno.h>
79 
80 #include <sys/filio.h>		/* _FIOIO */
81 
82 #include <vm/hat.h>
83 #include <vm/page.h>
84 #include <vm/pvn.h>
85 #include <vm/as.h>
86 #include <vm/seg.h>
87 #include <vm/seg_map.h>
88 #include <vm/seg_vn.h>
89 #include <vm/seg_kmem.h>
90 #include <vm/rm.h>
91 #include <sys/swap.h>
92 #include <sys/epm.h>
93 
94 #include <fs/fs_subr.h>
95 
96 static void	*ufs_directio_zero_buf;
97 static int	ufs_directio_zero_len	= 8192;
98 
99 int	ufs_directio_enabled = 1;	/* feature is enabled */
100 
101 /*
102  * for kstats reader
103  */
104 struct ufs_directio_kstats {
105 	kstat_named_t	logical_reads;
106 	kstat_named_t	phys_reads;
107 	kstat_named_t	hole_reads;
108 	kstat_named_t	nread;
109 	kstat_named_t	logical_writes;
110 	kstat_named_t	phys_writes;
111 	kstat_named_t	nwritten;
112 	kstat_named_t	nflushes;
113 } ufs_directio_kstats = {
114 	{ "logical_reads",	KSTAT_DATA_UINT64 },
115 	{ "phys_reads",		KSTAT_DATA_UINT64 },
116 	{ "hole_reads",		KSTAT_DATA_UINT64 },
117 	{ "nread",		KSTAT_DATA_UINT64 },
118 	{ "logical_writes",	KSTAT_DATA_UINT64 },
119 	{ "phys_writes",	KSTAT_DATA_UINT64 },
120 	{ "nwritten",		KSTAT_DATA_UINT64 },
121 	{ "nflushes",		KSTAT_DATA_UINT64 },
122 };
123 
124 kstat_t	*ufs_directio_kstatsp;
125 
126 /*
127  * use kmem_cache_create for direct-physio buffers. This has shown
128  * a better cache distribution compared to buffers on the
129  * stack. It also avoids semaphore construction/deconstruction
130  * per request
131  */
132 struct directio_buf {
133 	struct directio_buf	*next;
134 	char		*addr;
135 	size_t		nbytes;
136 	struct buf	buf;
137 };
138 static struct kmem_cache *directio_buf_cache;
139 
140 
141 /* ARGSUSED */
142 static int
143 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
144 {
145 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
146 	return (0);
147 }
148 
149 /* ARGSUSED */
150 static void
151 directio_buf_destructor(void *dbp, void *cdrarg)
152 {
153 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
154 }
155 
156 void
157 directio_bufs_init(void)
158 {
159 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
160 		sizeof (struct directio_buf), 0,
161 		directio_buf_constructor, directio_buf_destructor,
162 		NULL, NULL, NULL, 0);
163 }
164 
165 void
166 ufs_directio_init(void)
167 {
168 	/*
169 	 * kstats
170 	 */
171 	ufs_directio_kstatsp = kstat_create("ufs", 0,
172 	    "directio", "ufs", KSTAT_TYPE_NAMED,
173 	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
174 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
175 	if (ufs_directio_kstatsp) {
176 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
177 		kstat_install(ufs_directio_kstatsp);
178 	}
179 	/*
180 	 * kzero is broken so we have to use a private buf of zeroes
181 	 */
182 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
183 	directio_bufs_init();
184 }
185 
186 /*
187  * Wait for the first direct IO operation to finish
188  */
189 static int
190 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
191 {
192 	buf_t	*bp;
193 	int	error;
194 
195 	/*
196 	 * Wait for IO to finish
197 	 */
198 	bp = &dbp->buf;
199 	error = biowait(bp);
200 
201 	/*
202 	 * bytes_io will be used to figure out a resid
203 	 * for the caller. The resid is approximated by reporting
204 	 * the bytes following the first failed IO as the residual.
205 	 *
206 	 * I am cautious about using b_resid because I
207 	 * am not sure how well the disk drivers maintain it.
208 	 */
209 	if (error)
210 		if (bp->b_resid)
211 			*bytes_iop = bp->b_bcount - bp->b_resid;
212 		else
213 			*bytes_iop = 0;
214 	else
215 		*bytes_iop += bp->b_bcount;
216 	/*
217 	 * Release direct IO resources
218 	 */
219 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
220 	kmem_cache_free(directio_buf_cache, dbp);
221 	return (error);
222 }
223 
224 /*
225  * Wait for all of the direct IO operations to finish
226  */
227 
228 uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
229 
230 static int
231 directio_wait(struct directio_buf *tail, long *bytes_iop)
232 {
233 	int	error = 0, newerror;
234 	struct directio_buf	*dbp;
235 	uint_t	kpri_req_save;
236 
237 	/*
238 	 * The linked list of directio buf structures is maintained
239 	 * in reverse order (tail->last request->penultimate request->...)
240 	 */
241 	/*
242 	 * This is the k_pri_req hack. Large numbers of threads
243 	 * sleeping with kernel priority will cause scheduler thrashing
244 	 * on an MP machine. This can be seen running Oracle using
245 	 * directio to ufs files. Sleep at normal priority here to
246 	 * more closely mimic physio to a device partition. This
247 	 * workaround is disabled by default as a niced thread could
248 	 * be starved from running while holding i_rwlock and i_contents.
249 	 */
250 	if (ufs_directio_drop_kpri) {
251 		kpri_req_save = curthread->t_kpri_req;
252 		curthread->t_kpri_req = 0;
253 	}
254 	while ((dbp = tail) != NULL) {
255 		tail = dbp->next;
256 		newerror = directio_wait_one(dbp, bytes_iop);
257 		if (error == 0)
258 			error = newerror;
259 	}
260 	if (ufs_directio_drop_kpri)
261 		curthread->t_kpri_req = kpri_req_save;
262 	return (error);
263 }
264 /*
265  * Initiate direct IO request
266  */
267 static void
268 directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes,
269 	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
270 	struct directio_buf **tailp, page_t **pplist)
271 {
272 	buf_t *bp;
273 	struct directio_buf *dbp;
274 
275 	/*
276 	 * Allocate a directio buf header
277 	 *   Note - list is maintained in reverse order.
278 	 *   directio_wait_one() depends on this fact when
279 	 *   adjusting the ``bytes_io'' param. bytes_io
280 	 *   is used to compute a residual in the case of error.
281 	 */
282 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
283 	dbp->next = *tailp;
284 	*tailp = dbp;
285 
286 	/*
287 	 * Initialize buf header
288 	 */
289 	dbp->addr = addr;
290 	dbp->nbytes = nbytes;
291 	bp = &dbp->buf;
292 	bp->b_edev = dev;
293 	bp->b_lblkno = btodt(offset);
294 	bp->b_bcount = nbytes;
295 	bp->b_un.b_addr = addr;
296 	bp->b_proc = procp;
297 
298 	/*
299 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
300 	 * will B_READ data from the filesystem and S_WRITE it into
301 	 * the user's buffer; a write(2) will S_READ data from the
302 	 * user's buffer and B_WRITE it to the filesystem.
303 	 */
304 	if (rw == S_WRITE) {
305 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
306 		ufs_directio_kstats.phys_reads.value.ui64++;
307 		ufs_directio_kstats.nread.value.ui64 += nbytes;
308 	} else {
309 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
310 		ufs_directio_kstats.phys_writes.value.ui64++;
311 		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
312 	}
313 	bp->b_shadow = pplist;
314 	if (pplist != NULL)
315 		bp->b_flags |= B_SHADOW;
316 
317 	/*
318 	 * Issue I/O request.
319 	 */
320 	ufsvfsp->vfs_iotstamp = lbolt;
321 	if (ufsvfsp->vfs_snapshot)
322 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
323 	else
324 		(void) bdev_strategy(bp);
325 
326 	if (rw == S_WRITE)
327 		lwp_stat_update(LWP_STAT_OUBLK, 1);
328 	else
329 		lwp_stat_update(LWP_STAT_INBLK, 1);
330 
331 }
332 
333 uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
334 uint32_t	ufs_cur_writes;		/* # concurrent writes */
335 uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
336 uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
337 
338 /*
339  * Force POSIX syncronous data integrity on all writes for testing.
340  */
341 uint32_t	ufs_force_posix_sdi = 0;
342 
343 /*
344  * Direct Write
345  */
346 
347 int
348 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
349 	cred_t *cr, int *statusp)
350 {
351 	long		resid, bytes_written;
352 	u_offset_t	size, uoff;
353 	uio_t		*uio = arg_uio;
354 	rlim64_t	limit = uio->uio_llimit;
355 	int		on, n, error, newerror, len, has_holes;
356 	daddr_t		bn;
357 	size_t		nbytes;
358 	struct fs	*fs;
359 	vnode_t		*vp;
360 	iovec_t		*iov;
361 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
362 	struct proc	*procp;
363 	struct as	*as;
364 	struct directio_buf	*tail;
365 	int		exclusive, ncur, bmap_peek;
366 	uio_t		copy_uio;
367 	iovec_t		copy_iov;
368 	char		*copy_base;
369 	long		copy_resid;
370 
371 	/*
372 	 * assume that directio isn't possible (normal case)
373 	 */
374 	*statusp = DIRECTIO_FAILURE;
375 
376 	/*
377 	 * Don't go direct
378 	 */
379 	if (ufs_directio_enabled == 0)
380 		return (0);
381 
382 	/*
383 	 * mapped file; nevermind
384 	 */
385 	if (ip->i_mapcnt)
386 		return (0);
387 
388 	/*
389 	 * CAN WE DO DIRECT IO?
390 	 */
391 	uoff = uio->uio_loffset;
392 	resid = uio->uio_resid;
393 
394 	/*
395 	 * beyond limit
396 	 */
397 	if (uoff + resid > limit)
398 		return (0);
399 
400 	/*
401 	 * must be sector aligned
402 	 */
403 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
404 		return (0);
405 
406 	/*
407 	 * SHOULD WE DO DIRECT IO?
408 	 */
409 	size = ip->i_size;
410 	has_holes = -1;
411 
412 	/*
413 	 * only on regular files; no metadata
414 	 */
415 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
416 		return (0);
417 
418 	/*
419 	 * Synchronous, allocating writes run very slow in Direct-Mode
420 	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
421 	 *	XXX - can be fixed for updates to "almost-full" files
422 	 *	XXX - WARNING - system hangs if bmap_write() has to
423 	 * 			allocate lots of pages since pageout
424 	 * 			suspends on locked inode
425 	 */
426 	if (!rewrite && (ip->i_flag & ISYNC)) {
427 		if ((uoff + resid) > size)
428 			return (0);
429 		has_holes = bmap_has_holes(ip);
430 		if (has_holes)
431 			return (0);
432 	}
433 
434 	/*
435 	 * Each iovec must be short aligned and sector aligned.  If
436 	 * one is not, then kmem_alloc a new buffer and copy all of
437 	 * the smaller buffers into the new buffer.  This new
438 	 * buffer will be short aligned and sector aligned.
439 	 */
440 	iov = uio->uio_iov;
441 	nbytes = uio->uio_iovcnt;
442 	while (nbytes--) {
443 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
444 		    (intptr_t)(iov->iov_base) & 1) {
445 			copy_resid = uio->uio_resid;
446 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
447 			if (copy_base == NULL)
448 				return (0);
449 			copy_iov.iov_base = copy_base;
450 			copy_iov.iov_len = copy_resid;
451 			copy_uio.uio_iov = &copy_iov;
452 			copy_uio.uio_iovcnt = 1;
453 			copy_uio.uio_segflg = UIO_SYSSPACE;
454 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
455 			copy_uio.uio_loffset = uio->uio_loffset;
456 			copy_uio.uio_resid = uio->uio_resid;
457 			copy_uio.uio_llimit = uio->uio_llimit;
458 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
459 			if (error) {
460 				kmem_free(copy_base, copy_resid);
461 				return (0);
462 			}
463 			uio = &copy_uio;
464 			break;
465 		}
466 		iov++;
467 	}
468 
469 	/*
470 	 * From here on down, all error exits must go to errout and
471 	 * not simply return a 0.
472 	 */
473 
474 	/*
475 	 * DIRECTIO
476 	 */
477 
478 	fs = ip->i_fs;
479 
480 	/*
481 	 * POSIX check. If attempting a concurrent re-write, make sure
482 	 * that this will be a single request to the driver to meet
483 	 * POSIX synchronous data integrity requirements.
484 	 */
485 	bmap_peek = 0;
486 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
487 		int upgrade = 0;
488 
489 		/* check easy conditions first */
490 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
491 			upgrade = 1;
492 		} else {
493 			/* now look for contiguous allocation */
494 			len = (ssize_t)blkroundup(fs, resid);
495 			error = bmap_read(ip, uoff, &bn, &len);
496 			if (error || bn == UFS_HOLE || len == 0)
497 				goto errout;
498 			/* save a call to bmap_read later */
499 			bmap_peek = 1;
500 			if (len < resid)
501 				upgrade = 1;
502 		}
503 		if (upgrade) {
504 			rw_exit(&ip->i_contents);
505 			rw_enter(&ip->i_contents, RW_WRITER);
506 			ufs_posix_hits++;
507 		}
508 	}
509 
510 
511 	/*
512 	 * allocate space
513 	 */
514 
515 	/*
516 	 * If attempting a re-write, there is no allocation to do.
517 	 * bmap_write would trip an ASSERT if i_contents is held shared.
518 	 */
519 	if (rewrite)
520 		goto skip_alloc;
521 
522 	do {
523 		on = (int)blkoff(fs, uoff);
524 		n = (int)MIN(fs->fs_bsize - on, resid);
525 		if ((uoff + n) > ip->i_size) {
526 			error = bmap_write(ip, uoff, (int)(on + n),
527 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
528 			    NULL, cr);
529 			/* Caller is responsible for updating i_seq if needed */
530 			if (error)
531 				break;
532 			ip->i_size = uoff + n;
533 			ip->i_flag |= IATTCHG;
534 		} else if (n == MAXBSIZE) {
535 			error = bmap_write(ip, uoff, (int)(on + n),
536 			    BI_ALLOC_ONLY, NULL, cr);
537 			/* Caller is responsible for updating i_seq if needed */
538 		} else {
539 			if (has_holes < 0)
540 				has_holes = bmap_has_holes(ip);
541 			if (has_holes) {
542 				uint_t	blk_size;
543 				u_offset_t offset;
544 
545 				offset = uoff & (offset_t)fs->fs_bmask;
546 				blk_size = (int)blksize(fs, ip,
547 				    (daddr_t)lblkno(fs, offset));
548 				error = bmap_write(ip, uoff, blk_size,
549 				    BI_NORMAL, NULL, cr);
550 				/*
551 				 * Caller is responsible for updating
552 				 * i_seq if needed
553 				 */
554 			} else
555 				error = 0;
556 		}
557 		if (error)
558 			break;
559 		uoff += n;
560 		resid -= n;
561 		/*
562 		 * if file has grown larger than 2GB, set flag
563 		 * in superblock if not already set
564 		 */
565 		if ((ip->i_size > MAXOFF32_T) &&
566 		    !(fs->fs_flags & FSLARGEFILES)) {
567 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
568 			mutex_enter(&ufsvfsp->vfs_lock);
569 			fs->fs_flags |= FSLARGEFILES;
570 			ufs_sbwrite(ufsvfsp);
571 			mutex_exit(&ufsvfsp->vfs_lock);
572 		}
573 	} while (resid);
574 
575 	if (error) {
576 		/*
577 		 * restore original state
578 		 */
579 		if (resid) {
580 			if (size == ip->i_size)
581 				goto errout;
582 			(void) ufs_itrunc(ip, size, 0, cr);
583 		}
584 		/*
585 		 * try non-directio path
586 		 */
587 		goto errout;
588 	}
589 skip_alloc:
590 
591 	/*
592 	 * get rid of cached pages
593 	 */
594 	vp = ITOV(ip);
595 	exclusive = rw_write_held(&ip->i_contents);
596 	if (vn_has_cached_data(vp)) {
597 		if (!exclusive) {
598 			/*
599 			 * Still holding i_rwlock, so no allocations
600 			 * can happen after dropping contents.
601 			 */
602 			rw_exit(&ip->i_contents);
603 			rw_enter(&ip->i_contents, RW_WRITER);
604 		}
605 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
606 		if (vn_has_cached_data(vp))
607 			goto errout;
608 		if (!exclusive)
609 			rw_downgrade(&ip->i_contents);
610 		ufs_directio_kstats.nflushes.value.ui64++;
611 	}
612 
613 	/*
614 	 * Direct Writes
615 	 */
616 
617 	if (!exclusive) {
618 		ufs_shared_writes++;
619 		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
620 		if (ncur > ufs_maxcur_writes)
621 			ufs_maxcur_writes = ncur;
622 	}
623 
624 	/*
625 	 * proc and as are for VM operations in directio_start()
626 	 */
627 	if (uio->uio_segflg == UIO_USERSPACE) {
628 		procp = ttoproc(curthread);
629 		as = procp->p_as;
630 	} else {
631 		procp = NULL;
632 		as = &kas;
633 	}
634 	*statusp = DIRECTIO_SUCCESS;
635 	error = 0;
636 	newerror = 0;
637 	resid = uio->uio_resid;
638 	bytes_written = 0;
639 	ufs_directio_kstats.logical_writes.value.ui64++;
640 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
641 		size_t pglck_len, pglck_size;
642 		caddr_t pglck_base;
643 		page_t **pplist, **spplist;
644 
645 		tail = NULL;
646 
647 		/*
648 		 * Adjust number of bytes
649 		 */
650 		iov = uio->uio_iov;
651 		pglck_len = (size_t)MIN(iov->iov_len, resid);
652 		pglck_base = iov->iov_base;
653 		if (pglck_len == 0) {
654 			uio->uio_iov++;
655 			uio->uio_iovcnt--;
656 			continue;
657 		}
658 
659 		/*
660 		 * Try to Lock down the largest chunck of pages possible.
661 		 */
662 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
663 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
664 
665 		if (error)
666 			break;
667 
668 		pglck_size = pglck_len;
669 		while (pglck_len) {
670 
671 			nbytes = pglck_len;
672 			uoff = uio->uio_loffset;
673 
674 			if (!bmap_peek) {
675 
676 				/*
677 				 * Re-adjust number of bytes to contiguous
678 				 * range. May have already called bmap_read
679 				 * in the case of a concurrent rewrite.
680 				 */
681 				len = (ssize_t)blkroundup(fs, nbytes);
682 				error = bmap_read(ip, uoff, &bn, &len);
683 				if (error)
684 					break;
685 				if (bn == UFS_HOLE || len == 0)
686 					break;
687 			}
688 			nbytes = (size_t)MIN(nbytes, len);
689 			bmap_peek = 0;
690 
691 			/*
692 			 * Get the pagelist pointer for this offset to be
693 			 * passed to directio_start.
694 			 */
695 
696 			if (pplist != NULL)
697 				spplist = pplist +
698 				btop((uintptr_t)iov->iov_base -
699 					((uintptr_t)pglck_base & PAGEMASK));
700 			else
701 				spplist = NULL;
702 
703 			/*
704 			 * Kick off the direct write requests
705 			 */
706 			directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn),
707 				iov->iov_base, S_READ, procp, &tail, spplist);
708 
709 			/*
710 			 * Adjust pointers and counters
711 			 */
712 			iov->iov_len -= nbytes;
713 			iov->iov_base += nbytes;
714 			uio->uio_loffset += nbytes;
715 			resid -= nbytes;
716 			pglck_len -= nbytes;
717 		}
718 
719 		/*
720 		 * Wait for outstanding requests
721 		 */
722 		newerror = directio_wait(tail, &bytes_written);
723 
724 		/*
725 		 * Release VM resources
726 		 */
727 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
728 
729 	}
730 
731 	if (!exclusive) {
732 		atomic_add_32(&ufs_cur_writes, -1);
733 		/*
734 		 * If this write was done shared, readers may
735 		 * have pulled in unmodified pages. Get rid of
736 		 * these potentially stale pages.
737 		 */
738 		if (vn_has_cached_data(vp)) {
739 			rw_exit(&ip->i_contents);
740 			rw_enter(&ip->i_contents, RW_WRITER);
741 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
742 				B_INVAL, cr);
743 			ufs_directio_kstats.nflushes.value.ui64++;
744 			rw_downgrade(&ip->i_contents);
745 		}
746 	}
747 
748 	/*
749 	 * If error, adjust resid to begin at the first
750 	 * un-writable byte.
751 	 */
752 	if (error == 0)
753 		error = newerror;
754 	if (error)
755 		resid = uio->uio_resid - bytes_written;
756 	arg_uio->uio_resid = resid;
757 
758 	if (!rewrite) {
759 		ip->i_flag |= IUPD | ICHG;
760 		/* Caller will update i_seq */
761 		TRANS_INODE(ip->i_ufsvfs, ip);
762 	}
763 	/*
764 	 * If there is a residual; adjust the EOF if necessary
765 	 */
766 	if (resid) {
767 		if (size != ip->i_size) {
768 			if (uio->uio_loffset > size)
769 				size = uio->uio_loffset;
770 			(void) ufs_itrunc(ip, size, 0, cr);
771 		}
772 	}
773 
774 	if (uio == &copy_uio)
775 		kmem_free(copy_base, copy_resid);
776 
777 	return (error);
778 
779 errout:
780 	if (uio == &copy_uio)
781 		kmem_free(copy_base, copy_resid);
782 
783 	return (0);
784 }
785 /*
786  * Direct read of a hole
787  */
788 static int
789 directio_hole(struct uio *uio, size_t nbytes)
790 {
791 	int		error = 0, nzero;
792 	uio_t		phys_uio;
793 	iovec_t		phys_iov;
794 
795 	ufs_directio_kstats.hole_reads.value.ui64++;
796 	ufs_directio_kstats.nread.value.ui64 += nbytes;
797 
798 	phys_iov.iov_base = uio->uio_iov->iov_base;
799 	phys_iov.iov_len = nbytes;
800 
801 	phys_uio.uio_iov = &phys_iov;
802 	phys_uio.uio_iovcnt = 1;
803 	phys_uio.uio_resid = phys_iov.iov_len;
804 	phys_uio.uio_segflg = uio->uio_segflg;
805 	phys_uio.uio_extflg = uio->uio_extflg;
806 	while (error == 0 && phys_uio.uio_resid) {
807 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
808 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
809 				&phys_uio);
810 	}
811 	return (error);
812 }
813 
814 /*
815  * Direct Read
816  */
817 int
818 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
819 {
820 	ssize_t		resid, bytes_read;
821 	u_offset_t	size, uoff;
822 	int		error, newerror, len;
823 	size_t		nbytes;
824 	struct fs	*fs;
825 	vnode_t		*vp;
826 	daddr_t		bn;
827 	iovec_t		*iov;
828 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
829 	struct proc	*procp;
830 	struct as	*as;
831 	struct directio_buf	*tail;
832 
833 	/*
834 	 * assume that directio isn't possible (normal case)
835 	 */
836 	*statusp = DIRECTIO_FAILURE;
837 
838 	/*
839 	 * Don't go direct
840 	 */
841 	if (ufs_directio_enabled == 0)
842 		return (0);
843 
844 	/*
845 	 * mapped file; nevermind
846 	 */
847 	if (ip->i_mapcnt)
848 		return (0);
849 
850 	/*
851 	 * CAN WE DO DIRECT IO?
852 	 */
853 	/*
854 	 * must be sector aligned
855 	 */
856 	uoff = uio->uio_loffset;
857 	resid = uio->uio_resid;
858 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
859 		return (0);
860 	/*
861 	 * must be short aligned and sector aligned
862 	 */
863 	iov = uio->uio_iov;
864 	nbytes = uio->uio_iovcnt;
865 	while (nbytes--) {
866 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
867 			return (0);
868 		if ((intptr_t)(iov++->iov_base) & 1)
869 			return (0);
870 	}
871 
872 	/*
873 	 * DIRECTIO
874 	 */
875 	fs = ip->i_fs;
876 
877 	/*
878 	 * don't read past EOF
879 	 */
880 	size = ip->i_size;
881 
882 	/*
883 	 * The file offset is past EOF so bail out here; we don't want
884 	 * to update uio_resid and make it look like we read something.
885 	 * We say that direct I/O was a success to avoid having rdip()
886 	 * go through the same "read past EOF logic".
887 	 */
888 	if (uoff >= size) {
889 		*statusp = DIRECTIO_SUCCESS;
890 		return (0);
891 	}
892 
893 	/*
894 	 * The read would extend past EOF so make it smaller.
895 	 */
896 	if ((uoff + resid) > size) {
897 		resid = size - uoff;
898 		/*
899 		 * recheck sector alignment
900 		 */
901 		if (resid & (DEV_BSIZE - 1))
902 			return (0);
903 	}
904 
905 	/*
906 	 * At this point, we know there is some real work to do.
907 	 */
908 	ASSERT(resid);
909 
910 	/*
911 	 * get rid of cached pages
912 	 */
913 	vp = ITOV(ip);
914 	if (vn_has_cached_data(vp)) {
915 		rw_exit(&ip->i_contents);
916 		rw_enter(&ip->i_contents, RW_WRITER);
917 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
918 		if (vn_has_cached_data(vp))
919 			return (0);
920 		rw_downgrade(&ip->i_contents);
921 		ufs_directio_kstats.nflushes.value.ui64++;
922 	}
923 	/*
924 	 * Direct Reads
925 	 */
926 
927 	/*
928 	 * proc and as are for VM operations in directio_start()
929 	 */
930 	if (uio->uio_segflg == UIO_USERSPACE) {
931 		procp = ttoproc(curthread);
932 		as = procp->p_as;
933 	} else {
934 		procp = NULL;
935 		as = &kas;
936 	}
937 
938 	*statusp = DIRECTIO_SUCCESS;
939 	error = 0;
940 	newerror = 0;
941 	bytes_read = 0;
942 	ufs_directio_kstats.logical_reads.value.ui64++;
943 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
944 		size_t pglck_len, pglck_size;
945 		caddr_t pglck_base;
946 		page_t **pplist, **spplist;
947 
948 		tail = NULL;
949 
950 		/*
951 		 * Adjust number of bytes
952 		 */
953 		iov = uio->uio_iov;
954 		pglck_len = (size_t)MIN(iov->iov_len, resid);
955 		pglck_base = iov->iov_base;
956 		if (pglck_len == 0) {
957 			uio->uio_iov++;
958 			uio->uio_iovcnt--;
959 			continue;
960 		}
961 
962 		/*
963 		 * Try to Lock down the largest chunck of pages possible.
964 		 */
965 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
966 		error = as_pagelock(as, &pplist, pglck_base,
967 							pglck_len, S_WRITE);
968 
969 		if (error)
970 			break;
971 
972 		pglck_size = pglck_len;
973 		while (pglck_len) {
974 
975 			nbytes = pglck_len;
976 			uoff = uio->uio_loffset;
977 
978 			/*
979 			 * Re-adjust number of bytes to contiguous range
980 			 */
981 			len = (ssize_t)blkroundup(fs, nbytes);
982 			error = bmap_read(ip, uoff, &bn, &len);
983 			if (error)
984 				break;
985 
986 			if (bn == UFS_HOLE) {
987 				nbytes = (size_t)MIN(fs->fs_bsize -
988 						(long)blkoff(fs, uoff), nbytes);
989 				error = directio_hole(uio, nbytes);
990 				/*
991 				 * Hole reads are not added to the list
992 				 * processed by directio_wait() below so
993 				 * account for bytes read here.
994 				 */
995 				if (!error)
996 					bytes_read += nbytes;
997 			} else {
998 				nbytes = (size_t)MIN(nbytes, len);
999 
1000 				/*
1001 				 * Get the pagelist pointer for this offset
1002 				 * to be passed to directio_start.
1003 				 */
1004 				if (pplist != NULL)
1005 					spplist = pplist +
1006 					btop((uintptr_t)iov->iov_base -
1007 					((uintptr_t)pglck_base & PAGEMASK));
1008 				else
1009 					spplist = NULL;
1010 
1011 				/*
1012 				 * Kick off the direct read requests
1013 				 */
1014 				directio_start(ufsvfsp, ip->i_dev, nbytes,
1015 						ldbtob(bn), iov->iov_base,
1016 						S_WRITE, procp, &tail, spplist);
1017 			}
1018 
1019 			if (error)
1020 				break;
1021 
1022 			/*
1023 			 * Adjust pointers and counters
1024 			 */
1025 			iov->iov_len -= nbytes;
1026 			iov->iov_base += nbytes;
1027 			uio->uio_loffset += nbytes;
1028 			resid -= nbytes;
1029 			pglck_len -= nbytes;
1030 		}
1031 
1032 		/*
1033 		 * Wait for outstanding requests
1034 		 */
1035 		newerror = directio_wait(tail, &bytes_read);
1036 		/*
1037 		 * Release VM resources
1038 		 */
1039 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1040 
1041 	}
1042 
1043 	/*
1044 	 * If error, adjust resid to begin at the first
1045 	 * un-read byte.
1046 	 */
1047 	if (error == 0)
1048 		error = newerror;
1049 	uio->uio_resid -= bytes_read;
1050 	return (error);
1051 }
1052