xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_directio.c (revision 30ef842d708d30553d7fbc8348a381664ef62a73)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/types.h>
38 #include <sys/t_lock.h>
39 #include <sys/param.h>
40 #include <sys/time.h>
41 #include <sys/systm.h>
42 #include <sys/sysmacros.h>
43 #include <sys/resource.h>
44 #include <sys/signal.h>
45 #include <sys/cred.h>
46 #include <sys/user.h>
47 #include <sys/buf.h>
48 #include <sys/vfs.h>
49 #include <sys/vnode.h>
50 #include <sys/proc.h>
51 #include <sys/disp.h>
52 #include <sys/file.h>
53 #include <sys/fcntl.h>
54 #include <sys/flock.h>
55 #include <sys/kmem.h>
56 #include <sys/uio.h>
57 #include <sys/dnlc.h>
58 #include <sys/conf.h>
59 #include <sys/mman.h>
60 #include <sys/pathname.h>
61 #include <sys/debug.h>
62 #include <sys/vmsystm.h>
63 #include <sys/cmn_err.h>
64 #include <sys/vtrace.h>
65 #include <sys/filio.h>
66 #include <sys/atomic.h>
67 
68 #include <sys/fssnap_if.h>
69 #include <sys/fs/ufs_fs.h>
70 #include <sys/fs/ufs_lockfs.h>
71 #include <sys/fs/ufs_filio.h>
72 #include <sys/fs/ufs_inode.h>
73 #include <sys/fs/ufs_fsdir.h>
74 #include <sys/fs/ufs_quota.h>
75 #include <sys/fs/ufs_trans.h>
76 #include <sys/fs/ufs_panic.h>
77 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
78 #include <sys/errno.h>
79 
80 #include <sys/filio.h>		/* _FIOIO */
81 
82 #include <vm/hat.h>
83 #include <vm/page.h>
84 #include <vm/pvn.h>
85 #include <vm/as.h>
86 #include <vm/seg.h>
87 #include <vm/seg_map.h>
88 #include <vm/seg_vn.h>
89 #include <vm/seg_kmem.h>
90 #include <vm/rm.h>
91 #include <sys/swap.h>
92 #include <sys/epm.h>
93 
94 #include <fs/fs_subr.h>
95 
96 static void	*ufs_directio_zero_buf;
97 static int	ufs_directio_zero_len	= 8192;
98 
99 int	ufs_directio_enabled = 1;	/* feature is enabled */
100 
101 /*
102  * for kstats reader
103  */
104 struct ufs_directio_kstats {
105 	uint_t	logical_reads;
106 	uint_t	phys_reads;
107 	uint_t	hole_reads;
108 	uint_t	nread;
109 	uint_t	logical_writes;
110 	uint_t	phys_writes;
111 	uint_t	nwritten;
112 	uint_t	nflushes;
113 } ufs_directio_kstats;
114 
115 kstat_t	*ufs_directio_kstatsp;
116 
117 /*
118  * use kmem_cache_create for direct-physio buffers. This has shown
119  * a better cache distribution compared to buffers on the
120  * stack. It also avoids semaphore construction/deconstruction
121  * per request
122  */
123 struct directio_buf {
124 	struct directio_buf	*next;
125 	char		*addr;
126 	size_t		nbytes;
127 	struct buf	buf;
128 };
129 static struct kmem_cache *directio_buf_cache;
130 
131 
132 /* ARGSUSED */
133 static int
134 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
135 {
136 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
137 	return (0);
138 }
139 
140 /* ARGSUSED */
141 static void
142 directio_buf_destructor(void *dbp, void *cdrarg)
143 {
144 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
145 }
146 
147 void
148 directio_bufs_init(void)
149 {
150 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
151 		sizeof (struct directio_buf), 0,
152 		directio_buf_constructor, directio_buf_destructor,
153 		NULL, NULL, NULL, 0);
154 }
155 
156 void
157 ufs_directio_init(void)
158 {
159 	/*
160 	 * kstats
161 	 */
162 	ufs_directio_kstatsp = kstat_create("ufs directio", 0,
163 			"UFS DirectIO Stats", "ufs directio",
164 			KSTAT_TYPE_RAW, sizeof (ufs_directio_kstats),
165 			KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
166 	if (ufs_directio_kstatsp) {
167 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
168 		kstat_install(ufs_directio_kstatsp);
169 	}
170 	/*
171 	 * kzero is broken so we have to use a private buf of zeroes
172 	 */
173 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
174 	directio_bufs_init();
175 }
176 
177 /*
178  * Wait for the first direct IO operation to finish
179  */
180 static int
181 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
182 {
183 	buf_t	*bp;
184 	int	error;
185 
186 	/*
187 	 * Wait for IO to finish
188 	 */
189 	bp = &dbp->buf;
190 	error = biowait(bp);
191 
192 	/*
193 	 * bytes_io will be used to figure out a resid
194 	 * for the caller. The resid is approximated by reporting
195 	 * the bytes following the first failed IO as the residual.
196 	 *
197 	 * I am cautious about using b_resid because I
198 	 * am not sure how well the disk drivers maintain it.
199 	 */
200 	if (error)
201 		if (bp->b_resid)
202 			*bytes_iop = bp->b_bcount - bp->b_resid;
203 		else
204 			*bytes_iop = 0;
205 	else
206 		*bytes_iop += bp->b_bcount;
207 	/*
208 	 * Release direct IO resources
209 	 */
210 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
211 	kmem_cache_free(directio_buf_cache, dbp);
212 	return (error);
213 }
214 
215 /*
216  * Wait for all of the direct IO operations to finish
217  */
218 
219 uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
220 
221 static int
222 directio_wait(struct directio_buf *tail, long *bytes_iop)
223 {
224 	int	error = 0, newerror;
225 	struct directio_buf	*dbp;
226 	uint_t	kpri_req_save;
227 
228 	/*
229 	 * The linked list of directio buf structures is maintained
230 	 * in reverse order (tail->last request->penultimate request->...)
231 	 */
232 	/*
233 	 * This is the k_pri_req hack. Large numbers of threads
234 	 * sleeping with kernel priority will cause scheduler thrashing
235 	 * on an MP machine. This can be seen running Oracle using
236 	 * directio to ufs files. Sleep at normal priority here to
237 	 * more closely mimic physio to a device partition. This
238 	 * workaround is disabled by default as a niced thread could
239 	 * be starved from running while holding i_rwlock and i_contents.
240 	 */
241 	if (ufs_directio_drop_kpri) {
242 		kpri_req_save = curthread->t_kpri_req;
243 		curthread->t_kpri_req = 0;
244 	}
245 	while ((dbp = tail) != NULL) {
246 		tail = dbp->next;
247 		newerror = directio_wait_one(dbp, bytes_iop);
248 		if (error == 0)
249 			error = newerror;
250 	}
251 	if (ufs_directio_drop_kpri)
252 		curthread->t_kpri_req = kpri_req_save;
253 	return (error);
254 }
255 /*
256  * Initiate direct IO request
257  */
258 static void
259 directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes,
260 	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
261 	struct directio_buf **tailp, page_t **pplist)
262 {
263 	buf_t *bp;
264 	struct directio_buf *dbp;
265 
266 	/*
267 	 * Allocate a directio buf header
268 	 *   Note - list is maintained in reverse order.
269 	 *   directio_wait_one() depends on this fact when
270 	 *   adjusting the ``bytes_io'' param. bytes_io
271 	 *   is used to compute a residual in the case of error.
272 	 */
273 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
274 	dbp->next = *tailp;
275 	*tailp = dbp;
276 
277 	/*
278 	 * Initialize buf header
279 	 */
280 	dbp->addr = addr;
281 	dbp->nbytes = nbytes;
282 	bp = &dbp->buf;
283 	bp->b_edev = dev;
284 	bp->b_lblkno = btodt(offset);
285 	bp->b_bcount = nbytes;
286 	bp->b_un.b_addr = addr;
287 	bp->b_proc = procp;
288 
289 	/*
290 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
291 	 * will B_READ data from the filesystem and S_WRITE it into
292 	 * the user's buffer; a write(2) will S_READ data from the
293 	 * user's buffer and B_WRITE it to the filesystem.
294 	 */
295 	if (rw == S_WRITE) {
296 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
297 		ufs_directio_kstats.phys_reads++;
298 		ufs_directio_kstats.nread += nbytes;
299 	} else {
300 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
301 		ufs_directio_kstats.phys_writes++;
302 		ufs_directio_kstats.nwritten += nbytes;
303 	}
304 	bp->b_shadow = pplist;
305 	if (pplist != NULL)
306 		bp->b_flags |= B_SHADOW;
307 
308 	/*
309 	 * Issue I/O request.
310 	 */
311 	ufsvfsp->vfs_iotstamp = lbolt;
312 	if (ufsvfsp->vfs_snapshot)
313 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
314 	else
315 		(void) bdev_strategy(bp);
316 
317 	if (rw == S_WRITE)
318 		lwp_stat_update(LWP_STAT_OUBLK, 1);
319 	else
320 		lwp_stat_update(LWP_STAT_INBLK, 1);
321 
322 }
323 
324 uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
325 uint32_t	ufs_cur_writes;		/* # concurrent writes */
326 uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
327 uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
328 
329 /*
330  * Force POSIX syncronous data integrity on all writes for testing.
331  */
332 uint32_t	ufs_force_posix_sdi = 0;
333 
334 /*
335  * Direct Write
336  */
337 
338 int
339 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
340 	cred_t *cr, int *statusp)
341 {
342 	long		resid, bytes_written;
343 	u_offset_t	size, uoff;
344 	uio_t		*uio = arg_uio;
345 	rlim64_t	limit = uio->uio_llimit;
346 	int		on, n, error, newerror, len, has_holes;
347 	daddr_t		bn;
348 	size_t		nbytes;
349 	struct fs	*fs;
350 	vnode_t		*vp;
351 	iovec_t		*iov;
352 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
353 	struct proc	*procp;
354 	struct as	*as;
355 	struct directio_buf	*tail;
356 	int		exclusive, ncur, bmap_peek;
357 	uio_t		copy_uio;
358 	iovec_t		copy_iov;
359 	char		*copy_base;
360 	long		copy_resid;
361 
362 	/*
363 	 * assume that directio isn't possible (normal case)
364 	 */
365 	*statusp = DIRECTIO_FAILURE;
366 
367 	/*
368 	 * Don't go direct
369 	 */
370 	if (ufs_directio_enabled == 0)
371 		return (0);
372 
373 	/*
374 	 * mapped file; nevermind
375 	 */
376 	if (ip->i_mapcnt)
377 		return (0);
378 
379 	/*
380 	 * CAN WE DO DIRECT IO?
381 	 */
382 	uoff = uio->uio_loffset;
383 	resid = uio->uio_resid;
384 
385 	/*
386 	 * beyond limit
387 	 */
388 	if (uoff + resid > limit)
389 		return (0);
390 
391 	/*
392 	 * must be sector aligned
393 	 */
394 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
395 		return (0);
396 
397 	/*
398 	 * SHOULD WE DO DIRECT IO?
399 	 */
400 	size = ip->i_size;
401 	has_holes = -1;
402 
403 	/*
404 	 * only on regular files; no metadata
405 	 */
406 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
407 		return (0);
408 
409 	/*
410 	 * Synchronous, allocating writes run very slow in Direct-Mode
411 	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
412 	 *	XXX - can be fixed for updates to "almost-full" files
413 	 *	XXX - WARNING - system hangs if bmap_write() has to
414 	 * 			allocate lots of pages since pageout
415 	 * 			suspends on locked inode
416 	 */
417 	if (!rewrite && (ip->i_flag & ISYNC)) {
418 		if ((uoff + resid) > size)
419 			return (0);
420 		has_holes = bmap_has_holes(ip);
421 		if (has_holes)
422 			return (0);
423 	}
424 
425 	/*
426 	 * Each iovec must be short aligned and sector aligned.  If
427 	 * one is not, then kmem_alloc a new buffer and copy all of
428 	 * the smaller buffers into the new buffer.  This new
429 	 * buffer will be short aligned and sector aligned.
430 	 */
431 	iov = uio->uio_iov;
432 	nbytes = uio->uio_iovcnt;
433 	while (nbytes--) {
434 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
435 		    (intptr_t)(iov->iov_base) & 1) {
436 			copy_resid = uio->uio_resid;
437 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
438 			if (copy_base == NULL)
439 				return (0);
440 			copy_iov.iov_base = copy_base;
441 			copy_iov.iov_len = copy_resid;
442 			copy_uio.uio_iov = &copy_iov;
443 			copy_uio.uio_iovcnt = 1;
444 			copy_uio.uio_segflg = UIO_SYSSPACE;
445 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
446 			copy_uio.uio_loffset = uio->uio_loffset;
447 			copy_uio.uio_resid = uio->uio_resid;
448 			copy_uio.uio_llimit = uio->uio_llimit;
449 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
450 			if (error) {
451 				kmem_free(copy_base, copy_resid);
452 				return (0);
453 			}
454 			uio = &copy_uio;
455 			break;
456 		}
457 		iov++;
458 	}
459 
460 	/*
461 	 * From here on down, all error exits must go to errout and
462 	 * not simply return a 0.
463 	 */
464 
465 	/*
466 	 * DIRECTIO
467 	 */
468 
469 	fs = ip->i_fs;
470 
471 	/*
472 	 * POSIX check. If attempting a concurrent re-write, make sure
473 	 * that this will be a single request to the driver to meet
474 	 * POSIX synchronous data integrity requirements.
475 	 */
476 	bmap_peek = 0;
477 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
478 		int upgrade = 0;
479 
480 		/* check easy conditions first */
481 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
482 			upgrade = 1;
483 		} else {
484 			/* now look for contiguous allocation */
485 			len = (ssize_t)blkroundup(fs, resid);
486 			error = bmap_read(ip, uoff, &bn, &len);
487 			if (error || bn == UFS_HOLE || len == 0)
488 				goto errout;
489 			/* save a call to bmap_read later */
490 			bmap_peek = 1;
491 			if (len < resid)
492 				upgrade = 1;
493 		}
494 		if (upgrade) {
495 			rw_exit(&ip->i_contents);
496 			rw_enter(&ip->i_contents, RW_WRITER);
497 			ufs_posix_hits++;
498 		}
499 	}
500 
501 
502 	/*
503 	 * allocate space
504 	 */
505 
506 	/*
507 	 * If attempting a re-write, there is no allocation to do.
508 	 * bmap_write would trip an ASSERT if i_contents is held shared.
509 	 */
510 	if (rewrite)
511 		goto skip_alloc;
512 
513 	do {
514 		on = (int)blkoff(fs, uoff);
515 		n = (int)MIN(fs->fs_bsize - on, resid);
516 		if ((uoff + n) > ip->i_size) {
517 			error = bmap_write(ip, uoff, (int)(on + n),
518 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
519 			    NULL, cr);
520 			/* Caller is responsible for updating i_seq if needed */
521 			if (error)
522 				break;
523 			ip->i_size = uoff + n;
524 			ip->i_flag |= IATTCHG;
525 		} else if (n == MAXBSIZE) {
526 			error = bmap_write(ip, uoff, (int)(on + n),
527 			    BI_ALLOC_ONLY, NULL, cr);
528 			/* Caller is responsible for updating i_seq if needed */
529 		} else {
530 			if (has_holes < 0)
531 				has_holes = bmap_has_holes(ip);
532 			if (has_holes) {
533 				uint_t	blk_size;
534 				u_offset_t offset;
535 
536 				offset = uoff & (offset_t)fs->fs_bmask;
537 				blk_size = (int)blksize(fs, ip,
538 				    (daddr_t)lblkno(fs, offset));
539 				error = bmap_write(ip, uoff, blk_size,
540 				    BI_NORMAL, NULL, cr);
541 				/*
542 				 * Caller is responsible for updating
543 				 * i_seq if needed
544 				 */
545 			} else
546 				error = 0;
547 		}
548 		if (error)
549 			break;
550 		uoff += n;
551 		resid -= n;
552 		/*
553 		 * if file has grown larger than 2GB, set flag
554 		 * in superblock if not already set
555 		 */
556 		if ((ip->i_size > MAXOFF32_T) &&
557 		    !(fs->fs_flags & FSLARGEFILES)) {
558 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
559 			mutex_enter(&ufsvfsp->vfs_lock);
560 			fs->fs_flags |= FSLARGEFILES;
561 			ufs_sbwrite(ufsvfsp);
562 			mutex_exit(&ufsvfsp->vfs_lock);
563 		}
564 	} while (resid);
565 
566 	if (error) {
567 		/*
568 		 * restore original state
569 		 */
570 		if (resid) {
571 			if (size == ip->i_size)
572 				goto errout;
573 			(void) ufs_itrunc(ip, size, 0, cr);
574 		}
575 		/*
576 		 * try non-directio path
577 		 */
578 		goto errout;
579 	}
580 skip_alloc:
581 
582 	/*
583 	 * get rid of cached pages
584 	 */
585 	vp = ITOV(ip);
586 	exclusive = rw_write_held(&ip->i_contents);
587 	if (vn_has_cached_data(vp)) {
588 		if (!exclusive) {
589 			/*
590 			 * Still holding i_rwlock, so no allocations
591 			 * can happen after dropping contents.
592 			 */
593 			rw_exit(&ip->i_contents);
594 			rw_enter(&ip->i_contents, RW_WRITER);
595 		}
596 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
597 		if (vn_has_cached_data(vp))
598 			goto errout;
599 		if (!exclusive)
600 			rw_downgrade(&ip->i_contents);
601 		ufs_directio_kstats.nflushes++;
602 	}
603 
604 	/*
605 	 * Direct Writes
606 	 */
607 
608 	if (!exclusive) {
609 		ufs_shared_writes++;
610 		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
611 		if (ncur > ufs_maxcur_writes)
612 			ufs_maxcur_writes = ncur;
613 	}
614 
615 	/*
616 	 * proc and as are for VM operations in directio_start()
617 	 */
618 	if (uio->uio_segflg == UIO_USERSPACE) {
619 		procp = ttoproc(curthread);
620 		as = procp->p_as;
621 	} else {
622 		procp = NULL;
623 		as = &kas;
624 	}
625 	*statusp = DIRECTIO_SUCCESS;
626 	error = 0;
627 	newerror = 0;
628 	resid = uio->uio_resid;
629 	bytes_written = 0;
630 	ufs_directio_kstats.logical_writes++;
631 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
632 		size_t pglck_len, pglck_size;
633 		caddr_t pglck_base;
634 		page_t **pplist, **spplist;
635 
636 		tail = NULL;
637 
638 		/*
639 		 * Adjust number of bytes
640 		 */
641 		iov = uio->uio_iov;
642 		pglck_len = (size_t)MIN(iov->iov_len, resid);
643 		pglck_base = iov->iov_base;
644 		if (pglck_len == 0) {
645 			uio->uio_iov++;
646 			uio->uio_iovcnt--;
647 			continue;
648 		}
649 
650 		/*
651 		 * Try to Lock down the largest chunck of pages possible.
652 		 */
653 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
654 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
655 
656 		if (error)
657 			break;
658 
659 		pglck_size = pglck_len;
660 		while (pglck_len) {
661 
662 			nbytes = pglck_len;
663 			uoff = uio->uio_loffset;
664 
665 			if (!bmap_peek) {
666 
667 				/*
668 				 * Re-adjust number of bytes to contiguous
669 				 * range. May have already called bmap_read
670 				 * in the case of a concurrent rewrite.
671 				 */
672 				len = (ssize_t)blkroundup(fs, nbytes);
673 				error = bmap_read(ip, uoff, &bn, &len);
674 				if (error)
675 					break;
676 				if (bn == UFS_HOLE || len == 0)
677 					break;
678 			}
679 			nbytes = (size_t)MIN(nbytes, len);
680 			bmap_peek = 0;
681 
682 			/*
683 			 * Get the pagelist pointer for this offset to be
684 			 * passed to directio_start.
685 			 */
686 
687 			if (pplist != NULL)
688 				spplist = pplist +
689 				btop((uintptr_t)iov->iov_base -
690 					((uintptr_t)pglck_base & PAGEMASK));
691 			else
692 				spplist = NULL;
693 
694 			/*
695 			 * Kick off the direct write requests
696 			 */
697 			directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn),
698 				iov->iov_base, S_READ, procp, &tail, spplist);
699 
700 			/*
701 			 * Adjust pointers and counters
702 			 */
703 			iov->iov_len -= nbytes;
704 			iov->iov_base += nbytes;
705 			uio->uio_loffset += nbytes;
706 			resid -= nbytes;
707 			pglck_len -= nbytes;
708 		}
709 
710 		/*
711 		 * Wait for outstanding requests
712 		 */
713 		newerror = directio_wait(tail, &bytes_written);
714 
715 		/*
716 		 * Release VM resources
717 		 */
718 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
719 
720 	}
721 
722 	if (!exclusive) {
723 		atomic_add_32(&ufs_cur_writes, -1);
724 		/*
725 		 * If this write was done shared, readers may
726 		 * have pulled in unmodified pages. Get rid of
727 		 * these potentially stale pages.
728 		 */
729 		if (vn_has_cached_data(vp)) {
730 			rw_exit(&ip->i_contents);
731 			rw_enter(&ip->i_contents, RW_WRITER);
732 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
733 				B_INVAL, cr);
734 			ufs_directio_kstats.nflushes++;
735 			rw_downgrade(&ip->i_contents);
736 		}
737 	}
738 
739 	/*
740 	 * If error, adjust resid to begin at the first
741 	 * un-writable byte.
742 	 */
743 	if (error == 0)
744 		error = newerror;
745 	if (error)
746 		resid = uio->uio_resid - bytes_written;
747 	arg_uio->uio_resid = resid;
748 
749 	if (!rewrite) {
750 		ip->i_flag |= IUPD | ICHG;
751 		/* Caller will update i_seq */
752 		TRANS_INODE(ip->i_ufsvfs, ip);
753 	}
754 	/*
755 	 * If there is a residual; adjust the EOF if necessary
756 	 */
757 	if (resid) {
758 		if (size != ip->i_size) {
759 			if (uio->uio_loffset > size)
760 				size = uio->uio_loffset;
761 			(void) ufs_itrunc(ip, size, 0, cr);
762 		}
763 	}
764 
765 	if (uio == &copy_uio)
766 		kmem_free(copy_base, copy_resid);
767 
768 	return (error);
769 
770 errout:
771 	if (uio == &copy_uio)
772 		kmem_free(copy_base, copy_resid);
773 
774 	return (0);
775 }
776 /*
777  * Direct read of a hole
778  */
779 static int
780 directio_hole(struct uio *uio, size_t nbytes)
781 {
782 	int		error = 0, nzero;
783 	uio_t		phys_uio;
784 	iovec_t		phys_iov;
785 
786 	ufs_directio_kstats.hole_reads++;
787 	ufs_directio_kstats.nread += nbytes;
788 
789 	phys_iov.iov_base = uio->uio_iov->iov_base;
790 	phys_iov.iov_len = nbytes;
791 
792 	phys_uio.uio_iov = &phys_iov;
793 	phys_uio.uio_iovcnt = 1;
794 	phys_uio.uio_resid = phys_iov.iov_len;
795 	phys_uio.uio_segflg = uio->uio_segflg;
796 	phys_uio.uio_extflg = uio->uio_extflg;
797 	while (error == 0 && phys_uio.uio_resid) {
798 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
799 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
800 				&phys_uio);
801 	}
802 	return (error);
803 }
804 
805 /*
806  * Direct Read
807  */
808 int
809 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
810 {
811 	ssize_t		resid, bytes_read;
812 	u_offset_t	size, uoff;
813 	int		error, newerror, len;
814 	size_t		nbytes;
815 	struct fs	*fs;
816 	vnode_t		*vp;
817 	daddr_t		bn;
818 	iovec_t		*iov;
819 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
820 	struct proc	*procp;
821 	struct as	*as;
822 	struct directio_buf	*tail;
823 
824 	/*
825 	 * assume that directio isn't possible (normal case)
826 	 */
827 	*statusp = DIRECTIO_FAILURE;
828 
829 	/*
830 	 * Don't go direct
831 	 */
832 	if (ufs_directio_enabled == 0)
833 		return (0);
834 
835 	/*
836 	 * mapped file; nevermind
837 	 */
838 	if (ip->i_mapcnt)
839 		return (0);
840 
841 	/*
842 	 * CAN WE DO DIRECT IO?
843 	 */
844 	/*
845 	 * must be sector aligned
846 	 */
847 	uoff = uio->uio_loffset;
848 	resid = uio->uio_resid;
849 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
850 		return (0);
851 	/*
852 	 * must be short aligned and sector aligned
853 	 */
854 	iov = uio->uio_iov;
855 	nbytes = uio->uio_iovcnt;
856 	while (nbytes--) {
857 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
858 			return (0);
859 		if ((intptr_t)(iov++->iov_base) & 1)
860 			return (0);
861 	}
862 
863 	/*
864 	 * DIRECTIO
865 	 */
866 	fs = ip->i_fs;
867 
868 	/*
869 	 * don't read past EOF
870 	 */
871 	size = ip->i_size;
872 
873 	/*
874 	 * The file offset is past EOF so bail out here; we don't want
875 	 * to update uio_resid and make it look like we read something.
876 	 * We say that direct I/O was a success to avoid having rdip()
877 	 * go through the same "read past EOF logic".
878 	 */
879 	if (uoff >= size) {
880 		*statusp = DIRECTIO_SUCCESS;
881 		return (0);
882 	}
883 
884 	/*
885 	 * The read would extend past EOF so make it smaller.
886 	 */
887 	if ((uoff + resid) > size) {
888 		resid = size - uoff;
889 		/*
890 		 * recheck sector alignment
891 		 */
892 		if (resid & (DEV_BSIZE - 1))
893 			return (0);
894 	}
895 
896 	/*
897 	 * At this point, we know there is some real work to do.
898 	 */
899 	ASSERT(resid);
900 
901 	/*
902 	 * get rid of cached pages
903 	 */
904 	vp = ITOV(ip);
905 	if (vn_has_cached_data(vp)) {
906 		rw_exit(&ip->i_contents);
907 		rw_enter(&ip->i_contents, RW_WRITER);
908 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
909 		if (vn_has_cached_data(vp))
910 			return (0);
911 		rw_downgrade(&ip->i_contents);
912 		ufs_directio_kstats.nflushes++;
913 	}
914 	/*
915 	 * Direct Reads
916 	 */
917 
918 	/*
919 	 * proc and as are for VM operations in directio_start()
920 	 */
921 	if (uio->uio_segflg == UIO_USERSPACE) {
922 		procp = ttoproc(curthread);
923 		as = procp->p_as;
924 	} else {
925 		procp = NULL;
926 		as = &kas;
927 	}
928 
929 	*statusp = DIRECTIO_SUCCESS;
930 	error = 0;
931 	newerror = 0;
932 	bytes_read = 0;
933 	ufs_directio_kstats.logical_reads++;
934 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
935 		size_t pglck_len, pglck_size;
936 		caddr_t pglck_base;
937 		page_t **pplist, **spplist;
938 
939 		tail = NULL;
940 
941 		/*
942 		 * Adjust number of bytes
943 		 */
944 		iov = uio->uio_iov;
945 		pglck_len = (size_t)MIN(iov->iov_len, resid);
946 		pglck_base = iov->iov_base;
947 		if (pglck_len == 0) {
948 			uio->uio_iov++;
949 			uio->uio_iovcnt--;
950 			continue;
951 		}
952 
953 		/*
954 		 * Try to Lock down the largest chunck of pages possible.
955 		 */
956 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
957 		error = as_pagelock(as, &pplist, pglck_base,
958 							pglck_len, S_WRITE);
959 
960 		if (error)
961 			break;
962 
963 		pglck_size = pglck_len;
964 		while (pglck_len) {
965 
966 			nbytes = pglck_len;
967 			uoff = uio->uio_loffset;
968 
969 			/*
970 			 * Re-adjust number of bytes to contiguous range
971 			 */
972 			len = (ssize_t)blkroundup(fs, nbytes);
973 			error = bmap_read(ip, uoff, &bn, &len);
974 			if (error)
975 				break;
976 
977 			if (bn == UFS_HOLE) {
978 				nbytes = (size_t)MIN(fs->fs_bsize -
979 						(long)blkoff(fs, uoff), nbytes);
980 				error = directio_hole(uio, nbytes);
981 				/*
982 				 * Hole reads are not added to the list
983 				 * processed by directio_wait() below so
984 				 * account for bytes read here.
985 				 */
986 				if (!error)
987 					bytes_read += nbytes;
988 			} else {
989 				nbytes = (size_t)MIN(nbytes, len);
990 
991 				/*
992 				 * Get the pagelist pointer for this offset
993 				 * to be passed to directio_start.
994 				 */
995 				if (pplist != NULL)
996 					spplist = pplist +
997 					btop((uintptr_t)iov->iov_base -
998 					((uintptr_t)pglck_base & PAGEMASK));
999 				else
1000 					spplist = NULL;
1001 
1002 				/*
1003 				 * Kick off the direct read requests
1004 				 */
1005 				directio_start(ufsvfsp, ip->i_dev, nbytes,
1006 						ldbtob(bn), iov->iov_base,
1007 						S_WRITE, procp, &tail, spplist);
1008 			}
1009 
1010 			if (error)
1011 				break;
1012 
1013 			/*
1014 			 * Adjust pointers and counters
1015 			 */
1016 			iov->iov_len -= nbytes;
1017 			iov->iov_base += nbytes;
1018 			uio->uio_loffset += nbytes;
1019 			resid -= nbytes;
1020 			pglck_len -= nbytes;
1021 		}
1022 
1023 		/*
1024 		 * Wait for outstanding requests
1025 		 */
1026 		newerror = directio_wait(tail, &bytes_read);
1027 		/*
1028 		 * Release VM resources
1029 		 */
1030 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1031 
1032 	}
1033 
1034 	/*
1035 	 * If error, adjust resid to begin at the first
1036 	 * un-read byte.
1037 	 */
1038 	if (error == 0)
1039 		error = newerror;
1040 	uio->uio_resid -= bytes_read;
1041 	return (error);
1042 }
1043