xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_extvnops.c (revision 275c9da86e89f8abf71135cf63d9fc23671b2e60)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/conf.h>
32 #include <sys/fssnap_if.h>
33 #include <sys/fs/ufs_inode.h>
34 #include <sys/fs/ufs_lockfs.h>
35 #include <sys/fs/ufs_log.h>
36 #include <sys/fs/ufs_trans.h>
37 #include <sys/cmn_err.h>
38 #include <vm/pvn.h>
39 #include <vm/seg_map.h>
40 #include <sys/fdbuffer.h>
41 
42 #ifdef DEBUG
43 int evn_ufs_debug = 0;
44 #define	DEBUGF(args)	{ if (evn_ufs_debug) cmn_err args; }
45 #else
46 #define	DEBUGF(args)
47 #endif
48 
49 /*
50  * ufs_rdwr_data - supports reading or writing data when
51  * no changes are permitted in file size or space allocation.
52  *
53  * Inputs:
54  * fdb - The mandatory fdbuffer supports
55  *	the read or write operation.
56  * flags - defaults (zero value) to synchronous write
57  *	B_READ - indicates read operation
58  *	B_ASYNC - indicates perform operation asynchronously
59  */
60 /*ARGSUSED*/
61 int
62 ufs_rdwr_data(
63 	vnode_t		*vnodep,
64 	u_offset_t	offset,
65 	size_t		len,
66 	fdbuffer_t	*fdbp,
67 	int		flags,
68 	cred_t		*credp)
69 {
70 	struct inode	*ip = VTOI(vnodep);
71 	struct fs	*fs;
72 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
73 	struct buf	*bp;
74 	krw_t		rwtype = RW_READER;
75 	u_offset_t	offset1 = offset;	/* Initial offset */
76 	size_t		iolen;
77 	int		curlen = 0;
78 	int		pplen;
79 	daddr_t		bn;
80 	int		contig = 0;
81 	int		error = 0;
82 	int		nbytes;			/* Number bytes this IO */
83 	int		offsetn;		/* Start point this IO */
84 	int		iswrite = flags & B_WRITE;
85 	int		io_started = 0;		/* No IO started */
86 	struct ulockfs	*ulp;
87 	uint_t		protp = PROT_ALL;
88 
89 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, !iswrite,
90 	    &protp);
91 	if (error) {
92 		if (flags & B_ASYNC) {
93 			fdb_ioerrdone(fdbp, error);
94 		}
95 		return (error);
96 	}
97 	fs = ufsvfsp->vfs_fs;
98 	iolen = len;
99 
100 	DEBUGF((CE_CONT, "?ufs_rdwr: %s vp: %p pages:%p  off %llx len %lx"
101 	    " isize: %llx fdb: %p\n",
102 	    flags & B_READ ? "READ" : "WRITE", (void *)vnodep,
103 	    (void *)vnodep->v_pages, offset1, iolen, ip->i_size, (void *)fdbp));
104 
105 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
106 	rw_enter(&ip->i_contents, rwtype);
107 
108 	ASSERT(offset1 < ip->i_size);
109 
110 	if ((offset1 + iolen) > ip->i_size) {
111 		iolen = ip->i_size - offset1;
112 	}
113 	while (!error && curlen < iolen) {
114 
115 		contig = 0;
116 
117 		if ((error = bmap_read(ip, offset1, &bn, &contig)) != 0) {
118 			break;
119 		}
120 		ASSERT(!(bn == UFS_HOLE && iswrite));
121 		if (bn == UFS_HOLE) {
122 			/*
123 			 * If the above assertion is true,
124 			 * then the following if statement can never be true.
125 			 */
126 			if (iswrite && (rwtype == RW_READER)) {
127 				rwtype = RW_WRITER;
128 				if (!rw_tryupgrade(&ip->i_contents)) {
129 					rw_exit(&ip->i_contents);
130 					rw_enter(&ip->i_contents, rwtype);
131 					continue;
132 				}
133 			}
134 			offsetn = blkoff(fs, offset1);
135 			pplen = P2ROUNDUP(len, PAGESIZE);
136 			nbytes = MIN((pplen - curlen),
137 			    (fs->fs_bsize - offsetn));
138 			ASSERT(nbytes > 0);
139 
140 			/*
141 			 * We may be reading or writing.
142 			 */
143 			DEBUGF((CE_CONT, "?ufs_rdwr_data: hole %llx - %lx\n",
144 			    offset1, (iolen - curlen)));
145 
146 			if (iswrite) {
147 				printf("**WARNING: ignoring hole in write\n");
148 				error = ENOSPC;
149 			} else {
150 				fdb_add_hole(fdbp, offset1 - offset, nbytes);
151 			}
152 			offset1 += nbytes;
153 			curlen += nbytes;
154 			continue;
155 
156 		}
157 		ASSERT(contig > 0);
158 		pplen = P2ROUNDUP(len, PAGESIZE);
159 
160 		contig = MIN(contig, len - curlen);
161 		contig = P2ROUNDUP(contig, DEV_BSIZE);
162 
163 		bp = fdb_iosetup(fdbp, offset1 - offset, contig, vnodep, flags);
164 
165 		bp->b_edev = ip->i_dev;
166 		bp->b_dev = cmpdev(ip->i_dev);
167 		bp->b_blkno = bn;
168 		bp->b_file = ip->i_vnode;
169 		bp->b_offset = (offset_t)offset1;
170 
171 		if (ufsvfsp->vfs_snapshot) {
172 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
173 		} else {
174 			(void) bdev_strategy(bp);
175 		}
176 		io_started = 1;
177 
178 		offset1 += contig;
179 		curlen += contig;
180 		if (iswrite)
181 			lwp_stat_update(LWP_STAT_OUBLK, 1);
182 		else
183 			lwp_stat_update(LWP_STAT_INBLK, 1);
184 
185 		if ((flags & B_ASYNC) == 0) {
186 			error = biowait(bp);
187 			fdb_iodone(bp);
188 		}
189 
190 		DEBUGF((CE_CONT, "?loop ufs_rdwr_data.. off %llx len %lx\n",
191 		    offset1, (iolen - curlen)));
192 	}
193 
194 	DEBUGF((CE_CONT, "?ufs_rdwr_data: off %llx len %lx pages: %p ------\n",
195 	    offset1, (iolen - curlen), (void *)vnodep->v_pages));
196 
197 	rw_exit(&ip->i_contents);
198 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
199 
200 	if (flags & B_ASYNC) {
201 		/*
202 		 * Show that no more asynchronous IO will be added
203 		 */
204 		fdb_ioerrdone(fdbp, error);
205 	}
206 	if (ulp) {
207 		ufs_lockfs_end(ulp);
208 	}
209 	if (io_started && flags & B_ASYNC) {
210 		return (0);
211 	} else {
212 		return (error);
213 	}
214 }
215 
216 /*
217  * ufs_alloc_data - supports allocating space and reads or writes
218  * that involve changes to file length or space allocation.
219  *
220  * This function is more expensive, because of the UFS log transaction,
221  * so ufs_rdwr_data() should be used when space or file length changes
222  * will not occur.
223  *
224  * Inputs:
225  * fdb - A null pointer instructs this function to only allocate
226  *	space for the specified offset and length.
227  *	An actual fdbuffer instructs this function to perform
228  *	the read or write operation.
229  * flags - defaults (zero value) to synchronous write
230  *	B_READ - indicates read operation
231  *	B_ASYNC - indicates perform operation asynchronously
232  */
233 int
234 ufs_alloc_data(
235 	vnode_t		*vnodep,
236 	u_offset_t	offset,
237 	size_t		*len,
238 	fdbuffer_t	*fdbp,
239 	int		flags,
240 	cred_t		*credp)
241 {
242 	struct inode	*ip = VTOI(vnodep);
243 	size_t		done_len, io_len;
244 	int		contig;
245 	u_offset_t	uoff, io_off;
246 	int		error = 0;		/* No error occurred */
247 	int		offsetn;		/* Start point this IO */
248 	int		nbytes;			/* Number bytes in this IO */
249 	daddr_t		bn;
250 	struct fs	*fs;
251 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
252 	int		i_size_changed = 0;
253 	u_offset_t	old_i_size;
254 	struct ulockfs	*ulp;
255 	int		trans_size;
256 	int		issync;			/* UFS Log transaction */
257 						/* synchronous when non-zero */
258 
259 	int		io_started = 0;		/* No IO started */
260 	uint_t		protp = PROT_ALL;
261 
262 	ASSERT((flags & B_WRITE) == 0);
263 
264 	/*
265 	 * Obey the lockfs protocol
266 	 */
267 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, 0, &protp);
268 	if (error) {
269 		if ((fdbp != NULL) && (flags & B_ASYNC)) {
270 			fdb_ioerrdone(fdbp, error);
271 		}
272 		return (error);
273 	}
274 	if (ulp) {
275 		/*
276 		 * Try to begin a UFS log transaction
277 		 */
278 		trans_size = TOP_GETPAGE_SIZE(ip);
279 		TRANS_TRY_BEGIN_CSYNC(ufsvfsp, issync, TOP_GETPAGE,
280 		    trans_size, error);
281 		if (error == EWOULDBLOCK) {
282 			ufs_lockfs_end(ulp);
283 			if ((fdbp != NULL) && (flags & B_ASYNC)) {
284 				fdb_ioerrdone(fdbp, EDEADLK);
285 			}
286 			return (EDEADLK);
287 		}
288 	}
289 
290 	uoff = offset;
291 	io_off = offset;
292 	io_len = *len;
293 	done_len = 0;
294 
295 	DEBUGF((CE_CONT, "?ufs_alloc: off %llx len %lx size %llx fdb: %p\n",
296 	    uoff, (io_len - done_len), ip->i_size, (void *)fdbp));
297 
298 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
299 	rw_enter(&ip->i_contents, RW_WRITER);
300 
301 	ASSERT((ip->i_mode & IFMT) == IFREG);
302 
303 	fs = ip->i_fs;
304 
305 	while (error == 0 && done_len < io_len) {
306 		uoff = (u_offset_t)(io_off + done_len);
307 		offsetn = (int)blkoff(fs, uoff);
308 		nbytes = (int)MIN(fs->fs_bsize - offsetn, io_len - done_len);
309 
310 		DEBUGF((CE_CONT, "?ufs_alloc_data: offset: %llx len %x\n",
311 		    uoff, nbytes));
312 
313 		if (uoff + nbytes > ip->i_size) {
314 			/*
315 			 * We are extending the length of the file.
316 			 * bmap is used so that we are sure that
317 			 * if we need to allocate new blocks, that it
318 			 * is done here before we up the file size.
319 			 */
320 			DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n",
321 			    ip->i_size, uoff + nbytes));
322 
323 			error = bmap_write(ip, uoff, (offsetn + nbytes),
324 			    BI_ALLOC_ONLY, NULL, credp);
325 			if (ip->i_flag & (ICHG|IUPD))
326 				ip->i_seq++;
327 			if (error) {
328 				DEBUGF((CE_CONT, "?ufs_alloc_data: grow "
329 				    "failed err: %d\n", error));
330 				break;
331 			}
332 			if (fdbp != NULL) {
333 				if (uoff >= ip->i_size) {
334 					/*
335 					 * Desired offset is past end of bytes
336 					 * in file, so we have a hole.
337 					 */
338 					fdb_add_hole(fdbp, uoff - offset,
339 					    nbytes);
340 				} else {
341 					int contig;
342 					buf_t *bp;
343 
344 					error = bmap_read(ip, uoff, &bn,
345 					    &contig);
346 					if (error) {
347 						break;
348 					}
349 
350 					contig = ip->i_size - uoff;
351 					contig = P2ROUNDUP(contig, DEV_BSIZE);
352 
353 					bp = fdb_iosetup(fdbp, uoff - offset,
354 					    contig, vnodep, flags);
355 
356 					bp->b_edev = ip->i_dev;
357 					bp->b_dev = cmpdev(ip->i_dev);
358 					bp->b_blkno = bn;
359 					bp->b_file = ip->i_vnode;
360 					bp->b_offset = (offset_t)uoff;
361 
362 					if (ufsvfsp->vfs_snapshot) {
363 						fssnap_strategy(
364 						    &ufsvfsp->vfs_snapshot, bp);
365 					} else {
366 						(void) bdev_strategy(bp);
367 					}
368 					io_started = 1;
369 
370 					lwp_stat_update(LWP_STAT_OUBLK, 1);
371 
372 					if ((flags & B_ASYNC) == 0) {
373 						error = biowait(bp);
374 						fdb_iodone(bp);
375 						if (error) {
376 							break;
377 						}
378 					}
379 					if (contig > (ip->i_size - uoff)) {
380 						contig -= ip->i_size - uoff;
381 
382 						fdb_add_hole(fdbp,
383 						    ip->i_size - offset,
384 						    contig);
385 					}
386 				}
387 			}
388 
389 			i_size_changed = 1;
390 			old_i_size = ip->i_size;
391 			UFS_SET_ISIZE(uoff + nbytes, ip);
392 			TRANS_INODE(ip->i_ufsvfs, ip);
393 			/*
394 			 * file has grown larger than 2GB. Set flag
395 			 * in superblock to indicate this, if it
396 			 * is not already set.
397 			 */
398 			if ((ip->i_size > MAXOFF32_T) &&
399 			    !(fs->fs_flags & FSLARGEFILES)) {
400 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
401 				mutex_enter(&ufsvfsp->vfs_lock);
402 				fs->fs_flags |= FSLARGEFILES;
403 				ufs_sbwrite(ufsvfsp);
404 				mutex_exit(&ufsvfsp->vfs_lock);
405 			}
406 		} else {
407 			/*
408 			 * The file length is not being extended.
409 			 */
410 			error = bmap_read(ip, uoff, &bn, &contig);
411 			if (error) {
412 				DEBUGF((CE_CONT, "?ufs_alloc_data: "
413 				    "bmap_read err: %d\n", error));
414 				break;
415 			}
416 
417 			if (bn != UFS_HOLE) {
418 				/*
419 				 * Did not map a hole in the file
420 				 */
421 				int	contig = P2ROUNDUP(nbytes, DEV_BSIZE);
422 				buf_t	*bp;
423 
424 				if (fdbp != NULL) {
425 					bp = fdb_iosetup(fdbp, uoff - offset,
426 					    contig, vnodep, flags);
427 
428 					bp->b_edev = ip->i_dev;
429 					bp->b_dev = cmpdev(ip->i_dev);
430 					bp->b_blkno = bn;
431 					bp->b_file = ip->i_vnode;
432 					bp->b_offset = (offset_t)uoff;
433 
434 					if (ufsvfsp->vfs_snapshot) {
435 						fssnap_strategy(
436 						    &ufsvfsp->vfs_snapshot, bp);
437 					} else {
438 						(void) bdev_strategy(bp);
439 					}
440 					io_started = 1;
441 
442 					lwp_stat_update(LWP_STAT_OUBLK, 1);
443 
444 					if ((flags & B_ASYNC) == 0) {
445 						error = biowait(bp);
446 						fdb_iodone(bp);
447 						if (error) {
448 							break;
449 						}
450 					}
451 				}
452 			} else {
453 				/*
454 				 * We read a hole in the file.
455 				 * We have to allocate blocks for the hole.
456 				 */
457 				error = bmap_write(ip, uoff, (offsetn + nbytes),
458 				    BI_ALLOC_ONLY, NULL, credp);
459 				if (ip->i_flag & (ICHG|IUPD))
460 					ip->i_seq++;
461 				if (error) {
462 					DEBUGF((CE_CONT, "?ufs_alloc_data: fill"
463 					    " hole failed error: %d\n", error));
464 					break;
465 				}
466 				if (fdbp != NULL) {
467 					fdb_add_hole(fdbp, uoff - offset,
468 					    nbytes);
469 				}
470 			}
471 		}
472 		done_len += nbytes;
473 	}
474 
475 	if (error) {
476 		if (i_size_changed) {
477 			/*
478 			 * Allocation of the blocks for the file failed.
479 			 * So truncate the file size back to its original size.
480 			 */
481 			(void) ufs_itrunc(ip, old_i_size, 0, credp);
482 		}
483 	}
484 
485 	DEBUGF((CE_CONT, "?ufs_alloc: uoff %llx len %lx\n",
486 	    uoff, (io_len - done_len)));
487 
488 	if ((offset + *len) < (NDADDR * fs->fs_bsize)) {
489 		*len = (size_t)(roundup(offset + *len, fs->fs_fsize) - offset);
490 	} else {
491 		*len = (size_t)(roundup(offset + *len, fs->fs_bsize) - offset);
492 	}
493 
494 	/*
495 	 * Flush cached pages.
496 	 *
497 	 * XXX - There should be no pages involved, since the I/O was performed
498 	 * through the device strategy routine and the page cache was bypassed.
499 	 * However, testing has demonstrated that this VOP_PUTPAGE is
500 	 * necessary. Without this, data might not always be read back as it
501 	 * was written.
502 	 *
503 	 */
504 	(void) VOP_PUTPAGE(vnodep, 0, 0, B_INVAL, credp, NULL);
505 
506 	rw_exit(&ip->i_contents);
507 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
508 
509 	if ((fdbp != NULL) && (flags & B_ASYNC)) {
510 		/*
511 		 * Show that no more asynchronous IO will be added
512 		 */
513 		fdb_ioerrdone(fdbp, error);
514 	}
515 	if (ulp) {
516 		/*
517 		 * End the UFS Log transaction
518 		 */
519 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_GETPAGE,
520 		    trans_size);
521 		ufs_lockfs_end(ulp);
522 	}
523 	if (io_started && (flags & B_ASYNC)) {
524 		return (0);
525 	} else {
526 		return (error);
527 	}
528 }
529