xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_bmap.c (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/t_lock.h>
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/signal.h>
48 #include <sys/user.h>
49 #include <sys/vnode.h>
50 #include <sys/buf.h>
51 #include <sys/disp.h>
52 #include <sys/proc.h>
53 #include <sys/conf.h>
54 #include <sys/fs/ufs_inode.h>
55 #include <sys/fs/ufs_fs.h>
56 #include <sys/fs/ufs_quota.h>
57 #include <sys/fs/ufs_trans.h>
58 #include <sys/fs/ufs_bio.h>
59 #include <vm/seg.h>
60 #include <sys/errno.h>
61 #include <sys/sysmacros.h>
62 #include <sys/vfs.h>
63 #include <sys/cmn_err.h>
64 #include <sys/debug.h>
65 #include <sys/kmem.h>
66 
67 /*
68  * This structure is used to track blocks as we allocate them, so that
69  * we can free them if we encounter an error during allocation.  We
70  * keep track of five pieces of information for each allocated block:
71  *   - The number of the newly allocated block
72  *   - The size of the block (lets us deal with fragments if we want)
73  *   - The number of the block containing a pointer to it; or whether
74  *     the pointer is in the inode
75  *   - The offset within the block (or inode) containing a pointer to it.
76  *   - A flag indicating the usage of the block.  (Logging needs to know
77  *     this to avoid overwriting a data block if it was previously used
78  *     for metadata.)
79  */
80 
81 enum ufs_owner_type {
82 	ufs_no_owner,		/* Owner has not yet been updated */
83 	ufs_inode_direct,	/* Listed in inode's direct block table */
84 	ufs_inode_indirect,	/* Listed in inode's indirect block table */
85 	ufs_indirect_block	/* Listed in an indirect block */
86 };
87 
88 struct ufs_allocated_block {
89 	daddr_t this_block;	    /* Number of this block */
90 	off_t block_size;	    /* Size of this block, in bytes */
91 	enum ufs_owner_type owner;  /* Who points to this block? */
92 	daddr_t owner_block;	    /* Number of the owning block */
93 	uint_t owner_offset;	    /* Offset within that block or inode */
94 	int usage_flags;	    /* Usage flags, as expected by free() */
95 };
96 
97 
98 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
99 		int maxtrans);
100 
101 static void ufs_undo_allocation(inode_t *ip, int block_count,
102 	struct ufs_allocated_block table[], int inode_sector_adjust);
103 
104 /*
105  * Find the extent and the matching block number.
106  *
107  * bsize > PAGESIZE
108  *	boff indicates that we want a page in the middle
109  *	min expression is supposed to make sure no extra page[s] after EOF
110  * PAGESIZE >= bsize
111  *	we assume that a page is a multiple of bsize, i.e.,
112  *	boff always == 0
113  *
114  * We always return a length that is suitable for a disk transfer.
115  */
116 #define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
117 	register daddr32_t *dp = (tblp);				\
118 	register int _chkfrag = chkfrag; /* for lint. sigh */		\
119 									\
120 	if (*dp == 0) {							\
121 		*(bnp) = UFS_HOLE;					\
122 	} else {							\
123 		register int len;					\
124 									\
125 		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
126 			(fs)->fs_bshift; 				\
127 		if (_chkfrag) {						\
128 			register u_offset_t tmp;			\
129 									\
130 			tmp = fragroundup((fs), size) -			\
131 			    (((u_offset_t)lbn) << fs->fs_bshift);	\
132 			len = (int)MIN(tmp, len);			\
133 		}							\
134 		len -= (boff);						\
135 		if (len <= 0) {						\
136 			*(bnp) = UFS_HOLE;				\
137 		} else {						\
138 			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
139 			*(lenp) = len;					\
140 		}							\
141 	}								\
142 }
143 
144 /*
145  * The maximum supported file size is actually somewhat less that 1
146  * terabyte.  This is because the total number of blocks used for the
147  * file and its metadata must fit into the ic_blocks field of the
148  * inode, which is a signed 32-bit quantity.  The metadata allocated
149  * for a file (that is, the single, double, and triple indirect blocks
150  * used to reference the file blocks) is actually quite small,
151  * but just to make sure, we check for overflow in the ic_blocks
152  * ic_blocks fields for all files whose total block count is
153  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
154  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
155  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
156  * field if the number of blocks currently allocated to the file is
157  * greater than VERYLARGEFILESIZE.
158  *
159  * Note that file "size" is the not the same as file "length".  A
160  * file's "size" is the number of blocks allocated to it.  A file's
161  * "length" is the maximum offset in the file.  A UFS FILE can have a
162  * length of a terabyte, but the size is limited to somewhat less than
163  * a terabyte, as described above.
164  */
165 #define	VERYLARGEFILESIZE	0x7FE00000
166 
167 /*
168  * bmap{rd,wr} define the structure of file system storage by mapping
169  * a logical offset in a file to a physical block number on the device.
170  * It should be called with a locked inode when allocation is to be
171  * done (bmapwr).  Note this strangeness: bmapwr is always called from
172  * getpage(), not putpage(), since getpage() is where all the allocation
173  * is done.
174  *
175  * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr.
176  *
177  * NOTICE: the block number returned is the disk block number, not the
178  * file system block number.  All the worries about block offsets and
179  * page/block sizes are hidden inside of bmap.  Well, not quite,
180  * unfortunately.  It's impossible to find one place to hide all this
181  * mess.  There are 3 cases:
182  *
183  * PAGESIZE < bsize
184  *	In this case, the {get,put}page routines will attempt to align to
185  *	a file system block boundry (XXX - maybe this is a mistake?).  Since
186  *	the kluster routines may be out of memory, we don't always get all
187  *	the pages we wanted.  If we called bmap first, to find out how much
188  *	to kluster, we handed in the block aligned offset.  If we didn't get
189  *	all the pages, we have to chop off the amount we didn't get from the
190  *	amount handed back by bmap.
191  *
192  * PAGESIZE == bsize
193  *	Life is quite pleasant here, no extra work needed, mainly because we
194  *	(probably?) won't kluster backwards, just forwards.
195  *
196  * PAGESIZE > bsize
197  *	This one has a different set of problems, specifically, we may have to
198  *	do N reads to fill one page.  Let us hope that Sun will stay with small
199  *	pages.
200  *
201  * Returns 0 on success, or a non-zero errno if an error occurs.
202  *
203  * TODO
204  *	LMXXX - add a bmap cache.  This could be a couple of extents in the
205  *	inode.  Two is nice for PAGESIZE > bsize.
206  */
207 
208 int
209 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
210 {
211 	daddr_t lbn;
212 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
213 	struct	fs *fs = ufsvfsp->vfs_fs;
214 	struct	buf *bp;
215 	int	i, j, boff;
216 	int	shft;			/* we maintain sh = 1 << shft */
217 	daddr_t	ob, nb, tbn;
218 	daddr32_t *bap;
219 	int	nindirshift, nindiroffset;
220 
221 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
222 	lbn = (daddr_t)lblkno(fs, off);
223 	boff = (int)blkoff(fs, off);
224 	if (lbn < 0)
225 		return (EFBIG);
226 
227 	/*
228 	 * The first NDADDR blocks are direct blocks.
229 	 */
230 	if (lbn < NDADDR) {
231 		DOEXTENT(fs, lbn, boff, bnp, lenp,
232 		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
233 			ufsvfsp->vfs_iotransz);
234 		return (0);
235 	}
236 
237 	nindirshift = ufsvfsp->vfs_nindirshift;
238 	nindiroffset = ufsvfsp->vfs_nindiroffset;
239 	/*
240 	 * Determine how many levels of indirection.
241 	 */
242 	shft = 0;				/* sh = 1 */
243 	tbn = lbn - NDADDR;
244 	for (j = NIADDR; j > 0; j--) {
245 		longlong_t	sh;
246 
247 		shft += nindirshift;		/* sh *= nindir */
248 		sh = 1LL << shft;
249 		if (tbn < sh)
250 			break;
251 		tbn -= sh;
252 	}
253 	if (j == 0)
254 		return (EFBIG);
255 
256 	/*
257 	 * Fetch the first indirect block.
258 	 */
259 	nb = ip->i_ib[NIADDR - j];
260 	if (nb == 0) {
261 		*bnp = UFS_HOLE;
262 		return (0);
263 	}
264 
265 	/*
266 	 * Fetch through the indirect blocks.
267 	 */
268 	for (; j <= NIADDR; j++) {
269 		ob = nb;
270 		bp = UFS_BREAD(ufsvfsp,
271 				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
272 		if (bp->b_flags & B_ERROR) {
273 			brelse(bp);
274 			return (EIO);
275 		}
276 		bap = bp->b_un.b_daddr;
277 
278 		ASSERT(!ufs_indir_badblock(ip, bap));
279 
280 		shft -= nindirshift;		/* sh / nindir */
281 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
282 		nb = bap[i];
283 		if (nb == 0) {
284 			*bnp = UFS_HOLE;
285 			brelse(bp);
286 			return (0);
287 		}
288 		if (j != NIADDR)
289 			brelse(bp);
290 	}
291 	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
292 	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
293 		0, ufsvfsp->vfs_iotransz);
294 	brelse(bp);
295 	return (0);
296 }
297 
298 /*
299  * See bmaprd for general notes.
300  *
301  * The block must be at least size bytes and will be extended or
302  * allocated as needed.  If alloc_only is set, bmap will not create
303  * any in-core pages that correspond to the new disk allocation.
304  * Otherwise, the in-core pages will be created and initialized as
305  * needed.
306  *
307  * Returns 0 on success, or a non-zero errno if an error occurs.
308  */
309 
310 int
311 bmap_write(
312 	struct inode	*ip,
313 	u_offset_t	off,
314 	int		size,
315 	int		alloc_only,
316 	struct cred	*cr)
317 {
318 	struct	fs *fs;
319 	struct	buf *bp;
320 	int	i;
321 	struct	buf *nbp;
322 	int	j;
323 	int	shft;				/* we maintain sh = 1 << shft */
324 	daddr_t	ob, nb, pref, lbn, llbn, tbn;
325 	daddr32_t *bap;
326 	struct	vnode *vp = ITOV(ip);
327 	long	bsize = VBSIZE(vp);
328 	long	osize, nsize;
329 	int	issync, metaflag, isdirquota;
330 	int	err;
331 	dev_t	dev;
332 	struct	fbuf *fbp;
333 	int	nindirshift;
334 	int	nindiroffset;
335 	struct	ufsvfs	*ufsvfsp;
336 	int	added_sectors;		/* sectors added to this inode */
337 	int	alloced_blocks;		/* fs blocks newly allocated */
338 	struct  ufs_allocated_block undo_table[NIADDR+1];
339 	int	verylargefile = 0;
340 
341 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
342 
343 	ufsvfsp = ip->i_ufsvfs;
344 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
345 	lbn = (daddr_t)lblkno(fs, off);
346 	if (lbn < 0)
347 		return (EFBIG);
348 	if (ip->i_blocks >= VERYLARGEFILESIZE)
349 		verylargefile = 1;
350 	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
351 	metaflag = isdirquota = 0;
352 	if (((ip->i_mode & IFMT) == IFDIR) ||
353 	    ((ip->i_mode & IFMT) == IFATTRDIR))
354 		isdirquota = metaflag = I_DIR;
355 	else if ((ip->i_mode & IFMT) == IFSHAD)
356 		metaflag = I_SHAD;
357 	else if (ip->i_ufsvfs->vfs_qinod == ip)
358 		isdirquota = metaflag = I_QUOTA;
359 
360 	issync = ((ip->i_flag & ISYNC) != 0);
361 
362 	if (isdirquota || issync) {
363 		alloc_only = 0;		/* make sure */
364 	}
365 
366 	/*
367 	 * If the next write will extend the file into a new block,
368 	 * and the file is currently composed of a fragment
369 	 * this fragment has to be extended to be a full block.
370 	 */
371 	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
372 		osize = blksize(fs, ip, llbn);
373 		if (osize < bsize && osize > 0) {
374 			/*
375 			 * Check to see if doing this will make the file too
376 			 * big.  Only check if we are dealing with a very
377 			 * large file.
378 			 */
379 			if (verylargefile == 1) {
380 				if (((unsigned)ip->i_blocks +
381 				    btodb(bsize - osize)) > INT_MAX) {
382 					return (EFBIG);
383 				}
384 			}
385 			/*
386 			 * Make sure we have all needed pages setup correctly.
387 			 *
388 			 * We pass S_OTHER to fbread here because we want
389 			 * an exclusive lock on the page in question
390 			 * (see ufs_getpage). I/O to the old block location
391 			 * may still be in progress and we are about to free
392 			 * the old block. We don't want anyone else to get
393 			 * a hold of the old block once we free it until
394 			 * the I/O is complete.
395 			 */
396 			err = fbread(ITOV(ip),
397 				    ((offset_t)llbn << fs->fs_bshift),
398 					(uint_t)bsize, S_OTHER, &fbp);
399 			if (err)
400 				return (err);
401 			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
402 			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
403 					&nb, cr);
404 			if (err) {
405 				if (fbp)
406 					fbrelse(fbp, S_OTHER);
407 				return (err);
408 			}
409 			ASSERT(!ufs_badblock(ip, nb));
410 
411 			/*
412 			 * Update the inode before releasing the
413 			 * lock on the page. If we released the page
414 			 * lock first, the data could be written to it's
415 			 * old address and then destroyed.
416 			 */
417 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
418 			ip->i_db[llbn] = nb;
419 			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
420 			    ip);
421 			ip->i_blocks += btodb(bsize - osize);
422 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
423 			TRANS_INODE(ufsvfsp, ip);
424 			ip->i_flag |= IUPD | ICHG | IATTCHG;
425 			/* Caller is responsible for updating i_seq */
426 			/*
427 			 * Don't check metaflag here, directories won't do this
428 			 *
429 			 */
430 			if (issync) {
431 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
432 			} else {
433 				ASSERT(fbp);
434 				fbrelse(fbp, S_WRITE);
435 			}
436 
437 			if (nb != ob) {
438 				(void) free(ip, ob, (off_t)osize, metaflag);
439 			}
440 		}
441 	}
442 
443 	/*
444 	 * The first NDADDR blocks are direct blocks.
445 	 */
446 	if (lbn < NDADDR) {
447 		nb = ip->i_db[lbn];
448 		if (nb == 0 ||
449 		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
450 			if (nb != 0) {
451 				/* consider need to reallocate a frag */
452 				osize = fragroundup(fs, blkoff(fs, ip->i_size));
453 				nsize = fragroundup(fs, size);
454 				if (nsize <= osize)
455 					goto gotit;
456 				/*
457 				 * Check to see if doing this will make the
458 				 * file too big.  Only check if we are dealing
459 				 * with a very large file.
460 				 */
461 				if (verylargefile == 1) {
462 					if (((unsigned)ip->i_blocks +
463 					    btodb(nsize - osize)) > INT_MAX) {
464 						return (EFBIG);
465 					}
466 				}
467 				/*
468 				 * need to allocate a block or frag
469 				 */
470 				ob = nb;
471 				pref = blkpref(ip, lbn, (int)lbn,
472 								&ip->i_db[0]);
473 				err = realloccg(ip, ob, pref, (int)osize,
474 						(int)nsize, &nb, cr);
475 				if (err)
476 					return (err);
477 				ASSERT(!ufs_badblock(ip, nb));
478 
479 			} else {
480 				/*
481 				 * need to allocate a block or frag
482 				 */
483 				osize = 0;
484 				if (ip->i_size <
485 				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
486 					nsize = fragroundup(fs, size);
487 				else
488 					nsize = bsize;
489 				/*
490 				 * Check to see if doing this will make the
491 				 * file too big.  Only check if we are dealing
492 				 * with a very large file.
493 				 */
494 				if (verylargefile == 1) {
495 					if (((unsigned)ip->i_blocks +
496 					    btodb(nsize - osize)) > INT_MAX) {
497 						return (EFBIG);
498 					}
499 				}
500 				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
501 				err = alloc(ip, pref, (int)nsize, &nb, cr);
502 				if (err)
503 					return (err);
504 				ASSERT(!ufs_badblock(ip, nb));
505 				ob = nb;
506 			}
507 
508 			/*
509 			 * Read old/create new zero pages
510 			 */
511 			fbp = NULL;
512 			if (osize == 0) {
513 				/*
514 				 * mmap S_WRITE faults always enter here
515 				 */
516 				if (!alloc_only || P2ROUNDUP_TYPED(size,
517 				    PAGESIZE, u_offset_t) < nsize) {
518 					/* fbzero doesn't cause a pagefault */
519 					fbzero(ITOV(ip),
520 					    ((offset_t)lbn << fs->fs_bshift),
521 					    (uint_t)nsize, &fbp);
522 				}
523 			} else {
524 				err = fbread(vp,
525 				    ((offset_t)lbn << fs->fs_bshift),
526 				    (uint_t)nsize, S_OTHER, &fbp);
527 				if (err) {
528 					if (nb != ob) {
529 						(void) free(ip, nb,
530 						    (off_t)nsize, metaflag);
531 					} else {
532 						(void) free(ip,
533 						    ob + numfrags(fs, osize),
534 						    (off_t)(nsize - osize),
535 						    metaflag);
536 					}
537 					ASSERT(nsize >= osize);
538 					(void) chkdq(ip,
539 						-(long)btodb(nsize - osize),
540 						0, cr, (char **)NULL,
541 						(size_t *)NULL);
542 					return (err);
543 				}
544 			}
545 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
546 			ip->i_db[lbn] = nb;
547 			ip->i_blocks += btodb(nsize - osize);
548 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
549 			TRANS_INODE(ufsvfsp, ip);
550 			ip->i_flag |= IUPD | ICHG | IATTCHG;
551 			/* Caller is responsible for updating i_seq */
552 
553 			/*
554 			 * Write directory and shadow blocks synchronously so
555 			 * that they never appear with garbage in them on the
556 			 * disk.
557 			 *
558 			 */
559 			if (isdirquota && (ip->i_size ||
560 			    TRANS_ISTRANS(ufsvfsp))) {
561 			/*
562 			 * XXX man not be necessary with harpy trans
563 			 * bug id 1130055
564 			 */
565 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
566 			} else if (fbp) {
567 				fbrelse(fbp, S_WRITE);
568 			}
569 
570 			if (nb != ob)
571 				(void) free(ip, ob, (off_t)osize, metaflag);
572 		}
573 gotit:
574 		return (0);
575 	}
576 
577 	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */
578 
579 	/*
580 	 * Determine how many levels of indirection.
581 	 */
582 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
583 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
584 	pref = 0;
585 	shft = 0;				/* sh = 1 */
586 	tbn = lbn - NDADDR;
587 	for (j = NIADDR; j > 0; j--) {
588 		longlong_t	sh;
589 
590 		shft += nindirshift;		/* sh *= nindir */
591 		sh = 1LL << shft;
592 		if (tbn < sh)
593 			break;
594 		tbn -= sh;
595 	}
596 
597 	if (j == 0)
598 		return (EFBIG);
599 
600 	/*
601 	 * Fetch the first indirect block.
602 	 */
603 	dev = ip->i_dev;
604 	nb = ip->i_ib[NIADDR - j];
605 	if (nb == 0) {
606 		/*
607 		 * Check to see if doing this will make the
608 		 * file too big.  Only check if we are dealing
609 		 * with a very large file.
610 		 */
611 		if (verylargefile == 1) {
612 			if (((unsigned)ip->i_blocks + btodb(bsize))
613 			    > INT_MAX) {
614 				return (EFBIG);
615 			}
616 		}
617 		/*
618 		 * Need to allocate an indirect block.
619 		 */
620 		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
621 		err = alloc(ip, pref, (int)bsize, &nb, cr);
622 		if (err)
623 			return (err);
624 		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
625 		ASSERT(!ufs_badblock(ip, nb));
626 
627 		/*
628 		 * Keep track of this allocation so we can undo it if we
629 		 * get an error later.
630 		 */
631 
632 		ASSERT(alloced_blocks <= NIADDR);
633 
634 		undo_table[alloced_blocks].this_block = nb;
635 		undo_table[alloced_blocks].block_size = bsize;
636 		undo_table[alloced_blocks].owner = ufs_no_owner;
637 		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
638 
639 		alloced_blocks++;
640 
641 		/*
642 		 * Write zero block synchronously so that
643 		 * indirect blocks never point at garbage.
644 		 */
645 		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
646 
647 		clrbuf(bp);
648 		/* XXX Maybe special-case this? */
649 		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
650 		UFS_BWRITE2(ufsvfsp, bp);
651 		if (bp->b_flags & B_ERROR) {
652 			err = geterror(bp);
653 			brelse(bp);
654 			ufs_undo_allocation(ip, alloced_blocks,
655 			    undo_table, added_sectors);
656 			return (err);
657 		}
658 		brelse(bp);
659 
660 		ip->i_ib[NIADDR - j] = nb;
661 		added_sectors += btodb(bsize);
662 		ip->i_blocks += btodb(bsize);
663 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
664 		TRANS_INODE(ufsvfsp, ip);
665 		ip->i_flag |= IUPD | ICHG | IATTCHG;
666 		/* Caller is responsible for updating i_seq */
667 
668 		/*
669 		 * Update the 'undo table' now that we've linked this block
670 		 * to an inode.
671 		 */
672 
673 		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
674 		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
675 
676 		/*
677 		 * In the ISYNC case, wrip will notice that the block
678 		 * count on the inode has changed and will be sure to
679 		 * ufs_iupdat the inode at the end of wrip.
680 		 */
681 	}
682 
683 	/*
684 	 * Fetch through the indirect blocks.
685 	 */
686 	for (; j <= NIADDR; j++) {
687 		ob = nb;
688 		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
689 
690 		if (bp->b_flags & B_ERROR) {
691 			err = geterror(bp);
692 			brelse(bp);
693 			/*
694 			 * Return any partial allocations.
695 			 *
696 			 * It is possible that we have not yet made any
697 			 * allocations at this point (if this is the first
698 			 * pass through the loop and we didn't have to
699 			 * allocate the first indirect block, above).
700 			 * In this case, alloced_blocks and added_sectors will
701 			 * be zero, and ufs_undo_allocation will do nothing.
702 			 */
703 			ufs_undo_allocation(ip, alloced_blocks,
704 			    undo_table, added_sectors);
705 			return (err);
706 		}
707 		bap = bp->b_un.b_daddr;
708 		shft -= nindirshift;		/* sh /= nindir */
709 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
710 		nb = bap[i];
711 		if (nb == 0) {
712 			/*
713 			 * Check to see if doing this will make the
714 			 * file too big.  Only check if we are dealing
715 			 * with a very large file.
716 			 */
717 			if (verylargefile == 1) {
718 				if (((unsigned)ip->i_blocks + btodb(bsize))
719 				    > INT_MAX) {
720 					brelse(bp);
721 					ufs_undo_allocation(ip, alloced_blocks,
722 					    undo_table, added_sectors);
723 					return (EFBIG);
724 				}
725 			}
726 			if (pref == 0) {
727 				if (j < NIADDR) {
728 					/* Indirect block */
729 					pref = blkpref(ip, lbn, 0,
730 						(daddr32_t *)0);
731 				} else {
732 					/* Data block */
733 					pref = blkpref(ip, lbn, i, &bap[0]);
734 				}
735 			}
736 
737 			/*
738 			 * release "bp" buf to avoid deadlock (re-bread later)
739 			 */
740 			brelse(bp);
741 
742 			err = alloc(ip, pref, (int)bsize, &nb, cr);
743 			if (err) {
744 				/*
745 				 * Return any partial allocations.
746 				 */
747 				ufs_undo_allocation(ip, alloced_blocks,
748 				    undo_table, added_sectors);
749 				return (err);
750 			}
751 
752 			ASSERT(!ufs_badblock(ip, nb));
753 
754 			ASSERT(alloced_blocks <= NIADDR);
755 
756 			undo_table[alloced_blocks].this_block = nb;
757 			undo_table[alloced_blocks].block_size = bsize;
758 			undo_table[alloced_blocks].owner = ufs_no_owner;
759 			undo_table[alloced_blocks].usage_flags = metaflag |
760 			    ((j < NIADDR) ? I_IBLK : 0);
761 
762 			alloced_blocks++;
763 
764 			if (j < NIADDR) {
765 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
766 				/*
767 				 * Write synchronously so indirect
768 				 * blocks never point at garbage.
769 				 */
770 				nbp = UFS_GETBLK(
771 					ufsvfsp, dev, fsbtodb(fs, nb), bsize);
772 
773 				clrbuf(nbp);
774 				/* XXX Maybe special-case this? */
775 				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
776 				UFS_BWRITE2(ufsvfsp, nbp);
777 				if (nbp->b_flags & B_ERROR) {
778 					err = geterror(nbp);
779 					brelse(nbp);
780 					/*
781 					 * Return any partial
782 					 * allocations.
783 					 */
784 					ufs_undo_allocation(ip,
785 					    alloced_blocks,
786 					    undo_table, added_sectors);
787 					return (err);
788 				}
789 				brelse(nbp);
790 			} else if (!alloc_only || P2ROUNDUP_TYPED(size,
791 			    PAGESIZE, u_offset_t) < bsize) {
792 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
793 				fbzero(ITOV(ip),
794 				    ((offset_t)lbn << fs->fs_bshift),
795 				    (uint_t)bsize, &fbp);
796 
797 				/*
798 				 * Cases which we need to do a synchronous
799 				 * write of the zeroed data pages:
800 				 *
801 				 * 1) If we are writing a directory then we
802 				 * want to write synchronously so blocks in
803 				 * directories never contain garbage.
804 				 *
805 				 * 2) If we are filling in a hole and the
806 				 * indirect block is going to be synchronously
807 				 * written back below we need to make sure
808 				 * that the zeroes are written here before
809 				 * the indirect block is updated so that if
810 				 * we crash before the real data is pushed
811 				 * we will not end up with random data is
812 				 * the middle of the file.
813 				 *
814 				 * 3) If the size of the request rounded up
815 				 * to the system page size is smaller than
816 				 * the file system block size, we want to
817 				 * write out all the pages now so that
818 				 * they are not aborted before they actually
819 				 * make it to ufs_putpage since the length
820 				 * of the inode will not include the pages.
821 				 */
822 
823 				if (isdirquota || (issync &&
824 				    lbn < llbn))
825 					(void) ufs_fbiwrite(fbp, ip, nb,
826 						fs->fs_fsize);
827 				else
828 					fbrelse(fbp, S_WRITE);
829 			}
830 
831 			/*
832 			 * re-acquire "bp" buf
833 			 */
834 			bp = UFS_BREAD(ufsvfsp,
835 					ip->i_dev, fsbtodb(fs, ob), bsize);
836 			if (bp->b_flags & B_ERROR) {
837 				err = geterror(bp);
838 				brelse(bp);
839 				/*
840 				 * Return any partial allocations.
841 				 */
842 				ufs_undo_allocation(ip,
843 				    alloced_blocks,
844 				    undo_table, added_sectors);
845 				return (err);
846 			}
847 			bap = bp->b_un.b_daddr;
848 			bap[i] = nb;
849 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
850 			added_sectors += btodb(bsize);
851 			ip->i_blocks += btodb(bsize);
852 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
853 			TRANS_INODE(ufsvfsp, ip);
854 			ip->i_flag |= IUPD | ICHG | IATTCHG;
855 			/* Caller is responsible for updating i_seq */
856 
857 			undo_table[alloced_blocks-1].owner =
858 			    ufs_indirect_block;
859 			undo_table[alloced_blocks-1].owner_block = ob;
860 			undo_table[alloced_blocks-1].owner_offset = i;
861 
862 			if (issync) {
863 				UFS_BWRITE2(ufsvfsp, bp);
864 				if (bp->b_flags & B_ERROR) {
865 					err = geterror(bp);
866 					brelse(bp);
867 					/*
868 					 * Return any partial
869 					 * allocations.
870 					 */
871 					ufs_undo_allocation(ip,
872 					    alloced_blocks,
873 					    undo_table, added_sectors);
874 					return (err);
875 				}
876 				brelse(bp);
877 			} else {
878 				bdrwrite(bp);
879 			}
880 		} else {
881 			brelse(bp);
882 		}
883 	}
884 	return (0);
885 }
886 
887 /*
888  * Return 1 if inode has unmapped blocks (UFS holes).
889  */
890 int
891 bmap_has_holes(struct inode *ip)
892 {
893 	struct fs *fs = ip->i_fs;
894 	uint_t	dblks; 			/* # of data blocks */
895 	uint_t	mblks;			/* # of data + metadata blocks */
896 	int	nindirshift;
897 	int	nindiroffset;
898 	uint_t	cnt;
899 	int	n, j, shft;
900 	uint_t nindirblks;
901 
902 	int	fsbshift = fs->fs_bshift;
903 	int	fsboffset = (1 << fsbshift) - 1;
904 
905 	dblks = (ip->i_size + fsboffset) >> fsbshift;
906 	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
907 
908 	/*
909 	 * File has only direct blocks.
910 	 */
911 	if (dblks <= NDADDR)
912 		return (mblks < dblks);
913 
914 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
915 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
916 	nindirblks = nindiroffset + 1;
917 
918 	dblks -= NDADDR;
919 	shft = 0;
920 	/*
921 	 * Determine how many levels of indirection.
922 	 */
923 	for (j = NIADDR; j > 0; j--) {
924 		longlong_t	sh;
925 
926 		shft += nindirshift;	/* sh *= nindir */
927 		sh = 1LL << shft;
928 		if (dblks <= sh)
929 			break;
930 		dblks -= sh;
931 	}
932 	/* LINTED: warning: logical expression always true: op "||" */
933 	ASSERT(NIADDR <= 3);
934 	ASSERT(j <= NIADDR);
935 	if (j == NIADDR)	/* single level indirection */
936 		cnt = NDADDR + 1 + dblks;
937 	else if (j == NIADDR-1) /* double indirection */
938 		cnt = NDADDR + 1 + nindirblks +
939 			1 + (dblks + nindiroffset)/nindirblks + dblks;
940 	else if (j == NIADDR-2) { /* triple indirection */
941 		n = (dblks + nindiroffset)/nindirblks;
942 		cnt = NDADDR + 1 + nindirblks +
943 			1 + nindirblks + nindirblks*nindirblks +
944 			1 + (n + nindiroffset)/nindirblks + n + dblks;
945 	}
946 
947 	return (mblks < cnt);
948 }
949 
950 /*
951  * find some contig blocks starting at *sbp and going for min(n, max_contig)
952  * return the number of blocks (not frags) found.
953  * The array passed in must be at least [0..n-1].
954  */
955 static int
956 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
957 {
958 	register daddr_t bn, nextbn;
959 	register daddr32_t *bp;
960 	register int diff;
961 	int maxtransblk;
962 
963 	if (n <= 0)
964 		return (0);
965 	bn = *sbp;
966 	if (bn == 0)
967 		return (0);
968 	diff = fs->fs_frag;
969 	if (*lenp) {
970 		n = MIN(n, lblkno(fs, *lenp));
971 	} else {
972 		/*
973 		 * If the user has set the value for maxcontig lower than
974 		 * the drive transfer size, then assume they want this
975 		 * to be the maximum value for the size of the data transfer.
976 		 */
977 		maxtransblk = maxtransfer >> DEV_BSHIFT;
978 		if (fs->fs_maxcontig < maxtransblk) {
979 			n = MIN(n, fs->fs_maxcontig);
980 		} else {
981 			n = MIN(n, maxtransblk);
982 		}
983 	}
984 	bp = sbp;
985 	while (--n > 0) {
986 		nextbn = *(bp + 1);
987 		if (nextbn == 0 || bn + diff != nextbn)
988 			break;
989 		bn = nextbn;
990 		bp++;
991 	}
992 	return ((int)(bp - sbp) + 1);
993 }
994 
995 /*
996  * Free any blocks which had been successfully allocated.  Always called
997  * as a result of an error, so we don't bother returning an error code
998  * from here.
999  *
1000  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1001  * Thus it is safe to call this as part of error handling, whether or not
1002  * any blocks have been allocated.
1003  *
1004  * The ufs_inode_direct case is currently unused.
1005  */
1006 
1007 static void
1008 ufs_undo_allocation(
1009 	inode_t *ip,
1010 	int block_count,
1011 	struct ufs_allocated_block table[],
1012 	int inode_sector_adjust)
1013 {
1014 	int i;
1015 	int inode_changed;
1016 	int error_updating_pointers;
1017 	struct ufsvfs *ufsvfsp;
1018 
1019 	inode_changed = 0;
1020 	error_updating_pointers = 0;
1021 
1022 	ufsvfsp = ip->i_ufsvfs;
1023 
1024 	/*
1025 	 * Update pointers on disk before freeing blocks.  If we fail,
1026 	 * some blocks may remain busy; but they will be reclaimed by
1027 	 * an fsck.  (This is better than letting a block wind up with
1028 	 * two owners if we successfully freed it but could not remove
1029 	 * the pointer to it.)
1030 	 */
1031 
1032 	for (i = 0; i < block_count; i++) {
1033 		switch (table[i].owner) {
1034 		case ufs_no_owner:
1035 			/* Nothing to do here, nobody points to us */
1036 			break;
1037 		case ufs_inode_direct:
1038 			ASSERT(table[i].owner_offset < NDADDR);
1039 			ip->i_db[table[i].owner_offset] = 0;
1040 			inode_changed = 1;
1041 			break;
1042 		case ufs_inode_indirect:
1043 			ASSERT(table[i].owner_offset < NIADDR);
1044 			ip->i_ib[table[i].owner_offset] = 0;
1045 			inode_changed = 1;
1046 			break;
1047 		case ufs_indirect_block: {
1048 			buf_t *bp;
1049 			daddr32_t *block_data;
1050 
1051 			/* Read/modify/log/write. */
1052 
1053 			ASSERT(table[i].owner_offset <
1054 			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1055 
1056 			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1057 			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1058 			    VBSIZE(ITOV(ip)));
1059 
1060 			if (bp->b_flags & B_ERROR) {
1061 				/* Couldn't read this block; give up. */
1062 				error_updating_pointers = 1;
1063 				brelse(bp);
1064 				break;		/* out of SWITCH */
1065 			}
1066 
1067 			block_data = bp->b_un.b_daddr;
1068 			block_data[table[i].owner_offset] = 0;
1069 
1070 			/* Write a log entry which includes the zero. */
1071 			/* It might be possible to optimize this by using */
1072 			/* TRANS_BUF directly and zeroing only the four */
1073 			/* bytes involved, but an attempt to do that led */
1074 			/* to panics in the logging code.  The attempt was */
1075 			/* TRANS_BUF(ufsvfsp,				  */
1076 			/*    table[i].owner_offset * sizeof (daddr32_t), */
1077 			/*    sizeof (daddr32_t),			  */
1078 			/*    bp,					  */
1079 			/*    DT_ABZERO);				  */
1080 
1081 			TRANS_BUF_ITEM_128(ufsvfsp,
1082 			    block_data[table[i].owner_offset],
1083 			    block_data, bp, DT_AB);
1084 
1085 			/* Now we can write the buffer itself. */
1086 
1087 			UFS_BWRITE2(ufsvfsp, bp);
1088 
1089 			if (bp->b_flags & B_ERROR) {
1090 				error_updating_pointers = 1;
1091 			}
1092 
1093 			brelse(bp);
1094 			break;
1095 		}
1096 		default:
1097 			(void) ufs_fault(ITOV(ip),
1098 			    "ufs_undo_allocation failure\n");
1099 			break;
1100 		}
1101 	}
1102 
1103 	/*
1104 	 * If the inode changed, or if we need to update its block count,
1105 	 * then do that now.  We update the inode synchronously on disk
1106 	 * to ensure that it won't transiently point at a block we've
1107 	 * freed (only necessary if we're not logging).
1108 	 *
1109 	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
1110 	 * it is fixed, we should verify that we successfully updated the
1111 	 * inode before freeing blocks below.
1112 	 */
1113 
1114 	if (inode_changed || (inode_sector_adjust != 0)) {
1115 		ip->i_blocks -= inode_sector_adjust;
1116 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1117 		TRANS_INODE(ufsvfsp, ip);
1118 		ip->i_flag |= IUPD | ICHG | IATTCHG;
1119 		ip->i_seq++;
1120 		if (!TRANS_ISTRANS(ufsvfsp))
1121 			ufs_iupdat(ip, I_SYNC);
1122 	}
1123 
1124 	/*
1125 	 * Now we go through and actually free the blocks, but only if we
1126 	 * successfully removed the pointers to them.
1127 	 */
1128 
1129 	if (!error_updating_pointers) {
1130 		for (i = 0; i < block_count; i++) {
1131 			free(ip, table[i].this_block, table[i].block_size,
1132 			    table[i].usage_flags);
1133 		}
1134 	}
1135 }
1136 
1137 /*
1138  * Find the next hole or data block in file starting at *off
1139  * Return found offset in *off, which can be less than the
1140  * starting offset if not block aligned.
1141  * This code is based on bmap_read().
1142  * Errors: ENXIO for end of file
1143  *         EIO for block read error.
1144  */
1145 int
1146 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1147 {
1148 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1149 	struct fs *fs = ufsvfsp->vfs_fs;
1150 	buf_t *bp[NIADDR];
1151 	int i, j;
1152 	int shft;			/* we maintain sh = 1 << shft */
1153 	int nindirshift, nindiroffset;
1154 	daddr_t	ob, nb, tbn, lbn, skip;
1155 	daddr32_t *bap;
1156 	u_offset_t isz = (offset_t)ip->i_size;
1157 	int32_t bs = fs->fs_bsize; /* file system block size */
1158 	int32_t nindir = fs->fs_nindir;
1159 	dev_t dev;
1160 	int error = 0;
1161 	daddr_t limits[NIADDR];
1162 
1163 	ASSERT(*off < isz);
1164 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1165 	lbn = (daddr_t)lblkno(fs, *off);
1166 	ASSERT(lbn >= 0);
1167 
1168 	for (i = 0; i < NIADDR; i++)
1169 		bp[i] = NULL;
1170 
1171 	/*
1172 	 * The first NDADDR blocks are direct blocks.
1173 	 */
1174 	if (lbn < NDADDR) {
1175 		for (; lbn < NDADDR; lbn++) {
1176 			if ((hole && (ip->i_db[lbn] == 0)) ||
1177 			    (!hole && (ip->i_db[lbn] != 0))) {
1178 				goto out;
1179 			}
1180 		}
1181 		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1182 			goto out;
1183 	}
1184 
1185 	nindir = fs->fs_nindir;
1186 	nindirshift = ufsvfsp->vfs_nindirshift;
1187 	nindiroffset = ufsvfsp->vfs_nindiroffset;
1188 	dev = ip->i_dev;
1189 
1190 	/* Set up limits array */
1191 	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1192 		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1193 
1194 loop:
1195 	/*
1196 	 * Determine how many levels of indirection.
1197 	 */
1198 	shft = 0;				/* sh = 1 */
1199 	tbn = lbn - NDADDR;
1200 	for (j = NIADDR; j > 0; j--) {
1201 		longlong_t sh;
1202 
1203 		shft += nindirshift;		/* sh *= nindir */
1204 		sh = 1LL << shft;
1205 		if (tbn < sh)
1206 			break;
1207 		tbn -= sh;
1208 	}
1209 	if (j == 0) {
1210 		/* must have passed end of file */
1211 		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1212 		goto out;
1213 	}
1214 
1215 	/*
1216 	 * Fetch the first indirect block.
1217 	 */
1218 	nb = ip->i_ib[NIADDR - j];
1219 	if (nb == 0) {
1220 		if (hole) {
1221 			lbn = limits[NIADDR - j];
1222 			goto out;
1223 		} else {
1224 			lbn = limits[NIADDR - j + 1];
1225 			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1226 				goto out;
1227 			goto loop;
1228 		}
1229 	}
1230 
1231 	/*
1232 	 * Fetch through the indirect blocks.
1233 	 */
1234 	for (; ((j <= NIADDR) && (nb != 0)); j++) {
1235 		ob = nb;
1236 		/*
1237 		 * if there's a different block at this level then release
1238 		 * the old one and in with the new.
1239 		 */
1240 		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1241 			if (bp[j-1] != NULL)
1242 				brelse(bp[j-1]);
1243 			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1244 			if (bp[j-1]->b_flags & B_ERROR) {
1245 				error = EIO;
1246 				goto out;
1247 			}
1248 		}
1249 		bap = bp[j-1]->b_un.b_daddr;
1250 
1251 		shft -= nindirshift;		/* sh / nindir */
1252 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1253 		nb = bap[i];
1254 		skip = 1LL << (nindirshift * (NIADDR - j));
1255 	}
1256 
1257 	/*
1258 	 * Scan through the blocks in this array.
1259 	 */
1260 	for (; i < nindir; i++, lbn += skip) {
1261 		if (hole && (bap[i] == 0))
1262 			goto out;
1263 		if (!hole && (bap[i] != 0)) {
1264 			if (skip == 1) {
1265 				/* we're at the lowest level */
1266 				goto out;
1267 			} else {
1268 				goto loop;
1269 			}
1270 		}
1271 	}
1272 	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1273 		goto loop;
1274 out:
1275 	for (i = 0; i < NIADDR; i++) {
1276 		if (bp[i])
1277 			brelse(bp[i]);
1278 	}
1279 	if (error == 0) {
1280 		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1281 			error = ENXIO;
1282 		} else {
1283 			/* success */
1284 			*off = (u_offset_t)lbn << fs->fs_bshift;
1285 		}
1286 	}
1287 	return (error);
1288 }
1289