1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/signal.h> 48 #include <sys/user.h> 49 #include <sys/vnode.h> 50 #include <sys/buf.h> 51 #include <sys/disp.h> 52 #include <sys/proc.h> 53 #include <sys/conf.h> 54 #include <sys/fs/ufs_inode.h> 55 #include <sys/fs/ufs_fs.h> 56 #include <sys/fs/ufs_quota.h> 57 #include <sys/fs/ufs_trans.h> 58 #include <sys/fs/ufs_bio.h> 59 #include <vm/seg.h> 60 #include <sys/errno.h> 61 #include <sys/sysmacros.h> 62 #include <sys/vfs.h> 63 #include <sys/cmn_err.h> 64 #include <sys/debug.h> 65 #include <sys/kmem.h> 66 67 /* 68 * This structure is used to track blocks as we allocate them, so that 69 * we can free them if we encounter an error during allocation. We 70 * keep track of five pieces of information for each allocated block: 71 * - The number of the newly allocated block 72 * - The size of the block (lets us deal with fragments if we want) 73 * - The number of the block containing a pointer to it; or whether 74 * the pointer is in the inode 75 * - The offset within the block (or inode) containing a pointer to it. 76 * - A flag indicating the usage of the block. (Logging needs to know 77 * this to avoid overwriting a data block if it was previously used 78 * for metadata.) 79 */ 80 81 enum ufs_owner_type { 82 ufs_no_owner, /* Owner has not yet been updated */ 83 ufs_inode_direct, /* Listed in inode's direct block table */ 84 ufs_inode_indirect, /* Listed in inode's indirect block table */ 85 ufs_indirect_block /* Listed in an indirect block */ 86 }; 87 88 struct ufs_allocated_block { 89 daddr_t this_block; /* Number of this block */ 90 off_t block_size; /* Size of this block, in bytes */ 91 enum ufs_owner_type owner; /* Who points to this block? */ 92 daddr_t owner_block; /* Number of the owning block */ 93 uint_t owner_offset; /* Offset within that block or inode */ 94 int usage_flags; /* Usage flags, as expected by free() */ 95 }; 96 97 98 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, 99 int maxtrans); 100 101 static void ufs_undo_allocation(inode_t *ip, int block_count, 102 struct ufs_allocated_block table[], int inode_sector_adjust); 103 104 /* 105 * Find the extent and the matching block number. 106 * 107 * bsize > PAGESIZE 108 * boff indicates that we want a page in the middle 109 * min expression is supposed to make sure no extra page[s] after EOF 110 * PAGESIZE >= bsize 111 * we assume that a page is a multiple of bsize, i.e., 112 * boff always == 0 113 * 114 * We always return a length that is suitable for a disk transfer. 115 */ 116 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\ 117 register daddr32_t *dp = (tblp); \ 118 register int _chkfrag = chkfrag; /* for lint. sigh */ \ 119 \ 120 if (*dp == 0) { \ 121 *(bnp) = UFS_HOLE; \ 122 } else { \ 123 register int len; \ 124 \ 125 len = findextent(fs, dp, (int)(n), lenp, maxtrans) << \ 126 (fs)->fs_bshift; \ 127 if (_chkfrag) { \ 128 register u_offset_t tmp; \ 129 \ 130 tmp = fragroundup((fs), size) - \ 131 (((u_offset_t)lbn) << fs->fs_bshift); \ 132 len = (int)MIN(tmp, len); \ 133 } \ 134 len -= (boff); \ 135 if (len <= 0) { \ 136 *(bnp) = UFS_HOLE; \ 137 } else { \ 138 *(bnp) = fsbtodb(fs, *dp) + btodb(boff); \ 139 *(lenp) = len; \ 140 } \ 141 } \ 142 } 143 144 /* 145 * The maximum supported file size is actually somewhat less that 1 146 * terabyte. This is because the total number of blocks used for the 147 * file and its metadata must fit into the ic_blocks field of the 148 * inode, which is a signed 32-bit quantity. The metadata allocated 149 * for a file (that is, the single, double, and triple indirect blocks 150 * used to reference the file blocks) is actually quite small, 151 * but just to make sure, we check for overflow in the ic_blocks 152 * ic_blocks fields for all files whose total block count is 153 * within 1 GB of a terabyte. VERYLARGEFILESIZE below is the number of 154 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks 155 * in a gigabyte (2^21). We only check for overflow in the ic_blocks 156 * field if the number of blocks currently allocated to the file is 157 * greater than VERYLARGEFILESIZE. 158 * 159 * Note that file "size" is the not the same as file "length". A 160 * file's "size" is the number of blocks allocated to it. A file's 161 * "length" is the maximum offset in the file. A UFS FILE can have a 162 * length of a terabyte, but the size is limited to somewhat less than 163 * a terabyte, as described above. 164 */ 165 #define VERYLARGEFILESIZE 0x7FE00000 166 167 /* 168 * bmap{rd,wr} define the structure of file system storage by mapping 169 * a logical offset in a file to a physical block number on the device. 170 * It should be called with a locked inode when allocation is to be 171 * done (bmapwr). Note this strangeness: bmapwr is always called from 172 * getpage(), not putpage(), since getpage() is where all the allocation 173 * is done. 174 * 175 * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr. 176 * 177 * NOTICE: the block number returned is the disk block number, not the 178 * file system block number. All the worries about block offsets and 179 * page/block sizes are hidden inside of bmap. Well, not quite, 180 * unfortunately. It's impossible to find one place to hide all this 181 * mess. There are 3 cases: 182 * 183 * PAGESIZE < bsize 184 * In this case, the {get,put}page routines will attempt to align to 185 * a file system block boundry (XXX - maybe this is a mistake?). Since 186 * the kluster routines may be out of memory, we don't always get all 187 * the pages we wanted. If we called bmap first, to find out how much 188 * to kluster, we handed in the block aligned offset. If we didn't get 189 * all the pages, we have to chop off the amount we didn't get from the 190 * amount handed back by bmap. 191 * 192 * PAGESIZE == bsize 193 * Life is quite pleasant here, no extra work needed, mainly because we 194 * (probably?) won't kluster backwards, just forwards. 195 * 196 * PAGESIZE > bsize 197 * This one has a different set of problems, specifically, we may have to 198 * do N reads to fill one page. Let us hope that Sun will stay with small 199 * pages. 200 * 201 * Returns 0 on success, or a non-zero errno if an error occurs. 202 * 203 * TODO 204 * LMXXX - add a bmap cache. This could be a couple of extents in the 205 * inode. Two is nice for PAGESIZE > bsize. 206 */ 207 208 int 209 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp) 210 { 211 daddr_t lbn; 212 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 213 struct fs *fs = ufsvfsp->vfs_fs; 214 struct buf *bp; 215 int i, j, boff; 216 int shft; /* we maintain sh = 1 << shft */ 217 daddr_t ob, nb, tbn; 218 daddr32_t *bap; 219 int nindirshift, nindiroffset; 220 221 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 222 lbn = (daddr_t)lblkno(fs, off); 223 boff = (int)blkoff(fs, off); 224 if (lbn < 0) 225 return (EFBIG); 226 227 /* 228 * The first NDADDR blocks are direct blocks. 229 */ 230 if (lbn < NDADDR) { 231 DOEXTENT(fs, lbn, boff, bnp, lenp, 232 ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1, 233 ufsvfsp->vfs_iotransz); 234 return (0); 235 } 236 237 nindirshift = ufsvfsp->vfs_nindirshift; 238 nindiroffset = ufsvfsp->vfs_nindiroffset; 239 /* 240 * Determine how many levels of indirection. 241 */ 242 shft = 0; /* sh = 1 */ 243 tbn = lbn - NDADDR; 244 for (j = NIADDR; j > 0; j--) { 245 longlong_t sh; 246 247 shft += nindirshift; /* sh *= nindir */ 248 sh = 1LL << shft; 249 if (tbn < sh) 250 break; 251 tbn -= sh; 252 } 253 if (j == 0) 254 return (EFBIG); 255 256 /* 257 * Fetch the first indirect block. 258 */ 259 nb = ip->i_ib[NIADDR - j]; 260 if (nb == 0) { 261 *bnp = UFS_HOLE; 262 return (0); 263 } 264 265 /* 266 * Fetch through the indirect blocks. 267 */ 268 for (; j <= NIADDR; j++) { 269 ob = nb; 270 bp = UFS_BREAD(ufsvfsp, 271 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); 272 if (bp->b_flags & B_ERROR) { 273 brelse(bp); 274 return (EIO); 275 } 276 bap = bp->b_un.b_daddr; 277 278 ASSERT(!ufs_indir_badblock(ip, bap)); 279 280 shft -= nindirshift; /* sh / nindir */ 281 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */ 282 nb = bap[i]; 283 if (nb == 0) { 284 *bnp = UFS_HOLE; 285 brelse(bp); 286 return (0); 287 } 288 if (j != NIADDR) 289 brelse(bp); 290 } 291 DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i], 292 MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1), 293 0, ufsvfsp->vfs_iotransz); 294 brelse(bp); 295 return (0); 296 } 297 298 /* 299 * See bmaprd for general notes. 300 * 301 * The block must be at least size bytes and will be extended or 302 * allocated as needed. If alloc_only is set, bmap will not create 303 * any in-core pages that correspond to the new disk allocation. 304 * Otherwise, the in-core pages will be created and initialized as 305 * needed. 306 * 307 * Returns 0 on success, or a non-zero errno if an error occurs. 308 */ 309 310 int 311 bmap_write( 312 struct inode *ip, 313 u_offset_t off, 314 int size, 315 int alloc_only, 316 struct cred *cr) 317 { 318 struct fs *fs; 319 struct buf *bp; 320 int i; 321 struct buf *nbp; 322 int j; 323 int shft; /* we maintain sh = 1 << shft */ 324 daddr_t ob, nb, pref, lbn, llbn, tbn; 325 daddr32_t *bap; 326 struct vnode *vp = ITOV(ip); 327 long bsize = VBSIZE(vp); 328 long osize, nsize; 329 int issync, metaflag, isdirquota; 330 int err; 331 dev_t dev; 332 struct fbuf *fbp; 333 int nindirshift; 334 int nindiroffset; 335 struct ufsvfs *ufsvfsp; 336 int added_sectors; /* sectors added to this inode */ 337 int alloced_blocks; /* fs blocks newly allocated */ 338 struct ufs_allocated_block undo_table[NIADDR+1]; 339 int verylargefile = 0; 340 341 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 342 343 ufsvfsp = ip->i_ufsvfs; 344 fs = ufsvfsp->vfs_bufp->b_un.b_fs; 345 lbn = (daddr_t)lblkno(fs, off); 346 if (lbn < 0) 347 return (EFBIG); 348 if (ip->i_blocks >= VERYLARGEFILESIZE) 349 verylargefile = 1; 350 llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0); 351 metaflag = isdirquota = 0; 352 if (((ip->i_mode & IFMT) == IFDIR) || 353 ((ip->i_mode & IFMT) == IFATTRDIR)) 354 isdirquota = metaflag = I_DIR; 355 else if ((ip->i_mode & IFMT) == IFSHAD) 356 metaflag = I_SHAD; 357 else if (ip->i_ufsvfs->vfs_qinod == ip) 358 isdirquota = metaflag = I_QUOTA; 359 360 issync = ((ip->i_flag & ISYNC) != 0); 361 362 if (isdirquota || issync) { 363 alloc_only = 0; /* make sure */ 364 } 365 366 /* 367 * If the next write will extend the file into a new block, 368 * and the file is currently composed of a fragment 369 * this fragment has to be extended to be a full block. 370 */ 371 if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) { 372 osize = blksize(fs, ip, llbn); 373 if (osize < bsize && osize > 0) { 374 /* 375 * Check to see if doing this will make the file too 376 * big. Only check if we are dealing with a very 377 * large file. 378 */ 379 if (verylargefile == 1) { 380 if (((unsigned)ip->i_blocks + 381 btodb(bsize - osize)) > INT_MAX) { 382 return (EFBIG); 383 } 384 } 385 /* 386 * Make sure we have all needed pages setup correctly. 387 * 388 * We pass S_OTHER to fbread here because we want 389 * an exclusive lock on the page in question 390 * (see ufs_getpage). I/O to the old block location 391 * may still be in progress and we are about to free 392 * the old block. We don't want anyone else to get 393 * a hold of the old block once we free it until 394 * the I/O is complete. 395 */ 396 err = fbread(ITOV(ip), 397 ((offset_t)llbn << fs->fs_bshift), 398 (uint_t)bsize, S_OTHER, &fbp); 399 if (err) 400 return (err); 401 pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]); 402 err = realloccg(ip, ob, pref, (int)osize, (int)bsize, 403 &nb, cr); 404 if (err) { 405 if (fbp) 406 fbrelse(fbp, S_OTHER); 407 return (err); 408 } 409 ASSERT(!ufs_badblock(ip, nb)); 410 411 /* 412 * Update the inode before releasing the 413 * lock on the page. If we released the page 414 * lock first, the data could be written to it's 415 * old address and then destroyed. 416 */ 417 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0); 418 ip->i_db[llbn] = nb; 419 UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift, 420 ip); 421 ip->i_blocks += btodb(bsize - osize); 422 ASSERT((unsigned)ip->i_blocks <= INT_MAX); 423 TRANS_INODE(ufsvfsp, ip); 424 ip->i_flag |= IUPD | ICHG | IATTCHG; 425 /* Caller is responsible for updating i_seq */ 426 /* 427 * Don't check metaflag here, directories won't do this 428 * 429 */ 430 if (issync) { 431 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize); 432 } else { 433 ASSERT(fbp); 434 fbrelse(fbp, S_WRITE); 435 } 436 437 if (nb != ob) { 438 (void) free(ip, ob, (off_t)osize, metaflag); 439 } 440 } 441 } 442 443 /* 444 * The first NDADDR blocks are direct blocks. 445 */ 446 if (lbn < NDADDR) { 447 nb = ip->i_db[lbn]; 448 if (nb == 0 || 449 ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) { 450 if (nb != 0) { 451 /* consider need to reallocate a frag */ 452 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 453 nsize = fragroundup(fs, size); 454 if (nsize <= osize) 455 goto gotit; 456 /* 457 * Check to see if doing this will make the 458 * file too big. Only check if we are dealing 459 * with a very large file. 460 */ 461 if (verylargefile == 1) { 462 if (((unsigned)ip->i_blocks + 463 btodb(nsize - osize)) > INT_MAX) { 464 return (EFBIG); 465 } 466 } 467 /* 468 * need to allocate a block or frag 469 */ 470 ob = nb; 471 pref = blkpref(ip, lbn, (int)lbn, 472 &ip->i_db[0]); 473 err = realloccg(ip, ob, pref, (int)osize, 474 (int)nsize, &nb, cr); 475 if (err) 476 return (err); 477 ASSERT(!ufs_badblock(ip, nb)); 478 479 } else { 480 /* 481 * need to allocate a block or frag 482 */ 483 osize = 0; 484 if (ip->i_size < 485 ((u_offset_t)(lbn + 1)) << fs->fs_bshift) 486 nsize = fragroundup(fs, size); 487 else 488 nsize = bsize; 489 /* 490 * Check to see if doing this will make the 491 * file too big. Only check if we are dealing 492 * with a very large file. 493 */ 494 if (verylargefile == 1) { 495 if (((unsigned)ip->i_blocks + 496 btodb(nsize - osize)) > INT_MAX) { 497 return (EFBIG); 498 } 499 } 500 pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]); 501 err = alloc(ip, pref, (int)nsize, &nb, cr); 502 if (err) 503 return (err); 504 ASSERT(!ufs_badblock(ip, nb)); 505 ob = nb; 506 } 507 508 /* 509 * Read old/create new zero pages 510 */ 511 fbp = NULL; 512 if (osize == 0) { 513 /* 514 * mmap S_WRITE faults always enter here 515 */ 516 if (!alloc_only || P2ROUNDUP_TYPED(size, 517 PAGESIZE, u_offset_t) < nsize) { 518 /* fbzero doesn't cause a pagefault */ 519 fbzero(ITOV(ip), 520 ((offset_t)lbn << fs->fs_bshift), 521 (uint_t)nsize, &fbp); 522 } 523 } else { 524 err = fbread(vp, 525 ((offset_t)lbn << fs->fs_bshift), 526 (uint_t)nsize, S_OTHER, &fbp); 527 if (err) { 528 if (nb != ob) { 529 (void) free(ip, nb, 530 (off_t)nsize, metaflag); 531 } else { 532 (void) free(ip, 533 ob + numfrags(fs, osize), 534 (off_t)(nsize - osize), 535 metaflag); 536 } 537 ASSERT(nsize >= osize); 538 (void) chkdq(ip, 539 -(long)btodb(nsize - osize), 540 0, cr, (char **)NULL, 541 (size_t *)NULL); 542 return (err); 543 } 544 } 545 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0); 546 ip->i_db[lbn] = nb; 547 ip->i_blocks += btodb(nsize - osize); 548 ASSERT((unsigned)ip->i_blocks <= INT_MAX); 549 TRANS_INODE(ufsvfsp, ip); 550 ip->i_flag |= IUPD | ICHG | IATTCHG; 551 /* Caller is responsible for updating i_seq */ 552 553 /* 554 * Write directory and shadow blocks synchronously so 555 * that they never appear with garbage in them on the 556 * disk. 557 * 558 */ 559 if (isdirquota && (ip->i_size || 560 TRANS_ISTRANS(ufsvfsp))) { 561 /* 562 * XXX man not be necessary with harpy trans 563 * bug id 1130055 564 */ 565 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize); 566 } else if (fbp) { 567 fbrelse(fbp, S_WRITE); 568 } 569 570 if (nb != ob) 571 (void) free(ip, ob, (off_t)osize, metaflag); 572 } 573 gotit: 574 return (0); 575 } 576 577 added_sectors = alloced_blocks = 0; /* No blocks alloced yet */ 578 579 /* 580 * Determine how many levels of indirection. 581 */ 582 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 583 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 584 pref = 0; 585 shft = 0; /* sh = 1 */ 586 tbn = lbn - NDADDR; 587 for (j = NIADDR; j > 0; j--) { 588 longlong_t sh; 589 590 shft += nindirshift; /* sh *= nindir */ 591 sh = 1LL << shft; 592 if (tbn < sh) 593 break; 594 tbn -= sh; 595 } 596 597 if (j == 0) 598 return (EFBIG); 599 600 /* 601 * Fetch the first indirect block. 602 */ 603 dev = ip->i_dev; 604 nb = ip->i_ib[NIADDR - j]; 605 if (nb == 0) { 606 /* 607 * Check to see if doing this will make the 608 * file too big. Only check if we are dealing 609 * with a very large file. 610 */ 611 if (verylargefile == 1) { 612 if (((unsigned)ip->i_blocks + btodb(bsize)) 613 > INT_MAX) { 614 return (EFBIG); 615 } 616 } 617 /* 618 * Need to allocate an indirect block. 619 */ 620 pref = blkpref(ip, lbn, 0, (daddr32_t *)0); 621 err = alloc(ip, pref, (int)bsize, &nb, cr); 622 if (err) 623 return (err); 624 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1); 625 ASSERT(!ufs_badblock(ip, nb)); 626 627 /* 628 * Keep track of this allocation so we can undo it if we 629 * get an error later. 630 */ 631 632 ASSERT(alloced_blocks <= NIADDR); 633 634 undo_table[alloced_blocks].this_block = nb; 635 undo_table[alloced_blocks].block_size = bsize; 636 undo_table[alloced_blocks].owner = ufs_no_owner; 637 undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK; 638 639 alloced_blocks++; 640 641 /* 642 * Write zero block synchronously so that 643 * indirect blocks never point at garbage. 644 */ 645 bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize); 646 647 clrbuf(bp); 648 /* XXX Maybe special-case this? */ 649 TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO); 650 UFS_BWRITE2(ufsvfsp, bp); 651 if (bp->b_flags & B_ERROR) { 652 err = geterror(bp); 653 brelse(bp); 654 ufs_undo_allocation(ip, alloced_blocks, 655 undo_table, added_sectors); 656 return (err); 657 } 658 brelse(bp); 659 660 ip->i_ib[NIADDR - j] = nb; 661 added_sectors += btodb(bsize); 662 ip->i_blocks += btodb(bsize); 663 ASSERT((unsigned)ip->i_blocks <= INT_MAX); 664 TRANS_INODE(ufsvfsp, ip); 665 ip->i_flag |= IUPD | ICHG | IATTCHG; 666 /* Caller is responsible for updating i_seq */ 667 668 /* 669 * Update the 'undo table' now that we've linked this block 670 * to an inode. 671 */ 672 673 undo_table[alloced_blocks-1].owner = ufs_inode_indirect; 674 undo_table[alloced_blocks-1].owner_offset = NIADDR - j; 675 676 /* 677 * In the ISYNC case, wrip will notice that the block 678 * count on the inode has changed and will be sure to 679 * ufs_iupdat the inode at the end of wrip. 680 */ 681 } 682 683 /* 684 * Fetch through the indirect blocks. 685 */ 686 for (; j <= NIADDR; j++) { 687 ob = nb; 688 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize); 689 690 if (bp->b_flags & B_ERROR) { 691 err = geterror(bp); 692 brelse(bp); 693 /* 694 * Return any partial allocations. 695 * 696 * It is possible that we have not yet made any 697 * allocations at this point (if this is the first 698 * pass through the loop and we didn't have to 699 * allocate the first indirect block, above). 700 * In this case, alloced_blocks and added_sectors will 701 * be zero, and ufs_undo_allocation will do nothing. 702 */ 703 ufs_undo_allocation(ip, alloced_blocks, 704 undo_table, added_sectors); 705 return (err); 706 } 707 bap = bp->b_un.b_daddr; 708 shft -= nindirshift; /* sh /= nindir */ 709 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */ 710 nb = bap[i]; 711 if (nb == 0) { 712 /* 713 * Check to see if doing this will make the 714 * file too big. Only check if we are dealing 715 * with a very large file. 716 */ 717 if (verylargefile == 1) { 718 if (((unsigned)ip->i_blocks + btodb(bsize)) 719 > INT_MAX) { 720 brelse(bp); 721 ufs_undo_allocation(ip, alloced_blocks, 722 undo_table, added_sectors); 723 return (EFBIG); 724 } 725 } 726 if (pref == 0) { 727 if (j < NIADDR) { 728 /* Indirect block */ 729 pref = blkpref(ip, lbn, 0, 730 (daddr32_t *)0); 731 } else { 732 /* Data block */ 733 pref = blkpref(ip, lbn, i, &bap[0]); 734 } 735 } 736 737 /* 738 * release "bp" buf to avoid deadlock (re-bread later) 739 */ 740 brelse(bp); 741 742 err = alloc(ip, pref, (int)bsize, &nb, cr); 743 if (err) { 744 /* 745 * Return any partial allocations. 746 */ 747 ufs_undo_allocation(ip, alloced_blocks, 748 undo_table, added_sectors); 749 return (err); 750 } 751 752 ASSERT(!ufs_badblock(ip, nb)); 753 754 ASSERT(alloced_blocks <= NIADDR); 755 756 undo_table[alloced_blocks].this_block = nb; 757 undo_table[alloced_blocks].block_size = bsize; 758 undo_table[alloced_blocks].owner = ufs_no_owner; 759 undo_table[alloced_blocks].usage_flags = metaflag | 760 ((j < NIADDR) ? I_IBLK : 0); 761 762 alloced_blocks++; 763 764 if (j < NIADDR) { 765 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1); 766 /* 767 * Write synchronously so indirect 768 * blocks never point at garbage. 769 */ 770 nbp = UFS_GETBLK( 771 ufsvfsp, dev, fsbtodb(fs, nb), bsize); 772 773 clrbuf(nbp); 774 /* XXX Maybe special-case this? */ 775 TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO); 776 UFS_BWRITE2(ufsvfsp, nbp); 777 if (nbp->b_flags & B_ERROR) { 778 err = geterror(nbp); 779 brelse(nbp); 780 /* 781 * Return any partial 782 * allocations. 783 */ 784 ufs_undo_allocation(ip, 785 alloced_blocks, 786 undo_table, added_sectors); 787 return (err); 788 } 789 brelse(nbp); 790 } else if (!alloc_only || P2ROUNDUP_TYPED(size, 791 PAGESIZE, u_offset_t) < bsize) { 792 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0); 793 fbzero(ITOV(ip), 794 ((offset_t)lbn << fs->fs_bshift), 795 (uint_t)bsize, &fbp); 796 797 /* 798 * Cases which we need to do a synchronous 799 * write of the zeroed data pages: 800 * 801 * 1) If we are writing a directory then we 802 * want to write synchronously so blocks in 803 * directories never contain garbage. 804 * 805 * 2) If we are filling in a hole and the 806 * indirect block is going to be synchronously 807 * written back below we need to make sure 808 * that the zeroes are written here before 809 * the indirect block is updated so that if 810 * we crash before the real data is pushed 811 * we will not end up with random data is 812 * the middle of the file. 813 * 814 * 3) If the size of the request rounded up 815 * to the system page size is smaller than 816 * the file system block size, we want to 817 * write out all the pages now so that 818 * they are not aborted before they actually 819 * make it to ufs_putpage since the length 820 * of the inode will not include the pages. 821 */ 822 823 if (isdirquota || (issync && 824 lbn < llbn)) 825 (void) ufs_fbiwrite(fbp, ip, nb, 826 fs->fs_fsize); 827 else 828 fbrelse(fbp, S_WRITE); 829 } 830 831 /* 832 * re-acquire "bp" buf 833 */ 834 bp = UFS_BREAD(ufsvfsp, 835 ip->i_dev, fsbtodb(fs, ob), bsize); 836 if (bp->b_flags & B_ERROR) { 837 err = geterror(bp); 838 brelse(bp); 839 /* 840 * Return any partial allocations. 841 */ 842 ufs_undo_allocation(ip, 843 alloced_blocks, 844 undo_table, added_sectors); 845 return (err); 846 } 847 bap = bp->b_un.b_daddr; 848 bap[i] = nb; 849 TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB); 850 added_sectors += btodb(bsize); 851 ip->i_blocks += btodb(bsize); 852 ASSERT((unsigned)ip->i_blocks <= INT_MAX); 853 TRANS_INODE(ufsvfsp, ip); 854 ip->i_flag |= IUPD | ICHG | IATTCHG; 855 /* Caller is responsible for updating i_seq */ 856 857 undo_table[alloced_blocks-1].owner = 858 ufs_indirect_block; 859 undo_table[alloced_blocks-1].owner_block = ob; 860 undo_table[alloced_blocks-1].owner_offset = i; 861 862 if (issync) { 863 UFS_BWRITE2(ufsvfsp, bp); 864 if (bp->b_flags & B_ERROR) { 865 err = geterror(bp); 866 brelse(bp); 867 /* 868 * Return any partial 869 * allocations. 870 */ 871 ufs_undo_allocation(ip, 872 alloced_blocks, 873 undo_table, added_sectors); 874 return (err); 875 } 876 brelse(bp); 877 } else { 878 bdrwrite(bp); 879 } 880 } else { 881 brelse(bp); 882 } 883 } 884 return (0); 885 } 886 887 /* 888 * Return 1 if inode has unmapped blocks (UFS holes). 889 */ 890 int 891 bmap_has_holes(struct inode *ip) 892 { 893 struct fs *fs = ip->i_fs; 894 uint_t dblks; /* # of data blocks */ 895 uint_t mblks; /* # of data + metadata blocks */ 896 int nindirshift; 897 int nindiroffset; 898 uint_t cnt; 899 int n, j, shft; 900 uint_t nindirblks; 901 902 int fsbshift = fs->fs_bshift; 903 int fsboffset = (1 << fsbshift) - 1; 904 905 dblks = (ip->i_size + fsboffset) >> fsbshift; 906 mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift; 907 908 /* 909 * File has only direct blocks. 910 */ 911 if (dblks <= NDADDR) 912 return (mblks < dblks); 913 914 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 915 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 916 nindirblks = nindiroffset + 1; 917 918 dblks -= NDADDR; 919 shft = 0; 920 /* 921 * Determine how many levels of indirection. 922 */ 923 for (j = NIADDR; j > 0; j--) { 924 longlong_t sh; 925 926 shft += nindirshift; /* sh *= nindir */ 927 sh = 1LL << shft; 928 if (dblks <= sh) 929 break; 930 dblks -= sh; 931 } 932 /* LINTED: warning: logical expression always true: op "||" */ 933 ASSERT(NIADDR <= 3); 934 ASSERT(j <= NIADDR); 935 if (j == NIADDR) /* single level indirection */ 936 cnt = NDADDR + 1 + dblks; 937 else if (j == NIADDR-1) /* double indirection */ 938 cnt = NDADDR + 1 + nindirblks + 939 1 + (dblks + nindiroffset)/nindirblks + dblks; 940 else if (j == NIADDR-2) { /* triple indirection */ 941 n = (dblks + nindiroffset)/nindirblks; 942 cnt = NDADDR + 1 + nindirblks + 943 1 + nindirblks + nindirblks*nindirblks + 944 1 + (n + nindiroffset)/nindirblks + n + dblks; 945 } 946 947 return (mblks < cnt); 948 } 949 950 /* 951 * find some contig blocks starting at *sbp and going for min(n, max_contig) 952 * return the number of blocks (not frags) found. 953 * The array passed in must be at least [0..n-1]. 954 */ 955 static int 956 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer) 957 { 958 register daddr_t bn, nextbn; 959 register daddr32_t *bp; 960 register int diff; 961 int maxtransblk; 962 963 if (n <= 0) 964 return (0); 965 bn = *sbp; 966 if (bn == 0) 967 return (0); 968 diff = fs->fs_frag; 969 if (*lenp) { 970 n = MIN(n, lblkno(fs, *lenp)); 971 } else { 972 /* 973 * If the user has set the value for maxcontig lower than 974 * the drive transfer size, then assume they want this 975 * to be the maximum value for the size of the data transfer. 976 */ 977 maxtransblk = maxtransfer >> DEV_BSHIFT; 978 if (fs->fs_maxcontig < maxtransblk) { 979 n = MIN(n, fs->fs_maxcontig); 980 } else { 981 n = MIN(n, maxtransblk); 982 } 983 } 984 bp = sbp; 985 while (--n > 0) { 986 nextbn = *(bp + 1); 987 if (nextbn == 0 || bn + diff != nextbn) 988 break; 989 bn = nextbn; 990 bp++; 991 } 992 return ((int)(bp - sbp) + 1); 993 } 994 995 /* 996 * Free any blocks which had been successfully allocated. Always called 997 * as a result of an error, so we don't bother returning an error code 998 * from here. 999 * 1000 * If block_count and inode_sector_adjust are both zero, we'll do nothing. 1001 * Thus it is safe to call this as part of error handling, whether or not 1002 * any blocks have been allocated. 1003 * 1004 * The ufs_inode_direct case is currently unused. 1005 */ 1006 1007 static void 1008 ufs_undo_allocation( 1009 inode_t *ip, 1010 int block_count, 1011 struct ufs_allocated_block table[], 1012 int inode_sector_adjust) 1013 { 1014 int i; 1015 int inode_changed; 1016 int error_updating_pointers; 1017 struct ufsvfs *ufsvfsp; 1018 1019 inode_changed = 0; 1020 error_updating_pointers = 0; 1021 1022 ufsvfsp = ip->i_ufsvfs; 1023 1024 /* 1025 * Update pointers on disk before freeing blocks. If we fail, 1026 * some blocks may remain busy; but they will be reclaimed by 1027 * an fsck. (This is better than letting a block wind up with 1028 * two owners if we successfully freed it but could not remove 1029 * the pointer to it.) 1030 */ 1031 1032 for (i = 0; i < block_count; i++) { 1033 switch (table[i].owner) { 1034 case ufs_no_owner: 1035 /* Nothing to do here, nobody points to us */ 1036 break; 1037 case ufs_inode_direct: 1038 ASSERT(table[i].owner_offset < NDADDR); 1039 ip->i_db[table[i].owner_offset] = 0; 1040 inode_changed = 1; 1041 break; 1042 case ufs_inode_indirect: 1043 ASSERT(table[i].owner_offset < NIADDR); 1044 ip->i_ib[table[i].owner_offset] = 0; 1045 inode_changed = 1; 1046 break; 1047 case ufs_indirect_block: { 1048 buf_t *bp; 1049 daddr32_t *block_data; 1050 1051 /* Read/modify/log/write. */ 1052 1053 ASSERT(table[i].owner_offset < 1054 (VBSIZE(ITOV(ip)) / sizeof (daddr32_t))); 1055 1056 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 1057 fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block), 1058 VBSIZE(ITOV(ip))); 1059 1060 if (bp->b_flags & B_ERROR) { 1061 /* Couldn't read this block; give up. */ 1062 error_updating_pointers = 1; 1063 brelse(bp); 1064 break; /* out of SWITCH */ 1065 } 1066 1067 block_data = bp->b_un.b_daddr; 1068 block_data[table[i].owner_offset] = 0; 1069 1070 /* Write a log entry which includes the zero. */ 1071 /* It might be possible to optimize this by using */ 1072 /* TRANS_BUF directly and zeroing only the four */ 1073 /* bytes involved, but an attempt to do that led */ 1074 /* to panics in the logging code. The attempt was */ 1075 /* TRANS_BUF(ufsvfsp, */ 1076 /* table[i].owner_offset * sizeof (daddr32_t), */ 1077 /* sizeof (daddr32_t), */ 1078 /* bp, */ 1079 /* DT_ABZERO); */ 1080 1081 TRANS_BUF_ITEM_128(ufsvfsp, 1082 block_data[table[i].owner_offset], 1083 block_data, bp, DT_AB); 1084 1085 /* Now we can write the buffer itself. */ 1086 1087 UFS_BWRITE2(ufsvfsp, bp); 1088 1089 if (bp->b_flags & B_ERROR) { 1090 error_updating_pointers = 1; 1091 } 1092 1093 brelse(bp); 1094 break; 1095 } 1096 default: 1097 (void) ufs_fault(ITOV(ip), 1098 "ufs_undo_allocation failure\n"); 1099 break; 1100 } 1101 } 1102 1103 /* 1104 * If the inode changed, or if we need to update its block count, 1105 * then do that now. We update the inode synchronously on disk 1106 * to ensure that it won't transiently point at a block we've 1107 * freed (only necessary if we're not logging). 1108 * 1109 * NOTE: Currently ufs_iupdat() does not check for errors. When 1110 * it is fixed, we should verify that we successfully updated the 1111 * inode before freeing blocks below. 1112 */ 1113 1114 if (inode_changed || (inode_sector_adjust != 0)) { 1115 ip->i_blocks -= inode_sector_adjust; 1116 ASSERT((unsigned)ip->i_blocks <= INT_MAX); 1117 TRANS_INODE(ufsvfsp, ip); 1118 ip->i_flag |= IUPD | ICHG | IATTCHG; 1119 ip->i_seq++; 1120 if (!TRANS_ISTRANS(ufsvfsp)) 1121 ufs_iupdat(ip, I_SYNC); 1122 } 1123 1124 /* 1125 * Now we go through and actually free the blocks, but only if we 1126 * successfully removed the pointers to them. 1127 */ 1128 1129 if (!error_updating_pointers) { 1130 for (i = 0; i < block_count; i++) { 1131 free(ip, table[i].this_block, table[i].block_size, 1132 table[i].usage_flags); 1133 } 1134 } 1135 } 1136 1137 /* 1138 * Find the next hole or data block in file starting at *off 1139 * Return found offset in *off, which can be less than the 1140 * starting offset if not block aligned. 1141 * This code is based on bmap_read(). 1142 * Errors: ENXIO for end of file 1143 * EIO for block read error. 1144 */ 1145 int 1146 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off) 1147 { 1148 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 1149 struct fs *fs = ufsvfsp->vfs_fs; 1150 buf_t *bp[NIADDR]; 1151 int i, j; 1152 int shft; /* we maintain sh = 1 << shft */ 1153 int nindirshift, nindiroffset; 1154 daddr_t ob, nb, tbn, lbn, skip; 1155 daddr32_t *bap; 1156 u_offset_t isz = (offset_t)ip->i_size; 1157 int32_t bs = fs->fs_bsize; /* file system block size */ 1158 int32_t nindir = fs->fs_nindir; 1159 dev_t dev; 1160 int error = 0; 1161 daddr_t limits[NIADDR]; 1162 1163 ASSERT(*off < isz); 1164 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1165 lbn = (daddr_t)lblkno(fs, *off); 1166 ASSERT(lbn >= 0); 1167 1168 for (i = 0; i < NIADDR; i++) 1169 bp[i] = NULL; 1170 1171 /* 1172 * The first NDADDR blocks are direct blocks. 1173 */ 1174 if (lbn < NDADDR) { 1175 for (; lbn < NDADDR; lbn++) { 1176 if ((hole && (ip->i_db[lbn] == 0)) || 1177 (!hole && (ip->i_db[lbn] != 0))) { 1178 goto out; 1179 } 1180 } 1181 if ((u_offset_t)lbn << fs->fs_bshift >= isz) 1182 goto out; 1183 } 1184 1185 nindir = fs->fs_nindir; 1186 nindirshift = ufsvfsp->vfs_nindirshift; 1187 nindiroffset = ufsvfsp->vfs_nindiroffset; 1188 dev = ip->i_dev; 1189 1190 /* Set up limits array */ 1191 for (limits[0] = NDADDR, j = 1; j < NIADDR; j++) 1192 limits[j] = limits[j-1] + (1ULL << (nindirshift * j)); 1193 1194 loop: 1195 /* 1196 * Determine how many levels of indirection. 1197 */ 1198 shft = 0; /* sh = 1 */ 1199 tbn = lbn - NDADDR; 1200 for (j = NIADDR; j > 0; j--) { 1201 longlong_t sh; 1202 1203 shft += nindirshift; /* sh *= nindir */ 1204 sh = 1LL << shft; 1205 if (tbn < sh) 1206 break; 1207 tbn -= sh; 1208 } 1209 if (j == 0) { 1210 /* must have passed end of file */ 1211 ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz); 1212 goto out; 1213 } 1214 1215 /* 1216 * Fetch the first indirect block. 1217 */ 1218 nb = ip->i_ib[NIADDR - j]; 1219 if (nb == 0) { 1220 if (hole) { 1221 lbn = limits[NIADDR - j]; 1222 goto out; 1223 } else { 1224 lbn = limits[NIADDR - j + 1]; 1225 if ((u_offset_t)lbn << fs->fs_bshift >= isz) 1226 goto out; 1227 goto loop; 1228 } 1229 } 1230 1231 /* 1232 * Fetch through the indirect blocks. 1233 */ 1234 for (; ((j <= NIADDR) && (nb != 0)); j++) { 1235 ob = nb; 1236 /* 1237 * if there's a different block at this level then release 1238 * the old one and in with the new. 1239 */ 1240 if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) { 1241 if (bp[j-1] != NULL) 1242 brelse(bp[j-1]); 1243 bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs); 1244 if (bp[j-1]->b_flags & B_ERROR) { 1245 error = EIO; 1246 goto out; 1247 } 1248 } 1249 bap = bp[j-1]->b_un.b_daddr; 1250 1251 shft -= nindirshift; /* sh / nindir */ 1252 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */ 1253 nb = bap[i]; 1254 skip = 1LL << (nindirshift * (NIADDR - j)); 1255 } 1256 1257 /* 1258 * Scan through the blocks in this array. 1259 */ 1260 for (; i < nindir; i++, lbn += skip) { 1261 if (hole && (bap[i] == 0)) 1262 goto out; 1263 if (!hole && (bap[i] != 0)) { 1264 if (skip == 1) { 1265 /* we're at the lowest level */ 1266 goto out; 1267 } else { 1268 goto loop; 1269 } 1270 } 1271 } 1272 if (((u_offset_t)lbn << fs->fs_bshift) < isz) 1273 goto loop; 1274 out: 1275 for (i = 0; i < NIADDR; i++) { 1276 if (bp[i]) 1277 brelse(bp[i]); 1278 } 1279 if (error == 0) { 1280 if (((u_offset_t)lbn << fs->fs_bshift) >= isz) { 1281 error = ENXIO; 1282 } else { 1283 /* success */ 1284 *off = (u_offset_t)lbn << fs->fs_bshift; 1285 } 1286 } 1287 return (error); 1288 } 1289