1 /*- 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 60 */ 61 62 #include <sys/cdefs.h> 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/lock.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 73 #include <ufs/ufs/quota.h> 74 #include <ufs/ufs/inode.h> 75 #include <ufs/ufs/ufs_extern.h> 76 #include <ufs/ufs/extattr.h> 77 #include <ufs/ufs/ufsmount.h> 78 79 #include <ufs/ffs/fs.h> 80 #include <ufs/ffs/ffs_extern.h> 81 82 /* 83 * Balloc defines the structure of filesystem storage 84 * by allocating the physical blocks on a device given 85 * the inode and the logical block number in a file. 86 * This is the allocation strategy for UFS1. Below is 87 * the allocation strategy for UFS2. 88 */ 89 int 90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 91 struct ucred *cred, int flags, struct buf **bpp) 92 { 93 struct inode *ip; 94 struct ufs1_dinode *dp; 95 ufs_lbn_t lbn, lastlbn; 96 struct fs *fs; 97 ufs1_daddr_t nb; 98 struct buf *bp, *nbp; 99 struct ufsmount *ump; 100 struct indir indirs[UFS_NIADDR + 2]; 101 int deallocated, osize, nsize, num, i, error; 102 ufs2_daddr_t newb; 103 ufs1_daddr_t *bap, pref; 104 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 105 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 106 int unwindidx = -1; 107 int saved_inbdflush; 108 static struct timeval lastfail; 109 static int curfail; 110 int gbflags, reclaimed; 111 112 ip = VTOI(vp); 113 dp = ip->i_din1; 114 fs = ITOFS(ip); 115 ump = ITOUMP(ip); 116 lbn = lblkno(fs, startoffset); 117 size = blkoff(fs, startoffset) + size; 118 reclaimed = 0; 119 if (size > fs->fs_bsize) 120 panic("ffs_balloc_ufs1: blk too big"); 121 *bpp = NULL; 122 if (flags & IO_EXT) 123 return (EOPNOTSUPP); 124 if (lbn < 0) 125 return (EFBIG); 126 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 127 128 if (DOINGSOFTDEP(vp)) 129 softdep_prealloc(vp, MNT_WAIT); 130 /* 131 * If the next write will extend the file into a new block, 132 * and the file is currently composed of a fragment 133 * this fragment has to be extended to be a full block. 134 */ 135 lastlbn = lblkno(fs, ip->i_size); 136 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 137 nb = lastlbn; 138 osize = blksize(fs, ip, nb); 139 if (osize < fs->fs_bsize && osize > 0) { 140 UFS_LOCK(ump); 141 error = ffs_realloccg(ip, nb, dp->di_db[nb], 142 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 143 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 144 cred, &bp); 145 if (error) 146 return (error); 147 if (DOINGSOFTDEP(vp)) 148 softdep_setup_allocdirect(ip, nb, 149 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 150 fs->fs_bsize, osize, bp); 151 ip->i_size = smalllblktosize(fs, nb + 1); 152 dp->di_size = ip->i_size; 153 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 154 ip->i_flag |= IN_CHANGE | IN_UPDATE; 155 if (flags & IO_SYNC) 156 bwrite(bp); 157 else if (DOINGASYNC(vp)) 158 bdwrite(bp); 159 else 160 bawrite(bp); 161 } 162 } 163 /* 164 * The first UFS_NDADDR blocks are direct blocks 165 */ 166 if (lbn < UFS_NDADDR) { 167 if (flags & BA_METAONLY) 168 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 169 nb = dp->di_db[lbn]; 170 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 171 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 172 if (error) { 173 brelse(bp); 174 return (error); 175 } 176 bp->b_blkno = fsbtodb(fs, nb); 177 *bpp = bp; 178 return (0); 179 } 180 if (nb != 0) { 181 /* 182 * Consider need to reallocate a fragment. 183 */ 184 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 185 nsize = fragroundup(fs, size); 186 if (nsize <= osize) { 187 error = bread(vp, lbn, osize, NOCRED, &bp); 188 if (error) { 189 brelse(bp); 190 return (error); 191 } 192 bp->b_blkno = fsbtodb(fs, nb); 193 } else { 194 UFS_LOCK(ump); 195 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 196 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 197 &dp->di_db[0]), osize, nsize, flags, 198 cred, &bp); 199 if (error) 200 return (error); 201 if (DOINGSOFTDEP(vp)) 202 softdep_setup_allocdirect(ip, lbn, 203 dbtofsb(fs, bp->b_blkno), nb, 204 nsize, osize, bp); 205 } 206 } else { 207 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 208 nsize = fragroundup(fs, size); 209 else 210 nsize = fs->fs_bsize; 211 UFS_LOCK(ump); 212 error = ffs_alloc(ip, lbn, 213 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 214 nsize, flags, cred, &newb); 215 if (error) 216 return (error); 217 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 218 bp->b_blkno = fsbtodb(fs, newb); 219 if (flags & BA_CLRBUF) 220 vfs_bio_clrbuf(bp); 221 if (DOINGSOFTDEP(vp)) 222 softdep_setup_allocdirect(ip, lbn, newb, 0, 223 nsize, 0, bp); 224 } 225 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 226 ip->i_flag |= IN_CHANGE | IN_UPDATE; 227 *bpp = bp; 228 return (0); 229 } 230 /* 231 * Determine the number of levels of indirection. 232 */ 233 pref = 0; 234 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 235 return(error); 236 #ifdef INVARIANTS 237 if (num < 1) 238 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 239 #endif 240 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 241 /* 242 * Fetch the first indirect block allocating if necessary. 243 */ 244 --num; 245 nb = dp->di_ib[indirs[0].in_off]; 246 allocib = NULL; 247 allocblk = allociblk; 248 lbns_remfree = lbns; 249 if (nb == 0) { 250 UFS_LOCK(ump); 251 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 252 (ufs1_daddr_t *)0); 253 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 254 flags, cred, &newb)) != 0) { 255 curthread_pflags_restore(saved_inbdflush); 256 return (error); 257 } 258 pref = newb + fs->fs_frag; 259 nb = newb; 260 MPASS(allocblk < allociblk + nitems(allociblk)); 261 MPASS(lbns_remfree < lbns + nitems(lbns)); 262 *allocblk++ = nb; 263 *lbns_remfree++ = indirs[1].in_lbn; 264 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 265 bp->b_blkno = fsbtodb(fs, nb); 266 vfs_bio_clrbuf(bp); 267 if (DOINGSOFTDEP(vp)) { 268 softdep_setup_allocdirect(ip, 269 UFS_NDADDR + indirs[0].in_off, newb, 0, 270 fs->fs_bsize, 0, bp); 271 bdwrite(bp); 272 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 273 if (bp->b_bufsize == fs->fs_bsize) 274 bp->b_flags |= B_CLUSTEROK; 275 bdwrite(bp); 276 } else { 277 if ((error = bwrite(bp)) != 0) 278 goto fail; 279 } 280 allocib = &dp->di_ib[indirs[0].in_off]; 281 *allocib = nb; 282 ip->i_flag |= IN_CHANGE | IN_UPDATE; 283 } 284 /* 285 * Fetch through the indirect blocks, allocating as necessary. 286 */ 287 retry: 288 for (i = 1;;) { 289 error = bread(vp, 290 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 291 if (error) { 292 brelse(bp); 293 goto fail; 294 } 295 bap = (ufs1_daddr_t *)bp->b_data; 296 nb = bap[indirs[i].in_off]; 297 if (i == num) 298 break; 299 i += 1; 300 if (nb != 0) { 301 bqrelse(bp); 302 continue; 303 } 304 UFS_LOCK(ump); 305 /* 306 * If parent indirect has just been allocated, try to cluster 307 * immediately following it. 308 */ 309 if (pref == 0) 310 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 311 (ufs1_daddr_t *)0); 312 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 313 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 314 brelse(bp); 315 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 316 UFS_LOCK(ump); 317 softdep_request_cleanup(fs, vp, cred, 318 FLUSH_BLOCKS_WAIT); 319 UFS_UNLOCK(ump); 320 goto retry; 321 } 322 if (ppsratecheck(&lastfail, &curfail, 1)) { 323 ffs_fserr(fs, ip->i_number, "filesystem full"); 324 uprintf("\n%s: write failed, filesystem " 325 "is full\n", fs->fs_fsmnt); 326 } 327 goto fail; 328 } 329 pref = newb + fs->fs_frag; 330 nb = newb; 331 MPASS(allocblk < allociblk + nitems(allociblk)); 332 MPASS(lbns_remfree < lbns + nitems(lbns)); 333 *allocblk++ = nb; 334 *lbns_remfree++ = indirs[i].in_lbn; 335 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 336 nbp->b_blkno = fsbtodb(fs, nb); 337 vfs_bio_clrbuf(nbp); 338 if (DOINGSOFTDEP(vp)) { 339 softdep_setup_allocindir_meta(nbp, ip, bp, 340 indirs[i - 1].in_off, nb); 341 bdwrite(nbp); 342 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 343 if (nbp->b_bufsize == fs->fs_bsize) 344 nbp->b_flags |= B_CLUSTEROK; 345 bdwrite(nbp); 346 } else { 347 if ((error = bwrite(nbp)) != 0) { 348 brelse(bp); 349 goto fail; 350 } 351 } 352 bap[indirs[i - 1].in_off] = nb; 353 if (allocib == NULL && unwindidx < 0) 354 unwindidx = i - 1; 355 /* 356 * If required, write synchronously, otherwise use 357 * delayed write. 358 */ 359 if (flags & IO_SYNC) { 360 bwrite(bp); 361 } else { 362 if (bp->b_bufsize == fs->fs_bsize) 363 bp->b_flags |= B_CLUSTEROK; 364 bdwrite(bp); 365 } 366 } 367 /* 368 * If asked only for the indirect block, then return it. 369 */ 370 if (flags & BA_METAONLY) { 371 curthread_pflags_restore(saved_inbdflush); 372 *bpp = bp; 373 return (0); 374 } 375 /* 376 * Get the data block, allocating if necessary. 377 */ 378 if (nb == 0) { 379 UFS_LOCK(ump); 380 /* 381 * If allocating metadata at the front of the cylinder 382 * group and parent indirect block has just been allocated, 383 * then cluster next to it if it is the first indirect in 384 * the file. Otherwise it has been allocated in the metadata 385 * area, so we want to find our own place out in the data area. 386 */ 387 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 388 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 389 &bap[0]); 390 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 391 flags | IO_BUFLOCKED, cred, &newb); 392 if (error) { 393 brelse(bp); 394 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 395 UFS_LOCK(ump); 396 softdep_request_cleanup(fs, vp, cred, 397 FLUSH_BLOCKS_WAIT); 398 UFS_UNLOCK(ump); 399 goto retry; 400 } 401 if (ppsratecheck(&lastfail, &curfail, 1)) { 402 ffs_fserr(fs, ip->i_number, "filesystem full"); 403 uprintf("\n%s: write failed, filesystem " 404 "is full\n", fs->fs_fsmnt); 405 } 406 goto fail; 407 } 408 nb = newb; 409 MPASS(allocblk < allociblk + nitems(allociblk)); 410 MPASS(lbns_remfree < lbns + nitems(lbns)); 411 *allocblk++ = nb; 412 *lbns_remfree++ = lbn; 413 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 414 nbp->b_blkno = fsbtodb(fs, nb); 415 if (flags & BA_CLRBUF) 416 vfs_bio_clrbuf(nbp); 417 if (DOINGSOFTDEP(vp)) 418 softdep_setup_allocindir_page(ip, lbn, bp, 419 indirs[i].in_off, nb, 0, nbp); 420 bap[indirs[i].in_off] = nb; 421 /* 422 * If required, write synchronously, otherwise use 423 * delayed write. 424 */ 425 if (flags & IO_SYNC) { 426 bwrite(bp); 427 } else { 428 if (bp->b_bufsize == fs->fs_bsize) 429 bp->b_flags |= B_CLUSTEROK; 430 bdwrite(bp); 431 } 432 curthread_pflags_restore(saved_inbdflush); 433 *bpp = nbp; 434 return (0); 435 } 436 brelse(bp); 437 if (flags & BA_CLRBUF) { 438 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 439 if (seqcount != 0 && 440 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 441 !(vm_page_count_severe() || buf_dirty_count_severe())) { 442 error = cluster_read(vp, ip->i_size, lbn, 443 (int)fs->fs_bsize, NOCRED, 444 MAXBSIZE, seqcount, gbflags, &nbp); 445 } else { 446 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 447 gbflags, &nbp); 448 } 449 if (error) { 450 brelse(nbp); 451 goto fail; 452 } 453 } else { 454 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 455 nbp->b_blkno = fsbtodb(fs, nb); 456 } 457 curthread_pflags_restore(saved_inbdflush); 458 *bpp = nbp; 459 return (0); 460 fail: 461 curthread_pflags_restore(saved_inbdflush); 462 /* 463 * If we have failed to allocate any blocks, simply return the error. 464 * This is the usual case and avoids the need to fsync the file. 465 */ 466 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 467 return (error); 468 /* 469 * If we have failed part way through block allocation, we 470 * have to deallocate any indirect blocks that we have allocated. 471 * We have to fsync the file before we start to get rid of all 472 * of its dependencies so that we do not leave them dangling. 473 * We have to sync it at the end so that the soft updates code 474 * does not find any untracked changes. Although this is really 475 * slow, running out of disk space is not expected to be a common 476 * occurrence. The error return from fsync is ignored as we already 477 * have an error to return to the user. 478 * 479 * XXX Still have to journal the free below 480 */ 481 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 482 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 483 blkp < allocblk; blkp++, lbns_remfree++) { 484 /* 485 * We shall not leave the freed blocks on the vnode 486 * buffer object lists. 487 */ 488 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 489 GB_NOCREAT | GB_UNMAPPED); 490 if (bp != NULL) { 491 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 492 ("mismatch1 l %jd %jd b %ju %ju", 493 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 494 (uintmax_t)bp->b_blkno, 495 (uintmax_t)fsbtodb(fs, *blkp))); 496 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 497 bp->b_flags &= ~(B_ASYNC | B_CACHE); 498 brelse(bp); 499 } 500 deallocated += fs->fs_bsize; 501 } 502 if (allocib != NULL) { 503 *allocib = 0; 504 } else if (unwindidx >= 0) { 505 int r; 506 507 r = bread(vp, indirs[unwindidx].in_lbn, 508 (int)fs->fs_bsize, NOCRED, &bp); 509 if (r) { 510 panic("Could not unwind indirect block, error %d", r); 511 brelse(bp); 512 } else { 513 bap = (ufs1_daddr_t *)bp->b_data; 514 bap[indirs[unwindidx].in_off] = 0; 515 if (flags & IO_SYNC) { 516 bwrite(bp); 517 } else { 518 if (bp->b_bufsize == fs->fs_bsize) 519 bp->b_flags |= B_CLUSTEROK; 520 bdwrite(bp); 521 } 522 } 523 } 524 if (deallocated) { 525 #ifdef QUOTA 526 /* 527 * Restore user's disk quota because allocation failed. 528 */ 529 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 530 #endif 531 dp->di_blocks -= btodb(deallocated); 532 ip->i_flag |= IN_CHANGE | IN_UPDATE; 533 } 534 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 535 /* 536 * After the buffers are invalidated and on-disk pointers are 537 * cleared, free the blocks. 538 */ 539 for (blkp = allociblk; blkp < allocblk; blkp++) { 540 #ifdef INVARIANTS 541 if (blkp == allociblk) 542 lbns_remfree = lbns; 543 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 544 GB_NOCREAT | GB_UNMAPPED); 545 if (bp != NULL) { 546 panic("zombie1 %jd %ju %ju", 547 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 548 (uintmax_t)fsbtodb(fs, *blkp)); 549 } 550 lbns_remfree++; 551 #endif 552 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 553 ip->i_number, vp->v_type, NULL); 554 } 555 return (error); 556 } 557 558 /* 559 * Balloc defines the structure of file system storage 560 * by allocating the physical blocks on a device given 561 * the inode and the logical block number in a file. 562 * This is the allocation strategy for UFS2. Above is 563 * the allocation strategy for UFS1. 564 */ 565 int 566 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 567 struct ucred *cred, int flags, struct buf **bpp) 568 { 569 struct inode *ip; 570 struct ufs2_dinode *dp; 571 ufs_lbn_t lbn, lastlbn; 572 struct fs *fs; 573 struct buf *bp, *nbp; 574 struct ufsmount *ump; 575 struct indir indirs[UFS_NIADDR + 2]; 576 ufs2_daddr_t nb, newb, *bap, pref; 577 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 578 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 579 int deallocated, osize, nsize, num, i, error; 580 int unwindidx = -1; 581 int saved_inbdflush; 582 static struct timeval lastfail; 583 static int curfail; 584 int gbflags, reclaimed; 585 586 ip = VTOI(vp); 587 dp = ip->i_din2; 588 fs = ITOFS(ip); 589 ump = ITOUMP(ip); 590 lbn = lblkno(fs, startoffset); 591 size = blkoff(fs, startoffset) + size; 592 reclaimed = 0; 593 if (size > fs->fs_bsize) 594 panic("ffs_balloc_ufs2: blk too big"); 595 *bpp = NULL; 596 if (lbn < 0) 597 return (EFBIG); 598 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 599 600 if (DOINGSOFTDEP(vp)) 601 softdep_prealloc(vp, MNT_WAIT); 602 603 /* 604 * Check for allocating external data. 605 */ 606 if (flags & IO_EXT) { 607 if (lbn >= UFS_NXADDR) 608 return (EFBIG); 609 /* 610 * If the next write will extend the data into a new block, 611 * and the data is currently composed of a fragment 612 * this fragment has to be extended to be a full block. 613 */ 614 lastlbn = lblkno(fs, dp->di_extsize); 615 if (lastlbn < lbn) { 616 nb = lastlbn; 617 osize = sblksize(fs, dp->di_extsize, nb); 618 if (osize < fs->fs_bsize && osize > 0) { 619 UFS_LOCK(ump); 620 error = ffs_realloccg(ip, -1 - nb, 621 dp->di_extb[nb], 622 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 623 &dp->di_extb[0]), osize, 624 (int)fs->fs_bsize, flags, cred, &bp); 625 if (error) 626 return (error); 627 if (DOINGSOFTDEP(vp)) 628 softdep_setup_allocext(ip, nb, 629 dbtofsb(fs, bp->b_blkno), 630 dp->di_extb[nb], 631 fs->fs_bsize, osize, bp); 632 dp->di_extsize = smalllblktosize(fs, nb + 1); 633 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 634 bp->b_xflags |= BX_ALTDATA; 635 ip->i_flag |= IN_CHANGE; 636 if (flags & IO_SYNC) 637 bwrite(bp); 638 else 639 bawrite(bp); 640 } 641 } 642 /* 643 * All blocks are direct blocks 644 */ 645 if (flags & BA_METAONLY) 646 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 647 nb = dp->di_extb[lbn]; 648 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 649 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 650 gbflags, &bp); 651 if (error) { 652 brelse(bp); 653 return (error); 654 } 655 bp->b_blkno = fsbtodb(fs, nb); 656 bp->b_xflags |= BX_ALTDATA; 657 *bpp = bp; 658 return (0); 659 } 660 if (nb != 0) { 661 /* 662 * Consider need to reallocate a fragment. 663 */ 664 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 665 nsize = fragroundup(fs, size); 666 if (nsize <= osize) { 667 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 668 gbflags, &bp); 669 if (error) { 670 brelse(bp); 671 return (error); 672 } 673 bp->b_blkno = fsbtodb(fs, nb); 674 bp->b_xflags |= BX_ALTDATA; 675 } else { 676 UFS_LOCK(ump); 677 error = ffs_realloccg(ip, -1 - lbn, 678 dp->di_extb[lbn], 679 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 680 &dp->di_extb[0]), osize, nsize, flags, 681 cred, &bp); 682 if (error) 683 return (error); 684 bp->b_xflags |= BX_ALTDATA; 685 if (DOINGSOFTDEP(vp)) 686 softdep_setup_allocext(ip, lbn, 687 dbtofsb(fs, bp->b_blkno), nb, 688 nsize, osize, bp); 689 } 690 } else { 691 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 692 nsize = fragroundup(fs, size); 693 else 694 nsize = fs->fs_bsize; 695 UFS_LOCK(ump); 696 error = ffs_alloc(ip, lbn, 697 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 698 nsize, flags, cred, &newb); 699 if (error) 700 return (error); 701 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 702 bp->b_blkno = fsbtodb(fs, newb); 703 bp->b_xflags |= BX_ALTDATA; 704 if (flags & BA_CLRBUF) 705 vfs_bio_clrbuf(bp); 706 if (DOINGSOFTDEP(vp)) 707 softdep_setup_allocext(ip, lbn, newb, 0, 708 nsize, 0, bp); 709 } 710 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 711 ip->i_flag |= IN_CHANGE; 712 *bpp = bp; 713 return (0); 714 } 715 /* 716 * If the next write will extend the file into a new block, 717 * and the file is currently composed of a fragment 718 * this fragment has to be extended to be a full block. 719 */ 720 lastlbn = lblkno(fs, ip->i_size); 721 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 722 nb = lastlbn; 723 osize = blksize(fs, ip, nb); 724 if (osize < fs->fs_bsize && osize > 0) { 725 UFS_LOCK(ump); 726 error = ffs_realloccg(ip, nb, dp->di_db[nb], 727 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 728 &dp->di_db[0]), osize, (int)fs->fs_bsize, 729 flags, cred, &bp); 730 if (error) 731 return (error); 732 if (DOINGSOFTDEP(vp)) 733 softdep_setup_allocdirect(ip, nb, 734 dbtofsb(fs, bp->b_blkno), 735 dp->di_db[nb], 736 fs->fs_bsize, osize, bp); 737 ip->i_size = smalllblktosize(fs, nb + 1); 738 dp->di_size = ip->i_size; 739 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 740 ip->i_flag |= IN_CHANGE | IN_UPDATE; 741 if (flags & IO_SYNC) 742 bwrite(bp); 743 else 744 bawrite(bp); 745 } 746 } 747 /* 748 * The first UFS_NDADDR blocks are direct blocks 749 */ 750 if (lbn < UFS_NDADDR) { 751 if (flags & BA_METAONLY) 752 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 753 nb = dp->di_db[lbn]; 754 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 755 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 756 gbflags, &bp); 757 if (error) { 758 brelse(bp); 759 return (error); 760 } 761 bp->b_blkno = fsbtodb(fs, nb); 762 *bpp = bp; 763 return (0); 764 } 765 if (nb != 0) { 766 /* 767 * Consider need to reallocate a fragment. 768 */ 769 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 770 nsize = fragroundup(fs, size); 771 if (nsize <= osize) { 772 error = bread_gb(vp, lbn, osize, NOCRED, 773 gbflags, &bp); 774 if (error) { 775 brelse(bp); 776 return (error); 777 } 778 bp->b_blkno = fsbtodb(fs, nb); 779 } else { 780 UFS_LOCK(ump); 781 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 782 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 783 &dp->di_db[0]), osize, nsize, flags, 784 cred, &bp); 785 if (error) 786 return (error); 787 if (DOINGSOFTDEP(vp)) 788 softdep_setup_allocdirect(ip, lbn, 789 dbtofsb(fs, bp->b_blkno), nb, 790 nsize, osize, bp); 791 } 792 } else { 793 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 794 nsize = fragroundup(fs, size); 795 else 796 nsize = fs->fs_bsize; 797 UFS_LOCK(ump); 798 error = ffs_alloc(ip, lbn, 799 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 800 &dp->di_db[0]), nsize, flags, cred, &newb); 801 if (error) 802 return (error); 803 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 804 bp->b_blkno = fsbtodb(fs, newb); 805 if (flags & BA_CLRBUF) 806 vfs_bio_clrbuf(bp); 807 if (DOINGSOFTDEP(vp)) 808 softdep_setup_allocdirect(ip, lbn, newb, 0, 809 nsize, 0, bp); 810 } 811 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 812 ip->i_flag |= IN_CHANGE | IN_UPDATE; 813 *bpp = bp; 814 return (0); 815 } 816 /* 817 * Determine the number of levels of indirection. 818 */ 819 pref = 0; 820 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 821 return(error); 822 #ifdef INVARIANTS 823 if (num < 1) 824 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 825 #endif 826 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 827 /* 828 * Fetch the first indirect block allocating if necessary. 829 */ 830 --num; 831 nb = dp->di_ib[indirs[0].in_off]; 832 allocib = NULL; 833 allocblk = allociblk; 834 lbns_remfree = lbns; 835 if (nb == 0) { 836 UFS_LOCK(ump); 837 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 838 (ufs2_daddr_t *)0); 839 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 840 flags, cred, &newb)) != 0) { 841 curthread_pflags_restore(saved_inbdflush); 842 return (error); 843 } 844 pref = newb + fs->fs_frag; 845 nb = newb; 846 MPASS(allocblk < allociblk + nitems(allociblk)); 847 MPASS(lbns_remfree < lbns + nitems(lbns)); 848 *allocblk++ = nb; 849 *lbns_remfree++ = indirs[1].in_lbn; 850 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 851 GB_UNMAPPED); 852 bp->b_blkno = fsbtodb(fs, nb); 853 vfs_bio_clrbuf(bp); 854 if (DOINGSOFTDEP(vp)) { 855 softdep_setup_allocdirect(ip, 856 UFS_NDADDR + indirs[0].in_off, newb, 0, 857 fs->fs_bsize, 0, bp); 858 bdwrite(bp); 859 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 860 if (bp->b_bufsize == fs->fs_bsize) 861 bp->b_flags |= B_CLUSTEROK; 862 bdwrite(bp); 863 } else { 864 if ((error = bwrite(bp)) != 0) 865 goto fail; 866 } 867 allocib = &dp->di_ib[indirs[0].in_off]; 868 *allocib = nb; 869 ip->i_flag |= IN_CHANGE | IN_UPDATE; 870 } 871 /* 872 * Fetch through the indirect blocks, allocating as necessary. 873 */ 874 retry: 875 for (i = 1;;) { 876 error = bread(vp, 877 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 878 if (error) { 879 brelse(bp); 880 goto fail; 881 } 882 bap = (ufs2_daddr_t *)bp->b_data; 883 nb = bap[indirs[i].in_off]; 884 if (i == num) 885 break; 886 i += 1; 887 if (nb != 0) { 888 bqrelse(bp); 889 continue; 890 } 891 UFS_LOCK(ump); 892 /* 893 * If parent indirect has just been allocated, try to cluster 894 * immediately following it. 895 */ 896 if (pref == 0) 897 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 898 (ufs2_daddr_t *)0); 899 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 900 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 901 brelse(bp); 902 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 903 UFS_LOCK(ump); 904 softdep_request_cleanup(fs, vp, cred, 905 FLUSH_BLOCKS_WAIT); 906 UFS_UNLOCK(ump); 907 goto retry; 908 } 909 if (ppsratecheck(&lastfail, &curfail, 1)) { 910 ffs_fserr(fs, ip->i_number, "filesystem full"); 911 uprintf("\n%s: write failed, filesystem " 912 "is full\n", fs->fs_fsmnt); 913 } 914 goto fail; 915 } 916 pref = newb + fs->fs_frag; 917 nb = newb; 918 MPASS(allocblk < allociblk + nitems(allociblk)); 919 MPASS(lbns_remfree < lbns + nitems(lbns)); 920 *allocblk++ = nb; 921 *lbns_remfree++ = indirs[i].in_lbn; 922 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 923 GB_UNMAPPED); 924 nbp->b_blkno = fsbtodb(fs, nb); 925 vfs_bio_clrbuf(nbp); 926 if (DOINGSOFTDEP(vp)) { 927 softdep_setup_allocindir_meta(nbp, ip, bp, 928 indirs[i - 1].in_off, nb); 929 bdwrite(nbp); 930 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 931 if (nbp->b_bufsize == fs->fs_bsize) 932 nbp->b_flags |= B_CLUSTEROK; 933 bdwrite(nbp); 934 } else { 935 if ((error = bwrite(nbp)) != 0) { 936 brelse(bp); 937 goto fail; 938 } 939 } 940 bap[indirs[i - 1].in_off] = nb; 941 if (allocib == NULL && unwindidx < 0) 942 unwindidx = i - 1; 943 /* 944 * If required, write synchronously, otherwise use 945 * delayed write. 946 */ 947 if (flags & IO_SYNC) { 948 bwrite(bp); 949 } else { 950 if (bp->b_bufsize == fs->fs_bsize) 951 bp->b_flags |= B_CLUSTEROK; 952 bdwrite(bp); 953 } 954 } 955 /* 956 * If asked only for the indirect block, then return it. 957 */ 958 if (flags & BA_METAONLY) { 959 curthread_pflags_restore(saved_inbdflush); 960 *bpp = bp; 961 return (0); 962 } 963 /* 964 * Get the data block, allocating if necessary. 965 */ 966 if (nb == 0) { 967 UFS_LOCK(ump); 968 /* 969 * If allocating metadata at the front of the cylinder 970 * group and parent indirect block has just been allocated, 971 * then cluster next to it if it is the first indirect in 972 * the file. Otherwise it has been allocated in the metadata 973 * area, so we want to find our own place out in the data area. 974 */ 975 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 976 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 977 &bap[0]); 978 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 979 flags | IO_BUFLOCKED, cred, &newb); 980 if (error) { 981 brelse(bp); 982 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 983 UFS_LOCK(ump); 984 softdep_request_cleanup(fs, vp, cred, 985 FLUSH_BLOCKS_WAIT); 986 UFS_UNLOCK(ump); 987 goto retry; 988 } 989 if (ppsratecheck(&lastfail, &curfail, 1)) { 990 ffs_fserr(fs, ip->i_number, "filesystem full"); 991 uprintf("\n%s: write failed, filesystem " 992 "is full\n", fs->fs_fsmnt); 993 } 994 goto fail; 995 } 996 nb = newb; 997 MPASS(allocblk < allociblk + nitems(allociblk)); 998 MPASS(lbns_remfree < lbns + nitems(lbns)); 999 *allocblk++ = nb; 1000 *lbns_remfree++ = lbn; 1001 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1002 nbp->b_blkno = fsbtodb(fs, nb); 1003 if (flags & BA_CLRBUF) 1004 vfs_bio_clrbuf(nbp); 1005 if (DOINGSOFTDEP(vp)) 1006 softdep_setup_allocindir_page(ip, lbn, bp, 1007 indirs[i].in_off, nb, 0, nbp); 1008 bap[indirs[i].in_off] = nb; 1009 /* 1010 * If required, write synchronously, otherwise use 1011 * delayed write. 1012 */ 1013 if (flags & IO_SYNC) { 1014 bwrite(bp); 1015 } else { 1016 if (bp->b_bufsize == fs->fs_bsize) 1017 bp->b_flags |= B_CLUSTEROK; 1018 bdwrite(bp); 1019 } 1020 curthread_pflags_restore(saved_inbdflush); 1021 *bpp = nbp; 1022 return (0); 1023 } 1024 brelse(bp); 1025 /* 1026 * If requested clear invalid portions of the buffer. If we 1027 * have to do a read-before-write (typical if BA_CLRBUF is set), 1028 * try to do some read-ahead in the sequential case to reduce 1029 * the number of I/O transactions. 1030 */ 1031 if (flags & BA_CLRBUF) { 1032 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1033 if (seqcount != 0 && 1034 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1035 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1036 error = cluster_read(vp, ip->i_size, lbn, 1037 (int)fs->fs_bsize, NOCRED, 1038 MAXBSIZE, seqcount, gbflags, &nbp); 1039 } else { 1040 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1041 NOCRED, gbflags, &nbp); 1042 } 1043 if (error) { 1044 brelse(nbp); 1045 goto fail; 1046 } 1047 } else { 1048 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1049 nbp->b_blkno = fsbtodb(fs, nb); 1050 } 1051 curthread_pflags_restore(saved_inbdflush); 1052 *bpp = nbp; 1053 return (0); 1054 fail: 1055 curthread_pflags_restore(saved_inbdflush); 1056 /* 1057 * If we have failed to allocate any blocks, simply return the error. 1058 * This is the usual case and avoids the need to fsync the file. 1059 */ 1060 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1061 return (error); 1062 /* 1063 * If we have failed part way through block allocation, we 1064 * have to deallocate any indirect blocks that we have allocated. 1065 * We have to fsync the file before we start to get rid of all 1066 * of its dependencies so that we do not leave them dangling. 1067 * We have to sync it at the end so that the soft updates code 1068 * does not find any untracked changes. Although this is really 1069 * slow, running out of disk space is not expected to be a common 1070 * occurrence. The error return from fsync is ignored as we already 1071 * have an error to return to the user. 1072 * 1073 * XXX Still have to journal the free below 1074 */ 1075 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1076 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1077 blkp < allocblk; blkp++, lbns_remfree++) { 1078 /* 1079 * We shall not leave the freed blocks on the vnode 1080 * buffer object lists. 1081 */ 1082 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1083 GB_NOCREAT | GB_UNMAPPED); 1084 if (bp != NULL) { 1085 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1086 ("mismatch2 l %jd %jd b %ju %ju", 1087 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1088 (uintmax_t)bp->b_blkno, 1089 (uintmax_t)fsbtodb(fs, *blkp))); 1090 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1091 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1092 brelse(bp); 1093 } 1094 deallocated += fs->fs_bsize; 1095 } 1096 if (allocib != NULL) { 1097 *allocib = 0; 1098 } else if (unwindidx >= 0) { 1099 int r; 1100 1101 r = bread(vp, indirs[unwindidx].in_lbn, 1102 (int)fs->fs_bsize, NOCRED, &bp); 1103 if (r) { 1104 panic("Could not unwind indirect block, error %d", r); 1105 brelse(bp); 1106 } else { 1107 bap = (ufs2_daddr_t *)bp->b_data; 1108 bap[indirs[unwindidx].in_off] = 0; 1109 if (flags & IO_SYNC) { 1110 bwrite(bp); 1111 } else { 1112 if (bp->b_bufsize == fs->fs_bsize) 1113 bp->b_flags |= B_CLUSTEROK; 1114 bdwrite(bp); 1115 } 1116 } 1117 } 1118 if (deallocated) { 1119 #ifdef QUOTA 1120 /* 1121 * Restore user's disk quota because allocation failed. 1122 */ 1123 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1124 #endif 1125 dp->di_blocks -= btodb(deallocated); 1126 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1127 } 1128 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1129 /* 1130 * After the buffers are invalidated and on-disk pointers are 1131 * cleared, free the blocks. 1132 */ 1133 for (blkp = allociblk; blkp < allocblk; blkp++) { 1134 #ifdef INVARIANTS 1135 if (blkp == allociblk) 1136 lbns_remfree = lbns; 1137 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1138 GB_NOCREAT | GB_UNMAPPED); 1139 if (bp != NULL) { 1140 panic("zombie2 %jd %ju %ju", 1141 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1142 (uintmax_t)fsbtodb(fs, *blkp)); 1143 } 1144 lbns_remfree++; 1145 #endif 1146 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1147 ip->i_number, vp->v_type, NULL); 1148 } 1149 return (error); 1150 } 1151