1 /*- 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 60 */ 61 62 #include <sys/cdefs.h> 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/lock.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 73 #include <ufs/ufs/quota.h> 74 #include <ufs/ufs/inode.h> 75 #include <ufs/ufs/ufs_extern.h> 76 #include <ufs/ufs/extattr.h> 77 #include <ufs/ufs/ufsmount.h> 78 79 #include <ufs/ffs/fs.h> 80 #include <ufs/ffs/ffs_extern.h> 81 82 /* 83 * Balloc defines the structure of filesystem storage 84 * by allocating the physical blocks on a device given 85 * the inode and the logical block number in a file. 86 * This is the allocation strategy for UFS1. Below is 87 * the allocation strategy for UFS2. 88 */ 89 int 90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 91 struct ucred *cred, int flags, struct buf **bpp) 92 { 93 struct inode *ip; 94 struct ufs1_dinode *dp; 95 ufs_lbn_t lbn, lastlbn; 96 struct fs *fs; 97 ufs1_daddr_t nb; 98 struct buf *bp, *nbp; 99 struct ufsmount *ump; 100 struct indir indirs[NIADDR + 2]; 101 int deallocated, osize, nsize, num, i, error; 102 ufs2_daddr_t newb; 103 ufs1_daddr_t *bap, pref; 104 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 105 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 106 int unwindidx = -1; 107 int saved_inbdflush; 108 static struct timeval lastfail; 109 static int curfail; 110 int gbflags, reclaimed; 111 112 ip = VTOI(vp); 113 dp = ip->i_din1; 114 fs = ip->i_fs; 115 ump = ip->i_ump; 116 lbn = lblkno(fs, startoffset); 117 size = blkoff(fs, startoffset) + size; 118 reclaimed = 0; 119 if (size > fs->fs_bsize) 120 panic("ffs_balloc_ufs1: blk too big"); 121 *bpp = NULL; 122 if (flags & IO_EXT) 123 return (EOPNOTSUPP); 124 if (lbn < 0) 125 return (EFBIG); 126 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 127 128 if (DOINGSOFTDEP(vp)) 129 softdep_prealloc(vp, MNT_WAIT); 130 /* 131 * If the next write will extend the file into a new block, 132 * and the file is currently composed of a fragment 133 * this fragment has to be extended to be a full block. 134 */ 135 lastlbn = lblkno(fs, ip->i_size); 136 if (lastlbn < NDADDR && lastlbn < lbn) { 137 nb = lastlbn; 138 osize = blksize(fs, ip, nb); 139 if (osize < fs->fs_bsize && osize > 0) { 140 UFS_LOCK(ump); 141 error = ffs_realloccg(ip, nb, dp->di_db[nb], 142 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 143 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 144 cred, &bp); 145 if (error) 146 return (error); 147 if (DOINGSOFTDEP(vp)) 148 softdep_setup_allocdirect(ip, nb, 149 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 150 fs->fs_bsize, osize, bp); 151 ip->i_size = smalllblktosize(fs, nb + 1); 152 dp->di_size = ip->i_size; 153 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 154 ip->i_flag |= IN_CHANGE | IN_UPDATE; 155 if (flags & IO_SYNC) 156 bwrite(bp); 157 else 158 bawrite(bp); 159 } 160 } 161 /* 162 * The first NDADDR blocks are direct blocks 163 */ 164 if (lbn < NDADDR) { 165 if (flags & BA_METAONLY) 166 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 167 nb = dp->di_db[lbn]; 168 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 169 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 170 if (error) { 171 brelse(bp); 172 return (error); 173 } 174 bp->b_blkno = fsbtodb(fs, nb); 175 *bpp = bp; 176 return (0); 177 } 178 if (nb != 0) { 179 /* 180 * Consider need to reallocate a fragment. 181 */ 182 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 183 nsize = fragroundup(fs, size); 184 if (nsize <= osize) { 185 error = bread(vp, lbn, osize, NOCRED, &bp); 186 if (error) { 187 brelse(bp); 188 return (error); 189 } 190 bp->b_blkno = fsbtodb(fs, nb); 191 } else { 192 UFS_LOCK(ump); 193 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 194 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 195 &dp->di_db[0]), osize, nsize, flags, 196 cred, &bp); 197 if (error) 198 return (error); 199 if (DOINGSOFTDEP(vp)) 200 softdep_setup_allocdirect(ip, lbn, 201 dbtofsb(fs, bp->b_blkno), nb, 202 nsize, osize, bp); 203 } 204 } else { 205 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 206 nsize = fragroundup(fs, size); 207 else 208 nsize = fs->fs_bsize; 209 UFS_LOCK(ump); 210 error = ffs_alloc(ip, lbn, 211 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 212 nsize, flags, cred, &newb); 213 if (error) 214 return (error); 215 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 216 bp->b_blkno = fsbtodb(fs, newb); 217 if (flags & BA_CLRBUF) 218 vfs_bio_clrbuf(bp); 219 if (DOINGSOFTDEP(vp)) 220 softdep_setup_allocdirect(ip, lbn, newb, 0, 221 nsize, 0, bp); 222 } 223 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 224 ip->i_flag |= IN_CHANGE | IN_UPDATE; 225 *bpp = bp; 226 return (0); 227 } 228 /* 229 * Determine the number of levels of indirection. 230 */ 231 pref = 0; 232 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 233 return(error); 234 #ifdef INVARIANTS 235 if (num < 1) 236 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 237 #endif 238 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 239 /* 240 * Fetch the first indirect block allocating if necessary. 241 */ 242 --num; 243 nb = dp->di_ib[indirs[0].in_off]; 244 allocib = NULL; 245 allocblk = allociblk; 246 lbns_remfree = lbns; 247 if (nb == 0) { 248 UFS_LOCK(ump); 249 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 250 (ufs1_daddr_t *)0); 251 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 252 flags, cred, &newb)) != 0) { 253 curthread_pflags_restore(saved_inbdflush); 254 return (error); 255 } 256 pref = newb + fs->fs_frag; 257 nb = newb; 258 MPASS(allocblk < allociblk + nitems(allociblk)); 259 MPASS(lbns_remfree < lbns + nitems(lbns)); 260 *allocblk++ = nb; 261 *lbns_remfree++ = indirs[1].in_lbn; 262 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 263 bp->b_blkno = fsbtodb(fs, nb); 264 vfs_bio_clrbuf(bp); 265 if (DOINGSOFTDEP(vp)) { 266 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 267 newb, 0, fs->fs_bsize, 0, bp); 268 bdwrite(bp); 269 } else { 270 /* 271 * Write synchronously so that indirect blocks 272 * never point at garbage. 273 */ 274 if (DOINGASYNC(vp)) 275 bdwrite(bp); 276 else if ((error = bwrite(bp)) != 0) 277 goto fail; 278 } 279 allocib = &dp->di_ib[indirs[0].in_off]; 280 *allocib = nb; 281 ip->i_flag |= IN_CHANGE | IN_UPDATE; 282 } 283 /* 284 * Fetch through the indirect blocks, allocating as necessary. 285 */ 286 retry: 287 for (i = 1;;) { 288 error = bread(vp, 289 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 290 if (error) { 291 brelse(bp); 292 goto fail; 293 } 294 bap = (ufs1_daddr_t *)bp->b_data; 295 nb = bap[indirs[i].in_off]; 296 if (i == num) 297 break; 298 i += 1; 299 if (nb != 0) { 300 bqrelse(bp); 301 continue; 302 } 303 UFS_LOCK(ump); 304 /* 305 * If parent indirect has just been allocated, try to cluster 306 * immediately following it. 307 */ 308 if (pref == 0) 309 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 310 (ufs1_daddr_t *)0); 311 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 312 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 313 brelse(bp); 314 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 315 UFS_LOCK(ump); 316 softdep_request_cleanup(fs, vp, cred, 317 FLUSH_BLOCKS_WAIT); 318 UFS_UNLOCK(ump); 319 goto retry; 320 } 321 if (ppsratecheck(&lastfail, &curfail, 1)) { 322 ffs_fserr(fs, ip->i_number, "filesystem full"); 323 uprintf("\n%s: write failed, filesystem " 324 "is full\n", fs->fs_fsmnt); 325 } 326 goto fail; 327 } 328 pref = newb + fs->fs_frag; 329 nb = newb; 330 MPASS(allocblk < allociblk + nitems(allociblk)); 331 MPASS(lbns_remfree < lbns + nitems(lbns)); 332 *allocblk++ = nb; 333 *lbns_remfree++ = indirs[i].in_lbn; 334 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 335 nbp->b_blkno = fsbtodb(fs, nb); 336 vfs_bio_clrbuf(nbp); 337 if (DOINGSOFTDEP(vp)) { 338 softdep_setup_allocindir_meta(nbp, ip, bp, 339 indirs[i - 1].in_off, nb); 340 bdwrite(nbp); 341 } else { 342 /* 343 * Write synchronously so that indirect blocks 344 * never point at garbage. 345 */ 346 if ((error = bwrite(nbp)) != 0) { 347 brelse(bp); 348 goto fail; 349 } 350 } 351 bap[indirs[i - 1].in_off] = nb; 352 if (allocib == NULL && unwindidx < 0) 353 unwindidx = i - 1; 354 /* 355 * If required, write synchronously, otherwise use 356 * delayed write. 357 */ 358 if (flags & IO_SYNC) { 359 bwrite(bp); 360 } else { 361 if (bp->b_bufsize == fs->fs_bsize) 362 bp->b_flags |= B_CLUSTEROK; 363 bdwrite(bp); 364 } 365 } 366 /* 367 * If asked only for the indirect block, then return it. 368 */ 369 if (flags & BA_METAONLY) { 370 curthread_pflags_restore(saved_inbdflush); 371 *bpp = bp; 372 return (0); 373 } 374 /* 375 * Get the data block, allocating if necessary. 376 */ 377 if (nb == 0) { 378 UFS_LOCK(ump); 379 /* 380 * If allocating metadata at the front of the cylinder 381 * group and parent indirect block has just been allocated, 382 * then cluster next to it if it is the first indirect in 383 * the file. Otherwise it has been allocated in the metadata 384 * area, so we want to find our own place out in the data area. 385 */ 386 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 387 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 388 &bap[0]); 389 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 390 flags | IO_BUFLOCKED, cred, &newb); 391 if (error) { 392 brelse(bp); 393 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 394 UFS_LOCK(ump); 395 softdep_request_cleanup(fs, vp, cred, 396 FLUSH_BLOCKS_WAIT); 397 UFS_UNLOCK(ump); 398 goto retry; 399 } 400 if (ppsratecheck(&lastfail, &curfail, 1)) { 401 ffs_fserr(fs, ip->i_number, "filesystem full"); 402 uprintf("\n%s: write failed, filesystem " 403 "is full\n", fs->fs_fsmnt); 404 } 405 goto fail; 406 } 407 nb = newb; 408 MPASS(allocblk < allociblk + nitems(allociblk)); 409 MPASS(lbns_remfree < lbns + nitems(lbns)); 410 *allocblk++ = nb; 411 *lbns_remfree++ = lbn; 412 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 413 nbp->b_blkno = fsbtodb(fs, nb); 414 if (flags & BA_CLRBUF) 415 vfs_bio_clrbuf(nbp); 416 if (DOINGSOFTDEP(vp)) 417 softdep_setup_allocindir_page(ip, lbn, bp, 418 indirs[i].in_off, nb, 0, nbp); 419 bap[indirs[i].in_off] = nb; 420 /* 421 * If required, write synchronously, otherwise use 422 * delayed write. 423 */ 424 if (flags & IO_SYNC) { 425 bwrite(bp); 426 } else { 427 if (bp->b_bufsize == fs->fs_bsize) 428 bp->b_flags |= B_CLUSTEROK; 429 bdwrite(bp); 430 } 431 curthread_pflags_restore(saved_inbdflush); 432 *bpp = nbp; 433 return (0); 434 } 435 brelse(bp); 436 if (flags & BA_CLRBUF) { 437 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 438 if (seqcount != 0 && 439 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 440 !(vm_page_count_severe() || buf_dirty_count_severe())) { 441 error = cluster_read(vp, ip->i_size, lbn, 442 (int)fs->fs_bsize, NOCRED, 443 MAXBSIZE, seqcount, gbflags, &nbp); 444 } else { 445 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 446 gbflags, &nbp); 447 } 448 if (error) { 449 brelse(nbp); 450 goto fail; 451 } 452 } else { 453 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 454 nbp->b_blkno = fsbtodb(fs, nb); 455 } 456 curthread_pflags_restore(saved_inbdflush); 457 *bpp = nbp; 458 return (0); 459 fail: 460 curthread_pflags_restore(saved_inbdflush); 461 /* 462 * If we have failed to allocate any blocks, simply return the error. 463 * This is the usual case and avoids the need to fsync the file. 464 */ 465 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 466 return (error); 467 /* 468 * If we have failed part way through block allocation, we 469 * have to deallocate any indirect blocks that we have allocated. 470 * We have to fsync the file before we start to get rid of all 471 * of its dependencies so that we do not leave them dangling. 472 * We have to sync it at the end so that the soft updates code 473 * does not find any untracked changes. Although this is really 474 * slow, running out of disk space is not expected to be a common 475 * occurrence. The error return from fsync is ignored as we already 476 * have an error to return to the user. 477 * 478 * XXX Still have to journal the free below 479 */ 480 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 481 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 482 blkp < allocblk; blkp++, lbns_remfree++) { 483 /* 484 * We shall not leave the freed blocks on the vnode 485 * buffer object lists. 486 */ 487 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 488 GB_NOCREAT | GB_UNMAPPED); 489 if (bp != NULL) { 490 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 491 ("mismatch1 l %jd %jd b %ju %ju", 492 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 493 (uintmax_t)bp->b_blkno, 494 (uintmax_t)fsbtodb(fs, *blkp))); 495 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 496 bp->b_flags &= ~(B_ASYNC | B_CACHE); 497 brelse(bp); 498 } 499 deallocated += fs->fs_bsize; 500 } 501 if (allocib != NULL) { 502 *allocib = 0; 503 } else if (unwindidx >= 0) { 504 int r; 505 506 r = bread(vp, indirs[unwindidx].in_lbn, 507 (int)fs->fs_bsize, NOCRED, &bp); 508 if (r) { 509 panic("Could not unwind indirect block, error %d", r); 510 brelse(bp); 511 } else { 512 bap = (ufs1_daddr_t *)bp->b_data; 513 bap[indirs[unwindidx].in_off] = 0; 514 if (flags & IO_SYNC) { 515 bwrite(bp); 516 } else { 517 if (bp->b_bufsize == fs->fs_bsize) 518 bp->b_flags |= B_CLUSTEROK; 519 bdwrite(bp); 520 } 521 } 522 } 523 if (deallocated) { 524 #ifdef QUOTA 525 /* 526 * Restore user's disk quota because allocation failed. 527 */ 528 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 529 #endif 530 dp->di_blocks -= btodb(deallocated); 531 ip->i_flag |= IN_CHANGE | IN_UPDATE; 532 } 533 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 534 /* 535 * After the buffers are invalidated and on-disk pointers are 536 * cleared, free the blocks. 537 */ 538 for (blkp = allociblk; blkp < allocblk; blkp++) { 539 #ifdef INVARIANTS 540 if (blkp == allociblk) 541 lbns_remfree = lbns; 542 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 543 GB_NOCREAT | GB_UNMAPPED); 544 if (bp != NULL) { 545 panic("zombie1 %jd %ju %ju", 546 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 547 (uintmax_t)fsbtodb(fs, *blkp)); 548 } 549 lbns_remfree++; 550 #endif 551 ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, 552 ip->i_number, vp->v_type, NULL); 553 } 554 return (error); 555 } 556 557 /* 558 * Balloc defines the structure of file system storage 559 * by allocating the physical blocks on a device given 560 * the inode and the logical block number in a file. 561 * This is the allocation strategy for UFS2. Above is 562 * the allocation strategy for UFS1. 563 */ 564 int 565 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 566 struct ucred *cred, int flags, struct buf **bpp) 567 { 568 struct inode *ip; 569 struct ufs2_dinode *dp; 570 ufs_lbn_t lbn, lastlbn; 571 struct fs *fs; 572 struct buf *bp, *nbp; 573 struct ufsmount *ump; 574 struct indir indirs[NIADDR + 2]; 575 ufs2_daddr_t nb, newb, *bap, pref; 576 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 577 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 578 int deallocated, osize, nsize, num, i, error; 579 int unwindidx = -1; 580 int saved_inbdflush; 581 static struct timeval lastfail; 582 static int curfail; 583 int gbflags, reclaimed; 584 585 ip = VTOI(vp); 586 dp = ip->i_din2; 587 fs = ip->i_fs; 588 ump = ip->i_ump; 589 lbn = lblkno(fs, startoffset); 590 size = blkoff(fs, startoffset) + size; 591 reclaimed = 0; 592 if (size > fs->fs_bsize) 593 panic("ffs_balloc_ufs2: blk too big"); 594 *bpp = NULL; 595 if (lbn < 0) 596 return (EFBIG); 597 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 598 599 if (DOINGSOFTDEP(vp)) 600 softdep_prealloc(vp, MNT_WAIT); 601 602 /* 603 * Check for allocating external data. 604 */ 605 if (flags & IO_EXT) { 606 if (lbn >= NXADDR) 607 return (EFBIG); 608 /* 609 * If the next write will extend the data into a new block, 610 * and the data is currently composed of a fragment 611 * this fragment has to be extended to be a full block. 612 */ 613 lastlbn = lblkno(fs, dp->di_extsize); 614 if (lastlbn < lbn) { 615 nb = lastlbn; 616 osize = sblksize(fs, dp->di_extsize, nb); 617 if (osize < fs->fs_bsize && osize > 0) { 618 UFS_LOCK(ump); 619 error = ffs_realloccg(ip, -1 - nb, 620 dp->di_extb[nb], 621 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 622 &dp->di_extb[0]), osize, 623 (int)fs->fs_bsize, flags, cred, &bp); 624 if (error) 625 return (error); 626 if (DOINGSOFTDEP(vp)) 627 softdep_setup_allocext(ip, nb, 628 dbtofsb(fs, bp->b_blkno), 629 dp->di_extb[nb], 630 fs->fs_bsize, osize, bp); 631 dp->di_extsize = smalllblktosize(fs, nb + 1); 632 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 633 bp->b_xflags |= BX_ALTDATA; 634 ip->i_flag |= IN_CHANGE; 635 if (flags & IO_SYNC) 636 bwrite(bp); 637 else 638 bawrite(bp); 639 } 640 } 641 /* 642 * All blocks are direct blocks 643 */ 644 if (flags & BA_METAONLY) 645 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 646 nb = dp->di_extb[lbn]; 647 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 648 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 649 gbflags, &bp); 650 if (error) { 651 brelse(bp); 652 return (error); 653 } 654 bp->b_blkno = fsbtodb(fs, nb); 655 bp->b_xflags |= BX_ALTDATA; 656 *bpp = bp; 657 return (0); 658 } 659 if (nb != 0) { 660 /* 661 * Consider need to reallocate a fragment. 662 */ 663 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 664 nsize = fragroundup(fs, size); 665 if (nsize <= osize) { 666 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 667 gbflags, &bp); 668 if (error) { 669 brelse(bp); 670 return (error); 671 } 672 bp->b_blkno = fsbtodb(fs, nb); 673 bp->b_xflags |= BX_ALTDATA; 674 } else { 675 UFS_LOCK(ump); 676 error = ffs_realloccg(ip, -1 - lbn, 677 dp->di_extb[lbn], 678 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 679 &dp->di_extb[0]), osize, nsize, flags, 680 cred, &bp); 681 if (error) 682 return (error); 683 bp->b_xflags |= BX_ALTDATA; 684 if (DOINGSOFTDEP(vp)) 685 softdep_setup_allocext(ip, lbn, 686 dbtofsb(fs, bp->b_blkno), nb, 687 nsize, osize, bp); 688 } 689 } else { 690 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 691 nsize = fragroundup(fs, size); 692 else 693 nsize = fs->fs_bsize; 694 UFS_LOCK(ump); 695 error = ffs_alloc(ip, lbn, 696 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 697 nsize, flags, cred, &newb); 698 if (error) 699 return (error); 700 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 701 bp->b_blkno = fsbtodb(fs, newb); 702 bp->b_xflags |= BX_ALTDATA; 703 if (flags & BA_CLRBUF) 704 vfs_bio_clrbuf(bp); 705 if (DOINGSOFTDEP(vp)) 706 softdep_setup_allocext(ip, lbn, newb, 0, 707 nsize, 0, bp); 708 } 709 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 710 ip->i_flag |= IN_CHANGE; 711 *bpp = bp; 712 return (0); 713 } 714 /* 715 * If the next write will extend the file into a new block, 716 * and the file is currently composed of a fragment 717 * this fragment has to be extended to be a full block. 718 */ 719 lastlbn = lblkno(fs, ip->i_size); 720 if (lastlbn < NDADDR && lastlbn < lbn) { 721 nb = lastlbn; 722 osize = blksize(fs, ip, nb); 723 if (osize < fs->fs_bsize && osize > 0) { 724 UFS_LOCK(ump); 725 error = ffs_realloccg(ip, nb, dp->di_db[nb], 726 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 727 &dp->di_db[0]), osize, (int)fs->fs_bsize, 728 flags, cred, &bp); 729 if (error) 730 return (error); 731 if (DOINGSOFTDEP(vp)) 732 softdep_setup_allocdirect(ip, nb, 733 dbtofsb(fs, bp->b_blkno), 734 dp->di_db[nb], 735 fs->fs_bsize, osize, bp); 736 ip->i_size = smalllblktosize(fs, nb + 1); 737 dp->di_size = ip->i_size; 738 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 739 ip->i_flag |= IN_CHANGE | IN_UPDATE; 740 if (flags & IO_SYNC) 741 bwrite(bp); 742 else 743 bawrite(bp); 744 } 745 } 746 /* 747 * The first NDADDR blocks are direct blocks 748 */ 749 if (lbn < NDADDR) { 750 if (flags & BA_METAONLY) 751 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 752 nb = dp->di_db[lbn]; 753 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 754 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 755 gbflags, &bp); 756 if (error) { 757 brelse(bp); 758 return (error); 759 } 760 bp->b_blkno = fsbtodb(fs, nb); 761 *bpp = bp; 762 return (0); 763 } 764 if (nb != 0) { 765 /* 766 * Consider need to reallocate a fragment. 767 */ 768 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 769 nsize = fragroundup(fs, size); 770 if (nsize <= osize) { 771 error = bread_gb(vp, lbn, osize, NOCRED, 772 gbflags, &bp); 773 if (error) { 774 brelse(bp); 775 return (error); 776 } 777 bp->b_blkno = fsbtodb(fs, nb); 778 } else { 779 UFS_LOCK(ump); 780 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 781 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 782 &dp->di_db[0]), osize, nsize, flags, 783 cred, &bp); 784 if (error) 785 return (error); 786 if (DOINGSOFTDEP(vp)) 787 softdep_setup_allocdirect(ip, lbn, 788 dbtofsb(fs, bp->b_blkno), nb, 789 nsize, osize, bp); 790 } 791 } else { 792 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 793 nsize = fragroundup(fs, size); 794 else 795 nsize = fs->fs_bsize; 796 UFS_LOCK(ump); 797 error = ffs_alloc(ip, lbn, 798 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 799 &dp->di_db[0]), nsize, flags, cred, &newb); 800 if (error) 801 return (error); 802 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 803 bp->b_blkno = fsbtodb(fs, newb); 804 if (flags & BA_CLRBUF) 805 vfs_bio_clrbuf(bp); 806 if (DOINGSOFTDEP(vp)) 807 softdep_setup_allocdirect(ip, lbn, newb, 0, 808 nsize, 0, bp); 809 } 810 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 811 ip->i_flag |= IN_CHANGE | IN_UPDATE; 812 *bpp = bp; 813 return (0); 814 } 815 /* 816 * Determine the number of levels of indirection. 817 */ 818 pref = 0; 819 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 820 return(error); 821 #ifdef INVARIANTS 822 if (num < 1) 823 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 824 #endif 825 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 826 /* 827 * Fetch the first indirect block allocating if necessary. 828 */ 829 --num; 830 nb = dp->di_ib[indirs[0].in_off]; 831 allocib = NULL; 832 allocblk = allociblk; 833 lbns_remfree = lbns; 834 if (nb == 0) { 835 UFS_LOCK(ump); 836 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 837 (ufs2_daddr_t *)0); 838 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 839 flags, cred, &newb)) != 0) { 840 curthread_pflags_restore(saved_inbdflush); 841 return (error); 842 } 843 pref = newb + fs->fs_frag; 844 nb = newb; 845 MPASS(allocblk < allociblk + nitems(allociblk)); 846 MPASS(lbns_remfree < lbns + nitems(lbns)); 847 *allocblk++ = nb; 848 *lbns_remfree++ = indirs[1].in_lbn; 849 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 850 GB_UNMAPPED); 851 bp->b_blkno = fsbtodb(fs, nb); 852 vfs_bio_clrbuf(bp); 853 if (DOINGSOFTDEP(vp)) { 854 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 855 newb, 0, fs->fs_bsize, 0, bp); 856 bdwrite(bp); 857 } else { 858 /* 859 * Write synchronously so that indirect blocks 860 * never point at garbage. 861 */ 862 if (DOINGASYNC(vp)) 863 bdwrite(bp); 864 else if ((error = bwrite(bp)) != 0) 865 goto fail; 866 } 867 allocib = &dp->di_ib[indirs[0].in_off]; 868 *allocib = nb; 869 ip->i_flag |= IN_CHANGE | IN_UPDATE; 870 } 871 /* 872 * Fetch through the indirect blocks, allocating as necessary. 873 */ 874 retry: 875 for (i = 1;;) { 876 error = bread(vp, 877 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 878 if (error) { 879 brelse(bp); 880 goto fail; 881 } 882 bap = (ufs2_daddr_t *)bp->b_data; 883 nb = bap[indirs[i].in_off]; 884 if (i == num) 885 break; 886 i += 1; 887 if (nb != 0) { 888 bqrelse(bp); 889 continue; 890 } 891 UFS_LOCK(ump); 892 /* 893 * If parent indirect has just been allocated, try to cluster 894 * immediately following it. 895 */ 896 if (pref == 0) 897 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 898 (ufs2_daddr_t *)0); 899 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 900 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 901 brelse(bp); 902 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 903 UFS_LOCK(ump); 904 softdep_request_cleanup(fs, vp, cred, 905 FLUSH_BLOCKS_WAIT); 906 UFS_UNLOCK(ump); 907 goto retry; 908 } 909 if (ppsratecheck(&lastfail, &curfail, 1)) { 910 ffs_fserr(fs, ip->i_number, "filesystem full"); 911 uprintf("\n%s: write failed, filesystem " 912 "is full\n", fs->fs_fsmnt); 913 } 914 goto fail; 915 } 916 pref = newb + fs->fs_frag; 917 nb = newb; 918 MPASS(allocblk < allociblk + nitems(allociblk)); 919 MPASS(lbns_remfree < lbns + nitems(lbns)); 920 *allocblk++ = nb; 921 *lbns_remfree++ = indirs[i].in_lbn; 922 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 923 GB_UNMAPPED); 924 nbp->b_blkno = fsbtodb(fs, nb); 925 vfs_bio_clrbuf(nbp); 926 if (DOINGSOFTDEP(vp)) { 927 softdep_setup_allocindir_meta(nbp, ip, bp, 928 indirs[i - 1].in_off, nb); 929 bdwrite(nbp); 930 } else { 931 /* 932 * Write synchronously so that indirect blocks 933 * never point at garbage. 934 */ 935 if ((error = bwrite(nbp)) != 0) { 936 brelse(bp); 937 goto fail; 938 } 939 } 940 bap[indirs[i - 1].in_off] = nb; 941 if (allocib == NULL && unwindidx < 0) 942 unwindidx = i - 1; 943 /* 944 * If required, write synchronously, otherwise use 945 * delayed write. 946 */ 947 if (flags & IO_SYNC) { 948 bwrite(bp); 949 } else { 950 if (bp->b_bufsize == fs->fs_bsize) 951 bp->b_flags |= B_CLUSTEROK; 952 bdwrite(bp); 953 } 954 } 955 /* 956 * If asked only for the indirect block, then return it. 957 */ 958 if (flags & BA_METAONLY) { 959 curthread_pflags_restore(saved_inbdflush); 960 *bpp = bp; 961 return (0); 962 } 963 /* 964 * Get the data block, allocating if necessary. 965 */ 966 if (nb == 0) { 967 UFS_LOCK(ump); 968 /* 969 * If allocating metadata at the front of the cylinder 970 * group and parent indirect block has just been allocated, 971 * then cluster next to it if it is the first indirect in 972 * the file. Otherwise it has been allocated in the metadata 973 * area, so we want to find our own place out in the data area. 974 */ 975 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 976 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 977 &bap[0]); 978 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 979 flags | IO_BUFLOCKED, cred, &newb); 980 if (error) { 981 brelse(bp); 982 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 983 UFS_LOCK(ump); 984 softdep_request_cleanup(fs, vp, cred, 985 FLUSH_BLOCKS_WAIT); 986 UFS_UNLOCK(ump); 987 goto retry; 988 } 989 if (ppsratecheck(&lastfail, &curfail, 1)) { 990 ffs_fserr(fs, ip->i_number, "filesystem full"); 991 uprintf("\n%s: write failed, filesystem " 992 "is full\n", fs->fs_fsmnt); 993 } 994 goto fail; 995 } 996 nb = newb; 997 MPASS(allocblk < allociblk + nitems(allociblk)); 998 MPASS(lbns_remfree < lbns + nitems(lbns)); 999 *allocblk++ = nb; 1000 *lbns_remfree++ = lbn; 1001 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1002 nbp->b_blkno = fsbtodb(fs, nb); 1003 if (flags & BA_CLRBUF) 1004 vfs_bio_clrbuf(nbp); 1005 if (DOINGSOFTDEP(vp)) 1006 softdep_setup_allocindir_page(ip, lbn, bp, 1007 indirs[i].in_off, nb, 0, nbp); 1008 bap[indirs[i].in_off] = nb; 1009 /* 1010 * If required, write synchronously, otherwise use 1011 * delayed write. 1012 */ 1013 if (flags & IO_SYNC) { 1014 bwrite(bp); 1015 } else { 1016 if (bp->b_bufsize == fs->fs_bsize) 1017 bp->b_flags |= B_CLUSTEROK; 1018 bdwrite(bp); 1019 } 1020 curthread_pflags_restore(saved_inbdflush); 1021 *bpp = nbp; 1022 return (0); 1023 } 1024 brelse(bp); 1025 /* 1026 * If requested clear invalid portions of the buffer. If we 1027 * have to do a read-before-write (typical if BA_CLRBUF is set), 1028 * try to do some read-ahead in the sequential case to reduce 1029 * the number of I/O transactions. 1030 */ 1031 if (flags & BA_CLRBUF) { 1032 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1033 if (seqcount != 0 && 1034 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1035 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1036 error = cluster_read(vp, ip->i_size, lbn, 1037 (int)fs->fs_bsize, NOCRED, 1038 MAXBSIZE, seqcount, gbflags, &nbp); 1039 } else { 1040 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1041 NOCRED, gbflags, &nbp); 1042 } 1043 if (error) { 1044 brelse(nbp); 1045 goto fail; 1046 } 1047 } else { 1048 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1049 nbp->b_blkno = fsbtodb(fs, nb); 1050 } 1051 curthread_pflags_restore(saved_inbdflush); 1052 *bpp = nbp; 1053 return (0); 1054 fail: 1055 curthread_pflags_restore(saved_inbdflush); 1056 /* 1057 * If we have failed to allocate any blocks, simply return the error. 1058 * This is the usual case and avoids the need to fsync the file. 1059 */ 1060 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1061 return (error); 1062 /* 1063 * If we have failed part way through block allocation, we 1064 * have to deallocate any indirect blocks that we have allocated. 1065 * We have to fsync the file before we start to get rid of all 1066 * of its dependencies so that we do not leave them dangling. 1067 * We have to sync it at the end so that the soft updates code 1068 * does not find any untracked changes. Although this is really 1069 * slow, running out of disk space is not expected to be a common 1070 * occurrence. The error return from fsync is ignored as we already 1071 * have an error to return to the user. 1072 * 1073 * XXX Still have to journal the free below 1074 */ 1075 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1076 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1077 blkp < allocblk; blkp++, lbns_remfree++) { 1078 /* 1079 * We shall not leave the freed blocks on the vnode 1080 * buffer object lists. 1081 */ 1082 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1083 GB_NOCREAT | GB_UNMAPPED); 1084 if (bp != NULL) { 1085 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1086 ("mismatch2 l %jd %jd b %ju %ju", 1087 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1088 (uintmax_t)bp->b_blkno, 1089 (uintmax_t)fsbtodb(fs, *blkp))); 1090 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1091 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1092 brelse(bp); 1093 } 1094 deallocated += fs->fs_bsize; 1095 } 1096 if (allocib != NULL) { 1097 *allocib = 0; 1098 } else if (unwindidx >= 0) { 1099 int r; 1100 1101 r = bread(vp, indirs[unwindidx].in_lbn, 1102 (int)fs->fs_bsize, NOCRED, &bp); 1103 if (r) { 1104 panic("Could not unwind indirect block, error %d", r); 1105 brelse(bp); 1106 } else { 1107 bap = (ufs2_daddr_t *)bp->b_data; 1108 bap[indirs[unwindidx].in_off] = 0; 1109 if (flags & IO_SYNC) { 1110 bwrite(bp); 1111 } else { 1112 if (bp->b_bufsize == fs->fs_bsize) 1113 bp->b_flags |= B_CLUSTEROK; 1114 bdwrite(bp); 1115 } 1116 } 1117 } 1118 if (deallocated) { 1119 #ifdef QUOTA 1120 /* 1121 * Restore user's disk quota because allocation failed. 1122 */ 1123 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1124 #endif 1125 dp->di_blocks -= btodb(deallocated); 1126 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1127 } 1128 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1129 /* 1130 * After the buffers are invalidated and on-disk pointers are 1131 * cleared, free the blocks. 1132 */ 1133 for (blkp = allociblk; blkp < allocblk; blkp++) { 1134 #ifdef INVARIANTS 1135 if (blkp == allociblk) 1136 lbns_remfree = lbns; 1137 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1138 GB_NOCREAT | GB_UNMAPPED); 1139 if (bp != NULL) { 1140 panic("zombie2 %jd %ju %ju", 1141 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1142 (uintmax_t)fsbtodb(fs, *blkp)); 1143 } 1144 lbns_remfree++; 1145 #endif 1146 ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, 1147 ip->i_number, vp->v_type, NULL); 1148 } 1149 return (error); 1150 } 1151