1 /*- 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 60 */ 61 62 #include <sys/cdefs.h> 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/lock.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 73 #include <ufs/ufs/quota.h> 74 #include <ufs/ufs/inode.h> 75 #include <ufs/ufs/ufs_extern.h> 76 #include <ufs/ufs/extattr.h> 77 #include <ufs/ufs/ufsmount.h> 78 79 #include <ufs/ffs/fs.h> 80 #include <ufs/ffs/ffs_extern.h> 81 82 /* 83 * Balloc defines the structure of filesystem storage 84 * by allocating the physical blocks on a device given 85 * the inode and the logical block number in a file. 86 * This is the allocation strategy for UFS1. Below is 87 * the allocation strategy for UFS2. 88 */ 89 int 90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 91 struct ucred *cred, int flags, struct buf **bpp) 92 { 93 struct inode *ip; 94 struct ufs1_dinode *dp; 95 ufs_lbn_t lbn, lastlbn; 96 struct fs *fs; 97 ufs1_daddr_t nb; 98 struct buf *bp, *nbp; 99 struct ufsmount *ump; 100 struct indir indirs[NIADDR + 2]; 101 int deallocated, osize, nsize, num, i, error; 102 ufs2_daddr_t newb; 103 ufs1_daddr_t *bap, pref; 104 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 105 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 106 int unwindidx = -1; 107 int saved_inbdflush; 108 static struct timeval lastfail; 109 static int curfail; 110 int gbflags, reclaimed; 111 112 ip = VTOI(vp); 113 dp = ip->i_din1; 114 fs = ITOFS(ip); 115 ump = ITOUMP(ip); 116 lbn = lblkno(fs, startoffset); 117 size = blkoff(fs, startoffset) + size; 118 reclaimed = 0; 119 if (size > fs->fs_bsize) 120 panic("ffs_balloc_ufs1: blk too big"); 121 *bpp = NULL; 122 if (flags & IO_EXT) 123 return (EOPNOTSUPP); 124 if (lbn < 0) 125 return (EFBIG); 126 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 127 128 if (DOINGSOFTDEP(vp)) 129 softdep_prealloc(vp, MNT_WAIT); 130 /* 131 * If the next write will extend the file into a new block, 132 * and the file is currently composed of a fragment 133 * this fragment has to be extended to be a full block. 134 */ 135 lastlbn = lblkno(fs, ip->i_size); 136 if (lastlbn < NDADDR && lastlbn < lbn) { 137 nb = lastlbn; 138 osize = blksize(fs, ip, nb); 139 if (osize < fs->fs_bsize && osize > 0) { 140 UFS_LOCK(ump); 141 error = ffs_realloccg(ip, nb, dp->di_db[nb], 142 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 143 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 144 cred, &bp); 145 if (error) 146 return (error); 147 if (DOINGSOFTDEP(vp)) 148 softdep_setup_allocdirect(ip, nb, 149 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 150 fs->fs_bsize, osize, bp); 151 ip->i_size = smalllblktosize(fs, nb + 1); 152 dp->di_size = ip->i_size; 153 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 154 ip->i_flag |= IN_CHANGE | IN_UPDATE; 155 if (flags & IO_SYNC) 156 bwrite(bp); 157 else if (DOINGASYNC(vp)) 158 bdwrite(bp); 159 else 160 bawrite(bp); 161 } 162 } 163 /* 164 * The first NDADDR blocks are direct blocks 165 */ 166 if (lbn < NDADDR) { 167 if (flags & BA_METAONLY) 168 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 169 nb = dp->di_db[lbn]; 170 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 171 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 172 if (error) { 173 brelse(bp); 174 return (error); 175 } 176 bp->b_blkno = fsbtodb(fs, nb); 177 *bpp = bp; 178 return (0); 179 } 180 if (nb != 0) { 181 /* 182 * Consider need to reallocate a fragment. 183 */ 184 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 185 nsize = fragroundup(fs, size); 186 if (nsize <= osize) { 187 error = bread(vp, lbn, osize, NOCRED, &bp); 188 if (error) { 189 brelse(bp); 190 return (error); 191 } 192 bp->b_blkno = fsbtodb(fs, nb); 193 } else { 194 UFS_LOCK(ump); 195 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 196 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 197 &dp->di_db[0]), osize, nsize, flags, 198 cred, &bp); 199 if (error) 200 return (error); 201 if (DOINGSOFTDEP(vp)) 202 softdep_setup_allocdirect(ip, lbn, 203 dbtofsb(fs, bp->b_blkno), nb, 204 nsize, osize, bp); 205 } 206 } else { 207 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 208 nsize = fragroundup(fs, size); 209 else 210 nsize = fs->fs_bsize; 211 UFS_LOCK(ump); 212 error = ffs_alloc(ip, lbn, 213 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 214 nsize, flags, cred, &newb); 215 if (error) 216 return (error); 217 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 218 bp->b_blkno = fsbtodb(fs, newb); 219 if (flags & BA_CLRBUF) 220 vfs_bio_clrbuf(bp); 221 if (DOINGSOFTDEP(vp)) 222 softdep_setup_allocdirect(ip, lbn, newb, 0, 223 nsize, 0, bp); 224 } 225 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 226 ip->i_flag |= IN_CHANGE | IN_UPDATE; 227 *bpp = bp; 228 return (0); 229 } 230 /* 231 * Determine the number of levels of indirection. 232 */ 233 pref = 0; 234 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 235 return(error); 236 #ifdef INVARIANTS 237 if (num < 1) 238 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 239 #endif 240 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 241 /* 242 * Fetch the first indirect block allocating if necessary. 243 */ 244 --num; 245 nb = dp->di_ib[indirs[0].in_off]; 246 allocib = NULL; 247 allocblk = allociblk; 248 lbns_remfree = lbns; 249 if (nb == 0) { 250 UFS_LOCK(ump); 251 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 252 (ufs1_daddr_t *)0); 253 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 254 flags, cred, &newb)) != 0) { 255 curthread_pflags_restore(saved_inbdflush); 256 return (error); 257 } 258 pref = newb + fs->fs_frag; 259 nb = newb; 260 MPASS(allocblk < allociblk + nitems(allociblk)); 261 MPASS(lbns_remfree < lbns + nitems(lbns)); 262 *allocblk++ = nb; 263 *lbns_remfree++ = indirs[1].in_lbn; 264 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 265 bp->b_blkno = fsbtodb(fs, nb); 266 vfs_bio_clrbuf(bp); 267 if (DOINGSOFTDEP(vp)) { 268 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 269 newb, 0, fs->fs_bsize, 0, bp); 270 bdwrite(bp); 271 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 272 if (bp->b_bufsize == fs->fs_bsize) 273 bp->b_flags |= B_CLUSTEROK; 274 bdwrite(bp); 275 } else { 276 if ((error = bwrite(bp)) != 0) 277 goto fail; 278 } 279 allocib = &dp->di_ib[indirs[0].in_off]; 280 *allocib = nb; 281 ip->i_flag |= IN_CHANGE | IN_UPDATE; 282 } 283 /* 284 * Fetch through the indirect blocks, allocating as necessary. 285 */ 286 retry: 287 for (i = 1;;) { 288 error = bread(vp, 289 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 290 if (error) { 291 brelse(bp); 292 goto fail; 293 } 294 bap = (ufs1_daddr_t *)bp->b_data; 295 nb = bap[indirs[i].in_off]; 296 if (i == num) 297 break; 298 i += 1; 299 if (nb != 0) { 300 bqrelse(bp); 301 continue; 302 } 303 UFS_LOCK(ump); 304 /* 305 * If parent indirect has just been allocated, try to cluster 306 * immediately following it. 307 */ 308 if (pref == 0) 309 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 310 (ufs1_daddr_t *)0); 311 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 312 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 313 brelse(bp); 314 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 315 UFS_LOCK(ump); 316 softdep_request_cleanup(fs, vp, cred, 317 FLUSH_BLOCKS_WAIT); 318 UFS_UNLOCK(ump); 319 goto retry; 320 } 321 if (ppsratecheck(&lastfail, &curfail, 1)) { 322 ffs_fserr(fs, ip->i_number, "filesystem full"); 323 uprintf("\n%s: write failed, filesystem " 324 "is full\n", fs->fs_fsmnt); 325 } 326 goto fail; 327 } 328 pref = newb + fs->fs_frag; 329 nb = newb; 330 MPASS(allocblk < allociblk + nitems(allociblk)); 331 MPASS(lbns_remfree < lbns + nitems(lbns)); 332 *allocblk++ = nb; 333 *lbns_remfree++ = indirs[i].in_lbn; 334 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 335 nbp->b_blkno = fsbtodb(fs, nb); 336 vfs_bio_clrbuf(nbp); 337 if (DOINGSOFTDEP(vp)) { 338 softdep_setup_allocindir_meta(nbp, ip, bp, 339 indirs[i - 1].in_off, nb); 340 bdwrite(nbp); 341 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 342 if (nbp->b_bufsize == fs->fs_bsize) 343 nbp->b_flags |= B_CLUSTEROK; 344 bdwrite(nbp); 345 } else { 346 if ((error = bwrite(nbp)) != 0) { 347 brelse(bp); 348 goto fail; 349 } 350 } 351 bap[indirs[i - 1].in_off] = nb; 352 if (allocib == NULL && unwindidx < 0) 353 unwindidx = i - 1; 354 /* 355 * If required, write synchronously, otherwise use 356 * delayed write. 357 */ 358 if (flags & IO_SYNC) { 359 bwrite(bp); 360 } else { 361 if (bp->b_bufsize == fs->fs_bsize) 362 bp->b_flags |= B_CLUSTEROK; 363 bdwrite(bp); 364 } 365 } 366 /* 367 * If asked only for the indirect block, then return it. 368 */ 369 if (flags & BA_METAONLY) { 370 curthread_pflags_restore(saved_inbdflush); 371 *bpp = bp; 372 return (0); 373 } 374 /* 375 * Get the data block, allocating if necessary. 376 */ 377 if (nb == 0) { 378 UFS_LOCK(ump); 379 /* 380 * If allocating metadata at the front of the cylinder 381 * group and parent indirect block has just been allocated, 382 * then cluster next to it if it is the first indirect in 383 * the file. Otherwise it has been allocated in the metadata 384 * area, so we want to find our own place out in the data area. 385 */ 386 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 387 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 388 &bap[0]); 389 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 390 flags | IO_BUFLOCKED, cred, &newb); 391 if (error) { 392 brelse(bp); 393 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 394 UFS_LOCK(ump); 395 softdep_request_cleanup(fs, vp, cred, 396 FLUSH_BLOCKS_WAIT); 397 UFS_UNLOCK(ump); 398 goto retry; 399 } 400 if (ppsratecheck(&lastfail, &curfail, 1)) { 401 ffs_fserr(fs, ip->i_number, "filesystem full"); 402 uprintf("\n%s: write failed, filesystem " 403 "is full\n", fs->fs_fsmnt); 404 } 405 goto fail; 406 } 407 nb = newb; 408 MPASS(allocblk < allociblk + nitems(allociblk)); 409 MPASS(lbns_remfree < lbns + nitems(lbns)); 410 *allocblk++ = nb; 411 *lbns_remfree++ = lbn; 412 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 413 nbp->b_blkno = fsbtodb(fs, nb); 414 if (flags & BA_CLRBUF) 415 vfs_bio_clrbuf(nbp); 416 if (DOINGSOFTDEP(vp)) 417 softdep_setup_allocindir_page(ip, lbn, bp, 418 indirs[i].in_off, nb, 0, nbp); 419 bap[indirs[i].in_off] = nb; 420 /* 421 * If required, write synchronously, otherwise use 422 * delayed write. 423 */ 424 if (flags & IO_SYNC) { 425 bwrite(bp); 426 } else { 427 if (bp->b_bufsize == fs->fs_bsize) 428 bp->b_flags |= B_CLUSTEROK; 429 bdwrite(bp); 430 } 431 curthread_pflags_restore(saved_inbdflush); 432 *bpp = nbp; 433 return (0); 434 } 435 brelse(bp); 436 if (flags & BA_CLRBUF) { 437 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 438 if (seqcount != 0 && 439 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 440 !(vm_page_count_severe() || buf_dirty_count_severe())) { 441 error = cluster_read(vp, ip->i_size, lbn, 442 (int)fs->fs_bsize, NOCRED, 443 MAXBSIZE, seqcount, gbflags, &nbp); 444 } else { 445 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 446 gbflags, &nbp); 447 } 448 if (error) { 449 brelse(nbp); 450 goto fail; 451 } 452 } else { 453 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 454 nbp->b_blkno = fsbtodb(fs, nb); 455 } 456 curthread_pflags_restore(saved_inbdflush); 457 *bpp = nbp; 458 return (0); 459 fail: 460 curthread_pflags_restore(saved_inbdflush); 461 /* 462 * If we have failed to allocate any blocks, simply return the error. 463 * This is the usual case and avoids the need to fsync the file. 464 */ 465 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 466 return (error); 467 /* 468 * If we have failed part way through block allocation, we 469 * have to deallocate any indirect blocks that we have allocated. 470 * We have to fsync the file before we start to get rid of all 471 * of its dependencies so that we do not leave them dangling. 472 * We have to sync it at the end so that the soft updates code 473 * does not find any untracked changes. Although this is really 474 * slow, running out of disk space is not expected to be a common 475 * occurrence. The error return from fsync is ignored as we already 476 * have an error to return to the user. 477 * 478 * XXX Still have to journal the free below 479 */ 480 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 481 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 482 blkp < allocblk; blkp++, lbns_remfree++) { 483 /* 484 * We shall not leave the freed blocks on the vnode 485 * buffer object lists. 486 */ 487 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 488 GB_NOCREAT | GB_UNMAPPED); 489 if (bp != NULL) { 490 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 491 ("mismatch1 l %jd %jd b %ju %ju", 492 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 493 (uintmax_t)bp->b_blkno, 494 (uintmax_t)fsbtodb(fs, *blkp))); 495 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 496 bp->b_flags &= ~(B_ASYNC | B_CACHE); 497 brelse(bp); 498 } 499 deallocated += fs->fs_bsize; 500 } 501 if (allocib != NULL) { 502 *allocib = 0; 503 } else if (unwindidx >= 0) { 504 int r; 505 506 r = bread(vp, indirs[unwindidx].in_lbn, 507 (int)fs->fs_bsize, NOCRED, &bp); 508 if (r) { 509 panic("Could not unwind indirect block, error %d", r); 510 brelse(bp); 511 } else { 512 bap = (ufs1_daddr_t *)bp->b_data; 513 bap[indirs[unwindidx].in_off] = 0; 514 if (flags & IO_SYNC) { 515 bwrite(bp); 516 } else { 517 if (bp->b_bufsize == fs->fs_bsize) 518 bp->b_flags |= B_CLUSTEROK; 519 bdwrite(bp); 520 } 521 } 522 } 523 if (deallocated) { 524 #ifdef QUOTA 525 /* 526 * Restore user's disk quota because allocation failed. 527 */ 528 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 529 #endif 530 dp->di_blocks -= btodb(deallocated); 531 ip->i_flag |= IN_CHANGE | IN_UPDATE; 532 } 533 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 534 /* 535 * After the buffers are invalidated and on-disk pointers are 536 * cleared, free the blocks. 537 */ 538 for (blkp = allociblk; blkp < allocblk; blkp++) { 539 #ifdef INVARIANTS 540 if (blkp == allociblk) 541 lbns_remfree = lbns; 542 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 543 GB_NOCREAT | GB_UNMAPPED); 544 if (bp != NULL) { 545 panic("zombie1 %jd %ju %ju", 546 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 547 (uintmax_t)fsbtodb(fs, *blkp)); 548 } 549 lbns_remfree++; 550 #endif 551 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 552 ip->i_number, vp->v_type, NULL); 553 } 554 return (error); 555 } 556 557 /* 558 * Balloc defines the structure of file system storage 559 * by allocating the physical blocks on a device given 560 * the inode and the logical block number in a file. 561 * This is the allocation strategy for UFS2. Above is 562 * the allocation strategy for UFS1. 563 */ 564 int 565 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 566 struct ucred *cred, int flags, struct buf **bpp) 567 { 568 struct inode *ip; 569 struct ufs2_dinode *dp; 570 ufs_lbn_t lbn, lastlbn; 571 struct fs *fs; 572 struct buf *bp, *nbp; 573 struct ufsmount *ump; 574 struct indir indirs[NIADDR + 2]; 575 ufs2_daddr_t nb, newb, *bap, pref; 576 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 577 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 578 int deallocated, osize, nsize, num, i, error; 579 int unwindidx = -1; 580 int saved_inbdflush; 581 static struct timeval lastfail; 582 static int curfail; 583 int gbflags, reclaimed; 584 585 ip = VTOI(vp); 586 dp = ip->i_din2; 587 fs = ITOFS(ip); 588 ump = ITOUMP(ip); 589 lbn = lblkno(fs, startoffset); 590 size = blkoff(fs, startoffset) + size; 591 reclaimed = 0; 592 if (size > fs->fs_bsize) 593 panic("ffs_balloc_ufs2: blk too big"); 594 *bpp = NULL; 595 if (lbn < 0) 596 return (EFBIG); 597 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 598 599 if (DOINGSOFTDEP(vp)) 600 softdep_prealloc(vp, MNT_WAIT); 601 602 /* 603 * Check for allocating external data. 604 */ 605 if (flags & IO_EXT) { 606 if (lbn >= NXADDR) 607 return (EFBIG); 608 /* 609 * If the next write will extend the data into a new block, 610 * and the data is currently composed of a fragment 611 * this fragment has to be extended to be a full block. 612 */ 613 lastlbn = lblkno(fs, dp->di_extsize); 614 if (lastlbn < lbn) { 615 nb = lastlbn; 616 osize = sblksize(fs, dp->di_extsize, nb); 617 if (osize < fs->fs_bsize && osize > 0) { 618 UFS_LOCK(ump); 619 error = ffs_realloccg(ip, -1 - nb, 620 dp->di_extb[nb], 621 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 622 &dp->di_extb[0]), osize, 623 (int)fs->fs_bsize, flags, cred, &bp); 624 if (error) 625 return (error); 626 if (DOINGSOFTDEP(vp)) 627 softdep_setup_allocext(ip, nb, 628 dbtofsb(fs, bp->b_blkno), 629 dp->di_extb[nb], 630 fs->fs_bsize, osize, bp); 631 dp->di_extsize = smalllblktosize(fs, nb + 1); 632 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 633 bp->b_xflags |= BX_ALTDATA; 634 ip->i_flag |= IN_CHANGE; 635 if (flags & IO_SYNC) 636 bwrite(bp); 637 else 638 bawrite(bp); 639 } 640 } 641 /* 642 * All blocks are direct blocks 643 */ 644 if (flags & BA_METAONLY) 645 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 646 nb = dp->di_extb[lbn]; 647 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 648 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 649 gbflags, &bp); 650 if (error) { 651 brelse(bp); 652 return (error); 653 } 654 bp->b_blkno = fsbtodb(fs, nb); 655 bp->b_xflags |= BX_ALTDATA; 656 *bpp = bp; 657 return (0); 658 } 659 if (nb != 0) { 660 /* 661 * Consider need to reallocate a fragment. 662 */ 663 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 664 nsize = fragroundup(fs, size); 665 if (nsize <= osize) { 666 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 667 gbflags, &bp); 668 if (error) { 669 brelse(bp); 670 return (error); 671 } 672 bp->b_blkno = fsbtodb(fs, nb); 673 bp->b_xflags |= BX_ALTDATA; 674 } else { 675 UFS_LOCK(ump); 676 error = ffs_realloccg(ip, -1 - lbn, 677 dp->di_extb[lbn], 678 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 679 &dp->di_extb[0]), osize, nsize, flags, 680 cred, &bp); 681 if (error) 682 return (error); 683 bp->b_xflags |= BX_ALTDATA; 684 if (DOINGSOFTDEP(vp)) 685 softdep_setup_allocext(ip, lbn, 686 dbtofsb(fs, bp->b_blkno), nb, 687 nsize, osize, bp); 688 } 689 } else { 690 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 691 nsize = fragroundup(fs, size); 692 else 693 nsize = fs->fs_bsize; 694 UFS_LOCK(ump); 695 error = ffs_alloc(ip, lbn, 696 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 697 nsize, flags, cred, &newb); 698 if (error) 699 return (error); 700 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 701 bp->b_blkno = fsbtodb(fs, newb); 702 bp->b_xflags |= BX_ALTDATA; 703 if (flags & BA_CLRBUF) 704 vfs_bio_clrbuf(bp); 705 if (DOINGSOFTDEP(vp)) 706 softdep_setup_allocext(ip, lbn, newb, 0, 707 nsize, 0, bp); 708 } 709 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 710 ip->i_flag |= IN_CHANGE; 711 *bpp = bp; 712 return (0); 713 } 714 /* 715 * If the next write will extend the file into a new block, 716 * and the file is currently composed of a fragment 717 * this fragment has to be extended to be a full block. 718 */ 719 lastlbn = lblkno(fs, ip->i_size); 720 if (lastlbn < NDADDR && lastlbn < lbn) { 721 nb = lastlbn; 722 osize = blksize(fs, ip, nb); 723 if (osize < fs->fs_bsize && osize > 0) { 724 UFS_LOCK(ump); 725 error = ffs_realloccg(ip, nb, dp->di_db[nb], 726 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 727 &dp->di_db[0]), osize, (int)fs->fs_bsize, 728 flags, cred, &bp); 729 if (error) 730 return (error); 731 if (DOINGSOFTDEP(vp)) 732 softdep_setup_allocdirect(ip, nb, 733 dbtofsb(fs, bp->b_blkno), 734 dp->di_db[nb], 735 fs->fs_bsize, osize, bp); 736 ip->i_size = smalllblktosize(fs, nb + 1); 737 dp->di_size = ip->i_size; 738 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 739 ip->i_flag |= IN_CHANGE | IN_UPDATE; 740 if (flags & IO_SYNC) 741 bwrite(bp); 742 else 743 bawrite(bp); 744 } 745 } 746 /* 747 * The first NDADDR blocks are direct blocks 748 */ 749 if (lbn < NDADDR) { 750 if (flags & BA_METAONLY) 751 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 752 nb = dp->di_db[lbn]; 753 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 754 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 755 gbflags, &bp); 756 if (error) { 757 brelse(bp); 758 return (error); 759 } 760 bp->b_blkno = fsbtodb(fs, nb); 761 *bpp = bp; 762 return (0); 763 } 764 if (nb != 0) { 765 /* 766 * Consider need to reallocate a fragment. 767 */ 768 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 769 nsize = fragroundup(fs, size); 770 if (nsize <= osize) { 771 error = bread_gb(vp, lbn, osize, NOCRED, 772 gbflags, &bp); 773 if (error) { 774 brelse(bp); 775 return (error); 776 } 777 bp->b_blkno = fsbtodb(fs, nb); 778 } else { 779 UFS_LOCK(ump); 780 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 781 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 782 &dp->di_db[0]), osize, nsize, flags, 783 cred, &bp); 784 if (error) 785 return (error); 786 if (DOINGSOFTDEP(vp)) 787 softdep_setup_allocdirect(ip, lbn, 788 dbtofsb(fs, bp->b_blkno), nb, 789 nsize, osize, bp); 790 } 791 } else { 792 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 793 nsize = fragroundup(fs, size); 794 else 795 nsize = fs->fs_bsize; 796 UFS_LOCK(ump); 797 error = ffs_alloc(ip, lbn, 798 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 799 &dp->di_db[0]), nsize, flags, cred, &newb); 800 if (error) 801 return (error); 802 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 803 bp->b_blkno = fsbtodb(fs, newb); 804 if (flags & BA_CLRBUF) 805 vfs_bio_clrbuf(bp); 806 if (DOINGSOFTDEP(vp)) 807 softdep_setup_allocdirect(ip, lbn, newb, 0, 808 nsize, 0, bp); 809 } 810 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 811 ip->i_flag |= IN_CHANGE | IN_UPDATE; 812 *bpp = bp; 813 return (0); 814 } 815 /* 816 * Determine the number of levels of indirection. 817 */ 818 pref = 0; 819 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 820 return(error); 821 #ifdef INVARIANTS 822 if (num < 1) 823 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 824 #endif 825 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 826 /* 827 * Fetch the first indirect block allocating if necessary. 828 */ 829 --num; 830 nb = dp->di_ib[indirs[0].in_off]; 831 allocib = NULL; 832 allocblk = allociblk; 833 lbns_remfree = lbns; 834 if (nb == 0) { 835 UFS_LOCK(ump); 836 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 837 (ufs2_daddr_t *)0); 838 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 839 flags, cred, &newb)) != 0) { 840 curthread_pflags_restore(saved_inbdflush); 841 return (error); 842 } 843 pref = newb + fs->fs_frag; 844 nb = newb; 845 MPASS(allocblk < allociblk + nitems(allociblk)); 846 MPASS(lbns_remfree < lbns + nitems(lbns)); 847 *allocblk++ = nb; 848 *lbns_remfree++ = indirs[1].in_lbn; 849 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 850 GB_UNMAPPED); 851 bp->b_blkno = fsbtodb(fs, nb); 852 vfs_bio_clrbuf(bp); 853 if (DOINGSOFTDEP(vp)) { 854 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 855 newb, 0, fs->fs_bsize, 0, bp); 856 bdwrite(bp); 857 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 858 if (bp->b_bufsize == fs->fs_bsize) 859 bp->b_flags |= B_CLUSTEROK; 860 bdwrite(bp); 861 } else { 862 if ((error = bwrite(bp)) != 0) 863 goto fail; 864 } 865 allocib = &dp->di_ib[indirs[0].in_off]; 866 *allocib = nb; 867 ip->i_flag |= IN_CHANGE | IN_UPDATE; 868 } 869 /* 870 * Fetch through the indirect blocks, allocating as necessary. 871 */ 872 retry: 873 for (i = 1;;) { 874 error = bread(vp, 875 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 876 if (error) { 877 brelse(bp); 878 goto fail; 879 } 880 bap = (ufs2_daddr_t *)bp->b_data; 881 nb = bap[indirs[i].in_off]; 882 if (i == num) 883 break; 884 i += 1; 885 if (nb != 0) { 886 bqrelse(bp); 887 continue; 888 } 889 UFS_LOCK(ump); 890 /* 891 * If parent indirect has just been allocated, try to cluster 892 * immediately following it. 893 */ 894 if (pref == 0) 895 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 896 (ufs2_daddr_t *)0); 897 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 898 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 899 brelse(bp); 900 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 901 UFS_LOCK(ump); 902 softdep_request_cleanup(fs, vp, cred, 903 FLUSH_BLOCKS_WAIT); 904 UFS_UNLOCK(ump); 905 goto retry; 906 } 907 if (ppsratecheck(&lastfail, &curfail, 1)) { 908 ffs_fserr(fs, ip->i_number, "filesystem full"); 909 uprintf("\n%s: write failed, filesystem " 910 "is full\n", fs->fs_fsmnt); 911 } 912 goto fail; 913 } 914 pref = newb + fs->fs_frag; 915 nb = newb; 916 MPASS(allocblk < allociblk + nitems(allociblk)); 917 MPASS(lbns_remfree < lbns + nitems(lbns)); 918 *allocblk++ = nb; 919 *lbns_remfree++ = indirs[i].in_lbn; 920 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 921 GB_UNMAPPED); 922 nbp->b_blkno = fsbtodb(fs, nb); 923 vfs_bio_clrbuf(nbp); 924 if (DOINGSOFTDEP(vp)) { 925 softdep_setup_allocindir_meta(nbp, ip, bp, 926 indirs[i - 1].in_off, nb); 927 bdwrite(nbp); 928 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 929 if (nbp->b_bufsize == fs->fs_bsize) 930 nbp->b_flags |= B_CLUSTEROK; 931 bdwrite(nbp); 932 } else { 933 if ((error = bwrite(nbp)) != 0) { 934 brelse(bp); 935 goto fail; 936 } 937 } 938 bap[indirs[i - 1].in_off] = nb; 939 if (allocib == NULL && unwindidx < 0) 940 unwindidx = i - 1; 941 /* 942 * If required, write synchronously, otherwise use 943 * delayed write. 944 */ 945 if (flags & IO_SYNC) { 946 bwrite(bp); 947 } else { 948 if (bp->b_bufsize == fs->fs_bsize) 949 bp->b_flags |= B_CLUSTEROK; 950 bdwrite(bp); 951 } 952 } 953 /* 954 * If asked only for the indirect block, then return it. 955 */ 956 if (flags & BA_METAONLY) { 957 curthread_pflags_restore(saved_inbdflush); 958 *bpp = bp; 959 return (0); 960 } 961 /* 962 * Get the data block, allocating if necessary. 963 */ 964 if (nb == 0) { 965 UFS_LOCK(ump); 966 /* 967 * If allocating metadata at the front of the cylinder 968 * group and parent indirect block has just been allocated, 969 * then cluster next to it if it is the first indirect in 970 * the file. Otherwise it has been allocated in the metadata 971 * area, so we want to find our own place out in the data area. 972 */ 973 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 974 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 975 &bap[0]); 976 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 977 flags | IO_BUFLOCKED, cred, &newb); 978 if (error) { 979 brelse(bp); 980 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 981 UFS_LOCK(ump); 982 softdep_request_cleanup(fs, vp, cred, 983 FLUSH_BLOCKS_WAIT); 984 UFS_UNLOCK(ump); 985 goto retry; 986 } 987 if (ppsratecheck(&lastfail, &curfail, 1)) { 988 ffs_fserr(fs, ip->i_number, "filesystem full"); 989 uprintf("\n%s: write failed, filesystem " 990 "is full\n", fs->fs_fsmnt); 991 } 992 goto fail; 993 } 994 nb = newb; 995 MPASS(allocblk < allociblk + nitems(allociblk)); 996 MPASS(lbns_remfree < lbns + nitems(lbns)); 997 *allocblk++ = nb; 998 *lbns_remfree++ = lbn; 999 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1000 nbp->b_blkno = fsbtodb(fs, nb); 1001 if (flags & BA_CLRBUF) 1002 vfs_bio_clrbuf(nbp); 1003 if (DOINGSOFTDEP(vp)) 1004 softdep_setup_allocindir_page(ip, lbn, bp, 1005 indirs[i].in_off, nb, 0, nbp); 1006 bap[indirs[i].in_off] = nb; 1007 /* 1008 * If required, write synchronously, otherwise use 1009 * delayed write. 1010 */ 1011 if (flags & IO_SYNC) { 1012 bwrite(bp); 1013 } else { 1014 if (bp->b_bufsize == fs->fs_bsize) 1015 bp->b_flags |= B_CLUSTEROK; 1016 bdwrite(bp); 1017 } 1018 curthread_pflags_restore(saved_inbdflush); 1019 *bpp = nbp; 1020 return (0); 1021 } 1022 brelse(bp); 1023 /* 1024 * If requested clear invalid portions of the buffer. If we 1025 * have to do a read-before-write (typical if BA_CLRBUF is set), 1026 * try to do some read-ahead in the sequential case to reduce 1027 * the number of I/O transactions. 1028 */ 1029 if (flags & BA_CLRBUF) { 1030 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1031 if (seqcount != 0 && 1032 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1033 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1034 error = cluster_read(vp, ip->i_size, lbn, 1035 (int)fs->fs_bsize, NOCRED, 1036 MAXBSIZE, seqcount, gbflags, &nbp); 1037 } else { 1038 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1039 NOCRED, gbflags, &nbp); 1040 } 1041 if (error) { 1042 brelse(nbp); 1043 goto fail; 1044 } 1045 } else { 1046 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1047 nbp->b_blkno = fsbtodb(fs, nb); 1048 } 1049 curthread_pflags_restore(saved_inbdflush); 1050 *bpp = nbp; 1051 return (0); 1052 fail: 1053 curthread_pflags_restore(saved_inbdflush); 1054 /* 1055 * If we have failed to allocate any blocks, simply return the error. 1056 * This is the usual case and avoids the need to fsync the file. 1057 */ 1058 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1059 return (error); 1060 /* 1061 * If we have failed part way through block allocation, we 1062 * have to deallocate any indirect blocks that we have allocated. 1063 * We have to fsync the file before we start to get rid of all 1064 * of its dependencies so that we do not leave them dangling. 1065 * We have to sync it at the end so that the soft updates code 1066 * does not find any untracked changes. Although this is really 1067 * slow, running out of disk space is not expected to be a common 1068 * occurrence. The error return from fsync is ignored as we already 1069 * have an error to return to the user. 1070 * 1071 * XXX Still have to journal the free below 1072 */ 1073 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1074 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1075 blkp < allocblk; blkp++, lbns_remfree++) { 1076 /* 1077 * We shall not leave the freed blocks on the vnode 1078 * buffer object lists. 1079 */ 1080 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1081 GB_NOCREAT | GB_UNMAPPED); 1082 if (bp != NULL) { 1083 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1084 ("mismatch2 l %jd %jd b %ju %ju", 1085 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1086 (uintmax_t)bp->b_blkno, 1087 (uintmax_t)fsbtodb(fs, *blkp))); 1088 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1089 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1090 brelse(bp); 1091 } 1092 deallocated += fs->fs_bsize; 1093 } 1094 if (allocib != NULL) { 1095 *allocib = 0; 1096 } else if (unwindidx >= 0) { 1097 int r; 1098 1099 r = bread(vp, indirs[unwindidx].in_lbn, 1100 (int)fs->fs_bsize, NOCRED, &bp); 1101 if (r) { 1102 panic("Could not unwind indirect block, error %d", r); 1103 brelse(bp); 1104 } else { 1105 bap = (ufs2_daddr_t *)bp->b_data; 1106 bap[indirs[unwindidx].in_off] = 0; 1107 if (flags & IO_SYNC) { 1108 bwrite(bp); 1109 } else { 1110 if (bp->b_bufsize == fs->fs_bsize) 1111 bp->b_flags |= B_CLUSTEROK; 1112 bdwrite(bp); 1113 } 1114 } 1115 } 1116 if (deallocated) { 1117 #ifdef QUOTA 1118 /* 1119 * Restore user's disk quota because allocation failed. 1120 */ 1121 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1122 #endif 1123 dp->di_blocks -= btodb(deallocated); 1124 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1125 } 1126 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1127 /* 1128 * After the buffers are invalidated and on-disk pointers are 1129 * cleared, free the blocks. 1130 */ 1131 for (blkp = allociblk; blkp < allocblk; blkp++) { 1132 #ifdef INVARIANTS 1133 if (blkp == allociblk) 1134 lbns_remfree = lbns; 1135 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1136 GB_NOCREAT | GB_UNMAPPED); 1137 if (bp != NULL) { 1138 panic("zombie2 %jd %ju %ju", 1139 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1140 (uintmax_t)fsbtodb(fs, *blkp)); 1141 } 1142 lbns_remfree++; 1143 #endif 1144 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1145 ip->i_number, vp->v_type, NULL); 1146 } 1147 return (error); 1148 } 1149