1 /*- 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 60 */ 61 62 #include <sys/cdefs.h> 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/lock.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 74 #include <ufs/ufs/quota.h> 75 #include <ufs/ufs/inode.h> 76 #include <ufs/ufs/ufs_extern.h> 77 #include <ufs/ufs/extattr.h> 78 #include <ufs/ufs/ufsmount.h> 79 80 #include <ufs/ffs/fs.h> 81 #include <ufs/ffs/ffs_extern.h> 82 83 /* 84 * Balloc defines the structure of filesystem storage 85 * by allocating the physical blocks on a device given 86 * the inode and the logical block number in a file. 87 * This is the allocation strategy for UFS1. Below is 88 * the allocation strategy for UFS2. 89 */ 90 int 91 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 92 struct ucred *cred, int flags, struct buf **bpp) 93 { 94 struct inode *ip; 95 struct ufs1_dinode *dp; 96 ufs_lbn_t lbn, lastlbn; 97 struct fs *fs; 98 ufs1_daddr_t nb; 99 struct buf *bp, *nbp; 100 struct ufsmount *ump; 101 struct indir indirs[UFS_NIADDR + 2]; 102 int deallocated, osize, nsize, num, i, error; 103 ufs2_daddr_t newb; 104 ufs1_daddr_t *bap, pref; 105 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 106 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 107 int unwindidx = -1; 108 int saved_inbdflush; 109 static struct timeval lastfail; 110 static int curfail; 111 int gbflags, reclaimed; 112 113 ip = VTOI(vp); 114 dp = ip->i_din1; 115 fs = ITOFS(ip); 116 ump = ITOUMP(ip); 117 lbn = lblkno(fs, startoffset); 118 size = blkoff(fs, startoffset) + size; 119 reclaimed = 0; 120 if (size > fs->fs_bsize) 121 panic("ffs_balloc_ufs1: blk too big"); 122 *bpp = NULL; 123 if (flags & IO_EXT) 124 return (EOPNOTSUPP); 125 if (lbn < 0) 126 return (EFBIG); 127 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 128 129 if (DOINGSOFTDEP(vp)) 130 softdep_prealloc(vp, MNT_WAIT); 131 /* 132 * If the next write will extend the file into a new block, 133 * and the file is currently composed of a fragment 134 * this fragment has to be extended to be a full block. 135 */ 136 lastlbn = lblkno(fs, ip->i_size); 137 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 138 nb = lastlbn; 139 osize = blksize(fs, ip, nb); 140 if (osize < fs->fs_bsize && osize > 0) { 141 UFS_LOCK(ump); 142 error = ffs_realloccg(ip, nb, dp->di_db[nb], 143 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 144 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 145 cred, &bp); 146 if (error) 147 return (error); 148 if (DOINGSOFTDEP(vp)) 149 softdep_setup_allocdirect(ip, nb, 150 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 151 fs->fs_bsize, osize, bp); 152 ip->i_size = smalllblktosize(fs, nb + 1); 153 dp->di_size = ip->i_size; 154 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 155 ip->i_flag |= IN_CHANGE | IN_UPDATE; 156 if (flags & IO_SYNC) 157 bwrite(bp); 158 else if (DOINGASYNC(vp)) 159 bdwrite(bp); 160 else 161 bawrite(bp); 162 } 163 } 164 /* 165 * The first UFS_NDADDR blocks are direct blocks 166 */ 167 if (lbn < UFS_NDADDR) { 168 if (flags & BA_METAONLY) 169 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 170 nb = dp->di_db[lbn]; 171 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 172 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 173 if (error) { 174 brelse(bp); 175 return (error); 176 } 177 bp->b_blkno = fsbtodb(fs, nb); 178 *bpp = bp; 179 return (0); 180 } 181 if (nb != 0) { 182 /* 183 * Consider need to reallocate a fragment. 184 */ 185 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 186 nsize = fragroundup(fs, size); 187 if (nsize <= osize) { 188 error = bread(vp, lbn, osize, NOCRED, &bp); 189 if (error) { 190 brelse(bp); 191 return (error); 192 } 193 bp->b_blkno = fsbtodb(fs, nb); 194 } else { 195 UFS_LOCK(ump); 196 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 197 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 198 &dp->di_db[0]), osize, nsize, flags, 199 cred, &bp); 200 if (error) 201 return (error); 202 if (DOINGSOFTDEP(vp)) 203 softdep_setup_allocdirect(ip, lbn, 204 dbtofsb(fs, bp->b_blkno), nb, 205 nsize, osize, bp); 206 } 207 } else { 208 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 209 nsize = fragroundup(fs, size); 210 else 211 nsize = fs->fs_bsize; 212 UFS_LOCK(ump); 213 error = ffs_alloc(ip, lbn, 214 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 215 nsize, flags, cred, &newb); 216 if (error) 217 return (error); 218 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 219 bp->b_blkno = fsbtodb(fs, newb); 220 if (flags & BA_CLRBUF) 221 vfs_bio_clrbuf(bp); 222 if (DOINGSOFTDEP(vp)) 223 softdep_setup_allocdirect(ip, lbn, newb, 0, 224 nsize, 0, bp); 225 } 226 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 227 ip->i_flag |= IN_CHANGE | IN_UPDATE; 228 *bpp = bp; 229 return (0); 230 } 231 /* 232 * Determine the number of levels of indirection. 233 */ 234 pref = 0; 235 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 236 return(error); 237 #ifdef INVARIANTS 238 if (num < 1) 239 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 240 #endif 241 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 242 /* 243 * Fetch the first indirect block allocating if necessary. 244 */ 245 --num; 246 nb = dp->di_ib[indirs[0].in_off]; 247 allocib = NULL; 248 allocblk = allociblk; 249 lbns_remfree = lbns; 250 if (nb == 0) { 251 UFS_LOCK(ump); 252 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 253 (ufs1_daddr_t *)0); 254 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 255 flags, cred, &newb)) != 0) { 256 curthread_pflags_restore(saved_inbdflush); 257 return (error); 258 } 259 pref = newb + fs->fs_frag; 260 nb = newb; 261 MPASS(allocblk < allociblk + nitems(allociblk)); 262 MPASS(lbns_remfree < lbns + nitems(lbns)); 263 *allocblk++ = nb; 264 *lbns_remfree++ = indirs[1].in_lbn; 265 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 266 bp->b_blkno = fsbtodb(fs, nb); 267 vfs_bio_clrbuf(bp); 268 if (DOINGSOFTDEP(vp)) { 269 softdep_setup_allocdirect(ip, 270 UFS_NDADDR + indirs[0].in_off, newb, 0, 271 fs->fs_bsize, 0, bp); 272 bdwrite(bp); 273 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 274 if (bp->b_bufsize == fs->fs_bsize) 275 bp->b_flags |= B_CLUSTEROK; 276 bdwrite(bp); 277 } else { 278 if ((error = bwrite(bp)) != 0) 279 goto fail; 280 } 281 allocib = &dp->di_ib[indirs[0].in_off]; 282 *allocib = nb; 283 ip->i_flag |= IN_CHANGE | IN_UPDATE; 284 } 285 /* 286 * Fetch through the indirect blocks, allocating as necessary. 287 */ 288 retry: 289 for (i = 1;;) { 290 error = bread(vp, 291 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 292 if (error) { 293 brelse(bp); 294 goto fail; 295 } 296 bap = (ufs1_daddr_t *)bp->b_data; 297 nb = bap[indirs[i].in_off]; 298 if (i == num) 299 break; 300 i += 1; 301 if (nb != 0) { 302 bqrelse(bp); 303 continue; 304 } 305 UFS_LOCK(ump); 306 /* 307 * If parent indirect has just been allocated, try to cluster 308 * immediately following it. 309 */ 310 if (pref == 0) 311 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 312 (ufs1_daddr_t *)0); 313 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 314 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 315 brelse(bp); 316 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 317 UFS_LOCK(ump); 318 softdep_request_cleanup(fs, vp, cred, 319 FLUSH_BLOCKS_WAIT); 320 UFS_UNLOCK(ump); 321 goto retry; 322 } 323 if (ppsratecheck(&lastfail, &curfail, 1)) { 324 ffs_fserr(fs, ip->i_number, "filesystem full"); 325 uprintf("\n%s: write failed, filesystem " 326 "is full\n", fs->fs_fsmnt); 327 } 328 goto fail; 329 } 330 pref = newb + fs->fs_frag; 331 nb = newb; 332 MPASS(allocblk < allociblk + nitems(allociblk)); 333 MPASS(lbns_remfree < lbns + nitems(lbns)); 334 *allocblk++ = nb; 335 *lbns_remfree++ = indirs[i].in_lbn; 336 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 337 nbp->b_blkno = fsbtodb(fs, nb); 338 vfs_bio_clrbuf(nbp); 339 if (DOINGSOFTDEP(vp)) { 340 softdep_setup_allocindir_meta(nbp, ip, bp, 341 indirs[i - 1].in_off, nb); 342 bdwrite(nbp); 343 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 344 if (nbp->b_bufsize == fs->fs_bsize) 345 nbp->b_flags |= B_CLUSTEROK; 346 bdwrite(nbp); 347 } else { 348 if ((error = bwrite(nbp)) != 0) { 349 brelse(bp); 350 goto fail; 351 } 352 } 353 bap[indirs[i - 1].in_off] = nb; 354 if (allocib == NULL && unwindidx < 0) 355 unwindidx = i - 1; 356 /* 357 * If required, write synchronously, otherwise use 358 * delayed write. 359 */ 360 if (flags & IO_SYNC) { 361 bwrite(bp); 362 } else { 363 if (bp->b_bufsize == fs->fs_bsize) 364 bp->b_flags |= B_CLUSTEROK; 365 bdwrite(bp); 366 } 367 } 368 /* 369 * If asked only for the indirect block, then return it. 370 */ 371 if (flags & BA_METAONLY) { 372 curthread_pflags_restore(saved_inbdflush); 373 *bpp = bp; 374 return (0); 375 } 376 /* 377 * Get the data block, allocating if necessary. 378 */ 379 if (nb == 0) { 380 UFS_LOCK(ump); 381 /* 382 * If allocating metadata at the front of the cylinder 383 * group and parent indirect block has just been allocated, 384 * then cluster next to it if it is the first indirect in 385 * the file. Otherwise it has been allocated in the metadata 386 * area, so we want to find our own place out in the data area. 387 */ 388 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 389 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 390 &bap[0]); 391 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 392 flags | IO_BUFLOCKED, cred, &newb); 393 if (error) { 394 brelse(bp); 395 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 396 UFS_LOCK(ump); 397 softdep_request_cleanup(fs, vp, cred, 398 FLUSH_BLOCKS_WAIT); 399 UFS_UNLOCK(ump); 400 goto retry; 401 } 402 if (ppsratecheck(&lastfail, &curfail, 1)) { 403 ffs_fserr(fs, ip->i_number, "filesystem full"); 404 uprintf("\n%s: write failed, filesystem " 405 "is full\n", fs->fs_fsmnt); 406 } 407 goto fail; 408 } 409 nb = newb; 410 MPASS(allocblk < allociblk + nitems(allociblk)); 411 MPASS(lbns_remfree < lbns + nitems(lbns)); 412 *allocblk++ = nb; 413 *lbns_remfree++ = lbn; 414 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 415 nbp->b_blkno = fsbtodb(fs, nb); 416 if (flags & BA_CLRBUF) 417 vfs_bio_clrbuf(nbp); 418 if (DOINGSOFTDEP(vp)) 419 softdep_setup_allocindir_page(ip, lbn, bp, 420 indirs[i].in_off, nb, 0, nbp); 421 bap[indirs[i].in_off] = nb; 422 /* 423 * If required, write synchronously, otherwise use 424 * delayed write. 425 */ 426 if (flags & IO_SYNC) { 427 bwrite(bp); 428 } else { 429 if (bp->b_bufsize == fs->fs_bsize) 430 bp->b_flags |= B_CLUSTEROK; 431 bdwrite(bp); 432 } 433 curthread_pflags_restore(saved_inbdflush); 434 *bpp = nbp; 435 return (0); 436 } 437 brelse(bp); 438 if (flags & BA_CLRBUF) { 439 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 440 if (seqcount != 0 && 441 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 442 !(vm_page_count_severe() || buf_dirty_count_severe())) { 443 error = cluster_read(vp, ip->i_size, lbn, 444 (int)fs->fs_bsize, NOCRED, 445 MAXBSIZE, seqcount, gbflags, &nbp); 446 } else { 447 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 448 gbflags, &nbp); 449 } 450 if (error) { 451 brelse(nbp); 452 goto fail; 453 } 454 } else { 455 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 456 nbp->b_blkno = fsbtodb(fs, nb); 457 } 458 curthread_pflags_restore(saved_inbdflush); 459 *bpp = nbp; 460 return (0); 461 fail: 462 curthread_pflags_restore(saved_inbdflush); 463 /* 464 * If we have failed to allocate any blocks, simply return the error. 465 * This is the usual case and avoids the need to fsync the file. 466 */ 467 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 468 return (error); 469 /* 470 * If we have failed part way through block allocation, we 471 * have to deallocate any indirect blocks that we have allocated. 472 * We have to fsync the file before we start to get rid of all 473 * of its dependencies so that we do not leave them dangling. 474 * We have to sync it at the end so that the soft updates code 475 * does not find any untracked changes. Although this is really 476 * slow, running out of disk space is not expected to be a common 477 * occurrence. The error return from fsync is ignored as we already 478 * have an error to return to the user. 479 * 480 * XXX Still have to journal the free below 481 */ 482 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 483 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 484 blkp < allocblk; blkp++, lbns_remfree++) { 485 /* 486 * We shall not leave the freed blocks on the vnode 487 * buffer object lists. 488 */ 489 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 490 GB_NOCREAT | GB_UNMAPPED); 491 if (bp != NULL) { 492 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 493 ("mismatch1 l %jd %jd b %ju %ju", 494 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 495 (uintmax_t)bp->b_blkno, 496 (uintmax_t)fsbtodb(fs, *blkp))); 497 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 498 bp->b_flags &= ~(B_ASYNC | B_CACHE); 499 brelse(bp); 500 } 501 deallocated += fs->fs_bsize; 502 } 503 if (allocib != NULL) { 504 *allocib = 0; 505 } else if (unwindidx >= 0) { 506 int r; 507 508 r = bread(vp, indirs[unwindidx].in_lbn, 509 (int)fs->fs_bsize, NOCRED, &bp); 510 if (r) { 511 panic("Could not unwind indirect block, error %d", r); 512 brelse(bp); 513 } else { 514 bap = (ufs1_daddr_t *)bp->b_data; 515 bap[indirs[unwindidx].in_off] = 0; 516 if (flags & IO_SYNC) { 517 bwrite(bp); 518 } else { 519 if (bp->b_bufsize == fs->fs_bsize) 520 bp->b_flags |= B_CLUSTEROK; 521 bdwrite(bp); 522 } 523 } 524 } 525 if (deallocated) { 526 #ifdef QUOTA 527 /* 528 * Restore user's disk quota because allocation failed. 529 */ 530 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 531 #endif 532 dp->di_blocks -= btodb(deallocated); 533 ip->i_flag |= IN_CHANGE | IN_UPDATE; 534 } 535 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 536 /* 537 * After the buffers are invalidated and on-disk pointers are 538 * cleared, free the blocks. 539 */ 540 for (blkp = allociblk; blkp < allocblk; blkp++) { 541 #ifdef INVARIANTS 542 if (blkp == allociblk) 543 lbns_remfree = lbns; 544 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 545 GB_NOCREAT | GB_UNMAPPED); 546 if (bp != NULL) { 547 panic("zombie1 %jd %ju %ju", 548 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 549 (uintmax_t)fsbtodb(fs, *blkp)); 550 } 551 lbns_remfree++; 552 #endif 553 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 554 ip->i_number, vp->v_type, NULL); 555 } 556 return (error); 557 } 558 559 /* 560 * Balloc defines the structure of file system storage 561 * by allocating the physical blocks on a device given 562 * the inode and the logical block number in a file. 563 * This is the allocation strategy for UFS2. Above is 564 * the allocation strategy for UFS1. 565 */ 566 int 567 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 568 struct ucred *cred, int flags, struct buf **bpp) 569 { 570 struct inode *ip; 571 struct ufs2_dinode *dp; 572 ufs_lbn_t lbn, lastlbn; 573 struct fs *fs; 574 struct buf *bp, *nbp; 575 struct ufsmount *ump; 576 struct indir indirs[UFS_NIADDR + 2]; 577 ufs2_daddr_t nb, newb, *bap, pref; 578 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 579 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 580 int deallocated, osize, nsize, num, i, error; 581 int unwindidx = -1; 582 int saved_inbdflush; 583 static struct timeval lastfail; 584 static int curfail; 585 int gbflags, reclaimed; 586 587 ip = VTOI(vp); 588 dp = ip->i_din2; 589 fs = ITOFS(ip); 590 ump = ITOUMP(ip); 591 lbn = lblkno(fs, startoffset); 592 size = blkoff(fs, startoffset) + size; 593 reclaimed = 0; 594 if (size > fs->fs_bsize) 595 panic("ffs_balloc_ufs2: blk too big"); 596 *bpp = NULL; 597 if (lbn < 0) 598 return (EFBIG); 599 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 600 601 if (DOINGSOFTDEP(vp)) 602 softdep_prealloc(vp, MNT_WAIT); 603 604 /* 605 * Check for allocating external data. 606 */ 607 if (flags & IO_EXT) { 608 if (lbn >= UFS_NXADDR) 609 return (EFBIG); 610 /* 611 * If the next write will extend the data into a new block, 612 * and the data is currently composed of a fragment 613 * this fragment has to be extended to be a full block. 614 */ 615 lastlbn = lblkno(fs, dp->di_extsize); 616 if (lastlbn < lbn) { 617 nb = lastlbn; 618 osize = sblksize(fs, dp->di_extsize, nb); 619 if (osize < fs->fs_bsize && osize > 0) { 620 UFS_LOCK(ump); 621 error = ffs_realloccg(ip, -1 - nb, 622 dp->di_extb[nb], 623 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 624 &dp->di_extb[0]), osize, 625 (int)fs->fs_bsize, flags, cred, &bp); 626 if (error) 627 return (error); 628 if (DOINGSOFTDEP(vp)) 629 softdep_setup_allocext(ip, nb, 630 dbtofsb(fs, bp->b_blkno), 631 dp->di_extb[nb], 632 fs->fs_bsize, osize, bp); 633 dp->di_extsize = smalllblktosize(fs, nb + 1); 634 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 635 bp->b_xflags |= BX_ALTDATA; 636 ip->i_flag |= IN_CHANGE; 637 if (flags & IO_SYNC) 638 bwrite(bp); 639 else 640 bawrite(bp); 641 } 642 } 643 /* 644 * All blocks are direct blocks 645 */ 646 if (flags & BA_METAONLY) 647 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 648 nb = dp->di_extb[lbn]; 649 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 650 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 651 gbflags, &bp); 652 if (error) { 653 brelse(bp); 654 return (error); 655 } 656 bp->b_blkno = fsbtodb(fs, nb); 657 bp->b_xflags |= BX_ALTDATA; 658 *bpp = bp; 659 return (0); 660 } 661 if (nb != 0) { 662 /* 663 * Consider need to reallocate a fragment. 664 */ 665 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 666 nsize = fragroundup(fs, size); 667 if (nsize <= osize) { 668 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 669 gbflags, &bp); 670 if (error) { 671 brelse(bp); 672 return (error); 673 } 674 bp->b_blkno = fsbtodb(fs, nb); 675 bp->b_xflags |= BX_ALTDATA; 676 } else { 677 UFS_LOCK(ump); 678 error = ffs_realloccg(ip, -1 - lbn, 679 dp->di_extb[lbn], 680 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 681 &dp->di_extb[0]), osize, nsize, flags, 682 cred, &bp); 683 if (error) 684 return (error); 685 bp->b_xflags |= BX_ALTDATA; 686 if (DOINGSOFTDEP(vp)) 687 softdep_setup_allocext(ip, lbn, 688 dbtofsb(fs, bp->b_blkno), nb, 689 nsize, osize, bp); 690 } 691 } else { 692 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 693 nsize = fragroundup(fs, size); 694 else 695 nsize = fs->fs_bsize; 696 UFS_LOCK(ump); 697 error = ffs_alloc(ip, lbn, 698 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 699 nsize, flags, cred, &newb); 700 if (error) 701 return (error); 702 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 703 bp->b_blkno = fsbtodb(fs, newb); 704 bp->b_xflags |= BX_ALTDATA; 705 if (flags & BA_CLRBUF) 706 vfs_bio_clrbuf(bp); 707 if (DOINGSOFTDEP(vp)) 708 softdep_setup_allocext(ip, lbn, newb, 0, 709 nsize, 0, bp); 710 } 711 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 712 ip->i_flag |= IN_CHANGE; 713 *bpp = bp; 714 return (0); 715 } 716 /* 717 * If the next write will extend the file into a new block, 718 * and the file is currently composed of a fragment 719 * this fragment has to be extended to be a full block. 720 */ 721 lastlbn = lblkno(fs, ip->i_size); 722 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 723 nb = lastlbn; 724 osize = blksize(fs, ip, nb); 725 if (osize < fs->fs_bsize && osize > 0) { 726 UFS_LOCK(ump); 727 error = ffs_realloccg(ip, nb, dp->di_db[nb], 728 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 729 &dp->di_db[0]), osize, (int)fs->fs_bsize, 730 flags, cred, &bp); 731 if (error) 732 return (error); 733 if (DOINGSOFTDEP(vp)) 734 softdep_setup_allocdirect(ip, nb, 735 dbtofsb(fs, bp->b_blkno), 736 dp->di_db[nb], 737 fs->fs_bsize, osize, bp); 738 ip->i_size = smalllblktosize(fs, nb + 1); 739 dp->di_size = ip->i_size; 740 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 741 ip->i_flag |= IN_CHANGE | IN_UPDATE; 742 if (flags & IO_SYNC) 743 bwrite(bp); 744 else 745 bawrite(bp); 746 } 747 } 748 /* 749 * The first UFS_NDADDR blocks are direct blocks 750 */ 751 if (lbn < UFS_NDADDR) { 752 if (flags & BA_METAONLY) 753 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 754 nb = dp->di_db[lbn]; 755 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 756 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 757 gbflags, &bp); 758 if (error) { 759 brelse(bp); 760 return (error); 761 } 762 bp->b_blkno = fsbtodb(fs, nb); 763 *bpp = bp; 764 return (0); 765 } 766 if (nb != 0) { 767 /* 768 * Consider need to reallocate a fragment. 769 */ 770 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 771 nsize = fragroundup(fs, size); 772 if (nsize <= osize) { 773 error = bread_gb(vp, lbn, osize, NOCRED, 774 gbflags, &bp); 775 if (error) { 776 brelse(bp); 777 return (error); 778 } 779 bp->b_blkno = fsbtodb(fs, nb); 780 } else { 781 UFS_LOCK(ump); 782 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 783 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 784 &dp->di_db[0]), osize, nsize, flags, 785 cred, &bp); 786 if (error) 787 return (error); 788 if (DOINGSOFTDEP(vp)) 789 softdep_setup_allocdirect(ip, lbn, 790 dbtofsb(fs, bp->b_blkno), nb, 791 nsize, osize, bp); 792 } 793 } else { 794 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 795 nsize = fragroundup(fs, size); 796 else 797 nsize = fs->fs_bsize; 798 UFS_LOCK(ump); 799 error = ffs_alloc(ip, lbn, 800 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 801 &dp->di_db[0]), nsize, flags, cred, &newb); 802 if (error) 803 return (error); 804 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 805 bp->b_blkno = fsbtodb(fs, newb); 806 if (flags & BA_CLRBUF) 807 vfs_bio_clrbuf(bp); 808 if (DOINGSOFTDEP(vp)) 809 softdep_setup_allocdirect(ip, lbn, newb, 0, 810 nsize, 0, bp); 811 } 812 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 813 ip->i_flag |= IN_CHANGE | IN_UPDATE; 814 *bpp = bp; 815 return (0); 816 } 817 /* 818 * Determine the number of levels of indirection. 819 */ 820 pref = 0; 821 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 822 return(error); 823 #ifdef INVARIANTS 824 if (num < 1) 825 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 826 #endif 827 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 828 /* 829 * Fetch the first indirect block allocating if necessary. 830 */ 831 --num; 832 nb = dp->di_ib[indirs[0].in_off]; 833 allocib = NULL; 834 allocblk = allociblk; 835 lbns_remfree = lbns; 836 if (nb == 0) { 837 UFS_LOCK(ump); 838 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 839 (ufs2_daddr_t *)0); 840 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 841 flags, cred, &newb)) != 0) { 842 curthread_pflags_restore(saved_inbdflush); 843 return (error); 844 } 845 pref = newb + fs->fs_frag; 846 nb = newb; 847 MPASS(allocblk < allociblk + nitems(allociblk)); 848 MPASS(lbns_remfree < lbns + nitems(lbns)); 849 *allocblk++ = nb; 850 *lbns_remfree++ = indirs[1].in_lbn; 851 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 852 GB_UNMAPPED); 853 bp->b_blkno = fsbtodb(fs, nb); 854 vfs_bio_clrbuf(bp); 855 if (DOINGSOFTDEP(vp)) { 856 softdep_setup_allocdirect(ip, 857 UFS_NDADDR + indirs[0].in_off, newb, 0, 858 fs->fs_bsize, 0, bp); 859 bdwrite(bp); 860 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 861 if (bp->b_bufsize == fs->fs_bsize) 862 bp->b_flags |= B_CLUSTEROK; 863 bdwrite(bp); 864 } else { 865 if ((error = bwrite(bp)) != 0) 866 goto fail; 867 } 868 allocib = &dp->di_ib[indirs[0].in_off]; 869 *allocib = nb; 870 ip->i_flag |= IN_CHANGE | IN_UPDATE; 871 } 872 /* 873 * Fetch through the indirect blocks, allocating as necessary. 874 */ 875 retry: 876 for (i = 1;;) { 877 error = bread(vp, 878 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 879 if (error) { 880 brelse(bp); 881 goto fail; 882 } 883 bap = (ufs2_daddr_t *)bp->b_data; 884 nb = bap[indirs[i].in_off]; 885 if (i == num) 886 break; 887 i += 1; 888 if (nb != 0) { 889 bqrelse(bp); 890 continue; 891 } 892 UFS_LOCK(ump); 893 /* 894 * If parent indirect has just been allocated, try to cluster 895 * immediately following it. 896 */ 897 if (pref == 0) 898 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 899 (ufs2_daddr_t *)0); 900 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 901 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 902 brelse(bp); 903 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 904 UFS_LOCK(ump); 905 softdep_request_cleanup(fs, vp, cred, 906 FLUSH_BLOCKS_WAIT); 907 UFS_UNLOCK(ump); 908 goto retry; 909 } 910 if (ppsratecheck(&lastfail, &curfail, 1)) { 911 ffs_fserr(fs, ip->i_number, "filesystem full"); 912 uprintf("\n%s: write failed, filesystem " 913 "is full\n", fs->fs_fsmnt); 914 } 915 goto fail; 916 } 917 pref = newb + fs->fs_frag; 918 nb = newb; 919 MPASS(allocblk < allociblk + nitems(allociblk)); 920 MPASS(lbns_remfree < lbns + nitems(lbns)); 921 *allocblk++ = nb; 922 *lbns_remfree++ = indirs[i].in_lbn; 923 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 924 GB_UNMAPPED); 925 nbp->b_blkno = fsbtodb(fs, nb); 926 vfs_bio_clrbuf(nbp); 927 if (DOINGSOFTDEP(vp)) { 928 softdep_setup_allocindir_meta(nbp, ip, bp, 929 indirs[i - 1].in_off, nb); 930 bdwrite(nbp); 931 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 932 if (nbp->b_bufsize == fs->fs_bsize) 933 nbp->b_flags |= B_CLUSTEROK; 934 bdwrite(nbp); 935 } else { 936 if ((error = bwrite(nbp)) != 0) { 937 brelse(bp); 938 goto fail; 939 } 940 } 941 bap[indirs[i - 1].in_off] = nb; 942 if (allocib == NULL && unwindidx < 0) 943 unwindidx = i - 1; 944 /* 945 * If required, write synchronously, otherwise use 946 * delayed write. 947 */ 948 if (flags & IO_SYNC) { 949 bwrite(bp); 950 } else { 951 if (bp->b_bufsize == fs->fs_bsize) 952 bp->b_flags |= B_CLUSTEROK; 953 bdwrite(bp); 954 } 955 } 956 /* 957 * If asked only for the indirect block, then return it. 958 */ 959 if (flags & BA_METAONLY) { 960 curthread_pflags_restore(saved_inbdflush); 961 *bpp = bp; 962 return (0); 963 } 964 /* 965 * Get the data block, allocating if necessary. 966 */ 967 if (nb == 0) { 968 UFS_LOCK(ump); 969 /* 970 * If allocating metadata at the front of the cylinder 971 * group and parent indirect block has just been allocated, 972 * then cluster next to it if it is the first indirect in 973 * the file. Otherwise it has been allocated in the metadata 974 * area, so we want to find our own place out in the data area. 975 */ 976 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 977 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 978 &bap[0]); 979 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 980 flags | IO_BUFLOCKED, cred, &newb); 981 if (error) { 982 brelse(bp); 983 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 984 UFS_LOCK(ump); 985 softdep_request_cleanup(fs, vp, cred, 986 FLUSH_BLOCKS_WAIT); 987 UFS_UNLOCK(ump); 988 goto retry; 989 } 990 if (ppsratecheck(&lastfail, &curfail, 1)) { 991 ffs_fserr(fs, ip->i_number, "filesystem full"); 992 uprintf("\n%s: write failed, filesystem " 993 "is full\n", fs->fs_fsmnt); 994 } 995 goto fail; 996 } 997 nb = newb; 998 MPASS(allocblk < allociblk + nitems(allociblk)); 999 MPASS(lbns_remfree < lbns + nitems(lbns)); 1000 *allocblk++ = nb; 1001 *lbns_remfree++ = lbn; 1002 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1003 nbp->b_blkno = fsbtodb(fs, nb); 1004 if (flags & BA_CLRBUF) 1005 vfs_bio_clrbuf(nbp); 1006 if (DOINGSOFTDEP(vp)) 1007 softdep_setup_allocindir_page(ip, lbn, bp, 1008 indirs[i].in_off, nb, 0, nbp); 1009 bap[indirs[i].in_off] = nb; 1010 /* 1011 * If required, write synchronously, otherwise use 1012 * delayed write. 1013 */ 1014 if (flags & IO_SYNC) { 1015 bwrite(bp); 1016 } else { 1017 if (bp->b_bufsize == fs->fs_bsize) 1018 bp->b_flags |= B_CLUSTEROK; 1019 bdwrite(bp); 1020 } 1021 curthread_pflags_restore(saved_inbdflush); 1022 *bpp = nbp; 1023 return (0); 1024 } 1025 brelse(bp); 1026 /* 1027 * If requested clear invalid portions of the buffer. If we 1028 * have to do a read-before-write (typical if BA_CLRBUF is set), 1029 * try to do some read-ahead in the sequential case to reduce 1030 * the number of I/O transactions. 1031 */ 1032 if (flags & BA_CLRBUF) { 1033 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1034 if (seqcount != 0 && 1035 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1036 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1037 error = cluster_read(vp, ip->i_size, lbn, 1038 (int)fs->fs_bsize, NOCRED, 1039 MAXBSIZE, seqcount, gbflags, &nbp); 1040 } else { 1041 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1042 NOCRED, gbflags, &nbp); 1043 } 1044 if (error) { 1045 brelse(nbp); 1046 goto fail; 1047 } 1048 } else { 1049 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1050 nbp->b_blkno = fsbtodb(fs, nb); 1051 } 1052 curthread_pflags_restore(saved_inbdflush); 1053 *bpp = nbp; 1054 return (0); 1055 fail: 1056 curthread_pflags_restore(saved_inbdflush); 1057 /* 1058 * If we have failed to allocate any blocks, simply return the error. 1059 * This is the usual case and avoids the need to fsync the file. 1060 */ 1061 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1062 return (error); 1063 /* 1064 * If we have failed part way through block allocation, we 1065 * have to deallocate any indirect blocks that we have allocated. 1066 * We have to fsync the file before we start to get rid of all 1067 * of its dependencies so that we do not leave them dangling. 1068 * We have to sync it at the end so that the soft updates code 1069 * does not find any untracked changes. Although this is really 1070 * slow, running out of disk space is not expected to be a common 1071 * occurrence. The error return from fsync is ignored as we already 1072 * have an error to return to the user. 1073 * 1074 * XXX Still have to journal the free below 1075 */ 1076 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1077 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1078 blkp < allocblk; blkp++, lbns_remfree++) { 1079 /* 1080 * We shall not leave the freed blocks on the vnode 1081 * buffer object lists. 1082 */ 1083 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1084 GB_NOCREAT | GB_UNMAPPED); 1085 if (bp != NULL) { 1086 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1087 ("mismatch2 l %jd %jd b %ju %ju", 1088 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1089 (uintmax_t)bp->b_blkno, 1090 (uintmax_t)fsbtodb(fs, *blkp))); 1091 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1092 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1093 brelse(bp); 1094 } 1095 deallocated += fs->fs_bsize; 1096 } 1097 if (allocib != NULL) { 1098 *allocib = 0; 1099 } else if (unwindidx >= 0) { 1100 int r; 1101 1102 r = bread(vp, indirs[unwindidx].in_lbn, 1103 (int)fs->fs_bsize, NOCRED, &bp); 1104 if (r) { 1105 panic("Could not unwind indirect block, error %d", r); 1106 brelse(bp); 1107 } else { 1108 bap = (ufs2_daddr_t *)bp->b_data; 1109 bap[indirs[unwindidx].in_off] = 0; 1110 if (flags & IO_SYNC) { 1111 bwrite(bp); 1112 } else { 1113 if (bp->b_bufsize == fs->fs_bsize) 1114 bp->b_flags |= B_CLUSTEROK; 1115 bdwrite(bp); 1116 } 1117 } 1118 } 1119 if (deallocated) { 1120 #ifdef QUOTA 1121 /* 1122 * Restore user's disk quota because allocation failed. 1123 */ 1124 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1125 #endif 1126 dp->di_blocks -= btodb(deallocated); 1127 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1128 } 1129 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1130 /* 1131 * After the buffers are invalidated and on-disk pointers are 1132 * cleared, free the blocks. 1133 */ 1134 for (blkp = allociblk; blkp < allocblk; blkp++) { 1135 #ifdef INVARIANTS 1136 if (blkp == allociblk) 1137 lbns_remfree = lbns; 1138 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1139 GB_NOCREAT | GB_UNMAPPED); 1140 if (bp != NULL) { 1141 panic("zombie2 %jd %ju %ju", 1142 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1143 (uintmax_t)fsbtodb(fs, *blkp)); 1144 } 1145 lbns_remfree++; 1146 #endif 1147 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1148 ip->i_number, vp->v_type, NULL); 1149 } 1150 return (error); 1151 } 1152