1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/lock.h> 72 #include <sys/mount.h> 73 #include <sys/vnode.h> 74 #include <sys/vmmeter.h> 75 76 #include <ufs/ufs/quota.h> 77 #include <ufs/ufs/inode.h> 78 #include <ufs/ufs/ufs_extern.h> 79 #include <ufs/ufs/extattr.h> 80 #include <ufs/ufs/ufsmount.h> 81 82 #include <ufs/ffs/fs.h> 83 #include <ufs/ffs/ffs_extern.h> 84 85 /* 86 * Balloc defines the structure of filesystem storage 87 * by allocating the physical blocks on a device given 88 * the inode and the logical block number in a file. 89 * This is the allocation strategy for UFS1. Below is 90 * the allocation strategy for UFS2. 91 */ 92 int 93 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 94 struct ucred *cred, int flags, struct buf **bpp) 95 { 96 struct inode *ip; 97 struct ufs1_dinode *dp; 98 ufs_lbn_t lbn, lastlbn; 99 struct fs *fs; 100 ufs1_daddr_t nb; 101 struct buf *bp, *nbp; 102 struct ufsmount *ump; 103 struct indir indirs[UFS_NIADDR + 2]; 104 int deallocated, osize, nsize, num, i, error; 105 ufs2_daddr_t newb; 106 ufs1_daddr_t *bap, pref; 107 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 108 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 109 int unwindidx = -1; 110 int saved_inbdflush; 111 static struct timeval lastfail; 112 static int curfail; 113 int gbflags, reclaimed; 114 115 ip = VTOI(vp); 116 dp = ip->i_din1; 117 fs = ITOFS(ip); 118 ump = ITOUMP(ip); 119 lbn = lblkno(fs, startoffset); 120 size = blkoff(fs, startoffset) + size; 121 reclaimed = 0; 122 if (size > fs->fs_bsize) 123 panic("ffs_balloc_ufs1: blk too big"); 124 *bpp = NULL; 125 if (flags & IO_EXT) 126 return (EOPNOTSUPP); 127 if (lbn < 0) 128 return (EFBIG); 129 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 130 131 if (DOINGSOFTDEP(vp)) 132 softdep_prealloc(vp, MNT_WAIT); 133 /* 134 * If the next write will extend the file into a new block, 135 * and the file is currently composed of a fragment 136 * this fragment has to be extended to be a full block. 137 */ 138 lastlbn = lblkno(fs, ip->i_size); 139 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 140 nb = lastlbn; 141 osize = blksize(fs, ip, nb); 142 if (osize < fs->fs_bsize && osize > 0) { 143 UFS_LOCK(ump); 144 error = ffs_realloccg(ip, nb, dp->di_db[nb], 145 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 146 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 147 cred, &bp); 148 if (error) 149 return (error); 150 if (DOINGSOFTDEP(vp)) 151 softdep_setup_allocdirect(ip, nb, 152 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 153 fs->fs_bsize, osize, bp); 154 ip->i_size = smalllblktosize(fs, nb + 1); 155 dp->di_size = ip->i_size; 156 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 157 ip->i_flag |= IN_CHANGE | IN_UPDATE; 158 if (flags & IO_SYNC) 159 bwrite(bp); 160 else if (DOINGASYNC(vp)) 161 bdwrite(bp); 162 else 163 bawrite(bp); 164 } 165 } 166 /* 167 * The first UFS_NDADDR blocks are direct blocks 168 */ 169 if (lbn < UFS_NDADDR) { 170 if (flags & BA_METAONLY) 171 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 172 nb = dp->di_db[lbn]; 173 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 174 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 175 if (error) { 176 brelse(bp); 177 return (error); 178 } 179 bp->b_blkno = fsbtodb(fs, nb); 180 *bpp = bp; 181 return (0); 182 } 183 if (nb != 0) { 184 /* 185 * Consider need to reallocate a fragment. 186 */ 187 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 188 nsize = fragroundup(fs, size); 189 if (nsize <= osize) { 190 error = bread(vp, lbn, osize, NOCRED, &bp); 191 if (error) { 192 brelse(bp); 193 return (error); 194 } 195 bp->b_blkno = fsbtodb(fs, nb); 196 } else { 197 UFS_LOCK(ump); 198 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 199 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 200 &dp->di_db[0]), osize, nsize, flags, 201 cred, &bp); 202 if (error) 203 return (error); 204 if (DOINGSOFTDEP(vp)) 205 softdep_setup_allocdirect(ip, lbn, 206 dbtofsb(fs, bp->b_blkno), nb, 207 nsize, osize, bp); 208 } 209 } else { 210 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 211 nsize = fragroundup(fs, size); 212 else 213 nsize = fs->fs_bsize; 214 UFS_LOCK(ump); 215 error = ffs_alloc(ip, lbn, 216 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 217 nsize, flags, cred, &newb); 218 if (error) 219 return (error); 220 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 221 bp->b_blkno = fsbtodb(fs, newb); 222 if (flags & BA_CLRBUF) 223 vfs_bio_clrbuf(bp); 224 if (DOINGSOFTDEP(vp)) 225 softdep_setup_allocdirect(ip, lbn, newb, 0, 226 nsize, 0, bp); 227 } 228 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 229 ip->i_flag |= IN_CHANGE | IN_UPDATE; 230 *bpp = bp; 231 return (0); 232 } 233 /* 234 * Determine the number of levels of indirection. 235 */ 236 pref = 0; 237 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 238 return(error); 239 #ifdef INVARIANTS 240 if (num < 1) 241 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 242 #endif 243 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 244 /* 245 * Fetch the first indirect block allocating if necessary. 246 */ 247 --num; 248 nb = dp->di_ib[indirs[0].in_off]; 249 allocib = NULL; 250 allocblk = allociblk; 251 lbns_remfree = lbns; 252 if (nb == 0) { 253 UFS_LOCK(ump); 254 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 255 (ufs1_daddr_t *)0); 256 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 257 flags, cred, &newb)) != 0) { 258 curthread_pflags_restore(saved_inbdflush); 259 return (error); 260 } 261 pref = newb + fs->fs_frag; 262 nb = newb; 263 MPASS(allocblk < allociblk + nitems(allociblk)); 264 MPASS(lbns_remfree < lbns + nitems(lbns)); 265 *allocblk++ = nb; 266 *lbns_remfree++ = indirs[1].in_lbn; 267 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 268 bp->b_blkno = fsbtodb(fs, nb); 269 vfs_bio_clrbuf(bp); 270 if (DOINGSOFTDEP(vp)) { 271 softdep_setup_allocdirect(ip, 272 UFS_NDADDR + indirs[0].in_off, newb, 0, 273 fs->fs_bsize, 0, bp); 274 bdwrite(bp); 275 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 276 if (bp->b_bufsize == fs->fs_bsize) 277 bp->b_flags |= B_CLUSTEROK; 278 bdwrite(bp); 279 } else { 280 if ((error = bwrite(bp)) != 0) 281 goto fail; 282 } 283 allocib = &dp->di_ib[indirs[0].in_off]; 284 *allocib = nb; 285 ip->i_flag |= IN_CHANGE | IN_UPDATE; 286 } 287 /* 288 * Fetch through the indirect blocks, allocating as necessary. 289 */ 290 retry: 291 for (i = 1;;) { 292 error = bread(vp, 293 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 294 if (error) { 295 brelse(bp); 296 goto fail; 297 } 298 bap = (ufs1_daddr_t *)bp->b_data; 299 nb = bap[indirs[i].in_off]; 300 if (i == num) 301 break; 302 i += 1; 303 if (nb != 0) { 304 bqrelse(bp); 305 continue; 306 } 307 UFS_LOCK(ump); 308 /* 309 * If parent indirect has just been allocated, try to cluster 310 * immediately following it. 311 */ 312 if (pref == 0) 313 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 314 (ufs1_daddr_t *)0); 315 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 316 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 317 brelse(bp); 318 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 319 UFS_LOCK(ump); 320 softdep_request_cleanup(fs, vp, cred, 321 FLUSH_BLOCKS_WAIT); 322 UFS_UNLOCK(ump); 323 goto retry; 324 } 325 if (ppsratecheck(&lastfail, &curfail, 1)) { 326 ffs_fserr(fs, ip->i_number, "filesystem full"); 327 uprintf("\n%s: write failed, filesystem " 328 "is full\n", fs->fs_fsmnt); 329 } 330 goto fail; 331 } 332 pref = newb + fs->fs_frag; 333 nb = newb; 334 MPASS(allocblk < allociblk + nitems(allociblk)); 335 MPASS(lbns_remfree < lbns + nitems(lbns)); 336 *allocblk++ = nb; 337 *lbns_remfree++ = indirs[i].in_lbn; 338 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 339 nbp->b_blkno = fsbtodb(fs, nb); 340 vfs_bio_clrbuf(nbp); 341 if (DOINGSOFTDEP(vp)) { 342 softdep_setup_allocindir_meta(nbp, ip, bp, 343 indirs[i - 1].in_off, nb); 344 bdwrite(nbp); 345 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 346 if (nbp->b_bufsize == fs->fs_bsize) 347 nbp->b_flags |= B_CLUSTEROK; 348 bdwrite(nbp); 349 } else { 350 if ((error = bwrite(nbp)) != 0) { 351 brelse(bp); 352 goto fail; 353 } 354 } 355 bap[indirs[i - 1].in_off] = nb; 356 if (allocib == NULL && unwindidx < 0) 357 unwindidx = i - 1; 358 /* 359 * If required, write synchronously, otherwise use 360 * delayed write. 361 */ 362 if (flags & IO_SYNC) { 363 bwrite(bp); 364 } else { 365 if (bp->b_bufsize == fs->fs_bsize) 366 bp->b_flags |= B_CLUSTEROK; 367 bdwrite(bp); 368 } 369 } 370 /* 371 * If asked only for the indirect block, then return it. 372 */ 373 if (flags & BA_METAONLY) { 374 curthread_pflags_restore(saved_inbdflush); 375 *bpp = bp; 376 return (0); 377 } 378 /* 379 * Get the data block, allocating if necessary. 380 */ 381 if (nb == 0) { 382 UFS_LOCK(ump); 383 /* 384 * If allocating metadata at the front of the cylinder 385 * group and parent indirect block has just been allocated, 386 * then cluster next to it if it is the first indirect in 387 * the file. Otherwise it has been allocated in the metadata 388 * area, so we want to find our own place out in the data area. 389 */ 390 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 391 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 392 &bap[0]); 393 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 394 flags | IO_BUFLOCKED, cred, &newb); 395 if (error) { 396 brelse(bp); 397 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 398 UFS_LOCK(ump); 399 softdep_request_cleanup(fs, vp, cred, 400 FLUSH_BLOCKS_WAIT); 401 UFS_UNLOCK(ump); 402 goto retry; 403 } 404 if (ppsratecheck(&lastfail, &curfail, 1)) { 405 ffs_fserr(fs, ip->i_number, "filesystem full"); 406 uprintf("\n%s: write failed, filesystem " 407 "is full\n", fs->fs_fsmnt); 408 } 409 goto fail; 410 } 411 nb = newb; 412 MPASS(allocblk < allociblk + nitems(allociblk)); 413 MPASS(lbns_remfree < lbns + nitems(lbns)); 414 *allocblk++ = nb; 415 *lbns_remfree++ = lbn; 416 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 417 nbp->b_blkno = fsbtodb(fs, nb); 418 if (flags & BA_CLRBUF) 419 vfs_bio_clrbuf(nbp); 420 if (DOINGSOFTDEP(vp)) 421 softdep_setup_allocindir_page(ip, lbn, bp, 422 indirs[i].in_off, nb, 0, nbp); 423 bap[indirs[i].in_off] = nb; 424 /* 425 * If required, write synchronously, otherwise use 426 * delayed write. 427 */ 428 if (flags & IO_SYNC) { 429 bwrite(bp); 430 } else { 431 if (bp->b_bufsize == fs->fs_bsize) 432 bp->b_flags |= B_CLUSTEROK; 433 bdwrite(bp); 434 } 435 curthread_pflags_restore(saved_inbdflush); 436 *bpp = nbp; 437 return (0); 438 } 439 brelse(bp); 440 if (flags & BA_CLRBUF) { 441 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 442 if (seqcount != 0 && 443 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 444 !(vm_page_count_severe() || buf_dirty_count_severe())) { 445 error = cluster_read(vp, ip->i_size, lbn, 446 (int)fs->fs_bsize, NOCRED, 447 MAXBSIZE, seqcount, gbflags, &nbp); 448 } else { 449 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 450 gbflags, &nbp); 451 } 452 if (error) { 453 brelse(nbp); 454 goto fail; 455 } 456 } else { 457 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 458 nbp->b_blkno = fsbtodb(fs, nb); 459 } 460 curthread_pflags_restore(saved_inbdflush); 461 *bpp = nbp; 462 return (0); 463 fail: 464 curthread_pflags_restore(saved_inbdflush); 465 /* 466 * If we have failed to allocate any blocks, simply return the error. 467 * This is the usual case and avoids the need to fsync the file. 468 */ 469 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 470 return (error); 471 /* 472 * If we have failed part way through block allocation, we 473 * have to deallocate any indirect blocks that we have allocated. 474 * We have to fsync the file before we start to get rid of all 475 * of its dependencies so that we do not leave them dangling. 476 * We have to sync it at the end so that the soft updates code 477 * does not find any untracked changes. Although this is really 478 * slow, running out of disk space is not expected to be a common 479 * occurrence. The error return from fsync is ignored as we already 480 * have an error to return to the user. 481 * 482 * XXX Still have to journal the free below 483 */ 484 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 485 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 486 blkp < allocblk; blkp++, lbns_remfree++) { 487 /* 488 * We shall not leave the freed blocks on the vnode 489 * buffer object lists. 490 */ 491 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 492 GB_NOCREAT | GB_UNMAPPED); 493 if (bp != NULL) { 494 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 495 ("mismatch1 l %jd %jd b %ju %ju", 496 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 497 (uintmax_t)bp->b_blkno, 498 (uintmax_t)fsbtodb(fs, *blkp))); 499 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 500 bp->b_flags &= ~(B_ASYNC | B_CACHE); 501 brelse(bp); 502 } 503 deallocated += fs->fs_bsize; 504 } 505 if (allocib != NULL) { 506 *allocib = 0; 507 } else if (unwindidx >= 0) { 508 int r; 509 510 r = bread(vp, indirs[unwindidx].in_lbn, 511 (int)fs->fs_bsize, NOCRED, &bp); 512 if (r) { 513 panic("Could not unwind indirect block, error %d", r); 514 brelse(bp); 515 } else { 516 bap = (ufs1_daddr_t *)bp->b_data; 517 bap[indirs[unwindidx].in_off] = 0; 518 if (flags & IO_SYNC) { 519 bwrite(bp); 520 } else { 521 if (bp->b_bufsize == fs->fs_bsize) 522 bp->b_flags |= B_CLUSTEROK; 523 bdwrite(bp); 524 } 525 } 526 } 527 if (deallocated) { 528 #ifdef QUOTA 529 /* 530 * Restore user's disk quota because allocation failed. 531 */ 532 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 533 #endif 534 dp->di_blocks -= btodb(deallocated); 535 ip->i_flag |= IN_CHANGE | IN_UPDATE; 536 } 537 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 538 /* 539 * After the buffers are invalidated and on-disk pointers are 540 * cleared, free the blocks. 541 */ 542 for (blkp = allociblk; blkp < allocblk; blkp++) { 543 #ifdef INVARIANTS 544 if (blkp == allociblk) 545 lbns_remfree = lbns; 546 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 547 GB_NOCREAT | GB_UNMAPPED); 548 if (bp != NULL) { 549 panic("zombie1 %jd %ju %ju", 550 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 551 (uintmax_t)fsbtodb(fs, *blkp)); 552 } 553 lbns_remfree++; 554 #endif 555 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 556 ip->i_number, vp->v_type, NULL, SINGLETON_KEY); 557 } 558 return (error); 559 } 560 561 /* 562 * Balloc defines the structure of file system storage 563 * by allocating the physical blocks on a device given 564 * the inode and the logical block number in a file. 565 * This is the allocation strategy for UFS2. Above is 566 * the allocation strategy for UFS1. 567 */ 568 int 569 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 570 struct ucred *cred, int flags, struct buf **bpp) 571 { 572 struct inode *ip; 573 struct ufs2_dinode *dp; 574 ufs_lbn_t lbn, lastlbn; 575 struct fs *fs; 576 struct buf *bp, *nbp; 577 struct ufsmount *ump; 578 struct indir indirs[UFS_NIADDR + 2]; 579 ufs2_daddr_t nb, newb, *bap, pref; 580 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; 581 ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; 582 int deallocated, osize, nsize, num, i, error; 583 int unwindidx = -1; 584 int saved_inbdflush; 585 static struct timeval lastfail; 586 static int curfail; 587 int gbflags, reclaimed; 588 589 ip = VTOI(vp); 590 dp = ip->i_din2; 591 fs = ITOFS(ip); 592 ump = ITOUMP(ip); 593 lbn = lblkno(fs, startoffset); 594 size = blkoff(fs, startoffset) + size; 595 reclaimed = 0; 596 if (size > fs->fs_bsize) 597 panic("ffs_balloc_ufs2: blk too big"); 598 *bpp = NULL; 599 if (lbn < 0) 600 return (EFBIG); 601 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 602 603 if (DOINGSOFTDEP(vp)) 604 softdep_prealloc(vp, MNT_WAIT); 605 606 /* 607 * Check for allocating external data. 608 */ 609 if (flags & IO_EXT) { 610 if (lbn >= UFS_NXADDR) 611 return (EFBIG); 612 /* 613 * If the next write will extend the data into a new block, 614 * and the data is currently composed of a fragment 615 * this fragment has to be extended to be a full block. 616 */ 617 lastlbn = lblkno(fs, dp->di_extsize); 618 if (lastlbn < lbn) { 619 nb = lastlbn; 620 osize = sblksize(fs, dp->di_extsize, nb); 621 if (osize < fs->fs_bsize && osize > 0) { 622 UFS_LOCK(ump); 623 error = ffs_realloccg(ip, -1 - nb, 624 dp->di_extb[nb], 625 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 626 &dp->di_extb[0]), osize, 627 (int)fs->fs_bsize, flags, cred, &bp); 628 if (error) 629 return (error); 630 if (DOINGSOFTDEP(vp)) 631 softdep_setup_allocext(ip, nb, 632 dbtofsb(fs, bp->b_blkno), 633 dp->di_extb[nb], 634 fs->fs_bsize, osize, bp); 635 dp->di_extsize = smalllblktosize(fs, nb + 1); 636 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 637 bp->b_xflags |= BX_ALTDATA; 638 ip->i_flag |= IN_CHANGE; 639 if (flags & IO_SYNC) 640 bwrite(bp); 641 else 642 bawrite(bp); 643 } 644 } 645 /* 646 * All blocks are direct blocks 647 */ 648 if (flags & BA_METAONLY) 649 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 650 nb = dp->di_extb[lbn]; 651 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 652 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 653 gbflags, &bp); 654 if (error) { 655 brelse(bp); 656 return (error); 657 } 658 bp->b_blkno = fsbtodb(fs, nb); 659 bp->b_xflags |= BX_ALTDATA; 660 *bpp = bp; 661 return (0); 662 } 663 if (nb != 0) { 664 /* 665 * Consider need to reallocate a fragment. 666 */ 667 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 668 nsize = fragroundup(fs, size); 669 if (nsize <= osize) { 670 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 671 gbflags, &bp); 672 if (error) { 673 brelse(bp); 674 return (error); 675 } 676 bp->b_blkno = fsbtodb(fs, nb); 677 bp->b_xflags |= BX_ALTDATA; 678 } else { 679 UFS_LOCK(ump); 680 error = ffs_realloccg(ip, -1 - lbn, 681 dp->di_extb[lbn], 682 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 683 &dp->di_extb[0]), osize, nsize, flags, 684 cred, &bp); 685 if (error) 686 return (error); 687 bp->b_xflags |= BX_ALTDATA; 688 if (DOINGSOFTDEP(vp)) 689 softdep_setup_allocext(ip, lbn, 690 dbtofsb(fs, bp->b_blkno), nb, 691 nsize, osize, bp); 692 } 693 } else { 694 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 695 nsize = fragroundup(fs, size); 696 else 697 nsize = fs->fs_bsize; 698 UFS_LOCK(ump); 699 error = ffs_alloc(ip, lbn, 700 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 701 nsize, flags, cred, &newb); 702 if (error) 703 return (error); 704 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 705 bp->b_blkno = fsbtodb(fs, newb); 706 bp->b_xflags |= BX_ALTDATA; 707 if (flags & BA_CLRBUF) 708 vfs_bio_clrbuf(bp); 709 if (DOINGSOFTDEP(vp)) 710 softdep_setup_allocext(ip, lbn, newb, 0, 711 nsize, 0, bp); 712 } 713 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 714 ip->i_flag |= IN_CHANGE; 715 *bpp = bp; 716 return (0); 717 } 718 /* 719 * If the next write will extend the file into a new block, 720 * and the file is currently composed of a fragment 721 * this fragment has to be extended to be a full block. 722 */ 723 lastlbn = lblkno(fs, ip->i_size); 724 if (lastlbn < UFS_NDADDR && lastlbn < lbn) { 725 nb = lastlbn; 726 osize = blksize(fs, ip, nb); 727 if (osize < fs->fs_bsize && osize > 0) { 728 UFS_LOCK(ump); 729 error = ffs_realloccg(ip, nb, dp->di_db[nb], 730 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 731 &dp->di_db[0]), osize, (int)fs->fs_bsize, 732 flags, cred, &bp); 733 if (error) 734 return (error); 735 if (DOINGSOFTDEP(vp)) 736 softdep_setup_allocdirect(ip, nb, 737 dbtofsb(fs, bp->b_blkno), 738 dp->di_db[nb], 739 fs->fs_bsize, osize, bp); 740 ip->i_size = smalllblktosize(fs, nb + 1); 741 dp->di_size = ip->i_size; 742 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 743 ip->i_flag |= IN_CHANGE | IN_UPDATE; 744 if (flags & IO_SYNC) 745 bwrite(bp); 746 else 747 bawrite(bp); 748 } 749 } 750 /* 751 * The first UFS_NDADDR blocks are direct blocks 752 */ 753 if (lbn < UFS_NDADDR) { 754 if (flags & BA_METAONLY) 755 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 756 nb = dp->di_db[lbn]; 757 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 758 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 759 gbflags, &bp); 760 if (error) { 761 brelse(bp); 762 return (error); 763 } 764 bp->b_blkno = fsbtodb(fs, nb); 765 *bpp = bp; 766 return (0); 767 } 768 if (nb != 0) { 769 /* 770 * Consider need to reallocate a fragment. 771 */ 772 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 773 nsize = fragroundup(fs, size); 774 if (nsize <= osize) { 775 error = bread_gb(vp, lbn, osize, NOCRED, 776 gbflags, &bp); 777 if (error) { 778 brelse(bp); 779 return (error); 780 } 781 bp->b_blkno = fsbtodb(fs, nb); 782 } else { 783 UFS_LOCK(ump); 784 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 785 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 786 &dp->di_db[0]), osize, nsize, flags, 787 cred, &bp); 788 if (error) 789 return (error); 790 if (DOINGSOFTDEP(vp)) 791 softdep_setup_allocdirect(ip, lbn, 792 dbtofsb(fs, bp->b_blkno), nb, 793 nsize, osize, bp); 794 } 795 } else { 796 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 797 nsize = fragroundup(fs, size); 798 else 799 nsize = fs->fs_bsize; 800 UFS_LOCK(ump); 801 error = ffs_alloc(ip, lbn, 802 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 803 &dp->di_db[0]), nsize, flags, cred, &newb); 804 if (error) 805 return (error); 806 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 807 bp->b_blkno = fsbtodb(fs, newb); 808 if (flags & BA_CLRBUF) 809 vfs_bio_clrbuf(bp); 810 if (DOINGSOFTDEP(vp)) 811 softdep_setup_allocdirect(ip, lbn, newb, 0, 812 nsize, 0, bp); 813 } 814 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 815 ip->i_flag |= IN_CHANGE | IN_UPDATE; 816 *bpp = bp; 817 return (0); 818 } 819 /* 820 * Determine the number of levels of indirection. 821 */ 822 pref = 0; 823 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 824 return(error); 825 #ifdef INVARIANTS 826 if (num < 1) 827 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 828 #endif 829 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 830 /* 831 * Fetch the first indirect block allocating if necessary. 832 */ 833 --num; 834 nb = dp->di_ib[indirs[0].in_off]; 835 allocib = NULL; 836 allocblk = allociblk; 837 lbns_remfree = lbns; 838 if (nb == 0) { 839 UFS_LOCK(ump); 840 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 841 (ufs2_daddr_t *)0); 842 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 843 flags, cred, &newb)) != 0) { 844 curthread_pflags_restore(saved_inbdflush); 845 return (error); 846 } 847 pref = newb + fs->fs_frag; 848 nb = newb; 849 MPASS(allocblk < allociblk + nitems(allociblk)); 850 MPASS(lbns_remfree < lbns + nitems(lbns)); 851 *allocblk++ = nb; 852 *lbns_remfree++ = indirs[1].in_lbn; 853 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 854 GB_UNMAPPED); 855 bp->b_blkno = fsbtodb(fs, nb); 856 vfs_bio_clrbuf(bp); 857 if (DOINGSOFTDEP(vp)) { 858 softdep_setup_allocdirect(ip, 859 UFS_NDADDR + indirs[0].in_off, newb, 0, 860 fs->fs_bsize, 0, bp); 861 bdwrite(bp); 862 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 863 if (bp->b_bufsize == fs->fs_bsize) 864 bp->b_flags |= B_CLUSTEROK; 865 bdwrite(bp); 866 } else { 867 if ((error = bwrite(bp)) != 0) 868 goto fail; 869 } 870 allocib = &dp->di_ib[indirs[0].in_off]; 871 *allocib = nb; 872 ip->i_flag |= IN_CHANGE | IN_UPDATE; 873 } 874 /* 875 * Fetch through the indirect blocks, allocating as necessary. 876 */ 877 retry: 878 for (i = 1;;) { 879 error = bread(vp, 880 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 881 if (error) { 882 brelse(bp); 883 goto fail; 884 } 885 bap = (ufs2_daddr_t *)bp->b_data; 886 nb = bap[indirs[i].in_off]; 887 if (i == num) 888 break; 889 i += 1; 890 if (nb != 0) { 891 bqrelse(bp); 892 continue; 893 } 894 UFS_LOCK(ump); 895 /* 896 * If parent indirect has just been allocated, try to cluster 897 * immediately following it. 898 */ 899 if (pref == 0) 900 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 901 (ufs2_daddr_t *)0); 902 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 903 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 904 brelse(bp); 905 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 906 UFS_LOCK(ump); 907 softdep_request_cleanup(fs, vp, cred, 908 FLUSH_BLOCKS_WAIT); 909 UFS_UNLOCK(ump); 910 goto retry; 911 } 912 if (ppsratecheck(&lastfail, &curfail, 1)) { 913 ffs_fserr(fs, ip->i_number, "filesystem full"); 914 uprintf("\n%s: write failed, filesystem " 915 "is full\n", fs->fs_fsmnt); 916 } 917 goto fail; 918 } 919 pref = newb + fs->fs_frag; 920 nb = newb; 921 MPASS(allocblk < allociblk + nitems(allociblk)); 922 MPASS(lbns_remfree < lbns + nitems(lbns)); 923 *allocblk++ = nb; 924 *lbns_remfree++ = indirs[i].in_lbn; 925 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 926 GB_UNMAPPED); 927 nbp->b_blkno = fsbtodb(fs, nb); 928 vfs_bio_clrbuf(nbp); 929 if (DOINGSOFTDEP(vp)) { 930 softdep_setup_allocindir_meta(nbp, ip, bp, 931 indirs[i - 1].in_off, nb); 932 bdwrite(nbp); 933 } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { 934 if (nbp->b_bufsize == fs->fs_bsize) 935 nbp->b_flags |= B_CLUSTEROK; 936 bdwrite(nbp); 937 } else { 938 if ((error = bwrite(nbp)) != 0) { 939 brelse(bp); 940 goto fail; 941 } 942 } 943 bap[indirs[i - 1].in_off] = nb; 944 if (allocib == NULL && unwindidx < 0) 945 unwindidx = i - 1; 946 /* 947 * If required, write synchronously, otherwise use 948 * delayed write. 949 */ 950 if (flags & IO_SYNC) { 951 bwrite(bp); 952 } else { 953 if (bp->b_bufsize == fs->fs_bsize) 954 bp->b_flags |= B_CLUSTEROK; 955 bdwrite(bp); 956 } 957 } 958 /* 959 * If asked only for the indirect block, then return it. 960 */ 961 if (flags & BA_METAONLY) { 962 curthread_pflags_restore(saved_inbdflush); 963 *bpp = bp; 964 return (0); 965 } 966 /* 967 * Get the data block, allocating if necessary. 968 */ 969 if (nb == 0) { 970 UFS_LOCK(ump); 971 /* 972 * If allocating metadata at the front of the cylinder 973 * group and parent indirect block has just been allocated, 974 * then cluster next to it if it is the first indirect in 975 * the file. Otherwise it has been allocated in the metadata 976 * area, so we want to find our own place out in the data area. 977 */ 978 if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) 979 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 980 &bap[0]); 981 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 982 flags | IO_BUFLOCKED, cred, &newb); 983 if (error) { 984 brelse(bp); 985 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 986 UFS_LOCK(ump); 987 softdep_request_cleanup(fs, vp, cred, 988 FLUSH_BLOCKS_WAIT); 989 UFS_UNLOCK(ump); 990 goto retry; 991 } 992 if (ppsratecheck(&lastfail, &curfail, 1)) { 993 ffs_fserr(fs, ip->i_number, "filesystem full"); 994 uprintf("\n%s: write failed, filesystem " 995 "is full\n", fs->fs_fsmnt); 996 } 997 goto fail; 998 } 999 nb = newb; 1000 MPASS(allocblk < allociblk + nitems(allociblk)); 1001 MPASS(lbns_remfree < lbns + nitems(lbns)); 1002 *allocblk++ = nb; 1003 *lbns_remfree++ = lbn; 1004 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1005 nbp->b_blkno = fsbtodb(fs, nb); 1006 if (flags & BA_CLRBUF) 1007 vfs_bio_clrbuf(nbp); 1008 if (DOINGSOFTDEP(vp)) 1009 softdep_setup_allocindir_page(ip, lbn, bp, 1010 indirs[i].in_off, nb, 0, nbp); 1011 bap[indirs[i].in_off] = nb; 1012 /* 1013 * If required, write synchronously, otherwise use 1014 * delayed write. 1015 */ 1016 if (flags & IO_SYNC) { 1017 bwrite(bp); 1018 } else { 1019 if (bp->b_bufsize == fs->fs_bsize) 1020 bp->b_flags |= B_CLUSTEROK; 1021 bdwrite(bp); 1022 } 1023 curthread_pflags_restore(saved_inbdflush); 1024 *bpp = nbp; 1025 return (0); 1026 } 1027 brelse(bp); 1028 /* 1029 * If requested clear invalid portions of the buffer. If we 1030 * have to do a read-before-write (typical if BA_CLRBUF is set), 1031 * try to do some read-ahead in the sequential case to reduce 1032 * the number of I/O transactions. 1033 */ 1034 if (flags & BA_CLRBUF) { 1035 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1036 if (seqcount != 0 && 1037 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1038 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1039 error = cluster_read(vp, ip->i_size, lbn, 1040 (int)fs->fs_bsize, NOCRED, 1041 MAXBSIZE, seqcount, gbflags, &nbp); 1042 } else { 1043 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1044 NOCRED, gbflags, &nbp); 1045 } 1046 if (error) { 1047 brelse(nbp); 1048 goto fail; 1049 } 1050 } else { 1051 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1052 nbp->b_blkno = fsbtodb(fs, nb); 1053 } 1054 curthread_pflags_restore(saved_inbdflush); 1055 *bpp = nbp; 1056 return (0); 1057 fail: 1058 curthread_pflags_restore(saved_inbdflush); 1059 /* 1060 * If we have failed to allocate any blocks, simply return the error. 1061 * This is the usual case and avoids the need to fsync the file. 1062 */ 1063 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1064 return (error); 1065 /* 1066 * If we have failed part way through block allocation, we 1067 * have to deallocate any indirect blocks that we have allocated. 1068 * We have to fsync the file before we start to get rid of all 1069 * of its dependencies so that we do not leave them dangling. 1070 * We have to sync it at the end so that the soft updates code 1071 * does not find any untracked changes. Although this is really 1072 * slow, running out of disk space is not expected to be a common 1073 * occurrence. The error return from fsync is ignored as we already 1074 * have an error to return to the user. 1075 * 1076 * XXX Still have to journal the free below 1077 */ 1078 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1079 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1080 blkp < allocblk; blkp++, lbns_remfree++) { 1081 /* 1082 * We shall not leave the freed blocks on the vnode 1083 * buffer object lists. 1084 */ 1085 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1086 GB_NOCREAT | GB_UNMAPPED); 1087 if (bp != NULL) { 1088 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1089 ("mismatch2 l %jd %jd b %ju %ju", 1090 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1091 (uintmax_t)bp->b_blkno, 1092 (uintmax_t)fsbtodb(fs, *blkp))); 1093 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1094 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1095 brelse(bp); 1096 } 1097 deallocated += fs->fs_bsize; 1098 } 1099 if (allocib != NULL) { 1100 *allocib = 0; 1101 } else if (unwindidx >= 0) { 1102 int r; 1103 1104 r = bread(vp, indirs[unwindidx].in_lbn, 1105 (int)fs->fs_bsize, NOCRED, &bp); 1106 if (r) { 1107 panic("Could not unwind indirect block, error %d", r); 1108 brelse(bp); 1109 } else { 1110 bap = (ufs2_daddr_t *)bp->b_data; 1111 bap[indirs[unwindidx].in_off] = 0; 1112 if (flags & IO_SYNC) { 1113 bwrite(bp); 1114 } else { 1115 if (bp->b_bufsize == fs->fs_bsize) 1116 bp->b_flags |= B_CLUSTEROK; 1117 bdwrite(bp); 1118 } 1119 } 1120 } 1121 if (deallocated) { 1122 #ifdef QUOTA 1123 /* 1124 * Restore user's disk quota because allocation failed. 1125 */ 1126 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1127 #endif 1128 dp->di_blocks -= btodb(deallocated); 1129 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1130 } 1131 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1132 /* 1133 * After the buffers are invalidated and on-disk pointers are 1134 * cleared, free the blocks. 1135 */ 1136 for (blkp = allociblk; blkp < allocblk; blkp++) { 1137 #ifdef INVARIANTS 1138 if (blkp == allociblk) 1139 lbns_remfree = lbns; 1140 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1141 GB_NOCREAT | GB_UNMAPPED); 1142 if (bp != NULL) { 1143 panic("zombie2 %jd %ju %ju", 1144 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1145 (uintmax_t)fsbtodb(fs, *blkp)); 1146 } 1147 lbns_remfree++; 1148 #endif 1149 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1150 ip->i_number, vp->v_type, NULL, SINGLETON_KEY); 1151 } 1152 return (error); 1153 } 1154