1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #include <sys/sysmacros.h> 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/t_lock.h> 39 #include <sys/uio.h> 40 #include <sys/kmem.h> 41 #include <sys/thread.h> 42 #include <sys/vfs.h> 43 #include <sys/errno.h> 44 #include <sys/buf.h> 45 #include <sys/vnode.h> 46 #include <sys/fs/ufs_trans.h> 47 #include <sys/fs/ufs_inode.h> 48 #include <sys/fs/ufs_fs.h> 49 #include <sys/fs/ufs_fsdir.h> 50 #include <sys/fs/ufs_quota.h> 51 #include <sys/fs/ufs_panic.h> 52 #include <sys/fs/ufs_bio.h> 53 #include <sys/fs/ufs_log.h> 54 #include <sys/cmn_err.h> 55 #include <sys/file.h> 56 #include <sys/debug.h> 57 58 59 extern kmutex_t ufsvfs_mutex; 60 extern struct ufsvfs *ufs_instances; 61 62 /* 63 * hlock any file systems w/errored logs 64 */ 65 int 66 ufs_trans_hlock() 67 { 68 struct ufsvfs *ufsvfsp; 69 struct lockfs lockfs; 70 int error; 71 int retry = 0; 72 73 /* 74 * find fs's that paniced or have errored logging devices 75 */ 76 mutex_enter(&ufsvfs_mutex); 77 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) { 78 /* 79 * not mounted; continue 80 */ 81 if ((ufsvfsp->vfs_vfs == NULL) || 82 (ufsvfsp->vfs_validfs == UT_UNMOUNTED)) 83 continue; 84 /* 85 * disallow unmounts (hlock occurs below) 86 */ 87 if (TRANS_ISERROR(ufsvfsp)) 88 ufsvfsp->vfs_validfs = UT_HLOCKING; 89 } 90 mutex_exit(&ufsvfs_mutex); 91 92 /* 93 * hlock the fs's that paniced or have errored logging devices 94 */ 95 again: 96 mutex_enter(&ufsvfs_mutex); 97 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) 98 if (ufsvfsp->vfs_validfs == UT_HLOCKING) 99 break; 100 mutex_exit(&ufsvfs_mutex); 101 if (ufsvfsp == NULL) 102 return (retry); 103 /* 104 * hlock the file system 105 */ 106 (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs); 107 if (!LOCKFS_IS_ELOCK(&lockfs)) { 108 lockfs.lf_lock = LOCKFS_HLOCK; 109 lockfs.lf_flags = 0; 110 lockfs.lf_comlen = 0; 111 lockfs.lf_comment = NULL; 112 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0); 113 /* 114 * retry after awhile; another app currently doing lockfs 115 */ 116 if (error == EBUSY || error == EINVAL) 117 retry = 1; 118 } else { 119 if (ufsfx_get_failure_qlen() > 0) { 120 if (mutex_tryenter(&ufs_fix.uq_mutex)) { 121 ufs_fix.uq_lowat = ufs_fix.uq_ne; 122 cv_broadcast(&ufs_fix.uq_cv); 123 mutex_exit(&ufs_fix.uq_mutex); 124 } 125 } 126 retry = 1; 127 } 128 129 /* 130 * allow unmounts 131 */ 132 ufsvfsp->vfs_validfs = UT_MOUNTED; 133 goto again; 134 } 135 136 /*ARGSUSED*/ 137 void 138 ufs_trans_onerror() 139 { 140 mutex_enter(&ufs_hlock.uq_mutex); 141 ufs_hlock.uq_ne = ufs_hlock.uq_lowat; 142 cv_broadcast(&ufs_hlock.uq_cv); 143 mutex_exit(&ufs_hlock.uq_mutex); 144 } 145 146 void 147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid) 148 { 149 if (curthread->t_flag & T_DONTBLOCK) { 150 sbupdate(vfsp); 151 return; 152 } else { 153 154 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 155 return; 156 157 curthread->t_flag |= T_DONTBLOCK; 158 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 159 sbupdate(vfsp); 160 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 161 curthread->t_flag &= ~T_DONTBLOCK; 162 } 163 } 164 165 void 166 ufs_trans_iupdat(struct inode *ip, int waitfor) 167 { 168 struct ufsvfs *ufsvfsp; 169 170 if (curthread->t_flag & T_DONTBLOCK) { 171 rw_enter(&ip->i_contents, RW_READER); 172 ufs_iupdat(ip, waitfor); 173 rw_exit(&ip->i_contents); 174 return; 175 } else { 176 ufsvfsp = ip->i_ufsvfs; 177 178 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 179 return; 180 181 curthread->t_flag |= T_DONTBLOCK; 182 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 183 rw_enter(&ip->i_contents, RW_READER); 184 ufs_iupdat(ip, waitfor); 185 rw_exit(&ip->i_contents); 186 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 187 curthread->t_flag &= ~T_DONTBLOCK; 188 } 189 } 190 191 void 192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid) 193 { 194 if (curthread->t_flag & T_DONTBLOCK) { 195 mutex_enter(&ufsvfsp->vfs_lock); 196 ufs_sbwrite(ufsvfsp); 197 mutex_exit(&ufsvfsp->vfs_lock); 198 return; 199 } else { 200 201 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 202 return; 203 204 curthread->t_flag |= T_DONTBLOCK; 205 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 206 mutex_enter(&ufsvfsp->vfs_lock); 207 ufs_sbwrite(ufsvfsp); 208 mutex_exit(&ufsvfsp->vfs_lock); 209 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 210 curthread->t_flag &= ~T_DONTBLOCK; 211 } 212 } 213 214 /*ARGSUSED*/ 215 int 216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore) 217 { 218 struct fs *fs; 219 220 fs = ufsvfsp->vfs_fs; 221 mutex_enter(&ufsvfsp->vfs_lock); 222 TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp, 223 ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize, 224 (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize); 225 mutex_exit(&ufsvfsp->vfs_lock); 226 return (0); 227 } 228 229 /*ARGSUSED*/ 230 int 231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno) 232 { 233 struct buf *bp; 234 235 bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1); 236 if (bp == NULL) 237 return (ENOENT); 238 239 if (bp->b_flags & B_DELWRI) { 240 /* 241 * Do not use brwrite() here since the buffer is already 242 * marked for retry or not by the code that called 243 * TRANS_BUF(). 244 */ 245 UFS_BWRITE(ufsvfsp, bp); 246 return (0); 247 } 248 /* 249 * If we did not find the real buf for this block above then 250 * clear the dev so the buf won't be found by mistake 251 * for this block later. We had to allocate at least a 1 byte 252 * buffer to keep brelse happy. 253 */ 254 if (bp->b_bufsize == 1) { 255 bp->b_dev = (o_dev_t)NODEV; 256 bp->b_edev = NODEV; 257 bp->b_flags = 0; 258 } 259 brelse(bp); 260 return (ENOENT); 261 } 262 263 /*ARGSUSED*/ 264 int 265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino) 266 { 267 int error; 268 struct inode *ip; 269 270 /* 271 * Grab the quota lock (if the file system has not been forcibly 272 * unmounted). 273 */ 274 if (ufsvfsp) 275 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 276 277 error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred); 278 279 if (ufsvfsp) 280 rw_exit(&ufsvfsp->vfs_dqrwlock); 281 if (error) 282 return (ENOENT); 283 284 if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) { 285 rw_enter(&ip->i_contents, RW_READER); 286 ufs_iupdat(ip, 1); 287 rw_exit(&ip->i_contents); 288 VN_RELE(ITOV(ip)); 289 return (0); 290 } 291 VN_RELE(ITOV(ip)); 292 return (ENOENT); 293 } 294 295 #ifdef DEBUG 296 /* 297 * These routines maintain the metadata map (matamap) 298 */ 299 300 /* 301 * update the metadata map at mount 302 */ 303 static int 304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg) 305 { 306 /* 307 * wrong file system; keep looking 308 */ 309 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 310 return (0); 311 312 /* 313 * load the metadata map 314 */ 315 rw_enter(&ip->i_contents, RW_WRITER); 316 ufs_trans_mata_iget(ip); 317 rw_exit(&ip->i_contents); 318 return (0); 319 } 320 321 void 322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp) 323 { 324 struct fs *fs = ufsvfsp->vfs_fs; 325 ino_t ino; 326 int i; 327 328 /* 329 * put static metadata into matamap 330 * superblock 331 * cylinder groups 332 * inode groups 333 * existing inodes 334 */ 335 TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize); 336 337 for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) { 338 TRANS_MATAADD(ufsvfsp, 339 ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize); 340 TRANS_MATAADD(ufsvfsp, 341 ldbtob(fsbtodb(fs, itod(fs, ino))), 342 fs->fs_ipg * sizeof (struct dinode)); 343 } 344 (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp); 345 } 346 347 /* 348 * clear the metadata map at umount 349 */ 350 void 351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp) 352 { 353 top_mataclr(ufsvfsp); 354 } 355 356 /* 357 * summary info (may be extended during growfs test) 358 */ 359 void 360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs) 361 { 362 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)), 363 fs->fs_cssize); 364 } 365 366 /* 367 * scan an allocation block (either inode or true block) 368 */ 369 static void 370 ufs_trans_mata_direct( 371 struct inode *ip, 372 daddr_t *fragsp, 373 daddr32_t *blkp, 374 unsigned int nblk) 375 { 376 int i; 377 daddr_t frag; 378 ulong_t nb; 379 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 380 struct fs *fs = ufsvfsp->vfs_fs; 381 382 for (i = 0; i < nblk && *fragsp; ++i, ++blkp) 383 if ((frag = *blkp) != 0) { 384 if (*fragsp > fs->fs_frag) { 385 nb = fs->fs_bsize; 386 *fragsp -= fs->fs_frag; 387 } else { 388 nb = *fragsp * fs->fs_fsize; 389 *fragsp = 0; 390 } 391 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 392 } 393 } 394 395 /* 396 * scan an indirect allocation block (either inode or true block) 397 */ 398 static void 399 ufs_trans_mata_indir( 400 struct inode *ip, 401 daddr_t *fragsp, 402 daddr_t frag, 403 int level) 404 { 405 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 406 struct fs *fs = ufsvfsp->vfs_fs; 407 int ne = fs->fs_bsize / (int)sizeof (daddr32_t); 408 int i; 409 struct buf *bp; 410 daddr32_t *blkp; 411 o_mode_t ifmt = ip->i_mode & IFMT; 412 413 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize); 414 if (bp->b_flags & B_ERROR) { 415 brelse(bp); 416 return; 417 } 418 blkp = bp->b_un.b_daddr; 419 420 if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) || 421 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)) 422 ufs_trans_mata_direct(ip, fragsp, blkp, ne); 423 424 if (level) 425 for (i = 0; i < ne && *fragsp; ++i, ++blkp) 426 ufs_trans_mata_indir(ip, fragsp, *blkp, level-1); 427 brelse(bp); 428 } 429 430 /* 431 * put appropriate metadata into matamap for this inode 432 */ 433 void 434 ufs_trans_mata_iget(struct inode *ip) 435 { 436 int i; 437 daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks); 438 o_mode_t ifmt = ip->i_mode & IFMT; 439 440 if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) || 441 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 442 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR); 443 444 if (frags) 445 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR); 446 447 for (i = 0; i < NIADDR && frags; ++i) 448 if (ip->i_ib[i]) 449 ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i); 450 } 451 452 /* 453 * freeing possible metadata (block of user data) 454 */ 455 void 456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb) 457 { 458 top_matadel(ufsvfsp, mof, nb); 459 460 } 461 462 /* 463 * allocating metadata 464 */ 465 void 466 ufs_trans_mata_alloc( 467 struct ufsvfs *ufsvfsp, 468 struct inode *ip, 469 daddr_t frag, 470 ulong_t nb, 471 int indir) 472 { 473 struct fs *fs = ufsvfsp->vfs_fs; 474 o_mode_t ifmt = ip->i_mode & IFMT; 475 476 if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) || 477 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 478 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 479 } 480 481 #endif /* DEBUG */ 482 483 /* 484 * ufs_trans_dir is used to declare a directory delta 485 */ 486 int 487 ufs_trans_dir(struct inode *ip, off_t offset) 488 { 489 daddr_t bn; 490 int contig = 0, error; 491 492 ASSERT(ip); 493 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 494 error = bmap_read(ip, (u_offset_t)offset, &bn, &contig); 495 if (error || (bn == UFS_HOLE)) { 496 cmn_err(CE_WARN, "ufs_trans_dir - could not get block" 497 " number error = %d bn = %d\n", error, (int)bn); 498 if (error == 0) /* treat UFS_HOLE as an I/O error */ 499 error = EIO; 500 return (error); 501 } 502 TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0); 503 return (error); 504 } 505 506 /*ARGSUSED*/ 507 int 508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp) 509 { 510 /* 511 * Lock the quota subsystem (ufsvfsp can be NULL 512 * if the DQ_ERROR is set). 513 */ 514 if (ufsvfsp) 515 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 516 mutex_enter(&dqp->dq_lock); 517 518 /* 519 * If this transaction has been cancelled by closedq_scan_inode(), 520 * then bail out now. We don't call dqput() in this case because 521 * it has already been done. 522 */ 523 if ((dqp->dq_flags & DQ_TRANS) == 0) { 524 mutex_exit(&dqp->dq_lock); 525 if (ufsvfsp) 526 rw_exit(&ufsvfsp->vfs_dqrwlock); 527 return (0); 528 } 529 530 if (dqp->dq_flags & DQ_ERROR) { 531 /* 532 * Paranoia to make sure that there is at least one 533 * reference to the dquot struct. We are done with 534 * the dquot (due to an error) so clear logging 535 * specific markers. 536 */ 537 ASSERT(dqp->dq_cnt >= 1); 538 dqp->dq_flags &= ~DQ_TRANS; 539 dqput(dqp); 540 mutex_exit(&dqp->dq_lock); 541 if (ufsvfsp) 542 rw_exit(&ufsvfsp->vfs_dqrwlock); 543 return (1); 544 } 545 546 if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) { 547 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0)); 548 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb, 549 dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0); 550 /* 551 * Paranoia to make sure that there is at least one 552 * reference to the dquot struct. Clear the 553 * modification flag because the operation is now in 554 * the log. Also clear the logging specific markers 555 * that were set in ufs_trans_quota(). 556 */ 557 ASSERT(dqp->dq_cnt >= 1); 558 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS); 559 dqput(dqp); 560 } 561 562 /* 563 * At this point, the logging specific flag should be clear, 564 * but add paranoia just in case something has gone wrong. 565 */ 566 ASSERT((dqp->dq_flags & DQ_TRANS) == 0); 567 mutex_exit(&dqp->dq_lock); 568 if (ufsvfsp) 569 rw_exit(&ufsvfsp->vfs_dqrwlock); 570 return (0); 571 } 572 573 /* 574 * ufs_trans_quota take in a uid, allocates the disk space, placing the 575 * quota record into the metamap, then declares the delta. 576 */ 577 /*ARGSUSED*/ 578 void 579 ufs_trans_quota(struct dquot *dqp) 580 { 581 582 struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod; 583 584 ASSERT(qip); 585 ASSERT(MUTEX_HELD(&dqp->dq_lock)); 586 ASSERT(dqp->dq_flags & DQ_MOD); 587 ASSERT(dqp->dq_mof != 0); 588 ASSERT(dqp->dq_mof != UFS_HOLE); 589 590 /* 591 * Mark this dquot to indicate that we are starting a logging 592 * file system operation for this dquot. Also increment the 593 * reference count so that the dquot does not get reused while 594 * it is on the mapentry_t list. DQ_TRANS is cleared and the 595 * reference count is decremented by ufs_trans_push_quota. 596 * 597 * If the file system is force-unmounted while there is a 598 * pending quota transaction, then closedq_scan_inode() will 599 * clear the DQ_TRANS flag and decrement the reference count. 600 * 601 * Since deltamap_add() drops multiple transactions to the 602 * same dq_mof and ufs_trans_push_quota() won't get called, 603 * we use DQ_TRANS to prevent repeat transactions from 604 * incrementing the reference count (or calling TRANS_DELTA()). 605 */ 606 if ((dqp->dq_flags & DQ_TRANS) == 0) { 607 dqp->dq_flags |= DQ_TRANS; 608 dqp->dq_cnt++; 609 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk), 610 DT_QR, ufs_trans_push_quota, (ulong_t)dqp); 611 } 612 } 613 614 void 615 ufs_trans_dqrele(struct dquot *dqp) 616 { 617 struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp; 618 619 curthread->t_flag |= T_DONTBLOCK; 620 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 621 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 622 dqrele(dqp); 623 rw_exit(&ufsvfsp->vfs_dqrwlock); 624 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 625 curthread->t_flag &= ~T_DONTBLOCK; 626 } 627 628 int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */ 629 long ufs_trans_avgbfree = 0; /* will be adjusted for testing */ 630 #define TRANS_MAX_WRITE (1024 * 1024) 631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE; 632 633 /* 634 * Calculate the log reservation for the given write or truncate 635 */ 636 static ulong_t 637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc) 638 { 639 long ncg, last2blk; 640 long niblk = 0; 641 u_offset_t writeend, offblk; 642 int resv; 643 daddr_t nblk, maxfblk; 644 long avgbfree; 645 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 646 struct fs *fs = ufsvfsp->vfs_fs; 647 long fni = NINDIR(fs); 648 int bsize = fs->fs_bsize; 649 650 /* 651 * Assume that the request will fit in 1 or 2 cg's, 652 * resv is the amount of log space to reserve (in bytes). 653 */ 654 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 655 656 /* 657 * get max position of write in fs blocks 658 */ 659 writeend = offset + resid; 660 maxfblk = lblkno(fs, writeend); 661 offblk = lblkno(fs, offset); 662 /* 663 * request size in fs blocks 664 */ 665 nblk = lblkno(fs, blkroundup(fs, resid)); 666 /* 667 * Adjust for sparse files 668 */ 669 if (trunc) 670 nblk = MIN(nblk, ip->i_blocks); 671 672 /* 673 * Adjust avgbfree (for testing) 674 */ 675 avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1; 676 677 /* 678 * Calculate maximum number of blocks of triple indirect 679 * pointers to write. 680 */ 681 last2blk = NDADDR + fni + fni * fni; 682 if (maxfblk > last2blk) { 683 long nl2ptr; 684 long n3blk; 685 686 if (offblk > last2blk) 687 n3blk = maxfblk - offblk; 688 else 689 n3blk = maxfblk - last2blk; 690 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1; 691 nl2ptr = roundup(niblk, fni) / fni + 1; 692 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2; 693 maxfblk -= n3blk; 694 } 695 /* 696 * calculate maximum number of blocks of double indirect 697 * pointers to write. 698 */ 699 if (maxfblk > NDADDR + fni) { 700 long n2blk; 701 702 if (offblk > NDADDR + fni) 703 n2blk = maxfblk - offblk; 704 else 705 n2blk = maxfblk - NDADDR + fni; 706 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2; 707 maxfblk -= n2blk; 708 } 709 /* 710 * Add in indirect pointer block write 711 */ 712 if (maxfblk > NDADDR) { 713 niblk += 1; 714 } 715 /* 716 * Calculate deltas for indirect pointer writes 717 */ 718 resv += niblk * (fs->fs_bsize + sizeof (struct delta)); 719 /* 720 * maximum number of cg's needed for request 721 */ 722 ncg = nblk / avgbfree; 723 if (ncg > fs->fs_ncg) 724 ncg = fs->fs_ncg; 725 726 /* 727 * maximum amount of log space needed for request 728 */ 729 if (ncg > 2) 730 resv += (ncg - 2) * SIZECG(ip); 731 732 return (resv); 733 } 734 735 /* 736 * Calculate the amount of log space that needs to be reserved for this 737 * trunc request. If the amount of log space is too large, then 738 * calculate the the size that the requests needs to be split into. 739 */ 740 void 741 ufs_trans_trunc_resv( 742 struct inode *ip, 743 u_offset_t length, 744 int *resvp, 745 u_offset_t *residp) 746 { 747 ulong_t resv; 748 u_offset_t size, offset, resid; 749 int nchunks, incr; 750 int is_sparse = 0; 751 752 /* 753 * *resvp is the amount of log space to reserve (in bytes). 754 * when nonzero, *residp is the number of bytes to truncate. 755 */ 756 *residp = 0; 757 758 if (length < ip->i_size) { 759 size = ip->i_size - length; 760 } else { 761 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 762 /* 763 * truncate up, doesn't really use much space, 764 * the default above should be sufficient. 765 */ 766 goto done; 767 } 768 769 /* 770 * There is no need to split sparse file truncation into 771 * as many chunks as that of regular files. 772 */ 773 is_sparse = bmap_has_holes(ip); 774 775 offset = length; 776 resid = size; 777 nchunks = 1; 778 incr = 0; 779 780 do { 781 resv = ufs_log_amt(ip, offset, resid, 1); 782 /* 783 * If this is the first iteration, set "incr". 784 */ 785 if (!incr) { 786 /* 787 * If this request takes too much log space, 788 * it will be split into "nchunks". If this split 789 * is not enough, linearly increment the nchunks in 790 * the next iteration. 791 */ 792 if (resv > ufs_trans_max_resv && !is_sparse) { 793 nchunks = MAX(size/ufs_trans_max_resv, 1); 794 incr = nchunks; 795 } else { 796 incr = 1; 797 } 798 } else 799 nchunks += incr; 800 resid = size / nchunks; 801 offset = length + (nchunks - 1) * resid; 802 } while (resv > ufs_trans_max_resv); 803 804 if (nchunks > 1) { 805 *residp = resid; 806 } 807 done: 808 *resvp = resv; 809 } 810 811 int 812 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr) 813 { 814 int err, issync, resv; 815 u_offset_t resid; 816 int do_block = 0; 817 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 818 struct fs *fs = ufsvfsp->vfs_fs; 819 820 /* 821 * Not logging; just do the trunc 822 */ 823 if (!TRANS_ISTRANS(ufsvfsp)) { 824 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 825 rw_enter(&ip->i_contents, RW_WRITER); 826 err = ufs_itrunc(ip, length, flags, cr); 827 rw_exit(&ip->i_contents); 828 rw_exit(&ufsvfsp->vfs_dqrwlock); 829 return (err); 830 } 831 832 /* 833 * within the lockfs protocol but *not* part of a transaction 834 */ 835 do_block = curthread->t_flag & T_DONTBLOCK; 836 curthread->t_flag |= T_DONTBLOCK; 837 838 /* 839 * Trunc the file (in pieces, if necessary) 840 */ 841 again: 842 ufs_trans_trunc_resv(ip, length, &resv, &resid); 843 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv); 844 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 845 rw_enter(&ip->i_contents, RW_WRITER); 846 if (resid) { 847 /* 848 * resid is only set if we have to truncate in chunks 849 */ 850 ASSERT(length + resid < ip->i_size); 851 852 /* 853 * Partially trunc file down to desired size (length). 854 * Only retain I_FREE on the last partial trunc. 855 * Round up size to a block boundary, to ensure the truncate 856 * doesn't have to allocate blocks. This is done both for 857 * performance and to fix a bug where if the block can't be 858 * allocated then the inode delete fails, but the inode 859 * is still freed with attached blocks and non-zero size 860 * (bug 4348738). 861 */ 862 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)), 863 flags & ~I_FREE, cr); 864 ASSERT(ip->i_size != length); 865 } else 866 err = ufs_itrunc(ip, length, flags, cr); 867 if (!do_block) 868 curthread->t_flag &= ~T_DONTBLOCK; 869 rw_exit(&ip->i_contents); 870 rw_exit(&ufsvfsp->vfs_dqrwlock); 871 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv); 872 873 if ((err == 0) && resid) { 874 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 875 goto again; 876 } 877 return (err); 878 } 879 880 /* 881 * Calculate the amount of log space that needs to be reserved for this 882 * write request. If the amount of log space is too large, then 883 * calculate the size that the requests needs to be split into. 884 * First try fixed chunks of size ufs_trans_max_resid. If that 885 * is too big, iterate down to the largest size that will fit. 886 * Pagein the pages in the first chunk here, so that the pagein is 887 * avoided later when the transaction is open. 888 */ 889 void 890 ufs_trans_write_resv( 891 struct inode *ip, 892 struct uio *uio, 893 int *resvp, 894 int *residp) 895 { 896 ulong_t resv; 897 offset_t offset; 898 ssize_t resid; 899 int nchunks; 900 901 *residp = 0; 902 offset = uio->uio_offset; 903 resid = MIN(uio->uio_resid, ufs_trans_max_resid); 904 resv = ufs_log_amt(ip, offset, resid, 0); 905 if (resv <= ufs_trans_max_resv) { 906 uio_prefaultpages(resid, uio); 907 if (resid != uio->uio_resid) 908 *residp = resid; 909 *resvp = resv; 910 return; 911 } 912 913 resid = uio->uio_resid; 914 nchunks = 1; 915 for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv; 916 offset = uio->uio_offset + (nchunks - 1) * resid) { 917 nchunks++; 918 resid = uio->uio_resid / nchunks; 919 } 920 uio_prefaultpages(resid, uio); 921 /* 922 * If this request takes too much log space, it will be split 923 */ 924 if (nchunks > 1) 925 *residp = resid; 926 *resvp = resv; 927 } 928 929 /* 930 * Issue write request. 931 * 932 * Split a large request into smaller chunks. 933 */ 934 int 935 ufs_trans_write( 936 struct inode *ip, 937 struct uio *uio, 938 int ioflag, 939 cred_t *cr, 940 int resv, 941 long resid) 942 { 943 long realresid; 944 int err; 945 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 946 947 /* 948 * since the write is too big and would "HOG THE LOG" it needs to 949 * be broken up and done in pieces. NOTE, the caller will 950 * issue the EOT after the request has been completed 951 */ 952 realresid = uio->uio_resid; 953 954 again: 955 /* 956 * Perform partial request (uiomove will update uio for us) 957 * Request is split up into "resid" size chunks until 958 * "realresid" bytes have been transferred. 959 */ 960 uio->uio_resid = MIN(resid, realresid); 961 realresid -= uio->uio_resid; 962 err = wrip(ip, uio, ioflag, cr); 963 964 /* 965 * Error or request is done; caller issues final EOT 966 */ 967 if (err || uio->uio_resid || (realresid == 0)) { 968 uio->uio_resid += realresid; 969 return (err); 970 } 971 972 /* 973 * Generate EOT for this part of the request 974 */ 975 rw_exit(&ip->i_contents); 976 rw_exit(&ufsvfsp->vfs_dqrwlock); 977 if (ioflag & (FSYNC|FDSYNC)) { 978 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv); 979 } else { 980 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 981 } 982 983 /* 984 * Make sure the input buffer is resident before starting 985 * the next transaction. 986 */ 987 uio_prefaultpages(MIN(resid, realresid), uio); 988 989 /* 990 * Generate BOT for next part of the request 991 */ 992 if (ioflag & (FSYNC|FDSYNC)) { 993 int error; 994 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error); 995 ASSERT(!error); 996 } else { 997 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 998 } 999 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1000 rw_enter(&ip->i_contents, RW_WRITER); 1001 /* 1002 * Error during EOT (probably device error while writing commit rec) 1003 */ 1004 if (err) 1005 return (err); 1006 goto again; 1007 } 1008