1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #include <sys/sysmacros.h> 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/t_lock.h> 39 #include <sys/uio.h> 40 #include <sys/kmem.h> 41 #include <sys/thread.h> 42 #include <sys/vfs.h> 43 #include <sys/errno.h> 44 #include <sys/buf.h> 45 #include <sys/vnode.h> 46 #include <sys/fs/ufs_trans.h> 47 #include <sys/fs/ufs_inode.h> 48 #include <sys/fs/ufs_fs.h> 49 #include <sys/fs/ufs_fsdir.h> 50 #include <sys/fs/ufs_quota.h> 51 #include <sys/fs/ufs_panic.h> 52 #include <sys/fs/ufs_bio.h> 53 #include <sys/fs/ufs_log.h> 54 #include <sys/cmn_err.h> 55 #include <sys/file.h> 56 #include <sys/debug.h> 57 58 59 extern kmutex_t ufsvfs_mutex; 60 extern struct ufsvfs *ufs_instances; 61 62 /* 63 * hlock any file systems w/errored logs 64 */ 65 int 66 ufs_trans_hlock() 67 { 68 struct ufsvfs *ufsvfsp; 69 struct lockfs lockfs; 70 int error; 71 int retry = 0; 72 73 /* 74 * find fs's that paniced or have errored logging devices 75 */ 76 mutex_enter(&ufsvfs_mutex); 77 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) { 78 /* 79 * not mounted; continue 80 */ 81 if ((ufsvfsp->vfs_vfs == NULL) || 82 (ufsvfsp->vfs_validfs == UT_UNMOUNTED)) 83 continue; 84 /* 85 * disallow unmounts (hlock occurs below) 86 */ 87 if (TRANS_ISERROR(ufsvfsp)) 88 ufsvfsp->vfs_validfs = UT_HLOCKING; 89 } 90 mutex_exit(&ufsvfs_mutex); 91 92 /* 93 * hlock the fs's that paniced or have errored logging devices 94 */ 95 again: 96 mutex_enter(&ufsvfs_mutex); 97 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) 98 if (ufsvfsp->vfs_validfs == UT_HLOCKING) 99 break; 100 mutex_exit(&ufsvfs_mutex); 101 if (ufsvfsp == NULL) 102 return (retry); 103 /* 104 * hlock the file system 105 */ 106 (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs); 107 if (!LOCKFS_IS_ELOCK(&lockfs)) { 108 lockfs.lf_lock = LOCKFS_HLOCK; 109 lockfs.lf_flags = 0; 110 lockfs.lf_comlen = 0; 111 lockfs.lf_comment = NULL; 112 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0); 113 /* 114 * retry after awhile; another app currently doing lockfs 115 */ 116 if (error == EBUSY || error == EINVAL) 117 retry = 1; 118 } else { 119 if (ufsfx_get_failure_qlen() > 0) { 120 if (mutex_tryenter(&ufs_fix.uq_mutex)) { 121 ufs_fix.uq_lowat = ufs_fix.uq_ne; 122 cv_broadcast(&ufs_fix.uq_cv); 123 mutex_exit(&ufs_fix.uq_mutex); 124 } 125 } 126 retry = 1; 127 } 128 129 /* 130 * allow unmounts 131 */ 132 ufsvfsp->vfs_validfs = UT_MOUNTED; 133 goto again; 134 } 135 136 /*ARGSUSED*/ 137 void 138 ufs_trans_onerror() 139 { 140 mutex_enter(&ufs_hlock.uq_mutex); 141 ufs_hlock.uq_ne = ufs_hlock.uq_lowat; 142 cv_broadcast(&ufs_hlock.uq_cv); 143 mutex_exit(&ufs_hlock.uq_mutex); 144 } 145 146 void 147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid) 148 { 149 if (curthread->t_flag & T_DONTBLOCK) { 150 sbupdate(vfsp); 151 return; 152 } else { 153 154 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 155 return; 156 157 curthread->t_flag |= T_DONTBLOCK; 158 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 159 sbupdate(vfsp); 160 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 161 curthread->t_flag &= ~T_DONTBLOCK; 162 } 163 } 164 165 void 166 ufs_trans_iupdat(struct inode *ip, int waitfor) 167 { 168 struct ufsvfs *ufsvfsp; 169 170 if (curthread->t_flag & T_DONTBLOCK) { 171 rw_enter(&ip->i_contents, RW_READER); 172 ufs_iupdat(ip, waitfor); 173 rw_exit(&ip->i_contents); 174 return; 175 } else { 176 ufsvfsp = ip->i_ufsvfs; 177 178 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 179 return; 180 181 curthread->t_flag |= T_DONTBLOCK; 182 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 183 rw_enter(&ip->i_contents, RW_READER); 184 ufs_iupdat(ip, waitfor); 185 rw_exit(&ip->i_contents); 186 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 187 curthread->t_flag &= ~T_DONTBLOCK; 188 } 189 } 190 191 void 192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid) 193 { 194 if (curthread->t_flag & T_DONTBLOCK) { 195 mutex_enter(&ufsvfsp->vfs_lock); 196 ufs_sbwrite(ufsvfsp); 197 mutex_exit(&ufsvfsp->vfs_lock); 198 return; 199 } else { 200 201 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 202 return; 203 204 curthread->t_flag |= T_DONTBLOCK; 205 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 206 mutex_enter(&ufsvfsp->vfs_lock); 207 ufs_sbwrite(ufsvfsp); 208 mutex_exit(&ufsvfsp->vfs_lock); 209 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 210 curthread->t_flag &= ~T_DONTBLOCK; 211 } 212 } 213 214 /*ARGSUSED*/ 215 int 216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore) 217 { 218 struct fs *fs; 219 220 fs = ufsvfsp->vfs_fs; 221 mutex_enter(&ufsvfsp->vfs_lock); 222 TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp, 223 ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize, 224 (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize); 225 mutex_exit(&ufsvfsp->vfs_lock); 226 return (0); 227 } 228 229 /*ARGSUSED*/ 230 int 231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno) 232 { 233 struct buf *bp; 234 235 bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1); 236 if (bp == NULL) 237 return (ENOENT); 238 239 if (bp->b_flags & B_DELWRI) { 240 /* 241 * Do not use brwrite() here since the buffer is already 242 * marked for retry or not by the code that called 243 * TRANS_BUF(). 244 */ 245 UFS_BWRITE(ufsvfsp, bp); 246 return (0); 247 } 248 /* 249 * If we did not find the real buf for this block above then 250 * clear the dev so the buf won't be found by mistake 251 * for this block later. We had to allocate at least a 1 byte 252 * buffer to keep brelse happy. 253 */ 254 if (bp->b_bufsize == 1) { 255 bp->b_dev = (o_dev_t)NODEV; 256 bp->b_edev = NODEV; 257 bp->b_flags = 0; 258 } 259 brelse(bp); 260 return (ENOENT); 261 } 262 263 /*ARGSUSED*/ 264 int 265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino) 266 { 267 int error; 268 struct inode *ip; 269 270 /* 271 * Grab the quota lock (if the file system has not been forcibly 272 * unmounted). 273 */ 274 if (ufsvfsp) 275 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 276 277 error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred); 278 279 if (ufsvfsp) 280 rw_exit(&ufsvfsp->vfs_dqrwlock); 281 if (error) 282 return (ENOENT); 283 284 if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) { 285 rw_enter(&ip->i_contents, RW_READER); 286 ufs_iupdat(ip, 1); 287 rw_exit(&ip->i_contents); 288 VN_RELE(ITOV(ip)); 289 return (0); 290 } 291 VN_RELE(ITOV(ip)); 292 return (ENOENT); 293 } 294 295 #ifdef DEBUG 296 /* 297 * These routines maintain the metadata map (matamap) 298 */ 299 300 /* 301 * update the metadata map at mount 302 */ 303 static int 304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg) 305 { 306 /* 307 * wrong file system; keep looking 308 */ 309 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 310 return (0); 311 312 /* 313 * load the metadata map 314 */ 315 rw_enter(&ip->i_contents, RW_WRITER); 316 ufs_trans_mata_iget(ip); 317 rw_exit(&ip->i_contents); 318 return (0); 319 } 320 321 void 322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp) 323 { 324 struct fs *fs = ufsvfsp->vfs_fs; 325 ino_t ino; 326 int i; 327 328 /* 329 * put static metadata into matamap 330 * superblock 331 * cylinder groups 332 * inode groups 333 * existing inodes 334 */ 335 TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize); 336 337 for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) { 338 TRANS_MATAADD(ufsvfsp, 339 ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize); 340 TRANS_MATAADD(ufsvfsp, 341 ldbtob(fsbtodb(fs, itod(fs, ino))), 342 fs->fs_ipg * sizeof (struct dinode)); 343 } 344 (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp); 345 } 346 347 /* 348 * clear the metadata map at umount 349 */ 350 void 351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp) 352 { 353 top_mataclr(ufsvfsp); 354 } 355 356 /* 357 * summary info (may be extended during growfs test) 358 */ 359 void 360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs) 361 { 362 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)), 363 fs->fs_cssize); 364 } 365 366 /* 367 * scan an allocation block (either inode or true block) 368 */ 369 static void 370 ufs_trans_mata_direct( 371 struct inode *ip, 372 daddr_t *fragsp, 373 daddr32_t *blkp, 374 unsigned int nblk) 375 { 376 int i; 377 daddr_t frag; 378 ulong_t nb; 379 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 380 struct fs *fs = ufsvfsp->vfs_fs; 381 382 for (i = 0; i < nblk && *fragsp; ++i, ++blkp) 383 if ((frag = *blkp) != 0) { 384 if (*fragsp > fs->fs_frag) { 385 nb = fs->fs_bsize; 386 *fragsp -= fs->fs_frag; 387 } else { 388 nb = *fragsp * fs->fs_fsize; 389 *fragsp = 0; 390 } 391 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 392 } 393 } 394 395 /* 396 * scan an indirect allocation block (either inode or true block) 397 */ 398 static void 399 ufs_trans_mata_indir( 400 struct inode *ip, 401 daddr_t *fragsp, 402 daddr_t frag, 403 int level) 404 { 405 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 406 struct fs *fs = ufsvfsp->vfs_fs; 407 int ne = fs->fs_bsize / (int)sizeof (daddr32_t); 408 int i; 409 struct buf *bp; 410 daddr32_t *blkp; 411 o_mode_t ifmt = ip->i_mode & IFMT; 412 413 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize); 414 if (bp->b_flags & B_ERROR) { 415 brelse(bp); 416 return; 417 } 418 blkp = bp->b_un.b_daddr; 419 420 if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) || 421 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)) 422 ufs_trans_mata_direct(ip, fragsp, blkp, ne); 423 424 if (level) 425 for (i = 0; i < ne && *fragsp; ++i, ++blkp) 426 ufs_trans_mata_indir(ip, fragsp, *blkp, level-1); 427 brelse(bp); 428 } 429 430 /* 431 * put appropriate metadata into matamap for this inode 432 */ 433 void 434 ufs_trans_mata_iget(struct inode *ip) 435 { 436 int i; 437 daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks); 438 o_mode_t ifmt = ip->i_mode & IFMT; 439 440 if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) || 441 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 442 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR); 443 444 if (frags) 445 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR); 446 447 for (i = 0; i < NIADDR && frags; ++i) 448 if (ip->i_ib[i]) 449 ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i); 450 } 451 452 /* 453 * freeing possible metadata (block of user data) 454 */ 455 void 456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb) 457 { 458 top_matadel(ufsvfsp, mof, nb); 459 460 } 461 462 /* 463 * allocating metadata 464 */ 465 void 466 ufs_trans_mata_alloc( 467 struct ufsvfs *ufsvfsp, 468 struct inode *ip, 469 daddr_t frag, 470 ulong_t nb, 471 int indir) 472 { 473 struct fs *fs = ufsvfsp->vfs_fs; 474 o_mode_t ifmt = ip->i_mode & IFMT; 475 476 if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) || 477 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 478 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 479 } 480 481 #endif /* DEBUG */ 482 483 /* 484 * ufs_trans_dir is used to declare a directory delta 485 */ 486 int 487 ufs_trans_dir(struct inode *ip, off_t offset) 488 { 489 daddr_t bn; 490 int contig = 0, error; 491 492 ASSERT(ip); 493 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 494 error = bmap_read(ip, (u_offset_t)offset, &bn, &contig); 495 if (error || (bn == UFS_HOLE)) { 496 cmn_err(CE_WARN, "ufs_trans_dir - could not get block" 497 " number error = %d bn = %d\n", error, (int)bn); 498 if (error == 0) /* treat UFS_HOLE as an I/O error */ 499 error = EIO; 500 return (error); 501 } 502 TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0); 503 return (error); 504 } 505 506 /*ARGSUSED*/ 507 int 508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp) 509 { 510 /* 511 * Lock the quota subsystem (ufsvfsp can be NULL 512 * if the DQ_ERROR is set). 513 */ 514 if (ufsvfsp) 515 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 516 mutex_enter(&dqp->dq_lock); 517 518 /* 519 * If this transaction has been cancelled by closedq_scan_inode(), 520 * then bail out now. We don't call dqput() in this case because 521 * it has already been done. 522 */ 523 if ((dqp->dq_flags & DQ_TRANS) == 0) { 524 mutex_exit(&dqp->dq_lock); 525 if (ufsvfsp) 526 rw_exit(&ufsvfsp->vfs_dqrwlock); 527 return (0); 528 } 529 530 if (dqp->dq_flags & DQ_ERROR) { 531 /* 532 * Paranoia to make sure that there is at least one 533 * reference to the dquot struct. We are done with 534 * the dquot (due to an error) so clear logging 535 * specific markers. 536 */ 537 ASSERT(dqp->dq_cnt >= 1); 538 dqp->dq_flags &= ~DQ_TRANS; 539 dqput(dqp); 540 mutex_exit(&dqp->dq_lock); 541 if (ufsvfsp) 542 rw_exit(&ufsvfsp->vfs_dqrwlock); 543 return (1); 544 } 545 546 if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) { 547 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0)); 548 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb, 549 dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0); 550 /* 551 * Paranoia to make sure that there is at least one 552 * reference to the dquot struct. Clear the 553 * modification flag because the operation is now in 554 * the log. Also clear the logging specific markers 555 * that were set in ufs_trans_quota(). 556 */ 557 ASSERT(dqp->dq_cnt >= 1); 558 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS); 559 dqput(dqp); 560 } 561 562 /* 563 * At this point, the logging specific flag should be clear, 564 * but add paranoia just in case something has gone wrong. 565 */ 566 ASSERT((dqp->dq_flags & DQ_TRANS) == 0); 567 mutex_exit(&dqp->dq_lock); 568 if (ufsvfsp) 569 rw_exit(&ufsvfsp->vfs_dqrwlock); 570 return (0); 571 } 572 573 /* 574 * ufs_trans_quota take in a uid, allocates the disk space, placing the 575 * quota record into the metamap, then declares the delta. 576 */ 577 /*ARGSUSED*/ 578 void 579 ufs_trans_quota(struct dquot *dqp) 580 { 581 582 struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod; 583 584 ASSERT(qip); 585 ASSERT(MUTEX_HELD(&dqp->dq_lock)); 586 ASSERT(dqp->dq_flags & DQ_MOD); 587 ASSERT(dqp->dq_mof != 0); 588 ASSERT(dqp->dq_mof != UFS_HOLE); 589 590 /* 591 * Mark this dquot to indicate that we are starting a logging 592 * file system operation for this dquot. Also increment the 593 * reference count so that the dquot does not get reused while 594 * it is on the mapentry_t list. DQ_TRANS is cleared and the 595 * reference count is decremented by ufs_trans_push_quota. 596 * 597 * If the file system is force-unmounted while there is a 598 * pending quota transaction, then closedq_scan_inode() will 599 * clear the DQ_TRANS flag and decrement the reference count. 600 * 601 * Since deltamap_add() drops multiple transactions to the 602 * same dq_mof and ufs_trans_push_quota() won't get called, 603 * we use DQ_TRANS to prevent repeat transactions from 604 * incrementing the reference count (or calling TRANS_DELTA()). 605 */ 606 if ((dqp->dq_flags & DQ_TRANS) == 0) { 607 dqp->dq_flags |= DQ_TRANS; 608 dqp->dq_cnt++; 609 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk), 610 DT_QR, ufs_trans_push_quota, (ulong_t)dqp); 611 } 612 } 613 614 void 615 ufs_trans_dqrele(struct dquot *dqp) 616 { 617 struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp; 618 619 curthread->t_flag |= T_DONTBLOCK; 620 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 621 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 622 dqrele(dqp); 623 rw_exit(&ufsvfsp->vfs_dqrwlock); 624 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 625 curthread->t_flag &= ~T_DONTBLOCK; 626 } 627 628 int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */ 629 long ufs_trans_avgbfree = 0; /* will be adjusted for testing */ 630 #define TRANS_MAX_WRITE (1024 * 1024) 631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE; 632 633 /* 634 * Calculate the log reservation for the given write or truncate 635 */ 636 static ulong_t 637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc) 638 { 639 long ncg, last2blk; 640 long niblk = 0; 641 u_offset_t writeend, offblk; 642 int resv; 643 daddr_t nblk, maxfblk; 644 long avgbfree; 645 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 646 struct fs *fs = ufsvfsp->vfs_fs; 647 long fni = NINDIR(fs); 648 int bsize = fs->fs_bsize; 649 650 /* 651 * Assume that the request will fit in 1 or 2 cg's, 652 * resv is the amount of log space to reserve (in bytes). 653 */ 654 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 655 656 /* 657 * get max position of write in fs blocks 658 */ 659 writeend = offset + resid; 660 maxfblk = lblkno(fs, writeend); 661 offblk = lblkno(fs, offset); 662 /* 663 * request size in fs blocks 664 */ 665 nblk = lblkno(fs, blkroundup(fs, resid)); 666 /* 667 * Adjust for sparse files 668 */ 669 if (trunc) 670 nblk = MIN(nblk, ip->i_blocks); 671 672 /* 673 * Adjust avgbfree (for testing) 674 */ 675 avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1; 676 677 /* 678 * Calculate maximum number of blocks of triple indirect 679 * pointers to write. 680 */ 681 last2blk = NDADDR + fni + fni * fni; 682 if (maxfblk > last2blk) { 683 long nl2ptr; 684 long n3blk; 685 686 if (offblk > last2blk) 687 n3blk = maxfblk - offblk; 688 else 689 n3blk = maxfblk - last2blk; 690 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1; 691 nl2ptr = roundup(niblk, fni) / fni + 1; 692 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2; 693 maxfblk -= n3blk; 694 } 695 /* 696 * calculate maximum number of blocks of double indirect 697 * pointers to write. 698 */ 699 if (maxfblk > NDADDR + fni) { 700 long n2blk; 701 702 if (offblk > NDADDR + fni) 703 n2blk = maxfblk - offblk; 704 else 705 n2blk = maxfblk - NDADDR + fni; 706 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2; 707 maxfblk -= n2blk; 708 } 709 /* 710 * Add in indirect pointer block write 711 */ 712 if (maxfblk > NDADDR) { 713 niblk += 1; 714 } 715 /* 716 * Calculate deltas for indirect pointer writes 717 */ 718 resv += niblk * (fs->fs_bsize + sizeof (struct delta)); 719 /* 720 * maximum number of cg's needed for request 721 */ 722 ncg = nblk / avgbfree; 723 if (ncg > fs->fs_ncg) 724 ncg = fs->fs_ncg; 725 726 /* 727 * maximum amount of log space needed for request 728 */ 729 if (ncg > 2) 730 resv += (ncg - 2) * SIZECG(ip); 731 732 return (resv); 733 } 734 735 /* 736 * Calculate the amount of log space that needs to be reserved for this 737 * trunc request. If the amount of log space is too large, then 738 * calculate the the size that the requests needs to be split into. 739 */ 740 void 741 ufs_trans_trunc_resv( 742 struct inode *ip, 743 u_offset_t length, 744 int *resvp, 745 u_offset_t *residp) 746 { 747 ulong_t resv; 748 u_offset_t size, offset, resid; 749 int nchunks, flag; 750 751 /* 752 * *resvp is the amount of log space to reserve (in bytes). 753 * when nonzero, *residp is the number of bytes to truncate. 754 */ 755 *residp = 0; 756 757 if (length < ip->i_size) { 758 size = ip->i_size - length; 759 } else { 760 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 761 /* 762 * truncate up, doesn't really use much space, 763 * the default above should be sufficient. 764 */ 765 goto done; 766 } 767 768 offset = length; 769 resid = size; 770 nchunks = 1; 771 flag = 0; 772 773 /* 774 * If this request takes too much log space, it will be split into 775 * "nchunks". If this split is not enough, linearly increment the 776 * nchunks in the next iteration. 777 */ 778 for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv; 779 offset = length + (nchunks - 1) * resid) { 780 if (!flag) { 781 nchunks = roundup(resv, ufs_trans_max_resv) / 782 ufs_trans_max_resv; 783 flag = 1; 784 } else { 785 nchunks++; 786 } 787 resid = size / nchunks; 788 } 789 790 if (nchunks > 1) { 791 *residp = resid; 792 } 793 done: 794 *resvp = resv; 795 } 796 797 int 798 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr) 799 { 800 int err, issync, resv; 801 u_offset_t resid; 802 int do_block = 0; 803 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 804 struct fs *fs = ufsvfsp->vfs_fs; 805 806 /* 807 * Not logging; just do the trunc 808 */ 809 if (!TRANS_ISTRANS(ufsvfsp)) { 810 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 811 rw_enter(&ip->i_contents, RW_WRITER); 812 err = ufs_itrunc(ip, length, flags, cr); 813 rw_exit(&ip->i_contents); 814 rw_exit(&ufsvfsp->vfs_dqrwlock); 815 return (err); 816 } 817 818 /* 819 * within the lockfs protocol but *not* part of a transaction 820 */ 821 do_block = curthread->t_flag & T_DONTBLOCK; 822 curthread->t_flag |= T_DONTBLOCK; 823 824 /* 825 * Trunc the file (in pieces, if necessary) 826 */ 827 again: 828 ufs_trans_trunc_resv(ip, length, &resv, &resid); 829 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv); 830 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 831 rw_enter(&ip->i_contents, RW_WRITER); 832 if (resid) { 833 /* 834 * resid is only set if we have to truncate in chunks 835 */ 836 ASSERT(length + resid < ip->i_size); 837 838 /* 839 * Partially trunc file down to desired size (length). 840 * Only retain I_FREE on the last partial trunc. 841 * Round up size to a block boundary, to ensure the truncate 842 * doesn't have to allocate blocks. This is done both for 843 * performance and to fix a bug where if the block can't be 844 * allocated then the inode delete fails, but the inode 845 * is still freed with attached blocks and non-zero size 846 * (bug 4348738). 847 */ 848 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)), 849 flags & ~I_FREE, cr); 850 ASSERT(ip->i_size != length); 851 } else 852 err = ufs_itrunc(ip, length, flags, cr); 853 if (!do_block) 854 curthread->t_flag &= ~T_DONTBLOCK; 855 rw_exit(&ip->i_contents); 856 rw_exit(&ufsvfsp->vfs_dqrwlock); 857 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv); 858 859 if ((err == 0) && resid) { 860 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 861 goto again; 862 } 863 return (err); 864 } 865 866 /* 867 * Calculate the amount of log space that needs to be reserved for this 868 * write request. If the amount of log space is too large, then 869 * calculate the size that the requests needs to be split into. 870 * First try fixed chunks of size ufs_trans_max_resid. If that 871 * is too big, iterate down to the largest size that will fit. 872 * Pagein the pages in the first chunk here, so that the pagein is 873 * avoided later when the transaction is open. 874 */ 875 void 876 ufs_trans_write_resv( 877 struct inode *ip, 878 struct uio *uio, 879 int *resvp, 880 int *residp) 881 { 882 ulong_t resv; 883 offset_t offset; 884 ssize_t resid; 885 int nchunks; 886 887 *residp = 0; 888 offset = uio->uio_offset; 889 resid = MIN(uio->uio_resid, ufs_trans_max_resid); 890 resv = ufs_log_amt(ip, offset, resid, 0); 891 if (resv <= ufs_trans_max_resv) { 892 uio_prefaultpages(resid, uio); 893 if (resid != uio->uio_resid) 894 *residp = resid; 895 *resvp = resv; 896 return; 897 } 898 899 resid = uio->uio_resid; 900 nchunks = 1; 901 for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv; 902 offset = uio->uio_offset + (nchunks - 1) * resid) { 903 nchunks++; 904 resid = uio->uio_resid / nchunks; 905 } 906 uio_prefaultpages(resid, uio); 907 /* 908 * If this request takes too much log space, it will be split 909 */ 910 if (nchunks > 1) 911 *residp = resid; 912 *resvp = resv; 913 } 914 915 /* 916 * Issue write request. 917 * 918 * Split a large request into smaller chunks. 919 */ 920 int 921 ufs_trans_write( 922 struct inode *ip, 923 struct uio *uio, 924 int ioflag, 925 cred_t *cr, 926 int resv, 927 long resid) 928 { 929 long realresid; 930 int err; 931 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 932 933 /* 934 * since the write is too big and would "HOG THE LOG" it needs to 935 * be broken up and done in pieces. NOTE, the caller will 936 * issue the EOT after the request has been completed 937 */ 938 realresid = uio->uio_resid; 939 940 again: 941 /* 942 * Perform partial request (uiomove will update uio for us) 943 * Request is split up into "resid" size chunks until 944 * "realresid" bytes have been transferred. 945 */ 946 uio->uio_resid = MIN(resid, realresid); 947 realresid -= uio->uio_resid; 948 err = wrip(ip, uio, ioflag, cr); 949 950 /* 951 * Error or request is done; caller issues final EOT 952 */ 953 if (err || uio->uio_resid || (realresid == 0)) { 954 uio->uio_resid += realresid; 955 return (err); 956 } 957 958 /* 959 * Generate EOT for this part of the request 960 */ 961 rw_exit(&ip->i_contents); 962 rw_exit(&ufsvfsp->vfs_dqrwlock); 963 if (ioflag & (FSYNC|FDSYNC)) { 964 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv); 965 } else { 966 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 967 } 968 969 /* 970 * Make sure the input buffer is resident before starting 971 * the next transaction. 972 */ 973 uio_prefaultpages(MIN(resid, realresid), uio); 974 975 /* 976 * Generate BOT for next part of the request 977 */ 978 if (ioflag & (FSYNC|FDSYNC)) { 979 int error; 980 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error); 981 ASSERT(!error); 982 } else { 983 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 984 } 985 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 986 rw_enter(&ip->i_contents, RW_WRITER); 987 /* 988 * Error during EOT (probably device error while writing commit rec) 989 */ 990 if (err) 991 return (err); 992 goto again; 993 } 994