1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/sysmacros.h> 37 #include <sys/param.h> 38 #include <sys/types.h> 39 #include <sys/systm.h> 40 #include <sys/t_lock.h> 41 #include <sys/uio.h> 42 #include <sys/kmem.h> 43 #include <sys/thread.h> 44 #include <sys/vfs.h> 45 #include <sys/errno.h> 46 #include <sys/buf.h> 47 #include <sys/vnode.h> 48 #include <sys/fs/ufs_trans.h> 49 #include <sys/fs/ufs_inode.h> 50 #include <sys/fs/ufs_fs.h> 51 #include <sys/fs/ufs_fsdir.h> 52 #include <sys/fs/ufs_quota.h> 53 #include <sys/fs/ufs_panic.h> 54 #include <sys/fs/ufs_bio.h> 55 #include <sys/fs/ufs_log.h> 56 #include <sys/cmn_err.h> 57 #include <sys/file.h> 58 #include <sys/debug.h> 59 60 61 extern kmutex_t ufsvfs_mutex; 62 extern struct ufsvfs *ufs_instances; 63 64 /* 65 * hlock any file systems w/errored logs 66 */ 67 int 68 ufs_trans_hlock() 69 { 70 struct ufsvfs *ufsvfsp; 71 struct lockfs lockfs; 72 int error; 73 int retry = 0; 74 75 /* 76 * find fs's that paniced or have errored logging devices 77 */ 78 mutex_enter(&ufsvfs_mutex); 79 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) { 80 /* 81 * not mounted; continue 82 */ 83 if ((ufsvfsp->vfs_vfs == NULL) || 84 (ufsvfsp->vfs_validfs == UT_UNMOUNTED)) 85 continue; 86 /* 87 * disallow unmounts (hlock occurs below) 88 */ 89 if (TRANS_ISERROR(ufsvfsp)) 90 ufsvfsp->vfs_validfs = UT_HLOCKING; 91 } 92 mutex_exit(&ufsvfs_mutex); 93 94 /* 95 * hlock the fs's that paniced or have errored logging devices 96 */ 97 again: 98 mutex_enter(&ufsvfs_mutex); 99 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) 100 if (ufsvfsp->vfs_validfs == UT_HLOCKING) 101 break; 102 mutex_exit(&ufsvfs_mutex); 103 if (ufsvfsp == NULL) 104 return (retry); 105 /* 106 * hlock the file system 107 */ 108 (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs); 109 if (!LOCKFS_IS_ELOCK(&lockfs)) { 110 lockfs.lf_lock = LOCKFS_HLOCK; 111 lockfs.lf_flags = 0; 112 lockfs.lf_comlen = 0; 113 lockfs.lf_comment = NULL; 114 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0); 115 /* 116 * retry after awhile; another app currently doing lockfs 117 */ 118 if (error == EBUSY || error == EINVAL) 119 retry = 1; 120 } else { 121 if (ufsfx_get_failure_qlen() > 0) { 122 if (mutex_tryenter(&ufs_fix.uq_mutex)) { 123 ufs_fix.uq_lowat = ufs_fix.uq_ne; 124 cv_broadcast(&ufs_fix.uq_cv); 125 mutex_exit(&ufs_fix.uq_mutex); 126 } 127 } 128 retry = 1; 129 } 130 131 /* 132 * allow unmounts 133 */ 134 ufsvfsp->vfs_validfs = UT_MOUNTED; 135 goto again; 136 } 137 138 /*ARGSUSED*/ 139 void 140 ufs_trans_onerror() 141 { 142 mutex_enter(&ufs_hlock.uq_mutex); 143 ufs_hlock.uq_ne = ufs_hlock.uq_lowat; 144 cv_broadcast(&ufs_hlock.uq_cv); 145 mutex_exit(&ufs_hlock.uq_mutex); 146 } 147 148 void 149 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid) 150 { 151 if (curthread->t_flag & T_DONTBLOCK) { 152 sbupdate(vfsp); 153 return; 154 } else { 155 156 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 157 return; 158 159 curthread->t_flag |= T_DONTBLOCK; 160 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 161 sbupdate(vfsp); 162 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 163 curthread->t_flag &= ~T_DONTBLOCK; 164 } 165 } 166 167 void 168 ufs_trans_iupdat(struct inode *ip, int waitfor) 169 { 170 struct ufsvfs *ufsvfsp; 171 172 if (curthread->t_flag & T_DONTBLOCK) { 173 rw_enter(&ip->i_contents, RW_READER); 174 ufs_iupdat(ip, waitfor); 175 rw_exit(&ip->i_contents); 176 return; 177 } else { 178 ufsvfsp = ip->i_ufsvfs; 179 180 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 181 return; 182 183 curthread->t_flag |= T_DONTBLOCK; 184 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 185 rw_enter(&ip->i_contents, RW_READER); 186 ufs_iupdat(ip, waitfor); 187 rw_exit(&ip->i_contents); 188 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 189 curthread->t_flag &= ~T_DONTBLOCK; 190 } 191 } 192 193 void 194 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid) 195 { 196 if (curthread->t_flag & T_DONTBLOCK) { 197 mutex_enter(&ufsvfsp->vfs_lock); 198 ufs_sbwrite(ufsvfsp); 199 mutex_exit(&ufsvfsp->vfs_lock); 200 return; 201 } else { 202 203 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 204 return; 205 206 curthread->t_flag |= T_DONTBLOCK; 207 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 208 mutex_enter(&ufsvfsp->vfs_lock); 209 ufs_sbwrite(ufsvfsp); 210 mutex_exit(&ufsvfsp->vfs_lock); 211 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 212 curthread->t_flag &= ~T_DONTBLOCK; 213 } 214 } 215 216 /*ARGSUSED*/ 217 int 218 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore) 219 { 220 struct fs *fs; 221 222 fs = ufsvfsp->vfs_fs; 223 mutex_enter(&ufsvfsp->vfs_lock); 224 TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp, 225 ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize, 226 (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize); 227 mutex_exit(&ufsvfsp->vfs_lock); 228 return (0); 229 } 230 231 /*ARGSUSED*/ 232 int 233 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno) 234 { 235 struct buf *bp; 236 237 bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1); 238 if (bp == NULL) 239 return (ENOENT); 240 241 if (bp->b_flags & B_DELWRI) { 242 /* 243 * Do not use brwrite() here since the buffer is already 244 * marked for retry or not by the code that called 245 * TRANS_BUF(). 246 */ 247 UFS_BWRITE(ufsvfsp, bp); 248 return (0); 249 } 250 /* 251 * If we did not find the real buf for this block above then 252 * clear the dev so the buf won't be found by mistake 253 * for this block later. We had to allocate at least a 1 byte 254 * buffer to keep brelse happy. 255 */ 256 if (bp->b_bufsize == 1) { 257 bp->b_dev = (o_dev_t)NODEV; 258 bp->b_edev = NODEV; 259 bp->b_flags = 0; 260 } 261 brelse(bp); 262 return (ENOENT); 263 } 264 265 /*ARGSUSED*/ 266 int 267 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino) 268 { 269 int error; 270 struct inode *ip; 271 272 /* 273 * Grab the quota lock (if the file system has not been forcibly 274 * unmounted). 275 */ 276 if (ufsvfsp) 277 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 278 279 error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred); 280 281 if (ufsvfsp) 282 rw_exit(&ufsvfsp->vfs_dqrwlock); 283 if (error) 284 return (ENOENT); 285 286 if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) { 287 rw_enter(&ip->i_contents, RW_READER); 288 ufs_iupdat(ip, 1); 289 rw_exit(&ip->i_contents); 290 VN_RELE(ITOV(ip)); 291 return (0); 292 } 293 VN_RELE(ITOV(ip)); 294 return (ENOENT); 295 } 296 297 #ifdef DEBUG 298 /* 299 * These routines maintain the metadata map (matamap) 300 */ 301 302 /* 303 * update the metadata map at mount 304 */ 305 static int 306 ufs_trans_mata_mount_scan(struct inode *ip, void *arg) 307 { 308 /* 309 * wrong file system; keep looking 310 */ 311 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 312 return (0); 313 314 /* 315 * load the metadata map 316 */ 317 rw_enter(&ip->i_contents, RW_WRITER); 318 ufs_trans_mata_iget(ip); 319 rw_exit(&ip->i_contents); 320 return (0); 321 } 322 323 void 324 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp) 325 { 326 struct fs *fs = ufsvfsp->vfs_fs; 327 ino_t ino; 328 int i; 329 330 /* 331 * put static metadata into matamap 332 * superblock 333 * cylinder groups 334 * inode groups 335 * existing inodes 336 */ 337 TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize); 338 339 for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) { 340 TRANS_MATAADD(ufsvfsp, 341 ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize); 342 TRANS_MATAADD(ufsvfsp, 343 ldbtob(fsbtodb(fs, itod(fs, ino))), 344 fs->fs_ipg * sizeof (struct dinode)); 345 } 346 (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp); 347 } 348 349 /* 350 * clear the metadata map at umount 351 */ 352 void 353 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp) 354 { 355 top_mataclr(ufsvfsp); 356 } 357 358 /* 359 * summary info (may be extended during growfs test) 360 */ 361 void 362 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs) 363 { 364 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)), 365 fs->fs_cssize); 366 } 367 368 /* 369 * scan an allocation block (either inode or true block) 370 */ 371 static void 372 ufs_trans_mata_direct( 373 struct inode *ip, 374 daddr_t *fragsp, 375 daddr32_t *blkp, 376 unsigned int nblk) 377 { 378 int i; 379 daddr_t frag; 380 ulong_t nb; 381 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 382 struct fs *fs = ufsvfsp->vfs_fs; 383 384 for (i = 0; i < nblk && *fragsp; ++i, ++blkp) 385 if ((frag = *blkp) != 0) { 386 if (*fragsp > fs->fs_frag) { 387 nb = fs->fs_bsize; 388 *fragsp -= fs->fs_frag; 389 } else { 390 nb = *fragsp * fs->fs_fsize; 391 *fragsp = 0; 392 } 393 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 394 } 395 } 396 397 /* 398 * scan an indirect allocation block (either inode or true block) 399 */ 400 static void 401 ufs_trans_mata_indir( 402 struct inode *ip, 403 daddr_t *fragsp, 404 daddr_t frag, 405 int level) 406 { 407 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 408 struct fs *fs = ufsvfsp->vfs_fs; 409 int ne = fs->fs_bsize / (int)sizeof (daddr32_t); 410 int i; 411 struct buf *bp; 412 daddr32_t *blkp; 413 o_mode_t ifmt = ip->i_mode & IFMT; 414 415 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize); 416 if (bp->b_flags & B_ERROR) { 417 brelse(bp); 418 return; 419 } 420 blkp = bp->b_un.b_daddr; 421 422 if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) || 423 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)) 424 ufs_trans_mata_direct(ip, fragsp, blkp, ne); 425 426 if (level) 427 for (i = 0; i < ne && *fragsp; ++i, ++blkp) 428 ufs_trans_mata_indir(ip, fragsp, *blkp, level-1); 429 brelse(bp); 430 } 431 432 /* 433 * put appropriate metadata into matamap for this inode 434 */ 435 void 436 ufs_trans_mata_iget(struct inode *ip) 437 { 438 int i; 439 daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks); 440 o_mode_t ifmt = ip->i_mode & IFMT; 441 442 if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) || 443 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 444 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR); 445 446 if (frags) 447 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR); 448 449 for (i = 0; i < NIADDR && frags; ++i) 450 if (ip->i_ib[i]) 451 ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i); 452 } 453 454 /* 455 * freeing possible metadata (block of user data) 456 */ 457 void 458 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb) 459 { 460 top_matadel(ufsvfsp, mof, nb); 461 462 } 463 464 /* 465 * allocating metadata 466 */ 467 void 468 ufs_trans_mata_alloc( 469 struct ufsvfs *ufsvfsp, 470 struct inode *ip, 471 daddr_t frag, 472 ulong_t nb, 473 int indir) 474 { 475 struct fs *fs = ufsvfsp->vfs_fs; 476 o_mode_t ifmt = ip->i_mode & IFMT; 477 478 if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) || 479 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 480 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 481 } 482 483 #endif /* DEBUG */ 484 485 /* 486 * ufs_trans_dir is used to declare a directory delta 487 */ 488 int 489 ufs_trans_dir(struct inode *ip, off_t offset) 490 { 491 daddr_t bn; 492 int contig = 0, error; 493 494 ASSERT(ip); 495 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 496 error = bmap_read(ip, (u_offset_t)offset, &bn, &contig); 497 if (error || (bn == UFS_HOLE)) { 498 cmn_err(CE_WARN, "ufs_trans_dir - could not get block" 499 " number error = %d bn = %d\n", error, (int)bn); 500 if (error == 0) /* treat UFS_HOLE as an I/O error */ 501 error = EIO; 502 return (error); 503 } 504 TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0); 505 return (error); 506 } 507 508 /*ARGSUSED*/ 509 int 510 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp) 511 { 512 /* 513 * Lock the quota subsystem (ufsvfsp can be NULL 514 * if the DQ_ERROR is set). 515 */ 516 if (ufsvfsp) 517 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 518 mutex_enter(&dqp->dq_lock); 519 520 /* 521 * If this transaction has been cancelled by closedq_scan_inode(), 522 * then bail out now. We don't call dqput() in this case because 523 * it has already been done. 524 */ 525 if ((dqp->dq_flags & DQ_TRANS) == 0) { 526 mutex_exit(&dqp->dq_lock); 527 if (ufsvfsp) 528 rw_exit(&ufsvfsp->vfs_dqrwlock); 529 return (0); 530 } 531 532 if (dqp->dq_flags & DQ_ERROR) { 533 /* 534 * Paranoia to make sure that there is at least one 535 * reference to the dquot struct. We are done with 536 * the dquot (due to an error) so clear logging 537 * specific markers. 538 */ 539 ASSERT(dqp->dq_cnt >= 1); 540 dqp->dq_flags &= ~DQ_TRANS; 541 dqput(dqp); 542 mutex_exit(&dqp->dq_lock); 543 if (ufsvfsp) 544 rw_exit(&ufsvfsp->vfs_dqrwlock); 545 return (1); 546 } 547 548 if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) { 549 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0)); 550 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb, 551 dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0); 552 /* 553 * Paranoia to make sure that there is at least one 554 * reference to the dquot struct. Clear the 555 * modification flag because the operation is now in 556 * the log. Also clear the logging specific markers 557 * that were set in ufs_trans_quota(). 558 */ 559 ASSERT(dqp->dq_cnt >= 1); 560 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS); 561 dqput(dqp); 562 } 563 564 /* 565 * At this point, the logging specific flag should be clear, 566 * but add paranoia just in case something has gone wrong. 567 */ 568 ASSERT((dqp->dq_flags & DQ_TRANS) == 0); 569 mutex_exit(&dqp->dq_lock); 570 if (ufsvfsp) 571 rw_exit(&ufsvfsp->vfs_dqrwlock); 572 return (0); 573 } 574 575 /* 576 * ufs_trans_quota take in a uid, allocates the disk space, placing the 577 * quota record into the metamap, then declares the delta. 578 */ 579 /*ARGSUSED*/ 580 void 581 ufs_trans_quota(struct dquot *dqp) 582 { 583 584 struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod; 585 586 ASSERT(qip); 587 ASSERT(MUTEX_HELD(&dqp->dq_lock)); 588 ASSERT(dqp->dq_flags & DQ_MOD); 589 ASSERT(dqp->dq_mof != 0); 590 ASSERT(dqp->dq_mof != UFS_HOLE); 591 592 /* 593 * Mark this dquot to indicate that we are starting a logging 594 * file system operation for this dquot. Also increment the 595 * reference count so that the dquot does not get reused while 596 * it is on the mapentry_t list. DQ_TRANS is cleared and the 597 * reference count is decremented by ufs_trans_push_quota. 598 * 599 * If the file system is force-unmounted while there is a 600 * pending quota transaction, then closedq_scan_inode() will 601 * clear the DQ_TRANS flag and decrement the reference count. 602 * 603 * Since deltamap_add() drops multiple transactions to the 604 * same dq_mof and ufs_trans_push_quota() won't get called, 605 * we use DQ_TRANS to prevent repeat transactions from 606 * incrementing the reference count (or calling TRANS_DELTA()). 607 */ 608 if ((dqp->dq_flags & DQ_TRANS) == 0) { 609 dqp->dq_flags |= DQ_TRANS; 610 dqp->dq_cnt++; 611 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk), 612 DT_QR, ufs_trans_push_quota, (ulong_t)dqp); 613 } 614 } 615 616 void 617 ufs_trans_dqrele(struct dquot *dqp) 618 { 619 struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp; 620 621 curthread->t_flag |= T_DONTBLOCK; 622 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 623 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 624 dqrele(dqp); 625 rw_exit(&ufsvfsp->vfs_dqrwlock); 626 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 627 curthread->t_flag &= ~T_DONTBLOCK; 628 } 629 630 int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */ 631 long ufs_trans_avgbfree = 0; /* will be adjusted for testing */ 632 #define TRANS_MAX_WRITE (1024 * 1024) 633 size_t ufs_trans_max_resid = TRANS_MAX_WRITE; 634 635 /* 636 * Calculate the log reservation for the given write or truncate 637 */ 638 static ulong_t 639 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc) 640 { 641 long ncg, last2blk; 642 long niblk = 0; 643 u_offset_t writeend, offblk; 644 int resv; 645 daddr_t nblk, maxfblk; 646 long avgbfree; 647 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 648 struct fs *fs = ufsvfsp->vfs_fs; 649 long fni = NINDIR(fs); 650 int bsize = fs->fs_bsize; 651 652 /* 653 * Assume that the request will fit in 1 or 2 cg's, 654 * resv is the amount of log space to reserve (in bytes). 655 */ 656 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 657 658 /* 659 * get max position of write in fs blocks 660 */ 661 writeend = offset + resid; 662 maxfblk = lblkno(fs, writeend); 663 offblk = lblkno(fs, offset); 664 /* 665 * request size in fs blocks 666 */ 667 nblk = lblkno(fs, blkroundup(fs, resid)); 668 /* 669 * Adjust for sparse files 670 */ 671 if (trunc) 672 nblk = MIN(nblk, ip->i_blocks); 673 674 /* 675 * Adjust avgbfree (for testing) 676 */ 677 avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1; 678 679 /* 680 * Calculate maximum number of blocks of triple indirect 681 * pointers to write. 682 */ 683 last2blk = NDADDR + fni + fni * fni; 684 if (maxfblk > last2blk) { 685 long nl2ptr; 686 long n3blk; 687 688 if (offblk > last2blk) 689 n3blk = maxfblk - offblk; 690 else 691 n3blk = maxfblk - last2blk; 692 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1; 693 nl2ptr = roundup(niblk, fni) / fni + 1; 694 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2; 695 maxfblk -= n3blk; 696 } 697 /* 698 * calculate maximum number of blocks of double indirect 699 * pointers to write. 700 */ 701 if (maxfblk > NDADDR + fni) { 702 long n2blk; 703 704 if (offblk > NDADDR + fni) 705 n2blk = maxfblk - offblk; 706 else 707 n2blk = maxfblk - NDADDR + fni; 708 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2; 709 maxfblk -= n2blk; 710 } 711 /* 712 * Add in indirect pointer block write 713 */ 714 if (maxfblk > NDADDR) { 715 niblk += 1; 716 } 717 /* 718 * Calculate deltas for indirect pointer writes 719 */ 720 resv += niblk * (fs->fs_bsize + sizeof (struct delta)); 721 /* 722 * maximum number of cg's needed for request 723 */ 724 ncg = nblk / avgbfree; 725 if (ncg > fs->fs_ncg) 726 ncg = fs->fs_ncg; 727 728 /* 729 * maximum amount of log space needed for request 730 */ 731 if (ncg > 2) 732 resv += (ncg - 2) * SIZECG(ip); 733 734 return (resv); 735 } 736 737 /* 738 * Calculate the amount of log space that needs to be reserved for this 739 * trunc request. If the amount of log space is too large, then 740 * calculate the the size that the requests needs to be split into. 741 */ 742 void 743 ufs_trans_trunc_resv( 744 struct inode *ip, 745 u_offset_t length, 746 int *resvp, 747 u_offset_t *residp) 748 { 749 ulong_t resv; 750 u_offset_t size, offset, resid; 751 int nchunks; 752 753 /* 754 * *resvp is the amount of log space to reserve (in bytes). 755 * when nonzero, *residp is the number of bytes to truncate. 756 */ 757 *residp = 0; 758 759 if (length < ip->i_size) { 760 size = ip->i_size - length; 761 } else { 762 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 763 /* 764 * truncate up, doesn't really use much space, 765 * the default above should be sufficient. 766 */ 767 goto done; 768 } 769 770 offset = length; 771 resid = size; 772 nchunks = 1; 773 for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv; 774 offset = length + (nchunks - 1) * resid) { 775 nchunks++; 776 resid = size / nchunks; 777 } 778 /* 779 * If this request takes too much log space, it will be split 780 */ 781 if (nchunks > 1) { 782 *residp = resid; 783 } 784 done: 785 *resvp = resv; 786 } 787 788 int 789 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr) 790 { 791 int err, issync, resv; 792 u_offset_t resid; 793 int do_block = 0; 794 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 795 struct fs *fs = ufsvfsp->vfs_fs; 796 797 /* 798 * Not logging; just do the trunc 799 */ 800 if (!TRANS_ISTRANS(ufsvfsp)) { 801 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 802 rw_enter(&ip->i_contents, RW_WRITER); 803 err = ufs_itrunc(ip, length, flags, cr); 804 rw_exit(&ip->i_contents); 805 rw_exit(&ufsvfsp->vfs_dqrwlock); 806 return (err); 807 } 808 809 /* 810 * within the lockfs protocol but *not* part of a transaction 811 */ 812 do_block = curthread->t_flag & T_DONTBLOCK; 813 curthread->t_flag |= T_DONTBLOCK; 814 815 /* 816 * Trunc the file (in pieces, if necessary) 817 */ 818 again: 819 ufs_trans_trunc_resv(ip, length, &resv, &resid); 820 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv); 821 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 822 rw_enter(&ip->i_contents, RW_WRITER); 823 if (resid) { 824 /* 825 * resid is only set if we have to truncate in chunks 826 */ 827 ASSERT(length + resid < ip->i_size); 828 829 /* 830 * Partially trunc file down to desired size (length). 831 * Only retain I_FREE on the last partial trunc. 832 * Round up size to a block boundary, to ensure the truncate 833 * doesn't have to allocate blocks. This is done both for 834 * performance and to fix a bug where if the block can't be 835 * allocated then the inode delete fails, but the inode 836 * is still freed with attached blocks and non-zero size 837 * (bug 4348738). 838 */ 839 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)), 840 flags & ~I_FREE, cr); 841 ASSERT(ip->i_size != length); 842 } else 843 err = ufs_itrunc(ip, length, flags, cr); 844 if (!do_block) 845 curthread->t_flag &= ~T_DONTBLOCK; 846 rw_exit(&ip->i_contents); 847 rw_exit(&ufsvfsp->vfs_dqrwlock); 848 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv); 849 850 if ((err == 0) && resid) { 851 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 852 goto again; 853 } 854 return (err); 855 } 856 857 /* 858 * Fault in the pages of the first n bytes specified by the uio structure. 859 * 1 byte in each page is touched and the uio struct is unmodified. 860 * Any error will terminate the process as this is only a best 861 * attempt to get the pages resident. 862 */ 863 static void 864 ufs_trans_touch(ssize_t n, struct uio *uio) 865 { 866 struct iovec *iov; 867 ulong_t cnt, incr; 868 caddr_t p; 869 uint8_t tmp; 870 871 iov = uio->uio_iov; 872 873 while (n) { 874 cnt = MIN(iov->iov_len, n); 875 if (cnt == 0) { 876 /* empty iov entry */ 877 iov++; 878 continue; 879 } 880 n -= cnt; 881 /* 882 * touch each page in this segment. 883 */ 884 p = iov->iov_base; 885 while (cnt) { 886 switch (uio->uio_segflg) { 887 case UIO_USERSPACE: 888 case UIO_USERISPACE: 889 if (fuword8(p, &tmp)) 890 return; 891 break; 892 case UIO_SYSSPACE: 893 if (kcopy(p, &tmp, 1)) 894 return; 895 break; 896 } 897 incr = MIN(cnt, PAGESIZE); 898 p += incr; 899 cnt -= incr; 900 } 901 /* 902 * touch the last byte in case it straddles a page. 903 */ 904 p--; 905 switch (uio->uio_segflg) { 906 case UIO_USERSPACE: 907 case UIO_USERISPACE: 908 if (fuword8(p, &tmp)) 909 return; 910 break; 911 case UIO_SYSSPACE: 912 if (kcopy(p, &tmp, 1)) 913 return; 914 break; 915 } 916 iov++; 917 } 918 } 919 920 /* 921 * Calculate the amount of log space that needs to be reserved for this 922 * write request. If the amount of log space is too large, then 923 * calculate the size that the requests needs to be split into. 924 * First try fixed chunks of size ufs_trans_max_resid. If that 925 * is too big, iterate down to the largest size that will fit. 926 * Pagein the pages in the first chunk here, so that the pagein is 927 * avoided later when the transaction is open. 928 */ 929 void 930 ufs_trans_write_resv( 931 struct inode *ip, 932 struct uio *uio, 933 int *resvp, 934 int *residp) 935 { 936 ulong_t resv; 937 offset_t offset; 938 ssize_t resid; 939 int nchunks; 940 941 *residp = 0; 942 offset = uio->uio_offset; 943 resid = MIN(uio->uio_resid, ufs_trans_max_resid); 944 resv = ufs_log_amt(ip, offset, resid, 0); 945 if (resv <= ufs_trans_max_resv) { 946 ufs_trans_touch(resid, uio); 947 if (resid != uio->uio_resid) 948 *residp = resid; 949 *resvp = resv; 950 return; 951 } 952 953 resid = uio->uio_resid; 954 nchunks = 1; 955 for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv; 956 offset = uio->uio_offset + (nchunks - 1) * resid) { 957 nchunks++; 958 resid = uio->uio_resid / nchunks; 959 } 960 ufs_trans_touch(resid, uio); 961 /* 962 * If this request takes too much log space, it will be split 963 */ 964 if (nchunks > 1) 965 *residp = resid; 966 *resvp = resv; 967 } 968 969 /* 970 * Issue write request. 971 * 972 * Split a large request into smaller chunks. 973 */ 974 int 975 ufs_trans_write( 976 struct inode *ip, 977 struct uio *uio, 978 int ioflag, 979 cred_t *cr, 980 int resv, 981 long resid) 982 { 983 long realresid; 984 int err; 985 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 986 987 /* 988 * since the write is too big and would "HOG THE LOG" it needs to 989 * be broken up and done in pieces. NOTE, the caller will 990 * issue the EOT after the request has been completed 991 */ 992 realresid = uio->uio_resid; 993 994 again: 995 /* 996 * Perform partial request (uiomove will update uio for us) 997 * Request is split up into "resid" size chunks until 998 * "realresid" bytes have been transferred. 999 */ 1000 uio->uio_resid = MIN(resid, realresid); 1001 realresid -= uio->uio_resid; 1002 err = wrip(ip, uio, ioflag, cr); 1003 1004 /* 1005 * Error or request is done; caller issues final EOT 1006 */ 1007 if (err || uio->uio_resid || (realresid == 0)) { 1008 uio->uio_resid += realresid; 1009 return (err); 1010 } 1011 1012 /* 1013 * Generate EOT for this part of the request 1014 */ 1015 rw_exit(&ip->i_contents); 1016 rw_exit(&ufsvfsp->vfs_dqrwlock); 1017 if (ioflag & (FSYNC|FDSYNC)) { 1018 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv); 1019 } else { 1020 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 1021 } 1022 1023 /* 1024 * Make sure the input buffer is resident before starting 1025 * the next transaction. 1026 */ 1027 ufs_trans_touch(MIN(resid, realresid), uio); 1028 1029 /* 1030 * Generate BOT for next part of the request 1031 */ 1032 if (ioflag & (FSYNC|FDSYNC)) { 1033 int error; 1034 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error); 1035 ASSERT(!error); 1036 } else { 1037 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 1038 } 1039 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1040 rw_enter(&ip->i_contents, RW_WRITER); 1041 /* 1042 * Error during EOT (probably device error while writing commit rec) 1043 */ 1044 if (err) 1045 return (err); 1046 goto again; 1047 } 1048