1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/sysmacros.h> 38 #include <sys/param.h> 39 #include <sys/types.h> 40 #include <sys/systm.h> 41 #include <sys/t_lock.h> 42 #include <sys/uio.h> 43 #include <sys/kmem.h> 44 #include <sys/thread.h> 45 #include <sys/vfs.h> 46 #include <sys/errno.h> 47 #include <sys/buf.h> 48 #include <sys/vnode.h> 49 #include <sys/fs/ufs_trans.h> 50 #include <sys/fs/ufs_inode.h> 51 #include <sys/fs/ufs_fs.h> 52 #include <sys/fs/ufs_fsdir.h> 53 #include <sys/fs/ufs_quota.h> 54 #include <sys/fs/ufs_panic.h> 55 #include <sys/fs/ufs_bio.h> 56 #include <sys/fs/ufs_log.h> 57 #include <sys/cmn_err.h> 58 #include <sys/file.h> 59 #include <sys/debug.h> 60 61 62 extern kmutex_t ufsvfs_mutex; 63 extern struct ufsvfs *ufs_instances; 64 65 /* 66 * hlock any file systems w/errored logs 67 */ 68 int 69 ufs_trans_hlock() 70 { 71 struct ufsvfs *ufsvfsp; 72 struct lockfs lockfs; 73 int error; 74 int retry = 0; 75 76 /* 77 * find fs's that paniced or have errored logging devices 78 */ 79 mutex_enter(&ufsvfs_mutex); 80 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) { 81 /* 82 * not mounted; continue 83 */ 84 if ((ufsvfsp->vfs_vfs == NULL) || 85 (ufsvfsp->vfs_validfs == UT_UNMOUNTED)) 86 continue; 87 /* 88 * disallow unmounts (hlock occurs below) 89 */ 90 if (TRANS_ISERROR(ufsvfsp)) 91 ufsvfsp->vfs_validfs = UT_HLOCKING; 92 } 93 mutex_exit(&ufsvfs_mutex); 94 95 /* 96 * hlock the fs's that paniced or have errored logging devices 97 */ 98 again: 99 mutex_enter(&ufsvfs_mutex); 100 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) 101 if (ufsvfsp->vfs_validfs == UT_HLOCKING) 102 break; 103 mutex_exit(&ufsvfs_mutex); 104 if (ufsvfsp == NULL) 105 return (retry); 106 /* 107 * hlock the file system 108 */ 109 (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs); 110 if (!LOCKFS_IS_ELOCK(&lockfs)) { 111 lockfs.lf_lock = LOCKFS_HLOCK; 112 lockfs.lf_flags = 0; 113 lockfs.lf_comlen = 0; 114 lockfs.lf_comment = NULL; 115 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0); 116 /* 117 * retry after awhile; another app currently doing lockfs 118 */ 119 if (error == EBUSY || error == EINVAL) 120 retry = 1; 121 } else { 122 if (ufsfx_get_failure_qlen() > 0) { 123 if (mutex_tryenter(&ufs_fix.uq_mutex)) { 124 ufs_fix.uq_lowat = ufs_fix.uq_ne; 125 cv_broadcast(&ufs_fix.uq_cv); 126 mutex_exit(&ufs_fix.uq_mutex); 127 } 128 } 129 retry = 1; 130 } 131 132 /* 133 * allow unmounts 134 */ 135 ufsvfsp->vfs_validfs = UT_MOUNTED; 136 goto again; 137 } 138 139 /*ARGSUSED*/ 140 void 141 ufs_trans_onerror() 142 { 143 mutex_enter(&ufs_hlock.uq_mutex); 144 ufs_hlock.uq_ne = ufs_hlock.uq_lowat; 145 cv_broadcast(&ufs_hlock.uq_cv); 146 mutex_exit(&ufs_hlock.uq_mutex); 147 } 148 149 void 150 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid) 151 { 152 if (curthread->t_flag & T_DONTBLOCK) { 153 sbupdate(vfsp); 154 return; 155 } else { 156 157 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 158 return; 159 160 curthread->t_flag |= T_DONTBLOCK; 161 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 162 sbupdate(vfsp); 163 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE); 164 curthread->t_flag &= ~T_DONTBLOCK; 165 } 166 } 167 168 void 169 ufs_trans_iupdat(struct inode *ip, int waitfor) 170 { 171 struct ufsvfs *ufsvfsp; 172 173 if (curthread->t_flag & T_DONTBLOCK) { 174 rw_enter(&ip->i_contents, RW_READER); 175 ufs_iupdat(ip, waitfor); 176 rw_exit(&ip->i_contents); 177 return; 178 } else { 179 ufsvfsp = ip->i_ufsvfs; 180 181 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 182 return; 183 184 curthread->t_flag |= T_DONTBLOCK; 185 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 186 rw_enter(&ip->i_contents, RW_READER); 187 ufs_iupdat(ip, waitfor); 188 rw_exit(&ip->i_contents); 189 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip)); 190 curthread->t_flag &= ~T_DONTBLOCK; 191 } 192 } 193 194 void 195 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid) 196 { 197 if (curthread->t_flag & T_DONTBLOCK) { 198 mutex_enter(&ufsvfsp->vfs_lock); 199 ufs_sbwrite(ufsvfsp); 200 mutex_exit(&ufsvfsp->vfs_lock); 201 return; 202 } else { 203 204 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 205 return; 206 207 curthread->t_flag |= T_DONTBLOCK; 208 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 209 mutex_enter(&ufsvfsp->vfs_lock); 210 ufs_sbwrite(ufsvfsp); 211 mutex_exit(&ufsvfsp->vfs_lock); 212 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE); 213 curthread->t_flag &= ~T_DONTBLOCK; 214 } 215 } 216 217 /*ARGSUSED*/ 218 int 219 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore) 220 { 221 struct fs *fs; 222 223 fs = ufsvfsp->vfs_fs; 224 mutex_enter(&ufsvfsp->vfs_lock); 225 TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp, 226 ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize, 227 (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize); 228 mutex_exit(&ufsvfsp->vfs_lock); 229 return (0); 230 } 231 232 /*ARGSUSED*/ 233 int 234 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno) 235 { 236 struct buf *bp; 237 238 bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1); 239 if (bp == NULL) 240 return (ENOENT); 241 242 if (bp->b_flags & B_DELWRI) { 243 /* 244 * Do not use brwrite() here since the buffer is already 245 * marked for retry or not by the code that called 246 * TRANS_BUF(). 247 */ 248 UFS_BWRITE(ufsvfsp, bp); 249 return (0); 250 } 251 /* 252 * If we did not find the real buf for this block above then 253 * clear the dev so the buf won't be found by mistake 254 * for this block later. We had to allocate at least a 1 byte 255 * buffer to keep brelse happy. 256 */ 257 if (bp->b_bufsize == 1) { 258 bp->b_dev = (o_dev_t)NODEV; 259 bp->b_edev = NODEV; 260 bp->b_flags = 0; 261 } 262 brelse(bp); 263 return (ENOENT); 264 } 265 266 /*ARGSUSED*/ 267 int 268 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino) 269 { 270 int error; 271 struct inode *ip; 272 273 /* 274 * Grab the quota lock (if the file system has not been forcibly 275 * unmounted). 276 */ 277 if (ufsvfsp) 278 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 279 280 error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred); 281 282 if (ufsvfsp) 283 rw_exit(&ufsvfsp->vfs_dqrwlock); 284 if (error) 285 return (ENOENT); 286 287 if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) { 288 rw_enter(&ip->i_contents, RW_READER); 289 ufs_iupdat(ip, 1); 290 rw_exit(&ip->i_contents); 291 VN_RELE(ITOV(ip)); 292 return (0); 293 } 294 VN_RELE(ITOV(ip)); 295 return (ENOENT); 296 } 297 298 #ifdef DEBUG 299 /* 300 * These routines maintain the metadata map (matamap) 301 */ 302 303 /* 304 * update the metadata map at mount 305 */ 306 static int 307 ufs_trans_mata_mount_scan(struct inode *ip, void *arg) 308 { 309 /* 310 * wrong file system; keep looking 311 */ 312 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 313 return (0); 314 315 /* 316 * load the metadata map 317 */ 318 rw_enter(&ip->i_contents, RW_WRITER); 319 ufs_trans_mata_iget(ip); 320 rw_exit(&ip->i_contents); 321 return (0); 322 } 323 324 void 325 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp) 326 { 327 struct fs *fs = ufsvfsp->vfs_fs; 328 ino_t ino; 329 int i; 330 331 /* 332 * put static metadata into matamap 333 * superblock 334 * cylinder groups 335 * inode groups 336 * existing inodes 337 */ 338 TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize); 339 340 for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) { 341 TRANS_MATAADD(ufsvfsp, 342 ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize); 343 TRANS_MATAADD(ufsvfsp, 344 ldbtob(fsbtodb(fs, itod(fs, ino))), 345 fs->fs_ipg * sizeof (struct dinode)); 346 } 347 (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp); 348 } 349 350 /* 351 * clear the metadata map at umount 352 */ 353 void 354 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp) 355 { 356 top_mataclr(ufsvfsp); 357 } 358 359 /* 360 * summary info (may be extended during growfs test) 361 */ 362 void 363 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs) 364 { 365 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)), 366 fs->fs_cssize); 367 } 368 369 /* 370 * scan an allocation block (either inode or true block) 371 */ 372 static void 373 ufs_trans_mata_direct( 374 struct inode *ip, 375 daddr_t *fragsp, 376 daddr32_t *blkp, 377 unsigned int nblk) 378 { 379 int i; 380 daddr_t frag; 381 ulong_t nb; 382 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 383 struct fs *fs = ufsvfsp->vfs_fs; 384 385 for (i = 0; i < nblk && *fragsp; ++i, ++blkp) 386 if ((frag = *blkp) != 0) { 387 if (*fragsp > fs->fs_frag) { 388 nb = fs->fs_bsize; 389 *fragsp -= fs->fs_frag; 390 } else { 391 nb = *fragsp * fs->fs_fsize; 392 *fragsp = 0; 393 } 394 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 395 } 396 } 397 398 /* 399 * scan an indirect allocation block (either inode or true block) 400 */ 401 static void 402 ufs_trans_mata_indir( 403 struct inode *ip, 404 daddr_t *fragsp, 405 daddr_t frag, 406 int level) 407 { 408 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 409 struct fs *fs = ufsvfsp->vfs_fs; 410 int ne = fs->fs_bsize / (int)sizeof (daddr32_t); 411 int i; 412 struct buf *bp; 413 daddr32_t *blkp; 414 o_mode_t ifmt = ip->i_mode & IFMT; 415 416 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize); 417 if (bp->b_flags & B_ERROR) { 418 brelse(bp); 419 return; 420 } 421 blkp = bp->b_un.b_daddr; 422 423 if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) || 424 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)) 425 ufs_trans_mata_direct(ip, fragsp, blkp, ne); 426 427 if (level) 428 for (i = 0; i < ne && *fragsp; ++i, ++blkp) 429 ufs_trans_mata_indir(ip, fragsp, *blkp, level-1); 430 brelse(bp); 431 } 432 433 /* 434 * put appropriate metadata into matamap for this inode 435 */ 436 void 437 ufs_trans_mata_iget(struct inode *ip) 438 { 439 int i; 440 daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks); 441 o_mode_t ifmt = ip->i_mode & IFMT; 442 443 if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) || 444 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 445 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR); 446 447 if (frags) 448 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR); 449 450 for (i = 0; i < NIADDR && frags; ++i) 451 if (ip->i_ib[i]) 452 ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i); 453 } 454 455 /* 456 * freeing possible metadata (block of user data) 457 */ 458 void 459 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb) 460 { 461 top_matadel(ufsvfsp, mof, nb); 462 463 } 464 465 /* 466 * allocating metadata 467 */ 468 void 469 ufs_trans_mata_alloc( 470 struct ufsvfs *ufsvfsp, 471 struct inode *ip, 472 daddr_t frag, 473 ulong_t nb, 474 int indir) 475 { 476 struct fs *fs = ufsvfsp->vfs_fs; 477 o_mode_t ifmt = ip->i_mode & IFMT; 478 479 if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) || 480 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))) 481 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb); 482 } 483 484 #endif /* DEBUG */ 485 486 /* 487 * ufs_trans_dir is used to declare a directory delta 488 */ 489 int 490 ufs_trans_dir(struct inode *ip, off_t offset) 491 { 492 daddr_t bn; 493 int contig = 0, error; 494 495 ASSERT(ip); 496 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 497 error = bmap_read(ip, (u_offset_t)offset, &bn, &contig); 498 if (error || (bn == UFS_HOLE)) { 499 cmn_err(CE_WARN, "ufs_trans_dir - could not get block" 500 " number error = %d bn = %d\n", error, (int)bn); 501 if (error == 0) /* treat UFS_HOLE as an I/O error */ 502 error = EIO; 503 return (error); 504 } 505 TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0); 506 return (error); 507 } 508 509 /*ARGSUSED*/ 510 int 511 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp) 512 { 513 /* 514 * Lock the quota subsystem (ufsvfsp can be NULL 515 * if the DQ_ERROR is set). 516 */ 517 if (ufsvfsp) 518 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 519 mutex_enter(&dqp->dq_lock); 520 521 /* 522 * If this transaction has been cancelled by closedq_scan_inode(), 523 * then bail out now. We don't call dqput() in this case because 524 * it has already been done. 525 */ 526 if ((dqp->dq_flags & DQ_TRANS) == 0) { 527 mutex_exit(&dqp->dq_lock); 528 if (ufsvfsp) 529 rw_exit(&ufsvfsp->vfs_dqrwlock); 530 return (0); 531 } 532 533 if (dqp->dq_flags & DQ_ERROR) { 534 /* 535 * Paranoia to make sure that there is at least one 536 * reference to the dquot struct. We are done with 537 * the dquot (due to an error) so clear logging 538 * specific markers. 539 */ 540 ASSERT(dqp->dq_cnt >= 1); 541 dqp->dq_flags &= ~DQ_TRANS; 542 dqput(dqp); 543 mutex_exit(&dqp->dq_lock); 544 if (ufsvfsp) 545 rw_exit(&ufsvfsp->vfs_dqrwlock); 546 return (1); 547 } 548 549 if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) { 550 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0)); 551 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb, 552 dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0); 553 /* 554 * Paranoia to make sure that there is at least one 555 * reference to the dquot struct. Clear the 556 * modification flag because the operation is now in 557 * the log. Also clear the logging specific markers 558 * that were set in ufs_trans_quota(). 559 */ 560 ASSERT(dqp->dq_cnt >= 1); 561 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS); 562 dqput(dqp); 563 } 564 565 /* 566 * At this point, the logging specific flag should be clear, 567 * but add paranoia just in case something has gone wrong. 568 */ 569 ASSERT((dqp->dq_flags & DQ_TRANS) == 0); 570 mutex_exit(&dqp->dq_lock); 571 if (ufsvfsp) 572 rw_exit(&ufsvfsp->vfs_dqrwlock); 573 return (0); 574 } 575 576 /* 577 * ufs_trans_quota take in a uid, allocates the disk space, placing the 578 * quota record into the metamap, then declares the delta. 579 */ 580 /*ARGSUSED*/ 581 void 582 ufs_trans_quota(struct dquot *dqp) 583 { 584 585 struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod; 586 587 ASSERT(qip); 588 ASSERT(MUTEX_HELD(&dqp->dq_lock)); 589 ASSERT(dqp->dq_flags & DQ_MOD); 590 ASSERT(dqp->dq_mof != 0); 591 ASSERT(dqp->dq_mof != UFS_HOLE); 592 593 /* 594 * Mark this dquot to indicate that we are starting a logging 595 * file system operation for this dquot. Also increment the 596 * reference count so that the dquot does not get reused while 597 * it is on the mapentry_t list. DQ_TRANS is cleared and the 598 * reference count is decremented by ufs_trans_push_quota. 599 * 600 * If the file system is force-unmounted while there is a 601 * pending quota transaction, then closedq_scan_inode() will 602 * clear the DQ_TRANS flag and decrement the reference count. 603 * 604 * Since deltamap_add() drops multiple transactions to the 605 * same dq_mof and ufs_trans_push_quota() won't get called, 606 * we use DQ_TRANS to prevent repeat transactions from 607 * incrementing the reference count (or calling TRANS_DELTA()). 608 */ 609 if ((dqp->dq_flags & DQ_TRANS) == 0) { 610 dqp->dq_flags |= DQ_TRANS; 611 dqp->dq_cnt++; 612 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk), 613 DT_QR, ufs_trans_push_quota, (ulong_t)dqp); 614 } 615 } 616 617 void 618 ufs_trans_dqrele(struct dquot *dqp) 619 { 620 struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp; 621 622 curthread->t_flag |= T_DONTBLOCK; 623 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 624 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 625 dqrele(dqp); 626 rw_exit(&ufsvfsp->vfs_dqrwlock); 627 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE); 628 curthread->t_flag &= ~T_DONTBLOCK; 629 } 630 631 int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */ 632 long ufs_trans_avgbfree = 0; /* will be adjusted for testing */ 633 #define TRANS_MAX_WRITE (1024 * 1024) 634 size_t ufs_trans_max_resid = TRANS_MAX_WRITE; 635 636 /* 637 * Calculate the log reservation for the given write or truncate 638 */ 639 static ulong_t 640 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc) 641 { 642 long ncg, last2blk; 643 long niblk = 0; 644 u_offset_t writeend, offblk; 645 int resv; 646 daddr_t nblk, maxfblk; 647 long avgbfree; 648 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 649 struct fs *fs = ufsvfsp->vfs_fs; 650 long fni = NINDIR(fs); 651 int bsize = fs->fs_bsize; 652 653 /* 654 * Assume that the request will fit in 1 or 2 cg's, 655 * resv is the amount of log space to reserve (in bytes). 656 */ 657 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 658 659 /* 660 * get max position of write in fs blocks 661 */ 662 writeend = offset + resid; 663 maxfblk = lblkno(fs, writeend); 664 offblk = lblkno(fs, offset); 665 /* 666 * request size in fs blocks 667 */ 668 nblk = lblkno(fs, blkroundup(fs, resid)); 669 /* 670 * Adjust for sparse files 671 */ 672 if (trunc) 673 nblk = MIN(nblk, ip->i_blocks); 674 675 /* 676 * Adjust avgbfree (for testing) 677 */ 678 avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1; 679 680 /* 681 * Calculate maximum number of blocks of triple indirect 682 * pointers to write. 683 */ 684 last2blk = NDADDR + fni + fni * fni; 685 if (maxfblk > last2blk) { 686 long nl2ptr; 687 long n3blk; 688 689 if (offblk > last2blk) 690 n3blk = maxfblk - offblk; 691 else 692 n3blk = maxfblk - last2blk; 693 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1; 694 nl2ptr = roundup(niblk, fni) / fni + 1; 695 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2; 696 maxfblk -= n3blk; 697 } 698 /* 699 * calculate maximum number of blocks of double indirect 700 * pointers to write. 701 */ 702 if (maxfblk > NDADDR + fni) { 703 long n2blk; 704 705 if (offblk > NDADDR + fni) 706 n2blk = maxfblk - offblk; 707 else 708 n2blk = maxfblk - NDADDR + fni; 709 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2; 710 maxfblk -= n2blk; 711 } 712 /* 713 * Add in indirect pointer block write 714 */ 715 if (maxfblk > NDADDR) { 716 niblk += 1; 717 } 718 /* 719 * Calculate deltas for indirect pointer writes 720 */ 721 resv += niblk * (fs->fs_bsize + sizeof (struct delta)); 722 /* 723 * maximum number of cg's needed for request 724 */ 725 ncg = nblk / avgbfree; 726 if (ncg > fs->fs_ncg) 727 ncg = fs->fs_ncg; 728 729 /* 730 * maximum amount of log space needed for request 731 */ 732 if (ncg > 2) 733 resv += (ncg - 2) * SIZECG(ip); 734 735 return (resv); 736 } 737 738 /* 739 * Calculate the amount of log space that needs to be reserved for this 740 * trunc request. If the amount of log space is too large, then 741 * calculate the the size that the requests needs to be split into. 742 */ 743 void 744 ufs_trans_trunc_resv( 745 struct inode *ip, 746 u_offset_t length, 747 int *resvp, 748 u_offset_t *residp) 749 { 750 ulong_t resv; 751 u_offset_t size, offset, resid; 752 int nchunks; 753 754 /* 755 * *resvp is the amount of log space to reserve (in bytes). 756 * when nonzero, *residp is the number of bytes to truncate. 757 */ 758 *residp = 0; 759 760 if (length < ip->i_size) { 761 size = ip->i_size - length; 762 } else { 763 resv = SIZECG(ip) * 2 + INODESIZE + 1024; 764 /* 765 * truncate up, doesn't really use much space, 766 * the default above should be sufficient. 767 */ 768 goto done; 769 } 770 771 offset = length; 772 resid = size; 773 nchunks = 1; 774 for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv; 775 offset = length + (nchunks - 1) * resid) { 776 nchunks++; 777 resid = size / nchunks; 778 } 779 /* 780 * If this request takes too much log space, it will be split 781 */ 782 if (nchunks > 1) { 783 *residp = resid; 784 } 785 done: 786 *resvp = resv; 787 } 788 789 int 790 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr) 791 { 792 int err, issync, resv; 793 u_offset_t resid; 794 int do_block = 0; 795 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 796 struct fs *fs = ufsvfsp->vfs_fs; 797 798 /* 799 * Not logging; just do the trunc 800 */ 801 if (!TRANS_ISTRANS(ufsvfsp)) { 802 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 803 rw_enter(&ip->i_contents, RW_WRITER); 804 err = ufs_itrunc(ip, length, flags, cr); 805 rw_exit(&ip->i_contents); 806 rw_exit(&ufsvfsp->vfs_dqrwlock); 807 return (err); 808 } 809 810 /* 811 * within the lockfs protocol but *not* part of a transaction 812 */ 813 do_block = curthread->t_flag & T_DONTBLOCK; 814 curthread->t_flag |= T_DONTBLOCK; 815 816 /* 817 * Trunc the file (in pieces, if necessary) 818 */ 819 again: 820 ufs_trans_trunc_resv(ip, length, &resv, &resid); 821 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv); 822 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 823 rw_enter(&ip->i_contents, RW_WRITER); 824 if (resid) { 825 /* 826 * resid is only set if we have to truncate in chunks 827 */ 828 ASSERT(length + resid < ip->i_size); 829 830 /* 831 * Partially trunc file down to desired size (length). 832 * Only retain I_FREE on the last partial trunc. 833 * Round up size to a block boundary, to ensure the truncate 834 * doesn't have to allocate blocks. This is done both for 835 * performance and to fix a bug where if the block can't be 836 * allocated then the inode delete fails, but the inode 837 * is still freed with attached blocks and non-zero size 838 * (bug 4348738). 839 */ 840 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)), 841 flags & ~I_FREE, cr); 842 ASSERT(ip->i_size != length); 843 } else 844 err = ufs_itrunc(ip, length, flags, cr); 845 if (!do_block) 846 curthread->t_flag &= ~T_DONTBLOCK; 847 rw_exit(&ip->i_contents); 848 rw_exit(&ufsvfsp->vfs_dqrwlock); 849 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv); 850 851 if ((err == 0) && resid) { 852 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 853 goto again; 854 } 855 return (err); 856 } 857 858 /* 859 * Fault in the pages of the first n bytes specified by the uio structure. 860 * 1 byte in each page is touched and the uio struct is unmodified. 861 * Any error will terminate the process as this is only a best 862 * attempt to get the pages resident. 863 */ 864 static void 865 ufs_trans_touch(ssize_t n, struct uio *uio) 866 { 867 struct iovec *iov; 868 ulong_t cnt, incr; 869 caddr_t p; 870 uint8_t tmp; 871 872 iov = uio->uio_iov; 873 874 while (n) { 875 cnt = MIN(iov->iov_len, n); 876 if (cnt == 0) { 877 /* empty iov entry */ 878 iov++; 879 continue; 880 } 881 n -= cnt; 882 /* 883 * touch each page in this segment. 884 */ 885 p = iov->iov_base; 886 while (cnt) { 887 switch (uio->uio_segflg) { 888 case UIO_USERSPACE: 889 case UIO_USERISPACE: 890 if (fuword8(p, &tmp)) 891 return; 892 break; 893 case UIO_SYSSPACE: 894 if (kcopy(p, &tmp, 1)) 895 return; 896 break; 897 } 898 incr = MIN(cnt, PAGESIZE); 899 p += incr; 900 cnt -= incr; 901 } 902 /* 903 * touch the last byte in case it straddles a page. 904 */ 905 p--; 906 switch (uio->uio_segflg) { 907 case UIO_USERSPACE: 908 case UIO_USERISPACE: 909 if (fuword8(p, &tmp)) 910 return; 911 break; 912 case UIO_SYSSPACE: 913 if (kcopy(p, &tmp, 1)) 914 return; 915 break; 916 } 917 iov++; 918 } 919 } 920 921 /* 922 * Calculate the amount of log space that needs to be reserved for this 923 * write request. If the amount of log space is too large, then 924 * calculate the size that the requests needs to be split into. 925 * First try fixed chunks of size ufs_trans_max_resid. If that 926 * is too big, iterate down to the largest size that will fit. 927 * Pagein the pages in the first chunk here, so that the pagein is 928 * avoided later when the transaction is open. 929 */ 930 void 931 ufs_trans_write_resv( 932 struct inode *ip, 933 struct uio *uio, 934 int *resvp, 935 int *residp) 936 { 937 ulong_t resv; 938 offset_t offset; 939 ssize_t resid; 940 int nchunks; 941 942 *residp = 0; 943 offset = uio->uio_offset; 944 resid = MIN(uio->uio_resid, ufs_trans_max_resid); 945 resv = ufs_log_amt(ip, offset, resid, 0); 946 if (resv <= ufs_trans_max_resv) { 947 ufs_trans_touch(resid, uio); 948 if (resid != uio->uio_resid) 949 *residp = resid; 950 *resvp = resv; 951 return; 952 } 953 954 resid = uio->uio_resid; 955 nchunks = 1; 956 for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv; 957 offset = uio->uio_offset + (nchunks - 1) * resid) { 958 nchunks++; 959 resid = uio->uio_resid / nchunks; 960 } 961 ufs_trans_touch(resid, uio); 962 /* 963 * If this request takes too much log space, it will be split 964 */ 965 if (nchunks > 1) 966 *residp = resid; 967 *resvp = resv; 968 } 969 970 /* 971 * Issue write request. 972 * 973 * Split a large request into smaller chunks. 974 */ 975 int 976 ufs_trans_write( 977 struct inode *ip, 978 struct uio *uio, 979 int ioflag, 980 cred_t *cr, 981 int resv, 982 long resid) 983 { 984 long realresid; 985 int err; 986 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 987 988 /* 989 * since the write is too big and would "HOG THE LOG" it needs to 990 * be broken up and done in pieces. NOTE, the caller will 991 * issue the EOT after the request has been completed 992 */ 993 realresid = uio->uio_resid; 994 995 again: 996 /* 997 * Perform partial request (uiomove will update uio for us) 998 * Request is split up into "resid" size chunks until 999 * "realresid" bytes have been transferred. 1000 */ 1001 uio->uio_resid = MIN(resid, realresid); 1002 realresid -= uio->uio_resid; 1003 err = wrip(ip, uio, ioflag, cr); 1004 1005 /* 1006 * Error or request is done; caller issues final EOT 1007 */ 1008 if (err || uio->uio_resid || (realresid == 0)) { 1009 uio->uio_resid += realresid; 1010 return (err); 1011 } 1012 1013 /* 1014 * Generate EOT for this part of the request 1015 */ 1016 rw_exit(&ip->i_contents); 1017 rw_exit(&ufsvfsp->vfs_dqrwlock); 1018 if (ioflag & (FSYNC|FDSYNC)) { 1019 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv); 1020 } else { 1021 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 1022 } 1023 1024 /* 1025 * Make sure the input buffer is resident before starting 1026 * the next transaction. 1027 */ 1028 ufs_trans_touch(MIN(resid, realresid), uio); 1029 1030 /* 1031 * Generate BOT for next part of the request 1032 */ 1033 if (ioflag & (FSYNC|FDSYNC)) { 1034 int error; 1035 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error); 1036 ASSERT(!error); 1037 } else { 1038 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 1039 } 1040 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1041 rw_enter(&ip->i_contents, RW_WRITER); 1042 /* 1043 * Error during EOT (probably device error while writing commit rec) 1044 */ 1045 if (err) 1046 return (err); 1047 goto again; 1048 } 1049