1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/t_lock.h> 38 #include <sys/ksynch.h> 39 #include <sys/param.h> 40 #include <sys/time.h> 41 #include <sys/systm.h> 42 #include <sys/sysmacros.h> 43 #include <sys/resource.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/buf.h> 48 #include <sys/vfs.h> 49 #include <sys/vnode.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/file.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/atomic.h> 56 #include <sys/kmem.h> 57 #include <sys/uio.h> 58 #include <sys/dnlc.h> 59 #include <sys/conf.h> 60 #include <sys/mman.h> 61 #include <sys/pathname.h> 62 #include <sys/debug.h> 63 #include <sys/vmsystm.h> 64 #include <sys/cmn_err.h> 65 #include <sys/vtrace.h> 66 #include <sys/filio.h> 67 #include <sys/policy.h> 68 69 #include <sys/fs/ufs_fs.h> 70 #include <sys/fs/ufs_lockfs.h> 71 #include <sys/fs/ufs_filio.h> 72 #include <sys/fs/ufs_inode.h> 73 #include <sys/fs/ufs_fsdir.h> 74 #include <sys/fs/ufs_quota.h> 75 #include <sys/fs/ufs_log.h> 76 #include <sys/fs/ufs_snap.h> 77 #include <sys/fs/ufs_trans.h> 78 #include <sys/fs/ufs_panic.h> 79 #include <sys/fs/ufs_bio.h> 80 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 81 #include <sys/errno.h> 82 #include <sys/fssnap_if.h> 83 #include <sys/unistd.h> 84 #include <sys/sunddi.h> 85 86 #include <sys/filio.h> /* _FIOIO */ 87 88 #include <vm/hat.h> 89 #include <vm/page.h> 90 #include <vm/pvn.h> 91 #include <vm/as.h> 92 #include <vm/seg.h> 93 #include <vm/seg_map.h> 94 #include <vm/seg_vn.h> 95 #include <vm/seg_kmem.h> 96 #include <vm/rm.h> 97 #include <sys/swap.h> 98 99 #include <fs/fs_subr.h> 100 101 static struct instats ins; 102 103 static int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 104 static int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, 105 caddr_t, struct page **, size_t, enum seg_rw, int); 106 static int ufs_open(struct vnode **, int, struct cred *); 107 static int ufs_close(struct vnode *, int, int, offset_t, struct cred *); 108 static int ufs_read(struct vnode *, struct uio *, int, struct cred *, 109 struct caller_context *); 110 static int ufs_write(struct vnode *, struct uio *, int, struct cred *, 111 struct caller_context *); 112 static int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *); 113 static int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *); 114 static int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *, 115 caller_context_t *); 116 static int ufs_access(struct vnode *, int, int, struct cred *); 117 static int ufs_lookup(struct vnode *, char *, struct vnode **, 118 struct pathname *, int, struct vnode *, struct cred *); 119 static int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl, 120 int, struct vnode **, struct cred *, int); 121 static int ufs_remove(struct vnode *, char *, struct cred *); 122 static int ufs_link(struct vnode *, struct vnode *, char *, struct cred *); 123 static int ufs_rename(struct vnode *, char *, struct vnode *, char *, 124 struct cred *); 125 static int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, 126 struct cred *); 127 static int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *); 128 static int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *); 129 static int ufs_symlink(struct vnode *, char *, struct vattr *, char *, 130 struct cred *); 131 static int ufs_readlink(struct vnode *, struct uio *, struct cred *); 132 static int ufs_fsync(struct vnode *, int, struct cred *); 133 static void ufs_inactive(struct vnode *, struct cred *); 134 static int ufs_fid(struct vnode *, struct fid *); 135 static int ufs_rwlock(struct vnode *, int, caller_context_t *); 136 static void ufs_rwunlock(struct vnode *, int, caller_context_t *); 137 static int ufs_seek(struct vnode *, offset_t, offset_t *); 138 static int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 139 struct flk_callback *, struct cred *); 140 static int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t, 141 cred_t *, caller_context_t *); 142 static int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *, 143 struct page **, size_t, struct seg *, caddr_t, 144 enum seg_rw, struct cred *); 145 static int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *); 146 static int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *); 147 static int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 148 uchar_t, uchar_t, uint_t, struct cred *); 149 static int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 150 uchar_t, uchar_t, uint_t, struct cred *); 151 static int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 152 uint_t, uint_t, uint_t, struct cred *); 153 static int ufs_poll(vnode_t *, short, int, short *, struct pollhead **); 154 static int ufs_dump(vnode_t *, caddr_t, int, int); 155 static int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *); 156 static int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int, 157 struct cred *); 158 static int ufs_dump(vnode_t *, caddr_t, int, int); 159 static int ufs_dumpctl(vnode_t *, int, int *); 160 static daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *, 161 daddr32_t *, int, int); 162 static int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 163 static int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 164 165 /* 166 * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions. 167 * 168 * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet. 169 */ 170 struct vnodeops *ufs_vnodeops; 171 172 const fs_operation_def_t ufs_vnodeops_template[] = { 173 VOPNAME_OPEN, ufs_open, /* will not be blocked by lockfs */ 174 VOPNAME_CLOSE, ufs_close, /* will not be blocked by lockfs */ 175 VOPNAME_READ, ufs_read, 176 VOPNAME_WRITE, ufs_write, 177 VOPNAME_IOCTL, ufs_ioctl, 178 VOPNAME_GETATTR, ufs_getattr, 179 VOPNAME_SETATTR, ufs_setattr, 180 VOPNAME_ACCESS, ufs_access, 181 VOPNAME_LOOKUP, ufs_lookup, 182 VOPNAME_CREATE, ufs_create, 183 VOPNAME_REMOVE, ufs_remove, 184 VOPNAME_LINK, ufs_link, 185 VOPNAME_RENAME, ufs_rename, 186 VOPNAME_MKDIR, ufs_mkdir, 187 VOPNAME_RMDIR, ufs_rmdir, 188 VOPNAME_READDIR, ufs_readdir, 189 VOPNAME_SYMLINK, ufs_symlink, 190 VOPNAME_READLINK, ufs_readlink, 191 VOPNAME_FSYNC, ufs_fsync, 192 VOPNAME_INACTIVE, (fs_generic_func_p) ufs_inactive, /* not blocked */ 193 VOPNAME_FID, ufs_fid, 194 VOPNAME_RWLOCK, ufs_rwlock, /* not blocked */ 195 VOPNAME_RWUNLOCK, (fs_generic_func_p) ufs_rwunlock, /* not blocked */ 196 VOPNAME_SEEK, ufs_seek, 197 VOPNAME_FRLOCK, ufs_frlock, 198 VOPNAME_SPACE, ufs_space, 199 VOPNAME_GETPAGE, ufs_getpage, 200 VOPNAME_PUTPAGE, ufs_putpage, 201 VOPNAME_MAP, (fs_generic_func_p) ufs_map, 202 VOPNAME_ADDMAP, (fs_generic_func_p) ufs_addmap, /* not blocked */ 203 VOPNAME_DELMAP, ufs_delmap, /* will not be blocked by lockfs */ 204 VOPNAME_POLL, (fs_generic_func_p) ufs_poll, /* not blocked */ 205 VOPNAME_DUMP, ufs_dump, 206 VOPNAME_PATHCONF, ufs_l_pathconf, 207 VOPNAME_PAGEIO, ufs_pageio, 208 VOPNAME_DUMPCTL, ufs_dumpctl, 209 VOPNAME_GETSECATTR, ufs_getsecattr, 210 VOPNAME_SETSECATTR, ufs_setsecattr, 211 VOPNAME_VNEVENT, fs_vnevent_support, 212 NULL, NULL 213 }; 214 215 #define MAX_BACKFILE_COUNT 9999 216 217 /* 218 * Created by ufs_dumpctl() to store a file's disk block info into memory. 219 * Used by ufs_dump() to dump data to disk directly. 220 */ 221 struct dump { 222 struct inode *ip; /* the file we contain */ 223 daddr_t fsbs; /* number of blocks stored */ 224 struct timeval32 time; /* time stamp for the struct */ 225 daddr32_t dblk[1]; /* place holder for block info */ 226 }; 227 228 static struct dump *dump_info = NULL; 229 230 /* 231 * Previously there was no special action required for ordinary files. 232 * (Devices are handled through the device file system.) 233 * Now we support Large Files and Large File API requires open to 234 * fail if file is large. 235 * We could take care to prevent data corruption 236 * by doing an atomic check of size and truncate if file is opened with 237 * FTRUNC flag set but traditionally this is being done by the vfs/vnode 238 * layers. So taking care of truncation here is a change in the existing 239 * semantics of VOP_OPEN and therefore we chose not to implement any thing 240 * here. The check for the size of the file > 2GB is being done at the 241 * vfs layer in routine vn_open(). 242 */ 243 244 /* ARGSUSED */ 245 static int 246 ufs_open(struct vnode **vpp, int flag, struct cred *cr) 247 { 248 TRACE_1(TR_FAC_UFS, TR_UFS_OPEN, "ufs_open:vpp %p", vpp); 249 return (0); 250 } 251 252 /*ARGSUSED*/ 253 static int 254 ufs_close(struct vnode *vp, int flag, int count, offset_t offset, 255 struct cred *cr) 256 { 257 TRACE_1(TR_FAC_UFS, TR_UFS_CLOSE, "ufs_close:vp %p", vp); 258 259 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 260 cleanshares(vp, ttoproc(curthread)->p_pid); 261 262 /* 263 * Push partially filled cluster at last close. 264 * ``last close'' is approximated because the dnlc 265 * may have a hold on the vnode. 266 * Checking for VBAD here will also act as a forced umount check. 267 */ 268 if (vp->v_count <= 2 && vp->v_type != VBAD) { 269 struct inode *ip = VTOI(vp); 270 if (ip->i_delaylen) { 271 ins.in_poc.value.ul++; 272 (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen, 273 B_ASYNC | B_FREE, cr); 274 ip->i_delaylen = 0; 275 } 276 } 277 278 return (0); 279 } 280 281 /*ARGSUSED*/ 282 static int 283 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 284 struct caller_context *ct) 285 { 286 struct inode *ip = VTOI(vp); 287 struct ufsvfs *ufsvfsp; 288 struct ulockfs *ulp = NULL; 289 int error = 0; 290 int intrans = 0; 291 292 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 293 TRACE_3(TR_FAC_UFS, TR_UFS_READ_START, 294 "ufs_read_start:vp %p uiop %p ioflag %x", 295 vp, uiop, ioflag); 296 297 /* 298 * Mandatory locking needs to be done before ufs_lockfs_begin() 299 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep. 300 */ 301 if (MANDLOCK(vp, ip->i_mode)) { 302 /* 303 * ufs_getattr ends up being called by chklock 304 */ 305 error = chklock(vp, FREAD, uiop->uio_loffset, 306 uiop->uio_resid, uiop->uio_fmode, ct); 307 if (error) 308 goto out; 309 } 310 311 ufsvfsp = ip->i_ufsvfs; 312 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK); 313 if (error) 314 goto out; 315 316 /* 317 * In the case that a directory is opened for reading as a file 318 * (eg "cat .") with the O_RSYNC, O_SYNC and O_DSYNC flags set. 319 * The locking order had to be changed to avoid a deadlock with 320 * an update taking place on that directory at the same time. 321 */ 322 if ((ip->i_mode & IFMT) == IFDIR) { 323 324 rw_enter(&ip->i_contents, RW_READER); 325 error = rdip(ip, uiop, ioflag, cr); 326 rw_exit(&ip->i_contents); 327 328 if (error) { 329 if (ulp) 330 ufs_lockfs_end(ulp); 331 goto out; 332 } 333 334 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 335 TRANS_ISTRANS(ufsvfsp)) { 336 rw_exit(&ip->i_rwlock); 337 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 338 error); 339 ASSERT(!error); 340 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 341 TOP_READ_SIZE); 342 rw_enter(&ip->i_rwlock, RW_READER); 343 } 344 } else { 345 /* 346 * Only transact reads to files opened for sync-read and 347 * sync-write on a file system that is not write locked. 348 * 349 * The ``not write locked'' check prevents problems with 350 * enabling/disabling logging on a busy file system. E.g., 351 * logging exists at the beginning of the read but does not 352 * at the end. 353 * 354 */ 355 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 356 TRANS_ISTRANS(ufsvfsp)) { 357 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 358 error); 359 ASSERT(!error); 360 intrans = 1; 361 } 362 363 rw_enter(&ip->i_contents, RW_READER); 364 error = rdip(ip, uiop, ioflag, cr); 365 rw_exit(&ip->i_contents); 366 367 if (intrans) { 368 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 369 TOP_READ_SIZE); 370 } 371 } 372 373 if (ulp) { 374 ufs_lockfs_end(ulp); 375 } 376 out: 377 378 TRACE_2(TR_FAC_UFS, TR_UFS_READ_END, 379 "ufs_read_end:vp %p error %d", vp, error); 380 return (error); 381 } 382 383 extern int ufs_HW; /* high water mark */ 384 extern int ufs_LW; /* low water mark */ 385 int ufs_WRITES = 1; /* XXX - enable/disable */ 386 int ufs_throttles = 0; /* throttling count */ 387 int ufs_allow_shared_writes = 1; /* directio shared writes */ 388 389 static int 390 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag) 391 { 392 int shared_write; 393 394 /* 395 * If the FDSYNC flag is set then ignore the global 396 * ufs_allow_shared_writes in this case. 397 */ 398 shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes; 399 400 /* 401 * Filter to determine if this request is suitable as a 402 * concurrent rewrite. This write must not allocate blocks 403 * by extending the file or filling in holes. No use trying 404 * through FSYNC descriptors as the inode will be synchronously 405 * updated after the write. The uio structure has not yet been 406 * checked for sanity, so assume nothing. 407 */ 408 return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) && 409 (uiop->uio_loffset >= (offset_t)0) && 410 (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) && 411 ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) && 412 !(ioflag & FSYNC) && !bmap_has_holes(ip) && 413 shared_write); 414 } 415 416 /*ARGSUSED*/ 417 static int 418 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr, 419 caller_context_t *ct) 420 { 421 struct inode *ip = VTOI(vp); 422 struct ufsvfs *ufsvfsp; 423 struct ulockfs *ulp; 424 int retry = 1; 425 int error, resv, resid = 0; 426 int directio_status; 427 int exclusive; 428 int rewriteflg; 429 long start_resid = uiop->uio_resid; 430 431 TRACE_3(TR_FAC_UFS, TR_UFS_WRITE_START, 432 "ufs_write_start:vp %p uiop %p ioflag %x", 433 vp, uiop, ioflag); 434 435 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 436 437 retry_mandlock: 438 /* 439 * Mandatory locking needs to be done before ufs_lockfs_begin() 440 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep. 441 * Check for forced unmounts normally done in ufs_lockfs_begin(). 442 */ 443 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 444 error = EIO; 445 goto out; 446 } 447 if (MANDLOCK(vp, ip->i_mode)) { 448 449 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 450 451 /* 452 * ufs_getattr ends up being called by chklock 453 */ 454 error = chklock(vp, FWRITE, uiop->uio_loffset, 455 uiop->uio_resid, uiop->uio_fmode, ct); 456 if (error) 457 goto out; 458 } 459 460 /* i_rwlock can change in chklock */ 461 exclusive = rw_write_held(&ip->i_rwlock); 462 rewriteflg = ufs_check_rewrite(ip, uiop, ioflag); 463 464 /* 465 * Check for fast-path special case of directio re-writes. 466 */ 467 if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) && 468 !exclusive && rewriteflg) { 469 470 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 471 if (error) 472 goto out; 473 474 rw_enter(&ip->i_contents, RW_READER); 475 error = ufs_directio_write(ip, uiop, ioflag, 1, cr, 476 &directio_status); 477 if (directio_status == DIRECTIO_SUCCESS) { 478 uint_t i_flag_save; 479 480 if (start_resid != uiop->uio_resid) 481 error = 0; 482 /* 483 * Special treatment of access times for re-writes. 484 * If IMOD is not already set, then convert it 485 * to IMODACC for this operation. This defers 486 * entering a delta into the log until the inode 487 * is flushed. This mimics what is done for read 488 * operations and inode access time. 489 */ 490 mutex_enter(&ip->i_tlock); 491 i_flag_save = ip->i_flag; 492 ip->i_flag |= IUPD | ICHG; 493 ip->i_seq++; 494 ITIMES_NOLOCK(ip); 495 if ((i_flag_save & IMOD) == 0) { 496 ip->i_flag &= ~IMOD; 497 ip->i_flag |= IMODACC; 498 } 499 mutex_exit(&ip->i_tlock); 500 rw_exit(&ip->i_contents); 501 if (ulp) 502 ufs_lockfs_end(ulp); 503 goto out; 504 } 505 rw_exit(&ip->i_contents); 506 if (ulp) 507 ufs_lockfs_end(ulp); 508 } 509 510 if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) { 511 rw_exit(&ip->i_rwlock); 512 rw_enter(&ip->i_rwlock, RW_WRITER); 513 /* 514 * Mandatory locking could have been enabled 515 * after dropping the i_rwlock. 516 */ 517 if (MANDLOCK(vp, ip->i_mode)) 518 goto retry_mandlock; 519 } 520 521 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 522 if (error) 523 goto out; 524 525 /* 526 * Amount of log space needed for this write 527 */ 528 if (!rewriteflg || !(ioflag & FDSYNC)) 529 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid); 530 531 /* 532 * Throttle writes. 533 */ 534 if (ufs_WRITES && (ip->i_writes > ufs_HW)) { 535 mutex_enter(&ip->i_tlock); 536 while (ip->i_writes > ufs_HW) { 537 ufs_throttles++; 538 cv_wait(&ip->i_wrcv, &ip->i_tlock); 539 } 540 mutex_exit(&ip->i_tlock); 541 } 542 543 /* 544 * Enter Transaction 545 * 546 * If the write is a rewrite there is no need to open a transaction 547 * if the FDSYNC flag is set and not the FSYNC. In this case just 548 * set the IMODACC flag to modify do the update at a later time 549 * thus avoiding the overhead of the logging transaction that is 550 * not required. 551 */ 552 if (ioflag & (FSYNC|FDSYNC)) { 553 if (ulp) { 554 if (rewriteflg) { 555 uint_t i_flag_save; 556 557 rw_enter(&ip->i_contents, RW_READER); 558 mutex_enter(&ip->i_tlock); 559 i_flag_save = ip->i_flag; 560 ip->i_flag |= IUPD | ICHG; 561 ip->i_seq++; 562 ITIMES_NOLOCK(ip); 563 if ((i_flag_save & IMOD) == 0) { 564 ip->i_flag &= ~IMOD; 565 ip->i_flag |= IMODACC; 566 } 567 mutex_exit(&ip->i_tlock); 568 rw_exit(&ip->i_contents); 569 } else { 570 int terr = 0; 571 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, 572 terr); 573 ASSERT(!terr); 574 } 575 } 576 } else { 577 if (ulp) 578 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 579 } 580 581 /* 582 * Write the file 583 */ 584 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 585 rw_enter(&ip->i_contents, RW_WRITER); 586 if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) { 587 /* 588 * In append mode start at end of file. 589 */ 590 uiop->uio_loffset = ip->i_size; 591 } 592 593 /* 594 * Mild optimisation, don't call ufs_trans_write() unless we have to 595 * Also, suppress file system full messages if we will retry. 596 */ 597 if (retry) 598 ip->i_flag |= IQUIET; 599 if (resid) { 600 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid); 601 } else { 602 error = wrip(ip, uiop, ioflag, cr); 603 } 604 ip->i_flag &= ~IQUIET; 605 606 rw_exit(&ip->i_contents); 607 rw_exit(&ufsvfsp->vfs_dqrwlock); 608 609 /* 610 * Leave Transaction 611 */ 612 if (ulp) { 613 if (ioflag & (FSYNC|FDSYNC)) { 614 if (!rewriteflg) { 615 int terr = 0; 616 617 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, 618 resv); 619 if (error == 0) 620 error = terr; 621 } 622 } else { 623 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 624 } 625 ufs_lockfs_end(ulp); 626 } 627 out: 628 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 629 /* 630 * Any blocks tied up in pending deletes? 631 */ 632 ufs_delete_drain_wait(ufsvfsp, 1); 633 retry = 0; 634 goto retry_mandlock; 635 } 636 637 if (error == ENOSPC && (start_resid != uiop->uio_resid)) 638 error = 0; 639 640 TRACE_2(TR_FAC_UFS, TR_UFS_WRITE_END, 641 "ufs_write_end:vp %p error %d", vp, error); 642 return (error); 643 } 644 645 /* 646 * Don't cache write blocks to files with the sticky bit set. 647 * Used to keep swap files from blowing the page cache on a server. 648 */ 649 int stickyhack = 1; 650 651 /* 652 * Free behind hacks. The pager is busted. 653 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 654 * or B_FREE_IF_TIGHT_ON_MEMORY. 655 */ 656 int freebehind = 1; 657 int smallfile = 0; 658 u_offset_t smallfile64 = 32 * 1024; 659 660 /* 661 * While we should, in most cases, cache the pages for write, we 662 * may also want to cache the pages for read as long as they are 663 * frequently re-usable. 664 * 665 * If cache_read_ahead = 1, the pages for read will go to the tail 666 * of the cache list when they are released, otherwise go to the head. 667 */ 668 int cache_read_ahead = 0; 669 670 /* 671 * Freebehind exists so that as we read large files sequentially we 672 * don't consume most of memory with pages from a few files. It takes 673 * longer to re-read from disk multiple small files as it does reading 674 * one large one sequentially. As system memory grows customers need 675 * to retain bigger chunks of files in memory. The advent of the 676 * cachelist opens up of the possibility freeing pages to the head or 677 * tail of the list. 678 * 679 * Not freeing a page is a bet that the page will be read again before 680 * it's segmap slot is needed for something else. If we loose the bet, 681 * it means some other thread is burdened with the page free we did 682 * not do. If we win we save a free and reclaim. 683 * 684 * Freeing it at the tail vs the head of cachelist is a bet that the 685 * page will survive until the next read. It's also saying that this 686 * page is more likely to be re-used than a page freed some time ago 687 * and never reclaimed. 688 * 689 * Freebehind maintains a range of file offset [smallfile1; smallfile2] 690 * 691 * 0 < offset < smallfile1 : pages are not freed. 692 * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist. 693 * smallfile2 < offset : pages freed to head of cachelist. 694 * 695 * The range is computed at most once per second and depends on 696 * freemem and ncpus_online. Both parameters are bounded to be 697 * >= smallfile && >= smallfile64. 698 * 699 * smallfile1 = (free memory / ncpu) / 1000 700 * smallfile2 = (free memory / ncpu) / 10 701 * 702 * A few examples values: 703 * 704 * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2] 705 * ncpus_online = 4 ncpus_online = 64 706 * ------------------ ----------------------- ----------------------- 707 * 1G [256K; 25M] [32K; 1.5M] 708 * 10G [2.5M; 250M] [156K; 15M] 709 * 100G [25M; 2.5G] [1.5M; 150M] 710 * 711 */ 712 713 #define SMALLFILE1_D 1000 714 #define SMALLFILE2_D 10 715 static u_offset_t smallfile1 = 32 * 1024; 716 static u_offset_t smallfile2 = 32 * 1024; 717 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */ 718 uint_t smallfile1_d = SMALLFILE1_D; 719 uint_t smallfile2_d = SMALLFILE2_D; 720 721 /* 722 * wrip does the real work of write requests for ufs. 723 */ 724 int 725 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr) 726 { 727 rlim64_t limit = uio->uio_llimit; 728 u_offset_t off; 729 u_offset_t old_i_size; 730 struct fs *fs; 731 struct vnode *vp; 732 struct ufsvfs *ufsvfsp; 733 caddr_t base; 734 long start_resid = uio->uio_resid; /* save starting resid */ 735 long premove_resid; /* resid before uiomove() */ 736 uint_t flags; 737 int newpage; 738 int iupdat_flag, directio_status; 739 int n, on, mapon; 740 int error, pagecreate; 741 int do_dqrwlock; /* drop/reacquire vfs_dqrwlock */ 742 int32_t iblocks; 743 int new_iblocks; 744 745 /* 746 * ip->i_size is incremented before the uiomove 747 * is done on a write. If the move fails (bad user 748 * address) reset ip->i_size. 749 * The better way would be to increment ip->i_size 750 * only if the uiomove succeeds. 751 */ 752 int i_size_changed = 0; 753 o_mode_t type; 754 int i_seq_needed = 0; 755 756 vp = ITOV(ip); 757 758 /* 759 * check for forced unmount - should not happen as 760 * the request passed the lockfs checks. 761 */ 762 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 763 return (EIO); 764 765 fs = ip->i_fs; 766 767 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 768 "ufs_wrip_start:vp %p", vp); 769 770 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 771 772 /* check for valid filetype */ 773 type = ip->i_mode & IFMT; 774 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 775 (type != IFLNK) && (type != IFSHAD)) { 776 return (EIO); 777 } 778 779 /* 780 * the actual limit of UFS file size 781 * is UFS_MAXOFFSET_T 782 */ 783 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 784 limit = MAXOFFSET_T; 785 786 if (uio->uio_loffset >= limit) { 787 proc_t *p = ttoproc(curthread); 788 789 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 790 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 791 792 mutex_enter(&p->p_lock); 793 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 794 p, RCA_UNSAFE_SIGINFO); 795 mutex_exit(&p->p_lock); 796 return (EFBIG); 797 } 798 799 /* 800 * if largefiles are disallowed, the limit is 801 * the pre-largefiles value of 2GB 802 */ 803 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 804 limit = MIN(UFS_MAXOFFSET_T, limit); 805 else 806 limit = MIN(MAXOFF32_T, limit); 807 808 if (uio->uio_loffset < (offset_t)0) { 809 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 810 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 811 return (EINVAL); 812 } 813 if (uio->uio_resid == 0) { 814 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 815 "ufs_wrip_end:vp %p error %d", vp, 0); 816 return (0); 817 } 818 819 if (uio->uio_loffset >= limit) 820 return (EFBIG); 821 822 ip->i_flag |= INOACC; /* don't update ref time in getpage */ 823 824 if (ioflag & (FSYNC|FDSYNC)) { 825 ip->i_flag |= ISYNC; 826 iupdat_flag = 1; 827 } 828 /* 829 * Try to go direct 830 */ 831 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 832 uio->uio_llimit = limit; 833 error = ufs_directio_write(ip, uio, ioflag, 0, cr, 834 &directio_status); 835 /* 836 * If ufs_directio wrote to the file or set the flags, 837 * we need to update i_seq, but it may be deferred. 838 */ 839 if (start_resid != uio->uio_resid || 840 (ip->i_flag & (ICHG|IUPD))) { 841 i_seq_needed = 1; 842 ip->i_flag |= ISEQ; 843 } 844 if (directio_status == DIRECTIO_SUCCESS) 845 goto out; 846 } 847 848 /* 849 * Behavior with respect to dropping/reacquiring vfs_dqrwlock: 850 * 851 * o shadow inodes: vfs_dqrwlock is not held at all 852 * o quota updates: vfs_dqrwlock is read or write held 853 * o other updates: vfs_dqrwlock is read held 854 * 855 * The first case is the only one where we do not hold 856 * vfs_dqrwlock at all while entering wrip(). 857 * We must make sure not to downgrade/drop vfs_dqrwlock if we 858 * have it as writer, i.e. if we are updating the quota inode. 859 * There is no potential deadlock scenario in this case as 860 * ufs_getpage() takes care of this and avoids reacquiring 861 * vfs_dqrwlock in that case. 862 * 863 * This check is done here since the above conditions do not change 864 * and we possibly loop below, so save a few cycles. 865 */ 866 if ((type == IFSHAD) || 867 (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) { 868 do_dqrwlock = 0; 869 } else { 870 do_dqrwlock = 1; 871 } 872 873 /* 874 * Large Files: We cast MAXBMASK to offset_t 875 * inorder to mask out the higher bits. Since offset_t 876 * is a signed value, the high order bit set in MAXBMASK 877 * value makes it do the right thing by having all bits 1 878 * in the higher word. May be removed for _SOLARIS64_. 879 */ 880 881 fs = ip->i_fs; 882 do { 883 u_offset_t uoff = uio->uio_loffset; 884 off = uoff & (offset_t)MAXBMASK; 885 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 886 on = (int)blkoff(fs, uoff); 887 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid); 888 new_iblocks = 1; 889 890 if (type == IFREG && uoff + n >= limit) { 891 if (uoff >= limit) { 892 error = EFBIG; 893 goto out; 894 } 895 /* 896 * since uoff + n >= limit, 897 * therefore n >= limit - uoff, and n is an int 898 * so it is safe to cast it to an int 899 */ 900 n = (int)(limit - (rlim64_t)uoff); 901 } 902 if (uoff + n > ip->i_size) { 903 /* 904 * We are extending the length of the file. 905 * bmap is used so that we are sure that 906 * if we need to allocate new blocks, that it 907 * is done here before we up the file size. 908 */ 909 error = bmap_write(ip, uoff, (int)(on + n), 910 mapon == 0, NULL, cr); 911 /* 912 * bmap_write never drops i_contents so if 913 * the flags are set it changed the file. 914 */ 915 if (ip->i_flag & (ICHG|IUPD)) { 916 i_seq_needed = 1; 917 ip->i_flag |= ISEQ; 918 } 919 if (error) 920 break; 921 /* 922 * There is a window of vulnerability here. 923 * The sequence of operations: allocate file 924 * system blocks, uiomove the data into pages, 925 * and then update the size of the file in the 926 * inode, must happen atomically. However, due 927 * to current locking constraints, this can not 928 * be done. 929 */ 930 ASSERT(ip->i_writer == NULL); 931 ip->i_writer = curthread; 932 i_size_changed = 1; 933 /* 934 * If we are writing from the beginning of 935 * the mapping, we can just create the 936 * pages without having to read them. 937 */ 938 pagecreate = (mapon == 0); 939 } else if (n == MAXBSIZE) { 940 /* 941 * Going to do a whole mappings worth, 942 * so we can just create the pages w/o 943 * having to read them in. But before 944 * we do that, we need to make sure any 945 * needed blocks are allocated first. 946 */ 947 iblocks = ip->i_blocks; 948 error = bmap_write(ip, uoff, (int)(on + n), 949 BI_ALLOC_ONLY, NULL, cr); 950 /* 951 * bmap_write never drops i_contents so if 952 * the flags are set it changed the file. 953 */ 954 if (ip->i_flag & (ICHG|IUPD)) { 955 i_seq_needed = 1; 956 ip->i_flag |= ISEQ; 957 } 958 if (error) 959 break; 960 pagecreate = 1; 961 /* 962 * check if the new created page needed the 963 * allocation of new disk blocks. 964 */ 965 if (iblocks == ip->i_blocks) 966 new_iblocks = 0; /* no new blocks allocated */ 967 } else { 968 pagecreate = 0; 969 /* 970 * In sync mode flush the indirect blocks which 971 * may have been allocated and not written on 972 * disk. In above cases bmap_write will allocate 973 * in sync mode. 974 */ 975 if (ioflag & (FSYNC|FDSYNC)) { 976 error = ufs_indirblk_sync(ip, uoff); 977 if (error) 978 break; 979 } 980 } 981 982 /* 983 * At this point we can enter ufs_getpage() in one 984 * of two ways: 985 * 1) segmap_getmapflt() calls ufs_getpage() when the 986 * forcefault parameter is true (pagecreate == 0) 987 * 2) uiomove() causes a page fault. 988 * 989 * We have to drop the contents lock to prevent the VM 990 * system from trying to reaquire it in ufs_getpage() 991 * should the uiomove cause a pagefault. 992 * 993 * We have to drop the reader vfs_dqrwlock here as well. 994 */ 995 rw_exit(&ip->i_contents); 996 if (do_dqrwlock) { 997 ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 998 ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock))); 999 rw_exit(&ufsvfsp->vfs_dqrwlock); 1000 } 1001 1002 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1003 (uint_t)n, !pagecreate, S_WRITE); 1004 1005 /* 1006 * segmap_pagecreate() returns 1 if it calls 1007 * page_create_va() to allocate any pages. 1008 */ 1009 newpage = 0; 1010 1011 if (pagecreate) 1012 newpage = segmap_pagecreate(segkmap, base, 1013 (size_t)n, 0); 1014 1015 premove_resid = uio->uio_resid; 1016 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 1017 1018 /* 1019 * If "newpage" is set, then a new page was created and it 1020 * does not contain valid data, so it needs to be initialized 1021 * at this point. 1022 * Otherwise the page contains old data, which was overwritten 1023 * partially or as a whole in uiomove. 1024 * If there is only one iovec structure within uio, then 1025 * on error uiomove will not be able to update uio->uio_loffset 1026 * and we would zero the whole page here! 1027 * 1028 * If uiomove fails because of an error, the old valid data 1029 * is kept instead of filling the rest of the page with zero's. 1030 */ 1031 if (newpage && 1032 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 1033 /* 1034 * We created pages w/o initializing them completely, 1035 * thus we need to zero the part that wasn't set up. 1036 * This happens on most EOF write cases and if 1037 * we had some sort of error during the uiomove. 1038 */ 1039 int nzero, nmoved; 1040 1041 nmoved = (int)(uio->uio_loffset - (off + mapon)); 1042 ASSERT(nmoved >= 0 && nmoved <= n); 1043 nzero = roundup(on + n, PAGESIZE) - nmoved; 1044 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 1045 (void) kzero(base + mapon + nmoved, (uint_t)nzero); 1046 } 1047 1048 /* 1049 * Unlock the pages allocated by page_create_va() 1050 * in segmap_pagecreate() 1051 */ 1052 if (newpage) 1053 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 1054 1055 /* 1056 * If the size of the file changed, then update the 1057 * size field in the inode now. This can't be done 1058 * before the call to segmap_pageunlock or there is 1059 * a potential deadlock with callers to ufs_putpage(). 1060 * They will be holding i_contents and trying to lock 1061 * a page, while this thread is holding a page locked 1062 * and trying to acquire i_contents. 1063 */ 1064 if (i_size_changed) { 1065 rw_enter(&ip->i_contents, RW_WRITER); 1066 old_i_size = ip->i_size; 1067 UFS_SET_ISIZE(uoff + n, ip); 1068 TRANS_INODE(ufsvfsp, ip); 1069 /* 1070 * file has grown larger than 2GB. Set flag 1071 * in superblock to indicate this, if it 1072 * is not already set. 1073 */ 1074 if ((ip->i_size > MAXOFF32_T) && 1075 !(fs->fs_flags & FSLARGEFILES)) { 1076 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1077 mutex_enter(&ufsvfsp->vfs_lock); 1078 fs->fs_flags |= FSLARGEFILES; 1079 ufs_sbwrite(ufsvfsp); 1080 mutex_exit(&ufsvfsp->vfs_lock); 1081 } 1082 mutex_enter(&ip->i_tlock); 1083 ip->i_writer = NULL; 1084 cv_broadcast(&ip->i_wrcv); 1085 mutex_exit(&ip->i_tlock); 1086 rw_exit(&ip->i_contents); 1087 } 1088 1089 if (error) { 1090 /* 1091 * If we failed on a write, we may have already 1092 * allocated file blocks as well as pages. It's 1093 * hard to undo the block allocation, but we must 1094 * be sure to invalidate any pages that may have 1095 * been allocated. 1096 * 1097 * If the page was created without initialization 1098 * then we must check if it should be possible 1099 * to destroy the new page and to keep the old data 1100 * on the disk. 1101 * 1102 * It is possible to destroy the page without 1103 * having to write back its contents only when 1104 * - the size of the file keeps unchanged 1105 * - bmap_write() did not allocate new disk blocks 1106 * it is possible to create big files using "seek" and 1107 * write to the end of the file. A "write" to a 1108 * position before the end of the file would not 1109 * change the size of the file but it would allocate 1110 * new disk blocks. 1111 * - uiomove intended to overwrite the whole page. 1112 * - a new page was created (newpage == 1). 1113 */ 1114 1115 if (i_size_changed == 0 && new_iblocks == 0 && 1116 newpage) { 1117 1118 /* unwind what uiomove eventually last did */ 1119 uio->uio_resid = premove_resid; 1120 1121 /* 1122 * destroy the page, do not write ambiguous 1123 * data to the disk. 1124 */ 1125 flags = SM_DESTROY; 1126 } else { 1127 /* 1128 * write the page back to the disk, if dirty, 1129 * and remove the page from the cache. 1130 */ 1131 flags = SM_INVAL; 1132 } 1133 (void) segmap_release(segkmap, base, flags); 1134 } else { 1135 flags = 0; 1136 /* 1137 * Force write back for synchronous write cases. 1138 */ 1139 if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) { 1140 /* 1141 * If the sticky bit is set but the 1142 * execute bit is not set, we do a 1143 * synchronous write back and free 1144 * the page when done. We set up swap 1145 * files to be handled this way to 1146 * prevent servers from keeping around 1147 * the client's swap pages too long. 1148 * XXX - there ought to be a better way. 1149 */ 1150 if (IS_SWAPVP(vp)) { 1151 flags = SM_WRITE | SM_FREE | 1152 SM_DONTNEED; 1153 iupdat_flag = 0; 1154 } else { 1155 flags = SM_WRITE; 1156 } 1157 } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 1158 /* 1159 * Have written a whole block. 1160 * Start an asynchronous write and 1161 * mark the buffer to indicate that 1162 * it won't be needed again soon. 1163 */ 1164 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 1165 } 1166 error = segmap_release(segkmap, base, flags); 1167 /* 1168 * If the operation failed and is synchronous, 1169 * then we need to unwind what uiomove() last 1170 * did so we can potentially return an error to 1171 * the caller. If this write operation was 1172 * done in two pieces and the first succeeded, 1173 * then we won't return an error for the second 1174 * piece that failed. However, we only want to 1175 * return a resid value that reflects what was 1176 * really done. 1177 * 1178 * Failures for non-synchronous operations can 1179 * be ignored since the page subsystem will 1180 * retry the operation until it succeeds or the 1181 * file system is unmounted. 1182 */ 1183 if (error) { 1184 if ((ioflag & (FSYNC | FDSYNC)) || 1185 type == IFDIR) { 1186 uio->uio_resid = premove_resid; 1187 } else { 1188 error = 0; 1189 } 1190 } 1191 } 1192 1193 /* 1194 * Re-acquire contents lock. 1195 * If it was dropped, reacquire reader vfs_dqrwlock as well. 1196 */ 1197 if (do_dqrwlock) 1198 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1199 rw_enter(&ip->i_contents, RW_WRITER); 1200 1201 /* 1202 * If the uiomove() failed or if a synchronous 1203 * page push failed, fix up i_size. 1204 */ 1205 if (error) { 1206 if (i_size_changed) { 1207 /* 1208 * The uiomove failed, and we 1209 * allocated blocks,so get rid 1210 * of them. 1211 */ 1212 (void) ufs_itrunc(ip, old_i_size, 0, cr); 1213 } 1214 } else { 1215 /* 1216 * XXX - Can this be out of the loop? 1217 */ 1218 ip->i_flag |= IUPD | ICHG; 1219 /* 1220 * Only do one increase of i_seq for multiple 1221 * pieces. Because we drop locks, record 1222 * the fact that we changed the timestamp and 1223 * are deferring the increase in case another thread 1224 * pushes our timestamp update. 1225 */ 1226 i_seq_needed = 1; 1227 ip->i_flag |= ISEQ; 1228 if (i_size_changed) 1229 ip->i_flag |= IATTCHG; 1230 if ((ip->i_mode & (IEXEC | (IEXEC >> 3) | 1231 (IEXEC >> 6))) != 0 && 1232 (ip->i_mode & (ISUID | ISGID)) != 0 && 1233 secpolicy_vnode_setid_retain(cr, 1234 (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) { 1235 /* 1236 * Clear Set-UID & Set-GID bits on 1237 * successful write if not privileged 1238 * and at least one of the execute bits 1239 * is set. If we always clear Set-GID, 1240 * mandatory file and record locking is 1241 * unuseable. 1242 */ 1243 ip->i_mode &= ~(ISUID | ISGID); 1244 } 1245 } 1246 /* 1247 * In the case the FDSYNC flag is set and this is a 1248 * "rewrite" we won't log a delta. 1249 * The FSYNC flag overrides all cases. 1250 */ 1251 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) { 1252 TRANS_INODE(ufsvfsp, ip); 1253 } 1254 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1255 1256 out: 1257 /* 1258 * Make sure i_seq is increased at least once per write 1259 */ 1260 if (i_seq_needed) { 1261 ip->i_seq++; 1262 ip->i_flag &= ~ISEQ; /* no longer deferred */ 1263 } 1264 1265 /* 1266 * Inode is updated according to this table - 1267 * 1268 * FSYNC FDSYNC(posix.4) 1269 * -------------------------- 1270 * always@ IATTCHG|IBDWRITE 1271 * 1272 * @ - If we are doing synchronous write the only time we should 1273 * not be sync'ing the ip here is if we have the stickyhack 1274 * activated, the file is marked with the sticky bit and 1275 * no exec bit, the file length has not been changed and 1276 * no new blocks have been allocated during this write. 1277 */ 1278 1279 if ((ip->i_flag & ISYNC) != 0) { 1280 /* 1281 * we have eliminated nosync 1282 */ 1283 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 1284 ((ioflag & FSYNC) && iupdat_flag)) { 1285 ufs_iupdat(ip, 1); 1286 } 1287 } 1288 1289 /* 1290 * If we've already done a partial-write, terminate 1291 * the write but return no error unless the error is ENOSPC 1292 * because the caller can detect this and free resources and 1293 * try again. 1294 */ 1295 if ((start_resid != uio->uio_resid) && (error != ENOSPC)) 1296 error = 0; 1297 1298 ip->i_flag &= ~(INOACC | ISYNC); 1299 ITIMES_NOLOCK(ip); 1300 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1301 "ufs_wrip_end:vp %p error %d", vp, error); 1302 return (error); 1303 } 1304 1305 /* 1306 * rdip does the real work of read requests for ufs. 1307 */ 1308 int 1309 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr) 1310 { 1311 u_offset_t off; 1312 caddr_t base; 1313 struct fs *fs; 1314 struct ufsvfs *ufsvfsp; 1315 struct vnode *vp; 1316 long oresid = uio->uio_resid; 1317 u_offset_t n, on, mapon; 1318 int error = 0; 1319 int doupdate = 1; 1320 uint_t flags; 1321 int dofree, directio_status; 1322 krw_t rwtype; 1323 o_mode_t type; 1324 1325 vp = ITOV(ip); 1326 1327 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 1328 "ufs_rdip_start:vp %p", vp); 1329 1330 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1331 1332 ufsvfsp = ip->i_ufsvfs; 1333 1334 if (ufsvfsp == NULL) 1335 return (EIO); 1336 1337 fs = ufsvfsp->vfs_fs; 1338 1339 /* check for valid filetype */ 1340 type = ip->i_mode & IFMT; 1341 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 1342 (type != IFLNK) && (type != IFSHAD)) { 1343 return (EIO); 1344 } 1345 1346 if (uio->uio_loffset > UFS_MAXOFFSET_T) { 1347 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1348 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1349 error = 0; 1350 goto out; 1351 } 1352 if (uio->uio_loffset < (offset_t)0) { 1353 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1354 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1355 return (EINVAL); 1356 } 1357 if (uio->uio_resid == 0) { 1358 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1359 "ufs_rdip_end:vp %p error %d", vp, 0); 1360 return (0); 1361 } 1362 1363 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) && 1364 (!ufsvfsp->vfs_noatime)) { 1365 mutex_enter(&ip->i_tlock); 1366 ip->i_flag |= IACC; 1367 mutex_exit(&ip->i_tlock); 1368 } 1369 /* 1370 * Try to go direct 1371 */ 1372 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 1373 error = ufs_directio_read(ip, uio, cr, &directio_status); 1374 if (directio_status == DIRECTIO_SUCCESS) 1375 goto out; 1376 } 1377 1378 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 1379 1380 do { 1381 offset_t diff; 1382 u_offset_t uoff = uio->uio_loffset; 1383 off = uoff & (offset_t)MAXBMASK; 1384 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET); 1385 on = (u_offset_t)blkoff(fs, uoff); 1386 n = MIN((u_offset_t)fs->fs_bsize - on, 1387 (u_offset_t)uio->uio_resid); 1388 1389 diff = ip->i_size - uoff; 1390 1391 if (diff <= (offset_t)0) { 1392 error = 0; 1393 goto out; 1394 } 1395 if (diff < (offset_t)n) 1396 n = (int)diff; 1397 1398 /* 1399 * We update smallfile2 and smallfile1 at most every second. 1400 */ 1401 if (lbolt >= smallfile_update) { 1402 uint64_t percpufreeb; 1403 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 1404 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 1405 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 1406 smallfile1 = percpufreeb / smallfile1_d; 1407 smallfile2 = percpufreeb / smallfile2_d; 1408 smallfile1 = MAX(smallfile1, smallfile); 1409 smallfile1 = MAX(smallfile1, smallfile64); 1410 smallfile2 = MAX(smallfile1, smallfile2); 1411 smallfile_update = lbolt + hz; 1412 } 1413 1414 dofree = freebehind && 1415 ip->i_nextr == (off & PAGEMASK) && off > smallfile1; 1416 1417 /* 1418 * At this point we can enter ufs_getpage() in one of two 1419 * ways: 1420 * 1) segmap_getmapflt() calls ufs_getpage() when the 1421 * forcefault parameter is true (value of 1 is passed) 1422 * 2) uiomove() causes a page fault. 1423 * 1424 * We cannot hold onto an i_contents reader lock without 1425 * risking deadlock in ufs_getpage() so drop a reader lock. 1426 * The ufs_getpage() dolock logic already allows for a 1427 * thread holding i_contents as writer to work properly 1428 * so we keep a writer lock. 1429 */ 1430 if (rwtype == RW_READER) 1431 rw_exit(&ip->i_contents); 1432 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1433 (uint_t)n, 1, S_READ); 1434 1435 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 1436 1437 flags = 0; 1438 if (!error) { 1439 /* 1440 * If reading sequential we won't need this 1441 * buffer again soon. For offsets in range 1442 * [smallfile1, smallfile2] release the pages 1443 * at the tail of the cache list, larger 1444 * offsets are released at the head. 1445 */ 1446 if (dofree) { 1447 flags = SM_FREE | SM_ASYNC; 1448 if ((cache_read_ahead == 0) && 1449 (off > smallfile2)) 1450 flags |= SM_DONTNEED; 1451 } 1452 /* 1453 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 1454 * we want to make sure that the page which has 1455 * been read, is written on disk if it is dirty. 1456 * And corresponding indirect blocks should also 1457 * be flushed out. 1458 */ 1459 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 1460 flags &= ~SM_ASYNC; 1461 flags |= SM_WRITE; 1462 } 1463 error = segmap_release(segkmap, base, flags); 1464 } else 1465 (void) segmap_release(segkmap, base, flags); 1466 1467 if (rwtype == RW_READER) 1468 rw_enter(&ip->i_contents, rwtype); 1469 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1470 out: 1471 /* 1472 * Inode is updated according to this table if FRSYNC is set. 1473 * 1474 * FSYNC FDSYNC(posix.4) 1475 * -------------------------- 1476 * always IATTCHG|IBDWRITE 1477 */ 1478 /* 1479 * The inode is not updated if we're logging and the inode is a 1480 * directory with FRSYNC, FSYNC and FDSYNC flags set. 1481 */ 1482 if (ioflag & FRSYNC) { 1483 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) { 1484 doupdate = 0; 1485 } 1486 if (doupdate) { 1487 if ((ioflag & FSYNC) || 1488 ((ioflag & FDSYNC) && 1489 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 1490 ufs_iupdat(ip, 1); 1491 } 1492 } 1493 } 1494 /* 1495 * If we've already done a partial read, terminate 1496 * the read but return no error. 1497 */ 1498 if (oresid != uio->uio_resid) 1499 error = 0; 1500 ITIMES(ip); 1501 1502 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1503 "ufs_rdip_end:vp %p error %d", vp, error); 1504 return (error); 1505 } 1506 1507 /* ARGSUSED */ 1508 static int 1509 ufs_ioctl( 1510 struct vnode *vp, 1511 int cmd, 1512 intptr_t arg, 1513 int flag, 1514 struct cred *cr, 1515 int *rvalp) 1516 { 1517 struct lockfs lockfs, lockfs_out; 1518 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 1519 char *comment, *original_comment; 1520 struct fs *fs; 1521 struct ulockfs *ulp; 1522 offset_t off; 1523 extern int maxphys; 1524 int error; 1525 int issync; 1526 int trans_size; 1527 1528 1529 /* 1530 * forcibly unmounted 1531 */ 1532 if (ufsvfsp == NULL) { 1533 return (EIO); 1534 } 1535 1536 fs = ufsvfsp->vfs_fs; 1537 1538 if (cmd == Q_QUOTACTL) { 1539 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK); 1540 if (error) 1541 return (error); 1542 1543 if (ulp) { 1544 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, 1545 TOP_SETQUOTA_SIZE(fs)); 1546 } 1547 1548 error = quotactl(vp, arg, flag, cr); 1549 1550 if (ulp) { 1551 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, 1552 TOP_SETQUOTA_SIZE(fs)); 1553 ufs_lockfs_end(ulp); 1554 } 1555 return (error); 1556 } 1557 1558 switch (cmd) { 1559 case _FIOLFS: 1560 /* 1561 * file system locking 1562 */ 1563 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1564 return (EPERM); 1565 1566 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1567 if (copyin((caddr_t)arg, &lockfs, 1568 sizeof (struct lockfs))) 1569 return (EFAULT); 1570 } 1571 #ifdef _SYSCALL32_IMPL 1572 else { 1573 struct lockfs32 lockfs32; 1574 /* Translate ILP32 lockfs to LP64 lockfs */ 1575 if (copyin((caddr_t)arg, &lockfs32, 1576 sizeof (struct lockfs32))) 1577 return (EFAULT); 1578 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1579 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1580 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1581 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1582 lockfs.lf_comment = 1583 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1584 } 1585 #endif /* _SYSCALL32_IMPL */ 1586 1587 if (lockfs.lf_comlen) { 1588 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) 1589 return (ENAMETOOLONG); 1590 comment = kmem_alloc(lockfs.lf_comlen, 1591 KM_SLEEP); 1592 if (copyin(lockfs.lf_comment, comment, 1593 lockfs.lf_comlen)) { 1594 kmem_free(comment, lockfs.lf_comlen); 1595 return (EFAULT); 1596 } 1597 original_comment = lockfs.lf_comment; 1598 lockfs.lf_comment = comment; 1599 } 1600 if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) { 1601 lockfs.lf_comment = original_comment; 1602 1603 if ((flag & DATAMODEL_MASK) == 1604 DATAMODEL_NATIVE) { 1605 (void) copyout(&lockfs, (caddr_t)arg, 1606 sizeof (struct lockfs)); 1607 } 1608 #ifdef _SYSCALL32_IMPL 1609 else { 1610 struct lockfs32 lockfs32; 1611 /* Translate LP64 to ILP32 lockfs */ 1612 lockfs32.lf_lock = 1613 (uint32_t)lockfs.lf_lock; 1614 lockfs32.lf_flags = 1615 (uint32_t)lockfs.lf_flags; 1616 lockfs32.lf_key = 1617 (uint32_t)lockfs.lf_key; 1618 lockfs32.lf_comlen = 1619 (uint32_t)lockfs.lf_comlen; 1620 lockfs32.lf_comment = 1621 (uint32_t)(uintptr_t)lockfs.lf_comment; 1622 (void) copyout(&lockfs32, (caddr_t)arg, 1623 sizeof (struct lockfs32)); 1624 } 1625 #endif /* _SYSCALL32_IMPL */ 1626 1627 } else { 1628 if (lockfs.lf_comlen) 1629 kmem_free(comment, lockfs.lf_comlen); 1630 } 1631 return (error); 1632 1633 case _FIOLFSS: 1634 /* 1635 * get file system locking status 1636 */ 1637 1638 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1639 if (copyin((caddr_t)arg, &lockfs, 1640 sizeof (struct lockfs))) 1641 return (EFAULT); 1642 } 1643 #ifdef _SYSCALL32_IMPL 1644 else { 1645 struct lockfs32 lockfs32; 1646 /* Translate ILP32 lockfs to LP64 lockfs */ 1647 if (copyin((caddr_t)arg, &lockfs32, 1648 sizeof (struct lockfs32))) 1649 return (EFAULT); 1650 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1651 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1652 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1653 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1654 lockfs.lf_comment = 1655 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1656 } 1657 #endif /* _SYSCALL32_IMPL */ 1658 1659 if (error = ufs_fiolfss(vp, &lockfs_out)) 1660 return (error); 1661 lockfs.lf_lock = lockfs_out.lf_lock; 1662 lockfs.lf_key = lockfs_out.lf_key; 1663 lockfs.lf_flags = lockfs_out.lf_flags; 1664 lockfs.lf_comlen = MIN(lockfs.lf_comlen, 1665 lockfs_out.lf_comlen); 1666 1667 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1668 if (copyout(&lockfs, (caddr_t)arg, 1669 sizeof (struct lockfs))) 1670 return (EFAULT); 1671 } 1672 #ifdef _SYSCALL32_IMPL 1673 else { 1674 /* Translate LP64 to ILP32 lockfs */ 1675 struct lockfs32 lockfs32; 1676 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock; 1677 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags; 1678 lockfs32.lf_key = (uint32_t)lockfs.lf_key; 1679 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen; 1680 lockfs32.lf_comment = 1681 (uint32_t)(uintptr_t)lockfs.lf_comment; 1682 if (copyout(&lockfs32, (caddr_t)arg, 1683 sizeof (struct lockfs32))) 1684 return (EFAULT); 1685 } 1686 #endif /* _SYSCALL32_IMPL */ 1687 1688 if (lockfs.lf_comlen && 1689 lockfs.lf_comment && lockfs_out.lf_comment) 1690 if (copyout(lockfs_out.lf_comment, 1691 lockfs.lf_comment, 1692 lockfs.lf_comlen)) 1693 return (EFAULT); 1694 return (0); 1695 1696 case _FIOSATIME: 1697 /* 1698 * set access time 1699 */ 1700 1701 /* 1702 * if mounted w/o atime, return quietly. 1703 * I briefly thought about returning ENOSYS, but 1704 * figured that most apps would consider this fatal 1705 * but the idea is to make this as seamless as poss. 1706 */ 1707 if (ufsvfsp->vfs_noatime) 1708 return (0); 1709 1710 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1711 ULOCKFS_SETATTR_MASK); 1712 if (error) 1713 return (error); 1714 1715 if (ulp) { 1716 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp)); 1717 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 1718 TOP_SETATTR, trans_size); 1719 } 1720 1721 error = ufs_fiosatime(vp, (struct timeval *)arg, 1722 flag, cr); 1723 1724 if (ulp) { 1725 TRANS_END_CSYNC(ufsvfsp, error, issync, 1726 TOP_SETATTR, trans_size); 1727 ufs_lockfs_end(ulp); 1728 } 1729 return (error); 1730 1731 case _FIOSDIO: 1732 /* 1733 * set delayed-io 1734 */ 1735 return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr)); 1736 1737 case _FIOGDIO: 1738 /* 1739 * get delayed-io 1740 */ 1741 return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr)); 1742 1743 case _FIOIO: 1744 /* 1745 * inode open 1746 */ 1747 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1748 ULOCKFS_VGET_MASK); 1749 if (error) 1750 return (error); 1751 1752 error = ufs_fioio(vp, (struct fioio *)arg, flag, cr); 1753 1754 if (ulp) { 1755 ufs_lockfs_end(ulp); 1756 } 1757 return (error); 1758 1759 case _FIOFFS: 1760 /* 1761 * file system flush (push w/invalidate) 1762 */ 1763 if ((caddr_t)arg != NULL) 1764 return (EINVAL); 1765 return (ufs_fioffs(vp, NULL, cr)); 1766 1767 case _FIOISBUSY: 1768 /* 1769 * Contract-private interface for Legato 1770 * Purge this vnode from the DNLC and decide 1771 * if this vnode is busy (*arg == 1) or not 1772 * (*arg == 0) 1773 */ 1774 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1775 return (EPERM); 1776 error = ufs_fioisbusy(vp, (int *)arg, cr); 1777 return (error); 1778 1779 case _FIODIRECTIO: 1780 return (ufs_fiodirectio(vp, (int)arg, cr)); 1781 1782 case _FIOTUNE: 1783 /* 1784 * Tune the file system (aka setting fs attributes) 1785 */ 1786 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1787 ULOCKFS_SETATTR_MASK); 1788 if (error) 1789 return (error); 1790 1791 error = ufs_fiotune(vp, (struct fiotune *)arg, cr); 1792 1793 if (ulp) 1794 ufs_lockfs_end(ulp); 1795 return (error); 1796 1797 case _FIOLOGENABLE: 1798 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1799 return (EPERM); 1800 return (ufs_fiologenable(vp, (void *)arg, cr, flag)); 1801 1802 case _FIOLOGDISABLE: 1803 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1804 return (EPERM); 1805 return (ufs_fiologdisable(vp, (void *)arg, cr, flag)); 1806 1807 case _FIOISLOG: 1808 return (ufs_fioislog(vp, (void *)arg, cr, flag)); 1809 1810 case _FIOSNAPSHOTCREATE_MULTI: 1811 { 1812 struct fiosnapcreate_multi fc, *fcp; 1813 size_t fcm_size; 1814 1815 if (copyin((void *)arg, &fc, sizeof (fc))) 1816 return (EFAULT); 1817 if (fc.backfilecount > MAX_BACKFILE_COUNT) 1818 return (EINVAL); 1819 fcm_size = sizeof (struct fiosnapcreate_multi) + 1820 (fc.backfilecount - 1) * sizeof (int); 1821 fcp = (struct fiosnapcreate_multi *) 1822 kmem_alloc(fcm_size, KM_SLEEP); 1823 if (copyin((void *)arg, fcp, fcm_size)) { 1824 kmem_free(fcp, fcm_size); 1825 return (EFAULT); 1826 } 1827 error = ufs_snap_create(vp, fcp, cr); 1828 if (!error && copyout(fcp, (void *)arg, fcm_size)) 1829 error = EFAULT; 1830 kmem_free(fcp, fcm_size); 1831 return (error); 1832 } 1833 1834 case _FIOSNAPSHOTDELETE: 1835 { 1836 struct fiosnapdelete fc; 1837 1838 if (copyin((void *)arg, &fc, sizeof (fc))) 1839 return (EFAULT); 1840 error = ufs_snap_delete(vp, &fc, cr); 1841 if (!error && copyout(&fc, (void *)arg, sizeof (fc))) 1842 error = EFAULT; 1843 return (error); 1844 } 1845 1846 case _FIOGETSUPERBLOCK: 1847 if (copyout(fs, (void *)arg, SBSIZE)) 1848 return (EFAULT); 1849 return (0); 1850 1851 case _FIOGETMAXPHYS: 1852 if (copyout(&maxphys, (void *)arg, sizeof (maxphys))) 1853 return (EFAULT); 1854 return (0); 1855 1856 /* 1857 * The following 3 ioctls are for TSufs support 1858 * although could potentially be used elsewhere 1859 */ 1860 case _FIO_SET_LUFS_DEBUG: 1861 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1862 return (EPERM); 1863 lufs_debug = (uint32_t)arg; 1864 return (0); 1865 1866 case _FIO_SET_LUFS_ERROR: 1867 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1868 return (EPERM); 1869 TRANS_SETERROR(ufsvfsp); 1870 return (0); 1871 1872 case _FIO_GET_TOP_STATS: 1873 { 1874 fio_lufs_stats_t *ls; 1875 ml_unit_t *ul = ufsvfsp->vfs_log; 1876 1877 ls = kmem_zalloc(sizeof (*ls), KM_SLEEP); 1878 ls->ls_debug = ul->un_debug; /* return debug value */ 1879 /* Copy stucture if statistics are being kept */ 1880 if (ul->un_logmap->mtm_tops) { 1881 ls->ls_topstats = *(ul->un_logmap->mtm_tops); 1882 } 1883 error = 0; 1884 if (copyout(ls, (void *)arg, sizeof (*ls))) 1885 error = EFAULT; 1886 kmem_free(ls, sizeof (*ls)); 1887 return (error); 1888 } 1889 1890 case _FIO_SEEK_DATA: 1891 case _FIO_SEEK_HOLE: 1892 if (ddi_copyin((void *)arg, &off, sizeof (off), flag)) 1893 return (EFAULT); 1894 /* offset paramater is in/out */ 1895 error = ufs_fio_holey(vp, cmd, &off); 1896 if (error) 1897 return (error); 1898 if (ddi_copyout(&off, (void *)arg, sizeof (off), flag)) 1899 return (EFAULT); 1900 return (0); 1901 1902 default: 1903 return (ENOTTY); 1904 } 1905 } 1906 1907 /* ARGSUSED */ 1908 static int 1909 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags, 1910 struct cred *cr) 1911 { 1912 struct inode *ip = VTOI(vp); 1913 struct ufsvfs *ufsvfsp; 1914 int err; 1915 1916 TRACE_2(TR_FAC_UFS, TR_UFS_GETATTR_START, 1917 "ufs_getattr_start:vp %p flags %x", vp, flags); 1918 1919 if (vap->va_mask == AT_SIZE) { 1920 /* 1921 * for performance, if only the size is requested don't bother 1922 * with anything else. 1923 */ 1924 UFS_GET_ISIZE(&vap->va_size, ip); 1925 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, 1926 "ufs_getattr_end:vp %p", vp); 1927 return (0); 1928 } 1929 1930 /* 1931 * inlined lockfs checks 1932 */ 1933 ufsvfsp = ip->i_ufsvfs; 1934 if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) { 1935 err = EIO; 1936 goto out; 1937 } 1938 1939 rw_enter(&ip->i_contents, RW_READER); 1940 /* 1941 * Return all the attributes. This should be refined so 1942 * that it only returns what's asked for. 1943 */ 1944 1945 /* 1946 * Copy from inode table. 1947 */ 1948 vap->va_type = vp->v_type; 1949 vap->va_mode = ip->i_mode & MODEMASK; 1950 /* 1951 * If there is an ACL and there is a mask entry, then do the 1952 * extra work that completes the equivalent of an acltomode(3) 1953 * call. According to POSIX P1003.1e, the acl mask should be 1954 * returned in the group permissions field. 1955 * 1956 * - start with the original permission and mode bits (from above) 1957 * - clear the group owner bits 1958 * - add in the mask bits. 1959 */ 1960 if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) { 1961 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3); 1962 vap->va_mode |= 1963 (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3; 1964 } 1965 vap->va_uid = ip->i_uid; 1966 vap->va_gid = ip->i_gid; 1967 vap->va_fsid = ip->i_dev; 1968 vap->va_nodeid = (ino64_t)ip->i_number; 1969 vap->va_nlink = ip->i_nlink; 1970 vap->va_size = ip->i_size; 1971 if (vp->v_type == VCHR || vp->v_type == VBLK) 1972 vap->va_rdev = ip->i_rdev; 1973 else 1974 vap->va_rdev = 0; /* not a b/c spec. */ 1975 mutex_enter(&ip->i_tlock); 1976 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 1977 vap->va_seq = ip->i_seq; 1978 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 1979 vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000; 1980 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 1981 vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000; 1982 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 1983 vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000; 1984 mutex_exit(&ip->i_tlock); 1985 1986 switch (ip->i_mode & IFMT) { 1987 1988 case IFBLK: 1989 vap->va_blksize = MAXBSIZE; /* was BLKDEV_IOSIZE */ 1990 break; 1991 1992 case IFCHR: 1993 vap->va_blksize = MAXBSIZE; 1994 break; 1995 1996 default: 1997 vap->va_blksize = ip->i_fs->fs_bsize; 1998 break; 1999 } 2000 vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks; 2001 rw_exit(&ip->i_contents); 2002 err = 0; 2003 2004 out: 2005 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, "ufs_getattr_end:vp %p", vp); 2006 2007 return (err); 2008 } 2009 2010 /*ARGSUSED4*/ 2011 static int 2012 ufs_setattr( 2013 struct vnode *vp, 2014 struct vattr *vap, 2015 int flags, 2016 struct cred *cr, 2017 caller_context_t *ct) 2018 { 2019 struct inode *ip = VTOI(vp); 2020 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2021 struct fs *fs; 2022 struct ulockfs *ulp; 2023 char *errmsg1; 2024 char *errmsg2; 2025 long blocks; 2026 long int mask = vap->va_mask; 2027 size_t len1, len2; 2028 int issync; 2029 int trans_size; 2030 int dotrans; 2031 int dorwlock; 2032 int error; 2033 int owner_change; 2034 int dodqlock; 2035 timestruc_t now; 2036 vattr_t oldva; 2037 int retry = 1; 2038 2039 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_START, 2040 "ufs_setattr_start:vp %p flags %x", vp, flags); 2041 2042 /* 2043 * Cannot set these attributes. 2044 */ 2045 if (mask & AT_NOSET) { 2046 error = EINVAL; 2047 goto out; 2048 } 2049 2050 /* 2051 * check for forced unmount 2052 */ 2053 if (ufsvfsp == NULL) 2054 return (EIO); 2055 2056 fs = ufsvfsp->vfs_fs; 2057 if (fs->fs_ronly != 0) 2058 return (EROFS); 2059 2060 again: 2061 errmsg1 = NULL; 2062 errmsg2 = NULL; 2063 dotrans = 0; 2064 dorwlock = 0; 2065 dodqlock = 0; 2066 2067 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK); 2068 if (error) 2069 goto out; 2070 2071 /* 2072 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 2073 * This follows the protocol for read()/write(). 2074 */ 2075 if (vp->v_type != VDIR) { 2076 rw_enter(&ip->i_rwlock, RW_WRITER); 2077 dorwlock = 1; 2078 } 2079 2080 /* 2081 * Truncate file. Must have write permission and not be a directory. 2082 */ 2083 if (mask & AT_SIZE) { 2084 rw_enter(&ip->i_contents, RW_WRITER); 2085 if (vp->v_type == VDIR) { 2086 error = EISDIR; 2087 goto update_inode; 2088 } 2089 if (error = ufs_iaccess(ip, IWRITE, cr)) 2090 goto update_inode; 2091 2092 rw_exit(&ip->i_contents); 2093 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr); 2094 if (error) { 2095 rw_enter(&ip->i_contents, RW_WRITER); 2096 goto update_inode; 2097 } 2098 } 2099 2100 if (ulp) { 2101 trans_size = (int)TOP_SETATTR_SIZE(ip); 2102 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size); 2103 ++dotrans; 2104 } 2105 2106 /* 2107 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 2108 * This follows the protocol established by 2109 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 2110 */ 2111 if (vp->v_type == VDIR) { 2112 rw_enter(&ip->i_rwlock, RW_WRITER); 2113 dorwlock = 1; 2114 } 2115 2116 /* 2117 * Grab quota lock if we are changing the file's owner. 2118 */ 2119 if (mask & AT_UID) { 2120 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2121 dodqlock = 1; 2122 } 2123 rw_enter(&ip->i_contents, RW_WRITER); 2124 2125 oldva.va_mode = ip->i_mode; 2126 oldva.va_uid = ip->i_uid; 2127 oldva.va_gid = ip->i_gid; 2128 2129 vap->va_mask &= ~AT_SIZE; 2130 /* 2131 * ufs_iaccess is "close enough"; that's because it doesn't 2132 * map the defines. 2133 */ 2134 error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2135 ufs_iaccess, ip); 2136 if (error) 2137 goto update_inode; 2138 2139 mask = vap->va_mask; 2140 2141 /* 2142 * Change file access modes. 2143 */ 2144 if (mask & AT_MODE) { 2145 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT); 2146 TRANS_INODE(ufsvfsp, ip); 2147 ip->i_flag |= ICHG; 2148 if (stickyhack) { 2149 mutex_enter(&vp->v_lock); 2150 if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 2151 vp->v_flag |= VSWAPLIKE; 2152 else 2153 vp->v_flag &= ~VSWAPLIKE; 2154 mutex_exit(&vp->v_lock); 2155 } 2156 } 2157 if (mask & (AT_UID|AT_GID)) { 2158 if (mask & AT_UID) { 2159 /* 2160 * Don't change ownership of the quota inode. 2161 */ 2162 if (ufsvfsp->vfs_qinod == ip) { 2163 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED); 2164 error = EINVAL; 2165 goto update_inode; 2166 } 2167 2168 /* 2169 * No real ownership change. 2170 */ 2171 if (ip->i_uid == vap->va_uid) { 2172 blocks = 0; 2173 owner_change = 0; 2174 } 2175 /* 2176 * Remove the blocks and the file, from the old user's 2177 * quota. 2178 */ 2179 else { 2180 blocks = ip->i_blocks; 2181 owner_change = 1; 2182 2183 (void) chkdq(ip, -blocks, /* force */ 1, cr, 2184 (char **)NULL, (size_t *)NULL); 2185 (void) chkiq(ufsvfsp, /* change */ -1, ip, 2186 (uid_t)ip->i_uid, 2187 /* force */ 1, cr, 2188 (char **)NULL, (size_t *)NULL); 2189 dqrele(ip->i_dquot); 2190 } 2191 2192 ip->i_uid = vap->va_uid; 2193 2194 /* 2195 * There is a real ownership change. 2196 */ 2197 if (owner_change) { 2198 /* 2199 * Add the blocks and the file to the new 2200 * user's quota. 2201 */ 2202 ip->i_dquot = getinoquota(ip); 2203 (void) chkdq(ip, blocks, /* force */ 1, cr, 2204 &errmsg1, &len1); 2205 (void) chkiq(ufsvfsp, /* change */ 1, 2206 (struct inode *)NULL, 2207 (uid_t)ip->i_uid, 2208 /* force */ 1, cr, 2209 &errmsg2, &len2); 2210 } 2211 } 2212 if (mask & AT_GID) { 2213 ip->i_gid = vap->va_gid; 2214 } 2215 TRANS_INODE(ufsvfsp, ip); 2216 ip->i_flag |= ICHG; 2217 } 2218 /* 2219 * Change file access or modified times. 2220 */ 2221 if (mask & (AT_ATIME|AT_MTIME)) { 2222 /* Check that the time value is within ufs range */ 2223 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2224 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2225 error = EOVERFLOW; 2226 goto update_inode; 2227 } 2228 2229 /* 2230 * if the "noaccess" mount option is set and only atime 2231 * update is requested, do nothing. No error is returned. 2232 */ 2233 if ((ufsvfsp->vfs_noatime) && 2234 ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME)) 2235 goto skip_atime; 2236 2237 if (mask & AT_ATIME) { 2238 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2239 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2240 ip->i_flag &= ~IACC; 2241 } 2242 if (mask & AT_MTIME) { 2243 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2244 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2245 gethrestime(&now); 2246 if (now.tv_sec > TIME32_MAX) { 2247 /* 2248 * In 2038, ctime sticks forever.. 2249 */ 2250 ip->i_ctime.tv_sec = TIME32_MAX; 2251 ip->i_ctime.tv_usec = 0; 2252 } else { 2253 ip->i_ctime.tv_sec = now.tv_sec; 2254 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2255 } 2256 ip->i_flag &= ~(IUPD|ICHG); 2257 ip->i_flag |= IMODTIME; 2258 } 2259 TRANS_INODE(ufsvfsp, ip); 2260 ip->i_flag |= IMOD; 2261 } 2262 2263 skip_atime: 2264 /* 2265 * The presence of a shadow inode may indicate an ACL, but does 2266 * not imply an ACL. Future FSD types should be handled here too 2267 * and check for the presence of the attribute-specific data 2268 * before referencing it. 2269 */ 2270 if (ip->i_shadow) { 2271 /* 2272 * XXX if ufs_iupdat is changed to sandbagged write fix 2273 * ufs_acl_setattr to push ip to keep acls consistent 2274 * 2275 * Suppress out of inodes messages if we will retry. 2276 */ 2277 if (retry) 2278 ip->i_flag |= IQUIET; 2279 error = ufs_acl_setattr(ip, vap, cr); 2280 ip->i_flag &= ~IQUIET; 2281 } 2282 2283 update_inode: 2284 /* 2285 * Setattr always increases the sequence number 2286 */ 2287 ip->i_seq++; 2288 2289 /* 2290 * if nfsd and not logging; push synchronously 2291 */ 2292 if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) { 2293 ufs_iupdat(ip, 1); 2294 } else { 2295 ITIMES_NOLOCK(ip); 2296 } 2297 2298 rw_exit(&ip->i_contents); 2299 if (dodqlock) { 2300 rw_exit(&ufsvfsp->vfs_dqrwlock); 2301 } 2302 if (dorwlock) 2303 rw_exit(&ip->i_rwlock); 2304 2305 if (ulp) { 2306 if (dotrans) { 2307 int terr = 0; 2308 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR, 2309 trans_size); 2310 if (error == 0) 2311 error = terr; 2312 } 2313 ufs_lockfs_end(ulp); 2314 } 2315 out: 2316 /* 2317 * If out of inodes or blocks, see if we can free something 2318 * up from the delete queue. 2319 */ 2320 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 2321 ufs_delete_drain_wait(ufsvfsp, 1); 2322 retry = 0; 2323 if (errmsg1 != NULL) 2324 kmem_free(errmsg1, len1); 2325 if (errmsg2 != NULL) 2326 kmem_free(errmsg2, len2); 2327 goto again; 2328 } 2329 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_END, 2330 "ufs_setattr_end:vp %p error %d", vp, error); 2331 if (errmsg1 != NULL) { 2332 uprintf(errmsg1); 2333 kmem_free(errmsg1, len1); 2334 } 2335 if (errmsg2 != NULL) { 2336 uprintf(errmsg2); 2337 kmem_free(errmsg2, len2); 2338 } 2339 return (error); 2340 } 2341 2342 /*ARGSUSED*/ 2343 static int 2344 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr) 2345 { 2346 struct inode *ip = VTOI(vp); 2347 int error; 2348 2349 TRACE_3(TR_FAC_UFS, TR_UFS_ACCESS_START, 2350 "ufs_access_start:vp %p mode %x flags %x", vp, mode, flags); 2351 2352 if (ip->i_ufsvfs == NULL) 2353 return (EIO); 2354 2355 rw_enter(&ip->i_contents, RW_READER); 2356 2357 /* 2358 * The ufs_iaccess function wants to be called with 2359 * mode bits expressed as "ufs specific" bits. 2360 * I.e., VWRITE|VREAD|VEXEC do not make sense to 2361 * ufs_iaccess() but IWRITE|IREAD|IEXEC do. 2362 * But since they're the same we just pass the vnode mode 2363 * bit but just verify that assumption at compile time. 2364 */ 2365 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC 2366 #error "ufs_access needs to map Vmodes to Imodes" 2367 #endif 2368 error = ufs_iaccess(ip, mode, cr); 2369 2370 rw_exit(&ip->i_contents); 2371 2372 TRACE_2(TR_FAC_UFS, TR_UFS_ACCESS_END, 2373 "ufs_access_end:vp %p error %d", vp, error); 2374 return (error); 2375 } 2376 2377 /* ARGSUSED */ 2378 static int 2379 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr) 2380 { 2381 struct inode *ip = VTOI(vp); 2382 struct ufsvfs *ufsvfsp; 2383 struct ulockfs *ulp; 2384 int error; 2385 int fastsymlink; 2386 2387 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_START, 2388 "ufs_readlink_start:vp %p uiop %p", uiop, vp); 2389 2390 if (vp->v_type != VLNK) { 2391 error = EINVAL; 2392 goto nolockout; 2393 } 2394 2395 /* 2396 * If the symbolic link is empty there is nothing to read. 2397 * Fast-track these empty symbolic links 2398 */ 2399 if (ip->i_size == 0) { 2400 error = 0; 2401 goto nolockout; 2402 } 2403 2404 ufsvfsp = ip->i_ufsvfs; 2405 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK); 2406 if (error) 2407 goto nolockout; 2408 /* 2409 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK 2410 */ 2411 again: 2412 fastsymlink = 0; 2413 if (ip->i_flag & IFASTSYMLNK) { 2414 rw_enter(&ip->i_rwlock, RW_READER); 2415 rw_enter(&ip->i_contents, RW_READER); 2416 if (ip->i_flag & IFASTSYMLNK) { 2417 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 2418 (ip->i_fs->fs_ronly == 0) && 2419 (!ufsvfsp->vfs_noatime)) { 2420 mutex_enter(&ip->i_tlock); 2421 ip->i_flag |= IACC; 2422 mutex_exit(&ip->i_tlock); 2423 } 2424 error = uiomove((caddr_t)&ip->i_db[1], 2425 MIN(ip->i_size, uiop->uio_resid), 2426 UIO_READ, uiop); 2427 ITIMES(ip); 2428 ++fastsymlink; 2429 } 2430 rw_exit(&ip->i_contents); 2431 rw_exit(&ip->i_rwlock); 2432 } 2433 if (!fastsymlink) { 2434 ssize_t size; /* number of bytes read */ 2435 caddr_t basep; /* pointer to input data */ 2436 ino_t ino; 2437 long igen; 2438 struct uio tuio; /* temp uio struct */ 2439 struct uio *tuiop; 2440 iovec_t tiov; /* temp iovec struct */ 2441 char kbuf[FSL_SIZE]; /* buffer to hold fast symlink */ 2442 int tflag = 0; /* flag to indicate temp vars used */ 2443 2444 ino = ip->i_number; 2445 igen = ip->i_gen; 2446 size = uiop->uio_resid; 2447 basep = uiop->uio_iov->iov_base; 2448 tuiop = uiop; 2449 2450 rw_enter(&ip->i_rwlock, RW_WRITER); 2451 rw_enter(&ip->i_contents, RW_WRITER); 2452 if (ip->i_flag & IFASTSYMLNK) { 2453 rw_exit(&ip->i_contents); 2454 rw_exit(&ip->i_rwlock); 2455 goto again; 2456 } 2457 2458 /* can this be a fast symlink and is it a user buffer? */ 2459 if (ip->i_size <= FSL_SIZE && 2460 (uiop->uio_segflg == UIO_USERSPACE || 2461 uiop->uio_segflg == UIO_USERISPACE)) { 2462 2463 bzero(&tuio, sizeof (struct uio)); 2464 /* 2465 * setup a kernel buffer to read link into. this 2466 * is to fix a race condition where the user buffer 2467 * got corrupted before copying it into the inode. 2468 */ 2469 size = ip->i_size; 2470 tiov.iov_len = size; 2471 tiov.iov_base = kbuf; 2472 tuio.uio_iov = &tiov; 2473 tuio.uio_iovcnt = 1; 2474 tuio.uio_offset = uiop->uio_offset; 2475 tuio.uio_segflg = UIO_SYSSPACE; 2476 tuio.uio_fmode = uiop->uio_fmode; 2477 tuio.uio_extflg = uiop->uio_extflg; 2478 tuio.uio_limit = uiop->uio_limit; 2479 tuio.uio_resid = size; 2480 2481 basep = tuio.uio_iov->iov_base; 2482 tuiop = &tuio; 2483 tflag = 1; 2484 } 2485 2486 error = rdip(ip, tuiop, 0, cr); 2487 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) { 2488 rw_exit(&ip->i_contents); 2489 rw_exit(&ip->i_rwlock); 2490 goto out; 2491 } 2492 2493 if (tflag == 0) 2494 size -= uiop->uio_resid; 2495 2496 if ((tflag == 0 && ip->i_size <= FSL_SIZE && 2497 ip->i_size == size) || (tflag == 1 && 2498 tuio.uio_resid == 0)) { 2499 error = kcopy(basep, &ip->i_db[1], ip->i_size); 2500 if (error == 0) { 2501 ip->i_flag |= IFASTSYMLNK; 2502 /* 2503 * free page 2504 */ 2505 (void) VOP_PUTPAGE(ITOV(ip), 2506 (offset_t)0, PAGESIZE, 2507 (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC), 2508 cr); 2509 } else { 2510 int i; 2511 /* error, clear garbage left behind */ 2512 for (i = 1; i < NDADDR; i++) 2513 ip->i_db[i] = 0; 2514 for (i = 0; i < NIADDR; i++) 2515 ip->i_ib[i] = 0; 2516 } 2517 } 2518 if (tflag == 1) { 2519 /* now, copy it into the user buffer */ 2520 error = uiomove((caddr_t)kbuf, 2521 MIN(size, uiop->uio_resid), 2522 UIO_READ, uiop); 2523 } 2524 rw_exit(&ip->i_contents); 2525 rw_exit(&ip->i_rwlock); 2526 } 2527 out: 2528 if (ulp) { 2529 ufs_lockfs_end(ulp); 2530 } 2531 nolockout: 2532 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_END, 2533 "ufs_readlink_end:vp %p error %d", vp, error); 2534 2535 return (error); 2536 } 2537 2538 /* ARGSUSED */ 2539 static int 2540 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr) 2541 { 2542 struct inode *ip = VTOI(vp); 2543 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2544 struct ulockfs *ulp; 2545 int error; 2546 2547 TRACE_1(TR_FAC_UFS, TR_UFS_FSYNC_START, 2548 "ufs_fsync_start:vp %p", vp); 2549 2550 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK); 2551 if (error) 2552 return (error); 2553 2554 if (TRANS_ISTRANS(ufsvfsp)) { 2555 /* 2556 * First push out any data pages 2557 */ 2558 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2559 (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) { 2560 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 2561 0, CRED()); 2562 if (error) 2563 goto out; 2564 } 2565 2566 /* 2567 * Delta any delayed inode times updates 2568 * and push inode to log. 2569 * All other inode deltas will have already been delta'd 2570 * and will be pushed during the commit. 2571 */ 2572 if (!(syncflag & FDSYNC) && 2573 ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) { 2574 if (ulp) { 2575 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC, 2576 TOP_SYNCIP_SIZE); 2577 } 2578 rw_enter(&ip->i_contents, RW_READER); 2579 mutex_enter(&ip->i_tlock); 2580 ip->i_flag &= ~IMODTIME; 2581 mutex_exit(&ip->i_tlock); 2582 ufs_iupdat(ip, I_SYNC); 2583 rw_exit(&ip->i_contents); 2584 if (ulp) { 2585 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC, 2586 TOP_SYNCIP_SIZE); 2587 } 2588 } 2589 2590 /* 2591 * Commit the Moby transaction 2592 * 2593 * Deltas have already been made so we just need to 2594 * commit them with a synchronous transaction. 2595 * TRANS_BEGIN_SYNC() will return an error 2596 * if there are no deltas to commit, for an 2597 * empty transaction. 2598 */ 2599 if (ulp) { 2600 TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE, 2601 error); 2602 if (error) { 2603 error = 0; /* commit wasn't needed */ 2604 goto out; 2605 } 2606 TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC, 2607 TOP_COMMIT_SIZE); 2608 } 2609 } else { /* not logging */ 2610 if (!(IS_SWAPVP(vp))) 2611 if (syncflag & FNODSYNC) { 2612 /* Just update the inode only */ 2613 TRANS_IUPDAT(ip, 1); 2614 error = 0; 2615 } else if (syncflag & FDSYNC) 2616 /* Do data-synchronous writes */ 2617 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC); 2618 else 2619 /* Do synchronous writes */ 2620 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC); 2621 2622 rw_enter(&ip->i_contents, RW_WRITER); 2623 if (!error) 2624 error = ufs_sync_indir(ip); 2625 rw_exit(&ip->i_contents); 2626 } 2627 out: 2628 if (ulp) { 2629 ufs_lockfs_end(ulp); 2630 } 2631 TRACE_2(TR_FAC_UFS, TR_UFS_FSYNC_END, 2632 "ufs_fsync_end:vp %p error %d", vp, error); 2633 return (error); 2634 } 2635 2636 /*ARGSUSED*/ 2637 static void 2638 ufs_inactive(struct vnode *vp, struct cred *cr) 2639 { 2640 ufs_iinactive(VTOI(vp)); 2641 } 2642 2643 /* 2644 * Unix file system operations having to do with directory manipulation. 2645 */ 2646 int ufs_lookup_idle_count = 2; /* Number of inodes to idle each time */ 2647 /* ARGSUSED */ 2648 static int 2649 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 2650 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr) 2651 { 2652 struct inode *ip; 2653 struct inode *sip; 2654 struct inode *xip; 2655 struct ufsvfs *ufsvfsp; 2656 struct ulockfs *ulp; 2657 struct vnode *vp; 2658 int error; 2659 2660 TRACE_2(TR_FAC_UFS, TR_UFS_LOOKUP_START, 2661 "ufs_lookup_start:dvp %p name %s", dvp, nm); 2662 2663 2664 /* 2665 * Check flags for type of lookup (regular file or attribute file) 2666 */ 2667 2668 ip = VTOI(dvp); 2669 2670 if (flags & LOOKUP_XATTR) { 2671 2672 /* 2673 * We don't allow recursive attributes... 2674 * Maybe someday we will. 2675 */ 2676 if ((ip->i_cflags & IXATTR)) { 2677 return (EINVAL); 2678 } 2679 2680 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) { 2681 error = ufs_xattr_getattrdir(dvp, &sip, flags, cr); 2682 if (error) { 2683 *vpp = NULL; 2684 goto out; 2685 } 2686 2687 vp = ITOV(sip); 2688 dnlc_update(dvp, XATTR_DIR_NAME, vp); 2689 } 2690 2691 /* 2692 * Check accessibility of directory. 2693 */ 2694 if (vp == DNLC_NO_VNODE) { 2695 VN_RELE(vp); 2696 error = ENOENT; 2697 goto out; 2698 } 2699 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr)) != 0) { 2700 VN_RELE(vp); 2701 goto out; 2702 } 2703 2704 *vpp = vp; 2705 return (0); 2706 } 2707 2708 /* 2709 * Check for a null component, which we should treat as 2710 * looking at dvp from within it's parent, so we don't 2711 * need a call to ufs_iaccess(), as it has already been 2712 * done. 2713 */ 2714 if (nm[0] == 0) { 2715 VN_HOLD(dvp); 2716 error = 0; 2717 *vpp = dvp; 2718 goto out; 2719 } 2720 2721 /* 2722 * Check for "." ie itself. this is a quick check and 2723 * avoids adding "." into the dnlc (which have been seen 2724 * to occupy >10% of the cache). 2725 */ 2726 if ((nm[0] == '.') && (nm[1] == 0)) { 2727 /* 2728 * Don't return without checking accessibility 2729 * of the directory. We only need the lock if 2730 * we are going to return it. 2731 */ 2732 if ((error = ufs_iaccess(ip, IEXEC, cr)) == 0) { 2733 VN_HOLD(dvp); 2734 *vpp = dvp; 2735 } 2736 goto out; 2737 } 2738 2739 /* 2740 * Fast path: Check the directory name lookup cache. 2741 */ 2742 if (vp = dnlc_lookup(dvp, nm)) { 2743 /* 2744 * Check accessibility of directory. 2745 */ 2746 if ((error = ufs_iaccess(ip, IEXEC, cr)) != 0) { 2747 VN_RELE(vp); 2748 goto out; 2749 } 2750 if (vp == DNLC_NO_VNODE) { 2751 VN_RELE(vp); 2752 error = ENOENT; 2753 goto out; 2754 } 2755 xip = VTOI(vp); 2756 ulp = NULL; 2757 goto fastpath; 2758 } 2759 2760 /* 2761 * Keep the idle queue from getting too long by 2762 * idling two inodes before attempting to allocate another. 2763 * This operation must be performed before entering 2764 * lockfs or a transaction. 2765 */ 2766 if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat) 2767 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 2768 ins.in_lidles.value.ul += ufs_lookup_idle_count; 2769 ufs_idle_some(ufs_lookup_idle_count); 2770 } 2771 2772 ufsvfsp = ip->i_ufsvfs; 2773 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK); 2774 if (error) 2775 goto out; 2776 2777 error = ufs_dirlook(ip, nm, &xip, cr, 1); 2778 2779 fastpath: 2780 if (error == 0) { 2781 ip = xip; 2782 *vpp = ITOV(ip); 2783 2784 /* 2785 * If vnode is a device return special vnode instead. 2786 */ 2787 if (IS_DEVVP(*vpp)) { 2788 struct vnode *newvp; 2789 2790 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, 2791 cr); 2792 VN_RELE(*vpp); 2793 if (newvp == NULL) 2794 error = ENOSYS; 2795 else 2796 *vpp = newvp; 2797 } 2798 } 2799 if (ulp) { 2800 ufs_lockfs_end(ulp); 2801 } 2802 2803 out: 2804 TRACE_3(TR_FAC_UFS, TR_UFS_LOOKUP_END, 2805 "ufs_lookup_end:dvp %p name %s error %d", vpp, nm, error); 2806 return (error); 2807 } 2808 2809 static int 2810 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl, 2811 int mode, struct vnode **vpp, struct cred *cr, int flag) 2812 { 2813 struct inode *ip; 2814 struct inode *xip; 2815 struct inode *dip; 2816 struct vnode *xvp; 2817 struct ufsvfs *ufsvfsp; 2818 struct ulockfs *ulp; 2819 int error; 2820 int issync; 2821 int truncflag; 2822 int trans_size; 2823 int noentry; 2824 int defer_dip_seq_update = 0; /* need to defer update of dip->i_seq */ 2825 int retry = 1; 2826 2827 TRACE_1(TR_FAC_UFS, TR_UFS_CREATE_START, 2828 "ufs_create_start:dvp %p", dvp); 2829 2830 again: 2831 ip = VTOI(dvp); 2832 ufsvfsp = ip->i_ufsvfs; 2833 truncflag = 0; 2834 2835 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK); 2836 if (error) 2837 goto out; 2838 2839 if (ulp) { 2840 trans_size = (int)TOP_CREATE_SIZE(ip); 2841 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size); 2842 } 2843 2844 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 2845 vap->va_mode &= ~VSVTX; 2846 2847 if (*name == '\0') { 2848 /* 2849 * Null component name refers to the directory itself. 2850 */ 2851 VN_HOLD(dvp); 2852 /* 2853 * Even though this is an error case, we need to grab the 2854 * quota lock since the error handling code below is common. 2855 */ 2856 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2857 rw_enter(&ip->i_contents, RW_WRITER); 2858 error = EEXIST; 2859 } else { 2860 xip = NULL; 2861 noentry = 0; 2862 rw_enter(&ip->i_rwlock, RW_WRITER); 2863 xvp = dnlc_lookup(dvp, name); 2864 if (xvp == DNLC_NO_VNODE) { 2865 noentry = 1; 2866 VN_RELE(xvp); 2867 xvp = NULL; 2868 } 2869 if (xvp) { 2870 rw_exit(&ip->i_rwlock); 2871 if (error = ufs_iaccess(ip, IEXEC, cr)) { 2872 VN_RELE(xvp); 2873 } else { 2874 error = EEXIST; 2875 xip = VTOI(xvp); 2876 } 2877 } else { 2878 /* 2879 * Suppress file system full message if we will retry 2880 */ 2881 error = ufs_direnter_cm(ip, name, DE_CREATE, 2882 vap, &xip, cr, 2883 (noentry | (retry ? IQUIET : 0))); 2884 rw_exit(&ip->i_rwlock); 2885 } 2886 ip = xip; 2887 if (ip != NULL) { 2888 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2889 rw_enter(&ip->i_contents, RW_WRITER); 2890 } 2891 } 2892 2893 /* 2894 * If the file already exists and this is a non-exclusive create, 2895 * check permissions and allow access for non-directories. 2896 * Read-only create of an existing directory is also allowed. 2897 * We fail an exclusive create of anything which already exists. 2898 */ 2899 if (error == EEXIST) { 2900 dip = VTOI(dvp); 2901 if (excl == NONEXCL) { 2902 if ((((ip->i_mode & IFMT) == IFDIR) || 2903 ((ip->i_mode & IFMT) == IFATTRDIR)) && 2904 (mode & IWRITE)) 2905 error = EISDIR; 2906 else if (mode) 2907 error = ufs_iaccess(ip, mode, cr); 2908 else 2909 error = 0; 2910 } 2911 if (error) { 2912 rw_exit(&ip->i_contents); 2913 rw_exit(&ufsvfsp->vfs_dqrwlock); 2914 VN_RELE(ITOV(ip)); 2915 goto unlock; 2916 } 2917 /* 2918 * If the error EEXIST was set, then i_seq can not 2919 * have been updated. The sequence number interface 2920 * is defined such that a non-error VOP_CREATE must 2921 * increase the dir va_seq it by at least one. If we 2922 * have cleared the error, increase i_seq. Note that 2923 * we are increasing the dir i_seq and in rare cases 2924 * ip may actually be from the dvp, so we already have 2925 * the locks and it will not be subject to truncation. 2926 * In case we have to update i_seq of the parent 2927 * directory dip, we have to defer it till we have 2928 * released our locks on ip due to lock ordering requirements. 2929 */ 2930 if (ip != dip) 2931 defer_dip_seq_update = 1; 2932 else 2933 ip->i_seq++; 2934 2935 if (((ip->i_mode & IFMT) == IFREG) && 2936 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 2937 /* 2938 * Truncate regular files, if requested by caller. 2939 * Grab i_rwlock to make sure no one else is 2940 * currently writing to the file (we promised 2941 * bmap we would do this). 2942 * Must get the locks in the correct order. 2943 */ 2944 if (ip->i_size == 0) { 2945 ip->i_flag |= ICHG | IUPD; 2946 ip->i_seq++; 2947 TRANS_INODE(ufsvfsp, ip); 2948 } else { 2949 /* 2950 * Large Files: Why this check here? 2951 * Though we do it in vn_create() we really 2952 * want to guarantee that we do not destroy 2953 * Large file data by atomically checking 2954 * the size while holding the contents 2955 * lock. 2956 */ 2957 if (flag && !(flag & FOFFMAX) && 2958 ((ip->i_mode & IFMT) == IFREG) && 2959 (ip->i_size > (offset_t)MAXOFF32_T)) { 2960 rw_exit(&ip->i_contents); 2961 rw_exit(&ufsvfsp->vfs_dqrwlock); 2962 error = EOVERFLOW; 2963 goto unlock; 2964 } 2965 if (TRANS_ISTRANS(ufsvfsp)) 2966 truncflag++; 2967 else { 2968 rw_exit(&ip->i_contents); 2969 rw_exit(&ufsvfsp->vfs_dqrwlock); 2970 rw_enter(&ip->i_rwlock, RW_WRITER); 2971 rw_enter(&ufsvfsp->vfs_dqrwlock, 2972 RW_READER); 2973 rw_enter(&ip->i_contents, RW_WRITER); 2974 (void) ufs_itrunc(ip, (u_offset_t)0, 0, 2975 cr); 2976 rw_exit(&ip->i_rwlock); 2977 } 2978 } 2979 } 2980 } 2981 2982 if (error) { 2983 if (ip != NULL) { 2984 rw_exit(&ufsvfsp->vfs_dqrwlock); 2985 rw_exit(&ip->i_contents); 2986 } 2987 goto unlock; 2988 } 2989 2990 *vpp = ITOV(ip); 2991 ITIMES(ip); 2992 rw_exit(&ip->i_contents); 2993 rw_exit(&ufsvfsp->vfs_dqrwlock); 2994 2995 /* 2996 * If vnode is a device return special vnode instead. 2997 */ 2998 if (!error && IS_DEVVP(*vpp)) { 2999 struct vnode *newvp; 3000 3001 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 3002 VN_RELE(*vpp); 3003 if (newvp == NULL) { 3004 error = ENOSYS; 3005 goto unlock; 3006 } 3007 truncflag = 0; 3008 *vpp = newvp; 3009 } 3010 unlock: 3011 3012 /* 3013 * Do the deferred update of the parent directory's sequence 3014 * number now. 3015 */ 3016 if (defer_dip_seq_update == 1) { 3017 rw_enter(&dip->i_contents, RW_READER); 3018 mutex_enter(&dip->i_tlock); 3019 dip->i_seq++; 3020 mutex_exit(&dip->i_tlock); 3021 rw_exit(&dip->i_contents); 3022 } 3023 3024 if (ulp) { 3025 int terr = 0; 3026 3027 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE, 3028 trans_size); 3029 3030 /* 3031 * If we haven't had a more interesting failure 3032 * already, then anything that might've happened 3033 * here should be reported. 3034 */ 3035 if (error == 0) 3036 error = terr; 3037 } 3038 3039 if (!error && truncflag) { 3040 rw_enter(&ip->i_rwlock, RW_WRITER); 3041 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr); 3042 rw_exit(&ip->i_rwlock); 3043 } 3044 3045 if (ulp) 3046 ufs_lockfs_end(ulp); 3047 3048 /* 3049 * If no inodes available, try to free one up out of the 3050 * pending delete queue. 3051 */ 3052 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3053 ufs_delete_drain_wait(ufsvfsp, 1); 3054 retry = 0; 3055 goto again; 3056 } 3057 3058 out: 3059 TRACE_3(TR_FAC_UFS, TR_UFS_CREATE_END, 3060 "ufs_create_end:dvp %p name %s error %d", vpp, name, error); 3061 return (error); 3062 } 3063 3064 extern int ufs_idle_max; 3065 /*ARGSUSED*/ 3066 static int 3067 ufs_remove(struct vnode *vp, char *nm, struct cred *cr) 3068 { 3069 struct inode *ip = VTOI(vp); 3070 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3071 struct ulockfs *ulp; 3072 vnode_t *rmvp = NULL; /* Vnode corresponding to name being removed */ 3073 int error; 3074 int issync; 3075 int trans_size; 3076 3077 TRACE_1(TR_FAC_UFS, TR_UFS_REMOVE_START, 3078 "ufs_remove_start:vp %p", vp); 3079 3080 /* 3081 * don't let the delete queue get too long 3082 */ 3083 if (ufsvfsp == NULL) { 3084 error = EIO; 3085 goto out; 3086 } 3087 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3088 ufs_delete_drain(vp->v_vfsp, 1, 1); 3089 3090 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK); 3091 if (error) 3092 goto out; 3093 3094 if (ulp) 3095 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 3096 trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp))); 3097 3098 rw_enter(&ip->i_rwlock, RW_WRITER); 3099 error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0, 3100 DR_REMOVE, cr, &rmvp); 3101 rw_exit(&ip->i_rwlock); 3102 3103 if (ulp) { 3104 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size); 3105 ufs_lockfs_end(ulp); 3106 } 3107 3108 /* 3109 * This must be called after the remove transaction is closed. 3110 */ 3111 if (rmvp != NULL) { 3112 /* Only send the event if there were no errors */ 3113 if (error == 0) 3114 vnevent_remove(rmvp); 3115 VN_RELE(rmvp); 3116 } 3117 out: 3118 TRACE_3(TR_FAC_UFS, TR_UFS_REMOVE_END, 3119 "ufs_remove_end:vp %p name %s error %d", vp, nm, error); 3120 return (error); 3121 } 3122 3123 /* 3124 * Link a file or a directory. Only privileged processes are allowed to 3125 * make links to directories. 3126 */ 3127 static int 3128 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr) 3129 { 3130 struct inode *sip; 3131 struct inode *tdp = VTOI(tdvp); 3132 struct ufsvfs *ufsvfsp = tdp->i_ufsvfs; 3133 struct ulockfs *ulp; 3134 struct vnode *realvp; 3135 int error; 3136 int issync; 3137 int trans_size; 3138 int isdev; 3139 3140 TRACE_1(TR_FAC_UFS, TR_UFS_LINK_START, 3141 "ufs_link_start:tdvp %p", tdvp); 3142 3143 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK); 3144 if (error) 3145 goto out; 3146 3147 if (ulp) 3148 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK, 3149 trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp))); 3150 3151 if (VOP_REALVP(svp, &realvp) == 0) 3152 svp = realvp; 3153 3154 /* 3155 * Make sure link for extended attributes is valid 3156 * We only support hard linking of attr in ATTRDIR to ATTRDIR 3157 * 3158 * Make certain we don't attempt to look at a device node as 3159 * a ufs inode. 3160 */ 3161 3162 isdev = IS_DEVVP(svp); 3163 if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) && 3164 ((tdp->i_mode & IFMT) == IFATTRDIR)) || 3165 ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) && 3166 ((tdp->i_mode & IFMT) == IFDIR))) { 3167 error = EINVAL; 3168 goto unlock; 3169 } 3170 3171 sip = VTOI(svp); 3172 if ((svp->v_type == VDIR && 3173 secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) || 3174 (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) { 3175 error = EPERM; 3176 goto unlock; 3177 } 3178 rw_enter(&tdp->i_rwlock, RW_WRITER); 3179 error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0, 3180 sip, cr, NULL); 3181 rw_exit(&tdp->i_rwlock); 3182 3183 unlock: 3184 if (ulp) { 3185 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size); 3186 ufs_lockfs_end(ulp); 3187 } 3188 out: 3189 TRACE_2(TR_FAC_UFS, TR_UFS_LINK_END, 3190 "ufs_link_end:tdvp %p error %d", tdvp, error); 3191 return (error); 3192 } 3193 3194 uint64_t ufs_rename_retry_cnt; 3195 uint64_t ufs_rename_upgrade_retry_cnt; 3196 uint64_t ufs_rename_dircheck_retry_cnt; 3197 clock_t ufs_rename_backoff_delay = 1; 3198 3199 /* 3200 * Rename a file or directory. 3201 * We are given the vnode and entry string of the source and the 3202 * vnode and entry string of the place we want to move the source 3203 * to (the target). The essential operation is: 3204 * unlink(target); 3205 * link(source, target); 3206 * unlink(source); 3207 * but "atomically". Can't do full commit without saving state in 3208 * the inode on disk, which isn't feasible at this time. Best we 3209 * can do is always guarantee that the TARGET exists. 3210 */ 3211 3212 /*ARGSUSED*/ 3213 static int 3214 ufs_rename( 3215 struct vnode *sdvp, /* old (source) parent vnode */ 3216 char *snm, /* old (source) entry name */ 3217 struct vnode *tdvp, /* new (target) parent vnode */ 3218 char *tnm, /* new (target) entry name */ 3219 struct cred *cr) 3220 { 3221 struct inode *sip = NULL; /* source inode */ 3222 struct inode *ip = NULL; /* check inode */ 3223 struct inode *sdp; /* old (source) parent inode */ 3224 struct inode *tdp; /* new (target) parent inode */ 3225 struct vnode *tvp = NULL; /* target vnode, if it exists */ 3226 struct vnode *realvp; 3227 struct ufsvfs *ufsvfsp; 3228 struct ulockfs *ulp; 3229 struct ufs_slot slot; 3230 timestruc_t now; 3231 int error; 3232 int issync; 3233 int trans_size; 3234 3235 TRACE_1(TR_FAC_UFS, TR_UFS_RENAME_START, 3236 "ufs_rename_start:sdvp %p", sdvp); 3237 3238 3239 sdp = VTOI(sdvp); 3240 slot.fbp = NULL; 3241 ufsvfsp = sdp->i_ufsvfs; 3242 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); 3243 if (error) 3244 goto out; 3245 3246 if (ulp) 3247 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, 3248 trans_size = (int)TOP_RENAME_SIZE(sdp)); 3249 3250 if (VOP_REALVP(tdvp, &realvp) == 0) 3251 tdvp = realvp; 3252 3253 tdp = VTOI(tdvp); 3254 3255 3256 /* 3257 * We only allow renaming of attributes from ATTRDIR to ATTRDIR. 3258 */ 3259 if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) { 3260 error = EINVAL; 3261 goto unlock; 3262 } 3263 3264 /* 3265 * Look up inode of file we're supposed to rename. 3266 */ 3267 gethrestime(&now); 3268 if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) { 3269 goto unlock; 3270 } 3271 3272 /* 3273 * Lock both the source and target directories (they may be 3274 * the same) to provide the atomicity semantics that was 3275 * previously provided by the per file system vfs_rename_lock 3276 * 3277 * with vfs_rename_lock removed to allow simultaneous renames 3278 * within a file system, ufs_dircheckpath can deadlock while 3279 * traversing back to ensure that source is not a parent directory 3280 * of target parent directory. This is because we get into 3281 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER. 3282 * If the tdp and sdp of the simultaneous renames happen to be 3283 * in the path of each other, it can lead to a deadlock. This 3284 * can be avoided by getting the locks as RW_READER here and then 3285 * upgrading to RW_WRITER after completing the ufs_dircheckpath. 3286 */ 3287 retry: 3288 rw_enter(&tdp->i_rwlock, RW_READER); 3289 if (tdp != sdp) { 3290 /* 3291 * We're locking 2 peer level locks, so must use tryenter 3292 * on the 2nd to avoid deadlocks that would occur 3293 * if we renamed a->b and b->a concurrently. 3294 */ 3295 if (!rw_tryenter(&sdp->i_rwlock, RW_READER)) { 3296 /* 3297 * Reverse the lock grabs in case we have heavy 3298 * contention on the 2nd lock. 3299 */ 3300 rw_exit(&tdp->i_rwlock); 3301 rw_enter(&sdp->i_rwlock, RW_READER); 3302 if (!rw_tryenter(&tdp->i_rwlock, RW_READER)) { 3303 ufs_rename_retry_cnt++; 3304 rw_exit(&sdp->i_rwlock); 3305 goto retry; 3306 } 3307 } 3308 } 3309 3310 if (sip == tdp) { 3311 error = EINVAL; 3312 goto errout; 3313 } 3314 /* 3315 * Make sure we can delete the source entry. This requires 3316 * write permission on the containing directory. 3317 * Check for sticky directories. 3318 */ 3319 rw_enter(&sdp->i_contents, RW_READER); 3320 rw_enter(&sip->i_contents, RW_READER); 3321 if ((error = ufs_iaccess(sdp, IWRITE, cr)) != 0 || 3322 (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) { 3323 rw_exit(&sip->i_contents); 3324 rw_exit(&sdp->i_contents); 3325 goto errout; 3326 } 3327 3328 /* 3329 * If this is a rename of a directory and the parent is 3330 * different (".." must be changed), then the source 3331 * directory must not be in the directory hierarchy 3332 * above the target, as this would orphan everything 3333 * below the source directory. Also the user must have 3334 * write permission in the source so as to be able to 3335 * change "..". 3336 */ 3337 if ((((sip->i_mode & IFMT) == IFDIR) || 3338 ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) { 3339 ino_t inum; 3340 3341 if ((error = ufs_iaccess(sip, IWRITE, cr))) { 3342 rw_exit(&sip->i_contents); 3343 rw_exit(&sdp->i_contents); 3344 goto errout; 3345 } 3346 inum = sip->i_number; 3347 rw_exit(&sip->i_contents); 3348 rw_exit(&sdp->i_contents); 3349 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) { 3350 /* 3351 * If we got EAGAIN ufs_dircheckpath detected a 3352 * potential deadlock and backed out. We need 3353 * to retry the operation since sdp and tdp have 3354 * to be released to avoid the deadlock. 3355 */ 3356 if (error == EAGAIN) { 3357 rw_exit(&tdp->i_rwlock); 3358 if (tdp != sdp) 3359 rw_exit(&sdp->i_rwlock); 3360 delay(ufs_rename_backoff_delay); 3361 ufs_rename_dircheck_retry_cnt++; 3362 goto retry; 3363 } 3364 goto errout; 3365 } 3366 } else { 3367 rw_exit(&sip->i_contents); 3368 rw_exit(&sdp->i_contents); 3369 } 3370 3371 3372 /* 3373 * Check for renaming '.' or '..' or alias of '.' 3374 */ 3375 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) { 3376 error = EINVAL; 3377 goto errout; 3378 } 3379 3380 /* 3381 * Simultaneous renames can deadlock in ufs_dircheckpath since it 3382 * tries to traverse back the file tree with both tdp and sdp held 3383 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks 3384 * as RW_READERS till ufs_dircheckpath is done. 3385 * Now that ufs_dircheckpath is done with, we can upgrade the locks 3386 * to RW_WRITER. 3387 */ 3388 if (!rw_tryupgrade(&tdp->i_rwlock)) { 3389 /* 3390 * The upgrade failed. We got to give away the lock 3391 * as to avoid deadlocking with someone else who is 3392 * waiting for writer lock. With the lock gone, we 3393 * cannot be sure the checks done above will hold 3394 * good when we eventually get them back as writer. 3395 * So if we can't upgrade we drop the locks and retry 3396 * everything again. 3397 */ 3398 rw_exit(&tdp->i_rwlock); 3399 if (tdp != sdp) 3400 rw_exit(&sdp->i_rwlock); 3401 delay(ufs_rename_backoff_delay); 3402 ufs_rename_upgrade_retry_cnt++; 3403 goto retry; 3404 } 3405 if (tdp != sdp) { 3406 if (!rw_tryupgrade(&sdp->i_rwlock)) { 3407 /* 3408 * The upgrade failed. We got to give away the lock 3409 * as to avoid deadlocking with someone else who is 3410 * waiting for writer lock. With the lock gone, we 3411 * cannot be sure the checks done above will hold 3412 * good when we eventually get them back as writer. 3413 * So if we can't upgrade we drop the locks and retry 3414 * everything again. 3415 */ 3416 rw_exit(&tdp->i_rwlock); 3417 rw_exit(&sdp->i_rwlock); 3418 delay(ufs_rename_backoff_delay); 3419 ufs_rename_upgrade_retry_cnt++; 3420 goto retry; 3421 } 3422 } 3423 3424 /* 3425 * Now that all the locks are held check to make sure another thread 3426 * didn't slip in and take out the sip. 3427 */ 3428 slot.status = NONE; 3429 if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec || 3430 sip->i_ctime.tv_sec > now.tv_sec) { 3431 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 3432 rw_enter(&sdp->i_contents, RW_WRITER); 3433 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot, 3434 &ip, cr, 0); 3435 rw_exit(&sdp->i_contents); 3436 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock); 3437 if (error) { 3438 goto errout; 3439 } 3440 if (ip == NULL) { 3441 error = ENOENT; 3442 goto errout; 3443 } else { 3444 /* 3445 * If the inode was found need to drop the v_count 3446 * so as not to keep the filesystem from being 3447 * unmounted at a later time. 3448 */ 3449 VN_RELE(ITOV(ip)); 3450 } 3451 3452 /* 3453 * Release the slot.fbp that has the page mapped and 3454 * locked SE_SHARED, and could be used in in 3455 * ufs_direnter_lr() which needs to get the SE_EXCL lock 3456 * on said page. 3457 */ 3458 if (slot.fbp) { 3459 fbrelse(slot.fbp, S_OTHER); 3460 slot.fbp = NULL; 3461 } 3462 } 3463 3464 /* 3465 * Link source to the target. If a target exists, return its 3466 * vnode pointer in tvp. We'll release it after sending the 3467 * vnevent. 3468 */ 3469 if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) { 3470 /* 3471 * ESAME isn't really an error; it indicates that the 3472 * operation should not be done because the source and target 3473 * are the same file, but that no error should be reported. 3474 */ 3475 if (error == ESAME) 3476 error = 0; 3477 goto errout; 3478 } 3479 3480 /* 3481 * Unlink the source. 3482 * Remove the source entry. ufs_dirremove() checks that the entry 3483 * still reflects sip, and returns an error if it doesn't. 3484 * If the entry has changed just forget about it. Release 3485 * the source inode. 3486 */ 3487 if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0, 3488 DR_RENAME, cr, NULL)) == ENOENT) 3489 error = 0; 3490 3491 errout: 3492 if (slot.fbp) 3493 fbrelse(slot.fbp, S_OTHER); 3494 3495 rw_exit(&tdp->i_rwlock); 3496 if (sdp != tdp) { 3497 rw_exit(&sdp->i_rwlock); 3498 } 3499 3500 unlock: 3501 if (ulp) { 3502 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); 3503 ufs_lockfs_end(ulp); 3504 } 3505 3506 /* 3507 * If no errors, send the appropriate events on the source 3508 * and destination (a.k.a, target) vnodes, if they exist. 3509 * This has to be done after the rename transaction has closed. 3510 */ 3511 if (error == 0) { 3512 if (tvp != NULL) 3513 vnevent_rename_dest(tvp); 3514 /* 3515 * Note that if ufs_direnter_lr() returned ESAME then 3516 * this event will still be sent. This isn't expected 3517 * to be a problem for anticipated usage by consumers. 3518 */ 3519 if (sip != NULL) 3520 vnevent_rename_src(ITOV(sip)); 3521 } 3522 3523 if (tvp != NULL) 3524 VN_RELE(tvp); 3525 3526 if (sip != NULL) 3527 VN_RELE(ITOV(sip)); 3528 3529 out: 3530 TRACE_5(TR_FAC_UFS, TR_UFS_RENAME_END, 3531 "ufs_rename_end:sdvp %p snm %s tdvp %p tnm %s error %d", 3532 sdvp, snm, tdvp, tnm, error); 3533 return (error); 3534 } 3535 3536 /*ARGSUSED*/ 3537 static int 3538 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap, 3539 struct vnode **vpp, struct cred *cr) 3540 { 3541 struct inode *ip; 3542 struct inode *xip; 3543 struct ufsvfs *ufsvfsp; 3544 struct ulockfs *ulp; 3545 int error; 3546 int issync; 3547 int trans_size; 3548 int retry = 1; 3549 3550 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 3551 3552 TRACE_1(TR_FAC_UFS, TR_UFS_MKDIR_START, 3553 "ufs_mkdir_start:dvp %p", dvp); 3554 3555 /* 3556 * Can't make directory in attr hidden dir 3557 */ 3558 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3559 return (EINVAL); 3560 3561 again: 3562 ip = VTOI(dvp); 3563 ufsvfsp = ip->i_ufsvfs; 3564 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3565 if (error) 3566 goto out; 3567 if (ulp) 3568 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, 3569 trans_size = (int)TOP_MKDIR_SIZE(ip)); 3570 3571 rw_enter(&ip->i_rwlock, RW_WRITER); 3572 3573 error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr, 3574 (retry ? IQUIET : 0)); 3575 3576 rw_exit(&ip->i_rwlock); 3577 if (error == 0) { 3578 ip = xip; 3579 *vpp = ITOV(ip); 3580 } else if (error == EEXIST) 3581 VN_RELE(ITOV(xip)); 3582 3583 if (ulp) { 3584 int terr = 0; 3585 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size); 3586 ufs_lockfs_end(ulp); 3587 if (error == 0) 3588 error = terr; 3589 } 3590 out: 3591 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3592 ufs_delete_drain_wait(ufsvfsp, 1); 3593 retry = 0; 3594 goto again; 3595 } 3596 3597 TRACE_2(TR_FAC_UFS, TR_UFS_MKDIR_END, 3598 "ufs_mkdir_end:dvp %p error %d", dvp, error); 3599 return (error); 3600 } 3601 3602 /*ARGSUSED*/ 3603 static int 3604 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr) 3605 { 3606 struct inode *ip = VTOI(vp); 3607 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3608 struct ulockfs *ulp; 3609 vnode_t *rmvp = NULL; /* Vnode of removed directory */ 3610 int error; 3611 int issync; 3612 3613 TRACE_1(TR_FAC_UFS, TR_UFS_RMDIR_START, 3614 "ufs_rmdir_start:vp %p", vp); 3615 3616 /* 3617 * don't let the delete queue get too long 3618 */ 3619 if (ufsvfsp == NULL) { 3620 error = EIO; 3621 goto out; 3622 } 3623 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3624 ufs_delete_drain(vp->v_vfsp, 1, 1); 3625 3626 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK); 3627 if (error) 3628 goto out; 3629 3630 if (ulp) 3631 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR, TOP_RMDIR_SIZE); 3632 3633 rw_enter(&ip->i_rwlock, RW_WRITER); 3634 error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr, 3635 &rmvp); 3636 rw_exit(&ip->i_rwlock); 3637 3638 if (ulp) { 3639 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR, 3640 TOP_RMDIR_SIZE); 3641 ufs_lockfs_end(ulp); 3642 } 3643 3644 /* 3645 * This must be done AFTER the rmdir transaction has closed. 3646 */ 3647 if (rmvp != NULL) { 3648 /* Only send the event if there were no errors */ 3649 if (error == 0) 3650 vnevent_rmdir(rmvp); 3651 VN_RELE(rmvp); 3652 } 3653 out: 3654 TRACE_2(TR_FAC_UFS, TR_UFS_RMDIR_END, 3655 "ufs_rmdir_end:vp %p error %d", vp, error); 3656 3657 return (error); 3658 } 3659 3660 /* ARGSUSED */ 3661 static int 3662 ufs_readdir( 3663 struct vnode *vp, 3664 struct uio *uiop, 3665 struct cred *cr, 3666 int *eofp) 3667 { 3668 struct iovec *iovp; 3669 struct inode *ip; 3670 struct direct *idp; 3671 struct dirent64 *odp; 3672 struct fbuf *fbp; 3673 struct ufsvfs *ufsvfsp; 3674 struct ulockfs *ulp; 3675 caddr_t outbuf; 3676 size_t bufsize; 3677 uint_t offset; 3678 uint_t bytes_wanted, total_bytes_wanted; 3679 int incount = 0; 3680 int outcount = 0; 3681 int error; 3682 3683 ip = VTOI(vp); 3684 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 3685 3686 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_START, 3687 "ufs_readdir_start:vp %p uiop %p", vp, uiop); 3688 3689 if (uiop->uio_loffset >= MAXOFF32_T) { 3690 if (eofp) 3691 *eofp = 1; 3692 return (0); 3693 } 3694 3695 /* 3696 * Check if we have been called with a valid iov_len 3697 * and bail out if not, otherwise we may potentially loop 3698 * forever further down. 3699 */ 3700 if (uiop->uio_iov->iov_len <= 0) { 3701 error = EINVAL; 3702 goto out; 3703 } 3704 3705 /* 3706 * Large Files: When we come here we are guaranteed that 3707 * uio_offset can be used safely. The high word is zero. 3708 */ 3709 3710 ufsvfsp = ip->i_ufsvfs; 3711 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK); 3712 if (error) 3713 goto out; 3714 3715 iovp = uiop->uio_iov; 3716 total_bytes_wanted = iovp->iov_len; 3717 3718 /* Large Files: directory files should not be "large" */ 3719 3720 ASSERT(ip->i_size <= MAXOFF32_T); 3721 3722 /* Force offset to be valid (to guard against bogus lseek() values) */ 3723 offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1); 3724 3725 /* Quit if at end of file or link count of zero (posix) */ 3726 if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) { 3727 if (eofp) 3728 *eofp = 1; 3729 error = 0; 3730 goto unlock; 3731 } 3732 3733 /* 3734 * Get space to change directory entries into fs independent format. 3735 * Do fast alloc for the most commonly used-request size (filesystem 3736 * block size). 3737 */ 3738 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) { 3739 bufsize = total_bytes_wanted; 3740 outbuf = kmem_alloc(bufsize, KM_SLEEP); 3741 odp = (struct dirent64 *)outbuf; 3742 } else { 3743 bufsize = total_bytes_wanted; 3744 odp = (struct dirent64 *)iovp->iov_base; 3745 } 3746 3747 nextblk: 3748 bytes_wanted = total_bytes_wanted; 3749 3750 /* Truncate request to file size */ 3751 if (offset + bytes_wanted > (int)ip->i_size) 3752 bytes_wanted = (int)(ip->i_size - offset); 3753 3754 /* Comply with MAXBSIZE boundary restrictions of fbread() */ 3755 if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE) 3756 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET); 3757 3758 /* 3759 * Read in the next chunk. 3760 * We are still holding the i_rwlock. 3761 */ 3762 error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp); 3763 3764 if (error) 3765 goto update_inode; 3766 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) && 3767 (!ufsvfsp->vfs_noatime)) { 3768 ip->i_flag |= IACC; 3769 } 3770 incount = 0; 3771 idp = (struct direct *)fbp->fb_addr; 3772 if (idp->d_ino == 0 && idp->d_reclen == 0 && 3773 idp->d_namlen == 0) { 3774 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, " 3775 "fs = %s\n", 3776 (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt); 3777 fbrelse(fbp, S_OTHER); 3778 error = ENXIO; 3779 goto update_inode; 3780 } 3781 /* Transform to file-system independent format */ 3782 while (incount < bytes_wanted) { 3783 /* 3784 * If the current directory entry is mangled, then skip 3785 * to the next block. It would be nice to set the FSBAD 3786 * flag in the super-block so that a fsck is forced on 3787 * next reboot, but locking is a problem. 3788 */ 3789 if (idp->d_reclen & 0x3) { 3790 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3791 break; 3792 } 3793 3794 /* Skip to requested offset and skip empty entries */ 3795 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) { 3796 ushort_t this_reclen = 3797 DIRENT64_RECLEN(idp->d_namlen); 3798 /* Buffer too small for any entries */ 3799 if (!outcount && this_reclen > bufsize) { 3800 fbrelse(fbp, S_OTHER); 3801 error = EINVAL; 3802 goto update_inode; 3803 } 3804 /* If would overrun the buffer, quit */ 3805 if (outcount + this_reclen > bufsize) { 3806 break; 3807 } 3808 /* Take this entry */ 3809 odp->d_ino = (ino64_t)idp->d_ino; 3810 odp->d_reclen = (ushort_t)this_reclen; 3811 odp->d_off = (offset_t)(offset + idp->d_reclen); 3812 3813 /* use strncpy(9f) to zero out uninitialized bytes */ 3814 3815 ASSERT(strlen(idp->d_name) + 1 <= 3816 DIRENT64_NAMELEN(this_reclen)); 3817 (void) strncpy(odp->d_name, idp->d_name, 3818 DIRENT64_NAMELEN(this_reclen)); 3819 outcount += odp->d_reclen; 3820 odp = (struct dirent64 *)((intptr_t)odp + 3821 odp->d_reclen); 3822 ASSERT(outcount <= bufsize); 3823 } 3824 if (idp->d_reclen) { 3825 incount += idp->d_reclen; 3826 offset += idp->d_reclen; 3827 idp = (struct direct *)((intptr_t)idp + idp->d_reclen); 3828 } else { 3829 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3830 break; 3831 } 3832 } 3833 /* Release the chunk */ 3834 fbrelse(fbp, S_OTHER); 3835 3836 /* Read whole block, but got no entries, read another if not eof */ 3837 3838 /* 3839 * Large Files: casting i_size to int here is not a problem 3840 * because directory sizes are always less than MAXOFF32_T. 3841 * See assertion above. 3842 */ 3843 3844 if (offset < (int)ip->i_size && !outcount) 3845 goto nextblk; 3846 3847 /* Copy out the entry data */ 3848 if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) { 3849 iovp->iov_base += outcount; 3850 iovp->iov_len -= outcount; 3851 uiop->uio_resid -= outcount; 3852 uiop->uio_offset = offset; 3853 } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, 3854 uiop)) == 0) 3855 uiop->uio_offset = offset; 3856 update_inode: 3857 ITIMES(ip); 3858 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) 3859 kmem_free(outbuf, bufsize); 3860 3861 if (eofp && error == 0) 3862 *eofp = (uiop->uio_offset >= (int)ip->i_size); 3863 unlock: 3864 if (ulp) { 3865 ufs_lockfs_end(ulp); 3866 } 3867 out: 3868 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_END, 3869 "ufs_readdir_end:vp %p error %d", vp, error); 3870 return (error); 3871 } 3872 3873 /*ARGSUSED*/ 3874 static int 3875 ufs_symlink( 3876 struct vnode *dvp, /* ptr to parent dir vnode */ 3877 char *linkname, /* name of symbolic link */ 3878 struct vattr *vap, /* attributes */ 3879 char *target, /* target path */ 3880 struct cred *cr) /* user credentials */ 3881 { 3882 struct inode *ip, *dip = VTOI(dvp); 3883 struct ufsvfs *ufsvfsp = dip->i_ufsvfs; 3884 struct ulockfs *ulp; 3885 int error; 3886 int issync; 3887 int trans_size; 3888 int residual; 3889 int ioflag; 3890 int retry = 1; 3891 3892 TRACE_1(TR_FAC_UFS, TR_UFS_SYMLINK_START, 3893 "ufs_symlink_start:dvp %p", dvp); 3894 3895 /* 3896 * No symlinks in attrdirs at this time 3897 */ 3898 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3899 return (EINVAL); 3900 3901 again: 3902 ip = (struct inode *)NULL; 3903 vap->va_type = VLNK; 3904 vap->va_rdev = 0; 3905 3906 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK); 3907 if (error) 3908 goto out; 3909 3910 if (ulp) 3911 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK, 3912 trans_size = (int)TOP_SYMLINK_SIZE(dip)); 3913 3914 /* 3915 * We must create the inode before the directory entry, to avoid 3916 * racing with readlink(). ufs_dirmakeinode requires that we 3917 * hold the quota lock as reader, and directory locks as writer. 3918 */ 3919 3920 rw_enter(&dip->i_rwlock, RW_WRITER); 3921 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3922 rw_enter(&dip->i_contents, RW_WRITER); 3923 3924 /* 3925 * Suppress any out of inodes messages if we will retry on 3926 * ENOSP 3927 */ 3928 if (retry) 3929 dip->i_flag |= IQUIET; 3930 3931 error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr); 3932 3933 dip->i_flag &= ~IQUIET; 3934 3935 rw_exit(&dip->i_contents); 3936 rw_exit(&ufsvfsp->vfs_dqrwlock); 3937 rw_exit(&dip->i_rwlock); 3938 3939 if (error) 3940 goto unlock; 3941 3942 /* 3943 * OK. The inode has been created. Write out the data of the 3944 * symbolic link. Since symbolic links are metadata, and should 3945 * remain consistent across a system crash, we need to force the 3946 * data out synchronously. 3947 * 3948 * (This is a change from the semantics in earlier releases, which 3949 * only created symbolic links synchronously if the semi-documented 3950 * 'syncdir' option was set, or if we were being invoked by the NFS 3951 * server, which requires symbolic links to be created synchronously.) 3952 * 3953 * We need to pass in a pointer for the residual length; otherwise 3954 * ufs_rdwri() will always return EIO if it can't write the data, 3955 * even if the error was really ENOSPC or EDQUOT. 3956 */ 3957 3958 ioflag = FWRITE | FDSYNC; 3959 residual = 0; 3960 3961 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3962 rw_enter(&ip->i_contents, RW_WRITER); 3963 3964 /* 3965 * Suppress file system full messages if we will retry 3966 */ 3967 if (retry) 3968 ip->i_flag |= IQUIET; 3969 3970 error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target), 3971 (offset_t)0, UIO_SYSSPACE, &residual, cr); 3972 3973 ip->i_flag &= ~IQUIET; 3974 3975 if (error) { 3976 rw_exit(&ip->i_contents); 3977 rw_exit(&ufsvfsp->vfs_dqrwlock); 3978 goto remove; 3979 } 3980 3981 /* 3982 * If the link's data is small enough, we can cache it in the inode. 3983 * This is a "fast symbolic link". We don't use the first direct 3984 * block because that's actually used to point at the symbolic link's 3985 * contents on disk; but we know that none of the other direct or 3986 * indirect blocks can be used because symbolic links are restricted 3987 * to be smaller than a file system block. 3988 */ 3989 3990 ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip))); 3991 3992 if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) { 3993 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) { 3994 ip->i_flag |= IFASTSYMLNK; 3995 } else { 3996 int i; 3997 /* error, clear garbage left behind */ 3998 for (i = 1; i < NDADDR; i++) 3999 ip->i_db[i] = 0; 4000 for (i = 0; i < NIADDR; i++) 4001 ip->i_ib[i] = 0; 4002 } 4003 } 4004 4005 rw_exit(&ip->i_contents); 4006 rw_exit(&ufsvfsp->vfs_dqrwlock); 4007 4008 /* 4009 * OK. We've successfully created the symbolic link. All that 4010 * remains is to insert it into the appropriate directory. 4011 */ 4012 4013 rw_enter(&dip->i_rwlock, RW_WRITER); 4014 error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL); 4015 rw_exit(&dip->i_rwlock); 4016 4017 /* 4018 * Fall through into remove-on-error code. We're either done, or we 4019 * need to remove the inode (if we couldn't insert it). 4020 */ 4021 4022 remove: 4023 if (error && (ip != NULL)) { 4024 rw_enter(&ip->i_contents, RW_WRITER); 4025 ip->i_nlink--; 4026 ip->i_flag |= ICHG; 4027 ip->i_seq++; 4028 ufs_setreclaim(ip); 4029 rw_exit(&ip->i_contents); 4030 } 4031 4032 unlock: 4033 if (ip != NULL) 4034 VN_RELE(ITOV(ip)); 4035 4036 if (ulp) { 4037 int terr = 0; 4038 4039 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK, 4040 trans_size); 4041 ufs_lockfs_end(ulp); 4042 if (error == 0) 4043 error = terr; 4044 } 4045 4046 /* 4047 * We may have failed due to lack of an inode or of a block to 4048 * store the target in. Try flushing the delete queue to free 4049 * logically-available things up and try again. 4050 */ 4051 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 4052 ufs_delete_drain_wait(ufsvfsp, 1); 4053 retry = 0; 4054 goto again; 4055 } 4056 4057 out: 4058 TRACE_2(TR_FAC_UFS, TR_UFS_SYMLINK_END, 4059 "ufs_symlink_end:dvp %p error %d", dvp, error); 4060 return (error); 4061 } 4062 4063 /* 4064 * Ufs specific routine used to do ufs io. 4065 */ 4066 int 4067 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base, 4068 ssize_t len, offset_t offset, enum uio_seg seg, int *aresid, 4069 struct cred *cr) 4070 { 4071 struct uio auio; 4072 struct iovec aiov; 4073 int error; 4074 4075 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 4076 4077 bzero((caddr_t)&auio, sizeof (uio_t)); 4078 bzero((caddr_t)&aiov, sizeof (iovec_t)); 4079 4080 aiov.iov_base = base; 4081 aiov.iov_len = len; 4082 auio.uio_iov = &aiov; 4083 auio.uio_iovcnt = 1; 4084 auio.uio_loffset = offset; 4085 auio.uio_segflg = (short)seg; 4086 auio.uio_resid = len; 4087 4088 if (rw == UIO_WRITE) { 4089 auio.uio_fmode = FWRITE; 4090 auio.uio_extflg = UIO_COPY_DEFAULT; 4091 auio.uio_llimit = curproc->p_fsz_ctl; 4092 error = wrip(ip, &auio, ioflag, cr); 4093 } else { 4094 auio.uio_fmode = FREAD; 4095 auio.uio_extflg = UIO_COPY_CACHED; 4096 auio.uio_llimit = MAXOFFSET_T; 4097 error = rdip(ip, &auio, ioflag, cr); 4098 } 4099 4100 if (aresid) { 4101 *aresid = auio.uio_resid; 4102 } else if (auio.uio_resid) { 4103 error = EIO; 4104 } 4105 return (error); 4106 } 4107 4108 static int 4109 ufs_fid(vp, fidp) 4110 struct vnode *vp; 4111 struct fid *fidp; 4112 { 4113 struct ufid *ufid; 4114 struct inode *ip = VTOI(vp); 4115 4116 if (ip->i_ufsvfs == NULL) 4117 return (EIO); 4118 4119 if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) { 4120 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t); 4121 return (ENOSPC); 4122 } 4123 4124 ufid = (struct ufid *)fidp; 4125 bzero((char *)ufid, sizeof (struct ufid)); 4126 ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t); 4127 ufid->ufid_ino = ip->i_number; 4128 ufid->ufid_gen = ip->i_gen; 4129 4130 return (0); 4131 } 4132 4133 /* ARGSUSED2 */ 4134 static int 4135 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4136 { 4137 struct inode *ip = VTOI(vp); 4138 struct ufsvfs *ufsvfsp; 4139 int forcedirectio; 4140 4141 /* 4142 * Read case is easy. 4143 */ 4144 if (!write_lock) { 4145 rw_enter(&ip->i_rwlock, RW_READER); 4146 return (V_WRITELOCK_FALSE); 4147 } 4148 4149 /* 4150 * Caller has requested a writer lock, but that inhibits any 4151 * concurrency in the VOPs that follow. Acquire the lock shared 4152 * and defer exclusive access until it is known to be needed in 4153 * other VOP handlers. Some cases can be determined here. 4154 */ 4155 4156 /* 4157 * If directio is not set, there is no chance of concurrency, 4158 * so just acquire the lock exclusive. Beware of a forced 4159 * unmount before looking at the mount option. 4160 */ 4161 ufsvfsp = ip->i_ufsvfs; 4162 forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0; 4163 if (!(ip->i_flag & IDIRECTIO || forcedirectio) || 4164 !ufs_allow_shared_writes) { 4165 rw_enter(&ip->i_rwlock, RW_WRITER); 4166 return (V_WRITELOCK_TRUE); 4167 } 4168 4169 /* 4170 * Mandatory locking forces acquiring i_rwlock exclusive. 4171 */ 4172 if (MANDLOCK(vp, ip->i_mode)) { 4173 rw_enter(&ip->i_rwlock, RW_WRITER); 4174 return (V_WRITELOCK_TRUE); 4175 } 4176 4177 /* 4178 * Acquire the lock shared in case a concurrent write follows. 4179 * Mandatory locking could have become enabled before the lock 4180 * was acquired. Re-check and upgrade if needed. 4181 */ 4182 rw_enter(&ip->i_rwlock, RW_READER); 4183 if (MANDLOCK(vp, ip->i_mode)) { 4184 rw_exit(&ip->i_rwlock); 4185 rw_enter(&ip->i_rwlock, RW_WRITER); 4186 return (V_WRITELOCK_TRUE); 4187 } 4188 return (V_WRITELOCK_FALSE); 4189 } 4190 4191 /*ARGSUSED*/ 4192 static void 4193 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4194 { 4195 struct inode *ip = VTOI(vp); 4196 4197 rw_exit(&ip->i_rwlock); 4198 } 4199 4200 /* ARGSUSED */ 4201 static int 4202 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp) 4203 { 4204 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4205 } 4206 4207 /* ARGSUSED */ 4208 static int 4209 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4210 offset_t offset, struct flk_callback *flk_cbp, struct cred *cr) 4211 { 4212 struct inode *ip = VTOI(vp); 4213 4214 if (ip->i_ufsvfs == NULL) 4215 return (EIO); 4216 4217 /* 4218 * If file is being mapped, disallow frlock. 4219 * XXX I am not holding tlock while checking i_mapcnt because the 4220 * current locking strategy drops all locks before calling fs_frlock. 4221 * So, mapcnt could change before we enter fs_frlock making is 4222 * meaningless to have held tlock in the first place. 4223 */ 4224 if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode)) 4225 return (EAGAIN); 4226 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4227 } 4228 4229 /* ARGSUSED */ 4230 static int 4231 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4232 offset_t offset, cred_t *cr, caller_context_t *ct) 4233 { 4234 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 4235 struct ulockfs *ulp; 4236 int error; 4237 4238 if ((error = convoff(vp, bfp, 0, offset)) == 0) { 4239 if (cmd == F_FREESP) { 4240 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4241 ULOCKFS_SPACE_MASK); 4242 if (error) 4243 return (error); 4244 error = ufs_freesp(vp, bfp, flag, cr); 4245 } else if (cmd == F_ALLOCSP) { 4246 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4247 ULOCKFS_FALLOCATE_MASK); 4248 if (error) 4249 return (error); 4250 error = ufs_allocsp(vp, bfp, cr); 4251 } else 4252 return (EINVAL); /* Command not handled here */ 4253 4254 if (ulp) 4255 ufs_lockfs_end(ulp); 4256 4257 } 4258 return (error); 4259 } 4260 4261 /* 4262 * Used to determine if read ahead should be done. Also used to 4263 * to determine when write back occurs. 4264 */ 4265 #define CLUSTSZ(ip) ((ip)->i_ufsvfs->vfs_ioclustsz) 4266 4267 /* 4268 * A faster version of ufs_getpage. 4269 * 4270 * We optimize by inlining the pvn_getpages iterator, eliminating 4271 * calls to bmap_read if file doesn't have UFS holes, and avoiding 4272 * the overhead of page_exists(). 4273 * 4274 * When files has UFS_HOLES and ufs_getpage is called with S_READ, 4275 * we set *protp to PROT_READ to avoid calling bmap_read. This approach 4276 * victimizes performance when a file with UFS holes is faulted 4277 * first in the S_READ mode, and then in the S_WRITE mode. We will get 4278 * two MMU faults in this case. 4279 * 4280 * XXX - the inode fields which control the sequential mode are not 4281 * protected by any mutex. The read ahead will act wild if 4282 * multiple processes will access the file concurrently and 4283 * some of them in sequential mode. One particulary bad case 4284 * is if another thread will change the value of i_nextrio between 4285 * the time this thread tests the i_nextrio value and then reads it 4286 * again to use it as the offset for the read ahead. 4287 */ 4288 static int 4289 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 4290 page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr, 4291 enum seg_rw rw, struct cred *cr) 4292 { 4293 u_offset_t uoff = (u_offset_t)off; /* type conversion */ 4294 u_offset_t pgoff; 4295 u_offset_t eoff; 4296 struct inode *ip = VTOI(vp); 4297 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 4298 struct fs *fs; 4299 struct ulockfs *ulp; 4300 page_t **pl; 4301 caddr_t pgaddr; 4302 krw_t rwtype; 4303 int err; 4304 int has_holes; 4305 int beyond_eof; 4306 int seqmode; 4307 int pgsize = PAGESIZE; 4308 int dolock; 4309 int do_qlock; 4310 int trans_size; 4311 4312 TRACE_1(TR_FAC_UFS, TR_UFS_GETPAGE_START, 4313 "ufs_getpage_start:vp %p", vp); 4314 4315 ASSERT((uoff & PAGEOFFSET) == 0); 4316 4317 if (protp) 4318 *protp = PROT_ALL; 4319 4320 /* 4321 * Obey the lockfs protocol 4322 */ 4323 err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg, 4324 rw == S_READ || rw == S_EXEC, protp); 4325 if (err) 4326 goto out; 4327 4328 fs = ufsvfsp->vfs_fs; 4329 4330 if (ulp && (rw == S_CREATE || rw == S_WRITE) && 4331 !(vp->v_flag & VISSWAP)) { 4332 /* 4333 * Try to start a transaction, will return if blocking is 4334 * expected to occur and the address space is not the 4335 * kernel address space. 4336 */ 4337 trans_size = TOP_GETPAGE_SIZE(ip); 4338 if (seg->s_as != &kas) { 4339 TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, 4340 trans_size, err) 4341 if (err == EWOULDBLOCK) { 4342 /* 4343 * Use EDEADLK here because the VM code 4344 * can normally never see this error. 4345 */ 4346 err = EDEADLK; 4347 ufs_lockfs_end(ulp); 4348 goto out; 4349 } 4350 } else { 4351 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4352 } 4353 } 4354 4355 if (vp->v_flag & VNOMAP) { 4356 err = ENOSYS; 4357 goto unlock; 4358 } 4359 4360 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 4361 4362 rwtype = RW_READER; /* start as a reader */ 4363 dolock = (rw_owner(&ip->i_contents) != curthread); 4364 /* 4365 * If this thread owns the lock, i.e., this thread grabbed it 4366 * as writer somewhere above, then we don't need to grab the 4367 * lock as reader in this routine. 4368 */ 4369 do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread); 4370 4371 retrylock: 4372 if (dolock) { 4373 /* 4374 * Grab the quota lock if we need to call 4375 * bmap_write() below (with i_contents as writer). 4376 */ 4377 if (do_qlock && rwtype == RW_WRITER) 4378 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4379 rw_enter(&ip->i_contents, rwtype); 4380 } 4381 4382 /* 4383 * We may be getting called as a side effect of a bmap using 4384 * fbread() when the blocks might be being allocated and the 4385 * size has not yet been up'ed. In this case we want to be 4386 * able to return zero pages if we get back UFS_HOLE from 4387 * calling bmap for a non write case here. We also might have 4388 * to read some frags from the disk into a page if we are 4389 * extending the number of frags for a given lbn in bmap(). 4390 * Large Files: The read of i_size here is atomic because 4391 * i_contents is held here. If dolock is zero, the lock 4392 * is held in bmap routines. 4393 */ 4394 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 4395 if (beyond_eof && seg != segkmap) { 4396 if (dolock) { 4397 rw_exit(&ip->i_contents); 4398 if (do_qlock && rwtype == RW_WRITER) 4399 rw_exit(&ufsvfsp->vfs_dqrwlock); 4400 } 4401 err = EFAULT; 4402 goto unlock; 4403 } 4404 4405 /* 4406 * Must hold i_contents lock throughout the call to pvn_getpages 4407 * since locked pages are returned from each call to ufs_getapage. 4408 * Must *not* return locked pages and then try for contents lock 4409 * due to lock ordering requirements (inode > page) 4410 */ 4411 4412 has_holes = bmap_has_holes(ip); 4413 4414 if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) { 4415 int blk_size; 4416 u_offset_t offset; 4417 4418 /* 4419 * We must acquire the RW_WRITER lock in order to 4420 * call bmap_write(). 4421 */ 4422 if (dolock && rwtype == RW_READER) { 4423 rwtype = RW_WRITER; 4424 4425 /* 4426 * Grab the quota lock before 4427 * upgrading i_contents, but if we can't grab it 4428 * don't wait here due to lock order: 4429 * vfs_dqrwlock > i_contents. 4430 */ 4431 if (do_qlock && rw_tryenter(&ufsvfsp->vfs_dqrwlock, 4432 RW_READER) == 0) { 4433 rw_exit(&ip->i_contents); 4434 goto retrylock; 4435 } 4436 if (!rw_tryupgrade(&ip->i_contents)) { 4437 rw_exit(&ip->i_contents); 4438 if (do_qlock) 4439 rw_exit(&ufsvfsp->vfs_dqrwlock); 4440 goto retrylock; 4441 } 4442 } 4443 4444 /* 4445 * May be allocating disk blocks for holes here as 4446 * a result of mmap faults. write(2) does the bmap_write 4447 * in rdip/wrip, not here. We are not dealing with frags 4448 * in this case. 4449 */ 4450 /* 4451 * Large Files: We cast fs_bmask field to offset_t 4452 * just as we do for MAXBMASK because uoff is a 64-bit 4453 * data type. fs_bmask will still be a 32-bit type 4454 * as we cannot change any ondisk data structures. 4455 */ 4456 4457 offset = uoff & (offset_t)fs->fs_bmask; 4458 while (offset < uoff + len) { 4459 blk_size = (int)blksize(fs, ip, lblkno(fs, offset)); 4460 err = bmap_write(ip, offset, blk_size, 4461 BI_NORMAL, NULL, cr); 4462 if (ip->i_flag & (ICHG|IUPD)) 4463 ip->i_seq++; 4464 if (err) 4465 goto update_inode; 4466 offset += blk_size; /* XXX - make this contig */ 4467 } 4468 } 4469 4470 /* 4471 * Can be a reader from now on. 4472 */ 4473 if (dolock && rwtype == RW_WRITER) { 4474 rw_downgrade(&ip->i_contents); 4475 /* 4476 * We can release vfs_dqrwlock early so do it, but make 4477 * sure we don't try to release it again at the bottom. 4478 */ 4479 if (do_qlock) { 4480 rw_exit(&ufsvfsp->vfs_dqrwlock); 4481 do_qlock = 0; 4482 } 4483 } 4484 4485 /* 4486 * We remove PROT_WRITE in cases when the file has UFS holes 4487 * because we don't want to call bmap_read() to check each 4488 * page if it is backed with a disk block. 4489 */ 4490 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) 4491 *protp &= ~PROT_WRITE; 4492 4493 err = 0; 4494 4495 /* 4496 * The loop looks up pages in the range [off, off + len). 4497 * For each page, we first check if we should initiate an asynchronous 4498 * read ahead before we call page_lookup (we may sleep in page_lookup 4499 * for a previously initiated disk read). 4500 */ 4501 eoff = (uoff + len); 4502 for (pgoff = uoff, pgaddr = addr, pl = plarr; 4503 pgoff < eoff; /* empty */) { 4504 page_t *pp; 4505 u_offset_t nextrio; 4506 se_t se; 4507 int retval; 4508 4509 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED); 4510 4511 /* Handle async getpage (faultahead) */ 4512 if (plarr == NULL) { 4513 ip->i_nextrio = pgoff; 4514 (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4515 pgoff += pgsize; 4516 pgaddr += pgsize; 4517 continue; 4518 } 4519 /* 4520 * Check if we should initiate read ahead of next cluster. 4521 * We call page_exists only when we need to confirm that 4522 * we have the current page before we initiate the read ahead. 4523 */ 4524 nextrio = ip->i_nextrio; 4525 if (seqmode && 4526 pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 4527 nextrio < ip->i_size && page_exists(vp, pgoff)) { 4528 retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4529 /* 4530 * We always read ahead the next cluster of data 4531 * starting from i_nextrio. If the page (vp,nextrio) 4532 * is actually in core at this point, the routine 4533 * ufs_getpage_ra() will stop pre-fetching data 4534 * until we read that page in a synchronized manner 4535 * through ufs_getpage_miss(). So, we should increase 4536 * i_nextrio if the page (vp, nextrio) exists. 4537 */ 4538 if ((retval == 0) && page_exists(vp, nextrio)) { 4539 ip->i_nextrio = nextrio + pgsize; 4540 } 4541 } 4542 4543 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 4544 /* 4545 * We found the page in the page cache. 4546 */ 4547 *pl++ = pp; 4548 pgoff += pgsize; 4549 pgaddr += pgsize; 4550 len -= pgsize; 4551 plsz -= pgsize; 4552 } else { 4553 /* 4554 * We have to create the page, or read it from disk. 4555 */ 4556 if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr, 4557 pl, plsz, rw, seqmode)) 4558 goto error; 4559 4560 while (*pl != NULL) { 4561 pl++; 4562 pgoff += pgsize; 4563 pgaddr += pgsize; 4564 len -= pgsize; 4565 plsz -= pgsize; 4566 } 4567 } 4568 } 4569 4570 /* 4571 * Return pages up to plsz if they are in the page cache. 4572 * We cannot return pages if there is a chance that they are 4573 * backed with a UFS hole and rw is S_WRITE or S_CREATE. 4574 */ 4575 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 4576 4577 ASSERT((protp == NULL) || 4578 !(has_holes && (*protp & PROT_WRITE))); 4579 4580 eoff = pgoff + plsz; 4581 while (pgoff < eoff) { 4582 page_t *pp; 4583 4584 if ((pp = page_lookup_nowait(vp, pgoff, 4585 SE_SHARED)) == NULL) 4586 break; 4587 4588 *pl++ = pp; 4589 pgoff += pgsize; 4590 plsz -= pgsize; 4591 } 4592 } 4593 4594 if (plarr) 4595 *pl = NULL; /* Terminate page list */ 4596 ip->i_nextr = pgoff; 4597 4598 error: 4599 if (err && plarr) { 4600 /* 4601 * Release any pages we have locked. 4602 */ 4603 while (pl > &plarr[0]) 4604 page_unlock(*--pl); 4605 4606 plarr[0] = NULL; 4607 } 4608 4609 update_inode: 4610 /* 4611 * If the inode is not already marked for IACC (in rdip() for read) 4612 * and the inode is not marked for no access time update (in wrip() 4613 * for write) then update the inode access time and mod time now. 4614 */ 4615 if ((ip->i_flag & (IACC | INOACC)) == 0) { 4616 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) { 4617 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 4618 (fs->fs_ronly == 0) && 4619 (!ufsvfsp->vfs_noatime)) { 4620 mutex_enter(&ip->i_tlock); 4621 ip->i_flag |= IACC; 4622 ITIMES_NOLOCK(ip); 4623 mutex_exit(&ip->i_tlock); 4624 } 4625 } 4626 } 4627 4628 if (dolock) { 4629 rw_exit(&ip->i_contents); 4630 if (do_qlock && rwtype == RW_WRITER) 4631 rw_exit(&ufsvfsp->vfs_dqrwlock); 4632 } 4633 4634 unlock: 4635 if (ulp) { 4636 if ((rw == S_CREATE || rw == S_WRITE) && 4637 !(vp->v_flag & VISSWAP)) { 4638 TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4639 } 4640 ufs_lockfs_end(ulp); 4641 } 4642 out: 4643 TRACE_2(TR_FAC_UFS, TR_UFS_GETPAGE_END, 4644 "ufs_getpage_end:vp %p error %d", vp, err); 4645 return (err); 4646 } 4647 4648 /* 4649 * ufs_getpage_miss is called when ufs_getpage missed the page in the page 4650 * cache. The page is either read from the disk, or it's created. 4651 * A page is created (without disk read) if rw == S_CREATE, or if 4652 * the page is not backed with a real disk block (UFS hole). 4653 */ 4654 /* ARGSUSED */ 4655 static int 4656 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg, 4657 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq) 4658 { 4659 struct inode *ip = VTOI(vp); 4660 page_t *pp; 4661 daddr_t bn; 4662 size_t io_len; 4663 int crpage = 0; 4664 int err; 4665 int contig; 4666 int bsize = ip->i_fs->fs_bsize; 4667 4668 /* 4669 * Figure out whether the page can be created, or must be 4670 * must be read from the disk. 4671 */ 4672 if (rw == S_CREATE) 4673 crpage = 1; 4674 else { 4675 contig = 0; 4676 if (err = bmap_read(ip, off, &bn, &contig)) 4677 return (err); 4678 4679 crpage = (bn == UFS_HOLE); 4680 4681 /* 4682 * If its also a fallocated block that hasn't been written to 4683 * yet, we will treat it just like a UFS_HOLE and create 4684 * a zero page for it 4685 */ 4686 if (ISFALLOCBLK(ip, bn)) 4687 crpage = 1; 4688 } 4689 4690 if (crpage) { 4691 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg, 4692 addr)) == NULL) { 4693 return (ufs_fault(vp, 4694 "ufs_getpage_miss: page_create == NULL")); 4695 } 4696 4697 if (rw != S_CREATE) 4698 pagezero(pp, 0, PAGESIZE); 4699 4700 io_len = PAGESIZE; 4701 } else { 4702 u_offset_t io_off; 4703 uint_t xlen; 4704 struct buf *bp; 4705 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 4706 4707 /* 4708 * If access is not in sequential order, we read from disk 4709 * in bsize units. 4710 * 4711 * We limit the size of the transfer to bsize if we are reading 4712 * from the beginning of the file. Note in this situation we 4713 * will hedge our bets and initiate an async read ahead of 4714 * the second block. 4715 */ 4716 if (!seq || off == 0) 4717 contig = MIN(contig, bsize); 4718 4719 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4720 &io_len, off, contig, 0); 4721 4722 /* 4723 * Some other thread has entered the page. 4724 * ufs_getpage will retry page_lookup. 4725 */ 4726 if (pp == NULL) { 4727 pl[0] = NULL; 4728 return (0); 4729 } 4730 4731 /* 4732 * Zero part of the page which we are not 4733 * going to read from the disk. 4734 */ 4735 xlen = io_len & PAGEOFFSET; 4736 if (xlen != 0) 4737 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4738 4739 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ); 4740 bp->b_edev = ip->i_dev; 4741 bp->b_dev = cmpdev(ip->i_dev); 4742 bp->b_blkno = bn; 4743 bp->b_un.b_addr = (caddr_t)0; 4744 bp->b_file = ip->i_vnode; 4745 bp->b_offset = off; 4746 4747 if (ufsvfsp->vfs_log) { 4748 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4749 } else if (ufsvfsp->vfs_snapshot) { 4750 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4751 } else { 4752 ufsvfsp->vfs_iotstamp = lbolt; 4753 ub.ub_getpages.value.ul++; 4754 (void) bdev_strategy(bp); 4755 lwp_stat_update(LWP_STAT_INBLK, 1); 4756 } 4757 4758 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK); 4759 4760 /* 4761 * If the file access is sequential, initiate read ahead 4762 * of the next cluster. 4763 */ 4764 if (seq && ip->i_nextrio < ip->i_size) 4765 (void) ufs_getpage_ra(vp, off, seg, addr); 4766 err = biowait(bp); 4767 pageio_done(bp); 4768 4769 if (err) { 4770 pvn_read_done(pp, B_ERROR); 4771 return (err); 4772 } 4773 } 4774 4775 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4776 return (0); 4777 } 4778 4779 /* 4780 * Read ahead a cluster from the disk. Returns the length in bytes. 4781 */ 4782 static int 4783 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr) 4784 { 4785 struct inode *ip = VTOI(vp); 4786 page_t *pp; 4787 u_offset_t io_off = ip->i_nextrio; 4788 ufsvfs_t *ufsvfsp; 4789 caddr_t addr2 = addr + (io_off - off); 4790 struct buf *bp; 4791 daddr_t bn; 4792 size_t io_len; 4793 int err; 4794 int contig; 4795 int xlen; 4796 int bsize = ip->i_fs->fs_bsize; 4797 4798 /* 4799 * If the directio advisory is in effect on this file, 4800 * then do not do buffered read ahead. Read ahead makes 4801 * it more difficult on threads using directio as they 4802 * will be forced to flush the pages from this vnode. 4803 */ 4804 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 4805 return (0); 4806 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) 4807 return (0); 4808 4809 /* 4810 * Is this test needed? 4811 */ 4812 if (addr2 >= seg->s_base + seg->s_size) 4813 return (0); 4814 4815 contig = 0; 4816 err = bmap_read(ip, io_off, &bn, &contig); 4817 /* 4818 * If its a UFS_HOLE or a fallocated block, do not perform 4819 * any read ahead's since there probably is nothing to read ahead 4820 */ 4821 if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn)) 4822 return (0); 4823 4824 /* 4825 * Limit the transfer size to bsize if this is the 2nd block. 4826 */ 4827 if (io_off == (u_offset_t)bsize) 4828 contig = MIN(contig, bsize); 4829 4830 if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off, 4831 &io_len, io_off, contig, 1)) == NULL) 4832 return (0); 4833 4834 /* 4835 * Zero part of page which we are not going to read from disk 4836 */ 4837 if ((xlen = (io_len & PAGEOFFSET)) > 0) 4838 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4839 4840 ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK; 4841 4842 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC); 4843 bp->b_edev = ip->i_dev; 4844 bp->b_dev = cmpdev(ip->i_dev); 4845 bp->b_blkno = bn; 4846 bp->b_un.b_addr = (caddr_t)0; 4847 bp->b_file = ip->i_vnode; 4848 bp->b_offset = off; 4849 4850 if (ufsvfsp->vfs_log) { 4851 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4852 } else if (ufsvfsp->vfs_snapshot) { 4853 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4854 } else { 4855 ufsvfsp->vfs_iotstamp = lbolt; 4856 ub.ub_getras.value.ul++; 4857 (void) bdev_strategy(bp); 4858 lwp_stat_update(LWP_STAT_INBLK, 1); 4859 } 4860 4861 return (io_len); 4862 } 4863 4864 int ufs_delay = 1; 4865 /* 4866 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC} 4867 * 4868 * LMXXX - the inode really ought to contain a pointer to one of these 4869 * async args. Stuff gunk in there and just hand the whole mess off. 4870 * This would replace i_delaylen, i_delayoff. 4871 */ 4872 /*ARGSUSED*/ 4873 static int 4874 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 4875 struct cred *cr) 4876 { 4877 struct inode *ip = VTOI(vp); 4878 int err = 0; 4879 4880 if (vp->v_count == 0) { 4881 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0")); 4882 } 4883 4884 TRACE_1(TR_FAC_UFS, TR_UFS_PUTPAGE_START, 4885 "ufs_putpage_start:vp %p", vp); 4886 4887 /* 4888 * XXX - Why should this check be made here? 4889 */ 4890 if (vp->v_flag & VNOMAP) { 4891 err = ENOSYS; 4892 goto errout; 4893 } 4894 4895 if (ip->i_ufsvfs == NULL) { 4896 err = EIO; 4897 goto errout; 4898 } 4899 4900 if (flags & B_ASYNC) { 4901 if (ufs_delay && len && 4902 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 4903 mutex_enter(&ip->i_tlock); 4904 /* 4905 * If nobody stalled, start a new cluster. 4906 */ 4907 if (ip->i_delaylen == 0) { 4908 ip->i_delayoff = off; 4909 ip->i_delaylen = len; 4910 mutex_exit(&ip->i_tlock); 4911 goto errout; 4912 } 4913 /* 4914 * If we have a full cluster or they are not contig, 4915 * then push last cluster and start over. 4916 */ 4917 if (ip->i_delaylen >= CLUSTSZ(ip) || 4918 ip->i_delayoff + ip->i_delaylen != off) { 4919 u_offset_t doff; 4920 size_t dlen; 4921 4922 doff = ip->i_delayoff; 4923 dlen = ip->i_delaylen; 4924 ip->i_delayoff = off; 4925 ip->i_delaylen = len; 4926 mutex_exit(&ip->i_tlock); 4927 err = ufs_putpages(vp, doff, dlen, 4928 flags, cr); 4929 /* LMXXX - flags are new val, not old */ 4930 goto errout; 4931 } 4932 /* 4933 * There is something there, it's not full, and 4934 * it is contig. 4935 */ 4936 ip->i_delaylen += len; 4937 mutex_exit(&ip->i_tlock); 4938 goto errout; 4939 } 4940 /* 4941 * Must have weird flags or we are not clustering. 4942 */ 4943 } 4944 4945 err = ufs_putpages(vp, off, len, flags, cr); 4946 4947 errout: 4948 TRACE_2(TR_FAC_UFS, TR_UFS_PUTPAGE_END, 4949 "ufs_putpage_end:vp %p error %d", vp, err); 4950 return (err); 4951 } 4952 4953 /* 4954 * If len == 0, do from off to EOF. 4955 * 4956 * The normal cases should be len == 0 & off == 0 (entire vp list), 4957 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4958 * (from pageout). 4959 */ 4960 /*ARGSUSED*/ 4961 static int 4962 ufs_putpages( 4963 struct vnode *vp, 4964 offset_t off, 4965 size_t len, 4966 int flags, 4967 struct cred *cr) 4968 { 4969 u_offset_t io_off; 4970 u_offset_t eoff; 4971 struct inode *ip = VTOI(vp); 4972 page_t *pp; 4973 size_t io_len; 4974 int err = 0; 4975 int dolock; 4976 4977 if (vp->v_count == 0) 4978 return (ufs_fault(vp, "ufs_putpages: v_count == 0")); 4979 /* 4980 * Acquire the readers/write inode lock before locking 4981 * any pages in this inode. 4982 * The inode lock is held during i/o. 4983 */ 4984 if (len == 0) { 4985 mutex_enter(&ip->i_tlock); 4986 ip->i_delayoff = ip->i_delaylen = 0; 4987 mutex_exit(&ip->i_tlock); 4988 } 4989 dolock = (rw_owner(&ip->i_contents) != curthread); 4990 if (dolock) { 4991 /* 4992 * Must synchronize this thread and any possible thread 4993 * operating in the window of vulnerability in wrip(). 4994 * It is dangerous to allow both a thread doing a putpage 4995 * and a thread writing, so serialize them. The exception 4996 * is when the thread in wrip() does something which causes 4997 * a putpage operation. Then, the thread must be allowed 4998 * to continue. It may encounter a bmap_read problem in 4999 * ufs_putapage, but that is handled in ufs_putapage. 5000 * Allow async writers to proceed, we don't want to block 5001 * the pageout daemon. 5002 */ 5003 if (ip->i_writer == curthread) 5004 rw_enter(&ip->i_contents, RW_READER); 5005 else { 5006 for (;;) { 5007 rw_enter(&ip->i_contents, RW_READER); 5008 mutex_enter(&ip->i_tlock); 5009 /* 5010 * If there is no thread in the critical 5011 * section of wrip(), then proceed. 5012 * Otherwise, wait until there isn't one. 5013 */ 5014 if (ip->i_writer == NULL) { 5015 mutex_exit(&ip->i_tlock); 5016 break; 5017 } 5018 rw_exit(&ip->i_contents); 5019 /* 5020 * Bounce async writers when we have a writer 5021 * working on this file so we don't deadlock 5022 * the pageout daemon. 5023 */ 5024 if (flags & B_ASYNC) { 5025 mutex_exit(&ip->i_tlock); 5026 return (0); 5027 } 5028 cv_wait(&ip->i_wrcv, &ip->i_tlock); 5029 mutex_exit(&ip->i_tlock); 5030 } 5031 } 5032 } 5033 5034 if (!vn_has_cached_data(vp)) { 5035 if (dolock) 5036 rw_exit(&ip->i_contents); 5037 return (0); 5038 } 5039 5040 if (len == 0) { 5041 /* 5042 * Search the entire vp list for pages >= off. 5043 */ 5044 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage, 5045 flags, cr); 5046 } else { 5047 /* 5048 * Loop over all offsets in the range looking for 5049 * pages to deal with. 5050 */ 5051 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0) 5052 eoff = MIN(off + len, eoff); 5053 else 5054 eoff = off + len; 5055 5056 for (io_off = off; io_off < eoff; io_off += io_len) { 5057 /* 5058 * If we are not invalidating, synchronously 5059 * freeing or writing pages, use the routine 5060 * page_lookup_nowait() to prevent reclaiming 5061 * them from the free list. 5062 */ 5063 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 5064 pp = page_lookup(vp, io_off, 5065 (flags & (B_INVAL | B_FREE)) ? 5066 SE_EXCL : SE_SHARED); 5067 } else { 5068 pp = page_lookup_nowait(vp, io_off, 5069 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 5070 } 5071 5072 if (pp == NULL || pvn_getdirty(pp, flags) == 0) 5073 io_len = PAGESIZE; 5074 else { 5075 u_offset_t *io_offp = &io_off; 5076 5077 err = ufs_putapage(vp, pp, io_offp, &io_len, 5078 flags, cr); 5079 if (err != 0) 5080 break; 5081 /* 5082 * "io_off" and "io_len" are returned as 5083 * the range of pages we actually wrote. 5084 * This allows us to skip ahead more quickly 5085 * since several pages may've been dealt 5086 * with by this iteration of the loop. 5087 */ 5088 } 5089 } 5090 } 5091 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 5092 /* 5093 * We have just sync'ed back all the pages on 5094 * the inode, turn off the IMODTIME flag. 5095 */ 5096 mutex_enter(&ip->i_tlock); 5097 ip->i_flag &= ~IMODTIME; 5098 mutex_exit(&ip->i_tlock); 5099 } 5100 if (dolock) 5101 rw_exit(&ip->i_contents); 5102 return (err); 5103 } 5104 5105 static void 5106 ufs_iodone(buf_t *bp) 5107 { 5108 struct inode *ip; 5109 5110 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 5111 5112 bp->b_iodone = NULL; 5113 5114 ip = VTOI(bp->b_pages->p_vnode); 5115 5116 mutex_enter(&ip->i_tlock); 5117 if (ip->i_writes >= ufs_LW) { 5118 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW) 5119 if (ufs_WRITES) 5120 cv_broadcast(&ip->i_wrcv); /* wake all up */ 5121 } else { 5122 ip->i_writes -= bp->b_bcount; 5123 } 5124 5125 mutex_exit(&ip->i_tlock); 5126 iodone(bp); 5127 } 5128 5129 /* 5130 * Write out a single page, possibly klustering adjacent 5131 * dirty pages. The inode lock must be held. 5132 * 5133 * LMXXX - bsize < pagesize not done. 5134 */ 5135 /*ARGSUSED*/ 5136 int 5137 ufs_putapage( 5138 struct vnode *vp, 5139 page_t *pp, 5140 u_offset_t *offp, 5141 size_t *lenp, /* return values */ 5142 int flags, 5143 struct cred *cr) 5144 { 5145 u_offset_t io_off; 5146 u_offset_t off; 5147 struct inode *ip = VTOI(vp); 5148 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 5149 struct fs *fs; 5150 struct buf *bp; 5151 size_t io_len; 5152 daddr_t bn; 5153 int err; 5154 int contig; 5155 5156 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 5157 5158 TRACE_1(TR_FAC_UFS, TR_UFS_PUTAPAGE_START, 5159 "ufs_putapage_start:vp %p", vp); 5160 5161 if (ufsvfsp == NULL) { 5162 err = EIO; 5163 goto out_trace; 5164 } 5165 5166 fs = ip->i_fs; 5167 ASSERT(fs->fs_ronly == 0); 5168 5169 /* 5170 * If the modified time on the inode has not already been 5171 * set elsewhere (e.g. for write/setattr) we set the time now. 5172 * This gives us approximate modified times for mmap'ed files 5173 * which are modified via stores in the user address space. 5174 */ 5175 if ((ip->i_flag & IMODTIME) == 0) { 5176 mutex_enter(&ip->i_tlock); 5177 ip->i_flag |= IUPD; 5178 ip->i_seq++; 5179 ITIMES_NOLOCK(ip); 5180 mutex_exit(&ip->i_tlock); 5181 } 5182 5183 /* 5184 * Align the request to a block boundry (for old file systems), 5185 * and go ask bmap() how contiguous things are for this file. 5186 */ 5187 off = pp->p_offset & (offset_t)fs->fs_bmask; /* block align it */ 5188 contig = 0; 5189 err = bmap_read(ip, off, &bn, &contig); 5190 if (err) 5191 goto out; 5192 if (bn == UFS_HOLE) { /* putpage never allocates */ 5193 /* 5194 * logging device is in error mode; simply return EIO 5195 */ 5196 if (TRANS_ISERROR(ufsvfsp)) { 5197 err = EIO; 5198 goto out; 5199 } 5200 /* 5201 * Oops, the thread in the window in wrip() did some 5202 * sort of operation which caused a putpage in the bad 5203 * range. In this case, just return an error which will 5204 * cause the software modified bit on the page to set 5205 * and the page will get written out again later. 5206 */ 5207 if (ip->i_writer == curthread) { 5208 err = EIO; 5209 goto out; 5210 } 5211 /* 5212 * If the pager is trying to push a page in the bad range 5213 * just tell him to try again later when things are better. 5214 */ 5215 if (flags & B_ASYNC) { 5216 err = EAGAIN; 5217 goto out; 5218 } 5219 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE"); 5220 goto out; 5221 } 5222 5223 /* 5224 * If it is an fallocate'd block, reverse the negativity since 5225 * we are now writing to it 5226 */ 5227 if (ISFALLOCBLK(ip, bn)) { 5228 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn)); 5229 if (err) 5230 goto out; 5231 5232 bn = -bn; 5233 } 5234 5235 /* 5236 * Take the length (of contiguous bytes) passed back from bmap() 5237 * and _try_ and get a set of pages covering that extent. 5238 */ 5239 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags); 5240 5241 /* 5242 * May have run out of memory and not clustered backwards. 5243 * off p_offset 5244 * [ pp - 1 ][ pp ] 5245 * [ block ] 5246 * We told bmap off, so we have to adjust the bn accordingly. 5247 */ 5248 if (io_off > off) { 5249 bn += btod(io_off - off); 5250 contig -= (io_off - off); 5251 } 5252 5253 /* 5254 * bmap was carefull to tell us the right size so use that. 5255 * There might be unallocated frags at the end. 5256 * LMXXX - bzero the end of the page? We must be writing after EOF. 5257 */ 5258 if (io_len > contig) { 5259 ASSERT(io_len - contig < fs->fs_bsize); 5260 io_len -= (io_len - contig); 5261 } 5262 5263 /* 5264 * Handle the case where we are writing the last page after EOF. 5265 * 5266 * XXX - just a patch for i-mt3. 5267 */ 5268 if (io_len == 0) { 5269 ASSERT(pp->p_offset >= (u_offset_t)(roundup(ip->i_size, 5270 PAGESIZE))); 5271 io_len = PAGESIZE; 5272 } 5273 5274 bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags); 5275 5276 ULOCKFS_SET_MOD(ITOUL(ip)); 5277 5278 bp->b_edev = ip->i_dev; 5279 bp->b_dev = cmpdev(ip->i_dev); 5280 bp->b_blkno = bn; 5281 bp->b_un.b_addr = (caddr_t)0; 5282 bp->b_file = ip->i_vnode; 5283 5284 if (TRANS_ISTRANS(ufsvfsp)) { 5285 if ((ip->i_mode & IFMT) == IFSHAD) { 5286 TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD); 5287 } else if (ufsvfsp->vfs_qinod == ip) { 5288 TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR, 5289 0, 0); 5290 } 5291 } 5292 5293 /* write throttle */ 5294 5295 ASSERT(bp->b_iodone == NULL); 5296 bp->b_iodone = (int (*)())ufs_iodone; 5297 mutex_enter(&ip->i_tlock); 5298 ip->i_writes += bp->b_bcount; 5299 mutex_exit(&ip->i_tlock); 5300 5301 if (bp->b_flags & B_ASYNC) { 5302 if (ufsvfsp->vfs_log) { 5303 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5304 } else if (ufsvfsp->vfs_snapshot) { 5305 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5306 } else { 5307 ufsvfsp->vfs_iotstamp = lbolt; 5308 ub.ub_putasyncs.value.ul++; 5309 (void) bdev_strategy(bp); 5310 lwp_stat_update(LWP_STAT_OUBLK, 1); 5311 } 5312 } else { 5313 if (ufsvfsp->vfs_log) { 5314 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5315 } else if (ufsvfsp->vfs_snapshot) { 5316 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5317 } else { 5318 ufsvfsp->vfs_iotstamp = lbolt; 5319 ub.ub_putsyncs.value.ul++; 5320 (void) bdev_strategy(bp); 5321 lwp_stat_update(LWP_STAT_OUBLK, 1); 5322 } 5323 err = biowait(bp); 5324 pageio_done(bp); 5325 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); 5326 } 5327 5328 pp = NULL; 5329 5330 out: 5331 if (err != 0 && pp != NULL) 5332 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 5333 5334 if (offp) 5335 *offp = io_off; 5336 if (lenp) 5337 *lenp = io_len; 5338 out_trace: 5339 TRACE_2(TR_FAC_UFS, TR_UFS_PUTAPAGE_END, 5340 "ufs_putapage_end:vp %p error %d", vp, err); 5341 return (err); 5342 } 5343 5344 /* ARGSUSED */ 5345 static int 5346 ufs_map(struct vnode *vp, 5347 offset_t off, 5348 struct as *as, 5349 caddr_t *addrp, 5350 size_t len, 5351 uchar_t prot, 5352 uchar_t maxprot, 5353 uint_t flags, 5354 struct cred *cr) 5355 { 5356 struct segvn_crargs vn_a; 5357 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5358 struct ulockfs *ulp; 5359 int error; 5360 5361 TRACE_1(TR_FAC_UFS, TR_UFS_MAP_START, 5362 "ufs_map_start:vp %p", vp); 5363 5364 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK); 5365 if (error) 5366 goto out; 5367 5368 if (vp->v_flag & VNOMAP) { 5369 error = ENOSYS; 5370 goto unlock; 5371 } 5372 5373 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) { 5374 error = ENXIO; 5375 goto unlock; 5376 } 5377 5378 if (vp->v_type != VREG) { 5379 error = ENODEV; 5380 goto unlock; 5381 } 5382 5383 /* 5384 * If file is being locked, disallow mapping. 5385 */ 5386 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) { 5387 error = EAGAIN; 5388 goto unlock; 5389 } 5390 5391 as_rangelock(as); 5392 if ((flags & MAP_FIXED) == 0) { 5393 map_addr(addrp, len, off, 1, flags); 5394 if (*addrp == NULL) { 5395 as_rangeunlock(as); 5396 error = ENOMEM; 5397 goto unlock; 5398 } 5399 } else { 5400 /* 5401 * User specified address - blow away any previous mappings 5402 */ 5403 (void) as_unmap(as, *addrp, len); 5404 } 5405 5406 vn_a.vp = vp; 5407 vn_a.offset = (u_offset_t)off; 5408 vn_a.type = flags & MAP_TYPE; 5409 vn_a.prot = prot; 5410 vn_a.maxprot = maxprot; 5411 vn_a.cred = cr; 5412 vn_a.amp = NULL; 5413 vn_a.flags = flags & ~MAP_TYPE; 5414 vn_a.szc = 0; 5415 vn_a.lgrp_mem_policy_flags = 0; 5416 5417 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5418 as_rangeunlock(as); 5419 5420 unlock: 5421 if (ulp) { 5422 ufs_lockfs_end(ulp); 5423 } 5424 out: 5425 TRACE_2(TR_FAC_UFS, TR_UFS_MAP_END, 5426 "ufs_map_end:vp %p error %d", vp, error); 5427 return (error); 5428 } 5429 5430 /* ARGSUSED */ 5431 static int 5432 ufs_addmap(struct vnode *vp, 5433 offset_t off, 5434 struct as *as, 5435 caddr_t addr, 5436 size_t len, 5437 uchar_t prot, 5438 uchar_t maxprot, 5439 uint_t flags, 5440 struct cred *cr) 5441 { 5442 struct inode *ip = VTOI(vp); 5443 5444 if (vp->v_flag & VNOMAP) { 5445 return (ENOSYS); 5446 } 5447 5448 mutex_enter(&ip->i_tlock); 5449 ip->i_mapcnt += btopr(len); 5450 mutex_exit(&ip->i_tlock); 5451 return (0); 5452 } 5453 5454 /*ARGSUSED*/ 5455 static int 5456 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 5457 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 5458 struct cred *cr) 5459 { 5460 struct inode *ip = VTOI(vp); 5461 5462 if (vp->v_flag & VNOMAP) { 5463 return (ENOSYS); 5464 } 5465 5466 mutex_enter(&ip->i_tlock); 5467 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 5468 ASSERT(ip->i_mapcnt >= 0); 5469 mutex_exit(&ip->i_tlock); 5470 return (0); 5471 } 5472 /* 5473 * Return the answer requested to poll() for non-device files 5474 */ 5475 struct pollhead ufs_pollhd; 5476 5477 /* ARGSUSED */ 5478 int 5479 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp) 5480 { 5481 struct ufsvfs *ufsvfsp; 5482 5483 *revp = 0; 5484 ufsvfsp = VTOI(vp)->i_ufsvfs; 5485 5486 if (!ufsvfsp) { 5487 *revp = POLLHUP; 5488 goto out; 5489 } 5490 5491 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) || 5492 ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) { 5493 *revp |= POLLERR; 5494 5495 } else { 5496 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly && 5497 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5498 *revp |= POLLOUT; 5499 5500 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly && 5501 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5502 *revp |= POLLWRBAND; 5503 5504 if (ev & POLLIN) 5505 *revp |= POLLIN; 5506 5507 if (ev & POLLRDNORM) 5508 *revp |= POLLRDNORM; 5509 5510 if (ev & POLLRDBAND) 5511 *revp |= POLLRDBAND; 5512 } 5513 5514 if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP))) 5515 *revp |= POLLPRI; 5516 out: 5517 *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL; 5518 5519 return (0); 5520 } 5521 5522 /* ARGSUSED */ 5523 static int 5524 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr) 5525 { 5526 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5527 struct ulockfs *ulp = NULL; 5528 struct inode *sip = NULL; 5529 int error; 5530 struct inode *ip = VTOI(vp); 5531 int issync; 5532 5533 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK); 5534 if (error) 5535 return (error); 5536 5537 switch (cmd) { 5538 /* 5539 * Have to handle _PC_NAME_MAX here, because the normal way 5540 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()] 5541 * results in a lock ordering reversal between 5542 * ufs_lockfs_{begin,end}() and 5543 * ufs_thread_{suspend,continue}(). 5544 * 5545 * Keep in sync with ufs_statvfs(). 5546 */ 5547 case _PC_NAME_MAX: 5548 *valp = MAXNAMLEN; 5549 break; 5550 5551 case _PC_FILESIZEBITS: 5552 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 5553 *valp = UFS_FILESIZE_BITS; 5554 else 5555 *valp = 32; 5556 break; 5557 5558 case _PC_XATTR_EXISTS: 5559 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5560 5561 error = ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, 5562 cr); 5563 if (error == 0 && sip != NULL) { 5564 /* Start transaction */ 5565 if (ulp) { 5566 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 5567 TOP_RMDIR, TOP_RMDIR_SIZE); 5568 } 5569 /* 5570 * Is directory empty 5571 */ 5572 rw_enter(&sip->i_rwlock, RW_WRITER); 5573 rw_enter(&sip->i_contents, RW_WRITER); 5574 if (ufs_xattrdirempty(sip, 5575 sip->i_number, CRED())) { 5576 rw_enter(&ip->i_contents, RW_WRITER); 5577 ufs_unhook_shadow(ip, sip); 5578 rw_exit(&ip->i_contents); 5579 5580 *valp = 0; 5581 5582 } else 5583 *valp = 1; 5584 rw_exit(&sip->i_contents); 5585 rw_exit(&sip->i_rwlock); 5586 if (ulp) { 5587 TRANS_END_CSYNC(ufsvfsp, error, issync, 5588 TOP_RMDIR, TOP_RMDIR_SIZE); 5589 } 5590 VN_RELE(ITOV(sip)); 5591 } else if (error == ENOENT) { 5592 *valp = 0; 5593 error = 0; 5594 } 5595 } else { 5596 error = fs_pathconf(vp, cmd, valp, cr); 5597 } 5598 break; 5599 5600 case _PC_ACL_ENABLED: 5601 *valp = _ACL_ACLENT_ENABLED; 5602 break; 5603 5604 case _PC_MIN_HOLE_SIZE: 5605 *valp = (ulong_t)ip->i_fs->fs_bsize; 5606 break; 5607 5608 default: 5609 error = fs_pathconf(vp, cmd, valp, cr); 5610 } 5611 5612 if (ulp != NULL) { 5613 ufs_lockfs_end(ulp); 5614 } 5615 return (error); 5616 } 5617 5618 int ufs_pageio_writes, ufs_pageio_reads; 5619 5620 /*ARGSUSED*/ 5621 static int 5622 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5623 int flags, struct cred *cr) 5624 { 5625 struct inode *ip = VTOI(vp); 5626 struct ufsvfs *ufsvfsp; 5627 page_t *npp = NULL, *opp = NULL, *cpp = pp; 5628 struct buf *bp; 5629 daddr_t bn; 5630 size_t done_len = 0, cur_len = 0; 5631 int err = 0; 5632 int contig = 0; 5633 int dolock; 5634 int vmpss = 0; 5635 struct ulockfs *ulp; 5636 5637 if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp && 5638 vp->v_mpssdata != NULL) { 5639 vmpss = 1; 5640 } 5641 5642 dolock = (rw_owner(&ip->i_contents) != curthread); 5643 /* 5644 * We need a better check. Ideally, we would use another 5645 * vnodeops so that hlocked and forcibly unmounted file 5646 * systems would return EIO where appropriate and w/o the 5647 * need for these checks. 5648 */ 5649 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5650 return (EIO); 5651 5652 /* 5653 * For vmpss (pp can be NULL) case respect the quiesce protocol. 5654 * ul_lock must be taken before locking pages so we can't use it here 5655 * if pp is non NULL because segvn already locked pages 5656 * SE_EXCL. Instead we rely on the fact that a forced umount or 5657 * applying a filesystem lock via ufs_fiolfs() will block in the 5658 * implicit call to ufs_flush() until we unlock the pages after the 5659 * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend 5660 * above 0 until they are done. We have to be careful not to increment 5661 * ul_vnops_cnt here after forceful unmount hlocks the file system. 5662 * 5663 * If pp is NULL use ul_lock to make sure we don't increment 5664 * ul_vnops_cnt after forceful unmount hlocks the file system. 5665 */ 5666 if (vmpss || pp == NULL) { 5667 ulp = &ufsvfsp->vfs_ulockfs; 5668 if (pp == NULL) 5669 mutex_enter(&ulp->ul_lock); 5670 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) { 5671 if (pp == NULL) { 5672 mutex_exit(&ulp->ul_lock); 5673 } 5674 return (vmpss ? EIO : EINVAL); 5675 } 5676 atomic_add_long(&ulp->ul_vnops_cnt, 1); 5677 if (pp == NULL) 5678 mutex_exit(&ulp->ul_lock); 5679 if (ufs_quiesce_pend) { 5680 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5681 cv_broadcast(&ulp->ul_cv); 5682 return (vmpss ? EIO : EINVAL); 5683 } 5684 } 5685 5686 if (dolock) { 5687 /* 5688 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to 5689 * handle a fault against a segment that maps vnode pages with 5690 * large mappings. Segvn creates pages and holds them locked 5691 * SE_EXCL during VOP_PAGEIO() call. In this case we have to 5692 * use rw_tryenter() to avoid a potential deadlock since in 5693 * lock order i_contents needs to be taken first. 5694 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails. 5695 */ 5696 if (!vmpss) { 5697 rw_enter(&ip->i_contents, RW_READER); 5698 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) { 5699 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5700 cv_broadcast(&ulp->ul_cv); 5701 return (EDEADLK); 5702 } 5703 } 5704 5705 /* 5706 * Return an error to segvn because the pagefault request is beyond 5707 * PAGESIZE rounded EOF. 5708 */ 5709 if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) { 5710 if (dolock) 5711 rw_exit(&ip->i_contents); 5712 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5713 cv_broadcast(&ulp->ul_cv); 5714 return (EFAULT); 5715 } 5716 5717 if (pp == NULL) { 5718 if (bmap_has_holes(ip)) { 5719 err = ENOSYS; 5720 } else { 5721 err = EINVAL; 5722 } 5723 if (dolock) 5724 rw_exit(&ip->i_contents); 5725 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5726 cv_broadcast(&ulp->ul_cv); 5727 return (err); 5728 } 5729 5730 /* 5731 * Break the io request into chunks, one for each contiguous 5732 * stretch of disk blocks in the target file. 5733 */ 5734 while (done_len < io_len) { 5735 ASSERT(cpp); 5736 contig = 0; 5737 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len), 5738 &bn, &contig)) 5739 break; 5740 5741 if (bn == UFS_HOLE) { /* No holey swapfiles */ 5742 if (vmpss) { 5743 err = EFAULT; 5744 break; 5745 } 5746 err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE"); 5747 break; 5748 } 5749 5750 cur_len = MIN(io_len - done_len, contig); 5751 /* 5752 * Zero out a page beyond EOF, when the last block of 5753 * a file is a UFS fragment so that ufs_pageio() can be used 5754 * instead of ufs_getpage() to handle faults against 5755 * segvn segments that use large pages. 5756 */ 5757 page_list_break(&cpp, &npp, btopr(cur_len)); 5758 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) { 5759 size_t xlen = cur_len & PAGEOFFSET; 5760 pagezero(cpp->p_prev, xlen, PAGESIZE - xlen); 5761 } 5762 5763 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 5764 ASSERT(bp != NULL); 5765 5766 bp->b_edev = ip->i_dev; 5767 bp->b_dev = cmpdev(ip->i_dev); 5768 bp->b_blkno = bn; 5769 bp->b_un.b_addr = (caddr_t)0; 5770 bp->b_file = ip->i_vnode; 5771 5772 ufsvfsp->vfs_iotstamp = lbolt; 5773 ub.ub_pageios.value.ul++; 5774 if (ufsvfsp->vfs_snapshot) 5775 fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp); 5776 else 5777 (void) bdev_strategy(bp); 5778 5779 if (flags & B_READ) 5780 ufs_pageio_reads++; 5781 else 5782 ufs_pageio_writes++; 5783 if (flags & B_READ) 5784 lwp_stat_update(LWP_STAT_INBLK, 1); 5785 else 5786 lwp_stat_update(LWP_STAT_OUBLK, 1); 5787 /* 5788 * If the request is not B_ASYNC, wait for i/o to complete 5789 * and re-assemble the page list to return to the caller. 5790 * If it is B_ASYNC we leave the page list in pieces and 5791 * cleanup() will dispose of them. 5792 */ 5793 if ((flags & B_ASYNC) == 0) { 5794 err = biowait(bp); 5795 pageio_done(bp); 5796 if (err) 5797 break; 5798 page_list_concat(&opp, &cpp); 5799 } 5800 cpp = npp; 5801 npp = NULL; 5802 if (flags & B_READ) 5803 cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t); 5804 done_len += cur_len; 5805 } 5806 ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len)); 5807 if (err) { 5808 if (flags & B_ASYNC) { 5809 /* Cleanup unprocessed parts of list */ 5810 page_list_concat(&cpp, &npp); 5811 if (flags & B_READ) 5812 pvn_read_done(cpp, B_ERROR); 5813 else 5814 pvn_write_done(cpp, B_ERROR); 5815 } else { 5816 /* Re-assemble list and let caller clean up */ 5817 page_list_concat(&opp, &cpp); 5818 page_list_concat(&opp, &npp); 5819 } 5820 } 5821 5822 if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) && 5823 ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) { 5824 mutex_enter(&ip->i_tlock); 5825 ip->i_flag |= IACC; 5826 ITIMES_NOLOCK(ip); 5827 mutex_exit(&ip->i_tlock); 5828 } 5829 5830 if (dolock) 5831 rw_exit(&ip->i_contents); 5832 if (vmpss && !atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5833 cv_broadcast(&ulp->ul_cv); 5834 return (err); 5835 } 5836 5837 /* 5838 * Called when the kernel is in a frozen state to dump data 5839 * directly to the device. It uses a private dump data structure, 5840 * set up by dump_ctl, to locate the correct disk block to which to dump. 5841 */ 5842 static int 5843 ufs_dump(vnode_t *vp, caddr_t addr, int ldbn, int dblks) 5844 { 5845 u_offset_t file_size; 5846 struct inode *ip = VTOI(vp); 5847 struct fs *fs = ip->i_fs; 5848 daddr_t dbn, lfsbn; 5849 int disk_blks = fs->fs_bsize >> DEV_BSHIFT; 5850 int error = 0; 5851 int ndbs, nfsbs; 5852 5853 /* 5854 * forced unmount case 5855 */ 5856 if (ip->i_ufsvfs == NULL) 5857 return (EIO); 5858 /* 5859 * Validate the inode that it has not been modified since 5860 * the dump structure is allocated. 5861 */ 5862 mutex_enter(&ip->i_tlock); 5863 if ((dump_info == NULL) || 5864 (dump_info->ip != ip) || 5865 (dump_info->time.tv_sec != ip->i_mtime.tv_sec) || 5866 (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) { 5867 mutex_exit(&ip->i_tlock); 5868 return (-1); 5869 } 5870 mutex_exit(&ip->i_tlock); 5871 5872 /* 5873 * See that the file has room for this write 5874 */ 5875 UFS_GET_ISIZE(&file_size, ip); 5876 5877 if (ldbtob((offset_t)(ldbn + dblks)) > file_size) 5878 return (ENOSPC); 5879 5880 /* 5881 * Find the physical disk block numbers from the dump 5882 * private data structure directly and write out the data 5883 * in contiguous block lumps 5884 */ 5885 while (dblks > 0 && !error) { 5886 lfsbn = (daddr_t)lblkno(fs, ldbtob((offset_t)ldbn)); 5887 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks; 5888 nfsbs = 1; 5889 ndbs = disk_blks - ldbn % disk_blks; 5890 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn + 5891 nfsbs]) == dbn + ndbs) { 5892 nfsbs++; 5893 ndbs += disk_blks; 5894 } 5895 if (ndbs > dblks) 5896 ndbs = dblks; 5897 error = bdev_dump(ip->i_dev, addr, dbn, ndbs); 5898 addr += ldbtob((offset_t)ndbs); 5899 dblks -= ndbs; 5900 ldbn += ndbs; 5901 } 5902 return (error); 5903 5904 } 5905 5906 /* 5907 * Prepare the file system before and after the dump operation. 5908 * 5909 * action = DUMP_ALLOC: 5910 * Preparation before dump, allocate dump private data structure 5911 * to hold all the direct and indirect block info for dump. 5912 * 5913 * action = DUMP_FREE: 5914 * Clean up after dump, deallocate the dump private data structure. 5915 * 5916 * action = DUMP_SCAN: 5917 * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space; 5918 * if found, the starting file-relative DEV_BSIZE lbn is written 5919 * to *bklp; that lbn is intended for use with VOP_DUMP() 5920 */ 5921 static int 5922 ufs_dumpctl(vnode_t *vp, int action, int *blkp) 5923 { 5924 struct inode *ip = VTOI(vp); 5925 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 5926 struct fs *fs; 5927 daddr32_t *dblk, *storeblk; 5928 daddr32_t *nextblk, *endblk; 5929 struct buf *bp; 5930 int i, entry, entries; 5931 int n, ncontig; 5932 5933 /* 5934 * check for forced unmount 5935 */ 5936 if (ufsvfsp == NULL) 5937 return (EIO); 5938 5939 if (action == DUMP_ALLOC) { 5940 /* 5941 * alloc and record dump_info 5942 */ 5943 if (dump_info != NULL) 5944 return (EINVAL); 5945 5946 ASSERT(vp->v_type == VREG); 5947 fs = ufsvfsp->vfs_fs; 5948 5949 rw_enter(&ip->i_contents, RW_READER); 5950 5951 if (bmap_has_holes(ip)) { 5952 rw_exit(&ip->i_contents); 5953 return (EFAULT); 5954 } 5955 5956 /* 5957 * calculate and allocate space needed according to i_size 5958 */ 5959 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size)); 5960 if ((dump_info = (struct dump *) 5961 kmem_alloc(sizeof (struct dump) + 5962 (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP)) == NULL) { 5963 rw_exit(&ip->i_contents); 5964 return (ENOMEM); 5965 } 5966 5967 /* Start saving the info */ 5968 dump_info->fsbs = entries; 5969 dump_info->ip = ip; 5970 storeblk = &dump_info->dblk[0]; 5971 5972 /* Direct Blocks */ 5973 for (entry = 0; entry < NDADDR && entry < entries; entry++) 5974 *storeblk++ = ip->i_db[entry]; 5975 5976 /* Indirect Blocks */ 5977 for (i = 0; i < NIADDR; i++) { 5978 int error = 0; 5979 5980 bp = UFS_BREAD(ufsvfsp, 5981 ip->i_dev, fsbtodb(fs, ip->i_ib[i]), 5982 fs->fs_bsize); 5983 if (bp->b_flags & B_ERROR) 5984 error = EIO; 5985 else { 5986 dblk = bp->b_un.b_daddr; 5987 if ((storeblk = save_dblks(ip, ufsvfsp, 5988 storeblk, dblk, i, entries)) == NULL) 5989 error = EIO; 5990 } 5991 5992 brelse(bp); 5993 5994 if (error != 0) { 5995 kmem_free(dump_info, sizeof (struct dump) + 5996 (entries - 1) * sizeof (daddr32_t)); 5997 rw_exit(&ip->i_contents); 5998 dump_info = NULL; 5999 return (error); 6000 } 6001 } 6002 /* and time stamp the information */ 6003 mutex_enter(&ip->i_tlock); 6004 dump_info->time = ip->i_mtime; 6005 mutex_exit(&ip->i_tlock); 6006 6007 rw_exit(&ip->i_contents); 6008 } else if (action == DUMP_FREE) { 6009 /* 6010 * free dump_info 6011 */ 6012 if (dump_info == NULL) 6013 return (EINVAL); 6014 entries = dump_info->fsbs - 1; 6015 kmem_free(dump_info, sizeof (struct dump) + 6016 entries * sizeof (daddr32_t)); 6017 dump_info = NULL; 6018 } else if (action == DUMP_SCAN) { 6019 /* 6020 * scan dump_info 6021 */ 6022 if (dump_info == NULL) 6023 return (EINVAL); 6024 6025 dblk = dump_info->dblk; 6026 nextblk = dblk + 1; 6027 endblk = dblk + dump_info->fsbs - 1; 6028 fs = ufsvfsp->vfs_fs; 6029 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT); 6030 6031 /* 6032 * scan dblk[] entries; contig fs space is found when: 6033 * ((current blkno + frags per block) == next blkno) 6034 */ 6035 n = 0; 6036 while (n < ncontig && dblk < endblk) { 6037 if ((*dblk + fs->fs_frag) == *nextblk) 6038 n++; 6039 else 6040 n = 0; 6041 dblk++; 6042 nextblk++; 6043 } 6044 6045 /* 6046 * index is where size bytes of contig space begins; 6047 * conversion from index to the file's DEV_BSIZE lbn 6048 * is equivalent to: (index * fs_bsize) / DEV_BSIZE 6049 */ 6050 if (n == ncontig) { 6051 i = (dblk - dump_info->dblk) - ncontig; 6052 *blkp = i << (fs->fs_bshift - DEV_BSHIFT); 6053 } else 6054 return (EFAULT); 6055 } 6056 return (0); 6057 } 6058 6059 /* 6060 * Recursive helper function for ufs_dumpctl(). It follows the indirect file 6061 * system blocks until it reaches the the disk block addresses, which are 6062 * then stored into the given buffer, storeblk. 6063 */ 6064 static daddr32_t * 6065 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp, daddr32_t *storeblk, 6066 daddr32_t *dblk, int level, int entries) 6067 { 6068 struct fs *fs = ufsvfsp->vfs_fs; 6069 struct buf *bp; 6070 int i; 6071 6072 if (level == 0) { 6073 for (i = 0; i < NINDIR(fs); i++) { 6074 if (storeblk - dump_info->dblk >= entries) 6075 break; 6076 *storeblk++ = dblk[i]; 6077 } 6078 return (storeblk); 6079 } 6080 for (i = 0; i < NINDIR(fs); i++) { 6081 if (storeblk - dump_info->dblk >= entries) 6082 break; 6083 bp = UFS_BREAD(ufsvfsp, 6084 ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize); 6085 if (bp->b_flags & B_ERROR) { 6086 brelse(bp); 6087 return (NULL); 6088 } 6089 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr, 6090 level - 1, entries); 6091 brelse(bp); 6092 6093 if (storeblk == NULL) 6094 return (NULL); 6095 } 6096 return (storeblk); 6097 } 6098 6099 /* ARGSUSED */ 6100 static int 6101 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, 6102 struct cred *cr) 6103 { 6104 struct inode *ip = VTOI(vp); 6105 struct ulockfs *ulp; 6106 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 6107 ulong_t vsa_mask = vsap->vsa_mask; 6108 int err = EINVAL; 6109 6110 TRACE_3(TR_FAC_UFS, TR_UFS_GETSECATTR_START, 6111 "ufs_getsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6112 6113 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6114 6115 /* 6116 * Only grab locks if needed - they're not needed to check vsa_mask 6117 * or if the mask contains no acl flags. 6118 */ 6119 if (vsa_mask != 0) { 6120 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, 6121 ULOCKFS_GETATTR_MASK)) 6122 return (err); 6123 6124 rw_enter(&ip->i_contents, RW_READER); 6125 err = ufs_acl_get(ip, vsap, flag, cr); 6126 rw_exit(&ip->i_contents); 6127 6128 if (ulp) 6129 ufs_lockfs_end(ulp); 6130 } 6131 TRACE_1(TR_FAC_UFS, TR_UFS_GETSECATTR_END, 6132 "ufs_getsecattr_end:vp %p", vp); 6133 return (err); 6134 } 6135 6136 /* ARGSUSED */ 6137 static int 6138 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr) 6139 { 6140 struct inode *ip = VTOI(vp); 6141 struct ulockfs *ulp = NULL; 6142 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 6143 ulong_t vsa_mask = vsap->vsa_mask; 6144 int err; 6145 int haverwlock = 1; 6146 int trans_size; 6147 int donetrans = 0; 6148 int retry = 1; 6149 6150 6151 TRACE_3(TR_FAC_UFS, TR_UFS_SETSECATTR_START, 6152 "ufs_setsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6153 6154 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 6155 6156 /* Abort now if the request is either empty or invalid. */ 6157 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6158 if ((vsa_mask == 0) || 6159 ((vsap->vsa_aclentp == NULL) && 6160 (vsap->vsa_dfaclentp == NULL))) { 6161 err = EINVAL; 6162 goto out; 6163 } 6164 6165 /* 6166 * Following convention, if this is a directory then we acquire the 6167 * inode's i_rwlock after starting a UFS logging transaction; 6168 * otherwise, we acquire it beforehand. Since we were called (and 6169 * must therefore return) with the lock held, we will have to drop it, 6170 * and later reacquire it, if operating on a directory. 6171 */ 6172 if (vp->v_type == VDIR) { 6173 rw_exit(&ip->i_rwlock); 6174 haverwlock = 0; 6175 } else { 6176 /* Upgrade the lock if required. */ 6177 if (!rw_write_held(&ip->i_rwlock)) { 6178 rw_exit(&ip->i_rwlock); 6179 rw_enter(&ip->i_rwlock, RW_WRITER); 6180 } 6181 } 6182 6183 again: 6184 ASSERT(!(vp->v_type == VDIR && haverwlock)); 6185 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) { 6186 ulp = NULL; 6187 retry = 0; 6188 goto out; 6189 } 6190 6191 /* 6192 * Check that the file system supports this operation. Note that 6193 * ufs_lockfs_begin() will have checked that the file system had 6194 * not been forcibly unmounted. 6195 */ 6196 if (ufsvfsp->vfs_fs->fs_ronly) { 6197 err = EROFS; 6198 goto out; 6199 } 6200 if (ufsvfsp->vfs_nosetsec) { 6201 err = ENOSYS; 6202 goto out; 6203 } 6204 6205 if (ulp) { 6206 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR, 6207 trans_size = TOP_SETSECATTR_SIZE(VTOI(vp))); 6208 donetrans = 1; 6209 } 6210 6211 if (vp->v_type == VDIR) { 6212 rw_enter(&ip->i_rwlock, RW_WRITER); 6213 haverwlock = 1; 6214 } 6215 6216 ASSERT(haverwlock); 6217 6218 /* Do the actual work. */ 6219 rw_enter(&ip->i_contents, RW_WRITER); 6220 /* 6221 * Suppress out of inodes messages if we will retry. 6222 */ 6223 if (retry) 6224 ip->i_flag |= IQUIET; 6225 err = ufs_acl_set(ip, vsap, flag, cr); 6226 ip->i_flag &= ~IQUIET; 6227 rw_exit(&ip->i_contents); 6228 6229 out: 6230 if (ulp) { 6231 if (donetrans) { 6232 /* 6233 * top_end_async() can eventually call 6234 * top_end_sync(), which can block. We must 6235 * therefore observe the lock-ordering protocol 6236 * here as well. 6237 */ 6238 if (vp->v_type == VDIR) { 6239 rw_exit(&ip->i_rwlock); 6240 haverwlock = 0; 6241 } 6242 TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size); 6243 } 6244 ufs_lockfs_end(ulp); 6245 } 6246 /* 6247 * If no inodes available, try scaring a logically- 6248 * free one out of the delete queue to someplace 6249 * that we can find it. 6250 */ 6251 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 6252 ufs_delete_drain_wait(ufsvfsp, 1); 6253 retry = 0; 6254 if (vp->v_type == VDIR && haverwlock) { 6255 rw_exit(&ip->i_rwlock); 6256 haverwlock = 0; 6257 } 6258 goto again; 6259 } 6260 /* 6261 * If we need to reacquire the lock then it is safe to do so 6262 * as a reader. This is because ufs_rwunlock(), which will be 6263 * called by our caller after we return, does not differentiate 6264 * between shared and exclusive locks. 6265 */ 6266 if (!haverwlock) { 6267 ASSERT(vp->v_type == VDIR); 6268 rw_enter(&ip->i_rwlock, RW_READER); 6269 } 6270 6271 TRACE_1(TR_FAC_UFS, TR_UFS_SETSECATTR_END, 6272 "ufs_setsecattr_end:vp %p", vp); 6273 return (err); 6274 } 6275