1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/t_lock.h> 38 #include <sys/ksynch.h> 39 #include <sys/param.h> 40 #include <sys/time.h> 41 #include <sys/systm.h> 42 #include <sys/sysmacros.h> 43 #include <sys/resource.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/buf.h> 48 #include <sys/vfs.h> 49 #include <sys/vnode.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/file.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/atomic.h> 56 #include <sys/kmem.h> 57 #include <sys/uio.h> 58 #include <sys/dnlc.h> 59 #include <sys/conf.h> 60 #include <sys/mman.h> 61 #include <sys/pathname.h> 62 #include <sys/debug.h> 63 #include <sys/vmsystm.h> 64 #include <sys/cmn_err.h> 65 #include <sys/vtrace.h> 66 #include <sys/filio.h> 67 #include <sys/policy.h> 68 69 #include <sys/fs/ufs_fs.h> 70 #include <sys/fs/ufs_lockfs.h> 71 #include <sys/fs/ufs_filio.h> 72 #include <sys/fs/ufs_inode.h> 73 #include <sys/fs/ufs_fsdir.h> 74 #include <sys/fs/ufs_quota.h> 75 #include <sys/fs/ufs_log.h> 76 #include <sys/fs/ufs_snap.h> 77 #include <sys/fs/ufs_trans.h> 78 #include <sys/fs/ufs_panic.h> 79 #include <sys/fs/ufs_bio.h> 80 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 81 #include <sys/errno.h> 82 #include <sys/fssnap_if.h> 83 #include <sys/unistd.h> 84 #include <sys/sunddi.h> 85 86 #include <sys/filio.h> /* _FIOIO */ 87 88 #include <vm/hat.h> 89 #include <vm/page.h> 90 #include <vm/pvn.h> 91 #include <vm/as.h> 92 #include <vm/seg.h> 93 #include <vm/seg_map.h> 94 #include <vm/seg_vn.h> 95 #include <vm/seg_kmem.h> 96 #include <vm/rm.h> 97 #include <sys/swap.h> 98 99 #include <fs/fs_subr.h> 100 101 static struct instats ins; 102 103 static int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 104 static int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, 105 caddr_t, struct page **, size_t, enum seg_rw, int); 106 static int ufs_open(struct vnode **, int, struct cred *); 107 static int ufs_close(struct vnode *, int, int, offset_t, struct cred *); 108 static int ufs_read(struct vnode *, struct uio *, int, struct cred *, 109 struct caller_context *); 110 static int ufs_write(struct vnode *, struct uio *, int, struct cred *, 111 struct caller_context *); 112 static int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *); 113 static int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *); 114 static int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *, 115 caller_context_t *); 116 static int ufs_access(struct vnode *, int, int, struct cred *); 117 static int ufs_lookup(struct vnode *, char *, struct vnode **, 118 struct pathname *, int, struct vnode *, struct cred *); 119 static int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl, 120 int, struct vnode **, struct cred *, int); 121 static int ufs_remove(struct vnode *, char *, struct cred *); 122 static int ufs_link(struct vnode *, struct vnode *, char *, struct cred *); 123 static int ufs_rename(struct vnode *, char *, struct vnode *, char *, 124 struct cred *); 125 static int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, 126 struct cred *); 127 static int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *); 128 static int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *); 129 static int ufs_symlink(struct vnode *, char *, struct vattr *, char *, 130 struct cred *); 131 static int ufs_readlink(struct vnode *, struct uio *, struct cred *); 132 static int ufs_fsync(struct vnode *, int, struct cred *); 133 static void ufs_inactive(struct vnode *, struct cred *); 134 static int ufs_fid(struct vnode *, struct fid *); 135 static int ufs_rwlock(struct vnode *, int, caller_context_t *); 136 static void ufs_rwunlock(struct vnode *, int, caller_context_t *); 137 static int ufs_seek(struct vnode *, offset_t, offset_t *); 138 static int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 139 struct flk_callback *, struct cred *); 140 static int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t, 141 cred_t *, caller_context_t *); 142 static int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *, 143 struct page **, size_t, struct seg *, caddr_t, 144 enum seg_rw, struct cred *); 145 static int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *); 146 static int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *); 147 static int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 148 uchar_t, uchar_t, uint_t, struct cred *); 149 static int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 150 uchar_t, uchar_t, uint_t, struct cred *); 151 static int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 152 uint_t, uint_t, uint_t, struct cred *); 153 static int ufs_poll(vnode_t *, short, int, short *, struct pollhead **); 154 static int ufs_dump(vnode_t *, caddr_t, int, int); 155 static int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *); 156 static int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int, 157 struct cred *); 158 static int ufs_dump(vnode_t *, caddr_t, int, int); 159 static int ufs_dumpctl(vnode_t *, int, int *); 160 static daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *, 161 daddr32_t *, int, int); 162 static int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 163 static int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 164 165 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 166 167 /* 168 * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions. 169 * 170 * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet. 171 */ 172 struct vnodeops *ufs_vnodeops; 173 174 const fs_operation_def_t ufs_vnodeops_template[] = { 175 VOPNAME_OPEN, ufs_open, /* will not be blocked by lockfs */ 176 VOPNAME_CLOSE, ufs_close, /* will not be blocked by lockfs */ 177 VOPNAME_READ, ufs_read, 178 VOPNAME_WRITE, ufs_write, 179 VOPNAME_IOCTL, ufs_ioctl, 180 VOPNAME_GETATTR, ufs_getattr, 181 VOPNAME_SETATTR, ufs_setattr, 182 VOPNAME_ACCESS, ufs_access, 183 VOPNAME_LOOKUP, ufs_lookup, 184 VOPNAME_CREATE, ufs_create, 185 VOPNAME_REMOVE, ufs_remove, 186 VOPNAME_LINK, ufs_link, 187 VOPNAME_RENAME, ufs_rename, 188 VOPNAME_MKDIR, ufs_mkdir, 189 VOPNAME_RMDIR, ufs_rmdir, 190 VOPNAME_READDIR, ufs_readdir, 191 VOPNAME_SYMLINK, ufs_symlink, 192 VOPNAME_READLINK, ufs_readlink, 193 VOPNAME_FSYNC, ufs_fsync, 194 VOPNAME_INACTIVE, (fs_generic_func_p) ufs_inactive, /* not blocked */ 195 VOPNAME_FID, ufs_fid, 196 VOPNAME_RWLOCK, ufs_rwlock, /* not blocked */ 197 VOPNAME_RWUNLOCK, (fs_generic_func_p) ufs_rwunlock, /* not blocked */ 198 VOPNAME_SEEK, ufs_seek, 199 VOPNAME_FRLOCK, ufs_frlock, 200 VOPNAME_SPACE, ufs_space, 201 VOPNAME_GETPAGE, ufs_getpage, 202 VOPNAME_PUTPAGE, ufs_putpage, 203 VOPNAME_MAP, (fs_generic_func_p) ufs_map, 204 VOPNAME_ADDMAP, (fs_generic_func_p) ufs_addmap, /* not blocked */ 205 VOPNAME_DELMAP, ufs_delmap, /* will not be blocked by lockfs */ 206 VOPNAME_POLL, (fs_generic_func_p) ufs_poll, /* not blocked */ 207 VOPNAME_DUMP, ufs_dump, 208 VOPNAME_PATHCONF, ufs_l_pathconf, 209 VOPNAME_PAGEIO, ufs_pageio, 210 VOPNAME_DUMPCTL, ufs_dumpctl, 211 VOPNAME_GETSECATTR, ufs_getsecattr, 212 VOPNAME_SETSECATTR, ufs_setsecattr, 213 VOPNAME_VNEVENT, fs_vnevent_support, 214 NULL, NULL 215 }; 216 217 #define MAX_BACKFILE_COUNT 9999 218 219 /* 220 * Created by ufs_dumpctl() to store a file's disk block info into memory. 221 * Used by ufs_dump() to dump data to disk directly. 222 */ 223 struct dump { 224 struct inode *ip; /* the file we contain */ 225 daddr_t fsbs; /* number of blocks stored */ 226 struct timeval32 time; /* time stamp for the struct */ 227 daddr32_t dblk[1]; /* place holder for block info */ 228 }; 229 230 static struct dump *dump_info = NULL; 231 232 /* 233 * Previously there was no special action required for ordinary files. 234 * (Devices are handled through the device file system.) 235 * Now we support Large Files and Large File API requires open to 236 * fail if file is large. 237 * We could take care to prevent data corruption 238 * by doing an atomic check of size and truncate if file is opened with 239 * FTRUNC flag set but traditionally this is being done by the vfs/vnode 240 * layers. So taking care of truncation here is a change in the existing 241 * semantics of VOP_OPEN and therefore we chose not to implement any thing 242 * here. The check for the size of the file > 2GB is being done at the 243 * vfs layer in routine vn_open(). 244 */ 245 246 /* ARGSUSED */ 247 static int 248 ufs_open(struct vnode **vpp, int flag, struct cred *cr) 249 { 250 TRACE_1(TR_FAC_UFS, TR_UFS_OPEN, "ufs_open:vpp %p", vpp); 251 return (0); 252 } 253 254 /*ARGSUSED*/ 255 static int 256 ufs_close(struct vnode *vp, int flag, int count, offset_t offset, 257 struct cred *cr) 258 { 259 TRACE_1(TR_FAC_UFS, TR_UFS_CLOSE, "ufs_close:vp %p", vp); 260 261 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 262 cleanshares(vp, ttoproc(curthread)->p_pid); 263 264 /* 265 * Push partially filled cluster at last close. 266 * ``last close'' is approximated because the dnlc 267 * may have a hold on the vnode. 268 * Checking for VBAD here will also act as a forced umount check. 269 */ 270 if (vp->v_count <= 2 && vp->v_type != VBAD) { 271 struct inode *ip = VTOI(vp); 272 if (ip->i_delaylen) { 273 ins.in_poc.value.ul++; 274 (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen, 275 B_ASYNC | B_FREE, cr); 276 ip->i_delaylen = 0; 277 } 278 } 279 280 return (0); 281 } 282 283 /*ARGSUSED*/ 284 static int 285 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 286 struct caller_context *ct) 287 { 288 struct inode *ip = VTOI(vp); 289 struct ufsvfs *ufsvfsp; 290 struct ulockfs *ulp = NULL; 291 int error = 0; 292 int intrans = 0; 293 294 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 295 TRACE_3(TR_FAC_UFS, TR_UFS_READ_START, 296 "ufs_read_start:vp %p uiop %p ioflag %x", 297 vp, uiop, ioflag); 298 299 /* 300 * Mandatory locking needs to be done before ufs_lockfs_begin() 301 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep. 302 */ 303 if (MANDLOCK(vp, ip->i_mode)) { 304 /* 305 * ufs_getattr ends up being called by chklock 306 */ 307 error = chklock(vp, FREAD, uiop->uio_loffset, 308 uiop->uio_resid, uiop->uio_fmode, ct); 309 if (error) 310 goto out; 311 } 312 313 ufsvfsp = ip->i_ufsvfs; 314 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK); 315 if (error) 316 goto out; 317 318 /* 319 * In the case that a directory is opened for reading as a file 320 * (eg "cat .") with the O_RSYNC, O_SYNC and O_DSYNC flags set. 321 * The locking order had to be changed to avoid a deadlock with 322 * an update taking place on that directory at the same time. 323 */ 324 if ((ip->i_mode & IFMT) == IFDIR) { 325 326 rw_enter(&ip->i_contents, RW_READER); 327 error = rdip(ip, uiop, ioflag, cr); 328 rw_exit(&ip->i_contents); 329 330 if (error) { 331 if (ulp) 332 ufs_lockfs_end(ulp); 333 goto out; 334 } 335 336 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 337 TRANS_ISTRANS(ufsvfsp)) { 338 rw_exit(&ip->i_rwlock); 339 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 340 error); 341 ASSERT(!error); 342 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 343 TOP_READ_SIZE); 344 rw_enter(&ip->i_rwlock, RW_READER); 345 } 346 } else { 347 /* 348 * Only transact reads to files opened for sync-read and 349 * sync-write on a file system that is not write locked. 350 * 351 * The ``not write locked'' check prevents problems with 352 * enabling/disabling logging on a busy file system. E.g., 353 * logging exists at the beginning of the read but does not 354 * at the end. 355 * 356 */ 357 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 358 TRANS_ISTRANS(ufsvfsp)) { 359 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 360 error); 361 ASSERT(!error); 362 intrans = 1; 363 } 364 365 rw_enter(&ip->i_contents, RW_READER); 366 error = rdip(ip, uiop, ioflag, cr); 367 rw_exit(&ip->i_contents); 368 369 if (intrans) { 370 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 371 TOP_READ_SIZE); 372 } 373 } 374 375 if (ulp) { 376 ufs_lockfs_end(ulp); 377 } 378 out: 379 380 TRACE_2(TR_FAC_UFS, TR_UFS_READ_END, 381 "ufs_read_end:vp %p error %d", vp, error); 382 return (error); 383 } 384 385 extern int ufs_HW; /* high water mark */ 386 extern int ufs_LW; /* low water mark */ 387 int ufs_WRITES = 1; /* XXX - enable/disable */ 388 int ufs_throttles = 0; /* throttling count */ 389 int ufs_allow_shared_writes = 1; /* directio shared writes */ 390 391 static int 392 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag) 393 { 394 int shared_write; 395 396 /* 397 * If the FDSYNC flag is set then ignore the global 398 * ufs_allow_shared_writes in this case. 399 */ 400 shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes; 401 402 /* 403 * Filter to determine if this request is suitable as a 404 * concurrent rewrite. This write must not allocate blocks 405 * by extending the file or filling in holes. No use trying 406 * through FSYNC descriptors as the inode will be synchronously 407 * updated after the write. The uio structure has not yet been 408 * checked for sanity, so assume nothing. 409 */ 410 return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) && 411 (uiop->uio_loffset >= (offset_t)0) && 412 (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) && 413 ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) && 414 !(ioflag & FSYNC) && !bmap_has_holes(ip) && 415 shared_write); 416 } 417 418 /*ARGSUSED*/ 419 static int 420 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr, 421 caller_context_t *ct) 422 { 423 struct inode *ip = VTOI(vp); 424 struct ufsvfs *ufsvfsp; 425 struct ulockfs *ulp; 426 int retry = 1; 427 int error, resv, resid = 0; 428 int directio_status; 429 int exclusive; 430 int rewriteflg; 431 long start_resid = uiop->uio_resid; 432 433 TRACE_3(TR_FAC_UFS, TR_UFS_WRITE_START, 434 "ufs_write_start:vp %p uiop %p ioflag %x", 435 vp, uiop, ioflag); 436 437 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 438 439 retry_mandlock: 440 /* 441 * Mandatory locking needs to be done before ufs_lockfs_begin() 442 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep. 443 * Check for forced unmounts normally done in ufs_lockfs_begin(). 444 */ 445 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 446 error = EIO; 447 goto out; 448 } 449 if (MANDLOCK(vp, ip->i_mode)) { 450 451 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 452 453 /* 454 * ufs_getattr ends up being called by chklock 455 */ 456 error = chklock(vp, FWRITE, uiop->uio_loffset, 457 uiop->uio_resid, uiop->uio_fmode, ct); 458 if (error) 459 goto out; 460 } 461 462 /* i_rwlock can change in chklock */ 463 exclusive = rw_write_held(&ip->i_rwlock); 464 rewriteflg = ufs_check_rewrite(ip, uiop, ioflag); 465 466 /* 467 * Check for fast-path special case of directio re-writes. 468 */ 469 if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) && 470 !exclusive && rewriteflg) { 471 472 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 473 if (error) 474 goto out; 475 476 rw_enter(&ip->i_contents, RW_READER); 477 error = ufs_directio_write(ip, uiop, ioflag, 1, cr, 478 &directio_status); 479 if (directio_status == DIRECTIO_SUCCESS) { 480 uint_t i_flag_save; 481 482 if (start_resid != uiop->uio_resid) 483 error = 0; 484 /* 485 * Special treatment of access times for re-writes. 486 * If IMOD is not already set, then convert it 487 * to IMODACC for this operation. This defers 488 * entering a delta into the log until the inode 489 * is flushed. This mimics what is done for read 490 * operations and inode access time. 491 */ 492 mutex_enter(&ip->i_tlock); 493 i_flag_save = ip->i_flag; 494 ip->i_flag |= IUPD | ICHG; 495 ip->i_seq++; 496 ITIMES_NOLOCK(ip); 497 if ((i_flag_save & IMOD) == 0) { 498 ip->i_flag &= ~IMOD; 499 ip->i_flag |= IMODACC; 500 } 501 mutex_exit(&ip->i_tlock); 502 rw_exit(&ip->i_contents); 503 if (ulp) 504 ufs_lockfs_end(ulp); 505 goto out; 506 } 507 rw_exit(&ip->i_contents); 508 if (ulp) 509 ufs_lockfs_end(ulp); 510 } 511 512 if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) { 513 rw_exit(&ip->i_rwlock); 514 rw_enter(&ip->i_rwlock, RW_WRITER); 515 /* 516 * Mandatory locking could have been enabled 517 * after dropping the i_rwlock. 518 */ 519 if (MANDLOCK(vp, ip->i_mode)) 520 goto retry_mandlock; 521 } 522 523 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 524 if (error) 525 goto out; 526 527 /* 528 * Amount of log space needed for this write 529 */ 530 if (!rewriteflg || !(ioflag & FDSYNC)) 531 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid); 532 533 /* 534 * Throttle writes. 535 */ 536 if (ufs_WRITES && (ip->i_writes > ufs_HW)) { 537 mutex_enter(&ip->i_tlock); 538 while (ip->i_writes > ufs_HW) { 539 ufs_throttles++; 540 cv_wait(&ip->i_wrcv, &ip->i_tlock); 541 } 542 mutex_exit(&ip->i_tlock); 543 } 544 545 /* 546 * Enter Transaction 547 * 548 * If the write is a rewrite there is no need to open a transaction 549 * if the FDSYNC flag is set and not the FSYNC. In this case just 550 * set the IMODACC flag to modify do the update at a later time 551 * thus avoiding the overhead of the logging transaction that is 552 * not required. 553 */ 554 if (ioflag & (FSYNC|FDSYNC)) { 555 if (ulp) { 556 if (rewriteflg) { 557 uint_t i_flag_save; 558 559 rw_enter(&ip->i_contents, RW_READER); 560 mutex_enter(&ip->i_tlock); 561 i_flag_save = ip->i_flag; 562 ip->i_flag |= IUPD | ICHG; 563 ip->i_seq++; 564 ITIMES_NOLOCK(ip); 565 if ((i_flag_save & IMOD) == 0) { 566 ip->i_flag &= ~IMOD; 567 ip->i_flag |= IMODACC; 568 } 569 mutex_exit(&ip->i_tlock); 570 rw_exit(&ip->i_contents); 571 } else { 572 int terr = 0; 573 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, 574 terr); 575 ASSERT(!terr); 576 } 577 } 578 } else { 579 if (ulp) 580 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 581 } 582 583 /* 584 * Write the file 585 */ 586 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 587 rw_enter(&ip->i_contents, RW_WRITER); 588 if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) { 589 /* 590 * In append mode start at end of file. 591 */ 592 uiop->uio_loffset = ip->i_size; 593 } 594 595 /* 596 * Mild optimisation, don't call ufs_trans_write() unless we have to 597 * Also, suppress file system full messages if we will retry. 598 */ 599 if (retry) 600 ip->i_flag |= IQUIET; 601 if (resid) { 602 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid); 603 } else { 604 error = wrip(ip, uiop, ioflag, cr); 605 } 606 ip->i_flag &= ~IQUIET; 607 608 rw_exit(&ip->i_contents); 609 rw_exit(&ufsvfsp->vfs_dqrwlock); 610 611 /* 612 * Leave Transaction 613 */ 614 if (ulp) { 615 if (ioflag & (FSYNC|FDSYNC)) { 616 if (!rewriteflg) { 617 int terr = 0; 618 619 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, 620 resv); 621 if (error == 0) 622 error = terr; 623 } 624 } else { 625 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 626 } 627 ufs_lockfs_end(ulp); 628 } 629 out: 630 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 631 /* 632 * Any blocks tied up in pending deletes? 633 */ 634 ufs_delete_drain_wait(ufsvfsp, 1); 635 retry = 0; 636 goto retry_mandlock; 637 } 638 639 if (error == ENOSPC && (start_resid != uiop->uio_resid)) 640 error = 0; 641 642 TRACE_2(TR_FAC_UFS, TR_UFS_WRITE_END, 643 "ufs_write_end:vp %p error %d", vp, error); 644 return (error); 645 } 646 647 /* 648 * Don't cache write blocks to files with the sticky bit set. 649 * Used to keep swap files from blowing the page cache on a server. 650 */ 651 int stickyhack = 1; 652 653 /* 654 * Free behind hacks. The pager is busted. 655 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 656 * or B_FREE_IF_TIGHT_ON_MEMORY. 657 */ 658 int freebehind = 1; 659 int smallfile = 0; 660 u_offset_t smallfile64 = 32 * 1024; 661 662 /* 663 * While we should, in most cases, cache the pages for write, we 664 * may also want to cache the pages for read as long as they are 665 * frequently re-usable. 666 * 667 * If cache_read_ahead = 1, the pages for read will go to the tail 668 * of the cache list when they are released, otherwise go to the head. 669 */ 670 int cache_read_ahead = 0; 671 672 /* 673 * Freebehind exists so that as we read large files sequentially we 674 * don't consume most of memory with pages from a few files. It takes 675 * longer to re-read from disk multiple small files as it does reading 676 * one large one sequentially. As system memory grows customers need 677 * to retain bigger chunks of files in memory. The advent of the 678 * cachelist opens up of the possibility freeing pages to the head or 679 * tail of the list. 680 * 681 * Not freeing a page is a bet that the page will be read again before 682 * it's segmap slot is needed for something else. If we loose the bet, 683 * it means some other thread is burdened with the page free we did 684 * not do. If we win we save a free and reclaim. 685 * 686 * Freeing it at the tail vs the head of cachelist is a bet that the 687 * page will survive until the next read. It's also saying that this 688 * page is more likely to be re-used than a page freed some time ago 689 * and never reclaimed. 690 * 691 * Freebehind maintains a range of file offset [smallfile1; smallfile2] 692 * 693 * 0 < offset < smallfile1 : pages are not freed. 694 * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist. 695 * smallfile2 < offset : pages freed to head of cachelist. 696 * 697 * The range is computed at most once per second and depends on 698 * freemem and ncpus_online. Both parameters are bounded to be 699 * >= smallfile && >= smallfile64. 700 * 701 * smallfile1 = (free memory / ncpu) / 1000 702 * smallfile2 = (free memory / ncpu) / 10 703 * 704 * A few examples values: 705 * 706 * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2] 707 * ncpus_online = 4 ncpus_online = 64 708 * ------------------ ----------------------- ----------------------- 709 * 1G [256K; 25M] [32K; 1.5M] 710 * 10G [2.5M; 250M] [156K; 15M] 711 * 100G [25M; 2.5G] [1.5M; 150M] 712 * 713 */ 714 715 #define SMALLFILE1_D 1000 716 #define SMALLFILE2_D 10 717 static u_offset_t smallfile1 = 32 * 1024; 718 static u_offset_t smallfile2 = 32 * 1024; 719 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */ 720 uint_t smallfile1_d = SMALLFILE1_D; 721 uint_t smallfile2_d = SMALLFILE2_D; 722 723 /* 724 * wrip does the real work of write requests for ufs. 725 */ 726 int 727 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr) 728 { 729 rlim64_t limit = uio->uio_llimit; 730 u_offset_t off; 731 u_offset_t old_i_size; 732 struct fs *fs; 733 struct vnode *vp; 734 struct ufsvfs *ufsvfsp; 735 caddr_t base; 736 long start_resid = uio->uio_resid; /* save starting resid */ 737 long premove_resid; /* resid before uiomove() */ 738 uint_t flags; 739 int newpage; 740 int iupdat_flag, directio_status; 741 int n, on, mapon; 742 int error, pagecreate; 743 int do_dqrwlock; /* drop/reacquire vfs_dqrwlock */ 744 int32_t iblocks; 745 int new_iblocks; 746 747 /* 748 * ip->i_size is incremented before the uiomove 749 * is done on a write. If the move fails (bad user 750 * address) reset ip->i_size. 751 * The better way would be to increment ip->i_size 752 * only if the uiomove succeeds. 753 */ 754 int i_size_changed = 0; 755 o_mode_t type; 756 int i_seq_needed = 0; 757 758 vp = ITOV(ip); 759 760 /* 761 * check for forced unmount - should not happen as 762 * the request passed the lockfs checks. 763 */ 764 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 765 return (EIO); 766 767 fs = ip->i_fs; 768 769 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 770 "ufs_wrip_start:vp %p", vp); 771 772 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 773 774 /* check for valid filetype */ 775 type = ip->i_mode & IFMT; 776 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 777 (type != IFLNK) && (type != IFSHAD)) { 778 return (EIO); 779 } 780 781 /* 782 * the actual limit of UFS file size 783 * is UFS_MAXOFFSET_T 784 */ 785 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 786 limit = MAXOFFSET_T; 787 788 if (uio->uio_loffset >= limit) { 789 proc_t *p = ttoproc(curthread); 790 791 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 792 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 793 794 mutex_enter(&p->p_lock); 795 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 796 p, RCA_UNSAFE_SIGINFO); 797 mutex_exit(&p->p_lock); 798 return (EFBIG); 799 } 800 801 /* 802 * if largefiles are disallowed, the limit is 803 * the pre-largefiles value of 2GB 804 */ 805 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 806 limit = MIN(UFS_MAXOFFSET_T, limit); 807 else 808 limit = MIN(MAXOFF32_T, limit); 809 810 if (uio->uio_loffset < (offset_t)0) { 811 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 812 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 813 return (EINVAL); 814 } 815 if (uio->uio_resid == 0) { 816 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 817 "ufs_wrip_end:vp %p error %d", vp, 0); 818 return (0); 819 } 820 821 if (uio->uio_loffset >= limit) 822 return (EFBIG); 823 824 ip->i_flag |= INOACC; /* don't update ref time in getpage */ 825 826 if (ioflag & (FSYNC|FDSYNC)) { 827 ip->i_flag |= ISYNC; 828 iupdat_flag = 1; 829 } 830 /* 831 * Try to go direct 832 */ 833 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 834 uio->uio_llimit = limit; 835 error = ufs_directio_write(ip, uio, ioflag, 0, cr, 836 &directio_status); 837 /* 838 * If ufs_directio wrote to the file or set the flags, 839 * we need to update i_seq, but it may be deferred. 840 */ 841 if (start_resid != uio->uio_resid || 842 (ip->i_flag & (ICHG|IUPD))) { 843 i_seq_needed = 1; 844 ip->i_flag |= ISEQ; 845 } 846 if (directio_status == DIRECTIO_SUCCESS) 847 goto out; 848 } 849 850 /* 851 * Behavior with respect to dropping/reacquiring vfs_dqrwlock: 852 * 853 * o shadow inodes: vfs_dqrwlock is not held at all 854 * o quota updates: vfs_dqrwlock is read or write held 855 * o other updates: vfs_dqrwlock is read held 856 * 857 * The first case is the only one where we do not hold 858 * vfs_dqrwlock at all while entering wrip(). 859 * We must make sure not to downgrade/drop vfs_dqrwlock if we 860 * have it as writer, i.e. if we are updating the quota inode. 861 * There is no potential deadlock scenario in this case as 862 * ufs_getpage() takes care of this and avoids reacquiring 863 * vfs_dqrwlock in that case. 864 * 865 * This check is done here since the above conditions do not change 866 * and we possibly loop below, so save a few cycles. 867 */ 868 if ((type == IFSHAD) || 869 (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) { 870 do_dqrwlock = 0; 871 } else { 872 do_dqrwlock = 1; 873 } 874 875 /* 876 * Large Files: We cast MAXBMASK to offset_t 877 * inorder to mask out the higher bits. Since offset_t 878 * is a signed value, the high order bit set in MAXBMASK 879 * value makes it do the right thing by having all bits 1 880 * in the higher word. May be removed for _SOLARIS64_. 881 */ 882 883 fs = ip->i_fs; 884 do { 885 u_offset_t uoff = uio->uio_loffset; 886 off = uoff & (offset_t)MAXBMASK; 887 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 888 on = (int)blkoff(fs, uoff); 889 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid); 890 new_iblocks = 1; 891 892 if (type == IFREG && uoff + n >= limit) { 893 if (uoff >= limit) { 894 error = EFBIG; 895 goto out; 896 } 897 /* 898 * since uoff + n >= limit, 899 * therefore n >= limit - uoff, and n is an int 900 * so it is safe to cast it to an int 901 */ 902 n = (int)(limit - (rlim64_t)uoff); 903 } 904 if (uoff + n > ip->i_size) { 905 /* 906 * We are extending the length of the file. 907 * bmap is used so that we are sure that 908 * if we need to allocate new blocks, that it 909 * is done here before we up the file size. 910 */ 911 error = bmap_write(ip, uoff, (int)(on + n), 912 mapon == 0, NULL, cr); 913 /* 914 * bmap_write never drops i_contents so if 915 * the flags are set it changed the file. 916 */ 917 if (ip->i_flag & (ICHG|IUPD)) { 918 i_seq_needed = 1; 919 ip->i_flag |= ISEQ; 920 } 921 if (error) 922 break; 923 /* 924 * There is a window of vulnerability here. 925 * The sequence of operations: allocate file 926 * system blocks, uiomove the data into pages, 927 * and then update the size of the file in the 928 * inode, must happen atomically. However, due 929 * to current locking constraints, this can not 930 * be done. 931 */ 932 ASSERT(ip->i_writer == NULL); 933 ip->i_writer = curthread; 934 i_size_changed = 1; 935 /* 936 * If we are writing from the beginning of 937 * the mapping, we can just create the 938 * pages without having to read them. 939 */ 940 pagecreate = (mapon == 0); 941 } else if (n == MAXBSIZE) { 942 /* 943 * Going to do a whole mappings worth, 944 * so we can just create the pages w/o 945 * having to read them in. But before 946 * we do that, we need to make sure any 947 * needed blocks are allocated first. 948 */ 949 iblocks = ip->i_blocks; 950 error = bmap_write(ip, uoff, (int)(on + n), 951 BI_ALLOC_ONLY, NULL, cr); 952 /* 953 * bmap_write never drops i_contents so if 954 * the flags are set it changed the file. 955 */ 956 if (ip->i_flag & (ICHG|IUPD)) { 957 i_seq_needed = 1; 958 ip->i_flag |= ISEQ; 959 } 960 if (error) 961 break; 962 pagecreate = 1; 963 /* 964 * check if the new created page needed the 965 * allocation of new disk blocks. 966 */ 967 if (iblocks == ip->i_blocks) 968 new_iblocks = 0; /* no new blocks allocated */ 969 } else { 970 pagecreate = 0; 971 /* 972 * In sync mode flush the indirect blocks which 973 * may have been allocated and not written on 974 * disk. In above cases bmap_write will allocate 975 * in sync mode. 976 */ 977 if (ioflag & (FSYNC|FDSYNC)) { 978 error = ufs_indirblk_sync(ip, uoff); 979 if (error) 980 break; 981 } 982 } 983 984 /* 985 * At this point we can enter ufs_getpage() in one 986 * of two ways: 987 * 1) segmap_getmapflt() calls ufs_getpage() when the 988 * forcefault parameter is true (pagecreate == 0) 989 * 2) uiomove() causes a page fault. 990 * 991 * We have to drop the contents lock to prevent the VM 992 * system from trying to reaquire it in ufs_getpage() 993 * should the uiomove cause a pagefault. 994 * 995 * We have to drop the reader vfs_dqrwlock here as well. 996 */ 997 rw_exit(&ip->i_contents); 998 if (do_dqrwlock) { 999 ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1000 ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock))); 1001 rw_exit(&ufsvfsp->vfs_dqrwlock); 1002 } 1003 1004 newpage = 0; 1005 premove_resid = uio->uio_resid; 1006 if (vpm_enable) { 1007 /* 1008 * Copy data. If new pages are created, part of 1009 * the page that is not written will be initizliazed 1010 * with zeros. 1011 */ 1012 error = vpm_data_copy(vp, (off + mapon), (uint_t)n, 1013 uio, !pagecreate, &newpage, 0, S_WRITE); 1014 } else { 1015 1016 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1017 (uint_t)n, !pagecreate, S_WRITE); 1018 1019 /* 1020 * segmap_pagecreate() returns 1 if it calls 1021 * page_create_va() to allocate any pages. 1022 */ 1023 1024 if (pagecreate) 1025 newpage = segmap_pagecreate(segkmap, base, 1026 (size_t)n, 0); 1027 1028 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 1029 } 1030 1031 /* 1032 * If "newpage" is set, then a new page was created and it 1033 * does not contain valid data, so it needs to be initialized 1034 * at this point. 1035 * Otherwise the page contains old data, which was overwritten 1036 * partially or as a whole in uiomove. 1037 * If there is only one iovec structure within uio, then 1038 * on error uiomove will not be able to update uio->uio_loffset 1039 * and we would zero the whole page here! 1040 * 1041 * If uiomove fails because of an error, the old valid data 1042 * is kept instead of filling the rest of the page with zero's. 1043 */ 1044 if (!vpm_enable && newpage && 1045 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 1046 /* 1047 * We created pages w/o initializing them completely, 1048 * thus we need to zero the part that wasn't set up. 1049 * This happens on most EOF write cases and if 1050 * we had some sort of error during the uiomove. 1051 */ 1052 int nzero, nmoved; 1053 1054 nmoved = (int)(uio->uio_loffset - (off + mapon)); 1055 ASSERT(nmoved >= 0 && nmoved <= n); 1056 nzero = roundup(on + n, PAGESIZE) - nmoved; 1057 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 1058 (void) kzero(base + mapon + nmoved, (uint_t)nzero); 1059 } 1060 1061 /* 1062 * Unlock the pages allocated by page_create_va() 1063 * in segmap_pagecreate() 1064 */ 1065 if (!vpm_enable && newpage) 1066 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 1067 1068 /* 1069 * If the size of the file changed, then update the 1070 * size field in the inode now. This can't be done 1071 * before the call to segmap_pageunlock or there is 1072 * a potential deadlock with callers to ufs_putpage(). 1073 * They will be holding i_contents and trying to lock 1074 * a page, while this thread is holding a page locked 1075 * and trying to acquire i_contents. 1076 */ 1077 if (i_size_changed) { 1078 rw_enter(&ip->i_contents, RW_WRITER); 1079 old_i_size = ip->i_size; 1080 UFS_SET_ISIZE(uoff + n, ip); 1081 TRANS_INODE(ufsvfsp, ip); 1082 /* 1083 * file has grown larger than 2GB. Set flag 1084 * in superblock to indicate this, if it 1085 * is not already set. 1086 */ 1087 if ((ip->i_size > MAXOFF32_T) && 1088 !(fs->fs_flags & FSLARGEFILES)) { 1089 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1090 mutex_enter(&ufsvfsp->vfs_lock); 1091 fs->fs_flags |= FSLARGEFILES; 1092 ufs_sbwrite(ufsvfsp); 1093 mutex_exit(&ufsvfsp->vfs_lock); 1094 } 1095 mutex_enter(&ip->i_tlock); 1096 ip->i_writer = NULL; 1097 cv_broadcast(&ip->i_wrcv); 1098 mutex_exit(&ip->i_tlock); 1099 rw_exit(&ip->i_contents); 1100 } 1101 1102 if (error) { 1103 /* 1104 * If we failed on a write, we may have already 1105 * allocated file blocks as well as pages. It's 1106 * hard to undo the block allocation, but we must 1107 * be sure to invalidate any pages that may have 1108 * been allocated. 1109 * 1110 * If the page was created without initialization 1111 * then we must check if it should be possible 1112 * to destroy the new page and to keep the old data 1113 * on the disk. 1114 * 1115 * It is possible to destroy the page without 1116 * having to write back its contents only when 1117 * - the size of the file keeps unchanged 1118 * - bmap_write() did not allocate new disk blocks 1119 * it is possible to create big files using "seek" and 1120 * write to the end of the file. A "write" to a 1121 * position before the end of the file would not 1122 * change the size of the file but it would allocate 1123 * new disk blocks. 1124 * - uiomove intended to overwrite the whole page. 1125 * - a new page was created (newpage == 1). 1126 */ 1127 1128 if (i_size_changed == 0 && new_iblocks == 0 && 1129 newpage) { 1130 1131 /* unwind what uiomove eventually last did */ 1132 uio->uio_resid = premove_resid; 1133 1134 /* 1135 * destroy the page, do not write ambiguous 1136 * data to the disk. 1137 */ 1138 flags = SM_DESTROY; 1139 } else { 1140 /* 1141 * write the page back to the disk, if dirty, 1142 * and remove the page from the cache. 1143 */ 1144 flags = SM_INVAL; 1145 } 1146 1147 if (vpm_enable) { 1148 /* 1149 * Flush pages. 1150 */ 1151 (void) vpm_sync_pages(vp, off, n, flags); 1152 } else { 1153 (void) segmap_release(segkmap, base, flags); 1154 } 1155 } else { 1156 flags = 0; 1157 /* 1158 * Force write back for synchronous write cases. 1159 */ 1160 if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) { 1161 /* 1162 * If the sticky bit is set but the 1163 * execute bit is not set, we do a 1164 * synchronous write back and free 1165 * the page when done. We set up swap 1166 * files to be handled this way to 1167 * prevent servers from keeping around 1168 * the client's swap pages too long. 1169 * XXX - there ought to be a better way. 1170 */ 1171 if (IS_SWAPVP(vp)) { 1172 flags = SM_WRITE | SM_FREE | 1173 SM_DONTNEED; 1174 iupdat_flag = 0; 1175 } else { 1176 flags = SM_WRITE; 1177 } 1178 } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 1179 /* 1180 * Have written a whole block. 1181 * Start an asynchronous write and 1182 * mark the buffer to indicate that 1183 * it won't be needed again soon. 1184 */ 1185 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 1186 } 1187 if (vpm_enable) { 1188 /* 1189 * Flush pages. 1190 */ 1191 error = vpm_sync_pages(vp, off, n, flags); 1192 } else { 1193 error = segmap_release(segkmap, base, flags); 1194 } 1195 /* 1196 * If the operation failed and is synchronous, 1197 * then we need to unwind what uiomove() last 1198 * did so we can potentially return an error to 1199 * the caller. If this write operation was 1200 * done in two pieces and the first succeeded, 1201 * then we won't return an error for the second 1202 * piece that failed. However, we only want to 1203 * return a resid value that reflects what was 1204 * really done. 1205 * 1206 * Failures for non-synchronous operations can 1207 * be ignored since the page subsystem will 1208 * retry the operation until it succeeds or the 1209 * file system is unmounted. 1210 */ 1211 if (error) { 1212 if ((ioflag & (FSYNC | FDSYNC)) || 1213 type == IFDIR) { 1214 uio->uio_resid = premove_resid; 1215 } else { 1216 error = 0; 1217 } 1218 } 1219 } 1220 1221 /* 1222 * Re-acquire contents lock. 1223 * If it was dropped, reacquire reader vfs_dqrwlock as well. 1224 */ 1225 if (do_dqrwlock) 1226 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1227 rw_enter(&ip->i_contents, RW_WRITER); 1228 1229 /* 1230 * If the uiomove() failed or if a synchronous 1231 * page push failed, fix up i_size. 1232 */ 1233 if (error) { 1234 if (i_size_changed) { 1235 /* 1236 * The uiomove failed, and we 1237 * allocated blocks,so get rid 1238 * of them. 1239 */ 1240 (void) ufs_itrunc(ip, old_i_size, 0, cr); 1241 } 1242 } else { 1243 /* 1244 * XXX - Can this be out of the loop? 1245 */ 1246 ip->i_flag |= IUPD | ICHG; 1247 /* 1248 * Only do one increase of i_seq for multiple 1249 * pieces. Because we drop locks, record 1250 * the fact that we changed the timestamp and 1251 * are deferring the increase in case another thread 1252 * pushes our timestamp update. 1253 */ 1254 i_seq_needed = 1; 1255 ip->i_flag |= ISEQ; 1256 if (i_size_changed) 1257 ip->i_flag |= IATTCHG; 1258 if ((ip->i_mode & (IEXEC | (IEXEC >> 3) | 1259 (IEXEC >> 6))) != 0 && 1260 (ip->i_mode & (ISUID | ISGID)) != 0 && 1261 secpolicy_vnode_setid_retain(cr, 1262 (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) { 1263 /* 1264 * Clear Set-UID & Set-GID bits on 1265 * successful write if not privileged 1266 * and at least one of the execute bits 1267 * is set. If we always clear Set-GID, 1268 * mandatory file and record locking is 1269 * unuseable. 1270 */ 1271 ip->i_mode &= ~(ISUID | ISGID); 1272 } 1273 } 1274 /* 1275 * In the case the FDSYNC flag is set and this is a 1276 * "rewrite" we won't log a delta. 1277 * The FSYNC flag overrides all cases. 1278 */ 1279 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) { 1280 TRANS_INODE(ufsvfsp, ip); 1281 } 1282 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1283 1284 out: 1285 /* 1286 * Make sure i_seq is increased at least once per write 1287 */ 1288 if (i_seq_needed) { 1289 ip->i_seq++; 1290 ip->i_flag &= ~ISEQ; /* no longer deferred */ 1291 } 1292 1293 /* 1294 * Inode is updated according to this table - 1295 * 1296 * FSYNC FDSYNC(posix.4) 1297 * -------------------------- 1298 * always@ IATTCHG|IBDWRITE 1299 * 1300 * @ - If we are doing synchronous write the only time we should 1301 * not be sync'ing the ip here is if we have the stickyhack 1302 * activated, the file is marked with the sticky bit and 1303 * no exec bit, the file length has not been changed and 1304 * no new blocks have been allocated during this write. 1305 */ 1306 1307 if ((ip->i_flag & ISYNC) != 0) { 1308 /* 1309 * we have eliminated nosync 1310 */ 1311 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 1312 ((ioflag & FSYNC) && iupdat_flag)) { 1313 ufs_iupdat(ip, 1); 1314 } 1315 } 1316 1317 /* 1318 * If we've already done a partial-write, terminate 1319 * the write but return no error unless the error is ENOSPC 1320 * because the caller can detect this and free resources and 1321 * try again. 1322 */ 1323 if ((start_resid != uio->uio_resid) && (error != ENOSPC)) 1324 error = 0; 1325 1326 ip->i_flag &= ~(INOACC | ISYNC); 1327 ITIMES_NOLOCK(ip); 1328 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1329 "ufs_wrip_end:vp %p error %d", vp, error); 1330 return (error); 1331 } 1332 1333 /* 1334 * rdip does the real work of read requests for ufs. 1335 */ 1336 int 1337 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr) 1338 { 1339 u_offset_t off; 1340 caddr_t base; 1341 struct fs *fs; 1342 struct ufsvfs *ufsvfsp; 1343 struct vnode *vp; 1344 long oresid = uio->uio_resid; 1345 u_offset_t n, on, mapon; 1346 int error = 0; 1347 int doupdate = 1; 1348 uint_t flags; 1349 int dofree, directio_status; 1350 krw_t rwtype; 1351 o_mode_t type; 1352 1353 vp = ITOV(ip); 1354 1355 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 1356 "ufs_rdip_start:vp %p", vp); 1357 1358 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1359 1360 ufsvfsp = ip->i_ufsvfs; 1361 1362 if (ufsvfsp == NULL) 1363 return (EIO); 1364 1365 fs = ufsvfsp->vfs_fs; 1366 1367 /* check for valid filetype */ 1368 type = ip->i_mode & IFMT; 1369 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 1370 (type != IFLNK) && (type != IFSHAD)) { 1371 return (EIO); 1372 } 1373 1374 if (uio->uio_loffset > UFS_MAXOFFSET_T) { 1375 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1376 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1377 error = 0; 1378 goto out; 1379 } 1380 if (uio->uio_loffset < (offset_t)0) { 1381 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1382 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1383 return (EINVAL); 1384 } 1385 if (uio->uio_resid == 0) { 1386 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1387 "ufs_rdip_end:vp %p error %d", vp, 0); 1388 return (0); 1389 } 1390 1391 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) && 1392 (!ufsvfsp->vfs_noatime)) { 1393 mutex_enter(&ip->i_tlock); 1394 ip->i_flag |= IACC; 1395 mutex_exit(&ip->i_tlock); 1396 } 1397 /* 1398 * Try to go direct 1399 */ 1400 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 1401 error = ufs_directio_read(ip, uio, cr, &directio_status); 1402 if (directio_status == DIRECTIO_SUCCESS) 1403 goto out; 1404 } 1405 1406 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 1407 1408 do { 1409 offset_t diff; 1410 u_offset_t uoff = uio->uio_loffset; 1411 off = uoff & (offset_t)MAXBMASK; 1412 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET); 1413 on = (u_offset_t)blkoff(fs, uoff); 1414 n = MIN((u_offset_t)fs->fs_bsize - on, 1415 (u_offset_t)uio->uio_resid); 1416 1417 diff = ip->i_size - uoff; 1418 1419 if (diff <= (offset_t)0) { 1420 error = 0; 1421 goto out; 1422 } 1423 if (diff < (offset_t)n) 1424 n = (int)diff; 1425 1426 /* 1427 * We update smallfile2 and smallfile1 at most every second. 1428 */ 1429 if (lbolt >= smallfile_update) { 1430 uint64_t percpufreeb; 1431 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 1432 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 1433 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 1434 smallfile1 = percpufreeb / smallfile1_d; 1435 smallfile2 = percpufreeb / smallfile2_d; 1436 smallfile1 = MAX(smallfile1, smallfile); 1437 smallfile1 = MAX(smallfile1, smallfile64); 1438 smallfile2 = MAX(smallfile1, smallfile2); 1439 smallfile_update = lbolt + hz; 1440 } 1441 1442 dofree = freebehind && 1443 ip->i_nextr == (off & PAGEMASK) && off > smallfile1; 1444 1445 /* 1446 * At this point we can enter ufs_getpage() in one of two 1447 * ways: 1448 * 1) segmap_getmapflt() calls ufs_getpage() when the 1449 * forcefault parameter is true (value of 1 is passed) 1450 * 2) uiomove() causes a page fault. 1451 * 1452 * We cannot hold onto an i_contents reader lock without 1453 * risking deadlock in ufs_getpage() so drop a reader lock. 1454 * The ufs_getpage() dolock logic already allows for a 1455 * thread holding i_contents as writer to work properly 1456 * so we keep a writer lock. 1457 */ 1458 if (rwtype == RW_READER) 1459 rw_exit(&ip->i_contents); 1460 1461 if (vpm_enable) { 1462 /* 1463 * Copy data. 1464 */ 1465 error = vpm_data_copy(vp, (off + mapon), (uint_t)n, 1466 uio, 1, NULL, 0, S_READ); 1467 } else { 1468 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1469 (uint_t)n, 1, S_READ); 1470 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 1471 } 1472 1473 flags = 0; 1474 if (!error) { 1475 /* 1476 * If reading sequential we won't need this 1477 * buffer again soon. For offsets in range 1478 * [smallfile1, smallfile2] release the pages 1479 * at the tail of the cache list, larger 1480 * offsets are released at the head. 1481 */ 1482 if (dofree) { 1483 flags = SM_FREE | SM_ASYNC; 1484 if ((cache_read_ahead == 0) && 1485 (off > smallfile2)) 1486 flags |= SM_DONTNEED; 1487 } 1488 /* 1489 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 1490 * we want to make sure that the page which has 1491 * been read, is written on disk if it is dirty. 1492 * And corresponding indirect blocks should also 1493 * be flushed out. 1494 */ 1495 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 1496 flags &= ~SM_ASYNC; 1497 flags |= SM_WRITE; 1498 } 1499 if (vpm_enable) { 1500 error = vpm_sync_pages(vp, off, n, flags); 1501 } else { 1502 error = segmap_release(segkmap, base, flags); 1503 } 1504 } else { 1505 if (vpm_enable) { 1506 (void) vpm_sync_pages(vp, off, n, flags); 1507 } else { 1508 (void) segmap_release(segkmap, base, flags); 1509 } 1510 } 1511 1512 if (rwtype == RW_READER) 1513 rw_enter(&ip->i_contents, rwtype); 1514 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1515 out: 1516 /* 1517 * Inode is updated according to this table if FRSYNC is set. 1518 * 1519 * FSYNC FDSYNC(posix.4) 1520 * -------------------------- 1521 * always IATTCHG|IBDWRITE 1522 */ 1523 /* 1524 * The inode is not updated if we're logging and the inode is a 1525 * directory with FRSYNC, FSYNC and FDSYNC flags set. 1526 */ 1527 if (ioflag & FRSYNC) { 1528 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) { 1529 doupdate = 0; 1530 } 1531 if (doupdate) { 1532 if ((ioflag & FSYNC) || 1533 ((ioflag & FDSYNC) && 1534 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 1535 ufs_iupdat(ip, 1); 1536 } 1537 } 1538 } 1539 /* 1540 * If we've already done a partial read, terminate 1541 * the read but return no error. 1542 */ 1543 if (oresid != uio->uio_resid) 1544 error = 0; 1545 ITIMES(ip); 1546 1547 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1548 "ufs_rdip_end:vp %p error %d", vp, error); 1549 return (error); 1550 } 1551 1552 /* ARGSUSED */ 1553 static int 1554 ufs_ioctl( 1555 struct vnode *vp, 1556 int cmd, 1557 intptr_t arg, 1558 int flag, 1559 struct cred *cr, 1560 int *rvalp) 1561 { 1562 struct lockfs lockfs, lockfs_out; 1563 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 1564 char *comment, *original_comment; 1565 struct fs *fs; 1566 struct ulockfs *ulp; 1567 offset_t off; 1568 extern int maxphys; 1569 int error; 1570 int issync; 1571 int trans_size; 1572 1573 1574 /* 1575 * forcibly unmounted 1576 */ 1577 if (ufsvfsp == NULL) { 1578 return (EIO); 1579 } 1580 1581 fs = ufsvfsp->vfs_fs; 1582 1583 if (cmd == Q_QUOTACTL) { 1584 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK); 1585 if (error) 1586 return (error); 1587 1588 if (ulp) { 1589 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, 1590 TOP_SETQUOTA_SIZE(fs)); 1591 } 1592 1593 error = quotactl(vp, arg, flag, cr); 1594 1595 if (ulp) { 1596 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, 1597 TOP_SETQUOTA_SIZE(fs)); 1598 ufs_lockfs_end(ulp); 1599 } 1600 return (error); 1601 } 1602 1603 switch (cmd) { 1604 case _FIOLFS: 1605 /* 1606 * file system locking 1607 */ 1608 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1609 return (EPERM); 1610 1611 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1612 if (copyin((caddr_t)arg, &lockfs, 1613 sizeof (struct lockfs))) 1614 return (EFAULT); 1615 } 1616 #ifdef _SYSCALL32_IMPL 1617 else { 1618 struct lockfs32 lockfs32; 1619 /* Translate ILP32 lockfs to LP64 lockfs */ 1620 if (copyin((caddr_t)arg, &lockfs32, 1621 sizeof (struct lockfs32))) 1622 return (EFAULT); 1623 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1624 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1625 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1626 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1627 lockfs.lf_comment = 1628 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1629 } 1630 #endif /* _SYSCALL32_IMPL */ 1631 1632 if (lockfs.lf_comlen) { 1633 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) 1634 return (ENAMETOOLONG); 1635 comment = kmem_alloc(lockfs.lf_comlen, 1636 KM_SLEEP); 1637 if (copyin(lockfs.lf_comment, comment, 1638 lockfs.lf_comlen)) { 1639 kmem_free(comment, lockfs.lf_comlen); 1640 return (EFAULT); 1641 } 1642 original_comment = lockfs.lf_comment; 1643 lockfs.lf_comment = comment; 1644 } 1645 if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) { 1646 lockfs.lf_comment = original_comment; 1647 1648 if ((flag & DATAMODEL_MASK) == 1649 DATAMODEL_NATIVE) { 1650 (void) copyout(&lockfs, (caddr_t)arg, 1651 sizeof (struct lockfs)); 1652 } 1653 #ifdef _SYSCALL32_IMPL 1654 else { 1655 struct lockfs32 lockfs32; 1656 /* Translate LP64 to ILP32 lockfs */ 1657 lockfs32.lf_lock = 1658 (uint32_t)lockfs.lf_lock; 1659 lockfs32.lf_flags = 1660 (uint32_t)lockfs.lf_flags; 1661 lockfs32.lf_key = 1662 (uint32_t)lockfs.lf_key; 1663 lockfs32.lf_comlen = 1664 (uint32_t)lockfs.lf_comlen; 1665 lockfs32.lf_comment = 1666 (uint32_t)(uintptr_t)lockfs.lf_comment; 1667 (void) copyout(&lockfs32, (caddr_t)arg, 1668 sizeof (struct lockfs32)); 1669 } 1670 #endif /* _SYSCALL32_IMPL */ 1671 1672 } else { 1673 if (lockfs.lf_comlen) 1674 kmem_free(comment, lockfs.lf_comlen); 1675 } 1676 return (error); 1677 1678 case _FIOLFSS: 1679 /* 1680 * get file system locking status 1681 */ 1682 1683 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1684 if (copyin((caddr_t)arg, &lockfs, 1685 sizeof (struct lockfs))) 1686 return (EFAULT); 1687 } 1688 #ifdef _SYSCALL32_IMPL 1689 else { 1690 struct lockfs32 lockfs32; 1691 /* Translate ILP32 lockfs to LP64 lockfs */ 1692 if (copyin((caddr_t)arg, &lockfs32, 1693 sizeof (struct lockfs32))) 1694 return (EFAULT); 1695 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1696 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1697 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1698 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1699 lockfs.lf_comment = 1700 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1701 } 1702 #endif /* _SYSCALL32_IMPL */ 1703 1704 if (error = ufs_fiolfss(vp, &lockfs_out)) 1705 return (error); 1706 lockfs.lf_lock = lockfs_out.lf_lock; 1707 lockfs.lf_key = lockfs_out.lf_key; 1708 lockfs.lf_flags = lockfs_out.lf_flags; 1709 lockfs.lf_comlen = MIN(lockfs.lf_comlen, 1710 lockfs_out.lf_comlen); 1711 1712 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1713 if (copyout(&lockfs, (caddr_t)arg, 1714 sizeof (struct lockfs))) 1715 return (EFAULT); 1716 } 1717 #ifdef _SYSCALL32_IMPL 1718 else { 1719 /* Translate LP64 to ILP32 lockfs */ 1720 struct lockfs32 lockfs32; 1721 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock; 1722 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags; 1723 lockfs32.lf_key = (uint32_t)lockfs.lf_key; 1724 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen; 1725 lockfs32.lf_comment = 1726 (uint32_t)(uintptr_t)lockfs.lf_comment; 1727 if (copyout(&lockfs32, (caddr_t)arg, 1728 sizeof (struct lockfs32))) 1729 return (EFAULT); 1730 } 1731 #endif /* _SYSCALL32_IMPL */ 1732 1733 if (lockfs.lf_comlen && 1734 lockfs.lf_comment && lockfs_out.lf_comment) 1735 if (copyout(lockfs_out.lf_comment, 1736 lockfs.lf_comment, 1737 lockfs.lf_comlen)) 1738 return (EFAULT); 1739 return (0); 1740 1741 case _FIOSATIME: 1742 /* 1743 * set access time 1744 */ 1745 1746 /* 1747 * if mounted w/o atime, return quietly. 1748 * I briefly thought about returning ENOSYS, but 1749 * figured that most apps would consider this fatal 1750 * but the idea is to make this as seamless as poss. 1751 */ 1752 if (ufsvfsp->vfs_noatime) 1753 return (0); 1754 1755 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1756 ULOCKFS_SETATTR_MASK); 1757 if (error) 1758 return (error); 1759 1760 if (ulp) { 1761 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp)); 1762 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 1763 TOP_SETATTR, trans_size); 1764 } 1765 1766 error = ufs_fiosatime(vp, (struct timeval *)arg, 1767 flag, cr); 1768 1769 if (ulp) { 1770 TRANS_END_CSYNC(ufsvfsp, error, issync, 1771 TOP_SETATTR, trans_size); 1772 ufs_lockfs_end(ulp); 1773 } 1774 return (error); 1775 1776 case _FIOSDIO: 1777 /* 1778 * set delayed-io 1779 */ 1780 return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr)); 1781 1782 case _FIOGDIO: 1783 /* 1784 * get delayed-io 1785 */ 1786 return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr)); 1787 1788 case _FIOIO: 1789 /* 1790 * inode open 1791 */ 1792 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1793 ULOCKFS_VGET_MASK); 1794 if (error) 1795 return (error); 1796 1797 error = ufs_fioio(vp, (struct fioio *)arg, flag, cr); 1798 1799 if (ulp) { 1800 ufs_lockfs_end(ulp); 1801 } 1802 return (error); 1803 1804 case _FIOFFS: 1805 /* 1806 * file system flush (push w/invalidate) 1807 */ 1808 if ((caddr_t)arg != NULL) 1809 return (EINVAL); 1810 return (ufs_fioffs(vp, NULL, cr)); 1811 1812 case _FIOISBUSY: 1813 /* 1814 * Contract-private interface for Legato 1815 * Purge this vnode from the DNLC and decide 1816 * if this vnode is busy (*arg == 1) or not 1817 * (*arg == 0) 1818 */ 1819 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1820 return (EPERM); 1821 error = ufs_fioisbusy(vp, (int *)arg, cr); 1822 return (error); 1823 1824 case _FIODIRECTIO: 1825 return (ufs_fiodirectio(vp, (int)arg, cr)); 1826 1827 case _FIOTUNE: 1828 /* 1829 * Tune the file system (aka setting fs attributes) 1830 */ 1831 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1832 ULOCKFS_SETATTR_MASK); 1833 if (error) 1834 return (error); 1835 1836 error = ufs_fiotune(vp, (struct fiotune *)arg, cr); 1837 1838 if (ulp) 1839 ufs_lockfs_end(ulp); 1840 return (error); 1841 1842 case _FIOLOGENABLE: 1843 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1844 return (EPERM); 1845 return (ufs_fiologenable(vp, (void *)arg, cr, flag)); 1846 1847 case _FIOLOGDISABLE: 1848 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1849 return (EPERM); 1850 return (ufs_fiologdisable(vp, (void *)arg, cr, flag)); 1851 1852 case _FIOISLOG: 1853 return (ufs_fioislog(vp, (void *)arg, cr, flag)); 1854 1855 case _FIOSNAPSHOTCREATE_MULTI: 1856 { 1857 struct fiosnapcreate_multi fc, *fcp; 1858 size_t fcm_size; 1859 1860 if (copyin((void *)arg, &fc, sizeof (fc))) 1861 return (EFAULT); 1862 if (fc.backfilecount > MAX_BACKFILE_COUNT) 1863 return (EINVAL); 1864 fcm_size = sizeof (struct fiosnapcreate_multi) + 1865 (fc.backfilecount - 1) * sizeof (int); 1866 fcp = (struct fiosnapcreate_multi *) 1867 kmem_alloc(fcm_size, KM_SLEEP); 1868 if (copyin((void *)arg, fcp, fcm_size)) { 1869 kmem_free(fcp, fcm_size); 1870 return (EFAULT); 1871 } 1872 error = ufs_snap_create(vp, fcp, cr); 1873 if (!error && copyout(fcp, (void *)arg, fcm_size)) 1874 error = EFAULT; 1875 kmem_free(fcp, fcm_size); 1876 return (error); 1877 } 1878 1879 case _FIOSNAPSHOTDELETE: 1880 { 1881 struct fiosnapdelete fc; 1882 1883 if (copyin((void *)arg, &fc, sizeof (fc))) 1884 return (EFAULT); 1885 error = ufs_snap_delete(vp, &fc, cr); 1886 if (!error && copyout(&fc, (void *)arg, sizeof (fc))) 1887 error = EFAULT; 1888 return (error); 1889 } 1890 1891 case _FIOGETSUPERBLOCK: 1892 if (copyout(fs, (void *)arg, SBSIZE)) 1893 return (EFAULT); 1894 return (0); 1895 1896 case _FIOGETMAXPHYS: 1897 if (copyout(&maxphys, (void *)arg, sizeof (maxphys))) 1898 return (EFAULT); 1899 return (0); 1900 1901 /* 1902 * The following 3 ioctls are for TSufs support 1903 * although could potentially be used elsewhere 1904 */ 1905 case _FIO_SET_LUFS_DEBUG: 1906 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1907 return (EPERM); 1908 lufs_debug = (uint32_t)arg; 1909 return (0); 1910 1911 case _FIO_SET_LUFS_ERROR: 1912 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1913 return (EPERM); 1914 TRANS_SETERROR(ufsvfsp); 1915 return (0); 1916 1917 case _FIO_GET_TOP_STATS: 1918 { 1919 fio_lufs_stats_t *ls; 1920 ml_unit_t *ul = ufsvfsp->vfs_log; 1921 1922 ls = kmem_zalloc(sizeof (*ls), KM_SLEEP); 1923 ls->ls_debug = ul->un_debug; /* return debug value */ 1924 /* Copy stucture if statistics are being kept */ 1925 if (ul->un_logmap->mtm_tops) { 1926 ls->ls_topstats = *(ul->un_logmap->mtm_tops); 1927 } 1928 error = 0; 1929 if (copyout(ls, (void *)arg, sizeof (*ls))) 1930 error = EFAULT; 1931 kmem_free(ls, sizeof (*ls)); 1932 return (error); 1933 } 1934 1935 case _FIO_SEEK_DATA: 1936 case _FIO_SEEK_HOLE: 1937 if (ddi_copyin((void *)arg, &off, sizeof (off), flag)) 1938 return (EFAULT); 1939 /* offset paramater is in/out */ 1940 error = ufs_fio_holey(vp, cmd, &off); 1941 if (error) 1942 return (error); 1943 if (ddi_copyout(&off, (void *)arg, sizeof (off), flag)) 1944 return (EFAULT); 1945 return (0); 1946 1947 default: 1948 return (ENOTTY); 1949 } 1950 } 1951 1952 /* ARGSUSED */ 1953 static int 1954 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags, 1955 struct cred *cr) 1956 { 1957 struct inode *ip = VTOI(vp); 1958 struct ufsvfs *ufsvfsp; 1959 int err; 1960 1961 TRACE_2(TR_FAC_UFS, TR_UFS_GETATTR_START, 1962 "ufs_getattr_start:vp %p flags %x", vp, flags); 1963 1964 if (vap->va_mask == AT_SIZE) { 1965 /* 1966 * for performance, if only the size is requested don't bother 1967 * with anything else. 1968 */ 1969 UFS_GET_ISIZE(&vap->va_size, ip); 1970 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, 1971 "ufs_getattr_end:vp %p", vp); 1972 return (0); 1973 } 1974 1975 /* 1976 * inlined lockfs checks 1977 */ 1978 ufsvfsp = ip->i_ufsvfs; 1979 if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) { 1980 err = EIO; 1981 goto out; 1982 } 1983 1984 rw_enter(&ip->i_contents, RW_READER); 1985 /* 1986 * Return all the attributes. This should be refined so 1987 * that it only returns what's asked for. 1988 */ 1989 1990 /* 1991 * Copy from inode table. 1992 */ 1993 vap->va_type = vp->v_type; 1994 vap->va_mode = ip->i_mode & MODEMASK; 1995 /* 1996 * If there is an ACL and there is a mask entry, then do the 1997 * extra work that completes the equivalent of an acltomode(3) 1998 * call. According to POSIX P1003.1e, the acl mask should be 1999 * returned in the group permissions field. 2000 * 2001 * - start with the original permission and mode bits (from above) 2002 * - clear the group owner bits 2003 * - add in the mask bits. 2004 */ 2005 if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) { 2006 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3); 2007 vap->va_mode |= 2008 (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3; 2009 } 2010 vap->va_uid = ip->i_uid; 2011 vap->va_gid = ip->i_gid; 2012 vap->va_fsid = ip->i_dev; 2013 vap->va_nodeid = (ino64_t)ip->i_number; 2014 vap->va_nlink = ip->i_nlink; 2015 vap->va_size = ip->i_size; 2016 if (vp->v_type == VCHR || vp->v_type == VBLK) 2017 vap->va_rdev = ip->i_rdev; 2018 else 2019 vap->va_rdev = 0; /* not a b/c spec. */ 2020 mutex_enter(&ip->i_tlock); 2021 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 2022 vap->va_seq = ip->i_seq; 2023 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 2024 vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000; 2025 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 2026 vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000; 2027 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 2028 vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000; 2029 mutex_exit(&ip->i_tlock); 2030 2031 switch (ip->i_mode & IFMT) { 2032 2033 case IFBLK: 2034 vap->va_blksize = MAXBSIZE; /* was BLKDEV_IOSIZE */ 2035 break; 2036 2037 case IFCHR: 2038 vap->va_blksize = MAXBSIZE; 2039 break; 2040 2041 default: 2042 vap->va_blksize = ip->i_fs->fs_bsize; 2043 break; 2044 } 2045 vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks; 2046 rw_exit(&ip->i_contents); 2047 err = 0; 2048 2049 out: 2050 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, "ufs_getattr_end:vp %p", vp); 2051 2052 return (err); 2053 } 2054 2055 /*ARGSUSED4*/ 2056 static int 2057 ufs_setattr( 2058 struct vnode *vp, 2059 struct vattr *vap, 2060 int flags, 2061 struct cred *cr, 2062 caller_context_t *ct) 2063 { 2064 struct inode *ip = VTOI(vp); 2065 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2066 struct fs *fs; 2067 struct ulockfs *ulp; 2068 char *errmsg1; 2069 char *errmsg2; 2070 long blocks; 2071 long int mask = vap->va_mask; 2072 size_t len1, len2; 2073 int issync; 2074 int trans_size; 2075 int dotrans; 2076 int dorwlock; 2077 int error; 2078 int owner_change; 2079 int dodqlock; 2080 timestruc_t now; 2081 vattr_t oldva; 2082 int retry = 1; 2083 int indeadlock; 2084 2085 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_START, 2086 "ufs_setattr_start:vp %p flags %x", vp, flags); 2087 2088 /* 2089 * Cannot set these attributes. 2090 */ 2091 if (mask & AT_NOSET) { 2092 error = EINVAL; 2093 goto out; 2094 } 2095 2096 /* 2097 * check for forced unmount 2098 */ 2099 if (ufsvfsp == NULL) 2100 return (EIO); 2101 2102 fs = ufsvfsp->vfs_fs; 2103 if (fs->fs_ronly != 0) 2104 return (EROFS); 2105 2106 again: 2107 errmsg1 = NULL; 2108 errmsg2 = NULL; 2109 dotrans = 0; 2110 dorwlock = 0; 2111 dodqlock = 0; 2112 2113 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK); 2114 if (error) 2115 goto out; 2116 2117 /* 2118 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 2119 * This follows the protocol for read()/write(). 2120 */ 2121 if (vp->v_type != VDIR) { 2122 /* 2123 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to 2124 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 2125 * possible, retries the operation. 2126 */ 2127 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file); 2128 if (indeadlock) { 2129 if (ulp) 2130 ufs_lockfs_end(ulp); 2131 goto again; 2132 } 2133 dorwlock = 1; 2134 } 2135 2136 /* 2137 * Truncate file. Must have write permission and not be a directory. 2138 */ 2139 if (mask & AT_SIZE) { 2140 rw_enter(&ip->i_contents, RW_WRITER); 2141 if (vp->v_type == VDIR) { 2142 error = EISDIR; 2143 goto update_inode; 2144 } 2145 if (error = ufs_iaccess(ip, IWRITE, cr)) 2146 goto update_inode; 2147 2148 rw_exit(&ip->i_contents); 2149 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr); 2150 if (error) { 2151 rw_enter(&ip->i_contents, RW_WRITER); 2152 goto update_inode; 2153 } 2154 } 2155 2156 if (ulp) { 2157 trans_size = (int)TOP_SETATTR_SIZE(ip); 2158 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size); 2159 ++dotrans; 2160 } 2161 2162 /* 2163 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 2164 * This follows the protocol established by 2165 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 2166 */ 2167 if (vp->v_type == VDIR) { 2168 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR, 2169 retry_dir); 2170 if (indeadlock) 2171 goto again; 2172 dorwlock = 1; 2173 } 2174 2175 /* 2176 * Grab quota lock if we are changing the file's owner. 2177 */ 2178 if (mask & AT_UID) { 2179 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2180 dodqlock = 1; 2181 } 2182 rw_enter(&ip->i_contents, RW_WRITER); 2183 2184 oldva.va_mode = ip->i_mode; 2185 oldva.va_uid = ip->i_uid; 2186 oldva.va_gid = ip->i_gid; 2187 2188 vap->va_mask &= ~AT_SIZE; 2189 /* 2190 * ufs_iaccess is "close enough"; that's because it doesn't 2191 * map the defines. 2192 */ 2193 error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2194 ufs_iaccess, ip); 2195 if (error) 2196 goto update_inode; 2197 2198 mask = vap->va_mask; 2199 2200 /* 2201 * Change file access modes. 2202 */ 2203 if (mask & AT_MODE) { 2204 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT); 2205 TRANS_INODE(ufsvfsp, ip); 2206 ip->i_flag |= ICHG; 2207 if (stickyhack) { 2208 mutex_enter(&vp->v_lock); 2209 if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 2210 vp->v_flag |= VSWAPLIKE; 2211 else 2212 vp->v_flag &= ~VSWAPLIKE; 2213 mutex_exit(&vp->v_lock); 2214 } 2215 } 2216 if (mask & (AT_UID|AT_GID)) { 2217 if (mask & AT_UID) { 2218 /* 2219 * Don't change ownership of the quota inode. 2220 */ 2221 if (ufsvfsp->vfs_qinod == ip) { 2222 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED); 2223 error = EINVAL; 2224 goto update_inode; 2225 } 2226 2227 /* 2228 * No real ownership change. 2229 */ 2230 if (ip->i_uid == vap->va_uid) { 2231 blocks = 0; 2232 owner_change = 0; 2233 } 2234 /* 2235 * Remove the blocks and the file, from the old user's 2236 * quota. 2237 */ 2238 else { 2239 blocks = ip->i_blocks; 2240 owner_change = 1; 2241 2242 (void) chkdq(ip, -blocks, /* force */ 1, cr, 2243 (char **)NULL, (size_t *)NULL); 2244 (void) chkiq(ufsvfsp, /* change */ -1, ip, 2245 (uid_t)ip->i_uid, 2246 /* force */ 1, cr, 2247 (char **)NULL, (size_t *)NULL); 2248 dqrele(ip->i_dquot); 2249 } 2250 2251 ip->i_uid = vap->va_uid; 2252 2253 /* 2254 * There is a real ownership change. 2255 */ 2256 if (owner_change) { 2257 /* 2258 * Add the blocks and the file to the new 2259 * user's quota. 2260 */ 2261 ip->i_dquot = getinoquota(ip); 2262 (void) chkdq(ip, blocks, /* force */ 1, cr, 2263 &errmsg1, &len1); 2264 (void) chkiq(ufsvfsp, /* change */ 1, 2265 (struct inode *)NULL, 2266 (uid_t)ip->i_uid, 2267 /* force */ 1, cr, 2268 &errmsg2, &len2); 2269 } 2270 } 2271 if (mask & AT_GID) { 2272 ip->i_gid = vap->va_gid; 2273 } 2274 TRANS_INODE(ufsvfsp, ip); 2275 ip->i_flag |= ICHG; 2276 } 2277 /* 2278 * Change file access or modified times. 2279 */ 2280 if (mask & (AT_ATIME|AT_MTIME)) { 2281 /* Check that the time value is within ufs range */ 2282 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2283 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2284 error = EOVERFLOW; 2285 goto update_inode; 2286 } 2287 2288 /* 2289 * if the "noaccess" mount option is set and only atime 2290 * update is requested, do nothing. No error is returned. 2291 */ 2292 if ((ufsvfsp->vfs_noatime) && 2293 ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME)) 2294 goto skip_atime; 2295 2296 if (mask & AT_ATIME) { 2297 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2298 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2299 ip->i_flag &= ~IACC; 2300 } 2301 if (mask & AT_MTIME) { 2302 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2303 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2304 gethrestime(&now); 2305 if (now.tv_sec > TIME32_MAX) { 2306 /* 2307 * In 2038, ctime sticks forever.. 2308 */ 2309 ip->i_ctime.tv_sec = TIME32_MAX; 2310 ip->i_ctime.tv_usec = 0; 2311 } else { 2312 ip->i_ctime.tv_sec = now.tv_sec; 2313 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2314 } 2315 ip->i_flag &= ~(IUPD|ICHG); 2316 ip->i_flag |= IMODTIME; 2317 } 2318 TRANS_INODE(ufsvfsp, ip); 2319 ip->i_flag |= IMOD; 2320 } 2321 2322 skip_atime: 2323 /* 2324 * The presence of a shadow inode may indicate an ACL, but does 2325 * not imply an ACL. Future FSD types should be handled here too 2326 * and check for the presence of the attribute-specific data 2327 * before referencing it. 2328 */ 2329 if (ip->i_shadow) { 2330 /* 2331 * XXX if ufs_iupdat is changed to sandbagged write fix 2332 * ufs_acl_setattr to push ip to keep acls consistent 2333 * 2334 * Suppress out of inodes messages if we will retry. 2335 */ 2336 if (retry) 2337 ip->i_flag |= IQUIET; 2338 error = ufs_acl_setattr(ip, vap, cr); 2339 ip->i_flag &= ~IQUIET; 2340 } 2341 2342 update_inode: 2343 /* 2344 * Setattr always increases the sequence number 2345 */ 2346 ip->i_seq++; 2347 2348 /* 2349 * if nfsd and not logging; push synchronously 2350 */ 2351 if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) { 2352 ufs_iupdat(ip, 1); 2353 } else { 2354 ITIMES_NOLOCK(ip); 2355 } 2356 2357 rw_exit(&ip->i_contents); 2358 if (dodqlock) { 2359 rw_exit(&ufsvfsp->vfs_dqrwlock); 2360 } 2361 if (dorwlock) 2362 rw_exit(&ip->i_rwlock); 2363 2364 if (ulp) { 2365 if (dotrans) { 2366 int terr = 0; 2367 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR, 2368 trans_size); 2369 if (error == 0) 2370 error = terr; 2371 } 2372 ufs_lockfs_end(ulp); 2373 } 2374 out: 2375 /* 2376 * If out of inodes or blocks, see if we can free something 2377 * up from the delete queue. 2378 */ 2379 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 2380 ufs_delete_drain_wait(ufsvfsp, 1); 2381 retry = 0; 2382 if (errmsg1 != NULL) 2383 kmem_free(errmsg1, len1); 2384 if (errmsg2 != NULL) 2385 kmem_free(errmsg2, len2); 2386 goto again; 2387 } 2388 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_END, 2389 "ufs_setattr_end:vp %p error %d", vp, error); 2390 if (errmsg1 != NULL) { 2391 uprintf(errmsg1); 2392 kmem_free(errmsg1, len1); 2393 } 2394 if (errmsg2 != NULL) { 2395 uprintf(errmsg2); 2396 kmem_free(errmsg2, len2); 2397 } 2398 return (error); 2399 } 2400 2401 /*ARGSUSED*/ 2402 static int 2403 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr) 2404 { 2405 struct inode *ip = VTOI(vp); 2406 int error; 2407 2408 TRACE_3(TR_FAC_UFS, TR_UFS_ACCESS_START, 2409 "ufs_access_start:vp %p mode %x flags %x", vp, mode, flags); 2410 2411 if (ip->i_ufsvfs == NULL) 2412 return (EIO); 2413 2414 rw_enter(&ip->i_contents, RW_READER); 2415 2416 /* 2417 * The ufs_iaccess function wants to be called with 2418 * mode bits expressed as "ufs specific" bits. 2419 * I.e., VWRITE|VREAD|VEXEC do not make sense to 2420 * ufs_iaccess() but IWRITE|IREAD|IEXEC do. 2421 * But since they're the same we just pass the vnode mode 2422 * bit but just verify that assumption at compile time. 2423 */ 2424 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC 2425 #error "ufs_access needs to map Vmodes to Imodes" 2426 #endif 2427 error = ufs_iaccess(ip, mode, cr); 2428 2429 rw_exit(&ip->i_contents); 2430 2431 TRACE_2(TR_FAC_UFS, TR_UFS_ACCESS_END, 2432 "ufs_access_end:vp %p error %d", vp, error); 2433 return (error); 2434 } 2435 2436 /* ARGSUSED */ 2437 static int 2438 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr) 2439 { 2440 struct inode *ip = VTOI(vp); 2441 struct ufsvfs *ufsvfsp; 2442 struct ulockfs *ulp; 2443 int error; 2444 int fastsymlink; 2445 2446 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_START, 2447 "ufs_readlink_start:vp %p uiop %p", uiop, vp); 2448 2449 if (vp->v_type != VLNK) { 2450 error = EINVAL; 2451 goto nolockout; 2452 } 2453 2454 /* 2455 * If the symbolic link is empty there is nothing to read. 2456 * Fast-track these empty symbolic links 2457 */ 2458 if (ip->i_size == 0) { 2459 error = 0; 2460 goto nolockout; 2461 } 2462 2463 ufsvfsp = ip->i_ufsvfs; 2464 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK); 2465 if (error) 2466 goto nolockout; 2467 /* 2468 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK 2469 */ 2470 again: 2471 fastsymlink = 0; 2472 if (ip->i_flag & IFASTSYMLNK) { 2473 rw_enter(&ip->i_rwlock, RW_READER); 2474 rw_enter(&ip->i_contents, RW_READER); 2475 if (ip->i_flag & IFASTSYMLNK) { 2476 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 2477 (ip->i_fs->fs_ronly == 0) && 2478 (!ufsvfsp->vfs_noatime)) { 2479 mutex_enter(&ip->i_tlock); 2480 ip->i_flag |= IACC; 2481 mutex_exit(&ip->i_tlock); 2482 } 2483 error = uiomove((caddr_t)&ip->i_db[1], 2484 MIN(ip->i_size, uiop->uio_resid), 2485 UIO_READ, uiop); 2486 ITIMES(ip); 2487 ++fastsymlink; 2488 } 2489 rw_exit(&ip->i_contents); 2490 rw_exit(&ip->i_rwlock); 2491 } 2492 if (!fastsymlink) { 2493 ssize_t size; /* number of bytes read */ 2494 caddr_t basep; /* pointer to input data */ 2495 ino_t ino; 2496 long igen; 2497 struct uio tuio; /* temp uio struct */ 2498 struct uio *tuiop; 2499 iovec_t tiov; /* temp iovec struct */ 2500 char kbuf[FSL_SIZE]; /* buffer to hold fast symlink */ 2501 int tflag = 0; /* flag to indicate temp vars used */ 2502 2503 ino = ip->i_number; 2504 igen = ip->i_gen; 2505 size = uiop->uio_resid; 2506 basep = uiop->uio_iov->iov_base; 2507 tuiop = uiop; 2508 2509 rw_enter(&ip->i_rwlock, RW_WRITER); 2510 rw_enter(&ip->i_contents, RW_WRITER); 2511 if (ip->i_flag & IFASTSYMLNK) { 2512 rw_exit(&ip->i_contents); 2513 rw_exit(&ip->i_rwlock); 2514 goto again; 2515 } 2516 2517 /* can this be a fast symlink and is it a user buffer? */ 2518 if (ip->i_size <= FSL_SIZE && 2519 (uiop->uio_segflg == UIO_USERSPACE || 2520 uiop->uio_segflg == UIO_USERISPACE)) { 2521 2522 bzero(&tuio, sizeof (struct uio)); 2523 /* 2524 * setup a kernel buffer to read link into. this 2525 * is to fix a race condition where the user buffer 2526 * got corrupted before copying it into the inode. 2527 */ 2528 size = ip->i_size; 2529 tiov.iov_len = size; 2530 tiov.iov_base = kbuf; 2531 tuio.uio_iov = &tiov; 2532 tuio.uio_iovcnt = 1; 2533 tuio.uio_offset = uiop->uio_offset; 2534 tuio.uio_segflg = UIO_SYSSPACE; 2535 tuio.uio_fmode = uiop->uio_fmode; 2536 tuio.uio_extflg = uiop->uio_extflg; 2537 tuio.uio_limit = uiop->uio_limit; 2538 tuio.uio_resid = size; 2539 2540 basep = tuio.uio_iov->iov_base; 2541 tuiop = &tuio; 2542 tflag = 1; 2543 } 2544 2545 error = rdip(ip, tuiop, 0, cr); 2546 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) { 2547 rw_exit(&ip->i_contents); 2548 rw_exit(&ip->i_rwlock); 2549 goto out; 2550 } 2551 2552 if (tflag == 0) 2553 size -= uiop->uio_resid; 2554 2555 if ((tflag == 0 && ip->i_size <= FSL_SIZE && 2556 ip->i_size == size) || (tflag == 1 && 2557 tuio.uio_resid == 0)) { 2558 error = kcopy(basep, &ip->i_db[1], ip->i_size); 2559 if (error == 0) { 2560 ip->i_flag |= IFASTSYMLNK; 2561 /* 2562 * free page 2563 */ 2564 (void) VOP_PUTPAGE(ITOV(ip), 2565 (offset_t)0, PAGESIZE, 2566 (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC), 2567 cr); 2568 } else { 2569 int i; 2570 /* error, clear garbage left behind */ 2571 for (i = 1; i < NDADDR; i++) 2572 ip->i_db[i] = 0; 2573 for (i = 0; i < NIADDR; i++) 2574 ip->i_ib[i] = 0; 2575 } 2576 } 2577 if (tflag == 1) { 2578 /* now, copy it into the user buffer */ 2579 error = uiomove((caddr_t)kbuf, 2580 MIN(size, uiop->uio_resid), 2581 UIO_READ, uiop); 2582 } 2583 rw_exit(&ip->i_contents); 2584 rw_exit(&ip->i_rwlock); 2585 } 2586 out: 2587 if (ulp) { 2588 ufs_lockfs_end(ulp); 2589 } 2590 nolockout: 2591 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_END, 2592 "ufs_readlink_end:vp %p error %d", vp, error); 2593 2594 return (error); 2595 } 2596 2597 /* ARGSUSED */ 2598 static int 2599 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr) 2600 { 2601 struct inode *ip = VTOI(vp); 2602 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2603 struct ulockfs *ulp; 2604 int error; 2605 2606 TRACE_1(TR_FAC_UFS, TR_UFS_FSYNC_START, 2607 "ufs_fsync_start:vp %p", vp); 2608 2609 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK); 2610 if (error) 2611 return (error); 2612 2613 if (TRANS_ISTRANS(ufsvfsp)) { 2614 /* 2615 * First push out any data pages 2616 */ 2617 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2618 (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) { 2619 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 2620 0, CRED()); 2621 if (error) 2622 goto out; 2623 } 2624 2625 /* 2626 * Delta any delayed inode times updates 2627 * and push inode to log. 2628 * All other inode deltas will have already been delta'd 2629 * and will be pushed during the commit. 2630 */ 2631 if (!(syncflag & FDSYNC) && 2632 ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) { 2633 if (ulp) { 2634 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC, 2635 TOP_SYNCIP_SIZE); 2636 } 2637 rw_enter(&ip->i_contents, RW_READER); 2638 mutex_enter(&ip->i_tlock); 2639 ip->i_flag &= ~IMODTIME; 2640 mutex_exit(&ip->i_tlock); 2641 ufs_iupdat(ip, I_SYNC); 2642 rw_exit(&ip->i_contents); 2643 if (ulp) { 2644 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC, 2645 TOP_SYNCIP_SIZE); 2646 } 2647 } 2648 2649 /* 2650 * Commit the Moby transaction 2651 * 2652 * Deltas have already been made so we just need to 2653 * commit them with a synchronous transaction. 2654 * TRANS_BEGIN_SYNC() will return an error 2655 * if there are no deltas to commit, for an 2656 * empty transaction. 2657 */ 2658 if (ulp) { 2659 TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE, 2660 error); 2661 if (error) { 2662 error = 0; /* commit wasn't needed */ 2663 goto out; 2664 } 2665 TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC, 2666 TOP_COMMIT_SIZE); 2667 } 2668 } else { /* not logging */ 2669 if (!(IS_SWAPVP(vp))) 2670 if (syncflag & FNODSYNC) { 2671 /* Just update the inode only */ 2672 TRANS_IUPDAT(ip, 1); 2673 error = 0; 2674 } else if (syncflag & FDSYNC) 2675 /* Do data-synchronous writes */ 2676 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC); 2677 else 2678 /* Do synchronous writes */ 2679 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC); 2680 2681 rw_enter(&ip->i_contents, RW_WRITER); 2682 if (!error) 2683 error = ufs_sync_indir(ip); 2684 rw_exit(&ip->i_contents); 2685 } 2686 out: 2687 if (ulp) { 2688 ufs_lockfs_end(ulp); 2689 } 2690 TRACE_2(TR_FAC_UFS, TR_UFS_FSYNC_END, 2691 "ufs_fsync_end:vp %p error %d", vp, error); 2692 return (error); 2693 } 2694 2695 /*ARGSUSED*/ 2696 static void 2697 ufs_inactive(struct vnode *vp, struct cred *cr) 2698 { 2699 ufs_iinactive(VTOI(vp)); 2700 } 2701 2702 /* 2703 * Unix file system operations having to do with directory manipulation. 2704 */ 2705 int ufs_lookup_idle_count = 2; /* Number of inodes to idle each time */ 2706 /* ARGSUSED */ 2707 static int 2708 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 2709 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr) 2710 { 2711 struct inode *ip; 2712 struct inode *sip; 2713 struct inode *xip; 2714 struct ufsvfs *ufsvfsp; 2715 struct ulockfs *ulp; 2716 struct vnode *vp; 2717 int error; 2718 2719 TRACE_2(TR_FAC_UFS, TR_UFS_LOOKUP_START, 2720 "ufs_lookup_start:dvp %p name %s", dvp, nm); 2721 2722 2723 /* 2724 * Check flags for type of lookup (regular file or attribute file) 2725 */ 2726 2727 ip = VTOI(dvp); 2728 2729 if (flags & LOOKUP_XATTR) { 2730 2731 /* 2732 * We don't allow recursive attributes... 2733 * Maybe someday we will. 2734 */ 2735 if ((ip->i_cflags & IXATTR)) { 2736 return (EINVAL); 2737 } 2738 2739 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) { 2740 error = ufs_xattr_getattrdir(dvp, &sip, flags, cr); 2741 if (error) { 2742 *vpp = NULL; 2743 goto out; 2744 } 2745 2746 vp = ITOV(sip); 2747 dnlc_update(dvp, XATTR_DIR_NAME, vp); 2748 } 2749 2750 /* 2751 * Check accessibility of directory. 2752 */ 2753 if (vp == DNLC_NO_VNODE) { 2754 VN_RELE(vp); 2755 error = ENOENT; 2756 goto out; 2757 } 2758 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr)) != 0) { 2759 VN_RELE(vp); 2760 goto out; 2761 } 2762 2763 *vpp = vp; 2764 return (0); 2765 } 2766 2767 /* 2768 * Check for a null component, which we should treat as 2769 * looking at dvp from within it's parent, so we don't 2770 * need a call to ufs_iaccess(), as it has already been 2771 * done. 2772 */ 2773 if (nm[0] == 0) { 2774 VN_HOLD(dvp); 2775 error = 0; 2776 *vpp = dvp; 2777 goto out; 2778 } 2779 2780 /* 2781 * Check for "." ie itself. this is a quick check and 2782 * avoids adding "." into the dnlc (which have been seen 2783 * to occupy >10% of the cache). 2784 */ 2785 if ((nm[0] == '.') && (nm[1] == 0)) { 2786 /* 2787 * Don't return without checking accessibility 2788 * of the directory. We only need the lock if 2789 * we are going to return it. 2790 */ 2791 if ((error = ufs_iaccess(ip, IEXEC, cr)) == 0) { 2792 VN_HOLD(dvp); 2793 *vpp = dvp; 2794 } 2795 goto out; 2796 } 2797 2798 /* 2799 * Fast path: Check the directory name lookup cache. 2800 */ 2801 if (vp = dnlc_lookup(dvp, nm)) { 2802 /* 2803 * Check accessibility of directory. 2804 */ 2805 if ((error = ufs_iaccess(ip, IEXEC, cr)) != 0) { 2806 VN_RELE(vp); 2807 goto out; 2808 } 2809 if (vp == DNLC_NO_VNODE) { 2810 VN_RELE(vp); 2811 error = ENOENT; 2812 goto out; 2813 } 2814 xip = VTOI(vp); 2815 ulp = NULL; 2816 goto fastpath; 2817 } 2818 2819 /* 2820 * Keep the idle queue from getting too long by 2821 * idling two inodes before attempting to allocate another. 2822 * This operation must be performed before entering 2823 * lockfs or a transaction. 2824 */ 2825 if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat) 2826 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 2827 ins.in_lidles.value.ul += ufs_lookup_idle_count; 2828 ufs_idle_some(ufs_lookup_idle_count); 2829 } 2830 2831 retry_lookup: 2832 ufsvfsp = ip->i_ufsvfs; 2833 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK); 2834 if (error) 2835 goto out; 2836 2837 error = ufs_dirlook(ip, nm, &xip, cr, 1); 2838 2839 fastpath: 2840 if (error == 0) { 2841 ip = xip; 2842 *vpp = ITOV(ip); 2843 2844 /* 2845 * If vnode is a device return special vnode instead. 2846 */ 2847 if (IS_DEVVP(*vpp)) { 2848 struct vnode *newvp; 2849 2850 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, 2851 cr); 2852 VN_RELE(*vpp); 2853 if (newvp == NULL) 2854 error = ENOSYS; 2855 else 2856 *vpp = newvp; 2857 } 2858 } 2859 if (ulp) { 2860 ufs_lockfs_end(ulp); 2861 } 2862 2863 if (error == EAGAIN) 2864 goto retry_lookup; 2865 2866 out: 2867 TRACE_3(TR_FAC_UFS, TR_UFS_LOOKUP_END, 2868 "ufs_lookup_end:dvp %p name %s error %d", vpp, nm, error); 2869 return (error); 2870 } 2871 2872 static int 2873 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl, 2874 int mode, struct vnode **vpp, struct cred *cr, int flag) 2875 { 2876 struct inode *ip; 2877 struct inode *xip; 2878 struct inode *dip; 2879 struct vnode *xvp; 2880 struct ufsvfs *ufsvfsp; 2881 struct ulockfs *ulp; 2882 int error; 2883 int issync; 2884 int truncflag; 2885 int trans_size; 2886 int noentry; 2887 int defer_dip_seq_update = 0; /* need to defer update of dip->i_seq */ 2888 int retry = 1; 2889 int indeadlock; 2890 2891 TRACE_1(TR_FAC_UFS, TR_UFS_CREATE_START, 2892 "ufs_create_start:dvp %p", dvp); 2893 2894 again: 2895 ip = VTOI(dvp); 2896 ufsvfsp = ip->i_ufsvfs; 2897 truncflag = 0; 2898 2899 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK); 2900 if (error) 2901 goto out; 2902 2903 if (ulp) { 2904 trans_size = (int)TOP_CREATE_SIZE(ip); 2905 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size); 2906 } 2907 2908 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 2909 vap->va_mode &= ~VSVTX; 2910 2911 if (*name == '\0') { 2912 /* 2913 * Null component name refers to the directory itself. 2914 */ 2915 VN_HOLD(dvp); 2916 /* 2917 * Even though this is an error case, we need to grab the 2918 * quota lock since the error handling code below is common. 2919 */ 2920 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2921 rw_enter(&ip->i_contents, RW_WRITER); 2922 error = EEXIST; 2923 } else { 2924 xip = NULL; 2925 noentry = 0; 2926 /* 2927 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 2928 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 2929 * possible, retries the operation. 2930 */ 2931 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE, 2932 retry_dir); 2933 if (indeadlock) 2934 goto again; 2935 2936 xvp = dnlc_lookup(dvp, name); 2937 if (xvp == DNLC_NO_VNODE) { 2938 noentry = 1; 2939 VN_RELE(xvp); 2940 xvp = NULL; 2941 } 2942 if (xvp) { 2943 rw_exit(&ip->i_rwlock); 2944 if (error = ufs_iaccess(ip, IEXEC, cr)) { 2945 VN_RELE(xvp); 2946 } else { 2947 error = EEXIST; 2948 xip = VTOI(xvp); 2949 } 2950 } else { 2951 /* 2952 * Suppress file system full message if we will retry 2953 */ 2954 error = ufs_direnter_cm(ip, name, DE_CREATE, 2955 vap, &xip, cr, 2956 (noentry | (retry ? IQUIET : 0))); 2957 if (error == EAGAIN) { 2958 if (ulp) { 2959 TRANS_END_CSYNC(ufsvfsp, error, issync, 2960 TOP_CREATE, trans_size); 2961 ufs_lockfs_end(ulp); 2962 } 2963 goto again; 2964 } 2965 rw_exit(&ip->i_rwlock); 2966 } 2967 ip = xip; 2968 if (ip != NULL) { 2969 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2970 rw_enter(&ip->i_contents, RW_WRITER); 2971 } 2972 } 2973 2974 /* 2975 * If the file already exists and this is a non-exclusive create, 2976 * check permissions and allow access for non-directories. 2977 * Read-only create of an existing directory is also allowed. 2978 * We fail an exclusive create of anything which already exists. 2979 */ 2980 if (error == EEXIST) { 2981 dip = VTOI(dvp); 2982 if (excl == NONEXCL) { 2983 if ((((ip->i_mode & IFMT) == IFDIR) || 2984 ((ip->i_mode & IFMT) == IFATTRDIR)) && 2985 (mode & IWRITE)) 2986 error = EISDIR; 2987 else if (mode) 2988 error = ufs_iaccess(ip, mode, cr); 2989 else 2990 error = 0; 2991 } 2992 if (error) { 2993 rw_exit(&ip->i_contents); 2994 rw_exit(&ufsvfsp->vfs_dqrwlock); 2995 VN_RELE(ITOV(ip)); 2996 goto unlock; 2997 } 2998 /* 2999 * If the error EEXIST was set, then i_seq can not 3000 * have been updated. The sequence number interface 3001 * is defined such that a non-error VOP_CREATE must 3002 * increase the dir va_seq it by at least one. If we 3003 * have cleared the error, increase i_seq. Note that 3004 * we are increasing the dir i_seq and in rare cases 3005 * ip may actually be from the dvp, so we already have 3006 * the locks and it will not be subject to truncation. 3007 * In case we have to update i_seq of the parent 3008 * directory dip, we have to defer it till we have 3009 * released our locks on ip due to lock ordering requirements. 3010 */ 3011 if (ip != dip) 3012 defer_dip_seq_update = 1; 3013 else 3014 ip->i_seq++; 3015 3016 if (((ip->i_mode & IFMT) == IFREG) && 3017 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 3018 /* 3019 * Truncate regular files, if requested by caller. 3020 * Grab i_rwlock to make sure no one else is 3021 * currently writing to the file (we promised 3022 * bmap we would do this). 3023 * Must get the locks in the correct order. 3024 */ 3025 if (ip->i_size == 0) { 3026 ip->i_flag |= ICHG | IUPD; 3027 ip->i_seq++; 3028 TRANS_INODE(ufsvfsp, ip); 3029 } else { 3030 /* 3031 * Large Files: Why this check here? 3032 * Though we do it in vn_create() we really 3033 * want to guarantee that we do not destroy 3034 * Large file data by atomically checking 3035 * the size while holding the contents 3036 * lock. 3037 */ 3038 if (flag && !(flag & FOFFMAX) && 3039 ((ip->i_mode & IFMT) == IFREG) && 3040 (ip->i_size > (offset_t)MAXOFF32_T)) { 3041 rw_exit(&ip->i_contents); 3042 rw_exit(&ufsvfsp->vfs_dqrwlock); 3043 error = EOVERFLOW; 3044 goto unlock; 3045 } 3046 if (TRANS_ISTRANS(ufsvfsp)) 3047 truncflag++; 3048 else { 3049 rw_exit(&ip->i_contents); 3050 rw_exit(&ufsvfsp->vfs_dqrwlock); 3051 ufs_tryirwlock_trans(&ip->i_rwlock, 3052 RW_WRITER, TOP_CREATE, 3053 retry_file); 3054 if (indeadlock) { 3055 VN_RELE(ITOV(ip)); 3056 goto again; 3057 } 3058 rw_enter(&ufsvfsp->vfs_dqrwlock, 3059 RW_READER); 3060 rw_enter(&ip->i_contents, RW_WRITER); 3061 (void) ufs_itrunc(ip, (u_offset_t)0, 0, 3062 cr); 3063 rw_exit(&ip->i_rwlock); 3064 } 3065 } 3066 } 3067 } 3068 3069 if (error) { 3070 if (ip != NULL) { 3071 rw_exit(&ufsvfsp->vfs_dqrwlock); 3072 rw_exit(&ip->i_contents); 3073 } 3074 goto unlock; 3075 } 3076 3077 *vpp = ITOV(ip); 3078 ITIMES(ip); 3079 rw_exit(&ip->i_contents); 3080 rw_exit(&ufsvfsp->vfs_dqrwlock); 3081 3082 /* 3083 * If vnode is a device return special vnode instead. 3084 */ 3085 if (!error && IS_DEVVP(*vpp)) { 3086 struct vnode *newvp; 3087 3088 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 3089 VN_RELE(*vpp); 3090 if (newvp == NULL) { 3091 error = ENOSYS; 3092 goto unlock; 3093 } 3094 truncflag = 0; 3095 *vpp = newvp; 3096 } 3097 unlock: 3098 3099 /* 3100 * Do the deferred update of the parent directory's sequence 3101 * number now. 3102 */ 3103 if (defer_dip_seq_update == 1) { 3104 rw_enter(&dip->i_contents, RW_READER); 3105 mutex_enter(&dip->i_tlock); 3106 dip->i_seq++; 3107 mutex_exit(&dip->i_tlock); 3108 rw_exit(&dip->i_contents); 3109 } 3110 3111 if (ulp) { 3112 int terr = 0; 3113 3114 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE, 3115 trans_size); 3116 3117 /* 3118 * If we haven't had a more interesting failure 3119 * already, then anything that might've happened 3120 * here should be reported. 3121 */ 3122 if (error == 0) 3123 error = terr; 3124 } 3125 3126 if (!error && truncflag) { 3127 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc); 3128 if (indeadlock) { 3129 if (ulp) 3130 ufs_lockfs_end(ulp); 3131 VN_RELE(ITOV(ip)); 3132 goto again; 3133 } 3134 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr); 3135 rw_exit(&ip->i_rwlock); 3136 } 3137 3138 if (ulp) 3139 ufs_lockfs_end(ulp); 3140 3141 /* 3142 * If no inodes available, try to free one up out of the 3143 * pending delete queue. 3144 */ 3145 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3146 ufs_delete_drain_wait(ufsvfsp, 1); 3147 retry = 0; 3148 goto again; 3149 } 3150 3151 out: 3152 TRACE_3(TR_FAC_UFS, TR_UFS_CREATE_END, 3153 "ufs_create_end:dvp %p name %s error %d", vpp, name, error); 3154 return (error); 3155 } 3156 3157 extern int ufs_idle_max; 3158 /*ARGSUSED*/ 3159 static int 3160 ufs_remove(struct vnode *vp, char *nm, struct cred *cr) 3161 { 3162 struct inode *ip = VTOI(vp); 3163 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3164 struct ulockfs *ulp; 3165 vnode_t *rmvp = NULL; /* Vnode corresponding to name being removed */ 3166 int indeadlock; 3167 int error; 3168 int issync; 3169 int trans_size; 3170 3171 TRACE_1(TR_FAC_UFS, TR_UFS_REMOVE_START, 3172 "ufs_remove_start:vp %p", vp); 3173 3174 /* 3175 * don't let the delete queue get too long 3176 */ 3177 if (ufsvfsp == NULL) { 3178 error = EIO; 3179 goto out; 3180 } 3181 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3182 ufs_delete_drain(vp->v_vfsp, 1, 1); 3183 3184 retry_remove: 3185 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK); 3186 if (error) 3187 goto out; 3188 3189 if (ulp) 3190 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 3191 trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp))); 3192 3193 /* 3194 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3195 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3196 * possible, retries the operation. 3197 */ 3198 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry); 3199 if (indeadlock) 3200 goto retry_remove; 3201 error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0, 3202 DR_REMOVE, cr, &rmvp); 3203 rw_exit(&ip->i_rwlock); 3204 3205 if (ulp) { 3206 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size); 3207 ufs_lockfs_end(ulp); 3208 } 3209 3210 /* 3211 * This must be called after the remove transaction is closed. 3212 */ 3213 if (rmvp != NULL) { 3214 /* Only send the event if there were no errors */ 3215 if (error == 0) 3216 vnevent_remove(rmvp); 3217 VN_RELE(rmvp); 3218 } 3219 out: 3220 TRACE_3(TR_FAC_UFS, TR_UFS_REMOVE_END, 3221 "ufs_remove_end:vp %p name %s error %d", vp, nm, error); 3222 return (error); 3223 } 3224 3225 /* 3226 * Link a file or a directory. Only privileged processes are allowed to 3227 * make links to directories. 3228 */ 3229 static int 3230 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr) 3231 { 3232 struct inode *sip; 3233 struct inode *tdp = VTOI(tdvp); 3234 struct ufsvfs *ufsvfsp = tdp->i_ufsvfs; 3235 struct ulockfs *ulp; 3236 struct vnode *realvp; 3237 int error; 3238 int issync; 3239 int trans_size; 3240 int isdev; 3241 int indeadlock; 3242 3243 TRACE_1(TR_FAC_UFS, TR_UFS_LINK_START, 3244 "ufs_link_start:tdvp %p", tdvp); 3245 3246 retry_link: 3247 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK); 3248 if (error) 3249 goto out; 3250 3251 if (ulp) 3252 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK, 3253 trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp))); 3254 3255 if (VOP_REALVP(svp, &realvp) == 0) 3256 svp = realvp; 3257 3258 /* 3259 * Make sure link for extended attributes is valid 3260 * We only support hard linking of attr in ATTRDIR to ATTRDIR 3261 * 3262 * Make certain we don't attempt to look at a device node as 3263 * a ufs inode. 3264 */ 3265 3266 isdev = IS_DEVVP(svp); 3267 if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) && 3268 ((tdp->i_mode & IFMT) == IFATTRDIR)) || 3269 ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) && 3270 ((tdp->i_mode & IFMT) == IFDIR))) { 3271 error = EINVAL; 3272 goto unlock; 3273 } 3274 3275 sip = VTOI(svp); 3276 if ((svp->v_type == VDIR && 3277 secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) || 3278 (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) { 3279 error = EPERM; 3280 goto unlock; 3281 } 3282 3283 /* 3284 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3285 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3286 * possible, retries the operation. 3287 */ 3288 ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry); 3289 if (indeadlock) 3290 goto retry_link; 3291 error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0, 3292 sip, cr, NULL); 3293 rw_exit(&tdp->i_rwlock); 3294 3295 unlock: 3296 if (ulp) { 3297 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size); 3298 ufs_lockfs_end(ulp); 3299 } 3300 out: 3301 TRACE_2(TR_FAC_UFS, TR_UFS_LINK_END, 3302 "ufs_link_end:tdvp %p error %d", tdvp, error); 3303 return (error); 3304 } 3305 3306 uint64_t ufs_rename_retry_cnt; 3307 uint64_t ufs_rename_upgrade_retry_cnt; 3308 uint64_t ufs_rename_dircheck_retry_cnt; 3309 clock_t ufs_rename_backoff_delay = 1; 3310 3311 /* 3312 * Rename a file or directory. 3313 * We are given the vnode and entry string of the source and the 3314 * vnode and entry string of the place we want to move the source 3315 * to (the target). The essential operation is: 3316 * unlink(target); 3317 * link(source, target); 3318 * unlink(source); 3319 * but "atomically". Can't do full commit without saving state in 3320 * the inode on disk, which isn't feasible at this time. Best we 3321 * can do is always guarantee that the TARGET exists. 3322 */ 3323 3324 /*ARGSUSED*/ 3325 static int 3326 ufs_rename( 3327 struct vnode *sdvp, /* old (source) parent vnode */ 3328 char *snm, /* old (source) entry name */ 3329 struct vnode *tdvp, /* new (target) parent vnode */ 3330 char *tnm, /* new (target) entry name */ 3331 struct cred *cr) 3332 { 3333 struct inode *sip = NULL; /* source inode */ 3334 struct inode *ip = NULL; /* check inode */ 3335 struct inode *sdp; /* old (source) parent inode */ 3336 struct inode *tdp; /* new (target) parent inode */ 3337 struct vnode *tvp = NULL; /* target vnode, if it exists */ 3338 struct vnode *realvp; 3339 struct ufsvfs *ufsvfsp; 3340 struct ulockfs *ulp; 3341 struct ufs_slot slot; 3342 timestruc_t now; 3343 int error; 3344 int issync; 3345 int trans_size; 3346 krwlock_t *first_lock; 3347 krwlock_t *second_lock; 3348 krwlock_t *reverse_lock; 3349 3350 TRACE_1(TR_FAC_UFS, TR_UFS_RENAME_START, 3351 "ufs_rename_start:sdvp %p", sdvp); 3352 3353 3354 sdp = VTOI(sdvp); 3355 slot.fbp = NULL; 3356 ufsvfsp = sdp->i_ufsvfs; 3357 retry_rename: 3358 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); 3359 if (error) 3360 goto out; 3361 3362 if (ulp) 3363 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, 3364 trans_size = (int)TOP_RENAME_SIZE(sdp)); 3365 3366 if (VOP_REALVP(tdvp, &realvp) == 0) 3367 tdvp = realvp; 3368 3369 tdp = VTOI(tdvp); 3370 3371 3372 /* 3373 * We only allow renaming of attributes from ATTRDIR to ATTRDIR. 3374 */ 3375 if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) { 3376 error = EINVAL; 3377 goto unlock; 3378 } 3379 3380 /* 3381 * Look up inode of file we're supposed to rename. 3382 */ 3383 gethrestime(&now); 3384 if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) { 3385 if (error == EAGAIN) { 3386 if (ulp) { 3387 TRANS_END_CSYNC(ufsvfsp, error, issync, 3388 TOP_RENAME, trans_size); 3389 ufs_lockfs_end(ulp); 3390 } 3391 goto retry_rename; 3392 } 3393 3394 goto unlock; 3395 } 3396 3397 /* 3398 * Lock both the source and target directories (they may be 3399 * the same) to provide the atomicity semantics that was 3400 * previously provided by the per file system vfs_rename_lock 3401 * 3402 * with vfs_rename_lock removed to allow simultaneous renames 3403 * within a file system, ufs_dircheckpath can deadlock while 3404 * traversing back to ensure that source is not a parent directory 3405 * of target parent directory. This is because we get into 3406 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER. 3407 * If the tdp and sdp of the simultaneous renames happen to be 3408 * in the path of each other, it can lead to a deadlock. This 3409 * can be avoided by getting the locks as RW_READER here and then 3410 * upgrading to RW_WRITER after completing the ufs_dircheckpath. 3411 * 3412 * We hold the target directory's i_rwlock after calling 3413 * ufs_lockfs_begin but in many other operations (like ufs_readdir) 3414 * VOP_RWLOCK is explicitly called by the filesystem independent code 3415 * before calling the file system operation. In these cases the order 3416 * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin 3417 * is called). This is fine as long as ufs_lockfs_begin acts as a VOP 3418 * counter but with ufs_quiesce setting the SLOCK bit this becomes a 3419 * synchronizing object which might lead to a deadlock. So we use 3420 * rw_tryenter instead of rw_enter. If we fail to get this lock and 3421 * find that SLOCK bit is set, we call ufs_lockfs_end and restart the 3422 * operation. 3423 */ 3424 retry: 3425 first_lock = &tdp->i_rwlock; 3426 second_lock = &sdp->i_rwlock; 3427 retry_firstlock: 3428 if (!rw_tryenter(first_lock, RW_READER)) { 3429 /* 3430 * We didn't get the lock. Check if the SLOCK is set in the 3431 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 3432 * and wait for SLOCK to be cleared. 3433 */ 3434 3435 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 3436 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, 3437 trans_size); 3438 ufs_lockfs_end(ulp); 3439 goto retry_rename; 3440 3441 } else { 3442 /* 3443 * SLOCK isn't set so this is a genuine synchronization 3444 * case. Let's try again after giving them a breather. 3445 */ 3446 delay(RETRY_LOCK_DELAY); 3447 goto retry_firstlock; 3448 } 3449 } 3450 /* 3451 * Need to check if the tdp and sdp are same !!! 3452 */ 3453 if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) { 3454 /* 3455 * We didn't get the lock. Check if the SLOCK is set in the 3456 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 3457 * and wait for SLOCK to be cleared. 3458 */ 3459 3460 rw_exit(first_lock); 3461 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 3462 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, 3463 trans_size); 3464 ufs_lockfs_end(ulp); 3465 goto retry_rename; 3466 3467 } else { 3468 /* 3469 * So we couldn't get the second level peer lock *and* 3470 * the SLOCK bit isn't set. Too bad we can be 3471 * contentding with someone wanting these locks otherway 3472 * round. Reverse the locks in case there is a heavy 3473 * contention for the second level lock. 3474 */ 3475 reverse_lock = first_lock; 3476 first_lock = second_lock; 3477 second_lock = reverse_lock; 3478 ufs_rename_retry_cnt++; 3479 goto retry_firstlock; 3480 } 3481 } 3482 3483 if (sip == tdp) { 3484 error = EINVAL; 3485 goto errout; 3486 } 3487 /* 3488 * Make sure we can delete the source entry. This requires 3489 * write permission on the containing directory. 3490 * Check for sticky directories. 3491 */ 3492 rw_enter(&sdp->i_contents, RW_READER); 3493 rw_enter(&sip->i_contents, RW_READER); 3494 if ((error = ufs_iaccess(sdp, IWRITE, cr)) != 0 || 3495 (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) { 3496 rw_exit(&sip->i_contents); 3497 rw_exit(&sdp->i_contents); 3498 goto errout; 3499 } 3500 3501 /* 3502 * If this is a rename of a directory and the parent is 3503 * different (".." must be changed), then the source 3504 * directory must not be in the directory hierarchy 3505 * above the target, as this would orphan everything 3506 * below the source directory. Also the user must have 3507 * write permission in the source so as to be able to 3508 * change "..". 3509 */ 3510 if ((((sip->i_mode & IFMT) == IFDIR) || 3511 ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) { 3512 ino_t inum; 3513 3514 if ((error = ufs_iaccess(sip, IWRITE, cr))) { 3515 rw_exit(&sip->i_contents); 3516 rw_exit(&sdp->i_contents); 3517 goto errout; 3518 } 3519 inum = sip->i_number; 3520 rw_exit(&sip->i_contents); 3521 rw_exit(&sdp->i_contents); 3522 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) { 3523 /* 3524 * If we got EAGAIN ufs_dircheckpath detected a 3525 * potential deadlock and backed out. We need 3526 * to retry the operation since sdp and tdp have 3527 * to be released to avoid the deadlock. 3528 */ 3529 if (error == EAGAIN) { 3530 rw_exit(&tdp->i_rwlock); 3531 if (tdp != sdp) 3532 rw_exit(&sdp->i_rwlock); 3533 delay(ufs_rename_backoff_delay); 3534 ufs_rename_dircheck_retry_cnt++; 3535 goto retry; 3536 } 3537 goto errout; 3538 } 3539 } else { 3540 rw_exit(&sip->i_contents); 3541 rw_exit(&sdp->i_contents); 3542 } 3543 3544 3545 /* 3546 * Check for renaming '.' or '..' or alias of '.' 3547 */ 3548 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) { 3549 error = EINVAL; 3550 goto errout; 3551 } 3552 3553 /* 3554 * Simultaneous renames can deadlock in ufs_dircheckpath since it 3555 * tries to traverse back the file tree with both tdp and sdp held 3556 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks 3557 * as RW_READERS till ufs_dircheckpath is done. 3558 * Now that ufs_dircheckpath is done with, we can upgrade the locks 3559 * to RW_WRITER. 3560 */ 3561 if (!rw_tryupgrade(&tdp->i_rwlock)) { 3562 /* 3563 * The upgrade failed. We got to give away the lock 3564 * as to avoid deadlocking with someone else who is 3565 * waiting for writer lock. With the lock gone, we 3566 * cannot be sure the checks done above will hold 3567 * good when we eventually get them back as writer. 3568 * So if we can't upgrade we drop the locks and retry 3569 * everything again. 3570 */ 3571 rw_exit(&tdp->i_rwlock); 3572 if (tdp != sdp) 3573 rw_exit(&sdp->i_rwlock); 3574 delay(ufs_rename_backoff_delay); 3575 ufs_rename_upgrade_retry_cnt++; 3576 goto retry; 3577 } 3578 if (tdp != sdp) { 3579 if (!rw_tryupgrade(&sdp->i_rwlock)) { 3580 /* 3581 * The upgrade failed. We got to give away the lock 3582 * as to avoid deadlocking with someone else who is 3583 * waiting for writer lock. With the lock gone, we 3584 * cannot be sure the checks done above will hold 3585 * good when we eventually get them back as writer. 3586 * So if we can't upgrade we drop the locks and retry 3587 * everything again. 3588 */ 3589 rw_exit(&tdp->i_rwlock); 3590 rw_exit(&sdp->i_rwlock); 3591 delay(ufs_rename_backoff_delay); 3592 ufs_rename_upgrade_retry_cnt++; 3593 goto retry; 3594 } 3595 } 3596 3597 /* 3598 * Now that all the locks are held check to make sure another thread 3599 * didn't slip in and take out the sip. 3600 */ 3601 slot.status = NONE; 3602 if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec || 3603 sip->i_ctime.tv_sec > now.tv_sec) { 3604 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 3605 rw_enter(&sdp->i_contents, RW_WRITER); 3606 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot, 3607 &ip, cr, 0); 3608 rw_exit(&sdp->i_contents); 3609 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock); 3610 if (error) { 3611 goto errout; 3612 } 3613 if (ip == NULL) { 3614 error = ENOENT; 3615 goto errout; 3616 } else { 3617 /* 3618 * If the inode was found need to drop the v_count 3619 * so as not to keep the filesystem from being 3620 * unmounted at a later time. 3621 */ 3622 VN_RELE(ITOV(ip)); 3623 } 3624 3625 /* 3626 * Release the slot.fbp that has the page mapped and 3627 * locked SE_SHARED, and could be used in in 3628 * ufs_direnter_lr() which needs to get the SE_EXCL lock 3629 * on said page. 3630 */ 3631 if (slot.fbp) { 3632 fbrelse(slot.fbp, S_OTHER); 3633 slot.fbp = NULL; 3634 } 3635 } 3636 3637 /* 3638 * Link source to the target. If a target exists, return its 3639 * vnode pointer in tvp. We'll release it after sending the 3640 * vnevent. 3641 */ 3642 if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) { 3643 /* 3644 * ESAME isn't really an error; it indicates that the 3645 * operation should not be done because the source and target 3646 * are the same file, but that no error should be reported. 3647 */ 3648 if (error == ESAME) 3649 error = 0; 3650 goto errout; 3651 } 3652 3653 /* 3654 * Unlink the source. 3655 * Remove the source entry. ufs_dirremove() checks that the entry 3656 * still reflects sip, and returns an error if it doesn't. 3657 * If the entry has changed just forget about it. Release 3658 * the source inode. 3659 */ 3660 if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0, 3661 DR_RENAME, cr, NULL)) == ENOENT) 3662 error = 0; 3663 3664 errout: 3665 if (slot.fbp) 3666 fbrelse(slot.fbp, S_OTHER); 3667 3668 rw_exit(&tdp->i_rwlock); 3669 if (sdp != tdp) { 3670 rw_exit(&sdp->i_rwlock); 3671 } 3672 3673 unlock: 3674 if (ulp) { 3675 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); 3676 ufs_lockfs_end(ulp); 3677 } 3678 3679 /* 3680 * If no errors, send the appropriate events on the source 3681 * and destination (a.k.a, target) vnodes, if they exist. 3682 * This has to be done after the rename transaction has closed. 3683 */ 3684 if (error == 0) { 3685 if (tvp != NULL) 3686 vnevent_rename_dest(tvp); 3687 /* 3688 * Note that if ufs_direnter_lr() returned ESAME then 3689 * this event will still be sent. This isn't expected 3690 * to be a problem for anticipated usage by consumers. 3691 */ 3692 if (sip != NULL) 3693 vnevent_rename_src(ITOV(sip)); 3694 } 3695 3696 if (tvp != NULL) 3697 VN_RELE(tvp); 3698 3699 if (sip != NULL) 3700 VN_RELE(ITOV(sip)); 3701 3702 out: 3703 TRACE_5(TR_FAC_UFS, TR_UFS_RENAME_END, 3704 "ufs_rename_end:sdvp %p snm %s tdvp %p tnm %s error %d", 3705 sdvp, snm, tdvp, tnm, error); 3706 return (error); 3707 } 3708 3709 /*ARGSUSED*/ 3710 static int 3711 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap, 3712 struct vnode **vpp, struct cred *cr) 3713 { 3714 struct inode *ip; 3715 struct inode *xip; 3716 struct ufsvfs *ufsvfsp; 3717 struct ulockfs *ulp; 3718 int error; 3719 int issync; 3720 int trans_size; 3721 int indeadlock; 3722 int retry = 1; 3723 3724 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 3725 3726 TRACE_1(TR_FAC_UFS, TR_UFS_MKDIR_START, 3727 "ufs_mkdir_start:dvp %p", dvp); 3728 3729 /* 3730 * Can't make directory in attr hidden dir 3731 */ 3732 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3733 return (EINVAL); 3734 3735 again: 3736 ip = VTOI(dvp); 3737 ufsvfsp = ip->i_ufsvfs; 3738 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3739 if (error) 3740 goto out; 3741 if (ulp) 3742 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, 3743 trans_size = (int)TOP_MKDIR_SIZE(ip)); 3744 3745 /* 3746 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3747 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3748 * possible, retries the operation. 3749 */ 3750 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry); 3751 if (indeadlock) 3752 goto again; 3753 3754 error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr, 3755 (retry ? IQUIET : 0)); 3756 if (error == EAGAIN) { 3757 if (ulp) { 3758 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR, 3759 trans_size); 3760 ufs_lockfs_end(ulp); 3761 } 3762 goto again; 3763 } 3764 3765 rw_exit(&ip->i_rwlock); 3766 if (error == 0) { 3767 ip = xip; 3768 *vpp = ITOV(ip); 3769 } else if (error == EEXIST) 3770 VN_RELE(ITOV(xip)); 3771 3772 if (ulp) { 3773 int terr = 0; 3774 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size); 3775 ufs_lockfs_end(ulp); 3776 if (error == 0) 3777 error = terr; 3778 } 3779 out: 3780 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3781 ufs_delete_drain_wait(ufsvfsp, 1); 3782 retry = 0; 3783 goto again; 3784 } 3785 3786 TRACE_2(TR_FAC_UFS, TR_UFS_MKDIR_END, 3787 "ufs_mkdir_end:dvp %p error %d", dvp, error); 3788 return (error); 3789 } 3790 3791 /*ARGSUSED*/ 3792 static int 3793 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr) 3794 { 3795 struct inode *ip = VTOI(vp); 3796 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3797 struct ulockfs *ulp; 3798 vnode_t *rmvp = NULL; /* Vnode of removed directory */ 3799 int error; 3800 int issync; 3801 int trans_size; 3802 int indeadlock; 3803 3804 TRACE_1(TR_FAC_UFS, TR_UFS_RMDIR_START, 3805 "ufs_rmdir_start:vp %p", vp); 3806 3807 /* 3808 * don't let the delete queue get too long 3809 */ 3810 if (ufsvfsp == NULL) { 3811 error = EIO; 3812 goto out; 3813 } 3814 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3815 ufs_delete_drain(vp->v_vfsp, 1, 1); 3816 3817 retry_rmdir: 3818 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK); 3819 if (error) 3820 goto out; 3821 3822 if (ulp) 3823 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR, 3824 trans_size = TOP_RMDIR_SIZE); 3825 3826 /* 3827 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3828 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3829 * possible, retries the operation. 3830 */ 3831 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry); 3832 if (indeadlock) 3833 goto retry_rmdir; 3834 error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr, 3835 &rmvp); 3836 rw_exit(&ip->i_rwlock); 3837 3838 if (ulp) { 3839 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR, 3840 trans_size); 3841 ufs_lockfs_end(ulp); 3842 } 3843 3844 /* 3845 * This must be done AFTER the rmdir transaction has closed. 3846 */ 3847 if (rmvp != NULL) { 3848 /* Only send the event if there were no errors */ 3849 if (error == 0) 3850 vnevent_rmdir(rmvp); 3851 VN_RELE(rmvp); 3852 } 3853 out: 3854 TRACE_2(TR_FAC_UFS, TR_UFS_RMDIR_END, 3855 "ufs_rmdir_end:vp %p error %d", vp, error); 3856 3857 return (error); 3858 } 3859 3860 /* ARGSUSED */ 3861 static int 3862 ufs_readdir( 3863 struct vnode *vp, 3864 struct uio *uiop, 3865 struct cred *cr, 3866 int *eofp) 3867 { 3868 struct iovec *iovp; 3869 struct inode *ip; 3870 struct direct *idp; 3871 struct dirent64 *odp; 3872 struct fbuf *fbp; 3873 struct ufsvfs *ufsvfsp; 3874 struct ulockfs *ulp; 3875 caddr_t outbuf; 3876 size_t bufsize; 3877 uint_t offset; 3878 uint_t bytes_wanted, total_bytes_wanted; 3879 int incount = 0; 3880 int outcount = 0; 3881 int error; 3882 3883 ip = VTOI(vp); 3884 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 3885 3886 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_START, 3887 "ufs_readdir_start:vp %p uiop %p", vp, uiop); 3888 3889 if (uiop->uio_loffset >= MAXOFF32_T) { 3890 if (eofp) 3891 *eofp = 1; 3892 return (0); 3893 } 3894 3895 /* 3896 * Check if we have been called with a valid iov_len 3897 * and bail out if not, otherwise we may potentially loop 3898 * forever further down. 3899 */ 3900 if (uiop->uio_iov->iov_len <= 0) { 3901 error = EINVAL; 3902 goto out; 3903 } 3904 3905 /* 3906 * Large Files: When we come here we are guaranteed that 3907 * uio_offset can be used safely. The high word is zero. 3908 */ 3909 3910 ufsvfsp = ip->i_ufsvfs; 3911 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK); 3912 if (error) 3913 goto out; 3914 3915 iovp = uiop->uio_iov; 3916 total_bytes_wanted = iovp->iov_len; 3917 3918 /* Large Files: directory files should not be "large" */ 3919 3920 ASSERT(ip->i_size <= MAXOFF32_T); 3921 3922 /* Force offset to be valid (to guard against bogus lseek() values) */ 3923 offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1); 3924 3925 /* Quit if at end of file or link count of zero (posix) */ 3926 if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) { 3927 if (eofp) 3928 *eofp = 1; 3929 error = 0; 3930 goto unlock; 3931 } 3932 3933 /* 3934 * Get space to change directory entries into fs independent format. 3935 * Do fast alloc for the most commonly used-request size (filesystem 3936 * block size). 3937 */ 3938 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) { 3939 bufsize = total_bytes_wanted; 3940 outbuf = kmem_alloc(bufsize, KM_SLEEP); 3941 odp = (struct dirent64 *)outbuf; 3942 } else { 3943 bufsize = total_bytes_wanted; 3944 odp = (struct dirent64 *)iovp->iov_base; 3945 } 3946 3947 nextblk: 3948 bytes_wanted = total_bytes_wanted; 3949 3950 /* Truncate request to file size */ 3951 if (offset + bytes_wanted > (int)ip->i_size) 3952 bytes_wanted = (int)(ip->i_size - offset); 3953 3954 /* Comply with MAXBSIZE boundary restrictions of fbread() */ 3955 if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE) 3956 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET); 3957 3958 /* 3959 * Read in the next chunk. 3960 * We are still holding the i_rwlock. 3961 */ 3962 error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp); 3963 3964 if (error) 3965 goto update_inode; 3966 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) && 3967 (!ufsvfsp->vfs_noatime)) { 3968 ip->i_flag |= IACC; 3969 } 3970 incount = 0; 3971 idp = (struct direct *)fbp->fb_addr; 3972 if (idp->d_ino == 0 && idp->d_reclen == 0 && 3973 idp->d_namlen == 0) { 3974 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, " 3975 "fs = %s\n", 3976 (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt); 3977 fbrelse(fbp, S_OTHER); 3978 error = ENXIO; 3979 goto update_inode; 3980 } 3981 /* Transform to file-system independent format */ 3982 while (incount < bytes_wanted) { 3983 /* 3984 * If the current directory entry is mangled, then skip 3985 * to the next block. It would be nice to set the FSBAD 3986 * flag in the super-block so that a fsck is forced on 3987 * next reboot, but locking is a problem. 3988 */ 3989 if (idp->d_reclen & 0x3) { 3990 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3991 break; 3992 } 3993 3994 /* Skip to requested offset and skip empty entries */ 3995 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) { 3996 ushort_t this_reclen = 3997 DIRENT64_RECLEN(idp->d_namlen); 3998 /* Buffer too small for any entries */ 3999 if (!outcount && this_reclen > bufsize) { 4000 fbrelse(fbp, S_OTHER); 4001 error = EINVAL; 4002 goto update_inode; 4003 } 4004 /* If would overrun the buffer, quit */ 4005 if (outcount + this_reclen > bufsize) { 4006 break; 4007 } 4008 /* Take this entry */ 4009 odp->d_ino = (ino64_t)idp->d_ino; 4010 odp->d_reclen = (ushort_t)this_reclen; 4011 odp->d_off = (offset_t)(offset + idp->d_reclen); 4012 4013 /* use strncpy(9f) to zero out uninitialized bytes */ 4014 4015 ASSERT(strlen(idp->d_name) + 1 <= 4016 DIRENT64_NAMELEN(this_reclen)); 4017 (void) strncpy(odp->d_name, idp->d_name, 4018 DIRENT64_NAMELEN(this_reclen)); 4019 outcount += odp->d_reclen; 4020 odp = (struct dirent64 *)((intptr_t)odp + 4021 odp->d_reclen); 4022 ASSERT(outcount <= bufsize); 4023 } 4024 if (idp->d_reclen) { 4025 incount += idp->d_reclen; 4026 offset += idp->d_reclen; 4027 idp = (struct direct *)((intptr_t)idp + idp->d_reclen); 4028 } else { 4029 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 4030 break; 4031 } 4032 } 4033 /* Release the chunk */ 4034 fbrelse(fbp, S_OTHER); 4035 4036 /* Read whole block, but got no entries, read another if not eof */ 4037 4038 /* 4039 * Large Files: casting i_size to int here is not a problem 4040 * because directory sizes are always less than MAXOFF32_T. 4041 * See assertion above. 4042 */ 4043 4044 if (offset < (int)ip->i_size && !outcount) 4045 goto nextblk; 4046 4047 /* Copy out the entry data */ 4048 if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) { 4049 iovp->iov_base += outcount; 4050 iovp->iov_len -= outcount; 4051 uiop->uio_resid -= outcount; 4052 uiop->uio_offset = offset; 4053 } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, 4054 uiop)) == 0) 4055 uiop->uio_offset = offset; 4056 update_inode: 4057 ITIMES(ip); 4058 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) 4059 kmem_free(outbuf, bufsize); 4060 4061 if (eofp && error == 0) 4062 *eofp = (uiop->uio_offset >= (int)ip->i_size); 4063 unlock: 4064 if (ulp) { 4065 ufs_lockfs_end(ulp); 4066 } 4067 out: 4068 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_END, 4069 "ufs_readdir_end:vp %p error %d", vp, error); 4070 return (error); 4071 } 4072 4073 /*ARGSUSED*/ 4074 static int 4075 ufs_symlink( 4076 struct vnode *dvp, /* ptr to parent dir vnode */ 4077 char *linkname, /* name of symbolic link */ 4078 struct vattr *vap, /* attributes */ 4079 char *target, /* target path */ 4080 struct cred *cr) /* user credentials */ 4081 { 4082 struct inode *ip, *dip = VTOI(dvp); 4083 struct ufsvfs *ufsvfsp = dip->i_ufsvfs; 4084 struct ulockfs *ulp; 4085 int error; 4086 int issync; 4087 int trans_size; 4088 int residual; 4089 int ioflag; 4090 int retry = 1; 4091 4092 TRACE_1(TR_FAC_UFS, TR_UFS_SYMLINK_START, 4093 "ufs_symlink_start:dvp %p", dvp); 4094 4095 /* 4096 * No symlinks in attrdirs at this time 4097 */ 4098 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 4099 return (EINVAL); 4100 4101 again: 4102 ip = (struct inode *)NULL; 4103 vap->va_type = VLNK; 4104 vap->va_rdev = 0; 4105 4106 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK); 4107 if (error) 4108 goto out; 4109 4110 if (ulp) 4111 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK, 4112 trans_size = (int)TOP_SYMLINK_SIZE(dip)); 4113 4114 /* 4115 * We must create the inode before the directory entry, to avoid 4116 * racing with readlink(). ufs_dirmakeinode requires that we 4117 * hold the quota lock as reader, and directory locks as writer. 4118 */ 4119 4120 rw_enter(&dip->i_rwlock, RW_WRITER); 4121 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4122 rw_enter(&dip->i_contents, RW_WRITER); 4123 4124 /* 4125 * Suppress any out of inodes messages if we will retry on 4126 * ENOSP 4127 */ 4128 if (retry) 4129 dip->i_flag |= IQUIET; 4130 4131 error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr); 4132 4133 dip->i_flag &= ~IQUIET; 4134 4135 rw_exit(&dip->i_contents); 4136 rw_exit(&ufsvfsp->vfs_dqrwlock); 4137 rw_exit(&dip->i_rwlock); 4138 4139 if (error) 4140 goto unlock; 4141 4142 /* 4143 * OK. The inode has been created. Write out the data of the 4144 * symbolic link. Since symbolic links are metadata, and should 4145 * remain consistent across a system crash, we need to force the 4146 * data out synchronously. 4147 * 4148 * (This is a change from the semantics in earlier releases, which 4149 * only created symbolic links synchronously if the semi-documented 4150 * 'syncdir' option was set, or if we were being invoked by the NFS 4151 * server, which requires symbolic links to be created synchronously.) 4152 * 4153 * We need to pass in a pointer for the residual length; otherwise 4154 * ufs_rdwri() will always return EIO if it can't write the data, 4155 * even if the error was really ENOSPC or EDQUOT. 4156 */ 4157 4158 ioflag = FWRITE | FDSYNC; 4159 residual = 0; 4160 4161 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4162 rw_enter(&ip->i_contents, RW_WRITER); 4163 4164 /* 4165 * Suppress file system full messages if we will retry 4166 */ 4167 if (retry) 4168 ip->i_flag |= IQUIET; 4169 4170 error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target), 4171 (offset_t)0, UIO_SYSSPACE, &residual, cr); 4172 4173 ip->i_flag &= ~IQUIET; 4174 4175 if (error) { 4176 rw_exit(&ip->i_contents); 4177 rw_exit(&ufsvfsp->vfs_dqrwlock); 4178 goto remove; 4179 } 4180 4181 /* 4182 * If the link's data is small enough, we can cache it in the inode. 4183 * This is a "fast symbolic link". We don't use the first direct 4184 * block because that's actually used to point at the symbolic link's 4185 * contents on disk; but we know that none of the other direct or 4186 * indirect blocks can be used because symbolic links are restricted 4187 * to be smaller than a file system block. 4188 */ 4189 4190 ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip))); 4191 4192 if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) { 4193 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) { 4194 ip->i_flag |= IFASTSYMLNK; 4195 } else { 4196 int i; 4197 /* error, clear garbage left behind */ 4198 for (i = 1; i < NDADDR; i++) 4199 ip->i_db[i] = 0; 4200 for (i = 0; i < NIADDR; i++) 4201 ip->i_ib[i] = 0; 4202 } 4203 } 4204 4205 rw_exit(&ip->i_contents); 4206 rw_exit(&ufsvfsp->vfs_dqrwlock); 4207 4208 /* 4209 * OK. We've successfully created the symbolic link. All that 4210 * remains is to insert it into the appropriate directory. 4211 */ 4212 4213 rw_enter(&dip->i_rwlock, RW_WRITER); 4214 error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL); 4215 rw_exit(&dip->i_rwlock); 4216 4217 /* 4218 * Fall through into remove-on-error code. We're either done, or we 4219 * need to remove the inode (if we couldn't insert it). 4220 */ 4221 4222 remove: 4223 if (error && (ip != NULL)) { 4224 rw_enter(&ip->i_contents, RW_WRITER); 4225 ip->i_nlink--; 4226 ip->i_flag |= ICHG; 4227 ip->i_seq++; 4228 ufs_setreclaim(ip); 4229 rw_exit(&ip->i_contents); 4230 } 4231 4232 unlock: 4233 if (ip != NULL) 4234 VN_RELE(ITOV(ip)); 4235 4236 if (ulp) { 4237 int terr = 0; 4238 4239 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK, 4240 trans_size); 4241 ufs_lockfs_end(ulp); 4242 if (error == 0) 4243 error = terr; 4244 } 4245 4246 /* 4247 * We may have failed due to lack of an inode or of a block to 4248 * store the target in. Try flushing the delete queue to free 4249 * logically-available things up and try again. 4250 */ 4251 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 4252 ufs_delete_drain_wait(ufsvfsp, 1); 4253 retry = 0; 4254 goto again; 4255 } 4256 4257 out: 4258 TRACE_2(TR_FAC_UFS, TR_UFS_SYMLINK_END, 4259 "ufs_symlink_end:dvp %p error %d", dvp, error); 4260 return (error); 4261 } 4262 4263 /* 4264 * Ufs specific routine used to do ufs io. 4265 */ 4266 int 4267 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base, 4268 ssize_t len, offset_t offset, enum uio_seg seg, int *aresid, 4269 struct cred *cr) 4270 { 4271 struct uio auio; 4272 struct iovec aiov; 4273 int error; 4274 4275 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 4276 4277 bzero((caddr_t)&auio, sizeof (uio_t)); 4278 bzero((caddr_t)&aiov, sizeof (iovec_t)); 4279 4280 aiov.iov_base = base; 4281 aiov.iov_len = len; 4282 auio.uio_iov = &aiov; 4283 auio.uio_iovcnt = 1; 4284 auio.uio_loffset = offset; 4285 auio.uio_segflg = (short)seg; 4286 auio.uio_resid = len; 4287 4288 if (rw == UIO_WRITE) { 4289 auio.uio_fmode = FWRITE; 4290 auio.uio_extflg = UIO_COPY_DEFAULT; 4291 auio.uio_llimit = curproc->p_fsz_ctl; 4292 error = wrip(ip, &auio, ioflag, cr); 4293 } else { 4294 auio.uio_fmode = FREAD; 4295 auio.uio_extflg = UIO_COPY_CACHED; 4296 auio.uio_llimit = MAXOFFSET_T; 4297 error = rdip(ip, &auio, ioflag, cr); 4298 } 4299 4300 if (aresid) { 4301 *aresid = auio.uio_resid; 4302 } else if (auio.uio_resid) { 4303 error = EIO; 4304 } 4305 return (error); 4306 } 4307 4308 static int 4309 ufs_fid(vp, fidp) 4310 struct vnode *vp; 4311 struct fid *fidp; 4312 { 4313 struct ufid *ufid; 4314 struct inode *ip = VTOI(vp); 4315 4316 if (ip->i_ufsvfs == NULL) 4317 return (EIO); 4318 4319 if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) { 4320 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t); 4321 return (ENOSPC); 4322 } 4323 4324 ufid = (struct ufid *)fidp; 4325 bzero((char *)ufid, sizeof (struct ufid)); 4326 ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t); 4327 ufid->ufid_ino = ip->i_number; 4328 ufid->ufid_gen = ip->i_gen; 4329 4330 return (0); 4331 } 4332 4333 /* ARGSUSED2 */ 4334 static int 4335 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4336 { 4337 struct inode *ip = VTOI(vp); 4338 struct ufsvfs *ufsvfsp; 4339 int forcedirectio; 4340 4341 /* 4342 * Read case is easy. 4343 */ 4344 if (!write_lock) { 4345 rw_enter(&ip->i_rwlock, RW_READER); 4346 return (V_WRITELOCK_FALSE); 4347 } 4348 4349 /* 4350 * Caller has requested a writer lock, but that inhibits any 4351 * concurrency in the VOPs that follow. Acquire the lock shared 4352 * and defer exclusive access until it is known to be needed in 4353 * other VOP handlers. Some cases can be determined here. 4354 */ 4355 4356 /* 4357 * If directio is not set, there is no chance of concurrency, 4358 * so just acquire the lock exclusive. Beware of a forced 4359 * unmount before looking at the mount option. 4360 */ 4361 ufsvfsp = ip->i_ufsvfs; 4362 forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0; 4363 if (!(ip->i_flag & IDIRECTIO || forcedirectio) || 4364 !ufs_allow_shared_writes) { 4365 rw_enter(&ip->i_rwlock, RW_WRITER); 4366 return (V_WRITELOCK_TRUE); 4367 } 4368 4369 /* 4370 * Mandatory locking forces acquiring i_rwlock exclusive. 4371 */ 4372 if (MANDLOCK(vp, ip->i_mode)) { 4373 rw_enter(&ip->i_rwlock, RW_WRITER); 4374 return (V_WRITELOCK_TRUE); 4375 } 4376 4377 /* 4378 * Acquire the lock shared in case a concurrent write follows. 4379 * Mandatory locking could have become enabled before the lock 4380 * was acquired. Re-check and upgrade if needed. 4381 */ 4382 rw_enter(&ip->i_rwlock, RW_READER); 4383 if (MANDLOCK(vp, ip->i_mode)) { 4384 rw_exit(&ip->i_rwlock); 4385 rw_enter(&ip->i_rwlock, RW_WRITER); 4386 return (V_WRITELOCK_TRUE); 4387 } 4388 return (V_WRITELOCK_FALSE); 4389 } 4390 4391 /*ARGSUSED*/ 4392 static void 4393 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4394 { 4395 struct inode *ip = VTOI(vp); 4396 4397 rw_exit(&ip->i_rwlock); 4398 } 4399 4400 /* ARGSUSED */ 4401 static int 4402 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp) 4403 { 4404 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4405 } 4406 4407 /* ARGSUSED */ 4408 static int 4409 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4410 offset_t offset, struct flk_callback *flk_cbp, struct cred *cr) 4411 { 4412 struct inode *ip = VTOI(vp); 4413 4414 if (ip->i_ufsvfs == NULL) 4415 return (EIO); 4416 4417 /* 4418 * If file is being mapped, disallow frlock. 4419 * XXX I am not holding tlock while checking i_mapcnt because the 4420 * current locking strategy drops all locks before calling fs_frlock. 4421 * So, mapcnt could change before we enter fs_frlock making is 4422 * meaningless to have held tlock in the first place. 4423 */ 4424 if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode)) 4425 return (EAGAIN); 4426 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4427 } 4428 4429 /* ARGSUSED */ 4430 static int 4431 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4432 offset_t offset, cred_t *cr, caller_context_t *ct) 4433 { 4434 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 4435 struct ulockfs *ulp; 4436 int error; 4437 4438 if ((error = convoff(vp, bfp, 0, offset)) == 0) { 4439 if (cmd == F_FREESP) { 4440 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4441 ULOCKFS_SPACE_MASK); 4442 if (error) 4443 return (error); 4444 error = ufs_freesp(vp, bfp, flag, cr); 4445 } else if (cmd == F_ALLOCSP) { 4446 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4447 ULOCKFS_FALLOCATE_MASK); 4448 if (error) 4449 return (error); 4450 error = ufs_allocsp(vp, bfp, cr); 4451 } else 4452 return (EINVAL); /* Command not handled here */ 4453 4454 if (ulp) 4455 ufs_lockfs_end(ulp); 4456 4457 } 4458 return (error); 4459 } 4460 4461 /* 4462 * Used to determine if read ahead should be done. Also used to 4463 * to determine when write back occurs. 4464 */ 4465 #define CLUSTSZ(ip) ((ip)->i_ufsvfs->vfs_ioclustsz) 4466 4467 /* 4468 * A faster version of ufs_getpage. 4469 * 4470 * We optimize by inlining the pvn_getpages iterator, eliminating 4471 * calls to bmap_read if file doesn't have UFS holes, and avoiding 4472 * the overhead of page_exists(). 4473 * 4474 * When files has UFS_HOLES and ufs_getpage is called with S_READ, 4475 * we set *protp to PROT_READ to avoid calling bmap_read. This approach 4476 * victimizes performance when a file with UFS holes is faulted 4477 * first in the S_READ mode, and then in the S_WRITE mode. We will get 4478 * two MMU faults in this case. 4479 * 4480 * XXX - the inode fields which control the sequential mode are not 4481 * protected by any mutex. The read ahead will act wild if 4482 * multiple processes will access the file concurrently and 4483 * some of them in sequential mode. One particulary bad case 4484 * is if another thread will change the value of i_nextrio between 4485 * the time this thread tests the i_nextrio value and then reads it 4486 * again to use it as the offset for the read ahead. 4487 */ 4488 static int 4489 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 4490 page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr, 4491 enum seg_rw rw, struct cred *cr) 4492 { 4493 u_offset_t uoff = (u_offset_t)off; /* type conversion */ 4494 u_offset_t pgoff; 4495 u_offset_t eoff; 4496 struct inode *ip = VTOI(vp); 4497 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 4498 struct fs *fs; 4499 struct ulockfs *ulp; 4500 page_t **pl; 4501 caddr_t pgaddr; 4502 krw_t rwtype; 4503 int err; 4504 int has_holes; 4505 int beyond_eof; 4506 int seqmode; 4507 int pgsize = PAGESIZE; 4508 int dolock; 4509 int do_qlock; 4510 int trans_size; 4511 4512 TRACE_1(TR_FAC_UFS, TR_UFS_GETPAGE_START, 4513 "ufs_getpage_start:vp %p", vp); 4514 4515 ASSERT((uoff & PAGEOFFSET) == 0); 4516 4517 if (protp) 4518 *protp = PROT_ALL; 4519 4520 /* 4521 * Obey the lockfs protocol 4522 */ 4523 err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg, 4524 rw == S_READ || rw == S_EXEC, protp); 4525 if (err) 4526 goto out; 4527 4528 fs = ufsvfsp->vfs_fs; 4529 4530 if (ulp && (rw == S_CREATE || rw == S_WRITE) && 4531 !(vp->v_flag & VISSWAP)) { 4532 /* 4533 * Try to start a transaction, will return if blocking is 4534 * expected to occur and the address space is not the 4535 * kernel address space. 4536 */ 4537 trans_size = TOP_GETPAGE_SIZE(ip); 4538 if (seg->s_as != &kas) { 4539 TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, 4540 trans_size, err) 4541 if (err == EWOULDBLOCK) { 4542 /* 4543 * Use EDEADLK here because the VM code 4544 * can normally never see this error. 4545 */ 4546 err = EDEADLK; 4547 ufs_lockfs_end(ulp); 4548 goto out; 4549 } 4550 } else { 4551 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4552 } 4553 } 4554 4555 if (vp->v_flag & VNOMAP) { 4556 err = ENOSYS; 4557 goto unlock; 4558 } 4559 4560 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 4561 4562 rwtype = RW_READER; /* start as a reader */ 4563 dolock = (rw_owner(&ip->i_contents) != curthread); 4564 /* 4565 * If this thread owns the lock, i.e., this thread grabbed it 4566 * as writer somewhere above, then we don't need to grab the 4567 * lock as reader in this routine. 4568 */ 4569 do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread); 4570 4571 retrylock: 4572 if (dolock) { 4573 /* 4574 * Grab the quota lock if we need to call 4575 * bmap_write() below (with i_contents as writer). 4576 */ 4577 if (do_qlock && rwtype == RW_WRITER) 4578 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4579 rw_enter(&ip->i_contents, rwtype); 4580 } 4581 4582 /* 4583 * We may be getting called as a side effect of a bmap using 4584 * fbread() when the blocks might be being allocated and the 4585 * size has not yet been up'ed. In this case we want to be 4586 * able to return zero pages if we get back UFS_HOLE from 4587 * calling bmap for a non write case here. We also might have 4588 * to read some frags from the disk into a page if we are 4589 * extending the number of frags for a given lbn in bmap(). 4590 * Large Files: The read of i_size here is atomic because 4591 * i_contents is held here. If dolock is zero, the lock 4592 * is held in bmap routines. 4593 */ 4594 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 4595 if (beyond_eof && seg != segkmap) { 4596 if (dolock) { 4597 rw_exit(&ip->i_contents); 4598 if (do_qlock && rwtype == RW_WRITER) 4599 rw_exit(&ufsvfsp->vfs_dqrwlock); 4600 } 4601 err = EFAULT; 4602 goto unlock; 4603 } 4604 4605 /* 4606 * Must hold i_contents lock throughout the call to pvn_getpages 4607 * since locked pages are returned from each call to ufs_getapage. 4608 * Must *not* return locked pages and then try for contents lock 4609 * due to lock ordering requirements (inode > page) 4610 */ 4611 4612 has_holes = bmap_has_holes(ip); 4613 4614 if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) { 4615 int blk_size; 4616 u_offset_t offset; 4617 4618 /* 4619 * We must acquire the RW_WRITER lock in order to 4620 * call bmap_write(). 4621 */ 4622 if (dolock && rwtype == RW_READER) { 4623 rwtype = RW_WRITER; 4624 4625 /* 4626 * Grab the quota lock before 4627 * upgrading i_contents, but if we can't grab it 4628 * don't wait here due to lock order: 4629 * vfs_dqrwlock > i_contents. 4630 */ 4631 if (do_qlock && rw_tryenter(&ufsvfsp->vfs_dqrwlock, 4632 RW_READER) == 0) { 4633 rw_exit(&ip->i_contents); 4634 goto retrylock; 4635 } 4636 if (!rw_tryupgrade(&ip->i_contents)) { 4637 rw_exit(&ip->i_contents); 4638 if (do_qlock) 4639 rw_exit(&ufsvfsp->vfs_dqrwlock); 4640 goto retrylock; 4641 } 4642 } 4643 4644 /* 4645 * May be allocating disk blocks for holes here as 4646 * a result of mmap faults. write(2) does the bmap_write 4647 * in rdip/wrip, not here. We are not dealing with frags 4648 * in this case. 4649 */ 4650 /* 4651 * Large Files: We cast fs_bmask field to offset_t 4652 * just as we do for MAXBMASK because uoff is a 64-bit 4653 * data type. fs_bmask will still be a 32-bit type 4654 * as we cannot change any ondisk data structures. 4655 */ 4656 4657 offset = uoff & (offset_t)fs->fs_bmask; 4658 while (offset < uoff + len) { 4659 blk_size = (int)blksize(fs, ip, lblkno(fs, offset)); 4660 err = bmap_write(ip, offset, blk_size, 4661 BI_NORMAL, NULL, cr); 4662 if (ip->i_flag & (ICHG|IUPD)) 4663 ip->i_seq++; 4664 if (err) 4665 goto update_inode; 4666 offset += blk_size; /* XXX - make this contig */ 4667 } 4668 } 4669 4670 /* 4671 * Can be a reader from now on. 4672 */ 4673 if (dolock && rwtype == RW_WRITER) { 4674 rw_downgrade(&ip->i_contents); 4675 /* 4676 * We can release vfs_dqrwlock early so do it, but make 4677 * sure we don't try to release it again at the bottom. 4678 */ 4679 if (do_qlock) { 4680 rw_exit(&ufsvfsp->vfs_dqrwlock); 4681 do_qlock = 0; 4682 } 4683 } 4684 4685 /* 4686 * We remove PROT_WRITE in cases when the file has UFS holes 4687 * because we don't want to call bmap_read() to check each 4688 * page if it is backed with a disk block. 4689 */ 4690 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) 4691 *protp &= ~PROT_WRITE; 4692 4693 err = 0; 4694 4695 /* 4696 * The loop looks up pages in the range [off, off + len). 4697 * For each page, we first check if we should initiate an asynchronous 4698 * read ahead before we call page_lookup (we may sleep in page_lookup 4699 * for a previously initiated disk read). 4700 */ 4701 eoff = (uoff + len); 4702 for (pgoff = uoff, pgaddr = addr, pl = plarr; 4703 pgoff < eoff; /* empty */) { 4704 page_t *pp; 4705 u_offset_t nextrio; 4706 se_t se; 4707 int retval; 4708 4709 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED); 4710 4711 /* Handle async getpage (faultahead) */ 4712 if (plarr == NULL) { 4713 ip->i_nextrio = pgoff; 4714 (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4715 pgoff += pgsize; 4716 pgaddr += pgsize; 4717 continue; 4718 } 4719 /* 4720 * Check if we should initiate read ahead of next cluster. 4721 * We call page_exists only when we need to confirm that 4722 * we have the current page before we initiate the read ahead. 4723 */ 4724 nextrio = ip->i_nextrio; 4725 if (seqmode && 4726 pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 4727 nextrio < ip->i_size && page_exists(vp, pgoff)) { 4728 retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4729 /* 4730 * We always read ahead the next cluster of data 4731 * starting from i_nextrio. If the page (vp,nextrio) 4732 * is actually in core at this point, the routine 4733 * ufs_getpage_ra() will stop pre-fetching data 4734 * until we read that page in a synchronized manner 4735 * through ufs_getpage_miss(). So, we should increase 4736 * i_nextrio if the page (vp, nextrio) exists. 4737 */ 4738 if ((retval == 0) && page_exists(vp, nextrio)) { 4739 ip->i_nextrio = nextrio + pgsize; 4740 } 4741 } 4742 4743 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 4744 /* 4745 * We found the page in the page cache. 4746 */ 4747 *pl++ = pp; 4748 pgoff += pgsize; 4749 pgaddr += pgsize; 4750 len -= pgsize; 4751 plsz -= pgsize; 4752 } else { 4753 /* 4754 * We have to create the page, or read it from disk. 4755 */ 4756 if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr, 4757 pl, plsz, rw, seqmode)) 4758 goto error; 4759 4760 while (*pl != NULL) { 4761 pl++; 4762 pgoff += pgsize; 4763 pgaddr += pgsize; 4764 len -= pgsize; 4765 plsz -= pgsize; 4766 } 4767 } 4768 } 4769 4770 /* 4771 * Return pages up to plsz if they are in the page cache. 4772 * We cannot return pages if there is a chance that they are 4773 * backed with a UFS hole and rw is S_WRITE or S_CREATE. 4774 */ 4775 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 4776 4777 ASSERT((protp == NULL) || 4778 !(has_holes && (*protp & PROT_WRITE))); 4779 4780 eoff = pgoff + plsz; 4781 while (pgoff < eoff) { 4782 page_t *pp; 4783 4784 if ((pp = page_lookup_nowait(vp, pgoff, 4785 SE_SHARED)) == NULL) 4786 break; 4787 4788 *pl++ = pp; 4789 pgoff += pgsize; 4790 plsz -= pgsize; 4791 } 4792 } 4793 4794 if (plarr) 4795 *pl = NULL; /* Terminate page list */ 4796 ip->i_nextr = pgoff; 4797 4798 error: 4799 if (err && plarr) { 4800 /* 4801 * Release any pages we have locked. 4802 */ 4803 while (pl > &plarr[0]) 4804 page_unlock(*--pl); 4805 4806 plarr[0] = NULL; 4807 } 4808 4809 update_inode: 4810 /* 4811 * If the inode is not already marked for IACC (in rdip() for read) 4812 * and the inode is not marked for no access time update (in wrip() 4813 * for write) then update the inode access time and mod time now. 4814 */ 4815 if ((ip->i_flag & (IACC | INOACC)) == 0) { 4816 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) { 4817 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 4818 (fs->fs_ronly == 0) && 4819 (!ufsvfsp->vfs_noatime)) { 4820 mutex_enter(&ip->i_tlock); 4821 ip->i_flag |= IACC; 4822 ITIMES_NOLOCK(ip); 4823 mutex_exit(&ip->i_tlock); 4824 } 4825 } 4826 } 4827 4828 if (dolock) { 4829 rw_exit(&ip->i_contents); 4830 if (do_qlock && rwtype == RW_WRITER) 4831 rw_exit(&ufsvfsp->vfs_dqrwlock); 4832 } 4833 4834 unlock: 4835 if (ulp) { 4836 if ((rw == S_CREATE || rw == S_WRITE) && 4837 !(vp->v_flag & VISSWAP)) { 4838 TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4839 } 4840 ufs_lockfs_end(ulp); 4841 } 4842 out: 4843 TRACE_2(TR_FAC_UFS, TR_UFS_GETPAGE_END, 4844 "ufs_getpage_end:vp %p error %d", vp, err); 4845 return (err); 4846 } 4847 4848 /* 4849 * ufs_getpage_miss is called when ufs_getpage missed the page in the page 4850 * cache. The page is either read from the disk, or it's created. 4851 * A page is created (without disk read) if rw == S_CREATE, or if 4852 * the page is not backed with a real disk block (UFS hole). 4853 */ 4854 /* ARGSUSED */ 4855 static int 4856 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg, 4857 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq) 4858 { 4859 struct inode *ip = VTOI(vp); 4860 page_t *pp; 4861 daddr_t bn; 4862 size_t io_len; 4863 int crpage = 0; 4864 int err; 4865 int contig; 4866 int bsize = ip->i_fs->fs_bsize; 4867 4868 /* 4869 * Figure out whether the page can be created, or must be 4870 * must be read from the disk. 4871 */ 4872 if (rw == S_CREATE) 4873 crpage = 1; 4874 else { 4875 contig = 0; 4876 if (err = bmap_read(ip, off, &bn, &contig)) 4877 return (err); 4878 4879 crpage = (bn == UFS_HOLE); 4880 4881 /* 4882 * If its also a fallocated block that hasn't been written to 4883 * yet, we will treat it just like a UFS_HOLE and create 4884 * a zero page for it 4885 */ 4886 if (ISFALLOCBLK(ip, bn)) 4887 crpage = 1; 4888 } 4889 4890 if (crpage) { 4891 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg, 4892 addr)) == NULL) { 4893 return (ufs_fault(vp, 4894 "ufs_getpage_miss: page_create == NULL")); 4895 } 4896 4897 if (rw != S_CREATE) 4898 pagezero(pp, 0, PAGESIZE); 4899 4900 io_len = PAGESIZE; 4901 } else { 4902 u_offset_t io_off; 4903 uint_t xlen; 4904 struct buf *bp; 4905 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 4906 4907 /* 4908 * If access is not in sequential order, we read from disk 4909 * in bsize units. 4910 * 4911 * We limit the size of the transfer to bsize if we are reading 4912 * from the beginning of the file. Note in this situation we 4913 * will hedge our bets and initiate an async read ahead of 4914 * the second block. 4915 */ 4916 if (!seq || off == 0) 4917 contig = MIN(contig, bsize); 4918 4919 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4920 &io_len, off, contig, 0); 4921 4922 /* 4923 * Some other thread has entered the page. 4924 * ufs_getpage will retry page_lookup. 4925 */ 4926 if (pp == NULL) { 4927 pl[0] = NULL; 4928 return (0); 4929 } 4930 4931 /* 4932 * Zero part of the page which we are not 4933 * going to read from the disk. 4934 */ 4935 xlen = io_len & PAGEOFFSET; 4936 if (xlen != 0) 4937 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4938 4939 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ); 4940 bp->b_edev = ip->i_dev; 4941 bp->b_dev = cmpdev(ip->i_dev); 4942 bp->b_blkno = bn; 4943 bp->b_un.b_addr = (caddr_t)0; 4944 bp->b_file = ip->i_vnode; 4945 bp->b_offset = off; 4946 4947 if (ufsvfsp->vfs_log) { 4948 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4949 } else if (ufsvfsp->vfs_snapshot) { 4950 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4951 } else { 4952 ufsvfsp->vfs_iotstamp = lbolt; 4953 ub.ub_getpages.value.ul++; 4954 (void) bdev_strategy(bp); 4955 lwp_stat_update(LWP_STAT_INBLK, 1); 4956 } 4957 4958 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK); 4959 4960 /* 4961 * If the file access is sequential, initiate read ahead 4962 * of the next cluster. 4963 */ 4964 if (seq && ip->i_nextrio < ip->i_size) 4965 (void) ufs_getpage_ra(vp, off, seg, addr); 4966 err = biowait(bp); 4967 pageio_done(bp); 4968 4969 if (err) { 4970 pvn_read_done(pp, B_ERROR); 4971 return (err); 4972 } 4973 } 4974 4975 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4976 return (0); 4977 } 4978 4979 /* 4980 * Read ahead a cluster from the disk. Returns the length in bytes. 4981 */ 4982 static int 4983 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr) 4984 { 4985 struct inode *ip = VTOI(vp); 4986 page_t *pp; 4987 u_offset_t io_off = ip->i_nextrio; 4988 ufsvfs_t *ufsvfsp; 4989 caddr_t addr2 = addr + (io_off - off); 4990 struct buf *bp; 4991 daddr_t bn; 4992 size_t io_len; 4993 int err; 4994 int contig; 4995 int xlen; 4996 int bsize = ip->i_fs->fs_bsize; 4997 4998 /* 4999 * If the directio advisory is in effect on this file, 5000 * then do not do buffered read ahead. Read ahead makes 5001 * it more difficult on threads using directio as they 5002 * will be forced to flush the pages from this vnode. 5003 */ 5004 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5005 return (0); 5006 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) 5007 return (0); 5008 5009 /* 5010 * Is this test needed? 5011 */ 5012 if (addr2 >= seg->s_base + seg->s_size) 5013 return (0); 5014 5015 contig = 0; 5016 err = bmap_read(ip, io_off, &bn, &contig); 5017 /* 5018 * If its a UFS_HOLE or a fallocated block, do not perform 5019 * any read ahead's since there probably is nothing to read ahead 5020 */ 5021 if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn)) 5022 return (0); 5023 5024 /* 5025 * Limit the transfer size to bsize if this is the 2nd block. 5026 */ 5027 if (io_off == (u_offset_t)bsize) 5028 contig = MIN(contig, bsize); 5029 5030 if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off, 5031 &io_len, io_off, contig, 1)) == NULL) 5032 return (0); 5033 5034 /* 5035 * Zero part of page which we are not going to read from disk 5036 */ 5037 if ((xlen = (io_len & PAGEOFFSET)) > 0) 5038 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 5039 5040 ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK; 5041 5042 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC); 5043 bp->b_edev = ip->i_dev; 5044 bp->b_dev = cmpdev(ip->i_dev); 5045 bp->b_blkno = bn; 5046 bp->b_un.b_addr = (caddr_t)0; 5047 bp->b_file = ip->i_vnode; 5048 bp->b_offset = off; 5049 5050 if (ufsvfsp->vfs_log) { 5051 lufs_read_strategy(ufsvfsp->vfs_log, bp); 5052 } else if (ufsvfsp->vfs_snapshot) { 5053 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5054 } else { 5055 ufsvfsp->vfs_iotstamp = lbolt; 5056 ub.ub_getras.value.ul++; 5057 (void) bdev_strategy(bp); 5058 lwp_stat_update(LWP_STAT_INBLK, 1); 5059 } 5060 5061 return (io_len); 5062 } 5063 5064 int ufs_delay = 1; 5065 /* 5066 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC} 5067 * 5068 * LMXXX - the inode really ought to contain a pointer to one of these 5069 * async args. Stuff gunk in there and just hand the whole mess off. 5070 * This would replace i_delaylen, i_delayoff. 5071 */ 5072 /*ARGSUSED*/ 5073 static int 5074 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 5075 struct cred *cr) 5076 { 5077 struct inode *ip = VTOI(vp); 5078 int err = 0; 5079 5080 if (vp->v_count == 0) { 5081 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0")); 5082 } 5083 5084 TRACE_1(TR_FAC_UFS, TR_UFS_PUTPAGE_START, 5085 "ufs_putpage_start:vp %p", vp); 5086 5087 /* 5088 * XXX - Why should this check be made here? 5089 */ 5090 if (vp->v_flag & VNOMAP) { 5091 err = ENOSYS; 5092 goto errout; 5093 } 5094 5095 if (ip->i_ufsvfs == NULL) { 5096 err = EIO; 5097 goto errout; 5098 } 5099 5100 if (flags & B_ASYNC) { 5101 if (ufs_delay && len && 5102 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 5103 mutex_enter(&ip->i_tlock); 5104 /* 5105 * If nobody stalled, start a new cluster. 5106 */ 5107 if (ip->i_delaylen == 0) { 5108 ip->i_delayoff = off; 5109 ip->i_delaylen = len; 5110 mutex_exit(&ip->i_tlock); 5111 goto errout; 5112 } 5113 /* 5114 * If we have a full cluster or they are not contig, 5115 * then push last cluster and start over. 5116 */ 5117 if (ip->i_delaylen >= CLUSTSZ(ip) || 5118 ip->i_delayoff + ip->i_delaylen != off) { 5119 u_offset_t doff; 5120 size_t dlen; 5121 5122 doff = ip->i_delayoff; 5123 dlen = ip->i_delaylen; 5124 ip->i_delayoff = off; 5125 ip->i_delaylen = len; 5126 mutex_exit(&ip->i_tlock); 5127 err = ufs_putpages(vp, doff, dlen, 5128 flags, cr); 5129 /* LMXXX - flags are new val, not old */ 5130 goto errout; 5131 } 5132 /* 5133 * There is something there, it's not full, and 5134 * it is contig. 5135 */ 5136 ip->i_delaylen += len; 5137 mutex_exit(&ip->i_tlock); 5138 goto errout; 5139 } 5140 /* 5141 * Must have weird flags or we are not clustering. 5142 */ 5143 } 5144 5145 err = ufs_putpages(vp, off, len, flags, cr); 5146 5147 errout: 5148 TRACE_2(TR_FAC_UFS, TR_UFS_PUTPAGE_END, 5149 "ufs_putpage_end:vp %p error %d", vp, err); 5150 return (err); 5151 } 5152 5153 /* 5154 * If len == 0, do from off to EOF. 5155 * 5156 * The normal cases should be len == 0 & off == 0 (entire vp list), 5157 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 5158 * (from pageout). 5159 */ 5160 /*ARGSUSED*/ 5161 static int 5162 ufs_putpages( 5163 struct vnode *vp, 5164 offset_t off, 5165 size_t len, 5166 int flags, 5167 struct cred *cr) 5168 { 5169 u_offset_t io_off; 5170 u_offset_t eoff; 5171 struct inode *ip = VTOI(vp); 5172 page_t *pp; 5173 size_t io_len; 5174 int err = 0; 5175 int dolock; 5176 5177 if (vp->v_count == 0) 5178 return (ufs_fault(vp, "ufs_putpages: v_count == 0")); 5179 /* 5180 * Acquire the readers/write inode lock before locking 5181 * any pages in this inode. 5182 * The inode lock is held during i/o. 5183 */ 5184 if (len == 0) { 5185 mutex_enter(&ip->i_tlock); 5186 ip->i_delayoff = ip->i_delaylen = 0; 5187 mutex_exit(&ip->i_tlock); 5188 } 5189 dolock = (rw_owner(&ip->i_contents) != curthread); 5190 if (dolock) { 5191 /* 5192 * Must synchronize this thread and any possible thread 5193 * operating in the window of vulnerability in wrip(). 5194 * It is dangerous to allow both a thread doing a putpage 5195 * and a thread writing, so serialize them. The exception 5196 * is when the thread in wrip() does something which causes 5197 * a putpage operation. Then, the thread must be allowed 5198 * to continue. It may encounter a bmap_read problem in 5199 * ufs_putapage, but that is handled in ufs_putapage. 5200 * Allow async writers to proceed, we don't want to block 5201 * the pageout daemon. 5202 */ 5203 if (ip->i_writer == curthread) 5204 rw_enter(&ip->i_contents, RW_READER); 5205 else { 5206 for (;;) { 5207 rw_enter(&ip->i_contents, RW_READER); 5208 mutex_enter(&ip->i_tlock); 5209 /* 5210 * If there is no thread in the critical 5211 * section of wrip(), then proceed. 5212 * Otherwise, wait until there isn't one. 5213 */ 5214 if (ip->i_writer == NULL) { 5215 mutex_exit(&ip->i_tlock); 5216 break; 5217 } 5218 rw_exit(&ip->i_contents); 5219 /* 5220 * Bounce async writers when we have a writer 5221 * working on this file so we don't deadlock 5222 * the pageout daemon. 5223 */ 5224 if (flags & B_ASYNC) { 5225 mutex_exit(&ip->i_tlock); 5226 return (0); 5227 } 5228 cv_wait(&ip->i_wrcv, &ip->i_tlock); 5229 mutex_exit(&ip->i_tlock); 5230 } 5231 } 5232 } 5233 5234 if (!vn_has_cached_data(vp)) { 5235 if (dolock) 5236 rw_exit(&ip->i_contents); 5237 return (0); 5238 } 5239 5240 if (len == 0) { 5241 /* 5242 * Search the entire vp list for pages >= off. 5243 */ 5244 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage, 5245 flags, cr); 5246 } else { 5247 /* 5248 * Loop over all offsets in the range looking for 5249 * pages to deal with. 5250 */ 5251 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0) 5252 eoff = MIN(off + len, eoff); 5253 else 5254 eoff = off + len; 5255 5256 for (io_off = off; io_off < eoff; io_off += io_len) { 5257 /* 5258 * If we are not invalidating, synchronously 5259 * freeing or writing pages, use the routine 5260 * page_lookup_nowait() to prevent reclaiming 5261 * them from the free list. 5262 */ 5263 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 5264 pp = page_lookup(vp, io_off, 5265 (flags & (B_INVAL | B_FREE)) ? 5266 SE_EXCL : SE_SHARED); 5267 } else { 5268 pp = page_lookup_nowait(vp, io_off, 5269 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 5270 } 5271 5272 if (pp == NULL || pvn_getdirty(pp, flags) == 0) 5273 io_len = PAGESIZE; 5274 else { 5275 u_offset_t *io_offp = &io_off; 5276 5277 err = ufs_putapage(vp, pp, io_offp, &io_len, 5278 flags, cr); 5279 if (err != 0) 5280 break; 5281 /* 5282 * "io_off" and "io_len" are returned as 5283 * the range of pages we actually wrote. 5284 * This allows us to skip ahead more quickly 5285 * since several pages may've been dealt 5286 * with by this iteration of the loop. 5287 */ 5288 } 5289 } 5290 } 5291 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 5292 /* 5293 * We have just sync'ed back all the pages on 5294 * the inode, turn off the IMODTIME flag. 5295 */ 5296 mutex_enter(&ip->i_tlock); 5297 ip->i_flag &= ~IMODTIME; 5298 mutex_exit(&ip->i_tlock); 5299 } 5300 if (dolock) 5301 rw_exit(&ip->i_contents); 5302 return (err); 5303 } 5304 5305 static void 5306 ufs_iodone(buf_t *bp) 5307 { 5308 struct inode *ip; 5309 5310 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 5311 5312 bp->b_iodone = NULL; 5313 5314 ip = VTOI(bp->b_pages->p_vnode); 5315 5316 mutex_enter(&ip->i_tlock); 5317 if (ip->i_writes >= ufs_LW) { 5318 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW) 5319 if (ufs_WRITES) 5320 cv_broadcast(&ip->i_wrcv); /* wake all up */ 5321 } else { 5322 ip->i_writes -= bp->b_bcount; 5323 } 5324 5325 mutex_exit(&ip->i_tlock); 5326 iodone(bp); 5327 } 5328 5329 /* 5330 * Write out a single page, possibly klustering adjacent 5331 * dirty pages. The inode lock must be held. 5332 * 5333 * LMXXX - bsize < pagesize not done. 5334 */ 5335 /*ARGSUSED*/ 5336 int 5337 ufs_putapage( 5338 struct vnode *vp, 5339 page_t *pp, 5340 u_offset_t *offp, 5341 size_t *lenp, /* return values */ 5342 int flags, 5343 struct cred *cr) 5344 { 5345 u_offset_t io_off; 5346 u_offset_t off; 5347 struct inode *ip = VTOI(vp); 5348 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 5349 struct fs *fs; 5350 struct buf *bp; 5351 size_t io_len; 5352 daddr_t bn; 5353 int err; 5354 int contig; 5355 5356 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 5357 5358 TRACE_1(TR_FAC_UFS, TR_UFS_PUTAPAGE_START, 5359 "ufs_putapage_start:vp %p", vp); 5360 5361 if (ufsvfsp == NULL) { 5362 err = EIO; 5363 goto out_trace; 5364 } 5365 5366 fs = ip->i_fs; 5367 ASSERT(fs->fs_ronly == 0); 5368 5369 /* 5370 * If the modified time on the inode has not already been 5371 * set elsewhere (e.g. for write/setattr) we set the time now. 5372 * This gives us approximate modified times for mmap'ed files 5373 * which are modified via stores in the user address space. 5374 */ 5375 if ((ip->i_flag & IMODTIME) == 0) { 5376 mutex_enter(&ip->i_tlock); 5377 ip->i_flag |= IUPD; 5378 ip->i_seq++; 5379 ITIMES_NOLOCK(ip); 5380 mutex_exit(&ip->i_tlock); 5381 } 5382 5383 /* 5384 * Align the request to a block boundry (for old file systems), 5385 * and go ask bmap() how contiguous things are for this file. 5386 */ 5387 off = pp->p_offset & (offset_t)fs->fs_bmask; /* block align it */ 5388 contig = 0; 5389 err = bmap_read(ip, off, &bn, &contig); 5390 if (err) 5391 goto out; 5392 if (bn == UFS_HOLE) { /* putpage never allocates */ 5393 /* 5394 * logging device is in error mode; simply return EIO 5395 */ 5396 if (TRANS_ISERROR(ufsvfsp)) { 5397 err = EIO; 5398 goto out; 5399 } 5400 /* 5401 * Oops, the thread in the window in wrip() did some 5402 * sort of operation which caused a putpage in the bad 5403 * range. In this case, just return an error which will 5404 * cause the software modified bit on the page to set 5405 * and the page will get written out again later. 5406 */ 5407 if (ip->i_writer == curthread) { 5408 err = EIO; 5409 goto out; 5410 } 5411 /* 5412 * If the pager is trying to push a page in the bad range 5413 * just tell him to try again later when things are better. 5414 */ 5415 if (flags & B_ASYNC) { 5416 err = EAGAIN; 5417 goto out; 5418 } 5419 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE"); 5420 goto out; 5421 } 5422 5423 /* 5424 * If it is an fallocate'd block, reverse the negativity since 5425 * we are now writing to it 5426 */ 5427 if (ISFALLOCBLK(ip, bn)) { 5428 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn)); 5429 if (err) 5430 goto out; 5431 5432 bn = -bn; 5433 } 5434 5435 /* 5436 * Take the length (of contiguous bytes) passed back from bmap() 5437 * and _try_ and get a set of pages covering that extent. 5438 */ 5439 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags); 5440 5441 /* 5442 * May have run out of memory and not clustered backwards. 5443 * off p_offset 5444 * [ pp - 1 ][ pp ] 5445 * [ block ] 5446 * We told bmap off, so we have to adjust the bn accordingly. 5447 */ 5448 if (io_off > off) { 5449 bn += btod(io_off - off); 5450 contig -= (io_off - off); 5451 } 5452 5453 /* 5454 * bmap was carefull to tell us the right size so use that. 5455 * There might be unallocated frags at the end. 5456 * LMXXX - bzero the end of the page? We must be writing after EOF. 5457 */ 5458 if (io_len > contig) { 5459 ASSERT(io_len - contig < fs->fs_bsize); 5460 io_len -= (io_len - contig); 5461 } 5462 5463 /* 5464 * Handle the case where we are writing the last page after EOF. 5465 * 5466 * XXX - just a patch for i-mt3. 5467 */ 5468 if (io_len == 0) { 5469 ASSERT(pp->p_offset >= (u_offset_t)(roundup(ip->i_size, 5470 PAGESIZE))); 5471 io_len = PAGESIZE; 5472 } 5473 5474 bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags); 5475 5476 ULOCKFS_SET_MOD(ITOUL(ip)); 5477 5478 bp->b_edev = ip->i_dev; 5479 bp->b_dev = cmpdev(ip->i_dev); 5480 bp->b_blkno = bn; 5481 bp->b_un.b_addr = (caddr_t)0; 5482 bp->b_file = ip->i_vnode; 5483 5484 if (TRANS_ISTRANS(ufsvfsp)) { 5485 if ((ip->i_mode & IFMT) == IFSHAD) { 5486 TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD); 5487 } else if (ufsvfsp->vfs_qinod == ip) { 5488 TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR, 5489 0, 0); 5490 } 5491 } 5492 5493 /* write throttle */ 5494 5495 ASSERT(bp->b_iodone == NULL); 5496 bp->b_iodone = (int (*)())ufs_iodone; 5497 mutex_enter(&ip->i_tlock); 5498 ip->i_writes += bp->b_bcount; 5499 mutex_exit(&ip->i_tlock); 5500 5501 if (bp->b_flags & B_ASYNC) { 5502 if (ufsvfsp->vfs_log) { 5503 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5504 } else if (ufsvfsp->vfs_snapshot) { 5505 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5506 } else { 5507 ufsvfsp->vfs_iotstamp = lbolt; 5508 ub.ub_putasyncs.value.ul++; 5509 (void) bdev_strategy(bp); 5510 lwp_stat_update(LWP_STAT_OUBLK, 1); 5511 } 5512 } else { 5513 if (ufsvfsp->vfs_log) { 5514 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5515 } else if (ufsvfsp->vfs_snapshot) { 5516 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5517 } else { 5518 ufsvfsp->vfs_iotstamp = lbolt; 5519 ub.ub_putsyncs.value.ul++; 5520 (void) bdev_strategy(bp); 5521 lwp_stat_update(LWP_STAT_OUBLK, 1); 5522 } 5523 err = biowait(bp); 5524 pageio_done(bp); 5525 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); 5526 } 5527 5528 pp = NULL; 5529 5530 out: 5531 if (err != 0 && pp != NULL) 5532 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 5533 5534 if (offp) 5535 *offp = io_off; 5536 if (lenp) 5537 *lenp = io_len; 5538 out_trace: 5539 TRACE_2(TR_FAC_UFS, TR_UFS_PUTAPAGE_END, 5540 "ufs_putapage_end:vp %p error %d", vp, err); 5541 return (err); 5542 } 5543 5544 /* ARGSUSED */ 5545 static int 5546 ufs_map(struct vnode *vp, 5547 offset_t off, 5548 struct as *as, 5549 caddr_t *addrp, 5550 size_t len, 5551 uchar_t prot, 5552 uchar_t maxprot, 5553 uint_t flags, 5554 struct cred *cr) 5555 { 5556 struct segvn_crargs vn_a; 5557 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5558 struct ulockfs *ulp; 5559 int error; 5560 5561 TRACE_1(TR_FAC_UFS, TR_UFS_MAP_START, 5562 "ufs_map_start:vp %p", vp); 5563 5564 retry_map: 5565 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK); 5566 if (error) 5567 goto out; 5568 5569 if (vp->v_flag & VNOMAP) { 5570 error = ENOSYS; 5571 goto unlock; 5572 } 5573 5574 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) { 5575 error = ENXIO; 5576 goto unlock; 5577 } 5578 5579 if (vp->v_type != VREG) { 5580 error = ENODEV; 5581 goto unlock; 5582 } 5583 5584 /* 5585 * If file is being locked, disallow mapping. 5586 */ 5587 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) { 5588 error = EAGAIN; 5589 goto unlock; 5590 } 5591 5592 as_rangelock(as); 5593 if ((flags & MAP_FIXED) == 0) { 5594 map_addr(addrp, len, off, 1, flags); 5595 if (*addrp == NULL) { 5596 as_rangeunlock(as); 5597 error = ENOMEM; 5598 goto unlock; 5599 } 5600 } else { 5601 /* 5602 * User specified address - blow away any previous mappings 5603 */ 5604 (void) as_unmap(as, *addrp, len); 5605 } 5606 5607 vn_a.vp = vp; 5608 vn_a.offset = (u_offset_t)off; 5609 vn_a.type = flags & MAP_TYPE; 5610 vn_a.prot = prot; 5611 vn_a.maxprot = maxprot; 5612 vn_a.cred = cr; 5613 vn_a.amp = NULL; 5614 vn_a.flags = flags & ~MAP_TYPE; 5615 vn_a.szc = 0; 5616 vn_a.lgrp_mem_policy_flags = 0; 5617 5618 retry_lock: 5619 if (!AS_LOCK_TRYENTER(ias, &as->a_lock, RW_WRITER)) { 5620 /* 5621 * We didn't get the lock. Check if the SLOCK is set in the 5622 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 5623 * and wait for SLOCK to be cleared. 5624 */ 5625 5626 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 5627 as_rangeunlock(as); 5628 ufs_lockfs_end(ulp); 5629 goto retry_map; 5630 } else { 5631 /* 5632 * SLOCK isn't set so this is a genuine synchronization 5633 * case. Let's try again after giving them a breather. 5634 */ 5635 delay(RETRY_LOCK_DELAY); 5636 goto retry_lock; 5637 } 5638 } 5639 error = as_map_locked(as, *addrp, len, segvn_create, &vn_a); 5640 as_rangeunlock(as); 5641 5642 unlock: 5643 if (ulp) { 5644 ufs_lockfs_end(ulp); 5645 } 5646 out: 5647 TRACE_2(TR_FAC_UFS, TR_UFS_MAP_END, 5648 "ufs_map_end:vp %p error %d", vp, error); 5649 return (error); 5650 } 5651 5652 /* ARGSUSED */ 5653 static int 5654 ufs_addmap(struct vnode *vp, 5655 offset_t off, 5656 struct as *as, 5657 caddr_t addr, 5658 size_t len, 5659 uchar_t prot, 5660 uchar_t maxprot, 5661 uint_t flags, 5662 struct cred *cr) 5663 { 5664 struct inode *ip = VTOI(vp); 5665 5666 if (vp->v_flag & VNOMAP) { 5667 return (ENOSYS); 5668 } 5669 5670 mutex_enter(&ip->i_tlock); 5671 ip->i_mapcnt += btopr(len); 5672 mutex_exit(&ip->i_tlock); 5673 return (0); 5674 } 5675 5676 /*ARGSUSED*/ 5677 static int 5678 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 5679 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 5680 struct cred *cr) 5681 { 5682 struct inode *ip = VTOI(vp); 5683 5684 if (vp->v_flag & VNOMAP) { 5685 return (ENOSYS); 5686 } 5687 5688 mutex_enter(&ip->i_tlock); 5689 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 5690 ASSERT(ip->i_mapcnt >= 0); 5691 mutex_exit(&ip->i_tlock); 5692 return (0); 5693 } 5694 /* 5695 * Return the answer requested to poll() for non-device files 5696 */ 5697 struct pollhead ufs_pollhd; 5698 5699 /* ARGSUSED */ 5700 int 5701 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp) 5702 { 5703 struct ufsvfs *ufsvfsp; 5704 5705 *revp = 0; 5706 ufsvfsp = VTOI(vp)->i_ufsvfs; 5707 5708 if (!ufsvfsp) { 5709 *revp = POLLHUP; 5710 goto out; 5711 } 5712 5713 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) || 5714 ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) { 5715 *revp |= POLLERR; 5716 5717 } else { 5718 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly && 5719 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5720 *revp |= POLLOUT; 5721 5722 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly && 5723 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5724 *revp |= POLLWRBAND; 5725 5726 if (ev & POLLIN) 5727 *revp |= POLLIN; 5728 5729 if (ev & POLLRDNORM) 5730 *revp |= POLLRDNORM; 5731 5732 if (ev & POLLRDBAND) 5733 *revp |= POLLRDBAND; 5734 } 5735 5736 if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP))) 5737 *revp |= POLLPRI; 5738 out: 5739 *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL; 5740 5741 return (0); 5742 } 5743 5744 /* ARGSUSED */ 5745 static int 5746 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr) 5747 { 5748 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5749 struct ulockfs *ulp = NULL; 5750 struct inode *sip = NULL; 5751 int error; 5752 struct inode *ip = VTOI(vp); 5753 int issync; 5754 5755 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK); 5756 if (error) 5757 return (error); 5758 5759 switch (cmd) { 5760 /* 5761 * Have to handle _PC_NAME_MAX here, because the normal way 5762 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()] 5763 * results in a lock ordering reversal between 5764 * ufs_lockfs_{begin,end}() and 5765 * ufs_thread_{suspend,continue}(). 5766 * 5767 * Keep in sync with ufs_statvfs(). 5768 */ 5769 case _PC_NAME_MAX: 5770 *valp = MAXNAMLEN; 5771 break; 5772 5773 case _PC_FILESIZEBITS: 5774 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 5775 *valp = UFS_FILESIZE_BITS; 5776 else 5777 *valp = 32; 5778 break; 5779 5780 case _PC_XATTR_EXISTS: 5781 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5782 5783 error = ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, 5784 cr); 5785 if (error == 0 && sip != NULL) { 5786 /* Start transaction */ 5787 if (ulp) { 5788 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 5789 TOP_RMDIR, TOP_RMDIR_SIZE); 5790 } 5791 /* 5792 * Is directory empty 5793 */ 5794 rw_enter(&sip->i_rwlock, RW_WRITER); 5795 rw_enter(&sip->i_contents, RW_WRITER); 5796 if (ufs_xattrdirempty(sip, 5797 sip->i_number, CRED())) { 5798 rw_enter(&ip->i_contents, RW_WRITER); 5799 ufs_unhook_shadow(ip, sip); 5800 rw_exit(&ip->i_contents); 5801 5802 *valp = 0; 5803 5804 } else 5805 *valp = 1; 5806 rw_exit(&sip->i_contents); 5807 rw_exit(&sip->i_rwlock); 5808 if (ulp) { 5809 TRANS_END_CSYNC(ufsvfsp, error, issync, 5810 TOP_RMDIR, TOP_RMDIR_SIZE); 5811 } 5812 VN_RELE(ITOV(sip)); 5813 } else if (error == ENOENT) { 5814 *valp = 0; 5815 error = 0; 5816 } 5817 } else { 5818 error = fs_pathconf(vp, cmd, valp, cr); 5819 } 5820 break; 5821 5822 case _PC_ACL_ENABLED: 5823 *valp = _ACL_ACLENT_ENABLED; 5824 break; 5825 5826 case _PC_MIN_HOLE_SIZE: 5827 *valp = (ulong_t)ip->i_fs->fs_bsize; 5828 break; 5829 5830 default: 5831 error = fs_pathconf(vp, cmd, valp, cr); 5832 } 5833 5834 if (ulp != NULL) { 5835 ufs_lockfs_end(ulp); 5836 } 5837 return (error); 5838 } 5839 5840 int ufs_pageio_writes, ufs_pageio_reads; 5841 5842 /*ARGSUSED*/ 5843 static int 5844 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5845 int flags, struct cred *cr) 5846 { 5847 struct inode *ip = VTOI(vp); 5848 struct ufsvfs *ufsvfsp; 5849 page_t *npp = NULL, *opp = NULL, *cpp = pp; 5850 struct buf *bp; 5851 daddr_t bn; 5852 size_t done_len = 0, cur_len = 0; 5853 int err = 0; 5854 int contig = 0; 5855 int dolock; 5856 int vmpss = 0; 5857 struct ulockfs *ulp; 5858 5859 if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp && 5860 vp->v_mpssdata != NULL) { 5861 vmpss = 1; 5862 } 5863 5864 dolock = (rw_owner(&ip->i_contents) != curthread); 5865 /* 5866 * We need a better check. Ideally, we would use another 5867 * vnodeops so that hlocked and forcibly unmounted file 5868 * systems would return EIO where appropriate and w/o the 5869 * need for these checks. 5870 */ 5871 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5872 return (EIO); 5873 5874 /* 5875 * For vmpss (pp can be NULL) case respect the quiesce protocol. 5876 * ul_lock must be taken before locking pages so we can't use it here 5877 * if pp is non NULL because segvn already locked pages 5878 * SE_EXCL. Instead we rely on the fact that a forced umount or 5879 * applying a filesystem lock via ufs_fiolfs() will block in the 5880 * implicit call to ufs_flush() until we unlock the pages after the 5881 * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend 5882 * above 0 until they are done. We have to be careful not to increment 5883 * ul_vnops_cnt here after forceful unmount hlocks the file system. 5884 * 5885 * If pp is NULL use ul_lock to make sure we don't increment 5886 * ul_vnops_cnt after forceful unmount hlocks the file system. 5887 */ 5888 if (vmpss || pp == NULL) { 5889 ulp = &ufsvfsp->vfs_ulockfs; 5890 if (pp == NULL) 5891 mutex_enter(&ulp->ul_lock); 5892 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) { 5893 if (pp == NULL) { 5894 mutex_exit(&ulp->ul_lock); 5895 } 5896 return (vmpss ? EIO : EINVAL); 5897 } 5898 atomic_add_long(&ulp->ul_vnops_cnt, 1); 5899 if (pp == NULL) 5900 mutex_exit(&ulp->ul_lock); 5901 if (ufs_quiesce_pend) { 5902 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5903 cv_broadcast(&ulp->ul_cv); 5904 return (vmpss ? EIO : EINVAL); 5905 } 5906 } 5907 5908 if (dolock) { 5909 /* 5910 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to 5911 * handle a fault against a segment that maps vnode pages with 5912 * large mappings. Segvn creates pages and holds them locked 5913 * SE_EXCL during VOP_PAGEIO() call. In this case we have to 5914 * use rw_tryenter() to avoid a potential deadlock since in 5915 * lock order i_contents needs to be taken first. 5916 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails. 5917 */ 5918 if (!vmpss) { 5919 rw_enter(&ip->i_contents, RW_READER); 5920 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) { 5921 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5922 cv_broadcast(&ulp->ul_cv); 5923 return (EDEADLK); 5924 } 5925 } 5926 5927 /* 5928 * Return an error to segvn because the pagefault request is beyond 5929 * PAGESIZE rounded EOF. 5930 */ 5931 if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) { 5932 if (dolock) 5933 rw_exit(&ip->i_contents); 5934 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5935 cv_broadcast(&ulp->ul_cv); 5936 return (EFAULT); 5937 } 5938 5939 if (pp == NULL) { 5940 if (bmap_has_holes(ip)) { 5941 err = ENOSYS; 5942 } else { 5943 err = EINVAL; 5944 } 5945 if (dolock) 5946 rw_exit(&ip->i_contents); 5947 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5948 cv_broadcast(&ulp->ul_cv); 5949 return (err); 5950 } 5951 5952 /* 5953 * Break the io request into chunks, one for each contiguous 5954 * stretch of disk blocks in the target file. 5955 */ 5956 while (done_len < io_len) { 5957 ASSERT(cpp); 5958 contig = 0; 5959 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len), 5960 &bn, &contig)) 5961 break; 5962 5963 if (bn == UFS_HOLE) { /* No holey swapfiles */ 5964 if (vmpss) { 5965 err = EFAULT; 5966 break; 5967 } 5968 err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE"); 5969 break; 5970 } 5971 5972 cur_len = MIN(io_len - done_len, contig); 5973 /* 5974 * Zero out a page beyond EOF, when the last block of 5975 * a file is a UFS fragment so that ufs_pageio() can be used 5976 * instead of ufs_getpage() to handle faults against 5977 * segvn segments that use large pages. 5978 */ 5979 page_list_break(&cpp, &npp, btopr(cur_len)); 5980 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) { 5981 size_t xlen = cur_len & PAGEOFFSET; 5982 pagezero(cpp->p_prev, xlen, PAGESIZE - xlen); 5983 } 5984 5985 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 5986 ASSERT(bp != NULL); 5987 5988 bp->b_edev = ip->i_dev; 5989 bp->b_dev = cmpdev(ip->i_dev); 5990 bp->b_blkno = bn; 5991 bp->b_un.b_addr = (caddr_t)0; 5992 bp->b_file = ip->i_vnode; 5993 5994 ufsvfsp->vfs_iotstamp = lbolt; 5995 ub.ub_pageios.value.ul++; 5996 if (ufsvfsp->vfs_snapshot) 5997 fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp); 5998 else 5999 (void) bdev_strategy(bp); 6000 6001 if (flags & B_READ) 6002 ufs_pageio_reads++; 6003 else 6004 ufs_pageio_writes++; 6005 if (flags & B_READ) 6006 lwp_stat_update(LWP_STAT_INBLK, 1); 6007 else 6008 lwp_stat_update(LWP_STAT_OUBLK, 1); 6009 /* 6010 * If the request is not B_ASYNC, wait for i/o to complete 6011 * and re-assemble the page list to return to the caller. 6012 * If it is B_ASYNC we leave the page list in pieces and 6013 * cleanup() will dispose of them. 6014 */ 6015 if ((flags & B_ASYNC) == 0) { 6016 err = biowait(bp); 6017 pageio_done(bp); 6018 if (err) 6019 break; 6020 page_list_concat(&opp, &cpp); 6021 } 6022 cpp = npp; 6023 npp = NULL; 6024 if (flags & B_READ) 6025 cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t); 6026 done_len += cur_len; 6027 } 6028 ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len)); 6029 if (err) { 6030 if (flags & B_ASYNC) { 6031 /* Cleanup unprocessed parts of list */ 6032 page_list_concat(&cpp, &npp); 6033 if (flags & B_READ) 6034 pvn_read_done(cpp, B_ERROR); 6035 else 6036 pvn_write_done(cpp, B_ERROR); 6037 } else { 6038 /* Re-assemble list and let caller clean up */ 6039 page_list_concat(&opp, &cpp); 6040 page_list_concat(&opp, &npp); 6041 } 6042 } 6043 6044 if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) && 6045 ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) { 6046 mutex_enter(&ip->i_tlock); 6047 ip->i_flag |= IACC; 6048 ITIMES_NOLOCK(ip); 6049 mutex_exit(&ip->i_tlock); 6050 } 6051 6052 if (dolock) 6053 rw_exit(&ip->i_contents); 6054 if (vmpss && !atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 6055 cv_broadcast(&ulp->ul_cv); 6056 return (err); 6057 } 6058 6059 /* 6060 * Called when the kernel is in a frozen state to dump data 6061 * directly to the device. It uses a private dump data structure, 6062 * set up by dump_ctl, to locate the correct disk block to which to dump. 6063 */ 6064 static int 6065 ufs_dump(vnode_t *vp, caddr_t addr, int ldbn, int dblks) 6066 { 6067 u_offset_t file_size; 6068 struct inode *ip = VTOI(vp); 6069 struct fs *fs = ip->i_fs; 6070 daddr_t dbn, lfsbn; 6071 int disk_blks = fs->fs_bsize >> DEV_BSHIFT; 6072 int error = 0; 6073 int ndbs, nfsbs; 6074 6075 /* 6076 * forced unmount case 6077 */ 6078 if (ip->i_ufsvfs == NULL) 6079 return (EIO); 6080 /* 6081 * Validate the inode that it has not been modified since 6082 * the dump structure is allocated. 6083 */ 6084 mutex_enter(&ip->i_tlock); 6085 if ((dump_info == NULL) || 6086 (dump_info->ip != ip) || 6087 (dump_info->time.tv_sec != ip->i_mtime.tv_sec) || 6088 (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) { 6089 mutex_exit(&ip->i_tlock); 6090 return (-1); 6091 } 6092 mutex_exit(&ip->i_tlock); 6093 6094 /* 6095 * See that the file has room for this write 6096 */ 6097 UFS_GET_ISIZE(&file_size, ip); 6098 6099 if (ldbtob((offset_t)(ldbn + dblks)) > file_size) 6100 return (ENOSPC); 6101 6102 /* 6103 * Find the physical disk block numbers from the dump 6104 * private data structure directly and write out the data 6105 * in contiguous block lumps 6106 */ 6107 while (dblks > 0 && !error) { 6108 lfsbn = (daddr_t)lblkno(fs, ldbtob((offset_t)ldbn)); 6109 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks; 6110 nfsbs = 1; 6111 ndbs = disk_blks - ldbn % disk_blks; 6112 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn + 6113 nfsbs]) == dbn + ndbs) { 6114 nfsbs++; 6115 ndbs += disk_blks; 6116 } 6117 if (ndbs > dblks) 6118 ndbs = dblks; 6119 error = bdev_dump(ip->i_dev, addr, dbn, ndbs); 6120 addr += ldbtob((offset_t)ndbs); 6121 dblks -= ndbs; 6122 ldbn += ndbs; 6123 } 6124 return (error); 6125 6126 } 6127 6128 /* 6129 * Prepare the file system before and after the dump operation. 6130 * 6131 * action = DUMP_ALLOC: 6132 * Preparation before dump, allocate dump private data structure 6133 * to hold all the direct and indirect block info for dump. 6134 * 6135 * action = DUMP_FREE: 6136 * Clean up after dump, deallocate the dump private data structure. 6137 * 6138 * action = DUMP_SCAN: 6139 * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space; 6140 * if found, the starting file-relative DEV_BSIZE lbn is written 6141 * to *bklp; that lbn is intended for use with VOP_DUMP() 6142 */ 6143 static int 6144 ufs_dumpctl(vnode_t *vp, int action, int *blkp) 6145 { 6146 struct inode *ip = VTOI(vp); 6147 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 6148 struct fs *fs; 6149 daddr32_t *dblk, *storeblk; 6150 daddr32_t *nextblk, *endblk; 6151 struct buf *bp; 6152 int i, entry, entries; 6153 int n, ncontig; 6154 6155 /* 6156 * check for forced unmount 6157 */ 6158 if (ufsvfsp == NULL) 6159 return (EIO); 6160 6161 if (action == DUMP_ALLOC) { 6162 /* 6163 * alloc and record dump_info 6164 */ 6165 if (dump_info != NULL) 6166 return (EINVAL); 6167 6168 ASSERT(vp->v_type == VREG); 6169 fs = ufsvfsp->vfs_fs; 6170 6171 rw_enter(&ip->i_contents, RW_READER); 6172 6173 if (bmap_has_holes(ip)) { 6174 rw_exit(&ip->i_contents); 6175 return (EFAULT); 6176 } 6177 6178 /* 6179 * calculate and allocate space needed according to i_size 6180 */ 6181 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size)); 6182 if ((dump_info = (struct dump *) 6183 kmem_alloc(sizeof (struct dump) + 6184 (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP)) == NULL) { 6185 rw_exit(&ip->i_contents); 6186 return (ENOMEM); 6187 } 6188 6189 /* Start saving the info */ 6190 dump_info->fsbs = entries; 6191 dump_info->ip = ip; 6192 storeblk = &dump_info->dblk[0]; 6193 6194 /* Direct Blocks */ 6195 for (entry = 0; entry < NDADDR && entry < entries; entry++) 6196 *storeblk++ = ip->i_db[entry]; 6197 6198 /* Indirect Blocks */ 6199 for (i = 0; i < NIADDR; i++) { 6200 int error = 0; 6201 6202 bp = UFS_BREAD(ufsvfsp, 6203 ip->i_dev, fsbtodb(fs, ip->i_ib[i]), 6204 fs->fs_bsize); 6205 if (bp->b_flags & B_ERROR) 6206 error = EIO; 6207 else { 6208 dblk = bp->b_un.b_daddr; 6209 if ((storeblk = save_dblks(ip, ufsvfsp, 6210 storeblk, dblk, i, entries)) == NULL) 6211 error = EIO; 6212 } 6213 6214 brelse(bp); 6215 6216 if (error != 0) { 6217 kmem_free(dump_info, sizeof (struct dump) + 6218 (entries - 1) * sizeof (daddr32_t)); 6219 rw_exit(&ip->i_contents); 6220 dump_info = NULL; 6221 return (error); 6222 } 6223 } 6224 /* and time stamp the information */ 6225 mutex_enter(&ip->i_tlock); 6226 dump_info->time = ip->i_mtime; 6227 mutex_exit(&ip->i_tlock); 6228 6229 rw_exit(&ip->i_contents); 6230 } else if (action == DUMP_FREE) { 6231 /* 6232 * free dump_info 6233 */ 6234 if (dump_info == NULL) 6235 return (EINVAL); 6236 entries = dump_info->fsbs - 1; 6237 kmem_free(dump_info, sizeof (struct dump) + 6238 entries * sizeof (daddr32_t)); 6239 dump_info = NULL; 6240 } else if (action == DUMP_SCAN) { 6241 /* 6242 * scan dump_info 6243 */ 6244 if (dump_info == NULL) 6245 return (EINVAL); 6246 6247 dblk = dump_info->dblk; 6248 nextblk = dblk + 1; 6249 endblk = dblk + dump_info->fsbs - 1; 6250 fs = ufsvfsp->vfs_fs; 6251 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT); 6252 6253 /* 6254 * scan dblk[] entries; contig fs space is found when: 6255 * ((current blkno + frags per block) == next blkno) 6256 */ 6257 n = 0; 6258 while (n < ncontig && dblk < endblk) { 6259 if ((*dblk + fs->fs_frag) == *nextblk) 6260 n++; 6261 else 6262 n = 0; 6263 dblk++; 6264 nextblk++; 6265 } 6266 6267 /* 6268 * index is where size bytes of contig space begins; 6269 * conversion from index to the file's DEV_BSIZE lbn 6270 * is equivalent to: (index * fs_bsize) / DEV_BSIZE 6271 */ 6272 if (n == ncontig) { 6273 i = (dblk - dump_info->dblk) - ncontig; 6274 *blkp = i << (fs->fs_bshift - DEV_BSHIFT); 6275 } else 6276 return (EFAULT); 6277 } 6278 return (0); 6279 } 6280 6281 /* 6282 * Recursive helper function for ufs_dumpctl(). It follows the indirect file 6283 * system blocks until it reaches the the disk block addresses, which are 6284 * then stored into the given buffer, storeblk. 6285 */ 6286 static daddr32_t * 6287 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp, daddr32_t *storeblk, 6288 daddr32_t *dblk, int level, int entries) 6289 { 6290 struct fs *fs = ufsvfsp->vfs_fs; 6291 struct buf *bp; 6292 int i; 6293 6294 if (level == 0) { 6295 for (i = 0; i < NINDIR(fs); i++) { 6296 if (storeblk - dump_info->dblk >= entries) 6297 break; 6298 *storeblk++ = dblk[i]; 6299 } 6300 return (storeblk); 6301 } 6302 for (i = 0; i < NINDIR(fs); i++) { 6303 if (storeblk - dump_info->dblk >= entries) 6304 break; 6305 bp = UFS_BREAD(ufsvfsp, 6306 ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize); 6307 if (bp->b_flags & B_ERROR) { 6308 brelse(bp); 6309 return (NULL); 6310 } 6311 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr, 6312 level - 1, entries); 6313 brelse(bp); 6314 6315 if (storeblk == NULL) 6316 return (NULL); 6317 } 6318 return (storeblk); 6319 } 6320 6321 /* ARGSUSED */ 6322 static int 6323 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, 6324 struct cred *cr) 6325 { 6326 struct inode *ip = VTOI(vp); 6327 struct ulockfs *ulp; 6328 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 6329 ulong_t vsa_mask = vsap->vsa_mask; 6330 int err = EINVAL; 6331 6332 TRACE_3(TR_FAC_UFS, TR_UFS_GETSECATTR_START, 6333 "ufs_getsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6334 6335 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6336 6337 /* 6338 * Only grab locks if needed - they're not needed to check vsa_mask 6339 * or if the mask contains no acl flags. 6340 */ 6341 if (vsa_mask != 0) { 6342 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, 6343 ULOCKFS_GETATTR_MASK)) 6344 return (err); 6345 6346 rw_enter(&ip->i_contents, RW_READER); 6347 err = ufs_acl_get(ip, vsap, flag, cr); 6348 rw_exit(&ip->i_contents); 6349 6350 if (ulp) 6351 ufs_lockfs_end(ulp); 6352 } 6353 TRACE_1(TR_FAC_UFS, TR_UFS_GETSECATTR_END, 6354 "ufs_getsecattr_end:vp %p", vp); 6355 return (err); 6356 } 6357 6358 /* ARGSUSED */ 6359 static int 6360 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr) 6361 { 6362 struct inode *ip = VTOI(vp); 6363 struct ulockfs *ulp = NULL; 6364 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 6365 ulong_t vsa_mask = vsap->vsa_mask; 6366 int err; 6367 int haverwlock = 1; 6368 int trans_size; 6369 int donetrans = 0; 6370 int retry = 1; 6371 6372 6373 TRACE_3(TR_FAC_UFS, TR_UFS_SETSECATTR_START, 6374 "ufs_setsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6375 6376 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 6377 6378 /* Abort now if the request is either empty or invalid. */ 6379 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6380 if ((vsa_mask == 0) || 6381 ((vsap->vsa_aclentp == NULL) && 6382 (vsap->vsa_dfaclentp == NULL))) { 6383 err = EINVAL; 6384 goto out; 6385 } 6386 6387 /* 6388 * Following convention, if this is a directory then we acquire the 6389 * inode's i_rwlock after starting a UFS logging transaction; 6390 * otherwise, we acquire it beforehand. Since we were called (and 6391 * must therefore return) with the lock held, we will have to drop it, 6392 * and later reacquire it, if operating on a directory. 6393 */ 6394 if (vp->v_type == VDIR) { 6395 rw_exit(&ip->i_rwlock); 6396 haverwlock = 0; 6397 } else { 6398 /* Upgrade the lock if required. */ 6399 if (!rw_write_held(&ip->i_rwlock)) { 6400 rw_exit(&ip->i_rwlock); 6401 rw_enter(&ip->i_rwlock, RW_WRITER); 6402 } 6403 } 6404 6405 again: 6406 ASSERT(!(vp->v_type == VDIR && haverwlock)); 6407 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) { 6408 ulp = NULL; 6409 retry = 0; 6410 goto out; 6411 } 6412 6413 /* 6414 * Check that the file system supports this operation. Note that 6415 * ufs_lockfs_begin() will have checked that the file system had 6416 * not been forcibly unmounted. 6417 */ 6418 if (ufsvfsp->vfs_fs->fs_ronly) { 6419 err = EROFS; 6420 goto out; 6421 } 6422 if (ufsvfsp->vfs_nosetsec) { 6423 err = ENOSYS; 6424 goto out; 6425 } 6426 6427 if (ulp) { 6428 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR, 6429 trans_size = TOP_SETSECATTR_SIZE(VTOI(vp))); 6430 donetrans = 1; 6431 } 6432 6433 if (vp->v_type == VDIR) { 6434 rw_enter(&ip->i_rwlock, RW_WRITER); 6435 haverwlock = 1; 6436 } 6437 6438 ASSERT(haverwlock); 6439 6440 /* Do the actual work. */ 6441 rw_enter(&ip->i_contents, RW_WRITER); 6442 /* 6443 * Suppress out of inodes messages if we will retry. 6444 */ 6445 if (retry) 6446 ip->i_flag |= IQUIET; 6447 err = ufs_acl_set(ip, vsap, flag, cr); 6448 ip->i_flag &= ~IQUIET; 6449 rw_exit(&ip->i_contents); 6450 6451 out: 6452 if (ulp) { 6453 if (donetrans) { 6454 /* 6455 * top_end_async() can eventually call 6456 * top_end_sync(), which can block. We must 6457 * therefore observe the lock-ordering protocol 6458 * here as well. 6459 */ 6460 if (vp->v_type == VDIR) { 6461 rw_exit(&ip->i_rwlock); 6462 haverwlock = 0; 6463 } 6464 TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size); 6465 } 6466 ufs_lockfs_end(ulp); 6467 } 6468 /* 6469 * If no inodes available, try scaring a logically- 6470 * free one out of the delete queue to someplace 6471 * that we can find it. 6472 */ 6473 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 6474 ufs_delete_drain_wait(ufsvfsp, 1); 6475 retry = 0; 6476 if (vp->v_type == VDIR && haverwlock) { 6477 rw_exit(&ip->i_rwlock); 6478 haverwlock = 0; 6479 } 6480 goto again; 6481 } 6482 /* 6483 * If we need to reacquire the lock then it is safe to do so 6484 * as a reader. This is because ufs_rwunlock(), which will be 6485 * called by our caller after we return, does not differentiate 6486 * between shared and exclusive locks. 6487 */ 6488 if (!haverwlock) { 6489 ASSERT(vp->v_type == VDIR); 6490 rw_enter(&ip->i_rwlock, RW_READER); 6491 } 6492 6493 TRACE_1(TR_FAC_UFS, TR_UFS_SETSECATTR_END, 6494 "ufs_setsecattr_end:vp %p", vp); 6495 return (err); 6496 } 6497