1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/t_lock.h> 38 #include <sys/ksynch.h> 39 #include <sys/param.h> 40 #include <sys/time.h> 41 #include <sys/systm.h> 42 #include <sys/sysmacros.h> 43 #include <sys/resource.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/buf.h> 48 #include <sys/vfs.h> 49 #include <sys/vfs_opreg.h> 50 #include <sys/vnode.h> 51 #include <sys/proc.h> 52 #include <sys/disp.h> 53 #include <sys/file.h> 54 #include <sys/fcntl.h> 55 #include <sys/flock.h> 56 #include <sys/atomic.h> 57 #include <sys/kmem.h> 58 #include <sys/uio.h> 59 #include <sys/dnlc.h> 60 #include <sys/conf.h> 61 #include <sys/mman.h> 62 #include <sys/pathname.h> 63 #include <sys/debug.h> 64 #include <sys/vmsystm.h> 65 #include <sys/cmn_err.h> 66 #include <sys/vtrace.h> 67 #include <sys/filio.h> 68 #include <sys/policy.h> 69 70 #include <sys/fs/ufs_fs.h> 71 #include <sys/fs/ufs_lockfs.h> 72 #include <sys/fs/ufs_filio.h> 73 #include <sys/fs/ufs_inode.h> 74 #include <sys/fs/ufs_fsdir.h> 75 #include <sys/fs/ufs_quota.h> 76 #include <sys/fs/ufs_log.h> 77 #include <sys/fs/ufs_snap.h> 78 #include <sys/fs/ufs_trans.h> 79 #include <sys/fs/ufs_panic.h> 80 #include <sys/fs/ufs_bio.h> 81 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 82 #include <sys/errno.h> 83 #include <sys/fssnap_if.h> 84 #include <sys/unistd.h> 85 #include <sys/sunddi.h> 86 87 #include <sys/filio.h> /* _FIOIO */ 88 89 #include <vm/hat.h> 90 #include <vm/page.h> 91 #include <vm/pvn.h> 92 #include <vm/as.h> 93 #include <vm/seg.h> 94 #include <vm/seg_map.h> 95 #include <vm/seg_vn.h> 96 #include <vm/seg_kmem.h> 97 #include <vm/rm.h> 98 #include <sys/swap.h> 99 100 #include <fs/fs_subr.h> 101 102 static struct instats ins; 103 104 static int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 105 static int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, 106 caddr_t, struct page **, size_t, enum seg_rw, int); 107 static int ufs_open(struct vnode **, int, struct cred *); 108 static int ufs_close(struct vnode *, int, int, offset_t, struct cred *); 109 static int ufs_read(struct vnode *, struct uio *, int, struct cred *, 110 struct caller_context *); 111 static int ufs_write(struct vnode *, struct uio *, int, struct cred *, 112 struct caller_context *); 113 static int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *); 114 static int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *); 115 static int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *, 116 caller_context_t *); 117 static int ufs_access(struct vnode *, int, int, struct cred *); 118 static int ufs_lookup(struct vnode *, char *, struct vnode **, 119 struct pathname *, int, struct vnode *, struct cred *); 120 static int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl, 121 int, struct vnode **, struct cred *, int); 122 static int ufs_remove(struct vnode *, char *, struct cred *); 123 static int ufs_link(struct vnode *, struct vnode *, char *, struct cred *); 124 static int ufs_rename(struct vnode *, char *, struct vnode *, char *, 125 struct cred *); 126 static int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, 127 struct cred *); 128 static int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *); 129 static int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *); 130 static int ufs_symlink(struct vnode *, char *, struct vattr *, char *, 131 struct cred *); 132 static int ufs_readlink(struct vnode *, struct uio *, struct cred *); 133 static int ufs_fsync(struct vnode *, int, struct cred *); 134 static void ufs_inactive(struct vnode *, struct cred *); 135 static int ufs_fid(struct vnode *, struct fid *); 136 static int ufs_rwlock(struct vnode *, int, caller_context_t *); 137 static void ufs_rwunlock(struct vnode *, int, caller_context_t *); 138 static int ufs_seek(struct vnode *, offset_t, offset_t *); 139 static int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 140 struct flk_callback *, struct cred *); 141 static int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t, 142 cred_t *, caller_context_t *); 143 static int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *, 144 struct page **, size_t, struct seg *, caddr_t, 145 enum seg_rw, struct cred *); 146 static int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *); 147 static int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *); 148 static int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 149 uchar_t, uchar_t, uint_t, struct cred *); 150 static int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 151 uchar_t, uchar_t, uint_t, struct cred *); 152 static int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 153 uint_t, uint_t, uint_t, struct cred *); 154 static int ufs_poll(vnode_t *, short, int, short *, struct pollhead **); 155 static int ufs_dump(vnode_t *, caddr_t, int, int); 156 static int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *); 157 static int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int, 158 struct cred *); 159 static int ufs_dump(vnode_t *, caddr_t, int, int); 160 static int ufs_dumpctl(vnode_t *, int, int *); 161 static daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *, 162 daddr32_t *, int, int); 163 static int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 164 static int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 165 166 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 167 168 /* 169 * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions. 170 * 171 * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet. 172 */ 173 struct vnodeops *ufs_vnodeops; 174 175 /* NOTE: "not blkd" below means that the operation isn't blocked by lockfs */ 176 const fs_operation_def_t ufs_vnodeops_template[] = { 177 VOPNAME_OPEN, { .vop_open = ufs_open }, /* not blkd */ 178 VOPNAME_CLOSE, { .vop_close = ufs_close }, /* not blkd */ 179 VOPNAME_READ, { .vop_read = ufs_read }, 180 VOPNAME_WRITE, { .vop_write = ufs_write }, 181 VOPNAME_IOCTL, { .vop_ioctl = ufs_ioctl }, 182 VOPNAME_GETATTR, { .vop_getattr = ufs_getattr }, 183 VOPNAME_SETATTR, { .vop_setattr = ufs_setattr }, 184 VOPNAME_ACCESS, { .vop_access = ufs_access }, 185 VOPNAME_LOOKUP, { .vop_lookup = ufs_lookup }, 186 VOPNAME_CREATE, { .vop_create = ufs_create }, 187 VOPNAME_REMOVE, { .vop_remove = ufs_remove }, 188 VOPNAME_LINK, { .vop_link = ufs_link }, 189 VOPNAME_RENAME, { .vop_rename = ufs_rename }, 190 VOPNAME_MKDIR, { .vop_mkdir = ufs_mkdir }, 191 VOPNAME_RMDIR, { .vop_rmdir = ufs_rmdir }, 192 VOPNAME_READDIR, { .vop_readdir = ufs_readdir }, 193 VOPNAME_SYMLINK, { .vop_symlink = ufs_symlink }, 194 VOPNAME_READLINK, { .vop_readlink = ufs_readlink }, 195 VOPNAME_FSYNC, { .vop_fsync = ufs_fsync }, 196 VOPNAME_INACTIVE, { .vop_inactive = ufs_inactive }, /* not blkd */ 197 VOPNAME_FID, { .vop_fid = ufs_fid }, 198 VOPNAME_RWLOCK, { .vop_rwlock = ufs_rwlock }, /* not blkd */ 199 VOPNAME_RWUNLOCK, { .vop_rwunlock = ufs_rwunlock }, /* not blkd */ 200 VOPNAME_SEEK, { .vop_seek = ufs_seek }, 201 VOPNAME_FRLOCK, { .vop_frlock = ufs_frlock }, 202 VOPNAME_SPACE, { .vop_space = ufs_space }, 203 VOPNAME_GETPAGE, { .vop_getpage = ufs_getpage }, 204 VOPNAME_PUTPAGE, { .vop_putpage = ufs_putpage }, 205 VOPNAME_MAP, { .vop_map = ufs_map }, 206 VOPNAME_ADDMAP, { .vop_addmap = ufs_addmap }, /* not blkd */ 207 VOPNAME_DELMAP, { .vop_delmap = ufs_delmap }, /* not blkd */ 208 VOPNAME_POLL, { .vop_poll = ufs_poll }, /* not blkd */ 209 VOPNAME_DUMP, { .vop_dump = ufs_dump }, 210 VOPNAME_PATHCONF, { .vop_pathconf = ufs_l_pathconf }, 211 VOPNAME_PAGEIO, { .vop_pageio = ufs_pageio }, 212 VOPNAME_DUMPCTL, { .vop_dumpctl = ufs_dumpctl }, 213 VOPNAME_GETSECATTR, { .vop_getsecattr = ufs_getsecattr }, 214 VOPNAME_SETSECATTR, { .vop_setsecattr = ufs_setsecattr }, 215 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 216 NULL, NULL 217 }; 218 219 #define MAX_BACKFILE_COUNT 9999 220 221 /* 222 * Created by ufs_dumpctl() to store a file's disk block info into memory. 223 * Used by ufs_dump() to dump data to disk directly. 224 */ 225 struct dump { 226 struct inode *ip; /* the file we contain */ 227 daddr_t fsbs; /* number of blocks stored */ 228 struct timeval32 time; /* time stamp for the struct */ 229 daddr32_t dblk[1]; /* place holder for block info */ 230 }; 231 232 static struct dump *dump_info = NULL; 233 234 /* 235 * Previously there was no special action required for ordinary files. 236 * (Devices are handled through the device file system.) 237 * Now we support Large Files and Large File API requires open to 238 * fail if file is large. 239 * We could take care to prevent data corruption 240 * by doing an atomic check of size and truncate if file is opened with 241 * FTRUNC flag set but traditionally this is being done by the vfs/vnode 242 * layers. So taking care of truncation here is a change in the existing 243 * semantics of VOP_OPEN and therefore we chose not to implement any thing 244 * here. The check for the size of the file > 2GB is being done at the 245 * vfs layer in routine vn_open(). 246 */ 247 248 /* ARGSUSED */ 249 static int 250 ufs_open(struct vnode **vpp, int flag, struct cred *cr) 251 { 252 TRACE_1(TR_FAC_UFS, TR_UFS_OPEN, "ufs_open:vpp %p", vpp); 253 return (0); 254 } 255 256 /*ARGSUSED*/ 257 static int 258 ufs_close(struct vnode *vp, int flag, int count, offset_t offset, 259 struct cred *cr) 260 { 261 TRACE_1(TR_FAC_UFS, TR_UFS_CLOSE, "ufs_close:vp %p", vp); 262 263 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 264 cleanshares(vp, ttoproc(curthread)->p_pid); 265 266 /* 267 * Push partially filled cluster at last close. 268 * ``last close'' is approximated because the dnlc 269 * may have a hold on the vnode. 270 * Checking for VBAD here will also act as a forced umount check. 271 */ 272 if (vp->v_count <= 2 && vp->v_type != VBAD) { 273 struct inode *ip = VTOI(vp); 274 if (ip->i_delaylen) { 275 ins.in_poc.value.ul++; 276 (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen, 277 B_ASYNC | B_FREE, cr); 278 ip->i_delaylen = 0; 279 } 280 } 281 282 return (0); 283 } 284 285 /*ARGSUSED*/ 286 static int 287 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 288 struct caller_context *ct) 289 { 290 struct inode *ip = VTOI(vp); 291 struct ufsvfs *ufsvfsp; 292 struct ulockfs *ulp = NULL; 293 int error = 0; 294 int intrans = 0; 295 296 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 297 TRACE_3(TR_FAC_UFS, TR_UFS_READ_START, 298 "ufs_read_start:vp %p uiop %p ioflag %x", 299 vp, uiop, ioflag); 300 301 /* 302 * Mandatory locking needs to be done before ufs_lockfs_begin() 303 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep. 304 */ 305 if (MANDLOCK(vp, ip->i_mode)) { 306 /* 307 * ufs_getattr ends up being called by chklock 308 */ 309 error = chklock(vp, FREAD, uiop->uio_loffset, 310 uiop->uio_resid, uiop->uio_fmode, ct); 311 if (error) 312 goto out; 313 } 314 315 ufsvfsp = ip->i_ufsvfs; 316 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK); 317 if (error) 318 goto out; 319 320 /* 321 * In the case that a directory is opened for reading as a file 322 * (eg "cat .") with the O_RSYNC, O_SYNC and O_DSYNC flags set. 323 * The locking order had to be changed to avoid a deadlock with 324 * an update taking place on that directory at the same time. 325 */ 326 if ((ip->i_mode & IFMT) == IFDIR) { 327 328 rw_enter(&ip->i_contents, RW_READER); 329 error = rdip(ip, uiop, ioflag, cr); 330 rw_exit(&ip->i_contents); 331 332 if (error) { 333 if (ulp) 334 ufs_lockfs_end(ulp); 335 goto out; 336 } 337 338 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 339 TRANS_ISTRANS(ufsvfsp)) { 340 rw_exit(&ip->i_rwlock); 341 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 342 error); 343 ASSERT(!error); 344 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 345 TOP_READ_SIZE); 346 rw_enter(&ip->i_rwlock, RW_READER); 347 } 348 } else { 349 /* 350 * Only transact reads to files opened for sync-read and 351 * sync-write on a file system that is not write locked. 352 * 353 * The ``not write locked'' check prevents problems with 354 * enabling/disabling logging on a busy file system. E.g., 355 * logging exists at the beginning of the read but does not 356 * at the end. 357 * 358 */ 359 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 360 TRANS_ISTRANS(ufsvfsp)) { 361 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 362 error); 363 ASSERT(!error); 364 intrans = 1; 365 } 366 367 rw_enter(&ip->i_contents, RW_READER); 368 error = rdip(ip, uiop, ioflag, cr); 369 rw_exit(&ip->i_contents); 370 371 if (intrans) { 372 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 373 TOP_READ_SIZE); 374 } 375 } 376 377 if (ulp) { 378 ufs_lockfs_end(ulp); 379 } 380 out: 381 382 TRACE_2(TR_FAC_UFS, TR_UFS_READ_END, 383 "ufs_read_end:vp %p error %d", vp, error); 384 return (error); 385 } 386 387 extern int ufs_HW; /* high water mark */ 388 extern int ufs_LW; /* low water mark */ 389 int ufs_WRITES = 1; /* XXX - enable/disable */ 390 int ufs_throttles = 0; /* throttling count */ 391 int ufs_allow_shared_writes = 1; /* directio shared writes */ 392 393 static int 394 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag) 395 { 396 int shared_write; 397 398 /* 399 * If the FDSYNC flag is set then ignore the global 400 * ufs_allow_shared_writes in this case. 401 */ 402 shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes; 403 404 /* 405 * Filter to determine if this request is suitable as a 406 * concurrent rewrite. This write must not allocate blocks 407 * by extending the file or filling in holes. No use trying 408 * through FSYNC descriptors as the inode will be synchronously 409 * updated after the write. The uio structure has not yet been 410 * checked for sanity, so assume nothing. 411 */ 412 return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) && 413 (uiop->uio_loffset >= (offset_t)0) && 414 (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) && 415 ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) && 416 !(ioflag & FSYNC) && !bmap_has_holes(ip) && 417 shared_write); 418 } 419 420 /*ARGSUSED*/ 421 static int 422 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr, 423 caller_context_t *ct) 424 { 425 struct inode *ip = VTOI(vp); 426 struct ufsvfs *ufsvfsp; 427 struct ulockfs *ulp; 428 int retry = 1; 429 int error, resv, resid = 0; 430 int directio_status; 431 int exclusive; 432 int rewriteflg; 433 long start_resid = uiop->uio_resid; 434 435 TRACE_3(TR_FAC_UFS, TR_UFS_WRITE_START, 436 "ufs_write_start:vp %p uiop %p ioflag %x", 437 vp, uiop, ioflag); 438 439 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 440 441 retry_mandlock: 442 /* 443 * Mandatory locking needs to be done before ufs_lockfs_begin() 444 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep. 445 * Check for forced unmounts normally done in ufs_lockfs_begin(). 446 */ 447 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 448 error = EIO; 449 goto out; 450 } 451 if (MANDLOCK(vp, ip->i_mode)) { 452 453 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 454 455 /* 456 * ufs_getattr ends up being called by chklock 457 */ 458 error = chklock(vp, FWRITE, uiop->uio_loffset, 459 uiop->uio_resid, uiop->uio_fmode, ct); 460 if (error) 461 goto out; 462 } 463 464 /* i_rwlock can change in chklock */ 465 exclusive = rw_write_held(&ip->i_rwlock); 466 rewriteflg = ufs_check_rewrite(ip, uiop, ioflag); 467 468 /* 469 * Check for fast-path special case of directio re-writes. 470 */ 471 if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) && 472 !exclusive && rewriteflg) { 473 474 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 475 if (error) 476 goto out; 477 478 rw_enter(&ip->i_contents, RW_READER); 479 error = ufs_directio_write(ip, uiop, ioflag, 1, cr, 480 &directio_status); 481 if (directio_status == DIRECTIO_SUCCESS) { 482 uint_t i_flag_save; 483 484 if (start_resid != uiop->uio_resid) 485 error = 0; 486 /* 487 * Special treatment of access times for re-writes. 488 * If IMOD is not already set, then convert it 489 * to IMODACC for this operation. This defers 490 * entering a delta into the log until the inode 491 * is flushed. This mimics what is done for read 492 * operations and inode access time. 493 */ 494 mutex_enter(&ip->i_tlock); 495 i_flag_save = ip->i_flag; 496 ip->i_flag |= IUPD | ICHG; 497 ip->i_seq++; 498 ITIMES_NOLOCK(ip); 499 if ((i_flag_save & IMOD) == 0) { 500 ip->i_flag &= ~IMOD; 501 ip->i_flag |= IMODACC; 502 } 503 mutex_exit(&ip->i_tlock); 504 rw_exit(&ip->i_contents); 505 if (ulp) 506 ufs_lockfs_end(ulp); 507 goto out; 508 } 509 rw_exit(&ip->i_contents); 510 if (ulp) 511 ufs_lockfs_end(ulp); 512 } 513 514 if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) { 515 rw_exit(&ip->i_rwlock); 516 rw_enter(&ip->i_rwlock, RW_WRITER); 517 /* 518 * Mandatory locking could have been enabled 519 * after dropping the i_rwlock. 520 */ 521 if (MANDLOCK(vp, ip->i_mode)) 522 goto retry_mandlock; 523 } 524 525 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 526 if (error) 527 goto out; 528 529 /* 530 * Amount of log space needed for this write 531 */ 532 if (!rewriteflg || !(ioflag & FDSYNC)) 533 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid); 534 535 /* 536 * Throttle writes. 537 */ 538 if (ufs_WRITES && (ip->i_writes > ufs_HW)) { 539 mutex_enter(&ip->i_tlock); 540 while (ip->i_writes > ufs_HW) { 541 ufs_throttles++; 542 cv_wait(&ip->i_wrcv, &ip->i_tlock); 543 } 544 mutex_exit(&ip->i_tlock); 545 } 546 547 /* 548 * Enter Transaction 549 * 550 * If the write is a rewrite there is no need to open a transaction 551 * if the FDSYNC flag is set and not the FSYNC. In this case just 552 * set the IMODACC flag to modify do the update at a later time 553 * thus avoiding the overhead of the logging transaction that is 554 * not required. 555 */ 556 if (ioflag & (FSYNC|FDSYNC)) { 557 if (ulp) { 558 if (rewriteflg) { 559 uint_t i_flag_save; 560 561 rw_enter(&ip->i_contents, RW_READER); 562 mutex_enter(&ip->i_tlock); 563 i_flag_save = ip->i_flag; 564 ip->i_flag |= IUPD | ICHG; 565 ip->i_seq++; 566 ITIMES_NOLOCK(ip); 567 if ((i_flag_save & IMOD) == 0) { 568 ip->i_flag &= ~IMOD; 569 ip->i_flag |= IMODACC; 570 } 571 mutex_exit(&ip->i_tlock); 572 rw_exit(&ip->i_contents); 573 } else { 574 int terr = 0; 575 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, 576 terr); 577 ASSERT(!terr); 578 } 579 } 580 } else { 581 if (ulp) 582 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 583 } 584 585 /* 586 * Write the file 587 */ 588 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 589 rw_enter(&ip->i_contents, RW_WRITER); 590 if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) { 591 /* 592 * In append mode start at end of file. 593 */ 594 uiop->uio_loffset = ip->i_size; 595 } 596 597 /* 598 * Mild optimisation, don't call ufs_trans_write() unless we have to 599 * Also, suppress file system full messages if we will retry. 600 */ 601 if (retry) 602 ip->i_flag |= IQUIET; 603 if (resid) { 604 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid); 605 } else { 606 error = wrip(ip, uiop, ioflag, cr); 607 } 608 ip->i_flag &= ~IQUIET; 609 610 rw_exit(&ip->i_contents); 611 rw_exit(&ufsvfsp->vfs_dqrwlock); 612 613 /* 614 * Leave Transaction 615 */ 616 if (ulp) { 617 if (ioflag & (FSYNC|FDSYNC)) { 618 if (!rewriteflg) { 619 int terr = 0; 620 621 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, 622 resv); 623 if (error == 0) 624 error = terr; 625 } 626 } else { 627 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 628 } 629 ufs_lockfs_end(ulp); 630 } 631 out: 632 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 633 /* 634 * Any blocks tied up in pending deletes? 635 */ 636 ufs_delete_drain_wait(ufsvfsp, 1); 637 retry = 0; 638 goto retry_mandlock; 639 } 640 641 if (error == ENOSPC && (start_resid != uiop->uio_resid)) 642 error = 0; 643 644 TRACE_2(TR_FAC_UFS, TR_UFS_WRITE_END, 645 "ufs_write_end:vp %p error %d", vp, error); 646 return (error); 647 } 648 649 /* 650 * Don't cache write blocks to files with the sticky bit set. 651 * Used to keep swap files from blowing the page cache on a server. 652 */ 653 int stickyhack = 1; 654 655 /* 656 * Free behind hacks. The pager is busted. 657 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 658 * or B_FREE_IF_TIGHT_ON_MEMORY. 659 */ 660 int freebehind = 1; 661 int smallfile = 0; 662 u_offset_t smallfile64 = 32 * 1024; 663 664 /* 665 * While we should, in most cases, cache the pages for write, we 666 * may also want to cache the pages for read as long as they are 667 * frequently re-usable. 668 * 669 * If cache_read_ahead = 1, the pages for read will go to the tail 670 * of the cache list when they are released, otherwise go to the head. 671 */ 672 int cache_read_ahead = 0; 673 674 /* 675 * Freebehind exists so that as we read large files sequentially we 676 * don't consume most of memory with pages from a few files. It takes 677 * longer to re-read from disk multiple small files as it does reading 678 * one large one sequentially. As system memory grows customers need 679 * to retain bigger chunks of files in memory. The advent of the 680 * cachelist opens up of the possibility freeing pages to the head or 681 * tail of the list. 682 * 683 * Not freeing a page is a bet that the page will be read again before 684 * it's segmap slot is needed for something else. If we loose the bet, 685 * it means some other thread is burdened with the page free we did 686 * not do. If we win we save a free and reclaim. 687 * 688 * Freeing it at the tail vs the head of cachelist is a bet that the 689 * page will survive until the next read. It's also saying that this 690 * page is more likely to be re-used than a page freed some time ago 691 * and never reclaimed. 692 * 693 * Freebehind maintains a range of file offset [smallfile1; smallfile2] 694 * 695 * 0 < offset < smallfile1 : pages are not freed. 696 * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist. 697 * smallfile2 < offset : pages freed to head of cachelist. 698 * 699 * The range is computed at most once per second and depends on 700 * freemem and ncpus_online. Both parameters are bounded to be 701 * >= smallfile && >= smallfile64. 702 * 703 * smallfile1 = (free memory / ncpu) / 1000 704 * smallfile2 = (free memory / ncpu) / 10 705 * 706 * A few examples values: 707 * 708 * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2] 709 * ncpus_online = 4 ncpus_online = 64 710 * ------------------ ----------------------- ----------------------- 711 * 1G [256K; 25M] [32K; 1.5M] 712 * 10G [2.5M; 250M] [156K; 15M] 713 * 100G [25M; 2.5G] [1.5M; 150M] 714 * 715 */ 716 717 #define SMALLFILE1_D 1000 718 #define SMALLFILE2_D 10 719 static u_offset_t smallfile1 = 32 * 1024; 720 static u_offset_t smallfile2 = 32 * 1024; 721 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */ 722 uint_t smallfile1_d = SMALLFILE1_D; 723 uint_t smallfile2_d = SMALLFILE2_D; 724 725 /* 726 * wrip does the real work of write requests for ufs. 727 */ 728 int 729 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr) 730 { 731 rlim64_t limit = uio->uio_llimit; 732 u_offset_t off; 733 u_offset_t old_i_size; 734 struct fs *fs; 735 struct vnode *vp; 736 struct ufsvfs *ufsvfsp; 737 caddr_t base; 738 long start_resid = uio->uio_resid; /* save starting resid */ 739 long premove_resid; /* resid before uiomove() */ 740 uint_t flags; 741 int newpage; 742 int iupdat_flag, directio_status; 743 int n, on, mapon; 744 int error, pagecreate; 745 int do_dqrwlock; /* drop/reacquire vfs_dqrwlock */ 746 int32_t iblocks; 747 int new_iblocks; 748 749 /* 750 * ip->i_size is incremented before the uiomove 751 * is done on a write. If the move fails (bad user 752 * address) reset ip->i_size. 753 * The better way would be to increment ip->i_size 754 * only if the uiomove succeeds. 755 */ 756 int i_size_changed = 0; 757 o_mode_t type; 758 int i_seq_needed = 0; 759 760 vp = ITOV(ip); 761 762 /* 763 * check for forced unmount - should not happen as 764 * the request passed the lockfs checks. 765 */ 766 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 767 return (EIO); 768 769 fs = ip->i_fs; 770 771 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 772 "ufs_wrip_start:vp %p", vp); 773 774 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 775 776 /* check for valid filetype */ 777 type = ip->i_mode & IFMT; 778 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 779 (type != IFLNK) && (type != IFSHAD)) { 780 return (EIO); 781 } 782 783 /* 784 * the actual limit of UFS file size 785 * is UFS_MAXOFFSET_T 786 */ 787 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 788 limit = MAXOFFSET_T; 789 790 if (uio->uio_loffset >= limit) { 791 proc_t *p = ttoproc(curthread); 792 793 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 794 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 795 796 mutex_enter(&p->p_lock); 797 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 798 p, RCA_UNSAFE_SIGINFO); 799 mutex_exit(&p->p_lock); 800 return (EFBIG); 801 } 802 803 /* 804 * if largefiles are disallowed, the limit is 805 * the pre-largefiles value of 2GB 806 */ 807 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 808 limit = MIN(UFS_MAXOFFSET_T, limit); 809 else 810 limit = MIN(MAXOFF32_T, limit); 811 812 if (uio->uio_loffset < (offset_t)0) { 813 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 814 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 815 return (EINVAL); 816 } 817 if (uio->uio_resid == 0) { 818 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 819 "ufs_wrip_end:vp %p error %d", vp, 0); 820 return (0); 821 } 822 823 if (uio->uio_loffset >= limit) 824 return (EFBIG); 825 826 ip->i_flag |= INOACC; /* don't update ref time in getpage */ 827 828 if (ioflag & (FSYNC|FDSYNC)) { 829 ip->i_flag |= ISYNC; 830 iupdat_flag = 1; 831 } 832 /* 833 * Try to go direct 834 */ 835 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 836 uio->uio_llimit = limit; 837 error = ufs_directio_write(ip, uio, ioflag, 0, cr, 838 &directio_status); 839 /* 840 * If ufs_directio wrote to the file or set the flags, 841 * we need to update i_seq, but it may be deferred. 842 */ 843 if (start_resid != uio->uio_resid || 844 (ip->i_flag & (ICHG|IUPD))) { 845 i_seq_needed = 1; 846 ip->i_flag |= ISEQ; 847 } 848 if (directio_status == DIRECTIO_SUCCESS) 849 goto out; 850 } 851 852 /* 853 * Behavior with respect to dropping/reacquiring vfs_dqrwlock: 854 * 855 * o shadow inodes: vfs_dqrwlock is not held at all 856 * o quota updates: vfs_dqrwlock is read or write held 857 * o other updates: vfs_dqrwlock is read held 858 * 859 * The first case is the only one where we do not hold 860 * vfs_dqrwlock at all while entering wrip(). 861 * We must make sure not to downgrade/drop vfs_dqrwlock if we 862 * have it as writer, i.e. if we are updating the quota inode. 863 * There is no potential deadlock scenario in this case as 864 * ufs_getpage() takes care of this and avoids reacquiring 865 * vfs_dqrwlock in that case. 866 * 867 * This check is done here since the above conditions do not change 868 * and we possibly loop below, so save a few cycles. 869 */ 870 if ((type == IFSHAD) || 871 (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) { 872 do_dqrwlock = 0; 873 } else { 874 do_dqrwlock = 1; 875 } 876 877 /* 878 * Large Files: We cast MAXBMASK to offset_t 879 * inorder to mask out the higher bits. Since offset_t 880 * is a signed value, the high order bit set in MAXBMASK 881 * value makes it do the right thing by having all bits 1 882 * in the higher word. May be removed for _SOLARIS64_. 883 */ 884 885 fs = ip->i_fs; 886 do { 887 u_offset_t uoff = uio->uio_loffset; 888 off = uoff & (offset_t)MAXBMASK; 889 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 890 on = (int)blkoff(fs, uoff); 891 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid); 892 new_iblocks = 1; 893 894 if (type == IFREG && uoff + n >= limit) { 895 if (uoff >= limit) { 896 error = EFBIG; 897 goto out; 898 } 899 /* 900 * since uoff + n >= limit, 901 * therefore n >= limit - uoff, and n is an int 902 * so it is safe to cast it to an int 903 */ 904 n = (int)(limit - (rlim64_t)uoff); 905 } 906 if (uoff + n > ip->i_size) { 907 /* 908 * We are extending the length of the file. 909 * bmap is used so that we are sure that 910 * if we need to allocate new blocks, that it 911 * is done here before we up the file size. 912 */ 913 error = bmap_write(ip, uoff, (int)(on + n), 914 mapon == 0, NULL, cr); 915 /* 916 * bmap_write never drops i_contents so if 917 * the flags are set it changed the file. 918 */ 919 if (ip->i_flag & (ICHG|IUPD)) { 920 i_seq_needed = 1; 921 ip->i_flag |= ISEQ; 922 } 923 if (error) 924 break; 925 /* 926 * There is a window of vulnerability here. 927 * The sequence of operations: allocate file 928 * system blocks, uiomove the data into pages, 929 * and then update the size of the file in the 930 * inode, must happen atomically. However, due 931 * to current locking constraints, this can not 932 * be done. 933 */ 934 ASSERT(ip->i_writer == NULL); 935 ip->i_writer = curthread; 936 i_size_changed = 1; 937 /* 938 * If we are writing from the beginning of 939 * the mapping, we can just create the 940 * pages without having to read them. 941 */ 942 pagecreate = (mapon == 0); 943 } else if (n == MAXBSIZE) { 944 /* 945 * Going to do a whole mappings worth, 946 * so we can just create the pages w/o 947 * having to read them in. But before 948 * we do that, we need to make sure any 949 * needed blocks are allocated first. 950 */ 951 iblocks = ip->i_blocks; 952 error = bmap_write(ip, uoff, (int)(on + n), 953 BI_ALLOC_ONLY, NULL, cr); 954 /* 955 * bmap_write never drops i_contents so if 956 * the flags are set it changed the file. 957 */ 958 if (ip->i_flag & (ICHG|IUPD)) { 959 i_seq_needed = 1; 960 ip->i_flag |= ISEQ; 961 } 962 if (error) 963 break; 964 pagecreate = 1; 965 /* 966 * check if the new created page needed the 967 * allocation of new disk blocks. 968 */ 969 if (iblocks == ip->i_blocks) 970 new_iblocks = 0; /* no new blocks allocated */ 971 } else { 972 pagecreate = 0; 973 /* 974 * In sync mode flush the indirect blocks which 975 * may have been allocated and not written on 976 * disk. In above cases bmap_write will allocate 977 * in sync mode. 978 */ 979 if (ioflag & (FSYNC|FDSYNC)) { 980 error = ufs_indirblk_sync(ip, uoff); 981 if (error) 982 break; 983 } 984 } 985 986 /* 987 * At this point we can enter ufs_getpage() in one 988 * of two ways: 989 * 1) segmap_getmapflt() calls ufs_getpage() when the 990 * forcefault parameter is true (pagecreate == 0) 991 * 2) uiomove() causes a page fault. 992 * 993 * We have to drop the contents lock to prevent the VM 994 * system from trying to reaquire it in ufs_getpage() 995 * should the uiomove cause a pagefault. 996 * 997 * We have to drop the reader vfs_dqrwlock here as well. 998 */ 999 rw_exit(&ip->i_contents); 1000 if (do_dqrwlock) { 1001 ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1002 ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock))); 1003 rw_exit(&ufsvfsp->vfs_dqrwlock); 1004 } 1005 1006 newpage = 0; 1007 premove_resid = uio->uio_resid; 1008 if (vpm_enable) { 1009 /* 1010 * Copy data. If new pages are created, part of 1011 * the page that is not written will be initizliazed 1012 * with zeros. 1013 */ 1014 error = vpm_data_copy(vp, (off + mapon), (uint_t)n, 1015 uio, !pagecreate, &newpage, 0, S_WRITE); 1016 } else { 1017 1018 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1019 (uint_t)n, !pagecreate, S_WRITE); 1020 1021 /* 1022 * segmap_pagecreate() returns 1 if it calls 1023 * page_create_va() to allocate any pages. 1024 */ 1025 1026 if (pagecreate) 1027 newpage = segmap_pagecreate(segkmap, base, 1028 (size_t)n, 0); 1029 1030 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 1031 } 1032 1033 /* 1034 * If "newpage" is set, then a new page was created and it 1035 * does not contain valid data, so it needs to be initialized 1036 * at this point. 1037 * Otherwise the page contains old data, which was overwritten 1038 * partially or as a whole in uiomove. 1039 * If there is only one iovec structure within uio, then 1040 * on error uiomove will not be able to update uio->uio_loffset 1041 * and we would zero the whole page here! 1042 * 1043 * If uiomove fails because of an error, the old valid data 1044 * is kept instead of filling the rest of the page with zero's. 1045 */ 1046 if (!vpm_enable && newpage && 1047 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 1048 /* 1049 * We created pages w/o initializing them completely, 1050 * thus we need to zero the part that wasn't set up. 1051 * This happens on most EOF write cases and if 1052 * we had some sort of error during the uiomove. 1053 */ 1054 int nzero, nmoved; 1055 1056 nmoved = (int)(uio->uio_loffset - (off + mapon)); 1057 ASSERT(nmoved >= 0 && nmoved <= n); 1058 nzero = roundup(on + n, PAGESIZE) - nmoved; 1059 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 1060 (void) kzero(base + mapon + nmoved, (uint_t)nzero); 1061 } 1062 1063 /* 1064 * Unlock the pages allocated by page_create_va() 1065 * in segmap_pagecreate() 1066 */ 1067 if (!vpm_enable && newpage) 1068 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 1069 1070 /* 1071 * If the size of the file changed, then update the 1072 * size field in the inode now. This can't be done 1073 * before the call to segmap_pageunlock or there is 1074 * a potential deadlock with callers to ufs_putpage(). 1075 * They will be holding i_contents and trying to lock 1076 * a page, while this thread is holding a page locked 1077 * and trying to acquire i_contents. 1078 */ 1079 if (i_size_changed) { 1080 rw_enter(&ip->i_contents, RW_WRITER); 1081 old_i_size = ip->i_size; 1082 UFS_SET_ISIZE(uoff + n, ip); 1083 TRANS_INODE(ufsvfsp, ip); 1084 /* 1085 * file has grown larger than 2GB. Set flag 1086 * in superblock to indicate this, if it 1087 * is not already set. 1088 */ 1089 if ((ip->i_size > MAXOFF32_T) && 1090 !(fs->fs_flags & FSLARGEFILES)) { 1091 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1092 mutex_enter(&ufsvfsp->vfs_lock); 1093 fs->fs_flags |= FSLARGEFILES; 1094 ufs_sbwrite(ufsvfsp); 1095 mutex_exit(&ufsvfsp->vfs_lock); 1096 } 1097 mutex_enter(&ip->i_tlock); 1098 ip->i_writer = NULL; 1099 cv_broadcast(&ip->i_wrcv); 1100 mutex_exit(&ip->i_tlock); 1101 rw_exit(&ip->i_contents); 1102 } 1103 1104 if (error) { 1105 /* 1106 * If we failed on a write, we may have already 1107 * allocated file blocks as well as pages. It's 1108 * hard to undo the block allocation, but we must 1109 * be sure to invalidate any pages that may have 1110 * been allocated. 1111 * 1112 * If the page was created without initialization 1113 * then we must check if it should be possible 1114 * to destroy the new page and to keep the old data 1115 * on the disk. 1116 * 1117 * It is possible to destroy the page without 1118 * having to write back its contents only when 1119 * - the size of the file keeps unchanged 1120 * - bmap_write() did not allocate new disk blocks 1121 * it is possible to create big files using "seek" and 1122 * write to the end of the file. A "write" to a 1123 * position before the end of the file would not 1124 * change the size of the file but it would allocate 1125 * new disk blocks. 1126 * - uiomove intended to overwrite the whole page. 1127 * - a new page was created (newpage == 1). 1128 */ 1129 1130 if (i_size_changed == 0 && new_iblocks == 0 && 1131 newpage) { 1132 1133 /* unwind what uiomove eventually last did */ 1134 uio->uio_resid = premove_resid; 1135 1136 /* 1137 * destroy the page, do not write ambiguous 1138 * data to the disk. 1139 */ 1140 flags = SM_DESTROY; 1141 } else { 1142 /* 1143 * write the page back to the disk, if dirty, 1144 * and remove the page from the cache. 1145 */ 1146 flags = SM_INVAL; 1147 } 1148 1149 if (vpm_enable) { 1150 /* 1151 * Flush pages. 1152 */ 1153 (void) vpm_sync_pages(vp, off, n, flags); 1154 } else { 1155 (void) segmap_release(segkmap, base, flags); 1156 } 1157 } else { 1158 flags = 0; 1159 /* 1160 * Force write back for synchronous write cases. 1161 */ 1162 if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) { 1163 /* 1164 * If the sticky bit is set but the 1165 * execute bit is not set, we do a 1166 * synchronous write back and free 1167 * the page when done. We set up swap 1168 * files to be handled this way to 1169 * prevent servers from keeping around 1170 * the client's swap pages too long. 1171 * XXX - there ought to be a better way. 1172 */ 1173 if (IS_SWAPVP(vp)) { 1174 flags = SM_WRITE | SM_FREE | 1175 SM_DONTNEED; 1176 iupdat_flag = 0; 1177 } else { 1178 flags = SM_WRITE; 1179 } 1180 } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 1181 /* 1182 * Have written a whole block. 1183 * Start an asynchronous write and 1184 * mark the buffer to indicate that 1185 * it won't be needed again soon. 1186 */ 1187 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 1188 } 1189 if (vpm_enable) { 1190 /* 1191 * Flush pages. 1192 */ 1193 error = vpm_sync_pages(vp, off, n, flags); 1194 } else { 1195 error = segmap_release(segkmap, base, flags); 1196 } 1197 /* 1198 * If the operation failed and is synchronous, 1199 * then we need to unwind what uiomove() last 1200 * did so we can potentially return an error to 1201 * the caller. If this write operation was 1202 * done in two pieces and the first succeeded, 1203 * then we won't return an error for the second 1204 * piece that failed. However, we only want to 1205 * return a resid value that reflects what was 1206 * really done. 1207 * 1208 * Failures for non-synchronous operations can 1209 * be ignored since the page subsystem will 1210 * retry the operation until it succeeds or the 1211 * file system is unmounted. 1212 */ 1213 if (error) { 1214 if ((ioflag & (FSYNC | FDSYNC)) || 1215 type == IFDIR) { 1216 uio->uio_resid = premove_resid; 1217 } else { 1218 error = 0; 1219 } 1220 } 1221 } 1222 1223 /* 1224 * Re-acquire contents lock. 1225 * If it was dropped, reacquire reader vfs_dqrwlock as well. 1226 */ 1227 if (do_dqrwlock) 1228 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1229 rw_enter(&ip->i_contents, RW_WRITER); 1230 1231 /* 1232 * If the uiomove() failed or if a synchronous 1233 * page push failed, fix up i_size. 1234 */ 1235 if (error) { 1236 if (i_size_changed) { 1237 /* 1238 * The uiomove failed, and we 1239 * allocated blocks,so get rid 1240 * of them. 1241 */ 1242 (void) ufs_itrunc(ip, old_i_size, 0, cr); 1243 } 1244 } else { 1245 /* 1246 * XXX - Can this be out of the loop? 1247 */ 1248 ip->i_flag |= IUPD | ICHG; 1249 /* 1250 * Only do one increase of i_seq for multiple 1251 * pieces. Because we drop locks, record 1252 * the fact that we changed the timestamp and 1253 * are deferring the increase in case another thread 1254 * pushes our timestamp update. 1255 */ 1256 i_seq_needed = 1; 1257 ip->i_flag |= ISEQ; 1258 if (i_size_changed) 1259 ip->i_flag |= IATTCHG; 1260 if ((ip->i_mode & (IEXEC | (IEXEC >> 3) | 1261 (IEXEC >> 6))) != 0 && 1262 (ip->i_mode & (ISUID | ISGID)) != 0 && 1263 secpolicy_vnode_setid_retain(cr, 1264 (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) { 1265 /* 1266 * Clear Set-UID & Set-GID bits on 1267 * successful write if not privileged 1268 * and at least one of the execute bits 1269 * is set. If we always clear Set-GID, 1270 * mandatory file and record locking is 1271 * unuseable. 1272 */ 1273 ip->i_mode &= ~(ISUID | ISGID); 1274 } 1275 } 1276 /* 1277 * In the case the FDSYNC flag is set and this is a 1278 * "rewrite" we won't log a delta. 1279 * The FSYNC flag overrides all cases. 1280 */ 1281 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) { 1282 TRANS_INODE(ufsvfsp, ip); 1283 } 1284 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1285 1286 out: 1287 /* 1288 * Make sure i_seq is increased at least once per write 1289 */ 1290 if (i_seq_needed) { 1291 ip->i_seq++; 1292 ip->i_flag &= ~ISEQ; /* no longer deferred */ 1293 } 1294 1295 /* 1296 * Inode is updated according to this table - 1297 * 1298 * FSYNC FDSYNC(posix.4) 1299 * -------------------------- 1300 * always@ IATTCHG|IBDWRITE 1301 * 1302 * @ - If we are doing synchronous write the only time we should 1303 * not be sync'ing the ip here is if we have the stickyhack 1304 * activated, the file is marked with the sticky bit and 1305 * no exec bit, the file length has not been changed and 1306 * no new blocks have been allocated during this write. 1307 */ 1308 1309 if ((ip->i_flag & ISYNC) != 0) { 1310 /* 1311 * we have eliminated nosync 1312 */ 1313 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 1314 ((ioflag & FSYNC) && iupdat_flag)) { 1315 ufs_iupdat(ip, 1); 1316 } 1317 } 1318 1319 /* 1320 * If we've already done a partial-write, terminate 1321 * the write but return no error unless the error is ENOSPC 1322 * because the caller can detect this and free resources and 1323 * try again. 1324 */ 1325 if ((start_resid != uio->uio_resid) && (error != ENOSPC)) 1326 error = 0; 1327 1328 ip->i_flag &= ~(INOACC | ISYNC); 1329 ITIMES_NOLOCK(ip); 1330 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1331 "ufs_wrip_end:vp %p error %d", vp, error); 1332 return (error); 1333 } 1334 1335 /* 1336 * rdip does the real work of read requests for ufs. 1337 */ 1338 int 1339 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr) 1340 { 1341 u_offset_t off; 1342 caddr_t base; 1343 struct fs *fs; 1344 struct ufsvfs *ufsvfsp; 1345 struct vnode *vp; 1346 long oresid = uio->uio_resid; 1347 u_offset_t n, on, mapon; 1348 int error = 0; 1349 int doupdate = 1; 1350 uint_t flags; 1351 int dofree, directio_status; 1352 krw_t rwtype; 1353 o_mode_t type; 1354 1355 vp = ITOV(ip); 1356 1357 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 1358 "ufs_rdip_start:vp %p", vp); 1359 1360 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1361 1362 ufsvfsp = ip->i_ufsvfs; 1363 1364 if (ufsvfsp == NULL) 1365 return (EIO); 1366 1367 fs = ufsvfsp->vfs_fs; 1368 1369 /* check for valid filetype */ 1370 type = ip->i_mode & IFMT; 1371 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 1372 (type != IFLNK) && (type != IFSHAD)) { 1373 return (EIO); 1374 } 1375 1376 if (uio->uio_loffset > UFS_MAXOFFSET_T) { 1377 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1378 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1379 error = 0; 1380 goto out; 1381 } 1382 if (uio->uio_loffset < (offset_t)0) { 1383 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1384 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1385 return (EINVAL); 1386 } 1387 if (uio->uio_resid == 0) { 1388 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1389 "ufs_rdip_end:vp %p error %d", vp, 0); 1390 return (0); 1391 } 1392 1393 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) && 1394 (!ufsvfsp->vfs_noatime)) { 1395 mutex_enter(&ip->i_tlock); 1396 ip->i_flag |= IACC; 1397 mutex_exit(&ip->i_tlock); 1398 } 1399 /* 1400 * Try to go direct 1401 */ 1402 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 1403 error = ufs_directio_read(ip, uio, cr, &directio_status); 1404 if (directio_status == DIRECTIO_SUCCESS) 1405 goto out; 1406 } 1407 1408 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 1409 1410 do { 1411 offset_t diff; 1412 u_offset_t uoff = uio->uio_loffset; 1413 off = uoff & (offset_t)MAXBMASK; 1414 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET); 1415 on = (u_offset_t)blkoff(fs, uoff); 1416 n = MIN((u_offset_t)fs->fs_bsize - on, 1417 (u_offset_t)uio->uio_resid); 1418 1419 diff = ip->i_size - uoff; 1420 1421 if (diff <= (offset_t)0) { 1422 error = 0; 1423 goto out; 1424 } 1425 if (diff < (offset_t)n) 1426 n = (int)diff; 1427 1428 /* 1429 * We update smallfile2 and smallfile1 at most every second. 1430 */ 1431 if (lbolt >= smallfile_update) { 1432 uint64_t percpufreeb; 1433 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 1434 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 1435 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 1436 smallfile1 = percpufreeb / smallfile1_d; 1437 smallfile2 = percpufreeb / smallfile2_d; 1438 smallfile1 = MAX(smallfile1, smallfile); 1439 smallfile1 = MAX(smallfile1, smallfile64); 1440 smallfile2 = MAX(smallfile1, smallfile2); 1441 smallfile_update = lbolt + hz; 1442 } 1443 1444 dofree = freebehind && 1445 ip->i_nextr == (off & PAGEMASK) && off > smallfile1; 1446 1447 /* 1448 * At this point we can enter ufs_getpage() in one of two 1449 * ways: 1450 * 1) segmap_getmapflt() calls ufs_getpage() when the 1451 * forcefault parameter is true (value of 1 is passed) 1452 * 2) uiomove() causes a page fault. 1453 * 1454 * We cannot hold onto an i_contents reader lock without 1455 * risking deadlock in ufs_getpage() so drop a reader lock. 1456 * The ufs_getpage() dolock logic already allows for a 1457 * thread holding i_contents as writer to work properly 1458 * so we keep a writer lock. 1459 */ 1460 if (rwtype == RW_READER) 1461 rw_exit(&ip->i_contents); 1462 1463 if (vpm_enable) { 1464 /* 1465 * Copy data. 1466 */ 1467 error = vpm_data_copy(vp, (off + mapon), (uint_t)n, 1468 uio, 1, NULL, 0, S_READ); 1469 } else { 1470 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1471 (uint_t)n, 1, S_READ); 1472 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 1473 } 1474 1475 flags = 0; 1476 if (!error) { 1477 /* 1478 * If reading sequential we won't need this 1479 * buffer again soon. For offsets in range 1480 * [smallfile1, smallfile2] release the pages 1481 * at the tail of the cache list, larger 1482 * offsets are released at the head. 1483 */ 1484 if (dofree) { 1485 flags = SM_FREE | SM_ASYNC; 1486 if ((cache_read_ahead == 0) && 1487 (off > smallfile2)) 1488 flags |= SM_DONTNEED; 1489 } 1490 /* 1491 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 1492 * we want to make sure that the page which has 1493 * been read, is written on disk if it is dirty. 1494 * And corresponding indirect blocks should also 1495 * be flushed out. 1496 */ 1497 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 1498 flags &= ~SM_ASYNC; 1499 flags |= SM_WRITE; 1500 } 1501 if (vpm_enable) { 1502 error = vpm_sync_pages(vp, off, n, flags); 1503 } else { 1504 error = segmap_release(segkmap, base, flags); 1505 } 1506 } else { 1507 if (vpm_enable) { 1508 (void) vpm_sync_pages(vp, off, n, flags); 1509 } else { 1510 (void) segmap_release(segkmap, base, flags); 1511 } 1512 } 1513 1514 if (rwtype == RW_READER) 1515 rw_enter(&ip->i_contents, rwtype); 1516 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1517 out: 1518 /* 1519 * Inode is updated according to this table if FRSYNC is set. 1520 * 1521 * FSYNC FDSYNC(posix.4) 1522 * -------------------------- 1523 * always IATTCHG|IBDWRITE 1524 */ 1525 /* 1526 * The inode is not updated if we're logging and the inode is a 1527 * directory with FRSYNC, FSYNC and FDSYNC flags set. 1528 */ 1529 if (ioflag & FRSYNC) { 1530 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) { 1531 doupdate = 0; 1532 } 1533 if (doupdate) { 1534 if ((ioflag & FSYNC) || 1535 ((ioflag & FDSYNC) && 1536 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 1537 ufs_iupdat(ip, 1); 1538 } 1539 } 1540 } 1541 /* 1542 * If we've already done a partial read, terminate 1543 * the read but return no error. 1544 */ 1545 if (oresid != uio->uio_resid) 1546 error = 0; 1547 ITIMES(ip); 1548 1549 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1550 "ufs_rdip_end:vp %p error %d", vp, error); 1551 return (error); 1552 } 1553 1554 /* ARGSUSED */ 1555 static int 1556 ufs_ioctl( 1557 struct vnode *vp, 1558 int cmd, 1559 intptr_t arg, 1560 int flag, 1561 struct cred *cr, 1562 int *rvalp) 1563 { 1564 struct lockfs lockfs, lockfs_out; 1565 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 1566 char *comment, *original_comment; 1567 struct fs *fs; 1568 struct ulockfs *ulp; 1569 offset_t off; 1570 extern int maxphys; 1571 int error; 1572 int issync; 1573 int trans_size; 1574 1575 1576 /* 1577 * forcibly unmounted 1578 */ 1579 if (ufsvfsp == NULL) { 1580 return (EIO); 1581 } 1582 1583 fs = ufsvfsp->vfs_fs; 1584 1585 if (cmd == Q_QUOTACTL) { 1586 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK); 1587 if (error) 1588 return (error); 1589 1590 if (ulp) { 1591 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, 1592 TOP_SETQUOTA_SIZE(fs)); 1593 } 1594 1595 error = quotactl(vp, arg, flag, cr); 1596 1597 if (ulp) { 1598 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, 1599 TOP_SETQUOTA_SIZE(fs)); 1600 ufs_lockfs_end(ulp); 1601 } 1602 return (error); 1603 } 1604 1605 switch (cmd) { 1606 case _FIOLFS: 1607 /* 1608 * file system locking 1609 */ 1610 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1611 return (EPERM); 1612 1613 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1614 if (copyin((caddr_t)arg, &lockfs, 1615 sizeof (struct lockfs))) 1616 return (EFAULT); 1617 } 1618 #ifdef _SYSCALL32_IMPL 1619 else { 1620 struct lockfs32 lockfs32; 1621 /* Translate ILP32 lockfs to LP64 lockfs */ 1622 if (copyin((caddr_t)arg, &lockfs32, 1623 sizeof (struct lockfs32))) 1624 return (EFAULT); 1625 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1626 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1627 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1628 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1629 lockfs.lf_comment = 1630 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1631 } 1632 #endif /* _SYSCALL32_IMPL */ 1633 1634 if (lockfs.lf_comlen) { 1635 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) 1636 return (ENAMETOOLONG); 1637 comment = kmem_alloc(lockfs.lf_comlen, 1638 KM_SLEEP); 1639 if (copyin(lockfs.lf_comment, comment, 1640 lockfs.lf_comlen)) { 1641 kmem_free(comment, lockfs.lf_comlen); 1642 return (EFAULT); 1643 } 1644 original_comment = lockfs.lf_comment; 1645 lockfs.lf_comment = comment; 1646 } 1647 if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) { 1648 lockfs.lf_comment = original_comment; 1649 1650 if ((flag & DATAMODEL_MASK) == 1651 DATAMODEL_NATIVE) { 1652 (void) copyout(&lockfs, (caddr_t)arg, 1653 sizeof (struct lockfs)); 1654 } 1655 #ifdef _SYSCALL32_IMPL 1656 else { 1657 struct lockfs32 lockfs32; 1658 /* Translate LP64 to ILP32 lockfs */ 1659 lockfs32.lf_lock = 1660 (uint32_t)lockfs.lf_lock; 1661 lockfs32.lf_flags = 1662 (uint32_t)lockfs.lf_flags; 1663 lockfs32.lf_key = 1664 (uint32_t)lockfs.lf_key; 1665 lockfs32.lf_comlen = 1666 (uint32_t)lockfs.lf_comlen; 1667 lockfs32.lf_comment = 1668 (uint32_t)(uintptr_t)lockfs.lf_comment; 1669 (void) copyout(&lockfs32, (caddr_t)arg, 1670 sizeof (struct lockfs32)); 1671 } 1672 #endif /* _SYSCALL32_IMPL */ 1673 1674 } else { 1675 if (lockfs.lf_comlen) 1676 kmem_free(comment, lockfs.lf_comlen); 1677 } 1678 return (error); 1679 1680 case _FIOLFSS: 1681 /* 1682 * get file system locking status 1683 */ 1684 1685 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1686 if (copyin((caddr_t)arg, &lockfs, 1687 sizeof (struct lockfs))) 1688 return (EFAULT); 1689 } 1690 #ifdef _SYSCALL32_IMPL 1691 else { 1692 struct lockfs32 lockfs32; 1693 /* Translate ILP32 lockfs to LP64 lockfs */ 1694 if (copyin((caddr_t)arg, &lockfs32, 1695 sizeof (struct lockfs32))) 1696 return (EFAULT); 1697 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1698 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1699 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1700 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1701 lockfs.lf_comment = 1702 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1703 } 1704 #endif /* _SYSCALL32_IMPL */ 1705 1706 if (error = ufs_fiolfss(vp, &lockfs_out)) 1707 return (error); 1708 lockfs.lf_lock = lockfs_out.lf_lock; 1709 lockfs.lf_key = lockfs_out.lf_key; 1710 lockfs.lf_flags = lockfs_out.lf_flags; 1711 lockfs.lf_comlen = MIN(lockfs.lf_comlen, 1712 lockfs_out.lf_comlen); 1713 1714 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1715 if (copyout(&lockfs, (caddr_t)arg, 1716 sizeof (struct lockfs))) 1717 return (EFAULT); 1718 } 1719 #ifdef _SYSCALL32_IMPL 1720 else { 1721 /* Translate LP64 to ILP32 lockfs */ 1722 struct lockfs32 lockfs32; 1723 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock; 1724 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags; 1725 lockfs32.lf_key = (uint32_t)lockfs.lf_key; 1726 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen; 1727 lockfs32.lf_comment = 1728 (uint32_t)(uintptr_t)lockfs.lf_comment; 1729 if (copyout(&lockfs32, (caddr_t)arg, 1730 sizeof (struct lockfs32))) 1731 return (EFAULT); 1732 } 1733 #endif /* _SYSCALL32_IMPL */ 1734 1735 if (lockfs.lf_comlen && 1736 lockfs.lf_comment && lockfs_out.lf_comment) 1737 if (copyout(lockfs_out.lf_comment, 1738 lockfs.lf_comment, 1739 lockfs.lf_comlen)) 1740 return (EFAULT); 1741 return (0); 1742 1743 case _FIOSATIME: 1744 /* 1745 * set access time 1746 */ 1747 1748 /* 1749 * if mounted w/o atime, return quietly. 1750 * I briefly thought about returning ENOSYS, but 1751 * figured that most apps would consider this fatal 1752 * but the idea is to make this as seamless as poss. 1753 */ 1754 if (ufsvfsp->vfs_noatime) 1755 return (0); 1756 1757 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1758 ULOCKFS_SETATTR_MASK); 1759 if (error) 1760 return (error); 1761 1762 if (ulp) { 1763 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp)); 1764 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 1765 TOP_SETATTR, trans_size); 1766 } 1767 1768 error = ufs_fiosatime(vp, (struct timeval *)arg, 1769 flag, cr); 1770 1771 if (ulp) { 1772 TRANS_END_CSYNC(ufsvfsp, error, issync, 1773 TOP_SETATTR, trans_size); 1774 ufs_lockfs_end(ulp); 1775 } 1776 return (error); 1777 1778 case _FIOSDIO: 1779 /* 1780 * set delayed-io 1781 */ 1782 return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr)); 1783 1784 case _FIOGDIO: 1785 /* 1786 * get delayed-io 1787 */ 1788 return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr)); 1789 1790 case _FIOIO: 1791 /* 1792 * inode open 1793 */ 1794 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1795 ULOCKFS_VGET_MASK); 1796 if (error) 1797 return (error); 1798 1799 error = ufs_fioio(vp, (struct fioio *)arg, flag, cr); 1800 1801 if (ulp) { 1802 ufs_lockfs_end(ulp); 1803 } 1804 return (error); 1805 1806 case _FIOFFS: 1807 /* 1808 * file system flush (push w/invalidate) 1809 */ 1810 if ((caddr_t)arg != NULL) 1811 return (EINVAL); 1812 return (ufs_fioffs(vp, NULL, cr)); 1813 1814 case _FIOISBUSY: 1815 /* 1816 * Contract-private interface for Legato 1817 * Purge this vnode from the DNLC and decide 1818 * if this vnode is busy (*arg == 1) or not 1819 * (*arg == 0) 1820 */ 1821 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1822 return (EPERM); 1823 error = ufs_fioisbusy(vp, (int *)arg, cr); 1824 return (error); 1825 1826 case _FIODIRECTIO: 1827 return (ufs_fiodirectio(vp, (int)arg, cr)); 1828 1829 case _FIOTUNE: 1830 /* 1831 * Tune the file system (aka setting fs attributes) 1832 */ 1833 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1834 ULOCKFS_SETATTR_MASK); 1835 if (error) 1836 return (error); 1837 1838 error = ufs_fiotune(vp, (struct fiotune *)arg, cr); 1839 1840 if (ulp) 1841 ufs_lockfs_end(ulp); 1842 return (error); 1843 1844 case _FIOLOGENABLE: 1845 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1846 return (EPERM); 1847 return (ufs_fiologenable(vp, (void *)arg, cr, flag)); 1848 1849 case _FIOLOGDISABLE: 1850 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1851 return (EPERM); 1852 return (ufs_fiologdisable(vp, (void *)arg, cr, flag)); 1853 1854 case _FIOISLOG: 1855 return (ufs_fioislog(vp, (void *)arg, cr, flag)); 1856 1857 case _FIOSNAPSHOTCREATE_MULTI: 1858 { 1859 struct fiosnapcreate_multi fc, *fcp; 1860 size_t fcm_size; 1861 1862 if (copyin((void *)arg, &fc, sizeof (fc))) 1863 return (EFAULT); 1864 if (fc.backfilecount > MAX_BACKFILE_COUNT) 1865 return (EINVAL); 1866 fcm_size = sizeof (struct fiosnapcreate_multi) + 1867 (fc.backfilecount - 1) * sizeof (int); 1868 fcp = (struct fiosnapcreate_multi *) 1869 kmem_alloc(fcm_size, KM_SLEEP); 1870 if (copyin((void *)arg, fcp, fcm_size)) { 1871 kmem_free(fcp, fcm_size); 1872 return (EFAULT); 1873 } 1874 error = ufs_snap_create(vp, fcp, cr); 1875 if (!error && copyout(fcp, (void *)arg, fcm_size)) 1876 error = EFAULT; 1877 kmem_free(fcp, fcm_size); 1878 return (error); 1879 } 1880 1881 case _FIOSNAPSHOTDELETE: 1882 { 1883 struct fiosnapdelete fc; 1884 1885 if (copyin((void *)arg, &fc, sizeof (fc))) 1886 return (EFAULT); 1887 error = ufs_snap_delete(vp, &fc, cr); 1888 if (!error && copyout(&fc, (void *)arg, sizeof (fc))) 1889 error = EFAULT; 1890 return (error); 1891 } 1892 1893 case _FIOGETSUPERBLOCK: 1894 if (copyout(fs, (void *)arg, SBSIZE)) 1895 return (EFAULT); 1896 return (0); 1897 1898 case _FIOGETMAXPHYS: 1899 if (copyout(&maxphys, (void *)arg, sizeof (maxphys))) 1900 return (EFAULT); 1901 return (0); 1902 1903 /* 1904 * The following 3 ioctls are for TSufs support 1905 * although could potentially be used elsewhere 1906 */ 1907 case _FIO_SET_LUFS_DEBUG: 1908 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1909 return (EPERM); 1910 lufs_debug = (uint32_t)arg; 1911 return (0); 1912 1913 case _FIO_SET_LUFS_ERROR: 1914 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1915 return (EPERM); 1916 TRANS_SETERROR(ufsvfsp); 1917 return (0); 1918 1919 case _FIO_GET_TOP_STATS: 1920 { 1921 fio_lufs_stats_t *ls; 1922 ml_unit_t *ul = ufsvfsp->vfs_log; 1923 1924 ls = kmem_zalloc(sizeof (*ls), KM_SLEEP); 1925 ls->ls_debug = ul->un_debug; /* return debug value */ 1926 /* Copy stucture if statistics are being kept */ 1927 if (ul->un_logmap->mtm_tops) { 1928 ls->ls_topstats = *(ul->un_logmap->mtm_tops); 1929 } 1930 error = 0; 1931 if (copyout(ls, (void *)arg, sizeof (*ls))) 1932 error = EFAULT; 1933 kmem_free(ls, sizeof (*ls)); 1934 return (error); 1935 } 1936 1937 case _FIO_SEEK_DATA: 1938 case _FIO_SEEK_HOLE: 1939 if (ddi_copyin((void *)arg, &off, sizeof (off), flag)) 1940 return (EFAULT); 1941 /* offset paramater is in/out */ 1942 error = ufs_fio_holey(vp, cmd, &off); 1943 if (error) 1944 return (error); 1945 if (ddi_copyout(&off, (void *)arg, sizeof (off), flag)) 1946 return (EFAULT); 1947 return (0); 1948 1949 default: 1950 return (ENOTTY); 1951 } 1952 } 1953 1954 /* ARGSUSED */ 1955 static int 1956 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags, 1957 struct cred *cr) 1958 { 1959 struct inode *ip = VTOI(vp); 1960 struct ufsvfs *ufsvfsp; 1961 int err; 1962 1963 TRACE_2(TR_FAC_UFS, TR_UFS_GETATTR_START, 1964 "ufs_getattr_start:vp %p flags %x", vp, flags); 1965 1966 if (vap->va_mask == AT_SIZE) { 1967 /* 1968 * for performance, if only the size is requested don't bother 1969 * with anything else. 1970 */ 1971 UFS_GET_ISIZE(&vap->va_size, ip); 1972 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, 1973 "ufs_getattr_end:vp %p", vp); 1974 return (0); 1975 } 1976 1977 /* 1978 * inlined lockfs checks 1979 */ 1980 ufsvfsp = ip->i_ufsvfs; 1981 if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) { 1982 err = EIO; 1983 goto out; 1984 } 1985 1986 rw_enter(&ip->i_contents, RW_READER); 1987 /* 1988 * Return all the attributes. This should be refined so 1989 * that it only returns what's asked for. 1990 */ 1991 1992 /* 1993 * Copy from inode table. 1994 */ 1995 vap->va_type = vp->v_type; 1996 vap->va_mode = ip->i_mode & MODEMASK; 1997 /* 1998 * If there is an ACL and there is a mask entry, then do the 1999 * extra work that completes the equivalent of an acltomode(3) 2000 * call. According to POSIX P1003.1e, the acl mask should be 2001 * returned in the group permissions field. 2002 * 2003 * - start with the original permission and mode bits (from above) 2004 * - clear the group owner bits 2005 * - add in the mask bits. 2006 */ 2007 if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) { 2008 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3); 2009 vap->va_mode |= 2010 (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3; 2011 } 2012 vap->va_uid = ip->i_uid; 2013 vap->va_gid = ip->i_gid; 2014 vap->va_fsid = ip->i_dev; 2015 vap->va_nodeid = (ino64_t)ip->i_number; 2016 vap->va_nlink = ip->i_nlink; 2017 vap->va_size = ip->i_size; 2018 if (vp->v_type == VCHR || vp->v_type == VBLK) 2019 vap->va_rdev = ip->i_rdev; 2020 else 2021 vap->va_rdev = 0; /* not a b/c spec. */ 2022 mutex_enter(&ip->i_tlock); 2023 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 2024 vap->va_seq = ip->i_seq; 2025 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 2026 vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000; 2027 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 2028 vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000; 2029 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 2030 vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000; 2031 mutex_exit(&ip->i_tlock); 2032 2033 switch (ip->i_mode & IFMT) { 2034 2035 case IFBLK: 2036 vap->va_blksize = MAXBSIZE; /* was BLKDEV_IOSIZE */ 2037 break; 2038 2039 case IFCHR: 2040 vap->va_blksize = MAXBSIZE; 2041 break; 2042 2043 default: 2044 vap->va_blksize = ip->i_fs->fs_bsize; 2045 break; 2046 } 2047 vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks; 2048 rw_exit(&ip->i_contents); 2049 err = 0; 2050 2051 out: 2052 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, "ufs_getattr_end:vp %p", vp); 2053 2054 return (err); 2055 } 2056 2057 /*ARGSUSED4*/ 2058 static int 2059 ufs_setattr( 2060 struct vnode *vp, 2061 struct vattr *vap, 2062 int flags, 2063 struct cred *cr, 2064 caller_context_t *ct) 2065 { 2066 struct inode *ip = VTOI(vp); 2067 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2068 struct fs *fs; 2069 struct ulockfs *ulp; 2070 char *errmsg1; 2071 char *errmsg2; 2072 long blocks; 2073 long int mask = vap->va_mask; 2074 size_t len1, len2; 2075 int issync; 2076 int trans_size; 2077 int dotrans; 2078 int dorwlock; 2079 int error; 2080 int owner_change; 2081 int dodqlock; 2082 timestruc_t now; 2083 vattr_t oldva; 2084 int retry = 1; 2085 int indeadlock; 2086 2087 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_START, 2088 "ufs_setattr_start:vp %p flags %x", vp, flags); 2089 2090 /* 2091 * Cannot set these attributes. 2092 */ 2093 if (mask & AT_NOSET) { 2094 error = EINVAL; 2095 goto out; 2096 } 2097 2098 /* 2099 * check for forced unmount 2100 */ 2101 if (ufsvfsp == NULL) 2102 return (EIO); 2103 2104 fs = ufsvfsp->vfs_fs; 2105 if (fs->fs_ronly != 0) 2106 return (EROFS); 2107 2108 again: 2109 errmsg1 = NULL; 2110 errmsg2 = NULL; 2111 dotrans = 0; 2112 dorwlock = 0; 2113 dodqlock = 0; 2114 2115 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK); 2116 if (error) 2117 goto out; 2118 2119 /* 2120 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 2121 * This follows the protocol for read()/write(). 2122 */ 2123 if (vp->v_type != VDIR) { 2124 /* 2125 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to 2126 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 2127 * possible, retries the operation. 2128 */ 2129 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file); 2130 if (indeadlock) { 2131 if (ulp) 2132 ufs_lockfs_end(ulp); 2133 goto again; 2134 } 2135 dorwlock = 1; 2136 } 2137 2138 /* 2139 * Truncate file. Must have write permission and not be a directory. 2140 */ 2141 if (mask & AT_SIZE) { 2142 rw_enter(&ip->i_contents, RW_WRITER); 2143 if (vp->v_type == VDIR) { 2144 error = EISDIR; 2145 goto update_inode; 2146 } 2147 if (error = ufs_iaccess(ip, IWRITE, cr)) 2148 goto update_inode; 2149 2150 rw_exit(&ip->i_contents); 2151 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr); 2152 if (error) { 2153 rw_enter(&ip->i_contents, RW_WRITER); 2154 goto update_inode; 2155 } 2156 } 2157 2158 if (ulp) { 2159 trans_size = (int)TOP_SETATTR_SIZE(ip); 2160 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size); 2161 ++dotrans; 2162 } 2163 2164 /* 2165 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 2166 * This follows the protocol established by 2167 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 2168 */ 2169 if (vp->v_type == VDIR) { 2170 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR, 2171 retry_dir); 2172 if (indeadlock) 2173 goto again; 2174 dorwlock = 1; 2175 } 2176 2177 /* 2178 * Grab quota lock if we are changing the file's owner. 2179 */ 2180 if (mask & AT_UID) { 2181 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2182 dodqlock = 1; 2183 } 2184 rw_enter(&ip->i_contents, RW_WRITER); 2185 2186 oldva.va_mode = ip->i_mode; 2187 oldva.va_uid = ip->i_uid; 2188 oldva.va_gid = ip->i_gid; 2189 2190 vap->va_mask &= ~AT_SIZE; 2191 /* 2192 * ufs_iaccess is "close enough"; that's because it doesn't 2193 * map the defines. 2194 */ 2195 error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2196 ufs_iaccess, ip); 2197 if (error) 2198 goto update_inode; 2199 2200 mask = vap->va_mask; 2201 2202 /* 2203 * Change file access modes. 2204 */ 2205 if (mask & AT_MODE) { 2206 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT); 2207 TRANS_INODE(ufsvfsp, ip); 2208 ip->i_flag |= ICHG; 2209 if (stickyhack) { 2210 mutex_enter(&vp->v_lock); 2211 if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 2212 vp->v_flag |= VSWAPLIKE; 2213 else 2214 vp->v_flag &= ~VSWAPLIKE; 2215 mutex_exit(&vp->v_lock); 2216 } 2217 } 2218 if (mask & (AT_UID|AT_GID)) { 2219 if (mask & AT_UID) { 2220 /* 2221 * Don't change ownership of the quota inode. 2222 */ 2223 if (ufsvfsp->vfs_qinod == ip) { 2224 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED); 2225 error = EINVAL; 2226 goto update_inode; 2227 } 2228 2229 /* 2230 * No real ownership change. 2231 */ 2232 if (ip->i_uid == vap->va_uid) { 2233 blocks = 0; 2234 owner_change = 0; 2235 } 2236 /* 2237 * Remove the blocks and the file, from the old user's 2238 * quota. 2239 */ 2240 else { 2241 blocks = ip->i_blocks; 2242 owner_change = 1; 2243 2244 (void) chkdq(ip, -blocks, /* force */ 1, cr, 2245 (char **)NULL, (size_t *)NULL); 2246 (void) chkiq(ufsvfsp, /* change */ -1, ip, 2247 (uid_t)ip->i_uid, 2248 /* force */ 1, cr, 2249 (char **)NULL, (size_t *)NULL); 2250 dqrele(ip->i_dquot); 2251 } 2252 2253 ip->i_uid = vap->va_uid; 2254 2255 /* 2256 * There is a real ownership change. 2257 */ 2258 if (owner_change) { 2259 /* 2260 * Add the blocks and the file to the new 2261 * user's quota. 2262 */ 2263 ip->i_dquot = getinoquota(ip); 2264 (void) chkdq(ip, blocks, /* force */ 1, cr, 2265 &errmsg1, &len1); 2266 (void) chkiq(ufsvfsp, /* change */ 1, 2267 (struct inode *)NULL, 2268 (uid_t)ip->i_uid, 2269 /* force */ 1, cr, 2270 &errmsg2, &len2); 2271 } 2272 } 2273 if (mask & AT_GID) { 2274 ip->i_gid = vap->va_gid; 2275 } 2276 TRANS_INODE(ufsvfsp, ip); 2277 ip->i_flag |= ICHG; 2278 } 2279 /* 2280 * Change file access or modified times. 2281 */ 2282 if (mask & (AT_ATIME|AT_MTIME)) { 2283 /* Check that the time value is within ufs range */ 2284 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2285 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2286 error = EOVERFLOW; 2287 goto update_inode; 2288 } 2289 2290 /* 2291 * if the "noaccess" mount option is set and only atime 2292 * update is requested, do nothing. No error is returned. 2293 */ 2294 if ((ufsvfsp->vfs_noatime) && 2295 ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME)) 2296 goto skip_atime; 2297 2298 if (mask & AT_ATIME) { 2299 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2300 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2301 ip->i_flag &= ~IACC; 2302 } 2303 if (mask & AT_MTIME) { 2304 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2305 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2306 gethrestime(&now); 2307 if (now.tv_sec > TIME32_MAX) { 2308 /* 2309 * In 2038, ctime sticks forever.. 2310 */ 2311 ip->i_ctime.tv_sec = TIME32_MAX; 2312 ip->i_ctime.tv_usec = 0; 2313 } else { 2314 ip->i_ctime.tv_sec = now.tv_sec; 2315 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2316 } 2317 ip->i_flag &= ~(IUPD|ICHG); 2318 ip->i_flag |= IMODTIME; 2319 } 2320 TRANS_INODE(ufsvfsp, ip); 2321 ip->i_flag |= IMOD; 2322 } 2323 2324 skip_atime: 2325 /* 2326 * The presence of a shadow inode may indicate an ACL, but does 2327 * not imply an ACL. Future FSD types should be handled here too 2328 * and check for the presence of the attribute-specific data 2329 * before referencing it. 2330 */ 2331 if (ip->i_shadow) { 2332 /* 2333 * XXX if ufs_iupdat is changed to sandbagged write fix 2334 * ufs_acl_setattr to push ip to keep acls consistent 2335 * 2336 * Suppress out of inodes messages if we will retry. 2337 */ 2338 if (retry) 2339 ip->i_flag |= IQUIET; 2340 error = ufs_acl_setattr(ip, vap, cr); 2341 ip->i_flag &= ~IQUIET; 2342 } 2343 2344 update_inode: 2345 /* 2346 * Setattr always increases the sequence number 2347 */ 2348 ip->i_seq++; 2349 2350 /* 2351 * if nfsd and not logging; push synchronously 2352 */ 2353 if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) { 2354 ufs_iupdat(ip, 1); 2355 } else { 2356 ITIMES_NOLOCK(ip); 2357 } 2358 2359 rw_exit(&ip->i_contents); 2360 if (dodqlock) { 2361 rw_exit(&ufsvfsp->vfs_dqrwlock); 2362 } 2363 if (dorwlock) 2364 rw_exit(&ip->i_rwlock); 2365 2366 if (ulp) { 2367 if (dotrans) { 2368 int terr = 0; 2369 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR, 2370 trans_size); 2371 if (error == 0) 2372 error = terr; 2373 } 2374 ufs_lockfs_end(ulp); 2375 } 2376 out: 2377 /* 2378 * If out of inodes or blocks, see if we can free something 2379 * up from the delete queue. 2380 */ 2381 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 2382 ufs_delete_drain_wait(ufsvfsp, 1); 2383 retry = 0; 2384 if (errmsg1 != NULL) 2385 kmem_free(errmsg1, len1); 2386 if (errmsg2 != NULL) 2387 kmem_free(errmsg2, len2); 2388 goto again; 2389 } 2390 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_END, 2391 "ufs_setattr_end:vp %p error %d", vp, error); 2392 if (errmsg1 != NULL) { 2393 uprintf(errmsg1); 2394 kmem_free(errmsg1, len1); 2395 } 2396 if (errmsg2 != NULL) { 2397 uprintf(errmsg2); 2398 kmem_free(errmsg2, len2); 2399 } 2400 return (error); 2401 } 2402 2403 /*ARGSUSED*/ 2404 static int 2405 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr) 2406 { 2407 struct inode *ip = VTOI(vp); 2408 int error; 2409 2410 TRACE_3(TR_FAC_UFS, TR_UFS_ACCESS_START, 2411 "ufs_access_start:vp %p mode %x flags %x", vp, mode, flags); 2412 2413 if (ip->i_ufsvfs == NULL) 2414 return (EIO); 2415 2416 rw_enter(&ip->i_contents, RW_READER); 2417 2418 /* 2419 * The ufs_iaccess function wants to be called with 2420 * mode bits expressed as "ufs specific" bits. 2421 * I.e., VWRITE|VREAD|VEXEC do not make sense to 2422 * ufs_iaccess() but IWRITE|IREAD|IEXEC do. 2423 * But since they're the same we just pass the vnode mode 2424 * bit but just verify that assumption at compile time. 2425 */ 2426 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC 2427 #error "ufs_access needs to map Vmodes to Imodes" 2428 #endif 2429 error = ufs_iaccess(ip, mode, cr); 2430 2431 rw_exit(&ip->i_contents); 2432 2433 TRACE_2(TR_FAC_UFS, TR_UFS_ACCESS_END, 2434 "ufs_access_end:vp %p error %d", vp, error); 2435 return (error); 2436 } 2437 2438 /* ARGSUSED */ 2439 static int 2440 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr) 2441 { 2442 struct inode *ip = VTOI(vp); 2443 struct ufsvfs *ufsvfsp; 2444 struct ulockfs *ulp; 2445 int error; 2446 int fastsymlink; 2447 2448 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_START, 2449 "ufs_readlink_start:vp %p uiop %p", uiop, vp); 2450 2451 if (vp->v_type != VLNK) { 2452 error = EINVAL; 2453 goto nolockout; 2454 } 2455 2456 /* 2457 * If the symbolic link is empty there is nothing to read. 2458 * Fast-track these empty symbolic links 2459 */ 2460 if (ip->i_size == 0) { 2461 error = 0; 2462 goto nolockout; 2463 } 2464 2465 ufsvfsp = ip->i_ufsvfs; 2466 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK); 2467 if (error) 2468 goto nolockout; 2469 /* 2470 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK 2471 */ 2472 again: 2473 fastsymlink = 0; 2474 if (ip->i_flag & IFASTSYMLNK) { 2475 rw_enter(&ip->i_rwlock, RW_READER); 2476 rw_enter(&ip->i_contents, RW_READER); 2477 if (ip->i_flag & IFASTSYMLNK) { 2478 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 2479 (ip->i_fs->fs_ronly == 0) && 2480 (!ufsvfsp->vfs_noatime)) { 2481 mutex_enter(&ip->i_tlock); 2482 ip->i_flag |= IACC; 2483 mutex_exit(&ip->i_tlock); 2484 } 2485 error = uiomove((caddr_t)&ip->i_db[1], 2486 MIN(ip->i_size, uiop->uio_resid), 2487 UIO_READ, uiop); 2488 ITIMES(ip); 2489 ++fastsymlink; 2490 } 2491 rw_exit(&ip->i_contents); 2492 rw_exit(&ip->i_rwlock); 2493 } 2494 if (!fastsymlink) { 2495 ssize_t size; /* number of bytes read */ 2496 caddr_t basep; /* pointer to input data */ 2497 ino_t ino; 2498 long igen; 2499 struct uio tuio; /* temp uio struct */ 2500 struct uio *tuiop; 2501 iovec_t tiov; /* temp iovec struct */ 2502 char kbuf[FSL_SIZE]; /* buffer to hold fast symlink */ 2503 int tflag = 0; /* flag to indicate temp vars used */ 2504 2505 ino = ip->i_number; 2506 igen = ip->i_gen; 2507 size = uiop->uio_resid; 2508 basep = uiop->uio_iov->iov_base; 2509 tuiop = uiop; 2510 2511 rw_enter(&ip->i_rwlock, RW_WRITER); 2512 rw_enter(&ip->i_contents, RW_WRITER); 2513 if (ip->i_flag & IFASTSYMLNK) { 2514 rw_exit(&ip->i_contents); 2515 rw_exit(&ip->i_rwlock); 2516 goto again; 2517 } 2518 2519 /* can this be a fast symlink and is it a user buffer? */ 2520 if (ip->i_size <= FSL_SIZE && 2521 (uiop->uio_segflg == UIO_USERSPACE || 2522 uiop->uio_segflg == UIO_USERISPACE)) { 2523 2524 bzero(&tuio, sizeof (struct uio)); 2525 /* 2526 * setup a kernel buffer to read link into. this 2527 * is to fix a race condition where the user buffer 2528 * got corrupted before copying it into the inode. 2529 */ 2530 size = ip->i_size; 2531 tiov.iov_len = size; 2532 tiov.iov_base = kbuf; 2533 tuio.uio_iov = &tiov; 2534 tuio.uio_iovcnt = 1; 2535 tuio.uio_offset = uiop->uio_offset; 2536 tuio.uio_segflg = UIO_SYSSPACE; 2537 tuio.uio_fmode = uiop->uio_fmode; 2538 tuio.uio_extflg = uiop->uio_extflg; 2539 tuio.uio_limit = uiop->uio_limit; 2540 tuio.uio_resid = size; 2541 2542 basep = tuio.uio_iov->iov_base; 2543 tuiop = &tuio; 2544 tflag = 1; 2545 } 2546 2547 error = rdip(ip, tuiop, 0, cr); 2548 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) { 2549 rw_exit(&ip->i_contents); 2550 rw_exit(&ip->i_rwlock); 2551 goto out; 2552 } 2553 2554 if (tflag == 0) 2555 size -= uiop->uio_resid; 2556 2557 if ((tflag == 0 && ip->i_size <= FSL_SIZE && 2558 ip->i_size == size) || (tflag == 1 && 2559 tuio.uio_resid == 0)) { 2560 error = kcopy(basep, &ip->i_db[1], ip->i_size); 2561 if (error == 0) { 2562 ip->i_flag |= IFASTSYMLNK; 2563 /* 2564 * free page 2565 */ 2566 (void) VOP_PUTPAGE(ITOV(ip), 2567 (offset_t)0, PAGESIZE, 2568 (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC), 2569 cr); 2570 } else { 2571 int i; 2572 /* error, clear garbage left behind */ 2573 for (i = 1; i < NDADDR; i++) 2574 ip->i_db[i] = 0; 2575 for (i = 0; i < NIADDR; i++) 2576 ip->i_ib[i] = 0; 2577 } 2578 } 2579 if (tflag == 1) { 2580 /* now, copy it into the user buffer */ 2581 error = uiomove((caddr_t)kbuf, 2582 MIN(size, uiop->uio_resid), 2583 UIO_READ, uiop); 2584 } 2585 rw_exit(&ip->i_contents); 2586 rw_exit(&ip->i_rwlock); 2587 } 2588 out: 2589 if (ulp) { 2590 ufs_lockfs_end(ulp); 2591 } 2592 nolockout: 2593 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_END, 2594 "ufs_readlink_end:vp %p error %d", vp, error); 2595 2596 return (error); 2597 } 2598 2599 /* ARGSUSED */ 2600 static int 2601 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr) 2602 { 2603 struct inode *ip = VTOI(vp); 2604 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2605 struct ulockfs *ulp; 2606 int error; 2607 2608 TRACE_1(TR_FAC_UFS, TR_UFS_FSYNC_START, 2609 "ufs_fsync_start:vp %p", vp); 2610 2611 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK); 2612 if (error) 2613 return (error); 2614 2615 if (TRANS_ISTRANS(ufsvfsp)) { 2616 /* 2617 * First push out any data pages 2618 */ 2619 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2620 (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) { 2621 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 2622 0, CRED()); 2623 if (error) 2624 goto out; 2625 } 2626 2627 /* 2628 * Delta any delayed inode times updates 2629 * and push inode to log. 2630 * All other inode deltas will have already been delta'd 2631 * and will be pushed during the commit. 2632 */ 2633 if (!(syncflag & FDSYNC) && 2634 ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) { 2635 if (ulp) { 2636 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC, 2637 TOP_SYNCIP_SIZE); 2638 } 2639 rw_enter(&ip->i_contents, RW_READER); 2640 mutex_enter(&ip->i_tlock); 2641 ip->i_flag &= ~IMODTIME; 2642 mutex_exit(&ip->i_tlock); 2643 ufs_iupdat(ip, I_SYNC); 2644 rw_exit(&ip->i_contents); 2645 if (ulp) { 2646 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC, 2647 TOP_SYNCIP_SIZE); 2648 } 2649 } 2650 2651 /* 2652 * Commit the Moby transaction 2653 * 2654 * Deltas have already been made so we just need to 2655 * commit them with a synchronous transaction. 2656 * TRANS_BEGIN_SYNC() will return an error 2657 * if there are no deltas to commit, for an 2658 * empty transaction. 2659 */ 2660 if (ulp) { 2661 TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE, 2662 error); 2663 if (error) { 2664 error = 0; /* commit wasn't needed */ 2665 goto out; 2666 } 2667 TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC, 2668 TOP_COMMIT_SIZE); 2669 } 2670 } else { /* not logging */ 2671 if (!(IS_SWAPVP(vp))) 2672 if (syncflag & FNODSYNC) { 2673 /* Just update the inode only */ 2674 TRANS_IUPDAT(ip, 1); 2675 error = 0; 2676 } else if (syncflag & FDSYNC) 2677 /* Do data-synchronous writes */ 2678 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC); 2679 else 2680 /* Do synchronous writes */ 2681 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC); 2682 2683 rw_enter(&ip->i_contents, RW_WRITER); 2684 if (!error) 2685 error = ufs_sync_indir(ip); 2686 rw_exit(&ip->i_contents); 2687 } 2688 out: 2689 if (ulp) { 2690 ufs_lockfs_end(ulp); 2691 } 2692 TRACE_2(TR_FAC_UFS, TR_UFS_FSYNC_END, 2693 "ufs_fsync_end:vp %p error %d", vp, error); 2694 return (error); 2695 } 2696 2697 /*ARGSUSED*/ 2698 static void 2699 ufs_inactive(struct vnode *vp, struct cred *cr) 2700 { 2701 ufs_iinactive(VTOI(vp)); 2702 } 2703 2704 /* 2705 * Unix file system operations having to do with directory manipulation. 2706 */ 2707 int ufs_lookup_idle_count = 2; /* Number of inodes to idle each time */ 2708 /* ARGSUSED */ 2709 static int 2710 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 2711 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr) 2712 { 2713 struct inode *ip; 2714 struct inode *sip; 2715 struct inode *xip; 2716 struct ufsvfs *ufsvfsp; 2717 struct ulockfs *ulp; 2718 struct vnode *vp; 2719 int error; 2720 2721 TRACE_2(TR_FAC_UFS, TR_UFS_LOOKUP_START, 2722 "ufs_lookup_start:dvp %p name %s", dvp, nm); 2723 2724 2725 /* 2726 * Check flags for type of lookup (regular file or attribute file) 2727 */ 2728 2729 ip = VTOI(dvp); 2730 2731 if (flags & LOOKUP_XATTR) { 2732 2733 /* 2734 * We don't allow recursive attributes... 2735 * Maybe someday we will. 2736 */ 2737 if ((ip->i_cflags & IXATTR)) { 2738 return (EINVAL); 2739 } 2740 2741 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) { 2742 error = ufs_xattr_getattrdir(dvp, &sip, flags, cr); 2743 if (error) { 2744 *vpp = NULL; 2745 goto out; 2746 } 2747 2748 vp = ITOV(sip); 2749 dnlc_update(dvp, XATTR_DIR_NAME, vp); 2750 } 2751 2752 /* 2753 * Check accessibility of directory. 2754 */ 2755 if (vp == DNLC_NO_VNODE) { 2756 VN_RELE(vp); 2757 error = ENOENT; 2758 goto out; 2759 } 2760 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr)) != 0) { 2761 VN_RELE(vp); 2762 goto out; 2763 } 2764 2765 *vpp = vp; 2766 return (0); 2767 } 2768 2769 /* 2770 * Check for a null component, which we should treat as 2771 * looking at dvp from within it's parent, so we don't 2772 * need a call to ufs_iaccess(), as it has already been 2773 * done. 2774 */ 2775 if (nm[0] == 0) { 2776 VN_HOLD(dvp); 2777 error = 0; 2778 *vpp = dvp; 2779 goto out; 2780 } 2781 2782 /* 2783 * Check for "." ie itself. this is a quick check and 2784 * avoids adding "." into the dnlc (which have been seen 2785 * to occupy >10% of the cache). 2786 */ 2787 if ((nm[0] == '.') && (nm[1] == 0)) { 2788 /* 2789 * Don't return without checking accessibility 2790 * of the directory. We only need the lock if 2791 * we are going to return it. 2792 */ 2793 if ((error = ufs_iaccess(ip, IEXEC, cr)) == 0) { 2794 VN_HOLD(dvp); 2795 *vpp = dvp; 2796 } 2797 goto out; 2798 } 2799 2800 /* 2801 * Fast path: Check the directory name lookup cache. 2802 */ 2803 if (vp = dnlc_lookup(dvp, nm)) { 2804 /* 2805 * Check accessibility of directory. 2806 */ 2807 if ((error = ufs_iaccess(ip, IEXEC, cr)) != 0) { 2808 VN_RELE(vp); 2809 goto out; 2810 } 2811 if (vp == DNLC_NO_VNODE) { 2812 VN_RELE(vp); 2813 error = ENOENT; 2814 goto out; 2815 } 2816 xip = VTOI(vp); 2817 ulp = NULL; 2818 goto fastpath; 2819 } 2820 2821 /* 2822 * Keep the idle queue from getting too long by 2823 * idling two inodes before attempting to allocate another. 2824 * This operation must be performed before entering 2825 * lockfs or a transaction. 2826 */ 2827 if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat) 2828 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 2829 ins.in_lidles.value.ul += ufs_lookup_idle_count; 2830 ufs_idle_some(ufs_lookup_idle_count); 2831 } 2832 2833 retry_lookup: 2834 ufsvfsp = ip->i_ufsvfs; 2835 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK); 2836 if (error) 2837 goto out; 2838 2839 error = ufs_dirlook(ip, nm, &xip, cr, 1); 2840 2841 fastpath: 2842 if (error == 0) { 2843 ip = xip; 2844 *vpp = ITOV(ip); 2845 2846 /* 2847 * If vnode is a device return special vnode instead. 2848 */ 2849 if (IS_DEVVP(*vpp)) { 2850 struct vnode *newvp; 2851 2852 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, 2853 cr); 2854 VN_RELE(*vpp); 2855 if (newvp == NULL) 2856 error = ENOSYS; 2857 else 2858 *vpp = newvp; 2859 } 2860 } 2861 if (ulp) { 2862 ufs_lockfs_end(ulp); 2863 } 2864 2865 if (error == EAGAIN) 2866 goto retry_lookup; 2867 2868 out: 2869 TRACE_3(TR_FAC_UFS, TR_UFS_LOOKUP_END, 2870 "ufs_lookup_end:dvp %p name %s error %d", vpp, nm, error); 2871 return (error); 2872 } 2873 2874 static int 2875 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl, 2876 int mode, struct vnode **vpp, struct cred *cr, int flag) 2877 { 2878 struct inode *ip; 2879 struct inode *xip; 2880 struct inode *dip; 2881 struct vnode *xvp; 2882 struct ufsvfs *ufsvfsp; 2883 struct ulockfs *ulp; 2884 int error; 2885 int issync; 2886 int truncflag; 2887 int trans_size; 2888 int noentry; 2889 int defer_dip_seq_update = 0; /* need to defer update of dip->i_seq */ 2890 int retry = 1; 2891 int indeadlock; 2892 2893 TRACE_1(TR_FAC_UFS, TR_UFS_CREATE_START, 2894 "ufs_create_start:dvp %p", dvp); 2895 2896 again: 2897 ip = VTOI(dvp); 2898 ufsvfsp = ip->i_ufsvfs; 2899 truncflag = 0; 2900 2901 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK); 2902 if (error) 2903 goto out; 2904 2905 if (ulp) { 2906 trans_size = (int)TOP_CREATE_SIZE(ip); 2907 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size); 2908 } 2909 2910 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 2911 vap->va_mode &= ~VSVTX; 2912 2913 if (*name == '\0') { 2914 /* 2915 * Null component name refers to the directory itself. 2916 */ 2917 VN_HOLD(dvp); 2918 /* 2919 * Even though this is an error case, we need to grab the 2920 * quota lock since the error handling code below is common. 2921 */ 2922 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2923 rw_enter(&ip->i_contents, RW_WRITER); 2924 error = EEXIST; 2925 } else { 2926 xip = NULL; 2927 noentry = 0; 2928 /* 2929 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 2930 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 2931 * possible, retries the operation. 2932 */ 2933 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE, 2934 retry_dir); 2935 if (indeadlock) 2936 goto again; 2937 2938 xvp = dnlc_lookup(dvp, name); 2939 if (xvp == DNLC_NO_VNODE) { 2940 noentry = 1; 2941 VN_RELE(xvp); 2942 xvp = NULL; 2943 } 2944 if (xvp) { 2945 rw_exit(&ip->i_rwlock); 2946 if (error = ufs_iaccess(ip, IEXEC, cr)) { 2947 VN_RELE(xvp); 2948 } else { 2949 error = EEXIST; 2950 xip = VTOI(xvp); 2951 } 2952 } else { 2953 /* 2954 * Suppress file system full message if we will retry 2955 */ 2956 error = ufs_direnter_cm(ip, name, DE_CREATE, 2957 vap, &xip, cr, 2958 (noentry | (retry ? IQUIET : 0))); 2959 if (error == EAGAIN) { 2960 if (ulp) { 2961 TRANS_END_CSYNC(ufsvfsp, error, issync, 2962 TOP_CREATE, trans_size); 2963 ufs_lockfs_end(ulp); 2964 } 2965 goto again; 2966 } 2967 rw_exit(&ip->i_rwlock); 2968 } 2969 ip = xip; 2970 if (ip != NULL) { 2971 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2972 rw_enter(&ip->i_contents, RW_WRITER); 2973 } 2974 } 2975 2976 /* 2977 * If the file already exists and this is a non-exclusive create, 2978 * check permissions and allow access for non-directories. 2979 * Read-only create of an existing directory is also allowed. 2980 * We fail an exclusive create of anything which already exists. 2981 */ 2982 if (error == EEXIST) { 2983 dip = VTOI(dvp); 2984 if (excl == NONEXCL) { 2985 if ((((ip->i_mode & IFMT) == IFDIR) || 2986 ((ip->i_mode & IFMT) == IFATTRDIR)) && 2987 (mode & IWRITE)) 2988 error = EISDIR; 2989 else if (mode) 2990 error = ufs_iaccess(ip, mode, cr); 2991 else 2992 error = 0; 2993 } 2994 if (error) { 2995 rw_exit(&ip->i_contents); 2996 rw_exit(&ufsvfsp->vfs_dqrwlock); 2997 VN_RELE(ITOV(ip)); 2998 goto unlock; 2999 } 3000 /* 3001 * If the error EEXIST was set, then i_seq can not 3002 * have been updated. The sequence number interface 3003 * is defined such that a non-error VOP_CREATE must 3004 * increase the dir va_seq it by at least one. If we 3005 * have cleared the error, increase i_seq. Note that 3006 * we are increasing the dir i_seq and in rare cases 3007 * ip may actually be from the dvp, so we already have 3008 * the locks and it will not be subject to truncation. 3009 * In case we have to update i_seq of the parent 3010 * directory dip, we have to defer it till we have 3011 * released our locks on ip due to lock ordering requirements. 3012 */ 3013 if (ip != dip) 3014 defer_dip_seq_update = 1; 3015 else 3016 ip->i_seq++; 3017 3018 if (((ip->i_mode & IFMT) == IFREG) && 3019 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 3020 /* 3021 * Truncate regular files, if requested by caller. 3022 * Grab i_rwlock to make sure no one else is 3023 * currently writing to the file (we promised 3024 * bmap we would do this). 3025 * Must get the locks in the correct order. 3026 */ 3027 if (ip->i_size == 0) { 3028 ip->i_flag |= ICHG | IUPD; 3029 ip->i_seq++; 3030 TRANS_INODE(ufsvfsp, ip); 3031 } else { 3032 /* 3033 * Large Files: Why this check here? 3034 * Though we do it in vn_create() we really 3035 * want to guarantee that we do not destroy 3036 * Large file data by atomically checking 3037 * the size while holding the contents 3038 * lock. 3039 */ 3040 if (flag && !(flag & FOFFMAX) && 3041 ((ip->i_mode & IFMT) == IFREG) && 3042 (ip->i_size > (offset_t)MAXOFF32_T)) { 3043 rw_exit(&ip->i_contents); 3044 rw_exit(&ufsvfsp->vfs_dqrwlock); 3045 error = EOVERFLOW; 3046 goto unlock; 3047 } 3048 if (TRANS_ISTRANS(ufsvfsp)) 3049 truncflag++; 3050 else { 3051 rw_exit(&ip->i_contents); 3052 rw_exit(&ufsvfsp->vfs_dqrwlock); 3053 ufs_tryirwlock_trans(&ip->i_rwlock, 3054 RW_WRITER, TOP_CREATE, 3055 retry_file); 3056 if (indeadlock) { 3057 VN_RELE(ITOV(ip)); 3058 goto again; 3059 } 3060 rw_enter(&ufsvfsp->vfs_dqrwlock, 3061 RW_READER); 3062 rw_enter(&ip->i_contents, RW_WRITER); 3063 (void) ufs_itrunc(ip, (u_offset_t)0, 0, 3064 cr); 3065 rw_exit(&ip->i_rwlock); 3066 } 3067 } 3068 } 3069 } 3070 3071 if (error) { 3072 if (ip != NULL) { 3073 rw_exit(&ufsvfsp->vfs_dqrwlock); 3074 rw_exit(&ip->i_contents); 3075 } 3076 goto unlock; 3077 } 3078 3079 *vpp = ITOV(ip); 3080 ITIMES(ip); 3081 rw_exit(&ip->i_contents); 3082 rw_exit(&ufsvfsp->vfs_dqrwlock); 3083 3084 /* 3085 * If vnode is a device return special vnode instead. 3086 */ 3087 if (!error && IS_DEVVP(*vpp)) { 3088 struct vnode *newvp; 3089 3090 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 3091 VN_RELE(*vpp); 3092 if (newvp == NULL) { 3093 error = ENOSYS; 3094 goto unlock; 3095 } 3096 truncflag = 0; 3097 *vpp = newvp; 3098 } 3099 unlock: 3100 3101 /* 3102 * Do the deferred update of the parent directory's sequence 3103 * number now. 3104 */ 3105 if (defer_dip_seq_update == 1) { 3106 rw_enter(&dip->i_contents, RW_READER); 3107 mutex_enter(&dip->i_tlock); 3108 dip->i_seq++; 3109 mutex_exit(&dip->i_tlock); 3110 rw_exit(&dip->i_contents); 3111 } 3112 3113 if (ulp) { 3114 int terr = 0; 3115 3116 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE, 3117 trans_size); 3118 3119 /* 3120 * If we haven't had a more interesting failure 3121 * already, then anything that might've happened 3122 * here should be reported. 3123 */ 3124 if (error == 0) 3125 error = terr; 3126 } 3127 3128 if (!error && truncflag) { 3129 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc); 3130 if (indeadlock) { 3131 if (ulp) 3132 ufs_lockfs_end(ulp); 3133 VN_RELE(ITOV(ip)); 3134 goto again; 3135 } 3136 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr); 3137 rw_exit(&ip->i_rwlock); 3138 } 3139 3140 if (ulp) 3141 ufs_lockfs_end(ulp); 3142 3143 /* 3144 * If no inodes available, try to free one up out of the 3145 * pending delete queue. 3146 */ 3147 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3148 ufs_delete_drain_wait(ufsvfsp, 1); 3149 retry = 0; 3150 goto again; 3151 } 3152 3153 out: 3154 TRACE_3(TR_FAC_UFS, TR_UFS_CREATE_END, 3155 "ufs_create_end:dvp %p name %s error %d", vpp, name, error); 3156 return (error); 3157 } 3158 3159 extern int ufs_idle_max; 3160 /*ARGSUSED*/ 3161 static int 3162 ufs_remove(struct vnode *vp, char *nm, struct cred *cr) 3163 { 3164 struct inode *ip = VTOI(vp); 3165 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3166 struct ulockfs *ulp; 3167 vnode_t *rmvp = NULL; /* Vnode corresponding to name being removed */ 3168 int indeadlock; 3169 int error; 3170 int issync; 3171 int trans_size; 3172 3173 TRACE_1(TR_FAC_UFS, TR_UFS_REMOVE_START, 3174 "ufs_remove_start:vp %p", vp); 3175 3176 /* 3177 * don't let the delete queue get too long 3178 */ 3179 if (ufsvfsp == NULL) { 3180 error = EIO; 3181 goto out; 3182 } 3183 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3184 ufs_delete_drain(vp->v_vfsp, 1, 1); 3185 3186 retry_remove: 3187 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK); 3188 if (error) 3189 goto out; 3190 3191 if (ulp) 3192 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 3193 trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp))); 3194 3195 /* 3196 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3197 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3198 * possible, retries the operation. 3199 */ 3200 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry); 3201 if (indeadlock) 3202 goto retry_remove; 3203 error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0, 3204 DR_REMOVE, cr, &rmvp); 3205 rw_exit(&ip->i_rwlock); 3206 3207 if (ulp) { 3208 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size); 3209 ufs_lockfs_end(ulp); 3210 } 3211 3212 /* 3213 * This must be called after the remove transaction is closed. 3214 */ 3215 if (rmvp != NULL) { 3216 /* Only send the event if there were no errors */ 3217 if (error == 0) 3218 vnevent_remove(rmvp); 3219 VN_RELE(rmvp); 3220 } 3221 out: 3222 TRACE_3(TR_FAC_UFS, TR_UFS_REMOVE_END, 3223 "ufs_remove_end:vp %p name %s error %d", vp, nm, error); 3224 return (error); 3225 } 3226 3227 /* 3228 * Link a file or a directory. Only privileged processes are allowed to 3229 * make links to directories. 3230 */ 3231 static int 3232 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr) 3233 { 3234 struct inode *sip; 3235 struct inode *tdp = VTOI(tdvp); 3236 struct ufsvfs *ufsvfsp = tdp->i_ufsvfs; 3237 struct ulockfs *ulp; 3238 struct vnode *realvp; 3239 int error; 3240 int issync; 3241 int trans_size; 3242 int isdev; 3243 int indeadlock; 3244 3245 TRACE_1(TR_FAC_UFS, TR_UFS_LINK_START, 3246 "ufs_link_start:tdvp %p", tdvp); 3247 3248 retry_link: 3249 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK); 3250 if (error) 3251 goto out; 3252 3253 if (ulp) 3254 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK, 3255 trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp))); 3256 3257 if (VOP_REALVP(svp, &realvp) == 0) 3258 svp = realvp; 3259 3260 /* 3261 * Make sure link for extended attributes is valid 3262 * We only support hard linking of attr in ATTRDIR to ATTRDIR 3263 * 3264 * Make certain we don't attempt to look at a device node as 3265 * a ufs inode. 3266 */ 3267 3268 isdev = IS_DEVVP(svp); 3269 if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) && 3270 ((tdp->i_mode & IFMT) == IFATTRDIR)) || 3271 ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) && 3272 ((tdp->i_mode & IFMT) == IFDIR))) { 3273 error = EINVAL; 3274 goto unlock; 3275 } 3276 3277 sip = VTOI(svp); 3278 if ((svp->v_type == VDIR && 3279 secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) || 3280 (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) { 3281 error = EPERM; 3282 goto unlock; 3283 } 3284 3285 /* 3286 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3287 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3288 * possible, retries the operation. 3289 */ 3290 ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry); 3291 if (indeadlock) 3292 goto retry_link; 3293 error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0, 3294 sip, cr, NULL); 3295 rw_exit(&tdp->i_rwlock); 3296 3297 unlock: 3298 if (ulp) { 3299 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size); 3300 ufs_lockfs_end(ulp); 3301 } 3302 out: 3303 TRACE_2(TR_FAC_UFS, TR_UFS_LINK_END, 3304 "ufs_link_end:tdvp %p error %d", tdvp, error); 3305 return (error); 3306 } 3307 3308 uint64_t ufs_rename_retry_cnt; 3309 uint64_t ufs_rename_upgrade_retry_cnt; 3310 uint64_t ufs_rename_dircheck_retry_cnt; 3311 clock_t ufs_rename_backoff_delay = 1; 3312 3313 /* 3314 * Rename a file or directory. 3315 * We are given the vnode and entry string of the source and the 3316 * vnode and entry string of the place we want to move the source 3317 * to (the target). The essential operation is: 3318 * unlink(target); 3319 * link(source, target); 3320 * unlink(source); 3321 * but "atomically". Can't do full commit without saving state in 3322 * the inode on disk, which isn't feasible at this time. Best we 3323 * can do is always guarantee that the TARGET exists. 3324 */ 3325 3326 /*ARGSUSED*/ 3327 static int 3328 ufs_rename( 3329 struct vnode *sdvp, /* old (source) parent vnode */ 3330 char *snm, /* old (source) entry name */ 3331 struct vnode *tdvp, /* new (target) parent vnode */ 3332 char *tnm, /* new (target) entry name */ 3333 struct cred *cr) 3334 { 3335 struct inode *sip = NULL; /* source inode */ 3336 struct inode *ip = NULL; /* check inode */ 3337 struct inode *sdp; /* old (source) parent inode */ 3338 struct inode *tdp; /* new (target) parent inode */ 3339 struct vnode *tvp = NULL; /* target vnode, if it exists */ 3340 struct vnode *realvp; 3341 struct ufsvfs *ufsvfsp; 3342 struct ulockfs *ulp; 3343 struct ufs_slot slot; 3344 timestruc_t now; 3345 int error; 3346 int issync; 3347 int trans_size; 3348 krwlock_t *first_lock; 3349 krwlock_t *second_lock; 3350 krwlock_t *reverse_lock; 3351 3352 TRACE_1(TR_FAC_UFS, TR_UFS_RENAME_START, 3353 "ufs_rename_start:sdvp %p", sdvp); 3354 3355 3356 sdp = VTOI(sdvp); 3357 slot.fbp = NULL; 3358 ufsvfsp = sdp->i_ufsvfs; 3359 retry_rename: 3360 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); 3361 if (error) 3362 goto out; 3363 3364 if (ulp) 3365 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, 3366 trans_size = (int)TOP_RENAME_SIZE(sdp)); 3367 3368 if (VOP_REALVP(tdvp, &realvp) == 0) 3369 tdvp = realvp; 3370 3371 tdp = VTOI(tdvp); 3372 3373 3374 /* 3375 * We only allow renaming of attributes from ATTRDIR to ATTRDIR. 3376 */ 3377 if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) { 3378 error = EINVAL; 3379 goto unlock; 3380 } 3381 3382 /* 3383 * Look up inode of file we're supposed to rename. 3384 */ 3385 gethrestime(&now); 3386 if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) { 3387 if (error == EAGAIN) { 3388 if (ulp) { 3389 TRANS_END_CSYNC(ufsvfsp, error, issync, 3390 TOP_RENAME, trans_size); 3391 ufs_lockfs_end(ulp); 3392 } 3393 goto retry_rename; 3394 } 3395 3396 goto unlock; 3397 } 3398 3399 /* 3400 * Lock both the source and target directories (they may be 3401 * the same) to provide the atomicity semantics that was 3402 * previously provided by the per file system vfs_rename_lock 3403 * 3404 * with vfs_rename_lock removed to allow simultaneous renames 3405 * within a file system, ufs_dircheckpath can deadlock while 3406 * traversing back to ensure that source is not a parent directory 3407 * of target parent directory. This is because we get into 3408 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER. 3409 * If the tdp and sdp of the simultaneous renames happen to be 3410 * in the path of each other, it can lead to a deadlock. This 3411 * can be avoided by getting the locks as RW_READER here and then 3412 * upgrading to RW_WRITER after completing the ufs_dircheckpath. 3413 * 3414 * We hold the target directory's i_rwlock after calling 3415 * ufs_lockfs_begin but in many other operations (like ufs_readdir) 3416 * VOP_RWLOCK is explicitly called by the filesystem independent code 3417 * before calling the file system operation. In these cases the order 3418 * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin 3419 * is called). This is fine as long as ufs_lockfs_begin acts as a VOP 3420 * counter but with ufs_quiesce setting the SLOCK bit this becomes a 3421 * synchronizing object which might lead to a deadlock. So we use 3422 * rw_tryenter instead of rw_enter. If we fail to get this lock and 3423 * find that SLOCK bit is set, we call ufs_lockfs_end and restart the 3424 * operation. 3425 */ 3426 retry: 3427 first_lock = &tdp->i_rwlock; 3428 second_lock = &sdp->i_rwlock; 3429 retry_firstlock: 3430 if (!rw_tryenter(first_lock, RW_READER)) { 3431 /* 3432 * We didn't get the lock. Check if the SLOCK is set in the 3433 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 3434 * and wait for SLOCK to be cleared. 3435 */ 3436 3437 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 3438 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, 3439 trans_size); 3440 ufs_lockfs_end(ulp); 3441 goto retry_rename; 3442 3443 } else { 3444 /* 3445 * SLOCK isn't set so this is a genuine synchronization 3446 * case. Let's try again after giving them a breather. 3447 */ 3448 delay(RETRY_LOCK_DELAY); 3449 goto retry_firstlock; 3450 } 3451 } 3452 /* 3453 * Need to check if the tdp and sdp are same !!! 3454 */ 3455 if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) { 3456 /* 3457 * We didn't get the lock. Check if the SLOCK is set in the 3458 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 3459 * and wait for SLOCK to be cleared. 3460 */ 3461 3462 rw_exit(first_lock); 3463 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 3464 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, 3465 trans_size); 3466 ufs_lockfs_end(ulp); 3467 goto retry_rename; 3468 3469 } else { 3470 /* 3471 * So we couldn't get the second level peer lock *and* 3472 * the SLOCK bit isn't set. Too bad we can be 3473 * contentding with someone wanting these locks otherway 3474 * round. Reverse the locks in case there is a heavy 3475 * contention for the second level lock. 3476 */ 3477 reverse_lock = first_lock; 3478 first_lock = second_lock; 3479 second_lock = reverse_lock; 3480 ufs_rename_retry_cnt++; 3481 goto retry_firstlock; 3482 } 3483 } 3484 3485 if (sip == tdp) { 3486 error = EINVAL; 3487 goto errout; 3488 } 3489 /* 3490 * Make sure we can delete the source entry. This requires 3491 * write permission on the containing directory. 3492 * Check for sticky directories. 3493 */ 3494 rw_enter(&sdp->i_contents, RW_READER); 3495 rw_enter(&sip->i_contents, RW_READER); 3496 if ((error = ufs_iaccess(sdp, IWRITE, cr)) != 0 || 3497 (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) { 3498 rw_exit(&sip->i_contents); 3499 rw_exit(&sdp->i_contents); 3500 goto errout; 3501 } 3502 3503 /* 3504 * If this is a rename of a directory and the parent is 3505 * different (".." must be changed), then the source 3506 * directory must not be in the directory hierarchy 3507 * above the target, as this would orphan everything 3508 * below the source directory. Also the user must have 3509 * write permission in the source so as to be able to 3510 * change "..". 3511 */ 3512 if ((((sip->i_mode & IFMT) == IFDIR) || 3513 ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) { 3514 ino_t inum; 3515 3516 if ((error = ufs_iaccess(sip, IWRITE, cr))) { 3517 rw_exit(&sip->i_contents); 3518 rw_exit(&sdp->i_contents); 3519 goto errout; 3520 } 3521 inum = sip->i_number; 3522 rw_exit(&sip->i_contents); 3523 rw_exit(&sdp->i_contents); 3524 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) { 3525 /* 3526 * If we got EAGAIN ufs_dircheckpath detected a 3527 * potential deadlock and backed out. We need 3528 * to retry the operation since sdp and tdp have 3529 * to be released to avoid the deadlock. 3530 */ 3531 if (error == EAGAIN) { 3532 rw_exit(&tdp->i_rwlock); 3533 if (tdp != sdp) 3534 rw_exit(&sdp->i_rwlock); 3535 delay(ufs_rename_backoff_delay); 3536 ufs_rename_dircheck_retry_cnt++; 3537 goto retry; 3538 } 3539 goto errout; 3540 } 3541 } else { 3542 rw_exit(&sip->i_contents); 3543 rw_exit(&sdp->i_contents); 3544 } 3545 3546 3547 /* 3548 * Check for renaming '.' or '..' or alias of '.' 3549 */ 3550 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) { 3551 error = EINVAL; 3552 goto errout; 3553 } 3554 3555 /* 3556 * Simultaneous renames can deadlock in ufs_dircheckpath since it 3557 * tries to traverse back the file tree with both tdp and sdp held 3558 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks 3559 * as RW_READERS till ufs_dircheckpath is done. 3560 * Now that ufs_dircheckpath is done with, we can upgrade the locks 3561 * to RW_WRITER. 3562 */ 3563 if (!rw_tryupgrade(&tdp->i_rwlock)) { 3564 /* 3565 * The upgrade failed. We got to give away the lock 3566 * as to avoid deadlocking with someone else who is 3567 * waiting for writer lock. With the lock gone, we 3568 * cannot be sure the checks done above will hold 3569 * good when we eventually get them back as writer. 3570 * So if we can't upgrade we drop the locks and retry 3571 * everything again. 3572 */ 3573 rw_exit(&tdp->i_rwlock); 3574 if (tdp != sdp) 3575 rw_exit(&sdp->i_rwlock); 3576 delay(ufs_rename_backoff_delay); 3577 ufs_rename_upgrade_retry_cnt++; 3578 goto retry; 3579 } 3580 if (tdp != sdp) { 3581 if (!rw_tryupgrade(&sdp->i_rwlock)) { 3582 /* 3583 * The upgrade failed. We got to give away the lock 3584 * as to avoid deadlocking with someone else who is 3585 * waiting for writer lock. With the lock gone, we 3586 * cannot be sure the checks done above will hold 3587 * good when we eventually get them back as writer. 3588 * So if we can't upgrade we drop the locks and retry 3589 * everything again. 3590 */ 3591 rw_exit(&tdp->i_rwlock); 3592 rw_exit(&sdp->i_rwlock); 3593 delay(ufs_rename_backoff_delay); 3594 ufs_rename_upgrade_retry_cnt++; 3595 goto retry; 3596 } 3597 } 3598 3599 /* 3600 * Now that all the locks are held check to make sure another thread 3601 * didn't slip in and take out the sip. 3602 */ 3603 slot.status = NONE; 3604 if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec || 3605 sip->i_ctime.tv_sec > now.tv_sec) { 3606 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 3607 rw_enter(&sdp->i_contents, RW_WRITER); 3608 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot, 3609 &ip, cr, 0); 3610 rw_exit(&sdp->i_contents); 3611 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock); 3612 if (error) { 3613 goto errout; 3614 } 3615 if (ip == NULL) { 3616 error = ENOENT; 3617 goto errout; 3618 } else { 3619 /* 3620 * If the inode was found need to drop the v_count 3621 * so as not to keep the filesystem from being 3622 * unmounted at a later time. 3623 */ 3624 VN_RELE(ITOV(ip)); 3625 } 3626 3627 /* 3628 * Release the slot.fbp that has the page mapped and 3629 * locked SE_SHARED, and could be used in in 3630 * ufs_direnter_lr() which needs to get the SE_EXCL lock 3631 * on said page. 3632 */ 3633 if (slot.fbp) { 3634 fbrelse(slot.fbp, S_OTHER); 3635 slot.fbp = NULL; 3636 } 3637 } 3638 3639 /* 3640 * Link source to the target. If a target exists, return its 3641 * vnode pointer in tvp. We'll release it after sending the 3642 * vnevent. 3643 */ 3644 if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) { 3645 /* 3646 * ESAME isn't really an error; it indicates that the 3647 * operation should not be done because the source and target 3648 * are the same file, but that no error should be reported. 3649 */ 3650 if (error == ESAME) 3651 error = 0; 3652 goto errout; 3653 } 3654 3655 /* 3656 * Unlink the source. 3657 * Remove the source entry. ufs_dirremove() checks that the entry 3658 * still reflects sip, and returns an error if it doesn't. 3659 * If the entry has changed just forget about it. Release 3660 * the source inode. 3661 */ 3662 if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0, 3663 DR_RENAME, cr, NULL)) == ENOENT) 3664 error = 0; 3665 3666 errout: 3667 if (slot.fbp) 3668 fbrelse(slot.fbp, S_OTHER); 3669 3670 rw_exit(&tdp->i_rwlock); 3671 if (sdp != tdp) { 3672 rw_exit(&sdp->i_rwlock); 3673 } 3674 3675 unlock: 3676 if (ulp) { 3677 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); 3678 ufs_lockfs_end(ulp); 3679 } 3680 3681 /* 3682 * If no errors, send the appropriate events on the source 3683 * and destination (a.k.a, target) vnodes, if they exist. 3684 * This has to be done after the rename transaction has closed. 3685 */ 3686 if (error == 0) { 3687 if (tvp != NULL) 3688 vnevent_rename_dest(tvp); 3689 /* 3690 * Note that if ufs_direnter_lr() returned ESAME then 3691 * this event will still be sent. This isn't expected 3692 * to be a problem for anticipated usage by consumers. 3693 */ 3694 if (sip != NULL) 3695 vnevent_rename_src(ITOV(sip)); 3696 } 3697 3698 if (tvp != NULL) 3699 VN_RELE(tvp); 3700 3701 if (sip != NULL) 3702 VN_RELE(ITOV(sip)); 3703 3704 out: 3705 TRACE_5(TR_FAC_UFS, TR_UFS_RENAME_END, 3706 "ufs_rename_end:sdvp %p snm %s tdvp %p tnm %s error %d", 3707 sdvp, snm, tdvp, tnm, error); 3708 return (error); 3709 } 3710 3711 /*ARGSUSED*/ 3712 static int 3713 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap, 3714 struct vnode **vpp, struct cred *cr) 3715 { 3716 struct inode *ip; 3717 struct inode *xip; 3718 struct ufsvfs *ufsvfsp; 3719 struct ulockfs *ulp; 3720 int error; 3721 int issync; 3722 int trans_size; 3723 int indeadlock; 3724 int retry = 1; 3725 3726 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 3727 3728 TRACE_1(TR_FAC_UFS, TR_UFS_MKDIR_START, 3729 "ufs_mkdir_start:dvp %p", dvp); 3730 3731 /* 3732 * Can't make directory in attr hidden dir 3733 */ 3734 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3735 return (EINVAL); 3736 3737 again: 3738 ip = VTOI(dvp); 3739 ufsvfsp = ip->i_ufsvfs; 3740 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3741 if (error) 3742 goto out; 3743 if (ulp) 3744 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, 3745 trans_size = (int)TOP_MKDIR_SIZE(ip)); 3746 3747 /* 3748 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3749 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3750 * possible, retries the operation. 3751 */ 3752 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry); 3753 if (indeadlock) 3754 goto again; 3755 3756 error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr, 3757 (retry ? IQUIET : 0)); 3758 if (error == EAGAIN) { 3759 if (ulp) { 3760 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR, 3761 trans_size); 3762 ufs_lockfs_end(ulp); 3763 } 3764 goto again; 3765 } 3766 3767 rw_exit(&ip->i_rwlock); 3768 if (error == 0) { 3769 ip = xip; 3770 *vpp = ITOV(ip); 3771 } else if (error == EEXIST) 3772 VN_RELE(ITOV(xip)); 3773 3774 if (ulp) { 3775 int terr = 0; 3776 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size); 3777 ufs_lockfs_end(ulp); 3778 if (error == 0) 3779 error = terr; 3780 } 3781 out: 3782 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3783 ufs_delete_drain_wait(ufsvfsp, 1); 3784 retry = 0; 3785 goto again; 3786 } 3787 3788 TRACE_2(TR_FAC_UFS, TR_UFS_MKDIR_END, 3789 "ufs_mkdir_end:dvp %p error %d", dvp, error); 3790 return (error); 3791 } 3792 3793 /*ARGSUSED*/ 3794 static int 3795 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr) 3796 { 3797 struct inode *ip = VTOI(vp); 3798 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3799 struct ulockfs *ulp; 3800 vnode_t *rmvp = NULL; /* Vnode of removed directory */ 3801 int error; 3802 int issync; 3803 int trans_size; 3804 int indeadlock; 3805 3806 TRACE_1(TR_FAC_UFS, TR_UFS_RMDIR_START, 3807 "ufs_rmdir_start:vp %p", vp); 3808 3809 /* 3810 * don't let the delete queue get too long 3811 */ 3812 if (ufsvfsp == NULL) { 3813 error = EIO; 3814 goto out; 3815 } 3816 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3817 ufs_delete_drain(vp->v_vfsp, 1, 1); 3818 3819 retry_rmdir: 3820 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK); 3821 if (error) 3822 goto out; 3823 3824 if (ulp) 3825 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR, 3826 trans_size = TOP_RMDIR_SIZE); 3827 3828 /* 3829 * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK 3830 * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 3831 * possible, retries the operation. 3832 */ 3833 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry); 3834 if (indeadlock) 3835 goto retry_rmdir; 3836 error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr, 3837 &rmvp); 3838 rw_exit(&ip->i_rwlock); 3839 3840 if (ulp) { 3841 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR, 3842 trans_size); 3843 ufs_lockfs_end(ulp); 3844 } 3845 3846 /* 3847 * This must be done AFTER the rmdir transaction has closed. 3848 */ 3849 if (rmvp != NULL) { 3850 /* Only send the event if there were no errors */ 3851 if (error == 0) 3852 vnevent_rmdir(rmvp); 3853 VN_RELE(rmvp); 3854 } 3855 out: 3856 TRACE_2(TR_FAC_UFS, TR_UFS_RMDIR_END, 3857 "ufs_rmdir_end:vp %p error %d", vp, error); 3858 3859 return (error); 3860 } 3861 3862 /* ARGSUSED */ 3863 static int 3864 ufs_readdir( 3865 struct vnode *vp, 3866 struct uio *uiop, 3867 struct cred *cr, 3868 int *eofp) 3869 { 3870 struct iovec *iovp; 3871 struct inode *ip; 3872 struct direct *idp; 3873 struct dirent64 *odp; 3874 struct fbuf *fbp; 3875 struct ufsvfs *ufsvfsp; 3876 struct ulockfs *ulp; 3877 caddr_t outbuf; 3878 size_t bufsize; 3879 uint_t offset; 3880 uint_t bytes_wanted, total_bytes_wanted; 3881 int incount = 0; 3882 int outcount = 0; 3883 int error; 3884 3885 ip = VTOI(vp); 3886 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 3887 3888 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_START, 3889 "ufs_readdir_start:vp %p uiop %p", vp, uiop); 3890 3891 if (uiop->uio_loffset >= MAXOFF32_T) { 3892 if (eofp) 3893 *eofp = 1; 3894 return (0); 3895 } 3896 3897 /* 3898 * Check if we have been called with a valid iov_len 3899 * and bail out if not, otherwise we may potentially loop 3900 * forever further down. 3901 */ 3902 if (uiop->uio_iov->iov_len <= 0) { 3903 error = EINVAL; 3904 goto out; 3905 } 3906 3907 /* 3908 * Large Files: When we come here we are guaranteed that 3909 * uio_offset can be used safely. The high word is zero. 3910 */ 3911 3912 ufsvfsp = ip->i_ufsvfs; 3913 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK); 3914 if (error) 3915 goto out; 3916 3917 iovp = uiop->uio_iov; 3918 total_bytes_wanted = iovp->iov_len; 3919 3920 /* Large Files: directory files should not be "large" */ 3921 3922 ASSERT(ip->i_size <= MAXOFF32_T); 3923 3924 /* Force offset to be valid (to guard against bogus lseek() values) */ 3925 offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1); 3926 3927 /* Quit if at end of file or link count of zero (posix) */ 3928 if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) { 3929 if (eofp) 3930 *eofp = 1; 3931 error = 0; 3932 goto unlock; 3933 } 3934 3935 /* 3936 * Get space to change directory entries into fs independent format. 3937 * Do fast alloc for the most commonly used-request size (filesystem 3938 * block size). 3939 */ 3940 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) { 3941 bufsize = total_bytes_wanted; 3942 outbuf = kmem_alloc(bufsize, KM_SLEEP); 3943 odp = (struct dirent64 *)outbuf; 3944 } else { 3945 bufsize = total_bytes_wanted; 3946 odp = (struct dirent64 *)iovp->iov_base; 3947 } 3948 3949 nextblk: 3950 bytes_wanted = total_bytes_wanted; 3951 3952 /* Truncate request to file size */ 3953 if (offset + bytes_wanted > (int)ip->i_size) 3954 bytes_wanted = (int)(ip->i_size - offset); 3955 3956 /* Comply with MAXBSIZE boundary restrictions of fbread() */ 3957 if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE) 3958 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET); 3959 3960 /* 3961 * Read in the next chunk. 3962 * We are still holding the i_rwlock. 3963 */ 3964 error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp); 3965 3966 if (error) 3967 goto update_inode; 3968 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) && 3969 (!ufsvfsp->vfs_noatime)) { 3970 ip->i_flag |= IACC; 3971 } 3972 incount = 0; 3973 idp = (struct direct *)fbp->fb_addr; 3974 if (idp->d_ino == 0 && idp->d_reclen == 0 && 3975 idp->d_namlen == 0) { 3976 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, " 3977 "fs = %s\n", 3978 (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt); 3979 fbrelse(fbp, S_OTHER); 3980 error = ENXIO; 3981 goto update_inode; 3982 } 3983 /* Transform to file-system independent format */ 3984 while (incount < bytes_wanted) { 3985 /* 3986 * If the current directory entry is mangled, then skip 3987 * to the next block. It would be nice to set the FSBAD 3988 * flag in the super-block so that a fsck is forced on 3989 * next reboot, but locking is a problem. 3990 */ 3991 if (idp->d_reclen & 0x3) { 3992 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3993 break; 3994 } 3995 3996 /* Skip to requested offset and skip empty entries */ 3997 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) { 3998 ushort_t this_reclen = 3999 DIRENT64_RECLEN(idp->d_namlen); 4000 /* Buffer too small for any entries */ 4001 if (!outcount && this_reclen > bufsize) { 4002 fbrelse(fbp, S_OTHER); 4003 error = EINVAL; 4004 goto update_inode; 4005 } 4006 /* If would overrun the buffer, quit */ 4007 if (outcount + this_reclen > bufsize) { 4008 break; 4009 } 4010 /* Take this entry */ 4011 odp->d_ino = (ino64_t)idp->d_ino; 4012 odp->d_reclen = (ushort_t)this_reclen; 4013 odp->d_off = (offset_t)(offset + idp->d_reclen); 4014 4015 /* use strncpy(9f) to zero out uninitialized bytes */ 4016 4017 ASSERT(strlen(idp->d_name) + 1 <= 4018 DIRENT64_NAMELEN(this_reclen)); 4019 (void) strncpy(odp->d_name, idp->d_name, 4020 DIRENT64_NAMELEN(this_reclen)); 4021 outcount += odp->d_reclen; 4022 odp = (struct dirent64 *)((intptr_t)odp + 4023 odp->d_reclen); 4024 ASSERT(outcount <= bufsize); 4025 } 4026 if (idp->d_reclen) { 4027 incount += idp->d_reclen; 4028 offset += idp->d_reclen; 4029 idp = (struct direct *)((intptr_t)idp + idp->d_reclen); 4030 } else { 4031 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 4032 break; 4033 } 4034 } 4035 /* Release the chunk */ 4036 fbrelse(fbp, S_OTHER); 4037 4038 /* Read whole block, but got no entries, read another if not eof */ 4039 4040 /* 4041 * Large Files: casting i_size to int here is not a problem 4042 * because directory sizes are always less than MAXOFF32_T. 4043 * See assertion above. 4044 */ 4045 4046 if (offset < (int)ip->i_size && !outcount) 4047 goto nextblk; 4048 4049 /* Copy out the entry data */ 4050 if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) { 4051 iovp->iov_base += outcount; 4052 iovp->iov_len -= outcount; 4053 uiop->uio_resid -= outcount; 4054 uiop->uio_offset = offset; 4055 } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, 4056 uiop)) == 0) 4057 uiop->uio_offset = offset; 4058 update_inode: 4059 ITIMES(ip); 4060 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) 4061 kmem_free(outbuf, bufsize); 4062 4063 if (eofp && error == 0) 4064 *eofp = (uiop->uio_offset >= (int)ip->i_size); 4065 unlock: 4066 if (ulp) { 4067 ufs_lockfs_end(ulp); 4068 } 4069 out: 4070 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_END, 4071 "ufs_readdir_end:vp %p error %d", vp, error); 4072 return (error); 4073 } 4074 4075 /*ARGSUSED*/ 4076 static int 4077 ufs_symlink( 4078 struct vnode *dvp, /* ptr to parent dir vnode */ 4079 char *linkname, /* name of symbolic link */ 4080 struct vattr *vap, /* attributes */ 4081 char *target, /* target path */ 4082 struct cred *cr) /* user credentials */ 4083 { 4084 struct inode *ip, *dip = VTOI(dvp); 4085 struct ufsvfs *ufsvfsp = dip->i_ufsvfs; 4086 struct ulockfs *ulp; 4087 int error; 4088 int issync; 4089 int trans_size; 4090 int residual; 4091 int ioflag; 4092 int retry = 1; 4093 4094 TRACE_1(TR_FAC_UFS, TR_UFS_SYMLINK_START, 4095 "ufs_symlink_start:dvp %p", dvp); 4096 4097 /* 4098 * No symlinks in attrdirs at this time 4099 */ 4100 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 4101 return (EINVAL); 4102 4103 again: 4104 ip = (struct inode *)NULL; 4105 vap->va_type = VLNK; 4106 vap->va_rdev = 0; 4107 4108 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK); 4109 if (error) 4110 goto out; 4111 4112 if (ulp) 4113 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK, 4114 trans_size = (int)TOP_SYMLINK_SIZE(dip)); 4115 4116 /* 4117 * We must create the inode before the directory entry, to avoid 4118 * racing with readlink(). ufs_dirmakeinode requires that we 4119 * hold the quota lock as reader, and directory locks as writer. 4120 */ 4121 4122 rw_enter(&dip->i_rwlock, RW_WRITER); 4123 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4124 rw_enter(&dip->i_contents, RW_WRITER); 4125 4126 /* 4127 * Suppress any out of inodes messages if we will retry on 4128 * ENOSP 4129 */ 4130 if (retry) 4131 dip->i_flag |= IQUIET; 4132 4133 error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr); 4134 4135 dip->i_flag &= ~IQUIET; 4136 4137 rw_exit(&dip->i_contents); 4138 rw_exit(&ufsvfsp->vfs_dqrwlock); 4139 rw_exit(&dip->i_rwlock); 4140 4141 if (error) 4142 goto unlock; 4143 4144 /* 4145 * OK. The inode has been created. Write out the data of the 4146 * symbolic link. Since symbolic links are metadata, and should 4147 * remain consistent across a system crash, we need to force the 4148 * data out synchronously. 4149 * 4150 * (This is a change from the semantics in earlier releases, which 4151 * only created symbolic links synchronously if the semi-documented 4152 * 'syncdir' option was set, or if we were being invoked by the NFS 4153 * server, which requires symbolic links to be created synchronously.) 4154 * 4155 * We need to pass in a pointer for the residual length; otherwise 4156 * ufs_rdwri() will always return EIO if it can't write the data, 4157 * even if the error was really ENOSPC or EDQUOT. 4158 */ 4159 4160 ioflag = FWRITE | FDSYNC; 4161 residual = 0; 4162 4163 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4164 rw_enter(&ip->i_contents, RW_WRITER); 4165 4166 /* 4167 * Suppress file system full messages if we will retry 4168 */ 4169 if (retry) 4170 ip->i_flag |= IQUIET; 4171 4172 error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target), 4173 (offset_t)0, UIO_SYSSPACE, &residual, cr); 4174 4175 ip->i_flag &= ~IQUIET; 4176 4177 if (error) { 4178 rw_exit(&ip->i_contents); 4179 rw_exit(&ufsvfsp->vfs_dqrwlock); 4180 goto remove; 4181 } 4182 4183 /* 4184 * If the link's data is small enough, we can cache it in the inode. 4185 * This is a "fast symbolic link". We don't use the first direct 4186 * block because that's actually used to point at the symbolic link's 4187 * contents on disk; but we know that none of the other direct or 4188 * indirect blocks can be used because symbolic links are restricted 4189 * to be smaller than a file system block. 4190 */ 4191 4192 ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip))); 4193 4194 if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) { 4195 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) { 4196 ip->i_flag |= IFASTSYMLNK; 4197 } else { 4198 int i; 4199 /* error, clear garbage left behind */ 4200 for (i = 1; i < NDADDR; i++) 4201 ip->i_db[i] = 0; 4202 for (i = 0; i < NIADDR; i++) 4203 ip->i_ib[i] = 0; 4204 } 4205 } 4206 4207 rw_exit(&ip->i_contents); 4208 rw_exit(&ufsvfsp->vfs_dqrwlock); 4209 4210 /* 4211 * OK. We've successfully created the symbolic link. All that 4212 * remains is to insert it into the appropriate directory. 4213 */ 4214 4215 rw_enter(&dip->i_rwlock, RW_WRITER); 4216 error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL); 4217 rw_exit(&dip->i_rwlock); 4218 4219 /* 4220 * Fall through into remove-on-error code. We're either done, or we 4221 * need to remove the inode (if we couldn't insert it). 4222 */ 4223 4224 remove: 4225 if (error && (ip != NULL)) { 4226 rw_enter(&ip->i_contents, RW_WRITER); 4227 ip->i_nlink--; 4228 ip->i_flag |= ICHG; 4229 ip->i_seq++; 4230 ufs_setreclaim(ip); 4231 rw_exit(&ip->i_contents); 4232 } 4233 4234 unlock: 4235 if (ip != NULL) 4236 VN_RELE(ITOV(ip)); 4237 4238 if (ulp) { 4239 int terr = 0; 4240 4241 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK, 4242 trans_size); 4243 ufs_lockfs_end(ulp); 4244 if (error == 0) 4245 error = terr; 4246 } 4247 4248 /* 4249 * We may have failed due to lack of an inode or of a block to 4250 * store the target in. Try flushing the delete queue to free 4251 * logically-available things up and try again. 4252 */ 4253 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 4254 ufs_delete_drain_wait(ufsvfsp, 1); 4255 retry = 0; 4256 goto again; 4257 } 4258 4259 out: 4260 TRACE_2(TR_FAC_UFS, TR_UFS_SYMLINK_END, 4261 "ufs_symlink_end:dvp %p error %d", dvp, error); 4262 return (error); 4263 } 4264 4265 /* 4266 * Ufs specific routine used to do ufs io. 4267 */ 4268 int 4269 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base, 4270 ssize_t len, offset_t offset, enum uio_seg seg, int *aresid, 4271 struct cred *cr) 4272 { 4273 struct uio auio; 4274 struct iovec aiov; 4275 int error; 4276 4277 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 4278 4279 bzero((caddr_t)&auio, sizeof (uio_t)); 4280 bzero((caddr_t)&aiov, sizeof (iovec_t)); 4281 4282 aiov.iov_base = base; 4283 aiov.iov_len = len; 4284 auio.uio_iov = &aiov; 4285 auio.uio_iovcnt = 1; 4286 auio.uio_loffset = offset; 4287 auio.uio_segflg = (short)seg; 4288 auio.uio_resid = len; 4289 4290 if (rw == UIO_WRITE) { 4291 auio.uio_fmode = FWRITE; 4292 auio.uio_extflg = UIO_COPY_DEFAULT; 4293 auio.uio_llimit = curproc->p_fsz_ctl; 4294 error = wrip(ip, &auio, ioflag, cr); 4295 } else { 4296 auio.uio_fmode = FREAD; 4297 auio.uio_extflg = UIO_COPY_CACHED; 4298 auio.uio_llimit = MAXOFFSET_T; 4299 error = rdip(ip, &auio, ioflag, cr); 4300 } 4301 4302 if (aresid) { 4303 *aresid = auio.uio_resid; 4304 } else if (auio.uio_resid) { 4305 error = EIO; 4306 } 4307 return (error); 4308 } 4309 4310 static int 4311 ufs_fid(vp, fidp) 4312 struct vnode *vp; 4313 struct fid *fidp; 4314 { 4315 struct ufid *ufid; 4316 struct inode *ip = VTOI(vp); 4317 4318 if (ip->i_ufsvfs == NULL) 4319 return (EIO); 4320 4321 if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) { 4322 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t); 4323 return (ENOSPC); 4324 } 4325 4326 ufid = (struct ufid *)fidp; 4327 bzero((char *)ufid, sizeof (struct ufid)); 4328 ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t); 4329 ufid->ufid_ino = ip->i_number; 4330 ufid->ufid_gen = ip->i_gen; 4331 4332 return (0); 4333 } 4334 4335 /* ARGSUSED2 */ 4336 static int 4337 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4338 { 4339 struct inode *ip = VTOI(vp); 4340 struct ufsvfs *ufsvfsp; 4341 int forcedirectio; 4342 4343 /* 4344 * Read case is easy. 4345 */ 4346 if (!write_lock) { 4347 rw_enter(&ip->i_rwlock, RW_READER); 4348 return (V_WRITELOCK_FALSE); 4349 } 4350 4351 /* 4352 * Caller has requested a writer lock, but that inhibits any 4353 * concurrency in the VOPs that follow. Acquire the lock shared 4354 * and defer exclusive access until it is known to be needed in 4355 * other VOP handlers. Some cases can be determined here. 4356 */ 4357 4358 /* 4359 * If directio is not set, there is no chance of concurrency, 4360 * so just acquire the lock exclusive. Beware of a forced 4361 * unmount before looking at the mount option. 4362 */ 4363 ufsvfsp = ip->i_ufsvfs; 4364 forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0; 4365 if (!(ip->i_flag & IDIRECTIO || forcedirectio) || 4366 !ufs_allow_shared_writes) { 4367 rw_enter(&ip->i_rwlock, RW_WRITER); 4368 return (V_WRITELOCK_TRUE); 4369 } 4370 4371 /* 4372 * Mandatory locking forces acquiring i_rwlock exclusive. 4373 */ 4374 if (MANDLOCK(vp, ip->i_mode)) { 4375 rw_enter(&ip->i_rwlock, RW_WRITER); 4376 return (V_WRITELOCK_TRUE); 4377 } 4378 4379 /* 4380 * Acquire the lock shared in case a concurrent write follows. 4381 * Mandatory locking could have become enabled before the lock 4382 * was acquired. Re-check and upgrade if needed. 4383 */ 4384 rw_enter(&ip->i_rwlock, RW_READER); 4385 if (MANDLOCK(vp, ip->i_mode)) { 4386 rw_exit(&ip->i_rwlock); 4387 rw_enter(&ip->i_rwlock, RW_WRITER); 4388 return (V_WRITELOCK_TRUE); 4389 } 4390 return (V_WRITELOCK_FALSE); 4391 } 4392 4393 /*ARGSUSED*/ 4394 static void 4395 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4396 { 4397 struct inode *ip = VTOI(vp); 4398 4399 rw_exit(&ip->i_rwlock); 4400 } 4401 4402 /* ARGSUSED */ 4403 static int 4404 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp) 4405 { 4406 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4407 } 4408 4409 /* ARGSUSED */ 4410 static int 4411 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4412 offset_t offset, struct flk_callback *flk_cbp, struct cred *cr) 4413 { 4414 struct inode *ip = VTOI(vp); 4415 4416 if (ip->i_ufsvfs == NULL) 4417 return (EIO); 4418 4419 /* 4420 * If file is being mapped, disallow frlock. 4421 * XXX I am not holding tlock while checking i_mapcnt because the 4422 * current locking strategy drops all locks before calling fs_frlock. 4423 * So, mapcnt could change before we enter fs_frlock making is 4424 * meaningless to have held tlock in the first place. 4425 */ 4426 if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode)) 4427 return (EAGAIN); 4428 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4429 } 4430 4431 /* ARGSUSED */ 4432 static int 4433 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4434 offset_t offset, cred_t *cr, caller_context_t *ct) 4435 { 4436 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 4437 struct ulockfs *ulp; 4438 int error; 4439 4440 if ((error = convoff(vp, bfp, 0, offset)) == 0) { 4441 if (cmd == F_FREESP) { 4442 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4443 ULOCKFS_SPACE_MASK); 4444 if (error) 4445 return (error); 4446 error = ufs_freesp(vp, bfp, flag, cr); 4447 } else if (cmd == F_ALLOCSP) { 4448 error = ufs_lockfs_begin(ufsvfsp, &ulp, 4449 ULOCKFS_FALLOCATE_MASK); 4450 if (error) 4451 return (error); 4452 error = ufs_allocsp(vp, bfp, cr); 4453 } else 4454 return (EINVAL); /* Command not handled here */ 4455 4456 if (ulp) 4457 ufs_lockfs_end(ulp); 4458 4459 } 4460 return (error); 4461 } 4462 4463 /* 4464 * Used to determine if read ahead should be done. Also used to 4465 * to determine when write back occurs. 4466 */ 4467 #define CLUSTSZ(ip) ((ip)->i_ufsvfs->vfs_ioclustsz) 4468 4469 /* 4470 * A faster version of ufs_getpage. 4471 * 4472 * We optimize by inlining the pvn_getpages iterator, eliminating 4473 * calls to bmap_read if file doesn't have UFS holes, and avoiding 4474 * the overhead of page_exists(). 4475 * 4476 * When files has UFS_HOLES and ufs_getpage is called with S_READ, 4477 * we set *protp to PROT_READ to avoid calling bmap_read. This approach 4478 * victimizes performance when a file with UFS holes is faulted 4479 * first in the S_READ mode, and then in the S_WRITE mode. We will get 4480 * two MMU faults in this case. 4481 * 4482 * XXX - the inode fields which control the sequential mode are not 4483 * protected by any mutex. The read ahead will act wild if 4484 * multiple processes will access the file concurrently and 4485 * some of them in sequential mode. One particulary bad case 4486 * is if another thread will change the value of i_nextrio between 4487 * the time this thread tests the i_nextrio value and then reads it 4488 * again to use it as the offset for the read ahead. 4489 */ 4490 static int 4491 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 4492 page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr, 4493 enum seg_rw rw, struct cred *cr) 4494 { 4495 u_offset_t uoff = (u_offset_t)off; /* type conversion */ 4496 u_offset_t pgoff; 4497 u_offset_t eoff; 4498 struct inode *ip = VTOI(vp); 4499 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 4500 struct fs *fs; 4501 struct ulockfs *ulp; 4502 page_t **pl; 4503 caddr_t pgaddr; 4504 krw_t rwtype; 4505 int err; 4506 int has_holes; 4507 int beyond_eof; 4508 int seqmode; 4509 int pgsize = PAGESIZE; 4510 int dolock; 4511 int do_qlock; 4512 int trans_size; 4513 4514 TRACE_1(TR_FAC_UFS, TR_UFS_GETPAGE_START, 4515 "ufs_getpage_start:vp %p", vp); 4516 4517 ASSERT((uoff & PAGEOFFSET) == 0); 4518 4519 if (protp) 4520 *protp = PROT_ALL; 4521 4522 /* 4523 * Obey the lockfs protocol 4524 */ 4525 err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg, 4526 rw == S_READ || rw == S_EXEC, protp); 4527 if (err) 4528 goto out; 4529 4530 fs = ufsvfsp->vfs_fs; 4531 4532 if (ulp && (rw == S_CREATE || rw == S_WRITE) && 4533 !(vp->v_flag & VISSWAP)) { 4534 /* 4535 * Try to start a transaction, will return if blocking is 4536 * expected to occur and the address space is not the 4537 * kernel address space. 4538 */ 4539 trans_size = TOP_GETPAGE_SIZE(ip); 4540 if (seg->s_as != &kas) { 4541 TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, 4542 trans_size, err) 4543 if (err == EWOULDBLOCK) { 4544 /* 4545 * Use EDEADLK here because the VM code 4546 * can normally never see this error. 4547 */ 4548 err = EDEADLK; 4549 ufs_lockfs_end(ulp); 4550 goto out; 4551 } 4552 } else { 4553 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4554 } 4555 } 4556 4557 if (vp->v_flag & VNOMAP) { 4558 err = ENOSYS; 4559 goto unlock; 4560 } 4561 4562 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 4563 4564 rwtype = RW_READER; /* start as a reader */ 4565 dolock = (rw_owner(&ip->i_contents) != curthread); 4566 /* 4567 * If this thread owns the lock, i.e., this thread grabbed it 4568 * as writer somewhere above, then we don't need to grab the 4569 * lock as reader in this routine. 4570 */ 4571 do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread); 4572 4573 retrylock: 4574 if (dolock) { 4575 /* 4576 * Grab the quota lock if we need to call 4577 * bmap_write() below (with i_contents as writer). 4578 */ 4579 if (do_qlock && rwtype == RW_WRITER) 4580 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4581 rw_enter(&ip->i_contents, rwtype); 4582 } 4583 4584 /* 4585 * We may be getting called as a side effect of a bmap using 4586 * fbread() when the blocks might be being allocated and the 4587 * size has not yet been up'ed. In this case we want to be 4588 * able to return zero pages if we get back UFS_HOLE from 4589 * calling bmap for a non write case here. We also might have 4590 * to read some frags from the disk into a page if we are 4591 * extending the number of frags for a given lbn in bmap(). 4592 * Large Files: The read of i_size here is atomic because 4593 * i_contents is held here. If dolock is zero, the lock 4594 * is held in bmap routines. 4595 */ 4596 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 4597 if (beyond_eof && seg != segkmap) { 4598 if (dolock) { 4599 rw_exit(&ip->i_contents); 4600 if (do_qlock && rwtype == RW_WRITER) 4601 rw_exit(&ufsvfsp->vfs_dqrwlock); 4602 } 4603 err = EFAULT; 4604 goto unlock; 4605 } 4606 4607 /* 4608 * Must hold i_contents lock throughout the call to pvn_getpages 4609 * since locked pages are returned from each call to ufs_getapage. 4610 * Must *not* return locked pages and then try for contents lock 4611 * due to lock ordering requirements (inode > page) 4612 */ 4613 4614 has_holes = bmap_has_holes(ip); 4615 4616 if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) { 4617 int blk_size; 4618 u_offset_t offset; 4619 4620 /* 4621 * We must acquire the RW_WRITER lock in order to 4622 * call bmap_write(). 4623 */ 4624 if (dolock && rwtype == RW_READER) { 4625 rwtype = RW_WRITER; 4626 4627 /* 4628 * Grab the quota lock before 4629 * upgrading i_contents, but if we can't grab it 4630 * don't wait here due to lock order: 4631 * vfs_dqrwlock > i_contents. 4632 */ 4633 if (do_qlock && rw_tryenter(&ufsvfsp->vfs_dqrwlock, 4634 RW_READER) == 0) { 4635 rw_exit(&ip->i_contents); 4636 goto retrylock; 4637 } 4638 if (!rw_tryupgrade(&ip->i_contents)) { 4639 rw_exit(&ip->i_contents); 4640 if (do_qlock) 4641 rw_exit(&ufsvfsp->vfs_dqrwlock); 4642 goto retrylock; 4643 } 4644 } 4645 4646 /* 4647 * May be allocating disk blocks for holes here as 4648 * a result of mmap faults. write(2) does the bmap_write 4649 * in rdip/wrip, not here. We are not dealing with frags 4650 * in this case. 4651 */ 4652 /* 4653 * Large Files: We cast fs_bmask field to offset_t 4654 * just as we do for MAXBMASK because uoff is a 64-bit 4655 * data type. fs_bmask will still be a 32-bit type 4656 * as we cannot change any ondisk data structures. 4657 */ 4658 4659 offset = uoff & (offset_t)fs->fs_bmask; 4660 while (offset < uoff + len) { 4661 blk_size = (int)blksize(fs, ip, lblkno(fs, offset)); 4662 err = bmap_write(ip, offset, blk_size, 4663 BI_NORMAL, NULL, cr); 4664 if (ip->i_flag & (ICHG|IUPD)) 4665 ip->i_seq++; 4666 if (err) 4667 goto update_inode; 4668 offset += blk_size; /* XXX - make this contig */ 4669 } 4670 } 4671 4672 /* 4673 * Can be a reader from now on. 4674 */ 4675 if (dolock && rwtype == RW_WRITER) { 4676 rw_downgrade(&ip->i_contents); 4677 /* 4678 * We can release vfs_dqrwlock early so do it, but make 4679 * sure we don't try to release it again at the bottom. 4680 */ 4681 if (do_qlock) { 4682 rw_exit(&ufsvfsp->vfs_dqrwlock); 4683 do_qlock = 0; 4684 } 4685 } 4686 4687 /* 4688 * We remove PROT_WRITE in cases when the file has UFS holes 4689 * because we don't want to call bmap_read() to check each 4690 * page if it is backed with a disk block. 4691 */ 4692 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) 4693 *protp &= ~PROT_WRITE; 4694 4695 err = 0; 4696 4697 /* 4698 * The loop looks up pages in the range [off, off + len). 4699 * For each page, we first check if we should initiate an asynchronous 4700 * read ahead before we call page_lookup (we may sleep in page_lookup 4701 * for a previously initiated disk read). 4702 */ 4703 eoff = (uoff + len); 4704 for (pgoff = uoff, pgaddr = addr, pl = plarr; 4705 pgoff < eoff; /* empty */) { 4706 page_t *pp; 4707 u_offset_t nextrio; 4708 se_t se; 4709 int retval; 4710 4711 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED); 4712 4713 /* Handle async getpage (faultahead) */ 4714 if (plarr == NULL) { 4715 ip->i_nextrio = pgoff; 4716 (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4717 pgoff += pgsize; 4718 pgaddr += pgsize; 4719 continue; 4720 } 4721 /* 4722 * Check if we should initiate read ahead of next cluster. 4723 * We call page_exists only when we need to confirm that 4724 * we have the current page before we initiate the read ahead. 4725 */ 4726 nextrio = ip->i_nextrio; 4727 if (seqmode && 4728 pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 4729 nextrio < ip->i_size && page_exists(vp, pgoff)) { 4730 retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4731 /* 4732 * We always read ahead the next cluster of data 4733 * starting from i_nextrio. If the page (vp,nextrio) 4734 * is actually in core at this point, the routine 4735 * ufs_getpage_ra() will stop pre-fetching data 4736 * until we read that page in a synchronized manner 4737 * through ufs_getpage_miss(). So, we should increase 4738 * i_nextrio if the page (vp, nextrio) exists. 4739 */ 4740 if ((retval == 0) && page_exists(vp, nextrio)) { 4741 ip->i_nextrio = nextrio + pgsize; 4742 } 4743 } 4744 4745 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 4746 /* 4747 * We found the page in the page cache. 4748 */ 4749 *pl++ = pp; 4750 pgoff += pgsize; 4751 pgaddr += pgsize; 4752 len -= pgsize; 4753 plsz -= pgsize; 4754 } else { 4755 /* 4756 * We have to create the page, or read it from disk. 4757 */ 4758 if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr, 4759 pl, plsz, rw, seqmode)) 4760 goto error; 4761 4762 while (*pl != NULL) { 4763 pl++; 4764 pgoff += pgsize; 4765 pgaddr += pgsize; 4766 len -= pgsize; 4767 plsz -= pgsize; 4768 } 4769 } 4770 } 4771 4772 /* 4773 * Return pages up to plsz if they are in the page cache. 4774 * We cannot return pages if there is a chance that they are 4775 * backed with a UFS hole and rw is S_WRITE or S_CREATE. 4776 */ 4777 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 4778 4779 ASSERT((protp == NULL) || 4780 !(has_holes && (*protp & PROT_WRITE))); 4781 4782 eoff = pgoff + plsz; 4783 while (pgoff < eoff) { 4784 page_t *pp; 4785 4786 if ((pp = page_lookup_nowait(vp, pgoff, 4787 SE_SHARED)) == NULL) 4788 break; 4789 4790 *pl++ = pp; 4791 pgoff += pgsize; 4792 plsz -= pgsize; 4793 } 4794 } 4795 4796 if (plarr) 4797 *pl = NULL; /* Terminate page list */ 4798 ip->i_nextr = pgoff; 4799 4800 error: 4801 if (err && plarr) { 4802 /* 4803 * Release any pages we have locked. 4804 */ 4805 while (pl > &plarr[0]) 4806 page_unlock(*--pl); 4807 4808 plarr[0] = NULL; 4809 } 4810 4811 update_inode: 4812 /* 4813 * If the inode is not already marked for IACC (in rdip() for read) 4814 * and the inode is not marked for no access time update (in wrip() 4815 * for write) then update the inode access time and mod time now. 4816 */ 4817 if ((ip->i_flag & (IACC | INOACC)) == 0) { 4818 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) { 4819 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 4820 (fs->fs_ronly == 0) && 4821 (!ufsvfsp->vfs_noatime)) { 4822 mutex_enter(&ip->i_tlock); 4823 ip->i_flag |= IACC; 4824 ITIMES_NOLOCK(ip); 4825 mutex_exit(&ip->i_tlock); 4826 } 4827 } 4828 } 4829 4830 if (dolock) { 4831 rw_exit(&ip->i_contents); 4832 if (do_qlock && rwtype == RW_WRITER) 4833 rw_exit(&ufsvfsp->vfs_dqrwlock); 4834 } 4835 4836 unlock: 4837 if (ulp) { 4838 if ((rw == S_CREATE || rw == S_WRITE) && 4839 !(vp->v_flag & VISSWAP)) { 4840 TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4841 } 4842 ufs_lockfs_end(ulp); 4843 } 4844 out: 4845 TRACE_2(TR_FAC_UFS, TR_UFS_GETPAGE_END, 4846 "ufs_getpage_end:vp %p error %d", vp, err); 4847 return (err); 4848 } 4849 4850 /* 4851 * ufs_getpage_miss is called when ufs_getpage missed the page in the page 4852 * cache. The page is either read from the disk, or it's created. 4853 * A page is created (without disk read) if rw == S_CREATE, or if 4854 * the page is not backed with a real disk block (UFS hole). 4855 */ 4856 /* ARGSUSED */ 4857 static int 4858 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg, 4859 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq) 4860 { 4861 struct inode *ip = VTOI(vp); 4862 page_t *pp; 4863 daddr_t bn; 4864 size_t io_len; 4865 int crpage = 0; 4866 int err; 4867 int contig; 4868 int bsize = ip->i_fs->fs_bsize; 4869 4870 /* 4871 * Figure out whether the page can be created, or must be 4872 * must be read from the disk. 4873 */ 4874 if (rw == S_CREATE) 4875 crpage = 1; 4876 else { 4877 contig = 0; 4878 if (err = bmap_read(ip, off, &bn, &contig)) 4879 return (err); 4880 4881 crpage = (bn == UFS_HOLE); 4882 4883 /* 4884 * If its also a fallocated block that hasn't been written to 4885 * yet, we will treat it just like a UFS_HOLE and create 4886 * a zero page for it 4887 */ 4888 if (ISFALLOCBLK(ip, bn)) 4889 crpage = 1; 4890 } 4891 4892 if (crpage) { 4893 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg, 4894 addr)) == NULL) { 4895 return (ufs_fault(vp, 4896 "ufs_getpage_miss: page_create == NULL")); 4897 } 4898 4899 if (rw != S_CREATE) 4900 pagezero(pp, 0, PAGESIZE); 4901 4902 io_len = PAGESIZE; 4903 } else { 4904 u_offset_t io_off; 4905 uint_t xlen; 4906 struct buf *bp; 4907 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 4908 4909 /* 4910 * If access is not in sequential order, we read from disk 4911 * in bsize units. 4912 * 4913 * We limit the size of the transfer to bsize if we are reading 4914 * from the beginning of the file. Note in this situation we 4915 * will hedge our bets and initiate an async read ahead of 4916 * the second block. 4917 */ 4918 if (!seq || off == 0) 4919 contig = MIN(contig, bsize); 4920 4921 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4922 &io_len, off, contig, 0); 4923 4924 /* 4925 * Some other thread has entered the page. 4926 * ufs_getpage will retry page_lookup. 4927 */ 4928 if (pp == NULL) { 4929 pl[0] = NULL; 4930 return (0); 4931 } 4932 4933 /* 4934 * Zero part of the page which we are not 4935 * going to read from the disk. 4936 */ 4937 xlen = io_len & PAGEOFFSET; 4938 if (xlen != 0) 4939 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4940 4941 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ); 4942 bp->b_edev = ip->i_dev; 4943 bp->b_dev = cmpdev(ip->i_dev); 4944 bp->b_blkno = bn; 4945 bp->b_un.b_addr = (caddr_t)0; 4946 bp->b_file = ip->i_vnode; 4947 bp->b_offset = off; 4948 4949 if (ufsvfsp->vfs_log) { 4950 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4951 } else if (ufsvfsp->vfs_snapshot) { 4952 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4953 } else { 4954 ufsvfsp->vfs_iotstamp = lbolt; 4955 ub.ub_getpages.value.ul++; 4956 (void) bdev_strategy(bp); 4957 lwp_stat_update(LWP_STAT_INBLK, 1); 4958 } 4959 4960 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK); 4961 4962 /* 4963 * If the file access is sequential, initiate read ahead 4964 * of the next cluster. 4965 */ 4966 if (seq && ip->i_nextrio < ip->i_size) 4967 (void) ufs_getpage_ra(vp, off, seg, addr); 4968 err = biowait(bp); 4969 pageio_done(bp); 4970 4971 if (err) { 4972 pvn_read_done(pp, B_ERROR); 4973 return (err); 4974 } 4975 } 4976 4977 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4978 return (0); 4979 } 4980 4981 /* 4982 * Read ahead a cluster from the disk. Returns the length in bytes. 4983 */ 4984 static int 4985 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr) 4986 { 4987 struct inode *ip = VTOI(vp); 4988 page_t *pp; 4989 u_offset_t io_off = ip->i_nextrio; 4990 ufsvfs_t *ufsvfsp; 4991 caddr_t addr2 = addr + (io_off - off); 4992 struct buf *bp; 4993 daddr_t bn; 4994 size_t io_len; 4995 int err; 4996 int contig; 4997 int xlen; 4998 int bsize = ip->i_fs->fs_bsize; 4999 5000 /* 5001 * If the directio advisory is in effect on this file, 5002 * then do not do buffered read ahead. Read ahead makes 5003 * it more difficult on threads using directio as they 5004 * will be forced to flush the pages from this vnode. 5005 */ 5006 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5007 return (0); 5008 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) 5009 return (0); 5010 5011 /* 5012 * Is this test needed? 5013 */ 5014 if (addr2 >= seg->s_base + seg->s_size) 5015 return (0); 5016 5017 contig = 0; 5018 err = bmap_read(ip, io_off, &bn, &contig); 5019 /* 5020 * If its a UFS_HOLE or a fallocated block, do not perform 5021 * any read ahead's since there probably is nothing to read ahead 5022 */ 5023 if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn)) 5024 return (0); 5025 5026 /* 5027 * Limit the transfer size to bsize if this is the 2nd block. 5028 */ 5029 if (io_off == (u_offset_t)bsize) 5030 contig = MIN(contig, bsize); 5031 5032 if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off, 5033 &io_len, io_off, contig, 1)) == NULL) 5034 return (0); 5035 5036 /* 5037 * Zero part of page which we are not going to read from disk 5038 */ 5039 if ((xlen = (io_len & PAGEOFFSET)) > 0) 5040 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 5041 5042 ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK; 5043 5044 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC); 5045 bp->b_edev = ip->i_dev; 5046 bp->b_dev = cmpdev(ip->i_dev); 5047 bp->b_blkno = bn; 5048 bp->b_un.b_addr = (caddr_t)0; 5049 bp->b_file = ip->i_vnode; 5050 bp->b_offset = off; 5051 5052 if (ufsvfsp->vfs_log) { 5053 lufs_read_strategy(ufsvfsp->vfs_log, bp); 5054 } else if (ufsvfsp->vfs_snapshot) { 5055 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5056 } else { 5057 ufsvfsp->vfs_iotstamp = lbolt; 5058 ub.ub_getras.value.ul++; 5059 (void) bdev_strategy(bp); 5060 lwp_stat_update(LWP_STAT_INBLK, 1); 5061 } 5062 5063 return (io_len); 5064 } 5065 5066 int ufs_delay = 1; 5067 /* 5068 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC} 5069 * 5070 * LMXXX - the inode really ought to contain a pointer to one of these 5071 * async args. Stuff gunk in there and just hand the whole mess off. 5072 * This would replace i_delaylen, i_delayoff. 5073 */ 5074 /*ARGSUSED*/ 5075 static int 5076 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 5077 struct cred *cr) 5078 { 5079 struct inode *ip = VTOI(vp); 5080 int err = 0; 5081 5082 if (vp->v_count == 0) { 5083 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0")); 5084 } 5085 5086 TRACE_1(TR_FAC_UFS, TR_UFS_PUTPAGE_START, 5087 "ufs_putpage_start:vp %p", vp); 5088 5089 /* 5090 * XXX - Why should this check be made here? 5091 */ 5092 if (vp->v_flag & VNOMAP) { 5093 err = ENOSYS; 5094 goto errout; 5095 } 5096 5097 if (ip->i_ufsvfs == NULL) { 5098 err = EIO; 5099 goto errout; 5100 } 5101 5102 if (flags & B_ASYNC) { 5103 if (ufs_delay && len && 5104 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 5105 mutex_enter(&ip->i_tlock); 5106 /* 5107 * If nobody stalled, start a new cluster. 5108 */ 5109 if (ip->i_delaylen == 0) { 5110 ip->i_delayoff = off; 5111 ip->i_delaylen = len; 5112 mutex_exit(&ip->i_tlock); 5113 goto errout; 5114 } 5115 /* 5116 * If we have a full cluster or they are not contig, 5117 * then push last cluster and start over. 5118 */ 5119 if (ip->i_delaylen >= CLUSTSZ(ip) || 5120 ip->i_delayoff + ip->i_delaylen != off) { 5121 u_offset_t doff; 5122 size_t dlen; 5123 5124 doff = ip->i_delayoff; 5125 dlen = ip->i_delaylen; 5126 ip->i_delayoff = off; 5127 ip->i_delaylen = len; 5128 mutex_exit(&ip->i_tlock); 5129 err = ufs_putpages(vp, doff, dlen, 5130 flags, cr); 5131 /* LMXXX - flags are new val, not old */ 5132 goto errout; 5133 } 5134 /* 5135 * There is something there, it's not full, and 5136 * it is contig. 5137 */ 5138 ip->i_delaylen += len; 5139 mutex_exit(&ip->i_tlock); 5140 goto errout; 5141 } 5142 /* 5143 * Must have weird flags or we are not clustering. 5144 */ 5145 } 5146 5147 err = ufs_putpages(vp, off, len, flags, cr); 5148 5149 errout: 5150 TRACE_2(TR_FAC_UFS, TR_UFS_PUTPAGE_END, 5151 "ufs_putpage_end:vp %p error %d", vp, err); 5152 return (err); 5153 } 5154 5155 /* 5156 * If len == 0, do from off to EOF. 5157 * 5158 * The normal cases should be len == 0 & off == 0 (entire vp list), 5159 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 5160 * (from pageout). 5161 */ 5162 /*ARGSUSED*/ 5163 static int 5164 ufs_putpages( 5165 struct vnode *vp, 5166 offset_t off, 5167 size_t len, 5168 int flags, 5169 struct cred *cr) 5170 { 5171 u_offset_t io_off; 5172 u_offset_t eoff; 5173 struct inode *ip = VTOI(vp); 5174 page_t *pp; 5175 size_t io_len; 5176 int err = 0; 5177 int dolock; 5178 5179 if (vp->v_count == 0) 5180 return (ufs_fault(vp, "ufs_putpages: v_count == 0")); 5181 /* 5182 * Acquire the readers/write inode lock before locking 5183 * any pages in this inode. 5184 * The inode lock is held during i/o. 5185 */ 5186 if (len == 0) { 5187 mutex_enter(&ip->i_tlock); 5188 ip->i_delayoff = ip->i_delaylen = 0; 5189 mutex_exit(&ip->i_tlock); 5190 } 5191 dolock = (rw_owner(&ip->i_contents) != curthread); 5192 if (dolock) { 5193 /* 5194 * Must synchronize this thread and any possible thread 5195 * operating in the window of vulnerability in wrip(). 5196 * It is dangerous to allow both a thread doing a putpage 5197 * and a thread writing, so serialize them. The exception 5198 * is when the thread in wrip() does something which causes 5199 * a putpage operation. Then, the thread must be allowed 5200 * to continue. It may encounter a bmap_read problem in 5201 * ufs_putapage, but that is handled in ufs_putapage. 5202 * Allow async writers to proceed, we don't want to block 5203 * the pageout daemon. 5204 */ 5205 if (ip->i_writer == curthread) 5206 rw_enter(&ip->i_contents, RW_READER); 5207 else { 5208 for (;;) { 5209 rw_enter(&ip->i_contents, RW_READER); 5210 mutex_enter(&ip->i_tlock); 5211 /* 5212 * If there is no thread in the critical 5213 * section of wrip(), then proceed. 5214 * Otherwise, wait until there isn't one. 5215 */ 5216 if (ip->i_writer == NULL) { 5217 mutex_exit(&ip->i_tlock); 5218 break; 5219 } 5220 rw_exit(&ip->i_contents); 5221 /* 5222 * Bounce async writers when we have a writer 5223 * working on this file so we don't deadlock 5224 * the pageout daemon. 5225 */ 5226 if (flags & B_ASYNC) { 5227 mutex_exit(&ip->i_tlock); 5228 return (0); 5229 } 5230 cv_wait(&ip->i_wrcv, &ip->i_tlock); 5231 mutex_exit(&ip->i_tlock); 5232 } 5233 } 5234 } 5235 5236 if (!vn_has_cached_data(vp)) { 5237 if (dolock) 5238 rw_exit(&ip->i_contents); 5239 return (0); 5240 } 5241 5242 if (len == 0) { 5243 /* 5244 * Search the entire vp list for pages >= off. 5245 */ 5246 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage, 5247 flags, cr); 5248 } else { 5249 /* 5250 * Loop over all offsets in the range looking for 5251 * pages to deal with. 5252 */ 5253 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0) 5254 eoff = MIN(off + len, eoff); 5255 else 5256 eoff = off + len; 5257 5258 for (io_off = off; io_off < eoff; io_off += io_len) { 5259 /* 5260 * If we are not invalidating, synchronously 5261 * freeing or writing pages, use the routine 5262 * page_lookup_nowait() to prevent reclaiming 5263 * them from the free list. 5264 */ 5265 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 5266 pp = page_lookup(vp, io_off, 5267 (flags & (B_INVAL | B_FREE)) ? 5268 SE_EXCL : SE_SHARED); 5269 } else { 5270 pp = page_lookup_nowait(vp, io_off, 5271 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 5272 } 5273 5274 if (pp == NULL || pvn_getdirty(pp, flags) == 0) 5275 io_len = PAGESIZE; 5276 else { 5277 u_offset_t *io_offp = &io_off; 5278 5279 err = ufs_putapage(vp, pp, io_offp, &io_len, 5280 flags, cr); 5281 if (err != 0) 5282 break; 5283 /* 5284 * "io_off" and "io_len" are returned as 5285 * the range of pages we actually wrote. 5286 * This allows us to skip ahead more quickly 5287 * since several pages may've been dealt 5288 * with by this iteration of the loop. 5289 */ 5290 } 5291 } 5292 } 5293 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 5294 /* 5295 * We have just sync'ed back all the pages on 5296 * the inode, turn off the IMODTIME flag. 5297 */ 5298 mutex_enter(&ip->i_tlock); 5299 ip->i_flag &= ~IMODTIME; 5300 mutex_exit(&ip->i_tlock); 5301 } 5302 if (dolock) 5303 rw_exit(&ip->i_contents); 5304 return (err); 5305 } 5306 5307 static void 5308 ufs_iodone(buf_t *bp) 5309 { 5310 struct inode *ip; 5311 5312 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 5313 5314 bp->b_iodone = NULL; 5315 5316 ip = VTOI(bp->b_pages->p_vnode); 5317 5318 mutex_enter(&ip->i_tlock); 5319 if (ip->i_writes >= ufs_LW) { 5320 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW) 5321 if (ufs_WRITES) 5322 cv_broadcast(&ip->i_wrcv); /* wake all up */ 5323 } else { 5324 ip->i_writes -= bp->b_bcount; 5325 } 5326 5327 mutex_exit(&ip->i_tlock); 5328 iodone(bp); 5329 } 5330 5331 /* 5332 * Write out a single page, possibly klustering adjacent 5333 * dirty pages. The inode lock must be held. 5334 * 5335 * LMXXX - bsize < pagesize not done. 5336 */ 5337 /*ARGSUSED*/ 5338 int 5339 ufs_putapage( 5340 struct vnode *vp, 5341 page_t *pp, 5342 u_offset_t *offp, 5343 size_t *lenp, /* return values */ 5344 int flags, 5345 struct cred *cr) 5346 { 5347 u_offset_t io_off; 5348 u_offset_t off; 5349 struct inode *ip = VTOI(vp); 5350 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 5351 struct fs *fs; 5352 struct buf *bp; 5353 size_t io_len; 5354 daddr_t bn; 5355 int err; 5356 int contig; 5357 5358 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 5359 5360 TRACE_1(TR_FAC_UFS, TR_UFS_PUTAPAGE_START, 5361 "ufs_putapage_start:vp %p", vp); 5362 5363 if (ufsvfsp == NULL) { 5364 err = EIO; 5365 goto out_trace; 5366 } 5367 5368 fs = ip->i_fs; 5369 ASSERT(fs->fs_ronly == 0); 5370 5371 /* 5372 * If the modified time on the inode has not already been 5373 * set elsewhere (e.g. for write/setattr) we set the time now. 5374 * This gives us approximate modified times for mmap'ed files 5375 * which are modified via stores in the user address space. 5376 */ 5377 if ((ip->i_flag & IMODTIME) == 0) { 5378 mutex_enter(&ip->i_tlock); 5379 ip->i_flag |= IUPD; 5380 ip->i_seq++; 5381 ITIMES_NOLOCK(ip); 5382 mutex_exit(&ip->i_tlock); 5383 } 5384 5385 /* 5386 * Align the request to a block boundry (for old file systems), 5387 * and go ask bmap() how contiguous things are for this file. 5388 */ 5389 off = pp->p_offset & (offset_t)fs->fs_bmask; /* block align it */ 5390 contig = 0; 5391 err = bmap_read(ip, off, &bn, &contig); 5392 if (err) 5393 goto out; 5394 if (bn == UFS_HOLE) { /* putpage never allocates */ 5395 /* 5396 * logging device is in error mode; simply return EIO 5397 */ 5398 if (TRANS_ISERROR(ufsvfsp)) { 5399 err = EIO; 5400 goto out; 5401 } 5402 /* 5403 * Oops, the thread in the window in wrip() did some 5404 * sort of operation which caused a putpage in the bad 5405 * range. In this case, just return an error which will 5406 * cause the software modified bit on the page to set 5407 * and the page will get written out again later. 5408 */ 5409 if (ip->i_writer == curthread) { 5410 err = EIO; 5411 goto out; 5412 } 5413 /* 5414 * If the pager is trying to push a page in the bad range 5415 * just tell him to try again later when things are better. 5416 */ 5417 if (flags & B_ASYNC) { 5418 err = EAGAIN; 5419 goto out; 5420 } 5421 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE"); 5422 goto out; 5423 } 5424 5425 /* 5426 * If it is an fallocate'd block, reverse the negativity since 5427 * we are now writing to it 5428 */ 5429 if (ISFALLOCBLK(ip, bn)) { 5430 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn)); 5431 if (err) 5432 goto out; 5433 5434 bn = -bn; 5435 } 5436 5437 /* 5438 * Take the length (of contiguous bytes) passed back from bmap() 5439 * and _try_ and get a set of pages covering that extent. 5440 */ 5441 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags); 5442 5443 /* 5444 * May have run out of memory and not clustered backwards. 5445 * off p_offset 5446 * [ pp - 1 ][ pp ] 5447 * [ block ] 5448 * We told bmap off, so we have to adjust the bn accordingly. 5449 */ 5450 if (io_off > off) { 5451 bn += btod(io_off - off); 5452 contig -= (io_off - off); 5453 } 5454 5455 /* 5456 * bmap was carefull to tell us the right size so use that. 5457 * There might be unallocated frags at the end. 5458 * LMXXX - bzero the end of the page? We must be writing after EOF. 5459 */ 5460 if (io_len > contig) { 5461 ASSERT(io_len - contig < fs->fs_bsize); 5462 io_len -= (io_len - contig); 5463 } 5464 5465 /* 5466 * Handle the case where we are writing the last page after EOF. 5467 * 5468 * XXX - just a patch for i-mt3. 5469 */ 5470 if (io_len == 0) { 5471 ASSERT(pp->p_offset >= (u_offset_t)(roundup(ip->i_size, 5472 PAGESIZE))); 5473 io_len = PAGESIZE; 5474 } 5475 5476 bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags); 5477 5478 ULOCKFS_SET_MOD(ITOUL(ip)); 5479 5480 bp->b_edev = ip->i_dev; 5481 bp->b_dev = cmpdev(ip->i_dev); 5482 bp->b_blkno = bn; 5483 bp->b_un.b_addr = (caddr_t)0; 5484 bp->b_file = ip->i_vnode; 5485 5486 if (TRANS_ISTRANS(ufsvfsp)) { 5487 if ((ip->i_mode & IFMT) == IFSHAD) { 5488 TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD); 5489 } else if (ufsvfsp->vfs_qinod == ip) { 5490 TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR, 5491 0, 0); 5492 } 5493 } 5494 5495 /* write throttle */ 5496 5497 ASSERT(bp->b_iodone == NULL); 5498 bp->b_iodone = (int (*)())ufs_iodone; 5499 mutex_enter(&ip->i_tlock); 5500 ip->i_writes += bp->b_bcount; 5501 mutex_exit(&ip->i_tlock); 5502 5503 if (bp->b_flags & B_ASYNC) { 5504 if (ufsvfsp->vfs_log) { 5505 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5506 } else if (ufsvfsp->vfs_snapshot) { 5507 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5508 } else { 5509 ufsvfsp->vfs_iotstamp = lbolt; 5510 ub.ub_putasyncs.value.ul++; 5511 (void) bdev_strategy(bp); 5512 lwp_stat_update(LWP_STAT_OUBLK, 1); 5513 } 5514 } else { 5515 if (ufsvfsp->vfs_log) { 5516 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5517 } else if (ufsvfsp->vfs_snapshot) { 5518 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5519 } else { 5520 ufsvfsp->vfs_iotstamp = lbolt; 5521 ub.ub_putsyncs.value.ul++; 5522 (void) bdev_strategy(bp); 5523 lwp_stat_update(LWP_STAT_OUBLK, 1); 5524 } 5525 err = biowait(bp); 5526 pageio_done(bp); 5527 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); 5528 } 5529 5530 pp = NULL; 5531 5532 out: 5533 if (err != 0 && pp != NULL) 5534 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 5535 5536 if (offp) 5537 *offp = io_off; 5538 if (lenp) 5539 *lenp = io_len; 5540 out_trace: 5541 TRACE_2(TR_FAC_UFS, TR_UFS_PUTAPAGE_END, 5542 "ufs_putapage_end:vp %p error %d", vp, err); 5543 return (err); 5544 } 5545 5546 /* ARGSUSED */ 5547 static int 5548 ufs_map(struct vnode *vp, 5549 offset_t off, 5550 struct as *as, 5551 caddr_t *addrp, 5552 size_t len, 5553 uchar_t prot, 5554 uchar_t maxprot, 5555 uint_t flags, 5556 struct cred *cr) 5557 { 5558 struct segvn_crargs vn_a; 5559 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5560 struct ulockfs *ulp; 5561 int error; 5562 5563 TRACE_1(TR_FAC_UFS, TR_UFS_MAP_START, 5564 "ufs_map_start:vp %p", vp); 5565 5566 retry_map: 5567 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK); 5568 if (error) 5569 goto out; 5570 5571 if (vp->v_flag & VNOMAP) { 5572 error = ENOSYS; 5573 goto unlock; 5574 } 5575 5576 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) { 5577 error = ENXIO; 5578 goto unlock; 5579 } 5580 5581 if (vp->v_type != VREG) { 5582 error = ENODEV; 5583 goto unlock; 5584 } 5585 5586 /* 5587 * If file is being locked, disallow mapping. 5588 */ 5589 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) { 5590 error = EAGAIN; 5591 goto unlock; 5592 } 5593 5594 as_rangelock(as); 5595 if ((flags & MAP_FIXED) == 0) { 5596 map_addr(addrp, len, off, 1, flags); 5597 if (*addrp == NULL) { 5598 as_rangeunlock(as); 5599 error = ENOMEM; 5600 goto unlock; 5601 } 5602 } else { 5603 /* 5604 * User specified address - blow away any previous mappings 5605 */ 5606 (void) as_unmap(as, *addrp, len); 5607 } 5608 5609 vn_a.vp = vp; 5610 vn_a.offset = (u_offset_t)off; 5611 vn_a.type = flags & MAP_TYPE; 5612 vn_a.prot = prot; 5613 vn_a.maxprot = maxprot; 5614 vn_a.cred = cr; 5615 vn_a.amp = NULL; 5616 vn_a.flags = flags & ~MAP_TYPE; 5617 vn_a.szc = 0; 5618 vn_a.lgrp_mem_policy_flags = 0; 5619 5620 retry_lock: 5621 if (!AS_LOCK_TRYENTER(ias, &as->a_lock, RW_WRITER)) { 5622 /* 5623 * We didn't get the lock. Check if the SLOCK is set in the 5624 * ufsvfs. If yes, we might be in a deadlock. Safer to give up 5625 * and wait for SLOCK to be cleared. 5626 */ 5627 5628 if (ulp && ULOCKFS_IS_SLOCK(ulp)) { 5629 as_rangeunlock(as); 5630 ufs_lockfs_end(ulp); 5631 goto retry_map; 5632 } else { 5633 /* 5634 * SLOCK isn't set so this is a genuine synchronization 5635 * case. Let's try again after giving them a breather. 5636 */ 5637 delay(RETRY_LOCK_DELAY); 5638 goto retry_lock; 5639 } 5640 } 5641 error = as_map_locked(as, *addrp, len, segvn_create, &vn_a); 5642 as_rangeunlock(as); 5643 5644 unlock: 5645 if (ulp) { 5646 ufs_lockfs_end(ulp); 5647 } 5648 out: 5649 TRACE_2(TR_FAC_UFS, TR_UFS_MAP_END, 5650 "ufs_map_end:vp %p error %d", vp, error); 5651 return (error); 5652 } 5653 5654 /* ARGSUSED */ 5655 static int 5656 ufs_addmap(struct vnode *vp, 5657 offset_t off, 5658 struct as *as, 5659 caddr_t addr, 5660 size_t len, 5661 uchar_t prot, 5662 uchar_t maxprot, 5663 uint_t flags, 5664 struct cred *cr) 5665 { 5666 struct inode *ip = VTOI(vp); 5667 5668 if (vp->v_flag & VNOMAP) { 5669 return (ENOSYS); 5670 } 5671 5672 mutex_enter(&ip->i_tlock); 5673 ip->i_mapcnt += btopr(len); 5674 mutex_exit(&ip->i_tlock); 5675 return (0); 5676 } 5677 5678 /*ARGSUSED*/ 5679 static int 5680 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 5681 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 5682 struct cred *cr) 5683 { 5684 struct inode *ip = VTOI(vp); 5685 5686 if (vp->v_flag & VNOMAP) { 5687 return (ENOSYS); 5688 } 5689 5690 mutex_enter(&ip->i_tlock); 5691 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 5692 ASSERT(ip->i_mapcnt >= 0); 5693 mutex_exit(&ip->i_tlock); 5694 return (0); 5695 } 5696 /* 5697 * Return the answer requested to poll() for non-device files 5698 */ 5699 struct pollhead ufs_pollhd; 5700 5701 /* ARGSUSED */ 5702 int 5703 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp) 5704 { 5705 struct ufsvfs *ufsvfsp; 5706 5707 *revp = 0; 5708 ufsvfsp = VTOI(vp)->i_ufsvfs; 5709 5710 if (!ufsvfsp) { 5711 *revp = POLLHUP; 5712 goto out; 5713 } 5714 5715 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) || 5716 ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) { 5717 *revp |= POLLERR; 5718 5719 } else { 5720 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly && 5721 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5722 *revp |= POLLOUT; 5723 5724 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly && 5725 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5726 *revp |= POLLWRBAND; 5727 5728 if (ev & POLLIN) 5729 *revp |= POLLIN; 5730 5731 if (ev & POLLRDNORM) 5732 *revp |= POLLRDNORM; 5733 5734 if (ev & POLLRDBAND) 5735 *revp |= POLLRDBAND; 5736 } 5737 5738 if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP))) 5739 *revp |= POLLPRI; 5740 out: 5741 *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL; 5742 5743 return (0); 5744 } 5745 5746 /* ARGSUSED */ 5747 static int 5748 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr) 5749 { 5750 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5751 struct ulockfs *ulp = NULL; 5752 struct inode *sip = NULL; 5753 int error; 5754 struct inode *ip = VTOI(vp); 5755 int issync; 5756 5757 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK); 5758 if (error) 5759 return (error); 5760 5761 switch (cmd) { 5762 /* 5763 * Have to handle _PC_NAME_MAX here, because the normal way 5764 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()] 5765 * results in a lock ordering reversal between 5766 * ufs_lockfs_{begin,end}() and 5767 * ufs_thread_{suspend,continue}(). 5768 * 5769 * Keep in sync with ufs_statvfs(). 5770 */ 5771 case _PC_NAME_MAX: 5772 *valp = MAXNAMLEN; 5773 break; 5774 5775 case _PC_FILESIZEBITS: 5776 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 5777 *valp = UFS_FILESIZE_BITS; 5778 else 5779 *valp = 32; 5780 break; 5781 5782 case _PC_XATTR_EXISTS: 5783 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5784 5785 error = ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, 5786 cr); 5787 if (error == 0 && sip != NULL) { 5788 /* Start transaction */ 5789 if (ulp) { 5790 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 5791 TOP_RMDIR, TOP_RMDIR_SIZE); 5792 } 5793 /* 5794 * Is directory empty 5795 */ 5796 rw_enter(&sip->i_rwlock, RW_WRITER); 5797 rw_enter(&sip->i_contents, RW_WRITER); 5798 if (ufs_xattrdirempty(sip, 5799 sip->i_number, CRED())) { 5800 rw_enter(&ip->i_contents, RW_WRITER); 5801 ufs_unhook_shadow(ip, sip); 5802 rw_exit(&ip->i_contents); 5803 5804 *valp = 0; 5805 5806 } else 5807 *valp = 1; 5808 rw_exit(&sip->i_contents); 5809 rw_exit(&sip->i_rwlock); 5810 if (ulp) { 5811 TRANS_END_CSYNC(ufsvfsp, error, issync, 5812 TOP_RMDIR, TOP_RMDIR_SIZE); 5813 } 5814 VN_RELE(ITOV(sip)); 5815 } else if (error == ENOENT) { 5816 *valp = 0; 5817 error = 0; 5818 } 5819 } else { 5820 error = fs_pathconf(vp, cmd, valp, cr); 5821 } 5822 break; 5823 5824 case _PC_ACL_ENABLED: 5825 *valp = _ACL_ACLENT_ENABLED; 5826 break; 5827 5828 case _PC_MIN_HOLE_SIZE: 5829 *valp = (ulong_t)ip->i_fs->fs_bsize; 5830 break; 5831 5832 default: 5833 error = fs_pathconf(vp, cmd, valp, cr); 5834 } 5835 5836 if (ulp != NULL) { 5837 ufs_lockfs_end(ulp); 5838 } 5839 return (error); 5840 } 5841 5842 int ufs_pageio_writes, ufs_pageio_reads; 5843 5844 /*ARGSUSED*/ 5845 static int 5846 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5847 int flags, struct cred *cr) 5848 { 5849 struct inode *ip = VTOI(vp); 5850 struct ufsvfs *ufsvfsp; 5851 page_t *npp = NULL, *opp = NULL, *cpp = pp; 5852 struct buf *bp; 5853 daddr_t bn; 5854 size_t done_len = 0, cur_len = 0; 5855 int err = 0; 5856 int contig = 0; 5857 int dolock; 5858 int vmpss = 0; 5859 struct ulockfs *ulp; 5860 5861 if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp && 5862 vp->v_mpssdata != NULL) { 5863 vmpss = 1; 5864 } 5865 5866 dolock = (rw_owner(&ip->i_contents) != curthread); 5867 /* 5868 * We need a better check. Ideally, we would use another 5869 * vnodeops so that hlocked and forcibly unmounted file 5870 * systems would return EIO where appropriate and w/o the 5871 * need for these checks. 5872 */ 5873 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5874 return (EIO); 5875 5876 /* 5877 * For vmpss (pp can be NULL) case respect the quiesce protocol. 5878 * ul_lock must be taken before locking pages so we can't use it here 5879 * if pp is non NULL because segvn already locked pages 5880 * SE_EXCL. Instead we rely on the fact that a forced umount or 5881 * applying a filesystem lock via ufs_fiolfs() will block in the 5882 * implicit call to ufs_flush() until we unlock the pages after the 5883 * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend 5884 * above 0 until they are done. We have to be careful not to increment 5885 * ul_vnops_cnt here after forceful unmount hlocks the file system. 5886 * 5887 * If pp is NULL use ul_lock to make sure we don't increment 5888 * ul_vnops_cnt after forceful unmount hlocks the file system. 5889 */ 5890 if (vmpss || pp == NULL) { 5891 ulp = &ufsvfsp->vfs_ulockfs; 5892 if (pp == NULL) 5893 mutex_enter(&ulp->ul_lock); 5894 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) { 5895 if (pp == NULL) { 5896 mutex_exit(&ulp->ul_lock); 5897 } 5898 return (vmpss ? EIO : EINVAL); 5899 } 5900 atomic_add_long(&ulp->ul_vnops_cnt, 1); 5901 if (pp == NULL) 5902 mutex_exit(&ulp->ul_lock); 5903 if (ufs_quiesce_pend) { 5904 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5905 cv_broadcast(&ulp->ul_cv); 5906 return (vmpss ? EIO : EINVAL); 5907 } 5908 } 5909 5910 if (dolock) { 5911 /* 5912 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to 5913 * handle a fault against a segment that maps vnode pages with 5914 * large mappings. Segvn creates pages and holds them locked 5915 * SE_EXCL during VOP_PAGEIO() call. In this case we have to 5916 * use rw_tryenter() to avoid a potential deadlock since in 5917 * lock order i_contents needs to be taken first. 5918 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails. 5919 */ 5920 if (!vmpss) { 5921 rw_enter(&ip->i_contents, RW_READER); 5922 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) { 5923 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5924 cv_broadcast(&ulp->ul_cv); 5925 return (EDEADLK); 5926 } 5927 } 5928 5929 /* 5930 * Return an error to segvn because the pagefault request is beyond 5931 * PAGESIZE rounded EOF. 5932 */ 5933 if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) { 5934 if (dolock) 5935 rw_exit(&ip->i_contents); 5936 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5937 cv_broadcast(&ulp->ul_cv); 5938 return (EFAULT); 5939 } 5940 5941 if (pp == NULL) { 5942 if (bmap_has_holes(ip)) { 5943 err = ENOSYS; 5944 } else { 5945 err = EINVAL; 5946 } 5947 if (dolock) 5948 rw_exit(&ip->i_contents); 5949 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5950 cv_broadcast(&ulp->ul_cv); 5951 return (err); 5952 } 5953 5954 /* 5955 * Break the io request into chunks, one for each contiguous 5956 * stretch of disk blocks in the target file. 5957 */ 5958 while (done_len < io_len) { 5959 ASSERT(cpp); 5960 contig = 0; 5961 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len), 5962 &bn, &contig)) 5963 break; 5964 5965 if (bn == UFS_HOLE) { /* No holey swapfiles */ 5966 if (vmpss) { 5967 err = EFAULT; 5968 break; 5969 } 5970 err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE"); 5971 break; 5972 } 5973 5974 cur_len = MIN(io_len - done_len, contig); 5975 /* 5976 * Zero out a page beyond EOF, when the last block of 5977 * a file is a UFS fragment so that ufs_pageio() can be used 5978 * instead of ufs_getpage() to handle faults against 5979 * segvn segments that use large pages. 5980 */ 5981 page_list_break(&cpp, &npp, btopr(cur_len)); 5982 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) { 5983 size_t xlen = cur_len & PAGEOFFSET; 5984 pagezero(cpp->p_prev, xlen, PAGESIZE - xlen); 5985 } 5986 5987 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 5988 ASSERT(bp != NULL); 5989 5990 bp->b_edev = ip->i_dev; 5991 bp->b_dev = cmpdev(ip->i_dev); 5992 bp->b_blkno = bn; 5993 bp->b_un.b_addr = (caddr_t)0; 5994 bp->b_file = ip->i_vnode; 5995 5996 ufsvfsp->vfs_iotstamp = lbolt; 5997 ub.ub_pageios.value.ul++; 5998 if (ufsvfsp->vfs_snapshot) 5999 fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp); 6000 else 6001 (void) bdev_strategy(bp); 6002 6003 if (flags & B_READ) 6004 ufs_pageio_reads++; 6005 else 6006 ufs_pageio_writes++; 6007 if (flags & B_READ) 6008 lwp_stat_update(LWP_STAT_INBLK, 1); 6009 else 6010 lwp_stat_update(LWP_STAT_OUBLK, 1); 6011 /* 6012 * If the request is not B_ASYNC, wait for i/o to complete 6013 * and re-assemble the page list to return to the caller. 6014 * If it is B_ASYNC we leave the page list in pieces and 6015 * cleanup() will dispose of them. 6016 */ 6017 if ((flags & B_ASYNC) == 0) { 6018 err = biowait(bp); 6019 pageio_done(bp); 6020 if (err) 6021 break; 6022 page_list_concat(&opp, &cpp); 6023 } 6024 cpp = npp; 6025 npp = NULL; 6026 if (flags & B_READ) 6027 cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t); 6028 done_len += cur_len; 6029 } 6030 ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len)); 6031 if (err) { 6032 if (flags & B_ASYNC) { 6033 /* Cleanup unprocessed parts of list */ 6034 page_list_concat(&cpp, &npp); 6035 if (flags & B_READ) 6036 pvn_read_done(cpp, B_ERROR); 6037 else 6038 pvn_write_done(cpp, B_ERROR); 6039 } else { 6040 /* Re-assemble list and let caller clean up */ 6041 page_list_concat(&opp, &cpp); 6042 page_list_concat(&opp, &npp); 6043 } 6044 } 6045 6046 if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) && 6047 ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) { 6048 mutex_enter(&ip->i_tlock); 6049 ip->i_flag |= IACC; 6050 ITIMES_NOLOCK(ip); 6051 mutex_exit(&ip->i_tlock); 6052 } 6053 6054 if (dolock) 6055 rw_exit(&ip->i_contents); 6056 if (vmpss && !atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 6057 cv_broadcast(&ulp->ul_cv); 6058 return (err); 6059 } 6060 6061 /* 6062 * Called when the kernel is in a frozen state to dump data 6063 * directly to the device. It uses a private dump data structure, 6064 * set up by dump_ctl, to locate the correct disk block to which to dump. 6065 */ 6066 static int 6067 ufs_dump(vnode_t *vp, caddr_t addr, int ldbn, int dblks) 6068 { 6069 u_offset_t file_size; 6070 struct inode *ip = VTOI(vp); 6071 struct fs *fs = ip->i_fs; 6072 daddr_t dbn, lfsbn; 6073 int disk_blks = fs->fs_bsize >> DEV_BSHIFT; 6074 int error = 0; 6075 int ndbs, nfsbs; 6076 6077 /* 6078 * forced unmount case 6079 */ 6080 if (ip->i_ufsvfs == NULL) 6081 return (EIO); 6082 /* 6083 * Validate the inode that it has not been modified since 6084 * the dump structure is allocated. 6085 */ 6086 mutex_enter(&ip->i_tlock); 6087 if ((dump_info == NULL) || 6088 (dump_info->ip != ip) || 6089 (dump_info->time.tv_sec != ip->i_mtime.tv_sec) || 6090 (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) { 6091 mutex_exit(&ip->i_tlock); 6092 return (-1); 6093 } 6094 mutex_exit(&ip->i_tlock); 6095 6096 /* 6097 * See that the file has room for this write 6098 */ 6099 UFS_GET_ISIZE(&file_size, ip); 6100 6101 if (ldbtob((offset_t)(ldbn + dblks)) > file_size) 6102 return (ENOSPC); 6103 6104 /* 6105 * Find the physical disk block numbers from the dump 6106 * private data structure directly and write out the data 6107 * in contiguous block lumps 6108 */ 6109 while (dblks > 0 && !error) { 6110 lfsbn = (daddr_t)lblkno(fs, ldbtob((offset_t)ldbn)); 6111 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks; 6112 nfsbs = 1; 6113 ndbs = disk_blks - ldbn % disk_blks; 6114 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn + 6115 nfsbs]) == dbn + ndbs) { 6116 nfsbs++; 6117 ndbs += disk_blks; 6118 } 6119 if (ndbs > dblks) 6120 ndbs = dblks; 6121 error = bdev_dump(ip->i_dev, addr, dbn, ndbs); 6122 addr += ldbtob((offset_t)ndbs); 6123 dblks -= ndbs; 6124 ldbn += ndbs; 6125 } 6126 return (error); 6127 6128 } 6129 6130 /* 6131 * Prepare the file system before and after the dump operation. 6132 * 6133 * action = DUMP_ALLOC: 6134 * Preparation before dump, allocate dump private data structure 6135 * to hold all the direct and indirect block info for dump. 6136 * 6137 * action = DUMP_FREE: 6138 * Clean up after dump, deallocate the dump private data structure. 6139 * 6140 * action = DUMP_SCAN: 6141 * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space; 6142 * if found, the starting file-relative DEV_BSIZE lbn is written 6143 * to *bklp; that lbn is intended for use with VOP_DUMP() 6144 */ 6145 static int 6146 ufs_dumpctl(vnode_t *vp, int action, int *blkp) 6147 { 6148 struct inode *ip = VTOI(vp); 6149 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 6150 struct fs *fs; 6151 daddr32_t *dblk, *storeblk; 6152 daddr32_t *nextblk, *endblk; 6153 struct buf *bp; 6154 int i, entry, entries; 6155 int n, ncontig; 6156 6157 /* 6158 * check for forced unmount 6159 */ 6160 if (ufsvfsp == NULL) 6161 return (EIO); 6162 6163 if (action == DUMP_ALLOC) { 6164 /* 6165 * alloc and record dump_info 6166 */ 6167 if (dump_info != NULL) 6168 return (EINVAL); 6169 6170 ASSERT(vp->v_type == VREG); 6171 fs = ufsvfsp->vfs_fs; 6172 6173 rw_enter(&ip->i_contents, RW_READER); 6174 6175 if (bmap_has_holes(ip)) { 6176 rw_exit(&ip->i_contents); 6177 return (EFAULT); 6178 } 6179 6180 /* 6181 * calculate and allocate space needed according to i_size 6182 */ 6183 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size)); 6184 if ((dump_info = (struct dump *) 6185 kmem_alloc(sizeof (struct dump) + 6186 (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP)) == NULL) { 6187 rw_exit(&ip->i_contents); 6188 return (ENOMEM); 6189 } 6190 6191 /* Start saving the info */ 6192 dump_info->fsbs = entries; 6193 dump_info->ip = ip; 6194 storeblk = &dump_info->dblk[0]; 6195 6196 /* Direct Blocks */ 6197 for (entry = 0; entry < NDADDR && entry < entries; entry++) 6198 *storeblk++ = ip->i_db[entry]; 6199 6200 /* Indirect Blocks */ 6201 for (i = 0; i < NIADDR; i++) { 6202 int error = 0; 6203 6204 bp = UFS_BREAD(ufsvfsp, 6205 ip->i_dev, fsbtodb(fs, ip->i_ib[i]), 6206 fs->fs_bsize); 6207 if (bp->b_flags & B_ERROR) 6208 error = EIO; 6209 else { 6210 dblk = bp->b_un.b_daddr; 6211 if ((storeblk = save_dblks(ip, ufsvfsp, 6212 storeblk, dblk, i, entries)) == NULL) 6213 error = EIO; 6214 } 6215 6216 brelse(bp); 6217 6218 if (error != 0) { 6219 kmem_free(dump_info, sizeof (struct dump) + 6220 (entries - 1) * sizeof (daddr32_t)); 6221 rw_exit(&ip->i_contents); 6222 dump_info = NULL; 6223 return (error); 6224 } 6225 } 6226 /* and time stamp the information */ 6227 mutex_enter(&ip->i_tlock); 6228 dump_info->time = ip->i_mtime; 6229 mutex_exit(&ip->i_tlock); 6230 6231 rw_exit(&ip->i_contents); 6232 } else if (action == DUMP_FREE) { 6233 /* 6234 * free dump_info 6235 */ 6236 if (dump_info == NULL) 6237 return (EINVAL); 6238 entries = dump_info->fsbs - 1; 6239 kmem_free(dump_info, sizeof (struct dump) + 6240 entries * sizeof (daddr32_t)); 6241 dump_info = NULL; 6242 } else if (action == DUMP_SCAN) { 6243 /* 6244 * scan dump_info 6245 */ 6246 if (dump_info == NULL) 6247 return (EINVAL); 6248 6249 dblk = dump_info->dblk; 6250 nextblk = dblk + 1; 6251 endblk = dblk + dump_info->fsbs - 1; 6252 fs = ufsvfsp->vfs_fs; 6253 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT); 6254 6255 /* 6256 * scan dblk[] entries; contig fs space is found when: 6257 * ((current blkno + frags per block) == next blkno) 6258 */ 6259 n = 0; 6260 while (n < ncontig && dblk < endblk) { 6261 if ((*dblk + fs->fs_frag) == *nextblk) 6262 n++; 6263 else 6264 n = 0; 6265 dblk++; 6266 nextblk++; 6267 } 6268 6269 /* 6270 * index is where size bytes of contig space begins; 6271 * conversion from index to the file's DEV_BSIZE lbn 6272 * is equivalent to: (index * fs_bsize) / DEV_BSIZE 6273 */ 6274 if (n == ncontig) { 6275 i = (dblk - dump_info->dblk) - ncontig; 6276 *blkp = i << (fs->fs_bshift - DEV_BSHIFT); 6277 } else 6278 return (EFAULT); 6279 } 6280 return (0); 6281 } 6282 6283 /* 6284 * Recursive helper function for ufs_dumpctl(). It follows the indirect file 6285 * system blocks until it reaches the the disk block addresses, which are 6286 * then stored into the given buffer, storeblk. 6287 */ 6288 static daddr32_t * 6289 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp, daddr32_t *storeblk, 6290 daddr32_t *dblk, int level, int entries) 6291 { 6292 struct fs *fs = ufsvfsp->vfs_fs; 6293 struct buf *bp; 6294 int i; 6295 6296 if (level == 0) { 6297 for (i = 0; i < NINDIR(fs); i++) { 6298 if (storeblk - dump_info->dblk >= entries) 6299 break; 6300 *storeblk++ = dblk[i]; 6301 } 6302 return (storeblk); 6303 } 6304 for (i = 0; i < NINDIR(fs); i++) { 6305 if (storeblk - dump_info->dblk >= entries) 6306 break; 6307 bp = UFS_BREAD(ufsvfsp, 6308 ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize); 6309 if (bp->b_flags & B_ERROR) { 6310 brelse(bp); 6311 return (NULL); 6312 } 6313 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr, 6314 level - 1, entries); 6315 brelse(bp); 6316 6317 if (storeblk == NULL) 6318 return (NULL); 6319 } 6320 return (storeblk); 6321 } 6322 6323 /* ARGSUSED */ 6324 static int 6325 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, 6326 struct cred *cr) 6327 { 6328 struct inode *ip = VTOI(vp); 6329 struct ulockfs *ulp; 6330 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 6331 ulong_t vsa_mask = vsap->vsa_mask; 6332 int err = EINVAL; 6333 6334 TRACE_3(TR_FAC_UFS, TR_UFS_GETSECATTR_START, 6335 "ufs_getsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6336 6337 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6338 6339 /* 6340 * Only grab locks if needed - they're not needed to check vsa_mask 6341 * or if the mask contains no acl flags. 6342 */ 6343 if (vsa_mask != 0) { 6344 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, 6345 ULOCKFS_GETATTR_MASK)) 6346 return (err); 6347 6348 rw_enter(&ip->i_contents, RW_READER); 6349 err = ufs_acl_get(ip, vsap, flag, cr); 6350 rw_exit(&ip->i_contents); 6351 6352 if (ulp) 6353 ufs_lockfs_end(ulp); 6354 } 6355 TRACE_1(TR_FAC_UFS, TR_UFS_GETSECATTR_END, 6356 "ufs_getsecattr_end:vp %p", vp); 6357 return (err); 6358 } 6359 6360 /* ARGSUSED */ 6361 static int 6362 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr) 6363 { 6364 struct inode *ip = VTOI(vp); 6365 struct ulockfs *ulp = NULL; 6366 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 6367 ulong_t vsa_mask = vsap->vsa_mask; 6368 int err; 6369 int haverwlock = 1; 6370 int trans_size; 6371 int donetrans = 0; 6372 int retry = 1; 6373 6374 6375 TRACE_3(TR_FAC_UFS, TR_UFS_SETSECATTR_START, 6376 "ufs_setsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6377 6378 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 6379 6380 /* Abort now if the request is either empty or invalid. */ 6381 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6382 if ((vsa_mask == 0) || 6383 ((vsap->vsa_aclentp == NULL) && 6384 (vsap->vsa_dfaclentp == NULL))) { 6385 err = EINVAL; 6386 goto out; 6387 } 6388 6389 /* 6390 * Following convention, if this is a directory then we acquire the 6391 * inode's i_rwlock after starting a UFS logging transaction; 6392 * otherwise, we acquire it beforehand. Since we were called (and 6393 * must therefore return) with the lock held, we will have to drop it, 6394 * and later reacquire it, if operating on a directory. 6395 */ 6396 if (vp->v_type == VDIR) { 6397 rw_exit(&ip->i_rwlock); 6398 haverwlock = 0; 6399 } else { 6400 /* Upgrade the lock if required. */ 6401 if (!rw_write_held(&ip->i_rwlock)) { 6402 rw_exit(&ip->i_rwlock); 6403 rw_enter(&ip->i_rwlock, RW_WRITER); 6404 } 6405 } 6406 6407 again: 6408 ASSERT(!(vp->v_type == VDIR && haverwlock)); 6409 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) { 6410 ulp = NULL; 6411 retry = 0; 6412 goto out; 6413 } 6414 6415 /* 6416 * Check that the file system supports this operation. Note that 6417 * ufs_lockfs_begin() will have checked that the file system had 6418 * not been forcibly unmounted. 6419 */ 6420 if (ufsvfsp->vfs_fs->fs_ronly) { 6421 err = EROFS; 6422 goto out; 6423 } 6424 if (ufsvfsp->vfs_nosetsec) { 6425 err = ENOSYS; 6426 goto out; 6427 } 6428 6429 if (ulp) { 6430 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR, 6431 trans_size = TOP_SETSECATTR_SIZE(VTOI(vp))); 6432 donetrans = 1; 6433 } 6434 6435 if (vp->v_type == VDIR) { 6436 rw_enter(&ip->i_rwlock, RW_WRITER); 6437 haverwlock = 1; 6438 } 6439 6440 ASSERT(haverwlock); 6441 6442 /* Do the actual work. */ 6443 rw_enter(&ip->i_contents, RW_WRITER); 6444 /* 6445 * Suppress out of inodes messages if we will retry. 6446 */ 6447 if (retry) 6448 ip->i_flag |= IQUIET; 6449 err = ufs_acl_set(ip, vsap, flag, cr); 6450 ip->i_flag &= ~IQUIET; 6451 rw_exit(&ip->i_contents); 6452 6453 out: 6454 if (ulp) { 6455 if (donetrans) { 6456 /* 6457 * top_end_async() can eventually call 6458 * top_end_sync(), which can block. We must 6459 * therefore observe the lock-ordering protocol 6460 * here as well. 6461 */ 6462 if (vp->v_type == VDIR) { 6463 rw_exit(&ip->i_rwlock); 6464 haverwlock = 0; 6465 } 6466 TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size); 6467 } 6468 ufs_lockfs_end(ulp); 6469 } 6470 /* 6471 * If no inodes available, try scaring a logically- 6472 * free one out of the delete queue to someplace 6473 * that we can find it. 6474 */ 6475 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 6476 ufs_delete_drain_wait(ufsvfsp, 1); 6477 retry = 0; 6478 if (vp->v_type == VDIR && haverwlock) { 6479 rw_exit(&ip->i_rwlock); 6480 haverwlock = 0; 6481 } 6482 goto again; 6483 } 6484 /* 6485 * If we need to reacquire the lock then it is safe to do so 6486 * as a reader. This is because ufs_rwunlock(), which will be 6487 * called by our caller after we return, does not differentiate 6488 * between shared and exclusive locks. 6489 */ 6490 if (!haverwlock) { 6491 ASSERT(vp->v_type == VDIR); 6492 rw_enter(&ip->i_rwlock, RW_READER); 6493 } 6494 6495 TRACE_1(TR_FAC_UFS, TR_UFS_SETSECATTR_END, 6496 "ufs_setsecattr_end:vp %p", vp); 6497 return (err); 6498 } 6499