1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/t_lock.h> 39 #include <sys/ksynch.h> 40 #include <sys/param.h> 41 #include <sys/time.h> 42 #include <sys/systm.h> 43 #include <sys/sysmacros.h> 44 #include <sys/resource.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/user.h> 48 #include <sys/buf.h> 49 #include <sys/vfs.h> 50 #include <sys/vnode.h> 51 #include <sys/proc.h> 52 #include <sys/disp.h> 53 #include <sys/file.h> 54 #include <sys/fcntl.h> 55 #include <sys/flock.h> 56 #include <sys/atomic.h> 57 #include <sys/kmem.h> 58 #include <sys/uio.h> 59 #include <sys/dnlc.h> 60 #include <sys/conf.h> 61 #include <sys/mman.h> 62 #include <sys/pathname.h> 63 #include <sys/debug.h> 64 #include <sys/vmsystm.h> 65 #include <sys/cmn_err.h> 66 #include <sys/vtrace.h> 67 #include <sys/filio.h> 68 #include <sys/policy.h> 69 70 #include <sys/fs/ufs_fs.h> 71 #include <sys/fs/ufs_lockfs.h> 72 #include <sys/fs/ufs_filio.h> 73 #include <sys/fs/ufs_inode.h> 74 #include <sys/fs/ufs_fsdir.h> 75 #include <sys/fs/ufs_quota.h> 76 #include <sys/fs/ufs_log.h> 77 #include <sys/fs/ufs_snap.h> 78 #include <sys/fs/ufs_trans.h> 79 #include <sys/fs/ufs_panic.h> 80 #include <sys/fs/ufs_bio.h> 81 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 82 #include <sys/errno.h> 83 #include <sys/fssnap_if.h> 84 #include <sys/unistd.h> 85 #include <sys/sunddi.h> 86 87 #include <sys/filio.h> /* _FIOIO */ 88 89 #include <vm/hat.h> 90 #include <vm/page.h> 91 #include <vm/pvn.h> 92 #include <vm/as.h> 93 #include <vm/seg.h> 94 #include <vm/seg_map.h> 95 #include <vm/seg_vn.h> 96 #include <vm/seg_kmem.h> 97 #include <vm/rm.h> 98 #include <sys/swap.h> 99 100 #include <fs/fs_subr.h> 101 102 static struct instats ins; 103 104 static int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 105 static int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, 106 caddr_t, struct page **, size_t, enum seg_rw, int); 107 static int ufs_open(struct vnode **, int, struct cred *); 108 static int ufs_close(struct vnode *, int, int, offset_t, struct cred *); 109 static int ufs_read(struct vnode *, struct uio *, int, struct cred *, 110 struct caller_context *); 111 static int ufs_write(struct vnode *, struct uio *, int, struct cred *, 112 struct caller_context *); 113 static int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *); 114 static int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *); 115 static int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *, 116 caller_context_t *); 117 static int ufs_access(struct vnode *, int, int, struct cred *); 118 static int ufs_lookup(struct vnode *, char *, struct vnode **, 119 struct pathname *, int, struct vnode *, struct cred *); 120 static int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl, 121 int, struct vnode **, struct cred *, int); 122 static int ufs_remove(struct vnode *, char *, struct cred *); 123 static int ufs_link(struct vnode *, struct vnode *, char *, struct cred *); 124 static int ufs_rename(struct vnode *, char *, struct vnode *, char *, 125 struct cred *); 126 static int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, 127 struct cred *); 128 static int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *); 129 static int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *); 130 static int ufs_symlink(struct vnode *, char *, struct vattr *, char *, 131 struct cred *); 132 static int ufs_readlink(struct vnode *, struct uio *, struct cred *); 133 static int ufs_fsync(struct vnode *, int, struct cred *); 134 static void ufs_inactive(struct vnode *, struct cred *); 135 static int ufs_fid(struct vnode *, struct fid *); 136 static int ufs_rwlock(struct vnode *, int, caller_context_t *); 137 static void ufs_rwunlock(struct vnode *, int, caller_context_t *); 138 static int ufs_seek(struct vnode *, offset_t, offset_t *); 139 static int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 140 struct flk_callback *, struct cred *); 141 static int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t, 142 cred_t *, caller_context_t *); 143 static int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *, 144 struct page **, size_t, struct seg *, caddr_t, 145 enum seg_rw, struct cred *); 146 static int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *); 147 static int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *); 148 static int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 149 uchar_t, uchar_t, uint_t, struct cred *); 150 static int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 151 uchar_t, uchar_t, uint_t, struct cred *); 152 static int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 153 uint_t, uint_t, uint_t, struct cred *); 154 static int ufs_poll(vnode_t *, short, int, short *, struct pollhead **); 155 static int ufs_dump(vnode_t *, caddr_t, int, int); 156 static int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *); 157 static int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int, 158 struct cred *); 159 static int ufs_dump(vnode_t *, caddr_t, int, int); 160 static int ufs_dumpctl(vnode_t *, int, int *); 161 static daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *, 162 daddr32_t *, int, int); 163 static int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 164 static int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *); 165 166 /* 167 * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions. 168 * 169 * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet. 170 */ 171 struct vnodeops *ufs_vnodeops; 172 173 const fs_operation_def_t ufs_vnodeops_template[] = { 174 VOPNAME_OPEN, ufs_open, /* will not be blocked by lockfs */ 175 VOPNAME_CLOSE, ufs_close, /* will not be blocked by lockfs */ 176 VOPNAME_READ, ufs_read, 177 VOPNAME_WRITE, ufs_write, 178 VOPNAME_IOCTL, ufs_ioctl, 179 VOPNAME_GETATTR, ufs_getattr, 180 VOPNAME_SETATTR, ufs_setattr, 181 VOPNAME_ACCESS, ufs_access, 182 VOPNAME_LOOKUP, ufs_lookup, 183 VOPNAME_CREATE, ufs_create, 184 VOPNAME_REMOVE, ufs_remove, 185 VOPNAME_LINK, ufs_link, 186 VOPNAME_RENAME, ufs_rename, 187 VOPNAME_MKDIR, ufs_mkdir, 188 VOPNAME_RMDIR, ufs_rmdir, 189 VOPNAME_READDIR, ufs_readdir, 190 VOPNAME_SYMLINK, ufs_symlink, 191 VOPNAME_READLINK, ufs_readlink, 192 VOPNAME_FSYNC, ufs_fsync, 193 VOPNAME_INACTIVE, (fs_generic_func_p) ufs_inactive, /* not blocked */ 194 VOPNAME_FID, ufs_fid, 195 VOPNAME_RWLOCK, ufs_rwlock, /* not blocked */ 196 VOPNAME_RWUNLOCK, (fs_generic_func_p) ufs_rwunlock, /* not blocked */ 197 VOPNAME_SEEK, ufs_seek, 198 VOPNAME_FRLOCK, ufs_frlock, 199 VOPNAME_SPACE, ufs_space, 200 VOPNAME_GETPAGE, ufs_getpage, 201 VOPNAME_PUTPAGE, ufs_putpage, 202 VOPNAME_MAP, (fs_generic_func_p) ufs_map, 203 VOPNAME_ADDMAP, (fs_generic_func_p) ufs_addmap, /* not blocked */ 204 VOPNAME_DELMAP, ufs_delmap, /* will not be blocked by lockfs */ 205 VOPNAME_POLL, (fs_generic_func_p) ufs_poll, /* not blocked */ 206 VOPNAME_DUMP, ufs_dump, 207 VOPNAME_PATHCONF, ufs_l_pathconf, 208 VOPNAME_PAGEIO, ufs_pageio, 209 VOPNAME_DUMPCTL, ufs_dumpctl, 210 VOPNAME_GETSECATTR, ufs_getsecattr, 211 VOPNAME_SETSECATTR, ufs_setsecattr, 212 VOPNAME_VNEVENT, fs_vnevent_support, 213 NULL, NULL 214 }; 215 216 #define MAX_BACKFILE_COUNT 9999 217 218 /* 219 * Created by ufs_dumpctl() to store a file's disk block info into memory. 220 * Used by ufs_dump() to dump data to disk directly. 221 */ 222 struct dump { 223 struct inode *ip; /* the file we contain */ 224 daddr_t fsbs; /* number of blocks stored */ 225 struct timeval32 time; /* time stamp for the struct */ 226 daddr32_t dblk[1]; /* place holder for block info */ 227 }; 228 229 static struct dump *dump_info = NULL; 230 231 /* 232 * Previously there was no special action required for ordinary files. 233 * (Devices are handled through the device file system.) 234 * Now we support Large Files and Large File API requires open to 235 * fail if file is large. 236 * We could take care to prevent data corruption 237 * by doing an atomic check of size and truncate if file is opened with 238 * FTRUNC flag set but traditionally this is being done by the vfs/vnode 239 * layers. So taking care of truncation here is a change in the existing 240 * semantics of VOP_OPEN and therefore we chose not to implement any thing 241 * here. The check for the size of the file > 2GB is being done at the 242 * vfs layer in routine vn_open(). 243 */ 244 245 /* ARGSUSED */ 246 static int 247 ufs_open(struct vnode **vpp, int flag, struct cred *cr) 248 { 249 TRACE_1(TR_FAC_UFS, TR_UFS_OPEN, "ufs_open:vpp %p", vpp); 250 return (0); 251 } 252 253 /*ARGSUSED*/ 254 static int 255 ufs_close(struct vnode *vp, int flag, int count, offset_t offset, 256 struct cred *cr) 257 { 258 TRACE_1(TR_FAC_UFS, TR_UFS_CLOSE, "ufs_close:vp %p", vp); 259 260 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 261 cleanshares(vp, ttoproc(curthread)->p_pid); 262 263 /* 264 * Push partially filled cluster at last close. 265 * ``last close'' is approximated because the dnlc 266 * may have a hold on the vnode. 267 * Checking for VBAD here will also act as a forced umount check. 268 */ 269 if (vp->v_count <= 2 && vp->v_type != VBAD) { 270 struct inode *ip = VTOI(vp); 271 if (ip->i_delaylen) { 272 ins.in_poc.value.ul++; 273 (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen, 274 B_ASYNC | B_FREE, cr); 275 ip->i_delaylen = 0; 276 } 277 } 278 279 return (0); 280 } 281 282 /*ARGSUSED*/ 283 static int 284 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 285 struct caller_context *ct) 286 { 287 struct inode *ip = VTOI(vp); 288 struct ufsvfs *ufsvfsp; 289 struct ulockfs *ulp = NULL; 290 int error = 0; 291 int intrans = 0; 292 293 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 294 TRACE_3(TR_FAC_UFS, TR_UFS_READ_START, 295 "ufs_read_start:vp %p uiop %p ioflag %x", 296 vp, uiop, ioflag); 297 298 /* 299 * Mandatory locking needs to be done before ufs_lockfs_begin() 300 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep. 301 */ 302 if (MANDLOCK(vp, ip->i_mode)) { 303 /* 304 * ufs_getattr ends up being called by chklock 305 */ 306 error = chklock(vp, FREAD, uiop->uio_loffset, 307 uiop->uio_resid, uiop->uio_fmode, ct); 308 if (error) 309 goto out; 310 } 311 312 ufsvfsp = ip->i_ufsvfs; 313 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK); 314 if (error) 315 goto out; 316 317 /* 318 * In the case that a directory is opened for reading as a file 319 * (eg "cat .") with the O_RSYNC, O_SYNC and O_DSYNC flags set. 320 * The locking order had to be changed to avoid a deadlock with 321 * an update taking place on that directory at the same time. 322 */ 323 if ((ip->i_mode & IFMT) == IFDIR) { 324 325 rw_enter(&ip->i_contents, RW_READER); 326 error = rdip(ip, uiop, ioflag, cr); 327 rw_exit(&ip->i_contents); 328 329 if (error) { 330 if (ulp) 331 ufs_lockfs_end(ulp); 332 goto out; 333 } 334 335 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 336 TRANS_ISTRANS(ufsvfsp)) { 337 rw_exit(&ip->i_rwlock); 338 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 339 error); 340 ASSERT(!error); 341 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 342 TOP_READ_SIZE); 343 rw_enter(&ip->i_rwlock, RW_READER); 344 } 345 } else { 346 /* 347 * Only transact reads to files opened for sync-read and 348 * sync-write on a file system that is not write locked. 349 * 350 * The ``not write locked'' check prevents problems with 351 * enabling/disabling logging on a busy file system. E.g., 352 * logging exists at the beginning of the read but does not 353 * at the end. 354 * 355 */ 356 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 357 TRANS_ISTRANS(ufsvfsp)) { 358 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 359 error); 360 ASSERT(!error); 361 intrans = 1; 362 } 363 364 rw_enter(&ip->i_contents, RW_READER); 365 error = rdip(ip, uiop, ioflag, cr); 366 rw_exit(&ip->i_contents); 367 368 if (intrans) { 369 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 370 TOP_READ_SIZE); 371 } 372 } 373 374 if (ulp) { 375 ufs_lockfs_end(ulp); 376 } 377 out: 378 379 TRACE_2(TR_FAC_UFS, TR_UFS_READ_END, 380 "ufs_read_end:vp %p error %d", vp, error); 381 return (error); 382 } 383 384 extern int ufs_HW; /* high water mark */ 385 extern int ufs_LW; /* low water mark */ 386 int ufs_WRITES = 1; /* XXX - enable/disable */ 387 int ufs_throttles = 0; /* throttling count */ 388 int ufs_allow_shared_writes = 1; /* directio shared writes */ 389 390 static int 391 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag) 392 { 393 int shared_write; 394 395 /* 396 * If the FDSYNC flag is set then ignore the global 397 * ufs_allow_shared_writes in this case. 398 */ 399 shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes; 400 401 /* 402 * Filter to determine if this request is suitable as a 403 * concurrent rewrite. This write must not allocate blocks 404 * by extending the file or filling in holes. No use trying 405 * through FSYNC descriptors as the inode will be synchronously 406 * updated after the write. The uio structure has not yet been 407 * checked for sanity, so assume nothing. 408 */ 409 return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) && 410 (uiop->uio_loffset >= (offset_t)0) && 411 (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) && 412 ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) && 413 !(ioflag & FSYNC) && !bmap_has_holes(ip) && 414 shared_write); 415 } 416 417 /*ARGSUSED*/ 418 static int 419 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr, 420 caller_context_t *ct) 421 { 422 struct inode *ip = VTOI(vp); 423 struct ufsvfs *ufsvfsp; 424 struct ulockfs *ulp; 425 int retry = 1; 426 int error, resv, resid = 0; 427 int directio_status; 428 int exclusive; 429 int rewriteflg; 430 long start_resid = uiop->uio_resid; 431 432 TRACE_3(TR_FAC_UFS, TR_UFS_WRITE_START, 433 "ufs_write_start:vp %p uiop %p ioflag %x", 434 vp, uiop, ioflag); 435 436 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 437 438 retry_mandlock: 439 /* 440 * Mandatory locking needs to be done before ufs_lockfs_begin() 441 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep. 442 * Check for forced unmounts normally done in ufs_lockfs_begin(). 443 */ 444 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 445 error = EIO; 446 goto out; 447 } 448 if (MANDLOCK(vp, ip->i_mode)) { 449 450 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 451 452 /* 453 * ufs_getattr ends up being called by chklock 454 */ 455 error = chklock(vp, FWRITE, uiop->uio_loffset, 456 uiop->uio_resid, uiop->uio_fmode, ct); 457 if (error) 458 goto out; 459 } 460 461 /* i_rwlock can change in chklock */ 462 exclusive = rw_write_held(&ip->i_rwlock); 463 rewriteflg = ufs_check_rewrite(ip, uiop, ioflag); 464 465 /* 466 * Check for fast-path special case of directio re-writes. 467 */ 468 if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) && 469 !exclusive && rewriteflg) { 470 471 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 472 if (error) 473 goto out; 474 475 rw_enter(&ip->i_contents, RW_READER); 476 error = ufs_directio_write(ip, uiop, ioflag, 1, cr, 477 &directio_status); 478 if (directio_status == DIRECTIO_SUCCESS) { 479 uint_t i_flag_save; 480 481 if (start_resid != uiop->uio_resid) 482 error = 0; 483 /* 484 * Special treatment of access times for re-writes. 485 * If IMOD is not already set, then convert it 486 * to IMODACC for this operation. This defers 487 * entering a delta into the log until the inode 488 * is flushed. This mimics what is done for read 489 * operations and inode access time. 490 */ 491 mutex_enter(&ip->i_tlock); 492 i_flag_save = ip->i_flag; 493 ip->i_flag |= IUPD | ICHG; 494 ip->i_seq++; 495 ITIMES_NOLOCK(ip); 496 if ((i_flag_save & IMOD) == 0) { 497 ip->i_flag &= ~IMOD; 498 ip->i_flag |= IMODACC; 499 } 500 mutex_exit(&ip->i_tlock); 501 rw_exit(&ip->i_contents); 502 if (ulp) 503 ufs_lockfs_end(ulp); 504 goto out; 505 } 506 rw_exit(&ip->i_contents); 507 if (ulp) 508 ufs_lockfs_end(ulp); 509 } 510 511 if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) { 512 rw_exit(&ip->i_rwlock); 513 rw_enter(&ip->i_rwlock, RW_WRITER); 514 /* 515 * Mandatory locking could have been enabled 516 * after dropping the i_rwlock. 517 */ 518 if (MANDLOCK(vp, ip->i_mode)) 519 goto retry_mandlock; 520 } 521 522 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 523 if (error) 524 goto out; 525 526 /* 527 * Amount of log space needed for this write 528 */ 529 if (!rewriteflg || !(ioflag & FDSYNC)) 530 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid); 531 532 /* 533 * Throttle writes. 534 */ 535 if (ufs_WRITES && (ip->i_writes > ufs_HW)) { 536 mutex_enter(&ip->i_tlock); 537 while (ip->i_writes > ufs_HW) { 538 ufs_throttles++; 539 cv_wait(&ip->i_wrcv, &ip->i_tlock); 540 } 541 mutex_exit(&ip->i_tlock); 542 } 543 544 /* 545 * Enter Transaction 546 * 547 * If the write is a rewrite there is no need to open a transaction 548 * if the FDSYNC flag is set and not the FSYNC. In this case just 549 * set the IMODACC flag to modify do the update at a later time 550 * thus avoiding the overhead of the logging transaction that is 551 * not required. 552 */ 553 if (ioflag & (FSYNC|FDSYNC)) { 554 if (ulp) { 555 if (rewriteflg) { 556 uint_t i_flag_save; 557 558 rw_enter(&ip->i_contents, RW_READER); 559 mutex_enter(&ip->i_tlock); 560 i_flag_save = ip->i_flag; 561 ip->i_flag |= IUPD | ICHG; 562 ip->i_seq++; 563 ITIMES_NOLOCK(ip); 564 if ((i_flag_save & IMOD) == 0) { 565 ip->i_flag &= ~IMOD; 566 ip->i_flag |= IMODACC; 567 } 568 mutex_exit(&ip->i_tlock); 569 rw_exit(&ip->i_contents); 570 } else { 571 int terr = 0; 572 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, 573 terr); 574 ASSERT(!terr); 575 } 576 } 577 } else { 578 if (ulp) 579 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 580 } 581 582 /* 583 * Write the file 584 */ 585 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 586 rw_enter(&ip->i_contents, RW_WRITER); 587 if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) { 588 /* 589 * In append mode start at end of file. 590 */ 591 uiop->uio_loffset = ip->i_size; 592 } 593 594 /* 595 * Mild optimisation, don't call ufs_trans_write() unless we have to 596 * Also, suppress file system full messages if we will retry. 597 */ 598 if (retry) 599 ip->i_flag |= IQUIET; 600 if (resid) { 601 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid); 602 } else { 603 error = wrip(ip, uiop, ioflag, cr); 604 } 605 ip->i_flag &= ~IQUIET; 606 607 rw_exit(&ip->i_contents); 608 rw_exit(&ufsvfsp->vfs_dqrwlock); 609 610 /* 611 * Leave Transaction 612 */ 613 if (ulp) { 614 if (ioflag & (FSYNC|FDSYNC)) { 615 if (!rewriteflg) { 616 int terr = 0; 617 618 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, 619 resv); 620 if (error == 0) 621 error = terr; 622 } 623 } else { 624 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 625 } 626 ufs_lockfs_end(ulp); 627 } 628 out: 629 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 630 /* 631 * Any blocks tied up in pending deletes? 632 */ 633 ufs_delete_drain_wait(ufsvfsp, 1); 634 retry = 0; 635 goto retry_mandlock; 636 } 637 638 if (error == ENOSPC && (start_resid != uiop->uio_resid)) 639 error = 0; 640 641 TRACE_2(TR_FAC_UFS, TR_UFS_WRITE_END, 642 "ufs_write_end:vp %p error %d", vp, error); 643 return (error); 644 } 645 646 /* 647 * Don't cache write blocks to files with the sticky bit set. 648 * Used to keep swap files from blowing the page cache on a server. 649 */ 650 int stickyhack = 1; 651 652 /* 653 * Free behind hacks. The pager is busted. 654 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 655 * or B_FREE_IF_TIGHT_ON_MEMORY. 656 */ 657 int freebehind = 1; 658 int smallfile = 0; 659 u_offset_t smallfile64 = 32 * 1024; 660 661 /* 662 * While we should, in most cases, cache the pages for write, we 663 * may also want to cache the pages for read as long as they are 664 * frequently re-usable. 665 * 666 * If cache_read_ahead = 1, the pages for read will go to the tail 667 * of the cache list when they are released, otherwise go to the head. 668 */ 669 int cache_read_ahead = 0; 670 671 /* 672 * Freebehind exists so that as we read large files sequentially we 673 * don't consume most of memory with pages from a few files. It takes 674 * longer to re-read from disk multiple small files as it does reading 675 * one large one sequentially. As system memory grows customers need 676 * to retain bigger chunks of files in memory. The advent of the 677 * cachelist opens up of the possibility freeing pages to the head or 678 * tail of the list. 679 * 680 * Not freeing a page is a bet that the page will be read again before 681 * it's segmap slot is needed for something else. If we loose the bet, 682 * it means some other thread is burdened with the page free we did 683 * not do. If we win we save a free and reclaim. 684 * 685 * Freeing it at the tail vs the head of cachelist is a bet that the 686 * page will survive until the next read. It's also saying that this 687 * page is more likely to be re-used than a page freed some time ago 688 * and never reclaimed. 689 * 690 * Freebehind maintains a range of file offset [smallfile1; smallfile2] 691 * 692 * 0 < offset < smallfile1 : pages are not freed. 693 * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist. 694 * smallfile2 < offset : pages freed to head of cachelist. 695 * 696 * The range is computed at most once per second and depends on 697 * freemem and ncpus_online. Both parameters are bounded to be 698 * >= smallfile && >= smallfile64. 699 * 700 * smallfile1 = (free memory / ncpu) / 1000 701 * smallfile2 = (free memory / ncpu) / 10 702 * 703 * A few examples values: 704 * 705 * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2] 706 * ncpus_online = 4 ncpus_online = 64 707 * ------------------ ----------------------- ----------------------- 708 * 1G [256K; 25M] [32K; 1.5M] 709 * 10G [2.5M; 250M] [156K; 15M] 710 * 100G [25M; 2.5G] [1.5M; 150M] 711 * 712 */ 713 714 #define SMALLFILE1_D 1000 715 #define SMALLFILE2_D 10 716 static u_offset_t smallfile1 = 32 * 1024; 717 static u_offset_t smallfile2 = 32 * 1024; 718 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */ 719 uint_t smallfile1_d = SMALLFILE1_D; 720 uint_t smallfile2_d = SMALLFILE2_D; 721 722 /* 723 * wrip does the real work of write requests for ufs. 724 */ 725 int 726 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr) 727 { 728 rlim64_t limit = uio->uio_llimit; 729 u_offset_t off; 730 u_offset_t old_i_size; 731 struct fs *fs; 732 struct vnode *vp; 733 struct ufsvfs *ufsvfsp; 734 caddr_t base; 735 long start_resid = uio->uio_resid; /* save starting resid */ 736 long premove_resid; /* resid before uiomove() */ 737 uint_t flags; 738 int newpage; 739 int iupdat_flag, directio_status; 740 int n, on, mapon; 741 int error, pagecreate; 742 int do_dqrwlock; /* drop/reacquire vfs_dqrwlock */ 743 int32_t iblocks; 744 int new_iblocks; 745 746 /* 747 * ip->i_size is incremented before the uiomove 748 * is done on a write. If the move fails (bad user 749 * address) reset ip->i_size. 750 * The better way would be to increment ip->i_size 751 * only if the uiomove succeeds. 752 */ 753 int i_size_changed = 0; 754 o_mode_t type; 755 int i_seq_needed = 0; 756 757 vp = ITOV(ip); 758 759 /* 760 * check for forced unmount - should not happen as 761 * the request passed the lockfs checks. 762 */ 763 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 764 return (EIO); 765 766 fs = ip->i_fs; 767 768 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 769 "ufs_wrip_start:vp %p", vp); 770 771 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 772 773 /* check for valid filetype */ 774 type = ip->i_mode & IFMT; 775 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 776 (type != IFLNK) && (type != IFSHAD)) { 777 return (EIO); 778 } 779 780 /* 781 * the actual limit of UFS file size 782 * is UFS_MAXOFFSET_T 783 */ 784 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 785 limit = MAXOFFSET_T; 786 787 if (uio->uio_loffset >= limit) { 788 proc_t *p = ttoproc(curthread); 789 790 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 791 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 792 793 mutex_enter(&p->p_lock); 794 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 795 p, RCA_UNSAFE_SIGINFO); 796 mutex_exit(&p->p_lock); 797 return (EFBIG); 798 } 799 800 /* 801 * if largefiles are disallowed, the limit is 802 * the pre-largefiles value of 2GB 803 */ 804 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 805 limit = MIN(UFS_MAXOFFSET_T, limit); 806 else 807 limit = MIN(MAXOFF32_T, limit); 808 809 if (uio->uio_loffset < (offset_t)0) { 810 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 811 "ufs_wrip_end:vp %p error %d", vp, EINVAL); 812 return (EINVAL); 813 } 814 if (uio->uio_resid == 0) { 815 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 816 "ufs_wrip_end:vp %p error %d", vp, 0); 817 return (0); 818 } 819 820 if (uio->uio_loffset >= limit) 821 return (EFBIG); 822 823 ip->i_flag |= INOACC; /* don't update ref time in getpage */ 824 825 if (ioflag & (FSYNC|FDSYNC)) { 826 ip->i_flag |= ISYNC; 827 iupdat_flag = 1; 828 } 829 /* 830 * Try to go direct 831 */ 832 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 833 uio->uio_llimit = limit; 834 error = ufs_directio_write(ip, uio, ioflag, 0, cr, 835 &directio_status); 836 /* 837 * If ufs_directio wrote to the file or set the flags, 838 * we need to update i_seq, but it may be deferred. 839 */ 840 if (start_resid != uio->uio_resid || 841 (ip->i_flag & (ICHG|IUPD))) { 842 i_seq_needed = 1; 843 ip->i_flag |= ISEQ; 844 } 845 if (directio_status == DIRECTIO_SUCCESS) 846 goto out; 847 } 848 849 /* 850 * Behavior with respect to dropping/reacquiring vfs_dqrwlock: 851 * 852 * o shadow inodes: vfs_dqrwlock is not held at all 853 * o quota updates: vfs_dqrwlock is read or write held 854 * o other updates: vfs_dqrwlock is read held 855 * 856 * The first case is the only one where we do not hold 857 * vfs_dqrwlock at all while entering wrip(). 858 * We must make sure not to downgrade/drop vfs_dqrwlock if we 859 * have it as writer, i.e. if we are updating the quota inode. 860 * There is no potential deadlock scenario in this case as 861 * ufs_getpage() takes care of this and avoids reacquiring 862 * vfs_dqrwlock in that case. 863 * 864 * This check is done here since the above conditions do not change 865 * and we possibly loop below, so save a few cycles. 866 */ 867 if ((type == IFSHAD) || 868 (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) { 869 do_dqrwlock = 0; 870 } else { 871 do_dqrwlock = 1; 872 } 873 874 /* 875 * Large Files: We cast MAXBMASK to offset_t 876 * inorder to mask out the higher bits. Since offset_t 877 * is a signed value, the high order bit set in MAXBMASK 878 * value makes it do the right thing by having all bits 1 879 * in the higher word. May be removed for _SOLARIS64_. 880 */ 881 882 fs = ip->i_fs; 883 do { 884 u_offset_t uoff = uio->uio_loffset; 885 off = uoff & (offset_t)MAXBMASK; 886 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 887 on = (int)blkoff(fs, uoff); 888 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid); 889 new_iblocks = 1; 890 891 if (type == IFREG && uoff + n >= limit) { 892 if (uoff >= limit) { 893 error = EFBIG; 894 goto out; 895 } 896 /* 897 * since uoff + n >= limit, 898 * therefore n >= limit - uoff, and n is an int 899 * so it is safe to cast it to an int 900 */ 901 n = (int)(limit - (rlim64_t)uoff); 902 } 903 if (uoff + n > ip->i_size) { 904 /* 905 * We are extending the length of the file. 906 * bmap is used so that we are sure that 907 * if we need to allocate new blocks, that it 908 * is done here before we up the file size. 909 */ 910 error = bmap_write(ip, uoff, (int)(on + n), 911 mapon == 0, cr); 912 /* 913 * bmap_write never drops i_contents so if 914 * the flags are set it changed the file. 915 */ 916 if (ip->i_flag & (ICHG|IUPD)) { 917 i_seq_needed = 1; 918 ip->i_flag |= ISEQ; 919 } 920 if (error) 921 break; 922 /* 923 * There is a window of vulnerability here. 924 * The sequence of operations: allocate file 925 * system blocks, uiomove the data into pages, 926 * and then update the size of the file in the 927 * inode, must happen atomically. However, due 928 * to current locking constraints, this can not 929 * be done. 930 */ 931 ASSERT(ip->i_writer == NULL); 932 ip->i_writer = curthread; 933 i_size_changed = 1; 934 /* 935 * If we are writing from the beginning of 936 * the mapping, we can just create the 937 * pages without having to read them. 938 */ 939 pagecreate = (mapon == 0); 940 } else if (n == MAXBSIZE) { 941 /* 942 * Going to do a whole mappings worth, 943 * so we can just create the pages w/o 944 * having to read them in. But before 945 * we do that, we need to make sure any 946 * needed blocks are allocated first. 947 */ 948 iblocks = ip->i_blocks; 949 error = bmap_write(ip, uoff, (int)(on + n), 1, cr); 950 /* 951 * bmap_write never drops i_contents so if 952 * the flags are set it changed the file. 953 */ 954 if (ip->i_flag & (ICHG|IUPD)) { 955 i_seq_needed = 1; 956 ip->i_flag |= ISEQ; 957 } 958 if (error) 959 break; 960 pagecreate = 1; 961 /* 962 * check if the new created page needed the 963 * allocation of new disk blocks. 964 */ 965 if (iblocks == ip->i_blocks) 966 new_iblocks = 0; /* no new blocks allocated */ 967 } else { 968 pagecreate = 0; 969 /* 970 * In sync mode flush the indirect blocks which 971 * may have been allocated and not written on 972 * disk. In above cases bmap_write will allocate 973 * in sync mode. 974 */ 975 if (ioflag & (FSYNC|FDSYNC)) { 976 error = ufs_indirblk_sync(ip, uoff); 977 if (error) 978 break; 979 } 980 } 981 982 /* 983 * At this point we can enter ufs_getpage() in one 984 * of two ways: 985 * 1) segmap_getmapflt() calls ufs_getpage() when the 986 * forcefault parameter is true (pagecreate == 0) 987 * 2) uiomove() causes a page fault. 988 * 989 * We have to drop the contents lock to prevent the VM 990 * system from trying to reaquire it in ufs_getpage() 991 * should the uiomove cause a pagefault. 992 * 993 * We have to drop the reader vfs_dqrwlock here as well. 994 */ 995 rw_exit(&ip->i_contents); 996 if (do_dqrwlock) { 997 ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 998 ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock))); 999 rw_exit(&ufsvfsp->vfs_dqrwlock); 1000 } 1001 1002 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1003 (uint_t)n, !pagecreate, S_WRITE); 1004 1005 /* 1006 * segmap_pagecreate() returns 1 if it calls 1007 * page_create_va() to allocate any pages. 1008 */ 1009 newpage = 0; 1010 1011 if (pagecreate) 1012 newpage = segmap_pagecreate(segkmap, base, 1013 (size_t)n, 0); 1014 1015 premove_resid = uio->uio_resid; 1016 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 1017 1018 /* 1019 * If "newpage" is set, then a new page was created and it 1020 * does not contain valid data, so it needs to be initialized 1021 * at this point. 1022 * Otherwise the page contains old data, which was overwritten 1023 * partially or as a whole in uiomove. 1024 * If there is only one iovec structure within uio, then 1025 * on error uiomove will not be able to update uio->uio_loffset 1026 * and we would zero the whole page here! 1027 * 1028 * If uiomove fails because of an error, the old valid data 1029 * is kept instead of filling the rest of the page with zero's. 1030 */ 1031 if (newpage && 1032 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 1033 /* 1034 * We created pages w/o initializing them completely, 1035 * thus we need to zero the part that wasn't set up. 1036 * This happens on most EOF write cases and if 1037 * we had some sort of error during the uiomove. 1038 */ 1039 int nzero, nmoved; 1040 1041 nmoved = (int)(uio->uio_loffset - (off + mapon)); 1042 ASSERT(nmoved >= 0 && nmoved <= n); 1043 nzero = roundup(on + n, PAGESIZE) - nmoved; 1044 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 1045 (void) kzero(base + mapon + nmoved, (uint_t)nzero); 1046 } 1047 1048 /* 1049 * Unlock the pages allocated by page_create_va() 1050 * in segmap_pagecreate() 1051 */ 1052 if (newpage) 1053 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 1054 1055 /* 1056 * If the size of the file changed, then update the 1057 * size field in the inode now. This can't be done 1058 * before the call to segmap_pageunlock or there is 1059 * a potential deadlock with callers to ufs_putpage(). 1060 * They will be holding i_contents and trying to lock 1061 * a page, while this thread is holding a page locked 1062 * and trying to acquire i_contents. 1063 */ 1064 if (i_size_changed) { 1065 rw_enter(&ip->i_contents, RW_WRITER); 1066 old_i_size = ip->i_size; 1067 UFS_SET_ISIZE(uoff + n, ip); 1068 TRANS_INODE(ufsvfsp, ip); 1069 /* 1070 * file has grown larger than 2GB. Set flag 1071 * in superblock to indicate this, if it 1072 * is not already set. 1073 */ 1074 if ((ip->i_size > MAXOFF32_T) && 1075 !(fs->fs_flags & FSLARGEFILES)) { 1076 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1077 mutex_enter(&ufsvfsp->vfs_lock); 1078 fs->fs_flags |= FSLARGEFILES; 1079 ufs_sbwrite(ufsvfsp); 1080 mutex_exit(&ufsvfsp->vfs_lock); 1081 } 1082 mutex_enter(&ip->i_tlock); 1083 ip->i_writer = NULL; 1084 cv_broadcast(&ip->i_wrcv); 1085 mutex_exit(&ip->i_tlock); 1086 rw_exit(&ip->i_contents); 1087 } 1088 1089 if (error) { 1090 /* 1091 * If we failed on a write, we may have already 1092 * allocated file blocks as well as pages. It's 1093 * hard to undo the block allocation, but we must 1094 * be sure to invalidate any pages that may have 1095 * been allocated. 1096 * 1097 * If the page was created without initialization 1098 * then we must check if it should be possible 1099 * to destroy the new page and to keep the old data 1100 * on the disk. 1101 * 1102 * It is possible to destroy the page without 1103 * having to write back its contents only when 1104 * - the size of the file keeps unchanged 1105 * - bmap_write() did not allocate new disk blocks 1106 * it is possible to create big files using "seek" and 1107 * write to the end of the file. A "write" to a 1108 * position before the end of the file would not 1109 * change the size of the file but it would allocate 1110 * new disk blocks. 1111 * - uiomove intended to overwrite the whole page. 1112 * - a new page was created (newpage == 1). 1113 */ 1114 1115 if (i_size_changed == 0 && new_iblocks == 0 && 1116 newpage) { 1117 1118 /* unwind what uiomove eventually last did */ 1119 uio->uio_resid = premove_resid; 1120 1121 /* 1122 * destroy the page, do not write ambiguous 1123 * data to the disk. 1124 */ 1125 flags = SM_DESTROY; 1126 } else { 1127 /* 1128 * write the page back to the disk, if dirty, 1129 * and remove the page from the cache. 1130 */ 1131 flags = SM_INVAL; 1132 } 1133 (void) segmap_release(segkmap, base, flags); 1134 } else { 1135 flags = 0; 1136 /* 1137 * Force write back for synchronous write cases. 1138 */ 1139 if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) { 1140 /* 1141 * If the sticky bit is set but the 1142 * execute bit is not set, we do a 1143 * synchronous write back and free 1144 * the page when done. We set up swap 1145 * files to be handled this way to 1146 * prevent servers from keeping around 1147 * the client's swap pages too long. 1148 * XXX - there ought to be a better way. 1149 */ 1150 if (IS_SWAPVP(vp)) { 1151 flags = SM_WRITE | SM_FREE | 1152 SM_DONTNEED; 1153 iupdat_flag = 0; 1154 } else { 1155 flags = SM_WRITE; 1156 } 1157 } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 1158 /* 1159 * Have written a whole block. 1160 * Start an asynchronous write and 1161 * mark the buffer to indicate that 1162 * it won't be needed again soon. 1163 */ 1164 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 1165 } 1166 error = segmap_release(segkmap, base, flags); 1167 /* 1168 * If the operation failed and is synchronous, 1169 * then we need to unwind what uiomove() last 1170 * did so we can potentially return an error to 1171 * the caller. If this write operation was 1172 * done in two pieces and the first succeeded, 1173 * then we won't return an error for the second 1174 * piece that failed. However, we only want to 1175 * return a resid value that reflects what was 1176 * really done. 1177 * 1178 * Failures for non-synchronous operations can 1179 * be ignored since the page subsystem will 1180 * retry the operation until it succeeds or the 1181 * file system is unmounted. 1182 */ 1183 if (error) { 1184 if ((ioflag & (FSYNC | FDSYNC)) || 1185 type == IFDIR) { 1186 uio->uio_resid = premove_resid; 1187 } else { 1188 error = 0; 1189 } 1190 } 1191 } 1192 1193 /* 1194 * Re-acquire contents lock. 1195 * If it was dropped, reacquire reader vfs_dqrwlock as well. 1196 */ 1197 if (do_dqrwlock) 1198 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1199 rw_enter(&ip->i_contents, RW_WRITER); 1200 1201 /* 1202 * If the uiomove() failed or if a synchronous 1203 * page push failed, fix up i_size. 1204 */ 1205 if (error) { 1206 if (i_size_changed) { 1207 /* 1208 * The uiomove failed, and we 1209 * allocated blocks,so get rid 1210 * of them. 1211 */ 1212 (void) ufs_itrunc(ip, old_i_size, 0, cr); 1213 } 1214 } else { 1215 /* 1216 * XXX - Can this be out of the loop? 1217 */ 1218 ip->i_flag |= IUPD | ICHG; 1219 /* 1220 * Only do one increase of i_seq for multiple 1221 * pieces. Because we drop locks, record 1222 * the fact that we changed the timestamp and 1223 * are deferring the increase in case another thread 1224 * pushes our timestamp update. 1225 */ 1226 i_seq_needed = 1; 1227 ip->i_flag |= ISEQ; 1228 if (i_size_changed) 1229 ip->i_flag |= IATTCHG; 1230 if ((ip->i_mode & (IEXEC | (IEXEC >> 3) | 1231 (IEXEC >> 6))) != 0 && 1232 (ip->i_mode & (ISUID | ISGID)) != 0 && 1233 secpolicy_vnode_setid_retain(cr, 1234 (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) { 1235 /* 1236 * Clear Set-UID & Set-GID bits on 1237 * successful write if not privileged 1238 * and at least one of the execute bits 1239 * is set. If we always clear Set-GID, 1240 * mandatory file and record locking is 1241 * unuseable. 1242 */ 1243 ip->i_mode &= ~(ISUID | ISGID); 1244 } 1245 } 1246 /* 1247 * In the case the FDSYNC flag is set and this is a 1248 * "rewrite" we won't log a delta. 1249 * The FSYNC flag overrides all cases. 1250 */ 1251 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) { 1252 TRANS_INODE(ufsvfsp, ip); 1253 } 1254 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1255 1256 out: 1257 /* 1258 * Make sure i_seq is increased at least once per write 1259 */ 1260 if (i_seq_needed) { 1261 ip->i_seq++; 1262 ip->i_flag &= ~ISEQ; /* no longer deferred */ 1263 } 1264 1265 /* 1266 * Inode is updated according to this table - 1267 * 1268 * FSYNC FDSYNC(posix.4) 1269 * -------------------------- 1270 * always@ IATTCHG|IBDWRITE 1271 * 1272 * @ - If we are doing synchronous write the only time we should 1273 * not be sync'ing the ip here is if we have the stickyhack 1274 * activated, the file is marked with the sticky bit and 1275 * no exec bit, the file length has not been changed and 1276 * no new blocks have been allocated during this write. 1277 */ 1278 1279 if ((ip->i_flag & ISYNC) != 0) { 1280 /* 1281 * we have eliminated nosync 1282 */ 1283 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 1284 ((ioflag & FSYNC) && iupdat_flag)) { 1285 ufs_iupdat(ip, 1); 1286 } 1287 } 1288 1289 /* 1290 * If we've already done a partial-write, terminate 1291 * the write but return no error unless the error is ENOSPC 1292 * because the caller can detect this and free resources and 1293 * try again. 1294 */ 1295 if ((start_resid != uio->uio_resid) && (error != ENOSPC)) 1296 error = 0; 1297 1298 ip->i_flag &= ~(INOACC | ISYNC); 1299 ITIMES_NOLOCK(ip); 1300 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1301 "ufs_wrip_end:vp %p error %d", vp, error); 1302 return (error); 1303 } 1304 1305 /* 1306 * rdip does the real work of read requests for ufs. 1307 */ 1308 int 1309 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr) 1310 { 1311 u_offset_t off; 1312 caddr_t base; 1313 struct fs *fs; 1314 struct ufsvfs *ufsvfsp; 1315 struct vnode *vp; 1316 long oresid = uio->uio_resid; 1317 u_offset_t n, on, mapon; 1318 int error = 0; 1319 int doupdate = 1; 1320 uint_t flags; 1321 int dofree, directio_status; 1322 krw_t rwtype; 1323 o_mode_t type; 1324 1325 vp = ITOV(ip); 1326 1327 TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START, 1328 "ufs_rdip_start:vp %p", vp); 1329 1330 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1331 1332 ufsvfsp = ip->i_ufsvfs; 1333 1334 if (ufsvfsp == NULL) 1335 return (EIO); 1336 1337 fs = ufsvfsp->vfs_fs; 1338 1339 /* check for valid filetype */ 1340 type = ip->i_mode & IFMT; 1341 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 1342 (type != IFLNK) && (type != IFSHAD)) { 1343 return (EIO); 1344 } 1345 1346 if (uio->uio_loffset > UFS_MAXOFFSET_T) { 1347 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1348 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1349 error = 0; 1350 goto out; 1351 } 1352 if (uio->uio_loffset < (offset_t)0) { 1353 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1354 "ufs_rdip_end:vp %p error %d", vp, EINVAL); 1355 return (EINVAL); 1356 } 1357 if (uio->uio_resid == 0) { 1358 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1359 "ufs_rdip_end:vp %p error %d", vp, 0); 1360 return (0); 1361 } 1362 1363 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) && 1364 (!ufsvfsp->vfs_noatime)) { 1365 mutex_enter(&ip->i_tlock); 1366 ip->i_flag |= IACC; 1367 mutex_exit(&ip->i_tlock); 1368 } 1369 /* 1370 * Try to go direct 1371 */ 1372 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 1373 error = ufs_directio_read(ip, uio, cr, &directio_status); 1374 if (directio_status == DIRECTIO_SUCCESS) 1375 goto out; 1376 } 1377 1378 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 1379 1380 do { 1381 offset_t diff; 1382 u_offset_t uoff = uio->uio_loffset; 1383 off = uoff & (offset_t)MAXBMASK; 1384 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET); 1385 on = (u_offset_t)blkoff(fs, uoff); 1386 n = MIN((u_offset_t)fs->fs_bsize - on, 1387 (u_offset_t)uio->uio_resid); 1388 1389 diff = ip->i_size - uoff; 1390 1391 if (diff <= (offset_t)0) { 1392 error = 0; 1393 goto out; 1394 } 1395 if (diff < (offset_t)n) 1396 n = (int)diff; 1397 1398 /* 1399 * We update smallfile2 and smallfile1 at most every second. 1400 */ 1401 if (lbolt >= smallfile_update) { 1402 uint64_t percpufreeb; 1403 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 1404 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 1405 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 1406 smallfile1 = percpufreeb / smallfile1_d; 1407 smallfile2 = percpufreeb / smallfile2_d; 1408 smallfile1 = MAX(smallfile1, smallfile); 1409 smallfile1 = MAX(smallfile1, smallfile64); 1410 smallfile2 = MAX(smallfile1, smallfile2); 1411 smallfile_update = lbolt + hz; 1412 } 1413 1414 dofree = freebehind && 1415 ip->i_nextr == (off & PAGEMASK) && off > smallfile1; 1416 1417 /* 1418 * At this point we can enter ufs_getpage() in one of two 1419 * ways: 1420 * 1) segmap_getmapflt() calls ufs_getpage() when the 1421 * forcefault parameter is true (value of 1 is passed) 1422 * 2) uiomove() causes a page fault. 1423 * 1424 * We cannot hold onto an i_contents reader lock without 1425 * risking deadlock in ufs_getpage() so drop a reader lock. 1426 * The ufs_getpage() dolock logic already allows for a 1427 * thread holding i_contents as writer to work properly 1428 * so we keep a writer lock. 1429 */ 1430 if (rwtype == RW_READER) 1431 rw_exit(&ip->i_contents); 1432 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1433 (uint_t)n, 1, S_READ); 1434 1435 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 1436 1437 flags = 0; 1438 if (!error) { 1439 /* 1440 * If reading sequential we won't need this 1441 * buffer again soon. For offsets in range 1442 * [smallfile1, smallfile2] release the pages 1443 * at the tail of the cache list, larger 1444 * offsets are released at the head. 1445 */ 1446 if (dofree) { 1447 flags = SM_FREE | SM_ASYNC; 1448 if ((cache_read_ahead == 0) && 1449 (off > smallfile2)) 1450 flags |= SM_DONTNEED; 1451 } 1452 /* 1453 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 1454 * we want to make sure that the page which has 1455 * been read, is written on disk if it is dirty. 1456 * And corresponding indirect blocks should also 1457 * be flushed out. 1458 */ 1459 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 1460 flags &= ~SM_ASYNC; 1461 flags |= SM_WRITE; 1462 } 1463 error = segmap_release(segkmap, base, flags); 1464 } else 1465 (void) segmap_release(segkmap, base, flags); 1466 1467 if (rwtype == RW_READER) 1468 rw_enter(&ip->i_contents, rwtype); 1469 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1470 out: 1471 /* 1472 * Inode is updated according to this table if FRSYNC is set. 1473 * 1474 * FSYNC FDSYNC(posix.4) 1475 * -------------------------- 1476 * always IATTCHG|IBDWRITE 1477 */ 1478 /* 1479 * The inode is not updated if we're logging and the inode is a 1480 * directory with FRSYNC, FSYNC and FDSYNC flags set. 1481 */ 1482 if (ioflag & FRSYNC) { 1483 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) { 1484 doupdate = 0; 1485 } 1486 if (doupdate) { 1487 if ((ioflag & FSYNC) || 1488 ((ioflag & FDSYNC) && 1489 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 1490 ufs_iupdat(ip, 1); 1491 } 1492 } 1493 } 1494 /* 1495 * If we've already done a partial read, terminate 1496 * the read but return no error. 1497 */ 1498 if (oresid != uio->uio_resid) 1499 error = 0; 1500 ITIMES(ip); 1501 1502 TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END, 1503 "ufs_rdip_end:vp %p error %d", vp, error); 1504 return (error); 1505 } 1506 1507 /* ARGSUSED */ 1508 static int 1509 ufs_ioctl( 1510 struct vnode *vp, 1511 int cmd, 1512 intptr_t arg, 1513 int flag, 1514 struct cred *cr, 1515 int *rvalp) 1516 { 1517 struct lockfs lockfs, lockfs_out; 1518 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 1519 char *comment, *original_comment; 1520 struct fs *fs; 1521 struct ulockfs *ulp; 1522 offset_t off; 1523 extern int maxphys; 1524 int error; 1525 int issync; 1526 int trans_size; 1527 1528 1529 /* 1530 * forcibly unmounted 1531 */ 1532 if (ufsvfsp == NULL) { 1533 return (EIO); 1534 } 1535 1536 fs = ufsvfsp->vfs_fs; 1537 1538 if (cmd == Q_QUOTACTL) { 1539 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK); 1540 if (error) 1541 return (error); 1542 1543 if (ulp) { 1544 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, 1545 TOP_SETQUOTA_SIZE(fs)); 1546 } 1547 1548 error = quotactl(vp, arg, flag, cr); 1549 1550 if (ulp) { 1551 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, 1552 TOP_SETQUOTA_SIZE(fs)); 1553 ufs_lockfs_end(ulp); 1554 } 1555 return (error); 1556 } 1557 1558 switch (cmd) { 1559 case _FIOLFS: 1560 /* 1561 * file system locking 1562 */ 1563 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1564 return (EPERM); 1565 1566 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1567 if (copyin((caddr_t)arg, &lockfs, 1568 sizeof (struct lockfs))) 1569 return (EFAULT); 1570 } 1571 #ifdef _SYSCALL32_IMPL 1572 else { 1573 struct lockfs32 lockfs32; 1574 /* Translate ILP32 lockfs to LP64 lockfs */ 1575 if (copyin((caddr_t)arg, &lockfs32, 1576 sizeof (struct lockfs32))) 1577 return (EFAULT); 1578 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1579 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1580 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1581 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1582 lockfs.lf_comment = 1583 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1584 } 1585 #endif /* _SYSCALL32_IMPL */ 1586 1587 if (lockfs.lf_comlen) { 1588 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) 1589 return (ENAMETOOLONG); 1590 comment = kmem_alloc(lockfs.lf_comlen, 1591 KM_SLEEP); 1592 if (copyin(lockfs.lf_comment, comment, 1593 lockfs.lf_comlen)) { 1594 kmem_free(comment, lockfs.lf_comlen); 1595 return (EFAULT); 1596 } 1597 original_comment = lockfs.lf_comment; 1598 lockfs.lf_comment = comment; 1599 } 1600 if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) { 1601 lockfs.lf_comment = original_comment; 1602 1603 if ((flag & DATAMODEL_MASK) == 1604 DATAMODEL_NATIVE) { 1605 (void) copyout(&lockfs, (caddr_t)arg, 1606 sizeof (struct lockfs)); 1607 } 1608 #ifdef _SYSCALL32_IMPL 1609 else { 1610 struct lockfs32 lockfs32; 1611 /* Translate LP64 to ILP32 lockfs */ 1612 lockfs32.lf_lock = 1613 (uint32_t)lockfs.lf_lock; 1614 lockfs32.lf_flags = 1615 (uint32_t)lockfs.lf_flags; 1616 lockfs32.lf_key = 1617 (uint32_t)lockfs.lf_key; 1618 lockfs32.lf_comlen = 1619 (uint32_t)lockfs.lf_comlen; 1620 lockfs32.lf_comment = 1621 (uint32_t)(uintptr_t)lockfs.lf_comment; 1622 (void) copyout(&lockfs32, (caddr_t)arg, 1623 sizeof (struct lockfs32)); 1624 } 1625 #endif /* _SYSCALL32_IMPL */ 1626 1627 } else { 1628 if (lockfs.lf_comlen) 1629 kmem_free(comment, lockfs.lf_comlen); 1630 } 1631 return (error); 1632 1633 case _FIOLFSS: 1634 /* 1635 * get file system locking status 1636 */ 1637 1638 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1639 if (copyin((caddr_t)arg, &lockfs, 1640 sizeof (struct lockfs))) 1641 return (EFAULT); 1642 } 1643 #ifdef _SYSCALL32_IMPL 1644 else { 1645 struct lockfs32 lockfs32; 1646 /* Translate ILP32 lockfs to LP64 lockfs */ 1647 if (copyin((caddr_t)arg, &lockfs32, 1648 sizeof (struct lockfs32))) 1649 return (EFAULT); 1650 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock; 1651 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags; 1652 lockfs.lf_key = (ulong_t)lockfs32.lf_key; 1653 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen; 1654 lockfs.lf_comment = 1655 (caddr_t)(uintptr_t)lockfs32.lf_comment; 1656 } 1657 #endif /* _SYSCALL32_IMPL */ 1658 1659 if (error = ufs_fiolfss(vp, &lockfs_out)) 1660 return (error); 1661 lockfs.lf_lock = lockfs_out.lf_lock; 1662 lockfs.lf_key = lockfs_out.lf_key; 1663 lockfs.lf_flags = lockfs_out.lf_flags; 1664 lockfs.lf_comlen = MIN(lockfs.lf_comlen, 1665 lockfs_out.lf_comlen); 1666 1667 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 1668 if (copyout(&lockfs, (caddr_t)arg, 1669 sizeof (struct lockfs))) 1670 return (EFAULT); 1671 } 1672 #ifdef _SYSCALL32_IMPL 1673 else { 1674 /* Translate LP64 to ILP32 lockfs */ 1675 struct lockfs32 lockfs32; 1676 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock; 1677 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags; 1678 lockfs32.lf_key = (uint32_t)lockfs.lf_key; 1679 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen; 1680 lockfs32.lf_comment = 1681 (uint32_t)(uintptr_t)lockfs.lf_comment; 1682 if (copyout(&lockfs32, (caddr_t)arg, 1683 sizeof (struct lockfs32))) 1684 return (EFAULT); 1685 } 1686 #endif /* _SYSCALL32_IMPL */ 1687 1688 if (lockfs.lf_comlen && 1689 lockfs.lf_comment && lockfs_out.lf_comment) 1690 if (copyout(lockfs_out.lf_comment, 1691 lockfs.lf_comment, 1692 lockfs.lf_comlen)) 1693 return (EFAULT); 1694 return (0); 1695 1696 case _FIOSATIME: 1697 /* 1698 * set access time 1699 */ 1700 1701 /* 1702 * if mounted w/o atime, return quietly. 1703 * I briefly thought about returning ENOSYS, but 1704 * figured that most apps would consider this fatal 1705 * but the idea is to make this as seamless as poss. 1706 */ 1707 if (ufsvfsp->vfs_noatime) 1708 return (0); 1709 1710 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1711 ULOCKFS_SETATTR_MASK); 1712 if (error) 1713 return (error); 1714 1715 if (ulp) { 1716 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp)); 1717 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 1718 TOP_SETATTR, trans_size); 1719 } 1720 1721 error = ufs_fiosatime(vp, (struct timeval *)arg, 1722 flag, cr); 1723 1724 if (ulp) { 1725 TRANS_END_CSYNC(ufsvfsp, error, issync, 1726 TOP_SETATTR, trans_size); 1727 ufs_lockfs_end(ulp); 1728 } 1729 return (error); 1730 1731 case _FIOSDIO: 1732 /* 1733 * set delayed-io 1734 */ 1735 return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr)); 1736 1737 case _FIOGDIO: 1738 /* 1739 * get delayed-io 1740 */ 1741 return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr)); 1742 1743 case _FIOIO: 1744 /* 1745 * inode open 1746 */ 1747 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1748 ULOCKFS_VGET_MASK); 1749 if (error) 1750 return (error); 1751 1752 error = ufs_fioio(vp, (struct fioio *)arg, flag, cr); 1753 1754 if (ulp) { 1755 ufs_lockfs_end(ulp); 1756 } 1757 return (error); 1758 1759 case _FIOFFS: 1760 /* 1761 * file system flush (push w/invalidate) 1762 */ 1763 if ((caddr_t)arg != NULL) 1764 return (EINVAL); 1765 return (ufs_fioffs(vp, NULL, cr)); 1766 1767 case _FIOISBUSY: 1768 /* 1769 * Contract-private interface for Legato 1770 * Purge this vnode from the DNLC and decide 1771 * if this vnode is busy (*arg == 1) or not 1772 * (*arg == 0) 1773 */ 1774 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1775 return (EPERM); 1776 error = ufs_fioisbusy(vp, (int *)arg, cr); 1777 return (error); 1778 1779 case _FIODIRECTIO: 1780 return (ufs_fiodirectio(vp, (int)arg, cr)); 1781 1782 case _FIOTUNE: 1783 /* 1784 * Tune the file system (aka setting fs attributes) 1785 */ 1786 error = ufs_lockfs_begin(ufsvfsp, &ulp, 1787 ULOCKFS_SETATTR_MASK); 1788 if (error) 1789 return (error); 1790 1791 error = ufs_fiotune(vp, (struct fiotune *)arg, cr); 1792 1793 if (ulp) 1794 ufs_lockfs_end(ulp); 1795 return (error); 1796 1797 case _FIOLOGENABLE: 1798 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1799 return (EPERM); 1800 return (ufs_fiologenable(vp, (void *)arg, cr, flag)); 1801 1802 case _FIOLOGDISABLE: 1803 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1804 return (EPERM); 1805 return (ufs_fiologdisable(vp, (void *)arg, cr, flag)); 1806 1807 case _FIOISLOG: 1808 return (ufs_fioislog(vp, (void *)arg, cr, flag)); 1809 1810 case _FIOSNAPSHOTCREATE_MULTI: 1811 { 1812 struct fiosnapcreate_multi fc, *fcp; 1813 size_t fcm_size; 1814 1815 if (copyin((void *)arg, &fc, sizeof (fc))) 1816 return (EFAULT); 1817 if (fc.backfilecount > MAX_BACKFILE_COUNT) 1818 return (EINVAL); 1819 fcm_size = sizeof (struct fiosnapcreate_multi) + 1820 (fc.backfilecount - 1) * sizeof (int); 1821 fcp = (struct fiosnapcreate_multi *) 1822 kmem_alloc(fcm_size, KM_SLEEP); 1823 if (copyin((void *)arg, fcp, fcm_size)) { 1824 kmem_free(fcp, fcm_size); 1825 return (EFAULT); 1826 } 1827 error = ufs_snap_create(vp, fcp, cr); 1828 if (!error && copyout(fcp, (void *)arg, fcm_size)) 1829 error = EFAULT; 1830 kmem_free(fcp, fcm_size); 1831 return (error); 1832 } 1833 1834 case _FIOSNAPSHOTDELETE: 1835 { 1836 struct fiosnapdelete fc; 1837 1838 if (copyin((void *)arg, &fc, sizeof (fc))) 1839 return (EFAULT); 1840 error = ufs_snap_delete(vp, &fc, cr); 1841 if (!error && copyout(&fc, (void *)arg, sizeof (fc))) 1842 error = EFAULT; 1843 return (error); 1844 } 1845 1846 case _FIOGETSUPERBLOCK: 1847 if (copyout(fs, (void *)arg, SBSIZE)) 1848 return (EFAULT); 1849 return (0); 1850 1851 case _FIOGETMAXPHYS: 1852 if (copyout(&maxphys, (void *)arg, sizeof (maxphys))) 1853 return (EFAULT); 1854 return (0); 1855 1856 /* 1857 * The following 3 ioctls are for TSufs support 1858 * although could potentially be used elsewhere 1859 */ 1860 case _FIO_SET_LUFS_DEBUG: 1861 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1862 return (EPERM); 1863 lufs_debug = (uint32_t)arg; 1864 return (0); 1865 1866 case _FIO_SET_LUFS_ERROR: 1867 if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0) 1868 return (EPERM); 1869 TRANS_SETERROR(ufsvfsp); 1870 return (0); 1871 1872 case _FIO_GET_TOP_STATS: 1873 { 1874 fio_lufs_stats_t *ls; 1875 ml_unit_t *ul = ufsvfsp->vfs_log; 1876 1877 ls = kmem_zalloc(sizeof (*ls), KM_SLEEP); 1878 ls->ls_debug = ul->un_debug; /* return debug value */ 1879 /* Copy stucture if statistics are being kept */ 1880 if (ul->un_logmap->mtm_tops) { 1881 ls->ls_topstats = *(ul->un_logmap->mtm_tops); 1882 } 1883 error = 0; 1884 if (copyout(ls, (void *)arg, sizeof (*ls))) 1885 error = EFAULT; 1886 kmem_free(ls, sizeof (*ls)); 1887 return (error); 1888 } 1889 1890 case _FIO_SEEK_DATA: 1891 case _FIO_SEEK_HOLE: 1892 if (ddi_copyin((void *)arg, &off, sizeof (off), flag)) 1893 return (EFAULT); 1894 /* offset paramater is in/out */ 1895 error = ufs_fio_holey(vp, cmd, &off); 1896 if (error) 1897 return (error); 1898 if (ddi_copyout(&off, (void *)arg, sizeof (off), flag)) 1899 return (EFAULT); 1900 return (0); 1901 1902 default: 1903 return (ENOTTY); 1904 } 1905 } 1906 1907 /* ARGSUSED */ 1908 static int 1909 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags, 1910 struct cred *cr) 1911 { 1912 struct inode *ip = VTOI(vp); 1913 struct ufsvfs *ufsvfsp; 1914 int err; 1915 1916 TRACE_2(TR_FAC_UFS, TR_UFS_GETATTR_START, 1917 "ufs_getattr_start:vp %p flags %x", vp, flags); 1918 1919 if (vap->va_mask == AT_SIZE) { 1920 /* 1921 * for performance, if only the size is requested don't bother 1922 * with anything else. 1923 */ 1924 UFS_GET_ISIZE(&vap->va_size, ip); 1925 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, 1926 "ufs_getattr_end:vp %p", vp); 1927 return (0); 1928 } 1929 1930 /* 1931 * inlined lockfs checks 1932 */ 1933 ufsvfsp = ip->i_ufsvfs; 1934 if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) { 1935 err = EIO; 1936 goto out; 1937 } 1938 1939 rw_enter(&ip->i_contents, RW_READER); 1940 /* 1941 * Return all the attributes. This should be refined so 1942 * that it only returns what's asked for. 1943 */ 1944 1945 /* 1946 * Copy from inode table. 1947 */ 1948 vap->va_type = vp->v_type; 1949 vap->va_mode = ip->i_mode & MODEMASK; 1950 /* 1951 * If there is an ACL and there is a mask entry, then do the 1952 * extra work that completes the equivalent of an acltomode(3) 1953 * call. According to POSIX P1003.1e, the acl mask should be 1954 * returned in the group permissions field. 1955 * 1956 * - start with the original permission and mode bits (from above) 1957 * - clear the group owner bits 1958 * - add in the mask bits. 1959 */ 1960 if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) { 1961 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3); 1962 vap->va_mode |= 1963 (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3; 1964 } 1965 vap->va_uid = ip->i_uid; 1966 vap->va_gid = ip->i_gid; 1967 vap->va_fsid = ip->i_dev; 1968 vap->va_nodeid = (ino64_t)ip->i_number; 1969 vap->va_nlink = ip->i_nlink; 1970 vap->va_size = ip->i_size; 1971 if (vp->v_type == VCHR || vp->v_type == VBLK) 1972 vap->va_rdev = ip->i_rdev; 1973 else 1974 vap->va_rdev = 0; /* not a b/c spec. */ 1975 mutex_enter(&ip->i_tlock); 1976 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 1977 vap->va_seq = ip->i_seq; 1978 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 1979 vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000; 1980 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 1981 vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000; 1982 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 1983 vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000; 1984 mutex_exit(&ip->i_tlock); 1985 1986 switch (ip->i_mode & IFMT) { 1987 1988 case IFBLK: 1989 vap->va_blksize = MAXBSIZE; /* was BLKDEV_IOSIZE */ 1990 break; 1991 1992 case IFCHR: 1993 vap->va_blksize = MAXBSIZE; 1994 break; 1995 1996 default: 1997 vap->va_blksize = ip->i_fs->fs_bsize; 1998 break; 1999 } 2000 vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks; 2001 rw_exit(&ip->i_contents); 2002 err = 0; 2003 2004 out: 2005 TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, "ufs_getattr_end:vp %p", vp); 2006 2007 return (err); 2008 } 2009 2010 /*ARGSUSED4*/ 2011 static int 2012 ufs_setattr( 2013 struct vnode *vp, 2014 struct vattr *vap, 2015 int flags, 2016 struct cred *cr, 2017 caller_context_t *ct) 2018 { 2019 struct inode *ip = VTOI(vp); 2020 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2021 struct fs *fs; 2022 struct ulockfs *ulp; 2023 char *errmsg1; 2024 char *errmsg2; 2025 long blocks; 2026 long int mask = vap->va_mask; 2027 size_t len1, len2; 2028 int issync; 2029 int trans_size; 2030 int dotrans; 2031 int dorwlock; 2032 int error; 2033 int owner_change; 2034 int dodqlock; 2035 timestruc_t now; 2036 vattr_t oldva; 2037 int retry = 1; 2038 2039 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_START, 2040 "ufs_setattr_start:vp %p flags %x", vp, flags); 2041 2042 /* 2043 * Cannot set these attributes. 2044 */ 2045 if (mask & AT_NOSET) { 2046 error = EINVAL; 2047 goto out; 2048 } 2049 2050 /* 2051 * check for forced unmount 2052 */ 2053 if (ufsvfsp == NULL) 2054 return (EIO); 2055 2056 fs = ufsvfsp->vfs_fs; 2057 if (fs->fs_ronly != 0) 2058 return (EROFS); 2059 2060 again: 2061 errmsg1 = NULL; 2062 errmsg2 = NULL; 2063 dotrans = 0; 2064 dorwlock = 0; 2065 dodqlock = 0; 2066 2067 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK); 2068 if (error) 2069 goto out; 2070 2071 /* 2072 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 2073 * This follows the protocol for read()/write(). 2074 */ 2075 if (vp->v_type != VDIR) { 2076 rw_enter(&ip->i_rwlock, RW_WRITER); 2077 dorwlock = 1; 2078 } 2079 2080 /* 2081 * Truncate file. Must have write permission and not be a directory. 2082 */ 2083 if (mask & AT_SIZE) { 2084 rw_enter(&ip->i_contents, RW_WRITER); 2085 if (vp->v_type == VDIR) { 2086 error = EISDIR; 2087 goto update_inode; 2088 } 2089 if (error = ufs_iaccess(ip, IWRITE, cr)) 2090 goto update_inode; 2091 2092 rw_exit(&ip->i_contents); 2093 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr); 2094 if (error) { 2095 rw_enter(&ip->i_contents, RW_WRITER); 2096 goto update_inode; 2097 } 2098 } 2099 2100 if (ulp) { 2101 trans_size = (int)TOP_SETATTR_SIZE(ip); 2102 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size); 2103 ++dotrans; 2104 } 2105 2106 /* 2107 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 2108 * This follows the protocol established by 2109 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 2110 */ 2111 if (vp->v_type == VDIR) { 2112 rw_enter(&ip->i_rwlock, RW_WRITER); 2113 dorwlock = 1; 2114 } 2115 2116 /* 2117 * Grab quota lock if we are changing the file's owner. 2118 */ 2119 if (mask & AT_UID) { 2120 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2121 dodqlock = 1; 2122 } 2123 rw_enter(&ip->i_contents, RW_WRITER); 2124 2125 oldva.va_mode = ip->i_mode; 2126 oldva.va_uid = ip->i_uid; 2127 oldva.va_gid = ip->i_gid; 2128 2129 vap->va_mask &= ~AT_SIZE; 2130 /* 2131 * ufs_iaccess is "close enough"; that's because it doesn't 2132 * map the defines. 2133 */ 2134 error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2135 ufs_iaccess, ip); 2136 if (error) 2137 goto update_inode; 2138 2139 mask = vap->va_mask; 2140 2141 /* 2142 * Change file access modes. 2143 */ 2144 if (mask & AT_MODE) { 2145 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT); 2146 TRANS_INODE(ufsvfsp, ip); 2147 ip->i_flag |= ICHG; 2148 if (stickyhack) { 2149 mutex_enter(&vp->v_lock); 2150 if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 2151 vp->v_flag |= VSWAPLIKE; 2152 else 2153 vp->v_flag &= ~VSWAPLIKE; 2154 mutex_exit(&vp->v_lock); 2155 } 2156 } 2157 if (mask & (AT_UID|AT_GID)) { 2158 if (mask & AT_UID) { 2159 /* 2160 * Don't change ownership of the quota inode. 2161 */ 2162 if (ufsvfsp->vfs_qinod == ip) { 2163 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED); 2164 error = EINVAL; 2165 goto update_inode; 2166 } 2167 2168 /* 2169 * No real ownership change. 2170 */ 2171 if (ip->i_uid == vap->va_uid) { 2172 blocks = 0; 2173 owner_change = 0; 2174 } 2175 /* 2176 * Remove the blocks and the file, from the old user's 2177 * quota. 2178 */ 2179 else { 2180 blocks = ip->i_blocks; 2181 owner_change = 1; 2182 2183 (void) chkdq(ip, -blocks, /* force */ 1, cr, 2184 (char **)NULL, (size_t *)NULL); 2185 (void) chkiq(ufsvfsp, /* change */ -1, ip, 2186 (uid_t)ip->i_uid, 2187 /* force */ 1, cr, 2188 (char **)NULL, (size_t *)NULL); 2189 dqrele(ip->i_dquot); 2190 } 2191 2192 ip->i_uid = vap->va_uid; 2193 2194 /* 2195 * There is a real ownership change. 2196 */ 2197 if (owner_change) { 2198 /* 2199 * Add the blocks and the file to the new 2200 * user's quota. 2201 */ 2202 ip->i_dquot = getinoquota(ip); 2203 (void) chkdq(ip, blocks, /* force */ 1, cr, 2204 &errmsg1, &len1); 2205 (void) chkiq(ufsvfsp, /* change */ 1, 2206 (struct inode *)NULL, 2207 (uid_t)ip->i_uid, 2208 /* force */ 1, cr, 2209 &errmsg2, &len2); 2210 } 2211 } 2212 if (mask & AT_GID) { 2213 ip->i_gid = vap->va_gid; 2214 } 2215 TRANS_INODE(ufsvfsp, ip); 2216 ip->i_flag |= ICHG; 2217 } 2218 /* 2219 * Change file access or modified times. 2220 */ 2221 if (mask & (AT_ATIME|AT_MTIME)) { 2222 /* Check that the time value is within ufs range */ 2223 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2224 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2225 error = EOVERFLOW; 2226 goto update_inode; 2227 } 2228 2229 /* 2230 * if the "noaccess" mount option is set and only atime 2231 * update is requested, do nothing. No error is returned. 2232 */ 2233 if ((ufsvfsp->vfs_noatime) && 2234 ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME)) 2235 goto skip_atime; 2236 2237 if (mask & AT_ATIME) { 2238 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2239 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2240 ip->i_flag &= ~IACC; 2241 } 2242 if (mask & AT_MTIME) { 2243 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2244 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2245 gethrestime(&now); 2246 if (now.tv_sec > TIME32_MAX) { 2247 /* 2248 * In 2038, ctime sticks forever.. 2249 */ 2250 ip->i_ctime.tv_sec = TIME32_MAX; 2251 ip->i_ctime.tv_usec = 0; 2252 } else { 2253 ip->i_ctime.tv_sec = now.tv_sec; 2254 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2255 } 2256 ip->i_flag &= ~(IUPD|ICHG); 2257 ip->i_flag |= IMODTIME; 2258 } 2259 TRANS_INODE(ufsvfsp, ip); 2260 ip->i_flag |= IMOD; 2261 } 2262 2263 skip_atime: 2264 /* 2265 * The presence of a shadow inode may indicate an ACL, but does 2266 * not imply an ACL. Future FSD types should be handled here too 2267 * and check for the presence of the attribute-specific data 2268 * before referencing it. 2269 */ 2270 if (ip->i_shadow) { 2271 /* 2272 * XXX if ufs_iupdat is changed to sandbagged write fix 2273 * ufs_acl_setattr to push ip to keep acls consistent 2274 * 2275 * Suppress out of inodes messages if we will retry. 2276 */ 2277 if (retry) 2278 ip->i_flag |= IQUIET; 2279 error = ufs_acl_setattr(ip, vap, cr); 2280 ip->i_flag &= ~IQUIET; 2281 } 2282 2283 update_inode: 2284 /* 2285 * Setattr always increases the sequence number 2286 */ 2287 ip->i_seq++; 2288 2289 /* 2290 * if nfsd and not logging; push synchronously 2291 */ 2292 if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) { 2293 ufs_iupdat(ip, 1); 2294 } else { 2295 ITIMES_NOLOCK(ip); 2296 } 2297 2298 rw_exit(&ip->i_contents); 2299 if (dodqlock) { 2300 rw_exit(&ufsvfsp->vfs_dqrwlock); 2301 } 2302 if (dorwlock) 2303 rw_exit(&ip->i_rwlock); 2304 2305 if (ulp) { 2306 if (dotrans) { 2307 int terr = 0; 2308 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR, 2309 trans_size); 2310 if (error == 0) 2311 error = terr; 2312 } 2313 ufs_lockfs_end(ulp); 2314 } 2315 out: 2316 /* 2317 * If out of inodes or blocks, see if we can free something 2318 * up from the delete queue. 2319 */ 2320 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 2321 ufs_delete_drain_wait(ufsvfsp, 1); 2322 retry = 0; 2323 if (errmsg1 != NULL) 2324 kmem_free(errmsg1, len1); 2325 if (errmsg2 != NULL) 2326 kmem_free(errmsg2, len2); 2327 goto again; 2328 } 2329 TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_END, 2330 "ufs_setattr_end:vp %p error %d", vp, error); 2331 if (errmsg1 != NULL) { 2332 uprintf(errmsg1); 2333 kmem_free(errmsg1, len1); 2334 } 2335 if (errmsg2 != NULL) { 2336 uprintf(errmsg2); 2337 kmem_free(errmsg2, len2); 2338 } 2339 return (error); 2340 } 2341 2342 /*ARGSUSED*/ 2343 static int 2344 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr) 2345 { 2346 struct inode *ip = VTOI(vp); 2347 int error; 2348 2349 TRACE_3(TR_FAC_UFS, TR_UFS_ACCESS_START, 2350 "ufs_access_start:vp %p mode %x flags %x", vp, mode, flags); 2351 2352 if (ip->i_ufsvfs == NULL) 2353 return (EIO); 2354 2355 rw_enter(&ip->i_contents, RW_READER); 2356 2357 /* 2358 * The ufs_iaccess function wants to be called with 2359 * mode bits expressed as "ufs specific" bits. 2360 * I.e., VWRITE|VREAD|VEXEC do not make sense to 2361 * ufs_iaccess() but IWRITE|IREAD|IEXEC do. 2362 * But since they're the same we just pass the vnode mode 2363 * bit but just verify that assumption at compile time. 2364 */ 2365 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC 2366 #error "ufs_access needs to map Vmodes to Imodes" 2367 #endif 2368 error = ufs_iaccess(ip, mode, cr); 2369 2370 rw_exit(&ip->i_contents); 2371 2372 TRACE_2(TR_FAC_UFS, TR_UFS_ACCESS_END, 2373 "ufs_access_end:vp %p error %d", vp, error); 2374 return (error); 2375 } 2376 2377 /* ARGSUSED */ 2378 static int 2379 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr) 2380 { 2381 struct inode *ip = VTOI(vp); 2382 struct ufsvfs *ufsvfsp; 2383 struct ulockfs *ulp; 2384 int error; 2385 int fastsymlink; 2386 2387 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_START, 2388 "ufs_readlink_start:vp %p uiop %p", uiop, vp); 2389 2390 if (vp->v_type != VLNK) { 2391 error = EINVAL; 2392 goto nolockout; 2393 } 2394 2395 /* 2396 * If the symbolic link is empty there is nothing to read. 2397 * Fast-track these empty symbolic links 2398 */ 2399 if (ip->i_size == 0) { 2400 error = 0; 2401 goto nolockout; 2402 } 2403 2404 ufsvfsp = ip->i_ufsvfs; 2405 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK); 2406 if (error) 2407 goto nolockout; 2408 /* 2409 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK 2410 */ 2411 again: 2412 fastsymlink = 0; 2413 if (ip->i_flag & IFASTSYMLNK) { 2414 rw_enter(&ip->i_rwlock, RW_READER); 2415 rw_enter(&ip->i_contents, RW_READER); 2416 if (ip->i_flag & IFASTSYMLNK) { 2417 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 2418 (ip->i_fs->fs_ronly == 0) && 2419 (!ufsvfsp->vfs_noatime)) { 2420 mutex_enter(&ip->i_tlock); 2421 ip->i_flag |= IACC; 2422 mutex_exit(&ip->i_tlock); 2423 } 2424 error = uiomove((caddr_t)&ip->i_db[1], 2425 MIN(ip->i_size, uiop->uio_resid), 2426 UIO_READ, uiop); 2427 ITIMES(ip); 2428 ++fastsymlink; 2429 } 2430 rw_exit(&ip->i_contents); 2431 rw_exit(&ip->i_rwlock); 2432 } 2433 if (!fastsymlink) { 2434 ssize_t size; /* number of bytes read */ 2435 caddr_t basep; /* pointer to input data */ 2436 ino_t ino; 2437 long igen; 2438 struct uio tuio; /* temp uio struct */ 2439 struct uio *tuiop; 2440 iovec_t tiov; /* temp iovec struct */ 2441 char kbuf[FSL_SIZE]; /* buffer to hold fast symlink */ 2442 int tflag = 0; /* flag to indicate temp vars used */ 2443 2444 ino = ip->i_number; 2445 igen = ip->i_gen; 2446 size = uiop->uio_resid; 2447 basep = uiop->uio_iov->iov_base; 2448 tuiop = uiop; 2449 2450 rw_enter(&ip->i_rwlock, RW_WRITER); 2451 rw_enter(&ip->i_contents, RW_WRITER); 2452 if (ip->i_flag & IFASTSYMLNK) { 2453 rw_exit(&ip->i_contents); 2454 rw_exit(&ip->i_rwlock); 2455 goto again; 2456 } 2457 2458 /* can this be a fast symlink and is it a user buffer? */ 2459 if (ip->i_size <= FSL_SIZE && 2460 (uiop->uio_segflg == UIO_USERSPACE || 2461 uiop->uio_segflg == UIO_USERISPACE)) { 2462 2463 bzero(&tuio, sizeof (struct uio)); 2464 /* 2465 * setup a kernel buffer to read link into. this 2466 * is to fix a race condition where the user buffer 2467 * got corrupted before copying it into the inode. 2468 */ 2469 size = ip->i_size; 2470 tiov.iov_len = size; 2471 tiov.iov_base = kbuf; 2472 tuio.uio_iov = &tiov; 2473 tuio.uio_iovcnt = 1; 2474 tuio.uio_offset = uiop->uio_offset; 2475 tuio.uio_segflg = UIO_SYSSPACE; 2476 tuio.uio_fmode = uiop->uio_fmode; 2477 tuio.uio_extflg = uiop->uio_extflg; 2478 tuio.uio_limit = uiop->uio_limit; 2479 tuio.uio_resid = size; 2480 2481 basep = tuio.uio_iov->iov_base; 2482 tuiop = &tuio; 2483 tflag = 1; 2484 } 2485 2486 error = rdip(ip, tuiop, 0, cr); 2487 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) { 2488 rw_exit(&ip->i_contents); 2489 rw_exit(&ip->i_rwlock); 2490 goto out; 2491 } 2492 2493 if (tflag == 0) 2494 size -= uiop->uio_resid; 2495 2496 if ((tflag == 0 && ip->i_size <= FSL_SIZE && 2497 ip->i_size == size) || (tflag == 1 && 2498 tuio.uio_resid == 0)) { 2499 error = kcopy(basep, &ip->i_db[1], ip->i_size); 2500 if (error == 0) { 2501 ip->i_flag |= IFASTSYMLNK; 2502 /* 2503 * free page 2504 */ 2505 (void) VOP_PUTPAGE(ITOV(ip), 2506 (offset_t)0, PAGESIZE, 2507 (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC), 2508 cr); 2509 } else { 2510 int i; 2511 /* error, clear garbage left behind */ 2512 for (i = 1; i < NDADDR; i++) 2513 ip->i_db[i] = 0; 2514 for (i = 0; i < NIADDR; i++) 2515 ip->i_ib[i] = 0; 2516 } 2517 } 2518 if (tflag == 1) { 2519 /* now, copy it into the user buffer */ 2520 error = uiomove((caddr_t)kbuf, 2521 MIN(size, uiop->uio_resid), 2522 UIO_READ, uiop); 2523 } 2524 rw_exit(&ip->i_contents); 2525 rw_exit(&ip->i_rwlock); 2526 } 2527 out: 2528 if (ulp) { 2529 ufs_lockfs_end(ulp); 2530 } 2531 nolockout: 2532 TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_END, 2533 "ufs_readlink_end:vp %p error %d", vp, error); 2534 2535 return (error); 2536 } 2537 2538 /* ARGSUSED */ 2539 static int 2540 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr) 2541 { 2542 struct inode *ip = VTOI(vp); 2543 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 2544 struct ulockfs *ulp; 2545 int error; 2546 2547 TRACE_1(TR_FAC_UFS, TR_UFS_FSYNC_START, 2548 "ufs_fsync_start:vp %p", vp); 2549 2550 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK); 2551 if (error) 2552 return (error); 2553 2554 if (TRANS_ISTRANS(ufsvfsp)) { 2555 /* 2556 * First push out any data pages 2557 */ 2558 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2559 (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) { 2560 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 2561 0, CRED()); 2562 if (error) 2563 goto out; 2564 } 2565 2566 /* 2567 * Delta any delayed inode times updates 2568 * and push inode to log. 2569 * All other inode deltas will have already been delta'd 2570 * and will be pushed during the commit. 2571 */ 2572 if (!(syncflag & FDSYNC) && 2573 ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) { 2574 if (ulp) { 2575 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC, 2576 TOP_SYNCIP_SIZE); 2577 } 2578 rw_enter(&ip->i_contents, RW_READER); 2579 mutex_enter(&ip->i_tlock); 2580 ip->i_flag &= ~IMODTIME; 2581 mutex_exit(&ip->i_tlock); 2582 ufs_iupdat(ip, I_SYNC); 2583 rw_exit(&ip->i_contents); 2584 if (ulp) { 2585 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC, 2586 TOP_SYNCIP_SIZE); 2587 } 2588 } 2589 2590 /* 2591 * Commit the Moby transaction 2592 * 2593 * Deltas have already been made so we just need to 2594 * commit them with a synchronous transaction. 2595 * TRANS_BEGIN_SYNC() will return an error 2596 * if there are no deltas to commit, for an 2597 * empty transaction. 2598 */ 2599 if (ulp) { 2600 TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE, 2601 error); 2602 if (error) { 2603 error = 0; /* commit wasn't needed */ 2604 goto out; 2605 } 2606 TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC, 2607 TOP_COMMIT_SIZE); 2608 } 2609 } else { /* not logging */ 2610 if (!(IS_SWAPVP(vp))) 2611 if (syncflag & FNODSYNC) { 2612 /* Just update the inode only */ 2613 TRANS_IUPDAT(ip, 1); 2614 error = 0; 2615 } else if (syncflag & FDSYNC) 2616 /* Do data-synchronous writes */ 2617 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC); 2618 else 2619 /* Do synchronous writes */ 2620 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC); 2621 2622 rw_enter(&ip->i_contents, RW_WRITER); 2623 if (!error) 2624 error = ufs_sync_indir(ip); 2625 rw_exit(&ip->i_contents); 2626 } 2627 out: 2628 if (ulp) { 2629 ufs_lockfs_end(ulp); 2630 } 2631 TRACE_2(TR_FAC_UFS, TR_UFS_FSYNC_END, 2632 "ufs_fsync_end:vp %p error %d", vp, error); 2633 return (error); 2634 } 2635 2636 /*ARGSUSED*/ 2637 static void 2638 ufs_inactive(struct vnode *vp, struct cred *cr) 2639 { 2640 ufs_iinactive(VTOI(vp)); 2641 } 2642 2643 /* 2644 * Unix file system operations having to do with directory manipulation. 2645 */ 2646 int ufs_lookup_idle_count = 2; /* Number of inodes to idle each time */ 2647 /* ARGSUSED */ 2648 static int 2649 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 2650 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr) 2651 { 2652 struct inode *ip; 2653 struct inode *sip; 2654 struct inode *xip; 2655 struct ufsvfs *ufsvfsp; 2656 struct ulockfs *ulp; 2657 struct vnode *vp; 2658 int error; 2659 2660 TRACE_2(TR_FAC_UFS, TR_UFS_LOOKUP_START, 2661 "ufs_lookup_start:dvp %p name %s", dvp, nm); 2662 2663 2664 /* 2665 * Check flags for type of lookup (regular file or attribute file) 2666 */ 2667 2668 ip = VTOI(dvp); 2669 2670 if (flags & LOOKUP_XATTR) { 2671 2672 /* 2673 * We don't allow recursive attributes... 2674 * Maybe someday we will. 2675 */ 2676 if ((ip->i_cflags & IXATTR)) { 2677 return (EINVAL); 2678 } 2679 2680 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) { 2681 error = ufs_xattr_getattrdir(dvp, &sip, flags, cr); 2682 if (error) { 2683 *vpp = NULL; 2684 goto out; 2685 } 2686 2687 vp = ITOV(sip); 2688 dnlc_update(dvp, XATTR_DIR_NAME, vp); 2689 } 2690 2691 /* 2692 * Check accessibility of directory. 2693 */ 2694 if (vp == DNLC_NO_VNODE) { 2695 VN_RELE(vp); 2696 error = ENOENT; 2697 goto out; 2698 } 2699 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr)) != 0) { 2700 VN_RELE(vp); 2701 goto out; 2702 } 2703 2704 *vpp = vp; 2705 return (0); 2706 } 2707 2708 /* 2709 * Check for a null component, which we should treat as 2710 * looking at dvp from within it's parent, so we don't 2711 * need a call to ufs_iaccess(), as it has already been 2712 * done. 2713 */ 2714 if (nm[0] == 0) { 2715 VN_HOLD(dvp); 2716 error = 0; 2717 *vpp = dvp; 2718 goto out; 2719 } 2720 2721 /* 2722 * Check for "." ie itself. this is a quick check and 2723 * avoids adding "." into the dnlc (which have been seen 2724 * to occupy >10% of the cache). 2725 */ 2726 if ((nm[0] == '.') && (nm[1] == 0)) { 2727 /* 2728 * Don't return without checking accessibility 2729 * of the directory. We only need the lock if 2730 * we are going to return it. 2731 */ 2732 if ((error = ufs_iaccess(ip, IEXEC, cr)) == 0) { 2733 VN_HOLD(dvp); 2734 *vpp = dvp; 2735 } 2736 goto out; 2737 } 2738 2739 /* 2740 * Fast path: Check the directory name lookup cache. 2741 */ 2742 if (vp = dnlc_lookup(dvp, nm)) { 2743 /* 2744 * Check accessibility of directory. 2745 */ 2746 if ((error = ufs_iaccess(ip, IEXEC, cr)) != 0) { 2747 VN_RELE(vp); 2748 goto out; 2749 } 2750 if (vp == DNLC_NO_VNODE) { 2751 VN_RELE(vp); 2752 error = ENOENT; 2753 goto out; 2754 } 2755 xip = VTOI(vp); 2756 ulp = NULL; 2757 goto fastpath; 2758 } 2759 2760 /* 2761 * Keep the idle queue from getting too long by 2762 * idling two inodes before attempting to allocate another. 2763 * This operation must be performed before entering 2764 * lockfs or a transaction. 2765 */ 2766 if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat) 2767 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 2768 ins.in_lidles.value.ul += ufs_lookup_idle_count; 2769 ufs_idle_some(ufs_lookup_idle_count); 2770 } 2771 2772 ufsvfsp = ip->i_ufsvfs; 2773 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK); 2774 if (error) 2775 goto out; 2776 2777 error = ufs_dirlook(ip, nm, &xip, cr, 1); 2778 2779 fastpath: 2780 if (error == 0) { 2781 ip = xip; 2782 *vpp = ITOV(ip); 2783 2784 /* 2785 * If vnode is a device return special vnode instead. 2786 */ 2787 if (IS_DEVVP(*vpp)) { 2788 struct vnode *newvp; 2789 2790 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, 2791 cr); 2792 VN_RELE(*vpp); 2793 if (newvp == NULL) 2794 error = ENOSYS; 2795 else 2796 *vpp = newvp; 2797 } 2798 } 2799 if (ulp) { 2800 ufs_lockfs_end(ulp); 2801 } 2802 2803 out: 2804 TRACE_3(TR_FAC_UFS, TR_UFS_LOOKUP_END, 2805 "ufs_lookup_end:dvp %p name %s error %d", vpp, nm, error); 2806 return (error); 2807 } 2808 2809 static int 2810 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl, 2811 int mode, struct vnode **vpp, struct cred *cr, int flag) 2812 { 2813 struct inode *ip; 2814 struct inode *xip; 2815 struct inode *dip; 2816 struct vnode *xvp; 2817 struct ufsvfs *ufsvfsp; 2818 struct ulockfs *ulp; 2819 int error; 2820 int issync; 2821 int truncflag; 2822 int trans_size; 2823 int noentry; 2824 int defer_dip_seq_update = 0; /* need to defer update of dip->i_seq */ 2825 int retry = 1; 2826 2827 TRACE_1(TR_FAC_UFS, TR_UFS_CREATE_START, 2828 "ufs_create_start:dvp %p", dvp); 2829 2830 again: 2831 ip = VTOI(dvp); 2832 ufsvfsp = ip->i_ufsvfs; 2833 truncflag = 0; 2834 2835 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK); 2836 if (error) 2837 goto out; 2838 2839 if (ulp) { 2840 trans_size = (int)TOP_CREATE_SIZE(ip); 2841 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size); 2842 } 2843 2844 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 2845 vap->va_mode &= ~VSVTX; 2846 2847 if (*name == '\0') { 2848 /* 2849 * Null component name refers to the directory itself. 2850 */ 2851 VN_HOLD(dvp); 2852 /* 2853 * Even though this is an error case, we need to grab the 2854 * quota lock since the error handling code below is common. 2855 */ 2856 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2857 rw_enter(&ip->i_contents, RW_WRITER); 2858 error = EEXIST; 2859 } else { 2860 xip = NULL; 2861 noentry = 0; 2862 rw_enter(&ip->i_rwlock, RW_WRITER); 2863 xvp = dnlc_lookup(dvp, name); 2864 if (xvp == DNLC_NO_VNODE) { 2865 noentry = 1; 2866 VN_RELE(xvp); 2867 xvp = NULL; 2868 } 2869 if (xvp) { 2870 rw_exit(&ip->i_rwlock); 2871 if (error = ufs_iaccess(ip, IEXEC, cr)) { 2872 VN_RELE(xvp); 2873 } else { 2874 error = EEXIST; 2875 xip = VTOI(xvp); 2876 } 2877 } else { 2878 /* 2879 * Suppress file system full message if we will retry 2880 */ 2881 error = ufs_direnter_cm(ip, name, DE_CREATE, 2882 vap, &xip, cr, 2883 (noentry | (retry ? IQUIET : 0))); 2884 rw_exit(&ip->i_rwlock); 2885 } 2886 ip = xip; 2887 if (ip != NULL) { 2888 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2889 rw_enter(&ip->i_contents, RW_WRITER); 2890 } 2891 } 2892 2893 /* 2894 * If the file already exists and this is a non-exclusive create, 2895 * check permissions and allow access for non-directories. 2896 * Read-only create of an existing directory is also allowed. 2897 * We fail an exclusive create of anything which already exists. 2898 */ 2899 if (error == EEXIST) { 2900 dip = VTOI(dvp); 2901 if (excl == NONEXCL) { 2902 if ((((ip->i_mode & IFMT) == IFDIR) || 2903 ((ip->i_mode & IFMT) == IFATTRDIR)) && 2904 (mode & IWRITE)) 2905 error = EISDIR; 2906 else if (mode) 2907 error = ufs_iaccess(ip, mode, cr); 2908 else 2909 error = 0; 2910 } 2911 if (error) { 2912 rw_exit(&ip->i_contents); 2913 rw_exit(&ufsvfsp->vfs_dqrwlock); 2914 VN_RELE(ITOV(ip)); 2915 goto unlock; 2916 } 2917 /* 2918 * If the error EEXIST was set, then i_seq can not 2919 * have been updated. The sequence number interface 2920 * is defined such that a non-error VOP_CREATE must 2921 * increase the dir va_seq it by at least one. If we 2922 * have cleared the error, increase i_seq. Note that 2923 * we are increasing the dir i_seq and in rare cases 2924 * ip may actually be from the dvp, so we already have 2925 * the locks and it will not be subject to truncation. 2926 * In case we have to update i_seq of the parent 2927 * directory dip, we have to defer it till we have 2928 * released our locks on ip due to lock ordering requirements. 2929 */ 2930 if (ip != dip) 2931 defer_dip_seq_update = 1; 2932 else 2933 ip->i_seq++; 2934 2935 if (((ip->i_mode & IFMT) == IFREG) && 2936 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 2937 /* 2938 * Truncate regular files, if requested by caller. 2939 * Grab i_rwlock to make sure no one else is 2940 * currently writing to the file (we promised 2941 * bmap we would do this). 2942 * Must get the locks in the correct order. 2943 */ 2944 if (ip->i_size == 0) { 2945 ip->i_flag |= ICHG | IUPD; 2946 ip->i_seq++; 2947 TRANS_INODE(ufsvfsp, ip); 2948 } else { 2949 /* 2950 * Large Files: Why this check here? 2951 * Though we do it in vn_create() we really 2952 * want to guarantee that we do not destroy 2953 * Large file data by atomically checking 2954 * the size while holding the contents 2955 * lock. 2956 */ 2957 if (flag && !(flag & FOFFMAX) && 2958 ((ip->i_mode & IFMT) == IFREG) && 2959 (ip->i_size > (offset_t)MAXOFF32_T)) { 2960 rw_exit(&ip->i_contents); 2961 rw_exit(&ufsvfsp->vfs_dqrwlock); 2962 error = EOVERFLOW; 2963 goto unlock; 2964 } 2965 if (TRANS_ISTRANS(ufsvfsp)) 2966 truncflag++; 2967 else { 2968 rw_exit(&ip->i_contents); 2969 rw_exit(&ufsvfsp->vfs_dqrwlock); 2970 rw_enter(&ip->i_rwlock, RW_WRITER); 2971 rw_enter(&ufsvfsp->vfs_dqrwlock, 2972 RW_READER); 2973 rw_enter(&ip->i_contents, RW_WRITER); 2974 (void) ufs_itrunc(ip, (u_offset_t)0, 0, 2975 cr); 2976 rw_exit(&ip->i_rwlock); 2977 } 2978 } 2979 } 2980 } 2981 2982 if (error) { 2983 if (ip != NULL) { 2984 rw_exit(&ufsvfsp->vfs_dqrwlock); 2985 rw_exit(&ip->i_contents); 2986 } 2987 goto unlock; 2988 } 2989 2990 *vpp = ITOV(ip); 2991 ITIMES(ip); 2992 rw_exit(&ip->i_contents); 2993 rw_exit(&ufsvfsp->vfs_dqrwlock); 2994 2995 /* 2996 * If vnode is a device return special vnode instead. 2997 */ 2998 if (!error && IS_DEVVP(*vpp)) { 2999 struct vnode *newvp; 3000 3001 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 3002 VN_RELE(*vpp); 3003 if (newvp == NULL) { 3004 error = ENOSYS; 3005 goto unlock; 3006 } 3007 truncflag = 0; 3008 *vpp = newvp; 3009 } 3010 unlock: 3011 3012 /* 3013 * Do the deferred update of the parent directory's sequence 3014 * number now. 3015 */ 3016 if (defer_dip_seq_update == 1) { 3017 rw_enter(&dip->i_contents, RW_READER); 3018 mutex_enter(&dip->i_tlock); 3019 dip->i_seq++; 3020 mutex_exit(&dip->i_tlock); 3021 rw_exit(&dip->i_contents); 3022 } 3023 3024 if (ulp) { 3025 int terr = 0; 3026 3027 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE, 3028 trans_size); 3029 3030 /* 3031 * If we haven't had a more interesting failure 3032 * already, then anything that might've happened 3033 * here should be reported. 3034 */ 3035 if (error == 0) 3036 error = terr; 3037 } 3038 3039 if (!error && truncflag) { 3040 rw_enter(&ip->i_rwlock, RW_WRITER); 3041 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr); 3042 rw_exit(&ip->i_rwlock); 3043 } 3044 3045 if (ulp) 3046 ufs_lockfs_end(ulp); 3047 3048 /* 3049 * If no inodes available, try to free one up out of the 3050 * pending delete queue. 3051 */ 3052 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3053 ufs_delete_drain_wait(ufsvfsp, 1); 3054 retry = 0; 3055 goto again; 3056 } 3057 3058 out: 3059 TRACE_3(TR_FAC_UFS, TR_UFS_CREATE_END, 3060 "ufs_create_end:dvp %p name %s error %d", vpp, name, error); 3061 return (error); 3062 } 3063 3064 extern int ufs_idle_max; 3065 /*ARGSUSED*/ 3066 static int 3067 ufs_remove(struct vnode *vp, char *nm, struct cred *cr) 3068 { 3069 struct inode *ip = VTOI(vp); 3070 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3071 struct ulockfs *ulp; 3072 vnode_t *rmvp = NULL; /* Vnode corresponding to name being removed */ 3073 int error; 3074 int issync; 3075 int trans_size; 3076 3077 TRACE_1(TR_FAC_UFS, TR_UFS_REMOVE_START, 3078 "ufs_remove_start:vp %p", vp); 3079 3080 /* 3081 * don't let the delete queue get too long 3082 */ 3083 if (ufsvfsp == NULL) { 3084 error = EIO; 3085 goto out; 3086 } 3087 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3088 ufs_delete_drain(vp->v_vfsp, 1, 1); 3089 3090 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK); 3091 if (error) 3092 goto out; 3093 3094 if (ulp) 3095 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 3096 trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp))); 3097 3098 rw_enter(&ip->i_rwlock, RW_WRITER); 3099 error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0, 3100 DR_REMOVE, cr, &rmvp); 3101 rw_exit(&ip->i_rwlock); 3102 3103 if (ulp) { 3104 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size); 3105 ufs_lockfs_end(ulp); 3106 } 3107 3108 /* 3109 * This must be called after the remove transaction is closed. 3110 */ 3111 if (rmvp != NULL) { 3112 /* Only send the event if there were no errors */ 3113 if (error == 0) 3114 vnevent_remove(rmvp); 3115 VN_RELE(rmvp); 3116 } 3117 out: 3118 TRACE_3(TR_FAC_UFS, TR_UFS_REMOVE_END, 3119 "ufs_remove_end:vp %p name %s error %d", vp, nm, error); 3120 return (error); 3121 } 3122 3123 /* 3124 * Link a file or a directory. Only privileged processes are allowed to 3125 * make links to directories. 3126 */ 3127 static int 3128 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr) 3129 { 3130 struct inode *sip; 3131 struct inode *tdp = VTOI(tdvp); 3132 struct ufsvfs *ufsvfsp = tdp->i_ufsvfs; 3133 struct ulockfs *ulp; 3134 struct vnode *realvp; 3135 int error; 3136 int issync; 3137 int trans_size; 3138 int isdev; 3139 3140 TRACE_1(TR_FAC_UFS, TR_UFS_LINK_START, 3141 "ufs_link_start:tdvp %p", tdvp); 3142 3143 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK); 3144 if (error) 3145 goto out; 3146 3147 if (ulp) 3148 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK, 3149 trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp))); 3150 3151 if (VOP_REALVP(svp, &realvp) == 0) 3152 svp = realvp; 3153 3154 /* 3155 * Make sure link for extended attributes is valid 3156 * We only support hard linking of attr in ATTRDIR to ATTRDIR 3157 * 3158 * Make certain we don't attempt to look at a device node as 3159 * a ufs inode. 3160 */ 3161 3162 isdev = IS_DEVVP(svp); 3163 if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) && 3164 ((tdp->i_mode & IFMT) == IFATTRDIR)) || 3165 ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) && 3166 ((tdp->i_mode & IFMT) == IFDIR))) { 3167 error = EINVAL; 3168 goto unlock; 3169 } 3170 3171 sip = VTOI(svp); 3172 if ((svp->v_type == VDIR && 3173 secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) || 3174 (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) { 3175 error = EPERM; 3176 goto unlock; 3177 } 3178 rw_enter(&tdp->i_rwlock, RW_WRITER); 3179 error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0, 3180 sip, cr, NULL); 3181 rw_exit(&tdp->i_rwlock); 3182 3183 unlock: 3184 if (ulp) { 3185 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size); 3186 ufs_lockfs_end(ulp); 3187 } 3188 out: 3189 TRACE_2(TR_FAC_UFS, TR_UFS_LINK_END, 3190 "ufs_link_end:tdvp %p error %d", tdvp, error); 3191 return (error); 3192 } 3193 3194 uint64_t ufs_rename_retry_cnt; 3195 uint64_t ufs_rename_upgrade_retry_cnt; 3196 uint64_t ufs_rename_dircheck_retry_cnt; 3197 clock_t ufs_rename_backoff_delay = 1; 3198 3199 /* 3200 * Rename a file or directory. 3201 * We are given the vnode and entry string of the source and the 3202 * vnode and entry string of the place we want to move the source 3203 * to (the target). The essential operation is: 3204 * unlink(target); 3205 * link(source, target); 3206 * unlink(source); 3207 * but "atomically". Can't do full commit without saving state in 3208 * the inode on disk, which isn't feasible at this time. Best we 3209 * can do is always guarantee that the TARGET exists. 3210 */ 3211 3212 /*ARGSUSED*/ 3213 static int 3214 ufs_rename( 3215 struct vnode *sdvp, /* old (source) parent vnode */ 3216 char *snm, /* old (source) entry name */ 3217 struct vnode *tdvp, /* new (target) parent vnode */ 3218 char *tnm, /* new (target) entry name */ 3219 struct cred *cr) 3220 { 3221 struct inode *sip = NULL; /* source inode */ 3222 struct inode *ip = NULL; /* check inode */ 3223 struct inode *sdp; /* old (source) parent inode */ 3224 struct inode *tdp; /* new (target) parent inode */ 3225 struct vnode *tvp = NULL; /* target vnode, if it exists */ 3226 struct vnode *realvp; 3227 struct ufsvfs *ufsvfsp; 3228 struct ulockfs *ulp; 3229 struct slot slot; 3230 timestruc_t now; 3231 int error; 3232 int issync; 3233 int trans_size; 3234 3235 TRACE_1(TR_FAC_UFS, TR_UFS_RENAME_START, 3236 "ufs_rename_start:sdvp %p", sdvp); 3237 3238 3239 sdp = VTOI(sdvp); 3240 slot.fbp = NULL; 3241 ufsvfsp = sdp->i_ufsvfs; 3242 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); 3243 if (error) 3244 goto out; 3245 3246 if (ulp) 3247 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, 3248 trans_size = (int)TOP_RENAME_SIZE(sdp)); 3249 3250 if (VOP_REALVP(tdvp, &realvp) == 0) 3251 tdvp = realvp; 3252 3253 tdp = VTOI(tdvp); 3254 3255 3256 /* 3257 * We only allow renaming of attributes from ATTRDIR to ATTRDIR. 3258 */ 3259 if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) { 3260 error = EINVAL; 3261 goto unlock; 3262 } 3263 3264 /* 3265 * Look up inode of file we're supposed to rename. 3266 */ 3267 gethrestime(&now); 3268 if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) { 3269 goto unlock; 3270 } 3271 3272 /* 3273 * Lock both the source and target directories (they may be 3274 * the same) to provide the atomicity semantics that was 3275 * previously provided by the per file system vfs_rename_lock 3276 * 3277 * with vfs_rename_lock removed to allow simultaneous renames 3278 * within a file system, ufs_dircheckpath can deadlock while 3279 * traversing back to ensure that source is not a parent directory 3280 * of target parent directory. This is because we get into 3281 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER. 3282 * If the tdp and sdp of the simultaneous renames happen to be 3283 * in the path of each other, it can lead to a deadlock. This 3284 * can be avoided by getting the locks as RW_READER here and then 3285 * upgrading to RW_WRITER after completing the ufs_dircheckpath. 3286 */ 3287 retry: 3288 rw_enter(&tdp->i_rwlock, RW_READER); 3289 if (tdp != sdp) { 3290 /* 3291 * We're locking 2 peer level locks, so must use tryenter 3292 * on the 2nd to avoid deadlocks that would occur 3293 * if we renamed a->b and b->a concurrently. 3294 */ 3295 if (!rw_tryenter(&sdp->i_rwlock, RW_READER)) { 3296 /* 3297 * Reverse the lock grabs in case we have heavy 3298 * contention on the 2nd lock. 3299 */ 3300 rw_exit(&tdp->i_rwlock); 3301 rw_enter(&sdp->i_rwlock, RW_READER); 3302 if (!rw_tryenter(&tdp->i_rwlock, RW_READER)) { 3303 ufs_rename_retry_cnt++; 3304 rw_exit(&sdp->i_rwlock); 3305 goto retry; 3306 } 3307 } 3308 } 3309 3310 if (sip == tdp) { 3311 error = EINVAL; 3312 goto errout; 3313 } 3314 /* 3315 * Make sure we can delete the source entry. This requires 3316 * write permission on the containing directory. 3317 * Check for sticky directories. 3318 */ 3319 rw_enter(&sdp->i_contents, RW_READER); 3320 rw_enter(&sip->i_contents, RW_READER); 3321 if ((error = ufs_iaccess(sdp, IWRITE, cr)) != 0 || 3322 (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) { 3323 rw_exit(&sip->i_contents); 3324 rw_exit(&sdp->i_contents); 3325 goto errout; 3326 } 3327 3328 /* 3329 * If this is a rename of a directory and the parent is 3330 * different (".." must be changed), then the source 3331 * directory must not be in the directory hierarchy 3332 * above the target, as this would orphan everything 3333 * below the source directory. Also the user must have 3334 * write permission in the source so as to be able to 3335 * change "..". 3336 */ 3337 if ((((sip->i_mode & IFMT) == IFDIR) || 3338 ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) { 3339 ino_t inum; 3340 3341 if ((error = ufs_iaccess(sip, IWRITE, cr))) { 3342 rw_exit(&sip->i_contents); 3343 rw_exit(&sdp->i_contents); 3344 goto errout; 3345 } 3346 inum = sip->i_number; 3347 rw_exit(&sip->i_contents); 3348 rw_exit(&sdp->i_contents); 3349 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) { 3350 /* 3351 * If we got EAGAIN ufs_dircheckpath detected a 3352 * potential deadlock and backed out. We need 3353 * to retry the operation since sdp and tdp have 3354 * to be released to avoid the deadlock. 3355 */ 3356 if (error == EAGAIN) { 3357 rw_exit(&tdp->i_rwlock); 3358 if (tdp != sdp) 3359 rw_exit(&sdp->i_rwlock); 3360 delay(ufs_rename_backoff_delay); 3361 ufs_rename_dircheck_retry_cnt++; 3362 goto retry; 3363 } 3364 goto errout; 3365 } 3366 } else { 3367 rw_exit(&sip->i_contents); 3368 rw_exit(&sdp->i_contents); 3369 } 3370 3371 3372 /* 3373 * Check for renaming '.' or '..' or alias of '.' 3374 */ 3375 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) { 3376 error = EINVAL; 3377 goto errout; 3378 } 3379 3380 /* 3381 * Simultaneous renames can deadlock in ufs_dircheckpath since it 3382 * tries to traverse back the file tree with both tdp and sdp held 3383 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks 3384 * as RW_READERS till ufs_dircheckpath is done. 3385 * Now that ufs_dircheckpath is done with, we can upgrade the locks 3386 * to RW_WRITER. 3387 */ 3388 if (!rw_tryupgrade(&tdp->i_rwlock)) { 3389 /* 3390 * The upgrade failed. We got to give away the lock 3391 * as to avoid deadlocking with someone else who is 3392 * waiting for writer lock. With the lock gone, we 3393 * cannot be sure the checks done above will hold 3394 * good when we eventually get them back as writer. 3395 * So if we can't upgrade we drop the locks and retry 3396 * everything again. 3397 */ 3398 rw_exit(&tdp->i_rwlock); 3399 if (tdp != sdp) 3400 rw_exit(&sdp->i_rwlock); 3401 delay(ufs_rename_backoff_delay); 3402 ufs_rename_upgrade_retry_cnt++; 3403 goto retry; 3404 } 3405 if (tdp != sdp) { 3406 if (!rw_tryupgrade(&sdp->i_rwlock)) { 3407 /* 3408 * The upgrade failed. We got to give away the lock 3409 * as to avoid deadlocking with someone else who is 3410 * waiting for writer lock. With the lock gone, we 3411 * cannot be sure the checks done above will hold 3412 * good when we eventually get them back as writer. 3413 * So if we can't upgrade we drop the locks and retry 3414 * everything again. 3415 */ 3416 rw_exit(&tdp->i_rwlock); 3417 rw_exit(&sdp->i_rwlock); 3418 delay(ufs_rename_backoff_delay); 3419 ufs_rename_upgrade_retry_cnt++; 3420 goto retry; 3421 } 3422 } 3423 3424 /* 3425 * Now that all the locks are held check to make sure another thread 3426 * didn't slip in and take out the sip. 3427 */ 3428 slot.status = NONE; 3429 if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec || 3430 sip->i_ctime.tv_sec > now.tv_sec) { 3431 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 3432 rw_enter(&sdp->i_contents, RW_WRITER); 3433 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot, 3434 &ip, cr, 0); 3435 rw_exit(&sdp->i_contents); 3436 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock); 3437 if (error) { 3438 goto errout; 3439 } 3440 if (ip == NULL) { 3441 error = ENOENT; 3442 goto errout; 3443 } else { 3444 /* 3445 * If the inode was found need to drop the v_count 3446 * so as not to keep the filesystem from being 3447 * unmounted at a later time. 3448 */ 3449 VN_RELE(ITOV(ip)); 3450 } 3451 3452 /* 3453 * Release the slot.fbp that has the page mapped and 3454 * locked SE_SHARED, and could be used in in 3455 * ufs_direnter_lr() which needs to get the SE_EXCL lock 3456 * on said page. 3457 */ 3458 if (slot.fbp) { 3459 fbrelse(slot.fbp, S_OTHER); 3460 slot.fbp = NULL; 3461 } 3462 } 3463 3464 /* 3465 * Link source to the target. If a target exists, return its 3466 * vnode pointer in tvp. We'll release it after sending the 3467 * vnevent. 3468 */ 3469 if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) { 3470 /* 3471 * ESAME isn't really an error; it indicates that the 3472 * operation should not be done because the source and target 3473 * are the same file, but that no error should be reported. 3474 */ 3475 if (error == ESAME) 3476 error = 0; 3477 goto errout; 3478 } 3479 3480 /* 3481 * Unlink the source. 3482 * Remove the source entry. ufs_dirremove() checks that the entry 3483 * still reflects sip, and returns an error if it doesn't. 3484 * If the entry has changed just forget about it. Release 3485 * the source inode. 3486 */ 3487 if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0, 3488 DR_RENAME, cr, NULL)) == ENOENT) 3489 error = 0; 3490 3491 errout: 3492 if (slot.fbp) 3493 fbrelse(slot.fbp, S_OTHER); 3494 3495 rw_exit(&tdp->i_rwlock); 3496 if (sdp != tdp) { 3497 rw_exit(&sdp->i_rwlock); 3498 } 3499 3500 unlock: 3501 if (ulp) { 3502 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); 3503 ufs_lockfs_end(ulp); 3504 } 3505 3506 /* 3507 * If no errors, send the appropriate events on the source 3508 * and destination (a.k.a, target) vnodes, if they exist. 3509 * This has to be done after the rename transaction has closed. 3510 */ 3511 if (error == 0) { 3512 if (tvp != NULL) 3513 vnevent_rename_dest(tvp); 3514 /* 3515 * Note that if ufs_direnter_lr() returned ESAME then 3516 * this event will still be sent. This isn't expected 3517 * to be a problem for anticipated usage by consumers. 3518 */ 3519 if (sip != NULL) 3520 vnevent_rename_src(ITOV(sip)); 3521 } 3522 3523 if (tvp != NULL) 3524 VN_RELE(tvp); 3525 3526 if (sip != NULL) 3527 VN_RELE(ITOV(sip)); 3528 3529 out: 3530 TRACE_5(TR_FAC_UFS, TR_UFS_RENAME_END, 3531 "ufs_rename_end:sdvp %p snm %s tdvp %p tnm %s error %d", 3532 sdvp, snm, tdvp, tnm, error); 3533 return (error); 3534 } 3535 3536 /*ARGSUSED*/ 3537 static int 3538 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap, 3539 struct vnode **vpp, struct cred *cr) 3540 { 3541 struct inode *ip; 3542 struct inode *xip; 3543 struct ufsvfs *ufsvfsp; 3544 struct ulockfs *ulp; 3545 int error; 3546 int issync; 3547 int trans_size; 3548 int retry = 1; 3549 3550 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 3551 3552 TRACE_1(TR_FAC_UFS, TR_UFS_MKDIR_START, 3553 "ufs_mkdir_start:dvp %p", dvp); 3554 3555 /* 3556 * Can't make directory in attr hidden dir 3557 */ 3558 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3559 return (EINVAL); 3560 3561 again: 3562 ip = VTOI(dvp); 3563 ufsvfsp = ip->i_ufsvfs; 3564 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3565 if (error) 3566 goto out; 3567 if (ulp) 3568 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, 3569 trans_size = (int)TOP_MKDIR_SIZE(ip)); 3570 3571 rw_enter(&ip->i_rwlock, RW_WRITER); 3572 3573 error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr, 3574 (retry ? IQUIET : 0)); 3575 3576 rw_exit(&ip->i_rwlock); 3577 if (error == 0) { 3578 ip = xip; 3579 *vpp = ITOV(ip); 3580 } else if (error == EEXIST) 3581 VN_RELE(ITOV(xip)); 3582 3583 if (ulp) { 3584 int terr = 0; 3585 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size); 3586 ufs_lockfs_end(ulp); 3587 if (error == 0) 3588 error = terr; 3589 } 3590 out: 3591 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3592 ufs_delete_drain_wait(ufsvfsp, 1); 3593 retry = 0; 3594 goto again; 3595 } 3596 3597 TRACE_2(TR_FAC_UFS, TR_UFS_MKDIR_END, 3598 "ufs_mkdir_end:dvp %p error %d", dvp, error); 3599 return (error); 3600 } 3601 3602 /*ARGSUSED*/ 3603 static int 3604 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr) 3605 { 3606 struct inode *ip = VTOI(vp); 3607 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 3608 struct ulockfs *ulp; 3609 vnode_t *rmvp = NULL; /* Vnode of removed directory */ 3610 int error; 3611 int issync; 3612 3613 TRACE_1(TR_FAC_UFS, TR_UFS_RMDIR_START, 3614 "ufs_rmdir_start:vp %p", vp); 3615 3616 /* 3617 * don't let the delete queue get too long 3618 */ 3619 if (ufsvfsp == NULL) { 3620 error = EIO; 3621 goto out; 3622 } 3623 if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max) 3624 ufs_delete_drain(vp->v_vfsp, 1, 1); 3625 3626 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK); 3627 if (error) 3628 goto out; 3629 3630 if (ulp) 3631 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR, TOP_RMDIR_SIZE); 3632 3633 rw_enter(&ip->i_rwlock, RW_WRITER); 3634 error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr, 3635 &rmvp); 3636 rw_exit(&ip->i_rwlock); 3637 3638 if (ulp) { 3639 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR, 3640 TOP_RMDIR_SIZE); 3641 ufs_lockfs_end(ulp); 3642 } 3643 3644 /* 3645 * This must be done AFTER the rmdir transaction has closed. 3646 */ 3647 if (rmvp != NULL) { 3648 /* Only send the event if there were no errors */ 3649 if (error == 0) 3650 vnevent_rmdir(rmvp); 3651 VN_RELE(rmvp); 3652 } 3653 out: 3654 TRACE_2(TR_FAC_UFS, TR_UFS_RMDIR_END, 3655 "ufs_rmdir_end:vp %p error %d", vp, error); 3656 3657 return (error); 3658 } 3659 3660 /* ARGSUSED */ 3661 static int 3662 ufs_readdir( 3663 struct vnode *vp, 3664 struct uio *uiop, 3665 struct cred *cr, 3666 int *eofp) 3667 { 3668 struct iovec *iovp; 3669 struct inode *ip; 3670 struct direct *idp; 3671 struct dirent64 *odp; 3672 struct fbuf *fbp; 3673 struct ufsvfs *ufsvfsp; 3674 struct ulockfs *ulp; 3675 caddr_t outbuf; 3676 size_t bufsize; 3677 uint_t offset; 3678 uint_t bytes_wanted, total_bytes_wanted; 3679 int incount = 0; 3680 int outcount = 0; 3681 int error; 3682 3683 ip = VTOI(vp); 3684 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 3685 3686 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_START, 3687 "ufs_readdir_start:vp %p uiop %p", vp, uiop); 3688 3689 if (uiop->uio_loffset >= MAXOFF32_T) { 3690 if (eofp) 3691 *eofp = 1; 3692 return (0); 3693 } 3694 3695 /* 3696 * Check if we have been called with a valid iov_len 3697 * and bail out if not, otherwise we may potentially loop 3698 * forever further down. 3699 */ 3700 if (uiop->uio_iov->iov_len <= 0) { 3701 error = EINVAL; 3702 goto out; 3703 } 3704 3705 /* 3706 * Large Files: When we come here we are guaranteed that 3707 * uio_offset can be used safely. The high word is zero. 3708 */ 3709 3710 ufsvfsp = ip->i_ufsvfs; 3711 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK); 3712 if (error) 3713 goto out; 3714 3715 iovp = uiop->uio_iov; 3716 total_bytes_wanted = iovp->iov_len; 3717 3718 /* Large Files: directory files should not be "large" */ 3719 3720 ASSERT(ip->i_size <= MAXOFF32_T); 3721 3722 /* Force offset to be valid (to guard against bogus lseek() values) */ 3723 offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1); 3724 3725 /* Quit if at end of file or link count of zero (posix) */ 3726 if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) { 3727 if (eofp) 3728 *eofp = 1; 3729 error = 0; 3730 goto unlock; 3731 } 3732 3733 /* 3734 * Get space to change directory entries into fs independent format. 3735 * Do fast alloc for the most commonly used-request size (filesystem 3736 * block size). 3737 */ 3738 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) { 3739 bufsize = total_bytes_wanted; 3740 outbuf = kmem_alloc(bufsize, KM_SLEEP); 3741 odp = (struct dirent64 *)outbuf; 3742 } else { 3743 bufsize = total_bytes_wanted; 3744 odp = (struct dirent64 *)iovp->iov_base; 3745 } 3746 3747 nextblk: 3748 bytes_wanted = total_bytes_wanted; 3749 3750 /* Truncate request to file size */ 3751 if (offset + bytes_wanted > (int)ip->i_size) 3752 bytes_wanted = (int)(ip->i_size - offset); 3753 3754 /* Comply with MAXBSIZE boundary restrictions of fbread() */ 3755 if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE) 3756 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET); 3757 3758 /* 3759 * Read in the next chunk. 3760 * We are still holding the i_rwlock. 3761 */ 3762 error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp); 3763 3764 if (error) 3765 goto update_inode; 3766 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) && 3767 (!ufsvfsp->vfs_noatime)) { 3768 ip->i_flag |= IACC; 3769 } 3770 incount = 0; 3771 idp = (struct direct *)fbp->fb_addr; 3772 if (idp->d_ino == 0 && idp->d_reclen == 0 && 3773 idp->d_namlen == 0) { 3774 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, " 3775 "fs = %s\n", 3776 (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt); 3777 fbrelse(fbp, S_OTHER); 3778 error = ENXIO; 3779 goto update_inode; 3780 } 3781 /* Transform to file-system independent format */ 3782 while (incount < bytes_wanted) { 3783 /* 3784 * If the current directory entry is mangled, then skip 3785 * to the next block. It would be nice to set the FSBAD 3786 * flag in the super-block so that a fsck is forced on 3787 * next reboot, but locking is a problem. 3788 */ 3789 if (idp->d_reclen & 0x3) { 3790 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3791 break; 3792 } 3793 3794 /* Skip to requested offset and skip empty entries */ 3795 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) { 3796 ushort_t this_reclen = 3797 DIRENT64_RECLEN(idp->d_namlen); 3798 /* Buffer too small for any entries */ 3799 if (!outcount && this_reclen > bufsize) { 3800 fbrelse(fbp, S_OTHER); 3801 error = EINVAL; 3802 goto update_inode; 3803 } 3804 /* If would overrun the buffer, quit */ 3805 if (outcount + this_reclen > bufsize) { 3806 break; 3807 } 3808 /* Take this entry */ 3809 odp->d_ino = (ino64_t)idp->d_ino; 3810 odp->d_reclen = (ushort_t)this_reclen; 3811 odp->d_off = (offset_t)(offset + idp->d_reclen); 3812 3813 /* use strncpy(9f) to zero out uninitialized bytes */ 3814 3815 ASSERT(strlen(idp->d_name) + 1 <= 3816 DIRENT64_NAMELEN(this_reclen)); 3817 (void) strncpy(odp->d_name, idp->d_name, 3818 DIRENT64_NAMELEN(this_reclen)); 3819 outcount += odp->d_reclen; 3820 odp = (struct dirent64 *)((intptr_t)odp + 3821 odp->d_reclen); 3822 ASSERT(outcount <= bufsize); 3823 } 3824 if (idp->d_reclen) { 3825 incount += idp->d_reclen; 3826 offset += idp->d_reclen; 3827 idp = (struct direct *)((intptr_t)idp + idp->d_reclen); 3828 } else { 3829 offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1); 3830 break; 3831 } 3832 } 3833 /* Release the chunk */ 3834 fbrelse(fbp, S_OTHER); 3835 3836 /* Read whole block, but got no entries, read another if not eof */ 3837 3838 /* 3839 * Large Files: casting i_size to int here is not a problem 3840 * because directory sizes are always less than MAXOFF32_T. 3841 * See assertion above. 3842 */ 3843 3844 if (offset < (int)ip->i_size && !outcount) 3845 goto nextblk; 3846 3847 /* Copy out the entry data */ 3848 if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) { 3849 iovp->iov_base += outcount; 3850 iovp->iov_len -= outcount; 3851 uiop->uio_resid -= outcount; 3852 uiop->uio_offset = offset; 3853 } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, 3854 uiop)) == 0) 3855 uiop->uio_offset = offset; 3856 update_inode: 3857 ITIMES(ip); 3858 if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) 3859 kmem_free(outbuf, bufsize); 3860 3861 if (eofp && error == 0) 3862 *eofp = (uiop->uio_offset >= (int)ip->i_size); 3863 unlock: 3864 if (ulp) { 3865 ufs_lockfs_end(ulp); 3866 } 3867 out: 3868 TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_END, 3869 "ufs_readdir_end:vp %p error %d", vp, error); 3870 return (error); 3871 } 3872 3873 /*ARGSUSED*/ 3874 static int 3875 ufs_symlink( 3876 struct vnode *dvp, /* ptr to parent dir vnode */ 3877 char *linkname, /* name of symbolic link */ 3878 struct vattr *vap, /* attributes */ 3879 char *target, /* target path */ 3880 struct cred *cr) /* user credentials */ 3881 { 3882 struct inode *ip, *dip = VTOI(dvp); 3883 struct ufsvfs *ufsvfsp = dip->i_ufsvfs; 3884 struct ulockfs *ulp; 3885 int error; 3886 int issync; 3887 int trans_size; 3888 int residual; 3889 int ioflag; 3890 int retry = 1; 3891 3892 TRACE_1(TR_FAC_UFS, TR_UFS_SYMLINK_START, 3893 "ufs_symlink_start:dvp %p", dvp); 3894 3895 /* 3896 * No symlinks in attrdirs at this time 3897 */ 3898 if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR) 3899 return (EINVAL); 3900 3901 again: 3902 ip = (struct inode *)NULL; 3903 vap->va_type = VLNK; 3904 vap->va_rdev = 0; 3905 3906 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK); 3907 if (error) 3908 goto out; 3909 3910 if (ulp) 3911 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK, 3912 trans_size = (int)TOP_SYMLINK_SIZE(dip)); 3913 3914 /* 3915 * We must create the inode before the directory entry, to avoid 3916 * racing with readlink(). ufs_dirmakeinode requires that we 3917 * hold the quota lock as reader, and directory locks as writer. 3918 */ 3919 3920 rw_enter(&dip->i_rwlock, RW_WRITER); 3921 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3922 rw_enter(&dip->i_contents, RW_WRITER); 3923 3924 /* 3925 * Suppress any out of inodes messages if we will retry on 3926 * ENOSP 3927 */ 3928 if (retry) 3929 dip->i_flag |= IQUIET; 3930 3931 error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr); 3932 3933 dip->i_flag &= ~IQUIET; 3934 3935 rw_exit(&dip->i_contents); 3936 rw_exit(&ufsvfsp->vfs_dqrwlock); 3937 rw_exit(&dip->i_rwlock); 3938 3939 if (error) 3940 goto unlock; 3941 3942 /* 3943 * OK. The inode has been created. Write out the data of the 3944 * symbolic link. Since symbolic links are metadata, and should 3945 * remain consistent across a system crash, we need to force the 3946 * data out synchronously. 3947 * 3948 * (This is a change from the semantics in earlier releases, which 3949 * only created symbolic links synchronously if the semi-documented 3950 * 'syncdir' option was set, or if we were being invoked by the NFS 3951 * server, which requires symbolic links to be created synchronously.) 3952 * 3953 * We need to pass in a pointer for the residual length; otherwise 3954 * ufs_rdwri() will always return EIO if it can't write the data, 3955 * even if the error was really ENOSPC or EDQUOT. 3956 */ 3957 3958 ioflag = FWRITE | FDSYNC; 3959 residual = 0; 3960 3961 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3962 rw_enter(&ip->i_contents, RW_WRITER); 3963 3964 /* 3965 * Suppress file system full messages if we will retry 3966 */ 3967 if (retry) 3968 ip->i_flag |= IQUIET; 3969 3970 error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target), 3971 (offset_t)0, UIO_SYSSPACE, &residual, cr); 3972 3973 ip->i_flag &= ~IQUIET; 3974 3975 if (error) { 3976 rw_exit(&ip->i_contents); 3977 rw_exit(&ufsvfsp->vfs_dqrwlock); 3978 goto remove; 3979 } 3980 3981 /* 3982 * If the link's data is small enough, we can cache it in the inode. 3983 * This is a "fast symbolic link". We don't use the first direct 3984 * block because that's actually used to point at the symbolic link's 3985 * contents on disk; but we know that none of the other direct or 3986 * indirect blocks can be used because symbolic links are restricted 3987 * to be smaller than a file system block. 3988 */ 3989 3990 ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip))); 3991 3992 if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) { 3993 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) { 3994 ip->i_flag |= IFASTSYMLNK; 3995 } else { 3996 int i; 3997 /* error, clear garbage left behind */ 3998 for (i = 1; i < NDADDR; i++) 3999 ip->i_db[i] = 0; 4000 for (i = 0; i < NIADDR; i++) 4001 ip->i_ib[i] = 0; 4002 } 4003 } 4004 4005 rw_exit(&ip->i_contents); 4006 rw_exit(&ufsvfsp->vfs_dqrwlock); 4007 4008 /* 4009 * OK. We've successfully created the symbolic link. All that 4010 * remains is to insert it into the appropriate directory. 4011 */ 4012 4013 rw_enter(&dip->i_rwlock, RW_WRITER); 4014 error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL); 4015 rw_exit(&dip->i_rwlock); 4016 4017 /* 4018 * Fall through into remove-on-error code. We're either done, or we 4019 * need to remove the inode (if we couldn't insert it). 4020 */ 4021 4022 remove: 4023 if (error && (ip != NULL)) { 4024 rw_enter(&ip->i_contents, RW_WRITER); 4025 ip->i_nlink--; 4026 ip->i_flag |= ICHG; 4027 ip->i_seq++; 4028 ufs_setreclaim(ip); 4029 rw_exit(&ip->i_contents); 4030 } 4031 4032 unlock: 4033 if (ip != NULL) 4034 VN_RELE(ITOV(ip)); 4035 4036 if (ulp) { 4037 int terr = 0; 4038 4039 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK, 4040 trans_size); 4041 ufs_lockfs_end(ulp); 4042 if (error == 0) 4043 error = terr; 4044 } 4045 4046 /* 4047 * We may have failed due to lack of an inode or of a block to 4048 * store the target in. Try flushing the delete queue to free 4049 * logically-available things up and try again. 4050 */ 4051 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 4052 ufs_delete_drain_wait(ufsvfsp, 1); 4053 retry = 0; 4054 goto again; 4055 } 4056 4057 out: 4058 TRACE_2(TR_FAC_UFS, TR_UFS_SYMLINK_END, 4059 "ufs_symlink_end:dvp %p error %d", dvp, error); 4060 return (error); 4061 } 4062 4063 /* 4064 * Ufs specific routine used to do ufs io. 4065 */ 4066 int 4067 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base, 4068 ssize_t len, offset_t offset, enum uio_seg seg, int *aresid, 4069 struct cred *cr) 4070 { 4071 struct uio auio; 4072 struct iovec aiov; 4073 int error; 4074 4075 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 4076 4077 bzero((caddr_t)&auio, sizeof (uio_t)); 4078 bzero((caddr_t)&aiov, sizeof (iovec_t)); 4079 4080 aiov.iov_base = base; 4081 aiov.iov_len = len; 4082 auio.uio_iov = &aiov; 4083 auio.uio_iovcnt = 1; 4084 auio.uio_loffset = offset; 4085 auio.uio_segflg = (short)seg; 4086 auio.uio_resid = len; 4087 4088 if (rw == UIO_WRITE) { 4089 auio.uio_fmode = FWRITE; 4090 auio.uio_extflg = UIO_COPY_DEFAULT; 4091 auio.uio_llimit = curproc->p_fsz_ctl; 4092 error = wrip(ip, &auio, ioflag, cr); 4093 } else { 4094 auio.uio_fmode = FREAD; 4095 auio.uio_extflg = UIO_COPY_CACHED; 4096 auio.uio_llimit = MAXOFFSET_T; 4097 error = rdip(ip, &auio, ioflag, cr); 4098 } 4099 4100 if (aresid) { 4101 *aresid = auio.uio_resid; 4102 } else if (auio.uio_resid) { 4103 error = EIO; 4104 } 4105 return (error); 4106 } 4107 4108 static int 4109 ufs_fid(vp, fidp) 4110 struct vnode *vp; 4111 struct fid *fidp; 4112 { 4113 struct ufid *ufid; 4114 struct inode *ip = VTOI(vp); 4115 4116 if (ip->i_ufsvfs == NULL) 4117 return (EIO); 4118 4119 if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) { 4120 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t); 4121 return (ENOSPC); 4122 } 4123 4124 ufid = (struct ufid *)fidp; 4125 bzero((char *)ufid, sizeof (struct ufid)); 4126 ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t); 4127 ufid->ufid_ino = ip->i_number; 4128 ufid->ufid_gen = ip->i_gen; 4129 4130 return (0); 4131 } 4132 4133 /* ARGSUSED2 */ 4134 static int 4135 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4136 { 4137 struct inode *ip = VTOI(vp); 4138 struct ufsvfs *ufsvfsp; 4139 int forcedirectio; 4140 4141 /* 4142 * Read case is easy. 4143 */ 4144 if (!write_lock) { 4145 rw_enter(&ip->i_rwlock, RW_READER); 4146 return (V_WRITELOCK_FALSE); 4147 } 4148 4149 /* 4150 * Caller has requested a writer lock, but that inhibits any 4151 * concurrency in the VOPs that follow. Acquire the lock shared 4152 * and defer exclusive access until it is known to be needed in 4153 * other VOP handlers. Some cases can be determined here. 4154 */ 4155 4156 /* 4157 * If directio is not set, there is no chance of concurrency, 4158 * so just acquire the lock exclusive. Beware of a forced 4159 * unmount before looking at the mount option. 4160 */ 4161 ufsvfsp = ip->i_ufsvfs; 4162 forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0; 4163 if (!(ip->i_flag & IDIRECTIO || forcedirectio) || 4164 !ufs_allow_shared_writes) { 4165 rw_enter(&ip->i_rwlock, RW_WRITER); 4166 return (V_WRITELOCK_TRUE); 4167 } 4168 4169 /* 4170 * Mandatory locking forces acquiring i_rwlock exclusive. 4171 */ 4172 if (MANDLOCK(vp, ip->i_mode)) { 4173 rw_enter(&ip->i_rwlock, RW_WRITER); 4174 return (V_WRITELOCK_TRUE); 4175 } 4176 4177 /* 4178 * Acquire the lock shared in case a concurrent write follows. 4179 * Mandatory locking could have become enabled before the lock 4180 * was acquired. Re-check and upgrade if needed. 4181 */ 4182 rw_enter(&ip->i_rwlock, RW_READER); 4183 if (MANDLOCK(vp, ip->i_mode)) { 4184 rw_exit(&ip->i_rwlock); 4185 rw_enter(&ip->i_rwlock, RW_WRITER); 4186 return (V_WRITELOCK_TRUE); 4187 } 4188 return (V_WRITELOCK_FALSE); 4189 } 4190 4191 /*ARGSUSED*/ 4192 static void 4193 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) 4194 { 4195 struct inode *ip = VTOI(vp); 4196 4197 rw_exit(&ip->i_rwlock); 4198 } 4199 4200 /* ARGSUSED */ 4201 static int 4202 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp) 4203 { 4204 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4205 } 4206 4207 /* ARGSUSED */ 4208 static int 4209 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 4210 offset_t offset, struct flk_callback *flk_cbp, struct cred *cr) 4211 { 4212 struct inode *ip = VTOI(vp); 4213 4214 if (ip->i_ufsvfs == NULL) 4215 return (EIO); 4216 4217 /* 4218 * If file is being mapped, disallow frlock. 4219 * XXX I am not holding tlock while checking i_mapcnt because the 4220 * current locking strategy drops all locks before calling fs_frlock. 4221 * So, mapcnt could change before we enter fs_frlock making is 4222 * meaningless to have held tlock in the first place. 4223 */ 4224 if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode)) 4225 return (EAGAIN); 4226 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4227 } 4228 4229 /* ARGSUSED */ 4230 static int 4231 ufs_space( 4232 struct vnode *vp, 4233 int cmd, 4234 struct flock64 *bfp, 4235 int flag, 4236 offset_t offset, 4237 cred_t *cr, 4238 caller_context_t *ct) 4239 { 4240 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 4241 struct ulockfs *ulp; 4242 int error; 4243 4244 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SPACE_MASK); 4245 if (error) 4246 return (error); 4247 4248 4249 if (cmd != F_FREESP) 4250 error = EINVAL; 4251 else if ((error = convoff(vp, bfp, 0, offset)) == 0) 4252 error = ufs_freesp(vp, bfp, flag, cr); 4253 4254 if (ulp) 4255 ufs_lockfs_end(ulp); 4256 return (error); 4257 } 4258 4259 /* 4260 * Used to determine if read ahead should be done. Also used to 4261 * to determine when write back occurs. 4262 */ 4263 #define CLUSTSZ(ip) ((ip)->i_ufsvfs->vfs_ioclustsz) 4264 4265 /* 4266 * A faster version of ufs_getpage. 4267 * 4268 * We optimize by inlining the pvn_getpages iterator, eliminating 4269 * calls to bmap_read if file doesn't have UFS holes, and avoiding 4270 * the overhead of page_exists(). 4271 * 4272 * When files has UFS_HOLES and ufs_getpage is called with S_READ, 4273 * we set *protp to PROT_READ to avoid calling bmap_read. This approach 4274 * victimizes performance when a file with UFS holes is faulted 4275 * first in the S_READ mode, and then in the S_WRITE mode. We will get 4276 * two MMU faults in this case. 4277 * 4278 * XXX - the inode fields which control the sequential mode are not 4279 * protected by any mutex. The read ahead will act wild if 4280 * multiple processes will access the file concurrently and 4281 * some of them in sequential mode. One particulary bad case 4282 * is if another thread will change the value of i_nextrio between 4283 * the time this thread tests the i_nextrio value and then reads it 4284 * again to use it as the offset for the read ahead. 4285 */ 4286 static int 4287 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 4288 page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr, 4289 enum seg_rw rw, struct cred *cr) 4290 { 4291 u_offset_t uoff = (u_offset_t)off; /* type conversion */ 4292 u_offset_t pgoff; 4293 u_offset_t eoff; 4294 struct inode *ip = VTOI(vp); 4295 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 4296 struct fs *fs; 4297 struct ulockfs *ulp; 4298 page_t **pl; 4299 caddr_t pgaddr; 4300 krw_t rwtype; 4301 int err; 4302 int has_holes; 4303 int beyond_eof; 4304 int seqmode; 4305 int pgsize = PAGESIZE; 4306 int dolock; 4307 int do_qlock; 4308 int trans_size; 4309 4310 TRACE_1(TR_FAC_UFS, TR_UFS_GETPAGE_START, 4311 "ufs_getpage_start:vp %p", vp); 4312 4313 ASSERT((uoff & PAGEOFFSET) == 0); 4314 4315 if (protp) 4316 *protp = PROT_ALL; 4317 4318 /* 4319 * Obey the lockfs protocol 4320 */ 4321 err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg, 4322 rw == S_READ || rw == S_EXEC, protp); 4323 if (err) 4324 goto out; 4325 4326 fs = ufsvfsp->vfs_fs; 4327 4328 if (ulp && (rw == S_CREATE || rw == S_WRITE) && 4329 !(vp->v_flag & VISSWAP)) { 4330 /* 4331 * Try to start a transaction, will return if blocking is 4332 * expected to occur and the address space is not the 4333 * kernel address space. 4334 */ 4335 trans_size = TOP_GETPAGE_SIZE(ip); 4336 if (seg->s_as != &kas) { 4337 TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, 4338 trans_size, err) 4339 if (err == EWOULDBLOCK) { 4340 /* 4341 * Use EDEADLK here because the VM code 4342 * can normally never see this error. 4343 */ 4344 err = EDEADLK; 4345 ufs_lockfs_end(ulp); 4346 goto out; 4347 } 4348 } else { 4349 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4350 } 4351 } 4352 4353 if (vp->v_flag & VNOMAP) { 4354 err = ENOSYS; 4355 goto unlock; 4356 } 4357 4358 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 4359 4360 rwtype = RW_READER; /* start as a reader */ 4361 dolock = (rw_owner(&ip->i_contents) != curthread); 4362 /* 4363 * If this thread owns the lock, i.e., this thread grabbed it 4364 * as writer somewhere above, then we don't need to grab the 4365 * lock as reader in this routine. 4366 */ 4367 do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread); 4368 4369 retrylock: 4370 if (dolock) { 4371 /* 4372 * Grab the quota lock if we need to call 4373 * bmap_write() below (with i_contents as writer). 4374 */ 4375 if (do_qlock && rwtype == RW_WRITER) 4376 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 4377 rw_enter(&ip->i_contents, rwtype); 4378 } 4379 4380 /* 4381 * We may be getting called as a side effect of a bmap using 4382 * fbread() when the blocks might be being allocated and the 4383 * size has not yet been up'ed. In this case we want to be 4384 * able to return zero pages if we get back UFS_HOLE from 4385 * calling bmap for a non write case here. We also might have 4386 * to read some frags from the disk into a page if we are 4387 * extending the number of frags for a given lbn in bmap(). 4388 * Large Files: The read of i_size here is atomic because 4389 * i_contents is held here. If dolock is zero, the lock 4390 * is held in bmap routines. 4391 */ 4392 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 4393 if (beyond_eof && seg != segkmap) { 4394 if (dolock) { 4395 rw_exit(&ip->i_contents); 4396 if (do_qlock && rwtype == RW_WRITER) 4397 rw_exit(&ufsvfsp->vfs_dqrwlock); 4398 } 4399 err = EFAULT; 4400 goto unlock; 4401 } 4402 4403 /* 4404 * Must hold i_contents lock throughout the call to pvn_getpages 4405 * since locked pages are returned from each call to ufs_getapage. 4406 * Must *not* return locked pages and then try for contents lock 4407 * due to lock ordering requirements (inode > page) 4408 */ 4409 4410 has_holes = bmap_has_holes(ip); 4411 4412 if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) { 4413 int blk_size; 4414 u_offset_t offset; 4415 4416 /* 4417 * We must acquire the RW_WRITER lock in order to 4418 * call bmap_write(). 4419 */ 4420 if (dolock && rwtype == RW_READER) { 4421 rwtype = RW_WRITER; 4422 4423 /* 4424 * Grab the quota lock before 4425 * upgrading i_contents, but if we can't grab it 4426 * don't wait here due to lock order: 4427 * vfs_dqrwlock > i_contents. 4428 */ 4429 if (do_qlock && rw_tryenter(&ufsvfsp->vfs_dqrwlock, 4430 RW_READER) == 0) { 4431 rw_exit(&ip->i_contents); 4432 goto retrylock; 4433 } 4434 if (!rw_tryupgrade(&ip->i_contents)) { 4435 rw_exit(&ip->i_contents); 4436 if (do_qlock) 4437 rw_exit(&ufsvfsp->vfs_dqrwlock); 4438 goto retrylock; 4439 } 4440 } 4441 4442 /* 4443 * May be allocating disk blocks for holes here as 4444 * a result of mmap faults. write(2) does the bmap_write 4445 * in rdip/wrip, not here. We are not dealing with frags 4446 * in this case. 4447 */ 4448 /* 4449 * Large Files: We cast fs_bmask field to offset_t 4450 * just as we do for MAXBMASK because uoff is a 64-bit 4451 * data type. fs_bmask will still be a 32-bit type 4452 * as we cannot change any ondisk data structures. 4453 */ 4454 4455 offset = uoff & (offset_t)fs->fs_bmask; 4456 while (offset < uoff + len) { 4457 blk_size = (int)blksize(fs, ip, lblkno(fs, offset)); 4458 err = bmap_write(ip, offset, blk_size, 0, cr); 4459 if (ip->i_flag & (ICHG|IUPD)) 4460 ip->i_seq++; 4461 if (err) 4462 goto update_inode; 4463 offset += blk_size; /* XXX - make this contig */ 4464 } 4465 } 4466 4467 /* 4468 * Can be a reader from now on. 4469 */ 4470 if (dolock && rwtype == RW_WRITER) { 4471 rw_downgrade(&ip->i_contents); 4472 /* 4473 * We can release vfs_dqrwlock early so do it, but make 4474 * sure we don't try to release it again at the bottom. 4475 */ 4476 if (do_qlock) { 4477 rw_exit(&ufsvfsp->vfs_dqrwlock); 4478 do_qlock = 0; 4479 } 4480 } 4481 4482 /* 4483 * We remove PROT_WRITE in cases when the file has UFS holes 4484 * because we don't want to call bmap_read() to check each 4485 * page if it is backed with a disk block. 4486 */ 4487 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) 4488 *protp &= ~PROT_WRITE; 4489 4490 err = 0; 4491 4492 /* 4493 * The loop looks up pages in the range [off, off + len). 4494 * For each page, we first check if we should initiate an asynchronous 4495 * read ahead before we call page_lookup (we may sleep in page_lookup 4496 * for a previously initiated disk read). 4497 */ 4498 eoff = (uoff + len); 4499 for (pgoff = uoff, pgaddr = addr, pl = plarr; 4500 pgoff < eoff; /* empty */) { 4501 page_t *pp; 4502 u_offset_t nextrio; 4503 se_t se; 4504 int retval; 4505 4506 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED); 4507 4508 /* Handle async getpage (faultahead) */ 4509 if (plarr == NULL) { 4510 ip->i_nextrio = pgoff; 4511 (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4512 pgoff += pgsize; 4513 pgaddr += pgsize; 4514 continue; 4515 } 4516 /* 4517 * Check if we should initiate read ahead of next cluster. 4518 * We call page_exists only when we need to confirm that 4519 * we have the current page before we initiate the read ahead. 4520 */ 4521 nextrio = ip->i_nextrio; 4522 if (seqmode && 4523 pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 4524 nextrio < ip->i_size && page_exists(vp, pgoff)) { 4525 retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr); 4526 /* 4527 * We always read ahead the next cluster of data 4528 * starting from i_nextrio. If the page (vp,nextrio) 4529 * is actually in core at this point, the routine 4530 * ufs_getpage_ra() will stop pre-fetching data 4531 * until we read that page in a synchronized manner 4532 * through ufs_getpage_miss(). So, we should increase 4533 * i_nextrio if the page (vp, nextrio) exists. 4534 */ 4535 if ((retval == 0) && page_exists(vp, nextrio)) { 4536 ip->i_nextrio = nextrio + pgsize; 4537 } 4538 } 4539 4540 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 4541 /* 4542 * We found the page in the page cache. 4543 */ 4544 *pl++ = pp; 4545 pgoff += pgsize; 4546 pgaddr += pgsize; 4547 len -= pgsize; 4548 plsz -= pgsize; 4549 } else { 4550 /* 4551 * We have to create the page, or read it from disk. 4552 */ 4553 if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr, 4554 pl, plsz, rw, seqmode)) 4555 goto error; 4556 4557 while (*pl != NULL) { 4558 pl++; 4559 pgoff += pgsize; 4560 pgaddr += pgsize; 4561 len -= pgsize; 4562 plsz -= pgsize; 4563 } 4564 } 4565 } 4566 4567 /* 4568 * Return pages up to plsz if they are in the page cache. 4569 * We cannot return pages if there is a chance that they are 4570 * backed with a UFS hole and rw is S_WRITE or S_CREATE. 4571 */ 4572 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 4573 4574 ASSERT((protp == NULL) || 4575 !(has_holes && (*protp & PROT_WRITE))); 4576 4577 eoff = pgoff + plsz; 4578 while (pgoff < eoff) { 4579 page_t *pp; 4580 4581 if ((pp = page_lookup_nowait(vp, pgoff, 4582 SE_SHARED)) == NULL) 4583 break; 4584 4585 *pl++ = pp; 4586 pgoff += pgsize; 4587 plsz -= pgsize; 4588 } 4589 } 4590 4591 if (plarr) 4592 *pl = NULL; /* Terminate page list */ 4593 ip->i_nextr = pgoff; 4594 4595 error: 4596 if (err && plarr) { 4597 /* 4598 * Release any pages we have locked. 4599 */ 4600 while (pl > &plarr[0]) 4601 page_unlock(*--pl); 4602 4603 plarr[0] = NULL; 4604 } 4605 4606 update_inode: 4607 /* 4608 * If the inode is not already marked for IACC (in rdip() for read) 4609 * and the inode is not marked for no access time update (in wrip() 4610 * for write) then update the inode access time and mod time now. 4611 */ 4612 if ((ip->i_flag & (IACC | INOACC)) == 0) { 4613 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) { 4614 if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && 4615 (fs->fs_ronly == 0) && 4616 (!ufsvfsp->vfs_noatime)) { 4617 mutex_enter(&ip->i_tlock); 4618 ip->i_flag |= IACC; 4619 ITIMES_NOLOCK(ip); 4620 mutex_exit(&ip->i_tlock); 4621 } 4622 } 4623 } 4624 4625 if (dolock) { 4626 rw_exit(&ip->i_contents); 4627 if (do_qlock && rwtype == RW_WRITER) 4628 rw_exit(&ufsvfsp->vfs_dqrwlock); 4629 } 4630 4631 unlock: 4632 if (ulp) { 4633 if ((rw == S_CREATE || rw == S_WRITE) && 4634 !(vp->v_flag & VISSWAP)) { 4635 TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size); 4636 } 4637 ufs_lockfs_end(ulp); 4638 } 4639 out: 4640 TRACE_2(TR_FAC_UFS, TR_UFS_GETPAGE_END, 4641 "ufs_getpage_end:vp %p error %d", vp, err); 4642 return (err); 4643 } 4644 4645 /* 4646 * ufs_getpage_miss is called when ufs_getpage missed the page in the page 4647 * cache. The page is either read from the disk, or it's created. 4648 * A page is created (without disk read) if rw == S_CREATE, or if 4649 * the page is not backed with a real disk block (UFS hole). 4650 */ 4651 /* ARGSUSED */ 4652 static int 4653 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg, 4654 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq) 4655 { 4656 struct inode *ip = VTOI(vp); 4657 page_t *pp; 4658 daddr_t bn; 4659 size_t io_len; 4660 int crpage; 4661 int err; 4662 int contig; 4663 int bsize = ip->i_fs->fs_bsize; 4664 4665 /* 4666 * Figure out whether the page can be created, or must be 4667 * must be read from the disk. 4668 */ 4669 if (rw == S_CREATE) 4670 crpage = 1; 4671 else { 4672 contig = 0; 4673 if (err = bmap_read(ip, off, &bn, &contig)) 4674 return (err); 4675 crpage = (bn == UFS_HOLE); 4676 } 4677 4678 if (crpage) { 4679 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg, 4680 addr)) == NULL) { 4681 return (ufs_fault(vp, 4682 "ufs_getpage_miss: page_create == NULL")); 4683 } 4684 4685 if (rw != S_CREATE) 4686 pagezero(pp, 0, PAGESIZE); 4687 io_len = PAGESIZE; 4688 } else { 4689 u_offset_t io_off; 4690 uint_t xlen; 4691 struct buf *bp; 4692 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 4693 4694 /* 4695 * If access is not in sequential order, we read from disk 4696 * in bsize units. 4697 * 4698 * We limit the size of the transfer to bsize if we are reading 4699 * from the beginning of the file. Note in this situation we 4700 * will hedge our bets and initiate an async read ahead of 4701 * the second block. 4702 */ 4703 if (!seq || off == 0) 4704 contig = MIN(contig, bsize); 4705 4706 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4707 &io_len, off, contig, 0); 4708 4709 /* 4710 * Some other thread has entered the page. 4711 * ufs_getpage will retry page_lookup. 4712 */ 4713 if (pp == NULL) { 4714 pl[0] = NULL; 4715 return (0); 4716 } 4717 4718 /* 4719 * Zero part of the page which we are not 4720 * going to read from the disk. 4721 */ 4722 xlen = io_len & PAGEOFFSET; 4723 if (xlen != 0) 4724 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4725 4726 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ); 4727 bp->b_edev = ip->i_dev; 4728 bp->b_dev = cmpdev(ip->i_dev); 4729 bp->b_blkno = bn; 4730 bp->b_un.b_addr = (caddr_t)0; 4731 bp->b_file = ip->i_vnode; 4732 bp->b_offset = off; 4733 4734 if (ufsvfsp->vfs_log) { 4735 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4736 } else if (ufsvfsp->vfs_snapshot) { 4737 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4738 } else { 4739 ufsvfsp->vfs_iotstamp = lbolt; 4740 ub.ub_getpages.value.ul++; 4741 (void) bdev_strategy(bp); 4742 lwp_stat_update(LWP_STAT_INBLK, 1); 4743 } 4744 4745 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK); 4746 4747 /* 4748 * If the file access is sequential, initiate read ahead 4749 * of the next cluster. 4750 */ 4751 if (seq && ip->i_nextrio < ip->i_size) 4752 (void) ufs_getpage_ra(vp, off, seg, addr); 4753 err = biowait(bp); 4754 pageio_done(bp); 4755 4756 if (err) { 4757 pvn_read_done(pp, B_ERROR); 4758 return (err); 4759 } 4760 } 4761 4762 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4763 return (0); 4764 } 4765 4766 /* 4767 * Read ahead a cluster from the disk. Returns the length in bytes. 4768 */ 4769 static int 4770 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr) 4771 { 4772 struct inode *ip = VTOI(vp); 4773 page_t *pp; 4774 u_offset_t io_off = ip->i_nextrio; 4775 ufsvfs_t *ufsvfsp; 4776 caddr_t addr2 = addr + (io_off - off); 4777 struct buf *bp; 4778 daddr_t bn; 4779 size_t io_len; 4780 int contig; 4781 int xlen; 4782 int bsize = ip->i_fs->fs_bsize; 4783 4784 /* 4785 * If the directio advisory is in effect on this file, 4786 * then do not do buffered read ahead. Read ahead makes 4787 * it more difficult on threads using directio as they 4788 * will be forced to flush the pages from this vnode. 4789 */ 4790 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 4791 return (0); 4792 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) 4793 return (0); 4794 4795 /* 4796 * Is this test needed? 4797 */ 4798 if (addr2 >= seg->s_base + seg->s_size) 4799 return (0); 4800 4801 contig = 0; 4802 if (bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UFS_HOLE) 4803 return (0); 4804 4805 /* 4806 * Limit the transfer size to bsize if this is the 2nd block. 4807 */ 4808 if (io_off == (u_offset_t)bsize) 4809 contig = MIN(contig, bsize); 4810 4811 if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off, 4812 &io_len, io_off, contig, 1)) == NULL) 4813 return (0); 4814 4815 /* 4816 * Zero part of page which we are not going to read from disk 4817 */ 4818 if ((xlen = (io_len & PAGEOFFSET)) > 0) 4819 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 4820 4821 ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK; 4822 4823 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC); 4824 bp->b_edev = ip->i_dev; 4825 bp->b_dev = cmpdev(ip->i_dev); 4826 bp->b_blkno = bn; 4827 bp->b_un.b_addr = (caddr_t)0; 4828 bp->b_file = ip->i_vnode; 4829 bp->b_offset = off; 4830 4831 if (ufsvfsp->vfs_log) { 4832 lufs_read_strategy(ufsvfsp->vfs_log, bp); 4833 } else if (ufsvfsp->vfs_snapshot) { 4834 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 4835 } else { 4836 ufsvfsp->vfs_iotstamp = lbolt; 4837 ub.ub_getras.value.ul++; 4838 (void) bdev_strategy(bp); 4839 lwp_stat_update(LWP_STAT_INBLK, 1); 4840 } 4841 4842 return (io_len); 4843 } 4844 4845 int ufs_delay = 1; 4846 /* 4847 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC} 4848 * 4849 * LMXXX - the inode really ought to contain a pointer to one of these 4850 * async args. Stuff gunk in there and just hand the whole mess off. 4851 * This would replace i_delaylen, i_delayoff. 4852 */ 4853 /*ARGSUSED*/ 4854 static int 4855 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 4856 struct cred *cr) 4857 { 4858 struct inode *ip = VTOI(vp); 4859 int err = 0; 4860 4861 if (vp->v_count == 0) { 4862 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0")); 4863 } 4864 4865 TRACE_1(TR_FAC_UFS, TR_UFS_PUTPAGE_START, 4866 "ufs_putpage_start:vp %p", vp); 4867 4868 /* 4869 * XXX - Why should this check be made here? 4870 */ 4871 if (vp->v_flag & VNOMAP) { 4872 err = ENOSYS; 4873 goto errout; 4874 } 4875 4876 if (ip->i_ufsvfs == NULL) { 4877 err = EIO; 4878 goto errout; 4879 } 4880 4881 if (flags & B_ASYNC) { 4882 if (ufs_delay && len && 4883 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 4884 mutex_enter(&ip->i_tlock); 4885 /* 4886 * If nobody stalled, start a new cluster. 4887 */ 4888 if (ip->i_delaylen == 0) { 4889 ip->i_delayoff = off; 4890 ip->i_delaylen = len; 4891 mutex_exit(&ip->i_tlock); 4892 goto errout; 4893 } 4894 /* 4895 * If we have a full cluster or they are not contig, 4896 * then push last cluster and start over. 4897 */ 4898 if (ip->i_delaylen >= CLUSTSZ(ip) || 4899 ip->i_delayoff + ip->i_delaylen != off) { 4900 u_offset_t doff; 4901 size_t dlen; 4902 4903 doff = ip->i_delayoff; 4904 dlen = ip->i_delaylen; 4905 ip->i_delayoff = off; 4906 ip->i_delaylen = len; 4907 mutex_exit(&ip->i_tlock); 4908 err = ufs_putpages(vp, doff, dlen, 4909 flags, cr); 4910 /* LMXXX - flags are new val, not old */ 4911 goto errout; 4912 } 4913 /* 4914 * There is something there, it's not full, and 4915 * it is contig. 4916 */ 4917 ip->i_delaylen += len; 4918 mutex_exit(&ip->i_tlock); 4919 goto errout; 4920 } 4921 /* 4922 * Must have weird flags or we are not clustering. 4923 */ 4924 } 4925 4926 err = ufs_putpages(vp, off, len, flags, cr); 4927 4928 errout: 4929 TRACE_2(TR_FAC_UFS, TR_UFS_PUTPAGE_END, 4930 "ufs_putpage_end:vp %p error %d", vp, err); 4931 return (err); 4932 } 4933 4934 /* 4935 * If len == 0, do from off to EOF. 4936 * 4937 * The normal cases should be len == 0 & off == 0 (entire vp list), 4938 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4939 * (from pageout). 4940 */ 4941 /*ARGSUSED*/ 4942 static int 4943 ufs_putpages( 4944 struct vnode *vp, 4945 offset_t off, 4946 size_t len, 4947 int flags, 4948 struct cred *cr) 4949 { 4950 u_offset_t io_off; 4951 u_offset_t eoff; 4952 struct inode *ip = VTOI(vp); 4953 page_t *pp; 4954 size_t io_len; 4955 int err = 0; 4956 int dolock; 4957 4958 if (vp->v_count == 0) 4959 return (ufs_fault(vp, "ufs_putpages: v_count == 0")); 4960 /* 4961 * Acquire the readers/write inode lock before locking 4962 * any pages in this inode. 4963 * The inode lock is held during i/o. 4964 */ 4965 if (len == 0) { 4966 mutex_enter(&ip->i_tlock); 4967 ip->i_delayoff = ip->i_delaylen = 0; 4968 mutex_exit(&ip->i_tlock); 4969 } 4970 dolock = (rw_owner(&ip->i_contents) != curthread); 4971 if (dolock) { 4972 /* 4973 * Must synchronize this thread and any possible thread 4974 * operating in the window of vulnerability in wrip(). 4975 * It is dangerous to allow both a thread doing a putpage 4976 * and a thread writing, so serialize them. The exception 4977 * is when the thread in wrip() does something which causes 4978 * a putpage operation. Then, the thread must be allowed 4979 * to continue. It may encounter a bmap_read problem in 4980 * ufs_putapage, but that is handled in ufs_putapage. 4981 * Allow async writers to proceed, we don't want to block 4982 * the pageout daemon. 4983 */ 4984 if (ip->i_writer == curthread) 4985 rw_enter(&ip->i_contents, RW_READER); 4986 else { 4987 for (;;) { 4988 rw_enter(&ip->i_contents, RW_READER); 4989 mutex_enter(&ip->i_tlock); 4990 /* 4991 * If there is no thread in the critical 4992 * section of wrip(), then proceed. 4993 * Otherwise, wait until there isn't one. 4994 */ 4995 if (ip->i_writer == NULL) { 4996 mutex_exit(&ip->i_tlock); 4997 break; 4998 } 4999 rw_exit(&ip->i_contents); 5000 /* 5001 * Bounce async writers when we have a writer 5002 * working on this file so we don't deadlock 5003 * the pageout daemon. 5004 */ 5005 if (flags & B_ASYNC) { 5006 mutex_exit(&ip->i_tlock); 5007 return (0); 5008 } 5009 cv_wait(&ip->i_wrcv, &ip->i_tlock); 5010 mutex_exit(&ip->i_tlock); 5011 } 5012 } 5013 } 5014 5015 if (!vn_has_cached_data(vp)) { 5016 if (dolock) 5017 rw_exit(&ip->i_contents); 5018 return (0); 5019 } 5020 5021 if (len == 0) { 5022 /* 5023 * Search the entire vp list for pages >= off. 5024 */ 5025 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage, 5026 flags, cr); 5027 } else { 5028 /* 5029 * Loop over all offsets in the range looking for 5030 * pages to deal with. 5031 */ 5032 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0) 5033 eoff = MIN(off + len, eoff); 5034 else 5035 eoff = off + len; 5036 5037 for (io_off = off; io_off < eoff; io_off += io_len) { 5038 /* 5039 * If we are not invalidating, synchronously 5040 * freeing or writing pages, use the routine 5041 * page_lookup_nowait() to prevent reclaiming 5042 * them from the free list. 5043 */ 5044 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 5045 pp = page_lookup(vp, io_off, 5046 (flags & (B_INVAL | B_FREE)) ? 5047 SE_EXCL : SE_SHARED); 5048 } else { 5049 pp = page_lookup_nowait(vp, io_off, 5050 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 5051 } 5052 5053 if (pp == NULL || pvn_getdirty(pp, flags) == 0) 5054 io_len = PAGESIZE; 5055 else { 5056 u_offset_t *io_offp = &io_off; 5057 5058 err = ufs_putapage(vp, pp, io_offp, &io_len, 5059 flags, cr); 5060 if (err != 0) 5061 break; 5062 /* 5063 * "io_off" and "io_len" are returned as 5064 * the range of pages we actually wrote. 5065 * This allows us to skip ahead more quickly 5066 * since several pages may've been dealt 5067 * with by this iteration of the loop. 5068 */ 5069 } 5070 } 5071 } 5072 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 5073 /* 5074 * We have just sync'ed back all the pages on 5075 * the inode, turn off the IMODTIME flag. 5076 */ 5077 mutex_enter(&ip->i_tlock); 5078 ip->i_flag &= ~IMODTIME; 5079 mutex_exit(&ip->i_tlock); 5080 } 5081 if (dolock) 5082 rw_exit(&ip->i_contents); 5083 return (err); 5084 } 5085 5086 static void 5087 ufs_iodone(buf_t *bp) 5088 { 5089 struct inode *ip; 5090 5091 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 5092 5093 bp->b_iodone = NULL; 5094 5095 ip = VTOI(bp->b_pages->p_vnode); 5096 5097 mutex_enter(&ip->i_tlock); 5098 if (ip->i_writes >= ufs_LW) { 5099 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW) 5100 if (ufs_WRITES) 5101 cv_broadcast(&ip->i_wrcv); /* wake all up */ 5102 } else { 5103 ip->i_writes -= bp->b_bcount; 5104 } 5105 5106 mutex_exit(&ip->i_tlock); 5107 iodone(bp); 5108 } 5109 5110 /* 5111 * Write out a single page, possibly klustering adjacent 5112 * dirty pages. The inode lock must be held. 5113 * 5114 * LMXXX - bsize < pagesize not done. 5115 */ 5116 /*ARGSUSED*/ 5117 int 5118 ufs_putapage( 5119 struct vnode *vp, 5120 page_t *pp, 5121 u_offset_t *offp, 5122 size_t *lenp, /* return values */ 5123 int flags, 5124 struct cred *cr) 5125 { 5126 u_offset_t io_off; 5127 u_offset_t off; 5128 struct inode *ip = VTOI(vp); 5129 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 5130 struct fs *fs; 5131 struct buf *bp; 5132 size_t io_len; 5133 daddr_t bn; 5134 int err; 5135 int contig; 5136 5137 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 5138 5139 TRACE_1(TR_FAC_UFS, TR_UFS_PUTAPAGE_START, 5140 "ufs_putapage_start:vp %p", vp); 5141 5142 if (ufsvfsp == NULL) { 5143 err = EIO; 5144 goto out_trace; 5145 } 5146 5147 fs = ip->i_fs; 5148 ASSERT(fs->fs_ronly == 0); 5149 5150 /* 5151 * If the modified time on the inode has not already been 5152 * set elsewhere (e.g. for write/setattr) we set the time now. 5153 * This gives us approximate modified times for mmap'ed files 5154 * which are modified via stores in the user address space. 5155 */ 5156 if ((ip->i_flag & IMODTIME) == 0) { 5157 mutex_enter(&ip->i_tlock); 5158 ip->i_flag |= IUPD; 5159 ip->i_seq++; 5160 ITIMES_NOLOCK(ip); 5161 mutex_exit(&ip->i_tlock); 5162 } 5163 5164 /* 5165 * Align the request to a block boundry (for old file systems), 5166 * and go ask bmap() how contiguous things are for this file. 5167 */ 5168 off = pp->p_offset & (offset_t)fs->fs_bmask; /* block align it */ 5169 contig = 0; 5170 err = bmap_read(ip, off, &bn, &contig); 5171 if (err) 5172 goto out; 5173 if (bn == UFS_HOLE) { /* putpage never allocates */ 5174 /* 5175 * logging device is in error mode; simply return EIO 5176 */ 5177 if (TRANS_ISERROR(ufsvfsp)) { 5178 err = EIO; 5179 goto out; 5180 } 5181 /* 5182 * Oops, the thread in the window in wrip() did some 5183 * sort of operation which caused a putpage in the bad 5184 * range. In this case, just return an error which will 5185 * cause the software modified bit on the page to set 5186 * and the page will get written out again later. 5187 */ 5188 if (ip->i_writer == curthread) { 5189 err = EIO; 5190 goto out; 5191 } 5192 /* 5193 * If the pager is trying to push a page in the bad range 5194 * just tell him to try again later when things are better. 5195 */ 5196 if (flags & B_ASYNC) { 5197 err = EAGAIN; 5198 goto out; 5199 } 5200 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE"); 5201 goto out; 5202 } 5203 5204 /* 5205 * Take the length (of contiguous bytes) passed back from bmap() 5206 * and _try_ and get a set of pages covering that extent. 5207 */ 5208 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags); 5209 5210 /* 5211 * May have run out of memory and not clustered backwards. 5212 * off p_offset 5213 * [ pp - 1 ][ pp ] 5214 * [ block ] 5215 * We told bmap off, so we have to adjust the bn accordingly. 5216 */ 5217 if (io_off > off) { 5218 bn += btod(io_off - off); 5219 contig -= (io_off - off); 5220 } 5221 5222 /* 5223 * bmap was carefull to tell us the right size so use that. 5224 * There might be unallocated frags at the end. 5225 * LMXXX - bzero the end of the page? We must be writing after EOF. 5226 */ 5227 if (io_len > contig) { 5228 ASSERT(io_len - contig < fs->fs_bsize); 5229 io_len -= (io_len - contig); 5230 } 5231 5232 /* 5233 * Handle the case where we are writing the last page after EOF. 5234 * 5235 * XXX - just a patch for i-mt3. 5236 */ 5237 if (io_len == 0) { 5238 ASSERT(pp->p_offset >= (u_offset_t)(roundup(ip->i_size, 5239 PAGESIZE))); 5240 io_len = PAGESIZE; 5241 } 5242 5243 bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags); 5244 5245 ULOCKFS_SET_MOD(ITOUL(ip)); 5246 5247 bp->b_edev = ip->i_dev; 5248 bp->b_dev = cmpdev(ip->i_dev); 5249 bp->b_blkno = bn; 5250 bp->b_un.b_addr = (caddr_t)0; 5251 bp->b_file = ip->i_vnode; 5252 5253 if (TRANS_ISTRANS(ufsvfsp)) { 5254 if ((ip->i_mode & IFMT) == IFSHAD) { 5255 TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD); 5256 } else if (ufsvfsp->vfs_qinod == ip) { 5257 TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR, 5258 0, 0); 5259 } 5260 } 5261 5262 /* write throttle */ 5263 5264 ASSERT(bp->b_iodone == NULL); 5265 bp->b_iodone = (int (*)())ufs_iodone; 5266 mutex_enter(&ip->i_tlock); 5267 ip->i_writes += bp->b_bcount; 5268 mutex_exit(&ip->i_tlock); 5269 5270 if (bp->b_flags & B_ASYNC) { 5271 if (ufsvfsp->vfs_log) { 5272 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5273 } else if (ufsvfsp->vfs_snapshot) { 5274 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5275 } else { 5276 ufsvfsp->vfs_iotstamp = lbolt; 5277 ub.ub_putasyncs.value.ul++; 5278 (void) bdev_strategy(bp); 5279 lwp_stat_update(LWP_STAT_OUBLK, 1); 5280 } 5281 } else { 5282 if (ufsvfsp->vfs_log) { 5283 lufs_write_strategy(ufsvfsp->vfs_log, bp); 5284 } else if (ufsvfsp->vfs_snapshot) { 5285 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 5286 } else { 5287 ufsvfsp->vfs_iotstamp = lbolt; 5288 ub.ub_putsyncs.value.ul++; 5289 (void) bdev_strategy(bp); 5290 lwp_stat_update(LWP_STAT_OUBLK, 1); 5291 } 5292 err = biowait(bp); 5293 pageio_done(bp); 5294 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); 5295 } 5296 5297 pp = NULL; 5298 5299 out: 5300 if (err != 0 && pp != NULL) 5301 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 5302 5303 if (offp) 5304 *offp = io_off; 5305 if (lenp) 5306 *lenp = io_len; 5307 out_trace: 5308 TRACE_2(TR_FAC_UFS, TR_UFS_PUTAPAGE_END, 5309 "ufs_putapage_end:vp %p error %d", vp, err); 5310 return (err); 5311 } 5312 5313 /* ARGSUSED */ 5314 static int 5315 ufs_map(struct vnode *vp, 5316 offset_t off, 5317 struct as *as, 5318 caddr_t *addrp, 5319 size_t len, 5320 uchar_t prot, 5321 uchar_t maxprot, 5322 uint_t flags, 5323 struct cred *cr) 5324 { 5325 struct segvn_crargs vn_a; 5326 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5327 struct ulockfs *ulp; 5328 int error; 5329 5330 TRACE_1(TR_FAC_UFS, TR_UFS_MAP_START, 5331 "ufs_map_start:vp %p", vp); 5332 5333 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK); 5334 if (error) 5335 goto out; 5336 5337 if (vp->v_flag & VNOMAP) { 5338 error = ENOSYS; 5339 goto unlock; 5340 } 5341 5342 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) { 5343 error = ENXIO; 5344 goto unlock; 5345 } 5346 5347 if (vp->v_type != VREG) { 5348 error = ENODEV; 5349 goto unlock; 5350 } 5351 5352 /* 5353 * If file is being locked, disallow mapping. 5354 */ 5355 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) { 5356 error = EAGAIN; 5357 goto unlock; 5358 } 5359 5360 as_rangelock(as); 5361 if ((flags & MAP_FIXED) == 0) { 5362 map_addr(addrp, len, off, 1, flags); 5363 if (*addrp == NULL) { 5364 as_rangeunlock(as); 5365 error = ENOMEM; 5366 goto unlock; 5367 } 5368 } else { 5369 /* 5370 * User specified address - blow away any previous mappings 5371 */ 5372 (void) as_unmap(as, *addrp, len); 5373 } 5374 5375 vn_a.vp = vp; 5376 vn_a.offset = (u_offset_t)off; 5377 vn_a.type = flags & MAP_TYPE; 5378 vn_a.prot = prot; 5379 vn_a.maxprot = maxprot; 5380 vn_a.cred = cr; 5381 vn_a.amp = NULL; 5382 vn_a.flags = flags & ~MAP_TYPE; 5383 vn_a.szc = 0; 5384 vn_a.lgrp_mem_policy_flags = 0; 5385 5386 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5387 as_rangeunlock(as); 5388 5389 unlock: 5390 if (ulp) { 5391 ufs_lockfs_end(ulp); 5392 } 5393 out: 5394 TRACE_2(TR_FAC_UFS, TR_UFS_MAP_END, 5395 "ufs_map_end:vp %p error %d", vp, error); 5396 return (error); 5397 } 5398 5399 /* ARGSUSED */ 5400 static int 5401 ufs_addmap(struct vnode *vp, 5402 offset_t off, 5403 struct as *as, 5404 caddr_t addr, 5405 size_t len, 5406 uchar_t prot, 5407 uchar_t maxprot, 5408 uint_t flags, 5409 struct cred *cr) 5410 { 5411 struct inode *ip = VTOI(vp); 5412 5413 if (vp->v_flag & VNOMAP) { 5414 return (ENOSYS); 5415 } 5416 5417 mutex_enter(&ip->i_tlock); 5418 ip->i_mapcnt += btopr(len); 5419 mutex_exit(&ip->i_tlock); 5420 return (0); 5421 } 5422 5423 /*ARGSUSED*/ 5424 static int 5425 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 5426 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 5427 struct cred *cr) 5428 { 5429 struct inode *ip = VTOI(vp); 5430 5431 if (vp->v_flag & VNOMAP) { 5432 return (ENOSYS); 5433 } 5434 5435 mutex_enter(&ip->i_tlock); 5436 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 5437 ASSERT(ip->i_mapcnt >= 0); 5438 mutex_exit(&ip->i_tlock); 5439 return (0); 5440 } 5441 /* 5442 * Return the answer requested to poll() for non-device files 5443 */ 5444 struct pollhead ufs_pollhd; 5445 5446 /* ARGSUSED */ 5447 int 5448 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp) 5449 { 5450 struct ufsvfs *ufsvfsp; 5451 5452 *revp = 0; 5453 ufsvfsp = VTOI(vp)->i_ufsvfs; 5454 5455 if (!ufsvfsp) { 5456 *revp = POLLHUP; 5457 goto out; 5458 } 5459 5460 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) || 5461 ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) { 5462 *revp |= POLLERR; 5463 5464 } else { 5465 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly && 5466 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5467 *revp |= POLLOUT; 5468 5469 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly && 5470 !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs)) 5471 *revp |= POLLWRBAND; 5472 5473 if (ev & POLLIN) 5474 *revp |= POLLIN; 5475 5476 if (ev & POLLRDNORM) 5477 *revp |= POLLRDNORM; 5478 5479 if (ev & POLLRDBAND) 5480 *revp |= POLLRDBAND; 5481 } 5482 5483 if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP))) 5484 *revp |= POLLPRI; 5485 out: 5486 *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL; 5487 5488 return (0); 5489 } 5490 5491 /* ARGSUSED */ 5492 static int 5493 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr) 5494 { 5495 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 5496 struct ulockfs *ulp = NULL; 5497 struct inode *sip = NULL; 5498 int error; 5499 struct inode *ip = VTOI(vp); 5500 int issync; 5501 5502 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK); 5503 if (error) 5504 return (error); 5505 5506 switch (cmd) { 5507 /* 5508 * Have to handle _PC_NAME_MAX here, because the normal way 5509 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()] 5510 * results in a lock ordering reversal between 5511 * ufs_lockfs_{begin,end}() and 5512 * ufs_thread_{suspend,continue}(). 5513 * 5514 * Keep in sync with ufs_statvfs(). 5515 */ 5516 case _PC_NAME_MAX: 5517 *valp = MAXNAMLEN; 5518 break; 5519 5520 case _PC_FILESIZEBITS: 5521 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 5522 *valp = UFS_FILESIZE_BITS; 5523 else 5524 *valp = 32; 5525 break; 5526 5527 case _PC_XATTR_EXISTS: 5528 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5529 5530 error = ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, 5531 cr); 5532 if (error == 0 && sip != NULL) { 5533 /* Start transaction */ 5534 if (ulp) { 5535 TRANS_BEGIN_CSYNC(ufsvfsp, issync, 5536 TOP_RMDIR, TOP_RMDIR_SIZE); 5537 } 5538 /* 5539 * Is directory empty 5540 */ 5541 rw_enter(&sip->i_rwlock, RW_WRITER); 5542 rw_enter(&sip->i_contents, RW_WRITER); 5543 if (ufs_xattrdirempty(sip, 5544 sip->i_number, CRED())) { 5545 rw_enter(&ip->i_contents, RW_WRITER); 5546 ufs_unhook_shadow(ip, sip); 5547 rw_exit(&ip->i_contents); 5548 5549 *valp = 0; 5550 5551 } else 5552 *valp = 1; 5553 rw_exit(&sip->i_contents); 5554 rw_exit(&sip->i_rwlock); 5555 if (ulp) { 5556 TRANS_END_CSYNC(ufsvfsp, error, issync, 5557 TOP_RMDIR, TOP_RMDIR_SIZE); 5558 } 5559 VN_RELE(ITOV(sip)); 5560 } else if (error == ENOENT) { 5561 *valp = 0; 5562 error = 0; 5563 } 5564 } else { 5565 error = fs_pathconf(vp, cmd, valp, cr); 5566 } 5567 break; 5568 5569 case _PC_ACL_ENABLED: 5570 *valp = _ACL_ACLENT_ENABLED; 5571 break; 5572 5573 case _PC_MIN_HOLE_SIZE: 5574 *valp = (ulong_t)ip->i_fs->fs_bsize; 5575 break; 5576 5577 default: 5578 error = fs_pathconf(vp, cmd, valp, cr); 5579 } 5580 5581 if (ulp != NULL) { 5582 ufs_lockfs_end(ulp); 5583 } 5584 return (error); 5585 } 5586 5587 int ufs_pageio_writes, ufs_pageio_reads; 5588 5589 /*ARGSUSED*/ 5590 static int 5591 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5592 int flags, struct cred *cr) 5593 { 5594 struct inode *ip = VTOI(vp); 5595 struct ufsvfs *ufsvfsp; 5596 page_t *npp = NULL, *opp = NULL, *cpp = pp; 5597 struct buf *bp; 5598 daddr_t bn; 5599 size_t done_len = 0, cur_len = 0; 5600 int err = 0; 5601 int contig = 0; 5602 int dolock; 5603 int vmpss = 0; 5604 struct ulockfs *ulp; 5605 5606 if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp && 5607 vp->v_mpssdata != NULL) { 5608 vmpss = 1; 5609 } 5610 5611 dolock = (rw_owner(&ip->i_contents) != curthread); 5612 /* 5613 * We need a better check. Ideally, we would use another 5614 * vnodeops so that hlocked and forcibly unmounted file 5615 * systems would return EIO where appropriate and w/o the 5616 * need for these checks. 5617 */ 5618 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 5619 return (EIO); 5620 5621 /* 5622 * For vmpss (pp can be NULL) case respect the quiesce protocol. 5623 * ul_lock must be taken before locking pages so we can't use it here 5624 * if pp is non NULL because segvn already locked pages 5625 * SE_EXCL. Instead we rely on the fact that a forced umount or 5626 * applying a filesystem lock via ufs_fiolfs() will block in the 5627 * implicit call to ufs_flush() until we unlock the pages after the 5628 * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend 5629 * above 0 until they are done. We have to be careful not to increment 5630 * ul_vnops_cnt here after forceful unmount hlocks the file system. 5631 * 5632 * If pp is NULL use ul_lock to make sure we don't increment 5633 * ul_vnops_cnt after forceful unmount hlocks the file system. 5634 */ 5635 if (vmpss || pp == NULL) { 5636 ulp = &ufsvfsp->vfs_ulockfs; 5637 if (pp == NULL) 5638 mutex_enter(&ulp->ul_lock); 5639 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) { 5640 if (pp == NULL) { 5641 mutex_exit(&ulp->ul_lock); 5642 } 5643 return (vmpss ? EIO : EINVAL); 5644 } 5645 atomic_add_long(&ulp->ul_vnops_cnt, 1); 5646 if (pp == NULL) 5647 mutex_exit(&ulp->ul_lock); 5648 if (ufs_quiesce_pend) { 5649 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5650 cv_broadcast(&ulp->ul_cv); 5651 return (vmpss ? EIO : EINVAL); 5652 } 5653 } 5654 5655 if (dolock) { 5656 /* 5657 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to 5658 * handle a fault against a segment that maps vnode pages with 5659 * large mappings. Segvn creates pages and holds them locked 5660 * SE_EXCL during VOP_PAGEIO() call. In this case we have to 5661 * use rw_tryenter() to avoid a potential deadlock since in 5662 * lock order i_contents needs to be taken first. 5663 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails. 5664 */ 5665 if (!vmpss) { 5666 rw_enter(&ip->i_contents, RW_READER); 5667 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) { 5668 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5669 cv_broadcast(&ulp->ul_cv); 5670 return (EDEADLK); 5671 } 5672 } 5673 5674 /* 5675 * Return an error to segvn because the pagefault request is beyond 5676 * PAGESIZE rounded EOF. 5677 */ 5678 if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) { 5679 if (dolock) 5680 rw_exit(&ip->i_contents); 5681 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5682 cv_broadcast(&ulp->ul_cv); 5683 return (EFAULT); 5684 } 5685 5686 if (pp == NULL) { 5687 if (bmap_has_holes(ip)) { 5688 err = ENOSYS; 5689 } else { 5690 err = EINVAL; 5691 } 5692 if (dolock) 5693 rw_exit(&ip->i_contents); 5694 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5695 cv_broadcast(&ulp->ul_cv); 5696 return (err); 5697 } 5698 5699 /* 5700 * Break the io request into chunks, one for each contiguous 5701 * stretch of disk blocks in the target file. 5702 */ 5703 while (done_len < io_len) { 5704 ASSERT(cpp); 5705 contig = 0; 5706 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len), 5707 &bn, &contig)) 5708 break; 5709 5710 if (bn == UFS_HOLE) { /* No holey swapfiles */ 5711 if (vmpss) { 5712 err = EFAULT; 5713 break; 5714 } 5715 err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE"); 5716 break; 5717 } 5718 5719 cur_len = MIN(io_len - done_len, contig); 5720 /* 5721 * Zero out a page beyond EOF, when the last block of 5722 * a file is a UFS fragment so that ufs_pageio() can be used 5723 * instead of ufs_getpage() to handle faults against 5724 * segvn segments that use large pages. 5725 */ 5726 page_list_break(&cpp, &npp, btopr(cur_len)); 5727 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) { 5728 size_t xlen = cur_len & PAGEOFFSET; 5729 pagezero(cpp->p_prev, xlen, PAGESIZE - xlen); 5730 } 5731 5732 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 5733 ASSERT(bp != NULL); 5734 5735 bp->b_edev = ip->i_dev; 5736 bp->b_dev = cmpdev(ip->i_dev); 5737 bp->b_blkno = bn; 5738 bp->b_un.b_addr = (caddr_t)0; 5739 bp->b_file = ip->i_vnode; 5740 5741 ufsvfsp->vfs_iotstamp = lbolt; 5742 ub.ub_pageios.value.ul++; 5743 if (ufsvfsp->vfs_snapshot) 5744 fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp); 5745 else 5746 (void) bdev_strategy(bp); 5747 5748 if (flags & B_READ) 5749 ufs_pageio_reads++; 5750 else 5751 ufs_pageio_writes++; 5752 if (flags & B_READ) 5753 lwp_stat_update(LWP_STAT_INBLK, 1); 5754 else 5755 lwp_stat_update(LWP_STAT_OUBLK, 1); 5756 /* 5757 * If the request is not B_ASYNC, wait for i/o to complete 5758 * and re-assemble the page list to return to the caller. 5759 * If it is B_ASYNC we leave the page list in pieces and 5760 * cleanup() will dispose of them. 5761 */ 5762 if ((flags & B_ASYNC) == 0) { 5763 err = biowait(bp); 5764 pageio_done(bp); 5765 if (err) 5766 break; 5767 page_list_concat(&opp, &cpp); 5768 } 5769 cpp = npp; 5770 npp = NULL; 5771 if (flags & B_READ) 5772 cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t); 5773 done_len += cur_len; 5774 } 5775 ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len)); 5776 if (err) { 5777 if (flags & B_ASYNC) { 5778 /* Cleanup unprocessed parts of list */ 5779 page_list_concat(&cpp, &npp); 5780 if (flags & B_READ) 5781 pvn_read_done(cpp, B_ERROR); 5782 else 5783 pvn_write_done(cpp, B_ERROR); 5784 } else { 5785 /* Re-assemble list and let caller clean up */ 5786 page_list_concat(&opp, &cpp); 5787 page_list_concat(&opp, &npp); 5788 } 5789 } 5790 5791 if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) && 5792 ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) { 5793 mutex_enter(&ip->i_tlock); 5794 ip->i_flag |= IACC; 5795 ITIMES_NOLOCK(ip); 5796 mutex_exit(&ip->i_tlock); 5797 } 5798 5799 if (dolock) 5800 rw_exit(&ip->i_contents); 5801 if (vmpss && !atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 5802 cv_broadcast(&ulp->ul_cv); 5803 return (err); 5804 } 5805 5806 /* 5807 * Called when the kernel is in a frozen state to dump data 5808 * directly to the device. It uses a private dump data structure, 5809 * set up by dump_ctl, to locate the correct disk block to which to dump. 5810 */ 5811 static int 5812 ufs_dump(vnode_t *vp, caddr_t addr, int ldbn, int dblks) 5813 { 5814 u_offset_t file_size; 5815 struct inode *ip = VTOI(vp); 5816 struct fs *fs = ip->i_fs; 5817 daddr_t dbn, lfsbn; 5818 int disk_blks = fs->fs_bsize >> DEV_BSHIFT; 5819 int error = 0; 5820 int ndbs, nfsbs; 5821 5822 /* 5823 * forced unmount case 5824 */ 5825 if (ip->i_ufsvfs == NULL) 5826 return (EIO); 5827 /* 5828 * Validate the inode that it has not been modified since 5829 * the dump structure is allocated. 5830 */ 5831 mutex_enter(&ip->i_tlock); 5832 if ((dump_info == NULL) || 5833 (dump_info->ip != ip) || 5834 (dump_info->time.tv_sec != ip->i_mtime.tv_sec) || 5835 (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) { 5836 mutex_exit(&ip->i_tlock); 5837 return (-1); 5838 } 5839 mutex_exit(&ip->i_tlock); 5840 5841 /* 5842 * See that the file has room for this write 5843 */ 5844 UFS_GET_ISIZE(&file_size, ip); 5845 5846 if (ldbtob((offset_t)(ldbn + dblks)) > file_size) 5847 return (ENOSPC); 5848 5849 /* 5850 * Find the physical disk block numbers from the dump 5851 * private data structure directly and write out the data 5852 * in contiguous block lumps 5853 */ 5854 while (dblks > 0 && !error) { 5855 lfsbn = (daddr_t)lblkno(fs, ldbtob((offset_t)ldbn)); 5856 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks; 5857 nfsbs = 1; 5858 ndbs = disk_blks - ldbn % disk_blks; 5859 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn + 5860 nfsbs]) == dbn + ndbs) { 5861 nfsbs++; 5862 ndbs += disk_blks; 5863 } 5864 if (ndbs > dblks) 5865 ndbs = dblks; 5866 error = bdev_dump(ip->i_dev, addr, dbn, ndbs); 5867 addr += ldbtob((offset_t)ndbs); 5868 dblks -= ndbs; 5869 ldbn += ndbs; 5870 } 5871 return (error); 5872 5873 } 5874 5875 /* 5876 * Prepare the file system before and after the dump operation. 5877 * 5878 * action = DUMP_ALLOC: 5879 * Preparation before dump, allocate dump private data structure 5880 * to hold all the direct and indirect block info for dump. 5881 * 5882 * action = DUMP_FREE: 5883 * Clean up after dump, deallocate the dump private data structure. 5884 * 5885 * action = DUMP_SCAN: 5886 * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space; 5887 * if found, the starting file-relative DEV_BSIZE lbn is written 5888 * to *bklp; that lbn is intended for use with VOP_DUMP() 5889 */ 5890 static int 5891 ufs_dumpctl(vnode_t *vp, int action, int *blkp) 5892 { 5893 struct inode *ip = VTOI(vp); 5894 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 5895 struct fs *fs; 5896 daddr32_t *dblk, *storeblk; 5897 daddr32_t *nextblk, *endblk; 5898 struct buf *bp; 5899 int i, entry, entries; 5900 int n, ncontig; 5901 5902 /* 5903 * check for forced unmount 5904 */ 5905 if (ufsvfsp == NULL) 5906 return (EIO); 5907 5908 if (action == DUMP_ALLOC) { 5909 /* 5910 * alloc and record dump_info 5911 */ 5912 if (dump_info != NULL) 5913 return (EINVAL); 5914 5915 ASSERT(vp->v_type == VREG); 5916 fs = ufsvfsp->vfs_fs; 5917 5918 rw_enter(&ip->i_contents, RW_READER); 5919 5920 if (bmap_has_holes(ip)) { 5921 rw_exit(&ip->i_contents); 5922 return (EFAULT); 5923 } 5924 5925 /* 5926 * calculate and allocate space needed according to i_size 5927 */ 5928 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size)); 5929 if ((dump_info = (struct dump *) 5930 kmem_alloc(sizeof (struct dump) + 5931 (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP)) == NULL) { 5932 rw_exit(&ip->i_contents); 5933 return (ENOMEM); 5934 } 5935 5936 /* Start saving the info */ 5937 dump_info->fsbs = entries; 5938 dump_info->ip = ip; 5939 storeblk = &dump_info->dblk[0]; 5940 5941 /* Direct Blocks */ 5942 for (entry = 0; entry < NDADDR && entry < entries; entry++) 5943 *storeblk++ = ip->i_db[entry]; 5944 5945 /* Indirect Blocks */ 5946 for (i = 0; i < NIADDR; i++) { 5947 int error = 0; 5948 5949 bp = UFS_BREAD(ufsvfsp, 5950 ip->i_dev, fsbtodb(fs, ip->i_ib[i]), 5951 fs->fs_bsize); 5952 if (bp->b_flags & B_ERROR) 5953 error = EIO; 5954 else { 5955 dblk = bp->b_un.b_daddr; 5956 if ((storeblk = save_dblks(ip, ufsvfsp, 5957 storeblk, dblk, i, entries)) == NULL) 5958 error = EIO; 5959 } 5960 5961 brelse(bp); 5962 5963 if (error != 0) { 5964 kmem_free(dump_info, sizeof (struct dump) + 5965 (entries - 1) * sizeof (daddr32_t)); 5966 rw_exit(&ip->i_contents); 5967 dump_info = NULL; 5968 return (error); 5969 } 5970 } 5971 /* and time stamp the information */ 5972 mutex_enter(&ip->i_tlock); 5973 dump_info->time = ip->i_mtime; 5974 mutex_exit(&ip->i_tlock); 5975 5976 rw_exit(&ip->i_contents); 5977 } else if (action == DUMP_FREE) { 5978 /* 5979 * free dump_info 5980 */ 5981 if (dump_info == NULL) 5982 return (EINVAL); 5983 entries = dump_info->fsbs - 1; 5984 kmem_free(dump_info, sizeof (struct dump) + 5985 entries * sizeof (daddr32_t)); 5986 dump_info = NULL; 5987 } else if (action == DUMP_SCAN) { 5988 /* 5989 * scan dump_info 5990 */ 5991 if (dump_info == NULL) 5992 return (EINVAL); 5993 5994 dblk = dump_info->dblk; 5995 nextblk = dblk + 1; 5996 endblk = dblk + dump_info->fsbs - 1; 5997 fs = ufsvfsp->vfs_fs; 5998 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT); 5999 6000 /* 6001 * scan dblk[] entries; contig fs space is found when: 6002 * ((current blkno + frags per block) == next blkno) 6003 */ 6004 n = 0; 6005 while (n < ncontig && dblk < endblk) { 6006 if ((*dblk + fs->fs_frag) == *nextblk) 6007 n++; 6008 else 6009 n = 0; 6010 dblk++; 6011 nextblk++; 6012 } 6013 6014 /* 6015 * index is where size bytes of contig space begins; 6016 * conversion from index to the file's DEV_BSIZE lbn 6017 * is equivalent to: (index * fs_bsize) / DEV_BSIZE 6018 */ 6019 if (n == ncontig) { 6020 i = (dblk - dump_info->dblk) - ncontig; 6021 *blkp = i << (fs->fs_bshift - DEV_BSHIFT); 6022 } else 6023 return (EFAULT); 6024 } 6025 return (0); 6026 } 6027 6028 /* 6029 * Recursive helper function for ufs_dumpctl(). It follows the indirect file 6030 * system blocks until it reaches the the disk block addresses, which are 6031 * then stored into the given buffer, storeblk. 6032 */ 6033 static daddr32_t * 6034 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp, daddr32_t *storeblk, 6035 daddr32_t *dblk, int level, int entries) 6036 { 6037 struct fs *fs = ufsvfsp->vfs_fs; 6038 struct buf *bp; 6039 int i; 6040 6041 if (level == 0) { 6042 for (i = 0; i < NINDIR(fs); i++) { 6043 if (storeblk - dump_info->dblk >= entries) 6044 break; 6045 *storeblk++ = dblk[i]; 6046 } 6047 return (storeblk); 6048 } 6049 for (i = 0; i < NINDIR(fs); i++) { 6050 if (storeblk - dump_info->dblk >= entries) 6051 break; 6052 bp = UFS_BREAD(ufsvfsp, 6053 ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize); 6054 if (bp->b_flags & B_ERROR) { 6055 brelse(bp); 6056 return (NULL); 6057 } 6058 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr, 6059 level - 1, entries); 6060 brelse(bp); 6061 6062 if (storeblk == NULL) 6063 return (NULL); 6064 } 6065 return (storeblk); 6066 } 6067 6068 /* ARGSUSED */ 6069 static int 6070 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, 6071 struct cred *cr) 6072 { 6073 struct inode *ip = VTOI(vp); 6074 struct ulockfs *ulp; 6075 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 6076 ulong_t vsa_mask = vsap->vsa_mask; 6077 int err = EINVAL; 6078 6079 TRACE_3(TR_FAC_UFS, TR_UFS_GETSECATTR_START, 6080 "ufs_getsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6081 6082 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6083 6084 /* 6085 * Only grab locks if needed - they're not needed to check vsa_mask 6086 * or if the mask contains no acl flags. 6087 */ 6088 if (vsa_mask != 0) { 6089 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, 6090 ULOCKFS_GETATTR_MASK)) 6091 return (err); 6092 6093 rw_enter(&ip->i_contents, RW_READER); 6094 err = ufs_acl_get(ip, vsap, flag, cr); 6095 rw_exit(&ip->i_contents); 6096 6097 if (ulp) 6098 ufs_lockfs_end(ulp); 6099 } 6100 TRACE_1(TR_FAC_UFS, TR_UFS_GETSECATTR_END, 6101 "ufs_getsecattr_end:vp %p", vp); 6102 return (err); 6103 } 6104 6105 /* ARGSUSED */ 6106 static int 6107 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr) 6108 { 6109 struct inode *ip = VTOI(vp); 6110 struct ulockfs *ulp = NULL; 6111 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; 6112 ulong_t vsa_mask = vsap->vsa_mask; 6113 int err; 6114 int haverwlock = 1; 6115 int trans_size; 6116 int donetrans = 0; 6117 int retry = 1; 6118 6119 6120 TRACE_3(TR_FAC_UFS, TR_UFS_SETSECATTR_START, 6121 "ufs_setsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag); 6122 6123 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 6124 6125 /* Abort now if the request is either empty or invalid. */ 6126 vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT); 6127 if ((vsa_mask == 0) || 6128 ((vsap->vsa_aclentp == NULL) && 6129 (vsap->vsa_dfaclentp == NULL))) { 6130 err = EINVAL; 6131 goto out; 6132 } 6133 6134 /* 6135 * Following convention, if this is a directory then we acquire the 6136 * inode's i_rwlock after starting a UFS logging transaction; 6137 * otherwise, we acquire it beforehand. Since we were called (and 6138 * must therefore return) with the lock held, we will have to drop it, 6139 * and later reacquire it, if operating on a directory. 6140 */ 6141 if (vp->v_type == VDIR) { 6142 rw_exit(&ip->i_rwlock); 6143 haverwlock = 0; 6144 } else { 6145 /* Upgrade the lock if required. */ 6146 if (!rw_write_held(&ip->i_rwlock)) { 6147 rw_exit(&ip->i_rwlock); 6148 rw_enter(&ip->i_rwlock, RW_WRITER); 6149 } 6150 } 6151 6152 again: 6153 ASSERT(!(vp->v_type == VDIR && haverwlock)); 6154 if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) { 6155 ulp = NULL; 6156 retry = 0; 6157 goto out; 6158 } 6159 6160 /* 6161 * Check that the file system supports this operation. Note that 6162 * ufs_lockfs_begin() will have checked that the file system had 6163 * not been forcibly unmounted. 6164 */ 6165 if (ufsvfsp->vfs_fs->fs_ronly) { 6166 err = EROFS; 6167 goto out; 6168 } 6169 if (ufsvfsp->vfs_nosetsec) { 6170 err = ENOSYS; 6171 goto out; 6172 } 6173 6174 if (ulp) { 6175 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR, 6176 trans_size = TOP_SETSECATTR_SIZE(VTOI(vp))); 6177 donetrans = 1; 6178 } 6179 6180 if (vp->v_type == VDIR) { 6181 rw_enter(&ip->i_rwlock, RW_WRITER); 6182 haverwlock = 1; 6183 } 6184 6185 ASSERT(haverwlock); 6186 6187 /* Do the actual work. */ 6188 rw_enter(&ip->i_contents, RW_WRITER); 6189 /* 6190 * Suppress out of inodes messages if we will retry. 6191 */ 6192 if (retry) 6193 ip->i_flag |= IQUIET; 6194 err = ufs_acl_set(ip, vsap, flag, cr); 6195 ip->i_flag &= ~IQUIET; 6196 rw_exit(&ip->i_contents); 6197 6198 out: 6199 if (ulp) { 6200 if (donetrans) { 6201 /* 6202 * top_end_async() can eventually call 6203 * top_end_sync(), which can block. We must 6204 * therefore observe the lock-ordering protocol 6205 * here as well. 6206 */ 6207 if (vp->v_type == VDIR) { 6208 rw_exit(&ip->i_rwlock); 6209 haverwlock = 0; 6210 } 6211 TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size); 6212 } 6213 ufs_lockfs_end(ulp); 6214 } 6215 /* 6216 * If no inodes available, try scaring a logically- 6217 * free one out of the delete queue to someplace 6218 * that we can find it. 6219 */ 6220 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 6221 ufs_delete_drain_wait(ufsvfsp, 1); 6222 retry = 0; 6223 if (vp->v_type == VDIR && haverwlock) { 6224 rw_exit(&ip->i_rwlock); 6225 haverwlock = 0; 6226 } 6227 goto again; 6228 } 6229 /* 6230 * If we need to reacquire the lock then it is safe to do so 6231 * as a reader. This is because ufs_rwunlock(), which will be 6232 * called by our caller after we return, does not differentiate 6233 * between shared and exclusive locks. 6234 */ 6235 if (!haverwlock) { 6236 ASSERT(vp->v_type == VDIR); 6237 rw_enter(&ip->i_rwlock, RW_READER); 6238 } 6239 6240 TRACE_1(TR_FAC_UFS, TR_UFS_SETSECATTR_END, 6241 "ufs_setsecattr_end:vp %p", vp); 6242 return (err); 6243 } 6244