1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/filio.h> 40 #include <sys/uio.h> 41 #include <sys/buf.h> 42 #include <sys/mman.h> 43 #include <sys/pathname.h> 44 #include <sys/dirent.h> 45 #include <sys/debug.h> 46 #include <sys/vmsystm.h> 47 #include <sys/fcntl.h> 48 #include <sys/flock.h> 49 #include <sys/swap.h> 50 #include <sys/errno.h> 51 #include <sys/strsubr.h> 52 #include <sys/sysmacros.h> 53 #include <sys/kmem.h> 54 #include <sys/cmn_err.h> 55 #include <sys/pathconf.h> 56 #include <sys/utsname.h> 57 #include <sys/dnlc.h> 58 #include <sys/acl.h> 59 #include <sys/atomic.h> 60 #include <sys/policy.h> 61 #include <sys/sdt.h> 62 63 #include <rpc/types.h> 64 #include <rpc/auth.h> 65 #include <rpc/clnt.h> 66 67 #include <nfs/nfs.h> 68 #include <nfs/nfs_clnt.h> 69 #include <nfs/rnode.h> 70 #include <nfs/nfs_acl.h> 71 #include <nfs/lm.h> 72 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/page.h> 76 #include <vm/pvn.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_kpm.h> 80 #include <vm/seg_vn.h> 81 82 #include <fs/fs_subr.h> 83 84 #include <sys/ddi.h> 85 86 static int nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 87 cred_t *); 88 static int nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *); 89 static int nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *); 90 static int nfssetattr(vnode_t *, struct vattr *, int, cred_t *); 91 static int nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 92 static int nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 93 static int nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *); 94 static int nfsreaddir(vnode_t *, rddir_cache *, cred_t *); 95 static int nfs_bio(struct buf *, cred_t *); 96 static int nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 97 page_t *[], size_t, struct seg *, caddr_t, 98 enum seg_rw, cred_t *); 99 static void nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 100 cred_t *); 101 static int nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 102 int, cred_t *); 103 static int nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 104 int, cred_t *); 105 static void nfs_delmap_callback(struct as *, void *, uint_t); 106 107 /* 108 * Error flags used to pass information about certain special errors 109 * which need to be handled specially. 110 */ 111 #define NFS_EOF -98 112 113 /* 114 * These are the vnode ops routines which implement the vnode interface to 115 * the networked file system. These routines just take their parameters, 116 * make them look networkish by putting the right info into interface structs, 117 * and then calling the appropriate remote routine(s) to do the work. 118 * 119 * Note on directory name lookup cacheing: If we detect a stale fhandle, 120 * we purge the directory cache relative to that vnode. This way, the 121 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 122 * more details on rnode locking. 123 */ 124 125 static int nfs_open(vnode_t **, int, cred_t *); 126 static int nfs_close(vnode_t *, int, int, offset_t, cred_t *); 127 static int nfs_read(vnode_t *, struct uio *, int, cred_t *, 128 caller_context_t *); 129 static int nfs_write(vnode_t *, struct uio *, int, cred_t *, 130 caller_context_t *); 131 static int nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 132 static int nfs_getattr(vnode_t *, struct vattr *, int, cred_t *); 133 static int nfs_setattr(vnode_t *, struct vattr *, int, cred_t *, 134 caller_context_t *); 135 static int nfs_access(vnode_t *, int, int, cred_t *); 136 static int nfs_accessx(void *, int, cred_t *); 137 static int nfs_readlink(vnode_t *, struct uio *, cred_t *); 138 static int nfs_fsync(vnode_t *, int, cred_t *); 139 static void nfs_inactive(vnode_t *, cred_t *); 140 static int nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *, 141 int, vnode_t *, cred_t *); 142 static int nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl, 143 int, vnode_t **, cred_t *, int); 144 static int nfs_remove(vnode_t *, char *, cred_t *); 145 static int nfs_link(vnode_t *, vnode_t *, char *, cred_t *); 146 static int nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 147 static int nfs_mkdir(vnode_t *, char *, struct vattr *, 148 vnode_t **, cred_t *); 149 static int nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 150 static int nfs_symlink(vnode_t *, char *, struct vattr *, char *, 151 cred_t *); 152 static int nfs_readdir(vnode_t *, struct uio *, cred_t *, int *); 153 static int nfs_fid(vnode_t *, fid_t *); 154 static int nfs_rwlock(vnode_t *, int, caller_context_t *); 155 static void nfs_rwunlock(vnode_t *, int, caller_context_t *); 156 static int nfs_seek(vnode_t *, offset_t, offset_t *); 157 static int nfs_getpage(vnode_t *, offset_t, size_t, uint_t *, 158 page_t *[], size_t, struct seg *, caddr_t, 159 enum seg_rw, cred_t *); 160 static int nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 161 static int nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, 162 size_t, uchar_t, uchar_t, uint_t, cred_t *); 163 static int nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, 164 size_t, uchar_t, uchar_t, uint_t, cred_t *); 165 static int nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 166 struct flk_callback *, cred_t *); 167 static int nfs_space(vnode_t *, int, struct flock64 *, int, offset_t, 168 cred_t *, caller_context_t *); 169 static int nfs_realvp(vnode_t *, vnode_t **); 170 static int nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, 171 size_t, uint_t, uint_t, uint_t, cred_t *); 172 static int nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *); 173 static int nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 174 cred_t *); 175 static int nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 176 static int nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 177 static int nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 178 179 struct vnodeops *nfs_vnodeops; 180 181 const fs_operation_def_t nfs_vnodeops_template[] = { 182 VOPNAME_OPEN, nfs_open, 183 VOPNAME_CLOSE, nfs_close, 184 VOPNAME_READ, nfs_read, 185 VOPNAME_WRITE, nfs_write, 186 VOPNAME_IOCTL, nfs_ioctl, 187 VOPNAME_GETATTR, nfs_getattr, 188 VOPNAME_SETATTR, nfs_setattr, 189 VOPNAME_ACCESS, nfs_access, 190 VOPNAME_LOOKUP, nfs_lookup, 191 VOPNAME_CREATE, nfs_create, 192 VOPNAME_REMOVE, nfs_remove, 193 VOPNAME_LINK, nfs_link, 194 VOPNAME_RENAME, nfs_rename, 195 VOPNAME_MKDIR, nfs_mkdir, 196 VOPNAME_RMDIR, nfs_rmdir, 197 VOPNAME_READDIR, nfs_readdir, 198 VOPNAME_SYMLINK, nfs_symlink, 199 VOPNAME_READLINK, nfs_readlink, 200 VOPNAME_FSYNC, nfs_fsync, 201 VOPNAME_INACTIVE, (fs_generic_func_p) nfs_inactive, 202 VOPNAME_FID, nfs_fid, 203 VOPNAME_RWLOCK, nfs_rwlock, 204 VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs_rwunlock, 205 VOPNAME_SEEK, nfs_seek, 206 VOPNAME_FRLOCK, nfs_frlock, 207 VOPNAME_SPACE, nfs_space, 208 VOPNAME_REALVP, nfs_realvp, 209 VOPNAME_GETPAGE, nfs_getpage, 210 VOPNAME_PUTPAGE, nfs_putpage, 211 VOPNAME_MAP, (fs_generic_func_p) nfs_map, 212 VOPNAME_ADDMAP, (fs_generic_func_p) nfs_addmap, 213 VOPNAME_DELMAP, nfs_delmap, 214 VOPNAME_DUMP, nfs_dump, 215 VOPNAME_PATHCONF, nfs_pathconf, 216 VOPNAME_PAGEIO, nfs_pageio, 217 VOPNAME_SETSECATTR, nfs_setsecattr, 218 VOPNAME_GETSECATTR, nfs_getsecattr, 219 VOPNAME_SHRLOCK, nfs_shrlock, 220 NULL, NULL 221 }; 222 223 /* 224 * XXX: This is referenced in modstubs.s 225 */ 226 struct vnodeops * 227 nfs_getvnodeops(void) 228 { 229 return (nfs_vnodeops); 230 } 231 232 /* ARGSUSED */ 233 static int 234 nfs_open(vnode_t **vpp, int flag, cred_t *cr) 235 { 236 int error; 237 struct vattr va; 238 rnode_t *rp; 239 vnode_t *vp; 240 241 vp = *vpp; 242 rp = VTOR(vp); 243 if (nfs_zone() != VTOMI(vp)->mi_zone) 244 return (EIO); 245 mutex_enter(&rp->r_statelock); 246 if (rp->r_cred == NULL) { 247 crhold(cr); 248 rp->r_cred = cr; 249 } 250 mutex_exit(&rp->r_statelock); 251 252 /* 253 * If there is no cached data or if close-to-open 254 * consistency checking is turned off, we can avoid 255 * the over the wire getattr. Otherwise, if the 256 * file system is mounted readonly, then just verify 257 * the caches are up to date using the normal mechanism. 258 * Else, if the file is not mmap'd, then just mark 259 * the attributes as timed out. They will be refreshed 260 * and the caches validated prior to being used. 261 * Else, the file system is mounted writeable so 262 * force an over the wire GETATTR in order to ensure 263 * that all cached data is valid. 264 */ 265 if (vp->v_count > 1 || 266 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 267 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 268 if (vn_is_readonly(vp)) 269 error = nfs_validate_caches(vp, cr); 270 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 271 PURGE_ATTRCACHE(vp); 272 error = 0; 273 } else { 274 va.va_mask = AT_ALL; 275 error = nfs_getattr_otw(vp, &va, cr); 276 } 277 } else 278 error = 0; 279 280 return (error); 281 } 282 283 static int 284 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 285 { 286 rnode_t *rp; 287 int error; 288 struct vattr va; 289 290 /* 291 * zone_enter(2) prevents processes from changing zones with NFS files 292 * open; if we happen to get here from the wrong zone we can't do 293 * anything over the wire. 294 */ 295 if (VTOMI(vp)->mi_zone != nfs_zone()) { 296 /* 297 * We could attempt to clean up locks, except we're sure 298 * that the current process didn't acquire any locks on 299 * the file: any attempt to lock a file belong to another zone 300 * will fail, and one can't lock an NFS file and then change 301 * zones, as that fails too. 302 * 303 * Returning an error here is the sane thing to do. A 304 * subsequent call to VN_RELE() which translates to a 305 * nfs_inactive() will clean up state: if the zone of the 306 * vnode's origin is still alive and kicking, an async worker 307 * thread will handle the request (from the correct zone), and 308 * everything (minus the final nfs_getattr_otw() call) should 309 * be OK. If the zone is going away nfs_async_inactive() will 310 * throw away cached pages inline. 311 */ 312 return (EIO); 313 } 314 315 /* 316 * If we are using local locking for this filesystem, then 317 * release all of the SYSV style record locks. Otherwise, 318 * we are doing network locking and we need to release all 319 * of the network locks. All of the locks held by this 320 * process on this file are released no matter what the 321 * incoming reference count is. 322 */ 323 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 324 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 325 cleanshares(vp, ttoproc(curthread)->p_pid); 326 } else 327 nfs_lockrelease(vp, flag, offset, cr); 328 329 if (count > 1) 330 return (0); 331 332 /* 333 * If the file has been `unlinked', then purge the 334 * DNLC so that this vnode will get reycled quicker 335 * and the .nfs* file on the server will get removed. 336 */ 337 rp = VTOR(vp); 338 if (rp->r_unldvp != NULL) 339 dnlc_purge_vp(vp); 340 341 /* 342 * If the file was open for write and there are pages, 343 * then if the file system was mounted using the "no-close- 344 * to-open" semantics, then start an asynchronous flush 345 * of the all of the pages in the file. 346 * else the file system was not mounted using the "no-close- 347 * to-open" semantics, then do a synchronous flush and 348 * commit of all of the dirty and uncommitted pages. 349 * 350 * The asynchronous flush of the pages in the "nocto" path 351 * mostly just associates a cred pointer with the rnode so 352 * writes which happen later will have a better chance of 353 * working. It also starts the data being written to the 354 * server, but without unnecessarily delaying the application. 355 */ 356 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 357 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) { 358 error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, cr); 359 if (error == EAGAIN) 360 error = 0; 361 } else 362 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 363 if (!error) { 364 mutex_enter(&rp->r_statelock); 365 error = rp->r_error; 366 rp->r_error = 0; 367 mutex_exit(&rp->r_statelock); 368 } 369 } else { 370 mutex_enter(&rp->r_statelock); 371 error = rp->r_error; 372 rp->r_error = 0; 373 mutex_exit(&rp->r_statelock); 374 } 375 376 /* 377 * If RWRITEATTR is set, then issue an over the wire GETATTR to 378 * refresh the attribute cache with a set of attributes which 379 * weren't returned from a WRITE. This will enable the close- 380 * to-open processing to work. 381 */ 382 if (rp->r_flags & RWRITEATTR) 383 (void) nfs_getattr_otw(vp, &va, cr); 384 385 return (error); 386 } 387 388 /* ARGSUSED */ 389 static int 390 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 391 caller_context_t *ct) 392 { 393 rnode_t *rp; 394 u_offset_t off; 395 offset_t diff; 396 int on; 397 size_t n; 398 caddr_t base; 399 uint_t flags; 400 int error; 401 mntinfo_t *mi; 402 403 rp = VTOR(vp); 404 mi = VTOMI(vp); 405 406 if (nfs_zone() != mi->mi_zone) 407 return (EIO); 408 409 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 410 411 if (vp->v_type != VREG) 412 return (EISDIR); 413 414 if (uiop->uio_resid == 0) 415 return (0); 416 417 if (uiop->uio_loffset > MAXOFF32_T) 418 return (EFBIG); 419 420 if (uiop->uio_loffset < 0 || 421 uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T) 422 return (EINVAL); 423 424 /* 425 * Bypass VM if caching has been disabled (e.g., locking) or if 426 * using client-side direct I/O and the file is not mmap'd and 427 * there are no cached pages. 428 */ 429 if ((vp->v_flag & VNOCACHE) || 430 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 431 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 432 size_t bufsize; 433 size_t resid = 0; 434 435 /* 436 * Let's try to do read in as large a chunk as we can 437 * (Filesystem (NFS client) bsize if possible/needed). 438 * For V3, this is 32K and for V2, this is 8K. 439 */ 440 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread); 441 base = kmem_alloc(bufsize, KM_SLEEP); 442 do { 443 n = MIN(uiop->uio_resid, bufsize); 444 error = nfsread(vp, base, uiop->uio_offset, n, 445 &resid, cr); 446 if (!error) { 447 n -= resid; 448 error = uiomove(base, n, UIO_READ, uiop); 449 } 450 } while (!error && uiop->uio_resid > 0 && n > 0); 451 kmem_free(base, bufsize); 452 return (error); 453 } 454 455 error = 0; 456 457 do { 458 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 459 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 460 n = MIN(MAXBSIZE - on, uiop->uio_resid); 461 462 error = nfs_validate_caches(vp, cr); 463 if (error) 464 break; 465 466 mutex_enter(&rp->r_statelock); 467 diff = rp->r_size - uiop->uio_loffset; 468 mutex_exit(&rp->r_statelock); 469 if (diff <= 0) 470 break; 471 if (diff < n) 472 n = (size_t)diff; 473 474 if (vpm_enable) { 475 /* 476 * Copy data. 477 */ 478 error = vpm_data_copy(vp, off + on, n, uiop, 479 1, NULL, 0, S_READ); 480 } else { 481 base = segmap_getmapflt(segkmap, vp, off + on, n, 482 1, S_READ); 483 error = uiomove(base + on, n, UIO_READ, uiop); 484 } 485 486 if (!error) { 487 /* 488 * If read a whole block or read to eof, 489 * won't need this buffer again soon. 490 */ 491 mutex_enter(&rp->r_statelock); 492 if (n + on == MAXBSIZE || 493 uiop->uio_loffset == rp->r_size) 494 flags = SM_DONTNEED; 495 else 496 flags = 0; 497 mutex_exit(&rp->r_statelock); 498 if (vpm_enable) { 499 error = vpm_sync_pages(vp, off, n, flags); 500 } else { 501 error = segmap_release(segkmap, base, flags); 502 } 503 } else { 504 if (vpm_enable) { 505 (void) vpm_sync_pages(vp, off, n, 0); 506 } else { 507 (void) segmap_release(segkmap, base, 0); 508 } 509 } 510 } while (!error && uiop->uio_resid > 0); 511 512 return (error); 513 } 514 515 /* ARGSUSED */ 516 static int 517 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 518 caller_context_t *ct) 519 { 520 rnode_t *rp; 521 u_offset_t off; 522 caddr_t base; 523 uint_t flags; 524 int remainder; 525 size_t n; 526 int on; 527 int error; 528 int resid; 529 offset_t offset; 530 rlim_t limit; 531 mntinfo_t *mi; 532 533 rp = VTOR(vp); 534 535 mi = VTOMI(vp); 536 if (nfs_zone() != mi->mi_zone) 537 return (EIO); 538 if (vp->v_type != VREG) 539 return (EISDIR); 540 541 if (uiop->uio_resid == 0) 542 return (0); 543 544 if (ioflag & FAPPEND) { 545 struct vattr va; 546 547 /* 548 * Must serialize if appending. 549 */ 550 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 551 nfs_rw_exit(&rp->r_rwlock); 552 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 553 INTR(vp))) 554 return (EINTR); 555 } 556 557 va.va_mask = AT_SIZE; 558 error = nfsgetattr(vp, &va, cr); 559 if (error) 560 return (error); 561 uiop->uio_loffset = va.va_size; 562 } 563 564 if (uiop->uio_loffset > MAXOFF32_T) 565 return (EFBIG); 566 567 offset = uiop->uio_loffset + uiop->uio_resid; 568 569 if (uiop->uio_loffset < 0 || offset > MAXOFF32_T) 570 return (EINVAL); 571 572 if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) { 573 limit = MAXOFF32_T; 574 } else { 575 limit = (rlim_t)uiop->uio_llimit; 576 } 577 578 /* 579 * Check to make sure that the process will not exceed 580 * its limit on file size. It is okay to write up to 581 * the limit, but not beyond. Thus, the write which 582 * reaches the limit will be short and the next write 583 * will return an error. 584 */ 585 remainder = 0; 586 if (offset > limit) { 587 remainder = offset - limit; 588 uiop->uio_resid = limit - uiop->uio_offset; 589 if (uiop->uio_resid <= 0) { 590 proc_t *p = ttoproc(curthread); 591 592 uiop->uio_resid += remainder; 593 mutex_enter(&p->p_lock); 594 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 595 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 596 mutex_exit(&p->p_lock); 597 return (EFBIG); 598 } 599 } 600 601 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 602 return (EINTR); 603 604 /* 605 * Bypass VM if caching has been disabled (e.g., locking) or if 606 * using client-side direct I/O and the file is not mmap'd and 607 * there are no cached pages. 608 */ 609 if ((vp->v_flag & VNOCACHE) || 610 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 611 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 612 size_t bufsize; 613 int count; 614 uint_t org_offset; 615 616 nfs_fwrite: 617 if (rp->r_flags & RSTALE) { 618 resid = uiop->uio_resid; 619 offset = uiop->uio_loffset; 620 error = rp->r_error; 621 goto bottom; 622 } 623 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite); 624 base = kmem_alloc(bufsize, KM_SLEEP); 625 do { 626 resid = uiop->uio_resid; 627 offset = uiop->uio_loffset; 628 count = MIN(uiop->uio_resid, bufsize); 629 org_offset = uiop->uio_offset; 630 error = uiomove(base, count, UIO_WRITE, uiop); 631 if (!error) { 632 error = nfswrite(vp, base, org_offset, 633 count, cr); 634 } 635 } while (!error && uiop->uio_resid > 0); 636 kmem_free(base, bufsize); 637 goto bottom; 638 } 639 640 do { 641 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 642 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 643 n = MIN(MAXBSIZE - on, uiop->uio_resid); 644 645 resid = uiop->uio_resid; 646 offset = uiop->uio_loffset; 647 648 if (rp->r_flags & RSTALE) { 649 error = rp->r_error; 650 break; 651 } 652 653 /* 654 * Don't create dirty pages faster than they 655 * can be cleaned so that the system doesn't 656 * get imbalanced. If the async queue is 657 * maxed out, then wait for it to drain before 658 * creating more dirty pages. Also, wait for 659 * any threads doing pagewalks in the vop_getattr 660 * entry points so that they don't block for 661 * long periods. 662 */ 663 mutex_enter(&rp->r_statelock); 664 while ((mi->mi_max_threads != 0 && 665 rp->r_awcount > 2 * mi->mi_max_threads) || 666 rp->r_gcount > 0) 667 cv_wait(&rp->r_cv, &rp->r_statelock); 668 mutex_exit(&rp->r_statelock); 669 670 if (vpm_enable) { 671 /* 672 * It will use kpm mappings, so no need to 673 * pass an address. 674 */ 675 error = writerp(rp, NULL, n, uiop, 0); 676 } else { 677 if (segmap_kpm) { 678 int pon = uiop->uio_loffset & PAGEOFFSET; 679 size_t pn = MIN(PAGESIZE - pon, 680 uiop->uio_resid); 681 int pagecreate; 682 683 mutex_enter(&rp->r_statelock); 684 pagecreate = (pon == 0) && (pn == PAGESIZE || 685 uiop->uio_loffset + pn >= rp->r_size); 686 mutex_exit(&rp->r_statelock); 687 688 base = segmap_getmapflt(segkmap, vp, off + on, 689 pn, !pagecreate, S_WRITE); 690 691 error = writerp(rp, base + pon, n, uiop, 692 pagecreate); 693 694 } else { 695 base = segmap_getmapflt(segkmap, vp, off + on, 696 n, 0, S_READ); 697 error = writerp(rp, base + on, n, uiop, 0); 698 } 699 } 700 701 if (!error) { 702 if (mi->mi_flags & MI_NOAC) 703 flags = SM_WRITE; 704 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 705 /* 706 * Have written a whole block. 707 * Start an asynchronous write 708 * and mark the buffer to 709 * indicate that it won't be 710 * needed again soon. 711 */ 712 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 713 } else 714 flags = 0; 715 if ((ioflag & (FSYNC|FDSYNC)) || 716 (rp->r_flags & ROUTOFSPACE)) { 717 flags &= ~SM_ASYNC; 718 flags |= SM_WRITE; 719 } 720 if (vpm_enable) { 721 error = vpm_sync_pages(vp, off, n, flags); 722 } else { 723 error = segmap_release(segkmap, base, flags); 724 } 725 } else { 726 if (vpm_enable) { 727 (void) vpm_sync_pages(vp, off, n, 0); 728 } else { 729 (void) segmap_release(segkmap, base, 0); 730 } 731 /* 732 * In the event that we got an access error while 733 * faulting in a page for a write-only file just 734 * force a write. 735 */ 736 if (error == EACCES) 737 goto nfs_fwrite; 738 } 739 } while (!error && uiop->uio_resid > 0); 740 741 bottom: 742 if (error) { 743 uiop->uio_resid = resid + remainder; 744 uiop->uio_loffset = offset; 745 } else 746 uiop->uio_resid += remainder; 747 748 nfs_rw_exit(&rp->r_lkserlock); 749 750 return (error); 751 } 752 753 /* 754 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 755 */ 756 static int 757 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 758 int flags, cred_t *cr) 759 { 760 struct buf *bp; 761 int error; 762 763 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 764 bp = pageio_setup(pp, len, vp, flags); 765 ASSERT(bp != NULL); 766 767 /* 768 * pageio_setup should have set b_addr to 0. This 769 * is correct since we want to do I/O on a page 770 * boundary. bp_mapin will use this addr to calculate 771 * an offset, and then set b_addr to the kernel virtual 772 * address it allocated for us. 773 */ 774 ASSERT(bp->b_un.b_addr == 0); 775 776 bp->b_edev = 0; 777 bp->b_dev = 0; 778 bp->b_lblkno = lbtodb(off); 779 bp->b_file = vp; 780 bp->b_offset = (offset_t)off; 781 bp_mapin(bp); 782 783 error = nfs_bio(bp, cr); 784 785 bp_mapout(bp); 786 pageio_done(bp); 787 788 return (error); 789 } 790 791 /* 792 * Write to file. Writes to remote server in largest size 793 * chunks that the server can handle. Write is synchronous. 794 */ 795 static int 796 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr) 797 { 798 rnode_t *rp; 799 mntinfo_t *mi; 800 struct nfswriteargs wa; 801 struct nfsattrstat ns; 802 int error; 803 int tsize; 804 int douprintf; 805 806 douprintf = 1; 807 808 rp = VTOR(vp); 809 mi = VTOMI(vp); 810 811 ASSERT(nfs_zone() == mi->mi_zone); 812 813 wa.wa_args = &wa.wa_args_buf; 814 wa.wa_fhandle = *VTOFH(vp); 815 816 do { 817 tsize = MIN(mi->mi_curwrite, count); 818 wa.wa_data = base; 819 wa.wa_begoff = offset; 820 wa.wa_totcount = tsize; 821 wa.wa_count = tsize; 822 wa.wa_offset = offset; 823 824 if (mi->mi_io_kstats) { 825 mutex_enter(&mi->mi_lock); 826 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 827 mutex_exit(&mi->mi_lock); 828 } 829 wa.wa_mblk = NULL; 830 do { 831 error = rfs2call(mi, RFS_WRITE, 832 xdr_writeargs, (caddr_t)&wa, 833 xdr_attrstat, (caddr_t)&ns, cr, 834 &douprintf, &ns.ns_status, 0, NULL); 835 } while (error == ENFS_TRYAGAIN); 836 if (mi->mi_io_kstats) { 837 mutex_enter(&mi->mi_lock); 838 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 839 mutex_exit(&mi->mi_lock); 840 } 841 842 if (!error) { 843 error = geterrno(ns.ns_status); 844 /* 845 * Can't check for stale fhandle and purge caches 846 * here because pages are held by nfs_getpage. 847 * Just mark the attribute cache as timed out 848 * and set RWRITEATTR to indicate that the file 849 * was modified with a WRITE operation. 850 */ 851 if (!error) { 852 count -= tsize; 853 base += tsize; 854 offset += tsize; 855 if (mi->mi_io_kstats) { 856 mutex_enter(&mi->mi_lock); 857 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 858 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 859 tsize; 860 mutex_exit(&mi->mi_lock); 861 } 862 lwp_stat_update(LWP_STAT_OUBLK, 1); 863 mutex_enter(&rp->r_statelock); 864 PURGE_ATTRCACHE_LOCKED(rp); 865 rp->r_flags |= RWRITEATTR; 866 mutex_exit(&rp->r_statelock); 867 } 868 } 869 } while (!error && count); 870 871 return (error); 872 } 873 874 /* 875 * Read from a file. Reads data in largest chunks our interface can handle. 876 */ 877 static int 878 nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp, 879 cred_t *cr) 880 { 881 mntinfo_t *mi; 882 struct nfsreadargs ra; 883 struct nfsrdresult rr; 884 int tsize; 885 int error; 886 int douprintf; 887 failinfo_t fi; 888 rnode_t *rp; 889 struct vattr va; 890 hrtime_t t; 891 892 rp = VTOR(vp); 893 mi = VTOMI(vp); 894 895 ASSERT(nfs_zone() == mi->mi_zone); 896 897 douprintf = 1; 898 899 ra.ra_fhandle = *VTOFH(vp); 900 901 fi.vp = vp; 902 fi.fhp = (caddr_t)&ra.ra_fhandle; 903 fi.copyproc = nfscopyfh; 904 fi.lookupproc = nfslookup; 905 fi.xattrdirproc = acl_getxattrdir2; 906 907 do { 908 if (mi->mi_io_kstats) { 909 mutex_enter(&mi->mi_lock); 910 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 911 mutex_exit(&mi->mi_lock); 912 } 913 914 do { 915 tsize = MIN(mi->mi_curread, count); 916 rr.rr_data = base; 917 ra.ra_offset = offset; 918 ra.ra_totcount = tsize; 919 ra.ra_count = tsize; 920 t = gethrtime(); 921 error = rfs2call(mi, RFS_READ, 922 xdr_readargs, (caddr_t)&ra, 923 xdr_rdresult, (caddr_t)&rr, cr, 924 &douprintf, &rr.rr_status, 0, &fi); 925 } while (error == ENFS_TRYAGAIN); 926 927 if (mi->mi_io_kstats) { 928 mutex_enter(&mi->mi_lock); 929 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 930 mutex_exit(&mi->mi_lock); 931 } 932 933 if (!error) { 934 error = geterrno(rr.rr_status); 935 if (!error) { 936 count -= rr.rr_count; 937 base += rr.rr_count; 938 offset += rr.rr_count; 939 if (mi->mi_io_kstats) { 940 mutex_enter(&mi->mi_lock); 941 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 942 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 943 rr.rr_count; 944 mutex_exit(&mi->mi_lock); 945 } 946 lwp_stat_update(LWP_STAT_INBLK, 1); 947 } 948 } 949 } while (!error && count && rr.rr_count == tsize); 950 951 *residp = count; 952 953 if (!error) { 954 /* 955 * Since no error occurred, we have the current 956 * attributes and we need to do a cache check and then 957 * potentially update the cached attributes. We can't 958 * use the normal attribute check and cache mechanisms 959 * because they might cause a cache flush which would 960 * deadlock. Instead, we just check the cache to see 961 * if the attributes have changed. If it is, then we 962 * just mark the attributes as out of date. The next 963 * time that the attributes are checked, they will be 964 * out of date, new attributes will be fetched, and 965 * the page cache will be flushed. If the attributes 966 * weren't changed, then we just update the cached 967 * attributes with these attributes. 968 */ 969 /* 970 * If NFS_ACL is supported on the server, then the 971 * attributes returned by server may have minimal 972 * permissions sometimes denying access to users having 973 * proper access. To get the proper attributes, mark 974 * the attributes as expired so that they will be 975 * regotten via the NFS_ACL GETATTR2 procedure. 976 */ 977 error = nattr_to_vattr(vp, &rr.rr_attr, &va); 978 mutex_enter(&rp->r_statelock); 979 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) || 980 (mi->mi_flags & MI_ACL)) { 981 mutex_exit(&rp->r_statelock); 982 PURGE_ATTRCACHE(vp); 983 } else { 984 if (rp->r_mtime <= t) { 985 nfs_attrcache_va(vp, &va); 986 } 987 mutex_exit(&rp->r_statelock); 988 } 989 } 990 991 return (error); 992 } 993 994 /* ARGSUSED */ 995 static int 996 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 997 { 998 999 if (nfs_zone() != VTOMI(vp)->mi_zone) 1000 return (EIO); 1001 switch (cmd) { 1002 case _FIODIRECTIO: 1003 return (nfs_directio(vp, (int)arg, cr)); 1004 default: 1005 return (ENOTTY); 1006 } 1007 } 1008 1009 static int 1010 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1011 { 1012 int error; 1013 rnode_t *rp; 1014 1015 if (nfs_zone() != VTOMI(vp)->mi_zone) 1016 return (EIO); 1017 /* 1018 * If it has been specified that the return value will 1019 * just be used as a hint, and we are only being asked 1020 * for size, fsid or rdevid, then return the client's 1021 * notion of these values without checking to make sure 1022 * that the attribute cache is up to date. 1023 * The whole point is to avoid an over the wire GETATTR 1024 * call. 1025 */ 1026 rp = VTOR(vp); 1027 if (flags & ATTR_HINT) { 1028 if (vap->va_mask == 1029 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1030 mutex_enter(&rp->r_statelock); 1031 if (vap->va_mask | AT_SIZE) 1032 vap->va_size = rp->r_size; 1033 if (vap->va_mask | AT_FSID) 1034 vap->va_fsid = rp->r_attr.va_fsid; 1035 if (vap->va_mask | AT_RDEV) 1036 vap->va_rdev = rp->r_attr.va_rdev; 1037 mutex_exit(&rp->r_statelock); 1038 return (0); 1039 } 1040 } 1041 1042 /* 1043 * Only need to flush pages if asking for the mtime 1044 * and if there any dirty pages or any outstanding 1045 * asynchronous (write) requests for this file. 1046 */ 1047 if (vap->va_mask & AT_MTIME) { 1048 if (vn_has_cached_data(vp) && 1049 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1050 mutex_enter(&rp->r_statelock); 1051 rp->r_gcount++; 1052 mutex_exit(&rp->r_statelock); 1053 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1054 mutex_enter(&rp->r_statelock); 1055 if (error && (error == ENOSPC || error == EDQUOT)) { 1056 if (!rp->r_error) 1057 rp->r_error = error; 1058 } 1059 if (--rp->r_gcount == 0) 1060 cv_broadcast(&rp->r_cv); 1061 mutex_exit(&rp->r_statelock); 1062 } 1063 } 1064 1065 return (nfsgetattr(vp, vap, cr)); 1066 } 1067 1068 /*ARGSUSED4*/ 1069 static int 1070 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1071 caller_context_t *ct) 1072 { 1073 int error; 1074 uint_t mask; 1075 struct vattr va; 1076 1077 mask = vap->va_mask; 1078 1079 if (mask & AT_NOSET) 1080 return (EINVAL); 1081 1082 if ((mask & AT_SIZE) && 1083 vap->va_type == VREG && 1084 vap->va_size > MAXOFF32_T) 1085 return (EFBIG); 1086 1087 if (nfs_zone() != VTOMI(vp)->mi_zone) 1088 return (EIO); 1089 1090 va.va_mask = AT_UID | AT_MODE; 1091 1092 error = nfsgetattr(vp, &va, cr); 1093 if (error) 1094 return (error); 1095 1096 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx, 1097 vp); 1098 1099 if (error) 1100 return (error); 1101 1102 return (nfssetattr(vp, vap, flags, cr)); 1103 } 1104 1105 static int 1106 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1107 { 1108 int error; 1109 uint_t mask; 1110 struct nfssaargs args; 1111 struct nfsattrstat ns; 1112 int douprintf; 1113 rnode_t *rp; 1114 struct vattr va; 1115 mode_t omode; 1116 mntinfo_t *mi; 1117 vsecattr_t *vsp; 1118 hrtime_t t; 1119 1120 mask = vap->va_mask; 1121 1122 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1123 1124 rp = VTOR(vp); 1125 1126 /* 1127 * Only need to flush pages if there are any pages and 1128 * if the file is marked as dirty in some fashion. The 1129 * file must be flushed so that we can accurately 1130 * determine the size of the file and the cached data 1131 * after the SETATTR returns. A file is considered to 1132 * be dirty if it is either marked with RDIRTY, has 1133 * outstanding i/o's active, or is mmap'd. In this 1134 * last case, we can't tell whether there are dirty 1135 * pages, so we flush just to be sure. 1136 */ 1137 if (vn_has_cached_data(vp) && 1138 ((rp->r_flags & RDIRTY) || 1139 rp->r_count > 0 || 1140 rp->r_mapcnt > 0)) { 1141 ASSERT(vp->v_type != VCHR); 1142 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1143 if (error && (error == ENOSPC || error == EDQUOT)) { 1144 mutex_enter(&rp->r_statelock); 1145 if (!rp->r_error) 1146 rp->r_error = error; 1147 mutex_exit(&rp->r_statelock); 1148 } 1149 } 1150 1151 /* 1152 * If the system call was utime(2) or utimes(2) and the 1153 * application did not specify the times, then set the 1154 * mtime nanosecond field to 1 billion. This will get 1155 * translated from 1 billion nanoseconds to 1 million 1156 * microseconds in the over the wire request. The 1157 * server will use 1 million in the microsecond field 1158 * to tell whether both the mtime and atime should be 1159 * set to the server's current time. 1160 * 1161 * This is an overload of the protocol and should be 1162 * documented in the NFS Version 2 protocol specification. 1163 */ 1164 if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) { 1165 vap->va_mtime.tv_nsec = 1000000000; 1166 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) && 1167 NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1168 error = vattr_to_sattr(vap, &args.saa_sa); 1169 } else { 1170 /* 1171 * Use server times. vap time values will not be used. 1172 * To ensure no time overflow, make sure vap has 1173 * valid values, but retain the original values. 1174 */ 1175 timestruc_t mtime = vap->va_mtime; 1176 timestruc_t atime = vap->va_atime; 1177 time_t now; 1178 1179 now = gethrestime_sec(); 1180 if (NFS_TIME_T_OK(now)) { 1181 /* Just in case server does not know of this */ 1182 vap->va_mtime.tv_sec = now; 1183 vap->va_atime.tv_sec = now; 1184 } else { 1185 vap->va_mtime.tv_sec = 0; 1186 vap->va_atime.tv_sec = 0; 1187 } 1188 error = vattr_to_sattr(vap, &args.saa_sa); 1189 /* set vap times back on */ 1190 vap->va_mtime = mtime; 1191 vap->va_atime = atime; 1192 } 1193 } else { 1194 /* Either do not set times or use the client specified times */ 1195 error = vattr_to_sattr(vap, &args.saa_sa); 1196 } 1197 if (error) { 1198 /* req time field(s) overflow - return immediately */ 1199 return (error); 1200 } 1201 args.saa_fh = *VTOFH(vp); 1202 1203 va.va_mask = AT_MODE; 1204 error = nfsgetattr(vp, &va, cr); 1205 if (error) 1206 return (error); 1207 omode = va.va_mode; 1208 1209 mi = VTOMI(vp); 1210 1211 douprintf = 1; 1212 1213 t = gethrtime(); 1214 1215 error = rfs2call(mi, RFS_SETATTR, 1216 xdr_saargs, (caddr_t)&args, 1217 xdr_attrstat, (caddr_t)&ns, cr, 1218 &douprintf, &ns.ns_status, 0, NULL); 1219 1220 /* 1221 * Purge the access cache and ACL cache if changing either the 1222 * owner of the file, the group owner, or the mode. These may 1223 * change the access permissions of the file, so purge old 1224 * information and start over again. 1225 */ 1226 if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) { 1227 (void) nfs_access_purge_rp(rp); 1228 if (rp->r_secattr != NULL) { 1229 mutex_enter(&rp->r_statelock); 1230 vsp = rp->r_secattr; 1231 rp->r_secattr = NULL; 1232 mutex_exit(&rp->r_statelock); 1233 if (vsp != NULL) 1234 nfs_acl_free(vsp); 1235 } 1236 } 1237 1238 if (!error) { 1239 error = geterrno(ns.ns_status); 1240 if (!error) { 1241 /* 1242 * If changing the size of the file, invalidate 1243 * any local cached data which is no longer part 1244 * of the file. We also possibly invalidate the 1245 * last page in the file. We could use 1246 * pvn_vpzero(), but this would mark the page as 1247 * modified and require it to be written back to 1248 * the server for no particularly good reason. 1249 * This way, if we access it, then we bring it 1250 * back in. A read should be cheaper than a 1251 * write. 1252 */ 1253 if (mask & AT_SIZE) { 1254 nfs_invalidate_pages(vp, 1255 (vap->va_size & PAGEMASK), cr); 1256 } 1257 (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr); 1258 /* 1259 * If NFS_ACL is supported on the server, then the 1260 * attributes returned by server may have minimal 1261 * permissions sometimes denying access to users having 1262 * proper access. To get the proper attributes, mark 1263 * the attributes as expired so that they will be 1264 * regotten via the NFS_ACL GETATTR2 procedure. 1265 */ 1266 if (mi->mi_flags & MI_ACL) { 1267 PURGE_ATTRCACHE(vp); 1268 } 1269 /* 1270 * This next check attempts to deal with NFS 1271 * servers which can not handle increasing 1272 * the size of the file via setattr. Most 1273 * of these servers do not return an error, 1274 * but do not change the size of the file. 1275 * Hence, this check and then attempt to set 1276 * the file size by writing 1 byte at the 1277 * offset of the end of the file that we need. 1278 */ 1279 if ((mask & AT_SIZE) && 1280 ns.ns_attr.na_size < (uint32_t)vap->va_size) { 1281 char zb = '\0'; 1282 1283 error = nfswrite(vp, &zb, 1284 vap->va_size - sizeof (zb), 1285 sizeof (zb), cr); 1286 } 1287 /* 1288 * Some servers will change the mode to clear the setuid 1289 * and setgid bits when changing the uid or gid. The 1290 * client needs to compensate appropriately. 1291 */ 1292 if (mask & (AT_UID | AT_GID)) { 1293 int terror; 1294 1295 va.va_mask = AT_MODE; 1296 terror = nfsgetattr(vp, &va, cr); 1297 if (!terror && 1298 (((mask & AT_MODE) && 1299 va.va_mode != vap->va_mode) || 1300 (!(mask & AT_MODE) && 1301 va.va_mode != omode))) { 1302 va.va_mask = AT_MODE; 1303 if (mask & AT_MODE) 1304 va.va_mode = vap->va_mode; 1305 else 1306 va.va_mode = omode; 1307 (void) nfssetattr(vp, &va, 0, cr); 1308 } 1309 } 1310 } else { 1311 PURGE_ATTRCACHE(vp); 1312 PURGE_STALE_FH(error, vp, cr); 1313 } 1314 } else { 1315 PURGE_ATTRCACHE(vp); 1316 } 1317 1318 return (error); 1319 } 1320 1321 static int 1322 nfs_accessx(void *vp, int mode, cred_t *cr) 1323 { 1324 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1325 return (nfs_access(vp, mode, 0, cr)); 1326 } 1327 1328 static int 1329 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) 1330 { 1331 struct vattr va; 1332 int error; 1333 mntinfo_t *mi; 1334 int shift = 0; 1335 1336 mi = VTOMI(vp); 1337 1338 if (nfs_zone() != mi->mi_zone) 1339 return (EIO); 1340 if (mi->mi_flags & MI_ACL) { 1341 error = acl_access2(vp, mode, flags, cr); 1342 if (mi->mi_flags & MI_ACL) 1343 return (error); 1344 } 1345 1346 va.va_mask = AT_MODE | AT_UID | AT_GID; 1347 error = nfsgetattr(vp, &va, cr); 1348 if (error) 1349 return (error); 1350 1351 /* 1352 * Disallow write attempts on read-only 1353 * file systems, unless the file is a 1354 * device node. 1355 */ 1356 if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp)) 1357 return (EROFS); 1358 1359 /* 1360 * Disallow attempts to access mandatory lock files. 1361 */ 1362 if ((mode & (VWRITE | VREAD | VEXEC)) && 1363 MANDLOCK(vp, va.va_mode)) 1364 return (EACCES); 1365 1366 /* 1367 * Access check is based on only 1368 * one of owner, group, public. 1369 * If not owner, then check group. 1370 * If not a member of the group, 1371 * then check public access. 1372 */ 1373 if (crgetuid(cr) != va.va_uid) { 1374 shift += 3; 1375 if (!groupmember(va.va_gid, cr)) 1376 shift += 3; 1377 } 1378 found: 1379 mode &= ~(va.va_mode << shift); 1380 if (mode == 0) 1381 return (0); 1382 1383 return (secpolicy_vnode_access(cr, vp, va.va_uid, mode)); 1384 } 1385 1386 static int nfs_do_symlink_cache = 1; 1387 1388 static int 1389 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 1390 { 1391 int error; 1392 struct nfsrdlnres rl; 1393 rnode_t *rp; 1394 int douprintf; 1395 failinfo_t fi; 1396 1397 /* 1398 * We want to be consistent with UFS semantics so we will return 1399 * EINVAL instead of ENXIO. This violates the XNFS spec and 1400 * the RFC 1094, which are wrong any way. BUGID 1138002. 1401 */ 1402 if (vp->v_type != VLNK) 1403 return (EINVAL); 1404 1405 if (nfs_zone() != VTOMI(vp)->mi_zone) 1406 return (EIO); 1407 1408 rp = VTOR(vp); 1409 if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) { 1410 error = nfs_validate_caches(vp, cr); 1411 if (error) 1412 return (error); 1413 mutex_enter(&rp->r_statelock); 1414 if (rp->r_symlink.contents != NULL) { 1415 error = uiomove(rp->r_symlink.contents, 1416 rp->r_symlink.len, UIO_READ, uiop); 1417 mutex_exit(&rp->r_statelock); 1418 return (error); 1419 } 1420 mutex_exit(&rp->r_statelock); 1421 } 1422 1423 1424 rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 1425 1426 fi.vp = vp; 1427 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1428 fi.copyproc = nfscopyfh; 1429 fi.lookupproc = nfslookup; 1430 fi.xattrdirproc = acl_getxattrdir2; 1431 1432 douprintf = 1; 1433 1434 error = rfs2call(VTOMI(vp), RFS_READLINK, 1435 xdr_fhandle, (caddr_t)VTOFH(vp), 1436 xdr_rdlnres, (caddr_t)&rl, cr, 1437 &douprintf, &rl.rl_status, 0, &fi); 1438 1439 if (error) { 1440 1441 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1442 return (error); 1443 } 1444 1445 error = geterrno(rl.rl_status); 1446 if (!error) { 1447 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop); 1448 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) { 1449 mutex_enter(&rp->r_statelock); 1450 if (rp->r_symlink.contents == NULL) { 1451 rp->r_symlink.contents = rl.rl_data; 1452 rp->r_symlink.len = (int)rl.rl_count; 1453 rp->r_symlink.size = NFS_MAXPATHLEN; 1454 mutex_exit(&rp->r_statelock); 1455 } else { 1456 mutex_exit(&rp->r_statelock); 1457 1458 kmem_free((void *)rl.rl_data, 1459 NFS_MAXPATHLEN); 1460 } 1461 } else { 1462 1463 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1464 } 1465 } else { 1466 PURGE_STALE_FH(error, vp, cr); 1467 1468 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1469 } 1470 1471 /* 1472 * Conform to UFS semantics (see comment above) 1473 */ 1474 return (error == ENXIO ? EINVAL : error); 1475 } 1476 1477 /* 1478 * Flush local dirty pages to stable storage on the server. 1479 * 1480 * If FNODSYNC is specified, then there is nothing to do because 1481 * metadata changes are not cached on the client before being 1482 * sent to the server. 1483 */ 1484 static int 1485 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1486 { 1487 int error; 1488 1489 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1490 return (0); 1491 1492 if (nfs_zone() != VTOMI(vp)->mi_zone) 1493 return (EIO); 1494 1495 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1496 if (!error) 1497 error = VTOR(vp)->r_error; 1498 return (error); 1499 } 1500 1501 1502 /* 1503 * Weirdness: if the file was removed or the target of a rename 1504 * operation while it was open, it got renamed instead. Here we 1505 * remove the renamed file. 1506 */ 1507 static void 1508 nfs_inactive(vnode_t *vp, cred_t *cr) 1509 { 1510 rnode_t *rp; 1511 1512 ASSERT(vp != DNLC_NO_VNODE); 1513 1514 /* 1515 * If this is coming from the wrong zone, we let someone in the right 1516 * zone take care of it asynchronously. We can get here due to 1517 * VN_RELE() being called from pageout() or fsflush(). This call may 1518 * potentially turn into an expensive no-op if, for instance, v_count 1519 * gets incremented in the meantime, but it's still correct. 1520 */ 1521 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1522 nfs_async_inactive(vp, cr, nfs_inactive); 1523 return; 1524 } 1525 1526 rp = VTOR(vp); 1527 redo: 1528 if (rp->r_unldvp != NULL) { 1529 /* 1530 * Save the vnode pointer for the directory where the 1531 * unlinked-open file got renamed, then set it to NULL 1532 * to prevent another thread from getting here before 1533 * we're done with the remove. While we have the 1534 * statelock, make local copies of the pertinent rnode 1535 * fields. If we weren't to do this in an atomic way, the 1536 * the unl* fields could become inconsistent with respect 1537 * to each other due to a race condition between this 1538 * code and nfs_remove(). See bug report 1034328. 1539 */ 1540 mutex_enter(&rp->r_statelock); 1541 if (rp->r_unldvp != NULL) { 1542 vnode_t *unldvp; 1543 char *unlname; 1544 cred_t *unlcred; 1545 struct nfsdiropargs da; 1546 enum nfsstat status; 1547 int douprintf; 1548 int error; 1549 1550 unldvp = rp->r_unldvp; 1551 rp->r_unldvp = NULL; 1552 unlname = rp->r_unlname; 1553 rp->r_unlname = NULL; 1554 unlcred = rp->r_unlcred; 1555 rp->r_unlcred = NULL; 1556 mutex_exit(&rp->r_statelock); 1557 1558 /* 1559 * If there are any dirty pages left, then flush 1560 * them. This is unfortunate because they just 1561 * may get thrown away during the remove operation, 1562 * but we have to do this for correctness. 1563 */ 1564 if (vn_has_cached_data(vp) && 1565 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1566 ASSERT(vp->v_type != VCHR); 1567 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1568 if (error) { 1569 mutex_enter(&rp->r_statelock); 1570 if (!rp->r_error) 1571 rp->r_error = error; 1572 mutex_exit(&rp->r_statelock); 1573 } 1574 } 1575 1576 /* 1577 * Do the remove operation on the renamed file 1578 */ 1579 setdiropargs(&da, unlname, unldvp); 1580 1581 douprintf = 1; 1582 1583 (void) rfs2call(VTOMI(unldvp), RFS_REMOVE, 1584 xdr_diropargs, (caddr_t)&da, 1585 xdr_enum, (caddr_t)&status, unlcred, 1586 &douprintf, &status, 0, NULL); 1587 1588 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1589 nfs_purge_rddir_cache(unldvp); 1590 PURGE_ATTRCACHE(unldvp); 1591 1592 /* 1593 * Release stuff held for the remove 1594 */ 1595 VN_RELE(unldvp); 1596 kmem_free(unlname, MAXNAMELEN); 1597 crfree(unlcred); 1598 goto redo; 1599 } 1600 mutex_exit(&rp->r_statelock); 1601 } 1602 1603 rp_addfree(rp, cr); 1604 } 1605 1606 /* 1607 * Remote file system operations having to do with directory manipulation. 1608 */ 1609 1610 static int 1611 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1612 int flags, vnode_t *rdir, cred_t *cr) 1613 { 1614 int error; 1615 vnode_t *vp; 1616 vnode_t *avp = NULL; 1617 rnode_t *drp; 1618 1619 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1620 return (EPERM); 1621 1622 drp = VTOR(dvp); 1623 1624 /* 1625 * Are we looking up extended attributes? If so, "dvp" is 1626 * the file or directory for which we want attributes, and 1627 * we need a lookup of the hidden attribute directory 1628 * before we lookup the rest of the path. 1629 */ 1630 if (flags & LOOKUP_XATTR) { 1631 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1632 mntinfo_t *mi; 1633 1634 mi = VTOMI(dvp); 1635 if (!(mi->mi_flags & MI_EXTATTR)) 1636 return (EINVAL); 1637 1638 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1639 return (EINTR); 1640 1641 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1642 if (avp == NULL) 1643 error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0); 1644 else 1645 error = 0; 1646 1647 nfs_rw_exit(&drp->r_rwlock); 1648 1649 if (error) { 1650 if (mi->mi_flags & MI_EXTATTR) 1651 return (error); 1652 return (EINVAL); 1653 } 1654 dvp = avp; 1655 drp = VTOR(dvp); 1656 } 1657 1658 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1659 error = EINTR; 1660 goto out; 1661 } 1662 1663 error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1664 1665 nfs_rw_exit(&drp->r_rwlock); 1666 1667 /* 1668 * If vnode is a device, create special vnode. 1669 */ 1670 if (!error && IS_DEVVP(*vpp)) { 1671 vp = *vpp; 1672 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1673 VN_RELE(vp); 1674 } 1675 1676 out: 1677 if (avp != NULL) 1678 VN_RELE(avp); 1679 1680 return (error); 1681 } 1682 1683 static int nfs_lookup_neg_cache = 1; 1684 1685 #ifdef DEBUG 1686 static int nfs_lookup_dnlc_hits = 0; 1687 static int nfs_lookup_dnlc_misses = 0; 1688 static int nfs_lookup_dnlc_neg_hits = 0; 1689 static int nfs_lookup_dnlc_disappears = 0; 1690 static int nfs_lookup_dnlc_lookups = 0; 1691 #endif 1692 1693 /* ARGSUSED */ 1694 int 1695 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1696 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1697 { 1698 int error; 1699 1700 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1701 1702 /* 1703 * If lookup is for "", just return dvp. Don't need 1704 * to send it over the wire, look it up in the dnlc, 1705 * or perform any access checks. 1706 */ 1707 if (*nm == '\0') { 1708 VN_HOLD(dvp); 1709 *vpp = dvp; 1710 return (0); 1711 } 1712 1713 /* 1714 * Can't do lookups in non-directories. 1715 */ 1716 if (dvp->v_type != VDIR) 1717 return (ENOTDIR); 1718 1719 /* 1720 * If we're called with RFSCALL_SOFT, it's important that 1721 * the only rfscall is one we make directly; if we permit 1722 * an access call because we're looking up "." or validating 1723 * a dnlc hit, we'll deadlock because that rfscall will not 1724 * have the RFSCALL_SOFT set. 1725 */ 1726 if (rfscall_flags & RFSCALL_SOFT) 1727 goto callit; 1728 1729 /* 1730 * If lookup is for ".", just return dvp. Don't need 1731 * to send it over the wire or look it up in the dnlc, 1732 * just need to check access. 1733 */ 1734 if (strcmp(nm, ".") == 0) { 1735 error = nfs_access(dvp, VEXEC, 0, cr); 1736 if (error) 1737 return (error); 1738 VN_HOLD(dvp); 1739 *vpp = dvp; 1740 return (0); 1741 } 1742 1743 /* 1744 * Lookup this name in the DNLC. If there was a valid entry, 1745 * then return the results of the lookup. 1746 */ 1747 error = nfslookup_dnlc(dvp, nm, vpp, cr); 1748 if (error || *vpp != NULL) 1749 return (error); 1750 1751 callit: 1752 error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1753 1754 return (error); 1755 } 1756 1757 static int 1758 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1759 { 1760 int error; 1761 vnode_t *vp; 1762 1763 ASSERT(*nm != '\0'); 1764 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1765 1766 /* 1767 * Lookup this name in the DNLC. If successful, then validate 1768 * the caches and then recheck the DNLC. The DNLC is rechecked 1769 * just in case this entry got invalidated during the call 1770 * to nfs_validate_caches. 1771 * 1772 * An assumption is being made that it is safe to say that a 1773 * file exists which may not on the server. Any operations to 1774 * the server will fail with ESTALE. 1775 */ 1776 #ifdef DEBUG 1777 nfs_lookup_dnlc_lookups++; 1778 #endif 1779 vp = dnlc_lookup(dvp, nm); 1780 if (vp != NULL) { 1781 VN_RELE(vp); 1782 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 1783 PURGE_ATTRCACHE(dvp); 1784 } 1785 error = nfs_validate_caches(dvp, cr); 1786 if (error) 1787 return (error); 1788 vp = dnlc_lookup(dvp, nm); 1789 if (vp != NULL) { 1790 error = nfs_access(dvp, VEXEC, 0, cr); 1791 if (error) { 1792 VN_RELE(vp); 1793 return (error); 1794 } 1795 if (vp == DNLC_NO_VNODE) { 1796 VN_RELE(vp); 1797 #ifdef DEBUG 1798 nfs_lookup_dnlc_neg_hits++; 1799 #endif 1800 return (ENOENT); 1801 } 1802 *vpp = vp; 1803 #ifdef DEBUG 1804 nfs_lookup_dnlc_hits++; 1805 #endif 1806 return (0); 1807 } 1808 #ifdef DEBUG 1809 nfs_lookup_dnlc_disappears++; 1810 #endif 1811 } 1812 #ifdef DEBUG 1813 else 1814 nfs_lookup_dnlc_misses++; 1815 #endif 1816 1817 *vpp = NULL; 1818 1819 return (0); 1820 } 1821 1822 static int 1823 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 1824 int rfscall_flags) 1825 { 1826 int error; 1827 struct nfsdiropargs da; 1828 struct nfsdiropres dr; 1829 int douprintf; 1830 failinfo_t fi; 1831 hrtime_t t; 1832 1833 ASSERT(*nm != '\0'); 1834 ASSERT(dvp->v_type == VDIR); 1835 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1836 1837 setdiropargs(&da, nm, dvp); 1838 1839 fi.vp = dvp; 1840 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1841 fi.copyproc = nfscopyfh; 1842 fi.lookupproc = nfslookup; 1843 fi.xattrdirproc = acl_getxattrdir2; 1844 1845 douprintf = 1; 1846 1847 t = gethrtime(); 1848 1849 error = rfs2call(VTOMI(dvp), RFS_LOOKUP, 1850 xdr_diropargs, (caddr_t)&da, 1851 xdr_diropres, (caddr_t)&dr, cr, 1852 &douprintf, &dr.dr_status, rfscall_flags, &fi); 1853 1854 if (!error) { 1855 error = geterrno(dr.dr_status); 1856 if (!error) { 1857 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 1858 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 1859 /* 1860 * If NFS_ACL is supported on the server, then the 1861 * attributes returned by server may have minimal 1862 * permissions sometimes denying access to users having 1863 * proper access. To get the proper attributes, mark 1864 * the attributes as expired so that they will be 1865 * regotten via the NFS_ACL GETATTR2 procedure. 1866 */ 1867 if (VTOMI(*vpp)->mi_flags & MI_ACL) { 1868 PURGE_ATTRCACHE(*vpp); 1869 } 1870 if (!(rfscall_flags & RFSCALL_SOFT)) 1871 dnlc_update(dvp, nm, *vpp); 1872 } else { 1873 PURGE_STALE_FH(error, dvp, cr); 1874 if (error == ENOENT && nfs_lookup_neg_cache) 1875 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 1876 } 1877 } 1878 1879 return (error); 1880 } 1881 1882 /* ARGSUSED */ 1883 static int 1884 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 1885 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 1886 { 1887 int error; 1888 struct nfscreatargs args; 1889 struct nfsdiropres dr; 1890 int douprintf; 1891 vnode_t *vp; 1892 rnode_t *rp; 1893 struct vattr vattr; 1894 rnode_t *drp; 1895 vnode_t *tempvp; 1896 hrtime_t t; 1897 1898 drp = VTOR(dvp); 1899 1900 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1901 return (EPERM); 1902 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 1903 return (EINTR); 1904 1905 /* 1906 * We make a copy of the attributes because the caller does not 1907 * expect us to change what va points to. 1908 */ 1909 vattr = *va; 1910 1911 /* 1912 * If the pathname is "", just use dvp. Don't need 1913 * to send it over the wire, look it up in the dnlc, 1914 * or perform any access checks. 1915 */ 1916 if (*nm == '\0') { 1917 error = 0; 1918 VN_HOLD(dvp); 1919 vp = dvp; 1920 /* 1921 * If the pathname is ".", just use dvp. Don't need 1922 * to send it over the wire or look it up in the dnlc, 1923 * just need to check access. 1924 */ 1925 } else if (strcmp(nm, ".") == 0) { 1926 error = nfs_access(dvp, VEXEC, 0, cr); 1927 if (error) { 1928 nfs_rw_exit(&drp->r_rwlock); 1929 return (error); 1930 } 1931 VN_HOLD(dvp); 1932 vp = dvp; 1933 /* 1934 * We need to go over the wire, just to be sure whether the 1935 * file exists or not. Using the DNLC can be dangerous in 1936 * this case when making a decision regarding existence. 1937 */ 1938 } else { 1939 error = nfslookup_otw(dvp, nm, &vp, cr, 0); 1940 } 1941 if (!error) { 1942 if (exclusive == EXCL) 1943 error = EEXIST; 1944 else if (vp->v_type == VDIR && (mode & VWRITE)) 1945 error = EISDIR; 1946 else { 1947 /* 1948 * If vnode is a device, create special vnode. 1949 */ 1950 if (IS_DEVVP(vp)) { 1951 tempvp = vp; 1952 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1953 VN_RELE(tempvp); 1954 } 1955 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 1956 if ((vattr.va_mask & AT_SIZE) && 1957 vp->v_type == VREG) { 1958 vattr.va_mask = AT_SIZE; 1959 error = nfssetattr(vp, &vattr, 0, cr); 1960 } 1961 } 1962 } 1963 nfs_rw_exit(&drp->r_rwlock); 1964 if (error) { 1965 VN_RELE(vp); 1966 } else 1967 *vpp = vp; 1968 return (error); 1969 } 1970 1971 ASSERT(vattr.va_mask & AT_TYPE); 1972 if (vattr.va_type == VREG) { 1973 ASSERT(vattr.va_mask & AT_MODE); 1974 if (MANDMODE(vattr.va_mode)) { 1975 nfs_rw_exit(&drp->r_rwlock); 1976 return (EACCES); 1977 } 1978 } 1979 1980 dnlc_remove(dvp, nm); 1981 1982 setdiropargs(&args.ca_da, nm, dvp); 1983 1984 /* 1985 * Decide what the group-id of the created file should be. 1986 * Set it in attribute list as advisory...then do a setattr 1987 * if the server didn't get it right the first time. 1988 */ 1989 error = setdirgid(dvp, &vattr.va_gid, cr); 1990 if (error) { 1991 nfs_rw_exit(&drp->r_rwlock); 1992 return (error); 1993 } 1994 vattr.va_mask |= AT_GID; 1995 1996 /* 1997 * This is a completely gross hack to make mknod 1998 * work over the wire until we can wack the protocol 1999 */ 2000 #define IFCHR 0020000 /* character special */ 2001 #define IFBLK 0060000 /* block special */ 2002 #define IFSOCK 0140000 /* socket */ 2003 2004 /* 2005 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x 2006 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18 2007 * bits in the minor number where 4.x supports 8 bits. If the 5.x 2008 * minor/major numbers <= 8 bits long, compress the device 2009 * number before sending it. Otherwise, the 4.x server will not 2010 * create the device with the correct device number and nothing can be 2011 * done about this. 2012 */ 2013 if (vattr.va_type == VCHR || vattr.va_type == VBLK) { 2014 dev_t d = vattr.va_rdev; 2015 dev32_t dev32; 2016 2017 if (vattr.va_type == VCHR) 2018 vattr.va_mode |= IFCHR; 2019 else 2020 vattr.va_mode |= IFBLK; 2021 2022 (void) cmpldev(&dev32, d); 2023 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN)) 2024 vattr.va_size = (u_offset_t)dev32; 2025 else 2026 vattr.va_size = (u_offset_t)nfsv2_cmpdev(d); 2027 2028 vattr.va_mask |= AT_MODE|AT_SIZE; 2029 } else if (vattr.va_type == VFIFO) { 2030 vattr.va_mode |= IFCHR; /* xtra kludge for namedpipe */ 2031 vattr.va_size = (u_offset_t)NFS_FIFO_DEV; /* blech */ 2032 vattr.va_mask |= AT_MODE|AT_SIZE; 2033 } else if (vattr.va_type == VSOCK) { 2034 vattr.va_mode |= IFSOCK; 2035 /* 2036 * To avoid triggering bugs in the servers set AT_SIZE 2037 * (all other RFS_CREATE calls set this). 2038 */ 2039 vattr.va_size = 0; 2040 vattr.va_mask |= AT_MODE|AT_SIZE; 2041 } 2042 2043 args.ca_sa = &args.ca_sa_buf; 2044 error = vattr_to_sattr(&vattr, args.ca_sa); 2045 if (error) { 2046 /* req time field(s) overflow - return immediately */ 2047 nfs_rw_exit(&drp->r_rwlock); 2048 return (error); 2049 } 2050 2051 douprintf = 1; 2052 2053 t = gethrtime(); 2054 2055 error = rfs2call(VTOMI(dvp), RFS_CREATE, 2056 xdr_creatargs, (caddr_t)&args, 2057 xdr_diropres, (caddr_t)&dr, cr, 2058 &douprintf, &dr.dr_status, 0, NULL); 2059 2060 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2061 2062 if (!error) { 2063 error = geterrno(dr.dr_status); 2064 if (!error) { 2065 if (HAVE_RDDIR_CACHE(drp)) 2066 nfs_purge_rddir_cache(dvp); 2067 vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2068 dvp->v_vfsp, t, cr, NULL, NULL); 2069 /* 2070 * If NFS_ACL is supported on the server, then the 2071 * attributes returned by server may have minimal 2072 * permissions sometimes denying access to users having 2073 * proper access. To get the proper attributes, mark 2074 * the attributes as expired so that they will be 2075 * regotten via the NFS_ACL GETATTR2 procedure. 2076 */ 2077 if (VTOMI(vp)->mi_flags & MI_ACL) { 2078 PURGE_ATTRCACHE(vp); 2079 } 2080 dnlc_update(dvp, nm, vp); 2081 rp = VTOR(vp); 2082 if (vattr.va_size == 0) { 2083 mutex_enter(&rp->r_statelock); 2084 rp->r_size = 0; 2085 mutex_exit(&rp->r_statelock); 2086 if (vn_has_cached_data(vp)) { 2087 ASSERT(vp->v_type != VCHR); 2088 nfs_invalidate_pages(vp, 2089 (u_offset_t)0, cr); 2090 } 2091 } 2092 2093 /* 2094 * Make sure the gid was set correctly. 2095 * If not, try to set it (but don't lose 2096 * any sleep over it). 2097 */ 2098 if (vattr.va_gid != rp->r_attr.va_gid) { 2099 vattr.va_mask = AT_GID; 2100 (void) nfssetattr(vp, &vattr, 0, cr); 2101 } 2102 2103 /* 2104 * If vnode is a device create special vnode 2105 */ 2106 if (IS_DEVVP(vp)) { 2107 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2108 VN_RELE(vp); 2109 } else 2110 *vpp = vp; 2111 } else { 2112 PURGE_STALE_FH(error, dvp, cr); 2113 } 2114 } 2115 2116 nfs_rw_exit(&drp->r_rwlock); 2117 2118 return (error); 2119 } 2120 2121 /* 2122 * Weirdness: if the vnode to be removed is open 2123 * we rename it instead of removing it and nfs_inactive 2124 * will remove the new name. 2125 */ 2126 static int 2127 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr) 2128 { 2129 int error; 2130 struct nfsdiropargs da; 2131 enum nfsstat status; 2132 vnode_t *vp; 2133 char *tmpname; 2134 int douprintf; 2135 rnode_t *rp; 2136 rnode_t *drp; 2137 2138 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2139 return (EPERM); 2140 drp = VTOR(dvp); 2141 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2142 return (EINTR); 2143 2144 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2145 if (error) { 2146 nfs_rw_exit(&drp->r_rwlock); 2147 return (error); 2148 } 2149 2150 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2151 VN_RELE(vp); 2152 nfs_rw_exit(&drp->r_rwlock); 2153 return (EPERM); 2154 } 2155 2156 /* 2157 * First just remove the entry from the name cache, as it 2158 * is most likely the only entry for this vp. 2159 */ 2160 dnlc_remove(dvp, nm); 2161 2162 /* 2163 * If the file has a v_count > 1 then there may be more than one 2164 * entry in the name cache due multiple links or an open file, 2165 * but we don't have the real reference count so flush all 2166 * possible entries. 2167 */ 2168 if (vp->v_count > 1) 2169 dnlc_purge_vp(vp); 2170 2171 /* 2172 * Now we have the real reference count on the vnode 2173 */ 2174 rp = VTOR(vp); 2175 mutex_enter(&rp->r_statelock); 2176 if (vp->v_count > 1 && 2177 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2178 mutex_exit(&rp->r_statelock); 2179 tmpname = newname(); 2180 error = nfsrename(dvp, nm, dvp, tmpname, cr); 2181 if (error) 2182 kmem_free(tmpname, MAXNAMELEN); 2183 else { 2184 mutex_enter(&rp->r_statelock); 2185 if (rp->r_unldvp == NULL) { 2186 VN_HOLD(dvp); 2187 rp->r_unldvp = dvp; 2188 if (rp->r_unlcred != NULL) 2189 crfree(rp->r_unlcred); 2190 crhold(cr); 2191 rp->r_unlcred = cr; 2192 rp->r_unlname = tmpname; 2193 } else { 2194 kmem_free(rp->r_unlname, MAXNAMELEN); 2195 rp->r_unlname = tmpname; 2196 } 2197 mutex_exit(&rp->r_statelock); 2198 } 2199 } else { 2200 mutex_exit(&rp->r_statelock); 2201 /* 2202 * We need to flush any dirty pages which happen to 2203 * be hanging around before removing the file. This 2204 * shouldn't happen very often and mostly on file 2205 * systems mounted "nocto". 2206 */ 2207 if (vn_has_cached_data(vp) && 2208 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2209 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 2210 if (error && (error == ENOSPC || error == EDQUOT)) { 2211 mutex_enter(&rp->r_statelock); 2212 if (!rp->r_error) 2213 rp->r_error = error; 2214 mutex_exit(&rp->r_statelock); 2215 } 2216 } 2217 2218 setdiropargs(&da, nm, dvp); 2219 2220 douprintf = 1; 2221 2222 error = rfs2call(VTOMI(dvp), RFS_REMOVE, 2223 xdr_diropargs, (caddr_t)&da, 2224 xdr_enum, (caddr_t)&status, cr, 2225 &douprintf, &status, 0, NULL); 2226 2227 /* 2228 * The xattr dir may be gone after last attr is removed, 2229 * so flush it from dnlc. 2230 */ 2231 if (dvp->v_flag & V_XATTRDIR) 2232 dnlc_purge_vp(dvp); 2233 2234 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2235 PURGE_ATTRCACHE(vp); /* link count changed */ 2236 2237 if (!error) { 2238 error = geterrno(status); 2239 if (!error) { 2240 if (HAVE_RDDIR_CACHE(drp)) 2241 nfs_purge_rddir_cache(dvp); 2242 } else { 2243 PURGE_STALE_FH(error, dvp, cr); 2244 } 2245 } 2246 } 2247 2248 VN_RELE(vp); 2249 2250 nfs_rw_exit(&drp->r_rwlock); 2251 2252 return (error); 2253 } 2254 2255 static int 2256 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 2257 { 2258 int error; 2259 struct nfslinkargs args; 2260 enum nfsstat status; 2261 vnode_t *realvp; 2262 int douprintf; 2263 rnode_t *tdrp; 2264 2265 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2266 return (EPERM); 2267 if (VOP_REALVP(svp, &realvp) == 0) 2268 svp = realvp; 2269 2270 args.la_from = VTOFH(svp); 2271 setdiropargs(&args.la_to, tnm, tdvp); 2272 2273 tdrp = VTOR(tdvp); 2274 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2275 return (EINTR); 2276 2277 dnlc_remove(tdvp, tnm); 2278 2279 douprintf = 1; 2280 2281 error = rfs2call(VTOMI(svp), RFS_LINK, 2282 xdr_linkargs, (caddr_t)&args, 2283 xdr_enum, (caddr_t)&status, cr, 2284 &douprintf, &status, 0, NULL); 2285 2286 PURGE_ATTRCACHE(tdvp); /* mod time changed */ 2287 PURGE_ATTRCACHE(svp); /* link count changed */ 2288 2289 if (!error) { 2290 error = geterrno(status); 2291 if (!error) { 2292 if (HAVE_RDDIR_CACHE(tdrp)) 2293 nfs_purge_rddir_cache(tdvp); 2294 } 2295 } 2296 2297 nfs_rw_exit(&tdrp->r_rwlock); 2298 2299 return (error); 2300 } 2301 2302 static int 2303 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2304 { 2305 vnode_t *realvp; 2306 2307 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2308 return (EPERM); 2309 if (VOP_REALVP(ndvp, &realvp) == 0) 2310 ndvp = realvp; 2311 2312 return (nfsrename(odvp, onm, ndvp, nnm, cr)); 2313 } 2314 2315 /* 2316 * nfsrename does the real work of renaming in NFS Version 2. 2317 */ 2318 static int 2319 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2320 { 2321 int error; 2322 enum nfsstat status; 2323 struct nfsrnmargs args; 2324 int douprintf; 2325 vnode_t *nvp; 2326 vnode_t *ovp = NULL; 2327 char *tmpname; 2328 rnode_t *rp; 2329 rnode_t *odrp; 2330 rnode_t *ndrp; 2331 2332 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2333 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2334 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2335 return (EINVAL); 2336 2337 odrp = VTOR(odvp); 2338 ndrp = VTOR(ndvp); 2339 if ((intptr_t)odrp < (intptr_t)ndrp) { 2340 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2341 return (EINTR); 2342 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2343 nfs_rw_exit(&odrp->r_rwlock); 2344 return (EINTR); 2345 } 2346 } else { 2347 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2348 return (EINTR); 2349 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2350 nfs_rw_exit(&ndrp->r_rwlock); 2351 return (EINTR); 2352 } 2353 } 2354 2355 /* 2356 * Lookup the target file. If it exists, it needs to be 2357 * checked to see whether it is a mount point and whether 2358 * it is active (open). 2359 */ 2360 error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 2361 if (!error) { 2362 /* 2363 * If this file has been mounted on, then just 2364 * return busy because renaming to it would remove 2365 * the mounted file system from the name space. 2366 */ 2367 if (vn_mountedvfs(nvp) != NULL) { 2368 VN_RELE(nvp); 2369 nfs_rw_exit(&odrp->r_rwlock); 2370 nfs_rw_exit(&ndrp->r_rwlock); 2371 return (EBUSY); 2372 } 2373 2374 /* 2375 * Purge the name cache of all references to this vnode 2376 * so that we can check the reference count to infer 2377 * whether it is active or not. 2378 */ 2379 /* 2380 * First just remove the entry from the name cache, as it 2381 * is most likely the only entry for this vp. 2382 */ 2383 dnlc_remove(ndvp, nnm); 2384 /* 2385 * If the file has a v_count > 1 then there may be more 2386 * than one entry in the name cache due multiple links 2387 * or an open file, but we don't have the real reference 2388 * count so flush all possible entries. 2389 */ 2390 if (nvp->v_count > 1) 2391 dnlc_purge_vp(nvp); 2392 2393 /* 2394 * If the vnode is active and is not a directory, 2395 * arrange to rename it to a 2396 * temporary file so that it will continue to be 2397 * accessible. This implements the "unlink-open-file" 2398 * semantics for the target of a rename operation. 2399 * Before doing this though, make sure that the 2400 * source and target files are not already the same. 2401 */ 2402 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 2403 /* 2404 * Lookup the source name. 2405 */ 2406 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, 2407 cr, 0); 2408 2409 /* 2410 * The source name *should* already exist. 2411 */ 2412 if (error) { 2413 VN_RELE(nvp); 2414 nfs_rw_exit(&odrp->r_rwlock); 2415 nfs_rw_exit(&ndrp->r_rwlock); 2416 return (error); 2417 } 2418 2419 /* 2420 * Compare the two vnodes. If they are the same, 2421 * just release all held vnodes and return success. 2422 */ 2423 if (ovp == nvp) { 2424 VN_RELE(ovp); 2425 VN_RELE(nvp); 2426 nfs_rw_exit(&odrp->r_rwlock); 2427 nfs_rw_exit(&ndrp->r_rwlock); 2428 return (0); 2429 } 2430 2431 /* 2432 * Can't mix and match directories and non- 2433 * directories in rename operations. We already 2434 * know that the target is not a directory. If 2435 * the source is a directory, return an error. 2436 */ 2437 if (ovp->v_type == VDIR) { 2438 VN_RELE(ovp); 2439 VN_RELE(nvp); 2440 nfs_rw_exit(&odrp->r_rwlock); 2441 nfs_rw_exit(&ndrp->r_rwlock); 2442 return (ENOTDIR); 2443 } 2444 2445 /* 2446 * The target file exists, is not the same as 2447 * the source file, and is active. Link it 2448 * to a temporary filename to avoid having 2449 * the server removing the file completely. 2450 */ 2451 tmpname = newname(); 2452 error = nfs_link(ndvp, nvp, tmpname, cr); 2453 if (error == EOPNOTSUPP) { 2454 error = nfs_rename(ndvp, nnm, ndvp, tmpname, 2455 cr); 2456 } 2457 if (error) { 2458 kmem_free(tmpname, MAXNAMELEN); 2459 VN_RELE(ovp); 2460 VN_RELE(nvp); 2461 nfs_rw_exit(&odrp->r_rwlock); 2462 nfs_rw_exit(&ndrp->r_rwlock); 2463 return (error); 2464 } 2465 rp = VTOR(nvp); 2466 mutex_enter(&rp->r_statelock); 2467 if (rp->r_unldvp == NULL) { 2468 VN_HOLD(ndvp); 2469 rp->r_unldvp = ndvp; 2470 if (rp->r_unlcred != NULL) 2471 crfree(rp->r_unlcred); 2472 crhold(cr); 2473 rp->r_unlcred = cr; 2474 rp->r_unlname = tmpname; 2475 } else { 2476 kmem_free(rp->r_unlname, MAXNAMELEN); 2477 rp->r_unlname = tmpname; 2478 } 2479 mutex_exit(&rp->r_statelock); 2480 } 2481 2482 VN_RELE(nvp); 2483 } 2484 2485 if (ovp == NULL) { 2486 /* 2487 * When renaming directories to be a subdirectory of a 2488 * different parent, the dnlc entry for ".." will no 2489 * longer be valid, so it must be removed. 2490 * 2491 * We do a lookup here to determine whether we are renaming 2492 * a directory and we need to check if we are renaming 2493 * an unlinked file. This might have already been done 2494 * in previous code, so we check ovp == NULL to avoid 2495 * doing it twice. 2496 */ 2497 2498 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 2499 2500 /* 2501 * The source name *should* already exist. 2502 */ 2503 if (error) { 2504 nfs_rw_exit(&odrp->r_rwlock); 2505 nfs_rw_exit(&ndrp->r_rwlock); 2506 return (error); 2507 } 2508 ASSERT(ovp != NULL); 2509 } 2510 2511 dnlc_remove(odvp, onm); 2512 dnlc_remove(ndvp, nnm); 2513 2514 setdiropargs(&args.rna_from, onm, odvp); 2515 setdiropargs(&args.rna_to, nnm, ndvp); 2516 2517 douprintf = 1; 2518 2519 error = rfs2call(VTOMI(odvp), RFS_RENAME, 2520 xdr_rnmargs, (caddr_t)&args, 2521 xdr_enum, (caddr_t)&status, cr, 2522 &douprintf, &status, 0, NULL); 2523 2524 PURGE_ATTRCACHE(odvp); /* mod time changed */ 2525 PURGE_ATTRCACHE(ndvp); /* mod time changed */ 2526 2527 if (!error) { 2528 error = geterrno(status); 2529 if (!error) { 2530 if (HAVE_RDDIR_CACHE(odrp)) 2531 nfs_purge_rddir_cache(odvp); 2532 if (HAVE_RDDIR_CACHE(ndrp)) 2533 nfs_purge_rddir_cache(ndvp); 2534 /* 2535 * when renaming directories to be a subdirectory of a 2536 * different parent, the dnlc entry for ".." will no 2537 * longer be valid, so it must be removed 2538 */ 2539 rp = VTOR(ovp); 2540 if (ndvp != odvp) { 2541 if (ovp->v_type == VDIR) { 2542 dnlc_remove(ovp, ".."); 2543 if (HAVE_RDDIR_CACHE(rp)) 2544 nfs_purge_rddir_cache(ovp); 2545 } 2546 } 2547 2548 /* 2549 * If we are renaming the unlinked file, update the 2550 * r_unldvp and r_unlname as needed. 2551 */ 2552 mutex_enter(&rp->r_statelock); 2553 if (rp->r_unldvp != NULL) { 2554 if (strcmp(rp->r_unlname, onm) == 0) { 2555 (void) strncpy(rp->r_unlname, 2556 nnm, MAXNAMELEN); 2557 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 2558 2559 if (ndvp != rp->r_unldvp) { 2560 VN_RELE(rp->r_unldvp); 2561 rp->r_unldvp = ndvp; 2562 VN_HOLD(ndvp); 2563 } 2564 } 2565 } 2566 mutex_exit(&rp->r_statelock); 2567 } else { 2568 /* 2569 * System V defines rename to return EEXIST, not 2570 * ENOTEMPTY if the target directory is not empty. 2571 * Over the wire, the error is NFSERR_ENOTEMPTY 2572 * which geterrno maps to ENOTEMPTY. 2573 */ 2574 if (error == ENOTEMPTY) 2575 error = EEXIST; 2576 } 2577 } 2578 2579 VN_RELE(ovp); 2580 2581 nfs_rw_exit(&odrp->r_rwlock); 2582 nfs_rw_exit(&ndrp->r_rwlock); 2583 2584 return (error); 2585 } 2586 2587 static int 2588 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 2589 { 2590 int error; 2591 struct nfscreatargs args; 2592 struct nfsdiropres dr; 2593 int douprintf; 2594 rnode_t *drp; 2595 hrtime_t t; 2596 2597 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2598 return (EPERM); 2599 2600 setdiropargs(&args.ca_da, nm, dvp); 2601 2602 /* 2603 * Decide what the group-id and set-gid bit of the created directory 2604 * should be. May have to do a setattr to get the gid right. 2605 */ 2606 error = setdirgid(dvp, &va->va_gid, cr); 2607 if (error) 2608 return (error); 2609 error = setdirmode(dvp, &va->va_mode, cr); 2610 if (error) 2611 return (error); 2612 va->va_mask |= AT_MODE|AT_GID; 2613 2614 args.ca_sa = &args.ca_sa_buf; 2615 error = vattr_to_sattr(va, args.ca_sa); 2616 if (error) { 2617 /* req time field(s) overflow - return immediately */ 2618 return (error); 2619 } 2620 2621 drp = VTOR(dvp); 2622 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2623 return (EINTR); 2624 2625 dnlc_remove(dvp, nm); 2626 2627 douprintf = 1; 2628 2629 t = gethrtime(); 2630 2631 error = rfs2call(VTOMI(dvp), RFS_MKDIR, 2632 xdr_creatargs, (caddr_t)&args, 2633 xdr_diropres, (caddr_t)&dr, cr, 2634 &douprintf, &dr.dr_status, 0, NULL); 2635 2636 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2637 2638 if (!error) { 2639 error = geterrno(dr.dr_status); 2640 if (!error) { 2641 if (HAVE_RDDIR_CACHE(drp)) 2642 nfs_purge_rddir_cache(dvp); 2643 /* 2644 * The attributes returned by RFS_MKDIR can not 2645 * be depended upon, so mark the attribute cache 2646 * as purged. A subsequent GETATTR will get the 2647 * correct attributes from the server. 2648 */ 2649 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2650 dvp->v_vfsp, t, cr, NULL, NULL); 2651 PURGE_ATTRCACHE(*vpp); 2652 dnlc_update(dvp, nm, *vpp); 2653 2654 /* 2655 * Make sure the gid was set correctly. 2656 * If not, try to set it (but don't lose 2657 * any sleep over it). 2658 */ 2659 if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) { 2660 va->va_mask = AT_GID; 2661 (void) nfssetattr(*vpp, va, 0, cr); 2662 } 2663 } else { 2664 PURGE_STALE_FH(error, dvp, cr); 2665 } 2666 } 2667 2668 nfs_rw_exit(&drp->r_rwlock); 2669 2670 return (error); 2671 } 2672 2673 static int 2674 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 2675 { 2676 int error; 2677 enum nfsstat status; 2678 struct nfsdiropargs da; 2679 vnode_t *vp; 2680 int douprintf; 2681 rnode_t *drp; 2682 2683 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2684 return (EPERM); 2685 drp = VTOR(dvp); 2686 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2687 return (EINTR); 2688 2689 /* 2690 * Attempt to prevent a rmdir(".") from succeeding. 2691 */ 2692 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2693 if (error) { 2694 nfs_rw_exit(&drp->r_rwlock); 2695 return (error); 2696 } 2697 2698 if (vp == cdir) { 2699 VN_RELE(vp); 2700 nfs_rw_exit(&drp->r_rwlock); 2701 return (EINVAL); 2702 } 2703 2704 setdiropargs(&da, nm, dvp); 2705 2706 /* 2707 * First just remove the entry from the name cache, as it 2708 * is most likely an entry for this vp. 2709 */ 2710 dnlc_remove(dvp, nm); 2711 2712 /* 2713 * If there vnode reference count is greater than one, then 2714 * there may be additional references in the DNLC which will 2715 * need to be purged. First, trying removing the entry for 2716 * the parent directory and see if that removes the additional 2717 * reference(s). If that doesn't do it, then use dnlc_purge_vp 2718 * to completely remove any references to the directory which 2719 * might still exist in the DNLC. 2720 */ 2721 if (vp->v_count > 1) { 2722 dnlc_remove(vp, ".."); 2723 if (vp->v_count > 1) 2724 dnlc_purge_vp(vp); 2725 } 2726 2727 douprintf = 1; 2728 2729 error = rfs2call(VTOMI(dvp), RFS_RMDIR, 2730 xdr_diropargs, (caddr_t)&da, 2731 xdr_enum, (caddr_t)&status, cr, 2732 &douprintf, &status, 0, NULL); 2733 2734 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2735 2736 if (error) { 2737 VN_RELE(vp); 2738 nfs_rw_exit(&drp->r_rwlock); 2739 return (error); 2740 } 2741 2742 error = geterrno(status); 2743 if (!error) { 2744 if (HAVE_RDDIR_CACHE(drp)) 2745 nfs_purge_rddir_cache(dvp); 2746 if (HAVE_RDDIR_CACHE(VTOR(vp))) 2747 nfs_purge_rddir_cache(vp); 2748 } else { 2749 PURGE_STALE_FH(error, dvp, cr); 2750 /* 2751 * System V defines rmdir to return EEXIST, not 2752 * ENOTEMPTY if the directory is not empty. Over 2753 * the wire, the error is NFSERR_ENOTEMPTY which 2754 * geterrno maps to ENOTEMPTY. 2755 */ 2756 if (error == ENOTEMPTY) 2757 error = EEXIST; 2758 } 2759 2760 VN_RELE(vp); 2761 2762 nfs_rw_exit(&drp->r_rwlock); 2763 2764 return (error); 2765 } 2766 2767 static int 2768 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 2769 { 2770 int error; 2771 struct nfsslargs args; 2772 enum nfsstat status; 2773 int douprintf; 2774 rnode_t *drp; 2775 2776 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2777 return (EPERM); 2778 setdiropargs(&args.sla_from, lnm, dvp); 2779 args.sla_sa = &args.sla_sa_buf; 2780 error = vattr_to_sattr(tva, args.sla_sa); 2781 if (error) { 2782 /* req time field(s) overflow - return immediately */ 2783 return (error); 2784 } 2785 args.sla_tnm = tnm; 2786 2787 drp = VTOR(dvp); 2788 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2789 return (EINTR); 2790 2791 dnlc_remove(dvp, lnm); 2792 2793 douprintf = 1; 2794 2795 error = rfs2call(VTOMI(dvp), RFS_SYMLINK, 2796 xdr_slargs, (caddr_t)&args, 2797 xdr_enum, (caddr_t)&status, cr, 2798 &douprintf, &status, 0, NULL); 2799 2800 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2801 2802 if (!error) { 2803 error = geterrno(status); 2804 if (!error) { 2805 if (HAVE_RDDIR_CACHE(drp)) 2806 nfs_purge_rddir_cache(dvp); 2807 } else { 2808 PURGE_STALE_FH(error, dvp, cr); 2809 } 2810 } 2811 2812 nfs_rw_exit(&drp->r_rwlock); 2813 2814 return (error); 2815 } 2816 2817 #ifdef DEBUG 2818 static int nfs_readdir_cache_hits = 0; 2819 static int nfs_readdir_cache_shorts = 0; 2820 static int nfs_readdir_cache_waits = 0; 2821 static int nfs_readdir_cache_misses = 0; 2822 static int nfs_readdir_readahead = 0; 2823 #endif 2824 2825 static int nfs_shrinkreaddir = 0; 2826 2827 /* 2828 * Read directory entries. 2829 * There are some weird things to look out for here. The uio_offset 2830 * field is either 0 or it is the offset returned from a previous 2831 * readdir. It is an opaque value used by the server to find the 2832 * correct directory block to read. The count field is the number 2833 * of blocks to read on the server. This is advisory only, the server 2834 * may return only one block's worth of entries. Entries may be compressed 2835 * on the server. 2836 */ 2837 static int 2838 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 2839 { 2840 int error; 2841 size_t count; 2842 rnode_t *rp; 2843 rddir_cache *rdc; 2844 rddir_cache *nrdc; 2845 rddir_cache *rrdc; 2846 #ifdef DEBUG 2847 int missed; 2848 #endif 2849 rddir_cache srdc; 2850 avl_index_t where; 2851 2852 rp = VTOR(vp); 2853 2854 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2855 if (nfs_zone() != VTOMI(vp)->mi_zone) 2856 return (EIO); 2857 /* 2858 * Make sure that the directory cache is valid. 2859 */ 2860 if (HAVE_RDDIR_CACHE(rp)) { 2861 if (nfs_disable_rddir_cache) { 2862 /* 2863 * Setting nfs_disable_rddir_cache in /etc/system 2864 * allows interoperability with servers that do not 2865 * properly update the attributes of directories. 2866 * Any cached information gets purged before an 2867 * access is made to it. 2868 */ 2869 nfs_purge_rddir_cache(vp); 2870 } else { 2871 error = nfs_validate_caches(vp, cr); 2872 if (error) 2873 return (error); 2874 } 2875 } 2876 2877 /* 2878 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an 2879 * RFS_READDIR request with rda_count set to more than 0x400. So 2880 * we reduce the request size here purely for compatibility. 2881 * 2882 * In general, this is no longer required. However, if a server 2883 * is discovered which can not handle requests larger than 1024, 2884 * nfs_shrinkreaddir can be set to 1 to enable this backwards 2885 * compatibility. 2886 * 2887 * In any case, the request size is limited to NFS_MAXDATA bytes. 2888 */ 2889 count = MIN(uiop->uio_iov->iov_len, 2890 nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA); 2891 2892 nrdc = NULL; 2893 #ifdef DEBUG 2894 missed = 0; 2895 #endif 2896 top: 2897 /* 2898 * Short circuit last readdir which always returns 0 bytes. 2899 * This can be done after the directory has been read through 2900 * completely at least once. This will set r_direof which 2901 * can be used to find the value of the last cookie. 2902 */ 2903 mutex_enter(&rp->r_statelock); 2904 if (rp->r_direof != NULL && 2905 uiop->uio_offset == rp->r_direof->nfs_ncookie) { 2906 mutex_exit(&rp->r_statelock); 2907 #ifdef DEBUG 2908 nfs_readdir_cache_shorts++; 2909 #endif 2910 if (eofp) 2911 *eofp = 1; 2912 if (nrdc != NULL) 2913 rddir_cache_rele(nrdc); 2914 return (0); 2915 } 2916 /* 2917 * Look for a cache entry. Cache entries are identified 2918 * by the NFS cookie value and the byte count requested. 2919 */ 2920 srdc.nfs_cookie = uiop->uio_offset; 2921 srdc.buflen = count; 2922 rdc = avl_find(&rp->r_dir, &srdc, &where); 2923 if (rdc != NULL) { 2924 rddir_cache_hold(rdc); 2925 /* 2926 * If the cache entry is in the process of being 2927 * filled in, wait until this completes. The 2928 * RDDIRWAIT bit is set to indicate that someone 2929 * is waiting and then the thread currently 2930 * filling the entry is done, it should do a 2931 * cv_broadcast to wakeup all of the threads 2932 * waiting for it to finish. 2933 */ 2934 if (rdc->flags & RDDIR) { 2935 nfs_rw_exit(&rp->r_rwlock); 2936 rdc->flags |= RDDIRWAIT; 2937 #ifdef DEBUG 2938 nfs_readdir_cache_waits++; 2939 #endif 2940 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 2941 /* 2942 * We got interrupted, probably 2943 * the user typed ^C or an alarm 2944 * fired. We free the new entry 2945 * if we allocated one. 2946 */ 2947 mutex_exit(&rp->r_statelock); 2948 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2949 RW_READER, FALSE); 2950 rddir_cache_rele(rdc); 2951 if (nrdc != NULL) 2952 rddir_cache_rele(nrdc); 2953 return (EINTR); 2954 } 2955 mutex_exit(&rp->r_statelock); 2956 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2957 RW_READER, FALSE); 2958 rddir_cache_rele(rdc); 2959 goto top; 2960 } 2961 /* 2962 * Check to see if a readdir is required to 2963 * fill the entry. If so, mark this entry 2964 * as being filled, remove our reference, 2965 * and branch to the code to fill the entry. 2966 */ 2967 if (rdc->flags & RDDIRREQ) { 2968 rdc->flags &= ~RDDIRREQ; 2969 rdc->flags |= RDDIR; 2970 if (nrdc != NULL) 2971 rddir_cache_rele(nrdc); 2972 nrdc = rdc; 2973 mutex_exit(&rp->r_statelock); 2974 goto bottom; 2975 } 2976 #ifdef DEBUG 2977 if (!missed) 2978 nfs_readdir_cache_hits++; 2979 #endif 2980 /* 2981 * If an error occurred while attempting 2982 * to fill the cache entry, just return it. 2983 */ 2984 if (rdc->error) { 2985 error = rdc->error; 2986 mutex_exit(&rp->r_statelock); 2987 rddir_cache_rele(rdc); 2988 if (nrdc != NULL) 2989 rddir_cache_rele(nrdc); 2990 return (error); 2991 } 2992 2993 /* 2994 * The cache entry is complete and good, 2995 * copyout the dirent structs to the calling 2996 * thread. 2997 */ 2998 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 2999 3000 /* 3001 * If no error occurred during the copyout, 3002 * update the offset in the uio struct to 3003 * contain the value of the next cookie 3004 * and set the eof value appropriately. 3005 */ 3006 if (!error) { 3007 uiop->uio_offset = rdc->nfs_ncookie; 3008 if (eofp) 3009 *eofp = rdc->eof; 3010 } 3011 3012 /* 3013 * Decide whether to do readahead. Don't if 3014 * have already read to the end of directory. 3015 */ 3016 if (rdc->eof) { 3017 rp->r_direof = rdc; 3018 mutex_exit(&rp->r_statelock); 3019 rddir_cache_rele(rdc); 3020 if (nrdc != NULL) 3021 rddir_cache_rele(nrdc); 3022 return (error); 3023 } 3024 3025 /* 3026 * Check to see whether we found an entry 3027 * for the readahead. If so, we don't need 3028 * to do anything further, so free the new 3029 * entry if one was allocated. Otherwise, 3030 * allocate a new entry, add it to the cache, 3031 * and then initiate an asynchronous readdir 3032 * operation to fill it. 3033 */ 3034 srdc.nfs_cookie = rdc->nfs_ncookie; 3035 srdc.buflen = count; 3036 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3037 if (rrdc != NULL) { 3038 if (nrdc != NULL) 3039 rddir_cache_rele(nrdc); 3040 } else { 3041 if (nrdc != NULL) 3042 rrdc = nrdc; 3043 else { 3044 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3045 } 3046 if (rrdc != NULL) { 3047 rrdc->nfs_cookie = rdc->nfs_ncookie; 3048 rrdc->buflen = count; 3049 avl_insert(&rp->r_dir, rrdc, where); 3050 rddir_cache_hold(rrdc); 3051 mutex_exit(&rp->r_statelock); 3052 rddir_cache_rele(rdc); 3053 #ifdef DEBUG 3054 nfs_readdir_readahead++; 3055 #endif 3056 nfs_async_readdir(vp, rrdc, cr, nfsreaddir); 3057 return (error); 3058 } 3059 } 3060 3061 mutex_exit(&rp->r_statelock); 3062 rddir_cache_rele(rdc); 3063 return (error); 3064 } 3065 3066 /* 3067 * Didn't find an entry in the cache. Construct a new empty 3068 * entry and link it into the cache. Other processes attempting 3069 * to access this entry will need to wait until it is filled in. 3070 * 3071 * Since kmem_alloc may block, another pass through the cache 3072 * will need to be taken to make sure that another process 3073 * hasn't already added an entry to the cache for this request. 3074 */ 3075 if (nrdc == NULL) { 3076 mutex_exit(&rp->r_statelock); 3077 nrdc = rddir_cache_alloc(KM_SLEEP); 3078 nrdc->nfs_cookie = uiop->uio_offset; 3079 nrdc->buflen = count; 3080 goto top; 3081 } 3082 3083 /* 3084 * Add this entry to the cache. 3085 */ 3086 avl_insert(&rp->r_dir, nrdc, where); 3087 rddir_cache_hold(nrdc); 3088 mutex_exit(&rp->r_statelock); 3089 3090 bottom: 3091 #ifdef DEBUG 3092 missed = 1; 3093 nfs_readdir_cache_misses++; 3094 #endif 3095 /* 3096 * Do the readdir. 3097 */ 3098 error = nfsreaddir(vp, nrdc, cr); 3099 3100 /* 3101 * If this operation failed, just return the error which occurred. 3102 */ 3103 if (error != 0) 3104 return (error); 3105 3106 /* 3107 * Since the RPC operation will have taken sometime and blocked 3108 * this process, another pass through the cache will need to be 3109 * taken to find the correct cache entry. It is possible that 3110 * the correct cache entry will not be there (although one was 3111 * added) because the directory changed during the RPC operation 3112 * and the readdir cache was flushed. In this case, just start 3113 * over. It is hoped that this will not happen too often... :-) 3114 */ 3115 nrdc = NULL; 3116 goto top; 3117 /* NOTREACHED */ 3118 } 3119 3120 static int 3121 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3122 { 3123 int error; 3124 struct nfsrddirargs rda; 3125 struct nfsrddirres rd; 3126 rnode_t *rp; 3127 mntinfo_t *mi; 3128 uint_t count; 3129 int douprintf; 3130 failinfo_t fi, *fip; 3131 3132 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3133 count = rdc->buflen; 3134 3135 rp = VTOR(vp); 3136 mi = VTOMI(vp); 3137 3138 rda.rda_fh = *VTOFH(vp); 3139 rda.rda_offset = rdc->nfs_cookie; 3140 3141 /* 3142 * NFS client failover support 3143 * suppress failover unless we have a zero cookie 3144 */ 3145 if (rdc->nfs_cookie == (off_t)0) { 3146 fi.vp = vp; 3147 fi.fhp = (caddr_t)&rda.rda_fh; 3148 fi.copyproc = nfscopyfh; 3149 fi.lookupproc = nfslookup; 3150 fi.xattrdirproc = acl_getxattrdir2; 3151 fip = &fi; 3152 } else { 3153 fip = NULL; 3154 } 3155 3156 rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3157 rd.rd_size = count; 3158 rd.rd_offset = rda.rda_offset; 3159 3160 douprintf = 1; 3161 3162 if (mi->mi_io_kstats) { 3163 mutex_enter(&mi->mi_lock); 3164 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3165 mutex_exit(&mi->mi_lock); 3166 } 3167 3168 do { 3169 rda.rda_count = MIN(count, mi->mi_curread); 3170 error = rfs2call(mi, RFS_READDIR, 3171 xdr_rddirargs, (caddr_t)&rda, 3172 xdr_getrddirres, (caddr_t)&rd, cr, 3173 &douprintf, &rd.rd_status, 0, fip); 3174 } while (error == ENFS_TRYAGAIN); 3175 3176 if (mi->mi_io_kstats) { 3177 mutex_enter(&mi->mi_lock); 3178 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3179 mutex_exit(&mi->mi_lock); 3180 } 3181 3182 /* 3183 * Since we are actually doing a READDIR RPC, we must have 3184 * exclusive access to the cache entry being filled. Thus, 3185 * it is safe to update all fields except for the flags 3186 * field. The r_statelock in the rnode must be held to 3187 * prevent two different threads from simultaneously 3188 * attempting to update the flags field. This can happen 3189 * if we are turning off RDDIR and the other thread is 3190 * trying to set RDDIRWAIT. 3191 */ 3192 ASSERT(rdc->flags & RDDIR); 3193 if (!error) { 3194 error = geterrno(rd.rd_status); 3195 if (!error) { 3196 rdc->nfs_ncookie = rd.rd_offset; 3197 rdc->eof = rd.rd_eof ? 1 : 0; 3198 rdc->entlen = rd.rd_size; 3199 ASSERT(rdc->entlen <= rdc->buflen); 3200 #ifdef DEBUG 3201 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, 3202 KM_SLEEP); 3203 #else 3204 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3205 #endif 3206 bcopy(rd.rd_entries, rdc->entries, rdc->entlen); 3207 rdc->error = 0; 3208 if (mi->mi_io_kstats) { 3209 mutex_enter(&mi->mi_lock); 3210 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3211 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 3212 rd.rd_size; 3213 mutex_exit(&mi->mi_lock); 3214 } 3215 } else { 3216 PURGE_STALE_FH(error, vp, cr); 3217 } 3218 } 3219 if (error) { 3220 rdc->entries = NULL; 3221 rdc->error = error; 3222 } 3223 kmem_free(rd.rd_entries, rdc->buflen); 3224 3225 mutex_enter(&rp->r_statelock); 3226 rdc->flags &= ~RDDIR; 3227 if (rdc->flags & RDDIRWAIT) { 3228 rdc->flags &= ~RDDIRWAIT; 3229 cv_broadcast(&rdc->cv); 3230 } 3231 if (error) 3232 rdc->flags |= RDDIRREQ; 3233 mutex_exit(&rp->r_statelock); 3234 3235 rddir_cache_rele(rdc); 3236 3237 return (error); 3238 } 3239 3240 #ifdef DEBUG 3241 static int nfs_bio_do_stop = 0; 3242 #endif 3243 3244 static int 3245 nfs_bio(struct buf *bp, cred_t *cr) 3246 { 3247 rnode_t *rp = VTOR(bp->b_vp); 3248 int count; 3249 int error; 3250 cred_t *cred; 3251 uint_t offset; 3252 3253 DTRACE_IO1(start, struct buf *, bp); 3254 3255 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 3256 offset = dbtob(bp->b_blkno); 3257 3258 if (bp->b_flags & B_READ) { 3259 mutex_enter(&rp->r_statelock); 3260 if (rp->r_cred != NULL) { 3261 cred = rp->r_cred; 3262 crhold(cred); 3263 } else { 3264 rp->r_cred = cr; 3265 crhold(cr); 3266 cred = cr; 3267 crhold(cred); 3268 } 3269 mutex_exit(&rp->r_statelock); 3270 read_again: 3271 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr, 3272 offset, bp->b_bcount, &bp->b_resid, cred); 3273 crfree(cred); 3274 if (!error) { 3275 if (bp->b_resid) { 3276 /* 3277 * Didn't get it all because we hit EOF, 3278 * zero all the memory beyond the EOF. 3279 */ 3280 /* bzero(rdaddr + */ 3281 bzero(bp->b_un.b_addr + 3282 bp->b_bcount - bp->b_resid, bp->b_resid); 3283 } 3284 mutex_enter(&rp->r_statelock); 3285 if (bp->b_resid == bp->b_bcount && 3286 offset >= rp->r_size) { 3287 /* 3288 * We didn't read anything at all as we are 3289 * past EOF. Return an error indicator back 3290 * but don't destroy the pages (yet). 3291 */ 3292 error = NFS_EOF; 3293 } 3294 mutex_exit(&rp->r_statelock); 3295 } else if (error == EACCES) { 3296 mutex_enter(&rp->r_statelock); 3297 if (cred != cr) { 3298 if (rp->r_cred != NULL) 3299 crfree(rp->r_cred); 3300 rp->r_cred = cr; 3301 crhold(cr); 3302 cred = cr; 3303 crhold(cred); 3304 mutex_exit(&rp->r_statelock); 3305 goto read_again; 3306 } 3307 mutex_exit(&rp->r_statelock); 3308 } 3309 } else { 3310 if (!(rp->r_flags & RSTALE)) { 3311 mutex_enter(&rp->r_statelock); 3312 if (rp->r_cred != NULL) { 3313 cred = rp->r_cred; 3314 crhold(cred); 3315 } else { 3316 rp->r_cred = cr; 3317 crhold(cr); 3318 cred = cr; 3319 crhold(cred); 3320 } 3321 mutex_exit(&rp->r_statelock); 3322 write_again: 3323 mutex_enter(&rp->r_statelock); 3324 count = MIN(bp->b_bcount, rp->r_size - offset); 3325 mutex_exit(&rp->r_statelock); 3326 if (count < 0) 3327 cmn_err(CE_PANIC, "nfs_bio: write count < 0"); 3328 #ifdef DEBUG 3329 if (count == 0) { 3330 zcmn_err(getzoneid(), CE_WARN, 3331 "nfs_bio: zero length write at %d", 3332 offset); 3333 nfs_printfhandle(&rp->r_fh); 3334 if (nfs_bio_do_stop) 3335 debug_enter("nfs_bio"); 3336 } 3337 #endif 3338 error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset, 3339 count, cred); 3340 if (error == EACCES) { 3341 mutex_enter(&rp->r_statelock); 3342 if (cred != cr) { 3343 if (rp->r_cred != NULL) 3344 crfree(rp->r_cred); 3345 rp->r_cred = cr; 3346 crhold(cr); 3347 crfree(cred); 3348 cred = cr; 3349 crhold(cred); 3350 mutex_exit(&rp->r_statelock); 3351 goto write_again; 3352 } 3353 mutex_exit(&rp->r_statelock); 3354 } 3355 bp->b_error = error; 3356 if (error && error != EINTR) { 3357 /* 3358 * Don't print EDQUOT errors on the console. 3359 * Don't print asynchronous EACCES errors. 3360 * Don't print EFBIG errors. 3361 * Print all other write errors. 3362 */ 3363 if (error != EDQUOT && error != EFBIG && 3364 (error != EACCES || 3365 !(bp->b_flags & B_ASYNC))) 3366 nfs_write_error(bp->b_vp, error, cred); 3367 /* 3368 * Update r_error and r_flags as appropriate. 3369 * If the error was ESTALE, then mark the 3370 * rnode as not being writeable and save 3371 * the error status. Otherwise, save any 3372 * errors which occur from asynchronous 3373 * page invalidations. Any errors occurring 3374 * from other operations should be saved 3375 * by the caller. 3376 */ 3377 mutex_enter(&rp->r_statelock); 3378 if (error == ESTALE) { 3379 rp->r_flags |= RSTALE; 3380 if (!rp->r_error) 3381 rp->r_error = error; 3382 } else if (!rp->r_error && 3383 (bp->b_flags & 3384 (B_INVAL|B_FORCE|B_ASYNC)) == 3385 (B_INVAL|B_FORCE|B_ASYNC)) { 3386 rp->r_error = error; 3387 } 3388 mutex_exit(&rp->r_statelock); 3389 } 3390 crfree(cred); 3391 } else 3392 error = rp->r_error; 3393 } 3394 3395 if (error != 0 && error != NFS_EOF) 3396 bp->b_flags |= B_ERROR; 3397 3398 DTRACE_IO1(done, struct buf *, bp); 3399 3400 return (error); 3401 } 3402 3403 static int 3404 nfs_fid(vnode_t *vp, fid_t *fidp) 3405 { 3406 struct nfs_fid *fp; 3407 rnode_t *rp; 3408 3409 rp = VTOR(vp); 3410 3411 if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) { 3412 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short); 3413 return (ENOSPC); 3414 } 3415 fp = (struct nfs_fid *)fidp; 3416 fp->nf_pad = 0; 3417 fp->nf_len = sizeof (struct nfs_fid) - sizeof (short); 3418 bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE); 3419 return (0); 3420 } 3421 3422 /* ARGSUSED2 */ 3423 static int 3424 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3425 { 3426 rnode_t *rp = VTOR(vp); 3427 3428 if (!write_lock) { 3429 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3430 return (V_WRITELOCK_FALSE); 3431 } 3432 3433 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 3434 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3435 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 3436 return (V_WRITELOCK_FALSE); 3437 nfs_rw_exit(&rp->r_rwlock); 3438 } 3439 3440 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 3441 return (V_WRITELOCK_TRUE); 3442 } 3443 3444 /* ARGSUSED */ 3445 static void 3446 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3447 { 3448 rnode_t *rp = VTOR(vp); 3449 3450 nfs_rw_exit(&rp->r_rwlock); 3451 } 3452 3453 /* ARGSUSED */ 3454 static int 3455 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 3456 { 3457 3458 /* 3459 * Because we stuff the readdir cookie into the offset field 3460 * someone may attempt to do an lseek with the cookie which 3461 * we want to succeed. 3462 */ 3463 if (vp->v_type == VDIR) 3464 return (0); 3465 if (*noffp < 0 || *noffp > MAXOFF32_T) 3466 return (EINVAL); 3467 return (0); 3468 } 3469 3470 /* 3471 * number of NFS_MAXDATA blocks to read ahead 3472 * optimized for 100 base-T. 3473 */ 3474 static int nfs_nra = 4; 3475 3476 #ifdef DEBUG 3477 static int nfs_lostpage = 0; /* number of times we lost original page */ 3478 #endif 3479 3480 /* 3481 * Return all the pages from [off..off+len) in file 3482 */ 3483 static int 3484 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3485 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3486 enum seg_rw rw, cred_t *cr) 3487 { 3488 rnode_t *rp; 3489 int error; 3490 mntinfo_t *mi; 3491 3492 if (vp->v_flag & VNOMAP) 3493 return (ENOSYS); 3494 3495 ASSERT(off <= MAXOFF32_T); 3496 if (nfs_zone() != VTOMI(vp)->mi_zone) 3497 return (EIO); 3498 if (protp != NULL) 3499 *protp = PROT_ALL; 3500 3501 /* 3502 * Now valididate that the caches are up to date. 3503 */ 3504 error = nfs_validate_caches(vp, cr); 3505 if (error) 3506 return (error); 3507 3508 rp = VTOR(vp); 3509 mi = VTOMI(vp); 3510 retry: 3511 mutex_enter(&rp->r_statelock); 3512 3513 /* 3514 * Don't create dirty pages faster than they 3515 * can be cleaned so that the system doesn't 3516 * get imbalanced. If the async queue is 3517 * maxed out, then wait for it to drain before 3518 * creating more dirty pages. Also, wait for 3519 * any threads doing pagewalks in the vop_getattr 3520 * entry points so that they don't block for 3521 * long periods. 3522 */ 3523 if (rw == S_CREATE) { 3524 while ((mi->mi_max_threads != 0 && 3525 rp->r_awcount > 2 * mi->mi_max_threads) || 3526 rp->r_gcount > 0) 3527 cv_wait(&rp->r_cv, &rp->r_statelock); 3528 } 3529 3530 /* 3531 * If we are getting called as a side effect of an nfs_write() 3532 * operation the local file size might not be extended yet. 3533 * In this case we want to be able to return pages of zeroes. 3534 */ 3535 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 3536 mutex_exit(&rp->r_statelock); 3537 return (EFAULT); /* beyond EOF */ 3538 } 3539 3540 mutex_exit(&rp->r_statelock); 3541 3542 if (len <= PAGESIZE) { 3543 error = nfs_getapage(vp, off, len, protp, pl, plsz, 3544 seg, addr, rw, cr); 3545 } else { 3546 error = pvn_getpages(nfs_getapage, vp, off, len, protp, 3547 pl, plsz, seg, addr, rw, cr); 3548 } 3549 3550 switch (error) { 3551 case NFS_EOF: 3552 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 3553 goto retry; 3554 case ESTALE: 3555 PURGE_STALE_FH(error, vp, cr); 3556 } 3557 3558 return (error); 3559 } 3560 3561 /* 3562 * Called from pvn_getpages or nfs_getpage to get a particular page. 3563 */ 3564 /* ARGSUSED */ 3565 static int 3566 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 3567 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3568 enum seg_rw rw, cred_t *cr) 3569 { 3570 rnode_t *rp; 3571 uint_t bsize; 3572 struct buf *bp; 3573 page_t *pp; 3574 u_offset_t lbn; 3575 u_offset_t io_off; 3576 u_offset_t blkoff; 3577 u_offset_t rablkoff; 3578 size_t io_len; 3579 uint_t blksize; 3580 int error; 3581 int readahead; 3582 int readahead_issued = 0; 3583 int ra_window; /* readahead window */ 3584 page_t *pagefound; 3585 3586 if (nfs_zone() != VTOMI(vp)->mi_zone) 3587 return (EIO); 3588 rp = VTOR(vp); 3589 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3590 3591 reread: 3592 bp = NULL; 3593 pp = NULL; 3594 pagefound = NULL; 3595 3596 if (pl != NULL) 3597 pl[0] = NULL; 3598 3599 error = 0; 3600 lbn = off / bsize; 3601 blkoff = lbn * bsize; 3602 3603 /* 3604 * Queueing up the readahead before doing the synchronous read 3605 * results in a significant increase in read throughput because 3606 * of the increased parallelism between the async threads and 3607 * the process context. 3608 */ 3609 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 3610 rw != S_CREATE && 3611 !(vp->v_flag & VNOCACHE)) { 3612 mutex_enter(&rp->r_statelock); 3613 3614 /* 3615 * Calculate the number of readaheads to do. 3616 * a) No readaheads at offset = 0. 3617 * b) Do maximum(nfs_nra) readaheads when the readahead 3618 * window is closed. 3619 * c) Do readaheads between 1 to (nfs_nra - 1) depending 3620 * upon how far the readahead window is open or close. 3621 * d) No readaheads if rp->r_nextr is not within the scope 3622 * of the readahead window (random i/o). 3623 */ 3624 3625 if (off == 0) 3626 readahead = 0; 3627 else if (blkoff == rp->r_nextr) 3628 readahead = nfs_nra; 3629 else if (rp->r_nextr > blkoff && 3630 ((ra_window = (rp->r_nextr - blkoff) / bsize) 3631 <= (nfs_nra - 1))) 3632 readahead = nfs_nra - ra_window; 3633 else 3634 readahead = 0; 3635 3636 rablkoff = rp->r_nextr; 3637 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 3638 mutex_exit(&rp->r_statelock); 3639 if (nfs_async_readahead(vp, rablkoff + bsize, 3640 addr + (rablkoff + bsize - off), seg, cr, 3641 nfs_readahead) < 0) { 3642 mutex_enter(&rp->r_statelock); 3643 break; 3644 } 3645 readahead--; 3646 rablkoff += bsize; 3647 /* 3648 * Indicate that we did a readahead so 3649 * readahead offset is not updated 3650 * by the synchronous read below. 3651 */ 3652 readahead_issued = 1; 3653 mutex_enter(&rp->r_statelock); 3654 /* 3655 * set readahead offset to 3656 * offset of last async readahead 3657 * request. 3658 */ 3659 rp->r_nextr = rablkoff; 3660 } 3661 mutex_exit(&rp->r_statelock); 3662 } 3663 3664 again: 3665 if ((pagefound = page_exists(vp, off)) == NULL) { 3666 if (pl == NULL) { 3667 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 3668 nfs_readahead); 3669 } else if (rw == S_CREATE) { 3670 /* 3671 * Block for this page is not allocated, or the offset 3672 * is beyond the current allocation size, or we're 3673 * allocating a swap slot and the page was not found, 3674 * so allocate it and return a zero page. 3675 */ 3676 if ((pp = page_create_va(vp, off, 3677 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 3678 cmn_err(CE_PANIC, "nfs_getapage: page_create"); 3679 io_len = PAGESIZE; 3680 mutex_enter(&rp->r_statelock); 3681 rp->r_nextr = off + PAGESIZE; 3682 mutex_exit(&rp->r_statelock); 3683 } else { 3684 /* 3685 * Need to go to server to get a BLOCK, exception to 3686 * that being while reading at offset = 0 or doing 3687 * random i/o, in that case read only a PAGE. 3688 */ 3689 mutex_enter(&rp->r_statelock); 3690 if (blkoff < rp->r_size && 3691 blkoff + bsize >= rp->r_size) { 3692 /* 3693 * If only a block or less is left in 3694 * the file, read all that is remaining. 3695 */ 3696 if (rp->r_size <= off) { 3697 /* 3698 * Trying to access beyond EOF, 3699 * set up to get at least one page. 3700 */ 3701 blksize = off + PAGESIZE - blkoff; 3702 } else 3703 blksize = rp->r_size - blkoff; 3704 } else if ((off == 0) || 3705 (off != rp->r_nextr && !readahead_issued)) { 3706 blksize = PAGESIZE; 3707 blkoff = off; /* block = page here */ 3708 } else 3709 blksize = bsize; 3710 mutex_exit(&rp->r_statelock); 3711 3712 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3713 &io_len, blkoff, blksize, 0); 3714 3715 /* 3716 * Some other thread has entered the page, 3717 * so just use it. 3718 */ 3719 if (pp == NULL) 3720 goto again; 3721 3722 /* 3723 * Now round the request size up to page boundaries. 3724 * This ensures that the entire page will be 3725 * initialized to zeroes if EOF is encountered. 3726 */ 3727 io_len = ptob(btopr(io_len)); 3728 3729 bp = pageio_setup(pp, io_len, vp, B_READ); 3730 ASSERT(bp != NULL); 3731 3732 /* 3733 * pageio_setup should have set b_addr to 0. This 3734 * is correct since we want to do I/O on a page 3735 * boundary. bp_mapin will use this addr to calculate 3736 * an offset, and then set b_addr to the kernel virtual 3737 * address it allocated for us. 3738 */ 3739 ASSERT(bp->b_un.b_addr == 0); 3740 3741 bp->b_edev = 0; 3742 bp->b_dev = 0; 3743 bp->b_lblkno = lbtodb(io_off); 3744 bp->b_file = vp; 3745 bp->b_offset = (offset_t)off; 3746 bp_mapin(bp); 3747 3748 /* 3749 * If doing a write beyond what we believe is EOF, 3750 * don't bother trying to read the pages from the 3751 * server, we'll just zero the pages here. We 3752 * don't check that the rw flag is S_WRITE here 3753 * because some implementations may attempt a 3754 * read access to the buffer before copying data. 3755 */ 3756 mutex_enter(&rp->r_statelock); 3757 if (io_off >= rp->r_size && seg == segkmap) { 3758 mutex_exit(&rp->r_statelock); 3759 bzero(bp->b_un.b_addr, io_len); 3760 } else { 3761 mutex_exit(&rp->r_statelock); 3762 error = nfs_bio(bp, cr); 3763 } 3764 3765 /* 3766 * Unmap the buffer before freeing it. 3767 */ 3768 bp_mapout(bp); 3769 pageio_done(bp); 3770 3771 if (error == NFS_EOF) { 3772 /* 3773 * If doing a write system call just return 3774 * zeroed pages, else user tried to get pages 3775 * beyond EOF, return error. We don't check 3776 * that the rw flag is S_WRITE here because 3777 * some implementations may attempt a read 3778 * access to the buffer before copying data. 3779 */ 3780 if (seg == segkmap) 3781 error = 0; 3782 else 3783 error = EFAULT; 3784 } 3785 3786 if (!readahead_issued && !error) { 3787 mutex_enter(&rp->r_statelock); 3788 rp->r_nextr = io_off + io_len; 3789 mutex_exit(&rp->r_statelock); 3790 } 3791 } 3792 } 3793 3794 out: 3795 if (pl == NULL) 3796 return (error); 3797 3798 if (error) { 3799 if (pp != NULL) 3800 pvn_read_done(pp, B_ERROR); 3801 return (error); 3802 } 3803 3804 if (pagefound) { 3805 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 3806 3807 /* 3808 * Page exists in the cache, acquire the appropriate lock. 3809 * If this fails, start all over again. 3810 */ 3811 if ((pp = page_lookup(vp, off, se)) == NULL) { 3812 #ifdef DEBUG 3813 nfs_lostpage++; 3814 #endif 3815 goto reread; 3816 } 3817 pl[0] = pp; 3818 pl[1] = NULL; 3819 return (0); 3820 } 3821 3822 if (pp != NULL) 3823 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3824 3825 return (error); 3826 } 3827 3828 static void 3829 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 3830 cred_t *cr) 3831 { 3832 int error; 3833 page_t *pp; 3834 u_offset_t io_off; 3835 size_t io_len; 3836 struct buf *bp; 3837 uint_t bsize, blksize; 3838 rnode_t *rp = VTOR(vp); 3839 3840 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3841 3842 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3843 3844 mutex_enter(&rp->r_statelock); 3845 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 3846 /* 3847 * If less than a block left in file read less 3848 * than a block. 3849 */ 3850 blksize = rp->r_size - blkoff; 3851 } else 3852 blksize = bsize; 3853 mutex_exit(&rp->r_statelock); 3854 3855 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 3856 &io_off, &io_len, blkoff, blksize, 1); 3857 /* 3858 * The isra flag passed to the kluster function is 1, we may have 3859 * gotten a return value of NULL for a variety of reasons (# of free 3860 * pages < minfree, someone entered the page on the vnode etc). In all 3861 * cases, we want to punt on the readahead. 3862 */ 3863 if (pp == NULL) 3864 return; 3865 3866 /* 3867 * Now round the request size up to page boundaries. 3868 * This ensures that the entire page will be 3869 * initialized to zeroes if EOF is encountered. 3870 */ 3871 io_len = ptob(btopr(io_len)); 3872 3873 bp = pageio_setup(pp, io_len, vp, B_READ); 3874 ASSERT(bp != NULL); 3875 3876 /* 3877 * pageio_setup should have set b_addr to 0. This is correct since 3878 * we want to do I/O on a page boundary. bp_mapin() will use this addr 3879 * to calculate an offset, and then set b_addr to the kernel virtual 3880 * address it allocated for us. 3881 */ 3882 ASSERT(bp->b_un.b_addr == 0); 3883 3884 bp->b_edev = 0; 3885 bp->b_dev = 0; 3886 bp->b_lblkno = lbtodb(io_off); 3887 bp->b_file = vp; 3888 bp->b_offset = (offset_t)blkoff; 3889 bp_mapin(bp); 3890 3891 /* 3892 * If doing a write beyond what we believe is EOF, don't bother trying 3893 * to read the pages from the server, we'll just zero the pages here. 3894 * We don't check that the rw flag is S_WRITE here because some 3895 * implementations may attempt a read access to the buffer before 3896 * copying data. 3897 */ 3898 mutex_enter(&rp->r_statelock); 3899 if (io_off >= rp->r_size && seg == segkmap) { 3900 mutex_exit(&rp->r_statelock); 3901 bzero(bp->b_un.b_addr, io_len); 3902 error = 0; 3903 } else { 3904 mutex_exit(&rp->r_statelock); 3905 error = nfs_bio(bp, cr); 3906 if (error == NFS_EOF) 3907 error = 0; 3908 } 3909 3910 /* 3911 * Unmap the buffer before freeing it. 3912 */ 3913 bp_mapout(bp); 3914 pageio_done(bp); 3915 3916 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 3917 3918 /* 3919 * In case of error set readahead offset 3920 * to the lowest offset. 3921 * pvn_read_done() calls VN_DISPOSE to destroy the pages 3922 */ 3923 if (error && rp->r_nextr > io_off) { 3924 mutex_enter(&rp->r_statelock); 3925 if (rp->r_nextr > io_off) 3926 rp->r_nextr = io_off; 3927 mutex_exit(&rp->r_statelock); 3928 } 3929 } 3930 3931 /* 3932 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 3933 * If len == 0, do from off to EOF. 3934 * 3935 * The normal cases should be len == 0 && off == 0 (entire vp list), 3936 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 3937 * (from pageout). 3938 */ 3939 static int 3940 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 3941 { 3942 int error; 3943 rnode_t *rp; 3944 3945 ASSERT(cr != NULL); 3946 3947 /* 3948 * XXX - Why should this check be made here? 3949 */ 3950 if (vp->v_flag & VNOMAP) 3951 return (ENOSYS); 3952 3953 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 3954 return (0); 3955 3956 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 3957 return (EIO); 3958 ASSERT(off <= MAXOFF32_T); 3959 3960 rp = VTOR(vp); 3961 mutex_enter(&rp->r_statelock); 3962 rp->r_count++; 3963 mutex_exit(&rp->r_statelock); 3964 error = nfs_putpages(vp, off, len, flags, cr); 3965 mutex_enter(&rp->r_statelock); 3966 rp->r_count--; 3967 cv_broadcast(&rp->r_cv); 3968 mutex_exit(&rp->r_statelock); 3969 3970 return (error); 3971 } 3972 3973 /* 3974 * Write out a single page, possibly klustering adjacent dirty pages. 3975 */ 3976 int 3977 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 3978 int flags, cred_t *cr) 3979 { 3980 u_offset_t io_off; 3981 u_offset_t lbn_off; 3982 u_offset_t lbn; 3983 size_t io_len; 3984 uint_t bsize; 3985 int error; 3986 rnode_t *rp; 3987 3988 ASSERT(!vn_is_readonly(vp)); 3989 ASSERT(pp != NULL); 3990 ASSERT(cr != NULL); 3991 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 3992 3993 rp = VTOR(vp); 3994 ASSERT(rp->r_count > 0); 3995 3996 ASSERT(pp->p_offset <= MAXOFF32_T); 3997 3998 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3999 lbn = pp->p_offset / bsize; 4000 lbn_off = lbn * bsize; 4001 4002 /* 4003 * Find a kluster that fits in one block, or in 4004 * one page if pages are bigger than blocks. If 4005 * there is less file space allocated than a whole 4006 * page, we'll shorten the i/o request below. 4007 */ 4008 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4009 roundup(bsize, PAGESIZE), flags); 4010 4011 /* 4012 * pvn_write_kluster shouldn't have returned a page with offset 4013 * behind the original page we were given. Verify that. 4014 */ 4015 ASSERT((pp->p_offset / bsize) >= lbn); 4016 4017 /* 4018 * Now pp will have the list of kept dirty pages marked for 4019 * write back. It will also handle invalidation and freeing 4020 * of pages that are not dirty. Check for page length rounding 4021 * problems. 4022 */ 4023 if (io_off + io_len > lbn_off + bsize) { 4024 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4025 io_len = lbn_off + bsize - io_off; 4026 } 4027 /* 4028 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4029 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4030 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4031 * progress and the r_size has not been made consistent with the 4032 * new size of the file. When the uiomove() completes the r_size is 4033 * updated and the RMODINPROGRESS flag is cleared. 4034 * 4035 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4036 * consistent value of r_size. Without this handshaking, it is 4037 * possible that nfs(3)_bio() picks up the old value of r_size 4038 * before the uiomove() in writerp() completes. This will result 4039 * in the write through nfs(3)_bio() being dropped. 4040 * 4041 * More precisely, there is a window between the time the uiomove() 4042 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4043 * operation intervenes in this window, the page will be picked up, 4044 * because it is dirty (it will be unlocked, unless it was 4045 * pagecreate'd). When the page is picked up as dirty, the dirty 4046 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4047 * checked. This will still be the old size. Therefore the page will 4048 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4049 * the page will be found to be clean and the write will be dropped. 4050 */ 4051 if (rp->r_flags & RMODINPROGRESS) { 4052 mutex_enter(&rp->r_statelock); 4053 if ((rp->r_flags & RMODINPROGRESS) && 4054 rp->r_modaddr + MAXBSIZE > io_off && 4055 rp->r_modaddr < io_off + io_len) { 4056 page_t *plist; 4057 /* 4058 * A write is in progress for this region of the file. 4059 * If we did not detect RMODINPROGRESS here then this 4060 * path through nfs_putapage() would eventually go to 4061 * nfs(3)_bio() and may not write out all of the data 4062 * in the pages. We end up losing data. So we decide 4063 * to set the modified bit on each page in the page 4064 * list and mark the rnode with RDIRTY. This write 4065 * will be restarted at some later time. 4066 */ 4067 plist = pp; 4068 while (plist != NULL) { 4069 pp = plist; 4070 page_sub(&plist, pp); 4071 hat_setmod(pp); 4072 page_io_unlock(pp); 4073 page_unlock(pp); 4074 } 4075 rp->r_flags |= RDIRTY; 4076 mutex_exit(&rp->r_statelock); 4077 if (offp) 4078 *offp = io_off; 4079 if (lenp) 4080 *lenp = io_len; 4081 return (0); 4082 } 4083 mutex_exit(&rp->r_statelock); 4084 } 4085 4086 if (flags & B_ASYNC) { 4087 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4088 nfs_sync_putapage); 4089 } else 4090 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4091 4092 if (offp) 4093 *offp = io_off; 4094 if (lenp) 4095 *lenp = io_len; 4096 return (error); 4097 } 4098 4099 static int 4100 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4101 int flags, cred_t *cr) 4102 { 4103 int error; 4104 rnode_t *rp; 4105 4106 flags |= B_WRITE; 4107 4108 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4109 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4110 4111 rp = VTOR(vp); 4112 4113 if ((error == ENOSPC || error == EDQUOT || error == EACCES) && 4114 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 4115 if (!(rp->r_flags & ROUTOFSPACE)) { 4116 mutex_enter(&rp->r_statelock); 4117 rp->r_flags |= ROUTOFSPACE; 4118 mutex_exit(&rp->r_statelock); 4119 } 4120 flags |= B_ERROR; 4121 pvn_write_done(pp, flags); 4122 /* 4123 * If this was not an async thread, then try again to 4124 * write out the pages, but this time, also destroy 4125 * them whether or not the write is successful. This 4126 * will prevent memory from filling up with these 4127 * pages and destroying them is the only alternative 4128 * if they can't be written out. 4129 * 4130 * Don't do this if this is an async thread because 4131 * when the pages are unlocked in pvn_write_done, 4132 * some other thread could have come along, locked 4133 * them, and queued for an async thread. It would be 4134 * possible for all of the async threads to be tied 4135 * up waiting to lock the pages again and they would 4136 * all already be locked and waiting for an async 4137 * thread to handle them. Deadlock. 4138 */ 4139 if (!(flags & B_ASYNC)) { 4140 error = nfs_putpage(vp, io_off, io_len, 4141 B_INVAL | B_FORCE, cr); 4142 } 4143 } else { 4144 if (error) 4145 flags |= B_ERROR; 4146 else if (rp->r_flags & ROUTOFSPACE) { 4147 mutex_enter(&rp->r_statelock); 4148 rp->r_flags &= ~ROUTOFSPACE; 4149 mutex_exit(&rp->r_statelock); 4150 } 4151 pvn_write_done(pp, flags); 4152 } 4153 4154 return (error); 4155 } 4156 4157 static int 4158 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4159 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 4160 { 4161 struct segvn_crargs vn_a; 4162 int error; 4163 rnode_t *rp; 4164 struct vattr va; 4165 4166 if (nfs_zone() != VTOMI(vp)->mi_zone) 4167 return (EIO); 4168 4169 if (vp->v_flag & VNOMAP) 4170 return (ENOSYS); 4171 4172 if (off > MAXOFF32_T) 4173 return (EFBIG); 4174 4175 if (off < 0 || off + len < 0) 4176 return (ENXIO); 4177 4178 if (vp->v_type != VREG) 4179 return (ENODEV); 4180 4181 /* 4182 * If there is cached data and if close-to-open consistency 4183 * checking is not turned off and if the file system is not 4184 * mounted readonly, then force an over the wire getattr. 4185 * Otherwise, just invoke nfsgetattr to get a copy of the 4186 * attributes. The attribute cache will be used unless it 4187 * is timed out and if it is, then an over the wire getattr 4188 * will be issued. 4189 */ 4190 va.va_mask = AT_ALL; 4191 if (vn_has_cached_data(vp) && 4192 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 4193 error = nfs_getattr_otw(vp, &va, cr); 4194 else 4195 error = nfsgetattr(vp, &va, cr); 4196 if (error) 4197 return (error); 4198 4199 /* 4200 * Check to see if the vnode is currently marked as not cachable. 4201 * This means portions of the file are locked (through VOP_FRLOCK). 4202 * In this case the map request must be refused. We use 4203 * rp->r_lkserlock to avoid a race with concurrent lock requests. 4204 */ 4205 rp = VTOR(vp); 4206 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 4207 return (EINTR); 4208 4209 if (vp->v_flag & VNOCACHE) { 4210 error = EAGAIN; 4211 goto done; 4212 } 4213 4214 /* 4215 * Don't allow concurrent locks and mapping if mandatory locking is 4216 * enabled. 4217 */ 4218 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 4219 MANDLOCK(vp, va.va_mode)) { 4220 error = EAGAIN; 4221 goto done; 4222 } 4223 4224 as_rangelock(as); 4225 if (!(flags & MAP_FIXED)) { 4226 map_addr(addrp, len, off, 1, flags); 4227 if (*addrp == NULL) { 4228 as_rangeunlock(as); 4229 error = ENOMEM; 4230 goto done; 4231 } 4232 } else { 4233 /* 4234 * User specified address - blow away any previous mappings 4235 */ 4236 (void) as_unmap(as, *addrp, len); 4237 } 4238 4239 vn_a.vp = vp; 4240 vn_a.offset = off; 4241 vn_a.type = (flags & MAP_TYPE); 4242 vn_a.prot = (uchar_t)prot; 4243 vn_a.maxprot = (uchar_t)maxprot; 4244 vn_a.flags = (flags & ~MAP_TYPE); 4245 vn_a.cred = cr; 4246 vn_a.amp = NULL; 4247 vn_a.szc = 0; 4248 vn_a.lgrp_mem_policy_flags = 0; 4249 4250 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4251 as_rangeunlock(as); 4252 4253 done: 4254 nfs_rw_exit(&rp->r_lkserlock); 4255 return (error); 4256 } 4257 4258 /* ARGSUSED */ 4259 static int 4260 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4261 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 4262 { 4263 rnode_t *rp; 4264 4265 if (vp->v_flag & VNOMAP) 4266 return (ENOSYS); 4267 if (nfs_zone() != VTOMI(vp)->mi_zone) 4268 return (EIO); 4269 4270 /* 4271 * Need to hold rwlock while incrementing the mapcnt so that 4272 * mmap'ing can be serialized with writes so that the caching 4273 * can be handled correctly. 4274 */ 4275 rp = VTOR(vp); 4276 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 4277 return (EINTR); 4278 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 4279 nfs_rw_exit(&rp->r_rwlock); 4280 4281 return (0); 4282 } 4283 4284 static int 4285 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4286 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 4287 { 4288 netobj lm_fh; 4289 int rc; 4290 u_offset_t start, end; 4291 rnode_t *rp; 4292 int error = 0, intr = INTR(vp); 4293 4294 /* check for valid cmd parameter */ 4295 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 4296 return (EINVAL); 4297 if (nfs_zone() != VTOMI(vp)->mi_zone) 4298 return (EIO); 4299 4300 /* Verify l_type. */ 4301 switch (bfp->l_type) { 4302 case F_RDLCK: 4303 if (cmd != F_GETLK && !(flag & FREAD)) 4304 return (EBADF); 4305 break; 4306 case F_WRLCK: 4307 if (cmd != F_GETLK && !(flag & FWRITE)) 4308 return (EBADF); 4309 break; 4310 case F_UNLCK: 4311 intr = 0; 4312 break; 4313 4314 default: 4315 return (EINVAL); 4316 } 4317 4318 /* check the validity of the lock range */ 4319 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 4320 return (rc); 4321 if (rc = flk_check_lock_data(start, end, MAXOFF32_T)) 4322 return (rc); 4323 4324 /* 4325 * If the filesystem is mounted using local locking, pass the 4326 * request off to the local locking code. 4327 */ 4328 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 4329 if (offset > MAXOFF32_T) 4330 return (EFBIG); 4331 if (cmd == F_SETLK || cmd == F_SETLKW) { 4332 /* 4333 * For complete safety, we should be holding 4334 * r_lkserlock. However, we can't call 4335 * lm_safelock and then fs_frlock while 4336 * holding r_lkserlock, so just invoke 4337 * lm_safelock and expect that this will 4338 * catch enough of the cases. 4339 */ 4340 if (!lm_safelock(vp, bfp, cr)) 4341 return (EAGAIN); 4342 } 4343 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4344 } 4345 4346 rp = VTOR(vp); 4347 4348 /* 4349 * Check whether the given lock request can proceed, given the 4350 * current file mappings. 4351 */ 4352 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 4353 return (EINTR); 4354 if (cmd == F_SETLK || cmd == F_SETLKW) { 4355 if (!lm_safelock(vp, bfp, cr)) { 4356 rc = EAGAIN; 4357 goto done; 4358 } 4359 } 4360 4361 /* 4362 * Flush the cache after waiting for async I/O to finish. For new 4363 * locks, this is so that the process gets the latest bits from the 4364 * server. For unlocks, this is so that other clients see the 4365 * latest bits once the file has been unlocked. If currently dirty 4366 * pages can't be flushed, then don't allow a lock to be set. But 4367 * allow unlocks to succeed, to avoid having orphan locks on the 4368 * server. 4369 */ 4370 if (cmd != F_GETLK) { 4371 mutex_enter(&rp->r_statelock); 4372 while (rp->r_count > 0) { 4373 if (intr) { 4374 klwp_t *lwp = ttolwp(curthread); 4375 4376 if (lwp != NULL) 4377 lwp->lwp_nostop++; 4378 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 4379 if (lwp != NULL) 4380 lwp->lwp_nostop--; 4381 rc = EINTR; 4382 break; 4383 } 4384 if (lwp != NULL) 4385 lwp->lwp_nostop--; 4386 } else 4387 cv_wait(&rp->r_cv, &rp->r_statelock); 4388 } 4389 mutex_exit(&rp->r_statelock); 4390 if (rc != 0) 4391 goto done; 4392 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 4393 if (error) { 4394 if (error == ENOSPC || error == EDQUOT) { 4395 mutex_enter(&rp->r_statelock); 4396 if (!rp->r_error) 4397 rp->r_error = error; 4398 mutex_exit(&rp->r_statelock); 4399 } 4400 if (bfp->l_type != F_UNLCK) { 4401 rc = ENOLCK; 4402 goto done; 4403 } 4404 } 4405 } 4406 4407 lm_fh.n_len = sizeof (fhandle_t); 4408 lm_fh.n_bytes = (char *)VTOFH(vp); 4409 4410 /* 4411 * Call the lock manager to do the real work of contacting 4412 * the server and obtaining the lock. 4413 */ 4414 rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp); 4415 4416 if (rc == 0) 4417 nfs_lockcompletion(vp, cmd); 4418 4419 done: 4420 nfs_rw_exit(&rp->r_lkserlock); 4421 return (rc); 4422 } 4423 4424 /* 4425 * Free storage space associated with the specified vnode. The portion 4426 * to be freed is specified by bfp->l_start and bfp->l_len (already 4427 * normalized to a "whence" of 0). 4428 * 4429 * This is an experimental facility whose continued existence is not 4430 * guaranteed. Currently, we only support the special case 4431 * of l_len == 0, meaning free to end of file. 4432 */ 4433 /* ARGSUSED */ 4434 static int 4435 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4436 offset_t offset, cred_t *cr, caller_context_t *ct) 4437 { 4438 int error; 4439 4440 ASSERT(vp->v_type == VREG); 4441 if (cmd != F_FREESP) 4442 return (EINVAL); 4443 4444 if (offset > MAXOFF32_T) 4445 return (EFBIG); 4446 4447 if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) || 4448 (bfp->l_len > MAXOFF32_T)) 4449 return (EFBIG); 4450 4451 if (nfs_zone() != VTOMI(vp)->mi_zone) 4452 return (EIO); 4453 4454 error = convoff(vp, bfp, 0, offset); 4455 if (!error) { 4456 ASSERT(bfp->l_start >= 0); 4457 if (bfp->l_len == 0) { 4458 struct vattr va; 4459 4460 /* 4461 * ftruncate should not change the ctime and 4462 * mtime if we truncate the file to its 4463 * previous size. 4464 */ 4465 va.va_mask = AT_SIZE; 4466 error = nfsgetattr(vp, &va, cr); 4467 if (error || va.va_size == bfp->l_start) 4468 return (error); 4469 va.va_mask = AT_SIZE; 4470 va.va_size = bfp->l_start; 4471 error = nfssetattr(vp, &va, 0, cr); 4472 } else 4473 error = EINVAL; 4474 } 4475 4476 return (error); 4477 } 4478 4479 /* ARGSUSED */ 4480 static int 4481 nfs_realvp(vnode_t *vp, vnode_t **vpp) 4482 { 4483 4484 return (EINVAL); 4485 } 4486 4487 /* 4488 * Setup and add an address space callback to do the work of the delmap call. 4489 * The callback will (and must be) deleted in the actual callback function. 4490 * 4491 * This is done in order to take care of the problem that we have with holding 4492 * the address space's a_lock for a long period of time (e.g. if the NFS server 4493 * is down). Callbacks will be executed in the address space code while the 4494 * a_lock is not held. Holding the address space's a_lock causes things such 4495 * as ps and fork to hang because they are trying to acquire this lock as well. 4496 */ 4497 /* ARGSUSED */ 4498 static int 4499 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4500 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 4501 { 4502 int caller_found; 4503 int error; 4504 rnode_t *rp; 4505 nfs_delmap_args_t *dmapp; 4506 nfs_delmapcall_t *delmap_call; 4507 4508 if (vp->v_flag & VNOMAP) 4509 return (ENOSYS); 4510 /* 4511 * A process may not change zones if it has NFS pages mmap'ed 4512 * in, so we can't legitimately get here from the wrong zone. 4513 */ 4514 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4515 4516 rp = VTOR(vp); 4517 4518 /* 4519 * The way that the address space of this process deletes its mapping 4520 * of this file is via the following call chains: 4521 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4522 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4523 * 4524 * With the use of address space callbacks we are allowed to drop the 4525 * address space lock, a_lock, while executing the NFS operations that 4526 * need to go over the wire. Returning EAGAIN to the caller of this 4527 * function is what drives the execution of the callback that we add 4528 * below. The callback will be executed by the address space code 4529 * after dropping the a_lock. When the callback is finished, since 4530 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 4531 * is called again on the same segment to finish the rest of the work 4532 * that needs to happen during unmapping. 4533 * 4534 * This action of calling back into the segment driver causes 4535 * nfs_delmap() to get called again, but since the callback was 4536 * already executed at this point, it already did the work and there 4537 * is nothing left for us to do. 4538 * 4539 * To Summarize: 4540 * - The first time nfs_delmap is called by the current thread is when 4541 * we add the caller associated with this delmap to the delmap caller 4542 * list, add the callback, and return EAGAIN. 4543 * - The second time in this call chain when nfs_delmap is called we 4544 * will find this caller in the delmap caller list and realize there 4545 * is no more work to do thus removing this caller from the list and 4546 * returning the error that was set in the callback execution. 4547 */ 4548 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 4549 if (caller_found) { 4550 /* 4551 * 'error' is from the actual delmap operations. To avoid 4552 * hangs, we need to handle the return of EAGAIN differently 4553 * since this is what drives the callback execution. 4554 * In this case, we don't want to return EAGAIN and do the 4555 * callback execution because there are none to execute. 4556 */ 4557 if (error == EAGAIN) 4558 return (0); 4559 else 4560 return (error); 4561 } 4562 4563 /* current caller was not in the list */ 4564 delmap_call = nfs_init_delmapcall(); 4565 4566 mutex_enter(&rp->r_statelock); 4567 list_insert_tail(&rp->r_indelmap, delmap_call); 4568 mutex_exit(&rp->r_statelock); 4569 4570 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 4571 4572 dmapp->vp = vp; 4573 dmapp->off = off; 4574 dmapp->addr = addr; 4575 dmapp->len = len; 4576 dmapp->prot = prot; 4577 dmapp->maxprot = maxprot; 4578 dmapp->flags = flags; 4579 dmapp->cr = cr; 4580 dmapp->caller = delmap_call; 4581 4582 error = as_add_callback(as, nfs_delmap_callback, dmapp, 4583 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 4584 4585 return (error ? error : EAGAIN); 4586 } 4587 4588 /* 4589 * Remove some pages from an mmap'd vnode. Just update the 4590 * count of pages. If doing close-to-open, then flush all 4591 * of the pages associated with this file. Otherwise, start 4592 * an asynchronous page flush to write out any dirty pages. 4593 * This will also associate a credential with the rnode which 4594 * can be used to write the pages. 4595 */ 4596 /* ARGSUSED */ 4597 static void 4598 nfs_delmap_callback(struct as *as, void *arg, uint_t event) 4599 { 4600 int error; 4601 rnode_t *rp; 4602 mntinfo_t *mi; 4603 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 4604 4605 rp = VTOR(dmapp->vp); 4606 mi = VTOMI(dmapp->vp); 4607 4608 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 4609 ASSERT(rp->r_mapcnt >= 0); 4610 4611 /* 4612 * Initiate a page flush if there are pages, the file system 4613 * was not mounted readonly, the segment was mapped shared, and 4614 * the pages themselves were writeable. 4615 */ 4616 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 4617 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 4618 mutex_enter(&rp->r_statelock); 4619 rp->r_flags |= RDIRTY; 4620 mutex_exit(&rp->r_statelock); 4621 /* 4622 * If this is a cross-zone access a sync putpage won't work, so 4623 * the best we can do is try an async putpage. That seems 4624 * better than something more draconian such as discarding the 4625 * dirty pages. 4626 */ 4627 if ((mi->mi_flags & MI_NOCTO) || 4628 nfs_zone() != mi->mi_zone) 4629 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4630 B_ASYNC, dmapp->cr); 4631 else 4632 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4633 0, dmapp->cr); 4634 if (!error) { 4635 mutex_enter(&rp->r_statelock); 4636 error = rp->r_error; 4637 rp->r_error = 0; 4638 mutex_exit(&rp->r_statelock); 4639 } 4640 } else 4641 error = 0; 4642 4643 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 4644 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4645 B_INVAL, dmapp->cr); 4646 4647 dmapp->caller->error = error; 4648 (void) as_delete_callback(as, arg); 4649 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 4650 } 4651 4652 /* ARGSUSED */ 4653 static int 4654 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 4655 { 4656 int error = 0; 4657 4658 if (nfs_zone() != VTOMI(vp)->mi_zone) 4659 return (EIO); 4660 /* 4661 * This looks a little weird because it's written in a general 4662 * manner but we make little use of cases. If cntl() ever gets 4663 * widely used, the outer switch will make more sense. 4664 */ 4665 4666 switch (cmd) { 4667 4668 /* 4669 * Large file spec - need to base answer new query with 4670 * hardcoded constant based on the protocol. 4671 */ 4672 case _PC_FILESIZEBITS: 4673 *valp = 32; 4674 return (0); 4675 4676 case _PC_LINK_MAX: 4677 case _PC_NAME_MAX: 4678 case _PC_PATH_MAX: 4679 case _PC_SYMLINK_MAX: 4680 case _PC_CHOWN_RESTRICTED: 4681 case _PC_NO_TRUNC: { 4682 mntinfo_t *mi; 4683 struct pathcnf *pc; 4684 4685 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL) 4686 return (EINVAL); 4687 error = _PC_ISSET(cmd, pc->pc_mask); /* error or bool */ 4688 switch (cmd) { 4689 case _PC_LINK_MAX: 4690 *valp = pc->pc_link_max; 4691 break; 4692 case _PC_NAME_MAX: 4693 *valp = pc->pc_name_max; 4694 break; 4695 case _PC_PATH_MAX: 4696 case _PC_SYMLINK_MAX: 4697 *valp = pc->pc_path_max; 4698 break; 4699 case _PC_CHOWN_RESTRICTED: 4700 /* 4701 * if we got here, error is really a boolean which 4702 * indicates whether cmd is set or not. 4703 */ 4704 *valp = error ? 1 : 0; /* see above */ 4705 error = 0; 4706 break; 4707 case _PC_NO_TRUNC: 4708 /* 4709 * if we got here, error is really a boolean which 4710 * indicates whether cmd is set or not. 4711 */ 4712 *valp = error ? 1 : 0; /* see above */ 4713 error = 0; 4714 break; 4715 } 4716 return (error ? EINVAL : 0); 4717 } 4718 4719 case _PC_XATTR_EXISTS: 4720 *valp = 0; 4721 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 4722 vnode_t *avp; 4723 rnode_t *rp; 4724 mntinfo_t *mi = VTOMI(vp); 4725 4726 if (!(mi->mi_flags & MI_EXTATTR)) 4727 return (0); 4728 4729 rp = VTOR(vp); 4730 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 4731 INTR(vp))) 4732 return (EINTR); 4733 4734 error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 4735 if (error || avp == NULL) 4736 error = acl_getxattrdir2(vp, &avp, 0, cr, 0); 4737 4738 nfs_rw_exit(&rp->r_rwlock); 4739 4740 if (error == 0 && avp != NULL) { 4741 VN_RELE(avp); 4742 *valp = 1; 4743 } 4744 } 4745 return (error ? EINVAL : 0); 4746 4747 case _PC_ACL_ENABLED: 4748 *valp = _ACL_ACLENT_ENABLED; 4749 return (0); 4750 4751 default: 4752 return (EINVAL); 4753 } 4754 } 4755 4756 /* 4757 * Called by async thread to do synchronous pageio. Do the i/o, wait 4758 * for it to complete, and cleanup the page list when done. 4759 */ 4760 static int 4761 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4762 int flags, cred_t *cr) 4763 { 4764 int error; 4765 4766 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4767 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4768 if (flags & B_READ) 4769 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 4770 else 4771 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 4772 return (error); 4773 } 4774 4775 static int 4776 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4777 int flags, cred_t *cr) 4778 { 4779 int error; 4780 rnode_t *rp; 4781 4782 if (pp == NULL) 4783 return (EINVAL); 4784 4785 if (io_off > MAXOFF32_T) 4786 return (EFBIG); 4787 if (nfs_zone() != VTOMI(vp)->mi_zone) 4788 return (EIO); 4789 rp = VTOR(vp); 4790 mutex_enter(&rp->r_statelock); 4791 rp->r_count++; 4792 mutex_exit(&rp->r_statelock); 4793 4794 if (flags & B_ASYNC) { 4795 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 4796 nfs_sync_pageio); 4797 } else 4798 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4799 mutex_enter(&rp->r_statelock); 4800 rp->r_count--; 4801 cv_broadcast(&rp->r_cv); 4802 mutex_exit(&rp->r_statelock); 4803 return (error); 4804 } 4805 4806 static int 4807 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 4808 { 4809 int error; 4810 mntinfo_t *mi; 4811 4812 mi = VTOMI(vp); 4813 4814 if (nfs_zone() != mi->mi_zone) 4815 return (EIO); 4816 if (mi->mi_flags & MI_ACL) { 4817 error = acl_setacl2(vp, vsecattr, flag, cr); 4818 if (mi->mi_flags & MI_ACL) 4819 return (error); 4820 } 4821 4822 return (ENOSYS); 4823 } 4824 4825 static int 4826 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 4827 { 4828 int error; 4829 mntinfo_t *mi; 4830 4831 mi = VTOMI(vp); 4832 4833 if (nfs_zone() != mi->mi_zone) 4834 return (EIO); 4835 if (mi->mi_flags & MI_ACL) { 4836 error = acl_getacl2(vp, vsecattr, flag, cr); 4837 if (mi->mi_flags & MI_ACL) 4838 return (error); 4839 } 4840 4841 return (fs_fab_acl(vp, vsecattr, flag, cr)); 4842 } 4843 4844 static int 4845 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 4846 { 4847 int error; 4848 struct shrlock nshr; 4849 struct nfs_owner nfs_owner; 4850 netobj lm_fh; 4851 4852 if (nfs_zone() != VTOMI(vp)->mi_zone) 4853 return (EIO); 4854 4855 /* 4856 * check for valid cmd parameter 4857 */ 4858 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 4859 return (EINVAL); 4860 4861 /* 4862 * Check access permissions 4863 */ 4864 if (cmd == F_SHARE && 4865 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 4866 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 4867 return (EBADF); 4868 4869 /* 4870 * If the filesystem is mounted using local locking, pass the 4871 * request off to the local share code. 4872 */ 4873 if (VTOMI(vp)->mi_flags & MI_LLOCK) 4874 return (fs_shrlock(vp, cmd, shr, flag, cr)); 4875 4876 switch (cmd) { 4877 case F_SHARE: 4878 case F_UNSHARE: 4879 lm_fh.n_len = sizeof (fhandle_t); 4880 lm_fh.n_bytes = (char *)VTOFH(vp); 4881 4882 /* 4883 * If passed an owner that is too large to fit in an 4884 * nfs_owner it is likely a recursive call from the 4885 * lock manager client and pass it straight through. If 4886 * it is not a nfs_owner then simply return an error. 4887 */ 4888 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 4889 if (((struct nfs_owner *)shr->s_owner)->magic != 4890 NFS_OWNER_MAGIC) 4891 return (EINVAL); 4892 4893 if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) { 4894 error = set_errno(error); 4895 } 4896 return (error); 4897 } 4898 /* 4899 * Remote share reservations owner is a combination of 4900 * a magic number, hostname, and the local owner 4901 */ 4902 bzero(&nfs_owner, sizeof (nfs_owner)); 4903 nfs_owner.magic = NFS_OWNER_MAGIC; 4904 (void) strncpy(nfs_owner.hname, uts_nodename(), 4905 sizeof (nfs_owner.hname)); 4906 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 4907 nshr.s_access = shr->s_access; 4908 nshr.s_deny = shr->s_deny; 4909 nshr.s_sysid = 0; 4910 nshr.s_pid = ttoproc(curthread)->p_pid; 4911 nshr.s_own_len = sizeof (nfs_owner); 4912 nshr.s_owner = (caddr_t)&nfs_owner; 4913 4914 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) { 4915 error = set_errno(error); 4916 } 4917 4918 break; 4919 4920 case F_HASREMOTELOCKS: 4921 /* 4922 * NFS client can't store remote locks itself 4923 */ 4924 shr->s_access = 0; 4925 error = 0; 4926 break; 4927 4928 default: 4929 error = EINVAL; 4930 break; 4931 } 4932 4933 return (error); 4934 } 4935