1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/vfs_opreg.h> 39 #include <sys/file.h> 40 #include <sys/filio.h> 41 #include <sys/uio.h> 42 #include <sys/buf.h> 43 #include <sys/mman.h> 44 #include <sys/pathname.h> 45 #include <sys/dirent.h> 46 #include <sys/debug.h> 47 #include <sys/vmsystm.h> 48 #include <sys/fcntl.h> 49 #include <sys/flock.h> 50 #include <sys/swap.h> 51 #include <sys/errno.h> 52 #include <sys/strsubr.h> 53 #include <sys/sysmacros.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/pathconf.h> 57 #include <sys/utsname.h> 58 #include <sys/dnlc.h> 59 #include <sys/acl.h> 60 #include <sys/atomic.h> 61 #include <sys/policy.h> 62 #include <sys/sdt.h> 63 64 #include <rpc/types.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/rnode.h> 71 #include <nfs/nfs_acl.h> 72 #include <nfs/lm.h> 73 74 #include <vm/hat.h> 75 #include <vm/as.h> 76 #include <vm/page.h> 77 #include <vm/pvn.h> 78 #include <vm/seg.h> 79 #include <vm/seg_map.h> 80 #include <vm/seg_kpm.h> 81 #include <vm/seg_vn.h> 82 83 #include <fs/fs_subr.h> 84 85 #include <sys/ddi.h> 86 87 static int nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 88 cred_t *); 89 static int nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *); 90 static int nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *); 91 static int nfssetattr(vnode_t *, struct vattr *, int, cred_t *); 92 static int nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 93 static int nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 94 static int nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *); 95 static int nfsreaddir(vnode_t *, rddir_cache *, cred_t *); 96 static int nfs_bio(struct buf *, cred_t *); 97 static int nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 98 page_t *[], size_t, struct seg *, caddr_t, 99 enum seg_rw, cred_t *); 100 static void nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 101 cred_t *); 102 static int nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 103 int, cred_t *); 104 static int nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 105 int, cred_t *); 106 static void nfs_delmap_callback(struct as *, void *, uint_t); 107 108 /* 109 * Error flags used to pass information about certain special errors 110 * which need to be handled specially. 111 */ 112 #define NFS_EOF -98 113 114 /* 115 * These are the vnode ops routines which implement the vnode interface to 116 * the networked file system. These routines just take their parameters, 117 * make them look networkish by putting the right info into interface structs, 118 * and then calling the appropriate remote routine(s) to do the work. 119 * 120 * Note on directory name lookup cacheing: If we detect a stale fhandle, 121 * we purge the directory cache relative to that vnode. This way, the 122 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 123 * more details on rnode locking. 124 */ 125 126 static int nfs_open(vnode_t **, int, cred_t *); 127 static int nfs_close(vnode_t *, int, int, offset_t, cred_t *); 128 static int nfs_read(vnode_t *, struct uio *, int, cred_t *, 129 caller_context_t *); 130 static int nfs_write(vnode_t *, struct uio *, int, cred_t *, 131 caller_context_t *); 132 static int nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 133 static int nfs_getattr(vnode_t *, struct vattr *, int, cred_t *); 134 static int nfs_setattr(vnode_t *, struct vattr *, int, cred_t *, 135 caller_context_t *); 136 static int nfs_access(vnode_t *, int, int, cred_t *); 137 static int nfs_accessx(void *, int, cred_t *); 138 static int nfs_readlink(vnode_t *, struct uio *, cred_t *); 139 static int nfs_fsync(vnode_t *, int, cred_t *); 140 static void nfs_inactive(vnode_t *, cred_t *); 141 static int nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *, 142 int, vnode_t *, cred_t *); 143 static int nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl, 144 int, vnode_t **, cred_t *, int); 145 static int nfs_remove(vnode_t *, char *, cred_t *); 146 static int nfs_link(vnode_t *, vnode_t *, char *, cred_t *); 147 static int nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 148 static int nfs_mkdir(vnode_t *, char *, struct vattr *, 149 vnode_t **, cred_t *); 150 static int nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 151 static int nfs_symlink(vnode_t *, char *, struct vattr *, char *, 152 cred_t *); 153 static int nfs_readdir(vnode_t *, struct uio *, cred_t *, int *); 154 static int nfs_fid(vnode_t *, fid_t *); 155 static int nfs_rwlock(vnode_t *, int, caller_context_t *); 156 static void nfs_rwunlock(vnode_t *, int, caller_context_t *); 157 static int nfs_seek(vnode_t *, offset_t, offset_t *); 158 static int nfs_getpage(vnode_t *, offset_t, size_t, uint_t *, 159 page_t *[], size_t, struct seg *, caddr_t, 160 enum seg_rw, cred_t *); 161 static int nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 162 static int nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, 163 size_t, uchar_t, uchar_t, uint_t, cred_t *); 164 static int nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, 165 size_t, uchar_t, uchar_t, uint_t, cred_t *); 166 static int nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 167 struct flk_callback *, cred_t *); 168 static int nfs_space(vnode_t *, int, struct flock64 *, int, offset_t, 169 cred_t *, caller_context_t *); 170 static int nfs_realvp(vnode_t *, vnode_t **); 171 static int nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, 172 size_t, uint_t, uint_t, uint_t, cred_t *); 173 static int nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *); 174 static int nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 175 cred_t *); 176 static int nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 177 static int nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 178 static int nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 179 180 struct vnodeops *nfs_vnodeops; 181 182 const fs_operation_def_t nfs_vnodeops_template[] = { 183 VOPNAME_OPEN, { .vop_open = nfs_open }, 184 VOPNAME_CLOSE, { .vop_close = nfs_close }, 185 VOPNAME_READ, { .vop_read = nfs_read }, 186 VOPNAME_WRITE, { .vop_write = nfs_write }, 187 VOPNAME_IOCTL, { .vop_ioctl = nfs_ioctl }, 188 VOPNAME_GETATTR, { .vop_getattr = nfs_getattr }, 189 VOPNAME_SETATTR, { .vop_setattr = nfs_setattr }, 190 VOPNAME_ACCESS, { .vop_access = nfs_access }, 191 VOPNAME_LOOKUP, { .vop_lookup = nfs_lookup }, 192 VOPNAME_CREATE, { .vop_create = nfs_create }, 193 VOPNAME_REMOVE, { .vop_remove = nfs_remove }, 194 VOPNAME_LINK, { .vop_link = nfs_link }, 195 VOPNAME_RENAME, { .vop_rename = nfs_rename }, 196 VOPNAME_MKDIR, { .vop_mkdir = nfs_mkdir }, 197 VOPNAME_RMDIR, { .vop_rmdir = nfs_rmdir }, 198 VOPNAME_READDIR, { .vop_readdir = nfs_readdir }, 199 VOPNAME_SYMLINK, { .vop_symlink = nfs_symlink }, 200 VOPNAME_READLINK, { .vop_readlink = nfs_readlink }, 201 VOPNAME_FSYNC, { .vop_fsync = nfs_fsync }, 202 VOPNAME_INACTIVE, { .vop_inactive = nfs_inactive }, 203 VOPNAME_FID, { .vop_fid = nfs_fid }, 204 VOPNAME_RWLOCK, { .vop_rwlock = nfs_rwlock }, 205 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs_rwunlock }, 206 VOPNAME_SEEK, { .vop_seek = nfs_seek }, 207 VOPNAME_FRLOCK, { .vop_frlock = nfs_frlock }, 208 VOPNAME_SPACE, { .vop_space = nfs_space }, 209 VOPNAME_REALVP, { .vop_realvp = nfs_realvp }, 210 VOPNAME_GETPAGE, { .vop_getpage = nfs_getpage }, 211 VOPNAME_PUTPAGE, { .vop_putpage = nfs_putpage }, 212 VOPNAME_MAP, { .vop_map = nfs_map }, 213 VOPNAME_ADDMAP, { .vop_addmap = nfs_addmap }, 214 VOPNAME_DELMAP, { .vop_delmap = nfs_delmap }, 215 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 216 VOPNAME_PATHCONF, { .vop_pathconf = nfs_pathconf }, 217 VOPNAME_PAGEIO, { .vop_pageio = nfs_pageio }, 218 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs_setsecattr }, 219 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs_getsecattr }, 220 VOPNAME_SHRLOCK, { .vop_shrlock = nfs_shrlock }, 221 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 222 NULL, NULL 223 }; 224 225 /* 226 * XXX: This is referenced in modstubs.s 227 */ 228 struct vnodeops * 229 nfs_getvnodeops(void) 230 { 231 return (nfs_vnodeops); 232 } 233 234 /* ARGSUSED */ 235 static int 236 nfs_open(vnode_t **vpp, int flag, cred_t *cr) 237 { 238 int error; 239 struct vattr va; 240 rnode_t *rp; 241 vnode_t *vp; 242 243 vp = *vpp; 244 rp = VTOR(vp); 245 if (nfs_zone() != VTOMI(vp)->mi_zone) 246 return (EIO); 247 mutex_enter(&rp->r_statelock); 248 if (rp->r_cred == NULL) { 249 crhold(cr); 250 rp->r_cred = cr; 251 } 252 mutex_exit(&rp->r_statelock); 253 254 /* 255 * If there is no cached data or if close-to-open 256 * consistency checking is turned off, we can avoid 257 * the over the wire getattr. Otherwise, if the 258 * file system is mounted readonly, then just verify 259 * the caches are up to date using the normal mechanism. 260 * Else, if the file is not mmap'd, then just mark 261 * the attributes as timed out. They will be refreshed 262 * and the caches validated prior to being used. 263 * Else, the file system is mounted writeable so 264 * force an over the wire GETATTR in order to ensure 265 * that all cached data is valid. 266 */ 267 if (vp->v_count > 1 || 268 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 269 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 270 if (vn_is_readonly(vp)) 271 error = nfs_validate_caches(vp, cr); 272 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 273 PURGE_ATTRCACHE(vp); 274 error = 0; 275 } else { 276 va.va_mask = AT_ALL; 277 error = nfs_getattr_otw(vp, &va, cr); 278 } 279 } else 280 error = 0; 281 282 return (error); 283 } 284 285 static int 286 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 287 { 288 rnode_t *rp; 289 int error; 290 struct vattr va; 291 292 /* 293 * zone_enter(2) prevents processes from changing zones with NFS files 294 * open; if we happen to get here from the wrong zone we can't do 295 * anything over the wire. 296 */ 297 if (VTOMI(vp)->mi_zone != nfs_zone()) { 298 /* 299 * We could attempt to clean up locks, except we're sure 300 * that the current process didn't acquire any locks on 301 * the file: any attempt to lock a file belong to another zone 302 * will fail, and one can't lock an NFS file and then change 303 * zones, as that fails too. 304 * 305 * Returning an error here is the sane thing to do. A 306 * subsequent call to VN_RELE() which translates to a 307 * nfs_inactive() will clean up state: if the zone of the 308 * vnode's origin is still alive and kicking, an async worker 309 * thread will handle the request (from the correct zone), and 310 * everything (minus the final nfs_getattr_otw() call) should 311 * be OK. If the zone is going away nfs_async_inactive() will 312 * throw away cached pages inline. 313 */ 314 return (EIO); 315 } 316 317 /* 318 * If we are using local locking for this filesystem, then 319 * release all of the SYSV style record locks. Otherwise, 320 * we are doing network locking and we need to release all 321 * of the network locks. All of the locks held by this 322 * process on this file are released no matter what the 323 * incoming reference count is. 324 */ 325 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 326 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 327 cleanshares(vp, ttoproc(curthread)->p_pid); 328 } else 329 nfs_lockrelease(vp, flag, offset, cr); 330 331 if (count > 1) 332 return (0); 333 334 /* 335 * If the file has been `unlinked', then purge the 336 * DNLC so that this vnode will get reycled quicker 337 * and the .nfs* file on the server will get removed. 338 */ 339 rp = VTOR(vp); 340 if (rp->r_unldvp != NULL) 341 dnlc_purge_vp(vp); 342 343 /* 344 * If the file was open for write and there are pages, 345 * then if the file system was mounted using the "no-close- 346 * to-open" semantics, then start an asynchronous flush 347 * of the all of the pages in the file. 348 * else the file system was not mounted using the "no-close- 349 * to-open" semantics, then do a synchronous flush and 350 * commit of all of the dirty and uncommitted pages. 351 * 352 * The asynchronous flush of the pages in the "nocto" path 353 * mostly just associates a cred pointer with the rnode so 354 * writes which happen later will have a better chance of 355 * working. It also starts the data being written to the 356 * server, but without unnecessarily delaying the application. 357 */ 358 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 359 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) { 360 error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, cr); 361 if (error == EAGAIN) 362 error = 0; 363 } else 364 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 365 if (!error) { 366 mutex_enter(&rp->r_statelock); 367 error = rp->r_error; 368 rp->r_error = 0; 369 mutex_exit(&rp->r_statelock); 370 } 371 } else { 372 mutex_enter(&rp->r_statelock); 373 error = rp->r_error; 374 rp->r_error = 0; 375 mutex_exit(&rp->r_statelock); 376 } 377 378 /* 379 * If RWRITEATTR is set, then issue an over the wire GETATTR to 380 * refresh the attribute cache with a set of attributes which 381 * weren't returned from a WRITE. This will enable the close- 382 * to-open processing to work. 383 */ 384 if (rp->r_flags & RWRITEATTR) 385 (void) nfs_getattr_otw(vp, &va, cr); 386 387 return (error); 388 } 389 390 /* ARGSUSED */ 391 static int 392 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 393 caller_context_t *ct) 394 { 395 rnode_t *rp; 396 u_offset_t off; 397 offset_t diff; 398 int on; 399 size_t n; 400 caddr_t base; 401 uint_t flags; 402 int error; 403 mntinfo_t *mi; 404 405 rp = VTOR(vp); 406 mi = VTOMI(vp); 407 408 if (nfs_zone() != mi->mi_zone) 409 return (EIO); 410 411 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 412 413 if (vp->v_type != VREG) 414 return (EISDIR); 415 416 if (uiop->uio_resid == 0) 417 return (0); 418 419 if (uiop->uio_loffset > MAXOFF32_T) 420 return (EFBIG); 421 422 if (uiop->uio_loffset < 0 || 423 uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T) 424 return (EINVAL); 425 426 /* 427 * Bypass VM if caching has been disabled (e.g., locking) or if 428 * using client-side direct I/O and the file is not mmap'd and 429 * there are no cached pages. 430 */ 431 if ((vp->v_flag & VNOCACHE) || 432 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 433 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 434 size_t bufsize; 435 size_t resid = 0; 436 437 /* 438 * Let's try to do read in as large a chunk as we can 439 * (Filesystem (NFS client) bsize if possible/needed). 440 * For V3, this is 32K and for V2, this is 8K. 441 */ 442 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread); 443 base = kmem_alloc(bufsize, KM_SLEEP); 444 do { 445 n = MIN(uiop->uio_resid, bufsize); 446 error = nfsread(vp, base, uiop->uio_offset, n, 447 &resid, cr); 448 if (!error) { 449 n -= resid; 450 error = uiomove(base, n, UIO_READ, uiop); 451 } 452 } while (!error && uiop->uio_resid > 0 && n > 0); 453 kmem_free(base, bufsize); 454 return (error); 455 } 456 457 error = 0; 458 459 do { 460 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 461 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 462 n = MIN(MAXBSIZE - on, uiop->uio_resid); 463 464 error = nfs_validate_caches(vp, cr); 465 if (error) 466 break; 467 468 mutex_enter(&rp->r_statelock); 469 diff = rp->r_size - uiop->uio_loffset; 470 mutex_exit(&rp->r_statelock); 471 if (diff <= 0) 472 break; 473 if (diff < n) 474 n = (size_t)diff; 475 476 if (vpm_enable) { 477 /* 478 * Copy data. 479 */ 480 error = vpm_data_copy(vp, off + on, n, uiop, 481 1, NULL, 0, S_READ); 482 } else { 483 base = segmap_getmapflt(segkmap, vp, off + on, n, 484 1, S_READ); 485 error = uiomove(base + on, n, UIO_READ, uiop); 486 } 487 488 if (!error) { 489 /* 490 * If read a whole block or read to eof, 491 * won't need this buffer again soon. 492 */ 493 mutex_enter(&rp->r_statelock); 494 if (n + on == MAXBSIZE || 495 uiop->uio_loffset == rp->r_size) 496 flags = SM_DONTNEED; 497 else 498 flags = 0; 499 mutex_exit(&rp->r_statelock); 500 if (vpm_enable) { 501 error = vpm_sync_pages(vp, off, n, flags); 502 } else { 503 error = segmap_release(segkmap, base, flags); 504 } 505 } else { 506 if (vpm_enable) { 507 (void) vpm_sync_pages(vp, off, n, 0); 508 } else { 509 (void) segmap_release(segkmap, base, 0); 510 } 511 } 512 } while (!error && uiop->uio_resid > 0); 513 514 return (error); 515 } 516 517 /* ARGSUSED */ 518 static int 519 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 520 caller_context_t *ct) 521 { 522 rnode_t *rp; 523 u_offset_t off; 524 caddr_t base; 525 uint_t flags; 526 int remainder; 527 size_t n; 528 int on; 529 int error; 530 int resid; 531 offset_t offset; 532 rlim_t limit; 533 mntinfo_t *mi; 534 535 rp = VTOR(vp); 536 537 mi = VTOMI(vp); 538 if (nfs_zone() != mi->mi_zone) 539 return (EIO); 540 if (vp->v_type != VREG) 541 return (EISDIR); 542 543 if (uiop->uio_resid == 0) 544 return (0); 545 546 if (ioflag & FAPPEND) { 547 struct vattr va; 548 549 /* 550 * Must serialize if appending. 551 */ 552 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 553 nfs_rw_exit(&rp->r_rwlock); 554 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 555 INTR(vp))) 556 return (EINTR); 557 } 558 559 va.va_mask = AT_SIZE; 560 error = nfsgetattr(vp, &va, cr); 561 if (error) 562 return (error); 563 uiop->uio_loffset = va.va_size; 564 } 565 566 if (uiop->uio_loffset > MAXOFF32_T) 567 return (EFBIG); 568 569 offset = uiop->uio_loffset + uiop->uio_resid; 570 571 if (uiop->uio_loffset < 0 || offset > MAXOFF32_T) 572 return (EINVAL); 573 574 if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) { 575 limit = MAXOFF32_T; 576 } else { 577 limit = (rlim_t)uiop->uio_llimit; 578 } 579 580 /* 581 * Check to make sure that the process will not exceed 582 * its limit on file size. It is okay to write up to 583 * the limit, but not beyond. Thus, the write which 584 * reaches the limit will be short and the next write 585 * will return an error. 586 */ 587 remainder = 0; 588 if (offset > limit) { 589 remainder = offset - limit; 590 uiop->uio_resid = limit - uiop->uio_offset; 591 if (uiop->uio_resid <= 0) { 592 proc_t *p = ttoproc(curthread); 593 594 uiop->uio_resid += remainder; 595 mutex_enter(&p->p_lock); 596 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 597 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 598 mutex_exit(&p->p_lock); 599 return (EFBIG); 600 } 601 } 602 603 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 604 return (EINTR); 605 606 /* 607 * Bypass VM if caching has been disabled (e.g., locking) or if 608 * using client-side direct I/O and the file is not mmap'd and 609 * there are no cached pages. 610 */ 611 if ((vp->v_flag & VNOCACHE) || 612 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 613 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 614 size_t bufsize; 615 int count; 616 uint_t org_offset; 617 618 nfs_fwrite: 619 if (rp->r_flags & RSTALE) { 620 resid = uiop->uio_resid; 621 offset = uiop->uio_loffset; 622 error = rp->r_error; 623 goto bottom; 624 } 625 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite); 626 base = kmem_alloc(bufsize, KM_SLEEP); 627 do { 628 resid = uiop->uio_resid; 629 offset = uiop->uio_loffset; 630 count = MIN(uiop->uio_resid, bufsize); 631 org_offset = uiop->uio_offset; 632 error = uiomove(base, count, UIO_WRITE, uiop); 633 if (!error) { 634 error = nfswrite(vp, base, org_offset, 635 count, cr); 636 } 637 } while (!error && uiop->uio_resid > 0); 638 kmem_free(base, bufsize); 639 goto bottom; 640 } 641 642 do { 643 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 644 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 645 n = MIN(MAXBSIZE - on, uiop->uio_resid); 646 647 resid = uiop->uio_resid; 648 offset = uiop->uio_loffset; 649 650 if (rp->r_flags & RSTALE) { 651 error = rp->r_error; 652 break; 653 } 654 655 /* 656 * Don't create dirty pages faster than they 657 * can be cleaned so that the system doesn't 658 * get imbalanced. If the async queue is 659 * maxed out, then wait for it to drain before 660 * creating more dirty pages. Also, wait for 661 * any threads doing pagewalks in the vop_getattr 662 * entry points so that they don't block for 663 * long periods. 664 */ 665 mutex_enter(&rp->r_statelock); 666 while ((mi->mi_max_threads != 0 && 667 rp->r_awcount > 2 * mi->mi_max_threads) || 668 rp->r_gcount > 0) 669 cv_wait(&rp->r_cv, &rp->r_statelock); 670 mutex_exit(&rp->r_statelock); 671 672 if (vpm_enable) { 673 /* 674 * It will use kpm mappings, so no need to 675 * pass an address. 676 */ 677 error = writerp(rp, NULL, n, uiop, 0); 678 } else { 679 if (segmap_kpm) { 680 int pon = uiop->uio_loffset & PAGEOFFSET; 681 size_t pn = MIN(PAGESIZE - pon, 682 uiop->uio_resid); 683 int pagecreate; 684 685 mutex_enter(&rp->r_statelock); 686 pagecreate = (pon == 0) && (pn == PAGESIZE || 687 uiop->uio_loffset + pn >= rp->r_size); 688 mutex_exit(&rp->r_statelock); 689 690 base = segmap_getmapflt(segkmap, vp, off + on, 691 pn, !pagecreate, S_WRITE); 692 693 error = writerp(rp, base + pon, n, uiop, 694 pagecreate); 695 696 } else { 697 base = segmap_getmapflt(segkmap, vp, off + on, 698 n, 0, S_READ); 699 error = writerp(rp, base + on, n, uiop, 0); 700 } 701 } 702 703 if (!error) { 704 if (mi->mi_flags & MI_NOAC) 705 flags = SM_WRITE; 706 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 707 /* 708 * Have written a whole block. 709 * Start an asynchronous write 710 * and mark the buffer to 711 * indicate that it won't be 712 * needed again soon. 713 */ 714 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 715 } else 716 flags = 0; 717 if ((ioflag & (FSYNC|FDSYNC)) || 718 (rp->r_flags & ROUTOFSPACE)) { 719 flags &= ~SM_ASYNC; 720 flags |= SM_WRITE; 721 } 722 if (vpm_enable) { 723 error = vpm_sync_pages(vp, off, n, flags); 724 } else { 725 error = segmap_release(segkmap, base, flags); 726 } 727 } else { 728 if (vpm_enable) { 729 (void) vpm_sync_pages(vp, off, n, 0); 730 } else { 731 (void) segmap_release(segkmap, base, 0); 732 } 733 /* 734 * In the event that we got an access error while 735 * faulting in a page for a write-only file just 736 * force a write. 737 */ 738 if (error == EACCES) 739 goto nfs_fwrite; 740 } 741 } while (!error && uiop->uio_resid > 0); 742 743 bottom: 744 if (error) { 745 uiop->uio_resid = resid + remainder; 746 uiop->uio_loffset = offset; 747 } else 748 uiop->uio_resid += remainder; 749 750 nfs_rw_exit(&rp->r_lkserlock); 751 752 return (error); 753 } 754 755 /* 756 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 757 */ 758 static int 759 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 760 int flags, cred_t *cr) 761 { 762 struct buf *bp; 763 int error; 764 765 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 766 bp = pageio_setup(pp, len, vp, flags); 767 ASSERT(bp != NULL); 768 769 /* 770 * pageio_setup should have set b_addr to 0. This 771 * is correct since we want to do I/O on a page 772 * boundary. bp_mapin will use this addr to calculate 773 * an offset, and then set b_addr to the kernel virtual 774 * address it allocated for us. 775 */ 776 ASSERT(bp->b_un.b_addr == 0); 777 778 bp->b_edev = 0; 779 bp->b_dev = 0; 780 bp->b_lblkno = lbtodb(off); 781 bp->b_file = vp; 782 bp->b_offset = (offset_t)off; 783 bp_mapin(bp); 784 785 error = nfs_bio(bp, cr); 786 787 bp_mapout(bp); 788 pageio_done(bp); 789 790 return (error); 791 } 792 793 /* 794 * Write to file. Writes to remote server in largest size 795 * chunks that the server can handle. Write is synchronous. 796 */ 797 static int 798 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr) 799 { 800 rnode_t *rp; 801 mntinfo_t *mi; 802 struct nfswriteargs wa; 803 struct nfsattrstat ns; 804 int error; 805 int tsize; 806 int douprintf; 807 808 douprintf = 1; 809 810 rp = VTOR(vp); 811 mi = VTOMI(vp); 812 813 ASSERT(nfs_zone() == mi->mi_zone); 814 815 wa.wa_args = &wa.wa_args_buf; 816 wa.wa_fhandle = *VTOFH(vp); 817 818 do { 819 tsize = MIN(mi->mi_curwrite, count); 820 wa.wa_data = base; 821 wa.wa_begoff = offset; 822 wa.wa_totcount = tsize; 823 wa.wa_count = tsize; 824 wa.wa_offset = offset; 825 826 if (mi->mi_io_kstats) { 827 mutex_enter(&mi->mi_lock); 828 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 829 mutex_exit(&mi->mi_lock); 830 } 831 wa.wa_mblk = NULL; 832 do { 833 error = rfs2call(mi, RFS_WRITE, 834 xdr_writeargs, (caddr_t)&wa, 835 xdr_attrstat, (caddr_t)&ns, cr, 836 &douprintf, &ns.ns_status, 0, NULL); 837 } while (error == ENFS_TRYAGAIN); 838 if (mi->mi_io_kstats) { 839 mutex_enter(&mi->mi_lock); 840 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 841 mutex_exit(&mi->mi_lock); 842 } 843 844 if (!error) { 845 error = geterrno(ns.ns_status); 846 /* 847 * Can't check for stale fhandle and purge caches 848 * here because pages are held by nfs_getpage. 849 * Just mark the attribute cache as timed out 850 * and set RWRITEATTR to indicate that the file 851 * was modified with a WRITE operation. 852 */ 853 if (!error) { 854 count -= tsize; 855 base += tsize; 856 offset += tsize; 857 if (mi->mi_io_kstats) { 858 mutex_enter(&mi->mi_lock); 859 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 860 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 861 tsize; 862 mutex_exit(&mi->mi_lock); 863 } 864 lwp_stat_update(LWP_STAT_OUBLK, 1); 865 mutex_enter(&rp->r_statelock); 866 PURGE_ATTRCACHE_LOCKED(rp); 867 rp->r_flags |= RWRITEATTR; 868 mutex_exit(&rp->r_statelock); 869 } 870 } 871 } while (!error && count); 872 873 return (error); 874 } 875 876 /* 877 * Read from a file. Reads data in largest chunks our interface can handle. 878 */ 879 static int 880 nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp, 881 cred_t *cr) 882 { 883 mntinfo_t *mi; 884 struct nfsreadargs ra; 885 struct nfsrdresult rr; 886 int tsize; 887 int error; 888 int douprintf; 889 failinfo_t fi; 890 rnode_t *rp; 891 struct vattr va; 892 hrtime_t t; 893 894 rp = VTOR(vp); 895 mi = VTOMI(vp); 896 897 ASSERT(nfs_zone() == mi->mi_zone); 898 899 douprintf = 1; 900 901 ra.ra_fhandle = *VTOFH(vp); 902 903 fi.vp = vp; 904 fi.fhp = (caddr_t)&ra.ra_fhandle; 905 fi.copyproc = nfscopyfh; 906 fi.lookupproc = nfslookup; 907 fi.xattrdirproc = acl_getxattrdir2; 908 909 do { 910 if (mi->mi_io_kstats) { 911 mutex_enter(&mi->mi_lock); 912 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 913 mutex_exit(&mi->mi_lock); 914 } 915 916 do { 917 tsize = MIN(mi->mi_curread, count); 918 rr.rr_data = base; 919 ra.ra_offset = offset; 920 ra.ra_totcount = tsize; 921 ra.ra_count = tsize; 922 t = gethrtime(); 923 error = rfs2call(mi, RFS_READ, 924 xdr_readargs, (caddr_t)&ra, 925 xdr_rdresult, (caddr_t)&rr, cr, 926 &douprintf, &rr.rr_status, 0, &fi); 927 } while (error == ENFS_TRYAGAIN); 928 929 if (mi->mi_io_kstats) { 930 mutex_enter(&mi->mi_lock); 931 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 932 mutex_exit(&mi->mi_lock); 933 } 934 935 if (!error) { 936 error = geterrno(rr.rr_status); 937 if (!error) { 938 count -= rr.rr_count; 939 base += rr.rr_count; 940 offset += rr.rr_count; 941 if (mi->mi_io_kstats) { 942 mutex_enter(&mi->mi_lock); 943 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 944 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 945 rr.rr_count; 946 mutex_exit(&mi->mi_lock); 947 } 948 lwp_stat_update(LWP_STAT_INBLK, 1); 949 } 950 } 951 } while (!error && count && rr.rr_count == tsize); 952 953 *residp = count; 954 955 if (!error) { 956 /* 957 * Since no error occurred, we have the current 958 * attributes and we need to do a cache check and then 959 * potentially update the cached attributes. We can't 960 * use the normal attribute check and cache mechanisms 961 * because they might cause a cache flush which would 962 * deadlock. Instead, we just check the cache to see 963 * if the attributes have changed. If it is, then we 964 * just mark the attributes as out of date. The next 965 * time that the attributes are checked, they will be 966 * out of date, new attributes will be fetched, and 967 * the page cache will be flushed. If the attributes 968 * weren't changed, then we just update the cached 969 * attributes with these attributes. 970 */ 971 /* 972 * If NFS_ACL is supported on the server, then the 973 * attributes returned by server may have minimal 974 * permissions sometimes denying access to users having 975 * proper access. To get the proper attributes, mark 976 * the attributes as expired so that they will be 977 * regotten via the NFS_ACL GETATTR2 procedure. 978 */ 979 error = nattr_to_vattr(vp, &rr.rr_attr, &va); 980 mutex_enter(&rp->r_statelock); 981 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) || 982 (mi->mi_flags & MI_ACL)) { 983 mutex_exit(&rp->r_statelock); 984 PURGE_ATTRCACHE(vp); 985 } else { 986 if (rp->r_mtime <= t) { 987 nfs_attrcache_va(vp, &va); 988 } 989 mutex_exit(&rp->r_statelock); 990 } 991 } 992 993 return (error); 994 } 995 996 /* ARGSUSED */ 997 static int 998 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 999 { 1000 1001 if (nfs_zone() != VTOMI(vp)->mi_zone) 1002 return (EIO); 1003 switch (cmd) { 1004 case _FIODIRECTIO: 1005 return (nfs_directio(vp, (int)arg, cr)); 1006 default: 1007 return (ENOTTY); 1008 } 1009 } 1010 1011 static int 1012 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1013 { 1014 int error; 1015 rnode_t *rp; 1016 1017 if (nfs_zone() != VTOMI(vp)->mi_zone) 1018 return (EIO); 1019 /* 1020 * If it has been specified that the return value will 1021 * just be used as a hint, and we are only being asked 1022 * for size, fsid or rdevid, then return the client's 1023 * notion of these values without checking to make sure 1024 * that the attribute cache is up to date. 1025 * The whole point is to avoid an over the wire GETATTR 1026 * call. 1027 */ 1028 rp = VTOR(vp); 1029 if (flags & ATTR_HINT) { 1030 if (vap->va_mask == 1031 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1032 mutex_enter(&rp->r_statelock); 1033 if (vap->va_mask | AT_SIZE) 1034 vap->va_size = rp->r_size; 1035 if (vap->va_mask | AT_FSID) 1036 vap->va_fsid = rp->r_attr.va_fsid; 1037 if (vap->va_mask | AT_RDEV) 1038 vap->va_rdev = rp->r_attr.va_rdev; 1039 mutex_exit(&rp->r_statelock); 1040 return (0); 1041 } 1042 } 1043 1044 /* 1045 * Only need to flush pages if asking for the mtime 1046 * and if there any dirty pages or any outstanding 1047 * asynchronous (write) requests for this file. 1048 */ 1049 if (vap->va_mask & AT_MTIME) { 1050 if (vn_has_cached_data(vp) && 1051 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1052 mutex_enter(&rp->r_statelock); 1053 rp->r_gcount++; 1054 mutex_exit(&rp->r_statelock); 1055 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1056 mutex_enter(&rp->r_statelock); 1057 if (error && (error == ENOSPC || error == EDQUOT)) { 1058 if (!rp->r_error) 1059 rp->r_error = error; 1060 } 1061 if (--rp->r_gcount == 0) 1062 cv_broadcast(&rp->r_cv); 1063 mutex_exit(&rp->r_statelock); 1064 } 1065 } 1066 1067 return (nfsgetattr(vp, vap, cr)); 1068 } 1069 1070 /*ARGSUSED4*/ 1071 static int 1072 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1073 caller_context_t *ct) 1074 { 1075 int error; 1076 uint_t mask; 1077 struct vattr va; 1078 1079 mask = vap->va_mask; 1080 1081 if (mask & AT_NOSET) 1082 return (EINVAL); 1083 1084 if ((mask & AT_SIZE) && 1085 vap->va_type == VREG && 1086 vap->va_size > MAXOFF32_T) 1087 return (EFBIG); 1088 1089 if (nfs_zone() != VTOMI(vp)->mi_zone) 1090 return (EIO); 1091 1092 va.va_mask = AT_UID | AT_MODE; 1093 1094 error = nfsgetattr(vp, &va, cr); 1095 if (error) 1096 return (error); 1097 1098 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx, 1099 vp); 1100 1101 if (error) 1102 return (error); 1103 1104 return (nfssetattr(vp, vap, flags, cr)); 1105 } 1106 1107 static int 1108 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1109 { 1110 int error; 1111 uint_t mask; 1112 struct nfssaargs args; 1113 struct nfsattrstat ns; 1114 int douprintf; 1115 rnode_t *rp; 1116 struct vattr va; 1117 mode_t omode; 1118 mntinfo_t *mi; 1119 vsecattr_t *vsp; 1120 hrtime_t t; 1121 1122 mask = vap->va_mask; 1123 1124 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1125 1126 rp = VTOR(vp); 1127 1128 /* 1129 * Only need to flush pages if there are any pages and 1130 * if the file is marked as dirty in some fashion. The 1131 * file must be flushed so that we can accurately 1132 * determine the size of the file and the cached data 1133 * after the SETATTR returns. A file is considered to 1134 * be dirty if it is either marked with RDIRTY, has 1135 * outstanding i/o's active, or is mmap'd. In this 1136 * last case, we can't tell whether there are dirty 1137 * pages, so we flush just to be sure. 1138 */ 1139 if (vn_has_cached_data(vp) && 1140 ((rp->r_flags & RDIRTY) || 1141 rp->r_count > 0 || 1142 rp->r_mapcnt > 0)) { 1143 ASSERT(vp->v_type != VCHR); 1144 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1145 if (error && (error == ENOSPC || error == EDQUOT)) { 1146 mutex_enter(&rp->r_statelock); 1147 if (!rp->r_error) 1148 rp->r_error = error; 1149 mutex_exit(&rp->r_statelock); 1150 } 1151 } 1152 1153 /* 1154 * If the system call was utime(2) or utimes(2) and the 1155 * application did not specify the times, then set the 1156 * mtime nanosecond field to 1 billion. This will get 1157 * translated from 1 billion nanoseconds to 1 million 1158 * microseconds in the over the wire request. The 1159 * server will use 1 million in the microsecond field 1160 * to tell whether both the mtime and atime should be 1161 * set to the server's current time. 1162 * 1163 * This is an overload of the protocol and should be 1164 * documented in the NFS Version 2 protocol specification. 1165 */ 1166 if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) { 1167 vap->va_mtime.tv_nsec = 1000000000; 1168 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) && 1169 NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1170 error = vattr_to_sattr(vap, &args.saa_sa); 1171 } else { 1172 /* 1173 * Use server times. vap time values will not be used. 1174 * To ensure no time overflow, make sure vap has 1175 * valid values, but retain the original values. 1176 */ 1177 timestruc_t mtime = vap->va_mtime; 1178 timestruc_t atime = vap->va_atime; 1179 time_t now; 1180 1181 now = gethrestime_sec(); 1182 if (NFS_TIME_T_OK(now)) { 1183 /* Just in case server does not know of this */ 1184 vap->va_mtime.tv_sec = now; 1185 vap->va_atime.tv_sec = now; 1186 } else { 1187 vap->va_mtime.tv_sec = 0; 1188 vap->va_atime.tv_sec = 0; 1189 } 1190 error = vattr_to_sattr(vap, &args.saa_sa); 1191 /* set vap times back on */ 1192 vap->va_mtime = mtime; 1193 vap->va_atime = atime; 1194 } 1195 } else { 1196 /* Either do not set times or use the client specified times */ 1197 error = vattr_to_sattr(vap, &args.saa_sa); 1198 } 1199 if (error) { 1200 /* req time field(s) overflow - return immediately */ 1201 return (error); 1202 } 1203 args.saa_fh = *VTOFH(vp); 1204 1205 va.va_mask = AT_MODE; 1206 error = nfsgetattr(vp, &va, cr); 1207 if (error) 1208 return (error); 1209 omode = va.va_mode; 1210 1211 mi = VTOMI(vp); 1212 1213 douprintf = 1; 1214 1215 t = gethrtime(); 1216 1217 error = rfs2call(mi, RFS_SETATTR, 1218 xdr_saargs, (caddr_t)&args, 1219 xdr_attrstat, (caddr_t)&ns, cr, 1220 &douprintf, &ns.ns_status, 0, NULL); 1221 1222 /* 1223 * Purge the access cache and ACL cache if changing either the 1224 * owner of the file, the group owner, or the mode. These may 1225 * change the access permissions of the file, so purge old 1226 * information and start over again. 1227 */ 1228 if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) { 1229 (void) nfs_access_purge_rp(rp); 1230 if (rp->r_secattr != NULL) { 1231 mutex_enter(&rp->r_statelock); 1232 vsp = rp->r_secattr; 1233 rp->r_secattr = NULL; 1234 mutex_exit(&rp->r_statelock); 1235 if (vsp != NULL) 1236 nfs_acl_free(vsp); 1237 } 1238 } 1239 1240 if (!error) { 1241 error = geterrno(ns.ns_status); 1242 if (!error) { 1243 /* 1244 * If changing the size of the file, invalidate 1245 * any local cached data which is no longer part 1246 * of the file. We also possibly invalidate the 1247 * last page in the file. We could use 1248 * pvn_vpzero(), but this would mark the page as 1249 * modified and require it to be written back to 1250 * the server for no particularly good reason. 1251 * This way, if we access it, then we bring it 1252 * back in. A read should be cheaper than a 1253 * write. 1254 */ 1255 if (mask & AT_SIZE) { 1256 nfs_invalidate_pages(vp, 1257 (vap->va_size & PAGEMASK), cr); 1258 } 1259 (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr); 1260 /* 1261 * If NFS_ACL is supported on the server, then the 1262 * attributes returned by server may have minimal 1263 * permissions sometimes denying access to users having 1264 * proper access. To get the proper attributes, mark 1265 * the attributes as expired so that they will be 1266 * regotten via the NFS_ACL GETATTR2 procedure. 1267 */ 1268 if (mi->mi_flags & MI_ACL) { 1269 PURGE_ATTRCACHE(vp); 1270 } 1271 /* 1272 * This next check attempts to deal with NFS 1273 * servers which can not handle increasing 1274 * the size of the file via setattr. Most 1275 * of these servers do not return an error, 1276 * but do not change the size of the file. 1277 * Hence, this check and then attempt to set 1278 * the file size by writing 1 byte at the 1279 * offset of the end of the file that we need. 1280 */ 1281 if ((mask & AT_SIZE) && 1282 ns.ns_attr.na_size < (uint32_t)vap->va_size) { 1283 char zb = '\0'; 1284 1285 error = nfswrite(vp, &zb, 1286 vap->va_size - sizeof (zb), 1287 sizeof (zb), cr); 1288 } 1289 /* 1290 * Some servers will change the mode to clear the setuid 1291 * and setgid bits when changing the uid or gid. The 1292 * client needs to compensate appropriately. 1293 */ 1294 if (mask & (AT_UID | AT_GID)) { 1295 int terror; 1296 1297 va.va_mask = AT_MODE; 1298 terror = nfsgetattr(vp, &va, cr); 1299 if (!terror && 1300 (((mask & AT_MODE) && 1301 va.va_mode != vap->va_mode) || 1302 (!(mask & AT_MODE) && 1303 va.va_mode != omode))) { 1304 va.va_mask = AT_MODE; 1305 if (mask & AT_MODE) 1306 va.va_mode = vap->va_mode; 1307 else 1308 va.va_mode = omode; 1309 (void) nfssetattr(vp, &va, 0, cr); 1310 } 1311 } 1312 } else { 1313 PURGE_ATTRCACHE(vp); 1314 PURGE_STALE_FH(error, vp, cr); 1315 } 1316 } else { 1317 PURGE_ATTRCACHE(vp); 1318 } 1319 1320 return (error); 1321 } 1322 1323 static int 1324 nfs_accessx(void *vp, int mode, cred_t *cr) 1325 { 1326 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1327 return (nfs_access(vp, mode, 0, cr)); 1328 } 1329 1330 static int 1331 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) 1332 { 1333 struct vattr va; 1334 int error; 1335 mntinfo_t *mi; 1336 int shift = 0; 1337 1338 mi = VTOMI(vp); 1339 1340 if (nfs_zone() != mi->mi_zone) 1341 return (EIO); 1342 if (mi->mi_flags & MI_ACL) { 1343 error = acl_access2(vp, mode, flags, cr); 1344 if (mi->mi_flags & MI_ACL) 1345 return (error); 1346 } 1347 1348 va.va_mask = AT_MODE | AT_UID | AT_GID; 1349 error = nfsgetattr(vp, &va, cr); 1350 if (error) 1351 return (error); 1352 1353 /* 1354 * Disallow write attempts on read-only 1355 * file systems, unless the file is a 1356 * device node. 1357 */ 1358 if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp)) 1359 return (EROFS); 1360 1361 /* 1362 * Disallow attempts to access mandatory lock files. 1363 */ 1364 if ((mode & (VWRITE | VREAD | VEXEC)) && 1365 MANDLOCK(vp, va.va_mode)) 1366 return (EACCES); 1367 1368 /* 1369 * Access check is based on only 1370 * one of owner, group, public. 1371 * If not owner, then check group. 1372 * If not a member of the group, 1373 * then check public access. 1374 */ 1375 if (crgetuid(cr) != va.va_uid) { 1376 shift += 3; 1377 if (!groupmember(va.va_gid, cr)) 1378 shift += 3; 1379 } 1380 found: 1381 mode &= ~(va.va_mode << shift); 1382 if (mode == 0) 1383 return (0); 1384 1385 return (secpolicy_vnode_access(cr, vp, va.va_uid, mode)); 1386 } 1387 1388 static int nfs_do_symlink_cache = 1; 1389 1390 static int 1391 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 1392 { 1393 int error; 1394 struct nfsrdlnres rl; 1395 rnode_t *rp; 1396 int douprintf; 1397 failinfo_t fi; 1398 1399 /* 1400 * We want to be consistent with UFS semantics so we will return 1401 * EINVAL instead of ENXIO. This violates the XNFS spec and 1402 * the RFC 1094, which are wrong any way. BUGID 1138002. 1403 */ 1404 if (vp->v_type != VLNK) 1405 return (EINVAL); 1406 1407 if (nfs_zone() != VTOMI(vp)->mi_zone) 1408 return (EIO); 1409 1410 rp = VTOR(vp); 1411 if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) { 1412 error = nfs_validate_caches(vp, cr); 1413 if (error) 1414 return (error); 1415 mutex_enter(&rp->r_statelock); 1416 if (rp->r_symlink.contents != NULL) { 1417 error = uiomove(rp->r_symlink.contents, 1418 rp->r_symlink.len, UIO_READ, uiop); 1419 mutex_exit(&rp->r_statelock); 1420 return (error); 1421 } 1422 mutex_exit(&rp->r_statelock); 1423 } 1424 1425 1426 rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 1427 1428 fi.vp = vp; 1429 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1430 fi.copyproc = nfscopyfh; 1431 fi.lookupproc = nfslookup; 1432 fi.xattrdirproc = acl_getxattrdir2; 1433 1434 douprintf = 1; 1435 1436 error = rfs2call(VTOMI(vp), RFS_READLINK, 1437 xdr_fhandle, (caddr_t)VTOFH(vp), 1438 xdr_rdlnres, (caddr_t)&rl, cr, 1439 &douprintf, &rl.rl_status, 0, &fi); 1440 1441 if (error) { 1442 1443 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1444 return (error); 1445 } 1446 1447 error = geterrno(rl.rl_status); 1448 if (!error) { 1449 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop); 1450 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) { 1451 mutex_enter(&rp->r_statelock); 1452 if (rp->r_symlink.contents == NULL) { 1453 rp->r_symlink.contents = rl.rl_data; 1454 rp->r_symlink.len = (int)rl.rl_count; 1455 rp->r_symlink.size = NFS_MAXPATHLEN; 1456 mutex_exit(&rp->r_statelock); 1457 } else { 1458 mutex_exit(&rp->r_statelock); 1459 1460 kmem_free((void *)rl.rl_data, 1461 NFS_MAXPATHLEN); 1462 } 1463 } else { 1464 1465 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1466 } 1467 } else { 1468 PURGE_STALE_FH(error, vp, cr); 1469 1470 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1471 } 1472 1473 /* 1474 * Conform to UFS semantics (see comment above) 1475 */ 1476 return (error == ENXIO ? EINVAL : error); 1477 } 1478 1479 /* 1480 * Flush local dirty pages to stable storage on the server. 1481 * 1482 * If FNODSYNC is specified, then there is nothing to do because 1483 * metadata changes are not cached on the client before being 1484 * sent to the server. 1485 */ 1486 static int 1487 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1488 { 1489 int error; 1490 1491 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1492 return (0); 1493 1494 if (nfs_zone() != VTOMI(vp)->mi_zone) 1495 return (EIO); 1496 1497 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1498 if (!error) 1499 error = VTOR(vp)->r_error; 1500 return (error); 1501 } 1502 1503 1504 /* 1505 * Weirdness: if the file was removed or the target of a rename 1506 * operation while it was open, it got renamed instead. Here we 1507 * remove the renamed file. 1508 */ 1509 static void 1510 nfs_inactive(vnode_t *vp, cred_t *cr) 1511 { 1512 rnode_t *rp; 1513 1514 ASSERT(vp != DNLC_NO_VNODE); 1515 1516 /* 1517 * If this is coming from the wrong zone, we let someone in the right 1518 * zone take care of it asynchronously. We can get here due to 1519 * VN_RELE() being called from pageout() or fsflush(). This call may 1520 * potentially turn into an expensive no-op if, for instance, v_count 1521 * gets incremented in the meantime, but it's still correct. 1522 */ 1523 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1524 nfs_async_inactive(vp, cr, nfs_inactive); 1525 return; 1526 } 1527 1528 rp = VTOR(vp); 1529 redo: 1530 if (rp->r_unldvp != NULL) { 1531 /* 1532 * Save the vnode pointer for the directory where the 1533 * unlinked-open file got renamed, then set it to NULL 1534 * to prevent another thread from getting here before 1535 * we're done with the remove. While we have the 1536 * statelock, make local copies of the pertinent rnode 1537 * fields. If we weren't to do this in an atomic way, the 1538 * the unl* fields could become inconsistent with respect 1539 * to each other due to a race condition between this 1540 * code and nfs_remove(). See bug report 1034328. 1541 */ 1542 mutex_enter(&rp->r_statelock); 1543 if (rp->r_unldvp != NULL) { 1544 vnode_t *unldvp; 1545 char *unlname; 1546 cred_t *unlcred; 1547 struct nfsdiropargs da; 1548 enum nfsstat status; 1549 int douprintf; 1550 int error; 1551 1552 unldvp = rp->r_unldvp; 1553 rp->r_unldvp = NULL; 1554 unlname = rp->r_unlname; 1555 rp->r_unlname = NULL; 1556 unlcred = rp->r_unlcred; 1557 rp->r_unlcred = NULL; 1558 mutex_exit(&rp->r_statelock); 1559 1560 /* 1561 * If there are any dirty pages left, then flush 1562 * them. This is unfortunate because they just 1563 * may get thrown away during the remove operation, 1564 * but we have to do this for correctness. 1565 */ 1566 if (vn_has_cached_data(vp) && 1567 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1568 ASSERT(vp->v_type != VCHR); 1569 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 1570 if (error) { 1571 mutex_enter(&rp->r_statelock); 1572 if (!rp->r_error) 1573 rp->r_error = error; 1574 mutex_exit(&rp->r_statelock); 1575 } 1576 } 1577 1578 /* 1579 * Do the remove operation on the renamed file 1580 */ 1581 setdiropargs(&da, unlname, unldvp); 1582 1583 douprintf = 1; 1584 1585 (void) rfs2call(VTOMI(unldvp), RFS_REMOVE, 1586 xdr_diropargs, (caddr_t)&da, 1587 xdr_enum, (caddr_t)&status, unlcred, 1588 &douprintf, &status, 0, NULL); 1589 1590 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1591 nfs_purge_rddir_cache(unldvp); 1592 PURGE_ATTRCACHE(unldvp); 1593 1594 /* 1595 * Release stuff held for the remove 1596 */ 1597 VN_RELE(unldvp); 1598 kmem_free(unlname, MAXNAMELEN); 1599 crfree(unlcred); 1600 goto redo; 1601 } 1602 mutex_exit(&rp->r_statelock); 1603 } 1604 1605 rp_addfree(rp, cr); 1606 } 1607 1608 /* 1609 * Remote file system operations having to do with directory manipulation. 1610 */ 1611 1612 static int 1613 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1614 int flags, vnode_t *rdir, cred_t *cr) 1615 { 1616 int error; 1617 vnode_t *vp; 1618 vnode_t *avp = NULL; 1619 rnode_t *drp; 1620 1621 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1622 return (EPERM); 1623 1624 drp = VTOR(dvp); 1625 1626 /* 1627 * Are we looking up extended attributes? If so, "dvp" is 1628 * the file or directory for which we want attributes, and 1629 * we need a lookup of the hidden attribute directory 1630 * before we lookup the rest of the path. 1631 */ 1632 if (flags & LOOKUP_XATTR) { 1633 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1634 mntinfo_t *mi; 1635 1636 mi = VTOMI(dvp); 1637 if (!(mi->mi_flags & MI_EXTATTR)) 1638 return (EINVAL); 1639 1640 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1641 return (EINTR); 1642 1643 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1644 if (avp == NULL) 1645 error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0); 1646 else 1647 error = 0; 1648 1649 nfs_rw_exit(&drp->r_rwlock); 1650 1651 if (error) { 1652 if (mi->mi_flags & MI_EXTATTR) 1653 return (error); 1654 return (EINVAL); 1655 } 1656 dvp = avp; 1657 drp = VTOR(dvp); 1658 } 1659 1660 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1661 error = EINTR; 1662 goto out; 1663 } 1664 1665 error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1666 1667 nfs_rw_exit(&drp->r_rwlock); 1668 1669 /* 1670 * If vnode is a device, create special vnode. 1671 */ 1672 if (!error && IS_DEVVP(*vpp)) { 1673 vp = *vpp; 1674 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1675 VN_RELE(vp); 1676 } 1677 1678 out: 1679 if (avp != NULL) 1680 VN_RELE(avp); 1681 1682 return (error); 1683 } 1684 1685 static int nfs_lookup_neg_cache = 1; 1686 1687 #ifdef DEBUG 1688 static int nfs_lookup_dnlc_hits = 0; 1689 static int nfs_lookup_dnlc_misses = 0; 1690 static int nfs_lookup_dnlc_neg_hits = 0; 1691 static int nfs_lookup_dnlc_disappears = 0; 1692 static int nfs_lookup_dnlc_lookups = 0; 1693 #endif 1694 1695 /* ARGSUSED */ 1696 int 1697 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1698 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1699 { 1700 int error; 1701 1702 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1703 1704 /* 1705 * If lookup is for "", just return dvp. Don't need 1706 * to send it over the wire, look it up in the dnlc, 1707 * or perform any access checks. 1708 */ 1709 if (*nm == '\0') { 1710 VN_HOLD(dvp); 1711 *vpp = dvp; 1712 return (0); 1713 } 1714 1715 /* 1716 * Can't do lookups in non-directories. 1717 */ 1718 if (dvp->v_type != VDIR) 1719 return (ENOTDIR); 1720 1721 /* 1722 * If we're called with RFSCALL_SOFT, it's important that 1723 * the only rfscall is one we make directly; if we permit 1724 * an access call because we're looking up "." or validating 1725 * a dnlc hit, we'll deadlock because that rfscall will not 1726 * have the RFSCALL_SOFT set. 1727 */ 1728 if (rfscall_flags & RFSCALL_SOFT) 1729 goto callit; 1730 1731 /* 1732 * If lookup is for ".", just return dvp. Don't need 1733 * to send it over the wire or look it up in the dnlc, 1734 * just need to check access. 1735 */ 1736 if (strcmp(nm, ".") == 0) { 1737 error = nfs_access(dvp, VEXEC, 0, cr); 1738 if (error) 1739 return (error); 1740 VN_HOLD(dvp); 1741 *vpp = dvp; 1742 return (0); 1743 } 1744 1745 /* 1746 * Lookup this name in the DNLC. If there was a valid entry, 1747 * then return the results of the lookup. 1748 */ 1749 error = nfslookup_dnlc(dvp, nm, vpp, cr); 1750 if (error || *vpp != NULL) 1751 return (error); 1752 1753 callit: 1754 error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1755 1756 return (error); 1757 } 1758 1759 static int 1760 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1761 { 1762 int error; 1763 vnode_t *vp; 1764 1765 ASSERT(*nm != '\0'); 1766 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1767 1768 /* 1769 * Lookup this name in the DNLC. If successful, then validate 1770 * the caches and then recheck the DNLC. The DNLC is rechecked 1771 * just in case this entry got invalidated during the call 1772 * to nfs_validate_caches. 1773 * 1774 * An assumption is being made that it is safe to say that a 1775 * file exists which may not on the server. Any operations to 1776 * the server will fail with ESTALE. 1777 */ 1778 #ifdef DEBUG 1779 nfs_lookup_dnlc_lookups++; 1780 #endif 1781 vp = dnlc_lookup(dvp, nm); 1782 if (vp != NULL) { 1783 VN_RELE(vp); 1784 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 1785 PURGE_ATTRCACHE(dvp); 1786 } 1787 error = nfs_validate_caches(dvp, cr); 1788 if (error) 1789 return (error); 1790 vp = dnlc_lookup(dvp, nm); 1791 if (vp != NULL) { 1792 error = nfs_access(dvp, VEXEC, 0, cr); 1793 if (error) { 1794 VN_RELE(vp); 1795 return (error); 1796 } 1797 if (vp == DNLC_NO_VNODE) { 1798 VN_RELE(vp); 1799 #ifdef DEBUG 1800 nfs_lookup_dnlc_neg_hits++; 1801 #endif 1802 return (ENOENT); 1803 } 1804 *vpp = vp; 1805 #ifdef DEBUG 1806 nfs_lookup_dnlc_hits++; 1807 #endif 1808 return (0); 1809 } 1810 #ifdef DEBUG 1811 nfs_lookup_dnlc_disappears++; 1812 #endif 1813 } 1814 #ifdef DEBUG 1815 else 1816 nfs_lookup_dnlc_misses++; 1817 #endif 1818 1819 *vpp = NULL; 1820 1821 return (0); 1822 } 1823 1824 static int 1825 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 1826 int rfscall_flags) 1827 { 1828 int error; 1829 struct nfsdiropargs da; 1830 struct nfsdiropres dr; 1831 int douprintf; 1832 failinfo_t fi; 1833 hrtime_t t; 1834 1835 ASSERT(*nm != '\0'); 1836 ASSERT(dvp->v_type == VDIR); 1837 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1838 1839 setdiropargs(&da, nm, dvp); 1840 1841 fi.vp = dvp; 1842 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1843 fi.copyproc = nfscopyfh; 1844 fi.lookupproc = nfslookup; 1845 fi.xattrdirproc = acl_getxattrdir2; 1846 1847 douprintf = 1; 1848 1849 t = gethrtime(); 1850 1851 error = rfs2call(VTOMI(dvp), RFS_LOOKUP, 1852 xdr_diropargs, (caddr_t)&da, 1853 xdr_diropres, (caddr_t)&dr, cr, 1854 &douprintf, &dr.dr_status, rfscall_flags, &fi); 1855 1856 if (!error) { 1857 error = geterrno(dr.dr_status); 1858 if (!error) { 1859 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 1860 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 1861 /* 1862 * If NFS_ACL is supported on the server, then the 1863 * attributes returned by server may have minimal 1864 * permissions sometimes denying access to users having 1865 * proper access. To get the proper attributes, mark 1866 * the attributes as expired so that they will be 1867 * regotten via the NFS_ACL GETATTR2 procedure. 1868 */ 1869 if (VTOMI(*vpp)->mi_flags & MI_ACL) { 1870 PURGE_ATTRCACHE(*vpp); 1871 } 1872 if (!(rfscall_flags & RFSCALL_SOFT)) 1873 dnlc_update(dvp, nm, *vpp); 1874 } else { 1875 PURGE_STALE_FH(error, dvp, cr); 1876 if (error == ENOENT && nfs_lookup_neg_cache) 1877 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 1878 } 1879 } 1880 1881 return (error); 1882 } 1883 1884 /* ARGSUSED */ 1885 static int 1886 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 1887 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 1888 { 1889 int error; 1890 struct nfscreatargs args; 1891 struct nfsdiropres dr; 1892 int douprintf; 1893 vnode_t *vp; 1894 rnode_t *rp; 1895 struct vattr vattr; 1896 rnode_t *drp; 1897 vnode_t *tempvp; 1898 hrtime_t t; 1899 1900 drp = VTOR(dvp); 1901 1902 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1903 return (EPERM); 1904 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 1905 return (EINTR); 1906 1907 /* 1908 * We make a copy of the attributes because the caller does not 1909 * expect us to change what va points to. 1910 */ 1911 vattr = *va; 1912 1913 /* 1914 * If the pathname is "", just use dvp. Don't need 1915 * to send it over the wire, look it up in the dnlc, 1916 * or perform any access checks. 1917 */ 1918 if (*nm == '\0') { 1919 error = 0; 1920 VN_HOLD(dvp); 1921 vp = dvp; 1922 /* 1923 * If the pathname is ".", just use dvp. Don't need 1924 * to send it over the wire or look it up in the dnlc, 1925 * just need to check access. 1926 */ 1927 } else if (strcmp(nm, ".") == 0) { 1928 error = nfs_access(dvp, VEXEC, 0, cr); 1929 if (error) { 1930 nfs_rw_exit(&drp->r_rwlock); 1931 return (error); 1932 } 1933 VN_HOLD(dvp); 1934 vp = dvp; 1935 /* 1936 * We need to go over the wire, just to be sure whether the 1937 * file exists or not. Using the DNLC can be dangerous in 1938 * this case when making a decision regarding existence. 1939 */ 1940 } else { 1941 error = nfslookup_otw(dvp, nm, &vp, cr, 0); 1942 } 1943 if (!error) { 1944 if (exclusive == EXCL) 1945 error = EEXIST; 1946 else if (vp->v_type == VDIR && (mode & VWRITE)) 1947 error = EISDIR; 1948 else { 1949 /* 1950 * If vnode is a device, create special vnode. 1951 */ 1952 if (IS_DEVVP(vp)) { 1953 tempvp = vp; 1954 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1955 VN_RELE(tempvp); 1956 } 1957 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 1958 if ((vattr.va_mask & AT_SIZE) && 1959 vp->v_type == VREG) { 1960 vattr.va_mask = AT_SIZE; 1961 error = nfssetattr(vp, &vattr, 0, cr); 1962 } 1963 } 1964 } 1965 nfs_rw_exit(&drp->r_rwlock); 1966 if (error) { 1967 VN_RELE(vp); 1968 } else { 1969 /* 1970 * existing file got truncated, notify. 1971 */ 1972 vnevent_create(vp); 1973 *vpp = vp; 1974 } 1975 return (error); 1976 } 1977 1978 ASSERT(vattr.va_mask & AT_TYPE); 1979 if (vattr.va_type == VREG) { 1980 ASSERT(vattr.va_mask & AT_MODE); 1981 if (MANDMODE(vattr.va_mode)) { 1982 nfs_rw_exit(&drp->r_rwlock); 1983 return (EACCES); 1984 } 1985 } 1986 1987 dnlc_remove(dvp, nm); 1988 1989 setdiropargs(&args.ca_da, nm, dvp); 1990 1991 /* 1992 * Decide what the group-id of the created file should be. 1993 * Set it in attribute list as advisory...then do a setattr 1994 * if the server didn't get it right the first time. 1995 */ 1996 error = setdirgid(dvp, &vattr.va_gid, cr); 1997 if (error) { 1998 nfs_rw_exit(&drp->r_rwlock); 1999 return (error); 2000 } 2001 vattr.va_mask |= AT_GID; 2002 2003 /* 2004 * This is a completely gross hack to make mknod 2005 * work over the wire until we can wack the protocol 2006 */ 2007 #define IFCHR 0020000 /* character special */ 2008 #define IFBLK 0060000 /* block special */ 2009 #define IFSOCK 0140000 /* socket */ 2010 2011 /* 2012 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x 2013 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18 2014 * bits in the minor number where 4.x supports 8 bits. If the 5.x 2015 * minor/major numbers <= 8 bits long, compress the device 2016 * number before sending it. Otherwise, the 4.x server will not 2017 * create the device with the correct device number and nothing can be 2018 * done about this. 2019 */ 2020 if (vattr.va_type == VCHR || vattr.va_type == VBLK) { 2021 dev_t d = vattr.va_rdev; 2022 dev32_t dev32; 2023 2024 if (vattr.va_type == VCHR) 2025 vattr.va_mode |= IFCHR; 2026 else 2027 vattr.va_mode |= IFBLK; 2028 2029 (void) cmpldev(&dev32, d); 2030 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN)) 2031 vattr.va_size = (u_offset_t)dev32; 2032 else 2033 vattr.va_size = (u_offset_t)nfsv2_cmpdev(d); 2034 2035 vattr.va_mask |= AT_MODE|AT_SIZE; 2036 } else if (vattr.va_type == VFIFO) { 2037 vattr.va_mode |= IFCHR; /* xtra kludge for namedpipe */ 2038 vattr.va_size = (u_offset_t)NFS_FIFO_DEV; /* blech */ 2039 vattr.va_mask |= AT_MODE|AT_SIZE; 2040 } else if (vattr.va_type == VSOCK) { 2041 vattr.va_mode |= IFSOCK; 2042 /* 2043 * To avoid triggering bugs in the servers set AT_SIZE 2044 * (all other RFS_CREATE calls set this). 2045 */ 2046 vattr.va_size = 0; 2047 vattr.va_mask |= AT_MODE|AT_SIZE; 2048 } 2049 2050 args.ca_sa = &args.ca_sa_buf; 2051 error = vattr_to_sattr(&vattr, args.ca_sa); 2052 if (error) { 2053 /* req time field(s) overflow - return immediately */ 2054 nfs_rw_exit(&drp->r_rwlock); 2055 return (error); 2056 } 2057 2058 douprintf = 1; 2059 2060 t = gethrtime(); 2061 2062 error = rfs2call(VTOMI(dvp), RFS_CREATE, 2063 xdr_creatargs, (caddr_t)&args, 2064 xdr_diropres, (caddr_t)&dr, cr, 2065 &douprintf, &dr.dr_status, 0, NULL); 2066 2067 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2068 2069 if (!error) { 2070 error = geterrno(dr.dr_status); 2071 if (!error) { 2072 if (HAVE_RDDIR_CACHE(drp)) 2073 nfs_purge_rddir_cache(dvp); 2074 vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2075 dvp->v_vfsp, t, cr, NULL, NULL); 2076 /* 2077 * If NFS_ACL is supported on the server, then the 2078 * attributes returned by server may have minimal 2079 * permissions sometimes denying access to users having 2080 * proper access. To get the proper attributes, mark 2081 * the attributes as expired so that they will be 2082 * regotten via the NFS_ACL GETATTR2 procedure. 2083 */ 2084 if (VTOMI(vp)->mi_flags & MI_ACL) { 2085 PURGE_ATTRCACHE(vp); 2086 } 2087 dnlc_update(dvp, nm, vp); 2088 rp = VTOR(vp); 2089 if (vattr.va_size == 0) { 2090 mutex_enter(&rp->r_statelock); 2091 rp->r_size = 0; 2092 mutex_exit(&rp->r_statelock); 2093 if (vn_has_cached_data(vp)) { 2094 ASSERT(vp->v_type != VCHR); 2095 nfs_invalidate_pages(vp, 2096 (u_offset_t)0, cr); 2097 } 2098 } 2099 2100 /* 2101 * Make sure the gid was set correctly. 2102 * If not, try to set it (but don't lose 2103 * any sleep over it). 2104 */ 2105 if (vattr.va_gid != rp->r_attr.va_gid) { 2106 vattr.va_mask = AT_GID; 2107 (void) nfssetattr(vp, &vattr, 0, cr); 2108 } 2109 2110 /* 2111 * If vnode is a device create special vnode 2112 */ 2113 if (IS_DEVVP(vp)) { 2114 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2115 VN_RELE(vp); 2116 } else 2117 *vpp = vp; 2118 } else { 2119 PURGE_STALE_FH(error, dvp, cr); 2120 } 2121 } 2122 2123 nfs_rw_exit(&drp->r_rwlock); 2124 2125 return (error); 2126 } 2127 2128 /* 2129 * Weirdness: if the vnode to be removed is open 2130 * we rename it instead of removing it and nfs_inactive 2131 * will remove the new name. 2132 */ 2133 static int 2134 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr) 2135 { 2136 int error; 2137 struct nfsdiropargs da; 2138 enum nfsstat status; 2139 vnode_t *vp; 2140 char *tmpname; 2141 int douprintf; 2142 rnode_t *rp; 2143 rnode_t *drp; 2144 2145 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2146 return (EPERM); 2147 drp = VTOR(dvp); 2148 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2149 return (EINTR); 2150 2151 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2152 if (error) { 2153 nfs_rw_exit(&drp->r_rwlock); 2154 return (error); 2155 } 2156 2157 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2158 VN_RELE(vp); 2159 nfs_rw_exit(&drp->r_rwlock); 2160 return (EPERM); 2161 } 2162 2163 /* 2164 * First just remove the entry from the name cache, as it 2165 * is most likely the only entry for this vp. 2166 */ 2167 dnlc_remove(dvp, nm); 2168 2169 /* 2170 * If the file has a v_count > 1 then there may be more than one 2171 * entry in the name cache due multiple links or an open file, 2172 * but we don't have the real reference count so flush all 2173 * possible entries. 2174 */ 2175 if (vp->v_count > 1) 2176 dnlc_purge_vp(vp); 2177 2178 /* 2179 * Now we have the real reference count on the vnode 2180 */ 2181 rp = VTOR(vp); 2182 mutex_enter(&rp->r_statelock); 2183 if (vp->v_count > 1 && 2184 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2185 mutex_exit(&rp->r_statelock); 2186 tmpname = newname(); 2187 error = nfsrename(dvp, nm, dvp, tmpname, cr); 2188 if (error) 2189 kmem_free(tmpname, MAXNAMELEN); 2190 else { 2191 mutex_enter(&rp->r_statelock); 2192 if (rp->r_unldvp == NULL) { 2193 VN_HOLD(dvp); 2194 rp->r_unldvp = dvp; 2195 if (rp->r_unlcred != NULL) 2196 crfree(rp->r_unlcred); 2197 crhold(cr); 2198 rp->r_unlcred = cr; 2199 rp->r_unlname = tmpname; 2200 } else { 2201 kmem_free(rp->r_unlname, MAXNAMELEN); 2202 rp->r_unlname = tmpname; 2203 } 2204 mutex_exit(&rp->r_statelock); 2205 } 2206 } else { 2207 mutex_exit(&rp->r_statelock); 2208 /* 2209 * We need to flush any dirty pages which happen to 2210 * be hanging around before removing the file. This 2211 * shouldn't happen very often and mostly on file 2212 * systems mounted "nocto". 2213 */ 2214 if (vn_has_cached_data(vp) && 2215 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2216 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr); 2217 if (error && (error == ENOSPC || error == EDQUOT)) { 2218 mutex_enter(&rp->r_statelock); 2219 if (!rp->r_error) 2220 rp->r_error = error; 2221 mutex_exit(&rp->r_statelock); 2222 } 2223 } 2224 2225 setdiropargs(&da, nm, dvp); 2226 2227 douprintf = 1; 2228 2229 error = rfs2call(VTOMI(dvp), RFS_REMOVE, 2230 xdr_diropargs, (caddr_t)&da, 2231 xdr_enum, (caddr_t)&status, cr, 2232 &douprintf, &status, 0, NULL); 2233 2234 /* 2235 * The xattr dir may be gone after last attr is removed, 2236 * so flush it from dnlc. 2237 */ 2238 if (dvp->v_flag & V_XATTRDIR) 2239 dnlc_purge_vp(dvp); 2240 2241 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2242 PURGE_ATTRCACHE(vp); /* link count changed */ 2243 2244 if (!error) { 2245 error = geterrno(status); 2246 if (!error) { 2247 if (HAVE_RDDIR_CACHE(drp)) 2248 nfs_purge_rddir_cache(dvp); 2249 } else { 2250 PURGE_STALE_FH(error, dvp, cr); 2251 } 2252 } 2253 } 2254 2255 if (error == 0) { 2256 vnevent_remove(vp, dvp, nm); 2257 } 2258 VN_RELE(vp); 2259 2260 nfs_rw_exit(&drp->r_rwlock); 2261 2262 return (error); 2263 } 2264 2265 static int 2266 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 2267 { 2268 int error; 2269 struct nfslinkargs args; 2270 enum nfsstat status; 2271 vnode_t *realvp; 2272 int douprintf; 2273 rnode_t *tdrp; 2274 2275 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2276 return (EPERM); 2277 if (VOP_REALVP(svp, &realvp) == 0) 2278 svp = realvp; 2279 2280 args.la_from = VTOFH(svp); 2281 setdiropargs(&args.la_to, tnm, tdvp); 2282 2283 tdrp = VTOR(tdvp); 2284 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2285 return (EINTR); 2286 2287 dnlc_remove(tdvp, tnm); 2288 2289 douprintf = 1; 2290 2291 error = rfs2call(VTOMI(svp), RFS_LINK, 2292 xdr_linkargs, (caddr_t)&args, 2293 xdr_enum, (caddr_t)&status, cr, 2294 &douprintf, &status, 0, NULL); 2295 2296 PURGE_ATTRCACHE(tdvp); /* mod time changed */ 2297 PURGE_ATTRCACHE(svp); /* link count changed */ 2298 2299 if (!error) { 2300 error = geterrno(status); 2301 if (!error) { 2302 if (HAVE_RDDIR_CACHE(tdrp)) 2303 nfs_purge_rddir_cache(tdvp); 2304 } 2305 } 2306 2307 nfs_rw_exit(&tdrp->r_rwlock); 2308 2309 if (!error) { 2310 /* 2311 * Notify the source file of this link operation. 2312 */ 2313 vnevent_link(svp); 2314 } 2315 return (error); 2316 } 2317 2318 static int 2319 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2320 { 2321 vnode_t *realvp; 2322 2323 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2324 return (EPERM); 2325 if (VOP_REALVP(ndvp, &realvp) == 0) 2326 ndvp = realvp; 2327 2328 return (nfsrename(odvp, onm, ndvp, nnm, cr)); 2329 } 2330 2331 /* 2332 * nfsrename does the real work of renaming in NFS Version 2. 2333 */ 2334 static int 2335 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2336 { 2337 int error; 2338 enum nfsstat status; 2339 struct nfsrnmargs args; 2340 int douprintf; 2341 vnode_t *nvp = NULL; 2342 vnode_t *ovp = NULL; 2343 char *tmpname; 2344 rnode_t *rp; 2345 rnode_t *odrp; 2346 rnode_t *ndrp; 2347 2348 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2349 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2350 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2351 return (EINVAL); 2352 2353 odrp = VTOR(odvp); 2354 ndrp = VTOR(ndvp); 2355 if ((intptr_t)odrp < (intptr_t)ndrp) { 2356 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2357 return (EINTR); 2358 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2359 nfs_rw_exit(&odrp->r_rwlock); 2360 return (EINTR); 2361 } 2362 } else { 2363 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2364 return (EINTR); 2365 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2366 nfs_rw_exit(&ndrp->r_rwlock); 2367 return (EINTR); 2368 } 2369 } 2370 2371 /* 2372 * Lookup the target file. If it exists, it needs to be 2373 * checked to see whether it is a mount point and whether 2374 * it is active (open). 2375 */ 2376 error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 2377 if (!error) { 2378 /* 2379 * If this file has been mounted on, then just 2380 * return busy because renaming to it would remove 2381 * the mounted file system from the name space. 2382 */ 2383 if (vn_mountedvfs(nvp) != NULL) { 2384 VN_RELE(nvp); 2385 nfs_rw_exit(&odrp->r_rwlock); 2386 nfs_rw_exit(&ndrp->r_rwlock); 2387 return (EBUSY); 2388 } 2389 2390 /* 2391 * Purge the name cache of all references to this vnode 2392 * so that we can check the reference count to infer 2393 * whether it is active or not. 2394 */ 2395 /* 2396 * First just remove the entry from the name cache, as it 2397 * is most likely the only entry for this vp. 2398 */ 2399 dnlc_remove(ndvp, nnm); 2400 /* 2401 * If the file has a v_count > 1 then there may be more 2402 * than one entry in the name cache due multiple links 2403 * or an open file, but we don't have the real reference 2404 * count so flush all possible entries. 2405 */ 2406 if (nvp->v_count > 1) 2407 dnlc_purge_vp(nvp); 2408 2409 /* 2410 * If the vnode is active and is not a directory, 2411 * arrange to rename it to a 2412 * temporary file so that it will continue to be 2413 * accessible. This implements the "unlink-open-file" 2414 * semantics for the target of a rename operation. 2415 * Before doing this though, make sure that the 2416 * source and target files are not already the same. 2417 */ 2418 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 2419 /* 2420 * Lookup the source name. 2421 */ 2422 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, 2423 cr, 0); 2424 2425 /* 2426 * The source name *should* already exist. 2427 */ 2428 if (error) { 2429 VN_RELE(nvp); 2430 nfs_rw_exit(&odrp->r_rwlock); 2431 nfs_rw_exit(&ndrp->r_rwlock); 2432 return (error); 2433 } 2434 2435 /* 2436 * Compare the two vnodes. If they are the same, 2437 * just release all held vnodes and return success. 2438 */ 2439 if (ovp == nvp) { 2440 VN_RELE(ovp); 2441 VN_RELE(nvp); 2442 nfs_rw_exit(&odrp->r_rwlock); 2443 nfs_rw_exit(&ndrp->r_rwlock); 2444 return (0); 2445 } 2446 2447 /* 2448 * Can't mix and match directories and non- 2449 * directories in rename operations. We already 2450 * know that the target is not a directory. If 2451 * the source is a directory, return an error. 2452 */ 2453 if (ovp->v_type == VDIR) { 2454 VN_RELE(ovp); 2455 VN_RELE(nvp); 2456 nfs_rw_exit(&odrp->r_rwlock); 2457 nfs_rw_exit(&ndrp->r_rwlock); 2458 return (ENOTDIR); 2459 } 2460 2461 /* 2462 * The target file exists, is not the same as 2463 * the source file, and is active. Link it 2464 * to a temporary filename to avoid having 2465 * the server removing the file completely. 2466 */ 2467 tmpname = newname(); 2468 error = nfs_link(ndvp, nvp, tmpname, cr); 2469 if (error == EOPNOTSUPP) { 2470 error = nfs_rename(ndvp, nnm, ndvp, tmpname, 2471 cr); 2472 } 2473 if (error) { 2474 kmem_free(tmpname, MAXNAMELEN); 2475 VN_RELE(ovp); 2476 VN_RELE(nvp); 2477 nfs_rw_exit(&odrp->r_rwlock); 2478 nfs_rw_exit(&ndrp->r_rwlock); 2479 return (error); 2480 } 2481 rp = VTOR(nvp); 2482 mutex_enter(&rp->r_statelock); 2483 if (rp->r_unldvp == NULL) { 2484 VN_HOLD(ndvp); 2485 rp->r_unldvp = ndvp; 2486 if (rp->r_unlcred != NULL) 2487 crfree(rp->r_unlcred); 2488 crhold(cr); 2489 rp->r_unlcred = cr; 2490 rp->r_unlname = tmpname; 2491 } else { 2492 kmem_free(rp->r_unlname, MAXNAMELEN); 2493 rp->r_unlname = tmpname; 2494 } 2495 mutex_exit(&rp->r_statelock); 2496 } 2497 } 2498 2499 if (ovp == NULL) { 2500 /* 2501 * When renaming directories to be a subdirectory of a 2502 * different parent, the dnlc entry for ".." will no 2503 * longer be valid, so it must be removed. 2504 * 2505 * We do a lookup here to determine whether we are renaming 2506 * a directory and we need to check if we are renaming 2507 * an unlinked file. This might have already been done 2508 * in previous code, so we check ovp == NULL to avoid 2509 * doing it twice. 2510 */ 2511 2512 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 2513 2514 /* 2515 * The source name *should* already exist. 2516 */ 2517 if (error) { 2518 nfs_rw_exit(&odrp->r_rwlock); 2519 nfs_rw_exit(&ndrp->r_rwlock); 2520 if (nvp) { 2521 VN_RELE(nvp); 2522 } 2523 return (error); 2524 } 2525 ASSERT(ovp != NULL); 2526 } 2527 2528 dnlc_remove(odvp, onm); 2529 dnlc_remove(ndvp, nnm); 2530 2531 setdiropargs(&args.rna_from, onm, odvp); 2532 setdiropargs(&args.rna_to, nnm, ndvp); 2533 2534 douprintf = 1; 2535 2536 error = rfs2call(VTOMI(odvp), RFS_RENAME, 2537 xdr_rnmargs, (caddr_t)&args, 2538 xdr_enum, (caddr_t)&status, cr, 2539 &douprintf, &status, 0, NULL); 2540 2541 PURGE_ATTRCACHE(odvp); /* mod time changed */ 2542 PURGE_ATTRCACHE(ndvp); /* mod time changed */ 2543 2544 if (!error) { 2545 error = geterrno(status); 2546 if (!error) { 2547 if (HAVE_RDDIR_CACHE(odrp)) 2548 nfs_purge_rddir_cache(odvp); 2549 if (HAVE_RDDIR_CACHE(ndrp)) 2550 nfs_purge_rddir_cache(ndvp); 2551 /* 2552 * when renaming directories to be a subdirectory of a 2553 * different parent, the dnlc entry for ".." will no 2554 * longer be valid, so it must be removed 2555 */ 2556 rp = VTOR(ovp); 2557 if (ndvp != odvp) { 2558 if (ovp->v_type == VDIR) { 2559 dnlc_remove(ovp, ".."); 2560 if (HAVE_RDDIR_CACHE(rp)) 2561 nfs_purge_rddir_cache(ovp); 2562 } 2563 } 2564 2565 /* 2566 * If we are renaming the unlinked file, update the 2567 * r_unldvp and r_unlname as needed. 2568 */ 2569 mutex_enter(&rp->r_statelock); 2570 if (rp->r_unldvp != NULL) { 2571 if (strcmp(rp->r_unlname, onm) == 0) { 2572 (void) strncpy(rp->r_unlname, 2573 nnm, MAXNAMELEN); 2574 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 2575 2576 if (ndvp != rp->r_unldvp) { 2577 VN_RELE(rp->r_unldvp); 2578 rp->r_unldvp = ndvp; 2579 VN_HOLD(ndvp); 2580 } 2581 } 2582 } 2583 mutex_exit(&rp->r_statelock); 2584 } else { 2585 /* 2586 * System V defines rename to return EEXIST, not 2587 * ENOTEMPTY if the target directory is not empty. 2588 * Over the wire, the error is NFSERR_ENOTEMPTY 2589 * which geterrno maps to ENOTEMPTY. 2590 */ 2591 if (error == ENOTEMPTY) 2592 error = EEXIST; 2593 } 2594 } 2595 2596 if (error == 0) { 2597 if (nvp) 2598 vnevent_rename_dest(nvp, ndvp, nnm); 2599 2600 if (odvp != ndvp) 2601 vnevent_rename_dest_dir(ndvp); 2602 2603 ASSERT(ovp != NULL); 2604 vnevent_rename_src(ovp, odvp, onm); 2605 } 2606 2607 if (nvp) { 2608 VN_RELE(nvp); 2609 } 2610 VN_RELE(ovp); 2611 2612 nfs_rw_exit(&odrp->r_rwlock); 2613 nfs_rw_exit(&ndrp->r_rwlock); 2614 2615 return (error); 2616 } 2617 2618 static int 2619 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 2620 { 2621 int error; 2622 struct nfscreatargs args; 2623 struct nfsdiropres dr; 2624 int douprintf; 2625 rnode_t *drp; 2626 hrtime_t t; 2627 2628 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2629 return (EPERM); 2630 2631 setdiropargs(&args.ca_da, nm, dvp); 2632 2633 /* 2634 * Decide what the group-id and set-gid bit of the created directory 2635 * should be. May have to do a setattr to get the gid right. 2636 */ 2637 error = setdirgid(dvp, &va->va_gid, cr); 2638 if (error) 2639 return (error); 2640 error = setdirmode(dvp, &va->va_mode, cr); 2641 if (error) 2642 return (error); 2643 va->va_mask |= AT_MODE|AT_GID; 2644 2645 args.ca_sa = &args.ca_sa_buf; 2646 error = vattr_to_sattr(va, args.ca_sa); 2647 if (error) { 2648 /* req time field(s) overflow - return immediately */ 2649 return (error); 2650 } 2651 2652 drp = VTOR(dvp); 2653 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2654 return (EINTR); 2655 2656 dnlc_remove(dvp, nm); 2657 2658 douprintf = 1; 2659 2660 t = gethrtime(); 2661 2662 error = rfs2call(VTOMI(dvp), RFS_MKDIR, 2663 xdr_creatargs, (caddr_t)&args, 2664 xdr_diropres, (caddr_t)&dr, cr, 2665 &douprintf, &dr.dr_status, 0, NULL); 2666 2667 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2668 2669 if (!error) { 2670 error = geterrno(dr.dr_status); 2671 if (!error) { 2672 if (HAVE_RDDIR_CACHE(drp)) 2673 nfs_purge_rddir_cache(dvp); 2674 /* 2675 * The attributes returned by RFS_MKDIR can not 2676 * be depended upon, so mark the attribute cache 2677 * as purged. A subsequent GETATTR will get the 2678 * correct attributes from the server. 2679 */ 2680 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2681 dvp->v_vfsp, t, cr, NULL, NULL); 2682 PURGE_ATTRCACHE(*vpp); 2683 dnlc_update(dvp, nm, *vpp); 2684 2685 /* 2686 * Make sure the gid was set correctly. 2687 * If not, try to set it (but don't lose 2688 * any sleep over it). 2689 */ 2690 if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) { 2691 va->va_mask = AT_GID; 2692 (void) nfssetattr(*vpp, va, 0, cr); 2693 } 2694 } else { 2695 PURGE_STALE_FH(error, dvp, cr); 2696 } 2697 } 2698 2699 nfs_rw_exit(&drp->r_rwlock); 2700 2701 return (error); 2702 } 2703 2704 static int 2705 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 2706 { 2707 int error; 2708 enum nfsstat status; 2709 struct nfsdiropargs da; 2710 vnode_t *vp; 2711 int douprintf; 2712 rnode_t *drp; 2713 2714 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2715 return (EPERM); 2716 drp = VTOR(dvp); 2717 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2718 return (EINTR); 2719 2720 /* 2721 * Attempt to prevent a rmdir(".") from succeeding. 2722 */ 2723 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2724 if (error) { 2725 nfs_rw_exit(&drp->r_rwlock); 2726 return (error); 2727 } 2728 2729 if (vp == cdir) { 2730 VN_RELE(vp); 2731 nfs_rw_exit(&drp->r_rwlock); 2732 return (EINVAL); 2733 } 2734 2735 setdiropargs(&da, nm, dvp); 2736 2737 /* 2738 * First just remove the entry from the name cache, as it 2739 * is most likely an entry for this vp. 2740 */ 2741 dnlc_remove(dvp, nm); 2742 2743 /* 2744 * If there vnode reference count is greater than one, then 2745 * there may be additional references in the DNLC which will 2746 * need to be purged. First, trying removing the entry for 2747 * the parent directory and see if that removes the additional 2748 * reference(s). If that doesn't do it, then use dnlc_purge_vp 2749 * to completely remove any references to the directory which 2750 * might still exist in the DNLC. 2751 */ 2752 if (vp->v_count > 1) { 2753 dnlc_remove(vp, ".."); 2754 if (vp->v_count > 1) 2755 dnlc_purge_vp(vp); 2756 } 2757 2758 douprintf = 1; 2759 2760 error = rfs2call(VTOMI(dvp), RFS_RMDIR, 2761 xdr_diropargs, (caddr_t)&da, 2762 xdr_enum, (caddr_t)&status, cr, 2763 &douprintf, &status, 0, NULL); 2764 2765 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2766 2767 if (error) { 2768 VN_RELE(vp); 2769 nfs_rw_exit(&drp->r_rwlock); 2770 return (error); 2771 } 2772 2773 error = geterrno(status); 2774 if (!error) { 2775 if (HAVE_RDDIR_CACHE(drp)) 2776 nfs_purge_rddir_cache(dvp); 2777 if (HAVE_RDDIR_CACHE(VTOR(vp))) 2778 nfs_purge_rddir_cache(vp); 2779 } else { 2780 PURGE_STALE_FH(error, dvp, cr); 2781 /* 2782 * System V defines rmdir to return EEXIST, not 2783 * ENOTEMPTY if the directory is not empty. Over 2784 * the wire, the error is NFSERR_ENOTEMPTY which 2785 * geterrno maps to ENOTEMPTY. 2786 */ 2787 if (error == ENOTEMPTY) 2788 error = EEXIST; 2789 } 2790 2791 if (error == 0) { 2792 vnevent_rmdir(vp, dvp, nm); 2793 } 2794 VN_RELE(vp); 2795 2796 nfs_rw_exit(&drp->r_rwlock); 2797 2798 return (error); 2799 } 2800 2801 static int 2802 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 2803 { 2804 int error; 2805 struct nfsslargs args; 2806 enum nfsstat status; 2807 int douprintf; 2808 rnode_t *drp; 2809 2810 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2811 return (EPERM); 2812 setdiropargs(&args.sla_from, lnm, dvp); 2813 args.sla_sa = &args.sla_sa_buf; 2814 error = vattr_to_sattr(tva, args.sla_sa); 2815 if (error) { 2816 /* req time field(s) overflow - return immediately */ 2817 return (error); 2818 } 2819 args.sla_tnm = tnm; 2820 2821 drp = VTOR(dvp); 2822 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2823 return (EINTR); 2824 2825 dnlc_remove(dvp, lnm); 2826 2827 douprintf = 1; 2828 2829 error = rfs2call(VTOMI(dvp), RFS_SYMLINK, 2830 xdr_slargs, (caddr_t)&args, 2831 xdr_enum, (caddr_t)&status, cr, 2832 &douprintf, &status, 0, NULL); 2833 2834 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2835 2836 if (!error) { 2837 error = geterrno(status); 2838 if (!error) { 2839 if (HAVE_RDDIR_CACHE(drp)) 2840 nfs_purge_rddir_cache(dvp); 2841 } else { 2842 PURGE_STALE_FH(error, dvp, cr); 2843 } 2844 } 2845 2846 nfs_rw_exit(&drp->r_rwlock); 2847 2848 return (error); 2849 } 2850 2851 #ifdef DEBUG 2852 static int nfs_readdir_cache_hits = 0; 2853 static int nfs_readdir_cache_shorts = 0; 2854 static int nfs_readdir_cache_waits = 0; 2855 static int nfs_readdir_cache_misses = 0; 2856 static int nfs_readdir_readahead = 0; 2857 #endif 2858 2859 static int nfs_shrinkreaddir = 0; 2860 2861 /* 2862 * Read directory entries. 2863 * There are some weird things to look out for here. The uio_offset 2864 * field is either 0 or it is the offset returned from a previous 2865 * readdir. It is an opaque value used by the server to find the 2866 * correct directory block to read. The count field is the number 2867 * of blocks to read on the server. This is advisory only, the server 2868 * may return only one block's worth of entries. Entries may be compressed 2869 * on the server. 2870 */ 2871 static int 2872 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 2873 { 2874 int error; 2875 size_t count; 2876 rnode_t *rp; 2877 rddir_cache *rdc; 2878 rddir_cache *nrdc; 2879 rddir_cache *rrdc; 2880 #ifdef DEBUG 2881 int missed; 2882 #endif 2883 rddir_cache srdc; 2884 avl_index_t where; 2885 2886 rp = VTOR(vp); 2887 2888 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2889 if (nfs_zone() != VTOMI(vp)->mi_zone) 2890 return (EIO); 2891 /* 2892 * Make sure that the directory cache is valid. 2893 */ 2894 if (HAVE_RDDIR_CACHE(rp)) { 2895 if (nfs_disable_rddir_cache) { 2896 /* 2897 * Setting nfs_disable_rddir_cache in /etc/system 2898 * allows interoperability with servers that do not 2899 * properly update the attributes of directories. 2900 * Any cached information gets purged before an 2901 * access is made to it. 2902 */ 2903 nfs_purge_rddir_cache(vp); 2904 } else { 2905 error = nfs_validate_caches(vp, cr); 2906 if (error) 2907 return (error); 2908 } 2909 } 2910 2911 /* 2912 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an 2913 * RFS_READDIR request with rda_count set to more than 0x400. So 2914 * we reduce the request size here purely for compatibility. 2915 * 2916 * In general, this is no longer required. However, if a server 2917 * is discovered which can not handle requests larger than 1024, 2918 * nfs_shrinkreaddir can be set to 1 to enable this backwards 2919 * compatibility. 2920 * 2921 * In any case, the request size is limited to NFS_MAXDATA bytes. 2922 */ 2923 count = MIN(uiop->uio_iov->iov_len, 2924 nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA); 2925 2926 nrdc = NULL; 2927 #ifdef DEBUG 2928 missed = 0; 2929 #endif 2930 top: 2931 /* 2932 * Short circuit last readdir which always returns 0 bytes. 2933 * This can be done after the directory has been read through 2934 * completely at least once. This will set r_direof which 2935 * can be used to find the value of the last cookie. 2936 */ 2937 mutex_enter(&rp->r_statelock); 2938 if (rp->r_direof != NULL && 2939 uiop->uio_offset == rp->r_direof->nfs_ncookie) { 2940 mutex_exit(&rp->r_statelock); 2941 #ifdef DEBUG 2942 nfs_readdir_cache_shorts++; 2943 #endif 2944 if (eofp) 2945 *eofp = 1; 2946 if (nrdc != NULL) 2947 rddir_cache_rele(nrdc); 2948 return (0); 2949 } 2950 /* 2951 * Look for a cache entry. Cache entries are identified 2952 * by the NFS cookie value and the byte count requested. 2953 */ 2954 srdc.nfs_cookie = uiop->uio_offset; 2955 srdc.buflen = count; 2956 rdc = avl_find(&rp->r_dir, &srdc, &where); 2957 if (rdc != NULL) { 2958 rddir_cache_hold(rdc); 2959 /* 2960 * If the cache entry is in the process of being 2961 * filled in, wait until this completes. The 2962 * RDDIRWAIT bit is set to indicate that someone 2963 * is waiting and then the thread currently 2964 * filling the entry is done, it should do a 2965 * cv_broadcast to wakeup all of the threads 2966 * waiting for it to finish. 2967 */ 2968 if (rdc->flags & RDDIR) { 2969 nfs_rw_exit(&rp->r_rwlock); 2970 rdc->flags |= RDDIRWAIT; 2971 #ifdef DEBUG 2972 nfs_readdir_cache_waits++; 2973 #endif 2974 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 2975 /* 2976 * We got interrupted, probably 2977 * the user typed ^C or an alarm 2978 * fired. We free the new entry 2979 * if we allocated one. 2980 */ 2981 mutex_exit(&rp->r_statelock); 2982 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2983 RW_READER, FALSE); 2984 rddir_cache_rele(rdc); 2985 if (nrdc != NULL) 2986 rddir_cache_rele(nrdc); 2987 return (EINTR); 2988 } 2989 mutex_exit(&rp->r_statelock); 2990 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2991 RW_READER, FALSE); 2992 rddir_cache_rele(rdc); 2993 goto top; 2994 } 2995 /* 2996 * Check to see if a readdir is required to 2997 * fill the entry. If so, mark this entry 2998 * as being filled, remove our reference, 2999 * and branch to the code to fill the entry. 3000 */ 3001 if (rdc->flags & RDDIRREQ) { 3002 rdc->flags &= ~RDDIRREQ; 3003 rdc->flags |= RDDIR; 3004 if (nrdc != NULL) 3005 rddir_cache_rele(nrdc); 3006 nrdc = rdc; 3007 mutex_exit(&rp->r_statelock); 3008 goto bottom; 3009 } 3010 #ifdef DEBUG 3011 if (!missed) 3012 nfs_readdir_cache_hits++; 3013 #endif 3014 /* 3015 * If an error occurred while attempting 3016 * to fill the cache entry, just return it. 3017 */ 3018 if (rdc->error) { 3019 error = rdc->error; 3020 mutex_exit(&rp->r_statelock); 3021 rddir_cache_rele(rdc); 3022 if (nrdc != NULL) 3023 rddir_cache_rele(nrdc); 3024 return (error); 3025 } 3026 3027 /* 3028 * The cache entry is complete and good, 3029 * copyout the dirent structs to the calling 3030 * thread. 3031 */ 3032 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 3033 3034 /* 3035 * If no error occurred during the copyout, 3036 * update the offset in the uio struct to 3037 * contain the value of the next cookie 3038 * and set the eof value appropriately. 3039 */ 3040 if (!error) { 3041 uiop->uio_offset = rdc->nfs_ncookie; 3042 if (eofp) 3043 *eofp = rdc->eof; 3044 } 3045 3046 /* 3047 * Decide whether to do readahead. Don't if 3048 * have already read to the end of directory. 3049 */ 3050 if (rdc->eof) { 3051 rp->r_direof = rdc; 3052 mutex_exit(&rp->r_statelock); 3053 rddir_cache_rele(rdc); 3054 if (nrdc != NULL) 3055 rddir_cache_rele(nrdc); 3056 return (error); 3057 } 3058 3059 /* 3060 * Check to see whether we found an entry 3061 * for the readahead. If so, we don't need 3062 * to do anything further, so free the new 3063 * entry if one was allocated. Otherwise, 3064 * allocate a new entry, add it to the cache, 3065 * and then initiate an asynchronous readdir 3066 * operation to fill it. 3067 */ 3068 srdc.nfs_cookie = rdc->nfs_ncookie; 3069 srdc.buflen = count; 3070 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3071 if (rrdc != NULL) { 3072 if (nrdc != NULL) 3073 rddir_cache_rele(nrdc); 3074 } else { 3075 if (nrdc != NULL) 3076 rrdc = nrdc; 3077 else { 3078 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3079 } 3080 if (rrdc != NULL) { 3081 rrdc->nfs_cookie = rdc->nfs_ncookie; 3082 rrdc->buflen = count; 3083 avl_insert(&rp->r_dir, rrdc, where); 3084 rddir_cache_hold(rrdc); 3085 mutex_exit(&rp->r_statelock); 3086 rddir_cache_rele(rdc); 3087 #ifdef DEBUG 3088 nfs_readdir_readahead++; 3089 #endif 3090 nfs_async_readdir(vp, rrdc, cr, nfsreaddir); 3091 return (error); 3092 } 3093 } 3094 3095 mutex_exit(&rp->r_statelock); 3096 rddir_cache_rele(rdc); 3097 return (error); 3098 } 3099 3100 /* 3101 * Didn't find an entry in the cache. Construct a new empty 3102 * entry and link it into the cache. Other processes attempting 3103 * to access this entry will need to wait until it is filled in. 3104 * 3105 * Since kmem_alloc may block, another pass through the cache 3106 * will need to be taken to make sure that another process 3107 * hasn't already added an entry to the cache for this request. 3108 */ 3109 if (nrdc == NULL) { 3110 mutex_exit(&rp->r_statelock); 3111 nrdc = rddir_cache_alloc(KM_SLEEP); 3112 nrdc->nfs_cookie = uiop->uio_offset; 3113 nrdc->buflen = count; 3114 goto top; 3115 } 3116 3117 /* 3118 * Add this entry to the cache. 3119 */ 3120 avl_insert(&rp->r_dir, nrdc, where); 3121 rddir_cache_hold(nrdc); 3122 mutex_exit(&rp->r_statelock); 3123 3124 bottom: 3125 #ifdef DEBUG 3126 missed = 1; 3127 nfs_readdir_cache_misses++; 3128 #endif 3129 /* 3130 * Do the readdir. 3131 */ 3132 error = nfsreaddir(vp, nrdc, cr); 3133 3134 /* 3135 * If this operation failed, just return the error which occurred. 3136 */ 3137 if (error != 0) 3138 return (error); 3139 3140 /* 3141 * Since the RPC operation will have taken sometime and blocked 3142 * this process, another pass through the cache will need to be 3143 * taken to find the correct cache entry. It is possible that 3144 * the correct cache entry will not be there (although one was 3145 * added) because the directory changed during the RPC operation 3146 * and the readdir cache was flushed. In this case, just start 3147 * over. It is hoped that this will not happen too often... :-) 3148 */ 3149 nrdc = NULL; 3150 goto top; 3151 /* NOTREACHED */ 3152 } 3153 3154 static int 3155 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3156 { 3157 int error; 3158 struct nfsrddirargs rda; 3159 struct nfsrddirres rd; 3160 rnode_t *rp; 3161 mntinfo_t *mi; 3162 uint_t count; 3163 int douprintf; 3164 failinfo_t fi, *fip; 3165 3166 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3167 count = rdc->buflen; 3168 3169 rp = VTOR(vp); 3170 mi = VTOMI(vp); 3171 3172 rda.rda_fh = *VTOFH(vp); 3173 rda.rda_offset = rdc->nfs_cookie; 3174 3175 /* 3176 * NFS client failover support 3177 * suppress failover unless we have a zero cookie 3178 */ 3179 if (rdc->nfs_cookie == (off_t)0) { 3180 fi.vp = vp; 3181 fi.fhp = (caddr_t)&rda.rda_fh; 3182 fi.copyproc = nfscopyfh; 3183 fi.lookupproc = nfslookup; 3184 fi.xattrdirproc = acl_getxattrdir2; 3185 fip = &fi; 3186 } else { 3187 fip = NULL; 3188 } 3189 3190 rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3191 rd.rd_size = count; 3192 rd.rd_offset = rda.rda_offset; 3193 3194 douprintf = 1; 3195 3196 if (mi->mi_io_kstats) { 3197 mutex_enter(&mi->mi_lock); 3198 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3199 mutex_exit(&mi->mi_lock); 3200 } 3201 3202 do { 3203 rda.rda_count = MIN(count, mi->mi_curread); 3204 error = rfs2call(mi, RFS_READDIR, 3205 xdr_rddirargs, (caddr_t)&rda, 3206 xdr_getrddirres, (caddr_t)&rd, cr, 3207 &douprintf, &rd.rd_status, 0, fip); 3208 } while (error == ENFS_TRYAGAIN); 3209 3210 if (mi->mi_io_kstats) { 3211 mutex_enter(&mi->mi_lock); 3212 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3213 mutex_exit(&mi->mi_lock); 3214 } 3215 3216 /* 3217 * Since we are actually doing a READDIR RPC, we must have 3218 * exclusive access to the cache entry being filled. Thus, 3219 * it is safe to update all fields except for the flags 3220 * field. The r_statelock in the rnode must be held to 3221 * prevent two different threads from simultaneously 3222 * attempting to update the flags field. This can happen 3223 * if we are turning off RDDIR and the other thread is 3224 * trying to set RDDIRWAIT. 3225 */ 3226 ASSERT(rdc->flags & RDDIR); 3227 if (!error) { 3228 error = geterrno(rd.rd_status); 3229 if (!error) { 3230 rdc->nfs_ncookie = rd.rd_offset; 3231 rdc->eof = rd.rd_eof ? 1 : 0; 3232 rdc->entlen = rd.rd_size; 3233 ASSERT(rdc->entlen <= rdc->buflen); 3234 #ifdef DEBUG 3235 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, 3236 KM_SLEEP); 3237 #else 3238 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3239 #endif 3240 bcopy(rd.rd_entries, rdc->entries, rdc->entlen); 3241 rdc->error = 0; 3242 if (mi->mi_io_kstats) { 3243 mutex_enter(&mi->mi_lock); 3244 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3245 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 3246 rd.rd_size; 3247 mutex_exit(&mi->mi_lock); 3248 } 3249 } else { 3250 PURGE_STALE_FH(error, vp, cr); 3251 } 3252 } 3253 if (error) { 3254 rdc->entries = NULL; 3255 rdc->error = error; 3256 } 3257 kmem_free(rd.rd_entries, rdc->buflen); 3258 3259 mutex_enter(&rp->r_statelock); 3260 rdc->flags &= ~RDDIR; 3261 if (rdc->flags & RDDIRWAIT) { 3262 rdc->flags &= ~RDDIRWAIT; 3263 cv_broadcast(&rdc->cv); 3264 } 3265 if (error) 3266 rdc->flags |= RDDIRREQ; 3267 mutex_exit(&rp->r_statelock); 3268 3269 rddir_cache_rele(rdc); 3270 3271 return (error); 3272 } 3273 3274 #ifdef DEBUG 3275 static int nfs_bio_do_stop = 0; 3276 #endif 3277 3278 static int 3279 nfs_bio(struct buf *bp, cred_t *cr) 3280 { 3281 rnode_t *rp = VTOR(bp->b_vp); 3282 int count; 3283 int error; 3284 cred_t *cred; 3285 uint_t offset; 3286 3287 DTRACE_IO1(start, struct buf *, bp); 3288 3289 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 3290 offset = dbtob(bp->b_blkno); 3291 3292 if (bp->b_flags & B_READ) { 3293 mutex_enter(&rp->r_statelock); 3294 if (rp->r_cred != NULL) { 3295 cred = rp->r_cred; 3296 crhold(cred); 3297 } else { 3298 rp->r_cred = cr; 3299 crhold(cr); 3300 cred = cr; 3301 crhold(cred); 3302 } 3303 mutex_exit(&rp->r_statelock); 3304 read_again: 3305 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr, 3306 offset, bp->b_bcount, &bp->b_resid, cred); 3307 crfree(cred); 3308 if (!error) { 3309 if (bp->b_resid) { 3310 /* 3311 * Didn't get it all because we hit EOF, 3312 * zero all the memory beyond the EOF. 3313 */ 3314 /* bzero(rdaddr + */ 3315 bzero(bp->b_un.b_addr + 3316 bp->b_bcount - bp->b_resid, bp->b_resid); 3317 } 3318 mutex_enter(&rp->r_statelock); 3319 if (bp->b_resid == bp->b_bcount && 3320 offset >= rp->r_size) { 3321 /* 3322 * We didn't read anything at all as we are 3323 * past EOF. Return an error indicator back 3324 * but don't destroy the pages (yet). 3325 */ 3326 error = NFS_EOF; 3327 } 3328 mutex_exit(&rp->r_statelock); 3329 } else if (error == EACCES) { 3330 mutex_enter(&rp->r_statelock); 3331 if (cred != cr) { 3332 if (rp->r_cred != NULL) 3333 crfree(rp->r_cred); 3334 rp->r_cred = cr; 3335 crhold(cr); 3336 cred = cr; 3337 crhold(cred); 3338 mutex_exit(&rp->r_statelock); 3339 goto read_again; 3340 } 3341 mutex_exit(&rp->r_statelock); 3342 } 3343 } else { 3344 if (!(rp->r_flags & RSTALE)) { 3345 mutex_enter(&rp->r_statelock); 3346 if (rp->r_cred != NULL) { 3347 cred = rp->r_cred; 3348 crhold(cred); 3349 } else { 3350 rp->r_cred = cr; 3351 crhold(cr); 3352 cred = cr; 3353 crhold(cred); 3354 } 3355 mutex_exit(&rp->r_statelock); 3356 write_again: 3357 mutex_enter(&rp->r_statelock); 3358 count = MIN(bp->b_bcount, rp->r_size - offset); 3359 mutex_exit(&rp->r_statelock); 3360 if (count < 0) 3361 cmn_err(CE_PANIC, "nfs_bio: write count < 0"); 3362 #ifdef DEBUG 3363 if (count == 0) { 3364 zcmn_err(getzoneid(), CE_WARN, 3365 "nfs_bio: zero length write at %d", 3366 offset); 3367 nfs_printfhandle(&rp->r_fh); 3368 if (nfs_bio_do_stop) 3369 debug_enter("nfs_bio"); 3370 } 3371 #endif 3372 error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset, 3373 count, cred); 3374 if (error == EACCES) { 3375 mutex_enter(&rp->r_statelock); 3376 if (cred != cr) { 3377 if (rp->r_cred != NULL) 3378 crfree(rp->r_cred); 3379 rp->r_cred = cr; 3380 crhold(cr); 3381 crfree(cred); 3382 cred = cr; 3383 crhold(cred); 3384 mutex_exit(&rp->r_statelock); 3385 goto write_again; 3386 } 3387 mutex_exit(&rp->r_statelock); 3388 } 3389 bp->b_error = error; 3390 if (error && error != EINTR) { 3391 /* 3392 * Don't print EDQUOT errors on the console. 3393 * Don't print asynchronous EACCES errors. 3394 * Don't print EFBIG errors. 3395 * Print all other write errors. 3396 */ 3397 if (error != EDQUOT && error != EFBIG && 3398 (error != EACCES || 3399 !(bp->b_flags & B_ASYNC))) 3400 nfs_write_error(bp->b_vp, error, cred); 3401 /* 3402 * Update r_error and r_flags as appropriate. 3403 * If the error was ESTALE, then mark the 3404 * rnode as not being writeable and save 3405 * the error status. Otherwise, save any 3406 * errors which occur from asynchronous 3407 * page invalidations. Any errors occurring 3408 * from other operations should be saved 3409 * by the caller. 3410 */ 3411 mutex_enter(&rp->r_statelock); 3412 if (error == ESTALE) { 3413 rp->r_flags |= RSTALE; 3414 if (!rp->r_error) 3415 rp->r_error = error; 3416 } else if (!rp->r_error && 3417 (bp->b_flags & 3418 (B_INVAL|B_FORCE|B_ASYNC)) == 3419 (B_INVAL|B_FORCE|B_ASYNC)) { 3420 rp->r_error = error; 3421 } 3422 mutex_exit(&rp->r_statelock); 3423 } 3424 crfree(cred); 3425 } else 3426 error = rp->r_error; 3427 } 3428 3429 if (error != 0 && error != NFS_EOF) 3430 bp->b_flags |= B_ERROR; 3431 3432 DTRACE_IO1(done, struct buf *, bp); 3433 3434 return (error); 3435 } 3436 3437 static int 3438 nfs_fid(vnode_t *vp, fid_t *fidp) 3439 { 3440 struct nfs_fid *fp; 3441 rnode_t *rp; 3442 3443 rp = VTOR(vp); 3444 3445 if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) { 3446 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short); 3447 return (ENOSPC); 3448 } 3449 fp = (struct nfs_fid *)fidp; 3450 fp->nf_pad = 0; 3451 fp->nf_len = sizeof (struct nfs_fid) - sizeof (short); 3452 bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE); 3453 return (0); 3454 } 3455 3456 /* ARGSUSED2 */ 3457 static int 3458 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3459 { 3460 rnode_t *rp = VTOR(vp); 3461 3462 if (!write_lock) { 3463 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3464 return (V_WRITELOCK_FALSE); 3465 } 3466 3467 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 3468 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3469 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 3470 return (V_WRITELOCK_FALSE); 3471 nfs_rw_exit(&rp->r_rwlock); 3472 } 3473 3474 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 3475 return (V_WRITELOCK_TRUE); 3476 } 3477 3478 /* ARGSUSED */ 3479 static void 3480 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3481 { 3482 rnode_t *rp = VTOR(vp); 3483 3484 nfs_rw_exit(&rp->r_rwlock); 3485 } 3486 3487 /* ARGSUSED */ 3488 static int 3489 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 3490 { 3491 3492 /* 3493 * Because we stuff the readdir cookie into the offset field 3494 * someone may attempt to do an lseek with the cookie which 3495 * we want to succeed. 3496 */ 3497 if (vp->v_type == VDIR) 3498 return (0); 3499 if (*noffp < 0 || *noffp > MAXOFF32_T) 3500 return (EINVAL); 3501 return (0); 3502 } 3503 3504 /* 3505 * number of NFS_MAXDATA blocks to read ahead 3506 * optimized for 100 base-T. 3507 */ 3508 static int nfs_nra = 4; 3509 3510 #ifdef DEBUG 3511 static int nfs_lostpage = 0; /* number of times we lost original page */ 3512 #endif 3513 3514 /* 3515 * Return all the pages from [off..off+len) in file 3516 */ 3517 static int 3518 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3519 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3520 enum seg_rw rw, cred_t *cr) 3521 { 3522 rnode_t *rp; 3523 int error; 3524 mntinfo_t *mi; 3525 3526 if (vp->v_flag & VNOMAP) 3527 return (ENOSYS); 3528 3529 ASSERT(off <= MAXOFF32_T); 3530 if (nfs_zone() != VTOMI(vp)->mi_zone) 3531 return (EIO); 3532 if (protp != NULL) 3533 *protp = PROT_ALL; 3534 3535 /* 3536 * Now valididate that the caches are up to date. 3537 */ 3538 error = nfs_validate_caches(vp, cr); 3539 if (error) 3540 return (error); 3541 3542 rp = VTOR(vp); 3543 mi = VTOMI(vp); 3544 retry: 3545 mutex_enter(&rp->r_statelock); 3546 3547 /* 3548 * Don't create dirty pages faster than they 3549 * can be cleaned so that the system doesn't 3550 * get imbalanced. If the async queue is 3551 * maxed out, then wait for it to drain before 3552 * creating more dirty pages. Also, wait for 3553 * any threads doing pagewalks in the vop_getattr 3554 * entry points so that they don't block for 3555 * long periods. 3556 */ 3557 if (rw == S_CREATE) { 3558 while ((mi->mi_max_threads != 0 && 3559 rp->r_awcount > 2 * mi->mi_max_threads) || 3560 rp->r_gcount > 0) 3561 cv_wait(&rp->r_cv, &rp->r_statelock); 3562 } 3563 3564 /* 3565 * If we are getting called as a side effect of an nfs_write() 3566 * operation the local file size might not be extended yet. 3567 * In this case we want to be able to return pages of zeroes. 3568 */ 3569 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 3570 mutex_exit(&rp->r_statelock); 3571 return (EFAULT); /* beyond EOF */ 3572 } 3573 3574 mutex_exit(&rp->r_statelock); 3575 3576 if (len <= PAGESIZE) { 3577 error = nfs_getapage(vp, off, len, protp, pl, plsz, 3578 seg, addr, rw, cr); 3579 } else { 3580 error = pvn_getpages(nfs_getapage, vp, off, len, protp, 3581 pl, plsz, seg, addr, rw, cr); 3582 } 3583 3584 switch (error) { 3585 case NFS_EOF: 3586 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 3587 goto retry; 3588 case ESTALE: 3589 PURGE_STALE_FH(error, vp, cr); 3590 } 3591 3592 return (error); 3593 } 3594 3595 /* 3596 * Called from pvn_getpages or nfs_getpage to get a particular page. 3597 */ 3598 /* ARGSUSED */ 3599 static int 3600 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 3601 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3602 enum seg_rw rw, cred_t *cr) 3603 { 3604 rnode_t *rp; 3605 uint_t bsize; 3606 struct buf *bp; 3607 page_t *pp; 3608 u_offset_t lbn; 3609 u_offset_t io_off; 3610 u_offset_t blkoff; 3611 u_offset_t rablkoff; 3612 size_t io_len; 3613 uint_t blksize; 3614 int error; 3615 int readahead; 3616 int readahead_issued = 0; 3617 int ra_window; /* readahead window */ 3618 page_t *pagefound; 3619 3620 if (nfs_zone() != VTOMI(vp)->mi_zone) 3621 return (EIO); 3622 rp = VTOR(vp); 3623 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3624 3625 reread: 3626 bp = NULL; 3627 pp = NULL; 3628 pagefound = NULL; 3629 3630 if (pl != NULL) 3631 pl[0] = NULL; 3632 3633 error = 0; 3634 lbn = off / bsize; 3635 blkoff = lbn * bsize; 3636 3637 /* 3638 * Queueing up the readahead before doing the synchronous read 3639 * results in a significant increase in read throughput because 3640 * of the increased parallelism between the async threads and 3641 * the process context. 3642 */ 3643 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 3644 rw != S_CREATE && 3645 !(vp->v_flag & VNOCACHE)) { 3646 mutex_enter(&rp->r_statelock); 3647 3648 /* 3649 * Calculate the number of readaheads to do. 3650 * a) No readaheads at offset = 0. 3651 * b) Do maximum(nfs_nra) readaheads when the readahead 3652 * window is closed. 3653 * c) Do readaheads between 1 to (nfs_nra - 1) depending 3654 * upon how far the readahead window is open or close. 3655 * d) No readaheads if rp->r_nextr is not within the scope 3656 * of the readahead window (random i/o). 3657 */ 3658 3659 if (off == 0) 3660 readahead = 0; 3661 else if (blkoff == rp->r_nextr) 3662 readahead = nfs_nra; 3663 else if (rp->r_nextr > blkoff && 3664 ((ra_window = (rp->r_nextr - blkoff) / bsize) 3665 <= (nfs_nra - 1))) 3666 readahead = nfs_nra - ra_window; 3667 else 3668 readahead = 0; 3669 3670 rablkoff = rp->r_nextr; 3671 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 3672 mutex_exit(&rp->r_statelock); 3673 if (nfs_async_readahead(vp, rablkoff + bsize, 3674 addr + (rablkoff + bsize - off), seg, cr, 3675 nfs_readahead) < 0) { 3676 mutex_enter(&rp->r_statelock); 3677 break; 3678 } 3679 readahead--; 3680 rablkoff += bsize; 3681 /* 3682 * Indicate that we did a readahead so 3683 * readahead offset is not updated 3684 * by the synchronous read below. 3685 */ 3686 readahead_issued = 1; 3687 mutex_enter(&rp->r_statelock); 3688 /* 3689 * set readahead offset to 3690 * offset of last async readahead 3691 * request. 3692 */ 3693 rp->r_nextr = rablkoff; 3694 } 3695 mutex_exit(&rp->r_statelock); 3696 } 3697 3698 again: 3699 if ((pagefound = page_exists(vp, off)) == NULL) { 3700 if (pl == NULL) { 3701 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 3702 nfs_readahead); 3703 } else if (rw == S_CREATE) { 3704 /* 3705 * Block for this page is not allocated, or the offset 3706 * is beyond the current allocation size, or we're 3707 * allocating a swap slot and the page was not found, 3708 * so allocate it and return a zero page. 3709 */ 3710 if ((pp = page_create_va(vp, off, 3711 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 3712 cmn_err(CE_PANIC, "nfs_getapage: page_create"); 3713 io_len = PAGESIZE; 3714 mutex_enter(&rp->r_statelock); 3715 rp->r_nextr = off + PAGESIZE; 3716 mutex_exit(&rp->r_statelock); 3717 } else { 3718 /* 3719 * Need to go to server to get a BLOCK, exception to 3720 * that being while reading at offset = 0 or doing 3721 * random i/o, in that case read only a PAGE. 3722 */ 3723 mutex_enter(&rp->r_statelock); 3724 if (blkoff < rp->r_size && 3725 blkoff + bsize >= rp->r_size) { 3726 /* 3727 * If only a block or less is left in 3728 * the file, read all that is remaining. 3729 */ 3730 if (rp->r_size <= off) { 3731 /* 3732 * Trying to access beyond EOF, 3733 * set up to get at least one page. 3734 */ 3735 blksize = off + PAGESIZE - blkoff; 3736 } else 3737 blksize = rp->r_size - blkoff; 3738 } else if ((off == 0) || 3739 (off != rp->r_nextr && !readahead_issued)) { 3740 blksize = PAGESIZE; 3741 blkoff = off; /* block = page here */ 3742 } else 3743 blksize = bsize; 3744 mutex_exit(&rp->r_statelock); 3745 3746 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3747 &io_len, blkoff, blksize, 0); 3748 3749 /* 3750 * Some other thread has entered the page, 3751 * so just use it. 3752 */ 3753 if (pp == NULL) 3754 goto again; 3755 3756 /* 3757 * Now round the request size up to page boundaries. 3758 * This ensures that the entire page will be 3759 * initialized to zeroes if EOF is encountered. 3760 */ 3761 io_len = ptob(btopr(io_len)); 3762 3763 bp = pageio_setup(pp, io_len, vp, B_READ); 3764 ASSERT(bp != NULL); 3765 3766 /* 3767 * pageio_setup should have set b_addr to 0. This 3768 * is correct since we want to do I/O on a page 3769 * boundary. bp_mapin will use this addr to calculate 3770 * an offset, and then set b_addr to the kernel virtual 3771 * address it allocated for us. 3772 */ 3773 ASSERT(bp->b_un.b_addr == 0); 3774 3775 bp->b_edev = 0; 3776 bp->b_dev = 0; 3777 bp->b_lblkno = lbtodb(io_off); 3778 bp->b_file = vp; 3779 bp->b_offset = (offset_t)off; 3780 bp_mapin(bp); 3781 3782 /* 3783 * If doing a write beyond what we believe is EOF, 3784 * don't bother trying to read the pages from the 3785 * server, we'll just zero the pages here. We 3786 * don't check that the rw flag is S_WRITE here 3787 * because some implementations may attempt a 3788 * read access to the buffer before copying data. 3789 */ 3790 mutex_enter(&rp->r_statelock); 3791 if (io_off >= rp->r_size && seg == segkmap) { 3792 mutex_exit(&rp->r_statelock); 3793 bzero(bp->b_un.b_addr, io_len); 3794 } else { 3795 mutex_exit(&rp->r_statelock); 3796 error = nfs_bio(bp, cr); 3797 } 3798 3799 /* 3800 * Unmap the buffer before freeing it. 3801 */ 3802 bp_mapout(bp); 3803 pageio_done(bp); 3804 3805 if (error == NFS_EOF) { 3806 /* 3807 * If doing a write system call just return 3808 * zeroed pages, else user tried to get pages 3809 * beyond EOF, return error. We don't check 3810 * that the rw flag is S_WRITE here because 3811 * some implementations may attempt a read 3812 * access to the buffer before copying data. 3813 */ 3814 if (seg == segkmap) 3815 error = 0; 3816 else 3817 error = EFAULT; 3818 } 3819 3820 if (!readahead_issued && !error) { 3821 mutex_enter(&rp->r_statelock); 3822 rp->r_nextr = io_off + io_len; 3823 mutex_exit(&rp->r_statelock); 3824 } 3825 } 3826 } 3827 3828 out: 3829 if (pl == NULL) 3830 return (error); 3831 3832 if (error) { 3833 if (pp != NULL) 3834 pvn_read_done(pp, B_ERROR); 3835 return (error); 3836 } 3837 3838 if (pagefound) { 3839 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 3840 3841 /* 3842 * Page exists in the cache, acquire the appropriate lock. 3843 * If this fails, start all over again. 3844 */ 3845 if ((pp = page_lookup(vp, off, se)) == NULL) { 3846 #ifdef DEBUG 3847 nfs_lostpage++; 3848 #endif 3849 goto reread; 3850 } 3851 pl[0] = pp; 3852 pl[1] = NULL; 3853 return (0); 3854 } 3855 3856 if (pp != NULL) 3857 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3858 3859 return (error); 3860 } 3861 3862 static void 3863 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 3864 cred_t *cr) 3865 { 3866 int error; 3867 page_t *pp; 3868 u_offset_t io_off; 3869 size_t io_len; 3870 struct buf *bp; 3871 uint_t bsize, blksize; 3872 rnode_t *rp = VTOR(vp); 3873 3874 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3875 3876 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3877 3878 mutex_enter(&rp->r_statelock); 3879 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 3880 /* 3881 * If less than a block left in file read less 3882 * than a block. 3883 */ 3884 blksize = rp->r_size - blkoff; 3885 } else 3886 blksize = bsize; 3887 mutex_exit(&rp->r_statelock); 3888 3889 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 3890 &io_off, &io_len, blkoff, blksize, 1); 3891 /* 3892 * The isra flag passed to the kluster function is 1, we may have 3893 * gotten a return value of NULL for a variety of reasons (# of free 3894 * pages < minfree, someone entered the page on the vnode etc). In all 3895 * cases, we want to punt on the readahead. 3896 */ 3897 if (pp == NULL) 3898 return; 3899 3900 /* 3901 * Now round the request size up to page boundaries. 3902 * This ensures that the entire page will be 3903 * initialized to zeroes if EOF is encountered. 3904 */ 3905 io_len = ptob(btopr(io_len)); 3906 3907 bp = pageio_setup(pp, io_len, vp, B_READ); 3908 ASSERT(bp != NULL); 3909 3910 /* 3911 * pageio_setup should have set b_addr to 0. This is correct since 3912 * we want to do I/O on a page boundary. bp_mapin() will use this addr 3913 * to calculate an offset, and then set b_addr to the kernel virtual 3914 * address it allocated for us. 3915 */ 3916 ASSERT(bp->b_un.b_addr == 0); 3917 3918 bp->b_edev = 0; 3919 bp->b_dev = 0; 3920 bp->b_lblkno = lbtodb(io_off); 3921 bp->b_file = vp; 3922 bp->b_offset = (offset_t)blkoff; 3923 bp_mapin(bp); 3924 3925 /* 3926 * If doing a write beyond what we believe is EOF, don't bother trying 3927 * to read the pages from the server, we'll just zero the pages here. 3928 * We don't check that the rw flag is S_WRITE here because some 3929 * implementations may attempt a read access to the buffer before 3930 * copying data. 3931 */ 3932 mutex_enter(&rp->r_statelock); 3933 if (io_off >= rp->r_size && seg == segkmap) { 3934 mutex_exit(&rp->r_statelock); 3935 bzero(bp->b_un.b_addr, io_len); 3936 error = 0; 3937 } else { 3938 mutex_exit(&rp->r_statelock); 3939 error = nfs_bio(bp, cr); 3940 if (error == NFS_EOF) 3941 error = 0; 3942 } 3943 3944 /* 3945 * Unmap the buffer before freeing it. 3946 */ 3947 bp_mapout(bp); 3948 pageio_done(bp); 3949 3950 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 3951 3952 /* 3953 * In case of error set readahead offset 3954 * to the lowest offset. 3955 * pvn_read_done() calls VN_DISPOSE to destroy the pages 3956 */ 3957 if (error && rp->r_nextr > io_off) { 3958 mutex_enter(&rp->r_statelock); 3959 if (rp->r_nextr > io_off) 3960 rp->r_nextr = io_off; 3961 mutex_exit(&rp->r_statelock); 3962 } 3963 } 3964 3965 /* 3966 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 3967 * If len == 0, do from off to EOF. 3968 * 3969 * The normal cases should be len == 0 && off == 0 (entire vp list), 3970 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 3971 * (from pageout). 3972 */ 3973 static int 3974 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 3975 { 3976 int error; 3977 rnode_t *rp; 3978 3979 ASSERT(cr != NULL); 3980 3981 /* 3982 * XXX - Why should this check be made here? 3983 */ 3984 if (vp->v_flag & VNOMAP) 3985 return (ENOSYS); 3986 3987 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 3988 return (0); 3989 3990 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 3991 return (EIO); 3992 ASSERT(off <= MAXOFF32_T); 3993 3994 rp = VTOR(vp); 3995 mutex_enter(&rp->r_statelock); 3996 rp->r_count++; 3997 mutex_exit(&rp->r_statelock); 3998 error = nfs_putpages(vp, off, len, flags, cr); 3999 mutex_enter(&rp->r_statelock); 4000 rp->r_count--; 4001 cv_broadcast(&rp->r_cv); 4002 mutex_exit(&rp->r_statelock); 4003 4004 return (error); 4005 } 4006 4007 /* 4008 * Write out a single page, possibly klustering adjacent dirty pages. 4009 */ 4010 int 4011 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 4012 int flags, cred_t *cr) 4013 { 4014 u_offset_t io_off; 4015 u_offset_t lbn_off; 4016 u_offset_t lbn; 4017 size_t io_len; 4018 uint_t bsize; 4019 int error; 4020 rnode_t *rp; 4021 4022 ASSERT(!vn_is_readonly(vp)); 4023 ASSERT(pp != NULL); 4024 ASSERT(cr != NULL); 4025 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 4026 4027 rp = VTOR(vp); 4028 ASSERT(rp->r_count > 0); 4029 4030 ASSERT(pp->p_offset <= MAXOFF32_T); 4031 4032 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4033 lbn = pp->p_offset / bsize; 4034 lbn_off = lbn * bsize; 4035 4036 /* 4037 * Find a kluster that fits in one block, or in 4038 * one page if pages are bigger than blocks. If 4039 * there is less file space allocated than a whole 4040 * page, we'll shorten the i/o request below. 4041 */ 4042 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4043 roundup(bsize, PAGESIZE), flags); 4044 4045 /* 4046 * pvn_write_kluster shouldn't have returned a page with offset 4047 * behind the original page we were given. Verify that. 4048 */ 4049 ASSERT((pp->p_offset / bsize) >= lbn); 4050 4051 /* 4052 * Now pp will have the list of kept dirty pages marked for 4053 * write back. It will also handle invalidation and freeing 4054 * of pages that are not dirty. Check for page length rounding 4055 * problems. 4056 */ 4057 if (io_off + io_len > lbn_off + bsize) { 4058 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4059 io_len = lbn_off + bsize - io_off; 4060 } 4061 /* 4062 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4063 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4064 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4065 * progress and the r_size has not been made consistent with the 4066 * new size of the file. When the uiomove() completes the r_size is 4067 * updated and the RMODINPROGRESS flag is cleared. 4068 * 4069 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4070 * consistent value of r_size. Without this handshaking, it is 4071 * possible that nfs(3)_bio() picks up the old value of r_size 4072 * before the uiomove() in writerp() completes. This will result 4073 * in the write through nfs(3)_bio() being dropped. 4074 * 4075 * More precisely, there is a window between the time the uiomove() 4076 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4077 * operation intervenes in this window, the page will be picked up, 4078 * because it is dirty (it will be unlocked, unless it was 4079 * pagecreate'd). When the page is picked up as dirty, the dirty 4080 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4081 * checked. This will still be the old size. Therefore the page will 4082 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4083 * the page will be found to be clean and the write will be dropped. 4084 */ 4085 if (rp->r_flags & RMODINPROGRESS) { 4086 mutex_enter(&rp->r_statelock); 4087 if ((rp->r_flags & RMODINPROGRESS) && 4088 rp->r_modaddr + MAXBSIZE > io_off && 4089 rp->r_modaddr < io_off + io_len) { 4090 page_t *plist; 4091 /* 4092 * A write is in progress for this region of the file. 4093 * If we did not detect RMODINPROGRESS here then this 4094 * path through nfs_putapage() would eventually go to 4095 * nfs(3)_bio() and may not write out all of the data 4096 * in the pages. We end up losing data. So we decide 4097 * to set the modified bit on each page in the page 4098 * list and mark the rnode with RDIRTY. This write 4099 * will be restarted at some later time. 4100 */ 4101 plist = pp; 4102 while (plist != NULL) { 4103 pp = plist; 4104 page_sub(&plist, pp); 4105 hat_setmod(pp); 4106 page_io_unlock(pp); 4107 page_unlock(pp); 4108 } 4109 rp->r_flags |= RDIRTY; 4110 mutex_exit(&rp->r_statelock); 4111 if (offp) 4112 *offp = io_off; 4113 if (lenp) 4114 *lenp = io_len; 4115 return (0); 4116 } 4117 mutex_exit(&rp->r_statelock); 4118 } 4119 4120 if (flags & B_ASYNC) { 4121 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4122 nfs_sync_putapage); 4123 } else 4124 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4125 4126 if (offp) 4127 *offp = io_off; 4128 if (lenp) 4129 *lenp = io_len; 4130 return (error); 4131 } 4132 4133 static int 4134 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4135 int flags, cred_t *cr) 4136 { 4137 int error; 4138 rnode_t *rp; 4139 4140 flags |= B_WRITE; 4141 4142 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4143 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4144 4145 rp = VTOR(vp); 4146 4147 if ((error == ENOSPC || error == EDQUOT || error == EACCES) && 4148 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 4149 if (!(rp->r_flags & ROUTOFSPACE)) { 4150 mutex_enter(&rp->r_statelock); 4151 rp->r_flags |= ROUTOFSPACE; 4152 mutex_exit(&rp->r_statelock); 4153 } 4154 flags |= B_ERROR; 4155 pvn_write_done(pp, flags); 4156 /* 4157 * If this was not an async thread, then try again to 4158 * write out the pages, but this time, also destroy 4159 * them whether or not the write is successful. This 4160 * will prevent memory from filling up with these 4161 * pages and destroying them is the only alternative 4162 * if they can't be written out. 4163 * 4164 * Don't do this if this is an async thread because 4165 * when the pages are unlocked in pvn_write_done, 4166 * some other thread could have come along, locked 4167 * them, and queued for an async thread. It would be 4168 * possible for all of the async threads to be tied 4169 * up waiting to lock the pages again and they would 4170 * all already be locked and waiting for an async 4171 * thread to handle them. Deadlock. 4172 */ 4173 if (!(flags & B_ASYNC)) { 4174 error = nfs_putpage(vp, io_off, io_len, 4175 B_INVAL | B_FORCE, cr); 4176 } 4177 } else { 4178 if (error) 4179 flags |= B_ERROR; 4180 else if (rp->r_flags & ROUTOFSPACE) { 4181 mutex_enter(&rp->r_statelock); 4182 rp->r_flags &= ~ROUTOFSPACE; 4183 mutex_exit(&rp->r_statelock); 4184 } 4185 pvn_write_done(pp, flags); 4186 } 4187 4188 return (error); 4189 } 4190 4191 static int 4192 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4193 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 4194 { 4195 struct segvn_crargs vn_a; 4196 int error; 4197 rnode_t *rp; 4198 struct vattr va; 4199 4200 if (nfs_zone() != VTOMI(vp)->mi_zone) 4201 return (EIO); 4202 4203 if (vp->v_flag & VNOMAP) 4204 return (ENOSYS); 4205 4206 if (off > MAXOFF32_T) 4207 return (EFBIG); 4208 4209 if (off < 0 || off + len < 0) 4210 return (ENXIO); 4211 4212 if (vp->v_type != VREG) 4213 return (ENODEV); 4214 4215 /* 4216 * If there is cached data and if close-to-open consistency 4217 * checking is not turned off and if the file system is not 4218 * mounted readonly, then force an over the wire getattr. 4219 * Otherwise, just invoke nfsgetattr to get a copy of the 4220 * attributes. The attribute cache will be used unless it 4221 * is timed out and if it is, then an over the wire getattr 4222 * will be issued. 4223 */ 4224 va.va_mask = AT_ALL; 4225 if (vn_has_cached_data(vp) && 4226 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 4227 error = nfs_getattr_otw(vp, &va, cr); 4228 else 4229 error = nfsgetattr(vp, &va, cr); 4230 if (error) 4231 return (error); 4232 4233 /* 4234 * Check to see if the vnode is currently marked as not cachable. 4235 * This means portions of the file are locked (through VOP_FRLOCK). 4236 * In this case the map request must be refused. We use 4237 * rp->r_lkserlock to avoid a race with concurrent lock requests. 4238 */ 4239 rp = VTOR(vp); 4240 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 4241 return (EINTR); 4242 4243 if (vp->v_flag & VNOCACHE) { 4244 error = EAGAIN; 4245 goto done; 4246 } 4247 4248 /* 4249 * Don't allow concurrent locks and mapping if mandatory locking is 4250 * enabled. 4251 */ 4252 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 4253 MANDLOCK(vp, va.va_mode)) { 4254 error = EAGAIN; 4255 goto done; 4256 } 4257 4258 as_rangelock(as); 4259 if (!(flags & MAP_FIXED)) { 4260 map_addr(addrp, len, off, 1, flags); 4261 if (*addrp == NULL) { 4262 as_rangeunlock(as); 4263 error = ENOMEM; 4264 goto done; 4265 } 4266 } else { 4267 /* 4268 * User specified address - blow away any previous mappings 4269 */ 4270 (void) as_unmap(as, *addrp, len); 4271 } 4272 4273 vn_a.vp = vp; 4274 vn_a.offset = off; 4275 vn_a.type = (flags & MAP_TYPE); 4276 vn_a.prot = (uchar_t)prot; 4277 vn_a.maxprot = (uchar_t)maxprot; 4278 vn_a.flags = (flags & ~MAP_TYPE); 4279 vn_a.cred = cr; 4280 vn_a.amp = NULL; 4281 vn_a.szc = 0; 4282 vn_a.lgrp_mem_policy_flags = 0; 4283 4284 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4285 as_rangeunlock(as); 4286 4287 done: 4288 nfs_rw_exit(&rp->r_lkserlock); 4289 return (error); 4290 } 4291 4292 /* ARGSUSED */ 4293 static int 4294 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4295 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 4296 { 4297 rnode_t *rp; 4298 4299 if (vp->v_flag & VNOMAP) 4300 return (ENOSYS); 4301 if (nfs_zone() != VTOMI(vp)->mi_zone) 4302 return (EIO); 4303 4304 /* 4305 * Need to hold rwlock while incrementing the mapcnt so that 4306 * mmap'ing can be serialized with writes so that the caching 4307 * can be handled correctly. 4308 */ 4309 rp = VTOR(vp); 4310 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 4311 return (EINTR); 4312 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 4313 nfs_rw_exit(&rp->r_rwlock); 4314 4315 return (0); 4316 } 4317 4318 static int 4319 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4320 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 4321 { 4322 netobj lm_fh; 4323 int rc; 4324 u_offset_t start, end; 4325 rnode_t *rp; 4326 int error = 0, intr = INTR(vp); 4327 4328 /* check for valid cmd parameter */ 4329 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 4330 return (EINVAL); 4331 if (nfs_zone() != VTOMI(vp)->mi_zone) 4332 return (EIO); 4333 4334 /* Verify l_type. */ 4335 switch (bfp->l_type) { 4336 case F_RDLCK: 4337 if (cmd != F_GETLK && !(flag & FREAD)) 4338 return (EBADF); 4339 break; 4340 case F_WRLCK: 4341 if (cmd != F_GETLK && !(flag & FWRITE)) 4342 return (EBADF); 4343 break; 4344 case F_UNLCK: 4345 intr = 0; 4346 break; 4347 4348 default: 4349 return (EINVAL); 4350 } 4351 4352 /* check the validity of the lock range */ 4353 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 4354 return (rc); 4355 if (rc = flk_check_lock_data(start, end, MAXOFF32_T)) 4356 return (rc); 4357 4358 /* 4359 * If the filesystem is mounted using local locking, pass the 4360 * request off to the local locking code. 4361 */ 4362 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 4363 if (offset > MAXOFF32_T) 4364 return (EFBIG); 4365 if (cmd == F_SETLK || cmd == F_SETLKW) { 4366 /* 4367 * For complete safety, we should be holding 4368 * r_lkserlock. However, we can't call 4369 * lm_safelock and then fs_frlock while 4370 * holding r_lkserlock, so just invoke 4371 * lm_safelock and expect that this will 4372 * catch enough of the cases. 4373 */ 4374 if (!lm_safelock(vp, bfp, cr)) 4375 return (EAGAIN); 4376 } 4377 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 4378 } 4379 4380 rp = VTOR(vp); 4381 4382 /* 4383 * Check whether the given lock request can proceed, given the 4384 * current file mappings. 4385 */ 4386 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 4387 return (EINTR); 4388 if (cmd == F_SETLK || cmd == F_SETLKW) { 4389 if (!lm_safelock(vp, bfp, cr)) { 4390 rc = EAGAIN; 4391 goto done; 4392 } 4393 } 4394 4395 /* 4396 * Flush the cache after waiting for async I/O to finish. For new 4397 * locks, this is so that the process gets the latest bits from the 4398 * server. For unlocks, this is so that other clients see the 4399 * latest bits once the file has been unlocked. If currently dirty 4400 * pages can't be flushed, then don't allow a lock to be set. But 4401 * allow unlocks to succeed, to avoid having orphan locks on the 4402 * server. 4403 */ 4404 if (cmd != F_GETLK) { 4405 mutex_enter(&rp->r_statelock); 4406 while (rp->r_count > 0) { 4407 if (intr) { 4408 klwp_t *lwp = ttolwp(curthread); 4409 4410 if (lwp != NULL) 4411 lwp->lwp_nostop++; 4412 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 4413 if (lwp != NULL) 4414 lwp->lwp_nostop--; 4415 rc = EINTR; 4416 break; 4417 } 4418 if (lwp != NULL) 4419 lwp->lwp_nostop--; 4420 } else 4421 cv_wait(&rp->r_cv, &rp->r_statelock); 4422 } 4423 mutex_exit(&rp->r_statelock); 4424 if (rc != 0) 4425 goto done; 4426 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 4427 if (error) { 4428 if (error == ENOSPC || error == EDQUOT) { 4429 mutex_enter(&rp->r_statelock); 4430 if (!rp->r_error) 4431 rp->r_error = error; 4432 mutex_exit(&rp->r_statelock); 4433 } 4434 if (bfp->l_type != F_UNLCK) { 4435 rc = ENOLCK; 4436 goto done; 4437 } 4438 } 4439 } 4440 4441 lm_fh.n_len = sizeof (fhandle_t); 4442 lm_fh.n_bytes = (char *)VTOFH(vp); 4443 4444 /* 4445 * Call the lock manager to do the real work of contacting 4446 * the server and obtaining the lock. 4447 */ 4448 rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp); 4449 4450 if (rc == 0) 4451 nfs_lockcompletion(vp, cmd); 4452 4453 done: 4454 nfs_rw_exit(&rp->r_lkserlock); 4455 return (rc); 4456 } 4457 4458 /* 4459 * Free storage space associated with the specified vnode. The portion 4460 * to be freed is specified by bfp->l_start and bfp->l_len (already 4461 * normalized to a "whence" of 0). 4462 * 4463 * This is an experimental facility whose continued existence is not 4464 * guaranteed. Currently, we only support the special case 4465 * of l_len == 0, meaning free to end of file. 4466 */ 4467 /* ARGSUSED */ 4468 static int 4469 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4470 offset_t offset, cred_t *cr, caller_context_t *ct) 4471 { 4472 int error; 4473 4474 ASSERT(vp->v_type == VREG); 4475 if (cmd != F_FREESP) 4476 return (EINVAL); 4477 4478 if (offset > MAXOFF32_T) 4479 return (EFBIG); 4480 4481 if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) || 4482 (bfp->l_len > MAXOFF32_T)) 4483 return (EFBIG); 4484 4485 if (nfs_zone() != VTOMI(vp)->mi_zone) 4486 return (EIO); 4487 4488 error = convoff(vp, bfp, 0, offset); 4489 if (!error) { 4490 ASSERT(bfp->l_start >= 0); 4491 if (bfp->l_len == 0) { 4492 struct vattr va; 4493 4494 /* 4495 * ftruncate should not change the ctime and 4496 * mtime if we truncate the file to its 4497 * previous size. 4498 */ 4499 va.va_mask = AT_SIZE; 4500 error = nfsgetattr(vp, &va, cr); 4501 if (error || va.va_size == bfp->l_start) 4502 return (error); 4503 va.va_mask = AT_SIZE; 4504 va.va_size = bfp->l_start; 4505 error = nfssetattr(vp, &va, 0, cr); 4506 } else 4507 error = EINVAL; 4508 } 4509 4510 return (error); 4511 } 4512 4513 /* ARGSUSED */ 4514 static int 4515 nfs_realvp(vnode_t *vp, vnode_t **vpp) 4516 { 4517 4518 return (EINVAL); 4519 } 4520 4521 /* 4522 * Setup and add an address space callback to do the work of the delmap call. 4523 * The callback will (and must be) deleted in the actual callback function. 4524 * 4525 * This is done in order to take care of the problem that we have with holding 4526 * the address space's a_lock for a long period of time (e.g. if the NFS server 4527 * is down). Callbacks will be executed in the address space code while the 4528 * a_lock is not held. Holding the address space's a_lock causes things such 4529 * as ps and fork to hang because they are trying to acquire this lock as well. 4530 */ 4531 /* ARGSUSED */ 4532 static int 4533 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4534 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 4535 { 4536 int caller_found; 4537 int error; 4538 rnode_t *rp; 4539 nfs_delmap_args_t *dmapp; 4540 nfs_delmapcall_t *delmap_call; 4541 4542 if (vp->v_flag & VNOMAP) 4543 return (ENOSYS); 4544 /* 4545 * A process may not change zones if it has NFS pages mmap'ed 4546 * in, so we can't legitimately get here from the wrong zone. 4547 */ 4548 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4549 4550 rp = VTOR(vp); 4551 4552 /* 4553 * The way that the address space of this process deletes its mapping 4554 * of this file is via the following call chains: 4555 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4556 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4557 * 4558 * With the use of address space callbacks we are allowed to drop the 4559 * address space lock, a_lock, while executing the NFS operations that 4560 * need to go over the wire. Returning EAGAIN to the caller of this 4561 * function is what drives the execution of the callback that we add 4562 * below. The callback will be executed by the address space code 4563 * after dropping the a_lock. When the callback is finished, since 4564 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 4565 * is called again on the same segment to finish the rest of the work 4566 * that needs to happen during unmapping. 4567 * 4568 * This action of calling back into the segment driver causes 4569 * nfs_delmap() to get called again, but since the callback was 4570 * already executed at this point, it already did the work and there 4571 * is nothing left for us to do. 4572 * 4573 * To Summarize: 4574 * - The first time nfs_delmap is called by the current thread is when 4575 * we add the caller associated with this delmap to the delmap caller 4576 * list, add the callback, and return EAGAIN. 4577 * - The second time in this call chain when nfs_delmap is called we 4578 * will find this caller in the delmap caller list and realize there 4579 * is no more work to do thus removing this caller from the list and 4580 * returning the error that was set in the callback execution. 4581 */ 4582 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 4583 if (caller_found) { 4584 /* 4585 * 'error' is from the actual delmap operations. To avoid 4586 * hangs, we need to handle the return of EAGAIN differently 4587 * since this is what drives the callback execution. 4588 * In this case, we don't want to return EAGAIN and do the 4589 * callback execution because there are none to execute. 4590 */ 4591 if (error == EAGAIN) 4592 return (0); 4593 else 4594 return (error); 4595 } 4596 4597 /* current caller was not in the list */ 4598 delmap_call = nfs_init_delmapcall(); 4599 4600 mutex_enter(&rp->r_statelock); 4601 list_insert_tail(&rp->r_indelmap, delmap_call); 4602 mutex_exit(&rp->r_statelock); 4603 4604 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 4605 4606 dmapp->vp = vp; 4607 dmapp->off = off; 4608 dmapp->addr = addr; 4609 dmapp->len = len; 4610 dmapp->prot = prot; 4611 dmapp->maxprot = maxprot; 4612 dmapp->flags = flags; 4613 dmapp->cr = cr; 4614 dmapp->caller = delmap_call; 4615 4616 error = as_add_callback(as, nfs_delmap_callback, dmapp, 4617 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 4618 4619 return (error ? error : EAGAIN); 4620 } 4621 4622 /* 4623 * Remove some pages from an mmap'd vnode. Just update the 4624 * count of pages. If doing close-to-open, then flush all 4625 * of the pages associated with this file. Otherwise, start 4626 * an asynchronous page flush to write out any dirty pages. 4627 * This will also associate a credential with the rnode which 4628 * can be used to write the pages. 4629 */ 4630 /* ARGSUSED */ 4631 static void 4632 nfs_delmap_callback(struct as *as, void *arg, uint_t event) 4633 { 4634 int error; 4635 rnode_t *rp; 4636 mntinfo_t *mi; 4637 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 4638 4639 rp = VTOR(dmapp->vp); 4640 mi = VTOMI(dmapp->vp); 4641 4642 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 4643 ASSERT(rp->r_mapcnt >= 0); 4644 4645 /* 4646 * Initiate a page flush if there are pages, the file system 4647 * was not mounted readonly, the segment was mapped shared, and 4648 * the pages themselves were writeable. 4649 */ 4650 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 4651 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 4652 mutex_enter(&rp->r_statelock); 4653 rp->r_flags |= RDIRTY; 4654 mutex_exit(&rp->r_statelock); 4655 /* 4656 * If this is a cross-zone access a sync putpage won't work, so 4657 * the best we can do is try an async putpage. That seems 4658 * better than something more draconian such as discarding the 4659 * dirty pages. 4660 */ 4661 if ((mi->mi_flags & MI_NOCTO) || 4662 nfs_zone() != mi->mi_zone) 4663 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4664 B_ASYNC, dmapp->cr); 4665 else 4666 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4667 0, dmapp->cr); 4668 if (!error) { 4669 mutex_enter(&rp->r_statelock); 4670 error = rp->r_error; 4671 rp->r_error = 0; 4672 mutex_exit(&rp->r_statelock); 4673 } 4674 } else 4675 error = 0; 4676 4677 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 4678 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4679 B_INVAL, dmapp->cr); 4680 4681 dmapp->caller->error = error; 4682 (void) as_delete_callback(as, arg); 4683 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 4684 } 4685 4686 /* ARGSUSED */ 4687 static int 4688 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 4689 { 4690 int error = 0; 4691 4692 if (nfs_zone() != VTOMI(vp)->mi_zone) 4693 return (EIO); 4694 /* 4695 * This looks a little weird because it's written in a general 4696 * manner but we make little use of cases. If cntl() ever gets 4697 * widely used, the outer switch will make more sense. 4698 */ 4699 4700 switch (cmd) { 4701 4702 /* 4703 * Large file spec - need to base answer new query with 4704 * hardcoded constant based on the protocol. 4705 */ 4706 case _PC_FILESIZEBITS: 4707 *valp = 32; 4708 return (0); 4709 4710 case _PC_LINK_MAX: 4711 case _PC_NAME_MAX: 4712 case _PC_PATH_MAX: 4713 case _PC_SYMLINK_MAX: 4714 case _PC_CHOWN_RESTRICTED: 4715 case _PC_NO_TRUNC: { 4716 mntinfo_t *mi; 4717 struct pathcnf *pc; 4718 4719 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL) 4720 return (EINVAL); 4721 error = _PC_ISSET(cmd, pc->pc_mask); /* error or bool */ 4722 switch (cmd) { 4723 case _PC_LINK_MAX: 4724 *valp = pc->pc_link_max; 4725 break; 4726 case _PC_NAME_MAX: 4727 *valp = pc->pc_name_max; 4728 break; 4729 case _PC_PATH_MAX: 4730 case _PC_SYMLINK_MAX: 4731 *valp = pc->pc_path_max; 4732 break; 4733 case _PC_CHOWN_RESTRICTED: 4734 /* 4735 * if we got here, error is really a boolean which 4736 * indicates whether cmd is set or not. 4737 */ 4738 *valp = error ? 1 : 0; /* see above */ 4739 error = 0; 4740 break; 4741 case _PC_NO_TRUNC: 4742 /* 4743 * if we got here, error is really a boolean which 4744 * indicates whether cmd is set or not. 4745 */ 4746 *valp = error ? 1 : 0; /* see above */ 4747 error = 0; 4748 break; 4749 } 4750 return (error ? EINVAL : 0); 4751 } 4752 4753 case _PC_XATTR_EXISTS: 4754 *valp = 0; 4755 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 4756 vnode_t *avp; 4757 rnode_t *rp; 4758 mntinfo_t *mi = VTOMI(vp); 4759 4760 if (!(mi->mi_flags & MI_EXTATTR)) 4761 return (0); 4762 4763 rp = VTOR(vp); 4764 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 4765 INTR(vp))) 4766 return (EINTR); 4767 4768 error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 4769 if (error || avp == NULL) 4770 error = acl_getxattrdir2(vp, &avp, 0, cr, 0); 4771 4772 nfs_rw_exit(&rp->r_rwlock); 4773 4774 if (error == 0 && avp != NULL) { 4775 VN_RELE(avp); 4776 *valp = 1; 4777 } 4778 } 4779 return (error ? EINVAL : 0); 4780 4781 case _PC_ACL_ENABLED: 4782 *valp = _ACL_ACLENT_ENABLED; 4783 return (0); 4784 4785 default: 4786 return (EINVAL); 4787 } 4788 } 4789 4790 /* 4791 * Called by async thread to do synchronous pageio. Do the i/o, wait 4792 * for it to complete, and cleanup the page list when done. 4793 */ 4794 static int 4795 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4796 int flags, cred_t *cr) 4797 { 4798 int error; 4799 4800 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4801 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4802 if (flags & B_READ) 4803 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 4804 else 4805 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 4806 return (error); 4807 } 4808 4809 static int 4810 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4811 int flags, cred_t *cr) 4812 { 4813 int error; 4814 rnode_t *rp; 4815 4816 if (pp == NULL) 4817 return (EINVAL); 4818 4819 if (io_off > MAXOFF32_T) 4820 return (EFBIG); 4821 if (nfs_zone() != VTOMI(vp)->mi_zone) 4822 return (EIO); 4823 rp = VTOR(vp); 4824 mutex_enter(&rp->r_statelock); 4825 rp->r_count++; 4826 mutex_exit(&rp->r_statelock); 4827 4828 if (flags & B_ASYNC) { 4829 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 4830 nfs_sync_pageio); 4831 } else 4832 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4833 mutex_enter(&rp->r_statelock); 4834 rp->r_count--; 4835 cv_broadcast(&rp->r_cv); 4836 mutex_exit(&rp->r_statelock); 4837 return (error); 4838 } 4839 4840 static int 4841 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 4842 { 4843 int error; 4844 mntinfo_t *mi; 4845 4846 mi = VTOMI(vp); 4847 4848 if (nfs_zone() != mi->mi_zone) 4849 return (EIO); 4850 if (mi->mi_flags & MI_ACL) { 4851 error = acl_setacl2(vp, vsecattr, flag, cr); 4852 if (mi->mi_flags & MI_ACL) 4853 return (error); 4854 } 4855 4856 return (ENOSYS); 4857 } 4858 4859 static int 4860 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 4861 { 4862 int error; 4863 mntinfo_t *mi; 4864 4865 mi = VTOMI(vp); 4866 4867 if (nfs_zone() != mi->mi_zone) 4868 return (EIO); 4869 if (mi->mi_flags & MI_ACL) { 4870 error = acl_getacl2(vp, vsecattr, flag, cr); 4871 if (mi->mi_flags & MI_ACL) 4872 return (error); 4873 } 4874 4875 return (fs_fab_acl(vp, vsecattr, flag, cr)); 4876 } 4877 4878 static int 4879 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 4880 { 4881 int error; 4882 struct shrlock nshr; 4883 struct nfs_owner nfs_owner; 4884 netobj lm_fh; 4885 4886 if (nfs_zone() != VTOMI(vp)->mi_zone) 4887 return (EIO); 4888 4889 /* 4890 * check for valid cmd parameter 4891 */ 4892 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 4893 return (EINVAL); 4894 4895 /* 4896 * Check access permissions 4897 */ 4898 if (cmd == F_SHARE && 4899 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 4900 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 4901 return (EBADF); 4902 4903 /* 4904 * If the filesystem is mounted using local locking, pass the 4905 * request off to the local share code. 4906 */ 4907 if (VTOMI(vp)->mi_flags & MI_LLOCK) 4908 return (fs_shrlock(vp, cmd, shr, flag, cr)); 4909 4910 switch (cmd) { 4911 case F_SHARE: 4912 case F_UNSHARE: 4913 lm_fh.n_len = sizeof (fhandle_t); 4914 lm_fh.n_bytes = (char *)VTOFH(vp); 4915 4916 /* 4917 * If passed an owner that is too large to fit in an 4918 * nfs_owner it is likely a recursive call from the 4919 * lock manager client and pass it straight through. If 4920 * it is not a nfs_owner then simply return an error. 4921 */ 4922 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 4923 if (((struct nfs_owner *)shr->s_owner)->magic != 4924 NFS_OWNER_MAGIC) 4925 return (EINVAL); 4926 4927 if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) { 4928 error = set_errno(error); 4929 } 4930 return (error); 4931 } 4932 /* 4933 * Remote share reservations owner is a combination of 4934 * a magic number, hostname, and the local owner 4935 */ 4936 bzero(&nfs_owner, sizeof (nfs_owner)); 4937 nfs_owner.magic = NFS_OWNER_MAGIC; 4938 (void) strncpy(nfs_owner.hname, uts_nodename(), 4939 sizeof (nfs_owner.hname)); 4940 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 4941 nshr.s_access = shr->s_access; 4942 nshr.s_deny = shr->s_deny; 4943 nshr.s_sysid = 0; 4944 nshr.s_pid = ttoproc(curthread)->p_pid; 4945 nshr.s_own_len = sizeof (nfs_owner); 4946 nshr.s_owner = (caddr_t)&nfs_owner; 4947 4948 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) { 4949 error = set_errno(error); 4950 } 4951 4952 break; 4953 4954 case F_HASREMOTELOCKS: 4955 /* 4956 * NFS client can't store remote locks itself 4957 */ 4958 shr->s_access = 0; 4959 error = 0; 4960 break; 4961 4962 default: 4963 error = EINVAL; 4964 break; 4965 } 4966 4967 return (error); 4968 } 4969