1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/types.h> 31 #include <sys/systm.h> 32 #include <sys/cred.h> 33 #include <sys/time.h> 34 #include <sys/vnode.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/file.h> 38 #include <sys/filio.h> 39 #include <sys/uio.h> 40 #include <sys/buf.h> 41 #include <sys/mman.h> 42 #include <sys/pathname.h> 43 #include <sys/dirent.h> 44 #include <sys/debug.h> 45 #include <sys/vmsystm.h> 46 #include <sys/fcntl.h> 47 #include <sys/flock.h> 48 #include <sys/swap.h> 49 #include <sys/errno.h> 50 #include <sys/strsubr.h> 51 #include <sys/sysmacros.h> 52 #include <sys/kmem.h> 53 #include <sys/cmn_err.h> 54 #include <sys/pathconf.h> 55 #include <sys/utsname.h> 56 #include <sys/dnlc.h> 57 #include <sys/acl.h> 58 #include <sys/atomic.h> 59 #include <sys/policy.h> 60 #include <sys/sdt.h> 61 62 #include <rpc/types.h> 63 #include <rpc/auth.h> 64 #include <rpc/clnt.h> 65 66 #include <nfs/nfs.h> 67 #include <nfs/nfs_clnt.h> 68 #include <nfs/rnode.h> 69 #include <nfs/nfs_acl.h> 70 #include <nfs/lm.h> 71 72 #include <vm/hat.h> 73 #include <vm/as.h> 74 #include <vm/page.h> 75 #include <vm/pvn.h> 76 #include <vm/seg.h> 77 #include <vm/seg_map.h> 78 #include <vm/seg_kpm.h> 79 #include <vm/seg_vn.h> 80 81 #include <fs/fs_subr.h> 82 83 #include <sys/ddi.h> 84 85 static int nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 86 cred_t *); 87 static int nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *); 88 static int nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *); 89 static int nfssetattr(vnode_t *, struct vattr *, int, cred_t *); 90 static int nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 91 static int nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 92 static int nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *, 93 caller_context_t *); 94 static int nfsreaddir(vnode_t *, rddir_cache *, cred_t *); 95 static int nfs_bio(struct buf *, cred_t *); 96 static int nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 97 page_t *[], size_t, struct seg *, caddr_t, 98 enum seg_rw, cred_t *); 99 static void nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 100 cred_t *); 101 static int nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 102 int, cred_t *); 103 static int nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 104 int, cred_t *); 105 static void nfs_delmap_callback(struct as *, void *, uint_t); 106 107 /* 108 * Error flags used to pass information about certain special errors 109 * which need to be handled specially. 110 */ 111 #define NFS_EOF -98 112 113 /* 114 * These are the vnode ops routines which implement the vnode interface to 115 * the networked file system. These routines just take their parameters, 116 * make them look networkish by putting the right info into interface structs, 117 * and then calling the appropriate remote routine(s) to do the work. 118 * 119 * Note on directory name lookup cacheing: If we detect a stale fhandle, 120 * we purge the directory cache relative to that vnode. This way, the 121 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 122 * more details on rnode locking. 123 */ 124 125 static int nfs_open(vnode_t **, int, cred_t *, caller_context_t *); 126 static int nfs_close(vnode_t *, int, int, offset_t, cred_t *, 127 caller_context_t *); 128 static int nfs_read(vnode_t *, struct uio *, int, cred_t *, 129 caller_context_t *); 130 static int nfs_write(vnode_t *, struct uio *, int, cred_t *, 131 caller_context_t *); 132 static int nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 133 caller_context_t *); 134 static int nfs_getattr(vnode_t *, struct vattr *, int, cred_t *, 135 caller_context_t *); 136 static int nfs_setattr(vnode_t *, struct vattr *, int, cred_t *, 137 caller_context_t *); 138 static int nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *); 139 static int nfs_accessx(void *, int, cred_t *); 140 static int nfs_readlink(vnode_t *, struct uio *, cred_t *, 141 caller_context_t *); 142 static int nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *); 143 static void nfs_inactive(vnode_t *, cred_t *, caller_context_t *); 144 static int nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *, 145 int, vnode_t *, cred_t *, caller_context_t *, 146 int *, pathname_t *); 147 static int nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl, 148 int, vnode_t **, cred_t *, int, caller_context_t *, 149 vsecattr_t *); 150 static int nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *, 151 int); 152 static int nfs_link(vnode_t *, vnode_t *, char *, cred_t *, 153 caller_context_t *, int); 154 static int nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 155 caller_context_t *, int); 156 static int nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 157 cred_t *, caller_context_t *, int, vsecattr_t *); 158 static int nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 159 caller_context_t *, int); 160 static int nfs_symlink(vnode_t *, char *, struct vattr *, char *, 161 cred_t *, caller_context_t *, int); 162 static int nfs_readdir(vnode_t *, struct uio *, cred_t *, int *, 163 caller_context_t *, int); 164 static int nfs_fid(vnode_t *, fid_t *, caller_context_t *); 165 static int nfs_rwlock(vnode_t *, int, caller_context_t *); 166 static void nfs_rwunlock(vnode_t *, int, caller_context_t *); 167 static int nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 168 static int nfs_getpage(vnode_t *, offset_t, size_t, uint_t *, 169 page_t *[], size_t, struct seg *, caddr_t, 170 enum seg_rw, cred_t *, caller_context_t *); 171 static int nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 172 caller_context_t *); 173 static int nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 174 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 175 static int nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 176 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 177 static int nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 178 struct flk_callback *, cred_t *, caller_context_t *); 179 static int nfs_space(vnode_t *, int, struct flock64 *, int, offset_t, 180 cred_t *, caller_context_t *); 181 static int nfs_realvp(vnode_t *, vnode_t **, caller_context_t *); 182 static int nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 183 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 184 static int nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *, 185 caller_context_t *); 186 static int nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 187 cred_t *, caller_context_t *); 188 static int nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 189 caller_context_t *); 190 static int nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 191 caller_context_t *); 192 static int nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 193 caller_context_t *); 194 195 struct vnodeops *nfs_vnodeops; 196 197 const fs_operation_def_t nfs_vnodeops_template[] = { 198 VOPNAME_OPEN, { .vop_open = nfs_open }, 199 VOPNAME_CLOSE, { .vop_close = nfs_close }, 200 VOPNAME_READ, { .vop_read = nfs_read }, 201 VOPNAME_WRITE, { .vop_write = nfs_write }, 202 VOPNAME_IOCTL, { .vop_ioctl = nfs_ioctl }, 203 VOPNAME_GETATTR, { .vop_getattr = nfs_getattr }, 204 VOPNAME_SETATTR, { .vop_setattr = nfs_setattr }, 205 VOPNAME_ACCESS, { .vop_access = nfs_access }, 206 VOPNAME_LOOKUP, { .vop_lookup = nfs_lookup }, 207 VOPNAME_CREATE, { .vop_create = nfs_create }, 208 VOPNAME_REMOVE, { .vop_remove = nfs_remove }, 209 VOPNAME_LINK, { .vop_link = nfs_link }, 210 VOPNAME_RENAME, { .vop_rename = nfs_rename }, 211 VOPNAME_MKDIR, { .vop_mkdir = nfs_mkdir }, 212 VOPNAME_RMDIR, { .vop_rmdir = nfs_rmdir }, 213 VOPNAME_READDIR, { .vop_readdir = nfs_readdir }, 214 VOPNAME_SYMLINK, { .vop_symlink = nfs_symlink }, 215 VOPNAME_READLINK, { .vop_readlink = nfs_readlink }, 216 VOPNAME_FSYNC, { .vop_fsync = nfs_fsync }, 217 VOPNAME_INACTIVE, { .vop_inactive = nfs_inactive }, 218 VOPNAME_FID, { .vop_fid = nfs_fid }, 219 VOPNAME_RWLOCK, { .vop_rwlock = nfs_rwlock }, 220 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs_rwunlock }, 221 VOPNAME_SEEK, { .vop_seek = nfs_seek }, 222 VOPNAME_FRLOCK, { .vop_frlock = nfs_frlock }, 223 VOPNAME_SPACE, { .vop_space = nfs_space }, 224 VOPNAME_REALVP, { .vop_realvp = nfs_realvp }, 225 VOPNAME_GETPAGE, { .vop_getpage = nfs_getpage }, 226 VOPNAME_PUTPAGE, { .vop_putpage = nfs_putpage }, 227 VOPNAME_MAP, { .vop_map = nfs_map }, 228 VOPNAME_ADDMAP, { .vop_addmap = nfs_addmap }, 229 VOPNAME_DELMAP, { .vop_delmap = nfs_delmap }, 230 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 231 VOPNAME_PATHCONF, { .vop_pathconf = nfs_pathconf }, 232 VOPNAME_PAGEIO, { .vop_pageio = nfs_pageio }, 233 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs_setsecattr }, 234 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs_getsecattr }, 235 VOPNAME_SHRLOCK, { .vop_shrlock = nfs_shrlock }, 236 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 237 NULL, NULL 238 }; 239 240 /* 241 * XXX: This is referenced in modstubs.s 242 */ 243 struct vnodeops * 244 nfs_getvnodeops(void) 245 { 246 return (nfs_vnodeops); 247 } 248 249 /* ARGSUSED */ 250 static int 251 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 252 { 253 int error; 254 struct vattr va; 255 rnode_t *rp; 256 vnode_t *vp; 257 258 vp = *vpp; 259 rp = VTOR(vp); 260 if (nfs_zone() != VTOMI(vp)->mi_zone) 261 return (EIO); 262 mutex_enter(&rp->r_statelock); 263 if (rp->r_cred == NULL) { 264 crhold(cr); 265 rp->r_cred = cr; 266 } 267 mutex_exit(&rp->r_statelock); 268 269 /* 270 * If there is no cached data or if close-to-open 271 * consistency checking is turned off, we can avoid 272 * the over the wire getattr. Otherwise, if the 273 * file system is mounted readonly, then just verify 274 * the caches are up to date using the normal mechanism. 275 * Else, if the file is not mmap'd, then just mark 276 * the attributes as timed out. They will be refreshed 277 * and the caches validated prior to being used. 278 * Else, the file system is mounted writeable so 279 * force an over the wire GETATTR in order to ensure 280 * that all cached data is valid. 281 */ 282 if (vp->v_count > 1 || 283 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 284 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 285 if (vn_is_readonly(vp)) 286 error = nfs_validate_caches(vp, cr); 287 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 288 PURGE_ATTRCACHE(vp); 289 error = 0; 290 } else { 291 va.va_mask = AT_ALL; 292 error = nfs_getattr_otw(vp, &va, cr); 293 } 294 } else 295 error = 0; 296 297 return (error); 298 } 299 300 /* ARGSUSED */ 301 static int 302 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 303 caller_context_t *ct) 304 { 305 rnode_t *rp; 306 int error; 307 struct vattr va; 308 309 /* 310 * zone_enter(2) prevents processes from changing zones with NFS files 311 * open; if we happen to get here from the wrong zone we can't do 312 * anything over the wire. 313 */ 314 if (VTOMI(vp)->mi_zone != nfs_zone()) { 315 /* 316 * We could attempt to clean up locks, except we're sure 317 * that the current process didn't acquire any locks on 318 * the file: any attempt to lock a file belong to another zone 319 * will fail, and one can't lock an NFS file and then change 320 * zones, as that fails too. 321 * 322 * Returning an error here is the sane thing to do. A 323 * subsequent call to VN_RELE() which translates to a 324 * nfs_inactive() will clean up state: if the zone of the 325 * vnode's origin is still alive and kicking, an async worker 326 * thread will handle the request (from the correct zone), and 327 * everything (minus the final nfs_getattr_otw() call) should 328 * be OK. If the zone is going away nfs_async_inactive() will 329 * throw away cached pages inline. 330 */ 331 return (EIO); 332 } 333 334 /* 335 * If we are using local locking for this filesystem, then 336 * release all of the SYSV style record locks. Otherwise, 337 * we are doing network locking and we need to release all 338 * of the network locks. All of the locks held by this 339 * process on this file are released no matter what the 340 * incoming reference count is. 341 */ 342 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 343 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 344 cleanshares(vp, ttoproc(curthread)->p_pid); 345 } else 346 nfs_lockrelease(vp, flag, offset, cr); 347 348 if (count > 1) 349 return (0); 350 351 /* 352 * If the file has been `unlinked', then purge the 353 * DNLC so that this vnode will get reycled quicker 354 * and the .nfs* file on the server will get removed. 355 */ 356 rp = VTOR(vp); 357 if (rp->r_unldvp != NULL) 358 dnlc_purge_vp(vp); 359 360 /* 361 * If the file was open for write and there are pages, 362 * then if the file system was mounted using the "no-close- 363 * to-open" semantics, then start an asynchronous flush 364 * of the all of the pages in the file. 365 * else the file system was not mounted using the "no-close- 366 * to-open" semantics, then do a synchronous flush and 367 * commit of all of the dirty and uncommitted pages. 368 * 369 * The asynchronous flush of the pages in the "nocto" path 370 * mostly just associates a cred pointer with the rnode so 371 * writes which happen later will have a better chance of 372 * working. It also starts the data being written to the 373 * server, but without unnecessarily delaying the application. 374 */ 375 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 376 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) { 377 error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, 378 cr, ct); 379 if (error == EAGAIN) 380 error = 0; 381 } else 382 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 383 if (!error) { 384 mutex_enter(&rp->r_statelock); 385 error = rp->r_error; 386 rp->r_error = 0; 387 mutex_exit(&rp->r_statelock); 388 } 389 } else { 390 mutex_enter(&rp->r_statelock); 391 error = rp->r_error; 392 rp->r_error = 0; 393 mutex_exit(&rp->r_statelock); 394 } 395 396 /* 397 * If RWRITEATTR is set, then issue an over the wire GETATTR to 398 * refresh the attribute cache with a set of attributes which 399 * weren't returned from a WRITE. This will enable the close- 400 * to-open processing to work. 401 */ 402 if (rp->r_flags & RWRITEATTR) 403 (void) nfs_getattr_otw(vp, &va, cr); 404 405 return (error); 406 } 407 408 /* ARGSUSED */ 409 static int 410 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 411 caller_context_t *ct) 412 { 413 rnode_t *rp; 414 u_offset_t off; 415 offset_t diff; 416 int on; 417 size_t n; 418 caddr_t base; 419 uint_t flags; 420 int error; 421 mntinfo_t *mi; 422 423 rp = VTOR(vp); 424 mi = VTOMI(vp); 425 426 if (nfs_zone() != mi->mi_zone) 427 return (EIO); 428 429 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 430 431 if (vp->v_type != VREG) 432 return (EISDIR); 433 434 if (uiop->uio_resid == 0) 435 return (0); 436 437 if (uiop->uio_loffset > MAXOFF32_T) 438 return (EFBIG); 439 440 if (uiop->uio_loffset < 0 || 441 uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T) 442 return (EINVAL); 443 444 /* 445 * Bypass VM if caching has been disabled (e.g., locking) or if 446 * using client-side direct I/O and the file is not mmap'd and 447 * there are no cached pages. 448 */ 449 if ((vp->v_flag & VNOCACHE) || 450 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 451 rp->r_mapcnt == 0 && rp->r_inmap == 0 && 452 !vn_has_cached_data(vp))) { 453 size_t bufsize; 454 size_t resid = 0; 455 456 /* 457 * Let's try to do read in as large a chunk as we can 458 * (Filesystem (NFS client) bsize if possible/needed). 459 * For V3, this is 32K and for V2, this is 8K. 460 */ 461 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread); 462 base = kmem_alloc(bufsize, KM_SLEEP); 463 do { 464 n = MIN(uiop->uio_resid, bufsize); 465 error = nfsread(vp, base, uiop->uio_offset, n, 466 &resid, cr); 467 if (!error) { 468 n -= resid; 469 error = uiomove(base, n, UIO_READ, uiop); 470 } 471 } while (!error && uiop->uio_resid > 0 && n > 0); 472 kmem_free(base, bufsize); 473 return (error); 474 } 475 476 error = 0; 477 478 do { 479 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 480 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 481 n = MIN(MAXBSIZE - on, uiop->uio_resid); 482 483 error = nfs_validate_caches(vp, cr); 484 if (error) 485 break; 486 487 mutex_enter(&rp->r_statelock); 488 while (rp->r_flags & RINCACHEPURGE) { 489 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 490 mutex_exit(&rp->r_statelock); 491 return (EINTR); 492 } 493 } 494 diff = rp->r_size - uiop->uio_loffset; 495 mutex_exit(&rp->r_statelock); 496 if (diff <= 0) 497 break; 498 if (diff < n) 499 n = (size_t)diff; 500 501 if (vpm_enable) { 502 /* 503 * Copy data. 504 */ 505 error = vpm_data_copy(vp, off + on, n, uiop, 506 1, NULL, 0, S_READ); 507 } else { 508 base = segmap_getmapflt(segkmap, vp, off + on, n, 509 1, S_READ); 510 error = uiomove(base + on, n, UIO_READ, uiop); 511 } 512 513 if (!error) { 514 /* 515 * If read a whole block or read to eof, 516 * won't need this buffer again soon. 517 */ 518 mutex_enter(&rp->r_statelock); 519 if (n + on == MAXBSIZE || 520 uiop->uio_loffset == rp->r_size) 521 flags = SM_DONTNEED; 522 else 523 flags = 0; 524 mutex_exit(&rp->r_statelock); 525 if (vpm_enable) { 526 error = vpm_sync_pages(vp, off, n, flags); 527 } else { 528 error = segmap_release(segkmap, base, flags); 529 } 530 } else { 531 if (vpm_enable) { 532 (void) vpm_sync_pages(vp, off, n, 0); 533 } else { 534 (void) segmap_release(segkmap, base, 0); 535 } 536 } 537 } while (!error && uiop->uio_resid > 0); 538 539 return (error); 540 } 541 542 /* ARGSUSED */ 543 static int 544 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 545 caller_context_t *ct) 546 { 547 rnode_t *rp; 548 u_offset_t off; 549 caddr_t base; 550 uint_t flags; 551 int remainder; 552 size_t n; 553 int on; 554 int error; 555 int resid; 556 offset_t offset; 557 rlim_t limit; 558 mntinfo_t *mi; 559 560 rp = VTOR(vp); 561 562 mi = VTOMI(vp); 563 if (nfs_zone() != mi->mi_zone) 564 return (EIO); 565 if (vp->v_type != VREG) 566 return (EISDIR); 567 568 if (uiop->uio_resid == 0) 569 return (0); 570 571 if (ioflag & FAPPEND) { 572 struct vattr va; 573 574 /* 575 * Must serialize if appending. 576 */ 577 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 578 nfs_rw_exit(&rp->r_rwlock); 579 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 580 INTR(vp))) 581 return (EINTR); 582 } 583 584 va.va_mask = AT_SIZE; 585 error = nfsgetattr(vp, &va, cr); 586 if (error) 587 return (error); 588 uiop->uio_loffset = va.va_size; 589 } 590 591 if (uiop->uio_loffset > MAXOFF32_T) 592 return (EFBIG); 593 594 offset = uiop->uio_loffset + uiop->uio_resid; 595 596 if (uiop->uio_loffset < 0 || offset > MAXOFF32_T) 597 return (EINVAL); 598 599 if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) { 600 limit = MAXOFF32_T; 601 } else { 602 limit = (rlim_t)uiop->uio_llimit; 603 } 604 605 /* 606 * Check to make sure that the process will not exceed 607 * its limit on file size. It is okay to write up to 608 * the limit, but not beyond. Thus, the write which 609 * reaches the limit will be short and the next write 610 * will return an error. 611 */ 612 remainder = 0; 613 if (offset > limit) { 614 remainder = offset - limit; 615 uiop->uio_resid = limit - uiop->uio_offset; 616 if (uiop->uio_resid <= 0) { 617 proc_t *p = ttoproc(curthread); 618 619 uiop->uio_resid += remainder; 620 mutex_enter(&p->p_lock); 621 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 622 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 623 mutex_exit(&p->p_lock); 624 return (EFBIG); 625 } 626 } 627 628 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 629 return (EINTR); 630 631 /* 632 * Bypass VM if caching has been disabled (e.g., locking) or if 633 * using client-side direct I/O and the file is not mmap'd and 634 * there are no cached pages. 635 */ 636 if ((vp->v_flag & VNOCACHE) || 637 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 638 rp->r_mapcnt == 0 && rp->r_inmap == 0 && 639 !vn_has_cached_data(vp))) { 640 size_t bufsize; 641 int count; 642 uint_t org_offset; 643 644 nfs_fwrite: 645 if (rp->r_flags & RSTALE) { 646 resid = uiop->uio_resid; 647 offset = uiop->uio_loffset; 648 error = rp->r_error; 649 /* 650 * A close may have cleared r_error, if so, 651 * propagate ESTALE error return properly 652 */ 653 if (error == 0) 654 error = ESTALE; 655 goto bottom; 656 } 657 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite); 658 base = kmem_alloc(bufsize, KM_SLEEP); 659 do { 660 resid = uiop->uio_resid; 661 offset = uiop->uio_loffset; 662 count = MIN(uiop->uio_resid, bufsize); 663 org_offset = uiop->uio_offset; 664 error = uiomove(base, count, UIO_WRITE, uiop); 665 if (!error) { 666 error = nfswrite(vp, base, org_offset, 667 count, cr); 668 } 669 } while (!error && uiop->uio_resid > 0); 670 kmem_free(base, bufsize); 671 goto bottom; 672 } 673 674 do { 675 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 676 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 677 n = MIN(MAXBSIZE - on, uiop->uio_resid); 678 679 resid = uiop->uio_resid; 680 offset = uiop->uio_loffset; 681 682 if (rp->r_flags & RSTALE) { 683 error = rp->r_error; 684 /* 685 * A close may have cleared r_error, if so, 686 * propagate ESTALE error return properly 687 */ 688 if (error == 0) 689 error = ESTALE; 690 break; 691 } 692 693 /* 694 * Don't create dirty pages faster than they 695 * can be cleaned so that the system doesn't 696 * get imbalanced. If the async queue is 697 * maxed out, then wait for it to drain before 698 * creating more dirty pages. Also, wait for 699 * any threads doing pagewalks in the vop_getattr 700 * entry points so that they don't block for 701 * long periods. 702 */ 703 mutex_enter(&rp->r_statelock); 704 while ((mi->mi_max_threads != 0 && 705 rp->r_awcount > 2 * mi->mi_max_threads) || 706 rp->r_gcount > 0) 707 cv_wait(&rp->r_cv, &rp->r_statelock); 708 mutex_exit(&rp->r_statelock); 709 710 /* 711 * Touch the page and fault it in if it is not in core 712 * before segmap_getmapflt or vpm_data_copy can lock it. 713 * This is to avoid the deadlock if the buffer is mapped 714 * to the same file through mmap which we want to write. 715 */ 716 uio_prefaultpages((long)n, uiop); 717 718 if (vpm_enable) { 719 /* 720 * It will use kpm mappings, so no need to 721 * pass an address. 722 */ 723 error = writerp(rp, NULL, n, uiop, 0); 724 } else { 725 if (segmap_kpm) { 726 int pon = uiop->uio_loffset & PAGEOFFSET; 727 size_t pn = MIN(PAGESIZE - pon, 728 uiop->uio_resid); 729 int pagecreate; 730 731 mutex_enter(&rp->r_statelock); 732 pagecreate = (pon == 0) && (pn == PAGESIZE || 733 uiop->uio_loffset + pn >= rp->r_size); 734 mutex_exit(&rp->r_statelock); 735 736 base = segmap_getmapflt(segkmap, vp, off + on, 737 pn, !pagecreate, S_WRITE); 738 739 error = writerp(rp, base + pon, n, uiop, 740 pagecreate); 741 742 } else { 743 base = segmap_getmapflt(segkmap, vp, off + on, 744 n, 0, S_READ); 745 error = writerp(rp, base + on, n, uiop, 0); 746 } 747 } 748 749 if (!error) { 750 if (mi->mi_flags & MI_NOAC) 751 flags = SM_WRITE; 752 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 753 /* 754 * Have written a whole block. 755 * Start an asynchronous write 756 * and mark the buffer to 757 * indicate that it won't be 758 * needed again soon. 759 */ 760 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 761 } else 762 flags = 0; 763 if ((ioflag & (FSYNC|FDSYNC)) || 764 (rp->r_flags & ROUTOFSPACE)) { 765 flags &= ~SM_ASYNC; 766 flags |= SM_WRITE; 767 } 768 if (vpm_enable) { 769 error = vpm_sync_pages(vp, off, n, flags); 770 } else { 771 error = segmap_release(segkmap, base, flags); 772 } 773 } else { 774 if (vpm_enable) { 775 (void) vpm_sync_pages(vp, off, n, 0); 776 } else { 777 (void) segmap_release(segkmap, base, 0); 778 } 779 /* 780 * In the event that we got an access error while 781 * faulting in a page for a write-only file just 782 * force a write. 783 */ 784 if (error == EACCES) 785 goto nfs_fwrite; 786 } 787 } while (!error && uiop->uio_resid > 0); 788 789 bottom: 790 if (error) { 791 uiop->uio_resid = resid + remainder; 792 uiop->uio_loffset = offset; 793 } else 794 uiop->uio_resid += remainder; 795 796 nfs_rw_exit(&rp->r_lkserlock); 797 798 return (error); 799 } 800 801 /* 802 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 803 */ 804 static int 805 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 806 int flags, cred_t *cr) 807 { 808 struct buf *bp; 809 int error; 810 811 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 812 bp = pageio_setup(pp, len, vp, flags); 813 ASSERT(bp != NULL); 814 815 /* 816 * pageio_setup should have set b_addr to 0. This 817 * is correct since we want to do I/O on a page 818 * boundary. bp_mapin will use this addr to calculate 819 * an offset, and then set b_addr to the kernel virtual 820 * address it allocated for us. 821 */ 822 ASSERT(bp->b_un.b_addr == 0); 823 824 bp->b_edev = 0; 825 bp->b_dev = 0; 826 bp->b_lblkno = lbtodb(off); 827 bp->b_file = vp; 828 bp->b_offset = (offset_t)off; 829 bp_mapin(bp); 830 831 error = nfs_bio(bp, cr); 832 833 bp_mapout(bp); 834 pageio_done(bp); 835 836 return (error); 837 } 838 839 /* 840 * Write to file. Writes to remote server in largest size 841 * chunks that the server can handle. Write is synchronous. 842 */ 843 static int 844 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr) 845 { 846 rnode_t *rp; 847 mntinfo_t *mi; 848 struct nfswriteargs wa; 849 struct nfsattrstat ns; 850 int error; 851 int tsize; 852 int douprintf; 853 854 douprintf = 1; 855 856 rp = VTOR(vp); 857 mi = VTOMI(vp); 858 859 ASSERT(nfs_zone() == mi->mi_zone); 860 861 wa.wa_args = &wa.wa_args_buf; 862 wa.wa_fhandle = *VTOFH(vp); 863 864 do { 865 tsize = MIN(mi->mi_curwrite, count); 866 wa.wa_data = base; 867 wa.wa_begoff = offset; 868 wa.wa_totcount = tsize; 869 wa.wa_count = tsize; 870 wa.wa_offset = offset; 871 872 if (mi->mi_io_kstats) { 873 mutex_enter(&mi->mi_lock); 874 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 875 mutex_exit(&mi->mi_lock); 876 } 877 wa.wa_mblk = NULL; 878 do { 879 error = rfs2call(mi, RFS_WRITE, 880 xdr_writeargs, (caddr_t)&wa, 881 xdr_attrstat, (caddr_t)&ns, cr, 882 &douprintf, &ns.ns_status, 0, NULL); 883 } while (error == ENFS_TRYAGAIN); 884 if (mi->mi_io_kstats) { 885 mutex_enter(&mi->mi_lock); 886 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 887 mutex_exit(&mi->mi_lock); 888 } 889 890 if (!error) { 891 error = geterrno(ns.ns_status); 892 /* 893 * Can't check for stale fhandle and purge caches 894 * here because pages are held by nfs_getpage. 895 * Just mark the attribute cache as timed out 896 * and set RWRITEATTR to indicate that the file 897 * was modified with a WRITE operation. 898 */ 899 if (!error) { 900 count -= tsize; 901 base += tsize; 902 offset += tsize; 903 if (mi->mi_io_kstats) { 904 mutex_enter(&mi->mi_lock); 905 KSTAT_IO_PTR(mi->mi_io_kstats)-> 906 writes++; 907 KSTAT_IO_PTR(mi->mi_io_kstats)-> 908 nwritten += tsize; 909 mutex_exit(&mi->mi_lock); 910 } 911 lwp_stat_update(LWP_STAT_OUBLK, 1); 912 mutex_enter(&rp->r_statelock); 913 PURGE_ATTRCACHE_LOCKED(rp); 914 rp->r_flags |= RWRITEATTR; 915 mutex_exit(&rp->r_statelock); 916 } 917 } 918 } while (!error && count); 919 920 return (error); 921 } 922 923 /* 924 * Read from a file. Reads data in largest chunks our interface can handle. 925 */ 926 static int 927 nfsread(vnode_t *vp, caddr_t base, uint_t offset, 928 int count, size_t *residp, cred_t *cr) 929 { 930 mntinfo_t *mi; 931 struct nfsreadargs ra; 932 struct nfsrdresult rr; 933 int tsize; 934 int error; 935 int douprintf; 936 failinfo_t fi; 937 rnode_t *rp; 938 struct vattr va; 939 hrtime_t t; 940 941 rp = VTOR(vp); 942 mi = VTOMI(vp); 943 944 ASSERT(nfs_zone() == mi->mi_zone); 945 946 douprintf = 1; 947 948 ra.ra_fhandle = *VTOFH(vp); 949 950 fi.vp = vp; 951 fi.fhp = (caddr_t)&ra.ra_fhandle; 952 fi.copyproc = nfscopyfh; 953 fi.lookupproc = nfslookup; 954 fi.xattrdirproc = acl_getxattrdir2; 955 956 do { 957 if (mi->mi_io_kstats) { 958 mutex_enter(&mi->mi_lock); 959 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 960 mutex_exit(&mi->mi_lock); 961 } 962 963 do { 964 tsize = MIN(mi->mi_curread, count); 965 rr.rr_data = base; 966 ra.ra_offset = offset; 967 ra.ra_totcount = tsize; 968 ra.ra_count = tsize; 969 ra.ra_data = base; 970 t = gethrtime(); 971 error = rfs2call(mi, RFS_READ, 972 xdr_readargs, (caddr_t)&ra, 973 xdr_rdresult, (caddr_t)&rr, cr, 974 &douprintf, &rr.rr_status, 0, &fi); 975 } while (error == ENFS_TRYAGAIN); 976 977 if (mi->mi_io_kstats) { 978 mutex_enter(&mi->mi_lock); 979 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 980 mutex_exit(&mi->mi_lock); 981 } 982 983 if (!error) { 984 error = geterrno(rr.rr_status); 985 if (!error) { 986 count -= rr.rr_count; 987 base += rr.rr_count; 988 offset += rr.rr_count; 989 if (mi->mi_io_kstats) { 990 mutex_enter(&mi->mi_lock); 991 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 992 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 993 rr.rr_count; 994 mutex_exit(&mi->mi_lock); 995 } 996 lwp_stat_update(LWP_STAT_INBLK, 1); 997 } 998 } 999 } while (!error && count && rr.rr_count == tsize); 1000 1001 *residp = count; 1002 1003 if (!error) { 1004 /* 1005 * Since no error occurred, we have the current 1006 * attributes and we need to do a cache check and then 1007 * potentially update the cached attributes. We can't 1008 * use the normal attribute check and cache mechanisms 1009 * because they might cause a cache flush which would 1010 * deadlock. Instead, we just check the cache to see 1011 * if the attributes have changed. If it is, then we 1012 * just mark the attributes as out of date. The next 1013 * time that the attributes are checked, they will be 1014 * out of date, new attributes will be fetched, and 1015 * the page cache will be flushed. If the attributes 1016 * weren't changed, then we just update the cached 1017 * attributes with these attributes. 1018 */ 1019 /* 1020 * If NFS_ACL is supported on the server, then the 1021 * attributes returned by server may have minimal 1022 * permissions sometimes denying access to users having 1023 * proper access. To get the proper attributes, mark 1024 * the attributes as expired so that they will be 1025 * regotten via the NFS_ACL GETATTR2 procedure. 1026 */ 1027 error = nattr_to_vattr(vp, &rr.rr_attr, &va); 1028 mutex_enter(&rp->r_statelock); 1029 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) || 1030 (mi->mi_flags & MI_ACL)) { 1031 mutex_exit(&rp->r_statelock); 1032 PURGE_ATTRCACHE(vp); 1033 } else { 1034 if (rp->r_mtime <= t) { 1035 nfs_attrcache_va(vp, &va); 1036 } 1037 mutex_exit(&rp->r_statelock); 1038 } 1039 } 1040 1041 return (error); 1042 } 1043 1044 /* ARGSUSED */ 1045 static int 1046 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 1047 caller_context_t *ct) 1048 { 1049 1050 if (nfs_zone() != VTOMI(vp)->mi_zone) 1051 return (EIO); 1052 switch (cmd) { 1053 case _FIODIRECTIO: 1054 return (nfs_directio(vp, (int)arg, cr)); 1055 default: 1056 return (ENOTTY); 1057 } 1058 } 1059 1060 /* ARGSUSED */ 1061 static int 1062 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1063 caller_context_t *ct) 1064 { 1065 int error; 1066 rnode_t *rp; 1067 1068 if (nfs_zone() != VTOMI(vp)->mi_zone) 1069 return (EIO); 1070 /* 1071 * If it has been specified that the return value will 1072 * just be used as a hint, and we are only being asked 1073 * for size, fsid or rdevid, then return the client's 1074 * notion of these values without checking to make sure 1075 * that the attribute cache is up to date. 1076 * The whole point is to avoid an over the wire GETATTR 1077 * call. 1078 */ 1079 rp = VTOR(vp); 1080 if (flags & ATTR_HINT) { 1081 if (vap->va_mask == 1082 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1083 mutex_enter(&rp->r_statelock); 1084 if (vap->va_mask | AT_SIZE) 1085 vap->va_size = rp->r_size; 1086 if (vap->va_mask | AT_FSID) 1087 vap->va_fsid = rp->r_attr.va_fsid; 1088 if (vap->va_mask | AT_RDEV) 1089 vap->va_rdev = rp->r_attr.va_rdev; 1090 mutex_exit(&rp->r_statelock); 1091 return (0); 1092 } 1093 } 1094 1095 /* 1096 * Only need to flush pages if asking for the mtime 1097 * and if there any dirty pages or any outstanding 1098 * asynchronous (write) requests for this file. 1099 */ 1100 if (vap->va_mask & AT_MTIME) { 1101 if (vn_has_cached_data(vp) && 1102 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1103 mutex_enter(&rp->r_statelock); 1104 rp->r_gcount++; 1105 mutex_exit(&rp->r_statelock); 1106 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 1107 mutex_enter(&rp->r_statelock); 1108 if (error && (error == ENOSPC || error == EDQUOT)) { 1109 if (!rp->r_error) 1110 rp->r_error = error; 1111 } 1112 if (--rp->r_gcount == 0) 1113 cv_broadcast(&rp->r_cv); 1114 mutex_exit(&rp->r_statelock); 1115 } 1116 } 1117 1118 return (nfsgetattr(vp, vap, cr)); 1119 } 1120 1121 /*ARGSUSED4*/ 1122 static int 1123 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1124 caller_context_t *ct) 1125 { 1126 int error; 1127 uint_t mask; 1128 struct vattr va; 1129 1130 mask = vap->va_mask; 1131 1132 if (mask & AT_NOSET) 1133 return (EINVAL); 1134 1135 if ((mask & AT_SIZE) && 1136 vap->va_type == VREG && 1137 vap->va_size > MAXOFF32_T) 1138 return (EFBIG); 1139 1140 if (nfs_zone() != VTOMI(vp)->mi_zone) 1141 return (EIO); 1142 1143 va.va_mask = AT_UID | AT_MODE; 1144 1145 error = nfsgetattr(vp, &va, cr); 1146 if (error) 1147 return (error); 1148 1149 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx, 1150 vp); 1151 1152 if (error) 1153 return (error); 1154 1155 return (nfssetattr(vp, vap, flags, cr)); 1156 } 1157 1158 static int 1159 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1160 { 1161 int error; 1162 uint_t mask; 1163 struct nfssaargs args; 1164 struct nfsattrstat ns; 1165 int douprintf; 1166 rnode_t *rp; 1167 struct vattr va; 1168 mode_t omode; 1169 mntinfo_t *mi; 1170 vsecattr_t *vsp; 1171 hrtime_t t; 1172 1173 mask = vap->va_mask; 1174 1175 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1176 1177 rp = VTOR(vp); 1178 1179 /* 1180 * Only need to flush pages if there are any pages and 1181 * if the file is marked as dirty in some fashion. The 1182 * file must be flushed so that we can accurately 1183 * determine the size of the file and the cached data 1184 * after the SETATTR returns. A file is considered to 1185 * be dirty if it is either marked with RDIRTY, has 1186 * outstanding i/o's active, or is mmap'd. In this 1187 * last case, we can't tell whether there are dirty 1188 * pages, so we flush just to be sure. 1189 */ 1190 if (vn_has_cached_data(vp) && 1191 ((rp->r_flags & RDIRTY) || 1192 rp->r_count > 0 || 1193 rp->r_mapcnt > 0)) { 1194 ASSERT(vp->v_type != VCHR); 1195 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 1196 if (error && (error == ENOSPC || error == EDQUOT)) { 1197 mutex_enter(&rp->r_statelock); 1198 if (!rp->r_error) 1199 rp->r_error = error; 1200 mutex_exit(&rp->r_statelock); 1201 } 1202 } 1203 1204 /* 1205 * If the system call was utime(2) or utimes(2) and the 1206 * application did not specify the times, then set the 1207 * mtime nanosecond field to 1 billion. This will get 1208 * translated from 1 billion nanoseconds to 1 million 1209 * microseconds in the over the wire request. The 1210 * server will use 1 million in the microsecond field 1211 * to tell whether both the mtime and atime should be 1212 * set to the server's current time. 1213 * 1214 * This is an overload of the protocol and should be 1215 * documented in the NFS Version 2 protocol specification. 1216 */ 1217 if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) { 1218 vap->va_mtime.tv_nsec = 1000000000; 1219 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) && 1220 NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1221 error = vattr_to_sattr(vap, &args.saa_sa); 1222 } else { 1223 /* 1224 * Use server times. vap time values will not be used. 1225 * To ensure no time overflow, make sure vap has 1226 * valid values, but retain the original values. 1227 */ 1228 timestruc_t mtime = vap->va_mtime; 1229 timestruc_t atime = vap->va_atime; 1230 time_t now; 1231 1232 now = gethrestime_sec(); 1233 if (NFS_TIME_T_OK(now)) { 1234 /* Just in case server does not know of this */ 1235 vap->va_mtime.tv_sec = now; 1236 vap->va_atime.tv_sec = now; 1237 } else { 1238 vap->va_mtime.tv_sec = 0; 1239 vap->va_atime.tv_sec = 0; 1240 } 1241 error = vattr_to_sattr(vap, &args.saa_sa); 1242 /* set vap times back on */ 1243 vap->va_mtime = mtime; 1244 vap->va_atime = atime; 1245 } 1246 } else { 1247 /* Either do not set times or use the client specified times */ 1248 error = vattr_to_sattr(vap, &args.saa_sa); 1249 } 1250 if (error) { 1251 /* req time field(s) overflow - return immediately */ 1252 return (error); 1253 } 1254 args.saa_fh = *VTOFH(vp); 1255 1256 va.va_mask = AT_MODE; 1257 error = nfsgetattr(vp, &va, cr); 1258 if (error) 1259 return (error); 1260 omode = va.va_mode; 1261 1262 mi = VTOMI(vp); 1263 1264 douprintf = 1; 1265 1266 t = gethrtime(); 1267 1268 error = rfs2call(mi, RFS_SETATTR, 1269 xdr_saargs, (caddr_t)&args, 1270 xdr_attrstat, (caddr_t)&ns, cr, 1271 &douprintf, &ns.ns_status, 0, NULL); 1272 1273 /* 1274 * Purge the access cache and ACL cache if changing either the 1275 * owner of the file, the group owner, or the mode. These may 1276 * change the access permissions of the file, so purge old 1277 * information and start over again. 1278 */ 1279 if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) { 1280 (void) nfs_access_purge_rp(rp); 1281 if (rp->r_secattr != NULL) { 1282 mutex_enter(&rp->r_statelock); 1283 vsp = rp->r_secattr; 1284 rp->r_secattr = NULL; 1285 mutex_exit(&rp->r_statelock); 1286 if (vsp != NULL) 1287 nfs_acl_free(vsp); 1288 } 1289 } 1290 1291 if (!error) { 1292 error = geterrno(ns.ns_status); 1293 if (!error) { 1294 /* 1295 * If changing the size of the file, invalidate 1296 * any local cached data which is no longer part 1297 * of the file. We also possibly invalidate the 1298 * last page in the file. We could use 1299 * pvn_vpzero(), but this would mark the page as 1300 * modified and require it to be written back to 1301 * the server for no particularly good reason. 1302 * This way, if we access it, then we bring it 1303 * back in. A read should be cheaper than a 1304 * write. 1305 */ 1306 if (mask & AT_SIZE) { 1307 nfs_invalidate_pages(vp, 1308 (vap->va_size & PAGEMASK), cr); 1309 } 1310 (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr); 1311 /* 1312 * If NFS_ACL is supported on the server, then the 1313 * attributes returned by server may have minimal 1314 * permissions sometimes denying access to users having 1315 * proper access. To get the proper attributes, mark 1316 * the attributes as expired so that they will be 1317 * regotten via the NFS_ACL GETATTR2 procedure. 1318 */ 1319 if (mi->mi_flags & MI_ACL) { 1320 PURGE_ATTRCACHE(vp); 1321 } 1322 /* 1323 * This next check attempts to deal with NFS 1324 * servers which can not handle increasing 1325 * the size of the file via setattr. Most 1326 * of these servers do not return an error, 1327 * but do not change the size of the file. 1328 * Hence, this check and then attempt to set 1329 * the file size by writing 1 byte at the 1330 * offset of the end of the file that we need. 1331 */ 1332 if ((mask & AT_SIZE) && 1333 ns.ns_attr.na_size < (uint32_t)vap->va_size) { 1334 char zb = '\0'; 1335 1336 error = nfswrite(vp, &zb, 1337 vap->va_size - sizeof (zb), 1338 sizeof (zb), cr); 1339 } 1340 /* 1341 * Some servers will change the mode to clear the setuid 1342 * and setgid bits when changing the uid or gid. The 1343 * client needs to compensate appropriately. 1344 */ 1345 if (mask & (AT_UID | AT_GID)) { 1346 int terror; 1347 1348 va.va_mask = AT_MODE; 1349 terror = nfsgetattr(vp, &va, cr); 1350 if (!terror && 1351 (((mask & AT_MODE) && 1352 va.va_mode != vap->va_mode) || 1353 (!(mask & AT_MODE) && 1354 va.va_mode != omode))) { 1355 va.va_mask = AT_MODE; 1356 if (mask & AT_MODE) 1357 va.va_mode = vap->va_mode; 1358 else 1359 va.va_mode = omode; 1360 (void) nfssetattr(vp, &va, 0, cr); 1361 } 1362 } 1363 } else { 1364 PURGE_ATTRCACHE(vp); 1365 PURGE_STALE_FH(error, vp, cr); 1366 } 1367 } else { 1368 PURGE_ATTRCACHE(vp); 1369 } 1370 1371 return (error); 1372 } 1373 1374 static int 1375 nfs_accessx(void *vp, int mode, cred_t *cr) 1376 { 1377 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1378 return (nfs_access(vp, mode, 0, cr, NULL)); 1379 } 1380 1381 /* ARGSUSED */ 1382 static int 1383 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 1384 { 1385 struct vattr va; 1386 int error; 1387 mntinfo_t *mi; 1388 int shift = 0; 1389 1390 mi = VTOMI(vp); 1391 1392 if (nfs_zone() != mi->mi_zone) 1393 return (EIO); 1394 if (mi->mi_flags & MI_ACL) { 1395 error = acl_access2(vp, mode, flags, cr); 1396 if (mi->mi_flags & MI_ACL) 1397 return (error); 1398 } 1399 1400 va.va_mask = AT_MODE | AT_UID | AT_GID; 1401 error = nfsgetattr(vp, &va, cr); 1402 if (error) 1403 return (error); 1404 1405 /* 1406 * Disallow write attempts on read-only 1407 * file systems, unless the file is a 1408 * device node. 1409 */ 1410 if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp)) 1411 return (EROFS); 1412 1413 /* 1414 * Disallow attempts to access mandatory lock files. 1415 */ 1416 if ((mode & (VWRITE | VREAD | VEXEC)) && 1417 MANDLOCK(vp, va.va_mode)) 1418 return (EACCES); 1419 1420 /* 1421 * Access check is based on only 1422 * one of owner, group, public. 1423 * If not owner, then check group. 1424 * If not a member of the group, 1425 * then check public access. 1426 */ 1427 if (crgetuid(cr) != va.va_uid) { 1428 shift += 3; 1429 if (!groupmember(va.va_gid, cr)) 1430 shift += 3; 1431 } 1432 found: 1433 mode &= ~(va.va_mode << shift); 1434 if (mode == 0) 1435 return (0); 1436 1437 return (secpolicy_vnode_access(cr, vp, va.va_uid, mode)); 1438 } 1439 1440 static int nfs_do_symlink_cache = 1; 1441 1442 /* ARGSUSED */ 1443 static int 1444 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 1445 { 1446 int error; 1447 struct nfsrdlnres rl; 1448 rnode_t *rp; 1449 int douprintf; 1450 failinfo_t fi; 1451 1452 /* 1453 * We want to be consistent with UFS semantics so we will return 1454 * EINVAL instead of ENXIO. This violates the XNFS spec and 1455 * the RFC 1094, which are wrong any way. BUGID 1138002. 1456 */ 1457 if (vp->v_type != VLNK) 1458 return (EINVAL); 1459 1460 if (nfs_zone() != VTOMI(vp)->mi_zone) 1461 return (EIO); 1462 1463 rp = VTOR(vp); 1464 if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) { 1465 error = nfs_validate_caches(vp, cr); 1466 if (error) 1467 return (error); 1468 mutex_enter(&rp->r_statelock); 1469 if (rp->r_symlink.contents != NULL) { 1470 error = uiomove(rp->r_symlink.contents, 1471 rp->r_symlink.len, UIO_READ, uiop); 1472 mutex_exit(&rp->r_statelock); 1473 return (error); 1474 } 1475 mutex_exit(&rp->r_statelock); 1476 } 1477 1478 1479 rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 1480 1481 fi.vp = vp; 1482 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1483 fi.copyproc = nfscopyfh; 1484 fi.lookupproc = nfslookup; 1485 fi.xattrdirproc = acl_getxattrdir2; 1486 1487 douprintf = 1; 1488 1489 error = rfs2call(VTOMI(vp), RFS_READLINK, 1490 xdr_readlink, (caddr_t)VTOFH(vp), 1491 xdr_rdlnres, (caddr_t)&rl, cr, 1492 &douprintf, &rl.rl_status, 0, &fi); 1493 1494 if (error) { 1495 1496 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1497 return (error); 1498 } 1499 1500 error = geterrno(rl.rl_status); 1501 if (!error) { 1502 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop); 1503 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) { 1504 mutex_enter(&rp->r_statelock); 1505 if (rp->r_symlink.contents == NULL) { 1506 rp->r_symlink.contents = rl.rl_data; 1507 rp->r_symlink.len = (int)rl.rl_count; 1508 rp->r_symlink.size = NFS_MAXPATHLEN; 1509 mutex_exit(&rp->r_statelock); 1510 } else { 1511 mutex_exit(&rp->r_statelock); 1512 1513 kmem_free((void *)rl.rl_data, 1514 NFS_MAXPATHLEN); 1515 } 1516 } else { 1517 1518 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1519 } 1520 } else { 1521 PURGE_STALE_FH(error, vp, cr); 1522 1523 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1524 } 1525 1526 /* 1527 * Conform to UFS semantics (see comment above) 1528 */ 1529 return (error == ENXIO ? EINVAL : error); 1530 } 1531 1532 /* 1533 * Flush local dirty pages to stable storage on the server. 1534 * 1535 * If FNODSYNC is specified, then there is nothing to do because 1536 * metadata changes are not cached on the client before being 1537 * sent to the server. 1538 */ 1539 /* ARGSUSED */ 1540 static int 1541 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 1542 { 1543 int error; 1544 1545 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1546 return (0); 1547 1548 if (nfs_zone() != VTOMI(vp)->mi_zone) 1549 return (EIO); 1550 1551 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 1552 if (!error) 1553 error = VTOR(vp)->r_error; 1554 return (error); 1555 } 1556 1557 1558 /* 1559 * Weirdness: if the file was removed or the target of a rename 1560 * operation while it was open, it got renamed instead. Here we 1561 * remove the renamed file. 1562 */ 1563 /* ARGSUSED */ 1564 static void 1565 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 1566 { 1567 rnode_t *rp; 1568 1569 ASSERT(vp != DNLC_NO_VNODE); 1570 1571 /* 1572 * If this is coming from the wrong zone, we let someone in the right 1573 * zone take care of it asynchronously. We can get here due to 1574 * VN_RELE() being called from pageout() or fsflush(). This call may 1575 * potentially turn into an expensive no-op if, for instance, v_count 1576 * gets incremented in the meantime, but it's still correct. 1577 */ 1578 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1579 nfs_async_inactive(vp, cr, nfs_inactive); 1580 return; 1581 } 1582 1583 rp = VTOR(vp); 1584 redo: 1585 if (rp->r_unldvp != NULL) { 1586 /* 1587 * Save the vnode pointer for the directory where the 1588 * unlinked-open file got renamed, then set it to NULL 1589 * to prevent another thread from getting here before 1590 * we're done with the remove. While we have the 1591 * statelock, make local copies of the pertinent rnode 1592 * fields. If we weren't to do this in an atomic way, the 1593 * the unl* fields could become inconsistent with respect 1594 * to each other due to a race condition between this 1595 * code and nfs_remove(). See bug report 1034328. 1596 */ 1597 mutex_enter(&rp->r_statelock); 1598 if (rp->r_unldvp != NULL) { 1599 vnode_t *unldvp; 1600 char *unlname; 1601 cred_t *unlcred; 1602 struct nfsdiropargs da; 1603 enum nfsstat status; 1604 int douprintf; 1605 int error; 1606 1607 unldvp = rp->r_unldvp; 1608 rp->r_unldvp = NULL; 1609 unlname = rp->r_unlname; 1610 rp->r_unlname = NULL; 1611 unlcred = rp->r_unlcred; 1612 rp->r_unlcred = NULL; 1613 mutex_exit(&rp->r_statelock); 1614 1615 /* 1616 * If there are any dirty pages left, then flush 1617 * them. This is unfortunate because they just 1618 * may get thrown away during the remove operation, 1619 * but we have to do this for correctness. 1620 */ 1621 if (vn_has_cached_data(vp) && 1622 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1623 ASSERT(vp->v_type != VCHR); 1624 error = nfs_putpage(vp, (offset_t)0, 0, 0, 1625 cr, ct); 1626 if (error) { 1627 mutex_enter(&rp->r_statelock); 1628 if (!rp->r_error) 1629 rp->r_error = error; 1630 mutex_exit(&rp->r_statelock); 1631 } 1632 } 1633 1634 /* 1635 * Do the remove operation on the renamed file 1636 */ 1637 setdiropargs(&da, unlname, unldvp); 1638 1639 douprintf = 1; 1640 1641 (void) rfs2call(VTOMI(unldvp), RFS_REMOVE, 1642 xdr_diropargs, (caddr_t)&da, 1643 xdr_enum, (caddr_t)&status, unlcred, 1644 &douprintf, &status, 0, NULL); 1645 1646 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1647 nfs_purge_rddir_cache(unldvp); 1648 PURGE_ATTRCACHE(unldvp); 1649 1650 /* 1651 * Release stuff held for the remove 1652 */ 1653 VN_RELE(unldvp); 1654 kmem_free(unlname, MAXNAMELEN); 1655 crfree(unlcred); 1656 goto redo; 1657 } 1658 mutex_exit(&rp->r_statelock); 1659 } 1660 1661 rp_addfree(rp, cr); 1662 } 1663 1664 /* 1665 * Remote file system operations having to do with directory manipulation. 1666 */ 1667 1668 /* ARGSUSED */ 1669 static int 1670 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1671 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1672 int *direntflags, pathname_t *realpnp) 1673 { 1674 int error; 1675 vnode_t *vp; 1676 vnode_t *avp = NULL; 1677 rnode_t *drp; 1678 1679 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1680 return (EPERM); 1681 1682 drp = VTOR(dvp); 1683 1684 /* 1685 * Are we looking up extended attributes? If so, "dvp" is 1686 * the file or directory for which we want attributes, and 1687 * we need a lookup of the hidden attribute directory 1688 * before we lookup the rest of the path. 1689 */ 1690 if (flags & LOOKUP_XATTR) { 1691 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1692 mntinfo_t *mi; 1693 1694 mi = VTOMI(dvp); 1695 if (!(mi->mi_flags & MI_EXTATTR)) 1696 return (EINVAL); 1697 1698 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1699 return (EINTR); 1700 1701 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1702 if (avp == NULL) 1703 error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0); 1704 else 1705 error = 0; 1706 1707 nfs_rw_exit(&drp->r_rwlock); 1708 1709 if (error) { 1710 if (mi->mi_flags & MI_EXTATTR) 1711 return (error); 1712 return (EINVAL); 1713 } 1714 dvp = avp; 1715 drp = VTOR(dvp); 1716 } 1717 1718 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1719 error = EINTR; 1720 goto out; 1721 } 1722 1723 error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1724 1725 nfs_rw_exit(&drp->r_rwlock); 1726 1727 /* 1728 * If vnode is a device, create special vnode. 1729 */ 1730 if (!error && IS_DEVVP(*vpp)) { 1731 vp = *vpp; 1732 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1733 VN_RELE(vp); 1734 } 1735 1736 out: 1737 if (avp != NULL) 1738 VN_RELE(avp); 1739 1740 return (error); 1741 } 1742 1743 static int nfs_lookup_neg_cache = 1; 1744 1745 #ifdef DEBUG 1746 static int nfs_lookup_dnlc_hits = 0; 1747 static int nfs_lookup_dnlc_misses = 0; 1748 static int nfs_lookup_dnlc_neg_hits = 0; 1749 static int nfs_lookup_dnlc_disappears = 0; 1750 static int nfs_lookup_dnlc_lookups = 0; 1751 #endif 1752 1753 /* ARGSUSED */ 1754 int 1755 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1756 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1757 { 1758 int error; 1759 1760 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1761 1762 /* 1763 * If lookup is for "", just return dvp. Don't need 1764 * to send it over the wire, look it up in the dnlc, 1765 * or perform any access checks. 1766 */ 1767 if (*nm == '\0') { 1768 VN_HOLD(dvp); 1769 *vpp = dvp; 1770 return (0); 1771 } 1772 1773 /* 1774 * Can't do lookups in non-directories. 1775 */ 1776 if (dvp->v_type != VDIR) 1777 return (ENOTDIR); 1778 1779 /* 1780 * If we're called with RFSCALL_SOFT, it's important that 1781 * the only rfscall is one we make directly; if we permit 1782 * an access call because we're looking up "." or validating 1783 * a dnlc hit, we'll deadlock because that rfscall will not 1784 * have the RFSCALL_SOFT set. 1785 */ 1786 if (rfscall_flags & RFSCALL_SOFT) 1787 goto callit; 1788 1789 /* 1790 * If lookup is for ".", just return dvp. Don't need 1791 * to send it over the wire or look it up in the dnlc, 1792 * just need to check access. 1793 */ 1794 if (strcmp(nm, ".") == 0) { 1795 error = nfs_access(dvp, VEXEC, 0, cr, NULL); 1796 if (error) 1797 return (error); 1798 VN_HOLD(dvp); 1799 *vpp = dvp; 1800 return (0); 1801 } 1802 1803 /* 1804 * Lookup this name in the DNLC. If there was a valid entry, 1805 * then return the results of the lookup. 1806 */ 1807 error = nfslookup_dnlc(dvp, nm, vpp, cr); 1808 if (error || *vpp != NULL) 1809 return (error); 1810 1811 callit: 1812 error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1813 1814 return (error); 1815 } 1816 1817 static int 1818 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1819 { 1820 int error; 1821 vnode_t *vp; 1822 1823 ASSERT(*nm != '\0'); 1824 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1825 1826 /* 1827 * Lookup this name in the DNLC. If successful, then validate 1828 * the caches and then recheck the DNLC. The DNLC is rechecked 1829 * just in case this entry got invalidated during the call 1830 * to nfs_validate_caches. 1831 * 1832 * An assumption is being made that it is safe to say that a 1833 * file exists which may not on the server. Any operations to 1834 * the server will fail with ESTALE. 1835 */ 1836 #ifdef DEBUG 1837 nfs_lookup_dnlc_lookups++; 1838 #endif 1839 vp = dnlc_lookup(dvp, nm); 1840 if (vp != NULL) { 1841 VN_RELE(vp); 1842 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 1843 PURGE_ATTRCACHE(dvp); 1844 } 1845 error = nfs_validate_caches(dvp, cr); 1846 if (error) 1847 return (error); 1848 vp = dnlc_lookup(dvp, nm); 1849 if (vp != NULL) { 1850 error = nfs_access(dvp, VEXEC, 0, cr, NULL); 1851 if (error) { 1852 VN_RELE(vp); 1853 return (error); 1854 } 1855 if (vp == DNLC_NO_VNODE) { 1856 VN_RELE(vp); 1857 #ifdef DEBUG 1858 nfs_lookup_dnlc_neg_hits++; 1859 #endif 1860 return (ENOENT); 1861 } 1862 *vpp = vp; 1863 #ifdef DEBUG 1864 nfs_lookup_dnlc_hits++; 1865 #endif 1866 return (0); 1867 } 1868 #ifdef DEBUG 1869 nfs_lookup_dnlc_disappears++; 1870 #endif 1871 } 1872 #ifdef DEBUG 1873 else 1874 nfs_lookup_dnlc_misses++; 1875 #endif 1876 1877 *vpp = NULL; 1878 1879 return (0); 1880 } 1881 1882 static int 1883 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 1884 int rfscall_flags) 1885 { 1886 int error; 1887 struct nfsdiropargs da; 1888 struct nfsdiropres dr; 1889 int douprintf; 1890 failinfo_t fi; 1891 hrtime_t t; 1892 1893 ASSERT(*nm != '\0'); 1894 ASSERT(dvp->v_type == VDIR); 1895 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1896 1897 setdiropargs(&da, nm, dvp); 1898 1899 fi.vp = dvp; 1900 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1901 fi.copyproc = nfscopyfh; 1902 fi.lookupproc = nfslookup; 1903 fi.xattrdirproc = acl_getxattrdir2; 1904 1905 douprintf = 1; 1906 1907 t = gethrtime(); 1908 1909 error = rfs2call(VTOMI(dvp), RFS_LOOKUP, 1910 xdr_diropargs, (caddr_t)&da, 1911 xdr_diropres, (caddr_t)&dr, cr, 1912 &douprintf, &dr.dr_status, rfscall_flags, &fi); 1913 1914 if (!error) { 1915 error = geterrno(dr.dr_status); 1916 if (!error) { 1917 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 1918 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 1919 /* 1920 * If NFS_ACL is supported on the server, then the 1921 * attributes returned by server may have minimal 1922 * permissions sometimes denying access to users having 1923 * proper access. To get the proper attributes, mark 1924 * the attributes as expired so that they will be 1925 * regotten via the NFS_ACL GETATTR2 procedure. 1926 */ 1927 if (VTOMI(*vpp)->mi_flags & MI_ACL) { 1928 PURGE_ATTRCACHE(*vpp); 1929 } 1930 if (!(rfscall_flags & RFSCALL_SOFT)) 1931 dnlc_update(dvp, nm, *vpp); 1932 } else { 1933 PURGE_STALE_FH(error, dvp, cr); 1934 if (error == ENOENT && nfs_lookup_neg_cache) 1935 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 1936 } 1937 } 1938 1939 return (error); 1940 } 1941 1942 /* ARGSUSED */ 1943 static int 1944 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 1945 int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct, 1946 vsecattr_t *vsecp) 1947 { 1948 int error; 1949 struct nfscreatargs args; 1950 struct nfsdiropres dr; 1951 int douprintf; 1952 vnode_t *vp; 1953 rnode_t *rp; 1954 struct vattr vattr; 1955 rnode_t *drp; 1956 vnode_t *tempvp; 1957 hrtime_t t; 1958 1959 drp = VTOR(dvp); 1960 1961 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1962 return (EPERM); 1963 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 1964 return (EINTR); 1965 1966 /* 1967 * We make a copy of the attributes because the caller does not 1968 * expect us to change what va points to. 1969 */ 1970 vattr = *va; 1971 1972 /* 1973 * If the pathname is "", just use dvp. Don't need 1974 * to send it over the wire, look it up in the dnlc, 1975 * or perform any access checks. 1976 */ 1977 if (*nm == '\0') { 1978 error = 0; 1979 VN_HOLD(dvp); 1980 vp = dvp; 1981 /* 1982 * If the pathname is ".", just use dvp. Don't need 1983 * to send it over the wire or look it up in the dnlc, 1984 * just need to check access. 1985 */ 1986 } else if (strcmp(nm, ".") == 0) { 1987 error = nfs_access(dvp, VEXEC, 0, cr, ct); 1988 if (error) { 1989 nfs_rw_exit(&drp->r_rwlock); 1990 return (error); 1991 } 1992 VN_HOLD(dvp); 1993 vp = dvp; 1994 /* 1995 * We need to go over the wire, just to be sure whether the 1996 * file exists or not. Using the DNLC can be dangerous in 1997 * this case when making a decision regarding existence. 1998 */ 1999 } else { 2000 error = nfslookup_otw(dvp, nm, &vp, cr, 0); 2001 } 2002 if (!error) { 2003 if (exclusive == EXCL) 2004 error = EEXIST; 2005 else if (vp->v_type == VDIR && (mode & VWRITE)) 2006 error = EISDIR; 2007 else { 2008 /* 2009 * If vnode is a device, create special vnode. 2010 */ 2011 if (IS_DEVVP(vp)) { 2012 tempvp = vp; 2013 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2014 VN_RELE(tempvp); 2015 } 2016 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 2017 if ((vattr.va_mask & AT_SIZE) && 2018 vp->v_type == VREG) { 2019 vattr.va_mask = AT_SIZE; 2020 error = nfssetattr(vp, &vattr, 0, cr); 2021 } 2022 } 2023 } 2024 nfs_rw_exit(&drp->r_rwlock); 2025 if (error) { 2026 VN_RELE(vp); 2027 } else { 2028 /* 2029 * existing file got truncated, notify. 2030 */ 2031 vnevent_create(vp, ct); 2032 *vpp = vp; 2033 } 2034 return (error); 2035 } 2036 2037 ASSERT(vattr.va_mask & AT_TYPE); 2038 if (vattr.va_type == VREG) { 2039 ASSERT(vattr.va_mask & AT_MODE); 2040 if (MANDMODE(vattr.va_mode)) { 2041 nfs_rw_exit(&drp->r_rwlock); 2042 return (EACCES); 2043 } 2044 } 2045 2046 dnlc_remove(dvp, nm); 2047 2048 setdiropargs(&args.ca_da, nm, dvp); 2049 2050 /* 2051 * Decide what the group-id of the created file should be. 2052 * Set it in attribute list as advisory...then do a setattr 2053 * if the server didn't get it right the first time. 2054 */ 2055 error = setdirgid(dvp, &vattr.va_gid, cr); 2056 if (error) { 2057 nfs_rw_exit(&drp->r_rwlock); 2058 return (error); 2059 } 2060 vattr.va_mask |= AT_GID; 2061 2062 /* 2063 * This is a completely gross hack to make mknod 2064 * work over the wire until we can wack the protocol 2065 */ 2066 #define IFCHR 0020000 /* character special */ 2067 #define IFBLK 0060000 /* block special */ 2068 #define IFSOCK 0140000 /* socket */ 2069 2070 /* 2071 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x 2072 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18 2073 * bits in the minor number where 4.x supports 8 bits. If the 5.x 2074 * minor/major numbers <= 8 bits long, compress the device 2075 * number before sending it. Otherwise, the 4.x server will not 2076 * create the device with the correct device number and nothing can be 2077 * done about this. 2078 */ 2079 if (vattr.va_type == VCHR || vattr.va_type == VBLK) { 2080 dev_t d = vattr.va_rdev; 2081 dev32_t dev32; 2082 2083 if (vattr.va_type == VCHR) 2084 vattr.va_mode |= IFCHR; 2085 else 2086 vattr.va_mode |= IFBLK; 2087 2088 (void) cmpldev(&dev32, d); 2089 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN)) 2090 vattr.va_size = (u_offset_t)dev32; 2091 else 2092 vattr.va_size = (u_offset_t)nfsv2_cmpdev(d); 2093 2094 vattr.va_mask |= AT_MODE|AT_SIZE; 2095 } else if (vattr.va_type == VFIFO) { 2096 vattr.va_mode |= IFCHR; /* xtra kludge for namedpipe */ 2097 vattr.va_size = (u_offset_t)NFS_FIFO_DEV; /* blech */ 2098 vattr.va_mask |= AT_MODE|AT_SIZE; 2099 } else if (vattr.va_type == VSOCK) { 2100 vattr.va_mode |= IFSOCK; 2101 /* 2102 * To avoid triggering bugs in the servers set AT_SIZE 2103 * (all other RFS_CREATE calls set this). 2104 */ 2105 vattr.va_size = 0; 2106 vattr.va_mask |= AT_MODE|AT_SIZE; 2107 } 2108 2109 args.ca_sa = &args.ca_sa_buf; 2110 error = vattr_to_sattr(&vattr, args.ca_sa); 2111 if (error) { 2112 /* req time field(s) overflow - return immediately */ 2113 nfs_rw_exit(&drp->r_rwlock); 2114 return (error); 2115 } 2116 2117 douprintf = 1; 2118 2119 t = gethrtime(); 2120 2121 error = rfs2call(VTOMI(dvp), RFS_CREATE, 2122 xdr_creatargs, (caddr_t)&args, 2123 xdr_diropres, (caddr_t)&dr, cr, 2124 &douprintf, &dr.dr_status, 0, NULL); 2125 2126 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2127 2128 if (!error) { 2129 error = geterrno(dr.dr_status); 2130 if (!error) { 2131 if (HAVE_RDDIR_CACHE(drp)) 2132 nfs_purge_rddir_cache(dvp); 2133 vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2134 dvp->v_vfsp, t, cr, NULL, NULL); 2135 /* 2136 * If NFS_ACL is supported on the server, then the 2137 * attributes returned by server may have minimal 2138 * permissions sometimes denying access to users having 2139 * proper access. To get the proper attributes, mark 2140 * the attributes as expired so that they will be 2141 * regotten via the NFS_ACL GETATTR2 procedure. 2142 */ 2143 if (VTOMI(vp)->mi_flags & MI_ACL) { 2144 PURGE_ATTRCACHE(vp); 2145 } 2146 dnlc_update(dvp, nm, vp); 2147 rp = VTOR(vp); 2148 if (vattr.va_size == 0) { 2149 mutex_enter(&rp->r_statelock); 2150 rp->r_size = 0; 2151 mutex_exit(&rp->r_statelock); 2152 if (vn_has_cached_data(vp)) { 2153 ASSERT(vp->v_type != VCHR); 2154 nfs_invalidate_pages(vp, 2155 (u_offset_t)0, cr); 2156 } 2157 } 2158 2159 /* 2160 * Make sure the gid was set correctly. 2161 * If not, try to set it (but don't lose 2162 * any sleep over it). 2163 */ 2164 if (vattr.va_gid != rp->r_attr.va_gid) { 2165 vattr.va_mask = AT_GID; 2166 (void) nfssetattr(vp, &vattr, 0, cr); 2167 } 2168 2169 /* 2170 * If vnode is a device create special vnode 2171 */ 2172 if (IS_DEVVP(vp)) { 2173 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2174 VN_RELE(vp); 2175 } else 2176 *vpp = vp; 2177 } else { 2178 PURGE_STALE_FH(error, dvp, cr); 2179 } 2180 } 2181 2182 nfs_rw_exit(&drp->r_rwlock); 2183 2184 return (error); 2185 } 2186 2187 /* 2188 * Weirdness: if the vnode to be removed is open 2189 * we rename it instead of removing it and nfs_inactive 2190 * will remove the new name. 2191 */ 2192 /* ARGSUSED */ 2193 static int 2194 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 2195 { 2196 int error; 2197 struct nfsdiropargs da; 2198 enum nfsstat status; 2199 vnode_t *vp; 2200 char *tmpname; 2201 int douprintf; 2202 rnode_t *rp; 2203 rnode_t *drp; 2204 2205 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2206 return (EPERM); 2207 drp = VTOR(dvp); 2208 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2209 return (EINTR); 2210 2211 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2212 if (error) { 2213 nfs_rw_exit(&drp->r_rwlock); 2214 return (error); 2215 } 2216 2217 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2218 VN_RELE(vp); 2219 nfs_rw_exit(&drp->r_rwlock); 2220 return (EPERM); 2221 } 2222 2223 /* 2224 * First just remove the entry from the name cache, as it 2225 * is most likely the only entry for this vp. 2226 */ 2227 dnlc_remove(dvp, nm); 2228 2229 /* 2230 * If the file has a v_count > 1 then there may be more than one 2231 * entry in the name cache due multiple links or an open file, 2232 * but we don't have the real reference count so flush all 2233 * possible entries. 2234 */ 2235 if (vp->v_count > 1) 2236 dnlc_purge_vp(vp); 2237 2238 /* 2239 * Now we have the real reference count on the vnode 2240 */ 2241 rp = VTOR(vp); 2242 mutex_enter(&rp->r_statelock); 2243 if (vp->v_count > 1 && 2244 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2245 mutex_exit(&rp->r_statelock); 2246 tmpname = newname(); 2247 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct); 2248 if (error) 2249 kmem_free(tmpname, MAXNAMELEN); 2250 else { 2251 mutex_enter(&rp->r_statelock); 2252 if (rp->r_unldvp == NULL) { 2253 VN_HOLD(dvp); 2254 rp->r_unldvp = dvp; 2255 if (rp->r_unlcred != NULL) 2256 crfree(rp->r_unlcred); 2257 crhold(cr); 2258 rp->r_unlcred = cr; 2259 rp->r_unlname = tmpname; 2260 } else { 2261 kmem_free(rp->r_unlname, MAXNAMELEN); 2262 rp->r_unlname = tmpname; 2263 } 2264 mutex_exit(&rp->r_statelock); 2265 } 2266 } else { 2267 mutex_exit(&rp->r_statelock); 2268 /* 2269 * We need to flush any dirty pages which happen to 2270 * be hanging around before removing the file. This 2271 * shouldn't happen very often and mostly on file 2272 * systems mounted "nocto". 2273 */ 2274 if (vn_has_cached_data(vp) && 2275 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2276 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 2277 if (error && (error == ENOSPC || error == EDQUOT)) { 2278 mutex_enter(&rp->r_statelock); 2279 if (!rp->r_error) 2280 rp->r_error = error; 2281 mutex_exit(&rp->r_statelock); 2282 } 2283 } 2284 2285 setdiropargs(&da, nm, dvp); 2286 2287 douprintf = 1; 2288 2289 error = rfs2call(VTOMI(dvp), RFS_REMOVE, 2290 xdr_diropargs, (caddr_t)&da, 2291 xdr_enum, (caddr_t)&status, cr, 2292 &douprintf, &status, 0, NULL); 2293 2294 /* 2295 * The xattr dir may be gone after last attr is removed, 2296 * so flush it from dnlc. 2297 */ 2298 if (dvp->v_flag & V_XATTRDIR) 2299 dnlc_purge_vp(dvp); 2300 2301 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2302 PURGE_ATTRCACHE(vp); /* link count changed */ 2303 2304 if (!error) { 2305 error = geterrno(status); 2306 if (!error) { 2307 if (HAVE_RDDIR_CACHE(drp)) 2308 nfs_purge_rddir_cache(dvp); 2309 } else { 2310 PURGE_STALE_FH(error, dvp, cr); 2311 } 2312 } 2313 } 2314 2315 if (error == 0) { 2316 vnevent_remove(vp, dvp, nm, ct); 2317 } 2318 VN_RELE(vp); 2319 2320 nfs_rw_exit(&drp->r_rwlock); 2321 2322 return (error); 2323 } 2324 2325 /* ARGSUSED */ 2326 static int 2327 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 2328 caller_context_t *ct, int flags) 2329 { 2330 int error; 2331 struct nfslinkargs args; 2332 enum nfsstat status; 2333 vnode_t *realvp; 2334 int douprintf; 2335 rnode_t *tdrp; 2336 2337 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2338 return (EPERM); 2339 if (VOP_REALVP(svp, &realvp, ct) == 0) 2340 svp = realvp; 2341 2342 args.la_from = VTOFH(svp); 2343 setdiropargs(&args.la_to, tnm, tdvp); 2344 2345 tdrp = VTOR(tdvp); 2346 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2347 return (EINTR); 2348 2349 dnlc_remove(tdvp, tnm); 2350 2351 douprintf = 1; 2352 2353 error = rfs2call(VTOMI(svp), RFS_LINK, 2354 xdr_linkargs, (caddr_t)&args, 2355 xdr_enum, (caddr_t)&status, cr, 2356 &douprintf, &status, 0, NULL); 2357 2358 PURGE_ATTRCACHE(tdvp); /* mod time changed */ 2359 PURGE_ATTRCACHE(svp); /* link count changed */ 2360 2361 if (!error) { 2362 error = geterrno(status); 2363 if (!error) { 2364 if (HAVE_RDDIR_CACHE(tdrp)) 2365 nfs_purge_rddir_cache(tdvp); 2366 } 2367 } 2368 2369 nfs_rw_exit(&tdrp->r_rwlock); 2370 2371 if (!error) { 2372 /* 2373 * Notify the source file of this link operation. 2374 */ 2375 vnevent_link(svp, ct); 2376 } 2377 return (error); 2378 } 2379 2380 /* ARGSUSED */ 2381 static int 2382 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 2383 caller_context_t *ct, int flags) 2384 { 2385 vnode_t *realvp; 2386 2387 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2388 return (EPERM); 2389 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 2390 ndvp = realvp; 2391 2392 return (nfsrename(odvp, onm, ndvp, nnm, cr, ct)); 2393 } 2394 2395 /* 2396 * nfsrename does the real work of renaming in NFS Version 2. 2397 */ 2398 static int 2399 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 2400 caller_context_t *ct) 2401 { 2402 int error; 2403 enum nfsstat status; 2404 struct nfsrnmargs args; 2405 int douprintf; 2406 vnode_t *nvp = NULL; 2407 vnode_t *ovp = NULL; 2408 char *tmpname; 2409 rnode_t *rp; 2410 rnode_t *odrp; 2411 rnode_t *ndrp; 2412 2413 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2414 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2415 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2416 return (EINVAL); 2417 2418 odrp = VTOR(odvp); 2419 ndrp = VTOR(ndvp); 2420 if ((intptr_t)odrp < (intptr_t)ndrp) { 2421 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2422 return (EINTR); 2423 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2424 nfs_rw_exit(&odrp->r_rwlock); 2425 return (EINTR); 2426 } 2427 } else { 2428 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2429 return (EINTR); 2430 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2431 nfs_rw_exit(&ndrp->r_rwlock); 2432 return (EINTR); 2433 } 2434 } 2435 2436 /* 2437 * Lookup the target file. If it exists, it needs to be 2438 * checked to see whether it is a mount point and whether 2439 * it is active (open). 2440 */ 2441 error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 2442 if (!error) { 2443 /* 2444 * If this file has been mounted on, then just 2445 * return busy because renaming to it would remove 2446 * the mounted file system from the name space. 2447 */ 2448 if (vn_mountedvfs(nvp) != NULL) { 2449 VN_RELE(nvp); 2450 nfs_rw_exit(&odrp->r_rwlock); 2451 nfs_rw_exit(&ndrp->r_rwlock); 2452 return (EBUSY); 2453 } 2454 2455 /* 2456 * Purge the name cache of all references to this vnode 2457 * so that we can check the reference count to infer 2458 * whether it is active or not. 2459 */ 2460 /* 2461 * First just remove the entry from the name cache, as it 2462 * is most likely the only entry for this vp. 2463 */ 2464 dnlc_remove(ndvp, nnm); 2465 /* 2466 * If the file has a v_count > 1 then there may be more 2467 * than one entry in the name cache due multiple links 2468 * or an open file, but we don't have the real reference 2469 * count so flush all possible entries. 2470 */ 2471 if (nvp->v_count > 1) 2472 dnlc_purge_vp(nvp); 2473 2474 /* 2475 * If the vnode is active and is not a directory, 2476 * arrange to rename it to a 2477 * temporary file so that it will continue to be 2478 * accessible. This implements the "unlink-open-file" 2479 * semantics for the target of a rename operation. 2480 * Before doing this though, make sure that the 2481 * source and target files are not already the same. 2482 */ 2483 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 2484 /* 2485 * Lookup the source name. 2486 */ 2487 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, 2488 cr, 0); 2489 2490 /* 2491 * The source name *should* already exist. 2492 */ 2493 if (error) { 2494 VN_RELE(nvp); 2495 nfs_rw_exit(&odrp->r_rwlock); 2496 nfs_rw_exit(&ndrp->r_rwlock); 2497 return (error); 2498 } 2499 2500 /* 2501 * Compare the two vnodes. If they are the same, 2502 * just release all held vnodes and return success. 2503 */ 2504 if (ovp == nvp) { 2505 VN_RELE(ovp); 2506 VN_RELE(nvp); 2507 nfs_rw_exit(&odrp->r_rwlock); 2508 nfs_rw_exit(&ndrp->r_rwlock); 2509 return (0); 2510 } 2511 2512 /* 2513 * Can't mix and match directories and non- 2514 * directories in rename operations. We already 2515 * know that the target is not a directory. If 2516 * the source is a directory, return an error. 2517 */ 2518 if (ovp->v_type == VDIR) { 2519 VN_RELE(ovp); 2520 VN_RELE(nvp); 2521 nfs_rw_exit(&odrp->r_rwlock); 2522 nfs_rw_exit(&ndrp->r_rwlock); 2523 return (ENOTDIR); 2524 } 2525 2526 /* 2527 * The target file exists, is not the same as 2528 * the source file, and is active. Link it 2529 * to a temporary filename to avoid having 2530 * the server removing the file completely. 2531 */ 2532 tmpname = newname(); 2533 error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0); 2534 if (error == EOPNOTSUPP) { 2535 error = nfs_rename(ndvp, nnm, ndvp, tmpname, 2536 cr, NULL, 0); 2537 } 2538 if (error) { 2539 kmem_free(tmpname, MAXNAMELEN); 2540 VN_RELE(ovp); 2541 VN_RELE(nvp); 2542 nfs_rw_exit(&odrp->r_rwlock); 2543 nfs_rw_exit(&ndrp->r_rwlock); 2544 return (error); 2545 } 2546 rp = VTOR(nvp); 2547 mutex_enter(&rp->r_statelock); 2548 if (rp->r_unldvp == NULL) { 2549 VN_HOLD(ndvp); 2550 rp->r_unldvp = ndvp; 2551 if (rp->r_unlcred != NULL) 2552 crfree(rp->r_unlcred); 2553 crhold(cr); 2554 rp->r_unlcred = cr; 2555 rp->r_unlname = tmpname; 2556 } else { 2557 kmem_free(rp->r_unlname, MAXNAMELEN); 2558 rp->r_unlname = tmpname; 2559 } 2560 mutex_exit(&rp->r_statelock); 2561 } 2562 } 2563 2564 if (ovp == NULL) { 2565 /* 2566 * When renaming directories to be a subdirectory of a 2567 * different parent, the dnlc entry for ".." will no 2568 * longer be valid, so it must be removed. 2569 * 2570 * We do a lookup here to determine whether we are renaming 2571 * a directory and we need to check if we are renaming 2572 * an unlinked file. This might have already been done 2573 * in previous code, so we check ovp == NULL to avoid 2574 * doing it twice. 2575 */ 2576 2577 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 2578 2579 /* 2580 * The source name *should* already exist. 2581 */ 2582 if (error) { 2583 nfs_rw_exit(&odrp->r_rwlock); 2584 nfs_rw_exit(&ndrp->r_rwlock); 2585 if (nvp) { 2586 VN_RELE(nvp); 2587 } 2588 return (error); 2589 } 2590 ASSERT(ovp != NULL); 2591 } 2592 2593 dnlc_remove(odvp, onm); 2594 dnlc_remove(ndvp, nnm); 2595 2596 setdiropargs(&args.rna_from, onm, odvp); 2597 setdiropargs(&args.rna_to, nnm, ndvp); 2598 2599 douprintf = 1; 2600 2601 error = rfs2call(VTOMI(odvp), RFS_RENAME, 2602 xdr_rnmargs, (caddr_t)&args, 2603 xdr_enum, (caddr_t)&status, cr, 2604 &douprintf, &status, 0, NULL); 2605 2606 PURGE_ATTRCACHE(odvp); /* mod time changed */ 2607 PURGE_ATTRCACHE(ndvp); /* mod time changed */ 2608 2609 if (!error) { 2610 error = geterrno(status); 2611 if (!error) { 2612 if (HAVE_RDDIR_CACHE(odrp)) 2613 nfs_purge_rddir_cache(odvp); 2614 if (HAVE_RDDIR_CACHE(ndrp)) 2615 nfs_purge_rddir_cache(ndvp); 2616 /* 2617 * when renaming directories to be a subdirectory of a 2618 * different parent, the dnlc entry for ".." will no 2619 * longer be valid, so it must be removed 2620 */ 2621 rp = VTOR(ovp); 2622 if (ndvp != odvp) { 2623 if (ovp->v_type == VDIR) { 2624 dnlc_remove(ovp, ".."); 2625 if (HAVE_RDDIR_CACHE(rp)) 2626 nfs_purge_rddir_cache(ovp); 2627 } 2628 } 2629 2630 /* 2631 * If we are renaming the unlinked file, update the 2632 * r_unldvp and r_unlname as needed. 2633 */ 2634 mutex_enter(&rp->r_statelock); 2635 if (rp->r_unldvp != NULL) { 2636 if (strcmp(rp->r_unlname, onm) == 0) { 2637 (void) strncpy(rp->r_unlname, 2638 nnm, MAXNAMELEN); 2639 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 2640 2641 if (ndvp != rp->r_unldvp) { 2642 VN_RELE(rp->r_unldvp); 2643 rp->r_unldvp = ndvp; 2644 VN_HOLD(ndvp); 2645 } 2646 } 2647 } 2648 mutex_exit(&rp->r_statelock); 2649 } else { 2650 /* 2651 * System V defines rename to return EEXIST, not 2652 * ENOTEMPTY if the target directory is not empty. 2653 * Over the wire, the error is NFSERR_ENOTEMPTY 2654 * which geterrno maps to ENOTEMPTY. 2655 */ 2656 if (error == ENOTEMPTY) 2657 error = EEXIST; 2658 } 2659 } 2660 2661 if (error == 0) { 2662 if (nvp) 2663 vnevent_rename_dest(nvp, ndvp, nnm, ct); 2664 2665 if (odvp != ndvp) 2666 vnevent_rename_dest_dir(ndvp, ct); 2667 2668 ASSERT(ovp != NULL); 2669 vnevent_rename_src(ovp, odvp, onm, ct); 2670 } 2671 2672 if (nvp) { 2673 VN_RELE(nvp); 2674 } 2675 VN_RELE(ovp); 2676 2677 nfs_rw_exit(&odrp->r_rwlock); 2678 nfs_rw_exit(&ndrp->r_rwlock); 2679 2680 return (error); 2681 } 2682 2683 /* ARGSUSED */ 2684 static int 2685 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 2686 caller_context_t *ct, int flags, vsecattr_t *vsecp) 2687 { 2688 int error; 2689 struct nfscreatargs args; 2690 struct nfsdiropres dr; 2691 int douprintf; 2692 rnode_t *drp; 2693 hrtime_t t; 2694 2695 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2696 return (EPERM); 2697 2698 setdiropargs(&args.ca_da, nm, dvp); 2699 2700 /* 2701 * Decide what the group-id and set-gid bit of the created directory 2702 * should be. May have to do a setattr to get the gid right. 2703 */ 2704 error = setdirgid(dvp, &va->va_gid, cr); 2705 if (error) 2706 return (error); 2707 error = setdirmode(dvp, &va->va_mode, cr); 2708 if (error) 2709 return (error); 2710 va->va_mask |= AT_MODE|AT_GID; 2711 2712 args.ca_sa = &args.ca_sa_buf; 2713 error = vattr_to_sattr(va, args.ca_sa); 2714 if (error) { 2715 /* req time field(s) overflow - return immediately */ 2716 return (error); 2717 } 2718 2719 drp = VTOR(dvp); 2720 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2721 return (EINTR); 2722 2723 dnlc_remove(dvp, nm); 2724 2725 douprintf = 1; 2726 2727 t = gethrtime(); 2728 2729 error = rfs2call(VTOMI(dvp), RFS_MKDIR, 2730 xdr_creatargs, (caddr_t)&args, 2731 xdr_diropres, (caddr_t)&dr, cr, 2732 &douprintf, &dr.dr_status, 0, NULL); 2733 2734 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2735 2736 if (!error) { 2737 error = geterrno(dr.dr_status); 2738 if (!error) { 2739 if (HAVE_RDDIR_CACHE(drp)) 2740 nfs_purge_rddir_cache(dvp); 2741 /* 2742 * The attributes returned by RFS_MKDIR can not 2743 * be depended upon, so mark the attribute cache 2744 * as purged. A subsequent GETATTR will get the 2745 * correct attributes from the server. 2746 */ 2747 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2748 dvp->v_vfsp, t, cr, NULL, NULL); 2749 PURGE_ATTRCACHE(*vpp); 2750 dnlc_update(dvp, nm, *vpp); 2751 2752 /* 2753 * Make sure the gid was set correctly. 2754 * If not, try to set it (but don't lose 2755 * any sleep over it). 2756 */ 2757 if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) { 2758 va->va_mask = AT_GID; 2759 (void) nfssetattr(*vpp, va, 0, cr); 2760 } 2761 } else { 2762 PURGE_STALE_FH(error, dvp, cr); 2763 } 2764 } 2765 2766 nfs_rw_exit(&drp->r_rwlock); 2767 2768 return (error); 2769 } 2770 2771 /* ARGSUSED */ 2772 static int 2773 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 2774 caller_context_t *ct, int flags) 2775 { 2776 int error; 2777 enum nfsstat status; 2778 struct nfsdiropargs da; 2779 vnode_t *vp; 2780 int douprintf; 2781 rnode_t *drp; 2782 2783 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2784 return (EPERM); 2785 drp = VTOR(dvp); 2786 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2787 return (EINTR); 2788 2789 /* 2790 * Attempt to prevent a rmdir(".") from succeeding. 2791 */ 2792 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2793 if (error) { 2794 nfs_rw_exit(&drp->r_rwlock); 2795 return (error); 2796 } 2797 2798 if (vp == cdir) { 2799 VN_RELE(vp); 2800 nfs_rw_exit(&drp->r_rwlock); 2801 return (EINVAL); 2802 } 2803 2804 setdiropargs(&da, nm, dvp); 2805 2806 /* 2807 * First just remove the entry from the name cache, as it 2808 * is most likely an entry for this vp. 2809 */ 2810 dnlc_remove(dvp, nm); 2811 2812 /* 2813 * If there vnode reference count is greater than one, then 2814 * there may be additional references in the DNLC which will 2815 * need to be purged. First, trying removing the entry for 2816 * the parent directory and see if that removes the additional 2817 * reference(s). If that doesn't do it, then use dnlc_purge_vp 2818 * to completely remove any references to the directory which 2819 * might still exist in the DNLC. 2820 */ 2821 if (vp->v_count > 1) { 2822 dnlc_remove(vp, ".."); 2823 if (vp->v_count > 1) 2824 dnlc_purge_vp(vp); 2825 } 2826 2827 douprintf = 1; 2828 2829 error = rfs2call(VTOMI(dvp), RFS_RMDIR, 2830 xdr_diropargs, (caddr_t)&da, 2831 xdr_enum, (caddr_t)&status, cr, 2832 &douprintf, &status, 0, NULL); 2833 2834 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2835 2836 if (error) { 2837 VN_RELE(vp); 2838 nfs_rw_exit(&drp->r_rwlock); 2839 return (error); 2840 } 2841 2842 error = geterrno(status); 2843 if (!error) { 2844 if (HAVE_RDDIR_CACHE(drp)) 2845 nfs_purge_rddir_cache(dvp); 2846 if (HAVE_RDDIR_CACHE(VTOR(vp))) 2847 nfs_purge_rddir_cache(vp); 2848 } else { 2849 PURGE_STALE_FH(error, dvp, cr); 2850 /* 2851 * System V defines rmdir to return EEXIST, not 2852 * ENOTEMPTY if the directory is not empty. Over 2853 * the wire, the error is NFSERR_ENOTEMPTY which 2854 * geterrno maps to ENOTEMPTY. 2855 */ 2856 if (error == ENOTEMPTY) 2857 error = EEXIST; 2858 } 2859 2860 if (error == 0) { 2861 vnevent_rmdir(vp, dvp, nm, ct); 2862 } 2863 VN_RELE(vp); 2864 2865 nfs_rw_exit(&drp->r_rwlock); 2866 2867 return (error); 2868 } 2869 2870 /* ARGSUSED */ 2871 static int 2872 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 2873 caller_context_t *ct, int flags) 2874 { 2875 int error; 2876 struct nfsslargs args; 2877 enum nfsstat status; 2878 int douprintf; 2879 rnode_t *drp; 2880 2881 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2882 return (EPERM); 2883 setdiropargs(&args.sla_from, lnm, dvp); 2884 args.sla_sa = &args.sla_sa_buf; 2885 error = vattr_to_sattr(tva, args.sla_sa); 2886 if (error) { 2887 /* req time field(s) overflow - return immediately */ 2888 return (error); 2889 } 2890 args.sla_tnm = tnm; 2891 2892 drp = VTOR(dvp); 2893 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2894 return (EINTR); 2895 2896 dnlc_remove(dvp, lnm); 2897 2898 douprintf = 1; 2899 2900 error = rfs2call(VTOMI(dvp), RFS_SYMLINK, 2901 xdr_slargs, (caddr_t)&args, 2902 xdr_enum, (caddr_t)&status, cr, 2903 &douprintf, &status, 0, NULL); 2904 2905 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2906 2907 if (!error) { 2908 error = geterrno(status); 2909 if (!error) { 2910 if (HAVE_RDDIR_CACHE(drp)) 2911 nfs_purge_rddir_cache(dvp); 2912 } else { 2913 PURGE_STALE_FH(error, dvp, cr); 2914 } 2915 } 2916 2917 nfs_rw_exit(&drp->r_rwlock); 2918 2919 return (error); 2920 } 2921 2922 #ifdef DEBUG 2923 static int nfs_readdir_cache_hits = 0; 2924 static int nfs_readdir_cache_shorts = 0; 2925 static int nfs_readdir_cache_waits = 0; 2926 static int nfs_readdir_cache_misses = 0; 2927 static int nfs_readdir_readahead = 0; 2928 #endif 2929 2930 static int nfs_shrinkreaddir = 0; 2931 2932 /* 2933 * Read directory entries. 2934 * There are some weird things to look out for here. The uio_offset 2935 * field is either 0 or it is the offset returned from a previous 2936 * readdir. It is an opaque value used by the server to find the 2937 * correct directory block to read. The count field is the number 2938 * of blocks to read on the server. This is advisory only, the server 2939 * may return only one block's worth of entries. Entries may be compressed 2940 * on the server. 2941 */ 2942 /* ARGSUSED */ 2943 static int 2944 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 2945 caller_context_t *ct, int flags) 2946 { 2947 int error; 2948 size_t count; 2949 rnode_t *rp; 2950 rddir_cache *rdc; 2951 rddir_cache *nrdc; 2952 rddir_cache *rrdc; 2953 #ifdef DEBUG 2954 int missed; 2955 #endif 2956 rddir_cache srdc; 2957 avl_index_t where; 2958 2959 rp = VTOR(vp); 2960 2961 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2962 if (nfs_zone() != VTOMI(vp)->mi_zone) 2963 return (EIO); 2964 /* 2965 * Make sure that the directory cache is valid. 2966 */ 2967 if (HAVE_RDDIR_CACHE(rp)) { 2968 if (nfs_disable_rddir_cache) { 2969 /* 2970 * Setting nfs_disable_rddir_cache in /etc/system 2971 * allows interoperability with servers that do not 2972 * properly update the attributes of directories. 2973 * Any cached information gets purged before an 2974 * access is made to it. 2975 */ 2976 nfs_purge_rddir_cache(vp); 2977 } else { 2978 error = nfs_validate_caches(vp, cr); 2979 if (error) 2980 return (error); 2981 } 2982 } 2983 2984 /* 2985 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an 2986 * RFS_READDIR request with rda_count set to more than 0x400. So 2987 * we reduce the request size here purely for compatibility. 2988 * 2989 * In general, this is no longer required. However, if a server 2990 * is discovered which can not handle requests larger than 1024, 2991 * nfs_shrinkreaddir can be set to 1 to enable this backwards 2992 * compatibility. 2993 * 2994 * In any case, the request size is limited to NFS_MAXDATA bytes. 2995 */ 2996 count = MIN(uiop->uio_iov->iov_len, 2997 nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA); 2998 2999 nrdc = NULL; 3000 #ifdef DEBUG 3001 missed = 0; 3002 #endif 3003 top: 3004 /* 3005 * Short circuit last readdir which always returns 0 bytes. 3006 * This can be done after the directory has been read through 3007 * completely at least once. This will set r_direof which 3008 * can be used to find the value of the last cookie. 3009 */ 3010 mutex_enter(&rp->r_statelock); 3011 if (rp->r_direof != NULL && 3012 uiop->uio_offset == rp->r_direof->nfs_ncookie) { 3013 mutex_exit(&rp->r_statelock); 3014 #ifdef DEBUG 3015 nfs_readdir_cache_shorts++; 3016 #endif 3017 if (eofp) 3018 *eofp = 1; 3019 if (nrdc != NULL) 3020 rddir_cache_rele(nrdc); 3021 return (0); 3022 } 3023 /* 3024 * Look for a cache entry. Cache entries are identified 3025 * by the NFS cookie value and the byte count requested. 3026 */ 3027 srdc.nfs_cookie = uiop->uio_offset; 3028 srdc.buflen = count; 3029 rdc = avl_find(&rp->r_dir, &srdc, &where); 3030 if (rdc != NULL) { 3031 rddir_cache_hold(rdc); 3032 /* 3033 * If the cache entry is in the process of being 3034 * filled in, wait until this completes. The 3035 * RDDIRWAIT bit is set to indicate that someone 3036 * is waiting and then the thread currently 3037 * filling the entry is done, it should do a 3038 * cv_broadcast to wakeup all of the threads 3039 * waiting for it to finish. 3040 */ 3041 if (rdc->flags & RDDIR) { 3042 nfs_rw_exit(&rp->r_rwlock); 3043 rdc->flags |= RDDIRWAIT; 3044 #ifdef DEBUG 3045 nfs_readdir_cache_waits++; 3046 #endif 3047 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 3048 /* 3049 * We got interrupted, probably 3050 * the user typed ^C or an alarm 3051 * fired. We free the new entry 3052 * if we allocated one. 3053 */ 3054 mutex_exit(&rp->r_statelock); 3055 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3056 RW_READER, FALSE); 3057 rddir_cache_rele(rdc); 3058 if (nrdc != NULL) 3059 rddir_cache_rele(nrdc); 3060 return (EINTR); 3061 } 3062 mutex_exit(&rp->r_statelock); 3063 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3064 RW_READER, FALSE); 3065 rddir_cache_rele(rdc); 3066 goto top; 3067 } 3068 /* 3069 * Check to see if a readdir is required to 3070 * fill the entry. If so, mark this entry 3071 * as being filled, remove our reference, 3072 * and branch to the code to fill the entry. 3073 */ 3074 if (rdc->flags & RDDIRREQ) { 3075 rdc->flags &= ~RDDIRREQ; 3076 rdc->flags |= RDDIR; 3077 if (nrdc != NULL) 3078 rddir_cache_rele(nrdc); 3079 nrdc = rdc; 3080 mutex_exit(&rp->r_statelock); 3081 goto bottom; 3082 } 3083 #ifdef DEBUG 3084 if (!missed) 3085 nfs_readdir_cache_hits++; 3086 #endif 3087 /* 3088 * If an error occurred while attempting 3089 * to fill the cache entry, just return it. 3090 */ 3091 if (rdc->error) { 3092 error = rdc->error; 3093 mutex_exit(&rp->r_statelock); 3094 rddir_cache_rele(rdc); 3095 if (nrdc != NULL) 3096 rddir_cache_rele(nrdc); 3097 return (error); 3098 } 3099 3100 /* 3101 * The cache entry is complete and good, 3102 * copyout the dirent structs to the calling 3103 * thread. 3104 */ 3105 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 3106 3107 /* 3108 * If no error occurred during the copyout, 3109 * update the offset in the uio struct to 3110 * contain the value of the next cookie 3111 * and set the eof value appropriately. 3112 */ 3113 if (!error) { 3114 uiop->uio_offset = rdc->nfs_ncookie; 3115 if (eofp) 3116 *eofp = rdc->eof; 3117 } 3118 3119 /* 3120 * Decide whether to do readahead. Don't if 3121 * have already read to the end of directory. 3122 */ 3123 if (rdc->eof) { 3124 rp->r_direof = rdc; 3125 mutex_exit(&rp->r_statelock); 3126 rddir_cache_rele(rdc); 3127 if (nrdc != NULL) 3128 rddir_cache_rele(nrdc); 3129 return (error); 3130 } 3131 3132 /* 3133 * Check to see whether we found an entry 3134 * for the readahead. If so, we don't need 3135 * to do anything further, so free the new 3136 * entry if one was allocated. Otherwise, 3137 * allocate a new entry, add it to the cache, 3138 * and then initiate an asynchronous readdir 3139 * operation to fill it. 3140 */ 3141 srdc.nfs_cookie = rdc->nfs_ncookie; 3142 srdc.buflen = count; 3143 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3144 if (rrdc != NULL) { 3145 if (nrdc != NULL) 3146 rddir_cache_rele(nrdc); 3147 } else { 3148 if (nrdc != NULL) 3149 rrdc = nrdc; 3150 else { 3151 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3152 } 3153 if (rrdc != NULL) { 3154 rrdc->nfs_cookie = rdc->nfs_ncookie; 3155 rrdc->buflen = count; 3156 avl_insert(&rp->r_dir, rrdc, where); 3157 rddir_cache_hold(rrdc); 3158 mutex_exit(&rp->r_statelock); 3159 rddir_cache_rele(rdc); 3160 #ifdef DEBUG 3161 nfs_readdir_readahead++; 3162 #endif 3163 nfs_async_readdir(vp, rrdc, cr, nfsreaddir); 3164 return (error); 3165 } 3166 } 3167 3168 mutex_exit(&rp->r_statelock); 3169 rddir_cache_rele(rdc); 3170 return (error); 3171 } 3172 3173 /* 3174 * Didn't find an entry in the cache. Construct a new empty 3175 * entry and link it into the cache. Other processes attempting 3176 * to access this entry will need to wait until it is filled in. 3177 * 3178 * Since kmem_alloc may block, another pass through the cache 3179 * will need to be taken to make sure that another process 3180 * hasn't already added an entry to the cache for this request. 3181 */ 3182 if (nrdc == NULL) { 3183 mutex_exit(&rp->r_statelock); 3184 nrdc = rddir_cache_alloc(KM_SLEEP); 3185 nrdc->nfs_cookie = uiop->uio_offset; 3186 nrdc->buflen = count; 3187 goto top; 3188 } 3189 3190 /* 3191 * Add this entry to the cache. 3192 */ 3193 avl_insert(&rp->r_dir, nrdc, where); 3194 rddir_cache_hold(nrdc); 3195 mutex_exit(&rp->r_statelock); 3196 3197 bottom: 3198 #ifdef DEBUG 3199 missed = 1; 3200 nfs_readdir_cache_misses++; 3201 #endif 3202 /* 3203 * Do the readdir. 3204 */ 3205 error = nfsreaddir(vp, nrdc, cr); 3206 3207 /* 3208 * If this operation failed, just return the error which occurred. 3209 */ 3210 if (error != 0) 3211 return (error); 3212 3213 /* 3214 * Since the RPC operation will have taken sometime and blocked 3215 * this process, another pass through the cache will need to be 3216 * taken to find the correct cache entry. It is possible that 3217 * the correct cache entry will not be there (although one was 3218 * added) because the directory changed during the RPC operation 3219 * and the readdir cache was flushed. In this case, just start 3220 * over. It is hoped that this will not happen too often... :-) 3221 */ 3222 nrdc = NULL; 3223 goto top; 3224 /* NOTREACHED */ 3225 } 3226 3227 static int 3228 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3229 { 3230 int error; 3231 struct nfsrddirargs rda; 3232 struct nfsrddirres rd; 3233 rnode_t *rp; 3234 mntinfo_t *mi; 3235 uint_t count; 3236 int douprintf; 3237 failinfo_t fi, *fip; 3238 3239 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3240 count = rdc->buflen; 3241 3242 rp = VTOR(vp); 3243 mi = VTOMI(vp); 3244 3245 rda.rda_fh = *VTOFH(vp); 3246 rda.rda_offset = rdc->nfs_cookie; 3247 3248 /* 3249 * NFS client failover support 3250 * suppress failover unless we have a zero cookie 3251 */ 3252 if (rdc->nfs_cookie == (off_t)0) { 3253 fi.vp = vp; 3254 fi.fhp = (caddr_t)&rda.rda_fh; 3255 fi.copyproc = nfscopyfh; 3256 fi.lookupproc = nfslookup; 3257 fi.xattrdirproc = acl_getxattrdir2; 3258 fip = &fi; 3259 } else { 3260 fip = NULL; 3261 } 3262 3263 rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3264 rd.rd_size = count; 3265 rd.rd_offset = rda.rda_offset; 3266 3267 douprintf = 1; 3268 3269 if (mi->mi_io_kstats) { 3270 mutex_enter(&mi->mi_lock); 3271 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3272 mutex_exit(&mi->mi_lock); 3273 } 3274 3275 do { 3276 rda.rda_count = MIN(count, mi->mi_curread); 3277 error = rfs2call(mi, RFS_READDIR, 3278 xdr_rddirargs, (caddr_t)&rda, 3279 xdr_getrddirres, (caddr_t)&rd, cr, 3280 &douprintf, &rd.rd_status, 0, fip); 3281 } while (error == ENFS_TRYAGAIN); 3282 3283 if (mi->mi_io_kstats) { 3284 mutex_enter(&mi->mi_lock); 3285 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3286 mutex_exit(&mi->mi_lock); 3287 } 3288 3289 /* 3290 * Since we are actually doing a READDIR RPC, we must have 3291 * exclusive access to the cache entry being filled. Thus, 3292 * it is safe to update all fields except for the flags 3293 * field. The r_statelock in the rnode must be held to 3294 * prevent two different threads from simultaneously 3295 * attempting to update the flags field. This can happen 3296 * if we are turning off RDDIR and the other thread is 3297 * trying to set RDDIRWAIT. 3298 */ 3299 ASSERT(rdc->flags & RDDIR); 3300 if (!error) { 3301 error = geterrno(rd.rd_status); 3302 if (!error) { 3303 rdc->nfs_ncookie = rd.rd_offset; 3304 rdc->eof = rd.rd_eof ? 1 : 0; 3305 rdc->entlen = rd.rd_size; 3306 ASSERT(rdc->entlen <= rdc->buflen); 3307 #ifdef DEBUG 3308 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, 3309 KM_SLEEP); 3310 #else 3311 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3312 #endif 3313 bcopy(rd.rd_entries, rdc->entries, rdc->entlen); 3314 rdc->error = 0; 3315 if (mi->mi_io_kstats) { 3316 mutex_enter(&mi->mi_lock); 3317 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3318 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 3319 rd.rd_size; 3320 mutex_exit(&mi->mi_lock); 3321 } 3322 } else { 3323 PURGE_STALE_FH(error, vp, cr); 3324 } 3325 } 3326 if (error) { 3327 rdc->entries = NULL; 3328 rdc->error = error; 3329 } 3330 kmem_free(rd.rd_entries, rdc->buflen); 3331 3332 mutex_enter(&rp->r_statelock); 3333 rdc->flags &= ~RDDIR; 3334 if (rdc->flags & RDDIRWAIT) { 3335 rdc->flags &= ~RDDIRWAIT; 3336 cv_broadcast(&rdc->cv); 3337 } 3338 if (error) 3339 rdc->flags |= RDDIRREQ; 3340 mutex_exit(&rp->r_statelock); 3341 3342 rddir_cache_rele(rdc); 3343 3344 return (error); 3345 } 3346 3347 #ifdef DEBUG 3348 static int nfs_bio_do_stop = 0; 3349 #endif 3350 3351 static int 3352 nfs_bio(struct buf *bp, cred_t *cr) 3353 { 3354 rnode_t *rp = VTOR(bp->b_vp); 3355 int count; 3356 int error; 3357 cred_t *cred; 3358 uint_t offset; 3359 3360 DTRACE_IO1(start, struct buf *, bp); 3361 3362 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 3363 offset = dbtob(bp->b_blkno); 3364 3365 if (bp->b_flags & B_READ) { 3366 mutex_enter(&rp->r_statelock); 3367 if (rp->r_cred != NULL) { 3368 cred = rp->r_cred; 3369 crhold(cred); 3370 } else { 3371 rp->r_cred = cr; 3372 crhold(cr); 3373 cred = cr; 3374 crhold(cred); 3375 } 3376 mutex_exit(&rp->r_statelock); 3377 read_again: 3378 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr, 3379 offset, bp->b_bcount, &bp->b_resid, cred); 3380 3381 crfree(cred); 3382 if (!error) { 3383 if (bp->b_resid) { 3384 /* 3385 * Didn't get it all because we hit EOF, 3386 * zero all the memory beyond the EOF. 3387 */ 3388 /* bzero(rdaddr + */ 3389 bzero(bp->b_un.b_addr + 3390 bp->b_bcount - bp->b_resid, bp->b_resid); 3391 } 3392 mutex_enter(&rp->r_statelock); 3393 if (bp->b_resid == bp->b_bcount && 3394 offset >= rp->r_size) { 3395 /* 3396 * We didn't read anything at all as we are 3397 * past EOF. Return an error indicator back 3398 * but don't destroy the pages (yet). 3399 */ 3400 error = NFS_EOF; 3401 } 3402 mutex_exit(&rp->r_statelock); 3403 } else if (error == EACCES) { 3404 mutex_enter(&rp->r_statelock); 3405 if (cred != cr) { 3406 if (rp->r_cred != NULL) 3407 crfree(rp->r_cred); 3408 rp->r_cred = cr; 3409 crhold(cr); 3410 cred = cr; 3411 crhold(cred); 3412 mutex_exit(&rp->r_statelock); 3413 goto read_again; 3414 } 3415 mutex_exit(&rp->r_statelock); 3416 } 3417 } else { 3418 if (!(rp->r_flags & RSTALE)) { 3419 mutex_enter(&rp->r_statelock); 3420 if (rp->r_cred != NULL) { 3421 cred = rp->r_cred; 3422 crhold(cred); 3423 } else { 3424 rp->r_cred = cr; 3425 crhold(cr); 3426 cred = cr; 3427 crhold(cred); 3428 } 3429 mutex_exit(&rp->r_statelock); 3430 write_again: 3431 mutex_enter(&rp->r_statelock); 3432 count = MIN(bp->b_bcount, rp->r_size - offset); 3433 mutex_exit(&rp->r_statelock); 3434 if (count < 0) 3435 cmn_err(CE_PANIC, "nfs_bio: write count < 0"); 3436 #ifdef DEBUG 3437 if (count == 0) { 3438 zcmn_err(getzoneid(), CE_WARN, 3439 "nfs_bio: zero length write at %d", 3440 offset); 3441 nfs_printfhandle(&rp->r_fh); 3442 if (nfs_bio_do_stop) 3443 debug_enter("nfs_bio"); 3444 } 3445 #endif 3446 error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset, 3447 count, cred); 3448 if (error == EACCES) { 3449 mutex_enter(&rp->r_statelock); 3450 if (cred != cr) { 3451 if (rp->r_cred != NULL) 3452 crfree(rp->r_cred); 3453 rp->r_cred = cr; 3454 crhold(cr); 3455 crfree(cred); 3456 cred = cr; 3457 crhold(cred); 3458 mutex_exit(&rp->r_statelock); 3459 goto write_again; 3460 } 3461 mutex_exit(&rp->r_statelock); 3462 } 3463 bp->b_error = error; 3464 if (error && error != EINTR) { 3465 /* 3466 * Don't print EDQUOT errors on the console. 3467 * Don't print asynchronous EACCES errors. 3468 * Don't print EFBIG errors. 3469 * Print all other write errors. 3470 */ 3471 if (error != EDQUOT && error != EFBIG && 3472 (error != EACCES || 3473 !(bp->b_flags & B_ASYNC))) 3474 nfs_write_error(bp->b_vp, error, cred); 3475 /* 3476 * Update r_error and r_flags as appropriate. 3477 * If the error was ESTALE, then mark the 3478 * rnode as not being writeable and save 3479 * the error status. Otherwise, save any 3480 * errors which occur from asynchronous 3481 * page invalidations. Any errors occurring 3482 * from other operations should be saved 3483 * by the caller. 3484 */ 3485 mutex_enter(&rp->r_statelock); 3486 if (error == ESTALE) { 3487 rp->r_flags |= RSTALE; 3488 if (!rp->r_error) 3489 rp->r_error = error; 3490 } else if (!rp->r_error && 3491 (bp->b_flags & 3492 (B_INVAL|B_FORCE|B_ASYNC)) == 3493 (B_INVAL|B_FORCE|B_ASYNC)) { 3494 rp->r_error = error; 3495 } 3496 mutex_exit(&rp->r_statelock); 3497 } 3498 crfree(cred); 3499 } else { 3500 error = rp->r_error; 3501 /* 3502 * A close may have cleared r_error, if so, 3503 * propagate ESTALE error return properly 3504 */ 3505 if (error == 0) 3506 error = ESTALE; 3507 } 3508 } 3509 3510 if (error != 0 && error != NFS_EOF) 3511 bp->b_flags |= B_ERROR; 3512 3513 DTRACE_IO1(done, struct buf *, bp); 3514 3515 return (error); 3516 } 3517 3518 /* ARGSUSED */ 3519 static int 3520 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3521 { 3522 struct nfs_fid *fp; 3523 rnode_t *rp; 3524 3525 rp = VTOR(vp); 3526 3527 if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) { 3528 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short); 3529 return (ENOSPC); 3530 } 3531 fp = (struct nfs_fid *)fidp; 3532 fp->nf_pad = 0; 3533 fp->nf_len = sizeof (struct nfs_fid) - sizeof (short); 3534 bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE); 3535 return (0); 3536 } 3537 3538 /* ARGSUSED2 */ 3539 static int 3540 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3541 { 3542 rnode_t *rp = VTOR(vp); 3543 3544 if (!write_lock) { 3545 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3546 return (V_WRITELOCK_FALSE); 3547 } 3548 3549 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 3550 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3551 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 3552 return (V_WRITELOCK_FALSE); 3553 nfs_rw_exit(&rp->r_rwlock); 3554 } 3555 3556 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 3557 return (V_WRITELOCK_TRUE); 3558 } 3559 3560 /* ARGSUSED */ 3561 static void 3562 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3563 { 3564 rnode_t *rp = VTOR(vp); 3565 3566 nfs_rw_exit(&rp->r_rwlock); 3567 } 3568 3569 /* ARGSUSED */ 3570 static int 3571 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 3572 { 3573 3574 /* 3575 * Because we stuff the readdir cookie into the offset field 3576 * someone may attempt to do an lseek with the cookie which 3577 * we want to succeed. 3578 */ 3579 if (vp->v_type == VDIR) 3580 return (0); 3581 if (*noffp < 0 || *noffp > MAXOFF32_T) 3582 return (EINVAL); 3583 return (0); 3584 } 3585 3586 /* 3587 * number of NFS_MAXDATA blocks to read ahead 3588 * optimized for 100 base-T. 3589 */ 3590 static int nfs_nra = 4; 3591 3592 #ifdef DEBUG 3593 static int nfs_lostpage = 0; /* number of times we lost original page */ 3594 #endif 3595 3596 /* 3597 * Return all the pages from [off..off+len) in file 3598 */ 3599 /* ARGSUSED */ 3600 static int 3601 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3602 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3603 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 3604 { 3605 rnode_t *rp; 3606 int error; 3607 mntinfo_t *mi; 3608 3609 if (vp->v_flag & VNOMAP) 3610 return (ENOSYS); 3611 3612 ASSERT(off <= MAXOFF32_T); 3613 if (nfs_zone() != VTOMI(vp)->mi_zone) 3614 return (EIO); 3615 if (protp != NULL) 3616 *protp = PROT_ALL; 3617 3618 /* 3619 * Now valididate that the caches are up to date. 3620 */ 3621 error = nfs_validate_caches(vp, cr); 3622 if (error) 3623 return (error); 3624 3625 rp = VTOR(vp); 3626 mi = VTOMI(vp); 3627 retry: 3628 mutex_enter(&rp->r_statelock); 3629 3630 /* 3631 * Don't create dirty pages faster than they 3632 * can be cleaned so that the system doesn't 3633 * get imbalanced. If the async queue is 3634 * maxed out, then wait for it to drain before 3635 * creating more dirty pages. Also, wait for 3636 * any threads doing pagewalks in the vop_getattr 3637 * entry points so that they don't block for 3638 * long periods. 3639 */ 3640 if (rw == S_CREATE) { 3641 while ((mi->mi_max_threads != 0 && 3642 rp->r_awcount > 2 * mi->mi_max_threads) || 3643 rp->r_gcount > 0) 3644 cv_wait(&rp->r_cv, &rp->r_statelock); 3645 } 3646 3647 /* 3648 * If we are getting called as a side effect of an nfs_write() 3649 * operation the local file size might not be extended yet. 3650 * In this case we want to be able to return pages of zeroes. 3651 */ 3652 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 3653 mutex_exit(&rp->r_statelock); 3654 return (EFAULT); /* beyond EOF */ 3655 } 3656 3657 mutex_exit(&rp->r_statelock); 3658 3659 if (len <= PAGESIZE) { 3660 error = nfs_getapage(vp, off, len, protp, pl, plsz, 3661 seg, addr, rw, cr); 3662 } else { 3663 error = pvn_getpages(nfs_getapage, vp, off, len, protp, 3664 pl, plsz, seg, addr, rw, cr); 3665 } 3666 3667 switch (error) { 3668 case NFS_EOF: 3669 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 3670 goto retry; 3671 case ESTALE: 3672 PURGE_STALE_FH(error, vp, cr); 3673 } 3674 3675 return (error); 3676 } 3677 3678 /* 3679 * Called from pvn_getpages or nfs_getpage to get a particular page. 3680 */ 3681 /* ARGSUSED */ 3682 static int 3683 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 3684 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3685 enum seg_rw rw, cred_t *cr) 3686 { 3687 rnode_t *rp; 3688 uint_t bsize; 3689 struct buf *bp; 3690 page_t *pp; 3691 u_offset_t lbn; 3692 u_offset_t io_off; 3693 u_offset_t blkoff; 3694 u_offset_t rablkoff; 3695 size_t io_len; 3696 uint_t blksize; 3697 int error; 3698 int readahead; 3699 int readahead_issued = 0; 3700 int ra_window; /* readahead window */ 3701 page_t *pagefound; 3702 3703 if (nfs_zone() != VTOMI(vp)->mi_zone) 3704 return (EIO); 3705 rp = VTOR(vp); 3706 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3707 3708 reread: 3709 bp = NULL; 3710 pp = NULL; 3711 pagefound = NULL; 3712 3713 if (pl != NULL) 3714 pl[0] = NULL; 3715 3716 error = 0; 3717 lbn = off / bsize; 3718 blkoff = lbn * bsize; 3719 3720 /* 3721 * Queueing up the readahead before doing the synchronous read 3722 * results in a significant increase in read throughput because 3723 * of the increased parallelism between the async threads and 3724 * the process context. 3725 */ 3726 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 3727 rw != S_CREATE && 3728 !(vp->v_flag & VNOCACHE)) { 3729 mutex_enter(&rp->r_statelock); 3730 3731 /* 3732 * Calculate the number of readaheads to do. 3733 * a) No readaheads at offset = 0. 3734 * b) Do maximum(nfs_nra) readaheads when the readahead 3735 * window is closed. 3736 * c) Do readaheads between 1 to (nfs_nra - 1) depending 3737 * upon how far the readahead window is open or close. 3738 * d) No readaheads if rp->r_nextr is not within the scope 3739 * of the readahead window (random i/o). 3740 */ 3741 3742 if (off == 0) 3743 readahead = 0; 3744 else if (blkoff == rp->r_nextr) 3745 readahead = nfs_nra; 3746 else if (rp->r_nextr > blkoff && 3747 ((ra_window = (rp->r_nextr - blkoff) / bsize) 3748 <= (nfs_nra - 1))) 3749 readahead = nfs_nra - ra_window; 3750 else 3751 readahead = 0; 3752 3753 rablkoff = rp->r_nextr; 3754 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 3755 mutex_exit(&rp->r_statelock); 3756 if (nfs_async_readahead(vp, rablkoff + bsize, 3757 addr + (rablkoff + bsize - off), seg, cr, 3758 nfs_readahead) < 0) { 3759 mutex_enter(&rp->r_statelock); 3760 break; 3761 } 3762 readahead--; 3763 rablkoff += bsize; 3764 /* 3765 * Indicate that we did a readahead so 3766 * readahead offset is not updated 3767 * by the synchronous read below. 3768 */ 3769 readahead_issued = 1; 3770 mutex_enter(&rp->r_statelock); 3771 /* 3772 * set readahead offset to 3773 * offset of last async readahead 3774 * request. 3775 */ 3776 rp->r_nextr = rablkoff; 3777 } 3778 mutex_exit(&rp->r_statelock); 3779 } 3780 3781 again: 3782 if ((pagefound = page_exists(vp, off)) == NULL) { 3783 if (pl == NULL) { 3784 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 3785 nfs_readahead); 3786 } else if (rw == S_CREATE) { 3787 /* 3788 * Block for this page is not allocated, or the offset 3789 * is beyond the current allocation size, or we're 3790 * allocating a swap slot and the page was not found, 3791 * so allocate it and return a zero page. 3792 */ 3793 if ((pp = page_create_va(vp, off, 3794 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 3795 cmn_err(CE_PANIC, "nfs_getapage: page_create"); 3796 io_len = PAGESIZE; 3797 mutex_enter(&rp->r_statelock); 3798 rp->r_nextr = off + PAGESIZE; 3799 mutex_exit(&rp->r_statelock); 3800 } else { 3801 /* 3802 * Need to go to server to get a BLOCK, exception to 3803 * that being while reading at offset = 0 or doing 3804 * random i/o, in that case read only a PAGE. 3805 */ 3806 mutex_enter(&rp->r_statelock); 3807 if (blkoff < rp->r_size && 3808 blkoff + bsize >= rp->r_size) { 3809 /* 3810 * If only a block or less is left in 3811 * the file, read all that is remaining. 3812 */ 3813 if (rp->r_size <= off) { 3814 /* 3815 * Trying to access beyond EOF, 3816 * set up to get at least one page. 3817 */ 3818 blksize = off + PAGESIZE - blkoff; 3819 } else 3820 blksize = rp->r_size - blkoff; 3821 } else if ((off == 0) || 3822 (off != rp->r_nextr && !readahead_issued)) { 3823 blksize = PAGESIZE; 3824 blkoff = off; /* block = page here */ 3825 } else 3826 blksize = bsize; 3827 mutex_exit(&rp->r_statelock); 3828 3829 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3830 &io_len, blkoff, blksize, 0); 3831 3832 /* 3833 * Some other thread has entered the page, 3834 * so just use it. 3835 */ 3836 if (pp == NULL) 3837 goto again; 3838 3839 /* 3840 * Now round the request size up to page boundaries. 3841 * This ensures that the entire page will be 3842 * initialized to zeroes if EOF is encountered. 3843 */ 3844 io_len = ptob(btopr(io_len)); 3845 3846 bp = pageio_setup(pp, io_len, vp, B_READ); 3847 ASSERT(bp != NULL); 3848 3849 /* 3850 * pageio_setup should have set b_addr to 0. This 3851 * is correct since we want to do I/O on a page 3852 * boundary. bp_mapin will use this addr to calculate 3853 * an offset, and then set b_addr to the kernel virtual 3854 * address it allocated for us. 3855 */ 3856 ASSERT(bp->b_un.b_addr == 0); 3857 3858 bp->b_edev = 0; 3859 bp->b_dev = 0; 3860 bp->b_lblkno = lbtodb(io_off); 3861 bp->b_file = vp; 3862 bp->b_offset = (offset_t)off; 3863 bp_mapin(bp); 3864 3865 /* 3866 * If doing a write beyond what we believe is EOF, 3867 * don't bother trying to read the pages from the 3868 * server, we'll just zero the pages here. We 3869 * don't check that the rw flag is S_WRITE here 3870 * because some implementations may attempt a 3871 * read access to the buffer before copying data. 3872 */ 3873 mutex_enter(&rp->r_statelock); 3874 if (io_off >= rp->r_size && seg == segkmap) { 3875 mutex_exit(&rp->r_statelock); 3876 bzero(bp->b_un.b_addr, io_len); 3877 } else { 3878 mutex_exit(&rp->r_statelock); 3879 error = nfs_bio(bp, cr); 3880 } 3881 3882 /* 3883 * Unmap the buffer before freeing it. 3884 */ 3885 bp_mapout(bp); 3886 pageio_done(bp); 3887 3888 if (error == NFS_EOF) { 3889 /* 3890 * If doing a write system call just return 3891 * zeroed pages, else user tried to get pages 3892 * beyond EOF, return error. We don't check 3893 * that the rw flag is S_WRITE here because 3894 * some implementations may attempt a read 3895 * access to the buffer before copying data. 3896 */ 3897 if (seg == segkmap) 3898 error = 0; 3899 else 3900 error = EFAULT; 3901 } 3902 3903 if (!readahead_issued && !error) { 3904 mutex_enter(&rp->r_statelock); 3905 rp->r_nextr = io_off + io_len; 3906 mutex_exit(&rp->r_statelock); 3907 } 3908 } 3909 } 3910 3911 out: 3912 if (pl == NULL) 3913 return (error); 3914 3915 if (error) { 3916 if (pp != NULL) 3917 pvn_read_done(pp, B_ERROR); 3918 return (error); 3919 } 3920 3921 if (pagefound) { 3922 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 3923 3924 /* 3925 * Page exists in the cache, acquire the appropriate lock. 3926 * If this fails, start all over again. 3927 */ 3928 if ((pp = page_lookup(vp, off, se)) == NULL) { 3929 #ifdef DEBUG 3930 nfs_lostpage++; 3931 #endif 3932 goto reread; 3933 } 3934 pl[0] = pp; 3935 pl[1] = NULL; 3936 return (0); 3937 } 3938 3939 if (pp != NULL) 3940 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3941 3942 return (error); 3943 } 3944 3945 static void 3946 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 3947 cred_t *cr) 3948 { 3949 int error; 3950 page_t *pp; 3951 u_offset_t io_off; 3952 size_t io_len; 3953 struct buf *bp; 3954 uint_t bsize, blksize; 3955 rnode_t *rp = VTOR(vp); 3956 3957 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3958 3959 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3960 3961 mutex_enter(&rp->r_statelock); 3962 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 3963 /* 3964 * If less than a block left in file read less 3965 * than a block. 3966 */ 3967 blksize = rp->r_size - blkoff; 3968 } else 3969 blksize = bsize; 3970 mutex_exit(&rp->r_statelock); 3971 3972 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 3973 &io_off, &io_len, blkoff, blksize, 1); 3974 /* 3975 * The isra flag passed to the kluster function is 1, we may have 3976 * gotten a return value of NULL for a variety of reasons (# of free 3977 * pages < minfree, someone entered the page on the vnode etc). In all 3978 * cases, we want to punt on the readahead. 3979 */ 3980 if (pp == NULL) 3981 return; 3982 3983 /* 3984 * Now round the request size up to page boundaries. 3985 * This ensures that the entire page will be 3986 * initialized to zeroes if EOF is encountered. 3987 */ 3988 io_len = ptob(btopr(io_len)); 3989 3990 bp = pageio_setup(pp, io_len, vp, B_READ); 3991 ASSERT(bp != NULL); 3992 3993 /* 3994 * pageio_setup should have set b_addr to 0. This is correct since 3995 * we want to do I/O on a page boundary. bp_mapin() will use this addr 3996 * to calculate an offset, and then set b_addr to the kernel virtual 3997 * address it allocated for us. 3998 */ 3999 ASSERT(bp->b_un.b_addr == 0); 4000 4001 bp->b_edev = 0; 4002 bp->b_dev = 0; 4003 bp->b_lblkno = lbtodb(io_off); 4004 bp->b_file = vp; 4005 bp->b_offset = (offset_t)blkoff; 4006 bp_mapin(bp); 4007 4008 /* 4009 * If doing a write beyond what we believe is EOF, don't bother trying 4010 * to read the pages from the server, we'll just zero the pages here. 4011 * We don't check that the rw flag is S_WRITE here because some 4012 * implementations may attempt a read access to the buffer before 4013 * copying data. 4014 */ 4015 mutex_enter(&rp->r_statelock); 4016 if (io_off >= rp->r_size && seg == segkmap) { 4017 mutex_exit(&rp->r_statelock); 4018 bzero(bp->b_un.b_addr, io_len); 4019 error = 0; 4020 } else { 4021 mutex_exit(&rp->r_statelock); 4022 error = nfs_bio(bp, cr); 4023 if (error == NFS_EOF) 4024 error = 0; 4025 } 4026 4027 /* 4028 * Unmap the buffer before freeing it. 4029 */ 4030 bp_mapout(bp); 4031 pageio_done(bp); 4032 4033 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 4034 4035 /* 4036 * In case of error set readahead offset 4037 * to the lowest offset. 4038 * pvn_read_done() calls VN_DISPOSE to destroy the pages 4039 */ 4040 if (error && rp->r_nextr > io_off) { 4041 mutex_enter(&rp->r_statelock); 4042 if (rp->r_nextr > io_off) 4043 rp->r_nextr = io_off; 4044 mutex_exit(&rp->r_statelock); 4045 } 4046 } 4047 4048 /* 4049 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 4050 * If len == 0, do from off to EOF. 4051 * 4052 * The normal cases should be len == 0 && off == 0 (entire vp list), 4053 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4054 * (from pageout). 4055 */ 4056 /* ARGSUSED */ 4057 static int 4058 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4059 caller_context_t *ct) 4060 { 4061 int error; 4062 rnode_t *rp; 4063 4064 ASSERT(cr != NULL); 4065 4066 /* 4067 * XXX - Why should this check be made here? 4068 */ 4069 if (vp->v_flag & VNOMAP) 4070 return (ENOSYS); 4071 4072 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 4073 return (0); 4074 4075 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 4076 return (EIO); 4077 ASSERT(off <= MAXOFF32_T); 4078 4079 rp = VTOR(vp); 4080 mutex_enter(&rp->r_statelock); 4081 rp->r_count++; 4082 mutex_exit(&rp->r_statelock); 4083 error = nfs_putpages(vp, off, len, flags, cr); 4084 mutex_enter(&rp->r_statelock); 4085 rp->r_count--; 4086 cv_broadcast(&rp->r_cv); 4087 mutex_exit(&rp->r_statelock); 4088 4089 return (error); 4090 } 4091 4092 /* 4093 * Write out a single page, possibly klustering adjacent dirty pages. 4094 */ 4095 int 4096 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 4097 int flags, cred_t *cr) 4098 { 4099 u_offset_t io_off; 4100 u_offset_t lbn_off; 4101 u_offset_t lbn; 4102 size_t io_len; 4103 uint_t bsize; 4104 int error; 4105 rnode_t *rp; 4106 4107 ASSERT(!vn_is_readonly(vp)); 4108 ASSERT(pp != NULL); 4109 ASSERT(cr != NULL); 4110 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 4111 4112 rp = VTOR(vp); 4113 ASSERT(rp->r_count > 0); 4114 4115 ASSERT(pp->p_offset <= MAXOFF32_T); 4116 4117 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4118 lbn = pp->p_offset / bsize; 4119 lbn_off = lbn * bsize; 4120 4121 /* 4122 * Find a kluster that fits in one block, or in 4123 * one page if pages are bigger than blocks. If 4124 * there is less file space allocated than a whole 4125 * page, we'll shorten the i/o request below. 4126 */ 4127 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4128 roundup(bsize, PAGESIZE), flags); 4129 4130 /* 4131 * pvn_write_kluster shouldn't have returned a page with offset 4132 * behind the original page we were given. Verify that. 4133 */ 4134 ASSERT((pp->p_offset / bsize) >= lbn); 4135 4136 /* 4137 * Now pp will have the list of kept dirty pages marked for 4138 * write back. It will also handle invalidation and freeing 4139 * of pages that are not dirty. Check for page length rounding 4140 * problems. 4141 */ 4142 if (io_off + io_len > lbn_off + bsize) { 4143 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4144 io_len = lbn_off + bsize - io_off; 4145 } 4146 /* 4147 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4148 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4149 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4150 * progress and the r_size has not been made consistent with the 4151 * new size of the file. When the uiomove() completes the r_size is 4152 * updated and the RMODINPROGRESS flag is cleared. 4153 * 4154 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4155 * consistent value of r_size. Without this handshaking, it is 4156 * possible that nfs(3)_bio() picks up the old value of r_size 4157 * before the uiomove() in writerp() completes. This will result 4158 * in the write through nfs(3)_bio() being dropped. 4159 * 4160 * More precisely, there is a window between the time the uiomove() 4161 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4162 * operation intervenes in this window, the page will be picked up, 4163 * because it is dirty (it will be unlocked, unless it was 4164 * pagecreate'd). When the page is picked up as dirty, the dirty 4165 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4166 * checked. This will still be the old size. Therefore the page will 4167 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4168 * the page will be found to be clean and the write will be dropped. 4169 */ 4170 if (rp->r_flags & RMODINPROGRESS) { 4171 mutex_enter(&rp->r_statelock); 4172 if ((rp->r_flags & RMODINPROGRESS) && 4173 rp->r_modaddr + MAXBSIZE > io_off && 4174 rp->r_modaddr < io_off + io_len) { 4175 page_t *plist; 4176 /* 4177 * A write is in progress for this region of the file. 4178 * If we did not detect RMODINPROGRESS here then this 4179 * path through nfs_putapage() would eventually go to 4180 * nfs(3)_bio() and may not write out all of the data 4181 * in the pages. We end up losing data. So we decide 4182 * to set the modified bit on each page in the page 4183 * list and mark the rnode with RDIRTY. This write 4184 * will be restarted at some later time. 4185 */ 4186 plist = pp; 4187 while (plist != NULL) { 4188 pp = plist; 4189 page_sub(&plist, pp); 4190 hat_setmod(pp); 4191 page_io_unlock(pp); 4192 page_unlock(pp); 4193 } 4194 rp->r_flags |= RDIRTY; 4195 mutex_exit(&rp->r_statelock); 4196 if (offp) 4197 *offp = io_off; 4198 if (lenp) 4199 *lenp = io_len; 4200 return (0); 4201 } 4202 mutex_exit(&rp->r_statelock); 4203 } 4204 4205 if (flags & B_ASYNC) { 4206 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4207 nfs_sync_putapage); 4208 } else 4209 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4210 4211 if (offp) 4212 *offp = io_off; 4213 if (lenp) 4214 *lenp = io_len; 4215 return (error); 4216 } 4217 4218 static int 4219 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4220 int flags, cred_t *cr) 4221 { 4222 int error; 4223 rnode_t *rp; 4224 4225 flags |= B_WRITE; 4226 4227 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4228 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4229 4230 rp = VTOR(vp); 4231 4232 if ((error == ENOSPC || error == EDQUOT || error == EACCES) && 4233 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 4234 if (!(rp->r_flags & ROUTOFSPACE)) { 4235 mutex_enter(&rp->r_statelock); 4236 rp->r_flags |= ROUTOFSPACE; 4237 mutex_exit(&rp->r_statelock); 4238 } 4239 flags |= B_ERROR; 4240 pvn_write_done(pp, flags); 4241 /* 4242 * If this was not an async thread, then try again to 4243 * write out the pages, but this time, also destroy 4244 * them whether or not the write is successful. This 4245 * will prevent memory from filling up with these 4246 * pages and destroying them is the only alternative 4247 * if they can't be written out. 4248 * 4249 * Don't do this if this is an async thread because 4250 * when the pages are unlocked in pvn_write_done, 4251 * some other thread could have come along, locked 4252 * them, and queued for an async thread. It would be 4253 * possible for all of the async threads to be tied 4254 * up waiting to lock the pages again and they would 4255 * all already be locked and waiting for an async 4256 * thread to handle them. Deadlock. 4257 */ 4258 if (!(flags & B_ASYNC)) { 4259 error = nfs_putpage(vp, io_off, io_len, 4260 B_INVAL | B_FORCE, cr, NULL); 4261 } 4262 } else { 4263 if (error) 4264 flags |= B_ERROR; 4265 else if (rp->r_flags & ROUTOFSPACE) { 4266 mutex_enter(&rp->r_statelock); 4267 rp->r_flags &= ~ROUTOFSPACE; 4268 mutex_exit(&rp->r_statelock); 4269 } 4270 pvn_write_done(pp, flags); 4271 } 4272 4273 return (error); 4274 } 4275 4276 /* ARGSUSED */ 4277 static int 4278 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4279 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4280 caller_context_t *ct) 4281 { 4282 struct segvn_crargs vn_a; 4283 int error; 4284 rnode_t *rp; 4285 struct vattr va; 4286 4287 if (nfs_zone() != VTOMI(vp)->mi_zone) 4288 return (EIO); 4289 4290 if (vp->v_flag & VNOMAP) 4291 return (ENOSYS); 4292 4293 if (off > MAXOFF32_T) 4294 return (EFBIG); 4295 4296 if (off < 0 || off + len < 0) 4297 return (ENXIO); 4298 4299 if (vp->v_type != VREG) 4300 return (ENODEV); 4301 4302 /* 4303 * If there is cached data and if close-to-open consistency 4304 * checking is not turned off and if the file system is not 4305 * mounted readonly, then force an over the wire getattr. 4306 * Otherwise, just invoke nfsgetattr to get a copy of the 4307 * attributes. The attribute cache will be used unless it 4308 * is timed out and if it is, then an over the wire getattr 4309 * will be issued. 4310 */ 4311 va.va_mask = AT_ALL; 4312 if (vn_has_cached_data(vp) && 4313 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 4314 error = nfs_getattr_otw(vp, &va, cr); 4315 else 4316 error = nfsgetattr(vp, &va, cr); 4317 if (error) 4318 return (error); 4319 4320 /* 4321 * Check to see if the vnode is currently marked as not cachable. 4322 * This means portions of the file are locked (through VOP_FRLOCK). 4323 * In this case the map request must be refused. We use 4324 * rp->r_lkserlock to avoid a race with concurrent lock requests. 4325 */ 4326 rp = VTOR(vp); 4327 4328 /* 4329 * Atomically increment r_inmap after acquiring r_rwlock. The 4330 * idea here is to acquire r_rwlock to block read/write and 4331 * not to protect r_inmap. r_inmap will inform nfs_read/write() 4332 * that we are in nfs_map(). Now, r_rwlock is acquired in order 4333 * and we can prevent the deadlock that would have occurred 4334 * when nfs_addmap() would have acquired it out of order. 4335 * 4336 * Since we are not protecting r_inmap by any lock, we do not 4337 * hold any lock when we decrement it. We atomically decrement 4338 * r_inmap after we release r_lkserlock. 4339 */ 4340 4341 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 4342 return (EINTR); 4343 atomic_add_int(&rp->r_inmap, 1); 4344 nfs_rw_exit(&rp->r_rwlock); 4345 4346 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) { 4347 atomic_add_int(&rp->r_inmap, -1); 4348 return (EINTR); 4349 } 4350 if (vp->v_flag & VNOCACHE) { 4351 error = EAGAIN; 4352 goto done; 4353 } 4354 4355 /* 4356 * Don't allow concurrent locks and mapping if mandatory locking is 4357 * enabled. 4358 */ 4359 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 4360 MANDLOCK(vp, va.va_mode)) { 4361 error = EAGAIN; 4362 goto done; 4363 } 4364 4365 as_rangelock(as); 4366 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4367 if (error != 0) { 4368 as_rangeunlock(as); 4369 goto done; 4370 } 4371 4372 vn_a.vp = vp; 4373 vn_a.offset = off; 4374 vn_a.type = (flags & MAP_TYPE); 4375 vn_a.prot = (uchar_t)prot; 4376 vn_a.maxprot = (uchar_t)maxprot; 4377 vn_a.flags = (flags & ~MAP_TYPE); 4378 vn_a.cred = cr; 4379 vn_a.amp = NULL; 4380 vn_a.szc = 0; 4381 vn_a.lgrp_mem_policy_flags = 0; 4382 4383 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4384 as_rangeunlock(as); 4385 4386 done: 4387 nfs_rw_exit(&rp->r_lkserlock); 4388 atomic_add_int(&rp->r_inmap, -1); 4389 return (error); 4390 } 4391 4392 /* ARGSUSED */ 4393 static int 4394 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4395 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4396 caller_context_t *ct) 4397 { 4398 rnode_t *rp; 4399 4400 if (vp->v_flag & VNOMAP) 4401 return (ENOSYS); 4402 if (nfs_zone() != VTOMI(vp)->mi_zone) 4403 return (EIO); 4404 4405 rp = VTOR(vp); 4406 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 4407 4408 return (0); 4409 } 4410 4411 /* ARGSUSED */ 4412 static int 4413 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, 4414 struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) 4415 { 4416 netobj lm_fh; 4417 int rc; 4418 u_offset_t start, end; 4419 rnode_t *rp; 4420 int error = 0, intr = INTR(vp); 4421 4422 /* check for valid cmd parameter */ 4423 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 4424 return (EINVAL); 4425 if (nfs_zone() != VTOMI(vp)->mi_zone) 4426 return (EIO); 4427 4428 /* Verify l_type. */ 4429 switch (bfp->l_type) { 4430 case F_RDLCK: 4431 if (cmd != F_GETLK && !(flag & FREAD)) 4432 return (EBADF); 4433 break; 4434 case F_WRLCK: 4435 if (cmd != F_GETLK && !(flag & FWRITE)) 4436 return (EBADF); 4437 break; 4438 case F_UNLCK: 4439 intr = 0; 4440 break; 4441 4442 default: 4443 return (EINVAL); 4444 } 4445 4446 /* check the validity of the lock range */ 4447 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 4448 return (rc); 4449 if (rc = flk_check_lock_data(start, end, MAXOFF32_T)) 4450 return (rc); 4451 4452 /* 4453 * If the filesystem is mounted using local locking, pass the 4454 * request off to the local locking code. 4455 */ 4456 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 4457 if (offset > MAXOFF32_T) 4458 return (EFBIG); 4459 if (cmd == F_SETLK || cmd == F_SETLKW) { 4460 /* 4461 * For complete safety, we should be holding 4462 * r_lkserlock. However, we can't call 4463 * lm_safelock and then fs_frlock while 4464 * holding r_lkserlock, so just invoke 4465 * lm_safelock and expect that this will 4466 * catch enough of the cases. 4467 */ 4468 if (!lm_safelock(vp, bfp, cr)) 4469 return (EAGAIN); 4470 } 4471 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4472 } 4473 4474 rp = VTOR(vp); 4475 4476 /* 4477 * Check whether the given lock request can proceed, given the 4478 * current file mappings. 4479 */ 4480 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 4481 return (EINTR); 4482 if (cmd == F_SETLK || cmd == F_SETLKW) { 4483 if (!lm_safelock(vp, bfp, cr)) { 4484 rc = EAGAIN; 4485 goto done; 4486 } 4487 } 4488 4489 /* 4490 * Flush the cache after waiting for async I/O to finish. For new 4491 * locks, this is so that the process gets the latest bits from the 4492 * server. For unlocks, this is so that other clients see the 4493 * latest bits once the file has been unlocked. If currently dirty 4494 * pages can't be flushed, then don't allow a lock to be set. But 4495 * allow unlocks to succeed, to avoid having orphan locks on the 4496 * server. 4497 */ 4498 if (cmd != F_GETLK) { 4499 mutex_enter(&rp->r_statelock); 4500 while (rp->r_count > 0) { 4501 if (intr) { 4502 klwp_t *lwp = ttolwp(curthread); 4503 4504 if (lwp != NULL) 4505 lwp->lwp_nostop++; 4506 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) 4507 == 0) { 4508 if (lwp != NULL) 4509 lwp->lwp_nostop--; 4510 rc = EINTR; 4511 break; 4512 } 4513 if (lwp != NULL) 4514 lwp->lwp_nostop--; 4515 } else 4516 cv_wait(&rp->r_cv, &rp->r_statelock); 4517 } 4518 mutex_exit(&rp->r_statelock); 4519 if (rc != 0) 4520 goto done; 4521 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 4522 if (error) { 4523 if (error == ENOSPC || error == EDQUOT) { 4524 mutex_enter(&rp->r_statelock); 4525 if (!rp->r_error) 4526 rp->r_error = error; 4527 mutex_exit(&rp->r_statelock); 4528 } 4529 if (bfp->l_type != F_UNLCK) { 4530 rc = ENOLCK; 4531 goto done; 4532 } 4533 } 4534 } 4535 4536 lm_fh.n_len = sizeof (fhandle_t); 4537 lm_fh.n_bytes = (char *)VTOFH(vp); 4538 4539 /* 4540 * Call the lock manager to do the real work of contacting 4541 * the server and obtaining the lock. 4542 */ 4543 rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp); 4544 4545 if (rc == 0) 4546 nfs_lockcompletion(vp, cmd); 4547 4548 done: 4549 nfs_rw_exit(&rp->r_lkserlock); 4550 return (rc); 4551 } 4552 4553 /* 4554 * Free storage space associated with the specified vnode. The portion 4555 * to be freed is specified by bfp->l_start and bfp->l_len (already 4556 * normalized to a "whence" of 0). 4557 * 4558 * This is an experimental facility whose continued existence is not 4559 * guaranteed. Currently, we only support the special case 4560 * of l_len == 0, meaning free to end of file. 4561 */ 4562 /* ARGSUSED */ 4563 static int 4564 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4565 offset_t offset, cred_t *cr, caller_context_t *ct) 4566 { 4567 int error; 4568 4569 ASSERT(vp->v_type == VREG); 4570 if (cmd != F_FREESP) 4571 return (EINVAL); 4572 4573 if (offset > MAXOFF32_T) 4574 return (EFBIG); 4575 4576 if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) || 4577 (bfp->l_len > MAXOFF32_T)) 4578 return (EFBIG); 4579 4580 if (nfs_zone() != VTOMI(vp)->mi_zone) 4581 return (EIO); 4582 4583 error = convoff(vp, bfp, 0, offset); 4584 if (!error) { 4585 ASSERT(bfp->l_start >= 0); 4586 if (bfp->l_len == 0) { 4587 struct vattr va; 4588 4589 /* 4590 * ftruncate should not change the ctime and 4591 * mtime if we truncate the file to its 4592 * previous size. 4593 */ 4594 va.va_mask = AT_SIZE; 4595 error = nfsgetattr(vp, &va, cr); 4596 if (error || va.va_size == bfp->l_start) 4597 return (error); 4598 va.va_mask = AT_SIZE; 4599 va.va_size = bfp->l_start; 4600 error = nfssetattr(vp, &va, 0, cr); 4601 } else 4602 error = EINVAL; 4603 } 4604 4605 return (error); 4606 } 4607 4608 /* ARGSUSED */ 4609 static int 4610 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 4611 { 4612 4613 return (EINVAL); 4614 } 4615 4616 /* 4617 * Setup and add an address space callback to do the work of the delmap call. 4618 * The callback will (and must be) deleted in the actual callback function. 4619 * 4620 * This is done in order to take care of the problem that we have with holding 4621 * the address space's a_lock for a long period of time (e.g. if the NFS server 4622 * is down). Callbacks will be executed in the address space code while the 4623 * a_lock is not held. Holding the address space's a_lock causes things such 4624 * as ps and fork to hang because they are trying to acquire this lock as well. 4625 */ 4626 /* ARGSUSED */ 4627 static int 4628 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4629 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4630 caller_context_t *ct) 4631 { 4632 int caller_found; 4633 int error; 4634 rnode_t *rp; 4635 nfs_delmap_args_t *dmapp; 4636 nfs_delmapcall_t *delmap_call; 4637 4638 if (vp->v_flag & VNOMAP) 4639 return (ENOSYS); 4640 /* 4641 * A process may not change zones if it has NFS pages mmap'ed 4642 * in, so we can't legitimately get here from the wrong zone. 4643 */ 4644 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4645 4646 rp = VTOR(vp); 4647 4648 /* 4649 * The way that the address space of this process deletes its mapping 4650 * of this file is via the following call chains: 4651 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4652 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4653 * 4654 * With the use of address space callbacks we are allowed to drop the 4655 * address space lock, a_lock, while executing the NFS operations that 4656 * need to go over the wire. Returning EAGAIN to the caller of this 4657 * function is what drives the execution of the callback that we add 4658 * below. The callback will be executed by the address space code 4659 * after dropping the a_lock. When the callback is finished, since 4660 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 4661 * is called again on the same segment to finish the rest of the work 4662 * that needs to happen during unmapping. 4663 * 4664 * This action of calling back into the segment driver causes 4665 * nfs_delmap() to get called again, but since the callback was 4666 * already executed at this point, it already did the work and there 4667 * is nothing left for us to do. 4668 * 4669 * To Summarize: 4670 * - The first time nfs_delmap is called by the current thread is when 4671 * we add the caller associated with this delmap to the delmap caller 4672 * list, add the callback, and return EAGAIN. 4673 * - The second time in this call chain when nfs_delmap is called we 4674 * will find this caller in the delmap caller list and realize there 4675 * is no more work to do thus removing this caller from the list and 4676 * returning the error that was set in the callback execution. 4677 */ 4678 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 4679 if (caller_found) { 4680 /* 4681 * 'error' is from the actual delmap operations. To avoid 4682 * hangs, we need to handle the return of EAGAIN differently 4683 * since this is what drives the callback execution. 4684 * In this case, we don't want to return EAGAIN and do the 4685 * callback execution because there are none to execute. 4686 */ 4687 if (error == EAGAIN) 4688 return (0); 4689 else 4690 return (error); 4691 } 4692 4693 /* current caller was not in the list */ 4694 delmap_call = nfs_init_delmapcall(); 4695 4696 mutex_enter(&rp->r_statelock); 4697 list_insert_tail(&rp->r_indelmap, delmap_call); 4698 mutex_exit(&rp->r_statelock); 4699 4700 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 4701 4702 dmapp->vp = vp; 4703 dmapp->off = off; 4704 dmapp->addr = addr; 4705 dmapp->len = len; 4706 dmapp->prot = prot; 4707 dmapp->maxprot = maxprot; 4708 dmapp->flags = flags; 4709 dmapp->cr = cr; 4710 dmapp->caller = delmap_call; 4711 4712 error = as_add_callback(as, nfs_delmap_callback, dmapp, 4713 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 4714 4715 return (error ? error : EAGAIN); 4716 } 4717 4718 /* 4719 * Remove some pages from an mmap'd vnode. Just update the 4720 * count of pages. If doing close-to-open, then flush all 4721 * of the pages associated with this file. Otherwise, start 4722 * an asynchronous page flush to write out any dirty pages. 4723 * This will also associate a credential with the rnode which 4724 * can be used to write the pages. 4725 */ 4726 /* ARGSUSED */ 4727 static void 4728 nfs_delmap_callback(struct as *as, void *arg, uint_t event) 4729 { 4730 int error; 4731 rnode_t *rp; 4732 mntinfo_t *mi; 4733 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 4734 4735 rp = VTOR(dmapp->vp); 4736 mi = VTOMI(dmapp->vp); 4737 4738 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 4739 ASSERT(rp->r_mapcnt >= 0); 4740 4741 /* 4742 * Initiate a page flush if there are pages, the file system 4743 * was not mounted readonly, the segment was mapped shared, and 4744 * the pages themselves were writeable. 4745 */ 4746 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 4747 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 4748 mutex_enter(&rp->r_statelock); 4749 rp->r_flags |= RDIRTY; 4750 mutex_exit(&rp->r_statelock); 4751 /* 4752 * If this is a cross-zone access a sync putpage won't work, so 4753 * the best we can do is try an async putpage. That seems 4754 * better than something more draconian such as discarding the 4755 * dirty pages. 4756 */ 4757 if ((mi->mi_flags & MI_NOCTO) || 4758 nfs_zone() != mi->mi_zone) 4759 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4760 B_ASYNC, dmapp->cr, NULL); 4761 else 4762 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4763 0, dmapp->cr, NULL); 4764 if (!error) { 4765 mutex_enter(&rp->r_statelock); 4766 error = rp->r_error; 4767 rp->r_error = 0; 4768 mutex_exit(&rp->r_statelock); 4769 } 4770 } else 4771 error = 0; 4772 4773 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 4774 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4775 B_INVAL, dmapp->cr, NULL); 4776 4777 dmapp->caller->error = error; 4778 (void) as_delete_callback(as, arg); 4779 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 4780 } 4781 4782 /* ARGSUSED */ 4783 static int 4784 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4785 caller_context_t *ct) 4786 { 4787 int error = 0; 4788 4789 if (nfs_zone() != VTOMI(vp)->mi_zone) 4790 return (EIO); 4791 /* 4792 * This looks a little weird because it's written in a general 4793 * manner but we make little use of cases. If cntl() ever gets 4794 * widely used, the outer switch will make more sense. 4795 */ 4796 4797 switch (cmd) { 4798 4799 /* 4800 * Large file spec - need to base answer new query with 4801 * hardcoded constant based on the protocol. 4802 */ 4803 case _PC_FILESIZEBITS: 4804 *valp = 32; 4805 return (0); 4806 4807 case _PC_LINK_MAX: 4808 case _PC_NAME_MAX: 4809 case _PC_PATH_MAX: 4810 case _PC_SYMLINK_MAX: 4811 case _PC_CHOWN_RESTRICTED: 4812 case _PC_NO_TRUNC: { 4813 mntinfo_t *mi; 4814 struct pathcnf *pc; 4815 4816 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL) 4817 return (EINVAL); 4818 error = _PC_ISSET(cmd, pc->pc_mask); /* error or bool */ 4819 switch (cmd) { 4820 case _PC_LINK_MAX: 4821 *valp = pc->pc_link_max; 4822 break; 4823 case _PC_NAME_MAX: 4824 *valp = pc->pc_name_max; 4825 break; 4826 case _PC_PATH_MAX: 4827 case _PC_SYMLINK_MAX: 4828 *valp = pc->pc_path_max; 4829 break; 4830 case _PC_CHOWN_RESTRICTED: 4831 /* 4832 * if we got here, error is really a boolean which 4833 * indicates whether cmd is set or not. 4834 */ 4835 *valp = error ? 1 : 0; /* see above */ 4836 error = 0; 4837 break; 4838 case _PC_NO_TRUNC: 4839 /* 4840 * if we got here, error is really a boolean which 4841 * indicates whether cmd is set or not. 4842 */ 4843 *valp = error ? 1 : 0; /* see above */ 4844 error = 0; 4845 break; 4846 } 4847 return (error ? EINVAL : 0); 4848 } 4849 4850 case _PC_XATTR_EXISTS: 4851 *valp = 0; 4852 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 4853 vnode_t *avp; 4854 rnode_t *rp; 4855 mntinfo_t *mi = VTOMI(vp); 4856 4857 if (!(mi->mi_flags & MI_EXTATTR)) 4858 return (0); 4859 4860 rp = VTOR(vp); 4861 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 4862 INTR(vp))) 4863 return (EINTR); 4864 4865 error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 4866 if (error || avp == NULL) 4867 error = acl_getxattrdir2(vp, &avp, 0, cr, 0); 4868 4869 nfs_rw_exit(&rp->r_rwlock); 4870 4871 if (error == 0 && avp != NULL) { 4872 error = do_xattr_exists_check(avp, valp, cr); 4873 VN_RELE(avp); 4874 } 4875 } 4876 return (error ? EINVAL : 0); 4877 4878 case _PC_ACL_ENABLED: 4879 *valp = _ACL_ACLENT_ENABLED; 4880 return (0); 4881 4882 default: 4883 return (EINVAL); 4884 } 4885 } 4886 4887 /* 4888 * Called by async thread to do synchronous pageio. Do the i/o, wait 4889 * for it to complete, and cleanup the page list when done. 4890 */ 4891 static int 4892 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4893 int flags, cred_t *cr) 4894 { 4895 int error; 4896 4897 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4898 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4899 if (flags & B_READ) 4900 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 4901 else 4902 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 4903 return (error); 4904 } 4905 4906 /* ARGSUSED */ 4907 static int 4908 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4909 int flags, cred_t *cr, caller_context_t *ct) 4910 { 4911 int error; 4912 rnode_t *rp; 4913 4914 if (pp == NULL) 4915 return (EINVAL); 4916 4917 if (io_off > MAXOFF32_T) 4918 return (EFBIG); 4919 if (nfs_zone() != VTOMI(vp)->mi_zone) 4920 return (EIO); 4921 rp = VTOR(vp); 4922 mutex_enter(&rp->r_statelock); 4923 rp->r_count++; 4924 mutex_exit(&rp->r_statelock); 4925 4926 if (flags & B_ASYNC) { 4927 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 4928 nfs_sync_pageio); 4929 } else 4930 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4931 mutex_enter(&rp->r_statelock); 4932 rp->r_count--; 4933 cv_broadcast(&rp->r_cv); 4934 mutex_exit(&rp->r_statelock); 4935 return (error); 4936 } 4937 4938 /* ARGSUSED */ 4939 static int 4940 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 4941 caller_context_t *ct) 4942 { 4943 int error; 4944 mntinfo_t *mi; 4945 4946 mi = VTOMI(vp); 4947 4948 if (nfs_zone() != mi->mi_zone) 4949 return (EIO); 4950 if (mi->mi_flags & MI_ACL) { 4951 error = acl_setacl2(vp, vsecattr, flag, cr); 4952 if (mi->mi_flags & MI_ACL) 4953 return (error); 4954 } 4955 4956 return (ENOSYS); 4957 } 4958 4959 /* ARGSUSED */ 4960 static int 4961 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 4962 caller_context_t *ct) 4963 { 4964 int error; 4965 mntinfo_t *mi; 4966 4967 mi = VTOMI(vp); 4968 4969 if (nfs_zone() != mi->mi_zone) 4970 return (EIO); 4971 if (mi->mi_flags & MI_ACL) { 4972 error = acl_getacl2(vp, vsecattr, flag, cr); 4973 if (mi->mi_flags & MI_ACL) 4974 return (error); 4975 } 4976 4977 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 4978 } 4979 4980 /* ARGSUSED */ 4981 static int 4982 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 4983 caller_context_t *ct) 4984 { 4985 int error; 4986 struct shrlock nshr; 4987 struct nfs_owner nfs_owner; 4988 netobj lm_fh; 4989 4990 if (nfs_zone() != VTOMI(vp)->mi_zone) 4991 return (EIO); 4992 4993 /* 4994 * check for valid cmd parameter 4995 */ 4996 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 4997 return (EINVAL); 4998 4999 /* 5000 * Check access permissions 5001 */ 5002 if (cmd == F_SHARE && 5003 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 5004 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 5005 return (EBADF); 5006 5007 /* 5008 * If the filesystem is mounted using local locking, pass the 5009 * request off to the local share code. 5010 */ 5011 if (VTOMI(vp)->mi_flags & MI_LLOCK) 5012 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 5013 5014 switch (cmd) { 5015 case F_SHARE: 5016 case F_UNSHARE: 5017 lm_fh.n_len = sizeof (fhandle_t); 5018 lm_fh.n_bytes = (char *)VTOFH(vp); 5019 5020 /* 5021 * If passed an owner that is too large to fit in an 5022 * nfs_owner it is likely a recursive call from the 5023 * lock manager client and pass it straight through. If 5024 * it is not a nfs_owner then simply return an error. 5025 */ 5026 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 5027 if (((struct nfs_owner *)shr->s_owner)->magic != 5028 NFS_OWNER_MAGIC) 5029 return (EINVAL); 5030 5031 if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) { 5032 error = set_errno(error); 5033 } 5034 return (error); 5035 } 5036 /* 5037 * Remote share reservations owner is a combination of 5038 * a magic number, hostname, and the local owner 5039 */ 5040 bzero(&nfs_owner, sizeof (nfs_owner)); 5041 nfs_owner.magic = NFS_OWNER_MAGIC; 5042 (void) strncpy(nfs_owner.hname, uts_nodename(), 5043 sizeof (nfs_owner.hname)); 5044 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 5045 nshr.s_access = shr->s_access; 5046 nshr.s_deny = shr->s_deny; 5047 nshr.s_sysid = 0; 5048 nshr.s_pid = ttoproc(curthread)->p_pid; 5049 nshr.s_own_len = sizeof (nfs_owner); 5050 nshr.s_owner = (caddr_t)&nfs_owner; 5051 5052 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) { 5053 error = set_errno(error); 5054 } 5055 5056 break; 5057 5058 case F_HASREMOTELOCKS: 5059 /* 5060 * NFS client can't store remote locks itself 5061 */ 5062 shr->s_access = 0; 5063 error = 0; 5064 break; 5065 5066 default: 5067 error = EINVAL; 5068 break; 5069 } 5070 5071 return (error); 5072 } 5073