1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/types.h> 31 #include <sys/systm.h> 32 #include <sys/cred.h> 33 #include <sys/time.h> 34 #include <sys/vnode.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/file.h> 38 #include <sys/filio.h> 39 #include <sys/uio.h> 40 #include <sys/buf.h> 41 #include <sys/mman.h> 42 #include <sys/pathname.h> 43 #include <sys/dirent.h> 44 #include <sys/debug.h> 45 #include <sys/vmsystm.h> 46 #include <sys/fcntl.h> 47 #include <sys/flock.h> 48 #include <sys/swap.h> 49 #include <sys/errno.h> 50 #include <sys/strsubr.h> 51 #include <sys/sysmacros.h> 52 #include <sys/kmem.h> 53 #include <sys/cmn_err.h> 54 #include <sys/pathconf.h> 55 #include <sys/utsname.h> 56 #include <sys/dnlc.h> 57 #include <sys/acl.h> 58 #include <sys/atomic.h> 59 #include <sys/policy.h> 60 #include <sys/sdt.h> 61 62 #include <rpc/types.h> 63 #include <rpc/auth.h> 64 #include <rpc/clnt.h> 65 66 #include <nfs/nfs.h> 67 #include <nfs/nfs_clnt.h> 68 #include <nfs/rnode.h> 69 #include <nfs/nfs_acl.h> 70 #include <nfs/lm.h> 71 72 #include <vm/hat.h> 73 #include <vm/as.h> 74 #include <vm/page.h> 75 #include <vm/pvn.h> 76 #include <vm/seg.h> 77 #include <vm/seg_map.h> 78 #include <vm/seg_kpm.h> 79 #include <vm/seg_vn.h> 80 81 #include <fs/fs_subr.h> 82 83 #include <sys/ddi.h> 84 85 static int nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 86 cred_t *); 87 static int nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *); 88 static int nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *); 89 static int nfssetattr(vnode_t *, struct vattr *, int, cred_t *); 90 static int nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 91 static int nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 92 static int nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *, 93 caller_context_t *); 94 static int nfsreaddir(vnode_t *, rddir_cache *, cred_t *); 95 static int nfs_bio(struct buf *, cred_t *); 96 static int nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 97 page_t *[], size_t, struct seg *, caddr_t, 98 enum seg_rw, cred_t *); 99 static void nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 100 cred_t *); 101 static int nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 102 int, cred_t *); 103 static int nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 104 int, cred_t *); 105 static void nfs_delmap_callback(struct as *, void *, uint_t); 106 107 /* 108 * Error flags used to pass information about certain special errors 109 * which need to be handled specially. 110 */ 111 #define NFS_EOF -98 112 113 /* 114 * These are the vnode ops routines which implement the vnode interface to 115 * the networked file system. These routines just take their parameters, 116 * make them look networkish by putting the right info into interface structs, 117 * and then calling the appropriate remote routine(s) to do the work. 118 * 119 * Note on directory name lookup cacheing: If we detect a stale fhandle, 120 * we purge the directory cache relative to that vnode. This way, the 121 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 122 * more details on rnode locking. 123 */ 124 125 static int nfs_open(vnode_t **, int, cred_t *, caller_context_t *); 126 static int nfs_close(vnode_t *, int, int, offset_t, cred_t *, 127 caller_context_t *); 128 static int nfs_read(vnode_t *, struct uio *, int, cred_t *, 129 caller_context_t *); 130 static int nfs_write(vnode_t *, struct uio *, int, cred_t *, 131 caller_context_t *); 132 static int nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 133 caller_context_t *); 134 static int nfs_getattr(vnode_t *, struct vattr *, int, cred_t *, 135 caller_context_t *); 136 static int nfs_setattr(vnode_t *, struct vattr *, int, cred_t *, 137 caller_context_t *); 138 static int nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *); 139 static int nfs_accessx(void *, int, cred_t *); 140 static int nfs_readlink(vnode_t *, struct uio *, cred_t *, 141 caller_context_t *); 142 static int nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *); 143 static void nfs_inactive(vnode_t *, cred_t *, caller_context_t *); 144 static int nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *, 145 int, vnode_t *, cred_t *, caller_context_t *, 146 int *, pathname_t *); 147 static int nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl, 148 int, vnode_t **, cred_t *, int, caller_context_t *, 149 vsecattr_t *); 150 static int nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *, 151 int); 152 static int nfs_link(vnode_t *, vnode_t *, char *, cred_t *, 153 caller_context_t *, int); 154 static int nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 155 caller_context_t *, int); 156 static int nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 157 cred_t *, caller_context_t *, int, vsecattr_t *); 158 static int nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 159 caller_context_t *, int); 160 static int nfs_symlink(vnode_t *, char *, struct vattr *, char *, 161 cred_t *, caller_context_t *, int); 162 static int nfs_readdir(vnode_t *, struct uio *, cred_t *, int *, 163 caller_context_t *, int); 164 static int nfs_fid(vnode_t *, fid_t *, caller_context_t *); 165 static int nfs_rwlock(vnode_t *, int, caller_context_t *); 166 static void nfs_rwunlock(vnode_t *, int, caller_context_t *); 167 static int nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 168 static int nfs_getpage(vnode_t *, offset_t, size_t, uint_t *, 169 page_t *[], size_t, struct seg *, caddr_t, 170 enum seg_rw, cred_t *, caller_context_t *); 171 static int nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 172 caller_context_t *); 173 static int nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 174 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 175 static int nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 176 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 177 static int nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 178 struct flk_callback *, cred_t *, caller_context_t *); 179 static int nfs_space(vnode_t *, int, struct flock64 *, int, offset_t, 180 cred_t *, caller_context_t *); 181 static int nfs_realvp(vnode_t *, vnode_t **, caller_context_t *); 182 static int nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 183 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 184 static int nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *, 185 caller_context_t *); 186 static int nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 187 cred_t *, caller_context_t *); 188 static int nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 189 caller_context_t *); 190 static int nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 191 caller_context_t *); 192 static int nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 193 caller_context_t *); 194 195 struct vnodeops *nfs_vnodeops; 196 197 const fs_operation_def_t nfs_vnodeops_template[] = { 198 VOPNAME_OPEN, { .vop_open = nfs_open }, 199 VOPNAME_CLOSE, { .vop_close = nfs_close }, 200 VOPNAME_READ, { .vop_read = nfs_read }, 201 VOPNAME_WRITE, { .vop_write = nfs_write }, 202 VOPNAME_IOCTL, { .vop_ioctl = nfs_ioctl }, 203 VOPNAME_GETATTR, { .vop_getattr = nfs_getattr }, 204 VOPNAME_SETATTR, { .vop_setattr = nfs_setattr }, 205 VOPNAME_ACCESS, { .vop_access = nfs_access }, 206 VOPNAME_LOOKUP, { .vop_lookup = nfs_lookup }, 207 VOPNAME_CREATE, { .vop_create = nfs_create }, 208 VOPNAME_REMOVE, { .vop_remove = nfs_remove }, 209 VOPNAME_LINK, { .vop_link = nfs_link }, 210 VOPNAME_RENAME, { .vop_rename = nfs_rename }, 211 VOPNAME_MKDIR, { .vop_mkdir = nfs_mkdir }, 212 VOPNAME_RMDIR, { .vop_rmdir = nfs_rmdir }, 213 VOPNAME_READDIR, { .vop_readdir = nfs_readdir }, 214 VOPNAME_SYMLINK, { .vop_symlink = nfs_symlink }, 215 VOPNAME_READLINK, { .vop_readlink = nfs_readlink }, 216 VOPNAME_FSYNC, { .vop_fsync = nfs_fsync }, 217 VOPNAME_INACTIVE, { .vop_inactive = nfs_inactive }, 218 VOPNAME_FID, { .vop_fid = nfs_fid }, 219 VOPNAME_RWLOCK, { .vop_rwlock = nfs_rwlock }, 220 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs_rwunlock }, 221 VOPNAME_SEEK, { .vop_seek = nfs_seek }, 222 VOPNAME_FRLOCK, { .vop_frlock = nfs_frlock }, 223 VOPNAME_SPACE, { .vop_space = nfs_space }, 224 VOPNAME_REALVP, { .vop_realvp = nfs_realvp }, 225 VOPNAME_GETPAGE, { .vop_getpage = nfs_getpage }, 226 VOPNAME_PUTPAGE, { .vop_putpage = nfs_putpage }, 227 VOPNAME_MAP, { .vop_map = nfs_map }, 228 VOPNAME_ADDMAP, { .vop_addmap = nfs_addmap }, 229 VOPNAME_DELMAP, { .vop_delmap = nfs_delmap }, 230 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 231 VOPNAME_PATHCONF, { .vop_pathconf = nfs_pathconf }, 232 VOPNAME_PAGEIO, { .vop_pageio = nfs_pageio }, 233 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs_setsecattr }, 234 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs_getsecattr }, 235 VOPNAME_SHRLOCK, { .vop_shrlock = nfs_shrlock }, 236 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 237 NULL, NULL 238 }; 239 240 /* 241 * XXX: This is referenced in modstubs.s 242 */ 243 struct vnodeops * 244 nfs_getvnodeops(void) 245 { 246 return (nfs_vnodeops); 247 } 248 249 /* ARGSUSED */ 250 static int 251 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 252 { 253 int error; 254 struct vattr va; 255 rnode_t *rp; 256 vnode_t *vp; 257 258 vp = *vpp; 259 rp = VTOR(vp); 260 if (nfs_zone() != VTOMI(vp)->mi_zone) 261 return (EIO); 262 mutex_enter(&rp->r_statelock); 263 if (rp->r_cred == NULL) { 264 crhold(cr); 265 rp->r_cred = cr; 266 } 267 mutex_exit(&rp->r_statelock); 268 269 /* 270 * If there is no cached data or if close-to-open 271 * consistency checking is turned off, we can avoid 272 * the over the wire getattr. Otherwise, if the 273 * file system is mounted readonly, then just verify 274 * the caches are up to date using the normal mechanism. 275 * Else, if the file is not mmap'd, then just mark 276 * the attributes as timed out. They will be refreshed 277 * and the caches validated prior to being used. 278 * Else, the file system is mounted writeable so 279 * force an over the wire GETATTR in order to ensure 280 * that all cached data is valid. 281 */ 282 if (vp->v_count > 1 || 283 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 284 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 285 if (vn_is_readonly(vp)) 286 error = nfs_validate_caches(vp, cr); 287 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 288 PURGE_ATTRCACHE(vp); 289 error = 0; 290 } else { 291 va.va_mask = AT_ALL; 292 error = nfs_getattr_otw(vp, &va, cr); 293 } 294 } else 295 error = 0; 296 297 return (error); 298 } 299 300 /* ARGSUSED */ 301 static int 302 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 303 caller_context_t *ct) 304 { 305 rnode_t *rp; 306 int error; 307 struct vattr va; 308 309 /* 310 * zone_enter(2) prevents processes from changing zones with NFS files 311 * open; if we happen to get here from the wrong zone we can't do 312 * anything over the wire. 313 */ 314 if (VTOMI(vp)->mi_zone != nfs_zone()) { 315 /* 316 * We could attempt to clean up locks, except we're sure 317 * that the current process didn't acquire any locks on 318 * the file: any attempt to lock a file belong to another zone 319 * will fail, and one can't lock an NFS file and then change 320 * zones, as that fails too. 321 * 322 * Returning an error here is the sane thing to do. A 323 * subsequent call to VN_RELE() which translates to a 324 * nfs_inactive() will clean up state: if the zone of the 325 * vnode's origin is still alive and kicking, an async worker 326 * thread will handle the request (from the correct zone), and 327 * everything (minus the final nfs_getattr_otw() call) should 328 * be OK. If the zone is going away nfs_async_inactive() will 329 * throw away cached pages inline. 330 */ 331 return (EIO); 332 } 333 334 /* 335 * If we are using local locking for this filesystem, then 336 * release all of the SYSV style record locks. Otherwise, 337 * we are doing network locking and we need to release all 338 * of the network locks. All of the locks held by this 339 * process on this file are released no matter what the 340 * incoming reference count is. 341 */ 342 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 343 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 344 cleanshares(vp, ttoproc(curthread)->p_pid); 345 } else 346 nfs_lockrelease(vp, flag, offset, cr); 347 348 if (count > 1) 349 return (0); 350 351 /* 352 * If the file has been `unlinked', then purge the 353 * DNLC so that this vnode will get reycled quicker 354 * and the .nfs* file on the server will get removed. 355 */ 356 rp = VTOR(vp); 357 if (rp->r_unldvp != NULL) 358 dnlc_purge_vp(vp); 359 360 /* 361 * If the file was open for write and there are pages, 362 * then if the file system was mounted using the "no-close- 363 * to-open" semantics, then start an asynchronous flush 364 * of the all of the pages in the file. 365 * else the file system was not mounted using the "no-close- 366 * to-open" semantics, then do a synchronous flush and 367 * commit of all of the dirty and uncommitted pages. 368 * 369 * The asynchronous flush of the pages in the "nocto" path 370 * mostly just associates a cred pointer with the rnode so 371 * writes which happen later will have a better chance of 372 * working. It also starts the data being written to the 373 * server, but without unnecessarily delaying the application. 374 */ 375 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 376 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) { 377 error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, 378 cr, ct); 379 if (error == EAGAIN) 380 error = 0; 381 } else 382 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 383 if (!error) { 384 mutex_enter(&rp->r_statelock); 385 error = rp->r_error; 386 rp->r_error = 0; 387 mutex_exit(&rp->r_statelock); 388 } 389 } else { 390 mutex_enter(&rp->r_statelock); 391 error = rp->r_error; 392 rp->r_error = 0; 393 mutex_exit(&rp->r_statelock); 394 } 395 396 /* 397 * If RWRITEATTR is set, then issue an over the wire GETATTR to 398 * refresh the attribute cache with a set of attributes which 399 * weren't returned from a WRITE. This will enable the close- 400 * to-open processing to work. 401 */ 402 if (rp->r_flags & RWRITEATTR) 403 (void) nfs_getattr_otw(vp, &va, cr); 404 405 return (error); 406 } 407 408 /* ARGSUSED */ 409 static int 410 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 411 caller_context_t *ct) 412 { 413 rnode_t *rp; 414 u_offset_t off; 415 offset_t diff; 416 int on; 417 size_t n; 418 caddr_t base; 419 uint_t flags; 420 int error; 421 mntinfo_t *mi; 422 423 rp = VTOR(vp); 424 mi = VTOMI(vp); 425 426 if (nfs_zone() != mi->mi_zone) 427 return (EIO); 428 429 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 430 431 if (vp->v_type != VREG) 432 return (EISDIR); 433 434 if (uiop->uio_resid == 0) 435 return (0); 436 437 if (uiop->uio_loffset > MAXOFF32_T) 438 return (EFBIG); 439 440 if (uiop->uio_loffset < 0 || 441 uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T) 442 return (EINVAL); 443 444 /* 445 * Bypass VM if caching has been disabled (e.g., locking) or if 446 * using client-side direct I/O and the file is not mmap'd and 447 * there are no cached pages. 448 */ 449 if ((vp->v_flag & VNOCACHE) || 450 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 451 rp->r_mapcnt == 0 && rp->r_inmap == 0 && 452 !vn_has_cached_data(vp))) { 453 size_t bufsize; 454 size_t resid = 0; 455 456 /* 457 * Let's try to do read in as large a chunk as we can 458 * (Filesystem (NFS client) bsize if possible/needed). 459 * For V3, this is 32K and for V2, this is 8K. 460 */ 461 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread); 462 base = kmem_alloc(bufsize, KM_SLEEP); 463 do { 464 n = MIN(uiop->uio_resid, bufsize); 465 error = nfsread(vp, base, uiop->uio_offset, n, 466 &resid, cr); 467 if (!error) { 468 n -= resid; 469 error = uiomove(base, n, UIO_READ, uiop); 470 } 471 } while (!error && uiop->uio_resid > 0 && n > 0); 472 kmem_free(base, bufsize); 473 return (error); 474 } 475 476 error = 0; 477 478 do { 479 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 480 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 481 n = MIN(MAXBSIZE - on, uiop->uio_resid); 482 483 error = nfs_validate_caches(vp, cr); 484 if (error) 485 break; 486 487 mutex_enter(&rp->r_statelock); 488 while (rp->r_flags & RINCACHEPURGE) { 489 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 490 mutex_exit(&rp->r_statelock); 491 return (EINTR); 492 } 493 } 494 diff = rp->r_size - uiop->uio_loffset; 495 mutex_exit(&rp->r_statelock); 496 if (diff <= 0) 497 break; 498 if (diff < n) 499 n = (size_t)diff; 500 501 if (vpm_enable) { 502 /* 503 * Copy data. 504 */ 505 error = vpm_data_copy(vp, off + on, n, uiop, 506 1, NULL, 0, S_READ); 507 } else { 508 base = segmap_getmapflt(segkmap, vp, off + on, n, 509 1, S_READ); 510 error = uiomove(base + on, n, UIO_READ, uiop); 511 } 512 513 if (!error) { 514 /* 515 * If read a whole block or read to eof, 516 * won't need this buffer again soon. 517 */ 518 mutex_enter(&rp->r_statelock); 519 if (n + on == MAXBSIZE || 520 uiop->uio_loffset == rp->r_size) 521 flags = SM_DONTNEED; 522 else 523 flags = 0; 524 mutex_exit(&rp->r_statelock); 525 if (vpm_enable) { 526 error = vpm_sync_pages(vp, off, n, flags); 527 } else { 528 error = segmap_release(segkmap, base, flags); 529 } 530 } else { 531 if (vpm_enable) { 532 (void) vpm_sync_pages(vp, off, n, 0); 533 } else { 534 (void) segmap_release(segkmap, base, 0); 535 } 536 } 537 } while (!error && uiop->uio_resid > 0); 538 539 return (error); 540 } 541 542 /* ARGSUSED */ 543 static int 544 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 545 caller_context_t *ct) 546 { 547 rnode_t *rp; 548 u_offset_t off; 549 caddr_t base; 550 uint_t flags; 551 int remainder; 552 size_t n; 553 int on; 554 int error; 555 int resid; 556 offset_t offset; 557 rlim_t limit; 558 mntinfo_t *mi; 559 560 rp = VTOR(vp); 561 562 mi = VTOMI(vp); 563 if (nfs_zone() != mi->mi_zone) 564 return (EIO); 565 if (vp->v_type != VREG) 566 return (EISDIR); 567 568 if (uiop->uio_resid == 0) 569 return (0); 570 571 if (ioflag & FAPPEND) { 572 struct vattr va; 573 574 /* 575 * Must serialize if appending. 576 */ 577 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 578 nfs_rw_exit(&rp->r_rwlock); 579 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 580 INTR(vp))) 581 return (EINTR); 582 } 583 584 va.va_mask = AT_SIZE; 585 error = nfsgetattr(vp, &va, cr); 586 if (error) 587 return (error); 588 uiop->uio_loffset = va.va_size; 589 } 590 591 if (uiop->uio_loffset > MAXOFF32_T) 592 return (EFBIG); 593 594 offset = uiop->uio_loffset + uiop->uio_resid; 595 596 if (uiop->uio_loffset < 0 || offset > MAXOFF32_T) 597 return (EINVAL); 598 599 if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) { 600 limit = MAXOFF32_T; 601 } else { 602 limit = (rlim_t)uiop->uio_llimit; 603 } 604 605 /* 606 * Check to make sure that the process will not exceed 607 * its limit on file size. It is okay to write up to 608 * the limit, but not beyond. Thus, the write which 609 * reaches the limit will be short and the next write 610 * will return an error. 611 */ 612 remainder = 0; 613 if (offset > limit) { 614 remainder = offset - limit; 615 uiop->uio_resid = limit - uiop->uio_offset; 616 if (uiop->uio_resid <= 0) { 617 proc_t *p = ttoproc(curthread); 618 619 uiop->uio_resid += remainder; 620 mutex_enter(&p->p_lock); 621 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 622 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 623 mutex_exit(&p->p_lock); 624 return (EFBIG); 625 } 626 } 627 628 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 629 return (EINTR); 630 631 /* 632 * Bypass VM if caching has been disabled (e.g., locking) or if 633 * using client-side direct I/O and the file is not mmap'd and 634 * there are no cached pages. 635 */ 636 if ((vp->v_flag & VNOCACHE) || 637 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 638 rp->r_mapcnt == 0 && rp->r_inmap == 0 && 639 !vn_has_cached_data(vp))) { 640 size_t bufsize; 641 int count; 642 uint_t org_offset; 643 644 nfs_fwrite: 645 if (rp->r_flags & RSTALE) { 646 resid = uiop->uio_resid; 647 offset = uiop->uio_loffset; 648 error = rp->r_error; 649 goto bottom; 650 } 651 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite); 652 base = kmem_alloc(bufsize, KM_SLEEP); 653 do { 654 resid = uiop->uio_resid; 655 offset = uiop->uio_loffset; 656 count = MIN(uiop->uio_resid, bufsize); 657 org_offset = uiop->uio_offset; 658 error = uiomove(base, count, UIO_WRITE, uiop); 659 if (!error) { 660 error = nfswrite(vp, base, org_offset, 661 count, cr); 662 } 663 } while (!error && uiop->uio_resid > 0); 664 kmem_free(base, bufsize); 665 goto bottom; 666 } 667 668 do { 669 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 670 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 671 n = MIN(MAXBSIZE - on, uiop->uio_resid); 672 673 resid = uiop->uio_resid; 674 offset = uiop->uio_loffset; 675 676 if (rp->r_flags & RSTALE) { 677 error = rp->r_error; 678 break; 679 } 680 681 /* 682 * Don't create dirty pages faster than they 683 * can be cleaned so that the system doesn't 684 * get imbalanced. If the async queue is 685 * maxed out, then wait for it to drain before 686 * creating more dirty pages. Also, wait for 687 * any threads doing pagewalks in the vop_getattr 688 * entry points so that they don't block for 689 * long periods. 690 */ 691 mutex_enter(&rp->r_statelock); 692 while ((mi->mi_max_threads != 0 && 693 rp->r_awcount > 2 * mi->mi_max_threads) || 694 rp->r_gcount > 0) 695 cv_wait(&rp->r_cv, &rp->r_statelock); 696 mutex_exit(&rp->r_statelock); 697 698 /* 699 * Touch the page and fault it in if it is not in core 700 * before segmap_getmapflt or vpm_data_copy can lock it. 701 * This is to avoid the deadlock if the buffer is mapped 702 * to the same file through mmap which we want to write. 703 */ 704 uio_prefaultpages((long)n, uiop); 705 706 if (vpm_enable) { 707 /* 708 * It will use kpm mappings, so no need to 709 * pass an address. 710 */ 711 error = writerp(rp, NULL, n, uiop, 0); 712 } else { 713 if (segmap_kpm) { 714 int pon = uiop->uio_loffset & PAGEOFFSET; 715 size_t pn = MIN(PAGESIZE - pon, 716 uiop->uio_resid); 717 int pagecreate; 718 719 mutex_enter(&rp->r_statelock); 720 pagecreate = (pon == 0) && (pn == PAGESIZE || 721 uiop->uio_loffset + pn >= rp->r_size); 722 mutex_exit(&rp->r_statelock); 723 724 base = segmap_getmapflt(segkmap, vp, off + on, 725 pn, !pagecreate, S_WRITE); 726 727 error = writerp(rp, base + pon, n, uiop, 728 pagecreate); 729 730 } else { 731 base = segmap_getmapflt(segkmap, vp, off + on, 732 n, 0, S_READ); 733 error = writerp(rp, base + on, n, uiop, 0); 734 } 735 } 736 737 if (!error) { 738 if (mi->mi_flags & MI_NOAC) 739 flags = SM_WRITE; 740 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 741 /* 742 * Have written a whole block. 743 * Start an asynchronous write 744 * and mark the buffer to 745 * indicate that it won't be 746 * needed again soon. 747 */ 748 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 749 } else 750 flags = 0; 751 if ((ioflag & (FSYNC|FDSYNC)) || 752 (rp->r_flags & ROUTOFSPACE)) { 753 flags &= ~SM_ASYNC; 754 flags |= SM_WRITE; 755 } 756 if (vpm_enable) { 757 error = vpm_sync_pages(vp, off, n, flags); 758 } else { 759 error = segmap_release(segkmap, base, flags); 760 } 761 } else { 762 if (vpm_enable) { 763 (void) vpm_sync_pages(vp, off, n, 0); 764 } else { 765 (void) segmap_release(segkmap, base, 0); 766 } 767 /* 768 * In the event that we got an access error while 769 * faulting in a page for a write-only file just 770 * force a write. 771 */ 772 if (error == EACCES) 773 goto nfs_fwrite; 774 } 775 } while (!error && uiop->uio_resid > 0); 776 777 bottom: 778 if (error) { 779 uiop->uio_resid = resid + remainder; 780 uiop->uio_loffset = offset; 781 } else 782 uiop->uio_resid += remainder; 783 784 nfs_rw_exit(&rp->r_lkserlock); 785 786 return (error); 787 } 788 789 /* 790 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 791 */ 792 static int 793 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 794 int flags, cred_t *cr) 795 { 796 struct buf *bp; 797 int error; 798 799 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 800 bp = pageio_setup(pp, len, vp, flags); 801 ASSERT(bp != NULL); 802 803 /* 804 * pageio_setup should have set b_addr to 0. This 805 * is correct since we want to do I/O on a page 806 * boundary. bp_mapin will use this addr to calculate 807 * an offset, and then set b_addr to the kernel virtual 808 * address it allocated for us. 809 */ 810 ASSERT(bp->b_un.b_addr == 0); 811 812 bp->b_edev = 0; 813 bp->b_dev = 0; 814 bp->b_lblkno = lbtodb(off); 815 bp->b_file = vp; 816 bp->b_offset = (offset_t)off; 817 bp_mapin(bp); 818 819 error = nfs_bio(bp, cr); 820 821 bp_mapout(bp); 822 pageio_done(bp); 823 824 return (error); 825 } 826 827 /* 828 * Write to file. Writes to remote server in largest size 829 * chunks that the server can handle. Write is synchronous. 830 */ 831 static int 832 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr) 833 { 834 rnode_t *rp; 835 mntinfo_t *mi; 836 struct nfswriteargs wa; 837 struct nfsattrstat ns; 838 int error; 839 int tsize; 840 int douprintf; 841 842 douprintf = 1; 843 844 rp = VTOR(vp); 845 mi = VTOMI(vp); 846 847 ASSERT(nfs_zone() == mi->mi_zone); 848 849 wa.wa_args = &wa.wa_args_buf; 850 wa.wa_fhandle = *VTOFH(vp); 851 852 do { 853 tsize = MIN(mi->mi_curwrite, count); 854 wa.wa_data = base; 855 wa.wa_begoff = offset; 856 wa.wa_totcount = tsize; 857 wa.wa_count = tsize; 858 wa.wa_offset = offset; 859 860 if (mi->mi_io_kstats) { 861 mutex_enter(&mi->mi_lock); 862 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 863 mutex_exit(&mi->mi_lock); 864 } 865 wa.wa_mblk = NULL; 866 do { 867 error = rfs2call(mi, RFS_WRITE, 868 xdr_writeargs, (caddr_t)&wa, 869 xdr_attrstat, (caddr_t)&ns, cr, 870 &douprintf, &ns.ns_status, 0, NULL); 871 } while (error == ENFS_TRYAGAIN); 872 if (mi->mi_io_kstats) { 873 mutex_enter(&mi->mi_lock); 874 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 875 mutex_exit(&mi->mi_lock); 876 } 877 878 if (!error) { 879 error = geterrno(ns.ns_status); 880 /* 881 * Can't check for stale fhandle and purge caches 882 * here because pages are held by nfs_getpage. 883 * Just mark the attribute cache as timed out 884 * and set RWRITEATTR to indicate that the file 885 * was modified with a WRITE operation. 886 */ 887 if (!error) { 888 count -= tsize; 889 base += tsize; 890 offset += tsize; 891 if (mi->mi_io_kstats) { 892 mutex_enter(&mi->mi_lock); 893 KSTAT_IO_PTR(mi->mi_io_kstats)-> 894 writes++; 895 KSTAT_IO_PTR(mi->mi_io_kstats)-> 896 nwritten += tsize; 897 mutex_exit(&mi->mi_lock); 898 } 899 lwp_stat_update(LWP_STAT_OUBLK, 1); 900 mutex_enter(&rp->r_statelock); 901 PURGE_ATTRCACHE_LOCKED(rp); 902 rp->r_flags |= RWRITEATTR; 903 mutex_exit(&rp->r_statelock); 904 } 905 } 906 } while (!error && count); 907 908 return (error); 909 } 910 911 /* 912 * Read from a file. Reads data in largest chunks our interface can handle. 913 */ 914 static int 915 nfsread(vnode_t *vp, caddr_t base, uint_t offset, 916 int count, size_t *residp, cred_t *cr) 917 { 918 mntinfo_t *mi; 919 struct nfsreadargs ra; 920 struct nfsrdresult rr; 921 int tsize; 922 int error; 923 int douprintf; 924 failinfo_t fi; 925 rnode_t *rp; 926 struct vattr va; 927 hrtime_t t; 928 929 rp = VTOR(vp); 930 mi = VTOMI(vp); 931 932 ASSERT(nfs_zone() == mi->mi_zone); 933 934 douprintf = 1; 935 936 ra.ra_fhandle = *VTOFH(vp); 937 938 fi.vp = vp; 939 fi.fhp = (caddr_t)&ra.ra_fhandle; 940 fi.copyproc = nfscopyfh; 941 fi.lookupproc = nfslookup; 942 fi.xattrdirproc = acl_getxattrdir2; 943 944 do { 945 if (mi->mi_io_kstats) { 946 mutex_enter(&mi->mi_lock); 947 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 948 mutex_exit(&mi->mi_lock); 949 } 950 951 do { 952 tsize = MIN(mi->mi_curread, count); 953 rr.rr_data = base; 954 ra.ra_offset = offset; 955 ra.ra_totcount = tsize; 956 ra.ra_count = tsize; 957 ra.ra_data = base; 958 t = gethrtime(); 959 error = rfs2call(mi, RFS_READ, 960 xdr_readargs, (caddr_t)&ra, 961 xdr_rdresult, (caddr_t)&rr, cr, 962 &douprintf, &rr.rr_status, 0, &fi); 963 } while (error == ENFS_TRYAGAIN); 964 965 if (mi->mi_io_kstats) { 966 mutex_enter(&mi->mi_lock); 967 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 968 mutex_exit(&mi->mi_lock); 969 } 970 971 if (!error) { 972 error = geterrno(rr.rr_status); 973 if (!error) { 974 count -= rr.rr_count; 975 base += rr.rr_count; 976 offset += rr.rr_count; 977 if (mi->mi_io_kstats) { 978 mutex_enter(&mi->mi_lock); 979 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 980 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 981 rr.rr_count; 982 mutex_exit(&mi->mi_lock); 983 } 984 lwp_stat_update(LWP_STAT_INBLK, 1); 985 } 986 } 987 } while (!error && count && rr.rr_count == tsize); 988 989 *residp = count; 990 991 if (!error) { 992 /* 993 * Since no error occurred, we have the current 994 * attributes and we need to do a cache check and then 995 * potentially update the cached attributes. We can't 996 * use the normal attribute check and cache mechanisms 997 * because they might cause a cache flush which would 998 * deadlock. Instead, we just check the cache to see 999 * if the attributes have changed. If it is, then we 1000 * just mark the attributes as out of date. The next 1001 * time that the attributes are checked, they will be 1002 * out of date, new attributes will be fetched, and 1003 * the page cache will be flushed. If the attributes 1004 * weren't changed, then we just update the cached 1005 * attributes with these attributes. 1006 */ 1007 /* 1008 * If NFS_ACL is supported on the server, then the 1009 * attributes returned by server may have minimal 1010 * permissions sometimes denying access to users having 1011 * proper access. To get the proper attributes, mark 1012 * the attributes as expired so that they will be 1013 * regotten via the NFS_ACL GETATTR2 procedure. 1014 */ 1015 error = nattr_to_vattr(vp, &rr.rr_attr, &va); 1016 mutex_enter(&rp->r_statelock); 1017 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) || 1018 (mi->mi_flags & MI_ACL)) { 1019 mutex_exit(&rp->r_statelock); 1020 PURGE_ATTRCACHE(vp); 1021 } else { 1022 if (rp->r_mtime <= t) { 1023 nfs_attrcache_va(vp, &va); 1024 } 1025 mutex_exit(&rp->r_statelock); 1026 } 1027 } 1028 1029 return (error); 1030 } 1031 1032 /* ARGSUSED */ 1033 static int 1034 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 1035 caller_context_t *ct) 1036 { 1037 1038 if (nfs_zone() != VTOMI(vp)->mi_zone) 1039 return (EIO); 1040 switch (cmd) { 1041 case _FIODIRECTIO: 1042 return (nfs_directio(vp, (int)arg, cr)); 1043 default: 1044 return (ENOTTY); 1045 } 1046 } 1047 1048 /* ARGSUSED */ 1049 static int 1050 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1051 caller_context_t *ct) 1052 { 1053 int error; 1054 rnode_t *rp; 1055 1056 if (nfs_zone() != VTOMI(vp)->mi_zone) 1057 return (EIO); 1058 /* 1059 * If it has been specified that the return value will 1060 * just be used as a hint, and we are only being asked 1061 * for size, fsid or rdevid, then return the client's 1062 * notion of these values without checking to make sure 1063 * that the attribute cache is up to date. 1064 * The whole point is to avoid an over the wire GETATTR 1065 * call. 1066 */ 1067 rp = VTOR(vp); 1068 if (flags & ATTR_HINT) { 1069 if (vap->va_mask == 1070 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1071 mutex_enter(&rp->r_statelock); 1072 if (vap->va_mask | AT_SIZE) 1073 vap->va_size = rp->r_size; 1074 if (vap->va_mask | AT_FSID) 1075 vap->va_fsid = rp->r_attr.va_fsid; 1076 if (vap->va_mask | AT_RDEV) 1077 vap->va_rdev = rp->r_attr.va_rdev; 1078 mutex_exit(&rp->r_statelock); 1079 return (0); 1080 } 1081 } 1082 1083 /* 1084 * Only need to flush pages if asking for the mtime 1085 * and if there any dirty pages or any outstanding 1086 * asynchronous (write) requests for this file. 1087 */ 1088 if (vap->va_mask & AT_MTIME) { 1089 if (vn_has_cached_data(vp) && 1090 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1091 mutex_enter(&rp->r_statelock); 1092 rp->r_gcount++; 1093 mutex_exit(&rp->r_statelock); 1094 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 1095 mutex_enter(&rp->r_statelock); 1096 if (error && (error == ENOSPC || error == EDQUOT)) { 1097 if (!rp->r_error) 1098 rp->r_error = error; 1099 } 1100 if (--rp->r_gcount == 0) 1101 cv_broadcast(&rp->r_cv); 1102 mutex_exit(&rp->r_statelock); 1103 } 1104 } 1105 1106 return (nfsgetattr(vp, vap, cr)); 1107 } 1108 1109 /*ARGSUSED4*/ 1110 static int 1111 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1112 caller_context_t *ct) 1113 { 1114 int error; 1115 uint_t mask; 1116 struct vattr va; 1117 1118 mask = vap->va_mask; 1119 1120 if (mask & AT_NOSET) 1121 return (EINVAL); 1122 1123 if ((mask & AT_SIZE) && 1124 vap->va_type == VREG && 1125 vap->va_size > MAXOFF32_T) 1126 return (EFBIG); 1127 1128 if (nfs_zone() != VTOMI(vp)->mi_zone) 1129 return (EIO); 1130 1131 va.va_mask = AT_UID | AT_MODE; 1132 1133 error = nfsgetattr(vp, &va, cr); 1134 if (error) 1135 return (error); 1136 1137 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx, 1138 vp); 1139 1140 if (error) 1141 return (error); 1142 1143 return (nfssetattr(vp, vap, flags, cr)); 1144 } 1145 1146 static int 1147 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1148 { 1149 int error; 1150 uint_t mask; 1151 struct nfssaargs args; 1152 struct nfsattrstat ns; 1153 int douprintf; 1154 rnode_t *rp; 1155 struct vattr va; 1156 mode_t omode; 1157 mntinfo_t *mi; 1158 vsecattr_t *vsp; 1159 hrtime_t t; 1160 1161 mask = vap->va_mask; 1162 1163 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1164 1165 rp = VTOR(vp); 1166 1167 /* 1168 * Only need to flush pages if there are any pages and 1169 * if the file is marked as dirty in some fashion. The 1170 * file must be flushed so that we can accurately 1171 * determine the size of the file and the cached data 1172 * after the SETATTR returns. A file is considered to 1173 * be dirty if it is either marked with RDIRTY, has 1174 * outstanding i/o's active, or is mmap'd. In this 1175 * last case, we can't tell whether there are dirty 1176 * pages, so we flush just to be sure. 1177 */ 1178 if (vn_has_cached_data(vp) && 1179 ((rp->r_flags & RDIRTY) || 1180 rp->r_count > 0 || 1181 rp->r_mapcnt > 0)) { 1182 ASSERT(vp->v_type != VCHR); 1183 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 1184 if (error && (error == ENOSPC || error == EDQUOT)) { 1185 mutex_enter(&rp->r_statelock); 1186 if (!rp->r_error) 1187 rp->r_error = error; 1188 mutex_exit(&rp->r_statelock); 1189 } 1190 } 1191 1192 /* 1193 * If the system call was utime(2) or utimes(2) and the 1194 * application did not specify the times, then set the 1195 * mtime nanosecond field to 1 billion. This will get 1196 * translated from 1 billion nanoseconds to 1 million 1197 * microseconds in the over the wire request. The 1198 * server will use 1 million in the microsecond field 1199 * to tell whether both the mtime and atime should be 1200 * set to the server's current time. 1201 * 1202 * This is an overload of the protocol and should be 1203 * documented in the NFS Version 2 protocol specification. 1204 */ 1205 if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) { 1206 vap->va_mtime.tv_nsec = 1000000000; 1207 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) && 1208 NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1209 error = vattr_to_sattr(vap, &args.saa_sa); 1210 } else { 1211 /* 1212 * Use server times. vap time values will not be used. 1213 * To ensure no time overflow, make sure vap has 1214 * valid values, but retain the original values. 1215 */ 1216 timestruc_t mtime = vap->va_mtime; 1217 timestruc_t atime = vap->va_atime; 1218 time_t now; 1219 1220 now = gethrestime_sec(); 1221 if (NFS_TIME_T_OK(now)) { 1222 /* Just in case server does not know of this */ 1223 vap->va_mtime.tv_sec = now; 1224 vap->va_atime.tv_sec = now; 1225 } else { 1226 vap->va_mtime.tv_sec = 0; 1227 vap->va_atime.tv_sec = 0; 1228 } 1229 error = vattr_to_sattr(vap, &args.saa_sa); 1230 /* set vap times back on */ 1231 vap->va_mtime = mtime; 1232 vap->va_atime = atime; 1233 } 1234 } else { 1235 /* Either do not set times or use the client specified times */ 1236 error = vattr_to_sattr(vap, &args.saa_sa); 1237 } 1238 if (error) { 1239 /* req time field(s) overflow - return immediately */ 1240 return (error); 1241 } 1242 args.saa_fh = *VTOFH(vp); 1243 1244 va.va_mask = AT_MODE; 1245 error = nfsgetattr(vp, &va, cr); 1246 if (error) 1247 return (error); 1248 omode = va.va_mode; 1249 1250 mi = VTOMI(vp); 1251 1252 douprintf = 1; 1253 1254 t = gethrtime(); 1255 1256 error = rfs2call(mi, RFS_SETATTR, 1257 xdr_saargs, (caddr_t)&args, 1258 xdr_attrstat, (caddr_t)&ns, cr, 1259 &douprintf, &ns.ns_status, 0, NULL); 1260 1261 /* 1262 * Purge the access cache and ACL cache if changing either the 1263 * owner of the file, the group owner, or the mode. These may 1264 * change the access permissions of the file, so purge old 1265 * information and start over again. 1266 */ 1267 if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) { 1268 (void) nfs_access_purge_rp(rp); 1269 if (rp->r_secattr != NULL) { 1270 mutex_enter(&rp->r_statelock); 1271 vsp = rp->r_secattr; 1272 rp->r_secattr = NULL; 1273 mutex_exit(&rp->r_statelock); 1274 if (vsp != NULL) 1275 nfs_acl_free(vsp); 1276 } 1277 } 1278 1279 if (!error) { 1280 error = geterrno(ns.ns_status); 1281 if (!error) { 1282 /* 1283 * If changing the size of the file, invalidate 1284 * any local cached data which is no longer part 1285 * of the file. We also possibly invalidate the 1286 * last page in the file. We could use 1287 * pvn_vpzero(), but this would mark the page as 1288 * modified and require it to be written back to 1289 * the server for no particularly good reason. 1290 * This way, if we access it, then we bring it 1291 * back in. A read should be cheaper than a 1292 * write. 1293 */ 1294 if (mask & AT_SIZE) { 1295 nfs_invalidate_pages(vp, 1296 (vap->va_size & PAGEMASK), cr); 1297 } 1298 (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr); 1299 /* 1300 * If NFS_ACL is supported on the server, then the 1301 * attributes returned by server may have minimal 1302 * permissions sometimes denying access to users having 1303 * proper access. To get the proper attributes, mark 1304 * the attributes as expired so that they will be 1305 * regotten via the NFS_ACL GETATTR2 procedure. 1306 */ 1307 if (mi->mi_flags & MI_ACL) { 1308 PURGE_ATTRCACHE(vp); 1309 } 1310 /* 1311 * This next check attempts to deal with NFS 1312 * servers which can not handle increasing 1313 * the size of the file via setattr. Most 1314 * of these servers do not return an error, 1315 * but do not change the size of the file. 1316 * Hence, this check and then attempt to set 1317 * the file size by writing 1 byte at the 1318 * offset of the end of the file that we need. 1319 */ 1320 if ((mask & AT_SIZE) && 1321 ns.ns_attr.na_size < (uint32_t)vap->va_size) { 1322 char zb = '\0'; 1323 1324 error = nfswrite(vp, &zb, 1325 vap->va_size - sizeof (zb), 1326 sizeof (zb), cr); 1327 } 1328 /* 1329 * Some servers will change the mode to clear the setuid 1330 * and setgid bits when changing the uid or gid. The 1331 * client needs to compensate appropriately. 1332 */ 1333 if (mask & (AT_UID | AT_GID)) { 1334 int terror; 1335 1336 va.va_mask = AT_MODE; 1337 terror = nfsgetattr(vp, &va, cr); 1338 if (!terror && 1339 (((mask & AT_MODE) && 1340 va.va_mode != vap->va_mode) || 1341 (!(mask & AT_MODE) && 1342 va.va_mode != omode))) { 1343 va.va_mask = AT_MODE; 1344 if (mask & AT_MODE) 1345 va.va_mode = vap->va_mode; 1346 else 1347 va.va_mode = omode; 1348 (void) nfssetattr(vp, &va, 0, cr); 1349 } 1350 } 1351 } else { 1352 PURGE_ATTRCACHE(vp); 1353 PURGE_STALE_FH(error, vp, cr); 1354 } 1355 } else { 1356 PURGE_ATTRCACHE(vp); 1357 } 1358 1359 return (error); 1360 } 1361 1362 static int 1363 nfs_accessx(void *vp, int mode, cred_t *cr) 1364 { 1365 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1366 return (nfs_access(vp, mode, 0, cr, NULL)); 1367 } 1368 1369 /* ARGSUSED */ 1370 static int 1371 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 1372 { 1373 struct vattr va; 1374 int error; 1375 mntinfo_t *mi; 1376 int shift = 0; 1377 1378 mi = VTOMI(vp); 1379 1380 if (nfs_zone() != mi->mi_zone) 1381 return (EIO); 1382 if (mi->mi_flags & MI_ACL) { 1383 error = acl_access2(vp, mode, flags, cr); 1384 if (mi->mi_flags & MI_ACL) 1385 return (error); 1386 } 1387 1388 va.va_mask = AT_MODE | AT_UID | AT_GID; 1389 error = nfsgetattr(vp, &va, cr); 1390 if (error) 1391 return (error); 1392 1393 /* 1394 * Disallow write attempts on read-only 1395 * file systems, unless the file is a 1396 * device node. 1397 */ 1398 if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp)) 1399 return (EROFS); 1400 1401 /* 1402 * Disallow attempts to access mandatory lock files. 1403 */ 1404 if ((mode & (VWRITE | VREAD | VEXEC)) && 1405 MANDLOCK(vp, va.va_mode)) 1406 return (EACCES); 1407 1408 /* 1409 * Access check is based on only 1410 * one of owner, group, public. 1411 * If not owner, then check group. 1412 * If not a member of the group, 1413 * then check public access. 1414 */ 1415 if (crgetuid(cr) != va.va_uid) { 1416 shift += 3; 1417 if (!groupmember(va.va_gid, cr)) 1418 shift += 3; 1419 } 1420 found: 1421 mode &= ~(va.va_mode << shift); 1422 if (mode == 0) 1423 return (0); 1424 1425 return (secpolicy_vnode_access(cr, vp, va.va_uid, mode)); 1426 } 1427 1428 static int nfs_do_symlink_cache = 1; 1429 1430 /* ARGSUSED */ 1431 static int 1432 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 1433 { 1434 int error; 1435 struct nfsrdlnres rl; 1436 rnode_t *rp; 1437 int douprintf; 1438 failinfo_t fi; 1439 1440 /* 1441 * We want to be consistent with UFS semantics so we will return 1442 * EINVAL instead of ENXIO. This violates the XNFS spec and 1443 * the RFC 1094, which are wrong any way. BUGID 1138002. 1444 */ 1445 if (vp->v_type != VLNK) 1446 return (EINVAL); 1447 1448 if (nfs_zone() != VTOMI(vp)->mi_zone) 1449 return (EIO); 1450 1451 rp = VTOR(vp); 1452 if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) { 1453 error = nfs_validate_caches(vp, cr); 1454 if (error) 1455 return (error); 1456 mutex_enter(&rp->r_statelock); 1457 if (rp->r_symlink.contents != NULL) { 1458 error = uiomove(rp->r_symlink.contents, 1459 rp->r_symlink.len, UIO_READ, uiop); 1460 mutex_exit(&rp->r_statelock); 1461 return (error); 1462 } 1463 mutex_exit(&rp->r_statelock); 1464 } 1465 1466 1467 rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 1468 1469 fi.vp = vp; 1470 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1471 fi.copyproc = nfscopyfh; 1472 fi.lookupproc = nfslookup; 1473 fi.xattrdirproc = acl_getxattrdir2; 1474 1475 douprintf = 1; 1476 1477 error = rfs2call(VTOMI(vp), RFS_READLINK, 1478 xdr_readlink, (caddr_t)VTOFH(vp), 1479 xdr_rdlnres, (caddr_t)&rl, cr, 1480 &douprintf, &rl.rl_status, 0, &fi); 1481 1482 if (error) { 1483 1484 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1485 return (error); 1486 } 1487 1488 error = geterrno(rl.rl_status); 1489 if (!error) { 1490 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop); 1491 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) { 1492 mutex_enter(&rp->r_statelock); 1493 if (rp->r_symlink.contents == NULL) { 1494 rp->r_symlink.contents = rl.rl_data; 1495 rp->r_symlink.len = (int)rl.rl_count; 1496 rp->r_symlink.size = NFS_MAXPATHLEN; 1497 mutex_exit(&rp->r_statelock); 1498 } else { 1499 mutex_exit(&rp->r_statelock); 1500 1501 kmem_free((void *)rl.rl_data, 1502 NFS_MAXPATHLEN); 1503 } 1504 } else { 1505 1506 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1507 } 1508 } else { 1509 PURGE_STALE_FH(error, vp, cr); 1510 1511 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN); 1512 } 1513 1514 /* 1515 * Conform to UFS semantics (see comment above) 1516 */ 1517 return (error == ENXIO ? EINVAL : error); 1518 } 1519 1520 /* 1521 * Flush local dirty pages to stable storage on the server. 1522 * 1523 * If FNODSYNC is specified, then there is nothing to do because 1524 * metadata changes are not cached on the client before being 1525 * sent to the server. 1526 */ 1527 /* ARGSUSED */ 1528 static int 1529 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 1530 { 1531 int error; 1532 1533 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1534 return (0); 1535 1536 if (nfs_zone() != VTOMI(vp)->mi_zone) 1537 return (EIO); 1538 1539 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 1540 if (!error) 1541 error = VTOR(vp)->r_error; 1542 return (error); 1543 } 1544 1545 1546 /* 1547 * Weirdness: if the file was removed or the target of a rename 1548 * operation while it was open, it got renamed instead. Here we 1549 * remove the renamed file. 1550 */ 1551 /* ARGSUSED */ 1552 static void 1553 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 1554 { 1555 rnode_t *rp; 1556 1557 ASSERT(vp != DNLC_NO_VNODE); 1558 1559 /* 1560 * If this is coming from the wrong zone, we let someone in the right 1561 * zone take care of it asynchronously. We can get here due to 1562 * VN_RELE() being called from pageout() or fsflush(). This call may 1563 * potentially turn into an expensive no-op if, for instance, v_count 1564 * gets incremented in the meantime, but it's still correct. 1565 */ 1566 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1567 nfs_async_inactive(vp, cr, nfs_inactive); 1568 return; 1569 } 1570 1571 rp = VTOR(vp); 1572 redo: 1573 if (rp->r_unldvp != NULL) { 1574 /* 1575 * Save the vnode pointer for the directory where the 1576 * unlinked-open file got renamed, then set it to NULL 1577 * to prevent another thread from getting here before 1578 * we're done with the remove. While we have the 1579 * statelock, make local copies of the pertinent rnode 1580 * fields. If we weren't to do this in an atomic way, the 1581 * the unl* fields could become inconsistent with respect 1582 * to each other due to a race condition between this 1583 * code and nfs_remove(). See bug report 1034328. 1584 */ 1585 mutex_enter(&rp->r_statelock); 1586 if (rp->r_unldvp != NULL) { 1587 vnode_t *unldvp; 1588 char *unlname; 1589 cred_t *unlcred; 1590 struct nfsdiropargs da; 1591 enum nfsstat status; 1592 int douprintf; 1593 int error; 1594 1595 unldvp = rp->r_unldvp; 1596 rp->r_unldvp = NULL; 1597 unlname = rp->r_unlname; 1598 rp->r_unlname = NULL; 1599 unlcred = rp->r_unlcred; 1600 rp->r_unlcred = NULL; 1601 mutex_exit(&rp->r_statelock); 1602 1603 /* 1604 * If there are any dirty pages left, then flush 1605 * them. This is unfortunate because they just 1606 * may get thrown away during the remove operation, 1607 * but we have to do this for correctness. 1608 */ 1609 if (vn_has_cached_data(vp) && 1610 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1611 ASSERT(vp->v_type != VCHR); 1612 error = nfs_putpage(vp, (offset_t)0, 0, 0, 1613 cr, ct); 1614 if (error) { 1615 mutex_enter(&rp->r_statelock); 1616 if (!rp->r_error) 1617 rp->r_error = error; 1618 mutex_exit(&rp->r_statelock); 1619 } 1620 } 1621 1622 /* 1623 * Do the remove operation on the renamed file 1624 */ 1625 setdiropargs(&da, unlname, unldvp); 1626 1627 douprintf = 1; 1628 1629 (void) rfs2call(VTOMI(unldvp), RFS_REMOVE, 1630 xdr_diropargs, (caddr_t)&da, 1631 xdr_enum, (caddr_t)&status, unlcred, 1632 &douprintf, &status, 0, NULL); 1633 1634 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1635 nfs_purge_rddir_cache(unldvp); 1636 PURGE_ATTRCACHE(unldvp); 1637 1638 /* 1639 * Release stuff held for the remove 1640 */ 1641 VN_RELE(unldvp); 1642 kmem_free(unlname, MAXNAMELEN); 1643 crfree(unlcred); 1644 goto redo; 1645 } 1646 mutex_exit(&rp->r_statelock); 1647 } 1648 1649 rp_addfree(rp, cr); 1650 } 1651 1652 /* 1653 * Remote file system operations having to do with directory manipulation. 1654 */ 1655 1656 /* ARGSUSED */ 1657 static int 1658 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1659 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1660 int *direntflags, pathname_t *realpnp) 1661 { 1662 int error; 1663 vnode_t *vp; 1664 vnode_t *avp = NULL; 1665 rnode_t *drp; 1666 1667 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1668 return (EPERM); 1669 1670 drp = VTOR(dvp); 1671 1672 /* 1673 * Are we looking up extended attributes? If so, "dvp" is 1674 * the file or directory for which we want attributes, and 1675 * we need a lookup of the hidden attribute directory 1676 * before we lookup the rest of the path. 1677 */ 1678 if (flags & LOOKUP_XATTR) { 1679 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1680 mntinfo_t *mi; 1681 1682 mi = VTOMI(dvp); 1683 if (!(mi->mi_flags & MI_EXTATTR)) 1684 return (EINVAL); 1685 1686 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1687 return (EINTR); 1688 1689 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1690 if (avp == NULL) 1691 error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0); 1692 else 1693 error = 0; 1694 1695 nfs_rw_exit(&drp->r_rwlock); 1696 1697 if (error) { 1698 if (mi->mi_flags & MI_EXTATTR) 1699 return (error); 1700 return (EINVAL); 1701 } 1702 dvp = avp; 1703 drp = VTOR(dvp); 1704 } 1705 1706 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1707 error = EINTR; 1708 goto out; 1709 } 1710 1711 error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1712 1713 nfs_rw_exit(&drp->r_rwlock); 1714 1715 /* 1716 * If vnode is a device, create special vnode. 1717 */ 1718 if (!error && IS_DEVVP(*vpp)) { 1719 vp = *vpp; 1720 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1721 VN_RELE(vp); 1722 } 1723 1724 out: 1725 if (avp != NULL) 1726 VN_RELE(avp); 1727 1728 return (error); 1729 } 1730 1731 static int nfs_lookup_neg_cache = 1; 1732 1733 #ifdef DEBUG 1734 static int nfs_lookup_dnlc_hits = 0; 1735 static int nfs_lookup_dnlc_misses = 0; 1736 static int nfs_lookup_dnlc_neg_hits = 0; 1737 static int nfs_lookup_dnlc_disappears = 0; 1738 static int nfs_lookup_dnlc_lookups = 0; 1739 #endif 1740 1741 /* ARGSUSED */ 1742 int 1743 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1744 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1745 { 1746 int error; 1747 1748 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1749 1750 /* 1751 * If lookup is for "", just return dvp. Don't need 1752 * to send it over the wire, look it up in the dnlc, 1753 * or perform any access checks. 1754 */ 1755 if (*nm == '\0') { 1756 VN_HOLD(dvp); 1757 *vpp = dvp; 1758 return (0); 1759 } 1760 1761 /* 1762 * Can't do lookups in non-directories. 1763 */ 1764 if (dvp->v_type != VDIR) 1765 return (ENOTDIR); 1766 1767 /* 1768 * If we're called with RFSCALL_SOFT, it's important that 1769 * the only rfscall is one we make directly; if we permit 1770 * an access call because we're looking up "." or validating 1771 * a dnlc hit, we'll deadlock because that rfscall will not 1772 * have the RFSCALL_SOFT set. 1773 */ 1774 if (rfscall_flags & RFSCALL_SOFT) 1775 goto callit; 1776 1777 /* 1778 * If lookup is for ".", just return dvp. Don't need 1779 * to send it over the wire or look it up in the dnlc, 1780 * just need to check access. 1781 */ 1782 if (strcmp(nm, ".") == 0) { 1783 error = nfs_access(dvp, VEXEC, 0, cr, NULL); 1784 if (error) 1785 return (error); 1786 VN_HOLD(dvp); 1787 *vpp = dvp; 1788 return (0); 1789 } 1790 1791 /* 1792 * Lookup this name in the DNLC. If there was a valid entry, 1793 * then return the results of the lookup. 1794 */ 1795 error = nfslookup_dnlc(dvp, nm, vpp, cr); 1796 if (error || *vpp != NULL) 1797 return (error); 1798 1799 callit: 1800 error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1801 1802 return (error); 1803 } 1804 1805 static int 1806 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1807 { 1808 int error; 1809 vnode_t *vp; 1810 1811 ASSERT(*nm != '\0'); 1812 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1813 1814 /* 1815 * Lookup this name in the DNLC. If successful, then validate 1816 * the caches and then recheck the DNLC. The DNLC is rechecked 1817 * just in case this entry got invalidated during the call 1818 * to nfs_validate_caches. 1819 * 1820 * An assumption is being made that it is safe to say that a 1821 * file exists which may not on the server. Any operations to 1822 * the server will fail with ESTALE. 1823 */ 1824 #ifdef DEBUG 1825 nfs_lookup_dnlc_lookups++; 1826 #endif 1827 vp = dnlc_lookup(dvp, nm); 1828 if (vp != NULL) { 1829 VN_RELE(vp); 1830 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 1831 PURGE_ATTRCACHE(dvp); 1832 } 1833 error = nfs_validate_caches(dvp, cr); 1834 if (error) 1835 return (error); 1836 vp = dnlc_lookup(dvp, nm); 1837 if (vp != NULL) { 1838 error = nfs_access(dvp, VEXEC, 0, cr, NULL); 1839 if (error) { 1840 VN_RELE(vp); 1841 return (error); 1842 } 1843 if (vp == DNLC_NO_VNODE) { 1844 VN_RELE(vp); 1845 #ifdef DEBUG 1846 nfs_lookup_dnlc_neg_hits++; 1847 #endif 1848 return (ENOENT); 1849 } 1850 *vpp = vp; 1851 #ifdef DEBUG 1852 nfs_lookup_dnlc_hits++; 1853 #endif 1854 return (0); 1855 } 1856 #ifdef DEBUG 1857 nfs_lookup_dnlc_disappears++; 1858 #endif 1859 } 1860 #ifdef DEBUG 1861 else 1862 nfs_lookup_dnlc_misses++; 1863 #endif 1864 1865 *vpp = NULL; 1866 1867 return (0); 1868 } 1869 1870 static int 1871 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 1872 int rfscall_flags) 1873 { 1874 int error; 1875 struct nfsdiropargs da; 1876 struct nfsdiropres dr; 1877 int douprintf; 1878 failinfo_t fi; 1879 hrtime_t t; 1880 1881 ASSERT(*nm != '\0'); 1882 ASSERT(dvp->v_type == VDIR); 1883 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1884 1885 setdiropargs(&da, nm, dvp); 1886 1887 fi.vp = dvp; 1888 fi.fhp = NULL; /* no need to update, filehandle not copied */ 1889 fi.copyproc = nfscopyfh; 1890 fi.lookupproc = nfslookup; 1891 fi.xattrdirproc = acl_getxattrdir2; 1892 1893 douprintf = 1; 1894 1895 t = gethrtime(); 1896 1897 error = rfs2call(VTOMI(dvp), RFS_LOOKUP, 1898 xdr_diropargs, (caddr_t)&da, 1899 xdr_diropres, (caddr_t)&dr, cr, 1900 &douprintf, &dr.dr_status, rfscall_flags, &fi); 1901 1902 if (!error) { 1903 error = geterrno(dr.dr_status); 1904 if (!error) { 1905 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 1906 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 1907 /* 1908 * If NFS_ACL is supported on the server, then the 1909 * attributes returned by server may have minimal 1910 * permissions sometimes denying access to users having 1911 * proper access. To get the proper attributes, mark 1912 * the attributes as expired so that they will be 1913 * regotten via the NFS_ACL GETATTR2 procedure. 1914 */ 1915 if (VTOMI(*vpp)->mi_flags & MI_ACL) { 1916 PURGE_ATTRCACHE(*vpp); 1917 } 1918 if (!(rfscall_flags & RFSCALL_SOFT)) 1919 dnlc_update(dvp, nm, *vpp); 1920 } else { 1921 PURGE_STALE_FH(error, dvp, cr); 1922 if (error == ENOENT && nfs_lookup_neg_cache) 1923 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 1924 } 1925 } 1926 1927 return (error); 1928 } 1929 1930 /* ARGSUSED */ 1931 static int 1932 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 1933 int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct, 1934 vsecattr_t *vsecp) 1935 { 1936 int error; 1937 struct nfscreatargs args; 1938 struct nfsdiropres dr; 1939 int douprintf; 1940 vnode_t *vp; 1941 rnode_t *rp; 1942 struct vattr vattr; 1943 rnode_t *drp; 1944 vnode_t *tempvp; 1945 hrtime_t t; 1946 1947 drp = VTOR(dvp); 1948 1949 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1950 return (EPERM); 1951 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 1952 return (EINTR); 1953 1954 /* 1955 * We make a copy of the attributes because the caller does not 1956 * expect us to change what va points to. 1957 */ 1958 vattr = *va; 1959 1960 /* 1961 * If the pathname is "", just use dvp. Don't need 1962 * to send it over the wire, look it up in the dnlc, 1963 * or perform any access checks. 1964 */ 1965 if (*nm == '\0') { 1966 error = 0; 1967 VN_HOLD(dvp); 1968 vp = dvp; 1969 /* 1970 * If the pathname is ".", just use dvp. Don't need 1971 * to send it over the wire or look it up in the dnlc, 1972 * just need to check access. 1973 */ 1974 } else if (strcmp(nm, ".") == 0) { 1975 error = nfs_access(dvp, VEXEC, 0, cr, ct); 1976 if (error) { 1977 nfs_rw_exit(&drp->r_rwlock); 1978 return (error); 1979 } 1980 VN_HOLD(dvp); 1981 vp = dvp; 1982 /* 1983 * We need to go over the wire, just to be sure whether the 1984 * file exists or not. Using the DNLC can be dangerous in 1985 * this case when making a decision regarding existence. 1986 */ 1987 } else { 1988 error = nfslookup_otw(dvp, nm, &vp, cr, 0); 1989 } 1990 if (!error) { 1991 if (exclusive == EXCL) 1992 error = EEXIST; 1993 else if (vp->v_type == VDIR && (mode & VWRITE)) 1994 error = EISDIR; 1995 else { 1996 /* 1997 * If vnode is a device, create special vnode. 1998 */ 1999 if (IS_DEVVP(vp)) { 2000 tempvp = vp; 2001 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2002 VN_RELE(tempvp); 2003 } 2004 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 2005 if ((vattr.va_mask & AT_SIZE) && 2006 vp->v_type == VREG) { 2007 vattr.va_mask = AT_SIZE; 2008 error = nfssetattr(vp, &vattr, 0, cr); 2009 } 2010 } 2011 } 2012 nfs_rw_exit(&drp->r_rwlock); 2013 if (error) { 2014 VN_RELE(vp); 2015 } else { 2016 /* 2017 * existing file got truncated, notify. 2018 */ 2019 vnevent_create(vp, ct); 2020 *vpp = vp; 2021 } 2022 return (error); 2023 } 2024 2025 ASSERT(vattr.va_mask & AT_TYPE); 2026 if (vattr.va_type == VREG) { 2027 ASSERT(vattr.va_mask & AT_MODE); 2028 if (MANDMODE(vattr.va_mode)) { 2029 nfs_rw_exit(&drp->r_rwlock); 2030 return (EACCES); 2031 } 2032 } 2033 2034 dnlc_remove(dvp, nm); 2035 2036 setdiropargs(&args.ca_da, nm, dvp); 2037 2038 /* 2039 * Decide what the group-id of the created file should be. 2040 * Set it in attribute list as advisory...then do a setattr 2041 * if the server didn't get it right the first time. 2042 */ 2043 error = setdirgid(dvp, &vattr.va_gid, cr); 2044 if (error) { 2045 nfs_rw_exit(&drp->r_rwlock); 2046 return (error); 2047 } 2048 vattr.va_mask |= AT_GID; 2049 2050 /* 2051 * This is a completely gross hack to make mknod 2052 * work over the wire until we can wack the protocol 2053 */ 2054 #define IFCHR 0020000 /* character special */ 2055 #define IFBLK 0060000 /* block special */ 2056 #define IFSOCK 0140000 /* socket */ 2057 2058 /* 2059 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x 2060 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18 2061 * bits in the minor number where 4.x supports 8 bits. If the 5.x 2062 * minor/major numbers <= 8 bits long, compress the device 2063 * number before sending it. Otherwise, the 4.x server will not 2064 * create the device with the correct device number and nothing can be 2065 * done about this. 2066 */ 2067 if (vattr.va_type == VCHR || vattr.va_type == VBLK) { 2068 dev_t d = vattr.va_rdev; 2069 dev32_t dev32; 2070 2071 if (vattr.va_type == VCHR) 2072 vattr.va_mode |= IFCHR; 2073 else 2074 vattr.va_mode |= IFBLK; 2075 2076 (void) cmpldev(&dev32, d); 2077 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN)) 2078 vattr.va_size = (u_offset_t)dev32; 2079 else 2080 vattr.va_size = (u_offset_t)nfsv2_cmpdev(d); 2081 2082 vattr.va_mask |= AT_MODE|AT_SIZE; 2083 } else if (vattr.va_type == VFIFO) { 2084 vattr.va_mode |= IFCHR; /* xtra kludge for namedpipe */ 2085 vattr.va_size = (u_offset_t)NFS_FIFO_DEV; /* blech */ 2086 vattr.va_mask |= AT_MODE|AT_SIZE; 2087 } else if (vattr.va_type == VSOCK) { 2088 vattr.va_mode |= IFSOCK; 2089 /* 2090 * To avoid triggering bugs in the servers set AT_SIZE 2091 * (all other RFS_CREATE calls set this). 2092 */ 2093 vattr.va_size = 0; 2094 vattr.va_mask |= AT_MODE|AT_SIZE; 2095 } 2096 2097 args.ca_sa = &args.ca_sa_buf; 2098 error = vattr_to_sattr(&vattr, args.ca_sa); 2099 if (error) { 2100 /* req time field(s) overflow - return immediately */ 2101 nfs_rw_exit(&drp->r_rwlock); 2102 return (error); 2103 } 2104 2105 douprintf = 1; 2106 2107 t = gethrtime(); 2108 2109 error = rfs2call(VTOMI(dvp), RFS_CREATE, 2110 xdr_creatargs, (caddr_t)&args, 2111 xdr_diropres, (caddr_t)&dr, cr, 2112 &douprintf, &dr.dr_status, 0, NULL); 2113 2114 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2115 2116 if (!error) { 2117 error = geterrno(dr.dr_status); 2118 if (!error) { 2119 if (HAVE_RDDIR_CACHE(drp)) 2120 nfs_purge_rddir_cache(dvp); 2121 vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2122 dvp->v_vfsp, t, cr, NULL, NULL); 2123 /* 2124 * If NFS_ACL is supported on the server, then the 2125 * attributes returned by server may have minimal 2126 * permissions sometimes denying access to users having 2127 * proper access. To get the proper attributes, mark 2128 * the attributes as expired so that they will be 2129 * regotten via the NFS_ACL GETATTR2 procedure. 2130 */ 2131 if (VTOMI(vp)->mi_flags & MI_ACL) { 2132 PURGE_ATTRCACHE(vp); 2133 } 2134 dnlc_update(dvp, nm, vp); 2135 rp = VTOR(vp); 2136 if (vattr.va_size == 0) { 2137 mutex_enter(&rp->r_statelock); 2138 rp->r_size = 0; 2139 mutex_exit(&rp->r_statelock); 2140 if (vn_has_cached_data(vp)) { 2141 ASSERT(vp->v_type != VCHR); 2142 nfs_invalidate_pages(vp, 2143 (u_offset_t)0, cr); 2144 } 2145 } 2146 2147 /* 2148 * Make sure the gid was set correctly. 2149 * If not, try to set it (but don't lose 2150 * any sleep over it). 2151 */ 2152 if (vattr.va_gid != rp->r_attr.va_gid) { 2153 vattr.va_mask = AT_GID; 2154 (void) nfssetattr(vp, &vattr, 0, cr); 2155 } 2156 2157 /* 2158 * If vnode is a device create special vnode 2159 */ 2160 if (IS_DEVVP(vp)) { 2161 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2162 VN_RELE(vp); 2163 } else 2164 *vpp = vp; 2165 } else { 2166 PURGE_STALE_FH(error, dvp, cr); 2167 } 2168 } 2169 2170 nfs_rw_exit(&drp->r_rwlock); 2171 2172 return (error); 2173 } 2174 2175 /* 2176 * Weirdness: if the vnode to be removed is open 2177 * we rename it instead of removing it and nfs_inactive 2178 * will remove the new name. 2179 */ 2180 /* ARGSUSED */ 2181 static int 2182 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 2183 { 2184 int error; 2185 struct nfsdiropargs da; 2186 enum nfsstat status; 2187 vnode_t *vp; 2188 char *tmpname; 2189 int douprintf; 2190 rnode_t *rp; 2191 rnode_t *drp; 2192 2193 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2194 return (EPERM); 2195 drp = VTOR(dvp); 2196 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2197 return (EINTR); 2198 2199 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2200 if (error) { 2201 nfs_rw_exit(&drp->r_rwlock); 2202 return (error); 2203 } 2204 2205 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2206 VN_RELE(vp); 2207 nfs_rw_exit(&drp->r_rwlock); 2208 return (EPERM); 2209 } 2210 2211 /* 2212 * First just remove the entry from the name cache, as it 2213 * is most likely the only entry for this vp. 2214 */ 2215 dnlc_remove(dvp, nm); 2216 2217 /* 2218 * If the file has a v_count > 1 then there may be more than one 2219 * entry in the name cache due multiple links or an open file, 2220 * but we don't have the real reference count so flush all 2221 * possible entries. 2222 */ 2223 if (vp->v_count > 1) 2224 dnlc_purge_vp(vp); 2225 2226 /* 2227 * Now we have the real reference count on the vnode 2228 */ 2229 rp = VTOR(vp); 2230 mutex_enter(&rp->r_statelock); 2231 if (vp->v_count > 1 && 2232 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2233 mutex_exit(&rp->r_statelock); 2234 tmpname = newname(); 2235 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct); 2236 if (error) 2237 kmem_free(tmpname, MAXNAMELEN); 2238 else { 2239 mutex_enter(&rp->r_statelock); 2240 if (rp->r_unldvp == NULL) { 2241 VN_HOLD(dvp); 2242 rp->r_unldvp = dvp; 2243 if (rp->r_unlcred != NULL) 2244 crfree(rp->r_unlcred); 2245 crhold(cr); 2246 rp->r_unlcred = cr; 2247 rp->r_unlname = tmpname; 2248 } else { 2249 kmem_free(rp->r_unlname, MAXNAMELEN); 2250 rp->r_unlname = tmpname; 2251 } 2252 mutex_exit(&rp->r_statelock); 2253 } 2254 } else { 2255 mutex_exit(&rp->r_statelock); 2256 /* 2257 * We need to flush any dirty pages which happen to 2258 * be hanging around before removing the file. This 2259 * shouldn't happen very often and mostly on file 2260 * systems mounted "nocto". 2261 */ 2262 if (vn_has_cached_data(vp) && 2263 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2264 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct); 2265 if (error && (error == ENOSPC || error == EDQUOT)) { 2266 mutex_enter(&rp->r_statelock); 2267 if (!rp->r_error) 2268 rp->r_error = error; 2269 mutex_exit(&rp->r_statelock); 2270 } 2271 } 2272 2273 setdiropargs(&da, nm, dvp); 2274 2275 douprintf = 1; 2276 2277 error = rfs2call(VTOMI(dvp), RFS_REMOVE, 2278 xdr_diropargs, (caddr_t)&da, 2279 xdr_enum, (caddr_t)&status, cr, 2280 &douprintf, &status, 0, NULL); 2281 2282 /* 2283 * The xattr dir may be gone after last attr is removed, 2284 * so flush it from dnlc. 2285 */ 2286 if (dvp->v_flag & V_XATTRDIR) 2287 dnlc_purge_vp(dvp); 2288 2289 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2290 PURGE_ATTRCACHE(vp); /* link count changed */ 2291 2292 if (!error) { 2293 error = geterrno(status); 2294 if (!error) { 2295 if (HAVE_RDDIR_CACHE(drp)) 2296 nfs_purge_rddir_cache(dvp); 2297 } else { 2298 PURGE_STALE_FH(error, dvp, cr); 2299 } 2300 } 2301 } 2302 2303 if (error == 0) { 2304 vnevent_remove(vp, dvp, nm, ct); 2305 } 2306 VN_RELE(vp); 2307 2308 nfs_rw_exit(&drp->r_rwlock); 2309 2310 return (error); 2311 } 2312 2313 /* ARGSUSED */ 2314 static int 2315 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 2316 caller_context_t *ct, int flags) 2317 { 2318 int error; 2319 struct nfslinkargs args; 2320 enum nfsstat status; 2321 vnode_t *realvp; 2322 int douprintf; 2323 rnode_t *tdrp; 2324 2325 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2326 return (EPERM); 2327 if (VOP_REALVP(svp, &realvp, ct) == 0) 2328 svp = realvp; 2329 2330 args.la_from = VTOFH(svp); 2331 setdiropargs(&args.la_to, tnm, tdvp); 2332 2333 tdrp = VTOR(tdvp); 2334 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2335 return (EINTR); 2336 2337 dnlc_remove(tdvp, tnm); 2338 2339 douprintf = 1; 2340 2341 error = rfs2call(VTOMI(svp), RFS_LINK, 2342 xdr_linkargs, (caddr_t)&args, 2343 xdr_enum, (caddr_t)&status, cr, 2344 &douprintf, &status, 0, NULL); 2345 2346 PURGE_ATTRCACHE(tdvp); /* mod time changed */ 2347 PURGE_ATTRCACHE(svp); /* link count changed */ 2348 2349 if (!error) { 2350 error = geterrno(status); 2351 if (!error) { 2352 if (HAVE_RDDIR_CACHE(tdrp)) 2353 nfs_purge_rddir_cache(tdvp); 2354 } 2355 } 2356 2357 nfs_rw_exit(&tdrp->r_rwlock); 2358 2359 if (!error) { 2360 /* 2361 * Notify the source file of this link operation. 2362 */ 2363 vnevent_link(svp, ct); 2364 } 2365 return (error); 2366 } 2367 2368 /* ARGSUSED */ 2369 static int 2370 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 2371 caller_context_t *ct, int flags) 2372 { 2373 vnode_t *realvp; 2374 2375 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2376 return (EPERM); 2377 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 2378 ndvp = realvp; 2379 2380 return (nfsrename(odvp, onm, ndvp, nnm, cr, ct)); 2381 } 2382 2383 /* 2384 * nfsrename does the real work of renaming in NFS Version 2. 2385 */ 2386 static int 2387 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 2388 caller_context_t *ct) 2389 { 2390 int error; 2391 enum nfsstat status; 2392 struct nfsrnmargs args; 2393 int douprintf; 2394 vnode_t *nvp = NULL; 2395 vnode_t *ovp = NULL; 2396 char *tmpname; 2397 rnode_t *rp; 2398 rnode_t *odrp; 2399 rnode_t *ndrp; 2400 2401 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2402 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2403 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2404 return (EINVAL); 2405 2406 odrp = VTOR(odvp); 2407 ndrp = VTOR(ndvp); 2408 if ((intptr_t)odrp < (intptr_t)ndrp) { 2409 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2410 return (EINTR); 2411 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2412 nfs_rw_exit(&odrp->r_rwlock); 2413 return (EINTR); 2414 } 2415 } else { 2416 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2417 return (EINTR); 2418 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2419 nfs_rw_exit(&ndrp->r_rwlock); 2420 return (EINTR); 2421 } 2422 } 2423 2424 /* 2425 * Lookup the target file. If it exists, it needs to be 2426 * checked to see whether it is a mount point and whether 2427 * it is active (open). 2428 */ 2429 error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 2430 if (!error) { 2431 /* 2432 * If this file has been mounted on, then just 2433 * return busy because renaming to it would remove 2434 * the mounted file system from the name space. 2435 */ 2436 if (vn_mountedvfs(nvp) != NULL) { 2437 VN_RELE(nvp); 2438 nfs_rw_exit(&odrp->r_rwlock); 2439 nfs_rw_exit(&ndrp->r_rwlock); 2440 return (EBUSY); 2441 } 2442 2443 /* 2444 * Purge the name cache of all references to this vnode 2445 * so that we can check the reference count to infer 2446 * whether it is active or not. 2447 */ 2448 /* 2449 * First just remove the entry from the name cache, as it 2450 * is most likely the only entry for this vp. 2451 */ 2452 dnlc_remove(ndvp, nnm); 2453 /* 2454 * If the file has a v_count > 1 then there may be more 2455 * than one entry in the name cache due multiple links 2456 * or an open file, but we don't have the real reference 2457 * count so flush all possible entries. 2458 */ 2459 if (nvp->v_count > 1) 2460 dnlc_purge_vp(nvp); 2461 2462 /* 2463 * If the vnode is active and is not a directory, 2464 * arrange to rename it to a 2465 * temporary file so that it will continue to be 2466 * accessible. This implements the "unlink-open-file" 2467 * semantics for the target of a rename operation. 2468 * Before doing this though, make sure that the 2469 * source and target files are not already the same. 2470 */ 2471 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 2472 /* 2473 * Lookup the source name. 2474 */ 2475 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, 2476 cr, 0); 2477 2478 /* 2479 * The source name *should* already exist. 2480 */ 2481 if (error) { 2482 VN_RELE(nvp); 2483 nfs_rw_exit(&odrp->r_rwlock); 2484 nfs_rw_exit(&ndrp->r_rwlock); 2485 return (error); 2486 } 2487 2488 /* 2489 * Compare the two vnodes. If they are the same, 2490 * just release all held vnodes and return success. 2491 */ 2492 if (ovp == nvp) { 2493 VN_RELE(ovp); 2494 VN_RELE(nvp); 2495 nfs_rw_exit(&odrp->r_rwlock); 2496 nfs_rw_exit(&ndrp->r_rwlock); 2497 return (0); 2498 } 2499 2500 /* 2501 * Can't mix and match directories and non- 2502 * directories in rename operations. We already 2503 * know that the target is not a directory. If 2504 * the source is a directory, return an error. 2505 */ 2506 if (ovp->v_type == VDIR) { 2507 VN_RELE(ovp); 2508 VN_RELE(nvp); 2509 nfs_rw_exit(&odrp->r_rwlock); 2510 nfs_rw_exit(&ndrp->r_rwlock); 2511 return (ENOTDIR); 2512 } 2513 2514 /* 2515 * The target file exists, is not the same as 2516 * the source file, and is active. Link it 2517 * to a temporary filename to avoid having 2518 * the server removing the file completely. 2519 */ 2520 tmpname = newname(); 2521 error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0); 2522 if (error == EOPNOTSUPP) { 2523 error = nfs_rename(ndvp, nnm, ndvp, tmpname, 2524 cr, NULL, 0); 2525 } 2526 if (error) { 2527 kmem_free(tmpname, MAXNAMELEN); 2528 VN_RELE(ovp); 2529 VN_RELE(nvp); 2530 nfs_rw_exit(&odrp->r_rwlock); 2531 nfs_rw_exit(&ndrp->r_rwlock); 2532 return (error); 2533 } 2534 rp = VTOR(nvp); 2535 mutex_enter(&rp->r_statelock); 2536 if (rp->r_unldvp == NULL) { 2537 VN_HOLD(ndvp); 2538 rp->r_unldvp = ndvp; 2539 if (rp->r_unlcred != NULL) 2540 crfree(rp->r_unlcred); 2541 crhold(cr); 2542 rp->r_unlcred = cr; 2543 rp->r_unlname = tmpname; 2544 } else { 2545 kmem_free(rp->r_unlname, MAXNAMELEN); 2546 rp->r_unlname = tmpname; 2547 } 2548 mutex_exit(&rp->r_statelock); 2549 } 2550 } 2551 2552 if (ovp == NULL) { 2553 /* 2554 * When renaming directories to be a subdirectory of a 2555 * different parent, the dnlc entry for ".." will no 2556 * longer be valid, so it must be removed. 2557 * 2558 * We do a lookup here to determine whether we are renaming 2559 * a directory and we need to check if we are renaming 2560 * an unlinked file. This might have already been done 2561 * in previous code, so we check ovp == NULL to avoid 2562 * doing it twice. 2563 */ 2564 2565 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 2566 2567 /* 2568 * The source name *should* already exist. 2569 */ 2570 if (error) { 2571 nfs_rw_exit(&odrp->r_rwlock); 2572 nfs_rw_exit(&ndrp->r_rwlock); 2573 if (nvp) { 2574 VN_RELE(nvp); 2575 } 2576 return (error); 2577 } 2578 ASSERT(ovp != NULL); 2579 } 2580 2581 dnlc_remove(odvp, onm); 2582 dnlc_remove(ndvp, nnm); 2583 2584 setdiropargs(&args.rna_from, onm, odvp); 2585 setdiropargs(&args.rna_to, nnm, ndvp); 2586 2587 douprintf = 1; 2588 2589 error = rfs2call(VTOMI(odvp), RFS_RENAME, 2590 xdr_rnmargs, (caddr_t)&args, 2591 xdr_enum, (caddr_t)&status, cr, 2592 &douprintf, &status, 0, NULL); 2593 2594 PURGE_ATTRCACHE(odvp); /* mod time changed */ 2595 PURGE_ATTRCACHE(ndvp); /* mod time changed */ 2596 2597 if (!error) { 2598 error = geterrno(status); 2599 if (!error) { 2600 if (HAVE_RDDIR_CACHE(odrp)) 2601 nfs_purge_rddir_cache(odvp); 2602 if (HAVE_RDDIR_CACHE(ndrp)) 2603 nfs_purge_rddir_cache(ndvp); 2604 /* 2605 * when renaming directories to be a subdirectory of a 2606 * different parent, the dnlc entry for ".." will no 2607 * longer be valid, so it must be removed 2608 */ 2609 rp = VTOR(ovp); 2610 if (ndvp != odvp) { 2611 if (ovp->v_type == VDIR) { 2612 dnlc_remove(ovp, ".."); 2613 if (HAVE_RDDIR_CACHE(rp)) 2614 nfs_purge_rddir_cache(ovp); 2615 } 2616 } 2617 2618 /* 2619 * If we are renaming the unlinked file, update the 2620 * r_unldvp and r_unlname as needed. 2621 */ 2622 mutex_enter(&rp->r_statelock); 2623 if (rp->r_unldvp != NULL) { 2624 if (strcmp(rp->r_unlname, onm) == 0) { 2625 (void) strncpy(rp->r_unlname, 2626 nnm, MAXNAMELEN); 2627 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 2628 2629 if (ndvp != rp->r_unldvp) { 2630 VN_RELE(rp->r_unldvp); 2631 rp->r_unldvp = ndvp; 2632 VN_HOLD(ndvp); 2633 } 2634 } 2635 } 2636 mutex_exit(&rp->r_statelock); 2637 } else { 2638 /* 2639 * System V defines rename to return EEXIST, not 2640 * ENOTEMPTY if the target directory is not empty. 2641 * Over the wire, the error is NFSERR_ENOTEMPTY 2642 * which geterrno maps to ENOTEMPTY. 2643 */ 2644 if (error == ENOTEMPTY) 2645 error = EEXIST; 2646 } 2647 } 2648 2649 if (error == 0) { 2650 if (nvp) 2651 vnevent_rename_dest(nvp, ndvp, nnm, ct); 2652 2653 if (odvp != ndvp) 2654 vnevent_rename_dest_dir(ndvp, ct); 2655 2656 ASSERT(ovp != NULL); 2657 vnevent_rename_src(ovp, odvp, onm, ct); 2658 } 2659 2660 if (nvp) { 2661 VN_RELE(nvp); 2662 } 2663 VN_RELE(ovp); 2664 2665 nfs_rw_exit(&odrp->r_rwlock); 2666 nfs_rw_exit(&ndrp->r_rwlock); 2667 2668 return (error); 2669 } 2670 2671 /* ARGSUSED */ 2672 static int 2673 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 2674 caller_context_t *ct, int flags, vsecattr_t *vsecp) 2675 { 2676 int error; 2677 struct nfscreatargs args; 2678 struct nfsdiropres dr; 2679 int douprintf; 2680 rnode_t *drp; 2681 hrtime_t t; 2682 2683 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2684 return (EPERM); 2685 2686 setdiropargs(&args.ca_da, nm, dvp); 2687 2688 /* 2689 * Decide what the group-id and set-gid bit of the created directory 2690 * should be. May have to do a setattr to get the gid right. 2691 */ 2692 error = setdirgid(dvp, &va->va_gid, cr); 2693 if (error) 2694 return (error); 2695 error = setdirmode(dvp, &va->va_mode, cr); 2696 if (error) 2697 return (error); 2698 va->va_mask |= AT_MODE|AT_GID; 2699 2700 args.ca_sa = &args.ca_sa_buf; 2701 error = vattr_to_sattr(va, args.ca_sa); 2702 if (error) { 2703 /* req time field(s) overflow - return immediately */ 2704 return (error); 2705 } 2706 2707 drp = VTOR(dvp); 2708 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2709 return (EINTR); 2710 2711 dnlc_remove(dvp, nm); 2712 2713 douprintf = 1; 2714 2715 t = gethrtime(); 2716 2717 error = rfs2call(VTOMI(dvp), RFS_MKDIR, 2718 xdr_creatargs, (caddr_t)&args, 2719 xdr_diropres, (caddr_t)&dr, cr, 2720 &douprintf, &dr.dr_status, 0, NULL); 2721 2722 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2723 2724 if (!error) { 2725 error = geterrno(dr.dr_status); 2726 if (!error) { 2727 if (HAVE_RDDIR_CACHE(drp)) 2728 nfs_purge_rddir_cache(dvp); 2729 /* 2730 * The attributes returned by RFS_MKDIR can not 2731 * be depended upon, so mark the attribute cache 2732 * as purged. A subsequent GETATTR will get the 2733 * correct attributes from the server. 2734 */ 2735 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr, 2736 dvp->v_vfsp, t, cr, NULL, NULL); 2737 PURGE_ATTRCACHE(*vpp); 2738 dnlc_update(dvp, nm, *vpp); 2739 2740 /* 2741 * Make sure the gid was set correctly. 2742 * If not, try to set it (but don't lose 2743 * any sleep over it). 2744 */ 2745 if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) { 2746 va->va_mask = AT_GID; 2747 (void) nfssetattr(*vpp, va, 0, cr); 2748 } 2749 } else { 2750 PURGE_STALE_FH(error, dvp, cr); 2751 } 2752 } 2753 2754 nfs_rw_exit(&drp->r_rwlock); 2755 2756 return (error); 2757 } 2758 2759 /* ARGSUSED */ 2760 static int 2761 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 2762 caller_context_t *ct, int flags) 2763 { 2764 int error; 2765 enum nfsstat status; 2766 struct nfsdiropargs da; 2767 vnode_t *vp; 2768 int douprintf; 2769 rnode_t *drp; 2770 2771 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2772 return (EPERM); 2773 drp = VTOR(dvp); 2774 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2775 return (EINTR); 2776 2777 /* 2778 * Attempt to prevent a rmdir(".") from succeeding. 2779 */ 2780 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2781 if (error) { 2782 nfs_rw_exit(&drp->r_rwlock); 2783 return (error); 2784 } 2785 2786 if (vp == cdir) { 2787 VN_RELE(vp); 2788 nfs_rw_exit(&drp->r_rwlock); 2789 return (EINVAL); 2790 } 2791 2792 setdiropargs(&da, nm, dvp); 2793 2794 /* 2795 * First just remove the entry from the name cache, as it 2796 * is most likely an entry for this vp. 2797 */ 2798 dnlc_remove(dvp, nm); 2799 2800 /* 2801 * If there vnode reference count is greater than one, then 2802 * there may be additional references in the DNLC which will 2803 * need to be purged. First, trying removing the entry for 2804 * the parent directory and see if that removes the additional 2805 * reference(s). If that doesn't do it, then use dnlc_purge_vp 2806 * to completely remove any references to the directory which 2807 * might still exist in the DNLC. 2808 */ 2809 if (vp->v_count > 1) { 2810 dnlc_remove(vp, ".."); 2811 if (vp->v_count > 1) 2812 dnlc_purge_vp(vp); 2813 } 2814 2815 douprintf = 1; 2816 2817 error = rfs2call(VTOMI(dvp), RFS_RMDIR, 2818 xdr_diropargs, (caddr_t)&da, 2819 xdr_enum, (caddr_t)&status, cr, 2820 &douprintf, &status, 0, NULL); 2821 2822 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2823 2824 if (error) { 2825 VN_RELE(vp); 2826 nfs_rw_exit(&drp->r_rwlock); 2827 return (error); 2828 } 2829 2830 error = geterrno(status); 2831 if (!error) { 2832 if (HAVE_RDDIR_CACHE(drp)) 2833 nfs_purge_rddir_cache(dvp); 2834 if (HAVE_RDDIR_CACHE(VTOR(vp))) 2835 nfs_purge_rddir_cache(vp); 2836 } else { 2837 PURGE_STALE_FH(error, dvp, cr); 2838 /* 2839 * System V defines rmdir to return EEXIST, not 2840 * ENOTEMPTY if the directory is not empty. Over 2841 * the wire, the error is NFSERR_ENOTEMPTY which 2842 * geterrno maps to ENOTEMPTY. 2843 */ 2844 if (error == ENOTEMPTY) 2845 error = EEXIST; 2846 } 2847 2848 if (error == 0) { 2849 vnevent_rmdir(vp, dvp, nm, ct); 2850 } 2851 VN_RELE(vp); 2852 2853 nfs_rw_exit(&drp->r_rwlock); 2854 2855 return (error); 2856 } 2857 2858 /* ARGSUSED */ 2859 static int 2860 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 2861 caller_context_t *ct, int flags) 2862 { 2863 int error; 2864 struct nfsslargs args; 2865 enum nfsstat status; 2866 int douprintf; 2867 rnode_t *drp; 2868 2869 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2870 return (EPERM); 2871 setdiropargs(&args.sla_from, lnm, dvp); 2872 args.sla_sa = &args.sla_sa_buf; 2873 error = vattr_to_sattr(tva, args.sla_sa); 2874 if (error) { 2875 /* req time field(s) overflow - return immediately */ 2876 return (error); 2877 } 2878 args.sla_tnm = tnm; 2879 2880 drp = VTOR(dvp); 2881 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2882 return (EINTR); 2883 2884 dnlc_remove(dvp, lnm); 2885 2886 douprintf = 1; 2887 2888 error = rfs2call(VTOMI(dvp), RFS_SYMLINK, 2889 xdr_slargs, (caddr_t)&args, 2890 xdr_enum, (caddr_t)&status, cr, 2891 &douprintf, &status, 0, NULL); 2892 2893 PURGE_ATTRCACHE(dvp); /* mod time changed */ 2894 2895 if (!error) { 2896 error = geterrno(status); 2897 if (!error) { 2898 if (HAVE_RDDIR_CACHE(drp)) 2899 nfs_purge_rddir_cache(dvp); 2900 } else { 2901 PURGE_STALE_FH(error, dvp, cr); 2902 } 2903 } 2904 2905 nfs_rw_exit(&drp->r_rwlock); 2906 2907 return (error); 2908 } 2909 2910 #ifdef DEBUG 2911 static int nfs_readdir_cache_hits = 0; 2912 static int nfs_readdir_cache_shorts = 0; 2913 static int nfs_readdir_cache_waits = 0; 2914 static int nfs_readdir_cache_misses = 0; 2915 static int nfs_readdir_readahead = 0; 2916 #endif 2917 2918 static int nfs_shrinkreaddir = 0; 2919 2920 /* 2921 * Read directory entries. 2922 * There are some weird things to look out for here. The uio_offset 2923 * field is either 0 or it is the offset returned from a previous 2924 * readdir. It is an opaque value used by the server to find the 2925 * correct directory block to read. The count field is the number 2926 * of blocks to read on the server. This is advisory only, the server 2927 * may return only one block's worth of entries. Entries may be compressed 2928 * on the server. 2929 */ 2930 /* ARGSUSED */ 2931 static int 2932 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 2933 caller_context_t *ct, int flags) 2934 { 2935 int error; 2936 size_t count; 2937 rnode_t *rp; 2938 rddir_cache *rdc; 2939 rddir_cache *nrdc; 2940 rddir_cache *rrdc; 2941 #ifdef DEBUG 2942 int missed; 2943 #endif 2944 rddir_cache srdc; 2945 avl_index_t where; 2946 2947 rp = VTOR(vp); 2948 2949 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2950 if (nfs_zone() != VTOMI(vp)->mi_zone) 2951 return (EIO); 2952 /* 2953 * Make sure that the directory cache is valid. 2954 */ 2955 if (HAVE_RDDIR_CACHE(rp)) { 2956 if (nfs_disable_rddir_cache) { 2957 /* 2958 * Setting nfs_disable_rddir_cache in /etc/system 2959 * allows interoperability with servers that do not 2960 * properly update the attributes of directories. 2961 * Any cached information gets purged before an 2962 * access is made to it. 2963 */ 2964 nfs_purge_rddir_cache(vp); 2965 } else { 2966 error = nfs_validate_caches(vp, cr); 2967 if (error) 2968 return (error); 2969 } 2970 } 2971 2972 /* 2973 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an 2974 * RFS_READDIR request with rda_count set to more than 0x400. So 2975 * we reduce the request size here purely for compatibility. 2976 * 2977 * In general, this is no longer required. However, if a server 2978 * is discovered which can not handle requests larger than 1024, 2979 * nfs_shrinkreaddir can be set to 1 to enable this backwards 2980 * compatibility. 2981 * 2982 * In any case, the request size is limited to NFS_MAXDATA bytes. 2983 */ 2984 count = MIN(uiop->uio_iov->iov_len, 2985 nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA); 2986 2987 nrdc = NULL; 2988 #ifdef DEBUG 2989 missed = 0; 2990 #endif 2991 top: 2992 /* 2993 * Short circuit last readdir which always returns 0 bytes. 2994 * This can be done after the directory has been read through 2995 * completely at least once. This will set r_direof which 2996 * can be used to find the value of the last cookie. 2997 */ 2998 mutex_enter(&rp->r_statelock); 2999 if (rp->r_direof != NULL && 3000 uiop->uio_offset == rp->r_direof->nfs_ncookie) { 3001 mutex_exit(&rp->r_statelock); 3002 #ifdef DEBUG 3003 nfs_readdir_cache_shorts++; 3004 #endif 3005 if (eofp) 3006 *eofp = 1; 3007 if (nrdc != NULL) 3008 rddir_cache_rele(nrdc); 3009 return (0); 3010 } 3011 /* 3012 * Look for a cache entry. Cache entries are identified 3013 * by the NFS cookie value and the byte count requested. 3014 */ 3015 srdc.nfs_cookie = uiop->uio_offset; 3016 srdc.buflen = count; 3017 rdc = avl_find(&rp->r_dir, &srdc, &where); 3018 if (rdc != NULL) { 3019 rddir_cache_hold(rdc); 3020 /* 3021 * If the cache entry is in the process of being 3022 * filled in, wait until this completes. The 3023 * RDDIRWAIT bit is set to indicate that someone 3024 * is waiting and then the thread currently 3025 * filling the entry is done, it should do a 3026 * cv_broadcast to wakeup all of the threads 3027 * waiting for it to finish. 3028 */ 3029 if (rdc->flags & RDDIR) { 3030 nfs_rw_exit(&rp->r_rwlock); 3031 rdc->flags |= RDDIRWAIT; 3032 #ifdef DEBUG 3033 nfs_readdir_cache_waits++; 3034 #endif 3035 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 3036 /* 3037 * We got interrupted, probably 3038 * the user typed ^C or an alarm 3039 * fired. We free the new entry 3040 * if we allocated one. 3041 */ 3042 mutex_exit(&rp->r_statelock); 3043 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3044 RW_READER, FALSE); 3045 rddir_cache_rele(rdc); 3046 if (nrdc != NULL) 3047 rddir_cache_rele(nrdc); 3048 return (EINTR); 3049 } 3050 mutex_exit(&rp->r_statelock); 3051 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3052 RW_READER, FALSE); 3053 rddir_cache_rele(rdc); 3054 goto top; 3055 } 3056 /* 3057 * Check to see if a readdir is required to 3058 * fill the entry. If so, mark this entry 3059 * as being filled, remove our reference, 3060 * and branch to the code to fill the entry. 3061 */ 3062 if (rdc->flags & RDDIRREQ) { 3063 rdc->flags &= ~RDDIRREQ; 3064 rdc->flags |= RDDIR; 3065 if (nrdc != NULL) 3066 rddir_cache_rele(nrdc); 3067 nrdc = rdc; 3068 mutex_exit(&rp->r_statelock); 3069 goto bottom; 3070 } 3071 #ifdef DEBUG 3072 if (!missed) 3073 nfs_readdir_cache_hits++; 3074 #endif 3075 /* 3076 * If an error occurred while attempting 3077 * to fill the cache entry, just return it. 3078 */ 3079 if (rdc->error) { 3080 error = rdc->error; 3081 mutex_exit(&rp->r_statelock); 3082 rddir_cache_rele(rdc); 3083 if (nrdc != NULL) 3084 rddir_cache_rele(nrdc); 3085 return (error); 3086 } 3087 3088 /* 3089 * The cache entry is complete and good, 3090 * copyout the dirent structs to the calling 3091 * thread. 3092 */ 3093 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 3094 3095 /* 3096 * If no error occurred during the copyout, 3097 * update the offset in the uio struct to 3098 * contain the value of the next cookie 3099 * and set the eof value appropriately. 3100 */ 3101 if (!error) { 3102 uiop->uio_offset = rdc->nfs_ncookie; 3103 if (eofp) 3104 *eofp = rdc->eof; 3105 } 3106 3107 /* 3108 * Decide whether to do readahead. Don't if 3109 * have already read to the end of directory. 3110 */ 3111 if (rdc->eof) { 3112 rp->r_direof = rdc; 3113 mutex_exit(&rp->r_statelock); 3114 rddir_cache_rele(rdc); 3115 if (nrdc != NULL) 3116 rddir_cache_rele(nrdc); 3117 return (error); 3118 } 3119 3120 /* 3121 * Check to see whether we found an entry 3122 * for the readahead. If so, we don't need 3123 * to do anything further, so free the new 3124 * entry if one was allocated. Otherwise, 3125 * allocate a new entry, add it to the cache, 3126 * and then initiate an asynchronous readdir 3127 * operation to fill it. 3128 */ 3129 srdc.nfs_cookie = rdc->nfs_ncookie; 3130 srdc.buflen = count; 3131 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3132 if (rrdc != NULL) { 3133 if (nrdc != NULL) 3134 rddir_cache_rele(nrdc); 3135 } else { 3136 if (nrdc != NULL) 3137 rrdc = nrdc; 3138 else { 3139 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3140 } 3141 if (rrdc != NULL) { 3142 rrdc->nfs_cookie = rdc->nfs_ncookie; 3143 rrdc->buflen = count; 3144 avl_insert(&rp->r_dir, rrdc, where); 3145 rddir_cache_hold(rrdc); 3146 mutex_exit(&rp->r_statelock); 3147 rddir_cache_rele(rdc); 3148 #ifdef DEBUG 3149 nfs_readdir_readahead++; 3150 #endif 3151 nfs_async_readdir(vp, rrdc, cr, nfsreaddir); 3152 return (error); 3153 } 3154 } 3155 3156 mutex_exit(&rp->r_statelock); 3157 rddir_cache_rele(rdc); 3158 return (error); 3159 } 3160 3161 /* 3162 * Didn't find an entry in the cache. Construct a new empty 3163 * entry and link it into the cache. Other processes attempting 3164 * to access this entry will need to wait until it is filled in. 3165 * 3166 * Since kmem_alloc may block, another pass through the cache 3167 * will need to be taken to make sure that another process 3168 * hasn't already added an entry to the cache for this request. 3169 */ 3170 if (nrdc == NULL) { 3171 mutex_exit(&rp->r_statelock); 3172 nrdc = rddir_cache_alloc(KM_SLEEP); 3173 nrdc->nfs_cookie = uiop->uio_offset; 3174 nrdc->buflen = count; 3175 goto top; 3176 } 3177 3178 /* 3179 * Add this entry to the cache. 3180 */ 3181 avl_insert(&rp->r_dir, nrdc, where); 3182 rddir_cache_hold(nrdc); 3183 mutex_exit(&rp->r_statelock); 3184 3185 bottom: 3186 #ifdef DEBUG 3187 missed = 1; 3188 nfs_readdir_cache_misses++; 3189 #endif 3190 /* 3191 * Do the readdir. 3192 */ 3193 error = nfsreaddir(vp, nrdc, cr); 3194 3195 /* 3196 * If this operation failed, just return the error which occurred. 3197 */ 3198 if (error != 0) 3199 return (error); 3200 3201 /* 3202 * Since the RPC operation will have taken sometime and blocked 3203 * this process, another pass through the cache will need to be 3204 * taken to find the correct cache entry. It is possible that 3205 * the correct cache entry will not be there (although one was 3206 * added) because the directory changed during the RPC operation 3207 * and the readdir cache was flushed. In this case, just start 3208 * over. It is hoped that this will not happen too often... :-) 3209 */ 3210 nrdc = NULL; 3211 goto top; 3212 /* NOTREACHED */ 3213 } 3214 3215 static int 3216 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3217 { 3218 int error; 3219 struct nfsrddirargs rda; 3220 struct nfsrddirres rd; 3221 rnode_t *rp; 3222 mntinfo_t *mi; 3223 uint_t count; 3224 int douprintf; 3225 failinfo_t fi, *fip; 3226 3227 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3228 count = rdc->buflen; 3229 3230 rp = VTOR(vp); 3231 mi = VTOMI(vp); 3232 3233 rda.rda_fh = *VTOFH(vp); 3234 rda.rda_offset = rdc->nfs_cookie; 3235 3236 /* 3237 * NFS client failover support 3238 * suppress failover unless we have a zero cookie 3239 */ 3240 if (rdc->nfs_cookie == (off_t)0) { 3241 fi.vp = vp; 3242 fi.fhp = (caddr_t)&rda.rda_fh; 3243 fi.copyproc = nfscopyfh; 3244 fi.lookupproc = nfslookup; 3245 fi.xattrdirproc = acl_getxattrdir2; 3246 fip = &fi; 3247 } else { 3248 fip = NULL; 3249 } 3250 3251 rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3252 rd.rd_size = count; 3253 rd.rd_offset = rda.rda_offset; 3254 3255 douprintf = 1; 3256 3257 if (mi->mi_io_kstats) { 3258 mutex_enter(&mi->mi_lock); 3259 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3260 mutex_exit(&mi->mi_lock); 3261 } 3262 3263 do { 3264 rda.rda_count = MIN(count, mi->mi_curread); 3265 error = rfs2call(mi, RFS_READDIR, 3266 xdr_rddirargs, (caddr_t)&rda, 3267 xdr_getrddirres, (caddr_t)&rd, cr, 3268 &douprintf, &rd.rd_status, 0, fip); 3269 } while (error == ENFS_TRYAGAIN); 3270 3271 if (mi->mi_io_kstats) { 3272 mutex_enter(&mi->mi_lock); 3273 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3274 mutex_exit(&mi->mi_lock); 3275 } 3276 3277 /* 3278 * Since we are actually doing a READDIR RPC, we must have 3279 * exclusive access to the cache entry being filled. Thus, 3280 * it is safe to update all fields except for the flags 3281 * field. The r_statelock in the rnode must be held to 3282 * prevent two different threads from simultaneously 3283 * attempting to update the flags field. This can happen 3284 * if we are turning off RDDIR and the other thread is 3285 * trying to set RDDIRWAIT. 3286 */ 3287 ASSERT(rdc->flags & RDDIR); 3288 if (!error) { 3289 error = geterrno(rd.rd_status); 3290 if (!error) { 3291 rdc->nfs_ncookie = rd.rd_offset; 3292 rdc->eof = rd.rd_eof ? 1 : 0; 3293 rdc->entlen = rd.rd_size; 3294 ASSERT(rdc->entlen <= rdc->buflen); 3295 #ifdef DEBUG 3296 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, 3297 KM_SLEEP); 3298 #else 3299 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3300 #endif 3301 bcopy(rd.rd_entries, rdc->entries, rdc->entlen); 3302 rdc->error = 0; 3303 if (mi->mi_io_kstats) { 3304 mutex_enter(&mi->mi_lock); 3305 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3306 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += 3307 rd.rd_size; 3308 mutex_exit(&mi->mi_lock); 3309 } 3310 } else { 3311 PURGE_STALE_FH(error, vp, cr); 3312 } 3313 } 3314 if (error) { 3315 rdc->entries = NULL; 3316 rdc->error = error; 3317 } 3318 kmem_free(rd.rd_entries, rdc->buflen); 3319 3320 mutex_enter(&rp->r_statelock); 3321 rdc->flags &= ~RDDIR; 3322 if (rdc->flags & RDDIRWAIT) { 3323 rdc->flags &= ~RDDIRWAIT; 3324 cv_broadcast(&rdc->cv); 3325 } 3326 if (error) 3327 rdc->flags |= RDDIRREQ; 3328 mutex_exit(&rp->r_statelock); 3329 3330 rddir_cache_rele(rdc); 3331 3332 return (error); 3333 } 3334 3335 #ifdef DEBUG 3336 static int nfs_bio_do_stop = 0; 3337 #endif 3338 3339 static int 3340 nfs_bio(struct buf *bp, cred_t *cr) 3341 { 3342 rnode_t *rp = VTOR(bp->b_vp); 3343 int count; 3344 int error; 3345 cred_t *cred; 3346 uint_t offset; 3347 3348 DTRACE_IO1(start, struct buf *, bp); 3349 3350 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 3351 offset = dbtob(bp->b_blkno); 3352 3353 if (bp->b_flags & B_READ) { 3354 mutex_enter(&rp->r_statelock); 3355 if (rp->r_cred != NULL) { 3356 cred = rp->r_cred; 3357 crhold(cred); 3358 } else { 3359 rp->r_cred = cr; 3360 crhold(cr); 3361 cred = cr; 3362 crhold(cred); 3363 } 3364 mutex_exit(&rp->r_statelock); 3365 read_again: 3366 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr, 3367 offset, bp->b_bcount, &bp->b_resid, cred); 3368 3369 crfree(cred); 3370 if (!error) { 3371 if (bp->b_resid) { 3372 /* 3373 * Didn't get it all because we hit EOF, 3374 * zero all the memory beyond the EOF. 3375 */ 3376 /* bzero(rdaddr + */ 3377 bzero(bp->b_un.b_addr + 3378 bp->b_bcount - bp->b_resid, bp->b_resid); 3379 } 3380 mutex_enter(&rp->r_statelock); 3381 if (bp->b_resid == bp->b_bcount && 3382 offset >= rp->r_size) { 3383 /* 3384 * We didn't read anything at all as we are 3385 * past EOF. Return an error indicator back 3386 * but don't destroy the pages (yet). 3387 */ 3388 error = NFS_EOF; 3389 } 3390 mutex_exit(&rp->r_statelock); 3391 } else if (error == EACCES) { 3392 mutex_enter(&rp->r_statelock); 3393 if (cred != cr) { 3394 if (rp->r_cred != NULL) 3395 crfree(rp->r_cred); 3396 rp->r_cred = cr; 3397 crhold(cr); 3398 cred = cr; 3399 crhold(cred); 3400 mutex_exit(&rp->r_statelock); 3401 goto read_again; 3402 } 3403 mutex_exit(&rp->r_statelock); 3404 } 3405 } else { 3406 if (!(rp->r_flags & RSTALE)) { 3407 mutex_enter(&rp->r_statelock); 3408 if (rp->r_cred != NULL) { 3409 cred = rp->r_cred; 3410 crhold(cred); 3411 } else { 3412 rp->r_cred = cr; 3413 crhold(cr); 3414 cred = cr; 3415 crhold(cred); 3416 } 3417 mutex_exit(&rp->r_statelock); 3418 write_again: 3419 mutex_enter(&rp->r_statelock); 3420 count = MIN(bp->b_bcount, rp->r_size - offset); 3421 mutex_exit(&rp->r_statelock); 3422 if (count < 0) 3423 cmn_err(CE_PANIC, "nfs_bio: write count < 0"); 3424 #ifdef DEBUG 3425 if (count == 0) { 3426 zcmn_err(getzoneid(), CE_WARN, 3427 "nfs_bio: zero length write at %d", 3428 offset); 3429 nfs_printfhandle(&rp->r_fh); 3430 if (nfs_bio_do_stop) 3431 debug_enter("nfs_bio"); 3432 } 3433 #endif 3434 error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset, 3435 count, cred); 3436 if (error == EACCES) { 3437 mutex_enter(&rp->r_statelock); 3438 if (cred != cr) { 3439 if (rp->r_cred != NULL) 3440 crfree(rp->r_cred); 3441 rp->r_cred = cr; 3442 crhold(cr); 3443 crfree(cred); 3444 cred = cr; 3445 crhold(cred); 3446 mutex_exit(&rp->r_statelock); 3447 goto write_again; 3448 } 3449 mutex_exit(&rp->r_statelock); 3450 } 3451 bp->b_error = error; 3452 if (error && error != EINTR) { 3453 /* 3454 * Don't print EDQUOT errors on the console. 3455 * Don't print asynchronous EACCES errors. 3456 * Don't print EFBIG errors. 3457 * Print all other write errors. 3458 */ 3459 if (error != EDQUOT && error != EFBIG && 3460 (error != EACCES || 3461 !(bp->b_flags & B_ASYNC))) 3462 nfs_write_error(bp->b_vp, error, cred); 3463 /* 3464 * Update r_error and r_flags as appropriate. 3465 * If the error was ESTALE, then mark the 3466 * rnode as not being writeable and save 3467 * the error status. Otherwise, save any 3468 * errors which occur from asynchronous 3469 * page invalidations. Any errors occurring 3470 * from other operations should be saved 3471 * by the caller. 3472 */ 3473 mutex_enter(&rp->r_statelock); 3474 if (error == ESTALE) { 3475 rp->r_flags |= RSTALE; 3476 if (!rp->r_error) 3477 rp->r_error = error; 3478 } else if (!rp->r_error && 3479 (bp->b_flags & 3480 (B_INVAL|B_FORCE|B_ASYNC)) == 3481 (B_INVAL|B_FORCE|B_ASYNC)) { 3482 rp->r_error = error; 3483 } 3484 mutex_exit(&rp->r_statelock); 3485 } 3486 crfree(cred); 3487 } else 3488 error = rp->r_error; 3489 } 3490 3491 if (error != 0 && error != NFS_EOF) 3492 bp->b_flags |= B_ERROR; 3493 3494 DTRACE_IO1(done, struct buf *, bp); 3495 3496 return (error); 3497 } 3498 3499 /* ARGSUSED */ 3500 static int 3501 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3502 { 3503 struct nfs_fid *fp; 3504 rnode_t *rp; 3505 3506 rp = VTOR(vp); 3507 3508 if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) { 3509 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short); 3510 return (ENOSPC); 3511 } 3512 fp = (struct nfs_fid *)fidp; 3513 fp->nf_pad = 0; 3514 fp->nf_len = sizeof (struct nfs_fid) - sizeof (short); 3515 bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE); 3516 return (0); 3517 } 3518 3519 /* ARGSUSED2 */ 3520 static int 3521 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3522 { 3523 rnode_t *rp = VTOR(vp); 3524 3525 if (!write_lock) { 3526 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3527 return (V_WRITELOCK_FALSE); 3528 } 3529 3530 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 3531 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 3532 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 3533 return (V_WRITELOCK_FALSE); 3534 nfs_rw_exit(&rp->r_rwlock); 3535 } 3536 3537 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 3538 return (V_WRITELOCK_TRUE); 3539 } 3540 3541 /* ARGSUSED */ 3542 static void 3543 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 3544 { 3545 rnode_t *rp = VTOR(vp); 3546 3547 nfs_rw_exit(&rp->r_rwlock); 3548 } 3549 3550 /* ARGSUSED */ 3551 static int 3552 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 3553 { 3554 3555 /* 3556 * Because we stuff the readdir cookie into the offset field 3557 * someone may attempt to do an lseek with the cookie which 3558 * we want to succeed. 3559 */ 3560 if (vp->v_type == VDIR) 3561 return (0); 3562 if (*noffp < 0 || *noffp > MAXOFF32_T) 3563 return (EINVAL); 3564 return (0); 3565 } 3566 3567 /* 3568 * number of NFS_MAXDATA blocks to read ahead 3569 * optimized for 100 base-T. 3570 */ 3571 static int nfs_nra = 4; 3572 3573 #ifdef DEBUG 3574 static int nfs_lostpage = 0; /* number of times we lost original page */ 3575 #endif 3576 3577 /* 3578 * Return all the pages from [off..off+len) in file 3579 */ 3580 /* ARGSUSED */ 3581 static int 3582 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3583 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3584 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 3585 { 3586 rnode_t *rp; 3587 int error; 3588 mntinfo_t *mi; 3589 3590 if (vp->v_flag & VNOMAP) 3591 return (ENOSYS); 3592 3593 ASSERT(off <= MAXOFF32_T); 3594 if (nfs_zone() != VTOMI(vp)->mi_zone) 3595 return (EIO); 3596 if (protp != NULL) 3597 *protp = PROT_ALL; 3598 3599 /* 3600 * Now valididate that the caches are up to date. 3601 */ 3602 error = nfs_validate_caches(vp, cr); 3603 if (error) 3604 return (error); 3605 3606 rp = VTOR(vp); 3607 mi = VTOMI(vp); 3608 retry: 3609 mutex_enter(&rp->r_statelock); 3610 3611 /* 3612 * Don't create dirty pages faster than they 3613 * can be cleaned so that the system doesn't 3614 * get imbalanced. If the async queue is 3615 * maxed out, then wait for it to drain before 3616 * creating more dirty pages. Also, wait for 3617 * any threads doing pagewalks in the vop_getattr 3618 * entry points so that they don't block for 3619 * long periods. 3620 */ 3621 if (rw == S_CREATE) { 3622 while ((mi->mi_max_threads != 0 && 3623 rp->r_awcount > 2 * mi->mi_max_threads) || 3624 rp->r_gcount > 0) 3625 cv_wait(&rp->r_cv, &rp->r_statelock); 3626 } 3627 3628 /* 3629 * If we are getting called as a side effect of an nfs_write() 3630 * operation the local file size might not be extended yet. 3631 * In this case we want to be able to return pages of zeroes. 3632 */ 3633 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 3634 mutex_exit(&rp->r_statelock); 3635 return (EFAULT); /* beyond EOF */ 3636 } 3637 3638 mutex_exit(&rp->r_statelock); 3639 3640 if (len <= PAGESIZE) { 3641 error = nfs_getapage(vp, off, len, protp, pl, plsz, 3642 seg, addr, rw, cr); 3643 } else { 3644 error = pvn_getpages(nfs_getapage, vp, off, len, protp, 3645 pl, plsz, seg, addr, rw, cr); 3646 } 3647 3648 switch (error) { 3649 case NFS_EOF: 3650 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 3651 goto retry; 3652 case ESTALE: 3653 PURGE_STALE_FH(error, vp, cr); 3654 } 3655 3656 return (error); 3657 } 3658 3659 /* 3660 * Called from pvn_getpages or nfs_getpage to get a particular page. 3661 */ 3662 /* ARGSUSED */ 3663 static int 3664 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 3665 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3666 enum seg_rw rw, cred_t *cr) 3667 { 3668 rnode_t *rp; 3669 uint_t bsize; 3670 struct buf *bp; 3671 page_t *pp; 3672 u_offset_t lbn; 3673 u_offset_t io_off; 3674 u_offset_t blkoff; 3675 u_offset_t rablkoff; 3676 size_t io_len; 3677 uint_t blksize; 3678 int error; 3679 int readahead; 3680 int readahead_issued = 0; 3681 int ra_window; /* readahead window */ 3682 page_t *pagefound; 3683 3684 if (nfs_zone() != VTOMI(vp)->mi_zone) 3685 return (EIO); 3686 rp = VTOR(vp); 3687 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3688 3689 reread: 3690 bp = NULL; 3691 pp = NULL; 3692 pagefound = NULL; 3693 3694 if (pl != NULL) 3695 pl[0] = NULL; 3696 3697 error = 0; 3698 lbn = off / bsize; 3699 blkoff = lbn * bsize; 3700 3701 /* 3702 * Queueing up the readahead before doing the synchronous read 3703 * results in a significant increase in read throughput because 3704 * of the increased parallelism between the async threads and 3705 * the process context. 3706 */ 3707 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 3708 rw != S_CREATE && 3709 !(vp->v_flag & VNOCACHE)) { 3710 mutex_enter(&rp->r_statelock); 3711 3712 /* 3713 * Calculate the number of readaheads to do. 3714 * a) No readaheads at offset = 0. 3715 * b) Do maximum(nfs_nra) readaheads when the readahead 3716 * window is closed. 3717 * c) Do readaheads between 1 to (nfs_nra - 1) depending 3718 * upon how far the readahead window is open or close. 3719 * d) No readaheads if rp->r_nextr is not within the scope 3720 * of the readahead window (random i/o). 3721 */ 3722 3723 if (off == 0) 3724 readahead = 0; 3725 else if (blkoff == rp->r_nextr) 3726 readahead = nfs_nra; 3727 else if (rp->r_nextr > blkoff && 3728 ((ra_window = (rp->r_nextr - blkoff) / bsize) 3729 <= (nfs_nra - 1))) 3730 readahead = nfs_nra - ra_window; 3731 else 3732 readahead = 0; 3733 3734 rablkoff = rp->r_nextr; 3735 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 3736 mutex_exit(&rp->r_statelock); 3737 if (nfs_async_readahead(vp, rablkoff + bsize, 3738 addr + (rablkoff + bsize - off), seg, cr, 3739 nfs_readahead) < 0) { 3740 mutex_enter(&rp->r_statelock); 3741 break; 3742 } 3743 readahead--; 3744 rablkoff += bsize; 3745 /* 3746 * Indicate that we did a readahead so 3747 * readahead offset is not updated 3748 * by the synchronous read below. 3749 */ 3750 readahead_issued = 1; 3751 mutex_enter(&rp->r_statelock); 3752 /* 3753 * set readahead offset to 3754 * offset of last async readahead 3755 * request. 3756 */ 3757 rp->r_nextr = rablkoff; 3758 } 3759 mutex_exit(&rp->r_statelock); 3760 } 3761 3762 again: 3763 if ((pagefound = page_exists(vp, off)) == NULL) { 3764 if (pl == NULL) { 3765 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 3766 nfs_readahead); 3767 } else if (rw == S_CREATE) { 3768 /* 3769 * Block for this page is not allocated, or the offset 3770 * is beyond the current allocation size, or we're 3771 * allocating a swap slot and the page was not found, 3772 * so allocate it and return a zero page. 3773 */ 3774 if ((pp = page_create_va(vp, off, 3775 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 3776 cmn_err(CE_PANIC, "nfs_getapage: page_create"); 3777 io_len = PAGESIZE; 3778 mutex_enter(&rp->r_statelock); 3779 rp->r_nextr = off + PAGESIZE; 3780 mutex_exit(&rp->r_statelock); 3781 } else { 3782 /* 3783 * Need to go to server to get a BLOCK, exception to 3784 * that being while reading at offset = 0 or doing 3785 * random i/o, in that case read only a PAGE. 3786 */ 3787 mutex_enter(&rp->r_statelock); 3788 if (blkoff < rp->r_size && 3789 blkoff + bsize >= rp->r_size) { 3790 /* 3791 * If only a block or less is left in 3792 * the file, read all that is remaining. 3793 */ 3794 if (rp->r_size <= off) { 3795 /* 3796 * Trying to access beyond EOF, 3797 * set up to get at least one page. 3798 */ 3799 blksize = off + PAGESIZE - blkoff; 3800 } else 3801 blksize = rp->r_size - blkoff; 3802 } else if ((off == 0) || 3803 (off != rp->r_nextr && !readahead_issued)) { 3804 blksize = PAGESIZE; 3805 blkoff = off; /* block = page here */ 3806 } else 3807 blksize = bsize; 3808 mutex_exit(&rp->r_statelock); 3809 3810 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3811 &io_len, blkoff, blksize, 0); 3812 3813 /* 3814 * Some other thread has entered the page, 3815 * so just use it. 3816 */ 3817 if (pp == NULL) 3818 goto again; 3819 3820 /* 3821 * Now round the request size up to page boundaries. 3822 * This ensures that the entire page will be 3823 * initialized to zeroes if EOF is encountered. 3824 */ 3825 io_len = ptob(btopr(io_len)); 3826 3827 bp = pageio_setup(pp, io_len, vp, B_READ); 3828 ASSERT(bp != NULL); 3829 3830 /* 3831 * pageio_setup should have set b_addr to 0. This 3832 * is correct since we want to do I/O on a page 3833 * boundary. bp_mapin will use this addr to calculate 3834 * an offset, and then set b_addr to the kernel virtual 3835 * address it allocated for us. 3836 */ 3837 ASSERT(bp->b_un.b_addr == 0); 3838 3839 bp->b_edev = 0; 3840 bp->b_dev = 0; 3841 bp->b_lblkno = lbtodb(io_off); 3842 bp->b_file = vp; 3843 bp->b_offset = (offset_t)off; 3844 bp_mapin(bp); 3845 3846 /* 3847 * If doing a write beyond what we believe is EOF, 3848 * don't bother trying to read the pages from the 3849 * server, we'll just zero the pages here. We 3850 * don't check that the rw flag is S_WRITE here 3851 * because some implementations may attempt a 3852 * read access to the buffer before copying data. 3853 */ 3854 mutex_enter(&rp->r_statelock); 3855 if (io_off >= rp->r_size && seg == segkmap) { 3856 mutex_exit(&rp->r_statelock); 3857 bzero(bp->b_un.b_addr, io_len); 3858 } else { 3859 mutex_exit(&rp->r_statelock); 3860 error = nfs_bio(bp, cr); 3861 } 3862 3863 /* 3864 * Unmap the buffer before freeing it. 3865 */ 3866 bp_mapout(bp); 3867 pageio_done(bp); 3868 3869 if (error == NFS_EOF) { 3870 /* 3871 * If doing a write system call just return 3872 * zeroed pages, else user tried to get pages 3873 * beyond EOF, return error. We don't check 3874 * that the rw flag is S_WRITE here because 3875 * some implementations may attempt a read 3876 * access to the buffer before copying data. 3877 */ 3878 if (seg == segkmap) 3879 error = 0; 3880 else 3881 error = EFAULT; 3882 } 3883 3884 if (!readahead_issued && !error) { 3885 mutex_enter(&rp->r_statelock); 3886 rp->r_nextr = io_off + io_len; 3887 mutex_exit(&rp->r_statelock); 3888 } 3889 } 3890 } 3891 3892 out: 3893 if (pl == NULL) 3894 return (error); 3895 3896 if (error) { 3897 if (pp != NULL) 3898 pvn_read_done(pp, B_ERROR); 3899 return (error); 3900 } 3901 3902 if (pagefound) { 3903 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 3904 3905 /* 3906 * Page exists in the cache, acquire the appropriate lock. 3907 * If this fails, start all over again. 3908 */ 3909 if ((pp = page_lookup(vp, off, se)) == NULL) { 3910 #ifdef DEBUG 3911 nfs_lostpage++; 3912 #endif 3913 goto reread; 3914 } 3915 pl[0] = pp; 3916 pl[1] = NULL; 3917 return (0); 3918 } 3919 3920 if (pp != NULL) 3921 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3922 3923 return (error); 3924 } 3925 3926 static void 3927 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 3928 cred_t *cr) 3929 { 3930 int error; 3931 page_t *pp; 3932 u_offset_t io_off; 3933 size_t io_len; 3934 struct buf *bp; 3935 uint_t bsize, blksize; 3936 rnode_t *rp = VTOR(vp); 3937 3938 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 3939 3940 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 3941 3942 mutex_enter(&rp->r_statelock); 3943 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 3944 /* 3945 * If less than a block left in file read less 3946 * than a block. 3947 */ 3948 blksize = rp->r_size - blkoff; 3949 } else 3950 blksize = bsize; 3951 mutex_exit(&rp->r_statelock); 3952 3953 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 3954 &io_off, &io_len, blkoff, blksize, 1); 3955 /* 3956 * The isra flag passed to the kluster function is 1, we may have 3957 * gotten a return value of NULL for a variety of reasons (# of free 3958 * pages < minfree, someone entered the page on the vnode etc). In all 3959 * cases, we want to punt on the readahead. 3960 */ 3961 if (pp == NULL) 3962 return; 3963 3964 /* 3965 * Now round the request size up to page boundaries. 3966 * This ensures that the entire page will be 3967 * initialized to zeroes if EOF is encountered. 3968 */ 3969 io_len = ptob(btopr(io_len)); 3970 3971 bp = pageio_setup(pp, io_len, vp, B_READ); 3972 ASSERT(bp != NULL); 3973 3974 /* 3975 * pageio_setup should have set b_addr to 0. This is correct since 3976 * we want to do I/O on a page boundary. bp_mapin() will use this addr 3977 * to calculate an offset, and then set b_addr to the kernel virtual 3978 * address it allocated for us. 3979 */ 3980 ASSERT(bp->b_un.b_addr == 0); 3981 3982 bp->b_edev = 0; 3983 bp->b_dev = 0; 3984 bp->b_lblkno = lbtodb(io_off); 3985 bp->b_file = vp; 3986 bp->b_offset = (offset_t)blkoff; 3987 bp_mapin(bp); 3988 3989 /* 3990 * If doing a write beyond what we believe is EOF, don't bother trying 3991 * to read the pages from the server, we'll just zero the pages here. 3992 * We don't check that the rw flag is S_WRITE here because some 3993 * implementations may attempt a read access to the buffer before 3994 * copying data. 3995 */ 3996 mutex_enter(&rp->r_statelock); 3997 if (io_off >= rp->r_size && seg == segkmap) { 3998 mutex_exit(&rp->r_statelock); 3999 bzero(bp->b_un.b_addr, io_len); 4000 error = 0; 4001 } else { 4002 mutex_exit(&rp->r_statelock); 4003 error = nfs_bio(bp, cr); 4004 if (error == NFS_EOF) 4005 error = 0; 4006 } 4007 4008 /* 4009 * Unmap the buffer before freeing it. 4010 */ 4011 bp_mapout(bp); 4012 pageio_done(bp); 4013 4014 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 4015 4016 /* 4017 * In case of error set readahead offset 4018 * to the lowest offset. 4019 * pvn_read_done() calls VN_DISPOSE to destroy the pages 4020 */ 4021 if (error && rp->r_nextr > io_off) { 4022 mutex_enter(&rp->r_statelock); 4023 if (rp->r_nextr > io_off) 4024 rp->r_nextr = io_off; 4025 mutex_exit(&rp->r_statelock); 4026 } 4027 } 4028 4029 /* 4030 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 4031 * If len == 0, do from off to EOF. 4032 * 4033 * The normal cases should be len == 0 && off == 0 (entire vp list), 4034 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4035 * (from pageout). 4036 */ 4037 /* ARGSUSED */ 4038 static int 4039 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4040 caller_context_t *ct) 4041 { 4042 int error; 4043 rnode_t *rp; 4044 4045 ASSERT(cr != NULL); 4046 4047 /* 4048 * XXX - Why should this check be made here? 4049 */ 4050 if (vp->v_flag & VNOMAP) 4051 return (ENOSYS); 4052 4053 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 4054 return (0); 4055 4056 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 4057 return (EIO); 4058 ASSERT(off <= MAXOFF32_T); 4059 4060 rp = VTOR(vp); 4061 mutex_enter(&rp->r_statelock); 4062 rp->r_count++; 4063 mutex_exit(&rp->r_statelock); 4064 error = nfs_putpages(vp, off, len, flags, cr); 4065 mutex_enter(&rp->r_statelock); 4066 rp->r_count--; 4067 cv_broadcast(&rp->r_cv); 4068 mutex_exit(&rp->r_statelock); 4069 4070 return (error); 4071 } 4072 4073 /* 4074 * Write out a single page, possibly klustering adjacent dirty pages. 4075 */ 4076 int 4077 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 4078 int flags, cred_t *cr) 4079 { 4080 u_offset_t io_off; 4081 u_offset_t lbn_off; 4082 u_offset_t lbn; 4083 size_t io_len; 4084 uint_t bsize; 4085 int error; 4086 rnode_t *rp; 4087 4088 ASSERT(!vn_is_readonly(vp)); 4089 ASSERT(pp != NULL); 4090 ASSERT(cr != NULL); 4091 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 4092 4093 rp = VTOR(vp); 4094 ASSERT(rp->r_count > 0); 4095 4096 ASSERT(pp->p_offset <= MAXOFF32_T); 4097 4098 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4099 lbn = pp->p_offset / bsize; 4100 lbn_off = lbn * bsize; 4101 4102 /* 4103 * Find a kluster that fits in one block, or in 4104 * one page if pages are bigger than blocks. If 4105 * there is less file space allocated than a whole 4106 * page, we'll shorten the i/o request below. 4107 */ 4108 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4109 roundup(bsize, PAGESIZE), flags); 4110 4111 /* 4112 * pvn_write_kluster shouldn't have returned a page with offset 4113 * behind the original page we were given. Verify that. 4114 */ 4115 ASSERT((pp->p_offset / bsize) >= lbn); 4116 4117 /* 4118 * Now pp will have the list of kept dirty pages marked for 4119 * write back. It will also handle invalidation and freeing 4120 * of pages that are not dirty. Check for page length rounding 4121 * problems. 4122 */ 4123 if (io_off + io_len > lbn_off + bsize) { 4124 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4125 io_len = lbn_off + bsize - io_off; 4126 } 4127 /* 4128 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4129 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4130 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4131 * progress and the r_size has not been made consistent with the 4132 * new size of the file. When the uiomove() completes the r_size is 4133 * updated and the RMODINPROGRESS flag is cleared. 4134 * 4135 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4136 * consistent value of r_size. Without this handshaking, it is 4137 * possible that nfs(3)_bio() picks up the old value of r_size 4138 * before the uiomove() in writerp() completes. This will result 4139 * in the write through nfs(3)_bio() being dropped. 4140 * 4141 * More precisely, there is a window between the time the uiomove() 4142 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4143 * operation intervenes in this window, the page will be picked up, 4144 * because it is dirty (it will be unlocked, unless it was 4145 * pagecreate'd). When the page is picked up as dirty, the dirty 4146 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4147 * checked. This will still be the old size. Therefore the page will 4148 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4149 * the page will be found to be clean and the write will be dropped. 4150 */ 4151 if (rp->r_flags & RMODINPROGRESS) { 4152 mutex_enter(&rp->r_statelock); 4153 if ((rp->r_flags & RMODINPROGRESS) && 4154 rp->r_modaddr + MAXBSIZE > io_off && 4155 rp->r_modaddr < io_off + io_len) { 4156 page_t *plist; 4157 /* 4158 * A write is in progress for this region of the file. 4159 * If we did not detect RMODINPROGRESS here then this 4160 * path through nfs_putapage() would eventually go to 4161 * nfs(3)_bio() and may not write out all of the data 4162 * in the pages. We end up losing data. So we decide 4163 * to set the modified bit on each page in the page 4164 * list and mark the rnode with RDIRTY. This write 4165 * will be restarted at some later time. 4166 */ 4167 plist = pp; 4168 while (plist != NULL) { 4169 pp = plist; 4170 page_sub(&plist, pp); 4171 hat_setmod(pp); 4172 page_io_unlock(pp); 4173 page_unlock(pp); 4174 } 4175 rp->r_flags |= RDIRTY; 4176 mutex_exit(&rp->r_statelock); 4177 if (offp) 4178 *offp = io_off; 4179 if (lenp) 4180 *lenp = io_len; 4181 return (0); 4182 } 4183 mutex_exit(&rp->r_statelock); 4184 } 4185 4186 if (flags & B_ASYNC) { 4187 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4188 nfs_sync_putapage); 4189 } else 4190 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4191 4192 if (offp) 4193 *offp = io_off; 4194 if (lenp) 4195 *lenp = io_len; 4196 return (error); 4197 } 4198 4199 static int 4200 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4201 int flags, cred_t *cr) 4202 { 4203 int error; 4204 rnode_t *rp; 4205 4206 flags |= B_WRITE; 4207 4208 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4209 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4210 4211 rp = VTOR(vp); 4212 4213 if ((error == ENOSPC || error == EDQUOT || error == EACCES) && 4214 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 4215 if (!(rp->r_flags & ROUTOFSPACE)) { 4216 mutex_enter(&rp->r_statelock); 4217 rp->r_flags |= ROUTOFSPACE; 4218 mutex_exit(&rp->r_statelock); 4219 } 4220 flags |= B_ERROR; 4221 pvn_write_done(pp, flags); 4222 /* 4223 * If this was not an async thread, then try again to 4224 * write out the pages, but this time, also destroy 4225 * them whether or not the write is successful. This 4226 * will prevent memory from filling up with these 4227 * pages and destroying them is the only alternative 4228 * if they can't be written out. 4229 * 4230 * Don't do this if this is an async thread because 4231 * when the pages are unlocked in pvn_write_done, 4232 * some other thread could have come along, locked 4233 * them, and queued for an async thread. It would be 4234 * possible for all of the async threads to be tied 4235 * up waiting to lock the pages again and they would 4236 * all already be locked and waiting for an async 4237 * thread to handle them. Deadlock. 4238 */ 4239 if (!(flags & B_ASYNC)) { 4240 error = nfs_putpage(vp, io_off, io_len, 4241 B_INVAL | B_FORCE, cr, NULL); 4242 } 4243 } else { 4244 if (error) 4245 flags |= B_ERROR; 4246 else if (rp->r_flags & ROUTOFSPACE) { 4247 mutex_enter(&rp->r_statelock); 4248 rp->r_flags &= ~ROUTOFSPACE; 4249 mutex_exit(&rp->r_statelock); 4250 } 4251 pvn_write_done(pp, flags); 4252 } 4253 4254 return (error); 4255 } 4256 4257 /* ARGSUSED */ 4258 static int 4259 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4260 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4261 caller_context_t *ct) 4262 { 4263 struct segvn_crargs vn_a; 4264 int error; 4265 rnode_t *rp; 4266 struct vattr va; 4267 4268 if (nfs_zone() != VTOMI(vp)->mi_zone) 4269 return (EIO); 4270 4271 if (vp->v_flag & VNOMAP) 4272 return (ENOSYS); 4273 4274 if (off > MAXOFF32_T) 4275 return (EFBIG); 4276 4277 if (off < 0 || off + len < 0) 4278 return (ENXIO); 4279 4280 if (vp->v_type != VREG) 4281 return (ENODEV); 4282 4283 /* 4284 * If there is cached data and if close-to-open consistency 4285 * checking is not turned off and if the file system is not 4286 * mounted readonly, then force an over the wire getattr. 4287 * Otherwise, just invoke nfsgetattr to get a copy of the 4288 * attributes. The attribute cache will be used unless it 4289 * is timed out and if it is, then an over the wire getattr 4290 * will be issued. 4291 */ 4292 va.va_mask = AT_ALL; 4293 if (vn_has_cached_data(vp) && 4294 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 4295 error = nfs_getattr_otw(vp, &va, cr); 4296 else 4297 error = nfsgetattr(vp, &va, cr); 4298 if (error) 4299 return (error); 4300 4301 /* 4302 * Check to see if the vnode is currently marked as not cachable. 4303 * This means portions of the file are locked (through VOP_FRLOCK). 4304 * In this case the map request must be refused. We use 4305 * rp->r_lkserlock to avoid a race with concurrent lock requests. 4306 */ 4307 rp = VTOR(vp); 4308 4309 /* 4310 * Atomically increment r_inmap after acquiring r_rwlock. The 4311 * idea here is to acquire r_rwlock to block read/write and 4312 * not to protect r_inmap. r_inmap will inform nfs_read/write() 4313 * that we are in nfs_map(). Now, r_rwlock is acquired in order 4314 * and we can prevent the deadlock that would have occurred 4315 * when nfs_addmap() would have acquired it out of order. 4316 * 4317 * Since we are not protecting r_inmap by any lock, we do not 4318 * hold any lock when we decrement it. We atomically decrement 4319 * r_inmap after we release r_lkserlock. 4320 */ 4321 4322 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 4323 return (EINTR); 4324 atomic_add_int(&rp->r_inmap, 1); 4325 nfs_rw_exit(&rp->r_rwlock); 4326 4327 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) { 4328 atomic_add_int(&rp->r_inmap, -1); 4329 return (EINTR); 4330 } 4331 if (vp->v_flag & VNOCACHE) { 4332 error = EAGAIN; 4333 goto done; 4334 } 4335 4336 /* 4337 * Don't allow concurrent locks and mapping if mandatory locking is 4338 * enabled. 4339 */ 4340 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 4341 MANDLOCK(vp, va.va_mode)) { 4342 error = EAGAIN; 4343 goto done; 4344 } 4345 4346 as_rangelock(as); 4347 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4348 if (error != 0) { 4349 as_rangeunlock(as); 4350 goto done; 4351 } 4352 4353 vn_a.vp = vp; 4354 vn_a.offset = off; 4355 vn_a.type = (flags & MAP_TYPE); 4356 vn_a.prot = (uchar_t)prot; 4357 vn_a.maxprot = (uchar_t)maxprot; 4358 vn_a.flags = (flags & ~MAP_TYPE); 4359 vn_a.cred = cr; 4360 vn_a.amp = NULL; 4361 vn_a.szc = 0; 4362 vn_a.lgrp_mem_policy_flags = 0; 4363 4364 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4365 as_rangeunlock(as); 4366 4367 done: 4368 nfs_rw_exit(&rp->r_lkserlock); 4369 atomic_add_int(&rp->r_inmap, -1); 4370 return (error); 4371 } 4372 4373 /* ARGSUSED */ 4374 static int 4375 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4376 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4377 caller_context_t *ct) 4378 { 4379 rnode_t *rp; 4380 4381 if (vp->v_flag & VNOMAP) 4382 return (ENOSYS); 4383 if (nfs_zone() != VTOMI(vp)->mi_zone) 4384 return (EIO); 4385 4386 rp = VTOR(vp); 4387 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 4388 4389 return (0); 4390 } 4391 4392 /* ARGSUSED */ 4393 static int 4394 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, 4395 struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) 4396 { 4397 netobj lm_fh; 4398 int rc; 4399 u_offset_t start, end; 4400 rnode_t *rp; 4401 int error = 0, intr = INTR(vp); 4402 4403 /* check for valid cmd parameter */ 4404 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 4405 return (EINVAL); 4406 if (nfs_zone() != VTOMI(vp)->mi_zone) 4407 return (EIO); 4408 4409 /* Verify l_type. */ 4410 switch (bfp->l_type) { 4411 case F_RDLCK: 4412 if (cmd != F_GETLK && !(flag & FREAD)) 4413 return (EBADF); 4414 break; 4415 case F_WRLCK: 4416 if (cmd != F_GETLK && !(flag & FWRITE)) 4417 return (EBADF); 4418 break; 4419 case F_UNLCK: 4420 intr = 0; 4421 break; 4422 4423 default: 4424 return (EINVAL); 4425 } 4426 4427 /* check the validity of the lock range */ 4428 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 4429 return (rc); 4430 if (rc = flk_check_lock_data(start, end, MAXOFF32_T)) 4431 return (rc); 4432 4433 /* 4434 * If the filesystem is mounted using local locking, pass the 4435 * request off to the local locking code. 4436 */ 4437 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 4438 if (offset > MAXOFF32_T) 4439 return (EFBIG); 4440 if (cmd == F_SETLK || cmd == F_SETLKW) { 4441 /* 4442 * For complete safety, we should be holding 4443 * r_lkserlock. However, we can't call 4444 * lm_safelock and then fs_frlock while 4445 * holding r_lkserlock, so just invoke 4446 * lm_safelock and expect that this will 4447 * catch enough of the cases. 4448 */ 4449 if (!lm_safelock(vp, bfp, cr)) 4450 return (EAGAIN); 4451 } 4452 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4453 } 4454 4455 rp = VTOR(vp); 4456 4457 /* 4458 * Check whether the given lock request can proceed, given the 4459 * current file mappings. 4460 */ 4461 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 4462 return (EINTR); 4463 if (cmd == F_SETLK || cmd == F_SETLKW) { 4464 if (!lm_safelock(vp, bfp, cr)) { 4465 rc = EAGAIN; 4466 goto done; 4467 } 4468 } 4469 4470 /* 4471 * Flush the cache after waiting for async I/O to finish. For new 4472 * locks, this is so that the process gets the latest bits from the 4473 * server. For unlocks, this is so that other clients see the 4474 * latest bits once the file has been unlocked. If currently dirty 4475 * pages can't be flushed, then don't allow a lock to be set. But 4476 * allow unlocks to succeed, to avoid having orphan locks on the 4477 * server. 4478 */ 4479 if (cmd != F_GETLK) { 4480 mutex_enter(&rp->r_statelock); 4481 while (rp->r_count > 0) { 4482 if (intr) { 4483 klwp_t *lwp = ttolwp(curthread); 4484 4485 if (lwp != NULL) 4486 lwp->lwp_nostop++; 4487 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) 4488 == 0) { 4489 if (lwp != NULL) 4490 lwp->lwp_nostop--; 4491 rc = EINTR; 4492 break; 4493 } 4494 if (lwp != NULL) 4495 lwp->lwp_nostop--; 4496 } else 4497 cv_wait(&rp->r_cv, &rp->r_statelock); 4498 } 4499 mutex_exit(&rp->r_statelock); 4500 if (rc != 0) 4501 goto done; 4502 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 4503 if (error) { 4504 if (error == ENOSPC || error == EDQUOT) { 4505 mutex_enter(&rp->r_statelock); 4506 if (!rp->r_error) 4507 rp->r_error = error; 4508 mutex_exit(&rp->r_statelock); 4509 } 4510 if (bfp->l_type != F_UNLCK) { 4511 rc = ENOLCK; 4512 goto done; 4513 } 4514 } 4515 } 4516 4517 lm_fh.n_len = sizeof (fhandle_t); 4518 lm_fh.n_bytes = (char *)VTOFH(vp); 4519 4520 /* 4521 * Call the lock manager to do the real work of contacting 4522 * the server and obtaining the lock. 4523 */ 4524 rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp); 4525 4526 if (rc == 0) 4527 nfs_lockcompletion(vp, cmd); 4528 4529 done: 4530 nfs_rw_exit(&rp->r_lkserlock); 4531 return (rc); 4532 } 4533 4534 /* 4535 * Free storage space associated with the specified vnode. The portion 4536 * to be freed is specified by bfp->l_start and bfp->l_len (already 4537 * normalized to a "whence" of 0). 4538 * 4539 * This is an experimental facility whose continued existence is not 4540 * guaranteed. Currently, we only support the special case 4541 * of l_len == 0, meaning free to end of file. 4542 */ 4543 /* ARGSUSED */ 4544 static int 4545 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 4546 offset_t offset, cred_t *cr, caller_context_t *ct) 4547 { 4548 int error; 4549 4550 ASSERT(vp->v_type == VREG); 4551 if (cmd != F_FREESP) 4552 return (EINVAL); 4553 4554 if (offset > MAXOFF32_T) 4555 return (EFBIG); 4556 4557 if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) || 4558 (bfp->l_len > MAXOFF32_T)) 4559 return (EFBIG); 4560 4561 if (nfs_zone() != VTOMI(vp)->mi_zone) 4562 return (EIO); 4563 4564 error = convoff(vp, bfp, 0, offset); 4565 if (!error) { 4566 ASSERT(bfp->l_start >= 0); 4567 if (bfp->l_len == 0) { 4568 struct vattr va; 4569 4570 /* 4571 * ftruncate should not change the ctime and 4572 * mtime if we truncate the file to its 4573 * previous size. 4574 */ 4575 va.va_mask = AT_SIZE; 4576 error = nfsgetattr(vp, &va, cr); 4577 if (error || va.va_size == bfp->l_start) 4578 return (error); 4579 va.va_mask = AT_SIZE; 4580 va.va_size = bfp->l_start; 4581 error = nfssetattr(vp, &va, 0, cr); 4582 } else 4583 error = EINVAL; 4584 } 4585 4586 return (error); 4587 } 4588 4589 /* ARGSUSED */ 4590 static int 4591 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 4592 { 4593 4594 return (EINVAL); 4595 } 4596 4597 /* 4598 * Setup and add an address space callback to do the work of the delmap call. 4599 * The callback will (and must be) deleted in the actual callback function. 4600 * 4601 * This is done in order to take care of the problem that we have with holding 4602 * the address space's a_lock for a long period of time (e.g. if the NFS server 4603 * is down). Callbacks will be executed in the address space code while the 4604 * a_lock is not held. Holding the address space's a_lock causes things such 4605 * as ps and fork to hang because they are trying to acquire this lock as well. 4606 */ 4607 /* ARGSUSED */ 4608 static int 4609 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4610 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4611 caller_context_t *ct) 4612 { 4613 int caller_found; 4614 int error; 4615 rnode_t *rp; 4616 nfs_delmap_args_t *dmapp; 4617 nfs_delmapcall_t *delmap_call; 4618 4619 if (vp->v_flag & VNOMAP) 4620 return (ENOSYS); 4621 /* 4622 * A process may not change zones if it has NFS pages mmap'ed 4623 * in, so we can't legitimately get here from the wrong zone. 4624 */ 4625 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4626 4627 rp = VTOR(vp); 4628 4629 /* 4630 * The way that the address space of this process deletes its mapping 4631 * of this file is via the following call chains: 4632 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4633 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap() 4634 * 4635 * With the use of address space callbacks we are allowed to drop the 4636 * address space lock, a_lock, while executing the NFS operations that 4637 * need to go over the wire. Returning EAGAIN to the caller of this 4638 * function is what drives the execution of the callback that we add 4639 * below. The callback will be executed by the address space code 4640 * after dropping the a_lock. When the callback is finished, since 4641 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 4642 * is called again on the same segment to finish the rest of the work 4643 * that needs to happen during unmapping. 4644 * 4645 * This action of calling back into the segment driver causes 4646 * nfs_delmap() to get called again, but since the callback was 4647 * already executed at this point, it already did the work and there 4648 * is nothing left for us to do. 4649 * 4650 * To Summarize: 4651 * - The first time nfs_delmap is called by the current thread is when 4652 * we add the caller associated with this delmap to the delmap caller 4653 * list, add the callback, and return EAGAIN. 4654 * - The second time in this call chain when nfs_delmap is called we 4655 * will find this caller in the delmap caller list and realize there 4656 * is no more work to do thus removing this caller from the list and 4657 * returning the error that was set in the callback execution. 4658 */ 4659 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 4660 if (caller_found) { 4661 /* 4662 * 'error' is from the actual delmap operations. To avoid 4663 * hangs, we need to handle the return of EAGAIN differently 4664 * since this is what drives the callback execution. 4665 * In this case, we don't want to return EAGAIN and do the 4666 * callback execution because there are none to execute. 4667 */ 4668 if (error == EAGAIN) 4669 return (0); 4670 else 4671 return (error); 4672 } 4673 4674 /* current caller was not in the list */ 4675 delmap_call = nfs_init_delmapcall(); 4676 4677 mutex_enter(&rp->r_statelock); 4678 list_insert_tail(&rp->r_indelmap, delmap_call); 4679 mutex_exit(&rp->r_statelock); 4680 4681 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 4682 4683 dmapp->vp = vp; 4684 dmapp->off = off; 4685 dmapp->addr = addr; 4686 dmapp->len = len; 4687 dmapp->prot = prot; 4688 dmapp->maxprot = maxprot; 4689 dmapp->flags = flags; 4690 dmapp->cr = cr; 4691 dmapp->caller = delmap_call; 4692 4693 error = as_add_callback(as, nfs_delmap_callback, dmapp, 4694 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 4695 4696 return (error ? error : EAGAIN); 4697 } 4698 4699 /* 4700 * Remove some pages from an mmap'd vnode. Just update the 4701 * count of pages. If doing close-to-open, then flush all 4702 * of the pages associated with this file. Otherwise, start 4703 * an asynchronous page flush to write out any dirty pages. 4704 * This will also associate a credential with the rnode which 4705 * can be used to write the pages. 4706 */ 4707 /* ARGSUSED */ 4708 static void 4709 nfs_delmap_callback(struct as *as, void *arg, uint_t event) 4710 { 4711 int error; 4712 rnode_t *rp; 4713 mntinfo_t *mi; 4714 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 4715 4716 rp = VTOR(dmapp->vp); 4717 mi = VTOMI(dmapp->vp); 4718 4719 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 4720 ASSERT(rp->r_mapcnt >= 0); 4721 4722 /* 4723 * Initiate a page flush if there are pages, the file system 4724 * was not mounted readonly, the segment was mapped shared, and 4725 * the pages themselves were writeable. 4726 */ 4727 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 4728 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 4729 mutex_enter(&rp->r_statelock); 4730 rp->r_flags |= RDIRTY; 4731 mutex_exit(&rp->r_statelock); 4732 /* 4733 * If this is a cross-zone access a sync putpage won't work, so 4734 * the best we can do is try an async putpage. That seems 4735 * better than something more draconian such as discarding the 4736 * dirty pages. 4737 */ 4738 if ((mi->mi_flags & MI_NOCTO) || 4739 nfs_zone() != mi->mi_zone) 4740 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4741 B_ASYNC, dmapp->cr, NULL); 4742 else 4743 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4744 0, dmapp->cr, NULL); 4745 if (!error) { 4746 mutex_enter(&rp->r_statelock); 4747 error = rp->r_error; 4748 rp->r_error = 0; 4749 mutex_exit(&rp->r_statelock); 4750 } 4751 } else 4752 error = 0; 4753 4754 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 4755 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len, 4756 B_INVAL, dmapp->cr, NULL); 4757 4758 dmapp->caller->error = error; 4759 (void) as_delete_callback(as, arg); 4760 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 4761 } 4762 4763 /* ARGSUSED */ 4764 static int 4765 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4766 caller_context_t *ct) 4767 { 4768 int error = 0; 4769 4770 if (nfs_zone() != VTOMI(vp)->mi_zone) 4771 return (EIO); 4772 /* 4773 * This looks a little weird because it's written in a general 4774 * manner but we make little use of cases. If cntl() ever gets 4775 * widely used, the outer switch will make more sense. 4776 */ 4777 4778 switch (cmd) { 4779 4780 /* 4781 * Large file spec - need to base answer new query with 4782 * hardcoded constant based on the protocol. 4783 */ 4784 case _PC_FILESIZEBITS: 4785 *valp = 32; 4786 return (0); 4787 4788 case _PC_LINK_MAX: 4789 case _PC_NAME_MAX: 4790 case _PC_PATH_MAX: 4791 case _PC_SYMLINK_MAX: 4792 case _PC_CHOWN_RESTRICTED: 4793 case _PC_NO_TRUNC: { 4794 mntinfo_t *mi; 4795 struct pathcnf *pc; 4796 4797 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL) 4798 return (EINVAL); 4799 error = _PC_ISSET(cmd, pc->pc_mask); /* error or bool */ 4800 switch (cmd) { 4801 case _PC_LINK_MAX: 4802 *valp = pc->pc_link_max; 4803 break; 4804 case _PC_NAME_MAX: 4805 *valp = pc->pc_name_max; 4806 break; 4807 case _PC_PATH_MAX: 4808 case _PC_SYMLINK_MAX: 4809 *valp = pc->pc_path_max; 4810 break; 4811 case _PC_CHOWN_RESTRICTED: 4812 /* 4813 * if we got here, error is really a boolean which 4814 * indicates whether cmd is set or not. 4815 */ 4816 *valp = error ? 1 : 0; /* see above */ 4817 error = 0; 4818 break; 4819 case _PC_NO_TRUNC: 4820 /* 4821 * if we got here, error is really a boolean which 4822 * indicates whether cmd is set or not. 4823 */ 4824 *valp = error ? 1 : 0; /* see above */ 4825 error = 0; 4826 break; 4827 } 4828 return (error ? EINVAL : 0); 4829 } 4830 4831 case _PC_XATTR_EXISTS: 4832 *valp = 0; 4833 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 4834 vnode_t *avp; 4835 rnode_t *rp; 4836 mntinfo_t *mi = VTOMI(vp); 4837 4838 if (!(mi->mi_flags & MI_EXTATTR)) 4839 return (0); 4840 4841 rp = VTOR(vp); 4842 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 4843 INTR(vp))) 4844 return (EINTR); 4845 4846 error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 4847 if (error || avp == NULL) 4848 error = acl_getxattrdir2(vp, &avp, 0, cr, 0); 4849 4850 nfs_rw_exit(&rp->r_rwlock); 4851 4852 if (error == 0 && avp != NULL) { 4853 error = do_xattr_exists_check(avp, valp, cr); 4854 VN_RELE(avp); 4855 } 4856 } 4857 return (error ? EINVAL : 0); 4858 4859 case _PC_ACL_ENABLED: 4860 *valp = _ACL_ACLENT_ENABLED; 4861 return (0); 4862 4863 default: 4864 return (EINVAL); 4865 } 4866 } 4867 4868 /* 4869 * Called by async thread to do synchronous pageio. Do the i/o, wait 4870 * for it to complete, and cleanup the page list when done. 4871 */ 4872 static int 4873 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4874 int flags, cred_t *cr) 4875 { 4876 int error; 4877 4878 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4879 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4880 if (flags & B_READ) 4881 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 4882 else 4883 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 4884 return (error); 4885 } 4886 4887 /* ARGSUSED */ 4888 static int 4889 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 4890 int flags, cred_t *cr, caller_context_t *ct) 4891 { 4892 int error; 4893 rnode_t *rp; 4894 4895 if (pp == NULL) 4896 return (EINVAL); 4897 4898 if (io_off > MAXOFF32_T) 4899 return (EFBIG); 4900 if (nfs_zone() != VTOMI(vp)->mi_zone) 4901 return (EIO); 4902 rp = VTOR(vp); 4903 mutex_enter(&rp->r_statelock); 4904 rp->r_count++; 4905 mutex_exit(&rp->r_statelock); 4906 4907 if (flags & B_ASYNC) { 4908 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 4909 nfs_sync_pageio); 4910 } else 4911 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 4912 mutex_enter(&rp->r_statelock); 4913 rp->r_count--; 4914 cv_broadcast(&rp->r_cv); 4915 mutex_exit(&rp->r_statelock); 4916 return (error); 4917 } 4918 4919 /* ARGSUSED */ 4920 static int 4921 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 4922 caller_context_t *ct) 4923 { 4924 int error; 4925 mntinfo_t *mi; 4926 4927 mi = VTOMI(vp); 4928 4929 if (nfs_zone() != mi->mi_zone) 4930 return (EIO); 4931 if (mi->mi_flags & MI_ACL) { 4932 error = acl_setacl2(vp, vsecattr, flag, cr); 4933 if (mi->mi_flags & MI_ACL) 4934 return (error); 4935 } 4936 4937 return (ENOSYS); 4938 } 4939 4940 /* ARGSUSED */ 4941 static int 4942 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 4943 caller_context_t *ct) 4944 { 4945 int error; 4946 mntinfo_t *mi; 4947 4948 mi = VTOMI(vp); 4949 4950 if (nfs_zone() != mi->mi_zone) 4951 return (EIO); 4952 if (mi->mi_flags & MI_ACL) { 4953 error = acl_getacl2(vp, vsecattr, flag, cr); 4954 if (mi->mi_flags & MI_ACL) 4955 return (error); 4956 } 4957 4958 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 4959 } 4960 4961 /* ARGSUSED */ 4962 static int 4963 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 4964 caller_context_t *ct) 4965 { 4966 int error; 4967 struct shrlock nshr; 4968 struct nfs_owner nfs_owner; 4969 netobj lm_fh; 4970 4971 if (nfs_zone() != VTOMI(vp)->mi_zone) 4972 return (EIO); 4973 4974 /* 4975 * check for valid cmd parameter 4976 */ 4977 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 4978 return (EINVAL); 4979 4980 /* 4981 * Check access permissions 4982 */ 4983 if (cmd == F_SHARE && 4984 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 4985 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 4986 return (EBADF); 4987 4988 /* 4989 * If the filesystem is mounted using local locking, pass the 4990 * request off to the local share code. 4991 */ 4992 if (VTOMI(vp)->mi_flags & MI_LLOCK) 4993 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 4994 4995 switch (cmd) { 4996 case F_SHARE: 4997 case F_UNSHARE: 4998 lm_fh.n_len = sizeof (fhandle_t); 4999 lm_fh.n_bytes = (char *)VTOFH(vp); 5000 5001 /* 5002 * If passed an owner that is too large to fit in an 5003 * nfs_owner it is likely a recursive call from the 5004 * lock manager client and pass it straight through. If 5005 * it is not a nfs_owner then simply return an error. 5006 */ 5007 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 5008 if (((struct nfs_owner *)shr->s_owner)->magic != 5009 NFS_OWNER_MAGIC) 5010 return (EINVAL); 5011 5012 if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) { 5013 error = set_errno(error); 5014 } 5015 return (error); 5016 } 5017 /* 5018 * Remote share reservations owner is a combination of 5019 * a magic number, hostname, and the local owner 5020 */ 5021 bzero(&nfs_owner, sizeof (nfs_owner)); 5022 nfs_owner.magic = NFS_OWNER_MAGIC; 5023 (void) strncpy(nfs_owner.hname, uts_nodename(), 5024 sizeof (nfs_owner.hname)); 5025 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 5026 nshr.s_access = shr->s_access; 5027 nshr.s_deny = shr->s_deny; 5028 nshr.s_sysid = 0; 5029 nshr.s_pid = ttoproc(curthread)->p_pid; 5030 nshr.s_own_len = sizeof (nfs_owner); 5031 nshr.s_owner = (caddr_t)&nfs_owner; 5032 5033 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) { 5034 error = set_errno(error); 5035 } 5036 5037 break; 5038 5039 case F_HASREMOTELOCKS: 5040 /* 5041 * NFS client can't store remote locks itself 5042 */ 5043 shr->s_access = 0; 5044 error = 0; 5045 break; 5046 5047 default: 5048 error = EINVAL; 5049 break; 5050 } 5051 5052 return (error); 5053 } 5054