1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/cmn_err.h> 57 #include <sys/pathconf.h> 58 #include <sys/utsname.h> 59 #include <sys/dnlc.h> 60 #include <sys/acl.h> 61 #include <sys/systeminfo.h> 62 #include <sys/atomic.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 66 #include <rpc/types.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs_clnt.h> 72 #include <nfs/rnode.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_kpm.h> 83 #include <vm/seg_vn.h> 84 85 #include <fs/fs_subr.h> 86 87 #include <sys/ddi.h> 88 89 static int nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 90 cred_t *); 91 static int nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 92 stable_how *); 93 static int nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *); 94 static int nfs3setattr(vnode_t *, struct vattr *, int, cred_t *); 95 static int nfs3_accessx(void *, int, cred_t *); 96 static int nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 97 static int nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 98 static int nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl, 99 int, vnode_t **, cred_t *, int); 100 static int nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *); 101 static int nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 102 int, vnode_t **, cred_t *); 103 static int nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 104 static int do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *); 105 static void nfs3readdir(vnode_t *, rddir_cache *, cred_t *); 106 static void nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *); 107 static int nfs3_bio(struct buf *, stable_how *, cred_t *); 108 static int nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 109 page_t *[], size_t, struct seg *, caddr_t, 110 enum seg_rw, cred_t *); 111 static void nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 112 cred_t *); 113 static int nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 114 int, cred_t *); 115 static int nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 116 int, cred_t *); 117 static int nfs3_commit(vnode_t *, offset3, count3, cred_t *); 118 static void nfs3_set_mod(vnode_t *); 119 static void nfs3_get_commit(vnode_t *); 120 static void nfs3_get_commit_range(vnode_t *, u_offset_t, size_t); 121 #if 0 /* unused */ 122 #ifdef DEBUG 123 static int nfs3_no_uncommitted_pages(vnode_t *); 124 #endif 125 #endif /* unused */ 126 static int nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 127 static int nfs3_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *); 128 static int nfs3_sync_commit(vnode_t *, page_t *, offset3, count3, 129 cred_t *); 130 static void nfs3_async_commit(vnode_t *, page_t *, offset3, count3, 131 cred_t *); 132 static void nfs3_delmap_callback(struct as *, void *, uint_t); 133 134 /* 135 * Error flags used to pass information about certain special errors 136 * which need to be handled specially. 137 */ 138 #define NFS_EOF -98 139 #define NFS_VERF_MISMATCH -97 140 141 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 142 #define ALIGN64(x, ptr, sz) \ 143 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 144 if (x) { \ 145 x = sizeof (uint64_t) - (x); \ 146 sz -= (x); \ 147 ptr += (x); \ 148 } 149 150 /* 151 * These are the vnode ops routines which implement the vnode interface to 152 * the networked file system. These routines just take their parameters, 153 * make them look networkish by putting the right info into interface structs, 154 * and then calling the appropriate remote routine(s) to do the work. 155 * 156 * Note on directory name lookup cacheing: If we detect a stale fhandle, 157 * we purge the directory cache relative to that vnode. This way, the 158 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 159 * more details on rnode locking. 160 */ 161 162 static int nfs3_open(vnode_t **, int, cred_t *); 163 static int nfs3_close(vnode_t *, int, int, offset_t, cred_t *); 164 static int nfs3_read(vnode_t *, struct uio *, int, cred_t *, 165 caller_context_t *); 166 static int nfs3_write(vnode_t *, struct uio *, int, cred_t *, 167 caller_context_t *); 168 static int nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 169 static int nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *); 170 static int nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *, 171 caller_context_t *); 172 static int nfs3_access(vnode_t *, int, int, cred_t *); 173 static int nfs3_readlink(vnode_t *, struct uio *, cred_t *); 174 static int nfs3_fsync(vnode_t *, int, cred_t *); 175 static void nfs3_inactive(vnode_t *, cred_t *); 176 static int nfs3_lookup(vnode_t *, char *, vnode_t **, 177 struct pathname *, int, vnode_t *, cred_t *); 178 static int nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl, 179 int, vnode_t **, cred_t *, int); 180 static int nfs3_remove(vnode_t *, char *, cred_t *); 181 static int nfs3_link(vnode_t *, vnode_t *, char *, cred_t *); 182 static int nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 183 static int nfs3_mkdir(vnode_t *, char *, struct vattr *, 184 vnode_t **, cred_t *); 185 static int nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 186 static int nfs3_symlink(vnode_t *, char *, struct vattr *, char *, 187 cred_t *); 188 static int nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *); 189 static int nfs3_fid(vnode_t *, fid_t *); 190 static int nfs3_rwlock(vnode_t *, int, caller_context_t *); 191 static void nfs3_rwunlock(vnode_t *, int, caller_context_t *); 192 static int nfs3_seek(vnode_t *, offset_t, offset_t *); 193 static int nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *, 194 page_t *[], size_t, struct seg *, caddr_t, 195 enum seg_rw, cred_t *); 196 static int nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 197 static int nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, 198 size_t, uchar_t, uchar_t, uint_t, cred_t *); 199 static int nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, 200 size_t, uchar_t, uchar_t, uint_t, cred_t *); 201 static int nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 202 struct flk_callback *, cred_t *); 203 static int nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t, 204 cred_t *, caller_context_t *); 205 static int nfs3_realvp(vnode_t *, vnode_t **); 206 static int nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, 207 size_t, uint_t, uint_t, uint_t, cred_t *); 208 static int nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *); 209 static int nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 210 cred_t *); 211 static void nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *); 212 static int nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 213 static int nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 214 static int nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 215 216 struct vnodeops *nfs3_vnodeops; 217 218 const fs_operation_def_t nfs3_vnodeops_template[] = { 219 VOPNAME_OPEN, nfs3_open, 220 VOPNAME_CLOSE, nfs3_close, 221 VOPNAME_READ, nfs3_read, 222 VOPNAME_WRITE, nfs3_write, 223 VOPNAME_IOCTL, nfs3_ioctl, 224 VOPNAME_GETATTR, nfs3_getattr, 225 VOPNAME_SETATTR, nfs3_setattr, 226 VOPNAME_ACCESS, nfs3_access, 227 VOPNAME_LOOKUP, nfs3_lookup, 228 VOPNAME_CREATE, nfs3_create, 229 VOPNAME_REMOVE, nfs3_remove, 230 VOPNAME_LINK, nfs3_link, 231 VOPNAME_RENAME, nfs3_rename, 232 VOPNAME_MKDIR, nfs3_mkdir, 233 VOPNAME_RMDIR, nfs3_rmdir, 234 VOPNAME_READDIR, nfs3_readdir, 235 VOPNAME_SYMLINK, nfs3_symlink, 236 VOPNAME_READLINK, nfs3_readlink, 237 VOPNAME_FSYNC, nfs3_fsync, 238 VOPNAME_INACTIVE, (fs_generic_func_p) nfs3_inactive, 239 VOPNAME_FID, nfs3_fid, 240 VOPNAME_RWLOCK, nfs3_rwlock, 241 VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs3_rwunlock, 242 VOPNAME_SEEK, nfs3_seek, 243 VOPNAME_FRLOCK, nfs3_frlock, 244 VOPNAME_SPACE, nfs3_space, 245 VOPNAME_REALVP, nfs3_realvp, 246 VOPNAME_GETPAGE, nfs3_getpage, 247 VOPNAME_PUTPAGE, nfs3_putpage, 248 VOPNAME_MAP, (fs_generic_func_p) nfs3_map, 249 VOPNAME_ADDMAP, (fs_generic_func_p) nfs3_addmap, 250 VOPNAME_DELMAP, nfs3_delmap, 251 VOPNAME_DUMP, nfs_dump, /* there is no separate nfs3_dump */ 252 VOPNAME_PATHCONF, nfs3_pathconf, 253 VOPNAME_PAGEIO, nfs3_pageio, 254 VOPNAME_DISPOSE, (fs_generic_func_p) nfs3_dispose, 255 VOPNAME_SETSECATTR, nfs3_setsecattr, 256 VOPNAME_GETSECATTR, nfs3_getsecattr, 257 VOPNAME_SHRLOCK, nfs3_shrlock, 258 NULL, NULL 259 }; 260 261 /* 262 * XXX: This is referenced in modstubs.s 263 */ 264 struct vnodeops * 265 nfs3_getvnodeops(void) 266 { 267 return (nfs3_vnodeops); 268 } 269 270 /* ARGSUSED */ 271 static int 272 nfs3_open(vnode_t **vpp, int flag, cred_t *cr) 273 { 274 int error; 275 struct vattr va; 276 rnode_t *rp; 277 vnode_t *vp; 278 279 vp = *vpp; 280 if (nfs_zone() != VTOMI(vp)->mi_zone) 281 return (EIO); 282 rp = VTOR(vp); 283 mutex_enter(&rp->r_statelock); 284 if (rp->r_cred == NULL) { 285 crhold(cr); 286 rp->r_cred = cr; 287 } 288 mutex_exit(&rp->r_statelock); 289 290 /* 291 * If there is no cached data or if close-to-open 292 * consistency checking is turned off, we can avoid 293 * the over the wire getattr. Otherwise, if the 294 * file system is mounted readonly, then just verify 295 * the caches are up to date using the normal mechanism. 296 * Else, if the file is not mmap'd, then just mark 297 * the attributes as timed out. They will be refreshed 298 * and the caches validated prior to being used. 299 * Else, the file system is mounted writeable so 300 * force an over the wire GETATTR in order to ensure 301 * that all cached data is valid. 302 */ 303 if (vp->v_count > 1 || 304 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 305 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 306 if (vn_is_readonly(vp)) 307 error = nfs3_validate_caches(vp, cr); 308 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 309 PURGE_ATTRCACHE(vp); 310 error = 0; 311 } else { 312 va.va_mask = AT_ALL; 313 error = nfs3_getattr_otw(vp, &va, cr); 314 } 315 } else 316 error = 0; 317 318 return (error); 319 } 320 321 static int 322 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 323 { 324 rnode_t *rp; 325 int error; 326 struct vattr va; 327 328 /* 329 * zone_enter(2) prevents processes from changing zones with NFS files 330 * open; if we happen to get here from the wrong zone we can't do 331 * anything over the wire. 332 */ 333 if (VTOMI(vp)->mi_zone != nfs_zone()) { 334 /* 335 * We could attempt to clean up locks, except we're sure 336 * that the current process didn't acquire any locks on 337 * the file: any attempt to lock a file belong to another zone 338 * will fail, and one can't lock an NFS file and then change 339 * zones, as that fails too. 340 * 341 * Returning an error here is the sane thing to do. A 342 * subsequent call to VN_RELE() which translates to a 343 * nfs3_inactive() will clean up state: if the zone of the 344 * vnode's origin is still alive and kicking, an async worker 345 * thread will handle the request (from the correct zone), and 346 * everything (minus the commit and final nfs3_getattr_otw() 347 * call) should be OK. If the zone is going away 348 * nfs_async_inactive() will throw away cached pages inline. 349 */ 350 return (EIO); 351 } 352 353 /* 354 * If we are using local locking for this filesystem, then 355 * release all of the SYSV style record locks. Otherwise, 356 * we are doing network locking and we need to release all 357 * of the network locks. All of the locks held by this 358 * process on this file are released no matter what the 359 * incoming reference count is. 360 */ 361 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 362 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 363 cleanshares(vp, ttoproc(curthread)->p_pid); 364 } else 365 nfs_lockrelease(vp, flag, offset, cr); 366 367 if (count > 1) 368 return (0); 369 370 /* 371 * If the file has been `unlinked', then purge the 372 * DNLC so that this vnode will get reycled quicker 373 * and the .nfs* file on the server will get removed. 374 */ 375 rp = VTOR(vp); 376 if (rp->r_unldvp != NULL) 377 dnlc_purge_vp(vp); 378 379 /* 380 * If the file was open for write and there are pages, 381 * then if the file system was mounted using the "no-close- 382 * to-open" semantics, then start an asynchronous flush 383 * of the all of the pages in the file. 384 * else the file system was not mounted using the "no-close- 385 * to-open" semantics, then do a synchronous flush and 386 * commit of all of the dirty and uncommitted pages. 387 * 388 * The asynchronous flush of the pages in the "nocto" path 389 * mostly just associates a cred pointer with the rnode so 390 * writes which happen later will have a better chance of 391 * working. It also starts the data being written to the 392 * server, but without unnecessarily delaying the application. 393 */ 394 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 395 if (VTOMI(vp)->mi_flags & MI_NOCTO) { 396 error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC, cr); 397 if (error == EAGAIN) 398 error = 0; 399 } else 400 error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); 401 if (!error) { 402 mutex_enter(&rp->r_statelock); 403 error = rp->r_error; 404 rp->r_error = 0; 405 mutex_exit(&rp->r_statelock); 406 } 407 } else { 408 mutex_enter(&rp->r_statelock); 409 error = rp->r_error; 410 rp->r_error = 0; 411 mutex_exit(&rp->r_statelock); 412 } 413 414 /* 415 * If RWRITEATTR is set, then issue an over the wire GETATTR to 416 * refresh the attribute cache with a set of attributes which 417 * weren't returned from a WRITE. This will enable the close- 418 * to-open processing to work. 419 */ 420 if (rp->r_flags & RWRITEATTR) 421 (void) nfs3_getattr_otw(vp, &va, cr); 422 423 return (error); 424 } 425 426 /* ARGSUSED */ 427 static int 428 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr) 429 { 430 mntinfo_t *mi; 431 READ3args args; 432 READ3uiores res; 433 int tsize; 434 offset_t offset; 435 ssize_t count; 436 int error; 437 int douprintf; 438 failinfo_t fi; 439 char *sv_hostname; 440 441 mi = VTOMI(vp); 442 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 443 sv_hostname = VTOR(vp)->r_server->sv_hostname; 444 445 douprintf = 1; 446 args.file = *VTOFH3(vp); 447 fi.vp = vp; 448 fi.fhp = (caddr_t)&args.file; 449 fi.copyproc = nfs3copyfh; 450 fi.lookupproc = nfs3lookup; 451 fi.xattrdirproc = acl_getxattrdir3; 452 453 res.uiop = uiop; 454 455 offset = uiop->uio_loffset; 456 count = uiop->uio_resid; 457 458 do { 459 if (mi->mi_io_kstats) { 460 mutex_enter(&mi->mi_lock); 461 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 462 mutex_exit(&mi->mi_lock); 463 } 464 465 do { 466 tsize = MIN(mi->mi_tsize, count); 467 args.offset = (offset3)offset; 468 args.count = (count3)tsize; 469 res.size = (uint_t)tsize; 470 error = rfs3call(mi, NFSPROC3_READ, 471 xdr_READ3args, (caddr_t)&args, 472 xdr_READ3uiores, (caddr_t)&res, cr, 473 &douprintf, &res.status, 0, &fi); 474 } while (error == ENFS_TRYAGAIN); 475 476 if (mi->mi_io_kstats) { 477 mutex_enter(&mi->mi_lock); 478 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 479 mutex_exit(&mi->mi_lock); 480 } 481 482 if (error) 483 return (error); 484 485 error = geterrno3(res.status); 486 if (error) 487 return (error); 488 489 if (res.count != res.size) { 490 zcmn_err(getzoneid(), CE_WARN, 491 "nfs3_directio_read: server %s returned incorrect amount", 492 sv_hostname); 493 return (EIO); 494 } 495 count -= res.count; 496 offset += res.count; 497 if (mi->mi_io_kstats) { 498 mutex_enter(&mi->mi_lock); 499 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 500 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; 501 mutex_exit(&mi->mi_lock); 502 } 503 lwp_stat_update(LWP_STAT_INBLK, 1); 504 } while (count && !res.eof); 505 506 return (0); 507 } 508 509 /* ARGSUSED */ 510 static int 511 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 512 caller_context_t *ct) 513 { 514 rnode_t *rp; 515 u_offset_t off; 516 offset_t diff; 517 int on; 518 size_t n; 519 caddr_t base; 520 uint_t flags; 521 int error = 0; 522 mntinfo_t *mi; 523 524 rp = VTOR(vp); 525 mi = VTOMI(vp); 526 527 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 528 529 if (nfs_zone() != mi->mi_zone) 530 return (EIO); 531 532 if (vp->v_type != VREG) 533 return (EISDIR); 534 535 if (uiop->uio_resid == 0) 536 return (0); 537 538 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 539 return (EINVAL); 540 541 /* 542 * Bypass VM if caching has been disabled (e.g., locking) or if 543 * using client-side direct I/O and the file is not mmap'd and 544 * there are no cached pages. 545 */ 546 if ((vp->v_flag & VNOCACHE) || 547 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 548 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 549 return (nfs3_directio_read(vp, uiop, cr)); 550 } 551 552 do { 553 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 554 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 555 n = MIN(MAXBSIZE - on, uiop->uio_resid); 556 557 error = nfs3_validate_caches(vp, cr); 558 if (error) 559 break; 560 561 mutex_enter(&rp->r_statelock); 562 diff = rp->r_size - uiop->uio_loffset; 563 mutex_exit(&rp->r_statelock); 564 if (diff <= 0) 565 break; 566 if (diff < n) 567 n = (size_t)diff; 568 569 if (vpm_enable) { 570 /* 571 * Copy data. 572 */ 573 error = vpm_data_copy(vp, off + on, n, uiop, 574 1, NULL, 0, S_READ); 575 } else { 576 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 577 S_READ); 578 579 error = uiomove(base + on, n, UIO_READ, uiop); 580 } 581 582 if (!error) { 583 /* 584 * If read a whole block or read to eof, 585 * won't need this buffer again soon. 586 */ 587 mutex_enter(&rp->r_statelock); 588 if (n + on == MAXBSIZE || 589 uiop->uio_loffset == rp->r_size) 590 flags = SM_DONTNEED; 591 else 592 flags = 0; 593 mutex_exit(&rp->r_statelock); 594 if (vpm_enable) { 595 error = vpm_sync_pages(vp, off, n, flags); 596 } else { 597 error = segmap_release(segkmap, base, flags); 598 } 599 } else { 600 if (vpm_enable) { 601 (void) vpm_sync_pages(vp, off, n, 0); 602 } else { 603 (void) segmap_release(segkmap, base, 0); 604 } 605 } 606 } while (!error && uiop->uio_resid > 0); 607 608 return (error); 609 } 610 611 /* ARGSUSED */ 612 static int 613 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 614 caller_context_t *ct) 615 { 616 rlim64_t limit = uiop->uio_llimit; 617 rnode_t *rp; 618 u_offset_t off; 619 caddr_t base; 620 uint_t flags; 621 int remainder; 622 size_t n; 623 int on; 624 int error; 625 int resid; 626 offset_t offset; 627 mntinfo_t *mi; 628 uint_t bsize; 629 630 rp = VTOR(vp); 631 632 if (vp->v_type != VREG) 633 return (EISDIR); 634 635 mi = VTOMI(vp); 636 if (nfs_zone() != mi->mi_zone) 637 return (EIO); 638 if (uiop->uio_resid == 0) 639 return (0); 640 641 if (ioflag & FAPPEND) { 642 struct vattr va; 643 644 /* 645 * Must serialize if appending. 646 */ 647 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 648 nfs_rw_exit(&rp->r_rwlock); 649 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 650 INTR(vp))) 651 return (EINTR); 652 } 653 654 va.va_mask = AT_SIZE; 655 error = nfs3getattr(vp, &va, cr); 656 if (error) 657 return (error); 658 uiop->uio_loffset = va.va_size; 659 } 660 661 offset = uiop->uio_loffset + uiop->uio_resid; 662 663 if (uiop->uio_loffset < 0 || offset < 0) 664 return (EINVAL); 665 666 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 667 limit = MAXOFFSET_T; 668 669 /* 670 * Check to make sure that the process will not exceed 671 * its limit on file size. It is okay to write up to 672 * the limit, but not beyond. Thus, the write which 673 * reaches the limit will be short and the next write 674 * will return an error. 675 */ 676 remainder = 0; 677 if (offset > limit) { 678 remainder = offset - limit; 679 uiop->uio_resid = limit - uiop->uio_loffset; 680 if (uiop->uio_resid <= 0) { 681 proc_t *p = ttoproc(curthread); 682 683 uiop->uio_resid += remainder; 684 mutex_enter(&p->p_lock); 685 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 686 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 687 mutex_exit(&p->p_lock); 688 return (EFBIG); 689 } 690 } 691 692 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 693 return (EINTR); 694 695 /* 696 * Bypass VM if caching has been disabled (e.g., locking) or if 697 * using client-side direct I/O and the file is not mmap'd and 698 * there are no cached pages. 699 */ 700 if ((vp->v_flag & VNOCACHE) || 701 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 702 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 703 size_t bufsize; 704 int count; 705 u_offset_t org_offset; 706 stable_how stab_comm; 707 708 nfs3_fwrite: 709 if (rp->r_flags & RSTALE) { 710 resid = uiop->uio_resid; 711 offset = uiop->uio_loffset; 712 error = rp->r_error; 713 goto bottom; 714 } 715 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 716 base = kmem_alloc(bufsize, KM_SLEEP); 717 do { 718 if (ioflag & FDSYNC) 719 stab_comm = DATA_SYNC; 720 else 721 stab_comm = FILE_SYNC; 722 resid = uiop->uio_resid; 723 offset = uiop->uio_loffset; 724 count = MIN(uiop->uio_resid, bufsize); 725 org_offset = uiop->uio_loffset; 726 error = uiomove(base, count, UIO_WRITE, uiop); 727 if (!error) { 728 error = nfs3write(vp, base, org_offset, 729 count, cr, &stab_comm); 730 } 731 } while (!error && uiop->uio_resid > 0); 732 kmem_free(base, bufsize); 733 goto bottom; 734 } 735 736 737 bsize = vp->v_vfsp->vfs_bsize; 738 739 do { 740 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 741 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 742 n = MIN(MAXBSIZE - on, uiop->uio_resid); 743 744 resid = uiop->uio_resid; 745 offset = uiop->uio_loffset; 746 747 if (rp->r_flags & RSTALE) { 748 error = rp->r_error; 749 break; 750 } 751 752 /* 753 * Don't create dirty pages faster than they 754 * can be cleaned so that the system doesn't 755 * get imbalanced. If the async queue is 756 * maxed out, then wait for it to drain before 757 * creating more dirty pages. Also, wait for 758 * any threads doing pagewalks in the vop_getattr 759 * entry points so that they don't block for 760 * long periods. 761 */ 762 mutex_enter(&rp->r_statelock); 763 while ((mi->mi_max_threads != 0 && 764 rp->r_awcount > 2 * mi->mi_max_threads) || 765 rp->r_gcount > 0) 766 cv_wait(&rp->r_cv, &rp->r_statelock); 767 mutex_exit(&rp->r_statelock); 768 769 if (vpm_enable) { 770 /* 771 * It will use kpm mappings, so no need to 772 * pass an address. 773 */ 774 error = writerp(rp, NULL, n, uiop, 0); 775 } else { 776 if (segmap_kpm) { 777 int pon = uiop->uio_loffset & PAGEOFFSET; 778 size_t pn = MIN(PAGESIZE - pon, 779 uiop->uio_resid); 780 int pagecreate; 781 782 mutex_enter(&rp->r_statelock); 783 pagecreate = (pon == 0) && (pn == PAGESIZE || 784 uiop->uio_loffset + pn >= rp->r_size); 785 mutex_exit(&rp->r_statelock); 786 787 base = segmap_getmapflt(segkmap, vp, off + on, 788 pn, !pagecreate, S_WRITE); 789 790 error = writerp(rp, base + pon, n, uiop, 791 pagecreate); 792 793 } else { 794 base = segmap_getmapflt(segkmap, vp, off + on, 795 n, 0, S_READ); 796 error = writerp(rp, base + on, n, uiop, 0); 797 } 798 } 799 800 if (!error) { 801 if (mi->mi_flags & MI_NOAC) 802 flags = SM_WRITE; 803 else if ((uiop->uio_loffset % bsize) == 0 || 804 IS_SWAPVP(vp)) { 805 /* 806 * Have written a whole block. 807 * Start an asynchronous write 808 * and mark the buffer to 809 * indicate that it won't be 810 * needed again soon. 811 */ 812 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 813 } else 814 flags = 0; 815 if ((ioflag & (FSYNC|FDSYNC)) || 816 (rp->r_flags & ROUTOFSPACE)) { 817 flags &= ~SM_ASYNC; 818 flags |= SM_WRITE; 819 } 820 if (vpm_enable) { 821 error = vpm_sync_pages(vp, off, n, flags); 822 } else { 823 error = segmap_release(segkmap, base, flags); 824 } 825 } else { 826 if (vpm_enable) { 827 (void) vpm_sync_pages(vp, off, n, 0); 828 } else { 829 (void) segmap_release(segkmap, base, 0); 830 } 831 /* 832 * In the event that we got an access error while 833 * faulting in a page for a write-only file just 834 * force a write. 835 */ 836 if (error == EACCES) 837 goto nfs3_fwrite; 838 } 839 } while (!error && uiop->uio_resid > 0); 840 841 bottom: 842 if (error) { 843 uiop->uio_resid = resid + remainder; 844 uiop->uio_loffset = offset; 845 } else 846 uiop->uio_resid += remainder; 847 848 nfs_rw_exit(&rp->r_lkserlock); 849 850 return (error); 851 } 852 853 /* 854 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 855 */ 856 static int 857 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 858 int flags, cred_t *cr) 859 { 860 struct buf *bp; 861 int error; 862 page_t *savepp; 863 uchar_t fsdata; 864 stable_how stab_comm; 865 866 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 867 bp = pageio_setup(pp, len, vp, flags); 868 ASSERT(bp != NULL); 869 870 /* 871 * pageio_setup should have set b_addr to 0. This 872 * is correct since we want to do I/O on a page 873 * boundary. bp_mapin will use this addr to calculate 874 * an offset, and then set b_addr to the kernel virtual 875 * address it allocated for us. 876 */ 877 ASSERT(bp->b_un.b_addr == 0); 878 879 bp->b_edev = 0; 880 bp->b_dev = 0; 881 bp->b_lblkno = lbtodb(off); 882 bp->b_file = vp; 883 bp->b_offset = (offset_t)off; 884 bp_mapin(bp); 885 886 /* 887 * Calculate the desired level of stability to write data 888 * on the server and then mark all of the pages to reflect 889 * this. 890 */ 891 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 892 freemem > desfree) { 893 stab_comm = UNSTABLE; 894 fsdata = C_DELAYCOMMIT; 895 } else { 896 stab_comm = FILE_SYNC; 897 fsdata = C_NOCOMMIT; 898 } 899 900 savepp = pp; 901 do { 902 pp->p_fsdata = fsdata; 903 } while ((pp = pp->p_next) != savepp); 904 905 error = nfs3_bio(bp, &stab_comm, cr); 906 907 bp_mapout(bp); 908 pageio_done(bp); 909 910 /* 911 * If the server wrote pages in a more stable fashion than 912 * was requested, then clear all of the marks in the pages 913 * indicating that COMMIT operations were required. 914 */ 915 if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) { 916 do { 917 pp->p_fsdata = C_NOCOMMIT; 918 } while ((pp = pp->p_next) != savepp); 919 } 920 921 return (error); 922 } 923 924 /* 925 * Write to file. Writes to remote server in largest size 926 * chunks that the server can handle. Write is synchronous. 927 */ 928 static int 929 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 930 stable_how *stab_comm) 931 { 932 mntinfo_t *mi; 933 WRITE3args args; 934 WRITE3res res; 935 int error; 936 int tsize; 937 rnode_t *rp; 938 int douprintf; 939 940 rp = VTOR(vp); 941 mi = VTOMI(vp); 942 943 ASSERT(nfs_zone() == mi->mi_zone); 944 945 args.file = *VTOFH3(vp); 946 args.stable = *stab_comm; 947 948 *stab_comm = FILE_SYNC; 949 950 douprintf = 1; 951 952 do { 953 if ((vp->v_flag & VNOCACHE) || 954 (rp->r_flags & RDIRECTIO) || 955 (mi->mi_flags & MI_DIRECTIO)) 956 tsize = MIN(mi->mi_stsize, count); 957 else 958 tsize = MIN(mi->mi_curwrite, count); 959 args.offset = (offset3)offset; 960 args.count = (count3)tsize; 961 args.data.data_len = (uint_t)tsize; 962 args.data.data_val = base; 963 964 if (mi->mi_io_kstats) { 965 mutex_enter(&mi->mi_lock); 966 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 967 mutex_exit(&mi->mi_lock); 968 } 969 args.mblk = NULL; 970 do { 971 error = rfs3call(mi, NFSPROC3_WRITE, 972 xdr_WRITE3args, (caddr_t)&args, 973 xdr_WRITE3res, (caddr_t)&res, cr, 974 &douprintf, &res.status, 0, NULL); 975 } while (error == ENFS_TRYAGAIN); 976 if (mi->mi_io_kstats) { 977 mutex_enter(&mi->mi_lock); 978 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 979 mutex_exit(&mi->mi_lock); 980 } 981 982 if (error) 983 return (error); 984 error = geterrno3(res.status); 985 if (!error) { 986 if (res.resok.count > args.count) { 987 zcmn_err(getzoneid(), CE_WARN, 988 "nfs3write: server %s wrote %u, " 989 "requested was %u", 990 rp->r_server->sv_hostname, 991 res.resok.count, args.count); 992 return (EIO); 993 } 994 if (res.resok.committed == UNSTABLE) { 995 *stab_comm = UNSTABLE; 996 if (args.stable == DATA_SYNC || 997 args.stable == FILE_SYNC) { 998 zcmn_err(getzoneid(), CE_WARN, 999 "nfs3write: server %s did not commit to stable storage", 1000 rp->r_server->sv_hostname); 1001 return (EIO); 1002 } 1003 } 1004 tsize = (int)res.resok.count; 1005 count -= tsize; 1006 base += tsize; 1007 offset += tsize; 1008 if (mi->mi_io_kstats) { 1009 mutex_enter(&mi->mi_lock); 1010 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 1011 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 1012 tsize; 1013 mutex_exit(&mi->mi_lock); 1014 } 1015 lwp_stat_update(LWP_STAT_OUBLK, 1); 1016 mutex_enter(&rp->r_statelock); 1017 if (rp->r_flags & RHAVEVERF) { 1018 if (rp->r_verf != res.resok.verf) { 1019 nfs3_set_mod(vp); 1020 rp->r_verf = res.resok.verf; 1021 /* 1022 * If the data was written UNSTABLE, 1023 * then might as well stop because 1024 * the whole block will have to get 1025 * rewritten anyway. 1026 */ 1027 if (*stab_comm == UNSTABLE) { 1028 mutex_exit(&rp->r_statelock); 1029 break; 1030 } 1031 } 1032 } else { 1033 rp->r_verf = res.resok.verf; 1034 rp->r_flags |= RHAVEVERF; 1035 } 1036 /* 1037 * Mark the attribute cache as timed out and 1038 * set RWRITEATTR to indicate that the file 1039 * was modified with a WRITE operation and 1040 * that the attributes can not be trusted. 1041 */ 1042 PURGE_ATTRCACHE_LOCKED(rp); 1043 rp->r_flags |= RWRITEATTR; 1044 mutex_exit(&rp->r_statelock); 1045 } 1046 } while (!error && count); 1047 1048 return (error); 1049 } 1050 1051 /* 1052 * Read from a file. Reads data in largest chunks our interface can handle. 1053 */ 1054 static int 1055 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count, 1056 size_t *residp, cred_t *cr) 1057 { 1058 mntinfo_t *mi; 1059 READ3args args; 1060 READ3vres res; 1061 int tsize; 1062 int error; 1063 int douprintf; 1064 failinfo_t fi; 1065 rnode_t *rp; 1066 struct vattr va; 1067 hrtime_t t; 1068 1069 rp = VTOR(vp); 1070 mi = VTOMI(vp); 1071 ASSERT(nfs_zone() == mi->mi_zone); 1072 douprintf = 1; 1073 1074 args.file = *VTOFH3(vp); 1075 fi.vp = vp; 1076 fi.fhp = (caddr_t)&args.file; 1077 fi.copyproc = nfs3copyfh; 1078 fi.lookupproc = nfs3lookup; 1079 fi.xattrdirproc = acl_getxattrdir3; 1080 1081 res.pov.fres.vp = vp; 1082 res.pov.fres.vap = &va; 1083 1084 *residp = count; 1085 do { 1086 if (mi->mi_io_kstats) { 1087 mutex_enter(&mi->mi_lock); 1088 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1089 mutex_exit(&mi->mi_lock); 1090 } 1091 1092 do { 1093 if ((vp->v_flag & VNOCACHE) || 1094 (rp->r_flags & RDIRECTIO) || 1095 (mi->mi_flags & MI_DIRECTIO)) 1096 tsize = MIN(mi->mi_tsize, count); 1097 else 1098 tsize = MIN(mi->mi_curread, count); 1099 res.data.data_val = base; 1100 res.data.data_len = tsize; 1101 args.offset = (offset3)offset; 1102 args.count = (count3)tsize; 1103 t = gethrtime(); 1104 error = rfs3call(mi, NFSPROC3_READ, 1105 xdr_READ3args, (caddr_t)&args, 1106 xdr_READ3vres, (caddr_t)&res, cr, 1107 &douprintf, &res.status, 0, &fi); 1108 } while (error == ENFS_TRYAGAIN); 1109 1110 if (mi->mi_io_kstats) { 1111 mutex_enter(&mi->mi_lock); 1112 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1113 mutex_exit(&mi->mi_lock); 1114 } 1115 1116 if (error) 1117 return (error); 1118 1119 error = geterrno3(res.status); 1120 if (error) 1121 return (error); 1122 1123 if (res.count != res.data.data_len) { 1124 zcmn_err(getzoneid(), CE_WARN, 1125 "nfs3read: server %s returned incorrect amount", 1126 rp->r_server->sv_hostname); 1127 return (EIO); 1128 } 1129 1130 count -= res.count; 1131 *residp = count; 1132 base += res.count; 1133 offset += res.count; 1134 if (mi->mi_io_kstats) { 1135 mutex_enter(&mi->mi_lock); 1136 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 1137 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; 1138 mutex_exit(&mi->mi_lock); 1139 } 1140 lwp_stat_update(LWP_STAT_INBLK, 1); 1141 } while (count && !res.eof); 1142 1143 if (res.pov.attributes) { 1144 mutex_enter(&rp->r_statelock); 1145 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) { 1146 mutex_exit(&rp->r_statelock); 1147 PURGE_ATTRCACHE(vp); 1148 } else { 1149 if (rp->r_mtime <= t) 1150 nfs_attrcache_va(vp, &va); 1151 mutex_exit(&rp->r_statelock); 1152 } 1153 } 1154 1155 return (0); 1156 } 1157 1158 /* ARGSUSED */ 1159 static int 1160 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 1161 { 1162 1163 if (nfs_zone() != VTOMI(vp)->mi_zone) 1164 return (EIO); 1165 switch (cmd) { 1166 case _FIODIRECTIO: 1167 return (nfs_directio(vp, (int)arg, cr)); 1168 default: 1169 return (ENOTTY); 1170 } 1171 } 1172 1173 static int 1174 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1175 { 1176 int error; 1177 rnode_t *rp; 1178 1179 if (nfs_zone() != VTOMI(vp)->mi_zone) 1180 return (EIO); 1181 /* 1182 * If it has been specified that the return value will 1183 * just be used as a hint, and we are only being asked 1184 * for size, fsid or rdevid, then return the client's 1185 * notion of these values without checking to make sure 1186 * that the attribute cache is up to date. 1187 * The whole point is to avoid an over the wire GETATTR 1188 * call. 1189 */ 1190 rp = VTOR(vp); 1191 if (flags & ATTR_HINT) { 1192 if (vap->va_mask == 1193 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1194 mutex_enter(&rp->r_statelock); 1195 if (vap->va_mask | AT_SIZE) 1196 vap->va_size = rp->r_size; 1197 if (vap->va_mask | AT_FSID) 1198 vap->va_fsid = rp->r_attr.va_fsid; 1199 if (vap->va_mask | AT_RDEV) 1200 vap->va_rdev = rp->r_attr.va_rdev; 1201 mutex_exit(&rp->r_statelock); 1202 return (0); 1203 } 1204 } 1205 1206 /* 1207 * Only need to flush pages if asking for the mtime 1208 * and if there any dirty pages or any outstanding 1209 * asynchronous (write) requests for this file. 1210 */ 1211 if (vap->va_mask & AT_MTIME) { 1212 if (vn_has_cached_data(vp) && 1213 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1214 mutex_enter(&rp->r_statelock); 1215 rp->r_gcount++; 1216 mutex_exit(&rp->r_statelock); 1217 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1218 mutex_enter(&rp->r_statelock); 1219 if (error && (error == ENOSPC || error == EDQUOT)) { 1220 if (!rp->r_error) 1221 rp->r_error = error; 1222 } 1223 if (--rp->r_gcount == 0) 1224 cv_broadcast(&rp->r_cv); 1225 mutex_exit(&rp->r_statelock); 1226 } 1227 } 1228 1229 return (nfs3getattr(vp, vap, cr)); 1230 } 1231 1232 /*ARGSUSED4*/ 1233 static int 1234 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1235 caller_context_t *ct) 1236 { 1237 int error; 1238 struct vattr va; 1239 1240 if (vap->va_mask & AT_NOSET) 1241 return (EINVAL); 1242 if (nfs_zone() != VTOMI(vp)->mi_zone) 1243 return (EIO); 1244 1245 va.va_mask = AT_UID | AT_MODE; 1246 error = nfs3getattr(vp, &va, cr); 1247 if (error) 1248 return (error); 1249 1250 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx, 1251 vp); 1252 if (error) 1253 return (error); 1254 1255 return (nfs3setattr(vp, vap, flags, cr)); 1256 } 1257 1258 static int 1259 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1260 { 1261 int error; 1262 uint_t mask; 1263 SETATTR3args args; 1264 SETATTR3res res; 1265 int douprintf; 1266 rnode_t *rp; 1267 struct vattr va; 1268 mode_t omode; 1269 vsecattr_t *vsp; 1270 hrtime_t t; 1271 1272 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1273 mask = vap->va_mask; 1274 1275 rp = VTOR(vp); 1276 1277 /* 1278 * Only need to flush pages if there are any pages and 1279 * if the file is marked as dirty in some fashion. The 1280 * file must be flushed so that we can accurately 1281 * determine the size of the file and the cached data 1282 * after the SETATTR returns. A file is considered to 1283 * be dirty if it is either marked with RDIRTY, has 1284 * outstanding i/o's active, or is mmap'd. In this 1285 * last case, we can't tell whether there are dirty 1286 * pages, so we flush just to be sure. 1287 */ 1288 if (vn_has_cached_data(vp) && 1289 ((rp->r_flags & RDIRTY) || 1290 rp->r_count > 0 || 1291 rp->r_mapcnt > 0)) { 1292 ASSERT(vp->v_type != VCHR); 1293 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1294 if (error && (error == ENOSPC || error == EDQUOT)) { 1295 mutex_enter(&rp->r_statelock); 1296 if (!rp->r_error) 1297 rp->r_error = error; 1298 mutex_exit(&rp->r_statelock); 1299 } 1300 } 1301 1302 args.object = *RTOFH3(rp); 1303 /* 1304 * If the intent is for the server to set the times, 1305 * there is no point in have the mask indicating set mtime or 1306 * atime, because the vap values may be junk, and so result 1307 * in an overflow error. Remove these flags from the vap mask 1308 * before calling in this case, and restore them afterwards. 1309 */ 1310 if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) { 1311 /* Use server times, so don't set the args time fields */ 1312 vap->va_mask &= ~(AT_ATIME | AT_MTIME); 1313 error = vattr_to_sattr3(vap, &args.new_attributes); 1314 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME)); 1315 if (mask & AT_ATIME) { 1316 args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; 1317 } 1318 if (mask & AT_MTIME) { 1319 args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; 1320 } 1321 } else { 1322 /* Either do not set times or use the client specified times */ 1323 error = vattr_to_sattr3(vap, &args.new_attributes); 1324 } 1325 1326 if (error) { 1327 /* req time field(s) overflow - return immediately */ 1328 return (error); 1329 } 1330 1331 va.va_mask = AT_MODE | AT_CTIME; 1332 error = nfs3getattr(vp, &va, cr); 1333 if (error) 1334 return (error); 1335 omode = va.va_mode; 1336 1337 tryagain: 1338 if (mask & AT_SIZE) { 1339 args.guard.check = TRUE; 1340 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec; 1341 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec; 1342 } else 1343 args.guard.check = FALSE; 1344 1345 douprintf = 1; 1346 1347 t = gethrtime(); 1348 1349 error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, 1350 xdr_SETATTR3args, (caddr_t)&args, 1351 xdr_SETATTR3res, (caddr_t)&res, cr, 1352 &douprintf, &res.status, 0, NULL); 1353 1354 /* 1355 * Purge the access cache and ACL cache if changing either the 1356 * owner of the file, the group owner, or the mode. These may 1357 * change the access permissions of the file, so purge old 1358 * information and start over again. 1359 */ 1360 if (mask & (AT_UID | AT_GID | AT_MODE)) { 1361 (void) nfs_access_purge_rp(rp); 1362 if (rp->r_secattr != NULL) { 1363 mutex_enter(&rp->r_statelock); 1364 vsp = rp->r_secattr; 1365 rp->r_secattr = NULL; 1366 mutex_exit(&rp->r_statelock); 1367 if (vsp != NULL) 1368 nfs_acl_free(vsp); 1369 } 1370 } 1371 1372 if (error) { 1373 PURGE_ATTRCACHE(vp); 1374 return (error); 1375 } 1376 1377 error = geterrno3(res.status); 1378 if (!error) { 1379 /* 1380 * If changing the size of the file, invalidate 1381 * any local cached data which is no longer part 1382 * of the file. We also possibly invalidate the 1383 * last page in the file. We could use 1384 * pvn_vpzero(), but this would mark the page as 1385 * modified and require it to be written back to 1386 * the server for no particularly good reason. 1387 * This way, if we access it, then we bring it 1388 * back in. A read should be cheaper than a 1389 * write. 1390 */ 1391 if (mask & AT_SIZE) { 1392 nfs_invalidate_pages(vp, 1393 (vap->va_size & PAGEMASK), cr); 1394 } 1395 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); 1396 /* 1397 * Some servers will change the mode to clear the setuid 1398 * and setgid bits when changing the uid or gid. The 1399 * client needs to compensate appropriately. 1400 */ 1401 if (mask & (AT_UID | AT_GID)) { 1402 int terror; 1403 1404 va.va_mask = AT_MODE; 1405 terror = nfs3getattr(vp, &va, cr); 1406 if (!terror && 1407 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 1408 (!(mask & AT_MODE) && va.va_mode != omode))) { 1409 va.va_mask = AT_MODE; 1410 if (mask & AT_MODE) 1411 va.va_mode = vap->va_mode; 1412 else 1413 va.va_mode = omode; 1414 (void) nfs3setattr(vp, &va, 0, cr); 1415 } 1416 } 1417 } else { 1418 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr); 1419 /* 1420 * If we got back a "not synchronized" error, then 1421 * we need to retry with a new guard value. The 1422 * guard value used is the change time. If the 1423 * server returned post_op_attr, then we can just 1424 * retry because we have the latest attributes. 1425 * Otherwise, we issue a GETATTR to get the latest 1426 * attributes and then retry. If we couldn't get 1427 * the attributes this way either, then we give 1428 * up because we can't complete the operation as 1429 * required. 1430 */ 1431 if (res.status == NFS3ERR_NOT_SYNC) { 1432 va.va_mask = AT_CTIME; 1433 if (nfs3getattr(vp, &va, cr) == 0) 1434 goto tryagain; 1435 } 1436 PURGE_STALE_FH(error, vp, cr); 1437 } 1438 1439 return (error); 1440 } 1441 1442 static int 1443 nfs3_accessx(void *vp, int mode, cred_t *cr) 1444 { 1445 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1446 return (nfs3_access(vp, mode, 0, cr)); 1447 } 1448 1449 /* ARGSUSED */ 1450 static int 1451 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr) 1452 { 1453 int error; 1454 ACCESS3args args; 1455 ACCESS3res res; 1456 int douprintf; 1457 uint32 acc; 1458 rnode_t *rp; 1459 cred_t *cred, *ncr, *ncrfree = NULL; 1460 failinfo_t fi; 1461 nfs_access_type_t cacc; 1462 hrtime_t t; 1463 1464 acc = 0; 1465 if (nfs_zone() != VTOMI(vp)->mi_zone) 1466 return (EIO); 1467 if (mode & VREAD) 1468 acc |= ACCESS3_READ; 1469 if (mode & VWRITE) { 1470 if (vn_is_readonly(vp) && !IS_DEVVP(vp)) 1471 return (EROFS); 1472 if (vp->v_type == VDIR) 1473 acc |= ACCESS3_DELETE; 1474 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND; 1475 } 1476 if (mode & VEXEC) { 1477 if (vp->v_type == VDIR) 1478 acc |= ACCESS3_LOOKUP; 1479 else 1480 acc |= ACCESS3_EXECUTE; 1481 } 1482 1483 rp = VTOR(vp); 1484 args.object = *VTOFH3(vp); 1485 if (vp->v_type == VDIR) { 1486 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY | 1487 ACCESS3_EXTEND | ACCESS3_LOOKUP; 1488 } else { 1489 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND | 1490 ACCESS3_EXECUTE; 1491 } 1492 fi.vp = vp; 1493 fi.fhp = (caddr_t)&args.object; 1494 fi.copyproc = nfs3copyfh; 1495 fi.lookupproc = nfs3lookup; 1496 fi.xattrdirproc = acl_getxattrdir3; 1497 1498 cred = cr; 1499 /* 1500 * ncr and ncrfree both initially 1501 * point to the memory area returned 1502 * by crnetadjust(); 1503 * ncrfree not NULL when exiting means 1504 * that we need to release it 1505 */ 1506 ncr = crnetadjust(cred); 1507 ncrfree = ncr; 1508 tryagain: 1509 if (rp->r_acache != NULL) { 1510 cacc = nfs_access_check(rp, acc, cred); 1511 if (cacc == NFS_ACCESS_ALLOWED) { 1512 if (ncrfree != NULL) 1513 crfree(ncrfree); 1514 return (0); 1515 } 1516 if (cacc == NFS_ACCESS_DENIED) { 1517 /* 1518 * If the cred can be adjusted, try again 1519 * with the new cred. 1520 */ 1521 if (ncr != NULL) { 1522 cred = ncr; 1523 ncr = NULL; 1524 goto tryagain; 1525 } 1526 if (ncrfree != NULL) 1527 crfree(ncrfree); 1528 return (EACCES); 1529 } 1530 } 1531 1532 douprintf = 1; 1533 1534 t = gethrtime(); 1535 1536 error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS, 1537 xdr_ACCESS3args, (caddr_t)&args, 1538 xdr_ACCESS3res, (caddr_t)&res, cred, 1539 &douprintf, &res.status, 0, &fi); 1540 1541 if (error) { 1542 if (ncrfree != NULL) 1543 crfree(ncrfree); 1544 return (error); 1545 } 1546 1547 error = geterrno3(res.status); 1548 if (!error) { 1549 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr); 1550 nfs_access_cache(rp, args.access, res.resok.access, cred); 1551 /* 1552 * we just cached results with cred; if cred is the 1553 * adjusted credentials from crnetadjust, we do not want 1554 * to release them before exiting: hence setting ncrfree 1555 * to NULL 1556 */ 1557 if (cred != cr) 1558 ncrfree = NULL; 1559 if ((acc & res.resok.access) != acc) { 1560 /* 1561 * If the cred can be adjusted, try again 1562 * with the new cred. 1563 */ 1564 if (ncr != NULL) { 1565 cred = ncr; 1566 ncr = NULL; 1567 goto tryagain; 1568 } 1569 error = EACCES; 1570 } 1571 } else { 1572 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr); 1573 PURGE_STALE_FH(error, vp, cr); 1574 } 1575 1576 if (ncrfree != NULL) 1577 crfree(ncrfree); 1578 1579 return (error); 1580 } 1581 1582 static int nfs3_do_symlink_cache = 1; 1583 1584 static int 1585 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 1586 { 1587 int error; 1588 READLINK3args args; 1589 READLINK3res res; 1590 nfspath3 resdata_backup; 1591 rnode_t *rp; 1592 int douprintf; 1593 int len; 1594 failinfo_t fi; 1595 hrtime_t t; 1596 1597 /* 1598 * Can't readlink anything other than a symbolic link. 1599 */ 1600 if (vp->v_type != VLNK) 1601 return (EINVAL); 1602 if (nfs_zone() != VTOMI(vp)->mi_zone) 1603 return (EIO); 1604 1605 rp = VTOR(vp); 1606 if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) { 1607 error = nfs3_validate_caches(vp, cr); 1608 if (error) 1609 return (error); 1610 mutex_enter(&rp->r_statelock); 1611 if (rp->r_symlink.contents != NULL) { 1612 error = uiomove(rp->r_symlink.contents, 1613 rp->r_symlink.len, UIO_READ, uiop); 1614 mutex_exit(&rp->r_statelock); 1615 return (error); 1616 } 1617 mutex_exit(&rp->r_statelock); 1618 } 1619 1620 args.symlink = *VTOFH3(vp); 1621 fi.vp = vp; 1622 fi.fhp = (caddr_t)&args.symlink; 1623 fi.copyproc = nfs3copyfh; 1624 fi.lookupproc = nfs3lookup; 1625 fi.xattrdirproc = acl_getxattrdir3; 1626 1627 res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1628 1629 resdata_backup = res.resok.data; 1630 1631 douprintf = 1; 1632 1633 t = gethrtime(); 1634 1635 error = rfs3call(VTOMI(vp), NFSPROC3_READLINK, 1636 xdr_nfs_fh3, (caddr_t)&args, 1637 xdr_READLINK3res, (caddr_t)&res, cr, 1638 &douprintf, &res.status, 0, &fi); 1639 1640 if (res.resok.data == nfs3nametoolong) 1641 error = EINVAL; 1642 1643 if (error) { 1644 kmem_free(resdata_backup, MAXPATHLEN); 1645 return (error); 1646 } 1647 1648 error = geterrno3(res.status); 1649 if (!error) { 1650 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t, 1651 cr); 1652 len = strlen(res.resok.data); 1653 error = uiomove(res.resok.data, len, UIO_READ, uiop); 1654 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) { 1655 mutex_enter(&rp->r_statelock); 1656 if (rp->r_symlink.contents == NULL) { 1657 rp->r_symlink.contents = res.resok.data; 1658 rp->r_symlink.len = len; 1659 rp->r_symlink.size = MAXPATHLEN; 1660 mutex_exit(&rp->r_statelock); 1661 } else { 1662 mutex_exit(&rp->r_statelock); 1663 1664 kmem_free((void *)res.resok.data, MAXPATHLEN); 1665 } 1666 } else { 1667 kmem_free((void *)res.resok.data, MAXPATHLEN); 1668 } 1669 } else { 1670 nfs3_cache_post_op_attr(vp, 1671 &res.resfail.symlink_attributes, t, cr); 1672 PURGE_STALE_FH(error, vp, cr); 1673 1674 kmem_free((void *)res.resok.data, MAXPATHLEN); 1675 1676 } 1677 1678 /* 1679 * The over the wire error for attempting to readlink something 1680 * other than a symbolic link is ENXIO. However, we need to 1681 * return EINVAL instead of ENXIO, so we map it here. 1682 */ 1683 return (error == ENXIO ? EINVAL : error); 1684 } 1685 1686 /* 1687 * Flush local dirty pages to stable storage on the server. 1688 * 1689 * If FNODSYNC is specified, then there is nothing to do because 1690 * metadata changes are not cached on the client before being 1691 * sent to the server. 1692 */ 1693 static int 1694 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1695 { 1696 int error; 1697 1698 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1699 return (0); 1700 if (nfs_zone() != VTOMI(vp)->mi_zone) 1701 return (EIO); 1702 1703 error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); 1704 if (!error) 1705 error = VTOR(vp)->r_error; 1706 return (error); 1707 } 1708 1709 /* 1710 * Weirdness: if the file was removed or the target of a rename 1711 * operation while it was open, it got renamed instead. Here we 1712 * remove the renamed file. 1713 */ 1714 static void 1715 nfs3_inactive(vnode_t *vp, cred_t *cr) 1716 { 1717 rnode_t *rp; 1718 1719 ASSERT(vp != DNLC_NO_VNODE); 1720 1721 /* 1722 * If this is coming from the wrong zone, we let someone in the right 1723 * zone take care of it asynchronously. We can get here due to 1724 * VN_RELE() being called from pageout() or fsflush(). This call may 1725 * potentially turn into an expensive no-op if, for instance, v_count 1726 * gets incremented in the meantime, but it's still correct. 1727 */ 1728 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1729 nfs_async_inactive(vp, cr, nfs3_inactive); 1730 return; 1731 } 1732 1733 rp = VTOR(vp); 1734 redo: 1735 if (rp->r_unldvp != NULL) { 1736 /* 1737 * Save the vnode pointer for the directory where the 1738 * unlinked-open file got renamed, then set it to NULL 1739 * to prevent another thread from getting here before 1740 * we're done with the remove. While we have the 1741 * statelock, make local copies of the pertinent rnode 1742 * fields. If we weren't to do this in an atomic way, the 1743 * the unl* fields could become inconsistent with respect 1744 * to each other due to a race condition between this 1745 * code and nfs_remove(). See bug report 1034328. 1746 */ 1747 mutex_enter(&rp->r_statelock); 1748 if (rp->r_unldvp != NULL) { 1749 vnode_t *unldvp; 1750 char *unlname; 1751 cred_t *unlcred; 1752 REMOVE3args args; 1753 REMOVE3res res; 1754 int douprintf; 1755 int error; 1756 hrtime_t t; 1757 1758 unldvp = rp->r_unldvp; 1759 rp->r_unldvp = NULL; 1760 unlname = rp->r_unlname; 1761 rp->r_unlname = NULL; 1762 unlcred = rp->r_unlcred; 1763 rp->r_unlcred = NULL; 1764 mutex_exit(&rp->r_statelock); 1765 1766 /* 1767 * If there are any dirty pages left, then flush 1768 * them. This is unfortunate because they just 1769 * may get thrown away during the remove operation, 1770 * but we have to do this for correctness. 1771 */ 1772 if (vn_has_cached_data(vp) && 1773 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1774 ASSERT(vp->v_type != VCHR); 1775 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1776 if (error) { 1777 mutex_enter(&rp->r_statelock); 1778 if (!rp->r_error) 1779 rp->r_error = error; 1780 mutex_exit(&rp->r_statelock); 1781 } 1782 } 1783 1784 /* 1785 * Do the remove operation on the renamed file 1786 */ 1787 setdiropargs3(&args.object, unlname, unldvp); 1788 1789 douprintf = 1; 1790 1791 t = gethrtime(); 1792 1793 error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE, 1794 xdr_diropargs3, (caddr_t)&args, 1795 xdr_REMOVE3res, (caddr_t)&res, unlcred, 1796 &douprintf, &res.status, 0, NULL); 1797 1798 if (error) { 1799 PURGE_ATTRCACHE(unldvp); 1800 } else { 1801 error = geterrno3(res.status); 1802 if (!error) { 1803 nfs3_cache_wcc_data(unldvp, 1804 &res.resok.dir_wcc, t, cr); 1805 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1806 nfs_purge_rddir_cache(unldvp); 1807 } else { 1808 nfs3_cache_wcc_data(unldvp, 1809 &res.resfail.dir_wcc, t, cr); 1810 PURGE_STALE_FH(error, unldvp, cr); 1811 } 1812 } 1813 1814 /* 1815 * Release stuff held for the remove 1816 */ 1817 VN_RELE(unldvp); 1818 kmem_free(unlname, MAXNAMELEN); 1819 crfree(unlcred); 1820 goto redo; 1821 } 1822 mutex_exit(&rp->r_statelock); 1823 } 1824 1825 rp_addfree(rp, cr); 1826 } 1827 1828 /* 1829 * Remote file system operations having to do with directory manipulation. 1830 */ 1831 1832 static int 1833 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1834 int flags, vnode_t *rdir, cred_t *cr) 1835 { 1836 int error; 1837 vnode_t *vp; 1838 vnode_t *avp = NULL; 1839 rnode_t *drp; 1840 1841 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1842 return (EPERM); 1843 1844 drp = VTOR(dvp); 1845 1846 /* 1847 * Are we looking up extended attributes? If so, "dvp" is 1848 * the file or directory for which we want attributes, and 1849 * we need a lookup of the hidden attribute directory 1850 * before we lookup the rest of the path. 1851 */ 1852 if (flags & LOOKUP_XATTR) { 1853 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1854 mntinfo_t *mi; 1855 1856 mi = VTOMI(dvp); 1857 if (!(mi->mi_flags & MI_EXTATTR)) 1858 return (EINVAL); 1859 1860 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1861 return (EINTR); 1862 1863 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1864 if (avp == NULL) 1865 error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0); 1866 else 1867 error = 0; 1868 1869 nfs_rw_exit(&drp->r_rwlock); 1870 1871 if (error) { 1872 if (mi->mi_flags & MI_EXTATTR) 1873 return (error); 1874 return (EINVAL); 1875 } 1876 dvp = avp; 1877 drp = VTOR(dvp); 1878 } 1879 1880 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1881 error = EINTR; 1882 goto out; 1883 } 1884 1885 error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1886 1887 nfs_rw_exit(&drp->r_rwlock); 1888 1889 /* 1890 * If vnode is a device, create special vnode. 1891 */ 1892 if (!error && IS_DEVVP(*vpp)) { 1893 vp = *vpp; 1894 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1895 VN_RELE(vp); 1896 } 1897 1898 out: 1899 if (avp != NULL) 1900 VN_RELE(avp); 1901 1902 return (error); 1903 } 1904 1905 static int nfs3_lookup_neg_cache = 1; 1906 1907 #ifdef DEBUG 1908 static int nfs3_lookup_dnlc_hits = 0; 1909 static int nfs3_lookup_dnlc_misses = 0; 1910 static int nfs3_lookup_dnlc_neg_hits = 0; 1911 static int nfs3_lookup_dnlc_disappears = 0; 1912 static int nfs3_lookup_dnlc_lookups = 0; 1913 #endif 1914 1915 /* ARGSUSED */ 1916 int 1917 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1918 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1919 { 1920 int error; 1921 rnode_t *drp; 1922 1923 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1924 /* 1925 * If lookup is for "", just return dvp. Don't need 1926 * to send it over the wire, look it up in the dnlc, 1927 * or perform any access checks. 1928 */ 1929 if (*nm == '\0') { 1930 VN_HOLD(dvp); 1931 *vpp = dvp; 1932 return (0); 1933 } 1934 1935 /* 1936 * Can't do lookups in non-directories. 1937 */ 1938 if (dvp->v_type != VDIR) 1939 return (ENOTDIR); 1940 1941 /* 1942 * If we're called with RFSCALL_SOFT, it's important that 1943 * the only rfscall is one we make directly; if we permit 1944 * an access call because we're looking up "." or validating 1945 * a dnlc hit, we'll deadlock because that rfscall will not 1946 * have the RFSCALL_SOFT set. 1947 */ 1948 if (rfscall_flags & RFSCALL_SOFT) 1949 goto callit; 1950 1951 /* 1952 * If lookup is for ".", just return dvp. Don't need 1953 * to send it over the wire or look it up in the dnlc, 1954 * just need to check access. 1955 */ 1956 if (strcmp(nm, ".") == 0) { 1957 error = nfs3_access(dvp, VEXEC, 0, cr); 1958 if (error) 1959 return (error); 1960 VN_HOLD(dvp); 1961 *vpp = dvp; 1962 return (0); 1963 } 1964 1965 drp = VTOR(dvp); 1966 if (!(drp->r_flags & RLOOKUP)) { 1967 mutex_enter(&drp->r_statelock); 1968 drp->r_flags |= RLOOKUP; 1969 mutex_exit(&drp->r_statelock); 1970 } 1971 1972 /* 1973 * Lookup this name in the DNLC. If there was a valid entry, 1974 * then return the results of the lookup. 1975 */ 1976 error = nfs3lookup_dnlc(dvp, nm, vpp, cr); 1977 if (error || *vpp != NULL) 1978 return (error); 1979 1980 callit: 1981 error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1982 1983 return (error); 1984 } 1985 1986 static int 1987 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1988 { 1989 int error; 1990 vnode_t *vp; 1991 1992 ASSERT(*nm != '\0'); 1993 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1994 /* 1995 * Lookup this name in the DNLC. If successful, then validate 1996 * the caches and then recheck the DNLC. The DNLC is rechecked 1997 * just in case this entry got invalidated during the call 1998 * to nfs3_validate_caches. 1999 * 2000 * An assumption is being made that it is safe to say that a 2001 * file exists which may not on the server. Any operations to 2002 * the server will fail with ESTALE. 2003 */ 2004 #ifdef DEBUG 2005 nfs3_lookup_dnlc_lookups++; 2006 #endif 2007 vp = dnlc_lookup(dvp, nm); 2008 if (vp != NULL) { 2009 VN_RELE(vp); 2010 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 2011 PURGE_ATTRCACHE(dvp); 2012 } 2013 error = nfs3_validate_caches(dvp, cr); 2014 if (error) 2015 return (error); 2016 vp = dnlc_lookup(dvp, nm); 2017 if (vp != NULL) { 2018 error = nfs3_access(dvp, VEXEC, 0, cr); 2019 if (error) { 2020 VN_RELE(vp); 2021 return (error); 2022 } 2023 if (vp == DNLC_NO_VNODE) { 2024 VN_RELE(vp); 2025 #ifdef DEBUG 2026 nfs3_lookup_dnlc_neg_hits++; 2027 #endif 2028 return (ENOENT); 2029 } 2030 *vpp = vp; 2031 #ifdef DEBUG 2032 nfs3_lookup_dnlc_hits++; 2033 #endif 2034 return (0); 2035 } 2036 #ifdef DEBUG 2037 nfs3_lookup_dnlc_disappears++; 2038 #endif 2039 } 2040 #ifdef DEBUG 2041 else 2042 nfs3_lookup_dnlc_misses++; 2043 #endif 2044 2045 *vpp = NULL; 2046 2047 return (0); 2048 } 2049 2050 static int 2051 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 2052 int rfscall_flags) 2053 { 2054 int error; 2055 LOOKUP3args args; 2056 LOOKUP3vres res; 2057 int douprintf; 2058 struct vattr vattr; 2059 struct vattr dvattr; 2060 vnode_t *vp; 2061 failinfo_t fi; 2062 hrtime_t t; 2063 2064 ASSERT(*nm != '\0'); 2065 ASSERT(dvp->v_type == VDIR); 2066 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2067 2068 setdiropargs3(&args.what, nm, dvp); 2069 2070 fi.vp = dvp; 2071 fi.fhp = (caddr_t)&args.what.dir; 2072 fi.copyproc = nfs3copyfh; 2073 fi.lookupproc = nfs3lookup; 2074 fi.xattrdirproc = acl_getxattrdir3; 2075 res.obj_attributes.fres.vp = dvp; 2076 res.obj_attributes.fres.vap = &vattr; 2077 res.dir_attributes.fres.vp = dvp; 2078 res.dir_attributes.fres.vap = &dvattr; 2079 2080 douprintf = 1; 2081 2082 t = gethrtime(); 2083 2084 error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP, 2085 xdr_diropargs3, (caddr_t)&args, 2086 xdr_LOOKUP3vres, (caddr_t)&res, cr, 2087 &douprintf, &res.status, rfscall_flags, &fi); 2088 2089 if (error) 2090 return (error); 2091 2092 nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr); 2093 2094 error = geterrno3(res.status); 2095 if (error) { 2096 PURGE_STALE_FH(error, dvp, cr); 2097 if (error == ENOENT && nfs3_lookup_neg_cache) 2098 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 2099 return (error); 2100 } 2101 2102 if (res.obj_attributes.attributes) { 2103 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap, 2104 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 2105 } else { 2106 vp = makenfs3node_va(&res.object, NULL, 2107 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 2108 if (vp->v_type == VNON) { 2109 vattr.va_mask = AT_TYPE; 2110 error = nfs3getattr(vp, &vattr, cr); 2111 if (error) { 2112 VN_RELE(vp); 2113 return (error); 2114 } 2115 vp->v_type = vattr.va_type; 2116 } 2117 } 2118 2119 if (!(rfscall_flags & RFSCALL_SOFT)) 2120 dnlc_update(dvp, nm, vp); 2121 2122 *vpp = vp; 2123 2124 return (error); 2125 } 2126 2127 #ifdef DEBUG 2128 static int nfs3_create_misses = 0; 2129 #endif 2130 2131 /* ARGSUSED */ 2132 static int 2133 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2134 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 2135 { 2136 int error; 2137 vnode_t *vp; 2138 rnode_t *rp; 2139 struct vattr vattr; 2140 rnode_t *drp; 2141 vnode_t *tempvp; 2142 2143 drp = VTOR(dvp); 2144 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2145 return (EPERM); 2146 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2147 return (EINTR); 2148 2149 top: 2150 /* 2151 * We make a copy of the attributes because the caller does not 2152 * expect us to change what va points to. 2153 */ 2154 vattr = *va; 2155 2156 /* 2157 * If the pathname is "", just use dvp. Don't need 2158 * to send it over the wire, look it up in the dnlc, 2159 * or perform any access checks. 2160 */ 2161 if (*nm == '\0') { 2162 error = 0; 2163 VN_HOLD(dvp); 2164 vp = dvp; 2165 /* 2166 * If the pathname is ".", just use dvp. Don't need 2167 * to send it over the wire or look it up in the dnlc, 2168 * just need to check access. 2169 */ 2170 } else if (strcmp(nm, ".") == 0) { 2171 error = nfs3_access(dvp, VEXEC, 0, cr); 2172 if (error) { 2173 nfs_rw_exit(&drp->r_rwlock); 2174 return (error); 2175 } 2176 VN_HOLD(dvp); 2177 vp = dvp; 2178 /* 2179 * We need to go over the wire, just to be sure whether the 2180 * file exists or not. Using the DNLC can be dangerous in 2181 * this case when making a decision regarding existence. 2182 */ 2183 } else { 2184 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0); 2185 } 2186 if (!error) { 2187 if (exclusive == EXCL) 2188 error = EEXIST; 2189 else if (vp->v_type == VDIR && (mode & VWRITE)) 2190 error = EISDIR; 2191 else { 2192 /* 2193 * If vnode is a device, create special vnode. 2194 */ 2195 if (IS_DEVVP(vp)) { 2196 tempvp = vp; 2197 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2198 VN_RELE(tempvp); 2199 } 2200 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 2201 if ((vattr.va_mask & AT_SIZE) && 2202 vp->v_type == VREG) { 2203 rp = VTOR(vp); 2204 /* 2205 * Check here for large file handled 2206 * by LF-unaware process (as 2207 * ufs_create() does) 2208 */ 2209 if (!(lfaware & FOFFMAX)) { 2210 mutex_enter(&rp->r_statelock); 2211 if (rp->r_size > MAXOFF32_T) 2212 error = EOVERFLOW; 2213 mutex_exit(&rp->r_statelock); 2214 } 2215 if (!error) { 2216 vattr.va_mask = AT_SIZE; 2217 error = nfs3setattr(vp, 2218 &vattr, 0, cr); 2219 } 2220 } 2221 } 2222 } 2223 nfs_rw_exit(&drp->r_rwlock); 2224 if (error) { 2225 VN_RELE(vp); 2226 } else 2227 *vpp = vp; 2228 return (error); 2229 } 2230 2231 dnlc_remove(dvp, nm); 2232 2233 /* 2234 * Decide what the group-id of the created file should be. 2235 * Set it in attribute list as advisory... 2236 */ 2237 error = setdirgid(dvp, &vattr.va_gid, cr); 2238 if (error) { 2239 nfs_rw_exit(&drp->r_rwlock); 2240 return (error); 2241 } 2242 vattr.va_mask |= AT_GID; 2243 2244 ASSERT(vattr.va_mask & AT_TYPE); 2245 if (vattr.va_type == VREG) { 2246 ASSERT(vattr.va_mask & AT_MODE); 2247 if (MANDMODE(vattr.va_mode)) { 2248 nfs_rw_exit(&drp->r_rwlock); 2249 return (EACCES); 2250 } 2251 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr, 2252 lfaware); 2253 /* 2254 * If this is not an exclusive create, then the CREATE 2255 * request will be made with the GUARDED mode set. This 2256 * means that the server will return EEXIST if the file 2257 * exists. The file could exist because of a retransmitted 2258 * request. In this case, we recover by starting over and 2259 * checking to see whether the file exists. This second 2260 * time through it should and a CREATE request will not be 2261 * sent. 2262 * 2263 * This handles the problem of a dangling CREATE request 2264 * which contains attributes which indicate that the file 2265 * should be truncated. This retransmitted request could 2266 * possibly truncate valid data in the file if not caught 2267 * by the duplicate request mechanism on the server or if 2268 * not caught by other means. The scenario is: 2269 * 2270 * Client transmits CREATE request with size = 0 2271 * Client times out, retransmits request. 2272 * Response to the first request arrives from the server 2273 * and the client proceeds on. 2274 * Client writes data to the file. 2275 * The server now processes retransmitted CREATE request 2276 * and truncates file. 2277 * 2278 * The use of the GUARDED CREATE request prevents this from 2279 * happening because the retransmitted CREATE would fail 2280 * with EEXIST and would not truncate the file. 2281 */ 2282 if (error == EEXIST && exclusive == NONEXCL) { 2283 #ifdef DEBUG 2284 nfs3_create_misses++; 2285 #endif 2286 goto top; 2287 } 2288 nfs_rw_exit(&drp->r_rwlock); 2289 return (error); 2290 } 2291 error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 2292 nfs_rw_exit(&drp->r_rwlock); 2293 return (error); 2294 } 2295 2296 /* ARGSUSED */ 2297 static int 2298 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2299 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 2300 { 2301 int error; 2302 CREATE3args args; 2303 CREATE3res res; 2304 int douprintf; 2305 vnode_t *vp; 2306 struct vattr vattr; 2307 nfstime3 *verfp; 2308 rnode_t *rp; 2309 timestruc_t now; 2310 hrtime_t t; 2311 2312 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2313 setdiropargs3(&args.where, nm, dvp); 2314 if (exclusive == EXCL) { 2315 args.how.mode = EXCLUSIVE; 2316 /* 2317 * Construct the create verifier. This verifier needs 2318 * to be unique between different clients. It also needs 2319 * to vary for each exclusive create request generated 2320 * from the client to the server. 2321 * 2322 * The first attempt is made to use the hostid and a 2323 * unique number on the client. If the hostid has not 2324 * been set, the high resolution time that the exclusive 2325 * create request is being made is used. This will work 2326 * unless two different clients, both with the hostid 2327 * not set, attempt an exclusive create request on the 2328 * same file, at exactly the same clock time. The 2329 * chances of this happening seem small enough to be 2330 * reasonable. 2331 */ 2332 verfp = (nfstime3 *)&args.how.createhow3_u.verf; 2333 verfp->seconds = nfs_atoi(hw_serial); 2334 if (verfp->seconds != 0) 2335 verfp->nseconds = newnum(); 2336 else { 2337 gethrestime(&now); 2338 verfp->seconds = now.tv_sec; 2339 verfp->nseconds = now.tv_nsec; 2340 } 2341 /* 2342 * Since the server will use this value for the mtime, 2343 * make sure that it can't overflow. Zero out the MSB. 2344 * The actual value does not matter here, only its uniqeness. 2345 */ 2346 verfp->seconds %= INT32_MAX; 2347 } else { 2348 /* 2349 * Issue the non-exclusive create in guarded mode. This 2350 * may result in some false EEXIST responses for 2351 * retransmitted requests, but these will be handled at 2352 * a higher level. By using GUARDED, duplicate requests 2353 * to do file truncation and possible access problems 2354 * can be avoided. 2355 */ 2356 args.how.mode = GUARDED; 2357 error = vattr_to_sattr3(va, 2358 &args.how.createhow3_u.obj_attributes); 2359 if (error) { 2360 /* req time field(s) overflow - return immediately */ 2361 return (error); 2362 } 2363 } 2364 2365 douprintf = 1; 2366 2367 t = gethrtime(); 2368 2369 error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE, 2370 xdr_CREATE3args, (caddr_t)&args, 2371 xdr_CREATE3res, (caddr_t)&res, cr, 2372 &douprintf, &res.status, 0, NULL); 2373 2374 if (error) { 2375 PURGE_ATTRCACHE(dvp); 2376 return (error); 2377 } 2378 2379 error = geterrno3(res.status); 2380 if (!error) { 2381 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 2382 if (HAVE_RDDIR_CACHE(VTOR(dvp))) 2383 nfs_purge_rddir_cache(dvp); 2384 2385 /* 2386 * On exclusive create the times need to be explicitly 2387 * set to clear any potential verifier that may be stored 2388 * in one of these fields (see comment below). This 2389 * is done here to cover the case where no post op attrs 2390 * were returned or a 'invalid' time was returned in 2391 * the attributes. 2392 */ 2393 if (exclusive == EXCL) 2394 va->va_mask |= (AT_MTIME | AT_ATIME); 2395 2396 if (!res.resok.obj.handle_follows) { 2397 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2398 if (error) 2399 return (error); 2400 } else { 2401 if (res.resok.obj_attributes.attributes) { 2402 vp = makenfs3node(&res.resok.obj.handle, 2403 &res.resok.obj_attributes.attr, 2404 dvp->v_vfsp, t, cr, NULL, NULL); 2405 } else { 2406 vp = makenfs3node(&res.resok.obj.handle, NULL, 2407 dvp->v_vfsp, t, cr, NULL, NULL); 2408 2409 /* 2410 * On an exclusive create, it is possible 2411 * that attributes were returned but those 2412 * postop attributes failed to decode 2413 * properly. If this is the case, 2414 * then most likely the atime or mtime 2415 * were invalid for our client; this 2416 * is caused by the server storing the 2417 * create verifier in one of the time 2418 * fields(most likely mtime). 2419 * So... we are going to setattr just the 2420 * atime/mtime to clear things up. 2421 */ 2422 if (exclusive == EXCL) { 2423 if (error = 2424 nfs3excl_create_settimes(vp, 2425 va, cr)) { 2426 /* 2427 * Setting the times failed. 2428 * Remove the file and return 2429 * the error. 2430 */ 2431 VN_RELE(vp); 2432 (void) nfs3_remove(dvp, 2433 nm, cr); 2434 return (error); 2435 } 2436 } 2437 2438 /* 2439 * This handles the non-exclusive case 2440 * and the exclusive case where no post op 2441 * attrs were returned. 2442 */ 2443 if (vp->v_type == VNON) { 2444 vattr.va_mask = AT_TYPE; 2445 error = nfs3getattr(vp, &vattr, cr); 2446 if (error) { 2447 VN_RELE(vp); 2448 return (error); 2449 } 2450 vp->v_type = vattr.va_type; 2451 } 2452 } 2453 dnlc_update(dvp, nm, vp); 2454 } 2455 2456 rp = VTOR(vp); 2457 2458 /* 2459 * Check here for large file handled by 2460 * LF-unaware process (as ufs_create() does) 2461 */ 2462 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG && 2463 !(lfaware & FOFFMAX)) { 2464 mutex_enter(&rp->r_statelock); 2465 if (rp->r_size > MAXOFF32_T) { 2466 mutex_exit(&rp->r_statelock); 2467 VN_RELE(vp); 2468 return (EOVERFLOW); 2469 } 2470 mutex_exit(&rp->r_statelock); 2471 } 2472 2473 if (exclusive == EXCL && 2474 (va->va_mask & ~(AT_GID | AT_SIZE))) { 2475 /* 2476 * If doing an exclusive create, then generate 2477 * a SETATTR to set the initial attributes. 2478 * Try to set the mtime and the atime to the 2479 * server's current time. It is somewhat 2480 * expected that these fields will be used to 2481 * store the exclusive create cookie. If not, 2482 * server implementors will need to know that 2483 * a SETATTR will follow an exclusive create 2484 * and the cookie should be destroyed if 2485 * appropriate. This work may have been done 2486 * earlier in this function if post op attrs 2487 * were not available. 2488 * 2489 * The AT_GID and AT_SIZE bits are turned off 2490 * so that the SETATTR request will not attempt 2491 * to process these. The gid will be set 2492 * separately if appropriate. The size is turned 2493 * off because it is assumed that a new file will 2494 * be created empty and if the file wasn't empty, 2495 * then the exclusive create will have failed 2496 * because the file must have existed already. 2497 * Therefore, no truncate operation is needed. 2498 */ 2499 va->va_mask &= ~(AT_GID | AT_SIZE); 2500 error = nfs3setattr(vp, va, 0, cr); 2501 if (error) { 2502 /* 2503 * Couldn't correct the attributes of 2504 * the newly created file and the 2505 * attributes are wrong. Remove the 2506 * file and return an error to the 2507 * application. 2508 */ 2509 VN_RELE(vp); 2510 (void) nfs3_remove(dvp, nm, cr); 2511 return (error); 2512 } 2513 } 2514 2515 if (va->va_gid != rp->r_attr.va_gid) { 2516 /* 2517 * If the gid on the file isn't right, then 2518 * generate a SETATTR to attempt to change 2519 * it. This may or may not work, depending 2520 * upon the server's semantics for allowing 2521 * file ownership changes. 2522 */ 2523 va->va_mask = AT_GID; 2524 (void) nfs3setattr(vp, va, 0, cr); 2525 } 2526 2527 /* 2528 * If vnode is a device create special vnode 2529 */ 2530 if (IS_DEVVP(vp)) { 2531 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2532 VN_RELE(vp); 2533 } else 2534 *vpp = vp; 2535 } else { 2536 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 2537 PURGE_STALE_FH(error, dvp, cr); 2538 } 2539 2540 return (error); 2541 } 2542 2543 /* 2544 * Special setattr function to take care of rest of atime/mtime 2545 * after successful exclusive create. This function exists to avoid 2546 * handling attributes from the server; exclusive the atime/mtime fields 2547 * may be 'invalid' in client's view and therefore can not be trusted. 2548 */ 2549 static int 2550 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr) 2551 { 2552 int error; 2553 uint_t mask; 2554 SETATTR3args args; 2555 SETATTR3res res; 2556 int douprintf; 2557 rnode_t *rp; 2558 hrtime_t t; 2559 2560 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 2561 /* save the caller's mask so that it can be reset later */ 2562 mask = vap->va_mask; 2563 2564 rp = VTOR(vp); 2565 2566 args.object = *RTOFH3(rp); 2567 args.guard.check = FALSE; 2568 2569 /* Use the mask to initialize the arguments */ 2570 vap->va_mask = 0; 2571 error = vattr_to_sattr3(vap, &args.new_attributes); 2572 2573 /* We want to set just atime/mtime on this request */ 2574 args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; 2575 args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; 2576 2577 douprintf = 1; 2578 2579 t = gethrtime(); 2580 2581 error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, 2582 xdr_SETATTR3args, (caddr_t)&args, 2583 xdr_SETATTR3res, (caddr_t)&res, cr, 2584 &douprintf, &res.status, 0, NULL); 2585 2586 if (error) { 2587 vap->va_mask = mask; 2588 return (error); 2589 } 2590 2591 error = geterrno3(res.status); 2592 if (!error) { 2593 /* 2594 * It is important to pick up the attributes. 2595 * Since this is the exclusive create path, the 2596 * attributes on the initial create were ignored 2597 * and we need these to have the correct info. 2598 */ 2599 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); 2600 /* 2601 * No need to do the atime/mtime work again so clear 2602 * the bits. 2603 */ 2604 mask &= ~(AT_ATIME | AT_MTIME); 2605 } else { 2606 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr); 2607 } 2608 2609 vap->va_mask = mask; 2610 2611 return (error); 2612 } 2613 2614 /* ARGSUSED */ 2615 static int 2616 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2617 int mode, vnode_t **vpp, cred_t *cr) 2618 { 2619 int error; 2620 MKNOD3args args; 2621 MKNOD3res res; 2622 int douprintf; 2623 vnode_t *vp; 2624 struct vattr vattr; 2625 hrtime_t t; 2626 2627 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2628 switch (va->va_type) { 2629 case VCHR: 2630 case VBLK: 2631 setdiropargs3(&args.where, nm, dvp); 2632 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK; 2633 error = vattr_to_sattr3(va, 2634 &args.what.mknoddata3_u.device.dev_attributes); 2635 if (error) { 2636 /* req time field(s) overflow - return immediately */ 2637 return (error); 2638 } 2639 args.what.mknoddata3_u.device.spec.specdata1 = 2640 getmajor(va->va_rdev); 2641 args.what.mknoddata3_u.device.spec.specdata2 = 2642 getminor(va->va_rdev); 2643 break; 2644 2645 case VFIFO: 2646 case VSOCK: 2647 setdiropargs3(&args.where, nm, dvp); 2648 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK; 2649 error = vattr_to_sattr3(va, 2650 &args.what.mknoddata3_u.pipe_attributes); 2651 if (error) { 2652 /* req time field(s) overflow - return immediately */ 2653 return (error); 2654 } 2655 break; 2656 2657 default: 2658 return (EINVAL); 2659 } 2660 2661 douprintf = 1; 2662 2663 t = gethrtime(); 2664 2665 error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD, 2666 xdr_MKNOD3args, (caddr_t)&args, 2667 xdr_MKNOD3res, (caddr_t)&res, cr, 2668 &douprintf, &res.status, 0, NULL); 2669 2670 if (error) { 2671 PURGE_ATTRCACHE(dvp); 2672 return (error); 2673 } 2674 2675 error = geterrno3(res.status); 2676 if (!error) { 2677 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 2678 if (HAVE_RDDIR_CACHE(VTOR(dvp))) 2679 nfs_purge_rddir_cache(dvp); 2680 2681 if (!res.resok.obj.handle_follows) { 2682 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2683 if (error) 2684 return (error); 2685 } else { 2686 if (res.resok.obj_attributes.attributes) { 2687 vp = makenfs3node(&res.resok.obj.handle, 2688 &res.resok.obj_attributes.attr, 2689 dvp->v_vfsp, t, cr, NULL, NULL); 2690 } else { 2691 vp = makenfs3node(&res.resok.obj.handle, NULL, 2692 dvp->v_vfsp, t, cr, NULL, NULL); 2693 if (vp->v_type == VNON) { 2694 vattr.va_mask = AT_TYPE; 2695 error = nfs3getattr(vp, &vattr, cr); 2696 if (error) { 2697 VN_RELE(vp); 2698 return (error); 2699 } 2700 vp->v_type = vattr.va_type; 2701 } 2702 2703 } 2704 dnlc_update(dvp, nm, vp); 2705 } 2706 2707 if (va->va_gid != VTOR(vp)->r_attr.va_gid) { 2708 va->va_mask = AT_GID; 2709 (void) nfs3setattr(vp, va, 0, cr); 2710 } 2711 2712 /* 2713 * If vnode is a device create special vnode 2714 */ 2715 if (IS_DEVVP(vp)) { 2716 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2717 VN_RELE(vp); 2718 } else 2719 *vpp = vp; 2720 } else { 2721 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 2722 PURGE_STALE_FH(error, dvp, cr); 2723 } 2724 return (error); 2725 } 2726 2727 /* 2728 * Weirdness: if the vnode to be removed is open 2729 * we rename it instead of removing it and nfs_inactive 2730 * will remove the new name. 2731 */ 2732 static int 2733 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr) 2734 { 2735 int error; 2736 REMOVE3args args; 2737 REMOVE3res res; 2738 vnode_t *vp; 2739 char *tmpname; 2740 int douprintf; 2741 rnode_t *rp; 2742 rnode_t *drp; 2743 hrtime_t t; 2744 2745 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2746 return (EPERM); 2747 drp = VTOR(dvp); 2748 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2749 return (EINTR); 2750 2751 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2752 if (error) { 2753 nfs_rw_exit(&drp->r_rwlock); 2754 return (error); 2755 } 2756 2757 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2758 VN_RELE(vp); 2759 nfs_rw_exit(&drp->r_rwlock); 2760 return (EPERM); 2761 } 2762 2763 /* 2764 * First just remove the entry from the name cache, as it 2765 * is most likely the only entry for this vp. 2766 */ 2767 dnlc_remove(dvp, nm); 2768 2769 /* 2770 * If the file has a v_count > 1 then there may be more than one 2771 * entry in the name cache due multiple links or an open file, 2772 * but we don't have the real reference count so flush all 2773 * possible entries. 2774 */ 2775 if (vp->v_count > 1) 2776 dnlc_purge_vp(vp); 2777 2778 /* 2779 * Now we have the real reference count on the vnode 2780 */ 2781 rp = VTOR(vp); 2782 mutex_enter(&rp->r_statelock); 2783 if (vp->v_count > 1 && 2784 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2785 mutex_exit(&rp->r_statelock); 2786 tmpname = newname(); 2787 error = nfs3rename(dvp, nm, dvp, tmpname, cr); 2788 if (error) 2789 kmem_free(tmpname, MAXNAMELEN); 2790 else { 2791 mutex_enter(&rp->r_statelock); 2792 if (rp->r_unldvp == NULL) { 2793 VN_HOLD(dvp); 2794 rp->r_unldvp = dvp; 2795 if (rp->r_unlcred != NULL) 2796 crfree(rp->r_unlcred); 2797 crhold(cr); 2798 rp->r_unlcred = cr; 2799 rp->r_unlname = tmpname; 2800 } else { 2801 kmem_free(rp->r_unlname, MAXNAMELEN); 2802 rp->r_unlname = tmpname; 2803 } 2804 mutex_exit(&rp->r_statelock); 2805 } 2806 } else { 2807 mutex_exit(&rp->r_statelock); 2808 /* 2809 * We need to flush any dirty pages which happen to 2810 * be hanging around before removing the file. This 2811 * shouldn't happen very often and mostly on file 2812 * systems mounted "nocto". 2813 */ 2814 if (vn_has_cached_data(vp) && 2815 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2816 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 2817 if (error && (error == ENOSPC || error == EDQUOT)) { 2818 mutex_enter(&rp->r_statelock); 2819 if (!rp->r_error) 2820 rp->r_error = error; 2821 mutex_exit(&rp->r_statelock); 2822 } 2823 } 2824 2825 setdiropargs3(&args.object, nm, dvp); 2826 2827 douprintf = 1; 2828 2829 t = gethrtime(); 2830 2831 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE, 2832 xdr_diropargs3, (caddr_t)&args, 2833 xdr_REMOVE3res, (caddr_t)&res, cr, 2834 &douprintf, &res.status, 0, NULL); 2835 2836 /* 2837 * The xattr dir may be gone after last attr is removed, 2838 * so flush it from dnlc. 2839 */ 2840 if (dvp->v_flag & V_XATTRDIR) 2841 dnlc_purge_vp(dvp); 2842 2843 PURGE_ATTRCACHE(vp); 2844 2845 if (error) { 2846 PURGE_ATTRCACHE(dvp); 2847 } else { 2848 error = geterrno3(res.status); 2849 if (!error) { 2850 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, 2851 cr); 2852 if (HAVE_RDDIR_CACHE(drp)) 2853 nfs_purge_rddir_cache(dvp); 2854 } else { 2855 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, 2856 t, cr); 2857 PURGE_STALE_FH(error, dvp, cr); 2858 } 2859 } 2860 } 2861 2862 VN_RELE(vp); 2863 2864 nfs_rw_exit(&drp->r_rwlock); 2865 2866 return (error); 2867 } 2868 2869 static int 2870 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 2871 { 2872 int error; 2873 LINK3args args; 2874 LINK3res res; 2875 vnode_t *realvp; 2876 int douprintf; 2877 mntinfo_t *mi; 2878 rnode_t *tdrp; 2879 hrtime_t t; 2880 2881 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2882 return (EPERM); 2883 if (VOP_REALVP(svp, &realvp) == 0) 2884 svp = realvp; 2885 2886 mi = VTOMI(svp); 2887 2888 if (!(mi->mi_flags & MI_LINK)) 2889 return (EOPNOTSUPP); 2890 2891 args.file = *VTOFH3(svp); 2892 setdiropargs3(&args.link, tnm, tdvp); 2893 2894 tdrp = VTOR(tdvp); 2895 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2896 return (EINTR); 2897 2898 dnlc_remove(tdvp, tnm); 2899 2900 douprintf = 1; 2901 2902 t = gethrtime(); 2903 2904 error = rfs3call(mi, NFSPROC3_LINK, 2905 xdr_LINK3args, (caddr_t)&args, 2906 xdr_LINK3res, (caddr_t)&res, cr, 2907 &douprintf, &res.status, 0, NULL); 2908 2909 if (error) { 2910 PURGE_ATTRCACHE(tdvp); 2911 PURGE_ATTRCACHE(svp); 2912 nfs_rw_exit(&tdrp->r_rwlock); 2913 return (error); 2914 } 2915 2916 error = geterrno3(res.status); 2917 2918 if (!error) { 2919 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr); 2920 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr); 2921 if (HAVE_RDDIR_CACHE(tdrp)) 2922 nfs_purge_rddir_cache(tdvp); 2923 dnlc_update(tdvp, tnm, svp); 2924 } else { 2925 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t, 2926 cr); 2927 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr); 2928 if (error == EOPNOTSUPP) { 2929 mutex_enter(&mi->mi_lock); 2930 mi->mi_flags &= ~MI_LINK; 2931 mutex_exit(&mi->mi_lock); 2932 } 2933 } 2934 2935 nfs_rw_exit(&tdrp->r_rwlock); 2936 2937 return (error); 2938 } 2939 2940 static int 2941 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2942 { 2943 vnode_t *realvp; 2944 2945 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2946 return (EPERM); 2947 if (VOP_REALVP(ndvp, &realvp) == 0) 2948 ndvp = realvp; 2949 2950 return (nfs3rename(odvp, onm, ndvp, nnm, cr)); 2951 } 2952 2953 /* 2954 * nfs3rename does the real work of renaming in NFS Version 3. 2955 */ 2956 static int 2957 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2958 { 2959 int error; 2960 RENAME3args args; 2961 RENAME3res res; 2962 int douprintf; 2963 vnode_t *nvp; 2964 vnode_t *ovp = NULL; 2965 char *tmpname; 2966 rnode_t *rp; 2967 rnode_t *odrp; 2968 rnode_t *ndrp; 2969 hrtime_t t; 2970 2971 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2972 2973 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2974 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2975 return (EINVAL); 2976 2977 odrp = VTOR(odvp); 2978 ndrp = VTOR(ndvp); 2979 if ((intptr_t)odrp < (intptr_t)ndrp) { 2980 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2981 return (EINTR); 2982 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2983 nfs_rw_exit(&odrp->r_rwlock); 2984 return (EINTR); 2985 } 2986 } else { 2987 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2988 return (EINTR); 2989 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2990 nfs_rw_exit(&ndrp->r_rwlock); 2991 return (EINTR); 2992 } 2993 } 2994 2995 /* 2996 * Lookup the target file. If it exists, it needs to be 2997 * checked to see whether it is a mount point and whether 2998 * it is active (open). 2999 */ 3000 error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 3001 if (!error) { 3002 /* 3003 * If this file has been mounted on, then just 3004 * return busy because renaming to it would remove 3005 * the mounted file system from the name space. 3006 */ 3007 if (vn_mountedvfs(nvp) != NULL) { 3008 VN_RELE(nvp); 3009 nfs_rw_exit(&odrp->r_rwlock); 3010 nfs_rw_exit(&ndrp->r_rwlock); 3011 return (EBUSY); 3012 } 3013 3014 /* 3015 * Purge the name cache of all references to this vnode 3016 * so that we can check the reference count to infer 3017 * whether it is active or not. 3018 */ 3019 /* 3020 * First just remove the entry from the name cache, as it 3021 * is most likely the only entry for this vp. 3022 */ 3023 dnlc_remove(ndvp, nnm); 3024 /* 3025 * If the file has a v_count > 1 then there may be more 3026 * than one entry in the name cache due multiple links 3027 * or an open file, but we don't have the real reference 3028 * count so flush all possible entries. 3029 */ 3030 if (nvp->v_count > 1) 3031 dnlc_purge_vp(nvp); 3032 3033 /* 3034 * If the vnode is active and is not a directory, 3035 * arrange to rename it to a 3036 * temporary file so that it will continue to be 3037 * accessible. This implements the "unlink-open-file" 3038 * semantics for the target of a rename operation. 3039 * Before doing this though, make sure that the 3040 * source and target files are not already the same. 3041 */ 3042 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 3043 /* 3044 * Lookup the source name. 3045 */ 3046 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, 3047 cr, 0); 3048 3049 /* 3050 * The source name *should* already exist. 3051 */ 3052 if (error) { 3053 VN_RELE(nvp); 3054 nfs_rw_exit(&odrp->r_rwlock); 3055 nfs_rw_exit(&ndrp->r_rwlock); 3056 return (error); 3057 } 3058 3059 /* 3060 * Compare the two vnodes. If they are the same, 3061 * just release all held vnodes and return success. 3062 */ 3063 if (ovp == nvp) { 3064 VN_RELE(ovp); 3065 VN_RELE(nvp); 3066 nfs_rw_exit(&odrp->r_rwlock); 3067 nfs_rw_exit(&ndrp->r_rwlock); 3068 return (0); 3069 } 3070 3071 /* 3072 * Can't mix and match directories and non- 3073 * directories in rename operations. We already 3074 * know that the target is not a directory. If 3075 * the source is a directory, return an error. 3076 */ 3077 if (ovp->v_type == VDIR) { 3078 VN_RELE(ovp); 3079 VN_RELE(nvp); 3080 nfs_rw_exit(&odrp->r_rwlock); 3081 nfs_rw_exit(&ndrp->r_rwlock); 3082 return (ENOTDIR); 3083 } 3084 3085 /* 3086 * The target file exists, is not the same as 3087 * the source file, and is active. Link it 3088 * to a temporary filename to avoid having 3089 * the server removing the file completely. 3090 */ 3091 tmpname = newname(); 3092 error = nfs3_link(ndvp, nvp, tmpname, cr); 3093 if (error == EOPNOTSUPP) { 3094 error = nfs3_rename(ndvp, nnm, ndvp, tmpname, 3095 cr); 3096 } 3097 if (error) { 3098 kmem_free(tmpname, MAXNAMELEN); 3099 VN_RELE(ovp); 3100 VN_RELE(nvp); 3101 nfs_rw_exit(&odrp->r_rwlock); 3102 nfs_rw_exit(&ndrp->r_rwlock); 3103 return (error); 3104 } 3105 rp = VTOR(nvp); 3106 mutex_enter(&rp->r_statelock); 3107 if (rp->r_unldvp == NULL) { 3108 VN_HOLD(ndvp); 3109 rp->r_unldvp = ndvp; 3110 if (rp->r_unlcred != NULL) 3111 crfree(rp->r_unlcred); 3112 crhold(cr); 3113 rp->r_unlcred = cr; 3114 rp->r_unlname = tmpname; 3115 } else { 3116 kmem_free(rp->r_unlname, MAXNAMELEN); 3117 rp->r_unlname = tmpname; 3118 } 3119 mutex_exit(&rp->r_statelock); 3120 } 3121 3122 VN_RELE(nvp); 3123 } 3124 3125 if (ovp == NULL) { 3126 /* 3127 * When renaming directories to be a subdirectory of a 3128 * different parent, the dnlc entry for ".." will no 3129 * longer be valid, so it must be removed. 3130 * 3131 * We do a lookup here to determine whether we are renaming 3132 * a directory and we need to check if we are renaming 3133 * an unlinked file. This might have already been done 3134 * in previous code, so we check ovp == NULL to avoid 3135 * doing it twice. 3136 */ 3137 3138 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 3139 /* 3140 * The source name *should* already exist. 3141 */ 3142 if (error) { 3143 nfs_rw_exit(&odrp->r_rwlock); 3144 nfs_rw_exit(&ndrp->r_rwlock); 3145 return (error); 3146 } 3147 ASSERT(ovp != NULL); 3148 } 3149 3150 dnlc_remove(odvp, onm); 3151 dnlc_remove(ndvp, nnm); 3152 3153 setdiropargs3(&args.from, onm, odvp); 3154 setdiropargs3(&args.to, nnm, ndvp); 3155 3156 douprintf = 1; 3157 3158 t = gethrtime(); 3159 3160 error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME, 3161 xdr_RENAME3args, (caddr_t)&args, 3162 xdr_RENAME3res, (caddr_t)&res, cr, 3163 &douprintf, &res.status, 0, NULL); 3164 3165 if (error) { 3166 PURGE_ATTRCACHE(odvp); 3167 PURGE_ATTRCACHE(ndvp); 3168 VN_RELE(ovp); 3169 nfs_rw_exit(&odrp->r_rwlock); 3170 nfs_rw_exit(&ndrp->r_rwlock); 3171 return (error); 3172 } 3173 3174 error = geterrno3(res.status); 3175 3176 if (!error) { 3177 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr); 3178 if (HAVE_RDDIR_CACHE(odrp)) 3179 nfs_purge_rddir_cache(odvp); 3180 if (ndvp != odvp) { 3181 nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr); 3182 if (HAVE_RDDIR_CACHE(ndrp)) 3183 nfs_purge_rddir_cache(ndvp); 3184 } 3185 /* 3186 * when renaming directories to be a subdirectory of a 3187 * different parent, the dnlc entry for ".." will no 3188 * longer be valid, so it must be removed 3189 */ 3190 rp = VTOR(ovp); 3191 if (ndvp != odvp) { 3192 if (ovp->v_type == VDIR) { 3193 dnlc_remove(ovp, ".."); 3194 if (HAVE_RDDIR_CACHE(rp)) 3195 nfs_purge_rddir_cache(ovp); 3196 } 3197 } 3198 3199 /* 3200 * If we are renaming the unlinked file, update the 3201 * r_unldvp and r_unlname as needed. 3202 */ 3203 mutex_enter(&rp->r_statelock); 3204 if (rp->r_unldvp != NULL) { 3205 if (strcmp(rp->r_unlname, onm) == 0) { 3206 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 3207 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 3208 3209 if (ndvp != rp->r_unldvp) { 3210 VN_RELE(rp->r_unldvp); 3211 rp->r_unldvp = ndvp; 3212 VN_HOLD(ndvp); 3213 } 3214 } 3215 } 3216 mutex_exit(&rp->r_statelock); 3217 } else { 3218 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr); 3219 if (ndvp != odvp) { 3220 nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t, 3221 cr); 3222 } 3223 /* 3224 * System V defines rename to return EEXIST, not 3225 * ENOTEMPTY if the target directory is not empty. 3226 * Over the wire, the error is NFSERR_ENOTEMPTY 3227 * which geterrno maps to ENOTEMPTY. 3228 */ 3229 if (error == ENOTEMPTY) 3230 error = EEXIST; 3231 } 3232 3233 VN_RELE(ovp); 3234 3235 nfs_rw_exit(&odrp->r_rwlock); 3236 nfs_rw_exit(&ndrp->r_rwlock); 3237 3238 return (error); 3239 } 3240 3241 static int 3242 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 3243 { 3244 int error; 3245 MKDIR3args args; 3246 MKDIR3res res; 3247 int douprintf; 3248 struct vattr vattr; 3249 vnode_t *vp; 3250 rnode_t *drp; 3251 hrtime_t t; 3252 3253 if (nfs_zone() != VTOMI(dvp)->mi_zone) 3254 return (EPERM); 3255 setdiropargs3(&args.where, nm, dvp); 3256 3257 /* 3258 * Decide what the group-id and set-gid bit of the created directory 3259 * should be. May have to do a setattr to get the gid right. 3260 */ 3261 error = setdirgid(dvp, &va->va_gid, cr); 3262 if (error) 3263 return (error); 3264 error = setdirmode(dvp, &va->va_mode, cr); 3265 if (error) 3266 return (error); 3267 va->va_mask |= AT_MODE|AT_GID; 3268 3269 error = vattr_to_sattr3(va, &args.attributes); 3270 if (error) { 3271 /* req time field(s) overflow - return immediately */ 3272 return (error); 3273 } 3274 3275 drp = VTOR(dvp); 3276 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3277 return (EINTR); 3278 3279 dnlc_remove(dvp, nm); 3280 3281 douprintf = 1; 3282 3283 t = gethrtime(); 3284 3285 error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR, 3286 xdr_MKDIR3args, (caddr_t)&args, 3287 xdr_MKDIR3res, (caddr_t)&res, cr, 3288 &douprintf, &res.status, 0, NULL); 3289 3290 if (error) { 3291 PURGE_ATTRCACHE(dvp); 3292 nfs_rw_exit(&drp->r_rwlock); 3293 return (error); 3294 } 3295 3296 error = geterrno3(res.status); 3297 if (!error) { 3298 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3299 if (HAVE_RDDIR_CACHE(drp)) 3300 nfs_purge_rddir_cache(dvp); 3301 3302 if (!res.resok.obj.handle_follows) { 3303 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 3304 if (error) { 3305 nfs_rw_exit(&drp->r_rwlock); 3306 return (error); 3307 } 3308 } else { 3309 if (res.resok.obj_attributes.attributes) { 3310 vp = makenfs3node(&res.resok.obj.handle, 3311 &res.resok.obj_attributes.attr, 3312 dvp->v_vfsp, t, cr, NULL, NULL); 3313 } else { 3314 vp = makenfs3node(&res.resok.obj.handle, NULL, 3315 dvp->v_vfsp, t, cr, NULL, NULL); 3316 if (vp->v_type == VNON) { 3317 vattr.va_mask = AT_TYPE; 3318 error = nfs3getattr(vp, &vattr, cr); 3319 if (error) { 3320 VN_RELE(vp); 3321 nfs_rw_exit(&drp->r_rwlock); 3322 return (error); 3323 } 3324 vp->v_type = vattr.va_type; 3325 } 3326 } 3327 dnlc_update(dvp, nm, vp); 3328 } 3329 if (va->va_gid != VTOR(vp)->r_attr.va_gid) { 3330 va->va_mask = AT_GID; 3331 (void) nfs3setattr(vp, va, 0, cr); 3332 } 3333 *vpp = vp; 3334 } else { 3335 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3336 PURGE_STALE_FH(error, dvp, cr); 3337 } 3338 3339 nfs_rw_exit(&drp->r_rwlock); 3340 3341 return (error); 3342 } 3343 3344 static int 3345 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 3346 { 3347 int error; 3348 RMDIR3args args; 3349 RMDIR3res res; 3350 vnode_t *vp; 3351 int douprintf; 3352 rnode_t *drp; 3353 hrtime_t t; 3354 3355 if (nfs_zone() != VTOMI(dvp)->mi_zone) 3356 return (EPERM); 3357 drp = VTOR(dvp); 3358 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3359 return (EINTR); 3360 3361 /* 3362 * Attempt to prevent a rmdir(".") from succeeding. 3363 */ 3364 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 3365 if (error) { 3366 nfs_rw_exit(&drp->r_rwlock); 3367 return (error); 3368 } 3369 3370 if (vp == cdir) { 3371 VN_RELE(vp); 3372 nfs_rw_exit(&drp->r_rwlock); 3373 return (EINVAL); 3374 } 3375 3376 setdiropargs3(&args.object, nm, dvp); 3377 3378 /* 3379 * First just remove the entry from the name cache, as it 3380 * is most likely an entry for this vp. 3381 */ 3382 dnlc_remove(dvp, nm); 3383 3384 /* 3385 * If there vnode reference count is greater than one, then 3386 * there may be additional references in the DNLC which will 3387 * need to be purged. First, trying removing the entry for 3388 * the parent directory and see if that removes the additional 3389 * reference(s). If that doesn't do it, then use dnlc_purge_vp 3390 * to completely remove any references to the directory which 3391 * might still exist in the DNLC. 3392 */ 3393 if (vp->v_count > 1) { 3394 dnlc_remove(vp, ".."); 3395 if (vp->v_count > 1) 3396 dnlc_purge_vp(vp); 3397 } 3398 3399 douprintf = 1; 3400 3401 t = gethrtime(); 3402 3403 error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR, 3404 xdr_diropargs3, (caddr_t)&args, 3405 xdr_RMDIR3res, (caddr_t)&res, cr, 3406 &douprintf, &res.status, 0, NULL); 3407 3408 PURGE_ATTRCACHE(vp); 3409 3410 if (error) { 3411 PURGE_ATTRCACHE(dvp); 3412 VN_RELE(vp); 3413 nfs_rw_exit(&drp->r_rwlock); 3414 return (error); 3415 } 3416 3417 error = geterrno3(res.status); 3418 if (!error) { 3419 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3420 if (HAVE_RDDIR_CACHE(drp)) 3421 nfs_purge_rddir_cache(dvp); 3422 if (HAVE_RDDIR_CACHE(VTOR(vp))) 3423 nfs_purge_rddir_cache(vp); 3424 } else { 3425 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3426 PURGE_STALE_FH(error, dvp, cr); 3427 /* 3428 * System V defines rmdir to return EEXIST, not 3429 * ENOTEMPTY if the directory is not empty. Over 3430 * the wire, the error is NFSERR_ENOTEMPTY which 3431 * geterrno maps to ENOTEMPTY. 3432 */ 3433 if (error == ENOTEMPTY) 3434 error = EEXIST; 3435 } 3436 3437 VN_RELE(vp); 3438 3439 nfs_rw_exit(&drp->r_rwlock); 3440 3441 return (error); 3442 } 3443 3444 static int 3445 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 3446 { 3447 int error; 3448 SYMLINK3args args; 3449 SYMLINK3res res; 3450 int douprintf; 3451 mntinfo_t *mi; 3452 vnode_t *vp; 3453 rnode_t *rp; 3454 char *contents; 3455 rnode_t *drp; 3456 hrtime_t t; 3457 3458 mi = VTOMI(dvp); 3459 3460 if (nfs_zone() != mi->mi_zone) 3461 return (EPERM); 3462 if (!(mi->mi_flags & MI_SYMLINK)) 3463 return (EOPNOTSUPP); 3464 3465 setdiropargs3(&args.where, lnm, dvp); 3466 error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes); 3467 if (error) { 3468 /* req time field(s) overflow - return immediately */ 3469 return (error); 3470 } 3471 args.symlink.symlink_data = tnm; 3472 3473 drp = VTOR(dvp); 3474 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3475 return (EINTR); 3476 3477 dnlc_remove(dvp, lnm); 3478 3479 douprintf = 1; 3480 3481 t = gethrtime(); 3482 3483 error = rfs3call(mi, NFSPROC3_SYMLINK, 3484 xdr_SYMLINK3args, (caddr_t)&args, 3485 xdr_SYMLINK3res, (caddr_t)&res, cr, 3486 &douprintf, &res.status, 0, NULL); 3487 3488 if (error) { 3489 PURGE_ATTRCACHE(dvp); 3490 nfs_rw_exit(&drp->r_rwlock); 3491 return (error); 3492 } 3493 3494 error = geterrno3(res.status); 3495 if (!error) { 3496 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3497 if (HAVE_RDDIR_CACHE(drp)) 3498 nfs_purge_rddir_cache(dvp); 3499 3500 if (res.resok.obj.handle_follows) { 3501 if (res.resok.obj_attributes.attributes) { 3502 vp = makenfs3node(&res.resok.obj.handle, 3503 &res.resok.obj_attributes.attr, 3504 dvp->v_vfsp, t, cr, NULL, NULL); 3505 } else { 3506 vp = makenfs3node(&res.resok.obj.handle, NULL, 3507 dvp->v_vfsp, t, cr, NULL, NULL); 3508 vp->v_type = VLNK; 3509 vp->v_rdev = 0; 3510 } 3511 dnlc_update(dvp, lnm, vp); 3512 rp = VTOR(vp); 3513 if (nfs3_do_symlink_cache && 3514 rp->r_symlink.contents == NULL) { 3515 3516 contents = kmem_alloc(MAXPATHLEN, 3517 KM_NOSLEEP); 3518 3519 if (contents != NULL) { 3520 mutex_enter(&rp->r_statelock); 3521 if (rp->r_symlink.contents == NULL) { 3522 rp->r_symlink.len = strlen(tnm); 3523 bcopy(tnm, contents, 3524 rp->r_symlink.len); 3525 rp->r_symlink.contents = 3526 contents; 3527 rp->r_symlink.size = MAXPATHLEN; 3528 mutex_exit(&rp->r_statelock); 3529 } else { 3530 mutex_exit(&rp->r_statelock); 3531 kmem_free((void *)contents, 3532 MAXPATHLEN); 3533 } 3534 } 3535 } 3536 VN_RELE(vp); 3537 } 3538 } else { 3539 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3540 PURGE_STALE_FH(error, dvp, cr); 3541 if (error == EOPNOTSUPP) { 3542 mutex_enter(&mi->mi_lock); 3543 mi->mi_flags &= ~MI_SYMLINK; 3544 mutex_exit(&mi->mi_lock); 3545 } 3546 } 3547 3548 nfs_rw_exit(&drp->r_rwlock); 3549 3550 return (error); 3551 } 3552 3553 #ifdef DEBUG 3554 static int nfs3_readdir_cache_hits = 0; 3555 static int nfs3_readdir_cache_shorts = 0; 3556 static int nfs3_readdir_cache_waits = 0; 3557 static int nfs3_readdir_cache_misses = 0; 3558 static int nfs3_readdir_readahead = 0; 3559 #endif 3560 3561 static int nfs3_shrinkreaddir = 0; 3562 3563 /* 3564 * Read directory entries. 3565 * There are some weird things to look out for here. The uio_loffset 3566 * field is either 0 or it is the offset returned from a previous 3567 * readdir. It is an opaque value used by the server to find the 3568 * correct directory block to read. The count field is the number 3569 * of blocks to read on the server. This is advisory only, the server 3570 * may return only one block's worth of entries. Entries may be compressed 3571 * on the server. 3572 */ 3573 static int 3574 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 3575 { 3576 int error; 3577 size_t count; 3578 rnode_t *rp; 3579 rddir_cache *rdc; 3580 rddir_cache *nrdc; 3581 rddir_cache *rrdc; 3582 #ifdef DEBUG 3583 int missed; 3584 #endif 3585 int doreadahead; 3586 rddir_cache srdc; 3587 avl_index_t where; 3588 3589 if (nfs_zone() != VTOMI(vp)->mi_zone) 3590 return (EIO); 3591 rp = VTOR(vp); 3592 3593 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 3594 3595 /* 3596 * Make sure that the directory cache is valid. 3597 */ 3598 if (HAVE_RDDIR_CACHE(rp)) { 3599 if (nfs_disable_rddir_cache) { 3600 /* 3601 * Setting nfs_disable_rddir_cache in /etc/system 3602 * allows interoperability with servers that do not 3603 * properly update the attributes of directories. 3604 * Any cached information gets purged before an 3605 * access is made to it. 3606 */ 3607 nfs_purge_rddir_cache(vp); 3608 } else { 3609 error = nfs3_validate_caches(vp, cr); 3610 if (error) 3611 return (error); 3612 } 3613 } 3614 3615 /* 3616 * It is possible that some servers may not be able to correctly 3617 * handle a large READDIR or READDIRPLUS request due to bugs in 3618 * their implementation. In order to continue to interoperate 3619 * with them, this workaround is provided to limit the maximum 3620 * size of a READDIRPLUS request to 1024. In any case, the request 3621 * size is limited to MAXBSIZE. 3622 */ 3623 count = MIN(uiop->uio_iov->iov_len, 3624 nfs3_shrinkreaddir ? 1024 : MAXBSIZE); 3625 3626 nrdc = NULL; 3627 #ifdef DEBUG 3628 missed = 0; 3629 #endif 3630 top: 3631 /* 3632 * Short circuit last readdir which always returns 0 bytes. 3633 * This can be done after the directory has been read through 3634 * completely at least once. This will set r_direof which 3635 * can be used to find the value of the last cookie. 3636 */ 3637 mutex_enter(&rp->r_statelock); 3638 if (rp->r_direof != NULL && 3639 uiop->uio_loffset == rp->r_direof->nfs3_ncookie) { 3640 mutex_exit(&rp->r_statelock); 3641 #ifdef DEBUG 3642 nfs3_readdir_cache_shorts++; 3643 #endif 3644 if (eofp) 3645 *eofp = 1; 3646 if (nrdc != NULL) 3647 rddir_cache_rele(nrdc); 3648 return (0); 3649 } 3650 /* 3651 * Look for a cache entry. Cache entries are identified 3652 * by the NFS cookie value and the byte count requested. 3653 */ 3654 srdc.nfs3_cookie = uiop->uio_loffset; 3655 srdc.buflen = count; 3656 rdc = avl_find(&rp->r_dir, &srdc, &where); 3657 if (rdc != NULL) { 3658 rddir_cache_hold(rdc); 3659 /* 3660 * If the cache entry is in the process of being 3661 * filled in, wait until this completes. The 3662 * RDDIRWAIT bit is set to indicate that someone 3663 * is waiting and then the thread currently 3664 * filling the entry is done, it should do a 3665 * cv_broadcast to wakeup all of the threads 3666 * waiting for it to finish. 3667 */ 3668 if (rdc->flags & RDDIR) { 3669 nfs_rw_exit(&rp->r_rwlock); 3670 rdc->flags |= RDDIRWAIT; 3671 #ifdef DEBUG 3672 nfs3_readdir_cache_waits++; 3673 #endif 3674 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 3675 /* 3676 * We got interrupted, probably 3677 * the user typed ^C or an alarm 3678 * fired. We free the new entry 3679 * if we allocated one. 3680 */ 3681 mutex_exit(&rp->r_statelock); 3682 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3683 RW_READER, FALSE); 3684 rddir_cache_rele(rdc); 3685 if (nrdc != NULL) 3686 rddir_cache_rele(nrdc); 3687 return (EINTR); 3688 } 3689 mutex_exit(&rp->r_statelock); 3690 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3691 RW_READER, FALSE); 3692 rddir_cache_rele(rdc); 3693 goto top; 3694 } 3695 /* 3696 * Check to see if a readdir is required to 3697 * fill the entry. If so, mark this entry 3698 * as being filled, remove our reference, 3699 * and branch to the code to fill the entry. 3700 */ 3701 if (rdc->flags & RDDIRREQ) { 3702 rdc->flags &= ~RDDIRREQ; 3703 rdc->flags |= RDDIR; 3704 if (nrdc != NULL) 3705 rddir_cache_rele(nrdc); 3706 nrdc = rdc; 3707 mutex_exit(&rp->r_statelock); 3708 goto bottom; 3709 } 3710 #ifdef DEBUG 3711 if (!missed) 3712 nfs3_readdir_cache_hits++; 3713 #endif 3714 /* 3715 * If an error occurred while attempting 3716 * to fill the cache entry, just return it. 3717 */ 3718 if (rdc->error) { 3719 error = rdc->error; 3720 mutex_exit(&rp->r_statelock); 3721 rddir_cache_rele(rdc); 3722 if (nrdc != NULL) 3723 rddir_cache_rele(nrdc); 3724 return (error); 3725 } 3726 3727 /* 3728 * The cache entry is complete and good, 3729 * copyout the dirent structs to the calling 3730 * thread. 3731 */ 3732 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 3733 3734 /* 3735 * If no error occurred during the copyout, 3736 * update the offset in the uio struct to 3737 * contain the value of the next cookie 3738 * and set the eof value appropriately. 3739 */ 3740 if (!error) { 3741 uiop->uio_loffset = rdc->nfs3_ncookie; 3742 if (eofp) 3743 *eofp = rdc->eof; 3744 } 3745 3746 /* 3747 * Decide whether to do readahead. 3748 * 3749 * Don't if have already read to the end of 3750 * directory. There is nothing more to read. 3751 * 3752 * Don't if the application is not doing 3753 * lookups in the directory. The readahead 3754 * is only effective if the application can 3755 * be doing work while an async thread is 3756 * handling the over the wire request. 3757 */ 3758 if (rdc->eof) { 3759 rp->r_direof = rdc; 3760 doreadahead = FALSE; 3761 } else if (!(rp->r_flags & RLOOKUP)) 3762 doreadahead = FALSE; 3763 else 3764 doreadahead = TRUE; 3765 3766 if (!doreadahead) { 3767 mutex_exit(&rp->r_statelock); 3768 rddir_cache_rele(rdc); 3769 if (nrdc != NULL) 3770 rddir_cache_rele(nrdc); 3771 return (error); 3772 } 3773 3774 /* 3775 * Check to see whether we found an entry 3776 * for the readahead. If so, we don't need 3777 * to do anything further, so free the new 3778 * entry if one was allocated. Otherwise, 3779 * allocate a new entry, add it to the cache, 3780 * and then initiate an asynchronous readdir 3781 * operation to fill it. 3782 */ 3783 srdc.nfs3_cookie = rdc->nfs3_ncookie; 3784 srdc.buflen = count; 3785 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3786 if (rrdc != NULL) { 3787 if (nrdc != NULL) 3788 rddir_cache_rele(nrdc); 3789 } else { 3790 if (nrdc != NULL) 3791 rrdc = nrdc; 3792 else { 3793 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3794 } 3795 if (rrdc != NULL) { 3796 rrdc->nfs3_cookie = rdc->nfs3_ncookie; 3797 rrdc->buflen = count; 3798 avl_insert(&rp->r_dir, rrdc, where); 3799 rddir_cache_hold(rrdc); 3800 mutex_exit(&rp->r_statelock); 3801 rddir_cache_rele(rdc); 3802 #ifdef DEBUG 3803 nfs3_readdir_readahead++; 3804 #endif 3805 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir); 3806 return (error); 3807 } 3808 } 3809 3810 mutex_exit(&rp->r_statelock); 3811 rddir_cache_rele(rdc); 3812 return (error); 3813 } 3814 3815 /* 3816 * Didn't find an entry in the cache. Construct a new empty 3817 * entry and link it into the cache. Other processes attempting 3818 * to access this entry will need to wait until it is filled in. 3819 * 3820 * Since kmem_alloc may block, another pass through the cache 3821 * will need to be taken to make sure that another process 3822 * hasn't already added an entry to the cache for this request. 3823 */ 3824 if (nrdc == NULL) { 3825 mutex_exit(&rp->r_statelock); 3826 nrdc = rddir_cache_alloc(KM_SLEEP); 3827 nrdc->nfs3_cookie = uiop->uio_loffset; 3828 nrdc->buflen = count; 3829 goto top; 3830 } 3831 3832 /* 3833 * Add this entry to the cache. 3834 */ 3835 avl_insert(&rp->r_dir, nrdc, where); 3836 rddir_cache_hold(nrdc); 3837 mutex_exit(&rp->r_statelock); 3838 3839 bottom: 3840 #ifdef DEBUG 3841 missed = 1; 3842 nfs3_readdir_cache_misses++; 3843 #endif 3844 /* 3845 * Do the readdir. This routine decides whether to use 3846 * READDIR or READDIRPLUS. 3847 */ 3848 error = do_nfs3readdir(vp, nrdc, cr); 3849 3850 /* 3851 * If this operation failed, just return the error which occurred. 3852 */ 3853 if (error != 0) 3854 return (error); 3855 3856 /* 3857 * Since the RPC operation will have taken sometime and blocked 3858 * this process, another pass through the cache will need to be 3859 * taken to find the correct cache entry. It is possible that 3860 * the correct cache entry will not be there (although one was 3861 * added) because the directory changed during the RPC operation 3862 * and the readdir cache was flushed. In this case, just start 3863 * over. It is hoped that this will not happen too often... :-) 3864 */ 3865 nrdc = NULL; 3866 goto top; 3867 /* NOTREACHED */ 3868 } 3869 3870 static int 3871 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3872 { 3873 int error; 3874 rnode_t *rp; 3875 mntinfo_t *mi; 3876 3877 rp = VTOR(vp); 3878 mi = VTOMI(vp); 3879 ASSERT(nfs_zone() == mi->mi_zone); 3880 /* 3881 * Issue the proper request. 3882 * 3883 * If the server does not support READDIRPLUS, then use READDIR. 3884 * 3885 * Otherwise -- 3886 * Issue a READDIRPLUS if reading to fill an empty cache or if 3887 * an application has performed a lookup in the directory which 3888 * required an over the wire lookup. The use of READDIRPLUS 3889 * will help to (re)populate the DNLC. 3890 */ 3891 if (!(mi->mi_flags & MI_READDIRONLY) && 3892 (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) { 3893 if (rp->r_flags & RREADDIRPLUS) { 3894 mutex_enter(&rp->r_statelock); 3895 rp->r_flags &= ~RREADDIRPLUS; 3896 mutex_exit(&rp->r_statelock); 3897 } 3898 nfs3readdirplus(vp, rdc, cr); 3899 if (rdc->error == EOPNOTSUPP) 3900 nfs3readdir(vp, rdc, cr); 3901 } else 3902 nfs3readdir(vp, rdc, cr); 3903 3904 mutex_enter(&rp->r_statelock); 3905 rdc->flags &= ~RDDIR; 3906 if (rdc->flags & RDDIRWAIT) { 3907 rdc->flags &= ~RDDIRWAIT; 3908 cv_broadcast(&rdc->cv); 3909 } 3910 error = rdc->error; 3911 if (error) 3912 rdc->flags |= RDDIRREQ; 3913 mutex_exit(&rp->r_statelock); 3914 3915 rddir_cache_rele(rdc); 3916 3917 return (error); 3918 } 3919 3920 static void 3921 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3922 { 3923 int error; 3924 READDIR3args args; 3925 READDIR3vres res; 3926 vattr_t dva; 3927 rnode_t *rp; 3928 int douprintf; 3929 failinfo_t fi, *fip = NULL; 3930 mntinfo_t *mi; 3931 hrtime_t t; 3932 3933 rp = VTOR(vp); 3934 mi = VTOMI(vp); 3935 ASSERT(nfs_zone() == mi->mi_zone); 3936 3937 args.dir = *RTOFH3(rp); 3938 args.cookie = (cookie3)rdc->nfs3_cookie; 3939 args.cookieverf = rp->r_cookieverf; 3940 args.count = rdc->buflen; 3941 3942 /* 3943 * NFS client failover support 3944 * suppress failover unless we have a zero cookie 3945 */ 3946 if (args.cookie == (cookie3) 0) { 3947 fi.vp = vp; 3948 fi.fhp = (caddr_t)&args.dir; 3949 fi.copyproc = nfs3copyfh; 3950 fi.lookupproc = nfs3lookup; 3951 fi.xattrdirproc = acl_getxattrdir3; 3952 fip = &fi; 3953 } 3954 3955 #ifdef DEBUG 3956 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP); 3957 #else 3958 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3959 #endif 3960 3961 res.entries = (dirent64_t *)rdc->entries; 3962 res.entries_size = rdc->buflen; 3963 res.dir_attributes.fres.vap = &dva; 3964 res.dir_attributes.fres.vp = vp; 3965 res.loff = rdc->nfs3_cookie; 3966 3967 douprintf = 1; 3968 3969 if (mi->mi_io_kstats) { 3970 mutex_enter(&mi->mi_lock); 3971 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3972 mutex_exit(&mi->mi_lock); 3973 } 3974 3975 t = gethrtime(); 3976 3977 error = rfs3call(VTOMI(vp), NFSPROC3_READDIR, 3978 xdr_READDIR3args, (caddr_t)&args, 3979 xdr_READDIR3vres, (caddr_t)&res, cr, 3980 &douprintf, &res.status, 0, fip); 3981 3982 if (mi->mi_io_kstats) { 3983 mutex_enter(&mi->mi_lock); 3984 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3985 mutex_exit(&mi->mi_lock); 3986 } 3987 3988 if (error) 3989 goto err; 3990 3991 nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr); 3992 3993 error = geterrno3(res.status); 3994 if (error) { 3995 PURGE_STALE_FH(error, vp, cr); 3996 goto err; 3997 } 3998 3999 if (mi->mi_io_kstats) { 4000 mutex_enter(&mi->mi_lock); 4001 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 4002 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size; 4003 mutex_exit(&mi->mi_lock); 4004 } 4005 4006 rdc->nfs3_ncookie = res.loff; 4007 rp->r_cookieverf = res.cookieverf; 4008 rdc->eof = res.eof ? 1 : 0; 4009 rdc->entlen = res.size; 4010 ASSERT(rdc->entlen <= rdc->buflen); 4011 rdc->error = 0; 4012 return; 4013 4014 err: 4015 kmem_free(rdc->entries, rdc->buflen); 4016 rdc->entries = NULL; 4017 rdc->error = error; 4018 } 4019 4020 /* 4021 * Read directory entries. 4022 * There are some weird things to look out for here. The uio_loffset 4023 * field is either 0 or it is the offset returned from a previous 4024 * readdir. It is an opaque value used by the server to find the 4025 * correct directory block to read. The count field is the number 4026 * of blocks to read on the server. This is advisory only, the server 4027 * may return only one block's worth of entries. Entries may be compressed 4028 * on the server. 4029 */ 4030 static void 4031 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 4032 { 4033 int error; 4034 READDIRPLUS3args args; 4035 READDIRPLUS3vres res; 4036 vattr_t dva; 4037 rnode_t *rp; 4038 mntinfo_t *mi; 4039 int douprintf; 4040 failinfo_t fi, *fip = NULL; 4041 4042 rp = VTOR(vp); 4043 mi = VTOMI(vp); 4044 ASSERT(nfs_zone() == mi->mi_zone); 4045 4046 args.dir = *RTOFH3(rp); 4047 args.cookie = (cookie3)rdc->nfs3_cookie; 4048 args.cookieverf = rp->r_cookieverf; 4049 args.dircount = rdc->buflen; 4050 args.maxcount = mi->mi_tsize; 4051 4052 /* 4053 * NFS client failover support 4054 * suppress failover unless we have a zero cookie 4055 */ 4056 if (args.cookie == (cookie3)0) { 4057 fi.vp = vp; 4058 fi.fhp = (caddr_t)&args.dir; 4059 fi.copyproc = nfs3copyfh; 4060 fi.lookupproc = nfs3lookup; 4061 fi.xattrdirproc = acl_getxattrdir3; 4062 fip = &fi; 4063 } 4064 4065 #ifdef DEBUG 4066 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP); 4067 #else 4068 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 4069 #endif 4070 4071 res.entries = (dirent64_t *)rdc->entries; 4072 res.entries_size = rdc->buflen; 4073 res.dir_attributes.fres.vap = &dva; 4074 res.dir_attributes.fres.vp = vp; 4075 res.loff = rdc->nfs3_cookie; 4076 res.credentials = cr; 4077 4078 douprintf = 1; 4079 4080 if (mi->mi_io_kstats) { 4081 mutex_enter(&mi->mi_lock); 4082 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 4083 mutex_exit(&mi->mi_lock); 4084 } 4085 4086 res.time = gethrtime(); 4087 4088 error = rfs3call(mi, NFSPROC3_READDIRPLUS, 4089 xdr_READDIRPLUS3args, (caddr_t)&args, 4090 xdr_READDIRPLUS3vres, (caddr_t)&res, cr, 4091 &douprintf, &res.status, 0, fip); 4092 4093 if (mi->mi_io_kstats) { 4094 mutex_enter(&mi->mi_lock); 4095 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 4096 mutex_exit(&mi->mi_lock); 4097 } 4098 4099 if (error) { 4100 goto err; 4101 } 4102 4103 nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr); 4104 4105 error = geterrno3(res.status); 4106 if (error) { 4107 PURGE_STALE_FH(error, vp, cr); 4108 if (error == EOPNOTSUPP) { 4109 mutex_enter(&mi->mi_lock); 4110 mi->mi_flags |= MI_READDIRONLY; 4111 mutex_exit(&mi->mi_lock); 4112 } 4113 goto err; 4114 } 4115 4116 if (mi->mi_io_kstats) { 4117 mutex_enter(&mi->mi_lock); 4118 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 4119 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size; 4120 mutex_exit(&mi->mi_lock); 4121 } 4122 4123 rdc->nfs3_ncookie = res.loff; 4124 rp->r_cookieverf = res.cookieverf; 4125 rdc->eof = res.eof ? 1 : 0; 4126 rdc->entlen = res.size; 4127 ASSERT(rdc->entlen <= rdc->buflen); 4128 rdc->error = 0; 4129 4130 return; 4131 4132 err: 4133 kmem_free(rdc->entries, rdc->buflen); 4134 rdc->entries = NULL; 4135 rdc->error = error; 4136 } 4137 4138 #ifdef DEBUG 4139 static int nfs3_bio_do_stop = 0; 4140 #endif 4141 4142 static int 4143 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr) 4144 { 4145 rnode_t *rp = VTOR(bp->b_vp); 4146 int count; 4147 int error; 4148 cred_t *cred; 4149 offset_t offset; 4150 4151 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 4152 offset = ldbtob(bp->b_lblkno); 4153 4154 DTRACE_IO1(start, struct buf *, bp); 4155 4156 if (bp->b_flags & B_READ) { 4157 mutex_enter(&rp->r_statelock); 4158 if (rp->r_cred != NULL) { 4159 cred = rp->r_cred; 4160 crhold(cred); 4161 } else { 4162 rp->r_cred = cr; 4163 crhold(cr); 4164 cred = cr; 4165 crhold(cred); 4166 } 4167 mutex_exit(&rp->r_statelock); 4168 read_again: 4169 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr, 4170 offset, bp->b_bcount, &bp->b_resid, cred); 4171 crfree(cred); 4172 if (!error) { 4173 if (bp->b_resid) { 4174 /* 4175 * Didn't get it all because we hit EOF, 4176 * zero all the memory beyond the EOF. 4177 */ 4178 /* bzero(rdaddr + */ 4179 bzero(bp->b_un.b_addr + 4180 bp->b_bcount - bp->b_resid, bp->b_resid); 4181 } 4182 mutex_enter(&rp->r_statelock); 4183 if (bp->b_resid == bp->b_bcount && 4184 offset >= rp->r_size) { 4185 /* 4186 * We didn't read anything at all as we are 4187 * past EOF. Return an error indicator back 4188 * but don't destroy the pages (yet). 4189 */ 4190 error = NFS_EOF; 4191 } 4192 mutex_exit(&rp->r_statelock); 4193 } else if (error == EACCES) { 4194 mutex_enter(&rp->r_statelock); 4195 if (cred != cr) { 4196 if (rp->r_cred != NULL) 4197 crfree(rp->r_cred); 4198 rp->r_cred = cr; 4199 crhold(cr); 4200 cred = cr; 4201 crhold(cred); 4202 mutex_exit(&rp->r_statelock); 4203 goto read_again; 4204 } 4205 mutex_exit(&rp->r_statelock); 4206 } 4207 } else { 4208 if (!(rp->r_flags & RSTALE)) { 4209 mutex_enter(&rp->r_statelock); 4210 if (rp->r_cred != NULL) { 4211 cred = rp->r_cred; 4212 crhold(cred); 4213 } else { 4214 rp->r_cred = cr; 4215 crhold(cr); 4216 cred = cr; 4217 crhold(cred); 4218 } 4219 mutex_exit(&rp->r_statelock); 4220 write_again: 4221 mutex_enter(&rp->r_statelock); 4222 count = MIN(bp->b_bcount, rp->r_size - offset); 4223 mutex_exit(&rp->r_statelock); 4224 if (count < 0) 4225 cmn_err(CE_PANIC, "nfs3_bio: write count < 0"); 4226 #ifdef DEBUG 4227 if (count == 0) { 4228 zcmn_err(getzoneid(), CE_WARN, 4229 "nfs3_bio: zero length write at %lld", 4230 offset); 4231 nfs_printfhandle(&rp->r_fh); 4232 if (nfs3_bio_do_stop) 4233 debug_enter("nfs3_bio"); 4234 } 4235 #endif 4236 error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset, 4237 count, cred, stab_comm); 4238 if (error == EACCES) { 4239 mutex_enter(&rp->r_statelock); 4240 if (cred != cr) { 4241 if (rp->r_cred != NULL) 4242 crfree(rp->r_cred); 4243 rp->r_cred = cr; 4244 crhold(cr); 4245 crfree(cred); 4246 cred = cr; 4247 crhold(cred); 4248 mutex_exit(&rp->r_statelock); 4249 goto write_again; 4250 } 4251 mutex_exit(&rp->r_statelock); 4252 } 4253 bp->b_error = error; 4254 if (error && error != EINTR) { 4255 /* 4256 * Don't print EDQUOT errors on the console. 4257 * Don't print asynchronous EACCES errors. 4258 * Don't print EFBIG errors. 4259 * Print all other write errors. 4260 */ 4261 if (error != EDQUOT && error != EFBIG && 4262 (error != EACCES || 4263 !(bp->b_flags & B_ASYNC))) 4264 nfs_write_error(bp->b_vp, error, cred); 4265 /* 4266 * Update r_error and r_flags as appropriate. 4267 * If the error was ESTALE, then mark the 4268 * rnode as not being writeable and save 4269 * the error status. Otherwise, save any 4270 * errors which occur from asynchronous 4271 * page invalidations. Any errors occurring 4272 * from other operations should be saved 4273 * by the caller. 4274 */ 4275 mutex_enter(&rp->r_statelock); 4276 if (error == ESTALE) { 4277 rp->r_flags |= RSTALE; 4278 if (!rp->r_error) 4279 rp->r_error = error; 4280 } else if (!rp->r_error && 4281 (bp->b_flags & 4282 (B_INVAL|B_FORCE|B_ASYNC)) == 4283 (B_INVAL|B_FORCE|B_ASYNC)) { 4284 rp->r_error = error; 4285 } 4286 mutex_exit(&rp->r_statelock); 4287 } 4288 crfree(cred); 4289 } else 4290 error = rp->r_error; 4291 } 4292 4293 if (error != 0 && error != NFS_EOF) 4294 bp->b_flags |= B_ERROR; 4295 4296 DTRACE_IO1(done, struct buf *, bp); 4297 4298 return (error); 4299 } 4300 4301 static int 4302 nfs3_fid(vnode_t *vp, fid_t *fidp) 4303 { 4304 rnode_t *rp; 4305 4306 if (nfs_zone() != VTOMI(vp)->mi_zone) 4307 return (EIO); 4308 rp = VTOR(vp); 4309 4310 if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) { 4311 fidp->fid_len = rp->r_fh.fh_len; 4312 return (ENOSPC); 4313 } 4314 fidp->fid_len = rp->r_fh.fh_len; 4315 bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len); 4316 return (0); 4317 } 4318 4319 /* ARGSUSED2 */ 4320 static int 4321 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 4322 { 4323 rnode_t *rp = VTOR(vp); 4324 4325 if (!write_lock) { 4326 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 4327 return (V_WRITELOCK_FALSE); 4328 } 4329 4330 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 4331 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 4332 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 4333 return (V_WRITELOCK_FALSE); 4334 nfs_rw_exit(&rp->r_rwlock); 4335 } 4336 4337 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 4338 return (V_WRITELOCK_TRUE); 4339 } 4340 4341 /* ARGSUSED */ 4342 static void 4343 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 4344 { 4345 rnode_t *rp = VTOR(vp); 4346 4347 nfs_rw_exit(&rp->r_rwlock); 4348 } 4349 4350 /* ARGSUSED */ 4351 static int 4352 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 4353 { 4354 4355 /* 4356 * Because we stuff the readdir cookie into the offset field 4357 * someone may attempt to do an lseek with the cookie which 4358 * we want to succeed. 4359 */ 4360 if (vp->v_type == VDIR) 4361 return (0); 4362 if (*noffp < 0) 4363 return (EINVAL); 4364 return (0); 4365 } 4366 4367 /* 4368 * number of nfs3_bsize blocks to read ahead. 4369 */ 4370 static int nfs3_nra = 4; 4371 4372 #ifdef DEBUG 4373 static int nfs3_lostpage = 0; /* number of times we lost original page */ 4374 #endif 4375 4376 /* 4377 * Return all the pages from [off..off+len) in file 4378 */ 4379 static int 4380 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4381 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4382 enum seg_rw rw, cred_t *cr) 4383 { 4384 rnode_t *rp; 4385 int error; 4386 mntinfo_t *mi; 4387 4388 if (vp->v_flag & VNOMAP) 4389 return (ENOSYS); 4390 4391 if (nfs_zone() != VTOMI(vp)->mi_zone) 4392 return (EIO); 4393 if (protp != NULL) 4394 *protp = PROT_ALL; 4395 4396 /* 4397 * Now valididate that the caches are up to date. 4398 */ 4399 error = nfs3_validate_caches(vp, cr); 4400 if (error) 4401 return (error); 4402 4403 rp = VTOR(vp); 4404 mi = VTOMI(vp); 4405 retry: 4406 mutex_enter(&rp->r_statelock); 4407 4408 /* 4409 * Don't create dirty pages faster than they 4410 * can be cleaned so that the system doesn't 4411 * get imbalanced. If the async queue is 4412 * maxed out, then wait for it to drain before 4413 * creating more dirty pages. Also, wait for 4414 * any threads doing pagewalks in the vop_getattr 4415 * entry points so that they don't block for 4416 * long periods. 4417 */ 4418 if (rw == S_CREATE) { 4419 while ((mi->mi_max_threads != 0 && 4420 rp->r_awcount > 2 * mi->mi_max_threads) || 4421 rp->r_gcount > 0) 4422 cv_wait(&rp->r_cv, &rp->r_statelock); 4423 } 4424 4425 /* 4426 * If we are getting called as a side effect of an nfs_write() 4427 * operation the local file size might not be extended yet. 4428 * In this case we want to be able to return pages of zeroes. 4429 */ 4430 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 4431 mutex_exit(&rp->r_statelock); 4432 return (EFAULT); /* beyond EOF */ 4433 } 4434 4435 mutex_exit(&rp->r_statelock); 4436 4437 if (len <= PAGESIZE) { 4438 error = nfs3_getapage(vp, off, len, protp, pl, plsz, 4439 seg, addr, rw, cr); 4440 } else { 4441 error = pvn_getpages(nfs3_getapage, vp, off, len, protp, 4442 pl, plsz, seg, addr, rw, cr); 4443 } 4444 4445 switch (error) { 4446 case NFS_EOF: 4447 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 4448 goto retry; 4449 case ESTALE: 4450 PURGE_STALE_FH(error, vp, cr); 4451 } 4452 4453 return (error); 4454 } 4455 4456 /* 4457 * Called from pvn_getpages or nfs3_getpage to get a particular page. 4458 */ 4459 /* ARGSUSED */ 4460 static int 4461 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 4462 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4463 enum seg_rw rw, cred_t *cr) 4464 { 4465 rnode_t *rp; 4466 uint_t bsize; 4467 struct buf *bp; 4468 page_t *pp; 4469 u_offset_t lbn; 4470 u_offset_t io_off; 4471 u_offset_t blkoff; 4472 u_offset_t rablkoff; 4473 size_t io_len; 4474 uint_t blksize; 4475 int error; 4476 int readahead; 4477 int readahead_issued = 0; 4478 int ra_window; /* readahead window */ 4479 page_t *pagefound; 4480 page_t *savepp; 4481 4482 if (nfs_zone() != VTOMI(vp)->mi_zone) 4483 return (EIO); 4484 rp = VTOR(vp); 4485 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4486 4487 reread: 4488 bp = NULL; 4489 pp = NULL; 4490 pagefound = NULL; 4491 4492 if (pl != NULL) 4493 pl[0] = NULL; 4494 4495 error = 0; 4496 lbn = off / bsize; 4497 blkoff = lbn * bsize; 4498 4499 /* 4500 * Queueing up the readahead before doing the synchronous read 4501 * results in a significant increase in read throughput because 4502 * of the increased parallelism between the async threads and 4503 * the process context. 4504 */ 4505 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 4506 rw != S_CREATE && 4507 !(vp->v_flag & VNOCACHE)) { 4508 mutex_enter(&rp->r_statelock); 4509 4510 /* 4511 * Calculate the number of readaheads to do. 4512 * a) No readaheads at offset = 0. 4513 * b) Do maximum(nfs3_nra) readaheads when the readahead 4514 * window is closed. 4515 * c) Do readaheads between 1 to (nfs3_nra - 1) depending 4516 * upon how far the readahead window is open or close. 4517 * d) No readaheads if rp->r_nextr is not within the scope 4518 * of the readahead window (random i/o). 4519 */ 4520 4521 if (off == 0) 4522 readahead = 0; 4523 else if (blkoff == rp->r_nextr) 4524 readahead = nfs3_nra; 4525 else if (rp->r_nextr > blkoff && 4526 ((ra_window = (rp->r_nextr - blkoff) / bsize) 4527 <= (nfs3_nra - 1))) 4528 readahead = nfs3_nra - ra_window; 4529 else 4530 readahead = 0; 4531 4532 rablkoff = rp->r_nextr; 4533 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 4534 mutex_exit(&rp->r_statelock); 4535 if (nfs_async_readahead(vp, rablkoff + bsize, 4536 addr + (rablkoff + bsize - off), seg, cr, 4537 nfs3_readahead) < 0) { 4538 mutex_enter(&rp->r_statelock); 4539 break; 4540 } 4541 readahead--; 4542 rablkoff += bsize; 4543 /* 4544 * Indicate that we did a readahead so 4545 * readahead offset is not updated 4546 * by the synchronous read below. 4547 */ 4548 readahead_issued = 1; 4549 mutex_enter(&rp->r_statelock); 4550 /* 4551 * set readahead offset to 4552 * offset of last async readahead 4553 * request. 4554 */ 4555 rp->r_nextr = rablkoff; 4556 } 4557 mutex_exit(&rp->r_statelock); 4558 } 4559 4560 again: 4561 if ((pagefound = page_exists(vp, off)) == NULL) { 4562 if (pl == NULL) { 4563 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 4564 nfs3_readahead); 4565 } else if (rw == S_CREATE) { 4566 /* 4567 * Block for this page is not allocated, or the offset 4568 * is beyond the current allocation size, or we're 4569 * allocating a swap slot and the page was not found, 4570 * so allocate it and return a zero page. 4571 */ 4572 if ((pp = page_create_va(vp, off, 4573 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 4574 cmn_err(CE_PANIC, "nfs3_getapage: page_create"); 4575 io_len = PAGESIZE; 4576 mutex_enter(&rp->r_statelock); 4577 rp->r_nextr = off + PAGESIZE; 4578 mutex_exit(&rp->r_statelock); 4579 } else { 4580 /* 4581 * Need to go to server to get a BLOCK, exception to 4582 * that being while reading at offset = 0 or doing 4583 * random i/o, in that case read only a PAGE. 4584 */ 4585 mutex_enter(&rp->r_statelock); 4586 if (blkoff < rp->r_size && 4587 blkoff + bsize >= rp->r_size) { 4588 /* 4589 * If only a block or less is left in 4590 * the file, read all that is remaining. 4591 */ 4592 if (rp->r_size <= off) { 4593 /* 4594 * Trying to access beyond EOF, 4595 * set up to get at least one page. 4596 */ 4597 blksize = off + PAGESIZE - blkoff; 4598 } else 4599 blksize = rp->r_size - blkoff; 4600 } else if ((off == 0) || 4601 (off != rp->r_nextr && !readahead_issued)) { 4602 blksize = PAGESIZE; 4603 blkoff = off; /* block = page here */ 4604 } else 4605 blksize = bsize; 4606 mutex_exit(&rp->r_statelock); 4607 4608 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4609 &io_len, blkoff, blksize, 0); 4610 4611 /* 4612 * Some other thread has entered the page, 4613 * so just use it. 4614 */ 4615 if (pp == NULL) 4616 goto again; 4617 4618 /* 4619 * Now round the request size up to page boundaries. 4620 * This ensures that the entire page will be 4621 * initialized to zeroes if EOF is encountered. 4622 */ 4623 io_len = ptob(btopr(io_len)); 4624 4625 bp = pageio_setup(pp, io_len, vp, B_READ); 4626 ASSERT(bp != NULL); 4627 4628 /* 4629 * pageio_setup should have set b_addr to 0. This 4630 * is correct since we want to do I/O on a page 4631 * boundary. bp_mapin will use this addr to calculate 4632 * an offset, and then set b_addr to the kernel virtual 4633 * address it allocated for us. 4634 */ 4635 ASSERT(bp->b_un.b_addr == 0); 4636 4637 bp->b_edev = 0; 4638 bp->b_dev = 0; 4639 bp->b_lblkno = lbtodb(io_off); 4640 bp->b_file = vp; 4641 bp->b_offset = (offset_t)off; 4642 bp_mapin(bp); 4643 4644 /* 4645 * If doing a write beyond what we believe is EOF, 4646 * don't bother trying to read the pages from the 4647 * server, we'll just zero the pages here. We 4648 * don't check that the rw flag is S_WRITE here 4649 * because some implementations may attempt a 4650 * read access to the buffer before copying data. 4651 */ 4652 mutex_enter(&rp->r_statelock); 4653 if (io_off >= rp->r_size && seg == segkmap) { 4654 mutex_exit(&rp->r_statelock); 4655 bzero(bp->b_un.b_addr, io_len); 4656 } else { 4657 mutex_exit(&rp->r_statelock); 4658 error = nfs3_bio(bp, NULL, cr); 4659 } 4660 4661 /* 4662 * Unmap the buffer before freeing it. 4663 */ 4664 bp_mapout(bp); 4665 pageio_done(bp); 4666 4667 savepp = pp; 4668 do { 4669 pp->p_fsdata = C_NOCOMMIT; 4670 } while ((pp = pp->p_next) != savepp); 4671 4672 if (error == NFS_EOF) { 4673 /* 4674 * If doing a write system call just return 4675 * zeroed pages, else user tried to get pages 4676 * beyond EOF, return error. We don't check 4677 * that the rw flag is S_WRITE here because 4678 * some implementations may attempt a read 4679 * access to the buffer before copying data. 4680 */ 4681 if (seg == segkmap) 4682 error = 0; 4683 else 4684 error = EFAULT; 4685 } 4686 4687 if (!readahead_issued && !error) { 4688 mutex_enter(&rp->r_statelock); 4689 rp->r_nextr = io_off + io_len; 4690 mutex_exit(&rp->r_statelock); 4691 } 4692 } 4693 } 4694 4695 out: 4696 if (pl == NULL) 4697 return (error); 4698 4699 if (error) { 4700 if (pp != NULL) 4701 pvn_read_done(pp, B_ERROR); 4702 return (error); 4703 } 4704 4705 if (pagefound) { 4706 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 4707 4708 /* 4709 * Page exists in the cache, acquire the appropriate lock. 4710 * If this fails, start all over again. 4711 */ 4712 if ((pp = page_lookup(vp, off, se)) == NULL) { 4713 #ifdef DEBUG 4714 nfs3_lostpage++; 4715 #endif 4716 goto reread; 4717 } 4718 pl[0] = pp; 4719 pl[1] = NULL; 4720 return (0); 4721 } 4722 4723 if (pp != NULL) 4724 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4725 4726 return (error); 4727 } 4728 4729 static void 4730 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 4731 cred_t *cr) 4732 { 4733 int error; 4734 page_t *pp; 4735 u_offset_t io_off; 4736 size_t io_len; 4737 struct buf *bp; 4738 uint_t bsize, blksize; 4739 rnode_t *rp = VTOR(vp); 4740 page_t *savepp; 4741 4742 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4743 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4744 4745 mutex_enter(&rp->r_statelock); 4746 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 4747 /* 4748 * If less than a block left in file read less 4749 * than a block. 4750 */ 4751 blksize = rp->r_size - blkoff; 4752 } else 4753 blksize = bsize; 4754 mutex_exit(&rp->r_statelock); 4755 4756 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 4757 &io_off, &io_len, blkoff, blksize, 1); 4758 /* 4759 * The isra flag passed to the kluster function is 1, we may have 4760 * gotten a return value of NULL for a variety of reasons (# of free 4761 * pages < minfree, someone entered the page on the vnode etc). In all 4762 * cases, we want to punt on the readahead. 4763 */ 4764 if (pp == NULL) 4765 return; 4766 4767 /* 4768 * Now round the request size up to page boundaries. 4769 * This ensures that the entire page will be 4770 * initialized to zeroes if EOF is encountered. 4771 */ 4772 io_len = ptob(btopr(io_len)); 4773 4774 bp = pageio_setup(pp, io_len, vp, B_READ); 4775 ASSERT(bp != NULL); 4776 4777 /* 4778 * pageio_setup should have set b_addr to 0. This is correct since 4779 * we want to do I/O on a page boundary. bp_mapin() will use this addr 4780 * to calculate an offset, and then set b_addr to the kernel virtual 4781 * address it allocated for us. 4782 */ 4783 ASSERT(bp->b_un.b_addr == 0); 4784 4785 bp->b_edev = 0; 4786 bp->b_dev = 0; 4787 bp->b_lblkno = lbtodb(io_off); 4788 bp->b_file = vp; 4789 bp->b_offset = (offset_t)blkoff; 4790 bp_mapin(bp); 4791 4792 /* 4793 * If doing a write beyond what we believe is EOF, don't bother trying 4794 * to read the pages from the server, we'll just zero the pages here. 4795 * We don't check that the rw flag is S_WRITE here because some 4796 * implementations may attempt a read access to the buffer before 4797 * copying data. 4798 */ 4799 mutex_enter(&rp->r_statelock); 4800 if (io_off >= rp->r_size && seg == segkmap) { 4801 mutex_exit(&rp->r_statelock); 4802 bzero(bp->b_un.b_addr, io_len); 4803 error = 0; 4804 } else { 4805 mutex_exit(&rp->r_statelock); 4806 error = nfs3_bio(bp, NULL, cr); 4807 if (error == NFS_EOF) 4808 error = 0; 4809 } 4810 4811 /* 4812 * Unmap the buffer before freeing it. 4813 */ 4814 bp_mapout(bp); 4815 pageio_done(bp); 4816 4817 savepp = pp; 4818 do { 4819 pp->p_fsdata = C_NOCOMMIT; 4820 } while ((pp = pp->p_next) != savepp); 4821 4822 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 4823 4824 /* 4825 * In case of error set readahead offset 4826 * to the lowest offset. 4827 * pvn_read_done() calls VN_DISPOSE to destroy the pages 4828 */ 4829 if (error && rp->r_nextr > io_off) { 4830 mutex_enter(&rp->r_statelock); 4831 if (rp->r_nextr > io_off) 4832 rp->r_nextr = io_off; 4833 mutex_exit(&rp->r_statelock); 4834 } 4835 } 4836 4837 /* 4838 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 4839 * If len == 0, do from off to EOF. 4840 * 4841 * The normal cases should be len == 0 && off == 0 (entire vp list), 4842 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4843 * (from pageout). 4844 */ 4845 static int 4846 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 4847 { 4848 int error; 4849 rnode_t *rp; 4850 4851 ASSERT(cr != NULL); 4852 4853 /* 4854 * XXX - Why should this check be made here? 4855 */ 4856 if (vp->v_flag & VNOMAP) 4857 return (ENOSYS); 4858 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 4859 return (0); 4860 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 4861 return (EIO); 4862 4863 rp = VTOR(vp); 4864 mutex_enter(&rp->r_statelock); 4865 rp->r_count++; 4866 mutex_exit(&rp->r_statelock); 4867 error = nfs_putpages(vp, off, len, flags, cr); 4868 mutex_enter(&rp->r_statelock); 4869 rp->r_count--; 4870 cv_broadcast(&rp->r_cv); 4871 mutex_exit(&rp->r_statelock); 4872 4873 return (error); 4874 } 4875 4876 /* 4877 * Write out a single page, possibly klustering adjacent dirty pages. 4878 */ 4879 int 4880 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 4881 int flags, cred_t *cr) 4882 { 4883 u_offset_t io_off; 4884 u_offset_t lbn_off; 4885 u_offset_t lbn; 4886 size_t io_len; 4887 uint_t bsize; 4888 int error; 4889 rnode_t *rp; 4890 4891 ASSERT(!vn_is_readonly(vp)); 4892 ASSERT(pp != NULL); 4893 ASSERT(cr != NULL); 4894 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 4895 4896 rp = VTOR(vp); 4897 ASSERT(rp->r_count > 0); 4898 4899 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4900 lbn = pp->p_offset / bsize; 4901 lbn_off = lbn * bsize; 4902 4903 /* 4904 * Find a kluster that fits in one block, or in 4905 * one page if pages are bigger than blocks. If 4906 * there is less file space allocated than a whole 4907 * page, we'll shorten the i/o request below. 4908 */ 4909 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4910 roundup(bsize, PAGESIZE), flags); 4911 4912 /* 4913 * pvn_write_kluster shouldn't have returned a page with offset 4914 * behind the original page we were given. Verify that. 4915 */ 4916 ASSERT((pp->p_offset / bsize) >= lbn); 4917 4918 /* 4919 * Now pp will have the list of kept dirty pages marked for 4920 * write back. It will also handle invalidation and freeing 4921 * of pages that are not dirty. Check for page length rounding 4922 * problems. 4923 */ 4924 if (io_off + io_len > lbn_off + bsize) { 4925 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4926 io_len = lbn_off + bsize - io_off; 4927 } 4928 /* 4929 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4930 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4931 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4932 * progress and the r_size has not been made consistent with the 4933 * new size of the file. When the uiomove() completes the r_size is 4934 * updated and the RMODINPROGRESS flag is cleared. 4935 * 4936 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4937 * consistent value of r_size. Without this handshaking, it is 4938 * possible that nfs(3)_bio() picks up the old value of r_size 4939 * before the uiomove() in writerp() completes. This will result 4940 * in the write through nfs(3)_bio() being dropped. 4941 * 4942 * More precisely, there is a window between the time the uiomove() 4943 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4944 * operation intervenes in this window, the page will be picked up, 4945 * because it is dirty (it will be unlocked, unless it was 4946 * pagecreate'd). When the page is picked up as dirty, the dirty 4947 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4948 * checked. This will still be the old size. Therefore the page will 4949 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4950 * the page will be found to be clean and the write will be dropped. 4951 */ 4952 if (rp->r_flags & RMODINPROGRESS) { 4953 mutex_enter(&rp->r_statelock); 4954 if ((rp->r_flags & RMODINPROGRESS) && 4955 rp->r_modaddr + MAXBSIZE > io_off && 4956 rp->r_modaddr < io_off + io_len) { 4957 page_t *plist; 4958 /* 4959 * A write is in progress for this region of the file. 4960 * If we did not detect RMODINPROGRESS here then this 4961 * path through nfs_putapage() would eventually go to 4962 * nfs(3)_bio() and may not write out all of the data 4963 * in the pages. We end up losing data. So we decide 4964 * to set the modified bit on each page in the page 4965 * list and mark the rnode with RDIRTY. This write 4966 * will be restarted at some later time. 4967 */ 4968 plist = pp; 4969 while (plist != NULL) { 4970 pp = plist; 4971 page_sub(&plist, pp); 4972 hat_setmod(pp); 4973 page_io_unlock(pp); 4974 page_unlock(pp); 4975 } 4976 rp->r_flags |= RDIRTY; 4977 mutex_exit(&rp->r_statelock); 4978 if (offp) 4979 *offp = io_off; 4980 if (lenp) 4981 *lenp = io_len; 4982 return (0); 4983 } 4984 mutex_exit(&rp->r_statelock); 4985 } 4986 4987 if (flags & B_ASYNC) { 4988 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4989 nfs3_sync_putapage); 4990 } else 4991 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4992 4993 if (offp) 4994 *offp = io_off; 4995 if (lenp) 4996 *lenp = io_len; 4997 return (error); 4998 } 4999 5000 static int 5001 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5002 int flags, cred_t *cr) 5003 { 5004 int error; 5005 rnode_t *rp; 5006 5007 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5008 5009 flags |= B_WRITE; 5010 5011 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5012 5013 rp = VTOR(vp); 5014 5015 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 5016 error == EACCES) && 5017 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 5018 if (!(rp->r_flags & ROUTOFSPACE)) { 5019 mutex_enter(&rp->r_statelock); 5020 rp->r_flags |= ROUTOFSPACE; 5021 mutex_exit(&rp->r_statelock); 5022 } 5023 flags |= B_ERROR; 5024 pvn_write_done(pp, flags); 5025 /* 5026 * If this was not an async thread, then try again to 5027 * write out the pages, but this time, also destroy 5028 * them whether or not the write is successful. This 5029 * will prevent memory from filling up with these 5030 * pages and destroying them is the only alternative 5031 * if they can't be written out. 5032 * 5033 * Don't do this if this is an async thread because 5034 * when the pages are unlocked in pvn_write_done, 5035 * some other thread could have come along, locked 5036 * them, and queued for an async thread. It would be 5037 * possible for all of the async threads to be tied 5038 * up waiting to lock the pages again and they would 5039 * all already be locked and waiting for an async 5040 * thread to handle them. Deadlock. 5041 */ 5042 if (!(flags & B_ASYNC)) { 5043 error = nfs3_putpage(vp, io_off, io_len, 5044 B_INVAL | B_FORCE, cr); 5045 } 5046 } else { 5047 if (error) 5048 flags |= B_ERROR; 5049 else if (rp->r_flags & ROUTOFSPACE) { 5050 mutex_enter(&rp->r_statelock); 5051 rp->r_flags &= ~ROUTOFSPACE; 5052 mutex_exit(&rp->r_statelock); 5053 } 5054 pvn_write_done(pp, flags); 5055 if (freemem < desfree) 5056 (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr); 5057 } 5058 5059 return (error); 5060 } 5061 5062 static int 5063 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 5064 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 5065 { 5066 struct segvn_crargs vn_a; 5067 int error; 5068 rnode_t *rp; 5069 struct vattr va; 5070 5071 if (nfs_zone() != VTOMI(vp)->mi_zone) 5072 return (EIO); 5073 5074 if (vp->v_flag & VNOMAP) 5075 return (ENOSYS); 5076 5077 if (off < 0 || off + len < 0) 5078 return (ENXIO); 5079 5080 if (vp->v_type != VREG) 5081 return (ENODEV); 5082 5083 /* 5084 * If there is cached data and if close-to-open consistency 5085 * checking is not turned off and if the file system is not 5086 * mounted readonly, then force an over the wire getattr. 5087 * Otherwise, just invoke nfs3getattr to get a copy of the 5088 * attributes. The attribute cache will be used unless it 5089 * is timed out and if it is, then an over the wire getattr 5090 * will be issued. 5091 */ 5092 va.va_mask = AT_ALL; 5093 if (vn_has_cached_data(vp) && 5094 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 5095 error = nfs3_getattr_otw(vp, &va, cr); 5096 else 5097 error = nfs3getattr(vp, &va, cr); 5098 if (error) 5099 return (error); 5100 5101 /* 5102 * Check to see if the vnode is currently marked as not cachable. 5103 * This means portions of the file are locked (through VOP_FRLOCK). 5104 * In this case the map request must be refused. We use 5105 * rp->r_lkserlock to avoid a race with concurrent lock requests. 5106 */ 5107 rp = VTOR(vp); 5108 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 5109 return (EINTR); 5110 5111 if (vp->v_flag & VNOCACHE) { 5112 error = EAGAIN; 5113 goto done; 5114 } 5115 5116 /* 5117 * Don't allow concurrent locks and mapping if mandatory locking is 5118 * enabled. 5119 */ 5120 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 5121 MANDLOCK(vp, va.va_mode)) { 5122 error = EAGAIN; 5123 goto done; 5124 } 5125 5126 as_rangelock(as); 5127 if (!(flags & MAP_FIXED)) { 5128 map_addr(addrp, len, off, 1, flags); 5129 if (*addrp == NULL) { 5130 as_rangeunlock(as); 5131 error = ENOMEM; 5132 goto done; 5133 } 5134 } else { 5135 /* 5136 * User specified address - blow away any previous mappings 5137 */ 5138 (void) as_unmap(as, *addrp, len); 5139 } 5140 5141 vn_a.vp = vp; 5142 vn_a.offset = off; 5143 vn_a.type = (flags & MAP_TYPE); 5144 vn_a.prot = (uchar_t)prot; 5145 vn_a.maxprot = (uchar_t)maxprot; 5146 vn_a.flags = (flags & ~MAP_TYPE); 5147 vn_a.cred = cr; 5148 vn_a.amp = NULL; 5149 vn_a.szc = 0; 5150 vn_a.lgrp_mem_policy_flags = 0; 5151 5152 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5153 as_rangeunlock(as); 5154 5155 done: 5156 nfs_rw_exit(&rp->r_lkserlock); 5157 return (error); 5158 } 5159 5160 /* ARGSUSED */ 5161 static int 5162 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5163 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 5164 { 5165 rnode_t *rp; 5166 5167 if (vp->v_flag & VNOMAP) 5168 return (ENOSYS); 5169 if (nfs_zone() != VTOMI(vp)->mi_zone) 5170 return (EIO); 5171 5172 /* 5173 * Need to hold rwlock while incrementing the mapcnt so that 5174 * mmap'ing can be serialized with writes so that the caching 5175 * can be handled correctly. 5176 */ 5177 rp = VTOR(vp); 5178 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 5179 return (EINTR); 5180 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 5181 nfs_rw_exit(&rp->r_rwlock); 5182 5183 return (0); 5184 } 5185 5186 static int 5187 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 5188 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 5189 { 5190 netobj lm_fh3; 5191 int rc; 5192 u_offset_t start, end; 5193 rnode_t *rp; 5194 int error = 0, intr = INTR(vp); 5195 5196 if (nfs_zone() != VTOMI(vp)->mi_zone) 5197 return (EIO); 5198 /* check for valid cmd parameter */ 5199 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 5200 return (EINVAL); 5201 5202 /* Verify l_type. */ 5203 switch (bfp->l_type) { 5204 case F_RDLCK: 5205 if (cmd != F_GETLK && !(flag & FREAD)) 5206 return (EBADF); 5207 break; 5208 case F_WRLCK: 5209 if (cmd != F_GETLK && !(flag & FWRITE)) 5210 return (EBADF); 5211 break; 5212 case F_UNLCK: 5213 intr = 0; 5214 break; 5215 5216 default: 5217 return (EINVAL); 5218 } 5219 5220 /* check the validity of the lock range */ 5221 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 5222 return (rc); 5223 if (rc = flk_check_lock_data(start, end, MAXEND)) 5224 return (rc); 5225 5226 /* 5227 * If the filesystem is mounted using local locking, pass the 5228 * request off to the local locking code. 5229 */ 5230 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 5231 if (cmd == F_SETLK || cmd == F_SETLKW) { 5232 /* 5233 * For complete safety, we should be holding 5234 * r_lkserlock. However, we can't call 5235 * lm_safelock and then fs_frlock while 5236 * holding r_lkserlock, so just invoke 5237 * lm_safelock and expect that this will 5238 * catch enough of the cases. 5239 */ 5240 if (!lm_safelock(vp, bfp, cr)) 5241 return (EAGAIN); 5242 } 5243 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 5244 } 5245 5246 rp = VTOR(vp); 5247 5248 /* 5249 * Check whether the given lock request can proceed, given the 5250 * current file mappings. 5251 */ 5252 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 5253 return (EINTR); 5254 if (cmd == F_SETLK || cmd == F_SETLKW) { 5255 if (!lm_safelock(vp, bfp, cr)) { 5256 rc = EAGAIN; 5257 goto done; 5258 } 5259 } 5260 5261 /* 5262 * Flush the cache after waiting for async I/O to finish. For new 5263 * locks, this is so that the process gets the latest bits from the 5264 * server. For unlocks, this is so that other clients see the 5265 * latest bits once the file has been unlocked. If currently dirty 5266 * pages can't be flushed, then don't allow a lock to be set. But 5267 * allow unlocks to succeed, to avoid having orphan locks on the 5268 * server. 5269 */ 5270 if (cmd != F_GETLK) { 5271 mutex_enter(&rp->r_statelock); 5272 while (rp->r_count > 0) { 5273 if (intr) { 5274 klwp_t *lwp = ttolwp(curthread); 5275 5276 if (lwp != NULL) 5277 lwp->lwp_nostop++; 5278 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 5279 if (lwp != NULL) 5280 lwp->lwp_nostop--; 5281 rc = EINTR; 5282 break; 5283 } 5284 if (lwp != NULL) 5285 lwp->lwp_nostop--; 5286 } else 5287 cv_wait(&rp->r_cv, &rp->r_statelock); 5288 } 5289 mutex_exit(&rp->r_statelock); 5290 if (rc != 0) 5291 goto done; 5292 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 5293 if (error) { 5294 if (error == ENOSPC || error == EDQUOT) { 5295 mutex_enter(&rp->r_statelock); 5296 if (!rp->r_error) 5297 rp->r_error = error; 5298 mutex_exit(&rp->r_statelock); 5299 } 5300 if (bfp->l_type != F_UNLCK) { 5301 rc = ENOLCK; 5302 goto done; 5303 } 5304 } 5305 } 5306 5307 lm_fh3.n_len = VTOFH3(vp)->fh3_length; 5308 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data); 5309 5310 /* 5311 * Call the lock manager to do the real work of contacting 5312 * the server and obtaining the lock. 5313 */ 5314 rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp); 5315 5316 if (rc == 0) 5317 nfs_lockcompletion(vp, cmd); 5318 5319 done: 5320 nfs_rw_exit(&rp->r_lkserlock); 5321 return (rc); 5322 } 5323 5324 /* 5325 * Free storage space associated with the specified vnode. The portion 5326 * to be freed is specified by bfp->l_start and bfp->l_len (already 5327 * normalized to a "whence" of 0). 5328 * 5329 * This is an experimental facility whose continued existence is not 5330 * guaranteed. Currently, we only support the special case 5331 * of l_len == 0, meaning free to end of file. 5332 */ 5333 /* ARGSUSED */ 5334 static int 5335 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 5336 offset_t offset, cred_t *cr, caller_context_t *ct) 5337 { 5338 int error; 5339 5340 ASSERT(vp->v_type == VREG); 5341 if (cmd != F_FREESP) 5342 return (EINVAL); 5343 if (nfs_zone() != VTOMI(vp)->mi_zone) 5344 return (EIO); 5345 5346 error = convoff(vp, bfp, 0, offset); 5347 if (!error) { 5348 ASSERT(bfp->l_start >= 0); 5349 if (bfp->l_len == 0) { 5350 struct vattr va; 5351 5352 /* 5353 * ftruncate should not change the ctime and 5354 * mtime if we truncate the file to its 5355 * previous size. 5356 */ 5357 va.va_mask = AT_SIZE; 5358 error = nfs3getattr(vp, &va, cr); 5359 if (error || va.va_size == bfp->l_start) 5360 return (error); 5361 va.va_mask = AT_SIZE; 5362 va.va_size = bfp->l_start; 5363 error = nfs3setattr(vp, &va, 0, cr); 5364 } else 5365 error = EINVAL; 5366 } 5367 5368 return (error); 5369 } 5370 5371 /* ARGSUSED */ 5372 static int 5373 nfs3_realvp(vnode_t *vp, vnode_t **vpp) 5374 { 5375 5376 return (EINVAL); 5377 } 5378 5379 /* 5380 * Setup and add an address space callback to do the work of the delmap call. 5381 * The callback will (and must be) deleted in the actual callback function. 5382 * 5383 * This is done in order to take care of the problem that we have with holding 5384 * the address space's a_lock for a long period of time (e.g. if the NFS server 5385 * is down). Callbacks will be executed in the address space code while the 5386 * a_lock is not held. Holding the address space's a_lock causes things such 5387 * as ps and fork to hang because they are trying to acquire this lock as well. 5388 */ 5389 /* ARGSUSED */ 5390 static int 5391 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5392 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 5393 { 5394 int caller_found; 5395 int error; 5396 rnode_t *rp; 5397 nfs_delmap_args_t *dmapp; 5398 nfs_delmapcall_t *delmap_call; 5399 5400 if (vp->v_flag & VNOMAP) 5401 return (ENOSYS); 5402 /* 5403 * A process may not change zones if it has NFS pages mmap'ed 5404 * in, so we can't legitimately get here from the wrong zone. 5405 */ 5406 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5407 5408 rp = VTOR(vp); 5409 5410 /* 5411 * The way that the address space of this process deletes its mapping 5412 * of this file is via the following call chains: 5413 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap() 5414 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap() 5415 * 5416 * With the use of address space callbacks we are allowed to drop the 5417 * address space lock, a_lock, while executing the NFS operations that 5418 * need to go over the wire. Returning EAGAIN to the caller of this 5419 * function is what drives the execution of the callback that we add 5420 * below. The callback will be executed by the address space code 5421 * after dropping the a_lock. When the callback is finished, since 5422 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 5423 * is called again on the same segment to finish the rest of the work 5424 * that needs to happen during unmapping. 5425 * 5426 * This action of calling back into the segment driver causes 5427 * nfs3_delmap() to get called again, but since the callback was 5428 * already executed at this point, it already did the work and there 5429 * is nothing left for us to do. 5430 * 5431 * To Summarize: 5432 * - The first time nfs3_delmap is called by the current thread is when 5433 * we add the caller associated with this delmap to the delmap caller 5434 * list, add the callback, and return EAGAIN. 5435 * - The second time in this call chain when nfs3_delmap is called we 5436 * will find this caller in the delmap caller list and realize there 5437 * is no more work to do thus removing this caller from the list and 5438 * returning the error that was set in the callback execution. 5439 */ 5440 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 5441 if (caller_found) { 5442 /* 5443 * 'error' is from the actual delmap operations. To avoid 5444 * hangs, we need to handle the return of EAGAIN differently 5445 * since this is what drives the callback execution. 5446 * In this case, we don't want to return EAGAIN and do the 5447 * callback execution because there are none to execute. 5448 */ 5449 if (error == EAGAIN) 5450 return (0); 5451 else 5452 return (error); 5453 } 5454 5455 /* current caller was not in the list */ 5456 delmap_call = nfs_init_delmapcall(); 5457 5458 mutex_enter(&rp->r_statelock); 5459 list_insert_tail(&rp->r_indelmap, delmap_call); 5460 mutex_exit(&rp->r_statelock); 5461 5462 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 5463 5464 dmapp->vp = vp; 5465 dmapp->off = off; 5466 dmapp->addr = addr; 5467 dmapp->len = len; 5468 dmapp->prot = prot; 5469 dmapp->maxprot = maxprot; 5470 dmapp->flags = flags; 5471 dmapp->cr = cr; 5472 dmapp->caller = delmap_call; 5473 5474 error = as_add_callback(as, nfs3_delmap_callback, dmapp, 5475 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 5476 5477 return (error ? error : EAGAIN); 5478 } 5479 5480 /* 5481 * Remove some pages from an mmap'd vnode. Just update the 5482 * count of pages. If doing close-to-open, then flush and 5483 * commit all of the pages associated with this file. 5484 * Otherwise, start an asynchronous page flush to write out 5485 * any dirty pages. This will also associate a credential 5486 * with the rnode which can be used to write the pages. 5487 */ 5488 /* ARGSUSED */ 5489 static void 5490 nfs3_delmap_callback(struct as *as, void *arg, uint_t event) 5491 { 5492 int error; 5493 rnode_t *rp; 5494 mntinfo_t *mi; 5495 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 5496 5497 rp = VTOR(dmapp->vp); 5498 mi = VTOMI(dmapp->vp); 5499 5500 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 5501 ASSERT(rp->r_mapcnt >= 0); 5502 5503 /* 5504 * Initiate a page flush and potential commit if there are 5505 * pages, the file system was not mounted readonly, the segment 5506 * was mapped shared, and the pages themselves were writeable. 5507 */ 5508 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 5509 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 5510 mutex_enter(&rp->r_statelock); 5511 rp->r_flags |= RDIRTY; 5512 mutex_exit(&rp->r_statelock); 5513 /* 5514 * If this is a cross-zone access a sync putpage won't work, so 5515 * the best we can do is try an async putpage. That seems 5516 * better than something more draconian such as discarding the 5517 * dirty pages. 5518 */ 5519 if ((mi->mi_flags & MI_NOCTO) || 5520 nfs_zone() != mi->mi_zone) 5521 error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len, 5522 B_ASYNC, dmapp->cr); 5523 else 5524 error = nfs3_putpage_commit(dmapp->vp, dmapp->off, 5525 dmapp->len, dmapp->cr); 5526 if (!error) { 5527 mutex_enter(&rp->r_statelock); 5528 error = rp->r_error; 5529 rp->r_error = 0; 5530 mutex_exit(&rp->r_statelock); 5531 } 5532 } else 5533 error = 0; 5534 5535 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 5536 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len, 5537 B_INVAL, dmapp->cr); 5538 5539 dmapp->caller->error = error; 5540 (void) as_delete_callback(as, arg); 5541 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 5542 } 5543 5544 static int nfs3_pathconf_disable_cache = 0; 5545 5546 #ifdef DEBUG 5547 static int nfs3_pathconf_cache_hits = 0; 5548 static int nfs3_pathconf_cache_misses = 0; 5549 #endif 5550 5551 static int 5552 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 5553 { 5554 int error; 5555 PATHCONF3args args; 5556 PATHCONF3res res; 5557 int douprintf; 5558 failinfo_t fi; 5559 rnode_t *rp; 5560 hrtime_t t; 5561 5562 if (nfs_zone() != VTOMI(vp)->mi_zone) 5563 return (EIO); 5564 /* 5565 * Large file spec - need to base answer on info stored 5566 * on original FSINFO response. 5567 */ 5568 if (cmd == _PC_FILESIZEBITS) { 5569 unsigned long long ll; 5570 long l = 1; 5571 5572 ll = VTOMI(vp)->mi_maxfilesize; 5573 5574 if (ll == 0) { 5575 *valp = 0; 5576 return (0); 5577 } 5578 5579 if (ll & 0xffffffff00000000) { 5580 l += 32; ll >>= 32; 5581 } 5582 if (ll & 0xffff0000) { 5583 l += 16; ll >>= 16; 5584 } 5585 if (ll & 0xff00) { 5586 l += 8; ll >>= 8; 5587 } 5588 if (ll & 0xf0) { 5589 l += 4; ll >>= 4; 5590 } 5591 if (ll & 0xc) { 5592 l += 2; ll >>= 2; 5593 } 5594 if (ll & 0x2) 5595 l += 2; 5596 else if (ll & 0x1) 5597 l += 1; 5598 *valp = l; 5599 return (0); 5600 } 5601 5602 if (cmd == _PC_ACL_ENABLED) { 5603 *valp = _ACL_ACLENT_ENABLED; 5604 return (0); 5605 } 5606 5607 if (cmd == _PC_XATTR_EXISTS) { 5608 error = 0; 5609 *valp = 0; 5610 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5611 vnode_t *avp; 5612 rnode_t *rp; 5613 int error = 0; 5614 mntinfo_t *mi = VTOMI(vp); 5615 5616 if (!(mi->mi_flags & MI_EXTATTR)) 5617 return (0); 5618 5619 rp = VTOR(vp); 5620 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 5621 INTR(vp))) 5622 return (EINTR); 5623 5624 error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 5625 if (error || avp == NULL) 5626 error = acl_getxattrdir3(vp, &avp, 0, cr, 0); 5627 5628 nfs_rw_exit(&rp->r_rwlock); 5629 5630 if (error == 0 && avp != NULL) { 5631 VN_RELE(avp); 5632 *valp = 1; 5633 } else if (error == ENOENT) 5634 error = 0; 5635 } 5636 return (error); 5637 } 5638 5639 rp = VTOR(vp); 5640 if (rp->r_pathconf != NULL) { 5641 mutex_enter(&rp->r_statelock); 5642 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) { 5643 kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf)); 5644 rp->r_pathconf = NULL; 5645 } 5646 if (rp->r_pathconf != NULL) { 5647 error = 0; 5648 switch (cmd) { 5649 case _PC_LINK_MAX: 5650 *valp = rp->r_pathconf->link_max; 5651 break; 5652 case _PC_NAME_MAX: 5653 *valp = rp->r_pathconf->name_max; 5654 break; 5655 case _PC_PATH_MAX: 5656 case _PC_SYMLINK_MAX: 5657 *valp = MAXPATHLEN; 5658 break; 5659 case _PC_CHOWN_RESTRICTED: 5660 *valp = rp->r_pathconf->chown_restricted; 5661 break; 5662 case _PC_NO_TRUNC: 5663 *valp = rp->r_pathconf->no_trunc; 5664 break; 5665 default: 5666 error = EINVAL; 5667 break; 5668 } 5669 mutex_exit(&rp->r_statelock); 5670 #ifdef DEBUG 5671 nfs3_pathconf_cache_hits++; 5672 #endif 5673 return (error); 5674 } 5675 mutex_exit(&rp->r_statelock); 5676 } 5677 #ifdef DEBUG 5678 nfs3_pathconf_cache_misses++; 5679 #endif 5680 5681 args.object = *VTOFH3(vp); 5682 fi.vp = vp; 5683 fi.fhp = (caddr_t)&args.object; 5684 fi.copyproc = nfs3copyfh; 5685 fi.lookupproc = nfs3lookup; 5686 fi.xattrdirproc = acl_getxattrdir3; 5687 5688 douprintf = 1; 5689 5690 t = gethrtime(); 5691 5692 error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF, 5693 xdr_nfs_fh3, (caddr_t)&args, 5694 xdr_PATHCONF3res, (caddr_t)&res, cr, 5695 &douprintf, &res.status, 0, &fi); 5696 5697 if (error) 5698 return (error); 5699 5700 error = geterrno3(res.status); 5701 5702 if (!error) { 5703 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr); 5704 if (!nfs3_pathconf_disable_cache) { 5705 mutex_enter(&rp->r_statelock); 5706 if (rp->r_pathconf == NULL) { 5707 rp->r_pathconf = kmem_alloc( 5708 sizeof (*rp->r_pathconf), KM_NOSLEEP); 5709 if (rp->r_pathconf != NULL) 5710 *rp->r_pathconf = res.resok.info; 5711 } 5712 mutex_exit(&rp->r_statelock); 5713 } 5714 switch (cmd) { 5715 case _PC_LINK_MAX: 5716 *valp = res.resok.info.link_max; 5717 break; 5718 case _PC_NAME_MAX: 5719 *valp = res.resok.info.name_max; 5720 break; 5721 case _PC_PATH_MAX: 5722 case _PC_SYMLINK_MAX: 5723 *valp = MAXPATHLEN; 5724 break; 5725 case _PC_CHOWN_RESTRICTED: 5726 *valp = res.resok.info.chown_restricted; 5727 break; 5728 case _PC_NO_TRUNC: 5729 *valp = res.resok.info.no_trunc; 5730 break; 5731 default: 5732 return (EINVAL); 5733 } 5734 } else { 5735 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr); 5736 PURGE_STALE_FH(error, vp, cr); 5737 } 5738 5739 return (error); 5740 } 5741 5742 /* 5743 * Called by async thread to do synchronous pageio. Do the i/o, wait 5744 * for it to complete, and cleanup the page list when done. 5745 */ 5746 static int 5747 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5748 int flags, cred_t *cr) 5749 { 5750 int error; 5751 5752 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5753 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5754 if (flags & B_READ) 5755 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 5756 else 5757 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 5758 return (error); 5759 } 5760 5761 static int 5762 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5763 int flags, cred_t *cr) 5764 { 5765 int error; 5766 rnode_t *rp; 5767 5768 if (pp == NULL) 5769 return (EINVAL); 5770 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 5771 return (EIO); 5772 5773 rp = VTOR(vp); 5774 mutex_enter(&rp->r_statelock); 5775 rp->r_count++; 5776 mutex_exit(&rp->r_statelock); 5777 5778 if (flags & B_ASYNC) { 5779 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 5780 nfs3_sync_pageio); 5781 } else 5782 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5783 mutex_enter(&rp->r_statelock); 5784 rp->r_count--; 5785 cv_broadcast(&rp->r_cv); 5786 mutex_exit(&rp->r_statelock); 5787 return (error); 5788 } 5789 5790 static void 5791 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 5792 { 5793 int error; 5794 rnode_t *rp; 5795 page_t *plist; 5796 page_t *pptr; 5797 offset3 offset; 5798 count3 len; 5799 k_sigset_t smask; 5800 5801 /* 5802 * We should get called with fl equal to either B_FREE or 5803 * B_INVAL. Any other value is illegal. 5804 * 5805 * The page that we are either supposed to free or destroy 5806 * should be exclusive locked and its io lock should not 5807 * be held. 5808 */ 5809 ASSERT(fl == B_FREE || fl == B_INVAL); 5810 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 5811 rp = VTOR(vp); 5812 5813 /* 5814 * If the page doesn't need to be committed or we shouldn't 5815 * even bother attempting to commit it, then just make sure 5816 * that the p_fsdata byte is clear and then either free or 5817 * destroy the page as appropriate. 5818 */ 5819 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) { 5820 pp->p_fsdata = C_NOCOMMIT; 5821 if (fl == B_FREE) 5822 page_free(pp, dn); 5823 else 5824 page_destroy(pp, dn); 5825 return; 5826 } 5827 5828 /* 5829 * If there is a page invalidation operation going on, then 5830 * if this is one of the pages being destroyed, then just 5831 * clear the p_fsdata byte and then either free or destroy 5832 * the page as appropriate. 5833 */ 5834 mutex_enter(&rp->r_statelock); 5835 if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 5836 mutex_exit(&rp->r_statelock); 5837 pp->p_fsdata = C_NOCOMMIT; 5838 if (fl == B_FREE) 5839 page_free(pp, dn); 5840 else 5841 page_destroy(pp, dn); 5842 return; 5843 } 5844 5845 /* 5846 * If we are freeing this page and someone else is already 5847 * waiting to do a commit, then just unlock the page and 5848 * return. That other thread will take care of commiting 5849 * this page. The page can be freed sometime after the 5850 * commit has finished. Otherwise, if the page is marked 5851 * as delay commit, then we may be getting called from 5852 * pvn_write_done, one page at a time. This could result 5853 * in one commit per page, so we end up doing lots of small 5854 * commits instead of fewer larger commits. This is bad, 5855 * we want do as few commits as possible. 5856 */ 5857 if (fl == B_FREE) { 5858 if (rp->r_flags & RCOMMITWAIT) { 5859 page_unlock(pp); 5860 mutex_exit(&rp->r_statelock); 5861 return; 5862 } 5863 if (pp->p_fsdata == C_DELAYCOMMIT) { 5864 pp->p_fsdata = C_COMMIT; 5865 page_unlock(pp); 5866 mutex_exit(&rp->r_statelock); 5867 return; 5868 } 5869 } 5870 5871 /* 5872 * Check to see if there is a signal which would prevent an 5873 * attempt to commit the pages from being successful. If so, 5874 * then don't bother with all of the work to gather pages and 5875 * generate the unsuccessful RPC. Just return from here and 5876 * let the page be committed at some later time. 5877 */ 5878 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 5879 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 5880 sigunintr(&smask); 5881 page_unlock(pp); 5882 mutex_exit(&rp->r_statelock); 5883 return; 5884 } 5885 sigunintr(&smask); 5886 5887 /* 5888 * We are starting to need to commit pages, so let's try 5889 * to commit as many as possible at once to reduce the 5890 * overhead. 5891 * 5892 * Set the `commit inprogress' state bit. We must 5893 * first wait until any current one finishes. Then 5894 * we initialize the c_pages list with this page. 5895 */ 5896 while (rp->r_flags & RCOMMIT) { 5897 rp->r_flags |= RCOMMITWAIT; 5898 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 5899 rp->r_flags &= ~RCOMMITWAIT; 5900 } 5901 rp->r_flags |= RCOMMIT; 5902 mutex_exit(&rp->r_statelock); 5903 ASSERT(rp->r_commit.c_pages == NULL); 5904 rp->r_commit.c_pages = pp; 5905 rp->r_commit.c_commbase = (offset3)pp->p_offset; 5906 rp->r_commit.c_commlen = PAGESIZE; 5907 5908 /* 5909 * Gather together all other pages which can be committed. 5910 * They will all be chained off r_commit.c_pages. 5911 */ 5912 nfs3_get_commit(vp); 5913 5914 /* 5915 * Clear the `commit inprogress' status and disconnect 5916 * the list of pages to be committed from the rnode. 5917 * At this same time, we also save the starting offset 5918 * and length of data to be committed on the server. 5919 */ 5920 plist = rp->r_commit.c_pages; 5921 rp->r_commit.c_pages = NULL; 5922 offset = rp->r_commit.c_commbase; 5923 len = rp->r_commit.c_commlen; 5924 mutex_enter(&rp->r_statelock); 5925 rp->r_flags &= ~RCOMMIT; 5926 cv_broadcast(&rp->r_commit.c_cv); 5927 mutex_exit(&rp->r_statelock); 5928 5929 if (curproc == proc_pageout || curproc == proc_fsflush || 5930 nfs_zone() != VTOMI(vp)->mi_zone) { 5931 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit); 5932 return; 5933 } 5934 5935 /* 5936 * Actually generate the COMMIT3 over the wire operation. 5937 */ 5938 error = nfs3_commit(vp, offset, len, cr); 5939 5940 /* 5941 * If we got an error during the commit, just unlock all 5942 * of the pages. The pages will get retransmitted to the 5943 * server during a putpage operation. 5944 */ 5945 if (error) { 5946 while (plist != NULL) { 5947 pptr = plist; 5948 page_sub(&plist, pptr); 5949 page_unlock(pptr); 5950 } 5951 return; 5952 } 5953 5954 /* 5955 * We've tried as hard as we can to commit the data to stable 5956 * storage on the server. We release the rest of the pages 5957 * and clear the commit required state. They will be put 5958 * onto the tail of the cachelist if they are nolonger 5959 * mapped. 5960 */ 5961 while (plist != pp) { 5962 pptr = plist; 5963 page_sub(&plist, pptr); 5964 pptr->p_fsdata = C_NOCOMMIT; 5965 (void) page_release(pptr, 1); 5966 } 5967 5968 /* 5969 * It is possible that nfs3_commit didn't return error but 5970 * some other thread has modified the page we are going 5971 * to free/destroy. 5972 * In this case we need to rewrite the page. Do an explicit check 5973 * before attempting to free/destroy the page. If modified, needs to 5974 * be rewritten so unlock the page and return. 5975 */ 5976 if (hat_ismod(pp)) { 5977 pp->p_fsdata = C_NOCOMMIT; 5978 page_unlock(pp); 5979 return; 5980 } 5981 5982 /* 5983 * Now, as appropriate, either free or destroy the page 5984 * that we were called with. 5985 */ 5986 pp->p_fsdata = C_NOCOMMIT; 5987 if (fl == B_FREE) 5988 page_free(pp, dn); 5989 else 5990 page_destroy(pp, dn); 5991 } 5992 5993 static int 5994 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr) 5995 { 5996 int error; 5997 rnode_t *rp; 5998 COMMIT3args args; 5999 COMMIT3res res; 6000 int douprintf; 6001 cred_t *cred; 6002 6003 rp = VTOR(vp); 6004 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6005 6006 mutex_enter(&rp->r_statelock); 6007 if (rp->r_cred != NULL) { 6008 cred = rp->r_cred; 6009 crhold(cred); 6010 } else { 6011 rp->r_cred = cr; 6012 crhold(cr); 6013 cred = cr; 6014 crhold(cred); 6015 } 6016 mutex_exit(&rp->r_statelock); 6017 6018 args.file = *VTOFH3(vp); 6019 args.offset = offset; 6020 args.count = count; 6021 6022 doitagain: 6023 douprintf = 1; 6024 error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT, 6025 xdr_COMMIT3args, (caddr_t)&args, 6026 xdr_COMMIT3res, (caddr_t)&res, cred, 6027 &douprintf, &res.status, 0, NULL); 6028 6029 crfree(cred); 6030 6031 if (error) 6032 return (error); 6033 6034 error = geterrno3(res.status); 6035 if (!error) { 6036 ASSERT(rp->r_flags & RHAVEVERF); 6037 mutex_enter(&rp->r_statelock); 6038 if (rp->r_verf == res.resok.verf) { 6039 mutex_exit(&rp->r_statelock); 6040 return (0); 6041 } 6042 nfs3_set_mod(vp); 6043 rp->r_verf = res.resok.verf; 6044 mutex_exit(&rp->r_statelock); 6045 error = NFS_VERF_MISMATCH; 6046 } else { 6047 if (error == EACCES) { 6048 mutex_enter(&rp->r_statelock); 6049 if (cred != cr) { 6050 if (rp->r_cred != NULL) 6051 crfree(rp->r_cred); 6052 rp->r_cred = cr; 6053 crhold(cr); 6054 cred = cr; 6055 crhold(cred); 6056 mutex_exit(&rp->r_statelock); 6057 goto doitagain; 6058 } 6059 mutex_exit(&rp->r_statelock); 6060 } 6061 /* 6062 * Can't do a PURGE_STALE_FH here because this 6063 * can cause a deadlock. nfs3_commit can 6064 * be called from nfs3_dispose which can be called 6065 * indirectly via pvn_vplist_dirty. PURGE_STALE_FH 6066 * can call back to pvn_vplist_dirty. 6067 */ 6068 if (error == ESTALE) { 6069 mutex_enter(&rp->r_statelock); 6070 rp->r_flags |= RSTALE; 6071 if (!rp->r_error) 6072 rp->r_error = error; 6073 mutex_exit(&rp->r_statelock); 6074 PURGE_ATTRCACHE(vp); 6075 } else { 6076 mutex_enter(&rp->r_statelock); 6077 if (!rp->r_error) 6078 rp->r_error = error; 6079 mutex_exit(&rp->r_statelock); 6080 } 6081 } 6082 6083 return (error); 6084 } 6085 6086 static void 6087 nfs3_set_mod(vnode_t *vp) 6088 { 6089 page_t *pp; 6090 kmutex_t *vphm; 6091 6092 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6093 vphm = page_vnode_mutex(vp); 6094 mutex_enter(vphm); 6095 if ((pp = vp->v_pages) != NULL) { 6096 do { 6097 if (pp->p_fsdata != C_NOCOMMIT) { 6098 hat_setmod(pp); 6099 pp->p_fsdata = C_NOCOMMIT; 6100 } 6101 } while ((pp = pp->p_vpnext) != vp->v_pages); 6102 } 6103 mutex_exit(vphm); 6104 } 6105 6106 6107 /* 6108 * This routine is used to gather together a page list of the pages 6109 * which are to be committed on the server. This routine must not 6110 * be called if the calling thread holds any locked pages. 6111 * 6112 * The calling thread must have set RCOMMIT. This bit is used to 6113 * serialize access to the commit structure in the rnode. As long 6114 * as the thread has set RCOMMIT, then it can manipulate the commit 6115 * structure without requiring any other locks. 6116 */ 6117 static void 6118 nfs3_get_commit(vnode_t *vp) 6119 { 6120 rnode_t *rp; 6121 page_t *pp; 6122 kmutex_t *vphm; 6123 6124 rp = VTOR(vp); 6125 6126 ASSERT(rp->r_flags & RCOMMIT); 6127 6128 vphm = page_vnode_mutex(vp); 6129 mutex_enter(vphm); 6130 6131 /* 6132 * If there are no pages associated with this vnode, then 6133 * just return. 6134 */ 6135 if ((pp = vp->v_pages) == NULL) { 6136 mutex_exit(vphm); 6137 return; 6138 } 6139 6140 /* 6141 * Step through all of the pages associated with this vnode 6142 * looking for pages which need to be committed. 6143 */ 6144 do { 6145 /* 6146 * If this page does not need to be committed or is 6147 * modified, then just skip it. 6148 */ 6149 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 6150 continue; 6151 6152 /* 6153 * Attempt to lock the page. If we can't, then 6154 * someone else is messing with it and we will 6155 * just skip it. 6156 */ 6157 if (!page_trylock(pp, SE_EXCL)) 6158 continue; 6159 6160 /* 6161 * If this page does not need to be committed or is 6162 * modified, then just skip it. Recheck now that 6163 * the page is locked. 6164 */ 6165 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 6166 page_unlock(pp); 6167 continue; 6168 } 6169 6170 if (PP_ISFREE(pp)) { 6171 cmn_err(CE_PANIC, "nfs3_get_commit: %p is free", 6172 (void *)pp); 6173 } 6174 6175 /* 6176 * The page needs to be committed and we locked it. 6177 * Update the base and length parameters and add it 6178 * to r_pages. 6179 */ 6180 if (rp->r_commit.c_pages == NULL) { 6181 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6182 rp->r_commit.c_commlen = PAGESIZE; 6183 } else if (pp->p_offset < rp->r_commit.c_commbase) { 6184 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 6185 (offset3)pp->p_offset + rp->r_commit.c_commlen; 6186 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6187 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 6188 <= pp->p_offset) { 6189 rp->r_commit.c_commlen = (offset3)pp->p_offset - 6190 rp->r_commit.c_commbase + PAGESIZE; 6191 } 6192 page_add(&rp->r_commit.c_pages, pp); 6193 } while ((pp = pp->p_vpnext) != vp->v_pages); 6194 6195 mutex_exit(vphm); 6196 } 6197 6198 /* 6199 * This routine is used to gather together a page list of the pages 6200 * which are to be committed on the server. This routine must not 6201 * be called if the calling thread holds any locked pages. 6202 * 6203 * The calling thread must have set RCOMMIT. This bit is used to 6204 * serialize access to the commit structure in the rnode. As long 6205 * as the thread has set RCOMMIT, then it can manipulate the commit 6206 * structure without requiring any other locks. 6207 */ 6208 static void 6209 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 6210 { 6211 6212 rnode_t *rp; 6213 page_t *pp; 6214 u_offset_t end; 6215 u_offset_t off; 6216 6217 ASSERT(len != 0); 6218 6219 rp = VTOR(vp); 6220 6221 ASSERT(rp->r_flags & RCOMMIT); 6222 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6223 6224 /* 6225 * If there are no pages associated with this vnode, then 6226 * just return. 6227 */ 6228 if ((pp = vp->v_pages) == NULL) 6229 return; 6230 6231 /* 6232 * Calculate the ending offset. 6233 */ 6234 end = soff + len; 6235 6236 for (off = soff; off < end; off += PAGESIZE) { 6237 /* 6238 * Lookup each page by vp, offset. 6239 */ 6240 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 6241 continue; 6242 6243 /* 6244 * If this page does not need to be committed or is 6245 * modified, then just skip it. 6246 */ 6247 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 6248 page_unlock(pp); 6249 continue; 6250 } 6251 6252 ASSERT(PP_ISFREE(pp) == 0); 6253 6254 /* 6255 * The page needs to be committed and we locked it. 6256 * Update the base and length parameters and add it 6257 * to r_pages. 6258 */ 6259 if (rp->r_commit.c_pages == NULL) { 6260 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6261 rp->r_commit.c_commlen = PAGESIZE; 6262 } else { 6263 rp->r_commit.c_commlen = (offset3)pp->p_offset - 6264 rp->r_commit.c_commbase + PAGESIZE; 6265 } 6266 page_add(&rp->r_commit.c_pages, pp); 6267 } 6268 } 6269 6270 #if 0 /* unused */ 6271 #ifdef DEBUG 6272 static int 6273 nfs3_no_uncommitted_pages(vnode_t *vp) 6274 { 6275 page_t *pp; 6276 kmutex_t *vphm; 6277 6278 vphm = page_vnode_mutex(vp); 6279 mutex_enter(vphm); 6280 if ((pp = vp->v_pages) != NULL) { 6281 do { 6282 if (pp->p_fsdata != C_NOCOMMIT) { 6283 mutex_exit(vphm); 6284 return (0); 6285 } 6286 } while ((pp = pp->p_vpnext) != vp->v_pages); 6287 } 6288 mutex_exit(vphm); 6289 6290 return (1); 6291 } 6292 #endif 6293 #endif 6294 6295 static int 6296 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 6297 { 6298 int error; 6299 writeverf3 write_verf; 6300 rnode_t *rp = VTOR(vp); 6301 6302 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6303 /* 6304 * Flush the data portion of the file and then commit any 6305 * portions which need to be committed. This may need to 6306 * be done twice if the server has changed state since 6307 * data was last written. The data will need to be 6308 * rewritten to the server and then a new commit done. 6309 * 6310 * In fact, this may need to be done several times if the 6311 * server is having problems and crashing while we are 6312 * attempting to do this. 6313 */ 6314 6315 top: 6316 /* 6317 * Do a flush based on the poff and plen arguments. This 6318 * will asynchronously write out any modified pages in the 6319 * range specified by (poff, plen). This starts all of the 6320 * i/o operations which will be waited for in the next 6321 * call to nfs3_putpage 6322 */ 6323 6324 mutex_enter(&rp->r_statelock); 6325 write_verf = rp->r_verf; 6326 mutex_exit(&rp->r_statelock); 6327 6328 error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr); 6329 if (error == EAGAIN) 6330 error = 0; 6331 6332 /* 6333 * Do a flush based on the poff and plen arguments. This 6334 * will synchronously write out any modified pages in the 6335 * range specified by (poff, plen) and wait until all of 6336 * the asynchronous i/o's in that range are done as well. 6337 */ 6338 if (!error) 6339 error = nfs3_putpage(vp, poff, plen, 0, cr); 6340 6341 if (error) 6342 return (error); 6343 6344 mutex_enter(&rp->r_statelock); 6345 if (rp->r_verf != write_verf) { 6346 mutex_exit(&rp->r_statelock); 6347 goto top; 6348 } 6349 mutex_exit(&rp->r_statelock); 6350 6351 /* 6352 * Now commit any pages which might need to be committed. 6353 * If the error, NFS_VERF_MISMATCH, is returned, then 6354 * start over with the flush operation. 6355 */ 6356 6357 error = nfs3_commit_vp(vp, poff, plen, cr); 6358 6359 if (error == NFS_VERF_MISMATCH) 6360 goto top; 6361 6362 return (error); 6363 } 6364 6365 static int 6366 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr) 6367 { 6368 rnode_t *rp; 6369 page_t *plist; 6370 offset3 offset; 6371 count3 len; 6372 6373 6374 rp = VTOR(vp); 6375 6376 if (nfs_zone() != VTOMI(vp)->mi_zone) 6377 return (EIO); 6378 /* 6379 * Set the `commit inprogress' state bit. We must 6380 * first wait until any current one finishes. 6381 */ 6382 mutex_enter(&rp->r_statelock); 6383 while (rp->r_flags & RCOMMIT) { 6384 rp->r_flags |= RCOMMITWAIT; 6385 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 6386 rp->r_flags &= ~RCOMMITWAIT; 6387 } 6388 rp->r_flags |= RCOMMIT; 6389 mutex_exit(&rp->r_statelock); 6390 6391 /* 6392 * Gather together all of the pages which need to be 6393 * committed. 6394 */ 6395 if (plen == 0) 6396 nfs3_get_commit(vp); 6397 else 6398 nfs3_get_commit_range(vp, poff, plen); 6399 6400 /* 6401 * Clear the `commit inprogress' bit and disconnect the 6402 * page list which was gathered together in nfs3_get_commit. 6403 */ 6404 plist = rp->r_commit.c_pages; 6405 rp->r_commit.c_pages = NULL; 6406 offset = rp->r_commit.c_commbase; 6407 len = rp->r_commit.c_commlen; 6408 mutex_enter(&rp->r_statelock); 6409 rp->r_flags &= ~RCOMMIT; 6410 cv_broadcast(&rp->r_commit.c_cv); 6411 mutex_exit(&rp->r_statelock); 6412 6413 /* 6414 * If any pages need to be committed, commit them and 6415 * then unlock them so that they can be freed some 6416 * time later. 6417 */ 6418 if (plist != NULL) { 6419 /* 6420 * No error occurred during the flush portion 6421 * of this operation, so now attempt to commit 6422 * the data to stable storage on the server. 6423 * 6424 * This will unlock all of the pages on the list. 6425 */ 6426 return (nfs3_sync_commit(vp, plist, offset, len, cr)); 6427 } 6428 return (0); 6429 } 6430 6431 static int 6432 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 6433 cred_t *cr) 6434 { 6435 int error; 6436 page_t *pp; 6437 6438 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6439 error = nfs3_commit(vp, offset, count, cr); 6440 6441 /* 6442 * If we got an error, then just unlock all of the pages 6443 * on the list. 6444 */ 6445 if (error) { 6446 while (plist != NULL) { 6447 pp = plist; 6448 page_sub(&plist, pp); 6449 page_unlock(pp); 6450 } 6451 return (error); 6452 } 6453 /* 6454 * We've tried as hard as we can to commit the data to stable 6455 * storage on the server. We just unlock the pages and clear 6456 * the commit required state. They will get freed later. 6457 */ 6458 while (plist != NULL) { 6459 pp = plist; 6460 page_sub(&plist, pp); 6461 pp->p_fsdata = C_NOCOMMIT; 6462 page_unlock(pp); 6463 } 6464 6465 return (error); 6466 } 6467 6468 static void 6469 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 6470 cred_t *cr) 6471 { 6472 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6473 (void) nfs3_sync_commit(vp, plist, offset, count, cr); 6474 } 6475 6476 static int 6477 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 6478 { 6479 int error; 6480 mntinfo_t *mi; 6481 6482 mi = VTOMI(vp); 6483 6484 if (nfs_zone() != mi->mi_zone) 6485 return (EIO); 6486 6487 if (mi->mi_flags & MI_ACL) { 6488 error = acl_setacl3(vp, vsecattr, flag, cr); 6489 if (mi->mi_flags & MI_ACL) 6490 return (error); 6491 } 6492 6493 return (ENOSYS); 6494 } 6495 6496 static int 6497 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 6498 { 6499 int error; 6500 mntinfo_t *mi; 6501 6502 mi = VTOMI(vp); 6503 6504 if (nfs_zone() != mi->mi_zone) 6505 return (EIO); 6506 6507 if (mi->mi_flags & MI_ACL) { 6508 error = acl_getacl3(vp, vsecattr, flag, cr); 6509 if (mi->mi_flags & MI_ACL) 6510 return (error); 6511 } 6512 6513 return (fs_fab_acl(vp, vsecattr, flag, cr)); 6514 } 6515 6516 static int 6517 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 6518 { 6519 int error; 6520 struct shrlock nshr; 6521 struct nfs_owner nfs_owner; 6522 netobj lm_fh3; 6523 6524 if (nfs_zone() != VTOMI(vp)->mi_zone) 6525 return (EIO); 6526 6527 /* 6528 * check for valid cmd parameter 6529 */ 6530 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 6531 return (EINVAL); 6532 6533 /* 6534 * Check access permissions 6535 */ 6536 if (cmd == F_SHARE && 6537 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 6538 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 6539 return (EBADF); 6540 6541 /* 6542 * If the filesystem is mounted using local locking, pass the 6543 * request off to the local share code. 6544 */ 6545 if (VTOMI(vp)->mi_flags & MI_LLOCK) 6546 return (fs_shrlock(vp, cmd, shr, flag, cr)); 6547 6548 switch (cmd) { 6549 case F_SHARE: 6550 case F_UNSHARE: 6551 lm_fh3.n_len = VTOFH3(vp)->fh3_length; 6552 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data); 6553 6554 /* 6555 * If passed an owner that is too large to fit in an 6556 * nfs_owner it is likely a recursive call from the 6557 * lock manager client and pass it straight through. If 6558 * it is not a nfs_owner then simply return an error. 6559 */ 6560 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 6561 if (((struct nfs_owner *)shr->s_owner)->magic != 6562 NFS_OWNER_MAGIC) 6563 return (EINVAL); 6564 6565 if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) { 6566 error = set_errno(error); 6567 } 6568 return (error); 6569 } 6570 /* 6571 * Remote share reservations owner is a combination of 6572 * a magic number, hostname, and the local owner 6573 */ 6574 bzero(&nfs_owner, sizeof (nfs_owner)); 6575 nfs_owner.magic = NFS_OWNER_MAGIC; 6576 (void) strncpy(nfs_owner.hname, uts_nodename(), 6577 sizeof (nfs_owner.hname)); 6578 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 6579 nshr.s_access = shr->s_access; 6580 nshr.s_deny = shr->s_deny; 6581 nshr.s_sysid = 0; 6582 nshr.s_pid = ttoproc(curthread)->p_pid; 6583 nshr.s_own_len = sizeof (nfs_owner); 6584 nshr.s_owner = (caddr_t)&nfs_owner; 6585 6586 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) { 6587 error = set_errno(error); 6588 } 6589 6590 break; 6591 6592 case F_HASREMOTELOCKS: 6593 /* 6594 * NFS client can't store remote locks itself 6595 */ 6596 shr->s_access = 0; 6597 error = 0; 6598 break; 6599 6600 default: 6601 error = EINVAL; 6602 break; 6603 } 6604 6605 return (error); 6606 } 6607