1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/vfs_opreg.h> 41 #include <sys/file.h> 42 #include <sys/filio.h> 43 #include <sys/uio.h> 44 #include <sys/buf.h> 45 #include <sys/mman.h> 46 #include <sys/pathname.h> 47 #include <sys/dirent.h> 48 #include <sys/debug.h> 49 #include <sys/vmsystm.h> 50 #include <sys/fcntl.h> 51 #include <sys/flock.h> 52 #include <sys/swap.h> 53 #include <sys/errno.h> 54 #include <sys/strsubr.h> 55 #include <sys/sysmacros.h> 56 #include <sys/kmem.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/atomic.h> 64 #include <sys/policy.h> 65 #include <sys/sdt.h> 66 67 #include <rpc/types.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/rnode.h> 74 #include <nfs/nfs_acl.h> 75 #include <nfs/lm.h> 76 77 #include <vm/hat.h> 78 #include <vm/as.h> 79 #include <vm/page.h> 80 #include <vm/pvn.h> 81 #include <vm/seg.h> 82 #include <vm/seg_map.h> 83 #include <vm/seg_kpm.h> 84 #include <vm/seg_vn.h> 85 86 #include <fs/fs_subr.h> 87 88 #include <sys/ddi.h> 89 90 static int nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 91 cred_t *); 92 static int nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 93 stable_how *); 94 static int nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *); 95 static int nfs3setattr(vnode_t *, struct vattr *, int, cred_t *); 96 static int nfs3_accessx(void *, int, cred_t *); 97 static int nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); 98 static int nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); 99 static int nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl, 100 int, vnode_t **, cred_t *, int); 101 static int nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *); 102 static int nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 103 int, vnode_t **, cred_t *); 104 static int nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 105 static int do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *); 106 static void nfs3readdir(vnode_t *, rddir_cache *, cred_t *); 107 static void nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *); 108 static int nfs3_bio(struct buf *, stable_how *, cred_t *); 109 static int nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 110 page_t *[], size_t, struct seg *, caddr_t, 111 enum seg_rw, cred_t *); 112 static void nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 113 cred_t *); 114 static int nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 115 int, cred_t *); 116 static int nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 117 int, cred_t *); 118 static int nfs3_commit(vnode_t *, offset3, count3, cred_t *); 119 static void nfs3_set_mod(vnode_t *); 120 static void nfs3_get_commit(vnode_t *); 121 static void nfs3_get_commit_range(vnode_t *, u_offset_t, size_t); 122 #if 0 /* unused */ 123 #ifdef DEBUG 124 static int nfs3_no_uncommitted_pages(vnode_t *); 125 #endif 126 #endif /* unused */ 127 static int nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 128 static int nfs3_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *); 129 static int nfs3_sync_commit(vnode_t *, page_t *, offset3, count3, 130 cred_t *); 131 static void nfs3_async_commit(vnode_t *, page_t *, offset3, count3, 132 cred_t *); 133 static void nfs3_delmap_callback(struct as *, void *, uint_t); 134 135 /* 136 * Error flags used to pass information about certain special errors 137 * which need to be handled specially. 138 */ 139 #define NFS_EOF -98 140 #define NFS_VERF_MISMATCH -97 141 142 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 143 #define ALIGN64(x, ptr, sz) \ 144 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 145 if (x) { \ 146 x = sizeof (uint64_t) - (x); \ 147 sz -= (x); \ 148 ptr += (x); \ 149 } 150 151 /* 152 * These are the vnode ops routines which implement the vnode interface to 153 * the networked file system. These routines just take their parameters, 154 * make them look networkish by putting the right info into interface structs, 155 * and then calling the appropriate remote routine(s) to do the work. 156 * 157 * Note on directory name lookup cacheing: If we detect a stale fhandle, 158 * we purge the directory cache relative to that vnode. This way, the 159 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for 160 * more details on rnode locking. 161 */ 162 163 static int nfs3_open(vnode_t **, int, cred_t *); 164 static int nfs3_close(vnode_t *, int, int, offset_t, cred_t *); 165 static int nfs3_read(vnode_t *, struct uio *, int, cred_t *, 166 caller_context_t *); 167 static int nfs3_write(vnode_t *, struct uio *, int, cred_t *, 168 caller_context_t *); 169 static int nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 170 static int nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *); 171 static int nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *, 172 caller_context_t *); 173 static int nfs3_access(vnode_t *, int, int, cred_t *); 174 static int nfs3_readlink(vnode_t *, struct uio *, cred_t *); 175 static int nfs3_fsync(vnode_t *, int, cred_t *); 176 static void nfs3_inactive(vnode_t *, cred_t *); 177 static int nfs3_lookup(vnode_t *, char *, vnode_t **, 178 struct pathname *, int, vnode_t *, cred_t *); 179 static int nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl, 180 int, vnode_t **, cred_t *, int); 181 static int nfs3_remove(vnode_t *, char *, cred_t *); 182 static int nfs3_link(vnode_t *, vnode_t *, char *, cred_t *); 183 static int nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 184 static int nfs3_mkdir(vnode_t *, char *, struct vattr *, 185 vnode_t **, cred_t *); 186 static int nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 187 static int nfs3_symlink(vnode_t *, char *, struct vattr *, char *, 188 cred_t *); 189 static int nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *); 190 static int nfs3_fid(vnode_t *, fid_t *); 191 static int nfs3_rwlock(vnode_t *, int, caller_context_t *); 192 static void nfs3_rwunlock(vnode_t *, int, caller_context_t *); 193 static int nfs3_seek(vnode_t *, offset_t, offset_t *); 194 static int nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *, 195 page_t *[], size_t, struct seg *, caddr_t, 196 enum seg_rw, cred_t *); 197 static int nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 198 static int nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, 199 size_t, uchar_t, uchar_t, uint_t, cred_t *); 200 static int nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, 201 size_t, uchar_t, uchar_t, uint_t, cred_t *); 202 static int nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 203 struct flk_callback *, cred_t *); 204 static int nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t, 205 cred_t *, caller_context_t *); 206 static int nfs3_realvp(vnode_t *, vnode_t **); 207 static int nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, 208 size_t, uint_t, uint_t, uint_t, cred_t *); 209 static int nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *); 210 static int nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 211 cred_t *); 212 static void nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *); 213 static int nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 214 static int nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 215 static int nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 216 217 struct vnodeops *nfs3_vnodeops; 218 219 const fs_operation_def_t nfs3_vnodeops_template[] = { 220 VOPNAME_OPEN, { .vop_open = nfs3_open }, 221 VOPNAME_CLOSE, { .vop_close = nfs3_close }, 222 VOPNAME_READ, { .vop_read = nfs3_read }, 223 VOPNAME_WRITE, { .vop_write = nfs3_write }, 224 VOPNAME_IOCTL, { .vop_ioctl = nfs3_ioctl }, 225 VOPNAME_GETATTR, { .vop_getattr = nfs3_getattr }, 226 VOPNAME_SETATTR, { .vop_setattr = nfs3_setattr }, 227 VOPNAME_ACCESS, { .vop_access = nfs3_access }, 228 VOPNAME_LOOKUP, { .vop_lookup = nfs3_lookup }, 229 VOPNAME_CREATE, { .vop_create = nfs3_create }, 230 VOPNAME_REMOVE, { .vop_remove = nfs3_remove }, 231 VOPNAME_LINK, { .vop_link = nfs3_link }, 232 VOPNAME_RENAME, { .vop_rename = nfs3_rename }, 233 VOPNAME_MKDIR, { .vop_mkdir = nfs3_mkdir }, 234 VOPNAME_RMDIR, { .vop_rmdir = nfs3_rmdir }, 235 VOPNAME_READDIR, { .vop_readdir = nfs3_readdir }, 236 VOPNAME_SYMLINK, { .vop_symlink = nfs3_symlink }, 237 VOPNAME_READLINK, { .vop_readlink = nfs3_readlink }, 238 VOPNAME_FSYNC, { .vop_fsync = nfs3_fsync }, 239 VOPNAME_INACTIVE, { .vop_inactive = nfs3_inactive }, 240 VOPNAME_FID, { .vop_fid = nfs3_fid }, 241 VOPNAME_RWLOCK, { .vop_rwlock = nfs3_rwlock }, 242 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs3_rwunlock }, 243 VOPNAME_SEEK, { .vop_seek = nfs3_seek }, 244 VOPNAME_FRLOCK, { .vop_frlock = nfs3_frlock }, 245 VOPNAME_SPACE, { .vop_space = nfs3_space }, 246 VOPNAME_REALVP, { .vop_realvp = nfs3_realvp }, 247 VOPNAME_GETPAGE, { .vop_getpage = nfs3_getpage }, 248 VOPNAME_PUTPAGE, { .vop_putpage = nfs3_putpage }, 249 VOPNAME_MAP, { .vop_map = nfs3_map }, 250 VOPNAME_ADDMAP, { .vop_addmap = nfs3_addmap }, 251 VOPNAME_DELMAP, { .vop_delmap = nfs3_delmap }, 252 /* no separate nfs3_dump */ 253 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 254 VOPNAME_PATHCONF, { .vop_pathconf = nfs3_pathconf }, 255 VOPNAME_PAGEIO, { .vop_pageio = nfs3_pageio }, 256 VOPNAME_DISPOSE, { .vop_dispose = nfs3_dispose }, 257 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs3_setsecattr }, 258 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs3_getsecattr }, 259 VOPNAME_SHRLOCK, { .vop_shrlock = nfs3_shrlock }, 260 NULL, NULL 261 }; 262 263 /* 264 * XXX: This is referenced in modstubs.s 265 */ 266 struct vnodeops * 267 nfs3_getvnodeops(void) 268 { 269 return (nfs3_vnodeops); 270 } 271 272 /* ARGSUSED */ 273 static int 274 nfs3_open(vnode_t **vpp, int flag, cred_t *cr) 275 { 276 int error; 277 struct vattr va; 278 rnode_t *rp; 279 vnode_t *vp; 280 281 vp = *vpp; 282 if (nfs_zone() != VTOMI(vp)->mi_zone) 283 return (EIO); 284 rp = VTOR(vp); 285 mutex_enter(&rp->r_statelock); 286 if (rp->r_cred == NULL) { 287 crhold(cr); 288 rp->r_cred = cr; 289 } 290 mutex_exit(&rp->r_statelock); 291 292 /* 293 * If there is no cached data or if close-to-open 294 * consistency checking is turned off, we can avoid 295 * the over the wire getattr. Otherwise, if the 296 * file system is mounted readonly, then just verify 297 * the caches are up to date using the normal mechanism. 298 * Else, if the file is not mmap'd, then just mark 299 * the attributes as timed out. They will be refreshed 300 * and the caches validated prior to being used. 301 * Else, the file system is mounted writeable so 302 * force an over the wire GETATTR in order to ensure 303 * that all cached data is valid. 304 */ 305 if (vp->v_count > 1 || 306 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && 307 !(VTOMI(vp)->mi_flags & MI_NOCTO))) { 308 if (vn_is_readonly(vp)) 309 error = nfs3_validate_caches(vp, cr); 310 else if (rp->r_mapcnt == 0 && vp->v_count == 1) { 311 PURGE_ATTRCACHE(vp); 312 error = 0; 313 } else { 314 va.va_mask = AT_ALL; 315 error = nfs3_getattr_otw(vp, &va, cr); 316 } 317 } else 318 error = 0; 319 320 return (error); 321 } 322 323 static int 324 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 325 { 326 rnode_t *rp; 327 int error; 328 struct vattr va; 329 330 /* 331 * zone_enter(2) prevents processes from changing zones with NFS files 332 * open; if we happen to get here from the wrong zone we can't do 333 * anything over the wire. 334 */ 335 if (VTOMI(vp)->mi_zone != nfs_zone()) { 336 /* 337 * We could attempt to clean up locks, except we're sure 338 * that the current process didn't acquire any locks on 339 * the file: any attempt to lock a file belong to another zone 340 * will fail, and one can't lock an NFS file and then change 341 * zones, as that fails too. 342 * 343 * Returning an error here is the sane thing to do. A 344 * subsequent call to VN_RELE() which translates to a 345 * nfs3_inactive() will clean up state: if the zone of the 346 * vnode's origin is still alive and kicking, an async worker 347 * thread will handle the request (from the correct zone), and 348 * everything (minus the commit and final nfs3_getattr_otw() 349 * call) should be OK. If the zone is going away 350 * nfs_async_inactive() will throw away cached pages inline. 351 */ 352 return (EIO); 353 } 354 355 /* 356 * If we are using local locking for this filesystem, then 357 * release all of the SYSV style record locks. Otherwise, 358 * we are doing network locking and we need to release all 359 * of the network locks. All of the locks held by this 360 * process on this file are released no matter what the 361 * incoming reference count is. 362 */ 363 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 364 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 365 cleanshares(vp, ttoproc(curthread)->p_pid); 366 } else 367 nfs_lockrelease(vp, flag, offset, cr); 368 369 if (count > 1) 370 return (0); 371 372 /* 373 * If the file has been `unlinked', then purge the 374 * DNLC so that this vnode will get reycled quicker 375 * and the .nfs* file on the server will get removed. 376 */ 377 rp = VTOR(vp); 378 if (rp->r_unldvp != NULL) 379 dnlc_purge_vp(vp); 380 381 /* 382 * If the file was open for write and there are pages, 383 * then if the file system was mounted using the "no-close- 384 * to-open" semantics, then start an asynchronous flush 385 * of the all of the pages in the file. 386 * else the file system was not mounted using the "no-close- 387 * to-open" semantics, then do a synchronous flush and 388 * commit of all of the dirty and uncommitted pages. 389 * 390 * The asynchronous flush of the pages in the "nocto" path 391 * mostly just associates a cred pointer with the rnode so 392 * writes which happen later will have a better chance of 393 * working. It also starts the data being written to the 394 * server, but without unnecessarily delaying the application. 395 */ 396 if ((flag & FWRITE) && vn_has_cached_data(vp)) { 397 if (VTOMI(vp)->mi_flags & MI_NOCTO) { 398 error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC, cr); 399 if (error == EAGAIN) 400 error = 0; 401 } else 402 error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); 403 if (!error) { 404 mutex_enter(&rp->r_statelock); 405 error = rp->r_error; 406 rp->r_error = 0; 407 mutex_exit(&rp->r_statelock); 408 } 409 } else { 410 mutex_enter(&rp->r_statelock); 411 error = rp->r_error; 412 rp->r_error = 0; 413 mutex_exit(&rp->r_statelock); 414 } 415 416 /* 417 * If RWRITEATTR is set, then issue an over the wire GETATTR to 418 * refresh the attribute cache with a set of attributes which 419 * weren't returned from a WRITE. This will enable the close- 420 * to-open processing to work. 421 */ 422 if (rp->r_flags & RWRITEATTR) 423 (void) nfs3_getattr_otw(vp, &va, cr); 424 425 return (error); 426 } 427 428 /* ARGSUSED */ 429 static int 430 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr) 431 { 432 mntinfo_t *mi; 433 READ3args args; 434 READ3uiores res; 435 int tsize; 436 offset_t offset; 437 ssize_t count; 438 int error; 439 int douprintf; 440 failinfo_t fi; 441 char *sv_hostname; 442 443 mi = VTOMI(vp); 444 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 445 sv_hostname = VTOR(vp)->r_server->sv_hostname; 446 447 douprintf = 1; 448 args.file = *VTOFH3(vp); 449 fi.vp = vp; 450 fi.fhp = (caddr_t)&args.file; 451 fi.copyproc = nfs3copyfh; 452 fi.lookupproc = nfs3lookup; 453 fi.xattrdirproc = acl_getxattrdir3; 454 455 res.uiop = uiop; 456 457 offset = uiop->uio_loffset; 458 count = uiop->uio_resid; 459 460 do { 461 if (mi->mi_io_kstats) { 462 mutex_enter(&mi->mi_lock); 463 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 464 mutex_exit(&mi->mi_lock); 465 } 466 467 do { 468 tsize = MIN(mi->mi_tsize, count); 469 args.offset = (offset3)offset; 470 args.count = (count3)tsize; 471 res.size = (uint_t)tsize; 472 error = rfs3call(mi, NFSPROC3_READ, 473 xdr_READ3args, (caddr_t)&args, 474 xdr_READ3uiores, (caddr_t)&res, cr, 475 &douprintf, &res.status, 0, &fi); 476 } while (error == ENFS_TRYAGAIN); 477 478 if (mi->mi_io_kstats) { 479 mutex_enter(&mi->mi_lock); 480 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 481 mutex_exit(&mi->mi_lock); 482 } 483 484 if (error) 485 return (error); 486 487 error = geterrno3(res.status); 488 if (error) 489 return (error); 490 491 if (res.count != res.size) { 492 zcmn_err(getzoneid(), CE_WARN, 493 "nfs3_directio_read: server %s returned incorrect amount", 494 sv_hostname); 495 return (EIO); 496 } 497 count -= res.count; 498 offset += res.count; 499 if (mi->mi_io_kstats) { 500 mutex_enter(&mi->mi_lock); 501 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 502 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; 503 mutex_exit(&mi->mi_lock); 504 } 505 lwp_stat_update(LWP_STAT_INBLK, 1); 506 } while (count && !res.eof); 507 508 return (0); 509 } 510 511 /* ARGSUSED */ 512 static int 513 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 514 caller_context_t *ct) 515 { 516 rnode_t *rp; 517 u_offset_t off; 518 offset_t diff; 519 int on; 520 size_t n; 521 caddr_t base; 522 uint_t flags; 523 int error = 0; 524 mntinfo_t *mi; 525 526 rp = VTOR(vp); 527 mi = VTOMI(vp); 528 529 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 530 531 if (nfs_zone() != mi->mi_zone) 532 return (EIO); 533 534 if (vp->v_type != VREG) 535 return (EISDIR); 536 537 if (uiop->uio_resid == 0) 538 return (0); 539 540 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 541 return (EINVAL); 542 543 /* 544 * Bypass VM if caching has been disabled (e.g., locking) or if 545 * using client-side direct I/O and the file is not mmap'd and 546 * there are no cached pages. 547 */ 548 if ((vp->v_flag & VNOCACHE) || 549 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 550 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 551 return (nfs3_directio_read(vp, uiop, cr)); 552 } 553 554 do { 555 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 556 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 557 n = MIN(MAXBSIZE - on, uiop->uio_resid); 558 559 error = nfs3_validate_caches(vp, cr); 560 if (error) 561 break; 562 563 mutex_enter(&rp->r_statelock); 564 diff = rp->r_size - uiop->uio_loffset; 565 mutex_exit(&rp->r_statelock); 566 if (diff <= 0) 567 break; 568 if (diff < n) 569 n = (size_t)diff; 570 571 if (vpm_enable) { 572 /* 573 * Copy data. 574 */ 575 error = vpm_data_copy(vp, off + on, n, uiop, 576 1, NULL, 0, S_READ); 577 } else { 578 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 579 S_READ); 580 581 error = uiomove(base + on, n, UIO_READ, uiop); 582 } 583 584 if (!error) { 585 /* 586 * If read a whole block or read to eof, 587 * won't need this buffer again soon. 588 */ 589 mutex_enter(&rp->r_statelock); 590 if (n + on == MAXBSIZE || 591 uiop->uio_loffset == rp->r_size) 592 flags = SM_DONTNEED; 593 else 594 flags = 0; 595 mutex_exit(&rp->r_statelock); 596 if (vpm_enable) { 597 error = vpm_sync_pages(vp, off, n, flags); 598 } else { 599 error = segmap_release(segkmap, base, flags); 600 } 601 } else { 602 if (vpm_enable) { 603 (void) vpm_sync_pages(vp, off, n, 0); 604 } else { 605 (void) segmap_release(segkmap, base, 0); 606 } 607 } 608 } while (!error && uiop->uio_resid > 0); 609 610 return (error); 611 } 612 613 /* ARGSUSED */ 614 static int 615 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 616 caller_context_t *ct) 617 { 618 rlim64_t limit = uiop->uio_llimit; 619 rnode_t *rp; 620 u_offset_t off; 621 caddr_t base; 622 uint_t flags; 623 int remainder; 624 size_t n; 625 int on; 626 int error; 627 int resid; 628 offset_t offset; 629 mntinfo_t *mi; 630 uint_t bsize; 631 632 rp = VTOR(vp); 633 634 if (vp->v_type != VREG) 635 return (EISDIR); 636 637 mi = VTOMI(vp); 638 if (nfs_zone() != mi->mi_zone) 639 return (EIO); 640 if (uiop->uio_resid == 0) 641 return (0); 642 643 if (ioflag & FAPPEND) { 644 struct vattr va; 645 646 /* 647 * Must serialize if appending. 648 */ 649 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 650 nfs_rw_exit(&rp->r_rwlock); 651 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 652 INTR(vp))) 653 return (EINTR); 654 } 655 656 va.va_mask = AT_SIZE; 657 error = nfs3getattr(vp, &va, cr); 658 if (error) 659 return (error); 660 uiop->uio_loffset = va.va_size; 661 } 662 663 offset = uiop->uio_loffset + uiop->uio_resid; 664 665 if (uiop->uio_loffset < 0 || offset < 0) 666 return (EINVAL); 667 668 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 669 limit = MAXOFFSET_T; 670 671 /* 672 * Check to make sure that the process will not exceed 673 * its limit on file size. It is okay to write up to 674 * the limit, but not beyond. Thus, the write which 675 * reaches the limit will be short and the next write 676 * will return an error. 677 */ 678 remainder = 0; 679 if (offset > limit) { 680 remainder = offset - limit; 681 uiop->uio_resid = limit - uiop->uio_loffset; 682 if (uiop->uio_resid <= 0) { 683 proc_t *p = ttoproc(curthread); 684 685 uiop->uio_resid += remainder; 686 mutex_enter(&p->p_lock); 687 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 688 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 689 mutex_exit(&p->p_lock); 690 return (EFBIG); 691 } 692 } 693 694 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 695 return (EINTR); 696 697 /* 698 * Bypass VM if caching has been disabled (e.g., locking) or if 699 * using client-side direct I/O and the file is not mmap'd and 700 * there are no cached pages. 701 */ 702 if ((vp->v_flag & VNOCACHE) || 703 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && 704 rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { 705 size_t bufsize; 706 int count; 707 u_offset_t org_offset; 708 stable_how stab_comm; 709 710 nfs3_fwrite: 711 if (rp->r_flags & RSTALE) { 712 resid = uiop->uio_resid; 713 offset = uiop->uio_loffset; 714 error = rp->r_error; 715 goto bottom; 716 } 717 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 718 base = kmem_alloc(bufsize, KM_SLEEP); 719 do { 720 if (ioflag & FDSYNC) 721 stab_comm = DATA_SYNC; 722 else 723 stab_comm = FILE_SYNC; 724 resid = uiop->uio_resid; 725 offset = uiop->uio_loffset; 726 count = MIN(uiop->uio_resid, bufsize); 727 org_offset = uiop->uio_loffset; 728 error = uiomove(base, count, UIO_WRITE, uiop); 729 if (!error) { 730 error = nfs3write(vp, base, org_offset, 731 count, cr, &stab_comm); 732 } 733 } while (!error && uiop->uio_resid > 0); 734 kmem_free(base, bufsize); 735 goto bottom; 736 } 737 738 739 bsize = vp->v_vfsp->vfs_bsize; 740 741 do { 742 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 743 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 744 n = MIN(MAXBSIZE - on, uiop->uio_resid); 745 746 resid = uiop->uio_resid; 747 offset = uiop->uio_loffset; 748 749 if (rp->r_flags & RSTALE) { 750 error = rp->r_error; 751 break; 752 } 753 754 /* 755 * Don't create dirty pages faster than they 756 * can be cleaned so that the system doesn't 757 * get imbalanced. If the async queue is 758 * maxed out, then wait for it to drain before 759 * creating more dirty pages. Also, wait for 760 * any threads doing pagewalks in the vop_getattr 761 * entry points so that they don't block for 762 * long periods. 763 */ 764 mutex_enter(&rp->r_statelock); 765 while ((mi->mi_max_threads != 0 && 766 rp->r_awcount > 2 * mi->mi_max_threads) || 767 rp->r_gcount > 0) 768 cv_wait(&rp->r_cv, &rp->r_statelock); 769 mutex_exit(&rp->r_statelock); 770 771 if (vpm_enable) { 772 /* 773 * It will use kpm mappings, so no need to 774 * pass an address. 775 */ 776 error = writerp(rp, NULL, n, uiop, 0); 777 } else { 778 if (segmap_kpm) { 779 int pon = uiop->uio_loffset & PAGEOFFSET; 780 size_t pn = MIN(PAGESIZE - pon, 781 uiop->uio_resid); 782 int pagecreate; 783 784 mutex_enter(&rp->r_statelock); 785 pagecreate = (pon == 0) && (pn == PAGESIZE || 786 uiop->uio_loffset + pn >= rp->r_size); 787 mutex_exit(&rp->r_statelock); 788 789 base = segmap_getmapflt(segkmap, vp, off + on, 790 pn, !pagecreate, S_WRITE); 791 792 error = writerp(rp, base + pon, n, uiop, 793 pagecreate); 794 795 } else { 796 base = segmap_getmapflt(segkmap, vp, off + on, 797 n, 0, S_READ); 798 error = writerp(rp, base + on, n, uiop, 0); 799 } 800 } 801 802 if (!error) { 803 if (mi->mi_flags & MI_NOAC) 804 flags = SM_WRITE; 805 else if ((uiop->uio_loffset % bsize) == 0 || 806 IS_SWAPVP(vp)) { 807 /* 808 * Have written a whole block. 809 * Start an asynchronous write 810 * and mark the buffer to 811 * indicate that it won't be 812 * needed again soon. 813 */ 814 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 815 } else 816 flags = 0; 817 if ((ioflag & (FSYNC|FDSYNC)) || 818 (rp->r_flags & ROUTOFSPACE)) { 819 flags &= ~SM_ASYNC; 820 flags |= SM_WRITE; 821 } 822 if (vpm_enable) { 823 error = vpm_sync_pages(vp, off, n, flags); 824 } else { 825 error = segmap_release(segkmap, base, flags); 826 } 827 } else { 828 if (vpm_enable) { 829 (void) vpm_sync_pages(vp, off, n, 0); 830 } else { 831 (void) segmap_release(segkmap, base, 0); 832 } 833 /* 834 * In the event that we got an access error while 835 * faulting in a page for a write-only file just 836 * force a write. 837 */ 838 if (error == EACCES) 839 goto nfs3_fwrite; 840 } 841 } while (!error && uiop->uio_resid > 0); 842 843 bottom: 844 if (error) { 845 uiop->uio_resid = resid + remainder; 846 uiop->uio_loffset = offset; 847 } else 848 uiop->uio_resid += remainder; 849 850 nfs_rw_exit(&rp->r_lkserlock); 851 852 return (error); 853 } 854 855 /* 856 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 857 */ 858 static int 859 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 860 int flags, cred_t *cr) 861 { 862 struct buf *bp; 863 int error; 864 page_t *savepp; 865 uchar_t fsdata; 866 stable_how stab_comm; 867 868 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 869 bp = pageio_setup(pp, len, vp, flags); 870 ASSERT(bp != NULL); 871 872 /* 873 * pageio_setup should have set b_addr to 0. This 874 * is correct since we want to do I/O on a page 875 * boundary. bp_mapin will use this addr to calculate 876 * an offset, and then set b_addr to the kernel virtual 877 * address it allocated for us. 878 */ 879 ASSERT(bp->b_un.b_addr == 0); 880 881 bp->b_edev = 0; 882 bp->b_dev = 0; 883 bp->b_lblkno = lbtodb(off); 884 bp->b_file = vp; 885 bp->b_offset = (offset_t)off; 886 bp_mapin(bp); 887 888 /* 889 * Calculate the desired level of stability to write data 890 * on the server and then mark all of the pages to reflect 891 * this. 892 */ 893 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 894 freemem > desfree) { 895 stab_comm = UNSTABLE; 896 fsdata = C_DELAYCOMMIT; 897 } else { 898 stab_comm = FILE_SYNC; 899 fsdata = C_NOCOMMIT; 900 } 901 902 savepp = pp; 903 do { 904 pp->p_fsdata = fsdata; 905 } while ((pp = pp->p_next) != savepp); 906 907 error = nfs3_bio(bp, &stab_comm, cr); 908 909 bp_mapout(bp); 910 pageio_done(bp); 911 912 /* 913 * If the server wrote pages in a more stable fashion than 914 * was requested, then clear all of the marks in the pages 915 * indicating that COMMIT operations were required. 916 */ 917 if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) { 918 do { 919 pp->p_fsdata = C_NOCOMMIT; 920 } while ((pp = pp->p_next) != savepp); 921 } 922 923 return (error); 924 } 925 926 /* 927 * Write to file. Writes to remote server in largest size 928 * chunks that the server can handle. Write is synchronous. 929 */ 930 static int 931 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 932 stable_how *stab_comm) 933 { 934 mntinfo_t *mi; 935 WRITE3args args; 936 WRITE3res res; 937 int error; 938 int tsize; 939 rnode_t *rp; 940 int douprintf; 941 942 rp = VTOR(vp); 943 mi = VTOMI(vp); 944 945 ASSERT(nfs_zone() == mi->mi_zone); 946 947 args.file = *VTOFH3(vp); 948 args.stable = *stab_comm; 949 950 *stab_comm = FILE_SYNC; 951 952 douprintf = 1; 953 954 do { 955 if ((vp->v_flag & VNOCACHE) || 956 (rp->r_flags & RDIRECTIO) || 957 (mi->mi_flags & MI_DIRECTIO)) 958 tsize = MIN(mi->mi_stsize, count); 959 else 960 tsize = MIN(mi->mi_curwrite, count); 961 args.offset = (offset3)offset; 962 args.count = (count3)tsize; 963 args.data.data_len = (uint_t)tsize; 964 args.data.data_val = base; 965 966 if (mi->mi_io_kstats) { 967 mutex_enter(&mi->mi_lock); 968 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 969 mutex_exit(&mi->mi_lock); 970 } 971 args.mblk = NULL; 972 do { 973 error = rfs3call(mi, NFSPROC3_WRITE, 974 xdr_WRITE3args, (caddr_t)&args, 975 xdr_WRITE3res, (caddr_t)&res, cr, 976 &douprintf, &res.status, 0, NULL); 977 } while (error == ENFS_TRYAGAIN); 978 if (mi->mi_io_kstats) { 979 mutex_enter(&mi->mi_lock); 980 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 981 mutex_exit(&mi->mi_lock); 982 } 983 984 if (error) 985 return (error); 986 error = geterrno3(res.status); 987 if (!error) { 988 if (res.resok.count > args.count) { 989 zcmn_err(getzoneid(), CE_WARN, 990 "nfs3write: server %s wrote %u, " 991 "requested was %u", 992 rp->r_server->sv_hostname, 993 res.resok.count, args.count); 994 return (EIO); 995 } 996 if (res.resok.committed == UNSTABLE) { 997 *stab_comm = UNSTABLE; 998 if (args.stable == DATA_SYNC || 999 args.stable == FILE_SYNC) { 1000 zcmn_err(getzoneid(), CE_WARN, 1001 "nfs3write: server %s did not commit to stable storage", 1002 rp->r_server->sv_hostname); 1003 return (EIO); 1004 } 1005 } 1006 tsize = (int)res.resok.count; 1007 count -= tsize; 1008 base += tsize; 1009 offset += tsize; 1010 if (mi->mi_io_kstats) { 1011 mutex_enter(&mi->mi_lock); 1012 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 1013 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 1014 tsize; 1015 mutex_exit(&mi->mi_lock); 1016 } 1017 lwp_stat_update(LWP_STAT_OUBLK, 1); 1018 mutex_enter(&rp->r_statelock); 1019 if (rp->r_flags & RHAVEVERF) { 1020 if (rp->r_verf != res.resok.verf) { 1021 nfs3_set_mod(vp); 1022 rp->r_verf = res.resok.verf; 1023 /* 1024 * If the data was written UNSTABLE, 1025 * then might as well stop because 1026 * the whole block will have to get 1027 * rewritten anyway. 1028 */ 1029 if (*stab_comm == UNSTABLE) { 1030 mutex_exit(&rp->r_statelock); 1031 break; 1032 } 1033 } 1034 } else { 1035 rp->r_verf = res.resok.verf; 1036 rp->r_flags |= RHAVEVERF; 1037 } 1038 /* 1039 * Mark the attribute cache as timed out and 1040 * set RWRITEATTR to indicate that the file 1041 * was modified with a WRITE operation and 1042 * that the attributes can not be trusted. 1043 */ 1044 PURGE_ATTRCACHE_LOCKED(rp); 1045 rp->r_flags |= RWRITEATTR; 1046 mutex_exit(&rp->r_statelock); 1047 } 1048 } while (!error && count); 1049 1050 return (error); 1051 } 1052 1053 /* 1054 * Read from a file. Reads data in largest chunks our interface can handle. 1055 */ 1056 static int 1057 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count, 1058 size_t *residp, cred_t *cr) 1059 { 1060 mntinfo_t *mi; 1061 READ3args args; 1062 READ3vres res; 1063 int tsize; 1064 int error; 1065 int douprintf; 1066 failinfo_t fi; 1067 rnode_t *rp; 1068 struct vattr va; 1069 hrtime_t t; 1070 1071 rp = VTOR(vp); 1072 mi = VTOMI(vp); 1073 ASSERT(nfs_zone() == mi->mi_zone); 1074 douprintf = 1; 1075 1076 args.file = *VTOFH3(vp); 1077 fi.vp = vp; 1078 fi.fhp = (caddr_t)&args.file; 1079 fi.copyproc = nfs3copyfh; 1080 fi.lookupproc = nfs3lookup; 1081 fi.xattrdirproc = acl_getxattrdir3; 1082 1083 res.pov.fres.vp = vp; 1084 res.pov.fres.vap = &va; 1085 1086 *residp = count; 1087 do { 1088 if (mi->mi_io_kstats) { 1089 mutex_enter(&mi->mi_lock); 1090 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1091 mutex_exit(&mi->mi_lock); 1092 } 1093 1094 do { 1095 if ((vp->v_flag & VNOCACHE) || 1096 (rp->r_flags & RDIRECTIO) || 1097 (mi->mi_flags & MI_DIRECTIO)) 1098 tsize = MIN(mi->mi_tsize, count); 1099 else 1100 tsize = MIN(mi->mi_curread, count); 1101 res.data.data_val = base; 1102 res.data.data_len = tsize; 1103 args.offset = (offset3)offset; 1104 args.count = (count3)tsize; 1105 t = gethrtime(); 1106 error = rfs3call(mi, NFSPROC3_READ, 1107 xdr_READ3args, (caddr_t)&args, 1108 xdr_READ3vres, (caddr_t)&res, cr, 1109 &douprintf, &res.status, 0, &fi); 1110 } while (error == ENFS_TRYAGAIN); 1111 1112 if (mi->mi_io_kstats) { 1113 mutex_enter(&mi->mi_lock); 1114 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1115 mutex_exit(&mi->mi_lock); 1116 } 1117 1118 if (error) 1119 return (error); 1120 1121 error = geterrno3(res.status); 1122 if (error) 1123 return (error); 1124 1125 if (res.count != res.data.data_len) { 1126 zcmn_err(getzoneid(), CE_WARN, 1127 "nfs3read: server %s returned incorrect amount", 1128 rp->r_server->sv_hostname); 1129 return (EIO); 1130 } 1131 1132 count -= res.count; 1133 *residp = count; 1134 base += res.count; 1135 offset += res.count; 1136 if (mi->mi_io_kstats) { 1137 mutex_enter(&mi->mi_lock); 1138 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 1139 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; 1140 mutex_exit(&mi->mi_lock); 1141 } 1142 lwp_stat_update(LWP_STAT_INBLK, 1); 1143 } while (count && !res.eof); 1144 1145 if (res.pov.attributes) { 1146 mutex_enter(&rp->r_statelock); 1147 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) { 1148 mutex_exit(&rp->r_statelock); 1149 PURGE_ATTRCACHE(vp); 1150 } else { 1151 if (rp->r_mtime <= t) 1152 nfs_attrcache_va(vp, &va); 1153 mutex_exit(&rp->r_statelock); 1154 } 1155 } 1156 1157 return (0); 1158 } 1159 1160 /* ARGSUSED */ 1161 static int 1162 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 1163 { 1164 1165 if (nfs_zone() != VTOMI(vp)->mi_zone) 1166 return (EIO); 1167 switch (cmd) { 1168 case _FIODIRECTIO: 1169 return (nfs_directio(vp, (int)arg, cr)); 1170 default: 1171 return (ENOTTY); 1172 } 1173 } 1174 1175 static int 1176 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1177 { 1178 int error; 1179 rnode_t *rp; 1180 1181 if (nfs_zone() != VTOMI(vp)->mi_zone) 1182 return (EIO); 1183 /* 1184 * If it has been specified that the return value will 1185 * just be used as a hint, and we are only being asked 1186 * for size, fsid or rdevid, then return the client's 1187 * notion of these values without checking to make sure 1188 * that the attribute cache is up to date. 1189 * The whole point is to avoid an over the wire GETATTR 1190 * call. 1191 */ 1192 rp = VTOR(vp); 1193 if (flags & ATTR_HINT) { 1194 if (vap->va_mask == 1195 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 1196 mutex_enter(&rp->r_statelock); 1197 if (vap->va_mask | AT_SIZE) 1198 vap->va_size = rp->r_size; 1199 if (vap->va_mask | AT_FSID) 1200 vap->va_fsid = rp->r_attr.va_fsid; 1201 if (vap->va_mask | AT_RDEV) 1202 vap->va_rdev = rp->r_attr.va_rdev; 1203 mutex_exit(&rp->r_statelock); 1204 return (0); 1205 } 1206 } 1207 1208 /* 1209 * Only need to flush pages if asking for the mtime 1210 * and if there any dirty pages or any outstanding 1211 * asynchronous (write) requests for this file. 1212 */ 1213 if (vap->va_mask & AT_MTIME) { 1214 if (vn_has_cached_data(vp) && 1215 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { 1216 mutex_enter(&rp->r_statelock); 1217 rp->r_gcount++; 1218 mutex_exit(&rp->r_statelock); 1219 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1220 mutex_enter(&rp->r_statelock); 1221 if (error && (error == ENOSPC || error == EDQUOT)) { 1222 if (!rp->r_error) 1223 rp->r_error = error; 1224 } 1225 if (--rp->r_gcount == 0) 1226 cv_broadcast(&rp->r_cv); 1227 mutex_exit(&rp->r_statelock); 1228 } 1229 } 1230 1231 return (nfs3getattr(vp, vap, cr)); 1232 } 1233 1234 /*ARGSUSED4*/ 1235 static int 1236 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 1237 caller_context_t *ct) 1238 { 1239 int error; 1240 struct vattr va; 1241 1242 if (vap->va_mask & AT_NOSET) 1243 return (EINVAL); 1244 if (nfs_zone() != VTOMI(vp)->mi_zone) 1245 return (EIO); 1246 1247 va.va_mask = AT_UID | AT_MODE; 1248 error = nfs3getattr(vp, &va, cr); 1249 if (error) 1250 return (error); 1251 1252 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx, 1253 vp); 1254 if (error) 1255 return (error); 1256 1257 return (nfs3setattr(vp, vap, flags, cr)); 1258 } 1259 1260 static int 1261 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 1262 { 1263 int error; 1264 uint_t mask; 1265 SETATTR3args args; 1266 SETATTR3res res; 1267 int douprintf; 1268 rnode_t *rp; 1269 struct vattr va; 1270 mode_t omode; 1271 vsecattr_t *vsp; 1272 hrtime_t t; 1273 1274 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 1275 mask = vap->va_mask; 1276 1277 rp = VTOR(vp); 1278 1279 /* 1280 * Only need to flush pages if there are any pages and 1281 * if the file is marked as dirty in some fashion. The 1282 * file must be flushed so that we can accurately 1283 * determine the size of the file and the cached data 1284 * after the SETATTR returns. A file is considered to 1285 * be dirty if it is either marked with RDIRTY, has 1286 * outstanding i/o's active, or is mmap'd. In this 1287 * last case, we can't tell whether there are dirty 1288 * pages, so we flush just to be sure. 1289 */ 1290 if (vn_has_cached_data(vp) && 1291 ((rp->r_flags & RDIRTY) || 1292 rp->r_count > 0 || 1293 rp->r_mapcnt > 0)) { 1294 ASSERT(vp->v_type != VCHR); 1295 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1296 if (error && (error == ENOSPC || error == EDQUOT)) { 1297 mutex_enter(&rp->r_statelock); 1298 if (!rp->r_error) 1299 rp->r_error = error; 1300 mutex_exit(&rp->r_statelock); 1301 } 1302 } 1303 1304 args.object = *RTOFH3(rp); 1305 /* 1306 * If the intent is for the server to set the times, 1307 * there is no point in have the mask indicating set mtime or 1308 * atime, because the vap values may be junk, and so result 1309 * in an overflow error. Remove these flags from the vap mask 1310 * before calling in this case, and restore them afterwards. 1311 */ 1312 if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) { 1313 /* Use server times, so don't set the args time fields */ 1314 vap->va_mask &= ~(AT_ATIME | AT_MTIME); 1315 error = vattr_to_sattr3(vap, &args.new_attributes); 1316 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME)); 1317 if (mask & AT_ATIME) { 1318 args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; 1319 } 1320 if (mask & AT_MTIME) { 1321 args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; 1322 } 1323 } else { 1324 /* Either do not set times or use the client specified times */ 1325 error = vattr_to_sattr3(vap, &args.new_attributes); 1326 } 1327 1328 if (error) { 1329 /* req time field(s) overflow - return immediately */ 1330 return (error); 1331 } 1332 1333 va.va_mask = AT_MODE | AT_CTIME; 1334 error = nfs3getattr(vp, &va, cr); 1335 if (error) 1336 return (error); 1337 omode = va.va_mode; 1338 1339 tryagain: 1340 if (mask & AT_SIZE) { 1341 args.guard.check = TRUE; 1342 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec; 1343 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec; 1344 } else 1345 args.guard.check = FALSE; 1346 1347 douprintf = 1; 1348 1349 t = gethrtime(); 1350 1351 error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, 1352 xdr_SETATTR3args, (caddr_t)&args, 1353 xdr_SETATTR3res, (caddr_t)&res, cr, 1354 &douprintf, &res.status, 0, NULL); 1355 1356 /* 1357 * Purge the access cache and ACL cache if changing either the 1358 * owner of the file, the group owner, or the mode. These may 1359 * change the access permissions of the file, so purge old 1360 * information and start over again. 1361 */ 1362 if (mask & (AT_UID | AT_GID | AT_MODE)) { 1363 (void) nfs_access_purge_rp(rp); 1364 if (rp->r_secattr != NULL) { 1365 mutex_enter(&rp->r_statelock); 1366 vsp = rp->r_secattr; 1367 rp->r_secattr = NULL; 1368 mutex_exit(&rp->r_statelock); 1369 if (vsp != NULL) 1370 nfs_acl_free(vsp); 1371 } 1372 } 1373 1374 if (error) { 1375 PURGE_ATTRCACHE(vp); 1376 return (error); 1377 } 1378 1379 error = geterrno3(res.status); 1380 if (!error) { 1381 /* 1382 * If changing the size of the file, invalidate 1383 * any local cached data which is no longer part 1384 * of the file. We also possibly invalidate the 1385 * last page in the file. We could use 1386 * pvn_vpzero(), but this would mark the page as 1387 * modified and require it to be written back to 1388 * the server for no particularly good reason. 1389 * This way, if we access it, then we bring it 1390 * back in. A read should be cheaper than a 1391 * write. 1392 */ 1393 if (mask & AT_SIZE) { 1394 nfs_invalidate_pages(vp, 1395 (vap->va_size & PAGEMASK), cr); 1396 } 1397 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); 1398 /* 1399 * Some servers will change the mode to clear the setuid 1400 * and setgid bits when changing the uid or gid. The 1401 * client needs to compensate appropriately. 1402 */ 1403 if (mask & (AT_UID | AT_GID)) { 1404 int terror; 1405 1406 va.va_mask = AT_MODE; 1407 terror = nfs3getattr(vp, &va, cr); 1408 if (!terror && 1409 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 1410 (!(mask & AT_MODE) && va.va_mode != omode))) { 1411 va.va_mask = AT_MODE; 1412 if (mask & AT_MODE) 1413 va.va_mode = vap->va_mode; 1414 else 1415 va.va_mode = omode; 1416 (void) nfs3setattr(vp, &va, 0, cr); 1417 } 1418 } 1419 } else { 1420 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr); 1421 /* 1422 * If we got back a "not synchronized" error, then 1423 * we need to retry with a new guard value. The 1424 * guard value used is the change time. If the 1425 * server returned post_op_attr, then we can just 1426 * retry because we have the latest attributes. 1427 * Otherwise, we issue a GETATTR to get the latest 1428 * attributes and then retry. If we couldn't get 1429 * the attributes this way either, then we give 1430 * up because we can't complete the operation as 1431 * required. 1432 */ 1433 if (res.status == NFS3ERR_NOT_SYNC) { 1434 va.va_mask = AT_CTIME; 1435 if (nfs3getattr(vp, &va, cr) == 0) 1436 goto tryagain; 1437 } 1438 PURGE_STALE_FH(error, vp, cr); 1439 } 1440 1441 return (error); 1442 } 1443 1444 static int 1445 nfs3_accessx(void *vp, int mode, cred_t *cr) 1446 { 1447 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); 1448 return (nfs3_access(vp, mode, 0, cr)); 1449 } 1450 1451 /* ARGSUSED */ 1452 static int 1453 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr) 1454 { 1455 int error; 1456 ACCESS3args args; 1457 ACCESS3res res; 1458 int douprintf; 1459 uint32 acc; 1460 rnode_t *rp; 1461 cred_t *cred, *ncr, *ncrfree = NULL; 1462 failinfo_t fi; 1463 nfs_access_type_t cacc; 1464 hrtime_t t; 1465 1466 acc = 0; 1467 if (nfs_zone() != VTOMI(vp)->mi_zone) 1468 return (EIO); 1469 if (mode & VREAD) 1470 acc |= ACCESS3_READ; 1471 if (mode & VWRITE) { 1472 if (vn_is_readonly(vp) && !IS_DEVVP(vp)) 1473 return (EROFS); 1474 if (vp->v_type == VDIR) 1475 acc |= ACCESS3_DELETE; 1476 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND; 1477 } 1478 if (mode & VEXEC) { 1479 if (vp->v_type == VDIR) 1480 acc |= ACCESS3_LOOKUP; 1481 else 1482 acc |= ACCESS3_EXECUTE; 1483 } 1484 1485 rp = VTOR(vp); 1486 args.object = *VTOFH3(vp); 1487 if (vp->v_type == VDIR) { 1488 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY | 1489 ACCESS3_EXTEND | ACCESS3_LOOKUP; 1490 } else { 1491 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND | 1492 ACCESS3_EXECUTE; 1493 } 1494 fi.vp = vp; 1495 fi.fhp = (caddr_t)&args.object; 1496 fi.copyproc = nfs3copyfh; 1497 fi.lookupproc = nfs3lookup; 1498 fi.xattrdirproc = acl_getxattrdir3; 1499 1500 cred = cr; 1501 /* 1502 * ncr and ncrfree both initially 1503 * point to the memory area returned 1504 * by crnetadjust(); 1505 * ncrfree not NULL when exiting means 1506 * that we need to release it 1507 */ 1508 ncr = crnetadjust(cred); 1509 ncrfree = ncr; 1510 tryagain: 1511 if (rp->r_acache != NULL) { 1512 cacc = nfs_access_check(rp, acc, cred); 1513 if (cacc == NFS_ACCESS_ALLOWED) { 1514 if (ncrfree != NULL) 1515 crfree(ncrfree); 1516 return (0); 1517 } 1518 if (cacc == NFS_ACCESS_DENIED) { 1519 /* 1520 * If the cred can be adjusted, try again 1521 * with the new cred. 1522 */ 1523 if (ncr != NULL) { 1524 cred = ncr; 1525 ncr = NULL; 1526 goto tryagain; 1527 } 1528 if (ncrfree != NULL) 1529 crfree(ncrfree); 1530 return (EACCES); 1531 } 1532 } 1533 1534 douprintf = 1; 1535 1536 t = gethrtime(); 1537 1538 error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS, 1539 xdr_ACCESS3args, (caddr_t)&args, 1540 xdr_ACCESS3res, (caddr_t)&res, cred, 1541 &douprintf, &res.status, 0, &fi); 1542 1543 if (error) { 1544 if (ncrfree != NULL) 1545 crfree(ncrfree); 1546 return (error); 1547 } 1548 1549 error = geterrno3(res.status); 1550 if (!error) { 1551 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr); 1552 nfs_access_cache(rp, args.access, res.resok.access, cred); 1553 /* 1554 * we just cached results with cred; if cred is the 1555 * adjusted credentials from crnetadjust, we do not want 1556 * to release them before exiting: hence setting ncrfree 1557 * to NULL 1558 */ 1559 if (cred != cr) 1560 ncrfree = NULL; 1561 if ((acc & res.resok.access) != acc) { 1562 /* 1563 * If the cred can be adjusted, try again 1564 * with the new cred. 1565 */ 1566 if (ncr != NULL) { 1567 cred = ncr; 1568 ncr = NULL; 1569 goto tryagain; 1570 } 1571 error = EACCES; 1572 } 1573 } else { 1574 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr); 1575 PURGE_STALE_FH(error, vp, cr); 1576 } 1577 1578 if (ncrfree != NULL) 1579 crfree(ncrfree); 1580 1581 return (error); 1582 } 1583 1584 static int nfs3_do_symlink_cache = 1; 1585 1586 static int 1587 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 1588 { 1589 int error; 1590 READLINK3args args; 1591 READLINK3res res; 1592 nfspath3 resdata_backup; 1593 rnode_t *rp; 1594 int douprintf; 1595 int len; 1596 failinfo_t fi; 1597 hrtime_t t; 1598 1599 /* 1600 * Can't readlink anything other than a symbolic link. 1601 */ 1602 if (vp->v_type != VLNK) 1603 return (EINVAL); 1604 if (nfs_zone() != VTOMI(vp)->mi_zone) 1605 return (EIO); 1606 1607 rp = VTOR(vp); 1608 if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) { 1609 error = nfs3_validate_caches(vp, cr); 1610 if (error) 1611 return (error); 1612 mutex_enter(&rp->r_statelock); 1613 if (rp->r_symlink.contents != NULL) { 1614 error = uiomove(rp->r_symlink.contents, 1615 rp->r_symlink.len, UIO_READ, uiop); 1616 mutex_exit(&rp->r_statelock); 1617 return (error); 1618 } 1619 mutex_exit(&rp->r_statelock); 1620 } 1621 1622 args.symlink = *VTOFH3(vp); 1623 fi.vp = vp; 1624 fi.fhp = (caddr_t)&args.symlink; 1625 fi.copyproc = nfs3copyfh; 1626 fi.lookupproc = nfs3lookup; 1627 fi.xattrdirproc = acl_getxattrdir3; 1628 1629 res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1630 1631 resdata_backup = res.resok.data; 1632 1633 douprintf = 1; 1634 1635 t = gethrtime(); 1636 1637 error = rfs3call(VTOMI(vp), NFSPROC3_READLINK, 1638 xdr_nfs_fh3, (caddr_t)&args, 1639 xdr_READLINK3res, (caddr_t)&res, cr, 1640 &douprintf, &res.status, 0, &fi); 1641 1642 if (res.resok.data == nfs3nametoolong) 1643 error = EINVAL; 1644 1645 if (error) { 1646 kmem_free(resdata_backup, MAXPATHLEN); 1647 return (error); 1648 } 1649 1650 error = geterrno3(res.status); 1651 if (!error) { 1652 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t, 1653 cr); 1654 len = strlen(res.resok.data); 1655 error = uiomove(res.resok.data, len, UIO_READ, uiop); 1656 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) { 1657 mutex_enter(&rp->r_statelock); 1658 if (rp->r_symlink.contents == NULL) { 1659 rp->r_symlink.contents = res.resok.data; 1660 rp->r_symlink.len = len; 1661 rp->r_symlink.size = MAXPATHLEN; 1662 mutex_exit(&rp->r_statelock); 1663 } else { 1664 mutex_exit(&rp->r_statelock); 1665 1666 kmem_free((void *)res.resok.data, MAXPATHLEN); 1667 } 1668 } else { 1669 kmem_free((void *)res.resok.data, MAXPATHLEN); 1670 } 1671 } else { 1672 nfs3_cache_post_op_attr(vp, 1673 &res.resfail.symlink_attributes, t, cr); 1674 PURGE_STALE_FH(error, vp, cr); 1675 1676 kmem_free((void *)res.resok.data, MAXPATHLEN); 1677 1678 } 1679 1680 /* 1681 * The over the wire error for attempting to readlink something 1682 * other than a symbolic link is ENXIO. However, we need to 1683 * return EINVAL instead of ENXIO, so we map it here. 1684 */ 1685 return (error == ENXIO ? EINVAL : error); 1686 } 1687 1688 /* 1689 * Flush local dirty pages to stable storage on the server. 1690 * 1691 * If FNODSYNC is specified, then there is nothing to do because 1692 * metadata changes are not cached on the client before being 1693 * sent to the server. 1694 */ 1695 static int 1696 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1697 { 1698 int error; 1699 1700 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 1701 return (0); 1702 if (nfs_zone() != VTOMI(vp)->mi_zone) 1703 return (EIO); 1704 1705 error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); 1706 if (!error) 1707 error = VTOR(vp)->r_error; 1708 return (error); 1709 } 1710 1711 /* 1712 * Weirdness: if the file was removed or the target of a rename 1713 * operation while it was open, it got renamed instead. Here we 1714 * remove the renamed file. 1715 */ 1716 static void 1717 nfs3_inactive(vnode_t *vp, cred_t *cr) 1718 { 1719 rnode_t *rp; 1720 1721 ASSERT(vp != DNLC_NO_VNODE); 1722 1723 /* 1724 * If this is coming from the wrong zone, we let someone in the right 1725 * zone take care of it asynchronously. We can get here due to 1726 * VN_RELE() being called from pageout() or fsflush(). This call may 1727 * potentially turn into an expensive no-op if, for instance, v_count 1728 * gets incremented in the meantime, but it's still correct. 1729 */ 1730 if (nfs_zone() != VTOMI(vp)->mi_zone) { 1731 nfs_async_inactive(vp, cr, nfs3_inactive); 1732 return; 1733 } 1734 1735 rp = VTOR(vp); 1736 redo: 1737 if (rp->r_unldvp != NULL) { 1738 /* 1739 * Save the vnode pointer for the directory where the 1740 * unlinked-open file got renamed, then set it to NULL 1741 * to prevent another thread from getting here before 1742 * we're done with the remove. While we have the 1743 * statelock, make local copies of the pertinent rnode 1744 * fields. If we weren't to do this in an atomic way, the 1745 * the unl* fields could become inconsistent with respect 1746 * to each other due to a race condition between this 1747 * code and nfs_remove(). See bug report 1034328. 1748 */ 1749 mutex_enter(&rp->r_statelock); 1750 if (rp->r_unldvp != NULL) { 1751 vnode_t *unldvp; 1752 char *unlname; 1753 cred_t *unlcred; 1754 REMOVE3args args; 1755 REMOVE3res res; 1756 int douprintf; 1757 int error; 1758 hrtime_t t; 1759 1760 unldvp = rp->r_unldvp; 1761 rp->r_unldvp = NULL; 1762 unlname = rp->r_unlname; 1763 rp->r_unlname = NULL; 1764 unlcred = rp->r_unlcred; 1765 rp->r_unlcred = NULL; 1766 mutex_exit(&rp->r_statelock); 1767 1768 /* 1769 * If there are any dirty pages left, then flush 1770 * them. This is unfortunate because they just 1771 * may get thrown away during the remove operation, 1772 * but we have to do this for correctness. 1773 */ 1774 if (vn_has_cached_data(vp) && 1775 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 1776 ASSERT(vp->v_type != VCHR); 1777 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 1778 if (error) { 1779 mutex_enter(&rp->r_statelock); 1780 if (!rp->r_error) 1781 rp->r_error = error; 1782 mutex_exit(&rp->r_statelock); 1783 } 1784 } 1785 1786 /* 1787 * Do the remove operation on the renamed file 1788 */ 1789 setdiropargs3(&args.object, unlname, unldvp); 1790 1791 douprintf = 1; 1792 1793 t = gethrtime(); 1794 1795 error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE, 1796 xdr_diropargs3, (caddr_t)&args, 1797 xdr_REMOVE3res, (caddr_t)&res, unlcred, 1798 &douprintf, &res.status, 0, NULL); 1799 1800 if (error) { 1801 PURGE_ATTRCACHE(unldvp); 1802 } else { 1803 error = geterrno3(res.status); 1804 if (!error) { 1805 nfs3_cache_wcc_data(unldvp, 1806 &res.resok.dir_wcc, t, cr); 1807 if (HAVE_RDDIR_CACHE(VTOR(unldvp))) 1808 nfs_purge_rddir_cache(unldvp); 1809 } else { 1810 nfs3_cache_wcc_data(unldvp, 1811 &res.resfail.dir_wcc, t, cr); 1812 PURGE_STALE_FH(error, unldvp, cr); 1813 } 1814 } 1815 1816 /* 1817 * Release stuff held for the remove 1818 */ 1819 VN_RELE(unldvp); 1820 kmem_free(unlname, MAXNAMELEN); 1821 crfree(unlcred); 1822 goto redo; 1823 } 1824 mutex_exit(&rp->r_statelock); 1825 } 1826 1827 rp_addfree(rp, cr); 1828 } 1829 1830 /* 1831 * Remote file system operations having to do with directory manipulation. 1832 */ 1833 1834 static int 1835 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1836 int flags, vnode_t *rdir, cred_t *cr) 1837 { 1838 int error; 1839 vnode_t *vp; 1840 vnode_t *avp = NULL; 1841 rnode_t *drp; 1842 1843 if (nfs_zone() != VTOMI(dvp)->mi_zone) 1844 return (EPERM); 1845 1846 drp = VTOR(dvp); 1847 1848 /* 1849 * Are we looking up extended attributes? If so, "dvp" is 1850 * the file or directory for which we want attributes, and 1851 * we need a lookup of the hidden attribute directory 1852 * before we lookup the rest of the path. 1853 */ 1854 if (flags & LOOKUP_XATTR) { 1855 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); 1856 mntinfo_t *mi; 1857 1858 mi = VTOMI(dvp); 1859 if (!(mi->mi_flags & MI_EXTATTR)) 1860 return (EINVAL); 1861 1862 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) 1863 return (EINTR); 1864 1865 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); 1866 if (avp == NULL) 1867 error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0); 1868 else 1869 error = 0; 1870 1871 nfs_rw_exit(&drp->r_rwlock); 1872 1873 if (error) { 1874 if (mi->mi_flags & MI_EXTATTR) 1875 return (error); 1876 return (EINVAL); 1877 } 1878 dvp = avp; 1879 drp = VTOR(dvp); 1880 } 1881 1882 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { 1883 error = EINTR; 1884 goto out; 1885 } 1886 1887 error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); 1888 1889 nfs_rw_exit(&drp->r_rwlock); 1890 1891 /* 1892 * If vnode is a device, create special vnode. 1893 */ 1894 if (!error && IS_DEVVP(*vpp)) { 1895 vp = *vpp; 1896 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 1897 VN_RELE(vp); 1898 } 1899 1900 out: 1901 if (avp != NULL) 1902 VN_RELE(avp); 1903 1904 return (error); 1905 } 1906 1907 static int nfs3_lookup_neg_cache = 1; 1908 1909 #ifdef DEBUG 1910 static int nfs3_lookup_dnlc_hits = 0; 1911 static int nfs3_lookup_dnlc_misses = 0; 1912 static int nfs3_lookup_dnlc_neg_hits = 0; 1913 static int nfs3_lookup_dnlc_disappears = 0; 1914 static int nfs3_lookup_dnlc_lookups = 0; 1915 #endif 1916 1917 /* ARGSUSED */ 1918 int 1919 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1920 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) 1921 { 1922 int error; 1923 rnode_t *drp; 1924 1925 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1926 /* 1927 * If lookup is for "", just return dvp. Don't need 1928 * to send it over the wire, look it up in the dnlc, 1929 * or perform any access checks. 1930 */ 1931 if (*nm == '\0') { 1932 VN_HOLD(dvp); 1933 *vpp = dvp; 1934 return (0); 1935 } 1936 1937 /* 1938 * Can't do lookups in non-directories. 1939 */ 1940 if (dvp->v_type != VDIR) 1941 return (ENOTDIR); 1942 1943 /* 1944 * If we're called with RFSCALL_SOFT, it's important that 1945 * the only rfscall is one we make directly; if we permit 1946 * an access call because we're looking up "." or validating 1947 * a dnlc hit, we'll deadlock because that rfscall will not 1948 * have the RFSCALL_SOFT set. 1949 */ 1950 if (rfscall_flags & RFSCALL_SOFT) 1951 goto callit; 1952 1953 /* 1954 * If lookup is for ".", just return dvp. Don't need 1955 * to send it over the wire or look it up in the dnlc, 1956 * just need to check access. 1957 */ 1958 if (strcmp(nm, ".") == 0) { 1959 error = nfs3_access(dvp, VEXEC, 0, cr); 1960 if (error) 1961 return (error); 1962 VN_HOLD(dvp); 1963 *vpp = dvp; 1964 return (0); 1965 } 1966 1967 drp = VTOR(dvp); 1968 if (!(drp->r_flags & RLOOKUP)) { 1969 mutex_enter(&drp->r_statelock); 1970 drp->r_flags |= RLOOKUP; 1971 mutex_exit(&drp->r_statelock); 1972 } 1973 1974 /* 1975 * Lookup this name in the DNLC. If there was a valid entry, 1976 * then return the results of the lookup. 1977 */ 1978 error = nfs3lookup_dnlc(dvp, nm, vpp, cr); 1979 if (error || *vpp != NULL) 1980 return (error); 1981 1982 callit: 1983 error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags); 1984 1985 return (error); 1986 } 1987 1988 static int 1989 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 1990 { 1991 int error; 1992 vnode_t *vp; 1993 1994 ASSERT(*nm != '\0'); 1995 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 1996 /* 1997 * Lookup this name in the DNLC. If successful, then validate 1998 * the caches and then recheck the DNLC. The DNLC is rechecked 1999 * just in case this entry got invalidated during the call 2000 * to nfs3_validate_caches. 2001 * 2002 * An assumption is being made that it is safe to say that a 2003 * file exists which may not on the server. Any operations to 2004 * the server will fail with ESTALE. 2005 */ 2006 #ifdef DEBUG 2007 nfs3_lookup_dnlc_lookups++; 2008 #endif 2009 vp = dnlc_lookup(dvp, nm); 2010 if (vp != NULL) { 2011 VN_RELE(vp); 2012 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { 2013 PURGE_ATTRCACHE(dvp); 2014 } 2015 error = nfs3_validate_caches(dvp, cr); 2016 if (error) 2017 return (error); 2018 vp = dnlc_lookup(dvp, nm); 2019 if (vp != NULL) { 2020 error = nfs3_access(dvp, VEXEC, 0, cr); 2021 if (error) { 2022 VN_RELE(vp); 2023 return (error); 2024 } 2025 if (vp == DNLC_NO_VNODE) { 2026 VN_RELE(vp); 2027 #ifdef DEBUG 2028 nfs3_lookup_dnlc_neg_hits++; 2029 #endif 2030 return (ENOENT); 2031 } 2032 *vpp = vp; 2033 #ifdef DEBUG 2034 nfs3_lookup_dnlc_hits++; 2035 #endif 2036 return (0); 2037 } 2038 #ifdef DEBUG 2039 nfs3_lookup_dnlc_disappears++; 2040 #endif 2041 } 2042 #ifdef DEBUG 2043 else 2044 nfs3_lookup_dnlc_misses++; 2045 #endif 2046 2047 *vpp = NULL; 2048 2049 return (0); 2050 } 2051 2052 static int 2053 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, 2054 int rfscall_flags) 2055 { 2056 int error; 2057 LOOKUP3args args; 2058 LOOKUP3vres res; 2059 int douprintf; 2060 struct vattr vattr; 2061 struct vattr dvattr; 2062 vnode_t *vp; 2063 failinfo_t fi; 2064 hrtime_t t; 2065 2066 ASSERT(*nm != '\0'); 2067 ASSERT(dvp->v_type == VDIR); 2068 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2069 2070 setdiropargs3(&args.what, nm, dvp); 2071 2072 fi.vp = dvp; 2073 fi.fhp = (caddr_t)&args.what.dir; 2074 fi.copyproc = nfs3copyfh; 2075 fi.lookupproc = nfs3lookup; 2076 fi.xattrdirproc = acl_getxattrdir3; 2077 res.obj_attributes.fres.vp = dvp; 2078 res.obj_attributes.fres.vap = &vattr; 2079 res.dir_attributes.fres.vp = dvp; 2080 res.dir_attributes.fres.vap = &dvattr; 2081 2082 douprintf = 1; 2083 2084 t = gethrtime(); 2085 2086 error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP, 2087 xdr_diropargs3, (caddr_t)&args, 2088 xdr_LOOKUP3vres, (caddr_t)&res, cr, 2089 &douprintf, &res.status, rfscall_flags, &fi); 2090 2091 if (error) 2092 return (error); 2093 2094 nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr); 2095 2096 error = geterrno3(res.status); 2097 if (error) { 2098 PURGE_STALE_FH(error, dvp, cr); 2099 if (error == ENOENT && nfs3_lookup_neg_cache) 2100 dnlc_enter(dvp, nm, DNLC_NO_VNODE); 2101 return (error); 2102 } 2103 2104 if (res.obj_attributes.attributes) { 2105 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap, 2106 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 2107 } else { 2108 vp = makenfs3node_va(&res.object, NULL, 2109 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); 2110 if (vp->v_type == VNON) { 2111 vattr.va_mask = AT_TYPE; 2112 error = nfs3getattr(vp, &vattr, cr); 2113 if (error) { 2114 VN_RELE(vp); 2115 return (error); 2116 } 2117 vp->v_type = vattr.va_type; 2118 } 2119 } 2120 2121 if (!(rfscall_flags & RFSCALL_SOFT)) 2122 dnlc_update(dvp, nm, vp); 2123 2124 *vpp = vp; 2125 2126 return (error); 2127 } 2128 2129 #ifdef DEBUG 2130 static int nfs3_create_misses = 0; 2131 #endif 2132 2133 /* ARGSUSED */ 2134 static int 2135 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2136 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 2137 { 2138 int error; 2139 vnode_t *vp; 2140 rnode_t *rp; 2141 struct vattr vattr; 2142 rnode_t *drp; 2143 vnode_t *tempvp; 2144 2145 drp = VTOR(dvp); 2146 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2147 return (EPERM); 2148 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2149 return (EINTR); 2150 2151 top: 2152 /* 2153 * We make a copy of the attributes because the caller does not 2154 * expect us to change what va points to. 2155 */ 2156 vattr = *va; 2157 2158 /* 2159 * If the pathname is "", just use dvp. Don't need 2160 * to send it over the wire, look it up in the dnlc, 2161 * or perform any access checks. 2162 */ 2163 if (*nm == '\0') { 2164 error = 0; 2165 VN_HOLD(dvp); 2166 vp = dvp; 2167 /* 2168 * If the pathname is ".", just use dvp. Don't need 2169 * to send it over the wire or look it up in the dnlc, 2170 * just need to check access. 2171 */ 2172 } else if (strcmp(nm, ".") == 0) { 2173 error = nfs3_access(dvp, VEXEC, 0, cr); 2174 if (error) { 2175 nfs_rw_exit(&drp->r_rwlock); 2176 return (error); 2177 } 2178 VN_HOLD(dvp); 2179 vp = dvp; 2180 /* 2181 * We need to go over the wire, just to be sure whether the 2182 * file exists or not. Using the DNLC can be dangerous in 2183 * this case when making a decision regarding existence. 2184 */ 2185 } else { 2186 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0); 2187 } 2188 if (!error) { 2189 if (exclusive == EXCL) 2190 error = EEXIST; 2191 else if (vp->v_type == VDIR && (mode & VWRITE)) 2192 error = EISDIR; 2193 else { 2194 /* 2195 * If vnode is a device, create special vnode. 2196 */ 2197 if (IS_DEVVP(vp)) { 2198 tempvp = vp; 2199 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2200 VN_RELE(tempvp); 2201 } 2202 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 2203 if ((vattr.va_mask & AT_SIZE) && 2204 vp->v_type == VREG) { 2205 rp = VTOR(vp); 2206 /* 2207 * Check here for large file handled 2208 * by LF-unaware process (as 2209 * ufs_create() does) 2210 */ 2211 if (!(lfaware & FOFFMAX)) { 2212 mutex_enter(&rp->r_statelock); 2213 if (rp->r_size > MAXOFF32_T) 2214 error = EOVERFLOW; 2215 mutex_exit(&rp->r_statelock); 2216 } 2217 if (!error) { 2218 vattr.va_mask = AT_SIZE; 2219 error = nfs3setattr(vp, 2220 &vattr, 0, cr); 2221 } 2222 } 2223 } 2224 } 2225 nfs_rw_exit(&drp->r_rwlock); 2226 if (error) { 2227 VN_RELE(vp); 2228 } else 2229 *vpp = vp; 2230 return (error); 2231 } 2232 2233 dnlc_remove(dvp, nm); 2234 2235 /* 2236 * Decide what the group-id of the created file should be. 2237 * Set it in attribute list as advisory... 2238 */ 2239 error = setdirgid(dvp, &vattr.va_gid, cr); 2240 if (error) { 2241 nfs_rw_exit(&drp->r_rwlock); 2242 return (error); 2243 } 2244 vattr.va_mask |= AT_GID; 2245 2246 ASSERT(vattr.va_mask & AT_TYPE); 2247 if (vattr.va_type == VREG) { 2248 ASSERT(vattr.va_mask & AT_MODE); 2249 if (MANDMODE(vattr.va_mode)) { 2250 nfs_rw_exit(&drp->r_rwlock); 2251 return (EACCES); 2252 } 2253 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr, 2254 lfaware); 2255 /* 2256 * If this is not an exclusive create, then the CREATE 2257 * request will be made with the GUARDED mode set. This 2258 * means that the server will return EEXIST if the file 2259 * exists. The file could exist because of a retransmitted 2260 * request. In this case, we recover by starting over and 2261 * checking to see whether the file exists. This second 2262 * time through it should and a CREATE request will not be 2263 * sent. 2264 * 2265 * This handles the problem of a dangling CREATE request 2266 * which contains attributes which indicate that the file 2267 * should be truncated. This retransmitted request could 2268 * possibly truncate valid data in the file if not caught 2269 * by the duplicate request mechanism on the server or if 2270 * not caught by other means. The scenario is: 2271 * 2272 * Client transmits CREATE request with size = 0 2273 * Client times out, retransmits request. 2274 * Response to the first request arrives from the server 2275 * and the client proceeds on. 2276 * Client writes data to the file. 2277 * The server now processes retransmitted CREATE request 2278 * and truncates file. 2279 * 2280 * The use of the GUARDED CREATE request prevents this from 2281 * happening because the retransmitted CREATE would fail 2282 * with EEXIST and would not truncate the file. 2283 */ 2284 if (error == EEXIST && exclusive == NONEXCL) { 2285 #ifdef DEBUG 2286 nfs3_create_misses++; 2287 #endif 2288 goto top; 2289 } 2290 nfs_rw_exit(&drp->r_rwlock); 2291 return (error); 2292 } 2293 error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 2294 nfs_rw_exit(&drp->r_rwlock); 2295 return (error); 2296 } 2297 2298 /* ARGSUSED */ 2299 static int 2300 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2301 int mode, vnode_t **vpp, cred_t *cr, int lfaware) 2302 { 2303 int error; 2304 CREATE3args args; 2305 CREATE3res res; 2306 int douprintf; 2307 vnode_t *vp; 2308 struct vattr vattr; 2309 nfstime3 *verfp; 2310 rnode_t *rp; 2311 timestruc_t now; 2312 hrtime_t t; 2313 2314 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2315 setdiropargs3(&args.where, nm, dvp); 2316 if (exclusive == EXCL) { 2317 args.how.mode = EXCLUSIVE; 2318 /* 2319 * Construct the create verifier. This verifier needs 2320 * to be unique between different clients. It also needs 2321 * to vary for each exclusive create request generated 2322 * from the client to the server. 2323 * 2324 * The first attempt is made to use the hostid and a 2325 * unique number on the client. If the hostid has not 2326 * been set, the high resolution time that the exclusive 2327 * create request is being made is used. This will work 2328 * unless two different clients, both with the hostid 2329 * not set, attempt an exclusive create request on the 2330 * same file, at exactly the same clock time. The 2331 * chances of this happening seem small enough to be 2332 * reasonable. 2333 */ 2334 verfp = (nfstime3 *)&args.how.createhow3_u.verf; 2335 verfp->seconds = nfs_atoi(hw_serial); 2336 if (verfp->seconds != 0) 2337 verfp->nseconds = newnum(); 2338 else { 2339 gethrestime(&now); 2340 verfp->seconds = now.tv_sec; 2341 verfp->nseconds = now.tv_nsec; 2342 } 2343 /* 2344 * Since the server will use this value for the mtime, 2345 * make sure that it can't overflow. Zero out the MSB. 2346 * The actual value does not matter here, only its uniqeness. 2347 */ 2348 verfp->seconds %= INT32_MAX; 2349 } else { 2350 /* 2351 * Issue the non-exclusive create in guarded mode. This 2352 * may result in some false EEXIST responses for 2353 * retransmitted requests, but these will be handled at 2354 * a higher level. By using GUARDED, duplicate requests 2355 * to do file truncation and possible access problems 2356 * can be avoided. 2357 */ 2358 args.how.mode = GUARDED; 2359 error = vattr_to_sattr3(va, 2360 &args.how.createhow3_u.obj_attributes); 2361 if (error) { 2362 /* req time field(s) overflow - return immediately */ 2363 return (error); 2364 } 2365 } 2366 2367 douprintf = 1; 2368 2369 t = gethrtime(); 2370 2371 error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE, 2372 xdr_CREATE3args, (caddr_t)&args, 2373 xdr_CREATE3res, (caddr_t)&res, cr, 2374 &douprintf, &res.status, 0, NULL); 2375 2376 if (error) { 2377 PURGE_ATTRCACHE(dvp); 2378 return (error); 2379 } 2380 2381 error = geterrno3(res.status); 2382 if (!error) { 2383 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 2384 if (HAVE_RDDIR_CACHE(VTOR(dvp))) 2385 nfs_purge_rddir_cache(dvp); 2386 2387 /* 2388 * On exclusive create the times need to be explicitly 2389 * set to clear any potential verifier that may be stored 2390 * in one of these fields (see comment below). This 2391 * is done here to cover the case where no post op attrs 2392 * were returned or a 'invalid' time was returned in 2393 * the attributes. 2394 */ 2395 if (exclusive == EXCL) 2396 va->va_mask |= (AT_MTIME | AT_ATIME); 2397 2398 if (!res.resok.obj.handle_follows) { 2399 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2400 if (error) 2401 return (error); 2402 } else { 2403 if (res.resok.obj_attributes.attributes) { 2404 vp = makenfs3node(&res.resok.obj.handle, 2405 &res.resok.obj_attributes.attr, 2406 dvp->v_vfsp, t, cr, NULL, NULL); 2407 } else { 2408 vp = makenfs3node(&res.resok.obj.handle, NULL, 2409 dvp->v_vfsp, t, cr, NULL, NULL); 2410 2411 /* 2412 * On an exclusive create, it is possible 2413 * that attributes were returned but those 2414 * postop attributes failed to decode 2415 * properly. If this is the case, 2416 * then most likely the atime or mtime 2417 * were invalid for our client; this 2418 * is caused by the server storing the 2419 * create verifier in one of the time 2420 * fields(most likely mtime). 2421 * So... we are going to setattr just the 2422 * atime/mtime to clear things up. 2423 */ 2424 if (exclusive == EXCL) { 2425 if (error = 2426 nfs3excl_create_settimes(vp, 2427 va, cr)) { 2428 /* 2429 * Setting the times failed. 2430 * Remove the file and return 2431 * the error. 2432 */ 2433 VN_RELE(vp); 2434 (void) nfs3_remove(dvp, 2435 nm, cr); 2436 return (error); 2437 } 2438 } 2439 2440 /* 2441 * This handles the non-exclusive case 2442 * and the exclusive case where no post op 2443 * attrs were returned. 2444 */ 2445 if (vp->v_type == VNON) { 2446 vattr.va_mask = AT_TYPE; 2447 error = nfs3getattr(vp, &vattr, cr); 2448 if (error) { 2449 VN_RELE(vp); 2450 return (error); 2451 } 2452 vp->v_type = vattr.va_type; 2453 } 2454 } 2455 dnlc_update(dvp, nm, vp); 2456 } 2457 2458 rp = VTOR(vp); 2459 2460 /* 2461 * Check here for large file handled by 2462 * LF-unaware process (as ufs_create() does) 2463 */ 2464 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG && 2465 !(lfaware & FOFFMAX)) { 2466 mutex_enter(&rp->r_statelock); 2467 if (rp->r_size > MAXOFF32_T) { 2468 mutex_exit(&rp->r_statelock); 2469 VN_RELE(vp); 2470 return (EOVERFLOW); 2471 } 2472 mutex_exit(&rp->r_statelock); 2473 } 2474 2475 if (exclusive == EXCL && 2476 (va->va_mask & ~(AT_GID | AT_SIZE))) { 2477 /* 2478 * If doing an exclusive create, then generate 2479 * a SETATTR to set the initial attributes. 2480 * Try to set the mtime and the atime to the 2481 * server's current time. It is somewhat 2482 * expected that these fields will be used to 2483 * store the exclusive create cookie. If not, 2484 * server implementors will need to know that 2485 * a SETATTR will follow an exclusive create 2486 * and the cookie should be destroyed if 2487 * appropriate. This work may have been done 2488 * earlier in this function if post op attrs 2489 * were not available. 2490 * 2491 * The AT_GID and AT_SIZE bits are turned off 2492 * so that the SETATTR request will not attempt 2493 * to process these. The gid will be set 2494 * separately if appropriate. The size is turned 2495 * off because it is assumed that a new file will 2496 * be created empty and if the file wasn't empty, 2497 * then the exclusive create will have failed 2498 * because the file must have existed already. 2499 * Therefore, no truncate operation is needed. 2500 */ 2501 va->va_mask &= ~(AT_GID | AT_SIZE); 2502 error = nfs3setattr(vp, va, 0, cr); 2503 if (error) { 2504 /* 2505 * Couldn't correct the attributes of 2506 * the newly created file and the 2507 * attributes are wrong. Remove the 2508 * file and return an error to the 2509 * application. 2510 */ 2511 VN_RELE(vp); 2512 (void) nfs3_remove(dvp, nm, cr); 2513 return (error); 2514 } 2515 } 2516 2517 if (va->va_gid != rp->r_attr.va_gid) { 2518 /* 2519 * If the gid on the file isn't right, then 2520 * generate a SETATTR to attempt to change 2521 * it. This may or may not work, depending 2522 * upon the server's semantics for allowing 2523 * file ownership changes. 2524 */ 2525 va->va_mask = AT_GID; 2526 (void) nfs3setattr(vp, va, 0, cr); 2527 } 2528 2529 /* 2530 * If vnode is a device create special vnode 2531 */ 2532 if (IS_DEVVP(vp)) { 2533 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2534 VN_RELE(vp); 2535 } else 2536 *vpp = vp; 2537 } else { 2538 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 2539 PURGE_STALE_FH(error, dvp, cr); 2540 } 2541 2542 return (error); 2543 } 2544 2545 /* 2546 * Special setattr function to take care of rest of atime/mtime 2547 * after successful exclusive create. This function exists to avoid 2548 * handling attributes from the server; exclusive the atime/mtime fields 2549 * may be 'invalid' in client's view and therefore can not be trusted. 2550 */ 2551 static int 2552 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr) 2553 { 2554 int error; 2555 uint_t mask; 2556 SETATTR3args args; 2557 SETATTR3res res; 2558 int douprintf; 2559 rnode_t *rp; 2560 hrtime_t t; 2561 2562 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 2563 /* save the caller's mask so that it can be reset later */ 2564 mask = vap->va_mask; 2565 2566 rp = VTOR(vp); 2567 2568 args.object = *RTOFH3(rp); 2569 args.guard.check = FALSE; 2570 2571 /* Use the mask to initialize the arguments */ 2572 vap->va_mask = 0; 2573 error = vattr_to_sattr3(vap, &args.new_attributes); 2574 2575 /* We want to set just atime/mtime on this request */ 2576 args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; 2577 args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; 2578 2579 douprintf = 1; 2580 2581 t = gethrtime(); 2582 2583 error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, 2584 xdr_SETATTR3args, (caddr_t)&args, 2585 xdr_SETATTR3res, (caddr_t)&res, cr, 2586 &douprintf, &res.status, 0, NULL); 2587 2588 if (error) { 2589 vap->va_mask = mask; 2590 return (error); 2591 } 2592 2593 error = geterrno3(res.status); 2594 if (!error) { 2595 /* 2596 * It is important to pick up the attributes. 2597 * Since this is the exclusive create path, the 2598 * attributes on the initial create were ignored 2599 * and we need these to have the correct info. 2600 */ 2601 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); 2602 /* 2603 * No need to do the atime/mtime work again so clear 2604 * the bits. 2605 */ 2606 mask &= ~(AT_ATIME | AT_MTIME); 2607 } else { 2608 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr); 2609 } 2610 2611 vap->va_mask = mask; 2612 2613 return (error); 2614 } 2615 2616 /* ARGSUSED */ 2617 static int 2618 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 2619 int mode, vnode_t **vpp, cred_t *cr) 2620 { 2621 int error; 2622 MKNOD3args args; 2623 MKNOD3res res; 2624 int douprintf; 2625 vnode_t *vp; 2626 struct vattr vattr; 2627 hrtime_t t; 2628 2629 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); 2630 switch (va->va_type) { 2631 case VCHR: 2632 case VBLK: 2633 setdiropargs3(&args.where, nm, dvp); 2634 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK; 2635 error = vattr_to_sattr3(va, 2636 &args.what.mknoddata3_u.device.dev_attributes); 2637 if (error) { 2638 /* req time field(s) overflow - return immediately */ 2639 return (error); 2640 } 2641 args.what.mknoddata3_u.device.spec.specdata1 = 2642 getmajor(va->va_rdev); 2643 args.what.mknoddata3_u.device.spec.specdata2 = 2644 getminor(va->va_rdev); 2645 break; 2646 2647 case VFIFO: 2648 case VSOCK: 2649 setdiropargs3(&args.where, nm, dvp); 2650 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK; 2651 error = vattr_to_sattr3(va, 2652 &args.what.mknoddata3_u.pipe_attributes); 2653 if (error) { 2654 /* req time field(s) overflow - return immediately */ 2655 return (error); 2656 } 2657 break; 2658 2659 default: 2660 return (EINVAL); 2661 } 2662 2663 douprintf = 1; 2664 2665 t = gethrtime(); 2666 2667 error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD, 2668 xdr_MKNOD3args, (caddr_t)&args, 2669 xdr_MKNOD3res, (caddr_t)&res, cr, 2670 &douprintf, &res.status, 0, NULL); 2671 2672 if (error) { 2673 PURGE_ATTRCACHE(dvp); 2674 return (error); 2675 } 2676 2677 error = geterrno3(res.status); 2678 if (!error) { 2679 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 2680 if (HAVE_RDDIR_CACHE(VTOR(dvp))) 2681 nfs_purge_rddir_cache(dvp); 2682 2683 if (!res.resok.obj.handle_follows) { 2684 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2685 if (error) 2686 return (error); 2687 } else { 2688 if (res.resok.obj_attributes.attributes) { 2689 vp = makenfs3node(&res.resok.obj.handle, 2690 &res.resok.obj_attributes.attr, 2691 dvp->v_vfsp, t, cr, NULL, NULL); 2692 } else { 2693 vp = makenfs3node(&res.resok.obj.handle, NULL, 2694 dvp->v_vfsp, t, cr, NULL, NULL); 2695 if (vp->v_type == VNON) { 2696 vattr.va_mask = AT_TYPE; 2697 error = nfs3getattr(vp, &vattr, cr); 2698 if (error) { 2699 VN_RELE(vp); 2700 return (error); 2701 } 2702 vp->v_type = vattr.va_type; 2703 } 2704 2705 } 2706 dnlc_update(dvp, nm, vp); 2707 } 2708 2709 if (va->va_gid != VTOR(vp)->r_attr.va_gid) { 2710 va->va_mask = AT_GID; 2711 (void) nfs3setattr(vp, va, 0, cr); 2712 } 2713 2714 /* 2715 * If vnode is a device create special vnode 2716 */ 2717 if (IS_DEVVP(vp)) { 2718 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 2719 VN_RELE(vp); 2720 } else 2721 *vpp = vp; 2722 } else { 2723 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 2724 PURGE_STALE_FH(error, dvp, cr); 2725 } 2726 return (error); 2727 } 2728 2729 /* 2730 * Weirdness: if the vnode to be removed is open 2731 * we rename it instead of removing it and nfs_inactive 2732 * will remove the new name. 2733 */ 2734 static int 2735 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr) 2736 { 2737 int error; 2738 REMOVE3args args; 2739 REMOVE3res res; 2740 vnode_t *vp; 2741 char *tmpname; 2742 int douprintf; 2743 rnode_t *rp; 2744 rnode_t *drp; 2745 hrtime_t t; 2746 2747 if (nfs_zone() != VTOMI(dvp)->mi_zone) 2748 return (EPERM); 2749 drp = VTOR(dvp); 2750 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 2751 return (EINTR); 2752 2753 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 2754 if (error) { 2755 nfs_rw_exit(&drp->r_rwlock); 2756 return (error); 2757 } 2758 2759 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) { 2760 VN_RELE(vp); 2761 nfs_rw_exit(&drp->r_rwlock); 2762 return (EPERM); 2763 } 2764 2765 /* 2766 * First just remove the entry from the name cache, as it 2767 * is most likely the only entry for this vp. 2768 */ 2769 dnlc_remove(dvp, nm); 2770 2771 /* 2772 * If the file has a v_count > 1 then there may be more than one 2773 * entry in the name cache due multiple links or an open file, 2774 * but we don't have the real reference count so flush all 2775 * possible entries. 2776 */ 2777 if (vp->v_count > 1) 2778 dnlc_purge_vp(vp); 2779 2780 /* 2781 * Now we have the real reference count on the vnode 2782 */ 2783 rp = VTOR(vp); 2784 mutex_enter(&rp->r_statelock); 2785 if (vp->v_count > 1 && 2786 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 2787 mutex_exit(&rp->r_statelock); 2788 tmpname = newname(); 2789 error = nfs3rename(dvp, nm, dvp, tmpname, cr); 2790 if (error) 2791 kmem_free(tmpname, MAXNAMELEN); 2792 else { 2793 mutex_enter(&rp->r_statelock); 2794 if (rp->r_unldvp == NULL) { 2795 VN_HOLD(dvp); 2796 rp->r_unldvp = dvp; 2797 if (rp->r_unlcred != NULL) 2798 crfree(rp->r_unlcred); 2799 crhold(cr); 2800 rp->r_unlcred = cr; 2801 rp->r_unlname = tmpname; 2802 } else { 2803 kmem_free(rp->r_unlname, MAXNAMELEN); 2804 rp->r_unlname = tmpname; 2805 } 2806 mutex_exit(&rp->r_statelock); 2807 } 2808 } else { 2809 mutex_exit(&rp->r_statelock); 2810 /* 2811 * We need to flush any dirty pages which happen to 2812 * be hanging around before removing the file. This 2813 * shouldn't happen very often and mostly on file 2814 * systems mounted "nocto". 2815 */ 2816 if (vn_has_cached_data(vp) && 2817 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { 2818 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr); 2819 if (error && (error == ENOSPC || error == EDQUOT)) { 2820 mutex_enter(&rp->r_statelock); 2821 if (!rp->r_error) 2822 rp->r_error = error; 2823 mutex_exit(&rp->r_statelock); 2824 } 2825 } 2826 2827 setdiropargs3(&args.object, nm, dvp); 2828 2829 douprintf = 1; 2830 2831 t = gethrtime(); 2832 2833 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE, 2834 xdr_diropargs3, (caddr_t)&args, 2835 xdr_REMOVE3res, (caddr_t)&res, cr, 2836 &douprintf, &res.status, 0, NULL); 2837 2838 /* 2839 * The xattr dir may be gone after last attr is removed, 2840 * so flush it from dnlc. 2841 */ 2842 if (dvp->v_flag & V_XATTRDIR) 2843 dnlc_purge_vp(dvp); 2844 2845 PURGE_ATTRCACHE(vp); 2846 2847 if (error) { 2848 PURGE_ATTRCACHE(dvp); 2849 } else { 2850 error = geterrno3(res.status); 2851 if (!error) { 2852 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, 2853 cr); 2854 if (HAVE_RDDIR_CACHE(drp)) 2855 nfs_purge_rddir_cache(dvp); 2856 } else { 2857 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, 2858 t, cr); 2859 PURGE_STALE_FH(error, dvp, cr); 2860 } 2861 } 2862 } 2863 2864 VN_RELE(vp); 2865 2866 nfs_rw_exit(&drp->r_rwlock); 2867 2868 return (error); 2869 } 2870 2871 static int 2872 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 2873 { 2874 int error; 2875 LINK3args args; 2876 LINK3res res; 2877 vnode_t *realvp; 2878 int douprintf; 2879 mntinfo_t *mi; 2880 rnode_t *tdrp; 2881 hrtime_t t; 2882 2883 if (nfs_zone() != VTOMI(tdvp)->mi_zone) 2884 return (EPERM); 2885 if (VOP_REALVP(svp, &realvp) == 0) 2886 svp = realvp; 2887 2888 mi = VTOMI(svp); 2889 2890 if (!(mi->mi_flags & MI_LINK)) 2891 return (EOPNOTSUPP); 2892 2893 args.file = *VTOFH3(svp); 2894 setdiropargs3(&args.link, tnm, tdvp); 2895 2896 tdrp = VTOR(tdvp); 2897 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp))) 2898 return (EINTR); 2899 2900 dnlc_remove(tdvp, tnm); 2901 2902 douprintf = 1; 2903 2904 t = gethrtime(); 2905 2906 error = rfs3call(mi, NFSPROC3_LINK, 2907 xdr_LINK3args, (caddr_t)&args, 2908 xdr_LINK3res, (caddr_t)&res, cr, 2909 &douprintf, &res.status, 0, NULL); 2910 2911 if (error) { 2912 PURGE_ATTRCACHE(tdvp); 2913 PURGE_ATTRCACHE(svp); 2914 nfs_rw_exit(&tdrp->r_rwlock); 2915 return (error); 2916 } 2917 2918 error = geterrno3(res.status); 2919 2920 if (!error) { 2921 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr); 2922 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr); 2923 if (HAVE_RDDIR_CACHE(tdrp)) 2924 nfs_purge_rddir_cache(tdvp); 2925 dnlc_update(tdvp, tnm, svp); 2926 } else { 2927 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t, 2928 cr); 2929 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr); 2930 if (error == EOPNOTSUPP) { 2931 mutex_enter(&mi->mi_lock); 2932 mi->mi_flags &= ~MI_LINK; 2933 mutex_exit(&mi->mi_lock); 2934 } 2935 } 2936 2937 nfs_rw_exit(&tdrp->r_rwlock); 2938 2939 return (error); 2940 } 2941 2942 static int 2943 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2944 { 2945 vnode_t *realvp; 2946 2947 if (nfs_zone() != VTOMI(odvp)->mi_zone) 2948 return (EPERM); 2949 if (VOP_REALVP(ndvp, &realvp) == 0) 2950 ndvp = realvp; 2951 2952 return (nfs3rename(odvp, onm, ndvp, nnm, cr)); 2953 } 2954 2955 /* 2956 * nfs3rename does the real work of renaming in NFS Version 3. 2957 */ 2958 static int 2959 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 2960 { 2961 int error; 2962 RENAME3args args; 2963 RENAME3res res; 2964 int douprintf; 2965 vnode_t *nvp; 2966 vnode_t *ovp = NULL; 2967 char *tmpname; 2968 rnode_t *rp; 2969 rnode_t *odrp; 2970 rnode_t *ndrp; 2971 hrtime_t t; 2972 2973 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone); 2974 2975 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 || 2976 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) 2977 return (EINVAL); 2978 2979 odrp = VTOR(odvp); 2980 ndrp = VTOR(ndvp); 2981 if ((intptr_t)odrp < (intptr_t)ndrp) { 2982 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) 2983 return (EINTR); 2984 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) { 2985 nfs_rw_exit(&odrp->r_rwlock); 2986 return (EINTR); 2987 } 2988 } else { 2989 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) 2990 return (EINTR); 2991 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) { 2992 nfs_rw_exit(&ndrp->r_rwlock); 2993 return (EINTR); 2994 } 2995 } 2996 2997 /* 2998 * Lookup the target file. If it exists, it needs to be 2999 * checked to see whether it is a mount point and whether 3000 * it is active (open). 3001 */ 3002 error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0); 3003 if (!error) { 3004 /* 3005 * If this file has been mounted on, then just 3006 * return busy because renaming to it would remove 3007 * the mounted file system from the name space. 3008 */ 3009 if (vn_mountedvfs(nvp) != NULL) { 3010 VN_RELE(nvp); 3011 nfs_rw_exit(&odrp->r_rwlock); 3012 nfs_rw_exit(&ndrp->r_rwlock); 3013 return (EBUSY); 3014 } 3015 3016 /* 3017 * Purge the name cache of all references to this vnode 3018 * so that we can check the reference count to infer 3019 * whether it is active or not. 3020 */ 3021 /* 3022 * First just remove the entry from the name cache, as it 3023 * is most likely the only entry for this vp. 3024 */ 3025 dnlc_remove(ndvp, nnm); 3026 /* 3027 * If the file has a v_count > 1 then there may be more 3028 * than one entry in the name cache due multiple links 3029 * or an open file, but we don't have the real reference 3030 * count so flush all possible entries. 3031 */ 3032 if (nvp->v_count > 1) 3033 dnlc_purge_vp(nvp); 3034 3035 /* 3036 * If the vnode is active and is not a directory, 3037 * arrange to rename it to a 3038 * temporary file so that it will continue to be 3039 * accessible. This implements the "unlink-open-file" 3040 * semantics for the target of a rename operation. 3041 * Before doing this though, make sure that the 3042 * source and target files are not already the same. 3043 */ 3044 if (nvp->v_count > 1 && nvp->v_type != VDIR) { 3045 /* 3046 * Lookup the source name. 3047 */ 3048 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, 3049 cr, 0); 3050 3051 /* 3052 * The source name *should* already exist. 3053 */ 3054 if (error) { 3055 VN_RELE(nvp); 3056 nfs_rw_exit(&odrp->r_rwlock); 3057 nfs_rw_exit(&ndrp->r_rwlock); 3058 return (error); 3059 } 3060 3061 /* 3062 * Compare the two vnodes. If they are the same, 3063 * just release all held vnodes and return success. 3064 */ 3065 if (ovp == nvp) { 3066 VN_RELE(ovp); 3067 VN_RELE(nvp); 3068 nfs_rw_exit(&odrp->r_rwlock); 3069 nfs_rw_exit(&ndrp->r_rwlock); 3070 return (0); 3071 } 3072 3073 /* 3074 * Can't mix and match directories and non- 3075 * directories in rename operations. We already 3076 * know that the target is not a directory. If 3077 * the source is a directory, return an error. 3078 */ 3079 if (ovp->v_type == VDIR) { 3080 VN_RELE(ovp); 3081 VN_RELE(nvp); 3082 nfs_rw_exit(&odrp->r_rwlock); 3083 nfs_rw_exit(&ndrp->r_rwlock); 3084 return (ENOTDIR); 3085 } 3086 3087 /* 3088 * The target file exists, is not the same as 3089 * the source file, and is active. Link it 3090 * to a temporary filename to avoid having 3091 * the server removing the file completely. 3092 */ 3093 tmpname = newname(); 3094 error = nfs3_link(ndvp, nvp, tmpname, cr); 3095 if (error == EOPNOTSUPP) { 3096 error = nfs3_rename(ndvp, nnm, ndvp, tmpname, 3097 cr); 3098 } 3099 if (error) { 3100 kmem_free(tmpname, MAXNAMELEN); 3101 VN_RELE(ovp); 3102 VN_RELE(nvp); 3103 nfs_rw_exit(&odrp->r_rwlock); 3104 nfs_rw_exit(&ndrp->r_rwlock); 3105 return (error); 3106 } 3107 rp = VTOR(nvp); 3108 mutex_enter(&rp->r_statelock); 3109 if (rp->r_unldvp == NULL) { 3110 VN_HOLD(ndvp); 3111 rp->r_unldvp = ndvp; 3112 if (rp->r_unlcred != NULL) 3113 crfree(rp->r_unlcred); 3114 crhold(cr); 3115 rp->r_unlcred = cr; 3116 rp->r_unlname = tmpname; 3117 } else { 3118 kmem_free(rp->r_unlname, MAXNAMELEN); 3119 rp->r_unlname = tmpname; 3120 } 3121 mutex_exit(&rp->r_statelock); 3122 } 3123 3124 VN_RELE(nvp); 3125 } 3126 3127 if (ovp == NULL) { 3128 /* 3129 * When renaming directories to be a subdirectory of a 3130 * different parent, the dnlc entry for ".." will no 3131 * longer be valid, so it must be removed. 3132 * 3133 * We do a lookup here to determine whether we are renaming 3134 * a directory and we need to check if we are renaming 3135 * an unlinked file. This might have already been done 3136 * in previous code, so we check ovp == NULL to avoid 3137 * doing it twice. 3138 */ 3139 3140 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0); 3141 /* 3142 * The source name *should* already exist. 3143 */ 3144 if (error) { 3145 nfs_rw_exit(&odrp->r_rwlock); 3146 nfs_rw_exit(&ndrp->r_rwlock); 3147 return (error); 3148 } 3149 ASSERT(ovp != NULL); 3150 } 3151 3152 dnlc_remove(odvp, onm); 3153 dnlc_remove(ndvp, nnm); 3154 3155 setdiropargs3(&args.from, onm, odvp); 3156 setdiropargs3(&args.to, nnm, ndvp); 3157 3158 douprintf = 1; 3159 3160 t = gethrtime(); 3161 3162 error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME, 3163 xdr_RENAME3args, (caddr_t)&args, 3164 xdr_RENAME3res, (caddr_t)&res, cr, 3165 &douprintf, &res.status, 0, NULL); 3166 3167 if (error) { 3168 PURGE_ATTRCACHE(odvp); 3169 PURGE_ATTRCACHE(ndvp); 3170 VN_RELE(ovp); 3171 nfs_rw_exit(&odrp->r_rwlock); 3172 nfs_rw_exit(&ndrp->r_rwlock); 3173 return (error); 3174 } 3175 3176 error = geterrno3(res.status); 3177 3178 if (!error) { 3179 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr); 3180 if (HAVE_RDDIR_CACHE(odrp)) 3181 nfs_purge_rddir_cache(odvp); 3182 if (ndvp != odvp) { 3183 nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr); 3184 if (HAVE_RDDIR_CACHE(ndrp)) 3185 nfs_purge_rddir_cache(ndvp); 3186 } 3187 /* 3188 * when renaming directories to be a subdirectory of a 3189 * different parent, the dnlc entry for ".." will no 3190 * longer be valid, so it must be removed 3191 */ 3192 rp = VTOR(ovp); 3193 if (ndvp != odvp) { 3194 if (ovp->v_type == VDIR) { 3195 dnlc_remove(ovp, ".."); 3196 if (HAVE_RDDIR_CACHE(rp)) 3197 nfs_purge_rddir_cache(ovp); 3198 } 3199 } 3200 3201 /* 3202 * If we are renaming the unlinked file, update the 3203 * r_unldvp and r_unlname as needed. 3204 */ 3205 mutex_enter(&rp->r_statelock); 3206 if (rp->r_unldvp != NULL) { 3207 if (strcmp(rp->r_unlname, onm) == 0) { 3208 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 3209 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 3210 3211 if (ndvp != rp->r_unldvp) { 3212 VN_RELE(rp->r_unldvp); 3213 rp->r_unldvp = ndvp; 3214 VN_HOLD(ndvp); 3215 } 3216 } 3217 } 3218 mutex_exit(&rp->r_statelock); 3219 } else { 3220 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr); 3221 if (ndvp != odvp) { 3222 nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t, 3223 cr); 3224 } 3225 /* 3226 * System V defines rename to return EEXIST, not 3227 * ENOTEMPTY if the target directory is not empty. 3228 * Over the wire, the error is NFSERR_ENOTEMPTY 3229 * which geterrno maps to ENOTEMPTY. 3230 */ 3231 if (error == ENOTEMPTY) 3232 error = EEXIST; 3233 } 3234 3235 VN_RELE(ovp); 3236 3237 nfs_rw_exit(&odrp->r_rwlock); 3238 nfs_rw_exit(&ndrp->r_rwlock); 3239 3240 return (error); 3241 } 3242 3243 static int 3244 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 3245 { 3246 int error; 3247 MKDIR3args args; 3248 MKDIR3res res; 3249 int douprintf; 3250 struct vattr vattr; 3251 vnode_t *vp; 3252 rnode_t *drp; 3253 hrtime_t t; 3254 3255 if (nfs_zone() != VTOMI(dvp)->mi_zone) 3256 return (EPERM); 3257 setdiropargs3(&args.where, nm, dvp); 3258 3259 /* 3260 * Decide what the group-id and set-gid bit of the created directory 3261 * should be. May have to do a setattr to get the gid right. 3262 */ 3263 error = setdirgid(dvp, &va->va_gid, cr); 3264 if (error) 3265 return (error); 3266 error = setdirmode(dvp, &va->va_mode, cr); 3267 if (error) 3268 return (error); 3269 va->va_mask |= AT_MODE|AT_GID; 3270 3271 error = vattr_to_sattr3(va, &args.attributes); 3272 if (error) { 3273 /* req time field(s) overflow - return immediately */ 3274 return (error); 3275 } 3276 3277 drp = VTOR(dvp); 3278 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3279 return (EINTR); 3280 3281 dnlc_remove(dvp, nm); 3282 3283 douprintf = 1; 3284 3285 t = gethrtime(); 3286 3287 error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR, 3288 xdr_MKDIR3args, (caddr_t)&args, 3289 xdr_MKDIR3res, (caddr_t)&res, cr, 3290 &douprintf, &res.status, 0, NULL); 3291 3292 if (error) { 3293 PURGE_ATTRCACHE(dvp); 3294 nfs_rw_exit(&drp->r_rwlock); 3295 return (error); 3296 } 3297 3298 error = geterrno3(res.status); 3299 if (!error) { 3300 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3301 if (HAVE_RDDIR_CACHE(drp)) 3302 nfs_purge_rddir_cache(dvp); 3303 3304 if (!res.resok.obj.handle_follows) { 3305 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 3306 if (error) { 3307 nfs_rw_exit(&drp->r_rwlock); 3308 return (error); 3309 } 3310 } else { 3311 if (res.resok.obj_attributes.attributes) { 3312 vp = makenfs3node(&res.resok.obj.handle, 3313 &res.resok.obj_attributes.attr, 3314 dvp->v_vfsp, t, cr, NULL, NULL); 3315 } else { 3316 vp = makenfs3node(&res.resok.obj.handle, NULL, 3317 dvp->v_vfsp, t, cr, NULL, NULL); 3318 if (vp->v_type == VNON) { 3319 vattr.va_mask = AT_TYPE; 3320 error = nfs3getattr(vp, &vattr, cr); 3321 if (error) { 3322 VN_RELE(vp); 3323 nfs_rw_exit(&drp->r_rwlock); 3324 return (error); 3325 } 3326 vp->v_type = vattr.va_type; 3327 } 3328 } 3329 dnlc_update(dvp, nm, vp); 3330 } 3331 if (va->va_gid != VTOR(vp)->r_attr.va_gid) { 3332 va->va_mask = AT_GID; 3333 (void) nfs3setattr(vp, va, 0, cr); 3334 } 3335 *vpp = vp; 3336 } else { 3337 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3338 PURGE_STALE_FH(error, dvp, cr); 3339 } 3340 3341 nfs_rw_exit(&drp->r_rwlock); 3342 3343 return (error); 3344 } 3345 3346 static int 3347 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 3348 { 3349 int error; 3350 RMDIR3args args; 3351 RMDIR3res res; 3352 vnode_t *vp; 3353 int douprintf; 3354 rnode_t *drp; 3355 hrtime_t t; 3356 3357 if (nfs_zone() != VTOMI(dvp)->mi_zone) 3358 return (EPERM); 3359 drp = VTOR(dvp); 3360 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3361 return (EINTR); 3362 3363 /* 3364 * Attempt to prevent a rmdir(".") from succeeding. 3365 */ 3366 error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); 3367 if (error) { 3368 nfs_rw_exit(&drp->r_rwlock); 3369 return (error); 3370 } 3371 3372 if (vp == cdir) { 3373 VN_RELE(vp); 3374 nfs_rw_exit(&drp->r_rwlock); 3375 return (EINVAL); 3376 } 3377 3378 setdiropargs3(&args.object, nm, dvp); 3379 3380 /* 3381 * First just remove the entry from the name cache, as it 3382 * is most likely an entry for this vp. 3383 */ 3384 dnlc_remove(dvp, nm); 3385 3386 /* 3387 * If there vnode reference count is greater than one, then 3388 * there may be additional references in the DNLC which will 3389 * need to be purged. First, trying removing the entry for 3390 * the parent directory and see if that removes the additional 3391 * reference(s). If that doesn't do it, then use dnlc_purge_vp 3392 * to completely remove any references to the directory which 3393 * might still exist in the DNLC. 3394 */ 3395 if (vp->v_count > 1) { 3396 dnlc_remove(vp, ".."); 3397 if (vp->v_count > 1) 3398 dnlc_purge_vp(vp); 3399 } 3400 3401 douprintf = 1; 3402 3403 t = gethrtime(); 3404 3405 error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR, 3406 xdr_diropargs3, (caddr_t)&args, 3407 xdr_RMDIR3res, (caddr_t)&res, cr, 3408 &douprintf, &res.status, 0, NULL); 3409 3410 PURGE_ATTRCACHE(vp); 3411 3412 if (error) { 3413 PURGE_ATTRCACHE(dvp); 3414 VN_RELE(vp); 3415 nfs_rw_exit(&drp->r_rwlock); 3416 return (error); 3417 } 3418 3419 error = geterrno3(res.status); 3420 if (!error) { 3421 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3422 if (HAVE_RDDIR_CACHE(drp)) 3423 nfs_purge_rddir_cache(dvp); 3424 if (HAVE_RDDIR_CACHE(VTOR(vp))) 3425 nfs_purge_rddir_cache(vp); 3426 } else { 3427 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3428 PURGE_STALE_FH(error, dvp, cr); 3429 /* 3430 * System V defines rmdir to return EEXIST, not 3431 * ENOTEMPTY if the directory is not empty. Over 3432 * the wire, the error is NFSERR_ENOTEMPTY which 3433 * geterrno maps to ENOTEMPTY. 3434 */ 3435 if (error == ENOTEMPTY) 3436 error = EEXIST; 3437 } 3438 3439 VN_RELE(vp); 3440 3441 nfs_rw_exit(&drp->r_rwlock); 3442 3443 return (error); 3444 } 3445 3446 static int 3447 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 3448 { 3449 int error; 3450 SYMLINK3args args; 3451 SYMLINK3res res; 3452 int douprintf; 3453 mntinfo_t *mi; 3454 vnode_t *vp; 3455 rnode_t *rp; 3456 char *contents; 3457 rnode_t *drp; 3458 hrtime_t t; 3459 3460 mi = VTOMI(dvp); 3461 3462 if (nfs_zone() != mi->mi_zone) 3463 return (EPERM); 3464 if (!(mi->mi_flags & MI_SYMLINK)) 3465 return (EOPNOTSUPP); 3466 3467 setdiropargs3(&args.where, lnm, dvp); 3468 error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes); 3469 if (error) { 3470 /* req time field(s) overflow - return immediately */ 3471 return (error); 3472 } 3473 args.symlink.symlink_data = tnm; 3474 3475 drp = VTOR(dvp); 3476 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) 3477 return (EINTR); 3478 3479 dnlc_remove(dvp, lnm); 3480 3481 douprintf = 1; 3482 3483 t = gethrtime(); 3484 3485 error = rfs3call(mi, NFSPROC3_SYMLINK, 3486 xdr_SYMLINK3args, (caddr_t)&args, 3487 xdr_SYMLINK3res, (caddr_t)&res, cr, 3488 &douprintf, &res.status, 0, NULL); 3489 3490 if (error) { 3491 PURGE_ATTRCACHE(dvp); 3492 nfs_rw_exit(&drp->r_rwlock); 3493 return (error); 3494 } 3495 3496 error = geterrno3(res.status); 3497 if (!error) { 3498 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); 3499 if (HAVE_RDDIR_CACHE(drp)) 3500 nfs_purge_rddir_cache(dvp); 3501 3502 if (res.resok.obj.handle_follows) { 3503 if (res.resok.obj_attributes.attributes) { 3504 vp = makenfs3node(&res.resok.obj.handle, 3505 &res.resok.obj_attributes.attr, 3506 dvp->v_vfsp, t, cr, NULL, NULL); 3507 } else { 3508 vp = makenfs3node(&res.resok.obj.handle, NULL, 3509 dvp->v_vfsp, t, cr, NULL, NULL); 3510 vp->v_type = VLNK; 3511 vp->v_rdev = 0; 3512 } 3513 dnlc_update(dvp, lnm, vp); 3514 rp = VTOR(vp); 3515 if (nfs3_do_symlink_cache && 3516 rp->r_symlink.contents == NULL) { 3517 3518 contents = kmem_alloc(MAXPATHLEN, 3519 KM_NOSLEEP); 3520 3521 if (contents != NULL) { 3522 mutex_enter(&rp->r_statelock); 3523 if (rp->r_symlink.contents == NULL) { 3524 rp->r_symlink.len = strlen(tnm); 3525 bcopy(tnm, contents, 3526 rp->r_symlink.len); 3527 rp->r_symlink.contents = 3528 contents; 3529 rp->r_symlink.size = MAXPATHLEN; 3530 mutex_exit(&rp->r_statelock); 3531 } else { 3532 mutex_exit(&rp->r_statelock); 3533 kmem_free((void *)contents, 3534 MAXPATHLEN); 3535 } 3536 } 3537 } 3538 VN_RELE(vp); 3539 } 3540 } else { 3541 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); 3542 PURGE_STALE_FH(error, dvp, cr); 3543 if (error == EOPNOTSUPP) { 3544 mutex_enter(&mi->mi_lock); 3545 mi->mi_flags &= ~MI_SYMLINK; 3546 mutex_exit(&mi->mi_lock); 3547 } 3548 } 3549 3550 nfs_rw_exit(&drp->r_rwlock); 3551 3552 return (error); 3553 } 3554 3555 #ifdef DEBUG 3556 static int nfs3_readdir_cache_hits = 0; 3557 static int nfs3_readdir_cache_shorts = 0; 3558 static int nfs3_readdir_cache_waits = 0; 3559 static int nfs3_readdir_cache_misses = 0; 3560 static int nfs3_readdir_readahead = 0; 3561 #endif 3562 3563 static int nfs3_shrinkreaddir = 0; 3564 3565 /* 3566 * Read directory entries. 3567 * There are some weird things to look out for here. The uio_loffset 3568 * field is either 0 or it is the offset returned from a previous 3569 * readdir. It is an opaque value used by the server to find the 3570 * correct directory block to read. The count field is the number 3571 * of blocks to read on the server. This is advisory only, the server 3572 * may return only one block's worth of entries. Entries may be compressed 3573 * on the server. 3574 */ 3575 static int 3576 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 3577 { 3578 int error; 3579 size_t count; 3580 rnode_t *rp; 3581 rddir_cache *rdc; 3582 rddir_cache *nrdc; 3583 rddir_cache *rrdc; 3584 #ifdef DEBUG 3585 int missed; 3586 #endif 3587 int doreadahead; 3588 rddir_cache srdc; 3589 avl_index_t where; 3590 3591 if (nfs_zone() != VTOMI(vp)->mi_zone) 3592 return (EIO); 3593 rp = VTOR(vp); 3594 3595 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 3596 3597 /* 3598 * Make sure that the directory cache is valid. 3599 */ 3600 if (HAVE_RDDIR_CACHE(rp)) { 3601 if (nfs_disable_rddir_cache) { 3602 /* 3603 * Setting nfs_disable_rddir_cache in /etc/system 3604 * allows interoperability with servers that do not 3605 * properly update the attributes of directories. 3606 * Any cached information gets purged before an 3607 * access is made to it. 3608 */ 3609 nfs_purge_rddir_cache(vp); 3610 } else { 3611 error = nfs3_validate_caches(vp, cr); 3612 if (error) 3613 return (error); 3614 } 3615 } 3616 3617 /* 3618 * It is possible that some servers may not be able to correctly 3619 * handle a large READDIR or READDIRPLUS request due to bugs in 3620 * their implementation. In order to continue to interoperate 3621 * with them, this workaround is provided to limit the maximum 3622 * size of a READDIRPLUS request to 1024. In any case, the request 3623 * size is limited to MAXBSIZE. 3624 */ 3625 count = MIN(uiop->uio_iov->iov_len, 3626 nfs3_shrinkreaddir ? 1024 : MAXBSIZE); 3627 3628 nrdc = NULL; 3629 #ifdef DEBUG 3630 missed = 0; 3631 #endif 3632 top: 3633 /* 3634 * Short circuit last readdir which always returns 0 bytes. 3635 * This can be done after the directory has been read through 3636 * completely at least once. This will set r_direof which 3637 * can be used to find the value of the last cookie. 3638 */ 3639 mutex_enter(&rp->r_statelock); 3640 if (rp->r_direof != NULL && 3641 uiop->uio_loffset == rp->r_direof->nfs3_ncookie) { 3642 mutex_exit(&rp->r_statelock); 3643 #ifdef DEBUG 3644 nfs3_readdir_cache_shorts++; 3645 #endif 3646 if (eofp) 3647 *eofp = 1; 3648 if (nrdc != NULL) 3649 rddir_cache_rele(nrdc); 3650 return (0); 3651 } 3652 /* 3653 * Look for a cache entry. Cache entries are identified 3654 * by the NFS cookie value and the byte count requested. 3655 */ 3656 srdc.nfs3_cookie = uiop->uio_loffset; 3657 srdc.buflen = count; 3658 rdc = avl_find(&rp->r_dir, &srdc, &where); 3659 if (rdc != NULL) { 3660 rddir_cache_hold(rdc); 3661 /* 3662 * If the cache entry is in the process of being 3663 * filled in, wait until this completes. The 3664 * RDDIRWAIT bit is set to indicate that someone 3665 * is waiting and then the thread currently 3666 * filling the entry is done, it should do a 3667 * cv_broadcast to wakeup all of the threads 3668 * waiting for it to finish. 3669 */ 3670 if (rdc->flags & RDDIR) { 3671 nfs_rw_exit(&rp->r_rwlock); 3672 rdc->flags |= RDDIRWAIT; 3673 #ifdef DEBUG 3674 nfs3_readdir_cache_waits++; 3675 #endif 3676 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 3677 /* 3678 * We got interrupted, probably 3679 * the user typed ^C or an alarm 3680 * fired. We free the new entry 3681 * if we allocated one. 3682 */ 3683 mutex_exit(&rp->r_statelock); 3684 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3685 RW_READER, FALSE); 3686 rddir_cache_rele(rdc); 3687 if (nrdc != NULL) 3688 rddir_cache_rele(nrdc); 3689 return (EINTR); 3690 } 3691 mutex_exit(&rp->r_statelock); 3692 (void) nfs_rw_enter_sig(&rp->r_rwlock, 3693 RW_READER, FALSE); 3694 rddir_cache_rele(rdc); 3695 goto top; 3696 } 3697 /* 3698 * Check to see if a readdir is required to 3699 * fill the entry. If so, mark this entry 3700 * as being filled, remove our reference, 3701 * and branch to the code to fill the entry. 3702 */ 3703 if (rdc->flags & RDDIRREQ) { 3704 rdc->flags &= ~RDDIRREQ; 3705 rdc->flags |= RDDIR; 3706 if (nrdc != NULL) 3707 rddir_cache_rele(nrdc); 3708 nrdc = rdc; 3709 mutex_exit(&rp->r_statelock); 3710 goto bottom; 3711 } 3712 #ifdef DEBUG 3713 if (!missed) 3714 nfs3_readdir_cache_hits++; 3715 #endif 3716 /* 3717 * If an error occurred while attempting 3718 * to fill the cache entry, just return it. 3719 */ 3720 if (rdc->error) { 3721 error = rdc->error; 3722 mutex_exit(&rp->r_statelock); 3723 rddir_cache_rele(rdc); 3724 if (nrdc != NULL) 3725 rddir_cache_rele(nrdc); 3726 return (error); 3727 } 3728 3729 /* 3730 * The cache entry is complete and good, 3731 * copyout the dirent structs to the calling 3732 * thread. 3733 */ 3734 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop); 3735 3736 /* 3737 * If no error occurred during the copyout, 3738 * update the offset in the uio struct to 3739 * contain the value of the next cookie 3740 * and set the eof value appropriately. 3741 */ 3742 if (!error) { 3743 uiop->uio_loffset = rdc->nfs3_ncookie; 3744 if (eofp) 3745 *eofp = rdc->eof; 3746 } 3747 3748 /* 3749 * Decide whether to do readahead. 3750 * 3751 * Don't if have already read to the end of 3752 * directory. There is nothing more to read. 3753 * 3754 * Don't if the application is not doing 3755 * lookups in the directory. The readahead 3756 * is only effective if the application can 3757 * be doing work while an async thread is 3758 * handling the over the wire request. 3759 */ 3760 if (rdc->eof) { 3761 rp->r_direof = rdc; 3762 doreadahead = FALSE; 3763 } else if (!(rp->r_flags & RLOOKUP)) 3764 doreadahead = FALSE; 3765 else 3766 doreadahead = TRUE; 3767 3768 if (!doreadahead) { 3769 mutex_exit(&rp->r_statelock); 3770 rddir_cache_rele(rdc); 3771 if (nrdc != NULL) 3772 rddir_cache_rele(nrdc); 3773 return (error); 3774 } 3775 3776 /* 3777 * Check to see whether we found an entry 3778 * for the readahead. If so, we don't need 3779 * to do anything further, so free the new 3780 * entry if one was allocated. Otherwise, 3781 * allocate a new entry, add it to the cache, 3782 * and then initiate an asynchronous readdir 3783 * operation to fill it. 3784 */ 3785 srdc.nfs3_cookie = rdc->nfs3_ncookie; 3786 srdc.buflen = count; 3787 rrdc = avl_find(&rp->r_dir, &srdc, &where); 3788 if (rrdc != NULL) { 3789 if (nrdc != NULL) 3790 rddir_cache_rele(nrdc); 3791 } else { 3792 if (nrdc != NULL) 3793 rrdc = nrdc; 3794 else { 3795 rrdc = rddir_cache_alloc(KM_NOSLEEP); 3796 } 3797 if (rrdc != NULL) { 3798 rrdc->nfs3_cookie = rdc->nfs3_ncookie; 3799 rrdc->buflen = count; 3800 avl_insert(&rp->r_dir, rrdc, where); 3801 rddir_cache_hold(rrdc); 3802 mutex_exit(&rp->r_statelock); 3803 rddir_cache_rele(rdc); 3804 #ifdef DEBUG 3805 nfs3_readdir_readahead++; 3806 #endif 3807 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir); 3808 return (error); 3809 } 3810 } 3811 3812 mutex_exit(&rp->r_statelock); 3813 rddir_cache_rele(rdc); 3814 return (error); 3815 } 3816 3817 /* 3818 * Didn't find an entry in the cache. Construct a new empty 3819 * entry and link it into the cache. Other processes attempting 3820 * to access this entry will need to wait until it is filled in. 3821 * 3822 * Since kmem_alloc may block, another pass through the cache 3823 * will need to be taken to make sure that another process 3824 * hasn't already added an entry to the cache for this request. 3825 */ 3826 if (nrdc == NULL) { 3827 mutex_exit(&rp->r_statelock); 3828 nrdc = rddir_cache_alloc(KM_SLEEP); 3829 nrdc->nfs3_cookie = uiop->uio_loffset; 3830 nrdc->buflen = count; 3831 goto top; 3832 } 3833 3834 /* 3835 * Add this entry to the cache. 3836 */ 3837 avl_insert(&rp->r_dir, nrdc, where); 3838 rddir_cache_hold(nrdc); 3839 mutex_exit(&rp->r_statelock); 3840 3841 bottom: 3842 #ifdef DEBUG 3843 missed = 1; 3844 nfs3_readdir_cache_misses++; 3845 #endif 3846 /* 3847 * Do the readdir. This routine decides whether to use 3848 * READDIR or READDIRPLUS. 3849 */ 3850 error = do_nfs3readdir(vp, nrdc, cr); 3851 3852 /* 3853 * If this operation failed, just return the error which occurred. 3854 */ 3855 if (error != 0) 3856 return (error); 3857 3858 /* 3859 * Since the RPC operation will have taken sometime and blocked 3860 * this process, another pass through the cache will need to be 3861 * taken to find the correct cache entry. It is possible that 3862 * the correct cache entry will not be there (although one was 3863 * added) because the directory changed during the RPC operation 3864 * and the readdir cache was flushed. In this case, just start 3865 * over. It is hoped that this will not happen too often... :-) 3866 */ 3867 nrdc = NULL; 3868 goto top; 3869 /* NOTREACHED */ 3870 } 3871 3872 static int 3873 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3874 { 3875 int error; 3876 rnode_t *rp; 3877 mntinfo_t *mi; 3878 3879 rp = VTOR(vp); 3880 mi = VTOMI(vp); 3881 ASSERT(nfs_zone() == mi->mi_zone); 3882 /* 3883 * Issue the proper request. 3884 * 3885 * If the server does not support READDIRPLUS, then use READDIR. 3886 * 3887 * Otherwise -- 3888 * Issue a READDIRPLUS if reading to fill an empty cache or if 3889 * an application has performed a lookup in the directory which 3890 * required an over the wire lookup. The use of READDIRPLUS 3891 * will help to (re)populate the DNLC. 3892 */ 3893 if (!(mi->mi_flags & MI_READDIRONLY) && 3894 (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) { 3895 if (rp->r_flags & RREADDIRPLUS) { 3896 mutex_enter(&rp->r_statelock); 3897 rp->r_flags &= ~RREADDIRPLUS; 3898 mutex_exit(&rp->r_statelock); 3899 } 3900 nfs3readdirplus(vp, rdc, cr); 3901 if (rdc->error == EOPNOTSUPP) 3902 nfs3readdir(vp, rdc, cr); 3903 } else 3904 nfs3readdir(vp, rdc, cr); 3905 3906 mutex_enter(&rp->r_statelock); 3907 rdc->flags &= ~RDDIR; 3908 if (rdc->flags & RDDIRWAIT) { 3909 rdc->flags &= ~RDDIRWAIT; 3910 cv_broadcast(&rdc->cv); 3911 } 3912 error = rdc->error; 3913 if (error) 3914 rdc->flags |= RDDIRREQ; 3915 mutex_exit(&rp->r_statelock); 3916 3917 rddir_cache_rele(rdc); 3918 3919 return (error); 3920 } 3921 3922 static void 3923 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 3924 { 3925 int error; 3926 READDIR3args args; 3927 READDIR3vres res; 3928 vattr_t dva; 3929 rnode_t *rp; 3930 int douprintf; 3931 failinfo_t fi, *fip = NULL; 3932 mntinfo_t *mi; 3933 hrtime_t t; 3934 3935 rp = VTOR(vp); 3936 mi = VTOMI(vp); 3937 ASSERT(nfs_zone() == mi->mi_zone); 3938 3939 args.dir = *RTOFH3(rp); 3940 args.cookie = (cookie3)rdc->nfs3_cookie; 3941 args.cookieverf = rp->r_cookieverf; 3942 args.count = rdc->buflen; 3943 3944 /* 3945 * NFS client failover support 3946 * suppress failover unless we have a zero cookie 3947 */ 3948 if (args.cookie == (cookie3) 0) { 3949 fi.vp = vp; 3950 fi.fhp = (caddr_t)&args.dir; 3951 fi.copyproc = nfs3copyfh; 3952 fi.lookupproc = nfs3lookup; 3953 fi.xattrdirproc = acl_getxattrdir3; 3954 fip = &fi; 3955 } 3956 3957 #ifdef DEBUG 3958 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP); 3959 #else 3960 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 3961 #endif 3962 3963 res.entries = (dirent64_t *)rdc->entries; 3964 res.entries_size = rdc->buflen; 3965 res.dir_attributes.fres.vap = &dva; 3966 res.dir_attributes.fres.vp = vp; 3967 res.loff = rdc->nfs3_cookie; 3968 3969 douprintf = 1; 3970 3971 if (mi->mi_io_kstats) { 3972 mutex_enter(&mi->mi_lock); 3973 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3974 mutex_exit(&mi->mi_lock); 3975 } 3976 3977 t = gethrtime(); 3978 3979 error = rfs3call(VTOMI(vp), NFSPROC3_READDIR, 3980 xdr_READDIR3args, (caddr_t)&args, 3981 xdr_READDIR3vres, (caddr_t)&res, cr, 3982 &douprintf, &res.status, 0, fip); 3983 3984 if (mi->mi_io_kstats) { 3985 mutex_enter(&mi->mi_lock); 3986 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3987 mutex_exit(&mi->mi_lock); 3988 } 3989 3990 if (error) 3991 goto err; 3992 3993 nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr); 3994 3995 error = geterrno3(res.status); 3996 if (error) { 3997 PURGE_STALE_FH(error, vp, cr); 3998 goto err; 3999 } 4000 4001 if (mi->mi_io_kstats) { 4002 mutex_enter(&mi->mi_lock); 4003 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 4004 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size; 4005 mutex_exit(&mi->mi_lock); 4006 } 4007 4008 rdc->nfs3_ncookie = res.loff; 4009 rp->r_cookieverf = res.cookieverf; 4010 rdc->eof = res.eof ? 1 : 0; 4011 rdc->entlen = res.size; 4012 ASSERT(rdc->entlen <= rdc->buflen); 4013 rdc->error = 0; 4014 return; 4015 4016 err: 4017 kmem_free(rdc->entries, rdc->buflen); 4018 rdc->entries = NULL; 4019 rdc->error = error; 4020 } 4021 4022 /* 4023 * Read directory entries. 4024 * There are some weird things to look out for here. The uio_loffset 4025 * field is either 0 or it is the offset returned from a previous 4026 * readdir. It is an opaque value used by the server to find the 4027 * correct directory block to read. The count field is the number 4028 * of blocks to read on the server. This is advisory only, the server 4029 * may return only one block's worth of entries. Entries may be compressed 4030 * on the server. 4031 */ 4032 static void 4033 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr) 4034 { 4035 int error; 4036 READDIRPLUS3args args; 4037 READDIRPLUS3vres res; 4038 vattr_t dva; 4039 rnode_t *rp; 4040 mntinfo_t *mi; 4041 int douprintf; 4042 failinfo_t fi, *fip = NULL; 4043 4044 rp = VTOR(vp); 4045 mi = VTOMI(vp); 4046 ASSERT(nfs_zone() == mi->mi_zone); 4047 4048 args.dir = *RTOFH3(rp); 4049 args.cookie = (cookie3)rdc->nfs3_cookie; 4050 args.cookieverf = rp->r_cookieverf; 4051 args.dircount = rdc->buflen; 4052 args.maxcount = mi->mi_tsize; 4053 4054 /* 4055 * NFS client failover support 4056 * suppress failover unless we have a zero cookie 4057 */ 4058 if (args.cookie == (cookie3)0) { 4059 fi.vp = vp; 4060 fi.fhp = (caddr_t)&args.dir; 4061 fi.copyproc = nfs3copyfh; 4062 fi.lookupproc = nfs3lookup; 4063 fi.xattrdirproc = acl_getxattrdir3; 4064 fip = &fi; 4065 } 4066 4067 #ifdef DEBUG 4068 rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP); 4069 #else 4070 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP); 4071 #endif 4072 4073 res.entries = (dirent64_t *)rdc->entries; 4074 res.entries_size = rdc->buflen; 4075 res.dir_attributes.fres.vap = &dva; 4076 res.dir_attributes.fres.vp = vp; 4077 res.loff = rdc->nfs3_cookie; 4078 res.credentials = cr; 4079 4080 douprintf = 1; 4081 4082 if (mi->mi_io_kstats) { 4083 mutex_enter(&mi->mi_lock); 4084 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 4085 mutex_exit(&mi->mi_lock); 4086 } 4087 4088 res.time = gethrtime(); 4089 4090 error = rfs3call(mi, NFSPROC3_READDIRPLUS, 4091 xdr_READDIRPLUS3args, (caddr_t)&args, 4092 xdr_READDIRPLUS3vres, (caddr_t)&res, cr, 4093 &douprintf, &res.status, 0, fip); 4094 4095 if (mi->mi_io_kstats) { 4096 mutex_enter(&mi->mi_lock); 4097 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 4098 mutex_exit(&mi->mi_lock); 4099 } 4100 4101 if (error) { 4102 goto err; 4103 } 4104 4105 nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr); 4106 4107 error = geterrno3(res.status); 4108 if (error) { 4109 PURGE_STALE_FH(error, vp, cr); 4110 if (error == EOPNOTSUPP) { 4111 mutex_enter(&mi->mi_lock); 4112 mi->mi_flags |= MI_READDIRONLY; 4113 mutex_exit(&mi->mi_lock); 4114 } 4115 goto err; 4116 } 4117 4118 if (mi->mi_io_kstats) { 4119 mutex_enter(&mi->mi_lock); 4120 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 4121 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size; 4122 mutex_exit(&mi->mi_lock); 4123 } 4124 4125 rdc->nfs3_ncookie = res.loff; 4126 rp->r_cookieverf = res.cookieverf; 4127 rdc->eof = res.eof ? 1 : 0; 4128 rdc->entlen = res.size; 4129 ASSERT(rdc->entlen <= rdc->buflen); 4130 rdc->error = 0; 4131 4132 return; 4133 4134 err: 4135 kmem_free(rdc->entries, rdc->buflen); 4136 rdc->entries = NULL; 4137 rdc->error = error; 4138 } 4139 4140 #ifdef DEBUG 4141 static int nfs3_bio_do_stop = 0; 4142 #endif 4143 4144 static int 4145 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr) 4146 { 4147 rnode_t *rp = VTOR(bp->b_vp); 4148 int count; 4149 int error; 4150 cred_t *cred; 4151 offset_t offset; 4152 4153 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone); 4154 offset = ldbtob(bp->b_lblkno); 4155 4156 DTRACE_IO1(start, struct buf *, bp); 4157 4158 if (bp->b_flags & B_READ) { 4159 mutex_enter(&rp->r_statelock); 4160 if (rp->r_cred != NULL) { 4161 cred = rp->r_cred; 4162 crhold(cred); 4163 } else { 4164 rp->r_cred = cr; 4165 crhold(cr); 4166 cred = cr; 4167 crhold(cred); 4168 } 4169 mutex_exit(&rp->r_statelock); 4170 read_again: 4171 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr, 4172 offset, bp->b_bcount, &bp->b_resid, cred); 4173 crfree(cred); 4174 if (!error) { 4175 if (bp->b_resid) { 4176 /* 4177 * Didn't get it all because we hit EOF, 4178 * zero all the memory beyond the EOF. 4179 */ 4180 /* bzero(rdaddr + */ 4181 bzero(bp->b_un.b_addr + 4182 bp->b_bcount - bp->b_resid, bp->b_resid); 4183 } 4184 mutex_enter(&rp->r_statelock); 4185 if (bp->b_resid == bp->b_bcount && 4186 offset >= rp->r_size) { 4187 /* 4188 * We didn't read anything at all as we are 4189 * past EOF. Return an error indicator back 4190 * but don't destroy the pages (yet). 4191 */ 4192 error = NFS_EOF; 4193 } 4194 mutex_exit(&rp->r_statelock); 4195 } else if (error == EACCES) { 4196 mutex_enter(&rp->r_statelock); 4197 if (cred != cr) { 4198 if (rp->r_cred != NULL) 4199 crfree(rp->r_cred); 4200 rp->r_cred = cr; 4201 crhold(cr); 4202 cred = cr; 4203 crhold(cred); 4204 mutex_exit(&rp->r_statelock); 4205 goto read_again; 4206 } 4207 mutex_exit(&rp->r_statelock); 4208 } 4209 } else { 4210 if (!(rp->r_flags & RSTALE)) { 4211 mutex_enter(&rp->r_statelock); 4212 if (rp->r_cred != NULL) { 4213 cred = rp->r_cred; 4214 crhold(cred); 4215 } else { 4216 rp->r_cred = cr; 4217 crhold(cr); 4218 cred = cr; 4219 crhold(cred); 4220 } 4221 mutex_exit(&rp->r_statelock); 4222 write_again: 4223 mutex_enter(&rp->r_statelock); 4224 count = MIN(bp->b_bcount, rp->r_size - offset); 4225 mutex_exit(&rp->r_statelock); 4226 if (count < 0) 4227 cmn_err(CE_PANIC, "nfs3_bio: write count < 0"); 4228 #ifdef DEBUG 4229 if (count == 0) { 4230 zcmn_err(getzoneid(), CE_WARN, 4231 "nfs3_bio: zero length write at %lld", 4232 offset); 4233 nfs_printfhandle(&rp->r_fh); 4234 if (nfs3_bio_do_stop) 4235 debug_enter("nfs3_bio"); 4236 } 4237 #endif 4238 error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset, 4239 count, cred, stab_comm); 4240 if (error == EACCES) { 4241 mutex_enter(&rp->r_statelock); 4242 if (cred != cr) { 4243 if (rp->r_cred != NULL) 4244 crfree(rp->r_cred); 4245 rp->r_cred = cr; 4246 crhold(cr); 4247 crfree(cred); 4248 cred = cr; 4249 crhold(cred); 4250 mutex_exit(&rp->r_statelock); 4251 goto write_again; 4252 } 4253 mutex_exit(&rp->r_statelock); 4254 } 4255 bp->b_error = error; 4256 if (error && error != EINTR) { 4257 /* 4258 * Don't print EDQUOT errors on the console. 4259 * Don't print asynchronous EACCES errors. 4260 * Don't print EFBIG errors. 4261 * Print all other write errors. 4262 */ 4263 if (error != EDQUOT && error != EFBIG && 4264 (error != EACCES || 4265 !(bp->b_flags & B_ASYNC))) 4266 nfs_write_error(bp->b_vp, error, cred); 4267 /* 4268 * Update r_error and r_flags as appropriate. 4269 * If the error was ESTALE, then mark the 4270 * rnode as not being writeable and save 4271 * the error status. Otherwise, save any 4272 * errors which occur from asynchronous 4273 * page invalidations. Any errors occurring 4274 * from other operations should be saved 4275 * by the caller. 4276 */ 4277 mutex_enter(&rp->r_statelock); 4278 if (error == ESTALE) { 4279 rp->r_flags |= RSTALE; 4280 if (!rp->r_error) 4281 rp->r_error = error; 4282 } else if (!rp->r_error && 4283 (bp->b_flags & 4284 (B_INVAL|B_FORCE|B_ASYNC)) == 4285 (B_INVAL|B_FORCE|B_ASYNC)) { 4286 rp->r_error = error; 4287 } 4288 mutex_exit(&rp->r_statelock); 4289 } 4290 crfree(cred); 4291 } else 4292 error = rp->r_error; 4293 } 4294 4295 if (error != 0 && error != NFS_EOF) 4296 bp->b_flags |= B_ERROR; 4297 4298 DTRACE_IO1(done, struct buf *, bp); 4299 4300 return (error); 4301 } 4302 4303 static int 4304 nfs3_fid(vnode_t *vp, fid_t *fidp) 4305 { 4306 rnode_t *rp; 4307 4308 if (nfs_zone() != VTOMI(vp)->mi_zone) 4309 return (EIO); 4310 rp = VTOR(vp); 4311 4312 if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) { 4313 fidp->fid_len = rp->r_fh.fh_len; 4314 return (ENOSPC); 4315 } 4316 fidp->fid_len = rp->r_fh.fh_len; 4317 bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len); 4318 return (0); 4319 } 4320 4321 /* ARGSUSED2 */ 4322 static int 4323 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 4324 { 4325 rnode_t *rp = VTOR(vp); 4326 4327 if (!write_lock) { 4328 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 4329 return (V_WRITELOCK_FALSE); 4330 } 4331 4332 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) { 4333 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 4334 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp)) 4335 return (V_WRITELOCK_FALSE); 4336 nfs_rw_exit(&rp->r_rwlock); 4337 } 4338 4339 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 4340 return (V_WRITELOCK_TRUE); 4341 } 4342 4343 /* ARGSUSED */ 4344 static void 4345 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 4346 { 4347 rnode_t *rp = VTOR(vp); 4348 4349 nfs_rw_exit(&rp->r_rwlock); 4350 } 4351 4352 /* ARGSUSED */ 4353 static int 4354 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 4355 { 4356 4357 /* 4358 * Because we stuff the readdir cookie into the offset field 4359 * someone may attempt to do an lseek with the cookie which 4360 * we want to succeed. 4361 */ 4362 if (vp->v_type == VDIR) 4363 return (0); 4364 if (*noffp < 0) 4365 return (EINVAL); 4366 return (0); 4367 } 4368 4369 /* 4370 * number of nfs3_bsize blocks to read ahead. 4371 */ 4372 static int nfs3_nra = 4; 4373 4374 #ifdef DEBUG 4375 static int nfs3_lostpage = 0; /* number of times we lost original page */ 4376 #endif 4377 4378 /* 4379 * Return all the pages from [off..off+len) in file 4380 */ 4381 static int 4382 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4383 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4384 enum seg_rw rw, cred_t *cr) 4385 { 4386 rnode_t *rp; 4387 int error; 4388 mntinfo_t *mi; 4389 4390 if (vp->v_flag & VNOMAP) 4391 return (ENOSYS); 4392 4393 if (nfs_zone() != VTOMI(vp)->mi_zone) 4394 return (EIO); 4395 if (protp != NULL) 4396 *protp = PROT_ALL; 4397 4398 /* 4399 * Now valididate that the caches are up to date. 4400 */ 4401 error = nfs3_validate_caches(vp, cr); 4402 if (error) 4403 return (error); 4404 4405 rp = VTOR(vp); 4406 mi = VTOMI(vp); 4407 retry: 4408 mutex_enter(&rp->r_statelock); 4409 4410 /* 4411 * Don't create dirty pages faster than they 4412 * can be cleaned so that the system doesn't 4413 * get imbalanced. If the async queue is 4414 * maxed out, then wait for it to drain before 4415 * creating more dirty pages. Also, wait for 4416 * any threads doing pagewalks in the vop_getattr 4417 * entry points so that they don't block for 4418 * long periods. 4419 */ 4420 if (rw == S_CREATE) { 4421 while ((mi->mi_max_threads != 0 && 4422 rp->r_awcount > 2 * mi->mi_max_threads) || 4423 rp->r_gcount > 0) 4424 cv_wait(&rp->r_cv, &rp->r_statelock); 4425 } 4426 4427 /* 4428 * If we are getting called as a side effect of an nfs_write() 4429 * operation the local file size might not be extended yet. 4430 * In this case we want to be able to return pages of zeroes. 4431 */ 4432 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 4433 mutex_exit(&rp->r_statelock); 4434 return (EFAULT); /* beyond EOF */ 4435 } 4436 4437 mutex_exit(&rp->r_statelock); 4438 4439 if (len <= PAGESIZE) { 4440 error = nfs3_getapage(vp, off, len, protp, pl, plsz, 4441 seg, addr, rw, cr); 4442 } else { 4443 error = pvn_getpages(nfs3_getapage, vp, off, len, protp, 4444 pl, plsz, seg, addr, rw, cr); 4445 } 4446 4447 switch (error) { 4448 case NFS_EOF: 4449 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 4450 goto retry; 4451 case ESTALE: 4452 PURGE_STALE_FH(error, vp, cr); 4453 } 4454 4455 return (error); 4456 } 4457 4458 /* 4459 * Called from pvn_getpages or nfs3_getpage to get a particular page. 4460 */ 4461 /* ARGSUSED */ 4462 static int 4463 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 4464 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4465 enum seg_rw rw, cred_t *cr) 4466 { 4467 rnode_t *rp; 4468 uint_t bsize; 4469 struct buf *bp; 4470 page_t *pp; 4471 u_offset_t lbn; 4472 u_offset_t io_off; 4473 u_offset_t blkoff; 4474 u_offset_t rablkoff; 4475 size_t io_len; 4476 uint_t blksize; 4477 int error; 4478 int readahead; 4479 int readahead_issued = 0; 4480 int ra_window; /* readahead window */ 4481 page_t *pagefound; 4482 page_t *savepp; 4483 4484 if (nfs_zone() != VTOMI(vp)->mi_zone) 4485 return (EIO); 4486 rp = VTOR(vp); 4487 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4488 4489 reread: 4490 bp = NULL; 4491 pp = NULL; 4492 pagefound = NULL; 4493 4494 if (pl != NULL) 4495 pl[0] = NULL; 4496 4497 error = 0; 4498 lbn = off / bsize; 4499 blkoff = lbn * bsize; 4500 4501 /* 4502 * Queueing up the readahead before doing the synchronous read 4503 * results in a significant increase in read throughput because 4504 * of the increased parallelism between the async threads and 4505 * the process context. 4506 */ 4507 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 4508 rw != S_CREATE && 4509 !(vp->v_flag & VNOCACHE)) { 4510 mutex_enter(&rp->r_statelock); 4511 4512 /* 4513 * Calculate the number of readaheads to do. 4514 * a) No readaheads at offset = 0. 4515 * b) Do maximum(nfs3_nra) readaheads when the readahead 4516 * window is closed. 4517 * c) Do readaheads between 1 to (nfs3_nra - 1) depending 4518 * upon how far the readahead window is open or close. 4519 * d) No readaheads if rp->r_nextr is not within the scope 4520 * of the readahead window (random i/o). 4521 */ 4522 4523 if (off == 0) 4524 readahead = 0; 4525 else if (blkoff == rp->r_nextr) 4526 readahead = nfs3_nra; 4527 else if (rp->r_nextr > blkoff && 4528 ((ra_window = (rp->r_nextr - blkoff) / bsize) 4529 <= (nfs3_nra - 1))) 4530 readahead = nfs3_nra - ra_window; 4531 else 4532 readahead = 0; 4533 4534 rablkoff = rp->r_nextr; 4535 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 4536 mutex_exit(&rp->r_statelock); 4537 if (nfs_async_readahead(vp, rablkoff + bsize, 4538 addr + (rablkoff + bsize - off), seg, cr, 4539 nfs3_readahead) < 0) { 4540 mutex_enter(&rp->r_statelock); 4541 break; 4542 } 4543 readahead--; 4544 rablkoff += bsize; 4545 /* 4546 * Indicate that we did a readahead so 4547 * readahead offset is not updated 4548 * by the synchronous read below. 4549 */ 4550 readahead_issued = 1; 4551 mutex_enter(&rp->r_statelock); 4552 /* 4553 * set readahead offset to 4554 * offset of last async readahead 4555 * request. 4556 */ 4557 rp->r_nextr = rablkoff; 4558 } 4559 mutex_exit(&rp->r_statelock); 4560 } 4561 4562 again: 4563 if ((pagefound = page_exists(vp, off)) == NULL) { 4564 if (pl == NULL) { 4565 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr, 4566 nfs3_readahead); 4567 } else if (rw == S_CREATE) { 4568 /* 4569 * Block for this page is not allocated, or the offset 4570 * is beyond the current allocation size, or we're 4571 * allocating a swap slot and the page was not found, 4572 * so allocate it and return a zero page. 4573 */ 4574 if ((pp = page_create_va(vp, off, 4575 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 4576 cmn_err(CE_PANIC, "nfs3_getapage: page_create"); 4577 io_len = PAGESIZE; 4578 mutex_enter(&rp->r_statelock); 4579 rp->r_nextr = off + PAGESIZE; 4580 mutex_exit(&rp->r_statelock); 4581 } else { 4582 /* 4583 * Need to go to server to get a BLOCK, exception to 4584 * that being while reading at offset = 0 or doing 4585 * random i/o, in that case read only a PAGE. 4586 */ 4587 mutex_enter(&rp->r_statelock); 4588 if (blkoff < rp->r_size && 4589 blkoff + bsize >= rp->r_size) { 4590 /* 4591 * If only a block or less is left in 4592 * the file, read all that is remaining. 4593 */ 4594 if (rp->r_size <= off) { 4595 /* 4596 * Trying to access beyond EOF, 4597 * set up to get at least one page. 4598 */ 4599 blksize = off + PAGESIZE - blkoff; 4600 } else 4601 blksize = rp->r_size - blkoff; 4602 } else if ((off == 0) || 4603 (off != rp->r_nextr && !readahead_issued)) { 4604 blksize = PAGESIZE; 4605 blkoff = off; /* block = page here */ 4606 } else 4607 blksize = bsize; 4608 mutex_exit(&rp->r_statelock); 4609 4610 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4611 &io_len, blkoff, blksize, 0); 4612 4613 /* 4614 * Some other thread has entered the page, 4615 * so just use it. 4616 */ 4617 if (pp == NULL) 4618 goto again; 4619 4620 /* 4621 * Now round the request size up to page boundaries. 4622 * This ensures that the entire page will be 4623 * initialized to zeroes if EOF is encountered. 4624 */ 4625 io_len = ptob(btopr(io_len)); 4626 4627 bp = pageio_setup(pp, io_len, vp, B_READ); 4628 ASSERT(bp != NULL); 4629 4630 /* 4631 * pageio_setup should have set b_addr to 0. This 4632 * is correct since we want to do I/O on a page 4633 * boundary. bp_mapin will use this addr to calculate 4634 * an offset, and then set b_addr to the kernel virtual 4635 * address it allocated for us. 4636 */ 4637 ASSERT(bp->b_un.b_addr == 0); 4638 4639 bp->b_edev = 0; 4640 bp->b_dev = 0; 4641 bp->b_lblkno = lbtodb(io_off); 4642 bp->b_file = vp; 4643 bp->b_offset = (offset_t)off; 4644 bp_mapin(bp); 4645 4646 /* 4647 * If doing a write beyond what we believe is EOF, 4648 * don't bother trying to read the pages from the 4649 * server, we'll just zero the pages here. We 4650 * don't check that the rw flag is S_WRITE here 4651 * because some implementations may attempt a 4652 * read access to the buffer before copying data. 4653 */ 4654 mutex_enter(&rp->r_statelock); 4655 if (io_off >= rp->r_size && seg == segkmap) { 4656 mutex_exit(&rp->r_statelock); 4657 bzero(bp->b_un.b_addr, io_len); 4658 } else { 4659 mutex_exit(&rp->r_statelock); 4660 error = nfs3_bio(bp, NULL, cr); 4661 } 4662 4663 /* 4664 * Unmap the buffer before freeing it. 4665 */ 4666 bp_mapout(bp); 4667 pageio_done(bp); 4668 4669 savepp = pp; 4670 do { 4671 pp->p_fsdata = C_NOCOMMIT; 4672 } while ((pp = pp->p_next) != savepp); 4673 4674 if (error == NFS_EOF) { 4675 /* 4676 * If doing a write system call just return 4677 * zeroed pages, else user tried to get pages 4678 * beyond EOF, return error. We don't check 4679 * that the rw flag is S_WRITE here because 4680 * some implementations may attempt a read 4681 * access to the buffer before copying data. 4682 */ 4683 if (seg == segkmap) 4684 error = 0; 4685 else 4686 error = EFAULT; 4687 } 4688 4689 if (!readahead_issued && !error) { 4690 mutex_enter(&rp->r_statelock); 4691 rp->r_nextr = io_off + io_len; 4692 mutex_exit(&rp->r_statelock); 4693 } 4694 } 4695 } 4696 4697 out: 4698 if (pl == NULL) 4699 return (error); 4700 4701 if (error) { 4702 if (pp != NULL) 4703 pvn_read_done(pp, B_ERROR); 4704 return (error); 4705 } 4706 4707 if (pagefound) { 4708 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 4709 4710 /* 4711 * Page exists in the cache, acquire the appropriate lock. 4712 * If this fails, start all over again. 4713 */ 4714 if ((pp = page_lookup(vp, off, se)) == NULL) { 4715 #ifdef DEBUG 4716 nfs3_lostpage++; 4717 #endif 4718 goto reread; 4719 } 4720 pl[0] = pp; 4721 pl[1] = NULL; 4722 return (0); 4723 } 4724 4725 if (pp != NULL) 4726 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4727 4728 return (error); 4729 } 4730 4731 static void 4732 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 4733 cred_t *cr) 4734 { 4735 int error; 4736 page_t *pp; 4737 u_offset_t io_off; 4738 size_t io_len; 4739 struct buf *bp; 4740 uint_t bsize, blksize; 4741 rnode_t *rp = VTOR(vp); 4742 page_t *savepp; 4743 4744 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 4745 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4746 4747 mutex_enter(&rp->r_statelock); 4748 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 4749 /* 4750 * If less than a block left in file read less 4751 * than a block. 4752 */ 4753 blksize = rp->r_size - blkoff; 4754 } else 4755 blksize = bsize; 4756 mutex_exit(&rp->r_statelock); 4757 4758 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 4759 &io_off, &io_len, blkoff, blksize, 1); 4760 /* 4761 * The isra flag passed to the kluster function is 1, we may have 4762 * gotten a return value of NULL for a variety of reasons (# of free 4763 * pages < minfree, someone entered the page on the vnode etc). In all 4764 * cases, we want to punt on the readahead. 4765 */ 4766 if (pp == NULL) 4767 return; 4768 4769 /* 4770 * Now round the request size up to page boundaries. 4771 * This ensures that the entire page will be 4772 * initialized to zeroes if EOF is encountered. 4773 */ 4774 io_len = ptob(btopr(io_len)); 4775 4776 bp = pageio_setup(pp, io_len, vp, B_READ); 4777 ASSERT(bp != NULL); 4778 4779 /* 4780 * pageio_setup should have set b_addr to 0. This is correct since 4781 * we want to do I/O on a page boundary. bp_mapin() will use this addr 4782 * to calculate an offset, and then set b_addr to the kernel virtual 4783 * address it allocated for us. 4784 */ 4785 ASSERT(bp->b_un.b_addr == 0); 4786 4787 bp->b_edev = 0; 4788 bp->b_dev = 0; 4789 bp->b_lblkno = lbtodb(io_off); 4790 bp->b_file = vp; 4791 bp->b_offset = (offset_t)blkoff; 4792 bp_mapin(bp); 4793 4794 /* 4795 * If doing a write beyond what we believe is EOF, don't bother trying 4796 * to read the pages from the server, we'll just zero the pages here. 4797 * We don't check that the rw flag is S_WRITE here because some 4798 * implementations may attempt a read access to the buffer before 4799 * copying data. 4800 */ 4801 mutex_enter(&rp->r_statelock); 4802 if (io_off >= rp->r_size && seg == segkmap) { 4803 mutex_exit(&rp->r_statelock); 4804 bzero(bp->b_un.b_addr, io_len); 4805 error = 0; 4806 } else { 4807 mutex_exit(&rp->r_statelock); 4808 error = nfs3_bio(bp, NULL, cr); 4809 if (error == NFS_EOF) 4810 error = 0; 4811 } 4812 4813 /* 4814 * Unmap the buffer before freeing it. 4815 */ 4816 bp_mapout(bp); 4817 pageio_done(bp); 4818 4819 savepp = pp; 4820 do { 4821 pp->p_fsdata = C_NOCOMMIT; 4822 } while ((pp = pp->p_next) != savepp); 4823 4824 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 4825 4826 /* 4827 * In case of error set readahead offset 4828 * to the lowest offset. 4829 * pvn_read_done() calls VN_DISPOSE to destroy the pages 4830 */ 4831 if (error && rp->r_nextr > io_off) { 4832 mutex_enter(&rp->r_statelock); 4833 if (rp->r_nextr > io_off) 4834 rp->r_nextr = io_off; 4835 mutex_exit(&rp->r_statelock); 4836 } 4837 } 4838 4839 /* 4840 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 4841 * If len == 0, do from off to EOF. 4842 * 4843 * The normal cases should be len == 0 && off == 0 (entire vp list), 4844 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 4845 * (from pageout). 4846 */ 4847 static int 4848 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 4849 { 4850 int error; 4851 rnode_t *rp; 4852 4853 ASSERT(cr != NULL); 4854 4855 /* 4856 * XXX - Why should this check be made here? 4857 */ 4858 if (vp->v_flag & VNOMAP) 4859 return (ENOSYS); 4860 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp)) 4861 return (0); 4862 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 4863 return (EIO); 4864 4865 rp = VTOR(vp); 4866 mutex_enter(&rp->r_statelock); 4867 rp->r_count++; 4868 mutex_exit(&rp->r_statelock); 4869 error = nfs_putpages(vp, off, len, flags, cr); 4870 mutex_enter(&rp->r_statelock); 4871 rp->r_count--; 4872 cv_broadcast(&rp->r_cv); 4873 mutex_exit(&rp->r_statelock); 4874 4875 return (error); 4876 } 4877 4878 /* 4879 * Write out a single page, possibly klustering adjacent dirty pages. 4880 */ 4881 int 4882 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 4883 int flags, cred_t *cr) 4884 { 4885 u_offset_t io_off; 4886 u_offset_t lbn_off; 4887 u_offset_t lbn; 4888 size_t io_len; 4889 uint_t bsize; 4890 int error; 4891 rnode_t *rp; 4892 4893 ASSERT(!vn_is_readonly(vp)); 4894 ASSERT(pp != NULL); 4895 ASSERT(cr != NULL); 4896 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone); 4897 4898 rp = VTOR(vp); 4899 ASSERT(rp->r_count > 0); 4900 4901 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 4902 lbn = pp->p_offset / bsize; 4903 lbn_off = lbn * bsize; 4904 4905 /* 4906 * Find a kluster that fits in one block, or in 4907 * one page if pages are bigger than blocks. If 4908 * there is less file space allocated than a whole 4909 * page, we'll shorten the i/o request below. 4910 */ 4911 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 4912 roundup(bsize, PAGESIZE), flags); 4913 4914 /* 4915 * pvn_write_kluster shouldn't have returned a page with offset 4916 * behind the original page we were given. Verify that. 4917 */ 4918 ASSERT((pp->p_offset / bsize) >= lbn); 4919 4920 /* 4921 * Now pp will have the list of kept dirty pages marked for 4922 * write back. It will also handle invalidation and freeing 4923 * of pages that are not dirty. Check for page length rounding 4924 * problems. 4925 */ 4926 if (io_off + io_len > lbn_off + bsize) { 4927 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 4928 io_len = lbn_off + bsize - io_off; 4929 } 4930 /* 4931 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4932 * consistent value of r_size. RMODINPROGRESS is set in writerp(). 4933 * When RMODINPROGRESS is set it indicates that a uiomove() is in 4934 * progress and the r_size has not been made consistent with the 4935 * new size of the file. When the uiomove() completes the r_size is 4936 * updated and the RMODINPROGRESS flag is cleared. 4937 * 4938 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a 4939 * consistent value of r_size. Without this handshaking, it is 4940 * possible that nfs(3)_bio() picks up the old value of r_size 4941 * before the uiomove() in writerp() completes. This will result 4942 * in the write through nfs(3)_bio() being dropped. 4943 * 4944 * More precisely, there is a window between the time the uiomove() 4945 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 4946 * operation intervenes in this window, the page will be picked up, 4947 * because it is dirty (it will be unlocked, unless it was 4948 * pagecreate'd). When the page is picked up as dirty, the dirty 4949 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is 4950 * checked. This will still be the old size. Therefore the page will 4951 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 4952 * the page will be found to be clean and the write will be dropped. 4953 */ 4954 if (rp->r_flags & RMODINPROGRESS) { 4955 mutex_enter(&rp->r_statelock); 4956 if ((rp->r_flags & RMODINPROGRESS) && 4957 rp->r_modaddr + MAXBSIZE > io_off && 4958 rp->r_modaddr < io_off + io_len) { 4959 page_t *plist; 4960 /* 4961 * A write is in progress for this region of the file. 4962 * If we did not detect RMODINPROGRESS here then this 4963 * path through nfs_putapage() would eventually go to 4964 * nfs(3)_bio() and may not write out all of the data 4965 * in the pages. We end up losing data. So we decide 4966 * to set the modified bit on each page in the page 4967 * list and mark the rnode with RDIRTY. This write 4968 * will be restarted at some later time. 4969 */ 4970 plist = pp; 4971 while (plist != NULL) { 4972 pp = plist; 4973 page_sub(&plist, pp); 4974 hat_setmod(pp); 4975 page_io_unlock(pp); 4976 page_unlock(pp); 4977 } 4978 rp->r_flags |= RDIRTY; 4979 mutex_exit(&rp->r_statelock); 4980 if (offp) 4981 *offp = io_off; 4982 if (lenp) 4983 *lenp = io_len; 4984 return (0); 4985 } 4986 mutex_exit(&rp->r_statelock); 4987 } 4988 4989 if (flags & B_ASYNC) { 4990 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr, 4991 nfs3_sync_putapage); 4992 } else 4993 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr); 4994 4995 if (offp) 4996 *offp = io_off; 4997 if (lenp) 4998 *lenp = io_len; 4999 return (error); 5000 } 5001 5002 static int 5003 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5004 int flags, cred_t *cr) 5005 { 5006 int error; 5007 rnode_t *rp; 5008 5009 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5010 5011 flags |= B_WRITE; 5012 5013 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5014 5015 rp = VTOR(vp); 5016 5017 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 5018 error == EACCES) && 5019 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 5020 if (!(rp->r_flags & ROUTOFSPACE)) { 5021 mutex_enter(&rp->r_statelock); 5022 rp->r_flags |= ROUTOFSPACE; 5023 mutex_exit(&rp->r_statelock); 5024 } 5025 flags |= B_ERROR; 5026 pvn_write_done(pp, flags); 5027 /* 5028 * If this was not an async thread, then try again to 5029 * write out the pages, but this time, also destroy 5030 * them whether or not the write is successful. This 5031 * will prevent memory from filling up with these 5032 * pages and destroying them is the only alternative 5033 * if they can't be written out. 5034 * 5035 * Don't do this if this is an async thread because 5036 * when the pages are unlocked in pvn_write_done, 5037 * some other thread could have come along, locked 5038 * them, and queued for an async thread. It would be 5039 * possible for all of the async threads to be tied 5040 * up waiting to lock the pages again and they would 5041 * all already be locked and waiting for an async 5042 * thread to handle them. Deadlock. 5043 */ 5044 if (!(flags & B_ASYNC)) { 5045 error = nfs3_putpage(vp, io_off, io_len, 5046 B_INVAL | B_FORCE, cr); 5047 } 5048 } else { 5049 if (error) 5050 flags |= B_ERROR; 5051 else if (rp->r_flags & ROUTOFSPACE) { 5052 mutex_enter(&rp->r_statelock); 5053 rp->r_flags &= ~ROUTOFSPACE; 5054 mutex_exit(&rp->r_statelock); 5055 } 5056 pvn_write_done(pp, flags); 5057 if (freemem < desfree) 5058 (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr); 5059 } 5060 5061 return (error); 5062 } 5063 5064 static int 5065 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 5066 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 5067 { 5068 struct segvn_crargs vn_a; 5069 int error; 5070 rnode_t *rp; 5071 struct vattr va; 5072 5073 if (nfs_zone() != VTOMI(vp)->mi_zone) 5074 return (EIO); 5075 5076 if (vp->v_flag & VNOMAP) 5077 return (ENOSYS); 5078 5079 if (off < 0 || off + len < 0) 5080 return (ENXIO); 5081 5082 if (vp->v_type != VREG) 5083 return (ENODEV); 5084 5085 /* 5086 * If there is cached data and if close-to-open consistency 5087 * checking is not turned off and if the file system is not 5088 * mounted readonly, then force an over the wire getattr. 5089 * Otherwise, just invoke nfs3getattr to get a copy of the 5090 * attributes. The attribute cache will be used unless it 5091 * is timed out and if it is, then an over the wire getattr 5092 * will be issued. 5093 */ 5094 va.va_mask = AT_ALL; 5095 if (vn_has_cached_data(vp) && 5096 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp)) 5097 error = nfs3_getattr_otw(vp, &va, cr); 5098 else 5099 error = nfs3getattr(vp, &va, cr); 5100 if (error) 5101 return (error); 5102 5103 /* 5104 * Check to see if the vnode is currently marked as not cachable. 5105 * This means portions of the file are locked (through VOP_FRLOCK). 5106 * In this case the map request must be refused. We use 5107 * rp->r_lkserlock to avoid a race with concurrent lock requests. 5108 */ 5109 rp = VTOR(vp); 5110 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) 5111 return (EINTR); 5112 5113 if (vp->v_flag & VNOCACHE) { 5114 error = EAGAIN; 5115 goto done; 5116 } 5117 5118 /* 5119 * Don't allow concurrent locks and mapping if mandatory locking is 5120 * enabled. 5121 */ 5122 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) && 5123 MANDLOCK(vp, va.va_mode)) { 5124 error = EAGAIN; 5125 goto done; 5126 } 5127 5128 as_rangelock(as); 5129 if (!(flags & MAP_FIXED)) { 5130 map_addr(addrp, len, off, 1, flags); 5131 if (*addrp == NULL) { 5132 as_rangeunlock(as); 5133 error = ENOMEM; 5134 goto done; 5135 } 5136 } else { 5137 /* 5138 * User specified address - blow away any previous mappings 5139 */ 5140 (void) as_unmap(as, *addrp, len); 5141 } 5142 5143 vn_a.vp = vp; 5144 vn_a.offset = off; 5145 vn_a.type = (flags & MAP_TYPE); 5146 vn_a.prot = (uchar_t)prot; 5147 vn_a.maxprot = (uchar_t)maxprot; 5148 vn_a.flags = (flags & ~MAP_TYPE); 5149 vn_a.cred = cr; 5150 vn_a.amp = NULL; 5151 vn_a.szc = 0; 5152 vn_a.lgrp_mem_policy_flags = 0; 5153 5154 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5155 as_rangeunlock(as); 5156 5157 done: 5158 nfs_rw_exit(&rp->r_lkserlock); 5159 return (error); 5160 } 5161 5162 /* ARGSUSED */ 5163 static int 5164 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5165 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 5166 { 5167 rnode_t *rp; 5168 5169 if (vp->v_flag & VNOMAP) 5170 return (ENOSYS); 5171 if (nfs_zone() != VTOMI(vp)->mi_zone) 5172 return (EIO); 5173 5174 /* 5175 * Need to hold rwlock while incrementing the mapcnt so that 5176 * mmap'ing can be serialized with writes so that the caching 5177 * can be handled correctly. 5178 */ 5179 rp = VTOR(vp); 5180 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 5181 return (EINTR); 5182 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 5183 nfs_rw_exit(&rp->r_rwlock); 5184 5185 return (0); 5186 } 5187 5188 static int 5189 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 5190 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 5191 { 5192 netobj lm_fh3; 5193 int rc; 5194 u_offset_t start, end; 5195 rnode_t *rp; 5196 int error = 0, intr = INTR(vp); 5197 5198 if (nfs_zone() != VTOMI(vp)->mi_zone) 5199 return (EIO); 5200 /* check for valid cmd parameter */ 5201 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 5202 return (EINVAL); 5203 5204 /* Verify l_type. */ 5205 switch (bfp->l_type) { 5206 case F_RDLCK: 5207 if (cmd != F_GETLK && !(flag & FREAD)) 5208 return (EBADF); 5209 break; 5210 case F_WRLCK: 5211 if (cmd != F_GETLK && !(flag & FWRITE)) 5212 return (EBADF); 5213 break; 5214 case F_UNLCK: 5215 intr = 0; 5216 break; 5217 5218 default: 5219 return (EINVAL); 5220 } 5221 5222 /* check the validity of the lock range */ 5223 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 5224 return (rc); 5225 if (rc = flk_check_lock_data(start, end, MAXEND)) 5226 return (rc); 5227 5228 /* 5229 * If the filesystem is mounted using local locking, pass the 5230 * request off to the local locking code. 5231 */ 5232 if (VTOMI(vp)->mi_flags & MI_LLOCK) { 5233 if (cmd == F_SETLK || cmd == F_SETLKW) { 5234 /* 5235 * For complete safety, we should be holding 5236 * r_lkserlock. However, we can't call 5237 * lm_safelock and then fs_frlock while 5238 * holding r_lkserlock, so just invoke 5239 * lm_safelock and expect that this will 5240 * catch enough of the cases. 5241 */ 5242 if (!lm_safelock(vp, bfp, cr)) 5243 return (EAGAIN); 5244 } 5245 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 5246 } 5247 5248 rp = VTOR(vp); 5249 5250 /* 5251 * Check whether the given lock request can proceed, given the 5252 * current file mappings. 5253 */ 5254 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 5255 return (EINTR); 5256 if (cmd == F_SETLK || cmd == F_SETLKW) { 5257 if (!lm_safelock(vp, bfp, cr)) { 5258 rc = EAGAIN; 5259 goto done; 5260 } 5261 } 5262 5263 /* 5264 * Flush the cache after waiting for async I/O to finish. For new 5265 * locks, this is so that the process gets the latest bits from the 5266 * server. For unlocks, this is so that other clients see the 5267 * latest bits once the file has been unlocked. If currently dirty 5268 * pages can't be flushed, then don't allow a lock to be set. But 5269 * allow unlocks to succeed, to avoid having orphan locks on the 5270 * server. 5271 */ 5272 if (cmd != F_GETLK) { 5273 mutex_enter(&rp->r_statelock); 5274 while (rp->r_count > 0) { 5275 if (intr) { 5276 klwp_t *lwp = ttolwp(curthread); 5277 5278 if (lwp != NULL) 5279 lwp->lwp_nostop++; 5280 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 5281 if (lwp != NULL) 5282 lwp->lwp_nostop--; 5283 rc = EINTR; 5284 break; 5285 } 5286 if (lwp != NULL) 5287 lwp->lwp_nostop--; 5288 } else 5289 cv_wait(&rp->r_cv, &rp->r_statelock); 5290 } 5291 mutex_exit(&rp->r_statelock); 5292 if (rc != 0) 5293 goto done; 5294 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 5295 if (error) { 5296 if (error == ENOSPC || error == EDQUOT) { 5297 mutex_enter(&rp->r_statelock); 5298 if (!rp->r_error) 5299 rp->r_error = error; 5300 mutex_exit(&rp->r_statelock); 5301 } 5302 if (bfp->l_type != F_UNLCK) { 5303 rc = ENOLCK; 5304 goto done; 5305 } 5306 } 5307 } 5308 5309 lm_fh3.n_len = VTOFH3(vp)->fh3_length; 5310 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data); 5311 5312 /* 5313 * Call the lock manager to do the real work of contacting 5314 * the server and obtaining the lock. 5315 */ 5316 rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp); 5317 5318 if (rc == 0) 5319 nfs_lockcompletion(vp, cmd); 5320 5321 done: 5322 nfs_rw_exit(&rp->r_lkserlock); 5323 return (rc); 5324 } 5325 5326 /* 5327 * Free storage space associated with the specified vnode. The portion 5328 * to be freed is specified by bfp->l_start and bfp->l_len (already 5329 * normalized to a "whence" of 0). 5330 * 5331 * This is an experimental facility whose continued existence is not 5332 * guaranteed. Currently, we only support the special case 5333 * of l_len == 0, meaning free to end of file. 5334 */ 5335 /* ARGSUSED */ 5336 static int 5337 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 5338 offset_t offset, cred_t *cr, caller_context_t *ct) 5339 { 5340 int error; 5341 5342 ASSERT(vp->v_type == VREG); 5343 if (cmd != F_FREESP) 5344 return (EINVAL); 5345 if (nfs_zone() != VTOMI(vp)->mi_zone) 5346 return (EIO); 5347 5348 error = convoff(vp, bfp, 0, offset); 5349 if (!error) { 5350 ASSERT(bfp->l_start >= 0); 5351 if (bfp->l_len == 0) { 5352 struct vattr va; 5353 5354 /* 5355 * ftruncate should not change the ctime and 5356 * mtime if we truncate the file to its 5357 * previous size. 5358 */ 5359 va.va_mask = AT_SIZE; 5360 error = nfs3getattr(vp, &va, cr); 5361 if (error || va.va_size == bfp->l_start) 5362 return (error); 5363 va.va_mask = AT_SIZE; 5364 va.va_size = bfp->l_start; 5365 error = nfs3setattr(vp, &va, 0, cr); 5366 } else 5367 error = EINVAL; 5368 } 5369 5370 return (error); 5371 } 5372 5373 /* ARGSUSED */ 5374 static int 5375 nfs3_realvp(vnode_t *vp, vnode_t **vpp) 5376 { 5377 5378 return (EINVAL); 5379 } 5380 5381 /* 5382 * Setup and add an address space callback to do the work of the delmap call. 5383 * The callback will (and must be) deleted in the actual callback function. 5384 * 5385 * This is done in order to take care of the problem that we have with holding 5386 * the address space's a_lock for a long period of time (e.g. if the NFS server 5387 * is down). Callbacks will be executed in the address space code while the 5388 * a_lock is not held. Holding the address space's a_lock causes things such 5389 * as ps and fork to hang because they are trying to acquire this lock as well. 5390 */ 5391 /* ARGSUSED */ 5392 static int 5393 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5394 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 5395 { 5396 int caller_found; 5397 int error; 5398 rnode_t *rp; 5399 nfs_delmap_args_t *dmapp; 5400 nfs_delmapcall_t *delmap_call; 5401 5402 if (vp->v_flag & VNOMAP) 5403 return (ENOSYS); 5404 /* 5405 * A process may not change zones if it has NFS pages mmap'ed 5406 * in, so we can't legitimately get here from the wrong zone. 5407 */ 5408 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5409 5410 rp = VTOR(vp); 5411 5412 /* 5413 * The way that the address space of this process deletes its mapping 5414 * of this file is via the following call chains: 5415 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap() 5416 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap() 5417 * 5418 * With the use of address space callbacks we are allowed to drop the 5419 * address space lock, a_lock, while executing the NFS operations that 5420 * need to go over the wire. Returning EAGAIN to the caller of this 5421 * function is what drives the execution of the callback that we add 5422 * below. The callback will be executed by the address space code 5423 * after dropping the a_lock. When the callback is finished, since 5424 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 5425 * is called again on the same segment to finish the rest of the work 5426 * that needs to happen during unmapping. 5427 * 5428 * This action of calling back into the segment driver causes 5429 * nfs3_delmap() to get called again, but since the callback was 5430 * already executed at this point, it already did the work and there 5431 * is nothing left for us to do. 5432 * 5433 * To Summarize: 5434 * - The first time nfs3_delmap is called by the current thread is when 5435 * we add the caller associated with this delmap to the delmap caller 5436 * list, add the callback, and return EAGAIN. 5437 * - The second time in this call chain when nfs3_delmap is called we 5438 * will find this caller in the delmap caller list and realize there 5439 * is no more work to do thus removing this caller from the list and 5440 * returning the error that was set in the callback execution. 5441 */ 5442 caller_found = nfs_find_and_delete_delmapcall(rp, &error); 5443 if (caller_found) { 5444 /* 5445 * 'error' is from the actual delmap operations. To avoid 5446 * hangs, we need to handle the return of EAGAIN differently 5447 * since this is what drives the callback execution. 5448 * In this case, we don't want to return EAGAIN and do the 5449 * callback execution because there are none to execute. 5450 */ 5451 if (error == EAGAIN) 5452 return (0); 5453 else 5454 return (error); 5455 } 5456 5457 /* current caller was not in the list */ 5458 delmap_call = nfs_init_delmapcall(); 5459 5460 mutex_enter(&rp->r_statelock); 5461 list_insert_tail(&rp->r_indelmap, delmap_call); 5462 mutex_exit(&rp->r_statelock); 5463 5464 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP); 5465 5466 dmapp->vp = vp; 5467 dmapp->off = off; 5468 dmapp->addr = addr; 5469 dmapp->len = len; 5470 dmapp->prot = prot; 5471 dmapp->maxprot = maxprot; 5472 dmapp->flags = flags; 5473 dmapp->cr = cr; 5474 dmapp->caller = delmap_call; 5475 5476 error = as_add_callback(as, nfs3_delmap_callback, dmapp, 5477 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 5478 5479 return (error ? error : EAGAIN); 5480 } 5481 5482 /* 5483 * Remove some pages from an mmap'd vnode. Just update the 5484 * count of pages. If doing close-to-open, then flush and 5485 * commit all of the pages associated with this file. 5486 * Otherwise, start an asynchronous page flush to write out 5487 * any dirty pages. This will also associate a credential 5488 * with the rnode which can be used to write the pages. 5489 */ 5490 /* ARGSUSED */ 5491 static void 5492 nfs3_delmap_callback(struct as *as, void *arg, uint_t event) 5493 { 5494 int error; 5495 rnode_t *rp; 5496 mntinfo_t *mi; 5497 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg; 5498 5499 rp = VTOR(dmapp->vp); 5500 mi = VTOMI(dmapp->vp); 5501 5502 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 5503 ASSERT(rp->r_mapcnt >= 0); 5504 5505 /* 5506 * Initiate a page flush and potential commit if there are 5507 * pages, the file system was not mounted readonly, the segment 5508 * was mapped shared, and the pages themselves were writeable. 5509 */ 5510 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) && 5511 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 5512 mutex_enter(&rp->r_statelock); 5513 rp->r_flags |= RDIRTY; 5514 mutex_exit(&rp->r_statelock); 5515 /* 5516 * If this is a cross-zone access a sync putpage won't work, so 5517 * the best we can do is try an async putpage. That seems 5518 * better than something more draconian such as discarding the 5519 * dirty pages. 5520 */ 5521 if ((mi->mi_flags & MI_NOCTO) || 5522 nfs_zone() != mi->mi_zone) 5523 error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len, 5524 B_ASYNC, dmapp->cr); 5525 else 5526 error = nfs3_putpage_commit(dmapp->vp, dmapp->off, 5527 dmapp->len, dmapp->cr); 5528 if (!error) { 5529 mutex_enter(&rp->r_statelock); 5530 error = rp->r_error; 5531 rp->r_error = 0; 5532 mutex_exit(&rp->r_statelock); 5533 } 5534 } else 5535 error = 0; 5536 5537 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) 5538 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len, 5539 B_INVAL, dmapp->cr); 5540 5541 dmapp->caller->error = error; 5542 (void) as_delete_callback(as, arg); 5543 kmem_free(dmapp, sizeof (nfs_delmap_args_t)); 5544 } 5545 5546 static int nfs3_pathconf_disable_cache = 0; 5547 5548 #ifdef DEBUG 5549 static int nfs3_pathconf_cache_hits = 0; 5550 static int nfs3_pathconf_cache_misses = 0; 5551 #endif 5552 5553 static int 5554 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 5555 { 5556 int error; 5557 PATHCONF3args args; 5558 PATHCONF3res res; 5559 int douprintf; 5560 failinfo_t fi; 5561 rnode_t *rp; 5562 hrtime_t t; 5563 5564 if (nfs_zone() != VTOMI(vp)->mi_zone) 5565 return (EIO); 5566 /* 5567 * Large file spec - need to base answer on info stored 5568 * on original FSINFO response. 5569 */ 5570 if (cmd == _PC_FILESIZEBITS) { 5571 unsigned long long ll; 5572 long l = 1; 5573 5574 ll = VTOMI(vp)->mi_maxfilesize; 5575 5576 if (ll == 0) { 5577 *valp = 0; 5578 return (0); 5579 } 5580 5581 if (ll & 0xffffffff00000000) { 5582 l += 32; ll >>= 32; 5583 } 5584 if (ll & 0xffff0000) { 5585 l += 16; ll >>= 16; 5586 } 5587 if (ll & 0xff00) { 5588 l += 8; ll >>= 8; 5589 } 5590 if (ll & 0xf0) { 5591 l += 4; ll >>= 4; 5592 } 5593 if (ll & 0xc) { 5594 l += 2; ll >>= 2; 5595 } 5596 if (ll & 0x2) 5597 l += 2; 5598 else if (ll & 0x1) 5599 l += 1; 5600 *valp = l; 5601 return (0); 5602 } 5603 5604 if (cmd == _PC_ACL_ENABLED) { 5605 *valp = _ACL_ACLENT_ENABLED; 5606 return (0); 5607 } 5608 5609 if (cmd == _PC_XATTR_EXISTS) { 5610 error = 0; 5611 *valp = 0; 5612 if (vp->v_vfsp->vfs_flag & VFS_XATTR) { 5613 vnode_t *avp; 5614 rnode_t *rp; 5615 int error = 0; 5616 mntinfo_t *mi = VTOMI(vp); 5617 5618 if (!(mi->mi_flags & MI_EXTATTR)) 5619 return (0); 5620 5621 rp = VTOR(vp); 5622 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, 5623 INTR(vp))) 5624 return (EINTR); 5625 5626 error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr); 5627 if (error || avp == NULL) 5628 error = acl_getxattrdir3(vp, &avp, 0, cr, 0); 5629 5630 nfs_rw_exit(&rp->r_rwlock); 5631 5632 if (error == 0 && avp != NULL) { 5633 VN_RELE(avp); 5634 *valp = 1; 5635 } else if (error == ENOENT) 5636 error = 0; 5637 } 5638 return (error); 5639 } 5640 5641 rp = VTOR(vp); 5642 if (rp->r_pathconf != NULL) { 5643 mutex_enter(&rp->r_statelock); 5644 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) { 5645 kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf)); 5646 rp->r_pathconf = NULL; 5647 } 5648 if (rp->r_pathconf != NULL) { 5649 error = 0; 5650 switch (cmd) { 5651 case _PC_LINK_MAX: 5652 *valp = rp->r_pathconf->link_max; 5653 break; 5654 case _PC_NAME_MAX: 5655 *valp = rp->r_pathconf->name_max; 5656 break; 5657 case _PC_PATH_MAX: 5658 case _PC_SYMLINK_MAX: 5659 *valp = MAXPATHLEN; 5660 break; 5661 case _PC_CHOWN_RESTRICTED: 5662 *valp = rp->r_pathconf->chown_restricted; 5663 break; 5664 case _PC_NO_TRUNC: 5665 *valp = rp->r_pathconf->no_trunc; 5666 break; 5667 default: 5668 error = EINVAL; 5669 break; 5670 } 5671 mutex_exit(&rp->r_statelock); 5672 #ifdef DEBUG 5673 nfs3_pathconf_cache_hits++; 5674 #endif 5675 return (error); 5676 } 5677 mutex_exit(&rp->r_statelock); 5678 } 5679 #ifdef DEBUG 5680 nfs3_pathconf_cache_misses++; 5681 #endif 5682 5683 args.object = *VTOFH3(vp); 5684 fi.vp = vp; 5685 fi.fhp = (caddr_t)&args.object; 5686 fi.copyproc = nfs3copyfh; 5687 fi.lookupproc = nfs3lookup; 5688 fi.xattrdirproc = acl_getxattrdir3; 5689 5690 douprintf = 1; 5691 5692 t = gethrtime(); 5693 5694 error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF, 5695 xdr_nfs_fh3, (caddr_t)&args, 5696 xdr_PATHCONF3res, (caddr_t)&res, cr, 5697 &douprintf, &res.status, 0, &fi); 5698 5699 if (error) 5700 return (error); 5701 5702 error = geterrno3(res.status); 5703 5704 if (!error) { 5705 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr); 5706 if (!nfs3_pathconf_disable_cache) { 5707 mutex_enter(&rp->r_statelock); 5708 if (rp->r_pathconf == NULL) { 5709 rp->r_pathconf = kmem_alloc( 5710 sizeof (*rp->r_pathconf), KM_NOSLEEP); 5711 if (rp->r_pathconf != NULL) 5712 *rp->r_pathconf = res.resok.info; 5713 } 5714 mutex_exit(&rp->r_statelock); 5715 } 5716 switch (cmd) { 5717 case _PC_LINK_MAX: 5718 *valp = res.resok.info.link_max; 5719 break; 5720 case _PC_NAME_MAX: 5721 *valp = res.resok.info.name_max; 5722 break; 5723 case _PC_PATH_MAX: 5724 case _PC_SYMLINK_MAX: 5725 *valp = MAXPATHLEN; 5726 break; 5727 case _PC_CHOWN_RESTRICTED: 5728 *valp = res.resok.info.chown_restricted; 5729 break; 5730 case _PC_NO_TRUNC: 5731 *valp = res.resok.info.no_trunc; 5732 break; 5733 default: 5734 return (EINVAL); 5735 } 5736 } else { 5737 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr); 5738 PURGE_STALE_FH(error, vp, cr); 5739 } 5740 5741 return (error); 5742 } 5743 5744 /* 5745 * Called by async thread to do synchronous pageio. Do the i/o, wait 5746 * for it to complete, and cleanup the page list when done. 5747 */ 5748 static int 5749 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5750 int flags, cred_t *cr) 5751 { 5752 int error; 5753 5754 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 5755 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5756 if (flags & B_READ) 5757 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 5758 else 5759 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 5760 return (error); 5761 } 5762 5763 static int 5764 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 5765 int flags, cred_t *cr) 5766 { 5767 int error; 5768 rnode_t *rp; 5769 5770 if (pp == NULL) 5771 return (EINVAL); 5772 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone) 5773 return (EIO); 5774 5775 rp = VTOR(vp); 5776 mutex_enter(&rp->r_statelock); 5777 rp->r_count++; 5778 mutex_exit(&rp->r_statelock); 5779 5780 if (flags & B_ASYNC) { 5781 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr, 5782 nfs3_sync_pageio); 5783 } else 5784 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 5785 mutex_enter(&rp->r_statelock); 5786 rp->r_count--; 5787 cv_broadcast(&rp->r_cv); 5788 mutex_exit(&rp->r_statelock); 5789 return (error); 5790 } 5791 5792 static void 5793 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 5794 { 5795 int error; 5796 rnode_t *rp; 5797 page_t *plist; 5798 page_t *pptr; 5799 offset3 offset; 5800 count3 len; 5801 k_sigset_t smask; 5802 5803 /* 5804 * We should get called with fl equal to either B_FREE or 5805 * B_INVAL. Any other value is illegal. 5806 * 5807 * The page that we are either supposed to free or destroy 5808 * should be exclusive locked and its io lock should not 5809 * be held. 5810 */ 5811 ASSERT(fl == B_FREE || fl == B_INVAL); 5812 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 5813 rp = VTOR(vp); 5814 5815 /* 5816 * If the page doesn't need to be committed or we shouldn't 5817 * even bother attempting to commit it, then just make sure 5818 * that the p_fsdata byte is clear and then either free or 5819 * destroy the page as appropriate. 5820 */ 5821 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) { 5822 pp->p_fsdata = C_NOCOMMIT; 5823 if (fl == B_FREE) 5824 page_free(pp, dn); 5825 else 5826 page_destroy(pp, dn); 5827 return; 5828 } 5829 5830 /* 5831 * If there is a page invalidation operation going on, then 5832 * if this is one of the pages being destroyed, then just 5833 * clear the p_fsdata byte and then either free or destroy 5834 * the page as appropriate. 5835 */ 5836 mutex_enter(&rp->r_statelock); 5837 if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 5838 mutex_exit(&rp->r_statelock); 5839 pp->p_fsdata = C_NOCOMMIT; 5840 if (fl == B_FREE) 5841 page_free(pp, dn); 5842 else 5843 page_destroy(pp, dn); 5844 return; 5845 } 5846 5847 /* 5848 * If we are freeing this page and someone else is already 5849 * waiting to do a commit, then just unlock the page and 5850 * return. That other thread will take care of commiting 5851 * this page. The page can be freed sometime after the 5852 * commit has finished. Otherwise, if the page is marked 5853 * as delay commit, then we may be getting called from 5854 * pvn_write_done, one page at a time. This could result 5855 * in one commit per page, so we end up doing lots of small 5856 * commits instead of fewer larger commits. This is bad, 5857 * we want do as few commits as possible. 5858 */ 5859 if (fl == B_FREE) { 5860 if (rp->r_flags & RCOMMITWAIT) { 5861 page_unlock(pp); 5862 mutex_exit(&rp->r_statelock); 5863 return; 5864 } 5865 if (pp->p_fsdata == C_DELAYCOMMIT) { 5866 pp->p_fsdata = C_COMMIT; 5867 page_unlock(pp); 5868 mutex_exit(&rp->r_statelock); 5869 return; 5870 } 5871 } 5872 5873 /* 5874 * Check to see if there is a signal which would prevent an 5875 * attempt to commit the pages from being successful. If so, 5876 * then don't bother with all of the work to gather pages and 5877 * generate the unsuccessful RPC. Just return from here and 5878 * let the page be committed at some later time. 5879 */ 5880 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 5881 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 5882 sigunintr(&smask); 5883 page_unlock(pp); 5884 mutex_exit(&rp->r_statelock); 5885 return; 5886 } 5887 sigunintr(&smask); 5888 5889 /* 5890 * We are starting to need to commit pages, so let's try 5891 * to commit as many as possible at once to reduce the 5892 * overhead. 5893 * 5894 * Set the `commit inprogress' state bit. We must 5895 * first wait until any current one finishes. Then 5896 * we initialize the c_pages list with this page. 5897 */ 5898 while (rp->r_flags & RCOMMIT) { 5899 rp->r_flags |= RCOMMITWAIT; 5900 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 5901 rp->r_flags &= ~RCOMMITWAIT; 5902 } 5903 rp->r_flags |= RCOMMIT; 5904 mutex_exit(&rp->r_statelock); 5905 ASSERT(rp->r_commit.c_pages == NULL); 5906 rp->r_commit.c_pages = pp; 5907 rp->r_commit.c_commbase = (offset3)pp->p_offset; 5908 rp->r_commit.c_commlen = PAGESIZE; 5909 5910 /* 5911 * Gather together all other pages which can be committed. 5912 * They will all be chained off r_commit.c_pages. 5913 */ 5914 nfs3_get_commit(vp); 5915 5916 /* 5917 * Clear the `commit inprogress' status and disconnect 5918 * the list of pages to be committed from the rnode. 5919 * At this same time, we also save the starting offset 5920 * and length of data to be committed on the server. 5921 */ 5922 plist = rp->r_commit.c_pages; 5923 rp->r_commit.c_pages = NULL; 5924 offset = rp->r_commit.c_commbase; 5925 len = rp->r_commit.c_commlen; 5926 mutex_enter(&rp->r_statelock); 5927 rp->r_flags &= ~RCOMMIT; 5928 cv_broadcast(&rp->r_commit.c_cv); 5929 mutex_exit(&rp->r_statelock); 5930 5931 if (curproc == proc_pageout || curproc == proc_fsflush || 5932 nfs_zone() != VTOMI(vp)->mi_zone) { 5933 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit); 5934 return; 5935 } 5936 5937 /* 5938 * Actually generate the COMMIT3 over the wire operation. 5939 */ 5940 error = nfs3_commit(vp, offset, len, cr); 5941 5942 /* 5943 * If we got an error during the commit, just unlock all 5944 * of the pages. The pages will get retransmitted to the 5945 * server during a putpage operation. 5946 */ 5947 if (error) { 5948 while (plist != NULL) { 5949 pptr = plist; 5950 page_sub(&plist, pptr); 5951 page_unlock(pptr); 5952 } 5953 return; 5954 } 5955 5956 /* 5957 * We've tried as hard as we can to commit the data to stable 5958 * storage on the server. We release the rest of the pages 5959 * and clear the commit required state. They will be put 5960 * onto the tail of the cachelist if they are nolonger 5961 * mapped. 5962 */ 5963 while (plist != pp) { 5964 pptr = plist; 5965 page_sub(&plist, pptr); 5966 pptr->p_fsdata = C_NOCOMMIT; 5967 (void) page_release(pptr, 1); 5968 } 5969 5970 /* 5971 * It is possible that nfs3_commit didn't return error but 5972 * some other thread has modified the page we are going 5973 * to free/destroy. 5974 * In this case we need to rewrite the page. Do an explicit check 5975 * before attempting to free/destroy the page. If modified, needs to 5976 * be rewritten so unlock the page and return. 5977 */ 5978 if (hat_ismod(pp)) { 5979 pp->p_fsdata = C_NOCOMMIT; 5980 page_unlock(pp); 5981 return; 5982 } 5983 5984 /* 5985 * Now, as appropriate, either free or destroy the page 5986 * that we were called with. 5987 */ 5988 pp->p_fsdata = C_NOCOMMIT; 5989 if (fl == B_FREE) 5990 page_free(pp, dn); 5991 else 5992 page_destroy(pp, dn); 5993 } 5994 5995 static int 5996 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr) 5997 { 5998 int error; 5999 rnode_t *rp; 6000 COMMIT3args args; 6001 COMMIT3res res; 6002 int douprintf; 6003 cred_t *cred; 6004 6005 rp = VTOR(vp); 6006 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6007 6008 mutex_enter(&rp->r_statelock); 6009 if (rp->r_cred != NULL) { 6010 cred = rp->r_cred; 6011 crhold(cred); 6012 } else { 6013 rp->r_cred = cr; 6014 crhold(cr); 6015 cred = cr; 6016 crhold(cred); 6017 } 6018 mutex_exit(&rp->r_statelock); 6019 6020 args.file = *VTOFH3(vp); 6021 args.offset = offset; 6022 args.count = count; 6023 6024 doitagain: 6025 douprintf = 1; 6026 error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT, 6027 xdr_COMMIT3args, (caddr_t)&args, 6028 xdr_COMMIT3res, (caddr_t)&res, cred, 6029 &douprintf, &res.status, 0, NULL); 6030 6031 crfree(cred); 6032 6033 if (error) 6034 return (error); 6035 6036 error = geterrno3(res.status); 6037 if (!error) { 6038 ASSERT(rp->r_flags & RHAVEVERF); 6039 mutex_enter(&rp->r_statelock); 6040 if (rp->r_verf == res.resok.verf) { 6041 mutex_exit(&rp->r_statelock); 6042 return (0); 6043 } 6044 nfs3_set_mod(vp); 6045 rp->r_verf = res.resok.verf; 6046 mutex_exit(&rp->r_statelock); 6047 error = NFS_VERF_MISMATCH; 6048 } else { 6049 if (error == EACCES) { 6050 mutex_enter(&rp->r_statelock); 6051 if (cred != cr) { 6052 if (rp->r_cred != NULL) 6053 crfree(rp->r_cred); 6054 rp->r_cred = cr; 6055 crhold(cr); 6056 cred = cr; 6057 crhold(cred); 6058 mutex_exit(&rp->r_statelock); 6059 goto doitagain; 6060 } 6061 mutex_exit(&rp->r_statelock); 6062 } 6063 /* 6064 * Can't do a PURGE_STALE_FH here because this 6065 * can cause a deadlock. nfs3_commit can 6066 * be called from nfs3_dispose which can be called 6067 * indirectly via pvn_vplist_dirty. PURGE_STALE_FH 6068 * can call back to pvn_vplist_dirty. 6069 */ 6070 if (error == ESTALE) { 6071 mutex_enter(&rp->r_statelock); 6072 rp->r_flags |= RSTALE; 6073 if (!rp->r_error) 6074 rp->r_error = error; 6075 mutex_exit(&rp->r_statelock); 6076 PURGE_ATTRCACHE(vp); 6077 } else { 6078 mutex_enter(&rp->r_statelock); 6079 if (!rp->r_error) 6080 rp->r_error = error; 6081 mutex_exit(&rp->r_statelock); 6082 } 6083 } 6084 6085 return (error); 6086 } 6087 6088 static void 6089 nfs3_set_mod(vnode_t *vp) 6090 { 6091 page_t *pp; 6092 kmutex_t *vphm; 6093 6094 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6095 vphm = page_vnode_mutex(vp); 6096 mutex_enter(vphm); 6097 if ((pp = vp->v_pages) != NULL) { 6098 do { 6099 if (pp->p_fsdata != C_NOCOMMIT) { 6100 hat_setmod(pp); 6101 pp->p_fsdata = C_NOCOMMIT; 6102 } 6103 } while ((pp = pp->p_vpnext) != vp->v_pages); 6104 } 6105 mutex_exit(vphm); 6106 } 6107 6108 6109 /* 6110 * This routine is used to gather together a page list of the pages 6111 * which are to be committed on the server. This routine must not 6112 * be called if the calling thread holds any locked pages. 6113 * 6114 * The calling thread must have set RCOMMIT. This bit is used to 6115 * serialize access to the commit structure in the rnode. As long 6116 * as the thread has set RCOMMIT, then it can manipulate the commit 6117 * structure without requiring any other locks. 6118 */ 6119 static void 6120 nfs3_get_commit(vnode_t *vp) 6121 { 6122 rnode_t *rp; 6123 page_t *pp; 6124 kmutex_t *vphm; 6125 6126 rp = VTOR(vp); 6127 6128 ASSERT(rp->r_flags & RCOMMIT); 6129 6130 vphm = page_vnode_mutex(vp); 6131 mutex_enter(vphm); 6132 6133 /* 6134 * If there are no pages associated with this vnode, then 6135 * just return. 6136 */ 6137 if ((pp = vp->v_pages) == NULL) { 6138 mutex_exit(vphm); 6139 return; 6140 } 6141 6142 /* 6143 * Step through all of the pages associated with this vnode 6144 * looking for pages which need to be committed. 6145 */ 6146 do { 6147 /* 6148 * If this page does not need to be committed or is 6149 * modified, then just skip it. 6150 */ 6151 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 6152 continue; 6153 6154 /* 6155 * Attempt to lock the page. If we can't, then 6156 * someone else is messing with it and we will 6157 * just skip it. 6158 */ 6159 if (!page_trylock(pp, SE_EXCL)) 6160 continue; 6161 6162 /* 6163 * If this page does not need to be committed or is 6164 * modified, then just skip it. Recheck now that 6165 * the page is locked. 6166 */ 6167 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 6168 page_unlock(pp); 6169 continue; 6170 } 6171 6172 if (PP_ISFREE(pp)) { 6173 cmn_err(CE_PANIC, "nfs3_get_commit: %p is free", 6174 (void *)pp); 6175 } 6176 6177 /* 6178 * The page needs to be committed and we locked it. 6179 * Update the base and length parameters and add it 6180 * to r_pages. 6181 */ 6182 if (rp->r_commit.c_pages == NULL) { 6183 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6184 rp->r_commit.c_commlen = PAGESIZE; 6185 } else if (pp->p_offset < rp->r_commit.c_commbase) { 6186 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 6187 (offset3)pp->p_offset + rp->r_commit.c_commlen; 6188 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6189 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 6190 <= pp->p_offset) { 6191 rp->r_commit.c_commlen = (offset3)pp->p_offset - 6192 rp->r_commit.c_commbase + PAGESIZE; 6193 } 6194 page_add(&rp->r_commit.c_pages, pp); 6195 } while ((pp = pp->p_vpnext) != vp->v_pages); 6196 6197 mutex_exit(vphm); 6198 } 6199 6200 /* 6201 * This routine is used to gather together a page list of the pages 6202 * which are to be committed on the server. This routine must not 6203 * be called if the calling thread holds any locked pages. 6204 * 6205 * The calling thread must have set RCOMMIT. This bit is used to 6206 * serialize access to the commit structure in the rnode. As long 6207 * as the thread has set RCOMMIT, then it can manipulate the commit 6208 * structure without requiring any other locks. 6209 */ 6210 static void 6211 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 6212 { 6213 6214 rnode_t *rp; 6215 page_t *pp; 6216 u_offset_t end; 6217 u_offset_t off; 6218 6219 ASSERT(len != 0); 6220 6221 rp = VTOR(vp); 6222 6223 ASSERT(rp->r_flags & RCOMMIT); 6224 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6225 6226 /* 6227 * If there are no pages associated with this vnode, then 6228 * just return. 6229 */ 6230 if ((pp = vp->v_pages) == NULL) 6231 return; 6232 6233 /* 6234 * Calculate the ending offset. 6235 */ 6236 end = soff + len; 6237 6238 for (off = soff; off < end; off += PAGESIZE) { 6239 /* 6240 * Lookup each page by vp, offset. 6241 */ 6242 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 6243 continue; 6244 6245 /* 6246 * If this page does not need to be committed or is 6247 * modified, then just skip it. 6248 */ 6249 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 6250 page_unlock(pp); 6251 continue; 6252 } 6253 6254 ASSERT(PP_ISFREE(pp) == 0); 6255 6256 /* 6257 * The page needs to be committed and we locked it. 6258 * Update the base and length parameters and add it 6259 * to r_pages. 6260 */ 6261 if (rp->r_commit.c_pages == NULL) { 6262 rp->r_commit.c_commbase = (offset3)pp->p_offset; 6263 rp->r_commit.c_commlen = PAGESIZE; 6264 } else { 6265 rp->r_commit.c_commlen = (offset3)pp->p_offset - 6266 rp->r_commit.c_commbase + PAGESIZE; 6267 } 6268 page_add(&rp->r_commit.c_pages, pp); 6269 } 6270 } 6271 6272 #if 0 /* unused */ 6273 #ifdef DEBUG 6274 static int 6275 nfs3_no_uncommitted_pages(vnode_t *vp) 6276 { 6277 page_t *pp; 6278 kmutex_t *vphm; 6279 6280 vphm = page_vnode_mutex(vp); 6281 mutex_enter(vphm); 6282 if ((pp = vp->v_pages) != NULL) { 6283 do { 6284 if (pp->p_fsdata != C_NOCOMMIT) { 6285 mutex_exit(vphm); 6286 return (0); 6287 } 6288 } while ((pp = pp->p_vpnext) != vp->v_pages); 6289 } 6290 mutex_exit(vphm); 6291 6292 return (1); 6293 } 6294 #endif 6295 #endif 6296 6297 static int 6298 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 6299 { 6300 int error; 6301 writeverf3 write_verf; 6302 rnode_t *rp = VTOR(vp); 6303 6304 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6305 /* 6306 * Flush the data portion of the file and then commit any 6307 * portions which need to be committed. This may need to 6308 * be done twice if the server has changed state since 6309 * data was last written. The data will need to be 6310 * rewritten to the server and then a new commit done. 6311 * 6312 * In fact, this may need to be done several times if the 6313 * server is having problems and crashing while we are 6314 * attempting to do this. 6315 */ 6316 6317 top: 6318 /* 6319 * Do a flush based on the poff and plen arguments. This 6320 * will asynchronously write out any modified pages in the 6321 * range specified by (poff, plen). This starts all of the 6322 * i/o operations which will be waited for in the next 6323 * call to nfs3_putpage 6324 */ 6325 6326 mutex_enter(&rp->r_statelock); 6327 write_verf = rp->r_verf; 6328 mutex_exit(&rp->r_statelock); 6329 6330 error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr); 6331 if (error == EAGAIN) 6332 error = 0; 6333 6334 /* 6335 * Do a flush based on the poff and plen arguments. This 6336 * will synchronously write out any modified pages in the 6337 * range specified by (poff, plen) and wait until all of 6338 * the asynchronous i/o's in that range are done as well. 6339 */ 6340 if (!error) 6341 error = nfs3_putpage(vp, poff, plen, 0, cr); 6342 6343 if (error) 6344 return (error); 6345 6346 mutex_enter(&rp->r_statelock); 6347 if (rp->r_verf != write_verf) { 6348 mutex_exit(&rp->r_statelock); 6349 goto top; 6350 } 6351 mutex_exit(&rp->r_statelock); 6352 6353 /* 6354 * Now commit any pages which might need to be committed. 6355 * If the error, NFS_VERF_MISMATCH, is returned, then 6356 * start over with the flush operation. 6357 */ 6358 6359 error = nfs3_commit_vp(vp, poff, plen, cr); 6360 6361 if (error == NFS_VERF_MISMATCH) 6362 goto top; 6363 6364 return (error); 6365 } 6366 6367 static int 6368 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr) 6369 { 6370 rnode_t *rp; 6371 page_t *plist; 6372 offset3 offset; 6373 count3 len; 6374 6375 6376 rp = VTOR(vp); 6377 6378 if (nfs_zone() != VTOMI(vp)->mi_zone) 6379 return (EIO); 6380 /* 6381 * Set the `commit inprogress' state bit. We must 6382 * first wait until any current one finishes. 6383 */ 6384 mutex_enter(&rp->r_statelock); 6385 while (rp->r_flags & RCOMMIT) { 6386 rp->r_flags |= RCOMMITWAIT; 6387 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 6388 rp->r_flags &= ~RCOMMITWAIT; 6389 } 6390 rp->r_flags |= RCOMMIT; 6391 mutex_exit(&rp->r_statelock); 6392 6393 /* 6394 * Gather together all of the pages which need to be 6395 * committed. 6396 */ 6397 if (plen == 0) 6398 nfs3_get_commit(vp); 6399 else 6400 nfs3_get_commit_range(vp, poff, plen); 6401 6402 /* 6403 * Clear the `commit inprogress' bit and disconnect the 6404 * page list which was gathered together in nfs3_get_commit. 6405 */ 6406 plist = rp->r_commit.c_pages; 6407 rp->r_commit.c_pages = NULL; 6408 offset = rp->r_commit.c_commbase; 6409 len = rp->r_commit.c_commlen; 6410 mutex_enter(&rp->r_statelock); 6411 rp->r_flags &= ~RCOMMIT; 6412 cv_broadcast(&rp->r_commit.c_cv); 6413 mutex_exit(&rp->r_statelock); 6414 6415 /* 6416 * If any pages need to be committed, commit them and 6417 * then unlock them so that they can be freed some 6418 * time later. 6419 */ 6420 if (plist != NULL) { 6421 /* 6422 * No error occurred during the flush portion 6423 * of this operation, so now attempt to commit 6424 * the data to stable storage on the server. 6425 * 6426 * This will unlock all of the pages on the list. 6427 */ 6428 return (nfs3_sync_commit(vp, plist, offset, len, cr)); 6429 } 6430 return (0); 6431 } 6432 6433 static int 6434 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 6435 cred_t *cr) 6436 { 6437 int error; 6438 page_t *pp; 6439 6440 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6441 error = nfs3_commit(vp, offset, count, cr); 6442 6443 /* 6444 * If we got an error, then just unlock all of the pages 6445 * on the list. 6446 */ 6447 if (error) { 6448 while (plist != NULL) { 6449 pp = plist; 6450 page_sub(&plist, pp); 6451 page_unlock(pp); 6452 } 6453 return (error); 6454 } 6455 /* 6456 * We've tried as hard as we can to commit the data to stable 6457 * storage on the server. We just unlock the pages and clear 6458 * the commit required state. They will get freed later. 6459 */ 6460 while (plist != NULL) { 6461 pp = plist; 6462 page_sub(&plist, pp); 6463 pp->p_fsdata = C_NOCOMMIT; 6464 page_unlock(pp); 6465 } 6466 6467 return (error); 6468 } 6469 6470 static void 6471 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 6472 cred_t *cr) 6473 { 6474 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); 6475 (void) nfs3_sync_commit(vp, plist, offset, count, cr); 6476 } 6477 6478 static int 6479 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 6480 { 6481 int error; 6482 mntinfo_t *mi; 6483 6484 mi = VTOMI(vp); 6485 6486 if (nfs_zone() != mi->mi_zone) 6487 return (EIO); 6488 6489 if (mi->mi_flags & MI_ACL) { 6490 error = acl_setacl3(vp, vsecattr, flag, cr); 6491 if (mi->mi_flags & MI_ACL) 6492 return (error); 6493 } 6494 6495 return (ENOSYS); 6496 } 6497 6498 static int 6499 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 6500 { 6501 int error; 6502 mntinfo_t *mi; 6503 6504 mi = VTOMI(vp); 6505 6506 if (nfs_zone() != mi->mi_zone) 6507 return (EIO); 6508 6509 if (mi->mi_flags & MI_ACL) { 6510 error = acl_getacl3(vp, vsecattr, flag, cr); 6511 if (mi->mi_flags & MI_ACL) 6512 return (error); 6513 } 6514 6515 return (fs_fab_acl(vp, vsecattr, flag, cr)); 6516 } 6517 6518 static int 6519 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 6520 { 6521 int error; 6522 struct shrlock nshr; 6523 struct nfs_owner nfs_owner; 6524 netobj lm_fh3; 6525 6526 if (nfs_zone() != VTOMI(vp)->mi_zone) 6527 return (EIO); 6528 6529 /* 6530 * check for valid cmd parameter 6531 */ 6532 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 6533 return (EINVAL); 6534 6535 /* 6536 * Check access permissions 6537 */ 6538 if (cmd == F_SHARE && 6539 (((shr->s_access & F_RDACC) && !(flag & FREAD)) || 6540 ((shr->s_access & F_WRACC) && !(flag & FWRITE)))) 6541 return (EBADF); 6542 6543 /* 6544 * If the filesystem is mounted using local locking, pass the 6545 * request off to the local share code. 6546 */ 6547 if (VTOMI(vp)->mi_flags & MI_LLOCK) 6548 return (fs_shrlock(vp, cmd, shr, flag, cr)); 6549 6550 switch (cmd) { 6551 case F_SHARE: 6552 case F_UNSHARE: 6553 lm_fh3.n_len = VTOFH3(vp)->fh3_length; 6554 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data); 6555 6556 /* 6557 * If passed an owner that is too large to fit in an 6558 * nfs_owner it is likely a recursive call from the 6559 * lock manager client and pass it straight through. If 6560 * it is not a nfs_owner then simply return an error. 6561 */ 6562 if (shr->s_own_len > sizeof (nfs_owner.lowner)) { 6563 if (((struct nfs_owner *)shr->s_owner)->magic != 6564 NFS_OWNER_MAGIC) 6565 return (EINVAL); 6566 6567 if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) { 6568 error = set_errno(error); 6569 } 6570 return (error); 6571 } 6572 /* 6573 * Remote share reservations owner is a combination of 6574 * a magic number, hostname, and the local owner 6575 */ 6576 bzero(&nfs_owner, sizeof (nfs_owner)); 6577 nfs_owner.magic = NFS_OWNER_MAGIC; 6578 (void) strncpy(nfs_owner.hname, uts_nodename(), 6579 sizeof (nfs_owner.hname)); 6580 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len); 6581 nshr.s_access = shr->s_access; 6582 nshr.s_deny = shr->s_deny; 6583 nshr.s_sysid = 0; 6584 nshr.s_pid = ttoproc(curthread)->p_pid; 6585 nshr.s_own_len = sizeof (nfs_owner); 6586 nshr.s_owner = (caddr_t)&nfs_owner; 6587 6588 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) { 6589 error = set_errno(error); 6590 } 6591 6592 break; 6593 6594 case F_HASREMOTELOCKS: 6595 /* 6596 * NFS client can't store remote locks itself 6597 */ 6598 shr->s_access = 0; 6599 error = 0; 6600 break; 6601 6602 default: 6603 error = EINVAL; 6604 break; 6605 } 6606 6607 return (error); 6608 } 6609