1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/cmn_err.h> 57 #include <sys/pathconf.h> 58 #include <sys/utsname.h> 59 #include <sys/dnlc.h> 60 #include <sys/acl.h> 61 #include <sys/systeminfo.h> 62 #include <sys/policy.h> 63 #include <sys/sdt.h> 64 #include <sys/list.h> 65 #include <sys/stat.h> 66 67 #include <rpc/types.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs4_kprot.h> 77 #include <nfs/rnode4.h> 78 #include <nfs/nfs4_clnt.h> 79 80 #include <vm/hat.h> 81 #include <vm/as.h> 82 #include <vm/page.h> 83 #include <vm/pvn.h> 84 #include <vm/seg.h> 85 #include <vm/seg_map.h> 86 #include <vm/seg_kpm.h> 87 #include <vm/seg_vn.h> 88 89 #include <fs/fs_subr.h> 90 91 #include <sys/ddi.h> 92 #include <sys/int_fmtio.h> 93 94 typedef struct { 95 nfs4_ga_res_t *di_garp; 96 cred_t *di_cred; 97 hrtime_t di_time_call; 98 } dirattr_info_t; 99 100 typedef enum nfs4_acl_op { 101 NFS4_ACL_GET, 102 NFS4_ACL_SET 103 } nfs4_acl_op_t; 104 105 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 106 char *, dirattr_info_t *); 107 108 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 109 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 110 nfs4_error_t *, int *); 111 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 112 cred_t *); 113 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 114 stable_how4 *); 115 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 116 cred_t *, bool_t, struct uio *); 117 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 118 vsecattr_t *); 119 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 120 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 121 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 122 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 123 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 124 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 125 int, vnode_t **, cred_t *); 126 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 127 cred_t *, int, int, enum createmode4, int); 128 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 129 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 130 vnode_t *, char *, cred_t *, nfsstat4 *); 131 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 132 vnode_t *, char *, cred_t *, nfsstat4 *); 133 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 134 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 135 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 136 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 137 page_t *[], size_t, struct seg *, caddr_t, 138 enum seg_rw, cred_t *); 139 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 140 cred_t *); 141 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 142 int, cred_t *); 143 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 144 int, cred_t *); 145 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 146 static void nfs4_set_mod(vnode_t *); 147 static void nfs4_get_commit(vnode_t *); 148 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 149 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 150 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 151 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 152 cred_t *); 153 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 154 cred_t *); 155 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 156 hrtime_t, vnode_t *, cred_t *); 157 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 158 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 159 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 160 u_offset_t); 161 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 162 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 163 static cred_t *state_to_cred(nfs4_open_stream_t *); 164 static int vtoname(vnode_t *, char *, ssize_t); 165 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 166 static pid_t lo_to_pid(lock_owner4 *); 167 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 168 cred_t *, nfs4_lock_owner_t *); 169 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 170 nfs4_lock_owner_t *); 171 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 172 static void nfs4_delmap_callback(struct as *, void *, uint_t); 173 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 174 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 175 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 176 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 177 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 178 uid_t, gid_t, int); 179 180 /* 181 * Routines that implement the setting of v4 args for the misc. ops 182 */ 183 static void nfs4args_lock_free(nfs_argop4 *); 184 static void nfs4args_lockt_free(nfs_argop4 *); 185 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 186 int, rnode4_t *, cred_t *, bitmap4, int *, 187 nfs4_stateid_types_t *); 188 static void nfs4args_setattr_free(nfs_argop4 *); 189 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 190 bitmap4); 191 static void nfs4args_verify_free(nfs_argop4 *); 192 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 193 WRITE4args **, nfs4_stateid_types_t *); 194 195 /* 196 * These are the vnode ops functions that implement the vnode interface to 197 * the networked file system. See more comments below at nfs4_vnodeops. 198 */ 199 static int nfs4_open(vnode_t **, int, cred_t *); 200 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *); 201 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 202 caller_context_t *); 203 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 204 caller_context_t *); 205 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 206 static int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *); 207 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 208 caller_context_t *); 209 static int nfs4_access(vnode_t *, int, int, cred_t *); 210 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *); 211 static int nfs4_fsync(vnode_t *, int, cred_t *); 212 static void nfs4_inactive(vnode_t *, cred_t *); 213 static int nfs4_lookup(vnode_t *, char *, vnode_t **, 214 struct pathname *, int, vnode_t *, cred_t *); 215 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 216 int, vnode_t **, cred_t *, int); 217 static int nfs4_remove(vnode_t *, char *, cred_t *); 218 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *); 219 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 220 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, 221 vnode_t **, cred_t *); 222 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 223 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 224 cred_t *); 225 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *); 226 static int nfs4_fid(vnode_t *, fid_t *); 227 static int nfs4_rwlock(vnode_t *, int, caller_context_t *); 228 static void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 229 static int nfs4_seek(vnode_t *, offset_t, offset_t *); 230 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 231 page_t *[], size_t, struct seg *, caddr_t, 232 enum seg_rw, cred_t *); 233 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 234 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, 235 size_t, uchar_t, uchar_t, uint_t, cred_t *); 236 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, 237 size_t, uchar_t, uchar_t, uint_t, cred_t *); 238 static int nfs4_cmp(vnode_t *, vnode_t *); 239 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 240 struct flk_callback *, cred_t *); 241 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 242 cred_t *, caller_context_t *); 243 static int nfs4_realvp(vnode_t *, vnode_t **); 244 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, 245 size_t, uint_t, uint_t, uint_t, cred_t *); 246 static int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *); 247 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 248 cred_t *); 249 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *); 250 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 251 static int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 252 static int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 253 254 /* 255 * Used for nfs4_commit_vp() to indicate if we should 256 * wait on pending writes. 257 */ 258 #define NFS4_WRITE_NOWAIT 0 259 #define NFS4_WRITE_WAIT 1 260 261 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 262 263 /* 264 * Error flags used to pass information about certain special errors 265 * which need to be handled specially. 266 */ 267 #define NFS_EOF -98 268 #define NFS_VERF_MISMATCH -97 269 270 /* 271 * Flags used to differentiate between which operation drove the 272 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 273 */ 274 #define NFS4_CLOSE_OP 0x1 275 #define NFS4_DELMAP_OP 0x2 276 #define NFS4_INACTIVE_OP 0x3 277 278 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 279 280 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 281 #define ALIGN64(x, ptr, sz) \ 282 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 283 if (x) { \ 284 x = sizeof (uint64_t) - (x); \ 285 sz -= (x); \ 286 ptr += (x); \ 287 } 288 289 #ifdef DEBUG 290 int nfs4_client_attr_debug = 0; 291 int nfs4_client_state_debug = 0; 292 int nfs4_client_shadow_debug = 0; 293 int nfs4_client_lock_debug = 0; 294 int nfs4_seqid_sync = 0; 295 int nfs4_client_map_debug = 0; 296 static int nfs4_pageio_debug = 0; 297 int nfs4_client_inactive_debug = 0; 298 int nfs4_client_recov_debug = 0; 299 int nfs4_client_recov_stub_debug = 0; 300 int nfs4_client_failover_debug = 0; 301 int nfs4_client_call_debug = 0; 302 int nfs4_client_lookup_debug = 0; 303 int nfs4_client_zone_debug = 0; 304 int nfs4_lost_rqst_debug = 0; 305 int nfs4_rdattrerr_debug = 0; 306 int nfs4_open_stream_debug = 0; 307 308 int nfs4read_error_inject; 309 310 static int nfs4_create_misses = 0; 311 312 static int nfs4_readdir_cache_shorts = 0; 313 static int nfs4_readdir_readahead = 0; 314 315 static int nfs4_bio_do_stop = 0; 316 317 static int nfs4_lostpage = 0; /* number of times we lost original page */ 318 319 int nfs4_mmap_debug = 0; 320 321 static int nfs4_pathconf_cache_hits = 0; 322 static int nfs4_pathconf_cache_misses = 0; 323 324 int nfs4close_all_cnt; 325 int nfs4close_one_debug = 0; 326 int nfs4close_notw_debug = 0; 327 328 int denied_to_flk_debug = 0; 329 void *lockt_denied_debug; 330 331 #endif 332 333 /* 334 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 335 * or NFS4ERR_RESOURCE. 336 */ 337 static int confirm_retry_sec = 30; 338 339 static int nfs4_lookup_neg_cache = 1; 340 341 /* 342 * number of pages to read ahead 343 * optimized for 100 base-T. 344 */ 345 static int nfs4_nra = 4; 346 347 static int nfs4_do_symlink_cache = 1; 348 349 static int nfs4_pathconf_disable_cache = 0; 350 351 /* 352 * These are the vnode ops routines which implement the vnode interface to 353 * the networked file system. These routines just take their parameters, 354 * make them look networkish by putting the right info into interface structs, 355 * and then calling the appropriate remote routine(s) to do the work. 356 * 357 * Note on directory name lookup cacheing: If we detect a stale fhandle, 358 * we purge the directory cache relative to that vnode. This way, the 359 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 360 * more details on rnode locking. 361 */ 362 363 struct vnodeops *nfs4_vnodeops; 364 365 const fs_operation_def_t nfs4_vnodeops_template[] = { 366 VOPNAME_OPEN, nfs4_open, 367 VOPNAME_CLOSE, nfs4_close, 368 VOPNAME_READ, nfs4_read, 369 VOPNAME_WRITE, nfs4_write, 370 VOPNAME_IOCTL, nfs4_ioctl, 371 VOPNAME_GETATTR, nfs4_getattr, 372 VOPNAME_SETATTR, nfs4_setattr, 373 VOPNAME_ACCESS, nfs4_access, 374 VOPNAME_LOOKUP, nfs4_lookup, 375 VOPNAME_CREATE, nfs4_create, 376 VOPNAME_REMOVE, nfs4_remove, 377 VOPNAME_LINK, nfs4_link, 378 VOPNAME_RENAME, nfs4_rename, 379 VOPNAME_MKDIR, nfs4_mkdir, 380 VOPNAME_RMDIR, nfs4_rmdir, 381 VOPNAME_READDIR, nfs4_readdir, 382 VOPNAME_SYMLINK, nfs4_symlink, 383 VOPNAME_READLINK, nfs4_readlink, 384 VOPNAME_FSYNC, nfs4_fsync, 385 VOPNAME_INACTIVE, (fs_generic_func_p) nfs4_inactive, 386 VOPNAME_FID, nfs4_fid, 387 VOPNAME_RWLOCK, nfs4_rwlock, 388 VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs4_rwunlock, 389 VOPNAME_SEEK, nfs4_seek, 390 VOPNAME_FRLOCK, nfs4_frlock, 391 VOPNAME_SPACE, nfs4_space, 392 VOPNAME_REALVP, nfs4_realvp, 393 VOPNAME_GETPAGE, nfs4_getpage, 394 VOPNAME_PUTPAGE, nfs4_putpage, 395 VOPNAME_MAP, (fs_generic_func_p) nfs4_map, 396 VOPNAME_ADDMAP, (fs_generic_func_p) nfs4_addmap, 397 VOPNAME_DELMAP, nfs4_delmap, 398 VOPNAME_DUMP, nfs_dump, /* there is no separate nfs4_dump */ 399 VOPNAME_PATHCONF, nfs4_pathconf, 400 VOPNAME_PAGEIO, nfs4_pageio, 401 VOPNAME_DISPOSE, (fs_generic_func_p) nfs4_dispose, 402 VOPNAME_SETSECATTR, nfs4_setsecattr, 403 VOPNAME_GETSECATTR, nfs4_getsecattr, 404 VOPNAME_SHRLOCK, nfs4_shrlock, 405 NULL, NULL 406 }; 407 408 /* 409 * The following are subroutines and definitions to set args or get res 410 * for the different nfsv4 ops 411 */ 412 413 void 414 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 415 { 416 int i; 417 418 for (i = 0; i < arglen; i++) { 419 if (argop[i].argop == OP_LOOKUP) 420 kmem_free( 421 argop[i].nfs_argop4_u.oplookup.objname.utf8string_val, 422 argop[i].nfs_argop4_u.oplookup.objname.utf8string_len); 423 } 424 } 425 426 static void 427 nfs4args_lock_free(nfs_argop4 *argop) 428 { 429 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 430 431 if (locker->new_lock_owner == TRUE) { 432 open_to_lock_owner4 *open_owner; 433 434 open_owner = &locker->locker4_u.open_owner; 435 if (open_owner->lock_owner.owner_val != NULL) { 436 kmem_free(open_owner->lock_owner.owner_val, 437 open_owner->lock_owner.owner_len); 438 } 439 } 440 } 441 442 static void 443 nfs4args_lockt_free(nfs_argop4 *argop) 444 { 445 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 446 447 if (lowner->owner_val != NULL) { 448 kmem_free(lowner->owner_val, lowner->owner_len); 449 } 450 } 451 452 static void 453 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 454 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 455 nfs4_stateid_types_t *sid_types) 456 { 457 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 458 mntinfo4_t *mi; 459 460 argop->argop = OP_SETATTR; 461 /* 462 * The stateid is set to 0 if client is not modifying the size 463 * and otherwise to whatever nfs4_get_stateid() returns. 464 * 465 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 466 * state struct could be found for the process/file pair. We may 467 * want to change this in the future (by OPENing the file). See 468 * bug # 4474852. 469 */ 470 if (vap->va_mask & AT_SIZE) { 471 472 ASSERT(rp != NULL); 473 mi = VTOMI4(RTOV4(rp)); 474 475 argop->nfs_argop4_u.opsetattr.stateid = 476 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 477 OP_SETATTR, sid_types, FALSE); 478 } else { 479 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 480 sizeof (stateid4)); 481 } 482 483 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 484 if (*error) 485 bzero(attr, sizeof (*attr)); 486 } 487 488 static void 489 nfs4args_setattr_free(nfs_argop4 *argop) 490 { 491 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 492 } 493 494 static int 495 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 496 bitmap4 supp) 497 { 498 fattr4 *attr; 499 int error = 0; 500 501 argop->argop = op; 502 switch (op) { 503 case OP_VERIFY: 504 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 505 break; 506 case OP_NVERIFY: 507 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 508 break; 509 default: 510 return (EINVAL); 511 } 512 if (!error) 513 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 514 if (error) 515 bzero(attr, sizeof (*attr)); 516 return (error); 517 } 518 519 static void 520 nfs4args_verify_free(nfs_argop4 *argop) 521 { 522 switch (argop->argop) { 523 case OP_VERIFY: 524 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 525 break; 526 case OP_NVERIFY: 527 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 528 break; 529 default: 530 break; 531 } 532 } 533 534 static void 535 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 536 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 537 { 538 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 539 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 540 541 argop->argop = OP_WRITE; 542 wargs->stable = stable; 543 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 544 mi, OP_WRITE, sid_tp); 545 wargs->mblk = NULL; 546 *wargs_pp = wargs; 547 } 548 549 void 550 nfs4args_copen_free(OPEN4cargs *open_args) 551 { 552 if (open_args->owner.owner_val) { 553 kmem_free(open_args->owner.owner_val, 554 open_args->owner.owner_len); 555 } 556 if ((open_args->opentype == OPEN4_CREATE) && 557 (open_args->mode != EXCLUSIVE4)) { 558 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 559 } 560 } 561 562 /* 563 * XXX: This is referenced in modstubs.s 564 */ 565 struct vnodeops * 566 nfs4_getvnodeops(void) 567 { 568 return (nfs4_vnodeops); 569 } 570 571 /* 572 * The OPEN operation opens a regular file. 573 * 574 * ARGSUSED 575 */ 576 static int 577 nfs4_open(vnode_t **vpp, int flag, cred_t *cr) 578 { 579 vnode_t *dvp = NULL; 580 rnode4_t *rp, *drp; 581 int error; 582 int just_been_created; 583 char fn[MAXNAMELEN]; 584 585 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 586 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 587 return (EIO); 588 rp = VTOR4(*vpp); 589 590 /* 591 * Check to see if opening something besides a regular file; 592 * if so skip the OTW call 593 */ 594 if ((*vpp)->v_type != VREG) { 595 error = nfs4_open_non_reg_file(vpp, flag, cr); 596 return (error); 597 } 598 599 /* 600 * XXX - would like a check right here to know if the file is 601 * executable or not, so as to skip OTW 602 */ 603 604 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 605 return (error); 606 607 drp = VTOR4(dvp); 608 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 609 return (EINTR); 610 611 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 612 nfs_rw_exit(&drp->r_rwlock); 613 return (error); 614 } 615 616 /* 617 * See if this file has just been CREATEd. 618 * If so, clear the flag and update the dnlc, which was previously 619 * skipped in nfs4_create. 620 * XXX need better serilization on this. 621 * XXX move this into the nf4open_otw call, after we have 622 * XXX acquired the open owner seqid sync. 623 */ 624 mutex_enter(&rp->r_statev4_lock); 625 if (rp->created_v4) { 626 rp->created_v4 = 0; 627 mutex_exit(&rp->r_statev4_lock); 628 629 dnlc_update(dvp, fn, *vpp); 630 /* This is needed so we don't bump the open ref count */ 631 just_been_created = 1; 632 } else { 633 mutex_exit(&rp->r_statev4_lock); 634 just_been_created = 0; 635 } 636 637 /* 638 * If caller specified O_TRUNC/FTRUNC, then be sure to set 639 * FWRITE (to drive successful setattr(size=0) after open) 640 */ 641 if (flag & FTRUNC) 642 flag |= FWRITE; 643 644 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 645 just_been_created); 646 647 if (!error && !((*vpp)->v_flag & VROOT)) 648 dnlc_update(dvp, fn, *vpp); 649 650 nfs_rw_exit(&drp->r_rwlock); 651 652 /* release the hold from vtodv */ 653 VN_RELE(dvp); 654 655 /* exchange the shadow for the master vnode, if needed */ 656 657 if (error == 0 && IS_SHADOW(*vpp, rp)) 658 sv_exchange(vpp); 659 660 return (error); 661 } 662 663 /* 664 * See if there's a "lost open" request to be saved and recovered. 665 */ 666 static void 667 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 668 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 669 vnode_t *dvp, OPEN4cargs *open_args) 670 { 671 vfs_t *vfsp; 672 char *srccfp; 673 674 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 675 676 if (error != ETIMEDOUT && error != EINTR && 677 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 678 lost_rqstp->lr_op = 0; 679 return; 680 } 681 682 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 683 "nfs4open_save_lost_rqst: error %d", error)); 684 685 lost_rqstp->lr_op = OP_OPEN; 686 /* 687 * The vp (if it is not NULL) and dvp are held and rele'd via 688 * the recovery code. See nfs4_save_lost_rqst. 689 */ 690 lost_rqstp->lr_vp = vp; 691 lost_rqstp->lr_dvp = dvp; 692 lost_rqstp->lr_oop = oop; 693 lost_rqstp->lr_osp = NULL; 694 lost_rqstp->lr_lop = NULL; 695 lost_rqstp->lr_cr = cr; 696 lost_rqstp->lr_flk = NULL; 697 lost_rqstp->lr_oacc = open_args->share_access; 698 lost_rqstp->lr_odeny = open_args->share_deny; 699 lost_rqstp->lr_oclaim = open_args->claim; 700 if (open_args->claim == CLAIM_DELEGATE_CUR) { 701 lost_rqstp->lr_ostateid = 702 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 703 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 704 } else { 705 srccfp = open_args->open_claim4_u.cfile; 706 } 707 lost_rqstp->lr_ofile.utf8string_len = 0; 708 lost_rqstp->lr_ofile.utf8string_val = NULL; 709 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 710 lost_rqstp->lr_putfirst = FALSE; 711 } 712 713 struct nfs4_excl_time { 714 uint32 seconds; 715 uint32 nseconds; 716 }; 717 718 /* 719 * The OPEN operation creates and/or opens a regular file 720 * 721 * ARGSUSED 722 */ 723 static int 724 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 725 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 726 enum createmode4 createmode, int file_just_been_created) 727 { 728 rnode4_t *rp; 729 rnode4_t *drp = VTOR4(dvp); 730 vnode_t *vp = NULL; 731 vnode_t *vpi = *vpp; 732 bool_t needrecov = FALSE; 733 734 int doqueue = 1; 735 736 COMPOUND4args_clnt args; 737 COMPOUND4res_clnt res; 738 nfs_argop4 *argop; 739 nfs_resop4 *resop; 740 int argoplist_size; 741 int idx_open, idx_fattr; 742 743 GETFH4res *gf_res = NULL; 744 OPEN4res *op_res = NULL; 745 nfs4_ga_res_t *garp; 746 fattr4 *attr = NULL; 747 struct nfs4_excl_time verf; 748 bool_t did_excl_setup = FALSE; 749 int created_osp; 750 751 OPEN4cargs *open_args; 752 nfs4_open_owner_t *oop = NULL; 753 nfs4_open_stream_t *osp = NULL; 754 seqid4 seqid = 0; 755 bool_t retry_open = FALSE; 756 nfs4_recov_state_t recov_state; 757 nfs4_lost_rqst_t lost_rqst; 758 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 759 hrtime_t t; 760 int acc = 0; 761 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 762 cred_t *ncr = NULL; 763 764 nfs4_sharedfh_t *otw_sfh; 765 nfs4_sharedfh_t *orig_sfh; 766 int fh_differs = 0; 767 int numops, setgid_flag; 768 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 769 770 /* 771 * Make sure we properly deal with setting the right gid on 772 * a newly created file to reflect the parent's setgid bit 773 */ 774 setgid_flag = 0; 775 if (create_flag && in_va) { 776 777 /* 778 * If the parent's directory has the setgid bit set 779 * _and_ the client was able to get a valid mapping 780 * for the parent dir's owner_group, we want to 781 * append NVERIFY(owner_group == dva.va_gid) and 782 * SETATTR to the CREATE compound. 783 */ 784 mutex_enter(&drp->r_statelock); 785 if (drp->r_attr.va_mode & VSGID && 786 drp->r_attr.va_gid != GID_NOBODY) { 787 in_va->va_gid = drp->r_attr.va_gid; 788 setgid_flag = 1; 789 } 790 mutex_exit(&drp->r_statelock); 791 } 792 793 /* 794 * Normal/non-create compound: 795 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 796 * 797 * Open(create) compound no setgid: 798 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 799 * RESTOREFH + GETATTR 800 * 801 * Open(create) setgid: 802 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 803 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 804 * NVERIFY(grp) + SETATTR 805 */ 806 if (setgid_flag) { 807 numops = 10; 808 idx_open = 1; 809 idx_fattr = 3; 810 } else if (create_flag) { 811 numops = 7; 812 idx_open = 2; 813 idx_fattr = 4; 814 } else { 815 numops = 4; 816 idx_open = 1; 817 idx_fattr = 3; 818 } 819 820 args.array_len = numops; 821 argoplist_size = numops * sizeof (nfs_argop4); 822 argop = kmem_alloc(argoplist_size, KM_SLEEP); 823 824 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 825 "open %s open flag 0x%x cred %p", file_name, open_flag, 826 (void *)cr)); 827 828 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 829 if (create_flag) { 830 /* 831 * We are to create a file. Initialize the passed in vnode 832 * pointer. 833 */ 834 vpi = NULL; 835 } else { 836 /* 837 * Check to see if the client owns a read delegation and is 838 * trying to open for write. If so, then return the delegation 839 * to avoid the server doing a cb_recall and returning DELAY. 840 * NB - we don't use the statev4_lock here because we'd have 841 * to drop the lock anyway and the result would be stale. 842 */ 843 if ((open_flag & FWRITE) && 844 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 845 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 846 847 /* 848 * If the file has a delegation, then do an access check up 849 * front. This avoids having to an access check later after 850 * we've already done start_op, which could deadlock. 851 */ 852 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 853 if (open_flag & FREAD && 854 nfs4_access(vpi, VREAD, 0, cr) == 0) 855 acc |= VREAD; 856 if (open_flag & FWRITE && 857 nfs4_access(vpi, VWRITE, 0, cr) == 0) 858 acc |= VWRITE; 859 } 860 } 861 862 drp = VTOR4(dvp); 863 864 recov_state.rs_flags = 0; 865 recov_state.rs_num_retry_despite_err = 0; 866 cred_otw = cr; 867 868 recov_retry: 869 fh_differs = 0; 870 nfs4_error_zinit(&e); 871 872 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 873 if (e.error) { 874 if (ncr != NULL) 875 crfree(ncr); 876 kmem_free(argop, argoplist_size); 877 return (e.error); 878 } 879 880 args.ctag = TAG_OPEN; 881 args.array_len = numops; 882 args.array = argop; 883 884 /* putfh directory fh */ 885 argop[0].argop = OP_CPUTFH; 886 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 887 888 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 889 argop[idx_open].argop = OP_COPEN; 890 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 891 open_args->claim = CLAIM_NULL; 892 893 /* name of file */ 894 open_args->open_claim4_u.cfile = file_name; 895 open_args->owner.owner_len = 0; 896 open_args->owner.owner_val = NULL; 897 898 if (create_flag) { 899 /* CREATE a file */ 900 open_args->opentype = OPEN4_CREATE; 901 open_args->mode = createmode; 902 if (createmode == EXCLUSIVE4) { 903 if (did_excl_setup == FALSE) { 904 verf.seconds = nfs_atoi(hw_serial); 905 if (verf.seconds != 0) 906 verf.nseconds = newnum(); 907 else { 908 timestruc_t now; 909 910 gethrestime(&now); 911 verf.seconds = now.tv_sec; 912 verf.nseconds = now.tv_nsec; 913 } 914 /* 915 * Since the server will use this value for the 916 * mtime, make sure that it can't overflow. Zero 917 * out the MSB. The actual value does not matter 918 * here, only its uniqeness. 919 */ 920 verf.seconds &= INT32_MAX; 921 did_excl_setup = TRUE; 922 } 923 924 /* Now copy over verifier to OPEN4args. */ 925 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 926 } else { 927 int v_error; 928 bitmap4 supp_attrs; 929 servinfo4_t *svp; 930 931 attr = &open_args->createhow4_u.createattrs; 932 933 svp = drp->r_server; 934 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 935 supp_attrs = svp->sv_supp_attrs; 936 nfs_rw_exit(&svp->sv_lock); 937 938 /* GUARDED4 or UNCHECKED4 */ 939 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 940 supp_attrs); 941 if (v_error) { 942 bzero(attr, sizeof (*attr)); 943 nfs4args_copen_free(open_args); 944 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 945 &recov_state, FALSE); 946 if (ncr != NULL) 947 crfree(ncr); 948 kmem_free(argop, argoplist_size); 949 return (v_error); 950 } 951 } 952 } else { 953 /* NO CREATE */ 954 open_args->opentype = OPEN4_NOCREATE; 955 } 956 957 if (recov_state.rs_sp != NULL) { 958 mutex_enter(&recov_state.rs_sp->s_lock); 959 open_args->owner.clientid = recov_state.rs_sp->clientid; 960 mutex_exit(&recov_state.rs_sp->s_lock); 961 } else { 962 /* XXX should we just fail here? */ 963 open_args->owner.clientid = 0; 964 } 965 966 /* 967 * This increments oop's ref count or creates a temporary 'just_created' 968 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 969 * completes. 970 */ 971 mutex_enter(&VTOMI4(dvp)->mi_lock); 972 973 /* See if a permanent or just created open owner exists */ 974 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 975 if (!oop) { 976 /* 977 * This open owner does not exist so create a temporary 978 * just created one. 979 */ 980 oop = create_open_owner(cr, VTOMI4(dvp)); 981 ASSERT(oop != NULL); 982 } 983 mutex_exit(&VTOMI4(dvp)->mi_lock); 984 985 /* this length never changes, do alloc before seqid sync */ 986 open_args->owner.owner_len = sizeof (oop->oo_name); 987 open_args->owner.owner_val = 988 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 989 990 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 991 if (e.error == EAGAIN) { 992 open_owner_rele(oop); 993 nfs4args_copen_free(open_args); 994 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 995 if (ncr != NULL) { 996 crfree(ncr); 997 ncr = NULL; 998 } 999 goto recov_retry; 1000 } 1001 1002 /* Check to see if we need to do the OTW call */ 1003 if (!create_flag) { 1004 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1005 file_just_been_created, &e.error, acc, &recov_state)) { 1006 1007 /* 1008 * The OTW open is not necessary. Either 1009 * the open can succeed without it (eg. 1010 * delegation, error == 0) or the open 1011 * must fail due to an access failure 1012 * (error != 0). In either case, tidy 1013 * up and return. 1014 */ 1015 1016 nfs4_end_open_seqid_sync(oop); 1017 open_owner_rele(oop); 1018 nfs4args_copen_free(open_args); 1019 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1020 if (ncr != NULL) 1021 crfree(ncr); 1022 kmem_free(argop, argoplist_size); 1023 return (e.error); 1024 } 1025 } 1026 1027 bcopy(&oop->oo_name, open_args->owner.owner_val, 1028 open_args->owner.owner_len); 1029 1030 seqid = nfs4_get_open_seqid(oop) + 1; 1031 open_args->seqid = seqid; 1032 open_args->share_access = 0; 1033 if (open_flag & FREAD) 1034 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1035 if (open_flag & FWRITE) 1036 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1037 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1038 1039 1040 1041 /* 1042 * getfh w/sanity check for idx_open/idx_fattr 1043 */ 1044 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1045 argop[idx_open + 1].argop = OP_GETFH; 1046 1047 /* getattr */ 1048 argop[idx_fattr].argop = OP_GETATTR; 1049 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1050 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1051 1052 if (setgid_flag) { 1053 vattr_t _v; 1054 servinfo4_t *svp; 1055 bitmap4 supp_attrs; 1056 1057 svp = drp->r_server; 1058 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1059 supp_attrs = svp->sv_supp_attrs; 1060 nfs_rw_exit(&svp->sv_lock); 1061 1062 /* 1063 * For setgid case, we need to: 1064 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1065 */ 1066 argop[4].argop = OP_SAVEFH; 1067 1068 argop[5].argop = OP_CPUTFH; 1069 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1070 1071 argop[6].argop = OP_GETATTR; 1072 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1073 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1074 1075 argop[7].argop = OP_RESTOREFH; 1076 1077 /* 1078 * nverify 1079 */ 1080 _v.va_mask = AT_GID; 1081 _v.va_gid = in_va->va_gid; 1082 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1083 supp_attrs))) { 1084 1085 /* 1086 * setattr 1087 * 1088 * We _know_ we're not messing with AT_SIZE or 1089 * AT_XTIME, so no need for stateid or flags. 1090 * Also we specify NULL rp since we're only 1091 * interested in setting owner_group attributes. 1092 */ 1093 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1094 supp_attrs, &e.error, 0); 1095 if (e.error) 1096 nfs4args_verify_free(&argop[8]); 1097 } 1098 1099 if (e.error) { 1100 /* 1101 * XXX - Revisit the last argument to nfs4_end_op() 1102 * once 5020486 is fixed. 1103 */ 1104 nfs4_end_open_seqid_sync(oop); 1105 open_owner_rele(oop); 1106 nfs4args_copen_free(open_args); 1107 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1108 if (ncr != NULL) 1109 crfree(ncr); 1110 kmem_free(argop, argoplist_size); 1111 return (e.error); 1112 } 1113 } else if (create_flag) { 1114 /* 1115 * For setgid case, we need to: 1116 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1117 */ 1118 argop[1].argop = OP_SAVEFH; 1119 1120 argop[5].argop = OP_RESTOREFH; 1121 1122 argop[6].argop = OP_GETATTR; 1123 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1124 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1125 } 1126 1127 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1128 "nfs4open_otw: %s call, nm %s, rp %s", 1129 needrecov ? "recov" : "first", file_name, 1130 rnode4info(VTOR4(dvp)))); 1131 1132 t = gethrtime(); 1133 1134 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1135 1136 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1137 nfs4_set_open_seqid(seqid, oop, args.ctag); 1138 1139 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1140 1141 if (e.error || needrecov) { 1142 bool_t abort = FALSE; 1143 1144 if (needrecov) { 1145 nfs4_bseqid_entry_t *bsep = NULL; 1146 1147 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1148 cred_otw, vpi, dvp, open_args); 1149 1150 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1151 bsep = nfs4_create_bseqid_entry(oop, NULL, 1152 vpi, 0, args.ctag, open_args->seqid); 1153 num_bseqid_retry--; 1154 } 1155 1156 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1157 NULL, lost_rqst.lr_op == OP_OPEN ? 1158 &lost_rqst : NULL, OP_OPEN, bsep); 1159 1160 if (bsep) 1161 kmem_free(bsep, sizeof (*bsep)); 1162 /* give up if we keep getting BAD_SEQID */ 1163 if (num_bseqid_retry == 0) 1164 abort = TRUE; 1165 if (abort == TRUE && e.error == 0) 1166 e.error = geterrno4(res.status); 1167 } 1168 nfs4_end_open_seqid_sync(oop); 1169 open_owner_rele(oop); 1170 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1171 nfs4args_copen_free(open_args); 1172 if (setgid_flag) { 1173 nfs4args_verify_free(&argop[8]); 1174 nfs4args_setattr_free(&argop[9]); 1175 } 1176 if (!e.error) 1177 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1178 if (ncr != NULL) { 1179 crfree(ncr); 1180 ncr = NULL; 1181 } 1182 if (!needrecov || abort == TRUE || e.error == EINTR || 1183 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1184 kmem_free(argop, argoplist_size); 1185 return (e.error); 1186 } 1187 goto recov_retry; 1188 } 1189 1190 /* 1191 * Will check and update lease after checking the rflag for 1192 * OPEN_CONFIRM in the successful OPEN call. 1193 */ 1194 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1195 1196 /* 1197 * XXX what if we're crossing mount points from server1:/drp 1198 * to server2:/drp/rp. 1199 */ 1200 1201 /* Signal our end of use of the open seqid */ 1202 nfs4_end_open_seqid_sync(oop); 1203 1204 /* 1205 * This will destroy the open owner if it was just created, 1206 * and no one else has put a reference on it. 1207 */ 1208 open_owner_rele(oop); 1209 if (create_flag && (createmode != EXCLUSIVE4) && 1210 res.status == NFS4ERR_BADOWNER) 1211 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1212 1213 e.error = geterrno4(res.status); 1214 nfs4args_copen_free(open_args); 1215 if (setgid_flag) { 1216 nfs4args_verify_free(&argop[8]); 1217 nfs4args_setattr_free(&argop[9]); 1218 } 1219 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1220 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1221 /* 1222 * If the reply is NFS4ERR_ACCESS, it may be because 1223 * we are root (no root net access). If the real uid 1224 * is not root, then retry with the real uid instead. 1225 */ 1226 if (ncr != NULL) { 1227 crfree(ncr); 1228 ncr = NULL; 1229 } 1230 if (res.status == NFS4ERR_ACCESS && 1231 (ncr = crnetadjust(cred_otw)) != NULL) { 1232 cred_otw = ncr; 1233 goto recov_retry; 1234 } 1235 kmem_free(argop, argoplist_size); 1236 return (e.error); 1237 } 1238 1239 resop = &res.array[idx_open]; /* open res */ 1240 op_res = &resop->nfs_resop4_u.opopen; 1241 1242 #ifdef DEBUG 1243 /* 1244 * verify attrset bitmap 1245 */ 1246 if (create_flag && 1247 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1248 /* make sure attrset returned is what we asked for */ 1249 /* XXX Ignore this 'error' for now */ 1250 if (attr->attrmask != op_res->attrset) 1251 /* EMPTY */; 1252 } 1253 #endif 1254 1255 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1256 mutex_enter(&VTOMI4(dvp)->mi_lock); 1257 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1258 mutex_exit(&VTOMI4(dvp)->mi_lock); 1259 } 1260 1261 resop = &res.array[idx_open + 1]; /* getfh res */ 1262 gf_res = &resop->nfs_resop4_u.opgetfh; 1263 1264 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1265 1266 /* 1267 * The open stateid has been updated on the server but not 1268 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1269 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1270 * WRITE call. That, however, will use the old stateid, so go ahead 1271 * and upate the open stateid now, before any call to makenfs4node. 1272 */ 1273 if (vpi) { 1274 nfs4_open_stream_t *tmp_osp; 1275 rnode4_t *tmp_rp = VTOR4(vpi); 1276 1277 tmp_osp = find_open_stream(oop, tmp_rp); 1278 if (tmp_osp) { 1279 tmp_osp->open_stateid = op_res->stateid; 1280 mutex_exit(&tmp_osp->os_sync_lock); 1281 open_stream_rele(tmp_osp, tmp_rp); 1282 } 1283 1284 /* 1285 * We must determine if the file handle given by the otw open 1286 * is the same as the file handle which was passed in with 1287 * *vpp. This case can be reached if the file we are trying 1288 * to open has been removed and another file has been created 1289 * having the same file name. The passed in vnode is released 1290 * later. 1291 */ 1292 orig_sfh = VTOR4(vpi)->r_fh; 1293 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1294 } 1295 1296 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1297 1298 if (create_flag || fh_differs) { 1299 int rnode_err = 0; 1300 1301 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1302 dvp, fn_get(VTOSV(dvp)->sv_name, file_name)); 1303 1304 if (e.error) 1305 PURGE_ATTRCACHE4(vp); 1306 /* 1307 * For the newly created vp case, make sure the rnode 1308 * isn't bad before using it. 1309 */ 1310 mutex_enter(&(VTOR4(vp))->r_statelock); 1311 if (VTOR4(vp)->r_flags & R4RECOVERR) 1312 rnode_err = EIO; 1313 mutex_exit(&(VTOR4(vp))->r_statelock); 1314 1315 if (rnode_err) { 1316 nfs4_end_open_seqid_sync(oop); 1317 nfs4args_copen_free(open_args); 1318 if (setgid_flag) { 1319 nfs4args_verify_free(&argop[8]); 1320 nfs4args_setattr_free(&argop[9]); 1321 } 1322 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1323 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1324 needrecov); 1325 open_owner_rele(oop); 1326 VN_RELE(vp); 1327 if (ncr != NULL) 1328 crfree(ncr); 1329 sfh4_rele(&otw_sfh); 1330 kmem_free(argop, argoplist_size); 1331 return (EIO); 1332 } 1333 } else { 1334 vp = vpi; 1335 } 1336 sfh4_rele(&otw_sfh); 1337 1338 /* 1339 * It seems odd to get a full set of attrs and then not update 1340 * the object's attrcache in the non-create case. Create case uses 1341 * the attrs since makenfs4node checks to see if the attrs need to 1342 * be updated (and then updates them). The non-create case should 1343 * update attrs also. 1344 */ 1345 if (! create_flag && ! fh_differs && !e.error) { 1346 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1347 } 1348 1349 nfs4_error_zinit(&e); 1350 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1351 /* This does not do recovery for vp explicitly. */ 1352 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1353 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1354 1355 if (e.error || e.stat) { 1356 nfs4_end_open_seqid_sync(oop); 1357 nfs4args_copen_free(open_args); 1358 if (setgid_flag) { 1359 nfs4args_verify_free(&argop[8]); 1360 nfs4args_setattr_free(&argop[9]); 1361 } 1362 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1363 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1364 needrecov); 1365 open_owner_rele(oop); 1366 if (create_flag || fh_differs) { 1367 /* rele the makenfs4node */ 1368 VN_RELE(vp); 1369 } 1370 if (ncr != NULL) { 1371 crfree(ncr); 1372 ncr = NULL; 1373 } 1374 if (retry_open == TRUE) { 1375 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1376 "nfs4open_otw: retry the open since OPEN " 1377 "CONFIRM failed with error %d stat %d", 1378 e.error, e.stat)); 1379 if (create_flag && createmode == GUARDED4) { 1380 NFS4_DEBUG(nfs4_client_recov_debug, 1381 (CE_NOTE, "nfs4open_otw: switch " 1382 "createmode from GUARDED4 to " 1383 "UNCHECKED4")); 1384 createmode = UNCHECKED4; 1385 } 1386 goto recov_retry; 1387 } 1388 if (!e.error) { 1389 if (create_flag && (createmode != EXCLUSIVE4) && 1390 e.stat == NFS4ERR_BADOWNER) 1391 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1392 1393 e.error = geterrno4(e.stat); 1394 } 1395 kmem_free(argop, argoplist_size); 1396 return (e.error); 1397 } 1398 } 1399 1400 rp = VTOR4(vp); 1401 1402 mutex_enter(&rp->r_statev4_lock); 1403 if (create_flag) 1404 rp->created_v4 = 1; 1405 mutex_exit(&rp->r_statev4_lock); 1406 1407 mutex_enter(&oop->oo_lock); 1408 /* Doesn't matter if 'oo_just_created' already was set as this */ 1409 oop->oo_just_created = NFS4_PERM_CREATED; 1410 if (oop->oo_cred_otw) 1411 crfree(oop->oo_cred_otw); 1412 oop->oo_cred_otw = cred_otw; 1413 crhold(oop->oo_cred_otw); 1414 mutex_exit(&oop->oo_lock); 1415 1416 /* returns with 'os_sync_lock' held */ 1417 osp = find_or_create_open_stream(oop, rp, &created_osp); 1418 if (!osp) { 1419 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1420 "nfs4open_otw: failed to create an open stream")); 1421 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1422 "signal our end of use of the open seqid")); 1423 1424 nfs4_end_open_seqid_sync(oop); 1425 open_owner_rele(oop); 1426 nfs4args_copen_free(open_args); 1427 if (setgid_flag) { 1428 nfs4args_verify_free(&argop[8]); 1429 nfs4args_setattr_free(&argop[9]); 1430 } 1431 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1432 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1433 if (create_flag || fh_differs) 1434 VN_RELE(vp); 1435 if (ncr != NULL) 1436 crfree(ncr); 1437 1438 kmem_free(argop, argoplist_size); 1439 return (EINVAL); 1440 1441 } 1442 1443 osp->open_stateid = op_res->stateid; 1444 1445 if (open_flag & FREAD) 1446 osp->os_share_acc_read++; 1447 if (open_flag & FWRITE) 1448 osp->os_share_acc_write++; 1449 osp->os_share_deny_none++; 1450 1451 /* 1452 * Need to reset this bitfield for the possible case where we were 1453 * going to OTW CLOSE the file, got a non-recoverable error, and before 1454 * we could retry the CLOSE, OPENed the file again. 1455 */ 1456 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1457 osp->os_final_close = 0; 1458 osp->os_force_close = 0; 1459 #ifdef DEBUG 1460 if (osp->os_failed_reopen) 1461 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1462 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1463 (void *)osp, (void *)cr, rnode4info(rp))); 1464 #endif 1465 osp->os_failed_reopen = 0; 1466 1467 mutex_exit(&osp->os_sync_lock); 1468 1469 nfs4_end_open_seqid_sync(oop); 1470 1471 if (created_osp && recov_state.rs_sp != NULL) { 1472 mutex_enter(&recov_state.rs_sp->s_lock); 1473 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1474 mutex_exit(&recov_state.rs_sp->s_lock); 1475 } 1476 1477 /* get rid of our reference to find oop */ 1478 open_owner_rele(oop); 1479 1480 open_stream_rele(osp, rp); 1481 1482 /* accept delegation, if any */ 1483 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1484 1485 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1486 1487 if (createmode == EXCLUSIVE4 && 1488 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1489 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1490 " EXCLUSIVE4: sending a SETATTR")); 1491 /* 1492 * If doing an exclusive create, then generate 1493 * a SETATTR to set the initial attributes. 1494 * Try to set the mtime and the atime to the 1495 * server's current time. It is somewhat 1496 * expected that these fields will be used to 1497 * store the exclusive create cookie. If not, 1498 * server implementors will need to know that 1499 * a SETATTR will follow an exclusive create 1500 * and the cookie should be destroyed if 1501 * appropriate. 1502 * 1503 * The AT_GID and AT_SIZE bits are turned off 1504 * so that the SETATTR request will not attempt 1505 * to process these. The gid will be set 1506 * separately if appropriate. The size is turned 1507 * off because it is assumed that a new file will 1508 * be created empty and if the file wasn't empty, 1509 * then the exclusive create will have failed 1510 * because the file must have existed already. 1511 * Therefore, no truncate operation is needed. 1512 */ 1513 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1514 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1515 1516 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1517 if (e.error) { 1518 /* 1519 * Couldn't correct the attributes of 1520 * the newly created file and the 1521 * attributes are wrong. Remove the 1522 * file and return an error to the 1523 * application. 1524 */ 1525 /* XXX will this take care of client state ? */ 1526 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1527 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1528 " remove file", e.error)); 1529 VN_RELE(vp); 1530 (void) nfs4_remove(dvp, file_name, cr); 1531 goto skip_update_dircaches; 1532 } 1533 } 1534 1535 /* 1536 * If we created or found the correct vnode, due to create_flag or 1537 * fh_differs being set, then update directory cache attribute, readdir 1538 * and dnlc caches. 1539 */ 1540 if (create_flag || fh_differs) { 1541 dirattr_info_t dinfo, *dinfop; 1542 1543 /* 1544 * Make sure getattr succeeded before using results. 1545 * note: op 7 is getattr(dir) for both flavors of 1546 * open(create). 1547 */ 1548 if (create_flag && res.status == NFS4_OK) { 1549 dinfo.di_time_call = t; 1550 dinfo.di_cred = cr; 1551 dinfo.di_garp = 1552 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1553 dinfop = &dinfo; 1554 } else { 1555 dinfop = NULL; 1556 } 1557 1558 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1559 dinfop); 1560 } 1561 skip_update_dircaches: 1562 1563 /* 1564 * If the page cache for this file was flushed from actions 1565 * above, it was done asynchronously and if that is true, 1566 * there is a need to wait here for it to complete. This must 1567 * be done outside of start_fop/end_fop. 1568 */ 1569 (void) nfs4_waitfor_purge_complete(vp); 1570 1571 /* 1572 * It is implicit that we are in the open case (create_flag == 0) since 1573 * fh_differs can only be set to a non-zero value in the open case. 1574 */ 1575 if (fh_differs != 0 && vpi != NULL) 1576 VN_RELE(vpi); 1577 1578 /* 1579 * Be sure to set *vpp to the correct value before returning. 1580 */ 1581 *vpp = vp; 1582 1583 nfs4args_copen_free(open_args); 1584 if (setgid_flag) { 1585 nfs4args_verify_free(&argop[8]); 1586 nfs4args_setattr_free(&argop[9]); 1587 } 1588 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1589 1590 if (ncr) 1591 crfree(ncr); 1592 kmem_free(argop, argoplist_size); 1593 return (e.error); 1594 } 1595 1596 /* 1597 * Reopen an open instance. cf. nfs4open_otw(). 1598 * 1599 * Errors are returned by the nfs4_error_t parameter. 1600 * - ep->error contains an errno value or zero. 1601 * - if it is zero, ep->stat is set to an NFS status code, if any. 1602 * If the file could not be reopened, but the caller should continue, the 1603 * file is marked dead and no error values are returned. If the caller 1604 * should stop recovering open files and start over, either the ep->error 1605 * value or ep->stat will indicate an error (either something that requires 1606 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1607 * filehandles) may be handled silently by this routine. 1608 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1609 * will be started, so the caller should not do it. 1610 * 1611 * Gotos: 1612 * - kill_file : reopen failed in such a fashion to constitute marking the 1613 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1614 * is for cases where recovery is not possible. 1615 * - failed_reopen : same as above, except that the file has already been 1616 * marked dead, so no need to do it again. 1617 * - bailout : reopen failed but we are able to recover and retry the reopen - 1618 * either within this function immediatley or via the calling function. 1619 */ 1620 1621 void 1622 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1623 open_claim_type4 claim, bool_t frc_use_claim_previous, 1624 bool_t is_recov) 1625 { 1626 COMPOUND4args_clnt args; 1627 COMPOUND4res_clnt res; 1628 nfs_argop4 argop[4]; 1629 nfs_resop4 *resop; 1630 OPEN4res *op_res = NULL; 1631 OPEN4cargs *open_args; 1632 GETFH4res *gf_res; 1633 rnode4_t *rp = VTOR4(vp); 1634 int doqueue = 1; 1635 cred_t *cr = NULL, *cred_otw = NULL; 1636 nfs4_open_owner_t *oop = NULL; 1637 seqid4 seqid; 1638 nfs4_ga_res_t *garp; 1639 char fn[MAXNAMELEN]; 1640 nfs4_recov_state_t recov = {NULL, 0}; 1641 nfs4_lost_rqst_t lost_rqst; 1642 mntinfo4_t *mi = VTOMI4(vp); 1643 bool_t abort; 1644 char *failed_msg = ""; 1645 int fh_different; 1646 hrtime_t t; 1647 nfs4_bseqid_entry_t *bsep = NULL; 1648 1649 ASSERT(nfs4_consistent_type(vp)); 1650 ASSERT(nfs_zone() == mi->mi_zone); 1651 1652 nfs4_error_zinit(ep); 1653 1654 /* this is the cred used to find the open owner */ 1655 cr = state_to_cred(osp); 1656 if (cr == NULL) { 1657 failed_msg = "Couldn't reopen: no cred"; 1658 goto kill_file; 1659 } 1660 /* use this cred for OTW operations */ 1661 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1662 1663 top: 1664 nfs4_error_zinit(ep); 1665 1666 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1667 /* File system has been unmounted, quit */ 1668 ep->error = EIO; 1669 failed_msg = "Couldn't reopen: file system has been unmounted"; 1670 goto kill_file; 1671 } 1672 1673 oop = osp->os_open_owner; 1674 1675 ASSERT(oop != NULL); 1676 if (oop == NULL) { /* be defensive in non-DEBUG */ 1677 failed_msg = "can't reopen: no open owner"; 1678 goto kill_file; 1679 } 1680 open_owner_hold(oop); 1681 1682 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1683 if (ep->error) { 1684 open_owner_rele(oop); 1685 oop = NULL; 1686 goto bailout; 1687 } 1688 1689 /* 1690 * If the rnode has a delegation and the delegation has been 1691 * recovered and the server didn't request a recall and the caller 1692 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1693 * recovery) and the rnode hasn't been marked dead, then install 1694 * the delegation stateid in the open stream. Otherwise, proceed 1695 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1696 */ 1697 mutex_enter(&rp->r_statev4_lock); 1698 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1699 !rp->r_deleg_return_pending && 1700 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1701 !rp->r_deleg_needs_recall && 1702 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1703 !(rp->r_flags & R4RECOVERR)) { 1704 mutex_enter(&osp->os_sync_lock); 1705 osp->os_delegation = 1; 1706 osp->open_stateid = rp->r_deleg_stateid; 1707 mutex_exit(&osp->os_sync_lock); 1708 mutex_exit(&rp->r_statev4_lock); 1709 goto bailout; 1710 } 1711 mutex_exit(&rp->r_statev4_lock); 1712 1713 /* 1714 * If the file failed recovery, just quit. This failure need not 1715 * affect other reopens, so don't return an error. 1716 */ 1717 mutex_enter(&rp->r_statelock); 1718 if (rp->r_flags & R4RECOVERR) { 1719 mutex_exit(&rp->r_statelock); 1720 ep->error = 0; 1721 goto failed_reopen; 1722 } 1723 mutex_exit(&rp->r_statelock); 1724 1725 /* 1726 * argop is empty here 1727 * 1728 * PUTFH, OPEN, GETATTR 1729 */ 1730 args.ctag = TAG_REOPEN; 1731 args.array_len = 4; 1732 args.array = argop; 1733 1734 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1735 "nfs4_reopen: file is type %d, id %s", 1736 vp->v_type, rnode4info(VTOR4(vp)))); 1737 1738 argop[0].argop = OP_CPUTFH; 1739 1740 if (claim != CLAIM_PREVIOUS) { 1741 /* 1742 * if this is a file mount then 1743 * use the mntinfo parentfh 1744 */ 1745 argop[0].nfs_argop4_u.opcputfh.sfh = 1746 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1747 VTOSV(vp)->sv_dfh; 1748 } else { 1749 /* putfh fh to reopen */ 1750 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1751 } 1752 1753 argop[1].argop = OP_COPEN; 1754 open_args = &argop[1].nfs_argop4_u.opcopen; 1755 open_args->claim = claim; 1756 1757 if (claim == CLAIM_NULL) { 1758 1759 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1760 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1761 "failed for vp 0x%p for CLAIM_NULL with %m", 1762 (void *)vp); 1763 failed_msg = "Couldn't reopen: vtoname failed for " 1764 "CLAIM_NULL"; 1765 /* nothing allocated yet */ 1766 goto kill_file; 1767 } 1768 1769 open_args->open_claim4_u.cfile = fn; 1770 } else if (claim == CLAIM_PREVIOUS) { 1771 1772 /* 1773 * We have two cases to deal with here: 1774 * 1) We're being called to reopen files in order to satisfy 1775 * a lock operation request which requires us to explicitly 1776 * reopen files which were opened under a delegation. If 1777 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1778 * that case, frc_use_claim_previous is TRUE and we must 1779 * use the rnode's current delegation type (r_deleg_type). 1780 * 2) We're reopening files during some form of recovery. 1781 * In this case, frc_use_claim_previous is FALSE and we 1782 * use the delegation type appropriate for recovery 1783 * (r_deleg_needs_recovery). 1784 */ 1785 mutex_enter(&rp->r_statev4_lock); 1786 open_args->open_claim4_u.delegate_type = 1787 frc_use_claim_previous ? 1788 rp->r_deleg_type : 1789 rp->r_deleg_needs_recovery; 1790 mutex_exit(&rp->r_statev4_lock); 1791 1792 } else if (claim == CLAIM_DELEGATE_CUR) { 1793 1794 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1795 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1796 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1797 "with %m", (void *)vp); 1798 failed_msg = "Couldn't reopen: vtoname failed for " 1799 "CLAIM_DELEGATE_CUR"; 1800 /* nothing allocated yet */ 1801 goto kill_file; 1802 } 1803 1804 mutex_enter(&rp->r_statev4_lock); 1805 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1806 rp->r_deleg_stateid; 1807 mutex_exit(&rp->r_statev4_lock); 1808 1809 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1810 } 1811 open_args->opentype = OPEN4_NOCREATE; 1812 open_args->owner.clientid = mi2clientid(mi); 1813 open_args->owner.owner_len = sizeof (oop->oo_name); 1814 open_args->owner.owner_val = 1815 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1816 bcopy(&oop->oo_name, open_args->owner.owner_val, 1817 open_args->owner.owner_len); 1818 open_args->share_access = 0; 1819 open_args->share_deny = 0; 1820 1821 mutex_enter(&osp->os_sync_lock); 1822 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1823 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1824 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1825 (void *)osp, (void *)rp, osp->os_share_acc_read, 1826 osp->os_share_acc_write, osp->os_open_ref_count, 1827 osp->os_mmap_read, osp->os_mmap_write, claim)); 1828 1829 if (osp->os_share_acc_read || osp->os_mmap_read) 1830 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1831 if (osp->os_share_acc_write || osp->os_mmap_write) 1832 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1833 if (osp->os_share_deny_read) 1834 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1835 if (osp->os_share_deny_write) 1836 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1837 mutex_exit(&osp->os_sync_lock); 1838 1839 seqid = nfs4_get_open_seqid(oop) + 1; 1840 open_args->seqid = seqid; 1841 1842 /* Construct the getfh part of the compound */ 1843 argop[2].argop = OP_GETFH; 1844 1845 /* Construct the getattr part of the compound */ 1846 argop[3].argop = OP_GETATTR; 1847 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1848 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1849 1850 t = gethrtime(); 1851 1852 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1853 1854 if (ep->error) { 1855 if (!is_recov && !frc_use_claim_previous && 1856 (ep->error == EINTR || ep->error == ETIMEDOUT || 1857 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1858 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1859 cred_otw, vp, NULL, open_args); 1860 abort = nfs4_start_recovery(ep, 1861 VTOMI4(vp), vp, NULL, NULL, 1862 lost_rqst.lr_op == OP_OPEN ? 1863 &lost_rqst : NULL, OP_OPEN, NULL); 1864 nfs4args_copen_free(open_args); 1865 goto bailout; 1866 } 1867 1868 nfs4args_copen_free(open_args); 1869 1870 if (ep->error == EACCES && cred_otw != cr) { 1871 crfree(cred_otw); 1872 cred_otw = cr; 1873 crhold(cred_otw); 1874 nfs4_end_open_seqid_sync(oop); 1875 open_owner_rele(oop); 1876 oop = NULL; 1877 goto top; 1878 } 1879 if (ep->error == ETIMEDOUT) 1880 goto bailout; 1881 failed_msg = "Couldn't reopen: rpc error"; 1882 goto kill_file; 1883 } 1884 1885 if (nfs4_need_to_bump_seqid(&res)) 1886 nfs4_set_open_seqid(seqid, oop, args.ctag); 1887 1888 switch (res.status) { 1889 case NFS4_OK: 1890 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1891 mutex_enter(&rp->r_statelock); 1892 rp->r_delay_interval = 0; 1893 mutex_exit(&rp->r_statelock); 1894 } 1895 break; 1896 case NFS4ERR_BAD_SEQID: 1897 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1898 args.ctag, open_args->seqid); 1899 1900 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1901 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1902 NULL, OP_OPEN, bsep); 1903 1904 nfs4args_copen_free(open_args); 1905 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1906 nfs4_end_open_seqid_sync(oop); 1907 open_owner_rele(oop); 1908 oop = NULL; 1909 kmem_free(bsep, sizeof (*bsep)); 1910 1911 goto kill_file; 1912 case NFS4ERR_NO_GRACE: 1913 nfs4args_copen_free(open_args); 1914 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1915 nfs4_end_open_seqid_sync(oop); 1916 open_owner_rele(oop); 1917 oop = NULL; 1918 if (claim == CLAIM_PREVIOUS) { 1919 /* 1920 * Retry as a plain open. We don't need to worry about 1921 * checking the changeinfo: it is acceptable for a 1922 * client to re-open a file and continue processing 1923 * (in the absence of locks). 1924 */ 1925 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1926 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1927 "will retry as CLAIM_NULL")); 1928 claim = CLAIM_NULL; 1929 nfs4_mi_kstat_inc_no_grace(mi); 1930 goto top; 1931 } 1932 failed_msg = 1933 "Couldn't reopen: tried reclaim outside grace period. "; 1934 goto kill_file; 1935 case NFS4ERR_GRACE: 1936 nfs4_set_grace_wait(mi); 1937 nfs4args_copen_free(open_args); 1938 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1939 nfs4_end_open_seqid_sync(oop); 1940 open_owner_rele(oop); 1941 oop = NULL; 1942 ep->error = nfs4_wait_for_grace(mi, &recov); 1943 if (ep->error != 0) 1944 goto bailout; 1945 goto top; 1946 case NFS4ERR_DELAY: 1947 nfs4_set_delay_wait(vp); 1948 nfs4args_copen_free(open_args); 1949 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1950 nfs4_end_open_seqid_sync(oop); 1951 open_owner_rele(oop); 1952 oop = NULL; 1953 ep->error = nfs4_wait_for_delay(vp, &recov); 1954 nfs4_mi_kstat_inc_delay(mi); 1955 if (ep->error != 0) 1956 goto bailout; 1957 goto top; 1958 case NFS4ERR_FHEXPIRED: 1959 /* recover filehandle and retry */ 1960 abort = nfs4_start_recovery(ep, 1961 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 1962 nfs4args_copen_free(open_args); 1963 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1964 nfs4_end_open_seqid_sync(oop); 1965 open_owner_rele(oop); 1966 oop = NULL; 1967 if (abort == FALSE) 1968 goto top; 1969 failed_msg = "Couldn't reopen: recovery aborted"; 1970 goto kill_file; 1971 case NFS4ERR_RESOURCE: 1972 case NFS4ERR_STALE_CLIENTID: 1973 case NFS4ERR_WRONGSEC: 1974 case NFS4ERR_EXPIRED: 1975 /* 1976 * Do not mark the file dead and let the calling 1977 * function initiate recovery. 1978 */ 1979 nfs4args_copen_free(open_args); 1980 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1981 nfs4_end_open_seqid_sync(oop); 1982 open_owner_rele(oop); 1983 oop = NULL; 1984 goto bailout; 1985 case NFS4ERR_ACCESS: 1986 if (cred_otw != cr) { 1987 crfree(cred_otw); 1988 cred_otw = cr; 1989 crhold(cred_otw); 1990 nfs4args_copen_free(open_args); 1991 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1992 nfs4_end_open_seqid_sync(oop); 1993 open_owner_rele(oop); 1994 oop = NULL; 1995 goto top; 1996 } 1997 /* fall through */ 1998 default: 1999 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2000 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2001 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2002 rnode4info(VTOR4(vp)))); 2003 failed_msg = "Couldn't reopen: NFSv4 error"; 2004 nfs4args_copen_free(open_args); 2005 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2006 goto kill_file; 2007 } 2008 2009 resop = &res.array[1]; /* open res */ 2010 op_res = &resop->nfs_resop4_u.opopen; 2011 2012 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2013 2014 /* 2015 * Check if the path we reopened really is the same 2016 * file. We could end up in a situation where the file 2017 * was removed and a new file created with the same name. 2018 */ 2019 resop = &res.array[2]; 2020 gf_res = &resop->nfs_resop4_u.opgetfh; 2021 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2022 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2023 if (fh_different) { 2024 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2025 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2026 /* Oops, we don't have the same file */ 2027 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2028 failed_msg = "Couldn't reopen: Persistent " 2029 "file handle changed"; 2030 else 2031 failed_msg = "Couldn't reopen: Volatile " 2032 "(no expire on open) file handle changed"; 2033 2034 nfs4args_copen_free(open_args); 2035 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2036 nfs_rw_exit(&mi->mi_fh_lock); 2037 goto kill_file; 2038 2039 } else { 2040 /* 2041 * We have volatile file handles that don't compare. 2042 * If the fids are the same then we assume that the 2043 * file handle expired but the rnode still refers to 2044 * the same file object. 2045 * 2046 * First check that we have fids or not. 2047 * If we don't we have a dumb server so we will 2048 * just assume every thing is ok for now. 2049 */ 2050 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2051 rp->r_attr.va_mask & AT_NODEID && 2052 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2053 /* 2054 * We have fids, but they don't 2055 * compare. So kill the file. 2056 */ 2057 failed_msg = 2058 "Couldn't reopen: file handle changed" 2059 " due to mismatched fids"; 2060 nfs4args_copen_free(open_args); 2061 (void) xdr_free(xdr_COMPOUND4res_clnt, 2062 (caddr_t)&res); 2063 nfs_rw_exit(&mi->mi_fh_lock); 2064 goto kill_file; 2065 } else { 2066 /* 2067 * We have volatile file handles that refers 2068 * to the same file (at least they have the 2069 * same fid) or we don't have fids so we 2070 * can't tell. :(. We'll be a kind and accepting 2071 * client so we'll update the rnode's file 2072 * handle with the otw handle. 2073 * 2074 * We need to drop mi->mi_fh_lock since 2075 * sh4_update acquires it. Since there is 2076 * only one recovery thread there is no 2077 * race. 2078 */ 2079 nfs_rw_exit(&mi->mi_fh_lock); 2080 sfh4_update(rp->r_fh, &gf_res->object); 2081 } 2082 } 2083 } else { 2084 nfs_rw_exit(&mi->mi_fh_lock); 2085 } 2086 2087 ASSERT(nfs4_consistent_type(vp)); 2088 2089 /* 2090 * If the server wanted an OPEN_CONFIRM but that fails, just start 2091 * over. Presumably if there is a persistent error it will show up 2092 * when we resend the OPEN. 2093 */ 2094 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2095 bool_t retry_open = FALSE; 2096 2097 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2098 cred_otw, is_recov, &retry_open, 2099 oop, FALSE, ep, NULL); 2100 if (ep->error || ep->stat) { 2101 nfs4args_copen_free(open_args); 2102 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2103 nfs4_end_open_seqid_sync(oop); 2104 open_owner_rele(oop); 2105 oop = NULL; 2106 goto top; 2107 } 2108 } 2109 2110 mutex_enter(&osp->os_sync_lock); 2111 osp->open_stateid = op_res->stateid; 2112 osp->os_delegation = 0; 2113 /* 2114 * Need to reset this bitfield for the possible case where we were 2115 * going to OTW CLOSE the file, got a non-recoverable error, and before 2116 * we could retry the CLOSE, OPENed the file again. 2117 */ 2118 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2119 osp->os_final_close = 0; 2120 osp->os_force_close = 0; 2121 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2122 osp->os_dc_openacc = open_args->share_access; 2123 mutex_exit(&osp->os_sync_lock); 2124 2125 nfs4_end_open_seqid_sync(oop); 2126 2127 /* accept delegation, if any */ 2128 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2129 2130 nfs4args_copen_free(open_args); 2131 2132 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2133 2134 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2135 2136 ASSERT(nfs4_consistent_type(vp)); 2137 2138 open_owner_rele(oop); 2139 crfree(cr); 2140 crfree(cred_otw); 2141 return; 2142 2143 kill_file: 2144 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2145 failed_reopen: 2146 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2147 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2148 (void *)osp, (void *)cr, rnode4info(rp))); 2149 mutex_enter(&osp->os_sync_lock); 2150 osp->os_failed_reopen = 1; 2151 mutex_exit(&osp->os_sync_lock); 2152 bailout: 2153 if (oop != NULL) { 2154 nfs4_end_open_seqid_sync(oop); 2155 open_owner_rele(oop); 2156 } 2157 if (cr != NULL) 2158 crfree(cr); 2159 if (cred_otw != NULL) 2160 crfree(cred_otw); 2161 } 2162 2163 /* for . and .. OPENs */ 2164 /* ARGSUSED */ 2165 static int 2166 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2167 { 2168 rnode4_t *rp; 2169 nfs4_ga_res_t gar; 2170 2171 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2172 2173 /* 2174 * If close-to-open consistency checking is turned off or 2175 * if there is no cached data, we can avoid 2176 * the over the wire getattr. Otherwise, force a 2177 * call to the server to get fresh attributes and to 2178 * check caches. This is required for close-to-open 2179 * consistency. 2180 */ 2181 rp = VTOR4(*vpp); 2182 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2183 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2184 return (0); 2185 2186 gar.n4g_va.va_mask = AT_ALL; 2187 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2188 } 2189 2190 /* 2191 * CLOSE a file 2192 */ 2193 static int 2194 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 2195 { 2196 rnode4_t *rp; 2197 int error = 0; 2198 int r_error = 0; 2199 int n4error = 0; 2200 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2201 2202 /* 2203 * Remove client state for this (lockowner, file) pair. 2204 * Issue otw v4 call to have the server do the same. 2205 */ 2206 2207 rp = VTOR4(vp); 2208 2209 /* 2210 * zone_enter(2) prevents processes from changing zones with NFS files 2211 * open; if we happen to get here from the wrong zone we can't do 2212 * anything over the wire. 2213 */ 2214 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2215 /* 2216 * We could attempt to clean up locks, except we're sure 2217 * that the current process didn't acquire any locks on 2218 * the file: any attempt to lock a file belong to another zone 2219 * will fail, and one can't lock an NFS file and then change 2220 * zones, as that fails too. 2221 * 2222 * Returning an error here is the sane thing to do. A 2223 * subsequent call to VN_RELE() which translates to a 2224 * nfs4_inactive() will clean up state: if the zone of the 2225 * vnode's origin is still alive and kicking, the inactive 2226 * thread will handle the request (from the correct zone), and 2227 * everything (minus the OTW close call) should be OK. If the 2228 * zone is going away nfs4_async_inactive() will throw away 2229 * delegations, open streams and cached pages inline. 2230 */ 2231 return (EIO); 2232 } 2233 2234 /* 2235 * If we are using local locking for this filesystem, then 2236 * release all of the SYSV style record locks. Otherwise, 2237 * we are doing network locking and we need to release all 2238 * of the network locks. All of the locks held by this 2239 * process on this file are released no matter what the 2240 * incoming reference count is. 2241 */ 2242 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2243 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2244 cleanshares(vp, ttoproc(curthread)->p_pid); 2245 } else 2246 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2247 2248 if (e.error) 2249 return (e.error); 2250 2251 if (count > 1) 2252 return (0); 2253 2254 /* 2255 * If the file has been `unlinked', then purge the 2256 * DNLC so that this vnode will get reycled quicker 2257 * and the .nfs* file on the server will get removed. 2258 */ 2259 if (rp->r_unldvp != NULL) 2260 dnlc_purge_vp(vp); 2261 2262 /* 2263 * If the file was open for write and there are pages, 2264 * do a synchronous flush and commit of all of the 2265 * dirty and uncommitted pages. 2266 */ 2267 ASSERT(!e.error); 2268 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2269 error = nfs4_putpage_commit(vp, 0, 0, cr); 2270 2271 mutex_enter(&rp->r_statelock); 2272 r_error = rp->r_error; 2273 rp->r_error = 0; 2274 mutex_exit(&rp->r_statelock); 2275 2276 /* 2277 * If this file type is one for which no explicit 'open' was 2278 * done, then bail now (ie. no need for protocol 'close'). If 2279 * there was an error w/the vm subsystem, return _that_ error, 2280 * otherwise, return any errors that may've been reported via 2281 * the rnode. 2282 */ 2283 if (vp->v_type != VREG) 2284 return (error ? error : r_error); 2285 2286 /* 2287 * The sync putpage commit may have failed above, but since 2288 * we're working w/a regular file, we need to do the protocol 2289 * 'close' (nfs4close_one will figure out if an otw close is 2290 * needed or not). Report any errors _after_ doing the protocol 2291 * 'close'. 2292 */ 2293 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2294 n4error = e.error ? e.error : geterrno4(e.stat); 2295 2296 /* 2297 * Error reporting prio (Hi -> Lo) 2298 * 2299 * i) nfs4_putpage_commit (error) 2300 * ii) rnode's (r_error) 2301 * iii) nfs4close_one (n4error) 2302 */ 2303 return (error ? error : (r_error ? r_error : n4error)); 2304 } 2305 2306 /* 2307 * Initialize *lost_rqstp. 2308 */ 2309 2310 static void 2311 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2312 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2313 vnode_t *vp) 2314 { 2315 if (error != ETIMEDOUT && error != EINTR && 2316 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2317 lost_rqstp->lr_op = 0; 2318 return; 2319 } 2320 2321 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2322 "nfs4close_save_lost_rqst: error %d", error)); 2323 2324 lost_rqstp->lr_op = OP_CLOSE; 2325 /* 2326 * The vp is held and rele'd via the recovery code. 2327 * See nfs4_save_lost_rqst. 2328 */ 2329 lost_rqstp->lr_vp = vp; 2330 lost_rqstp->lr_dvp = NULL; 2331 lost_rqstp->lr_oop = oop; 2332 lost_rqstp->lr_osp = osp; 2333 ASSERT(osp != NULL); 2334 ASSERT(mutex_owned(&osp->os_sync_lock)); 2335 osp->os_pending_close = 1; 2336 lost_rqstp->lr_lop = NULL; 2337 lost_rqstp->lr_cr = cr; 2338 lost_rqstp->lr_flk = NULL; 2339 lost_rqstp->lr_putfirst = FALSE; 2340 } 2341 2342 /* 2343 * Assumes you already have the open seqid sync grabbed as well as the 2344 * 'os_sync_lock'. Note: this will release the open seqid sync and 2345 * 'os_sync_lock' if client recovery starts. Calling functions have to 2346 * be prepared to handle this. 2347 * 2348 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2349 * was needed and was started, and that the calling function should retry 2350 * this function; otherwise it is returned as 0. 2351 * 2352 * Errors are returned via the nfs4_error_t parameter. 2353 */ 2354 static void 2355 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2356 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2357 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2358 { 2359 COMPOUND4args_clnt args; 2360 COMPOUND4res_clnt res; 2361 CLOSE4args *close_args; 2362 nfs_resop4 *resop; 2363 nfs_argop4 argop[3]; 2364 int doqueue = 1; 2365 mntinfo4_t *mi; 2366 seqid4 seqid; 2367 vnode_t *vp; 2368 bool_t needrecov = FALSE; 2369 nfs4_lost_rqst_t lost_rqst; 2370 hrtime_t t; 2371 2372 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2373 2374 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2375 2376 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2377 2378 /* Only set this to 1 if recovery is started */ 2379 *recov = 0; 2380 2381 /* do the OTW call to close the file */ 2382 2383 if (close_type == CLOSE_RESEND) 2384 args.ctag = TAG_CLOSE_LOST; 2385 else if (close_type == CLOSE_AFTER_RESEND) 2386 args.ctag = TAG_CLOSE_UNDO; 2387 else 2388 args.ctag = TAG_CLOSE; 2389 2390 args.array_len = 3; 2391 args.array = argop; 2392 2393 vp = RTOV4(rp); 2394 2395 mi = VTOMI4(vp); 2396 2397 /* putfh target fh */ 2398 argop[0].argop = OP_CPUTFH; 2399 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2400 2401 argop[1].argop = OP_GETATTR; 2402 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2403 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2404 2405 argop[2].argop = OP_CLOSE; 2406 close_args = &argop[2].nfs_argop4_u.opclose; 2407 2408 seqid = nfs4_get_open_seqid(oop) + 1; 2409 2410 close_args->seqid = seqid; 2411 close_args->open_stateid = osp->open_stateid; 2412 2413 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2414 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2415 rnode4info(rp))); 2416 2417 t = gethrtime(); 2418 2419 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2420 2421 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2422 nfs4_set_open_seqid(seqid, oop, args.ctag); 2423 } 2424 2425 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2426 if (ep->error && !needrecov) { 2427 /* 2428 * if there was an error and no recovery is to be done 2429 * then then set up the file to flush its cache if 2430 * needed for the next caller. 2431 */ 2432 mutex_enter(&rp->r_statelock); 2433 PURGE_ATTRCACHE4_LOCKED(rp); 2434 rp->r_flags &= ~R4WRITEMODIFIED; 2435 mutex_exit(&rp->r_statelock); 2436 return; 2437 } 2438 2439 if (needrecov) { 2440 bool_t abort; 2441 nfs4_bseqid_entry_t *bsep = NULL; 2442 2443 if (close_type != CLOSE_RESEND) 2444 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2445 osp, cred_otw, vp); 2446 2447 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2448 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2449 0, args.ctag, close_args->seqid); 2450 2451 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2452 "nfs4close_otw: initiating recovery. error %d " 2453 "res.status %d", ep->error, res.status)); 2454 2455 /* 2456 * Drop the 'os_sync_lock' here so we don't hit 2457 * a potential recursive mutex_enter via an 2458 * 'open_stream_hold()'. 2459 */ 2460 mutex_exit(&osp->os_sync_lock); 2461 *have_sync_lockp = 0; 2462 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2463 (close_type != CLOSE_RESEND && 2464 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2465 OP_CLOSE, bsep); 2466 2467 /* drop open seq sync, and let the calling function regrab it */ 2468 nfs4_end_open_seqid_sync(oop); 2469 *did_start_seqid_syncp = 0; 2470 2471 if (bsep) 2472 kmem_free(bsep, sizeof (*bsep)); 2473 /* 2474 * For signals, the caller wants to quit, so don't say to 2475 * retry. For forced unmount, if it's a user thread, it 2476 * wants to quit. If it's a recovery thread, the retry 2477 * will happen higher-up on the call stack. Either way, 2478 * don't say to retry. 2479 */ 2480 if (abort == FALSE && ep->error != EINTR && 2481 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2482 close_type != CLOSE_RESEND && 2483 close_type != CLOSE_AFTER_RESEND) 2484 *recov = 1; 2485 else 2486 *recov = 0; 2487 2488 if (!ep->error) 2489 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2490 return; 2491 } 2492 2493 if (res.status) { 2494 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2495 return; 2496 } 2497 2498 mutex_enter(&rp->r_statev4_lock); 2499 rp->created_v4 = 0; 2500 mutex_exit(&rp->r_statev4_lock); 2501 2502 resop = &res.array[2]; 2503 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2504 osp->os_valid = 0; 2505 2506 /* 2507 * This removes the reference obtained at OPEN; ie, when the 2508 * open stream structure was created. 2509 * 2510 * We don't have to worry about calling 'open_stream_rele' 2511 * since we our currently holding a reference to the open 2512 * stream which means the count cannot go to 0 with this 2513 * decrement. 2514 */ 2515 ASSERT(osp->os_ref_count >= 2); 2516 osp->os_ref_count--; 2517 2518 if (!ep->error) 2519 nfs4_attr_cache(vp, 2520 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2521 t, cred_otw, TRUE, NULL); 2522 2523 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2524 " returning %d", ep->error)); 2525 2526 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2527 } 2528 2529 /* ARGSUSED */ 2530 static int 2531 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2532 caller_context_t *ct) 2533 { 2534 rnode4_t *rp; 2535 u_offset_t off; 2536 offset_t diff; 2537 uint_t on; 2538 uint_t n; 2539 caddr_t base; 2540 uint_t flags; 2541 int error; 2542 mntinfo4_t *mi; 2543 2544 rp = VTOR4(vp); 2545 2546 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2547 2548 if (IS_SHADOW(vp, rp)) 2549 vp = RTOV4(rp); 2550 2551 if (vp->v_type != VREG) 2552 return (EISDIR); 2553 2554 mi = VTOMI4(vp); 2555 2556 if (nfs_zone() != mi->mi_zone) 2557 return (EIO); 2558 2559 if (uiop->uio_resid == 0) 2560 return (0); 2561 2562 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2563 return (EINVAL); 2564 2565 mutex_enter(&rp->r_statelock); 2566 if (rp->r_flags & R4RECOVERRP) 2567 error = (rp->r_error ? rp->r_error : EIO); 2568 else 2569 error = 0; 2570 mutex_exit(&rp->r_statelock); 2571 if (error) 2572 return (error); 2573 2574 /* 2575 * Bypass VM if caching has been disabled (e.g., locking) or if 2576 * using client-side direct I/O and the file is not mmap'd and 2577 * there are no cached pages. 2578 */ 2579 if ((vp->v_flag & VNOCACHE) || 2580 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2581 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2582 size_t resid = 0; 2583 2584 return (nfs4read(vp, NULL, uiop->uio_loffset, 2585 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2586 } 2587 2588 error = 0; 2589 2590 do { 2591 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2592 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2593 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2594 2595 if (error = nfs4_validate_caches(vp, cr)) 2596 break; 2597 2598 mutex_enter(&rp->r_statelock); 2599 diff = rp->r_size - uiop->uio_loffset; 2600 mutex_exit(&rp->r_statelock); 2601 if (diff <= 0) 2602 break; 2603 if (diff < n) 2604 n = (uint_t)diff; 2605 2606 if (vpm_enable) { 2607 /* 2608 * Copy data. 2609 */ 2610 error = vpm_data_copy(vp, off + on, n, uiop, 2611 1, NULL, 0, S_READ); 2612 2613 } else { 2614 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2615 S_READ); 2616 2617 error = uiomove(base + on, n, UIO_READ, uiop); 2618 } 2619 2620 if (!error) { 2621 /* 2622 * If read a whole block or read to eof, 2623 * won't need this buffer again soon. 2624 */ 2625 mutex_enter(&rp->r_statelock); 2626 if (n + on == MAXBSIZE || 2627 uiop->uio_loffset == rp->r_size) 2628 flags = SM_DONTNEED; 2629 else 2630 flags = 0; 2631 mutex_exit(&rp->r_statelock); 2632 if (vpm_enable) { 2633 error = vpm_sync_pages(vp, off, n, flags); 2634 } else { 2635 error = segmap_release(segkmap, base, flags); 2636 } 2637 } else { 2638 if (vpm_enable) { 2639 (void) vpm_sync_pages(vp, off, n, 0); 2640 } else { 2641 (void) segmap_release(segkmap, base, 0); 2642 } 2643 } 2644 } while (!error && uiop->uio_resid > 0); 2645 2646 return (error); 2647 } 2648 2649 /* ARGSUSED */ 2650 static int 2651 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2652 caller_context_t *ct) 2653 { 2654 rlim64_t limit = uiop->uio_llimit; 2655 rnode4_t *rp; 2656 u_offset_t off; 2657 caddr_t base; 2658 uint_t flags; 2659 int remainder; 2660 size_t n; 2661 int on; 2662 int error; 2663 int resid; 2664 u_offset_t offset; 2665 mntinfo4_t *mi; 2666 uint_t bsize; 2667 2668 rp = VTOR4(vp); 2669 2670 if (IS_SHADOW(vp, rp)) 2671 vp = RTOV4(rp); 2672 2673 if (vp->v_type != VREG) 2674 return (EISDIR); 2675 2676 mi = VTOMI4(vp); 2677 2678 if (nfs_zone() != mi->mi_zone) 2679 return (EIO); 2680 2681 if (uiop->uio_resid == 0) 2682 return (0); 2683 2684 mutex_enter(&rp->r_statelock); 2685 if (rp->r_flags & R4RECOVERRP) 2686 error = (rp->r_error ? rp->r_error : EIO); 2687 else 2688 error = 0; 2689 mutex_exit(&rp->r_statelock); 2690 if (error) 2691 return (error); 2692 2693 if (ioflag & FAPPEND) { 2694 struct vattr va; 2695 2696 /* 2697 * Must serialize if appending. 2698 */ 2699 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2700 nfs_rw_exit(&rp->r_rwlock); 2701 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2702 INTR(vp))) 2703 return (EINTR); 2704 } 2705 2706 va.va_mask = AT_SIZE; 2707 error = nfs4getattr(vp, &va, cr); 2708 if (error) 2709 return (error); 2710 uiop->uio_loffset = va.va_size; 2711 } 2712 2713 offset = uiop->uio_loffset + uiop->uio_resid; 2714 2715 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2716 return (EINVAL); 2717 2718 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2719 limit = MAXOFFSET_T; 2720 2721 /* 2722 * Check to make sure that the process will not exceed 2723 * its limit on file size. It is okay to write up to 2724 * the limit, but not beyond. Thus, the write which 2725 * reaches the limit will be short and the next write 2726 * will return an error. 2727 */ 2728 remainder = 0; 2729 if (offset > uiop->uio_llimit) { 2730 remainder = offset - uiop->uio_llimit; 2731 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2732 if (uiop->uio_resid <= 0) { 2733 proc_t *p = ttoproc(curthread); 2734 2735 uiop->uio_resid += remainder; 2736 mutex_enter(&p->p_lock); 2737 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2738 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2739 mutex_exit(&p->p_lock); 2740 return (EFBIG); 2741 } 2742 } 2743 2744 /* update the change attribute, if we have a write delegation */ 2745 2746 mutex_enter(&rp->r_statev4_lock); 2747 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2748 rp->r_deleg_change++; 2749 2750 mutex_exit(&rp->r_statev4_lock); 2751 2752 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2753 return (EINTR); 2754 2755 /* 2756 * Bypass VM if caching has been disabled (e.g., locking) or if 2757 * using client-side direct I/O and the file is not mmap'd and 2758 * there are no cached pages. 2759 */ 2760 if ((vp->v_flag & VNOCACHE) || 2761 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2762 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2763 size_t bufsize; 2764 int count; 2765 u_offset_t org_offset; 2766 stable_how4 stab_comm; 2767 nfs4_fwrite: 2768 if (rp->r_flags & R4STALE) { 2769 resid = uiop->uio_resid; 2770 offset = uiop->uio_loffset; 2771 error = rp->r_error; 2772 goto bottom; 2773 } 2774 2775 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2776 base = kmem_alloc(bufsize, KM_SLEEP); 2777 do { 2778 if (ioflag & FDSYNC) 2779 stab_comm = DATA_SYNC4; 2780 else 2781 stab_comm = FILE_SYNC4; 2782 resid = uiop->uio_resid; 2783 offset = uiop->uio_loffset; 2784 count = MIN(uiop->uio_resid, bufsize); 2785 org_offset = uiop->uio_loffset; 2786 error = uiomove(base, count, UIO_WRITE, uiop); 2787 if (!error) { 2788 error = nfs4write(vp, base, org_offset, 2789 count, cr, &stab_comm); 2790 if (!error) { 2791 mutex_enter(&rp->r_statelock); 2792 if (rp->r_size < uiop->uio_loffset) 2793 rp->r_size = uiop->uio_loffset; 2794 mutex_exit(&rp->r_statelock); 2795 } 2796 } 2797 } while (!error && uiop->uio_resid > 0); 2798 kmem_free(base, bufsize); 2799 goto bottom; 2800 } 2801 2802 bsize = vp->v_vfsp->vfs_bsize; 2803 2804 do { 2805 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2806 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2807 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2808 2809 resid = uiop->uio_resid; 2810 offset = uiop->uio_loffset; 2811 2812 if (rp->r_flags & R4STALE) { 2813 error = rp->r_error; 2814 break; 2815 } 2816 2817 /* 2818 * Don't create dirty pages faster than they 2819 * can be cleaned so that the system doesn't 2820 * get imbalanced. If the async queue is 2821 * maxed out, then wait for it to drain before 2822 * creating more dirty pages. Also, wait for 2823 * any threads doing pagewalks in the vop_getattr 2824 * entry points so that they don't block for 2825 * long periods. 2826 */ 2827 mutex_enter(&rp->r_statelock); 2828 while ((mi->mi_max_threads != 0 && 2829 rp->r_awcount > 2 * mi->mi_max_threads) || 2830 rp->r_gcount > 0) 2831 cv_wait(&rp->r_cv, &rp->r_statelock); 2832 mutex_exit(&rp->r_statelock); 2833 2834 if (vpm_enable) { 2835 /* 2836 * It will use kpm mappings, so no need to 2837 * pass an address. 2838 */ 2839 error = writerp4(rp, NULL, n, uiop, 0); 2840 } else { 2841 if (segmap_kpm) { 2842 int pon = uiop->uio_loffset & PAGEOFFSET; 2843 size_t pn = MIN(PAGESIZE - pon, 2844 uiop->uio_resid); 2845 int pagecreate; 2846 2847 mutex_enter(&rp->r_statelock); 2848 pagecreate = (pon == 0) && (pn == PAGESIZE || 2849 uiop->uio_loffset + pn >= rp->r_size); 2850 mutex_exit(&rp->r_statelock); 2851 2852 base = segmap_getmapflt(segkmap, vp, off + on, 2853 pn, !pagecreate, S_WRITE); 2854 2855 error = writerp4(rp, base + pon, n, uiop, 2856 pagecreate); 2857 2858 } else { 2859 base = segmap_getmapflt(segkmap, vp, off + on, 2860 n, 0, S_READ); 2861 error = writerp4(rp, base + on, n, uiop, 0); 2862 } 2863 } 2864 2865 if (!error) { 2866 if (mi->mi_flags & MI4_NOAC) 2867 flags = SM_WRITE; 2868 else if ((uiop->uio_loffset % bsize) == 0 || 2869 IS_SWAPVP(vp)) { 2870 /* 2871 * Have written a whole block. 2872 * Start an asynchronous write 2873 * and mark the buffer to 2874 * indicate that it won't be 2875 * needed again soon. 2876 */ 2877 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2878 } else 2879 flags = 0; 2880 if ((ioflag & (FSYNC|FDSYNC)) || 2881 (rp->r_flags & R4OUTOFSPACE)) { 2882 flags &= ~SM_ASYNC; 2883 flags |= SM_WRITE; 2884 } 2885 if (vpm_enable) { 2886 error = vpm_sync_pages(vp, off, n, flags); 2887 } else { 2888 error = segmap_release(segkmap, base, flags); 2889 } 2890 } else { 2891 if (vpm_enable) { 2892 (void) vpm_sync_pages(vp, off, n, 0); 2893 } else { 2894 (void) segmap_release(segkmap, base, 0); 2895 } 2896 /* 2897 * In the event that we got an access error while 2898 * faulting in a page for a write-only file just 2899 * force a write. 2900 */ 2901 if (error == EACCES) 2902 goto nfs4_fwrite; 2903 } 2904 } while (!error && uiop->uio_resid > 0); 2905 2906 bottom: 2907 if (error) { 2908 uiop->uio_resid = resid + remainder; 2909 uiop->uio_loffset = offset; 2910 } else { 2911 uiop->uio_resid += remainder; 2912 2913 mutex_enter(&rp->r_statev4_lock); 2914 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2915 gethrestime(&rp->r_attr.va_mtime); 2916 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2917 } 2918 mutex_exit(&rp->r_statev4_lock); 2919 } 2920 2921 nfs_rw_exit(&rp->r_lkserlock); 2922 2923 return (error); 2924 } 2925 2926 /* 2927 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2928 */ 2929 static int 2930 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2931 int flags, cred_t *cr) 2932 { 2933 struct buf *bp; 2934 int error; 2935 page_t *savepp; 2936 uchar_t fsdata; 2937 stable_how4 stab_comm; 2938 2939 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 2940 bp = pageio_setup(pp, len, vp, flags); 2941 ASSERT(bp != NULL); 2942 2943 /* 2944 * pageio_setup should have set b_addr to 0. This 2945 * is correct since we want to do I/O on a page 2946 * boundary. bp_mapin will use this addr to calculate 2947 * an offset, and then set b_addr to the kernel virtual 2948 * address it allocated for us. 2949 */ 2950 ASSERT(bp->b_un.b_addr == 0); 2951 2952 bp->b_edev = 0; 2953 bp->b_dev = 0; 2954 bp->b_lblkno = lbtodb(off); 2955 bp->b_file = vp; 2956 bp->b_offset = (offset_t)off; 2957 bp_mapin(bp); 2958 2959 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 2960 freemem > desfree) 2961 stab_comm = UNSTABLE4; 2962 else 2963 stab_comm = FILE_SYNC4; 2964 2965 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 2966 2967 bp_mapout(bp); 2968 pageio_done(bp); 2969 2970 if (stab_comm == UNSTABLE4) 2971 fsdata = C_DELAYCOMMIT; 2972 else 2973 fsdata = C_NOCOMMIT; 2974 2975 savepp = pp; 2976 do { 2977 pp->p_fsdata = fsdata; 2978 } while ((pp = pp->p_next) != savepp); 2979 2980 return (error); 2981 } 2982 2983 /* 2984 */ 2985 static int 2986 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 2987 { 2988 nfs4_open_owner_t *oop; 2989 nfs4_open_stream_t *osp; 2990 rnode4_t *rp = VTOR4(vp); 2991 mntinfo4_t *mi = VTOMI4(vp); 2992 int reopen_needed; 2993 2994 ASSERT(nfs_zone() == mi->mi_zone); 2995 2996 2997 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 2998 if (!oop) 2999 return (EIO); 3000 3001 /* returns with 'os_sync_lock' held */ 3002 osp = find_open_stream(oop, rp); 3003 if (!osp) { 3004 open_owner_rele(oop); 3005 return (EIO); 3006 } 3007 3008 if (osp->os_failed_reopen) { 3009 mutex_exit(&osp->os_sync_lock); 3010 open_stream_rele(osp, rp); 3011 open_owner_rele(oop); 3012 return (EIO); 3013 } 3014 3015 /* 3016 * Determine whether a reopen is needed. If this 3017 * is a delegation open stream, then the os_delegation bit 3018 * should be set. 3019 */ 3020 3021 reopen_needed = osp->os_delegation; 3022 3023 mutex_exit(&osp->os_sync_lock); 3024 open_owner_rele(oop); 3025 3026 if (reopen_needed) { 3027 nfs4_error_zinit(ep); 3028 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3029 mutex_enter(&osp->os_sync_lock); 3030 if (ep->error || ep->stat || osp->os_failed_reopen) { 3031 mutex_exit(&osp->os_sync_lock); 3032 open_stream_rele(osp, rp); 3033 return (EIO); 3034 } 3035 mutex_exit(&osp->os_sync_lock); 3036 } 3037 open_stream_rele(osp, rp); 3038 3039 return (0); 3040 } 3041 3042 /* 3043 * Write to file. Writes to remote server in largest size 3044 * chunks that the server can handle. Write is synchronous. 3045 */ 3046 static int 3047 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3048 stable_how4 *stab_comm) 3049 { 3050 mntinfo4_t *mi; 3051 COMPOUND4args_clnt args; 3052 COMPOUND4res_clnt res; 3053 WRITE4args *wargs; 3054 WRITE4res *wres; 3055 nfs_argop4 argop[2]; 3056 nfs_resop4 *resop; 3057 int tsize; 3058 stable_how4 stable; 3059 rnode4_t *rp; 3060 int doqueue = 1; 3061 bool_t needrecov; 3062 nfs4_recov_state_t recov_state; 3063 nfs4_stateid_types_t sid_types; 3064 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3065 3066 rp = VTOR4(vp); 3067 mi = VTOMI4(vp); 3068 3069 ASSERT(nfs_zone() == mi->mi_zone); 3070 3071 stable = *stab_comm; 3072 *stab_comm = FILE_SYNC4; 3073 3074 needrecov = FALSE; 3075 recov_state.rs_flags = 0; 3076 recov_state.rs_num_retry_despite_err = 0; 3077 nfs4_init_stateid_types(&sid_types); 3078 3079 recov_retry: 3080 args.ctag = TAG_WRITE; 3081 args.array_len = 2; 3082 args.array = argop; 3083 3084 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3085 &recov_state, NULL); 3086 if (e.error) 3087 return (e.error); 3088 3089 /* 0. putfh target fh */ 3090 argop[0].argop = OP_CPUTFH; 3091 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3092 3093 /* 1. write */ 3094 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3095 3096 do { 3097 3098 wargs->offset = (offset4)offset; 3099 wargs->data_val = base; 3100 3101 if (mi->mi_io_kstats) { 3102 mutex_enter(&mi->mi_lock); 3103 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3104 mutex_exit(&mi->mi_lock); 3105 } 3106 3107 if ((vp->v_flag & VNOCACHE) || 3108 (rp->r_flags & R4DIRECTIO) || 3109 (mi->mi_flags & MI4_DIRECTIO)) 3110 tsize = MIN(mi->mi_stsize, count); 3111 else 3112 tsize = MIN(mi->mi_curwrite, count); 3113 wargs->data_len = (uint_t)tsize; 3114 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3115 3116 if (mi->mi_io_kstats) { 3117 mutex_enter(&mi->mi_lock); 3118 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3119 mutex_exit(&mi->mi_lock); 3120 } 3121 3122 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3123 if (e.error && !needrecov) { 3124 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3125 &recov_state, needrecov); 3126 return (e.error); 3127 } 3128 3129 3130 /* 3131 * Do handling of OLD_STATEID outside 3132 * of the normal recovery framework. 3133 * 3134 * If write receives a BAD stateid error while using a 3135 * delegation stateid, retry using the open stateid (if it 3136 * exists). If it doesn't have an open stateid, reopen the 3137 * file first, then retry. 3138 */ 3139 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3140 sid_types.cur_sid_type != SPEC_SID) { 3141 nfs4_save_stateid(&wargs->stateid, &sid_types); 3142 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3143 &recov_state, needrecov); 3144 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3145 goto recov_retry; 3146 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3147 sid_types.cur_sid_type == DEL_SID) { 3148 nfs4_save_stateid(&wargs->stateid, &sid_types); 3149 mutex_enter(&rp->r_statev4_lock); 3150 rp->r_deleg_return_pending = TRUE; 3151 mutex_exit(&rp->r_statev4_lock); 3152 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3153 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3154 &recov_state, needrecov); 3155 (void) xdr_free(xdr_COMPOUND4res_clnt, 3156 (caddr_t)&res); 3157 return (EIO); 3158 } 3159 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3160 &recov_state, needrecov); 3161 /* hold needed for nfs4delegreturn_thread */ 3162 VN_HOLD(vp); 3163 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3164 NFS4_DR_DISCARD), FALSE); 3165 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3166 goto recov_retry; 3167 } 3168 3169 if (needrecov) { 3170 bool_t abort; 3171 3172 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3173 "nfs4write: client got error %d, res.status %d" 3174 ", so start recovery", e.error, res.status)); 3175 3176 abort = nfs4_start_recovery(&e, 3177 VTOMI4(vp), vp, NULL, &wargs->stateid, 3178 NULL, OP_WRITE, NULL); 3179 if (!e.error) { 3180 e.error = geterrno4(res.status); 3181 (void) xdr_free(xdr_COMPOUND4res_clnt, 3182 (caddr_t)&res); 3183 } 3184 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3185 &recov_state, needrecov); 3186 if (abort == FALSE) 3187 goto recov_retry; 3188 return (e.error); 3189 } 3190 3191 if (res.status) { 3192 e.error = geterrno4(res.status); 3193 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3194 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3195 &recov_state, needrecov); 3196 return (e.error); 3197 } 3198 3199 resop = &res.array[1]; /* write res */ 3200 wres = &resop->nfs_resop4_u.opwrite; 3201 3202 if ((int)wres->count > tsize) { 3203 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3204 3205 zcmn_err(getzoneid(), CE_WARN, 3206 "nfs4write: server wrote %u, requested was %u", 3207 (int)wres->count, tsize); 3208 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3209 &recov_state, needrecov); 3210 return (EIO); 3211 } 3212 if (wres->committed == UNSTABLE4) { 3213 *stab_comm = UNSTABLE4; 3214 if (wargs->stable == DATA_SYNC4 || 3215 wargs->stable == FILE_SYNC4) { 3216 (void) xdr_free(xdr_COMPOUND4res_clnt, 3217 (caddr_t)&res); 3218 zcmn_err(getzoneid(), CE_WARN, 3219 "nfs4write: server %s did not commit " 3220 "to stable storage", 3221 rp->r_server->sv_hostname); 3222 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3223 &recov_state, needrecov); 3224 return (EIO); 3225 } 3226 } 3227 3228 tsize = (int)wres->count; 3229 count -= tsize; 3230 base += tsize; 3231 offset += tsize; 3232 if (mi->mi_io_kstats) { 3233 mutex_enter(&mi->mi_lock); 3234 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3235 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3236 tsize; 3237 mutex_exit(&mi->mi_lock); 3238 } 3239 lwp_stat_update(LWP_STAT_OUBLK, 1); 3240 mutex_enter(&rp->r_statelock); 3241 if (rp->r_flags & R4HAVEVERF) { 3242 if (rp->r_writeverf != wres->writeverf) { 3243 nfs4_set_mod(vp); 3244 rp->r_writeverf = wres->writeverf; 3245 } 3246 } else { 3247 rp->r_writeverf = wres->writeverf; 3248 rp->r_flags |= R4HAVEVERF; 3249 } 3250 PURGE_ATTRCACHE4_LOCKED(rp); 3251 rp->r_flags |= R4WRITEMODIFIED; 3252 gethrestime(&rp->r_attr.va_mtime); 3253 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3254 mutex_exit(&rp->r_statelock); 3255 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3256 } while (count); 3257 3258 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, needrecov); 3259 3260 return (e.error); 3261 } 3262 3263 /* 3264 * Read from a file. Reads data in largest chunks our interface can handle. 3265 */ 3266 static int 3267 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3268 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3269 { 3270 mntinfo4_t *mi; 3271 COMPOUND4args_clnt args; 3272 COMPOUND4res_clnt res; 3273 READ4args *rargs; 3274 nfs_argop4 argop[2]; 3275 int tsize; 3276 int doqueue; 3277 rnode4_t *rp; 3278 int data_len; 3279 bool_t is_eof; 3280 bool_t needrecov = FALSE; 3281 nfs4_recov_state_t recov_state; 3282 nfs4_stateid_types_t sid_types; 3283 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3284 3285 rp = VTOR4(vp); 3286 mi = VTOMI4(vp); 3287 doqueue = 1; 3288 3289 ASSERT(nfs_zone() == mi->mi_zone); 3290 3291 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3292 3293 args.array_len = 2; 3294 args.array = argop; 3295 3296 nfs4_init_stateid_types(&sid_types); 3297 3298 recov_state.rs_flags = 0; 3299 recov_state.rs_num_retry_despite_err = 0; 3300 3301 recov_retry: 3302 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3303 &recov_state, NULL); 3304 if (e.error) 3305 return (e.error); 3306 3307 /* putfh target fh */ 3308 argop[0].argop = OP_CPUTFH; 3309 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3310 3311 /* read */ 3312 argop[1].argop = OP_READ; 3313 rargs = &argop[1].nfs_argop4_u.opread; 3314 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3315 OP_READ, &sid_types, async); 3316 3317 do { 3318 if (mi->mi_io_kstats) { 3319 mutex_enter(&mi->mi_lock); 3320 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3321 mutex_exit(&mi->mi_lock); 3322 } 3323 3324 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3325 "nfs4read: %s call, rp %s", 3326 needrecov ? "recov" : "first", 3327 rnode4info(rp))); 3328 3329 if ((vp->v_flag & VNOCACHE) || 3330 (rp->r_flags & R4DIRECTIO) || 3331 (mi->mi_flags & MI4_DIRECTIO)) 3332 tsize = MIN(mi->mi_tsize, count); 3333 else 3334 tsize = MIN(mi->mi_curread, count); 3335 rargs->offset = (offset4)offset; 3336 rargs->count = (count4)tsize; 3337 rargs->res_data_val_alt = NULL; 3338 rargs->res_mblk = NULL; 3339 rargs->res_uiop = NULL; 3340 rargs->res_maxsize = 0; 3341 if (uiop) 3342 rargs->res_uiop = uiop; 3343 else 3344 rargs->res_data_val_alt = base; 3345 rargs->res_maxsize = tsize; 3346 3347 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3348 #ifdef DEBUG 3349 if (nfs4read_error_inject) { 3350 res.status = nfs4read_error_inject; 3351 nfs4read_error_inject = 0; 3352 } 3353 #endif 3354 3355 if (mi->mi_io_kstats) { 3356 mutex_enter(&mi->mi_lock); 3357 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3358 mutex_exit(&mi->mi_lock); 3359 } 3360 3361 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3362 if (e.error != 0 && !needrecov) { 3363 nfs4_end_fop(mi, vp, NULL, OH_READ, 3364 &recov_state, needrecov); 3365 return (e.error); 3366 } 3367 3368 /* 3369 * Do proper retry for OLD and BAD stateid errors outside 3370 * of the normal recovery framework. There are two differences 3371 * between async and sync reads. The first is that we allow 3372 * retry on BAD_STATEID for async reads, but not sync reads. 3373 * The second is that we mark the file dead for a failed 3374 * attempt with a special stateid for sync reads, but just 3375 * return EIO for async reads. 3376 * 3377 * If a sync read receives a BAD stateid error while using a 3378 * delegation stateid, retry using the open stateid (if it 3379 * exists). If it doesn't have an open stateid, reopen the 3380 * file first, then retry. 3381 */ 3382 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3383 res.status == NFS4ERR_BAD_STATEID) && async) { 3384 nfs4_end_fop(mi, vp, NULL, OH_READ, 3385 &recov_state, needrecov); 3386 if (sid_types.cur_sid_type == SPEC_SID) { 3387 (void) xdr_free(xdr_COMPOUND4res_clnt, 3388 (caddr_t)&res); 3389 return (EIO); 3390 } 3391 nfs4_save_stateid(&rargs->stateid, &sid_types); 3392 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3393 goto recov_retry; 3394 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3395 !async && sid_types.cur_sid_type != SPEC_SID) { 3396 nfs4_save_stateid(&rargs->stateid, &sid_types); 3397 nfs4_end_fop(mi, vp, NULL, OH_READ, 3398 &recov_state, needrecov); 3399 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3400 goto recov_retry; 3401 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3402 sid_types.cur_sid_type == DEL_SID) { 3403 nfs4_save_stateid(&rargs->stateid, &sid_types); 3404 mutex_enter(&rp->r_statev4_lock); 3405 rp->r_deleg_return_pending = TRUE; 3406 mutex_exit(&rp->r_statev4_lock); 3407 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3408 nfs4_end_fop(mi, vp, NULL, OH_READ, 3409 &recov_state, needrecov); 3410 (void) xdr_free(xdr_COMPOUND4res_clnt, 3411 (caddr_t)&res); 3412 return (EIO); 3413 } 3414 nfs4_end_fop(mi, vp, NULL, OH_READ, 3415 &recov_state, needrecov); 3416 /* hold needed for nfs4delegreturn_thread */ 3417 VN_HOLD(vp); 3418 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3419 NFS4_DR_DISCARD), FALSE); 3420 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3421 goto recov_retry; 3422 } 3423 if (needrecov) { 3424 bool_t abort; 3425 3426 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3427 "nfs4read: initiating recovery\n")); 3428 3429 abort = nfs4_start_recovery(&e, 3430 mi, vp, NULL, &rargs->stateid, 3431 NULL, OP_READ, NULL); 3432 nfs4_end_fop(mi, vp, NULL, OH_READ, 3433 &recov_state, needrecov); 3434 /* 3435 * Do not retry if we got OLD_STATEID using a special 3436 * stateid. This avoids looping with a broken server. 3437 */ 3438 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3439 sid_types.cur_sid_type == SPEC_SID) 3440 abort = TRUE; 3441 3442 if (abort == FALSE) { 3443 /* 3444 * Need to retry all possible stateids in 3445 * case the recovery error wasn't stateid 3446 * related or the stateids have become 3447 * stale (server reboot). 3448 */ 3449 nfs4_init_stateid_types(&sid_types); 3450 (void) xdr_free(xdr_COMPOUND4res_clnt, 3451 (caddr_t)&res); 3452 goto recov_retry; 3453 } 3454 3455 if (!e.error) { 3456 e.error = geterrno4(res.status); 3457 (void) xdr_free(xdr_COMPOUND4res_clnt, 3458 (caddr_t)&res); 3459 } 3460 return (e.error); 3461 } 3462 3463 if (res.status) { 3464 e.error = geterrno4(res.status); 3465 nfs4_end_fop(mi, vp, NULL, OH_READ, 3466 &recov_state, needrecov); 3467 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3468 return (e.error); 3469 } 3470 3471 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3472 count -= data_len; 3473 if (base) 3474 base += data_len; 3475 offset += data_len; 3476 if (mi->mi_io_kstats) { 3477 mutex_enter(&mi->mi_lock); 3478 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3479 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3480 mutex_exit(&mi->mi_lock); 3481 } 3482 lwp_stat_update(LWP_STAT_INBLK, 1); 3483 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3484 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3485 3486 } while (count && !is_eof); 3487 3488 *residp = count; 3489 3490 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3491 3492 return (e.error); 3493 } 3494 3495 /* ARGSUSED */ 3496 static int 3497 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 3498 { 3499 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3500 return (EIO); 3501 switch (cmd) { 3502 case _FIODIRECTIO: 3503 return (nfs4_directio(vp, (int)arg, cr)); 3504 default: 3505 return (ENOTTY); 3506 } 3507 } 3508 3509 static int 3510 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 3511 { 3512 int error; 3513 rnode4_t *rp = VTOR4(vp); 3514 3515 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3516 return (EIO); 3517 /* 3518 * If it has been specified that the return value will 3519 * just be used as a hint, and we are only being asked 3520 * for size, fsid or rdevid, then return the client's 3521 * notion of these values without checking to make sure 3522 * that the attribute cache is up to date. 3523 * The whole point is to avoid an over the wire GETATTR 3524 * call. 3525 */ 3526 if (flags & ATTR_HINT) { 3527 if (vap->va_mask == 3528 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3529 mutex_enter(&rp->r_statelock); 3530 if (vap->va_mask | AT_SIZE) 3531 vap->va_size = rp->r_size; 3532 if (vap->va_mask | AT_FSID) 3533 vap->va_fsid = rp->r_attr.va_fsid; 3534 if (vap->va_mask | AT_RDEV) 3535 vap->va_rdev = rp->r_attr.va_rdev; 3536 mutex_exit(&rp->r_statelock); 3537 return (0); 3538 } 3539 } 3540 3541 /* 3542 * Only need to flush pages if asking for the mtime 3543 * and if there any dirty pages or any outstanding 3544 * asynchronous (write) requests for this file. 3545 */ 3546 if (vap->va_mask & AT_MTIME) { 3547 rp = VTOR4(vp); 3548 if (nfs4_has_pages(vp)) { 3549 mutex_enter(&rp->r_statev4_lock); 3550 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3551 mutex_exit(&rp->r_statev4_lock); 3552 if (rp->r_flags & R4DIRTY || 3553 rp->r_awcount > 0) { 3554 mutex_enter(&rp->r_statelock); 3555 rp->r_gcount++; 3556 mutex_exit(&rp->r_statelock); 3557 error = 3558 nfs4_putpage(vp, (u_offset_t)0, 3559 0, 0, cr); 3560 mutex_enter(&rp->r_statelock); 3561 if (error && (error == ENOSPC || 3562 error == EDQUOT)) { 3563 if (!rp->r_error) 3564 rp->r_error = error; 3565 } 3566 if (--rp->r_gcount == 0) 3567 cv_broadcast(&rp->r_cv); 3568 mutex_exit(&rp->r_statelock); 3569 } 3570 } else { 3571 mutex_exit(&rp->r_statev4_lock); 3572 } 3573 } 3574 } 3575 return (nfs4getattr(vp, vap, cr)); 3576 } 3577 3578 int 3579 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3580 { 3581 /* 3582 * If these are the only two bits cleared 3583 * on the server then return 0 (OK) else 3584 * return 1 (BAD). 3585 */ 3586 on_client &= ~(S_ISUID|S_ISGID); 3587 if (on_client == from_server) 3588 return (0); 3589 else 3590 return (1); 3591 } 3592 3593 /*ARGSUSED4*/ 3594 static int 3595 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3596 caller_context_t *ct) 3597 { 3598 if (vap->va_mask & AT_NOSET) 3599 return (EINVAL); 3600 3601 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3602 return (EIO); 3603 3604 /* 3605 * Don't call secpolicy_vnode_setattr, the client cannot 3606 * use its cached attributes to make security decisions 3607 * as the server may be faking mode bits or mapping uid/gid. 3608 * Always just let the server to the checking. 3609 * If we provide the ability to remove basic priviledges 3610 * to setattr (e.g. basic without chmod) then we will 3611 * need to add a check here before calling the server. 3612 */ 3613 3614 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3615 } 3616 3617 /* 3618 * To replace the "guarded" version 3 setattr, we use two types of compound 3619 * setattr requests: 3620 * 1. The "normal" setattr, used when the size of the file isn't being 3621 * changed - { Putfh <fh>; Setattr; Getattr }/ 3622 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3623 * with only ctime as the argument. If the server ctime differs from 3624 * what is cached on the client, the verify will fail, but we would 3625 * already have the ctime from the preceding getattr, so just set it 3626 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3627 * Setattr; Getattr }. 3628 * 3629 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3630 * this setattr and NULL if they are not. 3631 */ 3632 static int 3633 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3634 vsecattr_t *vsap) 3635 { 3636 COMPOUND4args_clnt args; 3637 COMPOUND4res_clnt res, *resp = NULL; 3638 nfs4_ga_res_t *garp = NULL; 3639 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3640 nfs_argop4 argop[5]; 3641 int verify_argop = -1; 3642 int setattr_argop = 1; 3643 nfs_resop4 *resop; 3644 vattr_t va; 3645 rnode4_t *rp; 3646 int doqueue = 1; 3647 uint_t mask = vap->va_mask; 3648 mode_t omode; 3649 vsecattr_t *vsp; 3650 timestruc_t ctime; 3651 bool_t needrecov = FALSE; 3652 nfs4_recov_state_t recov_state; 3653 nfs4_stateid_types_t sid_types; 3654 stateid4 stateid; 3655 hrtime_t t; 3656 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3657 servinfo4_t *svp; 3658 bitmap4 supp_attrs; 3659 3660 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3661 rp = VTOR4(vp); 3662 nfs4_init_stateid_types(&sid_types); 3663 3664 /* 3665 * Only need to flush pages if there are any pages and 3666 * if the file is marked as dirty in some fashion. The 3667 * file must be flushed so that we can accurately 3668 * determine the size of the file and the cached data 3669 * after the SETATTR returns. A file is considered to 3670 * be dirty if it is either marked with R4DIRTY, has 3671 * outstanding i/o's active, or is mmap'd. In this 3672 * last case, we can't tell whether there are dirty 3673 * pages, so we flush just to be sure. 3674 */ 3675 if (nfs4_has_pages(vp) && 3676 ((rp->r_flags & R4DIRTY) || 3677 rp->r_count > 0 || 3678 rp->r_mapcnt > 0)) { 3679 ASSERT(vp->v_type != VCHR); 3680 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr); 3681 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3682 mutex_enter(&rp->r_statelock); 3683 if (!rp->r_error) 3684 rp->r_error = e.error; 3685 mutex_exit(&rp->r_statelock); 3686 } 3687 } 3688 3689 if (mask & AT_SIZE) { 3690 /* 3691 * Verification setattr compound for non-deleg AT_SIZE: 3692 * { Putfh; Getattr; Verify; Setattr; Getattr } 3693 * Set ctime local here (outside the do_again label) 3694 * so that subsequent retries (after failed VERIFY) 3695 * will use ctime from GETATTR results (from failed 3696 * verify compound) as VERIFY arg. 3697 * If file has delegation, then VERIFY(time_metadata) 3698 * is of little added value, so don't bother. 3699 */ 3700 mutex_enter(&rp->r_statev4_lock); 3701 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3702 rp->r_deleg_return_pending) { 3703 numops = 5; 3704 ctime = rp->r_attr.va_ctime; 3705 } 3706 mutex_exit(&rp->r_statev4_lock); 3707 } 3708 3709 recov_state.rs_flags = 0; 3710 recov_state.rs_num_retry_despite_err = 0; 3711 3712 args.ctag = TAG_SETATTR; 3713 do_again: 3714 recov_retry: 3715 setattr_argop = numops - 2; 3716 3717 args.array = argop; 3718 args.array_len = numops; 3719 3720 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3721 if (e.error) 3722 return (e.error); 3723 3724 3725 /* putfh target fh */ 3726 argop[0].argop = OP_CPUTFH; 3727 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3728 3729 if (numops == 5) { 3730 /* 3731 * We only care about the ctime, but need to get mtime 3732 * and size for proper cache update. 3733 */ 3734 /* getattr */ 3735 argop[1].argop = OP_GETATTR; 3736 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3737 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3738 3739 /* verify - set later in loop */ 3740 verify_argop = 2; 3741 } 3742 3743 /* setattr */ 3744 svp = rp->r_server; 3745 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3746 supp_attrs = svp->sv_supp_attrs; 3747 nfs_rw_exit(&svp->sv_lock); 3748 3749 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3750 supp_attrs, &e.error, &sid_types); 3751 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3752 if (e.error) { 3753 /* req time field(s) overflow - return immediately */ 3754 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3755 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3756 opsetattr.obj_attributes); 3757 return (e.error); 3758 } 3759 omode = rp->r_attr.va_mode; 3760 3761 /* getattr */ 3762 argop[numops-1].argop = OP_GETATTR; 3763 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3764 /* 3765 * If we are setting the ACL (indicated only by vsap != NULL), request 3766 * the ACL in this getattr. The ACL returned from this getattr will be 3767 * used in updating the ACL cache. 3768 */ 3769 if (vsap != NULL) 3770 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3771 FATTR4_ACL_MASK; 3772 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3773 3774 /* 3775 * setattr iterates if the object size is set and the cached ctime 3776 * does not match the file ctime. In that case, verify the ctime first. 3777 */ 3778 3779 do { 3780 if (verify_argop != -1) { 3781 /* 3782 * Verify that the ctime match before doing setattr. 3783 */ 3784 va.va_mask = AT_CTIME; 3785 va.va_ctime = ctime; 3786 svp = rp->r_server; 3787 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3788 supp_attrs = svp->sv_supp_attrs; 3789 nfs_rw_exit(&svp->sv_lock); 3790 e.error = nfs4args_verify(&argop[verify_argop], &va, 3791 OP_VERIFY, supp_attrs); 3792 if (e.error) { 3793 /* req time field(s) overflow - return */ 3794 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3795 needrecov); 3796 break; 3797 } 3798 } 3799 3800 doqueue = 1; 3801 3802 t = gethrtime(); 3803 3804 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3805 3806 /* 3807 * Purge the access cache and ACL cache if changing either the 3808 * owner of the file, the group owner, or the mode. These may 3809 * change the access permissions of the file, so purge old 3810 * information and start over again. 3811 */ 3812 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3813 (void) nfs4_access_purge_rp(rp); 3814 if (rp->r_secattr != NULL) { 3815 mutex_enter(&rp->r_statelock); 3816 vsp = rp->r_secattr; 3817 rp->r_secattr = NULL; 3818 mutex_exit(&rp->r_statelock); 3819 if (vsp != NULL) 3820 nfs4_acl_free_cache(vsp); 3821 } 3822 } 3823 3824 /* 3825 * If res.array_len == numops, then everything succeeded, 3826 * except for possibly the final getattr. If only the 3827 * last getattr failed, give up, and don't try recovery. 3828 */ 3829 if (res.array_len == numops) { 3830 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3831 needrecov); 3832 if (! e.error) 3833 resp = &res; 3834 break; 3835 } 3836 3837 /* 3838 * if either rpc call failed or completely succeeded - done 3839 */ 3840 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3841 if (e.error) { 3842 PURGE_ATTRCACHE4(vp); 3843 if (!needrecov) { 3844 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3845 needrecov); 3846 break; 3847 } 3848 } 3849 3850 /* 3851 * Do proper retry for OLD_STATEID outside of the normal 3852 * recovery framework. 3853 */ 3854 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3855 sid_types.cur_sid_type != SPEC_SID && 3856 sid_types.cur_sid_type != NO_SID) { 3857 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3858 needrecov); 3859 nfs4_save_stateid(&stateid, &sid_types); 3860 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3861 opsetattr.obj_attributes); 3862 if (verify_argop != -1) { 3863 nfs4args_verify_free(&argop[verify_argop]); 3864 verify_argop = -1; 3865 } 3866 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3867 goto recov_retry; 3868 } 3869 3870 if (needrecov) { 3871 bool_t abort; 3872 3873 abort = nfs4_start_recovery(&e, 3874 VTOMI4(vp), vp, NULL, NULL, NULL, 3875 OP_SETATTR, NULL); 3876 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3877 needrecov); 3878 /* 3879 * Do not retry if we failed with OLD_STATEID using 3880 * a special stateid. This is done to avoid looping 3881 * with a broken server. 3882 */ 3883 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3884 (sid_types.cur_sid_type == SPEC_SID || 3885 sid_types.cur_sid_type == NO_SID)) 3886 abort = TRUE; 3887 if (!e.error) { 3888 if (res.status == NFS4ERR_BADOWNER) 3889 nfs4_log_badowner(VTOMI4(vp), 3890 OP_SETATTR); 3891 3892 e.error = geterrno4(res.status); 3893 (void) xdr_free(xdr_COMPOUND4res_clnt, 3894 (caddr_t)&res); 3895 } 3896 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3897 opsetattr.obj_attributes); 3898 if (verify_argop != -1) { 3899 nfs4args_verify_free(&argop[verify_argop]); 3900 verify_argop = -1; 3901 } 3902 if (abort == FALSE) { 3903 /* 3904 * Need to retry all possible stateids in 3905 * case the recovery error wasn't stateid 3906 * related or the stateids have become 3907 * stale (server reboot). 3908 */ 3909 nfs4_init_stateid_types(&sid_types); 3910 goto recov_retry; 3911 } 3912 return (e.error); 3913 } 3914 3915 /* 3916 * Need to call nfs4_end_op before nfs4getattr to 3917 * avoid potential nfs4_start_op deadlock. See RFE 3918 * 4777612. Calls to nfs4_invalidate_pages() and 3919 * nfs4_purge_stale_fh() might also generate over the 3920 * wire calls which my cause nfs4_start_op() deadlock. 3921 */ 3922 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3923 3924 /* 3925 * Check to update lease. 3926 */ 3927 resp = &res; 3928 if (res.status == NFS4_OK) { 3929 break; 3930 } 3931 3932 /* 3933 * Check if verify failed to see if try again 3934 */ 3935 if ((verify_argop == -1) || (res.array_len != 3)) { 3936 /* 3937 * can't continue... 3938 */ 3939 if (res.status == NFS4ERR_BADOWNER) 3940 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 3941 3942 e.error = geterrno4(res.status); 3943 } else { 3944 /* 3945 * When the verify request fails, the client ctime is 3946 * not in sync with the server. This is the same as 3947 * the version 3 "not synchronized" error, and we 3948 * handle it in a similar manner (XXX do we need to???). 3949 * Use the ctime returned in the first getattr for 3950 * the input to the next verify. 3951 * If we couldn't get the attributes, then we give up 3952 * because we can't complete the operation as required. 3953 */ 3954 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 3955 } 3956 if (e.error) { 3957 PURGE_ATTRCACHE4(vp); 3958 nfs4_purge_stale_fh(e.error, vp, cr); 3959 } else { 3960 /* 3961 * retry with a new verify value 3962 */ 3963 ctime = garp->n4g_va.va_ctime; 3964 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3965 resp = NULL; 3966 } 3967 if (!e.error) { 3968 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3969 opsetattr.obj_attributes); 3970 if (verify_argop != -1) { 3971 nfs4args_verify_free(&argop[verify_argop]); 3972 verify_argop = -1; 3973 } 3974 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3975 goto do_again; 3976 } 3977 } while (!e.error); 3978 3979 if (e.error) { 3980 /* 3981 * If we are here, rfs4call has an irrecoverable error - return 3982 */ 3983 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3984 opsetattr.obj_attributes); 3985 if (verify_argop != -1) { 3986 nfs4args_verify_free(&argop[verify_argop]); 3987 verify_argop = -1; 3988 } 3989 if (resp) 3990 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 3991 return (e.error); 3992 } 3993 3994 3995 3996 /* 3997 * If changing the size of the file, invalidate 3998 * any local cached data which is no longer part 3999 * of the file. We also possibly invalidate the 4000 * last page in the file. We could use 4001 * pvn_vpzero(), but this would mark the page as 4002 * modified and require it to be written back to 4003 * the server for no particularly good reason. 4004 * This way, if we access it, then we bring it 4005 * back in. A read should be cheaper than a 4006 * write. 4007 */ 4008 if (mask & AT_SIZE) { 4009 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4010 } 4011 4012 /* either no error or one of the postop getattr failed */ 4013 4014 /* 4015 * XXX Perform a simplified version of wcc checking. Instead of 4016 * have another getattr to get pre-op, just purge cache if 4017 * any of the ops prior to and including the getattr failed. 4018 * If the getattr succeeded then update the attrcache accordingly. 4019 */ 4020 4021 garp = NULL; 4022 if (res.status == NFS4_OK) { 4023 /* 4024 * Last getattr 4025 */ 4026 resop = &res.array[numops - 1]; 4027 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4028 } 4029 /* 4030 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4031 * rather than filling it. See the function itself for details. 4032 */ 4033 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4034 if (garp != NULL) { 4035 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4036 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4037 vs_ace4_destroy(&garp->n4g_vsa); 4038 } else { 4039 if (vsap != NULL) { 4040 /* 4041 * The ACL was supposed to be set and to be 4042 * returned in the last getattr of this 4043 * compound, but for some reason the getattr 4044 * result doesn't contain the ACL. In this 4045 * case, purge the ACL cache. 4046 */ 4047 if (rp->r_secattr != NULL) { 4048 mutex_enter(&rp->r_statelock); 4049 vsp = rp->r_secattr; 4050 rp->r_secattr = NULL; 4051 mutex_exit(&rp->r_statelock); 4052 if (vsp != NULL) 4053 nfs4_acl_free_cache(vsp); 4054 } 4055 } 4056 } 4057 } 4058 4059 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4060 /* 4061 * Set the size, rather than relying on getting it updated 4062 * via a GETATTR. With delegations the client tries to 4063 * suppress GETATTR calls. 4064 */ 4065 mutex_enter(&rp->r_statelock); 4066 rp->r_size = vap->va_size; 4067 mutex_exit(&rp->r_statelock); 4068 } 4069 4070 /* 4071 * Can free up request args and res 4072 */ 4073 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4074 opsetattr.obj_attributes); 4075 if (verify_argop != -1) { 4076 nfs4args_verify_free(&argop[verify_argop]); 4077 verify_argop = -1; 4078 } 4079 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4080 4081 /* 4082 * Some servers will change the mode to clear the setuid 4083 * and setgid bits when changing the uid or gid. The 4084 * client needs to compensate appropriately. 4085 */ 4086 if (mask & (AT_UID | AT_GID)) { 4087 int terror, do_setattr; 4088 4089 do_setattr = 0; 4090 va.va_mask = AT_MODE; 4091 terror = nfs4getattr(vp, &va, cr); 4092 if (!terror && 4093 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4094 (!(mask & AT_MODE) && va.va_mode != omode))) { 4095 va.va_mask = AT_MODE; 4096 if (mask & AT_MODE) { 4097 /* 4098 * We asked the mode to be changed and what 4099 * we just got from the server in getattr is 4100 * not what we wanted it to be, so set it now. 4101 */ 4102 va.va_mode = vap->va_mode; 4103 do_setattr = 1; 4104 } else { 4105 /* 4106 * We did not ask the mode to be changed, 4107 * Check to see that the server just cleared 4108 * I_SUID and I_GUID from it. If not then 4109 * set mode to omode with UID/GID cleared. 4110 */ 4111 if (nfs4_compare_modes(va.va_mode, omode)) { 4112 omode &= ~(S_ISUID|S_ISGID); 4113 va.va_mode = omode; 4114 do_setattr = 1; 4115 } 4116 } 4117 4118 if (do_setattr) 4119 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4120 } 4121 } 4122 4123 return (e.error); 4124 } 4125 4126 /* ARGSUSED */ 4127 static int 4128 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr) 4129 { 4130 COMPOUND4args_clnt args; 4131 COMPOUND4res_clnt res; 4132 int doqueue; 4133 uint32_t acc, resacc, argacc; 4134 rnode4_t *rp; 4135 cred_t *cred, *ncr, *ncrfree = NULL; 4136 nfs4_access_type_t cacc; 4137 int num_ops; 4138 nfs_argop4 argop[3]; 4139 nfs_resop4 *resop; 4140 bool_t needrecov = FALSE, do_getattr; 4141 nfs4_recov_state_t recov_state; 4142 int rpc_error; 4143 hrtime_t t; 4144 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4145 mntinfo4_t *mi = VTOMI4(vp); 4146 4147 if (nfs_zone() != mi->mi_zone) 4148 return (EIO); 4149 4150 acc = 0; 4151 if (mode & VREAD) 4152 acc |= ACCESS4_READ; 4153 if (mode & VWRITE) { 4154 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4155 return (EROFS); 4156 if (vp->v_type == VDIR) 4157 acc |= ACCESS4_DELETE; 4158 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4159 } 4160 if (mode & VEXEC) { 4161 if (vp->v_type == VDIR) 4162 acc |= ACCESS4_LOOKUP; 4163 else 4164 acc |= ACCESS4_EXECUTE; 4165 } 4166 4167 if (VTOR4(vp)->r_acache != NULL) { 4168 e.error = nfs4_validate_caches(vp, cr); 4169 if (e.error) 4170 return (e.error); 4171 } 4172 4173 rp = VTOR4(vp); 4174 if (vp->v_type == VDIR) { 4175 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4176 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4177 } else { 4178 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4179 ACCESS4_EXECUTE; 4180 } 4181 recov_state.rs_flags = 0; 4182 recov_state.rs_num_retry_despite_err = 0; 4183 4184 cred = cr; 4185 /* 4186 * ncr and ncrfree both initially 4187 * point to the memory area returned 4188 * by crnetadjust(); 4189 * ncrfree not NULL when exiting means 4190 * that we need to release it 4191 */ 4192 ncr = crnetadjust(cred); 4193 ncrfree = ncr; 4194 4195 tryagain: 4196 cacc = nfs4_access_check(rp, acc, cred); 4197 if (cacc == NFS4_ACCESS_ALLOWED) { 4198 if (ncrfree != NULL) 4199 crfree(ncrfree); 4200 return (0); 4201 } 4202 if (cacc == NFS4_ACCESS_DENIED) { 4203 /* 4204 * If the cred can be adjusted, try again 4205 * with the new cred. 4206 */ 4207 if (ncr != NULL) { 4208 cred = ncr; 4209 ncr = NULL; 4210 goto tryagain; 4211 } 4212 if (ncrfree != NULL) 4213 crfree(ncrfree); 4214 return (EACCES); 4215 } 4216 4217 recov_retry: 4218 /* 4219 * Don't take with r_statev4_lock here. r_deleg_type could 4220 * change as soon as lock is released. Since it is an int, 4221 * there is no atomicity issue. 4222 */ 4223 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4224 num_ops = do_getattr ? 3 : 2; 4225 4226 args.ctag = TAG_ACCESS; 4227 4228 args.array_len = num_ops; 4229 args.array = argop; 4230 4231 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4232 &recov_state, NULL)) { 4233 if (ncrfree != NULL) 4234 crfree(ncrfree); 4235 return (e.error); 4236 } 4237 4238 /* putfh target fh */ 4239 argop[0].argop = OP_CPUTFH; 4240 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4241 4242 /* access */ 4243 argop[1].argop = OP_ACCESS; 4244 argop[1].nfs_argop4_u.opaccess.access = argacc; 4245 4246 /* getattr */ 4247 if (do_getattr) { 4248 argop[2].argop = OP_GETATTR; 4249 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4250 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4251 } 4252 4253 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4254 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4255 rnode4info(VTOR4(vp)))); 4256 4257 doqueue = 1; 4258 t = gethrtime(); 4259 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4260 rpc_error = e.error; 4261 4262 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4263 if (needrecov) { 4264 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4265 "nfs4_access: initiating recovery\n")); 4266 4267 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4268 NULL, OP_ACCESS, NULL) == FALSE) { 4269 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4270 &recov_state, needrecov); 4271 if (!e.error) 4272 (void) xdr_free(xdr_COMPOUND4res_clnt, 4273 (caddr_t)&res); 4274 goto recov_retry; 4275 } 4276 } 4277 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4278 4279 if (e.error) 4280 goto out; 4281 4282 if (res.status) { 4283 e.error = geterrno4(res.status); 4284 /* 4285 * This might generate over the wire calls throught 4286 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4287 * here to avoid a deadlock. 4288 */ 4289 nfs4_purge_stale_fh(e.error, vp, cr); 4290 goto out; 4291 } 4292 resop = &res.array[1]; /* access res */ 4293 4294 resacc = resop->nfs_resop4_u.opaccess.access; 4295 4296 if (do_getattr) { 4297 resop++; /* getattr res */ 4298 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4299 t, cr, FALSE, NULL); 4300 } 4301 4302 if (!e.error) { 4303 nfs4_access_cache(rp, argacc, resacc, cred); 4304 /* 4305 * we just cached results with cred; if cred is the 4306 * adjusted credentials from crnetadjust, we do not want 4307 * to release them before exiting: hence setting ncrfree 4308 * to NULL 4309 */ 4310 if (cred != cr) 4311 ncrfree = NULL; 4312 /* XXX check the supported bits too? */ 4313 if ((acc & resacc) != acc) { 4314 /* 4315 * The following code implements the semantic 4316 * that a setuid root program has *at least* the 4317 * permissions of the user that is running the 4318 * program. See rfs3call() for more portions 4319 * of the implementation of this functionality. 4320 */ 4321 /* XXX-LP */ 4322 if (ncr != NULL) { 4323 (void) xdr_free(xdr_COMPOUND4res_clnt, 4324 (caddr_t)&res); 4325 cred = ncr; 4326 ncr = NULL; 4327 goto tryagain; 4328 } 4329 e.error = EACCES; 4330 } 4331 } 4332 4333 out: 4334 if (!rpc_error) 4335 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4336 4337 if (ncrfree != NULL) 4338 crfree(ncrfree); 4339 4340 return (e.error); 4341 } 4342 4343 static int 4344 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 4345 { 4346 COMPOUND4args_clnt args; 4347 COMPOUND4res_clnt res; 4348 int doqueue; 4349 rnode4_t *rp; 4350 nfs_argop4 argop[3]; 4351 nfs_resop4 *resop; 4352 READLINK4res *lr_res; 4353 nfs4_ga_res_t *garp; 4354 uint_t len; 4355 char *linkdata; 4356 bool_t needrecov = FALSE; 4357 nfs4_recov_state_t recov_state; 4358 hrtime_t t; 4359 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4360 4361 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4362 return (EIO); 4363 /* 4364 * Can't readlink anything other than a symbolic link. 4365 */ 4366 if (vp->v_type != VLNK) 4367 return (EINVAL); 4368 4369 rp = VTOR4(vp); 4370 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4371 e.error = nfs4_validate_caches(vp, cr); 4372 if (e.error) 4373 return (e.error); 4374 mutex_enter(&rp->r_statelock); 4375 if (rp->r_symlink.contents != NULL) { 4376 e.error = uiomove(rp->r_symlink.contents, 4377 rp->r_symlink.len, UIO_READ, uiop); 4378 mutex_exit(&rp->r_statelock); 4379 return (e.error); 4380 } 4381 mutex_exit(&rp->r_statelock); 4382 } 4383 recov_state.rs_flags = 0; 4384 recov_state.rs_num_retry_despite_err = 0; 4385 4386 recov_retry: 4387 args.array_len = 3; 4388 args.array = argop; 4389 args.ctag = TAG_READLINK; 4390 4391 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4392 if (e.error) { 4393 return (e.error); 4394 } 4395 4396 /* 0. putfh symlink fh */ 4397 argop[0].argop = OP_CPUTFH; 4398 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4399 4400 /* 1. readlink */ 4401 argop[1].argop = OP_READLINK; 4402 4403 /* 2. getattr */ 4404 argop[2].argop = OP_GETATTR; 4405 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4406 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4407 4408 doqueue = 1; 4409 4410 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4411 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4412 rnode4info(VTOR4(vp)))); 4413 4414 t = gethrtime(); 4415 4416 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4417 4418 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4419 if (needrecov) { 4420 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4421 "nfs4_readlink: initiating recovery\n")); 4422 4423 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4424 NULL, OP_READLINK, NULL) == FALSE) { 4425 if (!e.error) 4426 (void) xdr_free(xdr_COMPOUND4res_clnt, 4427 (caddr_t)&res); 4428 4429 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4430 needrecov); 4431 goto recov_retry; 4432 } 4433 } 4434 4435 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4436 4437 if (e.error) 4438 return (e.error); 4439 4440 /* 4441 * There is an path in the code below which calls 4442 * nfs4_purge_stale_fh(), which may generate otw calls through 4443 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4444 * here to avoid nfs4_start_op() deadlock. 4445 */ 4446 4447 if (res.status && (res.array_len < args.array_len)) { 4448 /* 4449 * either Putfh or Link failed 4450 */ 4451 e.error = geterrno4(res.status); 4452 nfs4_purge_stale_fh(e.error, vp, cr); 4453 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4454 return (e.error); 4455 } 4456 4457 resop = &res.array[1]; /* readlink res */ 4458 lr_res = &resop->nfs_resop4_u.opreadlink; 4459 4460 /* 4461 * treat symlink names as data 4462 */ 4463 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4464 if (linkdata != NULL) { 4465 int uio_len = len - 1; 4466 /* len includes null byte, which we won't uiomove */ 4467 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4468 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4469 mutex_enter(&rp->r_statelock); 4470 if (rp->r_symlink.contents == NULL) { 4471 rp->r_symlink.contents = linkdata; 4472 rp->r_symlink.len = uio_len; 4473 rp->r_symlink.size = len; 4474 mutex_exit(&rp->r_statelock); 4475 } else { 4476 mutex_exit(&rp->r_statelock); 4477 kmem_free(linkdata, len); 4478 } 4479 } else { 4480 kmem_free(linkdata, len); 4481 } 4482 } 4483 if (res.status == NFS4_OK) { 4484 resop++; /* getattr res */ 4485 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4486 } 4487 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4488 4489 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4490 4491 /* 4492 * The over the wire error for attempting to readlink something 4493 * other than a symbolic link is ENXIO. However, we need to 4494 * return EINVAL instead of ENXIO, so we map it here. 4495 */ 4496 return (e.error == ENXIO ? EINVAL : e.error); 4497 } 4498 4499 /* 4500 * Flush local dirty pages to stable storage on the server. 4501 * 4502 * If FNODSYNC is specified, then there is nothing to do because 4503 * metadata changes are not cached on the client before being 4504 * sent to the server. 4505 */ 4506 static int 4507 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr) 4508 { 4509 int error; 4510 4511 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4512 return (0); 4513 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4514 return (EIO); 4515 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4516 if (!error) 4517 error = VTOR4(vp)->r_error; 4518 return (error); 4519 } 4520 4521 /* 4522 * Weirdness: if the file was removed or the target of a rename 4523 * operation while it was open, it got renamed instead. Here we 4524 * remove the renamed file. 4525 */ 4526 static void 4527 nfs4_inactive(vnode_t *vp, cred_t *cr) 4528 { 4529 rnode4_t *rp; 4530 4531 ASSERT(vp != DNLC_NO_VNODE); 4532 4533 rp = VTOR4(vp); 4534 4535 if (IS_SHADOW(vp, rp)) { 4536 sv_inactive(vp); 4537 return; 4538 } 4539 4540 /* 4541 * If this is coming from the wrong zone, we let someone in the right 4542 * zone take care of it asynchronously. We can get here due to 4543 * VN_RELE() being called from pageout() or fsflush(). This call may 4544 * potentially turn into an expensive no-op if, for instance, v_count 4545 * gets incremented in the meantime, but it's still correct. 4546 */ 4547 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4548 nfs4_async_inactive(vp, cr); 4549 return; 4550 } 4551 4552 /* 4553 * Some of the cleanup steps might require over-the-wire 4554 * operations. Since VOP_INACTIVE can get called as a result of 4555 * other over-the-wire operations (e.g., an attribute cache update 4556 * can lead to a DNLC purge), doing those steps now would lead to a 4557 * nested call to the recovery framework, which can deadlock. So 4558 * do any over-the-wire cleanups asynchronously, in a separate 4559 * thread. 4560 */ 4561 4562 mutex_enter(&rp->r_os_lock); 4563 mutex_enter(&rp->r_statelock); 4564 mutex_enter(&rp->r_statev4_lock); 4565 4566 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4567 mutex_exit(&rp->r_statev4_lock); 4568 mutex_exit(&rp->r_statelock); 4569 mutex_exit(&rp->r_os_lock); 4570 nfs4_async_inactive(vp, cr); 4571 return; 4572 } 4573 4574 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4575 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4576 mutex_exit(&rp->r_statev4_lock); 4577 mutex_exit(&rp->r_statelock); 4578 mutex_exit(&rp->r_os_lock); 4579 nfs4_async_inactive(vp, cr); 4580 return; 4581 } 4582 4583 if (rp->r_unldvp != NULL) { 4584 mutex_exit(&rp->r_statev4_lock); 4585 mutex_exit(&rp->r_statelock); 4586 mutex_exit(&rp->r_os_lock); 4587 nfs4_async_inactive(vp, cr); 4588 return; 4589 } 4590 mutex_exit(&rp->r_statev4_lock); 4591 mutex_exit(&rp->r_statelock); 4592 mutex_exit(&rp->r_os_lock); 4593 4594 rp4_addfree(rp, cr); 4595 } 4596 4597 /* 4598 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4599 * various bits of state. The caller must not refer to vp after this call. 4600 */ 4601 4602 void 4603 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4604 { 4605 rnode4_t *rp = VTOR4(vp); 4606 nfs4_recov_state_t recov_state; 4607 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4608 vnode_t *unldvp; 4609 char *unlname; 4610 cred_t *unlcred; 4611 COMPOUND4args_clnt args; 4612 COMPOUND4res_clnt res, *resp; 4613 nfs_argop4 argop[2]; 4614 int doqueue; 4615 #ifdef DEBUG 4616 char *name; 4617 #endif 4618 4619 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4620 ASSERT(!IS_SHADOW(vp, rp)); 4621 4622 #ifdef DEBUG 4623 name = fn_name(VTOSV(vp)->sv_name); 4624 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4625 "release vnode %s", name)); 4626 kmem_free(name, MAXNAMELEN); 4627 #endif 4628 4629 if (vp->v_type == VREG) { 4630 bool_t recov_failed = FALSE; 4631 4632 e.error = nfs4close_all(vp, cr); 4633 if (e.error) { 4634 /* Check to see if recovery failed */ 4635 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4636 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4637 recov_failed = TRUE; 4638 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4639 if (!recov_failed) { 4640 mutex_enter(&rp->r_statelock); 4641 if (rp->r_flags & R4RECOVERR) 4642 recov_failed = TRUE; 4643 mutex_exit(&rp->r_statelock); 4644 } 4645 if (recov_failed) { 4646 NFS4_DEBUG(nfs4_client_recov_debug, 4647 (CE_NOTE, "nfs4_inactive_otw: " 4648 "close failed (recovery failure)")); 4649 } 4650 } 4651 } 4652 4653 redo: 4654 if (rp->r_unldvp == NULL) { 4655 rp4_addfree(rp, cr); 4656 return; 4657 } 4658 4659 /* 4660 * Save the vnode pointer for the directory where the 4661 * unlinked-open file got renamed, then set it to NULL 4662 * to prevent another thread from getting here before 4663 * we're done with the remove. While we have the 4664 * statelock, make local copies of the pertinent rnode 4665 * fields. If we weren't to do this in an atomic way, the 4666 * the unl* fields could become inconsistent with respect 4667 * to each other due to a race condition between this 4668 * code and nfs_remove(). See bug report 1034328. 4669 */ 4670 mutex_enter(&rp->r_statelock); 4671 if (rp->r_unldvp == NULL) { 4672 mutex_exit(&rp->r_statelock); 4673 rp4_addfree(rp, cr); 4674 return; 4675 } 4676 4677 unldvp = rp->r_unldvp; 4678 rp->r_unldvp = NULL; 4679 unlname = rp->r_unlname; 4680 rp->r_unlname = NULL; 4681 unlcred = rp->r_unlcred; 4682 rp->r_unlcred = NULL; 4683 mutex_exit(&rp->r_statelock); 4684 4685 /* 4686 * If there are any dirty pages left, then flush 4687 * them. This is unfortunate because they just 4688 * may get thrown away during the remove operation, 4689 * but we have to do this for correctness. 4690 */ 4691 if (nfs4_has_pages(vp) && 4692 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4693 ASSERT(vp->v_type != VCHR); 4694 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 4695 if (e.error) { 4696 mutex_enter(&rp->r_statelock); 4697 if (!rp->r_error) 4698 rp->r_error = e.error; 4699 mutex_exit(&rp->r_statelock); 4700 } 4701 } 4702 4703 recov_state.rs_flags = 0; 4704 recov_state.rs_num_retry_despite_err = 0; 4705 recov_retry_remove: 4706 /* 4707 * Do the remove operation on the renamed file 4708 */ 4709 args.ctag = TAG_INACTIVE; 4710 4711 /* 4712 * Remove ops: putfh dir; remove 4713 */ 4714 args.array_len = 2; 4715 args.array = argop; 4716 4717 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4718 if (e.error) { 4719 kmem_free(unlname, MAXNAMELEN); 4720 crfree(unlcred); 4721 VN_RELE(unldvp); 4722 /* 4723 * Try again; this time around r_unldvp will be NULL, so we'll 4724 * just call rp4_addfree() and return. 4725 */ 4726 goto redo; 4727 } 4728 4729 /* putfh directory */ 4730 argop[0].argop = OP_CPUTFH; 4731 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4732 4733 /* remove */ 4734 argop[1].argop = OP_CREMOVE; 4735 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4736 4737 doqueue = 1; 4738 resp = &res; 4739 4740 #if 0 /* notyet */ 4741 /* 4742 * Can't do this yet. We may be being called from 4743 * dnlc_purge_XXX while that routine is holding a 4744 * mutex lock to the nc_rele list. The calls to 4745 * nfs3_cache_wcc_data may result in calls to 4746 * dnlc_purge_XXX. This will result in a deadlock. 4747 */ 4748 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4749 if (e.error) { 4750 PURGE_ATTRCACHE4(unldvp); 4751 resp = NULL; 4752 } else if (res.status) { 4753 e.error = geterrno4(res.status); 4754 PURGE_ATTRCACHE4(unldvp); 4755 /* 4756 * This code is inactive right now 4757 * but if made active there should 4758 * be a nfs4_end_op() call before 4759 * nfs4_purge_stale_fh to avoid start_op() 4760 * deadlock. See BugId: 4948726 4761 */ 4762 nfs4_purge_stale_fh(error, unldvp, cr); 4763 } else { 4764 nfs_resop4 *resop; 4765 REMOVE4res *rm_res; 4766 4767 resop = &res.array[1]; 4768 rm_res = &resop->nfs_resop4_u.opremove; 4769 /* 4770 * Update directory cache attribute, 4771 * readdir and dnlc caches. 4772 */ 4773 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4774 } 4775 #else 4776 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4777 4778 PURGE_ATTRCACHE4(unldvp); 4779 #endif 4780 4781 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4782 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4783 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4784 if (!e.error) 4785 (void) xdr_free(xdr_COMPOUND4res_clnt, 4786 (caddr_t)&res); 4787 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4788 &recov_state, TRUE); 4789 goto recov_retry_remove; 4790 } 4791 } 4792 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4793 4794 /* 4795 * Release stuff held for the remove 4796 */ 4797 VN_RELE(unldvp); 4798 if (!e.error && resp) 4799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4800 4801 kmem_free(unlname, MAXNAMELEN); 4802 crfree(unlcred); 4803 goto redo; 4804 } 4805 4806 /* 4807 * Remote file system operations having to do with directory manipulation. 4808 */ 4809 /* ARGSUSED3 */ 4810 static int 4811 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4812 int flags, vnode_t *rdir, cred_t *cr) 4813 { 4814 int error; 4815 vnode_t *vp, *avp = NULL; 4816 rnode4_t *drp; 4817 4818 *vpp = NULL; 4819 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4820 return (EPERM); 4821 /* 4822 * if LOOKUP_XATTR, must replace dvp (object) with 4823 * object's attrdir before continuing with lookup 4824 */ 4825 if (flags & LOOKUP_XATTR) { 4826 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4827 if (error) 4828 return (error); 4829 4830 dvp = avp; 4831 4832 /* 4833 * If lookup is for "", just return dvp now. The attrdir 4834 * has already been activated (from nfs4lookup_xattr), and 4835 * the caller will RELE the original dvp -- not 4836 * the attrdir. So, set vpp and return. 4837 * Currently, when the LOOKUP_XATTR flag is 4838 * passed to VOP_LOOKUP, the name is always empty, and 4839 * shortcircuiting here avoids 3 unneeded lock/unlock 4840 * pairs. 4841 * 4842 * If a non-empty name was provided, then it is the 4843 * attribute name, and it will be looked up below. 4844 */ 4845 if (*nm == '\0') { 4846 *vpp = dvp; 4847 return (0); 4848 } 4849 4850 /* 4851 * The vfs layer never sends a name when asking for the 4852 * attrdir, so we should never get here (unless of course 4853 * name is passed at some time in future -- at which time 4854 * we'll blow up here). 4855 */ 4856 ASSERT(0); 4857 } 4858 4859 drp = VTOR4(dvp); 4860 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4861 return (EINTR); 4862 4863 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4864 nfs_rw_exit(&drp->r_rwlock); 4865 4866 /* 4867 * If vnode is a device, create special vnode. 4868 */ 4869 if (!error && ISVDEV((*vpp)->v_type)) { 4870 vp = *vpp; 4871 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4872 VN_RELE(vp); 4873 } 4874 4875 return (error); 4876 } 4877 4878 /* ARGSUSED */ 4879 static int 4880 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4881 { 4882 int error; 4883 rnode4_t *drp; 4884 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4885 mntinfo4_t *mi; 4886 4887 mi = VTOMI4(dvp); 4888 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR)) 4889 return (EINVAL); 4890 4891 drp = VTOR4(dvp); 4892 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4893 return (EINTR); 4894 4895 mutex_enter(&drp->r_statelock); 4896 /* 4897 * If the server doesn't support xattrs just return EINVAL 4898 */ 4899 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4900 mutex_exit(&drp->r_statelock); 4901 nfs_rw_exit(&drp->r_rwlock); 4902 return (EINVAL); 4903 } 4904 4905 /* 4906 * If there is a cached xattr directory entry, 4907 * use it as long as the attributes are valid. If the 4908 * attributes are not valid, take the simple approach and 4909 * free the cached value and re-fetch a new value. 4910 * 4911 * We don't negative entry cache for now, if we did we 4912 * would need to check if the file has changed on every 4913 * lookup. But xattrs don't exist very often and failing 4914 * an openattr is not much more expensive than and NVERIFY or GETATTR 4915 * so do an openattr over the wire for now. 4916 */ 4917 if (drp->r_xattr_dir != NULL) { 4918 if (ATTRCACHE4_VALID(dvp)) { 4919 VN_HOLD(drp->r_xattr_dir); 4920 *vpp = drp->r_xattr_dir; 4921 mutex_exit(&drp->r_statelock); 4922 nfs_rw_exit(&drp->r_rwlock); 4923 return (0); 4924 } 4925 VN_RELE(drp->r_xattr_dir); 4926 drp->r_xattr_dir = NULL; 4927 } 4928 mutex_exit(&drp->r_statelock); 4929 4930 error = nfs4openattr(dvp, vpp, cflag, cr); 4931 4932 nfs_rw_exit(&drp->r_rwlock); 4933 4934 return (error); 4935 } 4936 4937 static int 4938 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 4939 { 4940 int error; 4941 rnode4_t *drp; 4942 4943 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 4944 4945 /* 4946 * If lookup is for "", just return dvp. Don't need 4947 * to send it over the wire, look it up in the dnlc, 4948 * or perform any access checks. 4949 */ 4950 if (*nm == '\0') { 4951 VN_HOLD(dvp); 4952 *vpp = dvp; 4953 return (0); 4954 } 4955 4956 /* 4957 * Can't do lookups in non-directories. 4958 */ 4959 if (dvp->v_type != VDIR) 4960 return (ENOTDIR); 4961 4962 /* 4963 * If lookup is for ".", just return dvp. Don't need 4964 * to send it over the wire or look it up in the dnlc, 4965 * just need to check access. 4966 */ 4967 if (nm[0] == '.' && nm[1] == '\0') { 4968 error = nfs4_access(dvp, VEXEC, 0, cr); 4969 if (error) 4970 return (error); 4971 VN_HOLD(dvp); 4972 *vpp = dvp; 4973 return (0); 4974 } 4975 4976 drp = VTOR4(dvp); 4977 if (!(drp->r_flags & R4LOOKUP)) { 4978 mutex_enter(&drp->r_statelock); 4979 drp->r_flags |= R4LOOKUP; 4980 mutex_exit(&drp->r_statelock); 4981 } 4982 4983 *vpp = NULL; 4984 /* 4985 * Lookup this name in the DNLC. If there is no entry 4986 * lookup over the wire. 4987 */ 4988 if (!skipdnlc) 4989 *vpp = dnlc_lookup(dvp, nm); 4990 if (*vpp == NULL) { 4991 /* 4992 * We need to go over the wire to lookup the name. 4993 */ 4994 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 4995 } 4996 4997 /* 4998 * We hit on the dnlc 4999 */ 5000 if (*vpp != DNLC_NO_VNODE || 5001 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5002 /* 5003 * But our attrs may not be valid. 5004 */ 5005 if (ATTRCACHE4_VALID(dvp)) { 5006 error = nfs4_waitfor_purge_complete(dvp); 5007 if (error) { 5008 VN_RELE(*vpp); 5009 *vpp = NULL; 5010 return (error); 5011 } 5012 5013 /* 5014 * If after the purge completes, check to make sure 5015 * our attrs are still valid. 5016 */ 5017 if (ATTRCACHE4_VALID(dvp)) { 5018 /* 5019 * If we waited for a purge we may have 5020 * lost our vnode so look it up again. 5021 */ 5022 VN_RELE(*vpp); 5023 *vpp = dnlc_lookup(dvp, nm); 5024 if (*vpp == NULL) 5025 return (nfs4lookupnew_otw(dvp, 5026 nm, vpp, cr)); 5027 5028 /* 5029 * The access cache should almost always hit 5030 */ 5031 error = nfs4_access(dvp, VEXEC, 0, cr); 5032 5033 if (error) { 5034 VN_RELE(*vpp); 5035 *vpp = NULL; 5036 return (error); 5037 } 5038 if (*vpp == DNLC_NO_VNODE) { 5039 VN_RELE(*vpp); 5040 *vpp = NULL; 5041 return (ENOENT); 5042 } 5043 return (0); 5044 } 5045 } 5046 } 5047 5048 ASSERT(*vpp != NULL); 5049 5050 /* 5051 * We may have gotten here we have one of the following cases: 5052 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5053 * need to validate them. 5054 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5055 * must validate. 5056 * 5057 * Go to the server and check if the directory has changed, if 5058 * it hasn't we are done and can use the dnlc entry. 5059 */ 5060 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5061 } 5062 5063 /* 5064 * Go to the server and check if the directory has changed, if 5065 * it hasn't we are done and can use the dnlc entry. If it 5066 * has changed we get a new copy of its attributes and check 5067 * the access for VEXEC, then relookup the filename and 5068 * get its filehandle and attributes. 5069 * 5070 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5071 * if the NVERIFY failed we must 5072 * purge the caches 5073 * cache new attributes (will set r_time_attr_inval) 5074 * cache new access 5075 * recheck VEXEC access 5076 * add name to dnlc, possibly negative 5077 * if LOOKUP succeeded 5078 * cache new attributes 5079 * else 5080 * set a new r_time_attr_inval for dvp 5081 * check to make sure we have access 5082 * 5083 * The vpp returned is the vnode passed in if the directory is valid, 5084 * a new vnode if successful lookup, or NULL on error. 5085 */ 5086 static int 5087 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5088 { 5089 COMPOUND4args_clnt args; 5090 COMPOUND4res_clnt res; 5091 fattr4 *ver_fattr; 5092 fattr4_change dchange; 5093 int32_t *ptr; 5094 int argoplist_size = 7 * sizeof (nfs_argop4); 5095 nfs_argop4 *argop; 5096 int doqueue; 5097 mntinfo4_t *mi; 5098 nfs4_recov_state_t recov_state; 5099 hrtime_t t; 5100 int isdotdot; 5101 vnode_t *nvp; 5102 nfs_fh4 *fhp; 5103 nfs4_sharedfh_t *sfhp; 5104 nfs4_access_type_t cacc; 5105 rnode4_t *nrp; 5106 rnode4_t *drp = VTOR4(dvp); 5107 nfs4_ga_res_t *garp = NULL; 5108 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5109 5110 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5111 ASSERT(nm != NULL); 5112 ASSERT(nm[0] != '\0'); 5113 ASSERT(dvp->v_type == VDIR); 5114 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5115 ASSERT(*vpp != NULL); 5116 5117 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5118 isdotdot = 1; 5119 args.ctag = TAG_LOOKUP_VPARENT; 5120 } else { 5121 /* 5122 * Do not allow crossing of server mount points. The 5123 * only visible entries in a SRVSTUB dir are . and .. 5124 * This code handles the non-.. case. We can't even get 5125 * this far if looking up ".". 5126 */ 5127 if (VTOR4(dvp)->r_flags & R4SRVSTUB) { 5128 VN_RELE(*vpp); 5129 *vpp = NULL; 5130 return (ENOENT); 5131 } 5132 isdotdot = 0; 5133 args.ctag = TAG_LOOKUP_VALID; 5134 } 5135 5136 mi = VTOMI4(dvp); 5137 recov_state.rs_flags = 0; 5138 recov_state.rs_num_retry_despite_err = 0; 5139 5140 nvp = NULL; 5141 5142 /* Save the original mount point security information */ 5143 (void) save_mnt_secinfo(mi->mi_curr_serv); 5144 5145 recov_retry: 5146 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5147 &recov_state, NULL); 5148 if (e.error) { 5149 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5150 VN_RELE(*vpp); 5151 *vpp = NULL; 5152 return (e.error); 5153 } 5154 5155 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5156 5157 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5158 args.array_len = 7; 5159 args.array = argop; 5160 5161 /* 0. putfh file */ 5162 argop[0].argop = OP_CPUTFH; 5163 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5164 5165 /* 1. nverify the change info */ 5166 argop[1].argop = OP_NVERIFY; 5167 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5168 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5169 ver_fattr->attrlist4 = (char *)&dchange; 5170 ptr = (int32_t *)&dchange; 5171 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5172 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5173 5174 /* 2. getattr directory */ 5175 argop[2].argop = OP_GETATTR; 5176 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5177 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5178 5179 /* 3. access directory */ 5180 argop[3].argop = OP_ACCESS; 5181 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5182 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5183 5184 /* 4. lookup name */ 5185 if (isdotdot) { 5186 argop[4].argop = OP_LOOKUPP; 5187 } else { 5188 argop[4].argop = OP_CLOOKUP; 5189 argop[4].nfs_argop4_u.opclookup.cname = nm; 5190 } 5191 5192 /* 5. resulting file handle */ 5193 argop[5].argop = OP_GETFH; 5194 5195 /* 6. resulting file attributes */ 5196 argop[6].argop = OP_GETATTR; 5197 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5198 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5199 5200 doqueue = 1; 5201 t = gethrtime(); 5202 5203 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5204 5205 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5206 /* 5207 * For WRONGSEC of a non-dotdot case, send secinfo directly 5208 * from this thread, do not go thru the recovery thread since 5209 * we need the nm information. 5210 * 5211 * Not doing dotdot case because there is no specification 5212 * for (PUTFH, SECINFO "..") yet. 5213 */ 5214 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5215 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5216 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5217 &recov_state, FALSE); 5218 } else { 5219 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5220 &recov_state, TRUE); 5221 } 5222 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5223 kmem_free(argop, argoplist_size); 5224 if (!e.error) 5225 goto recov_retry; 5226 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5227 VN_RELE(*vpp); 5228 *vpp = NULL; 5229 return (e.error); 5230 } 5231 5232 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5233 OP_LOOKUP, NULL) == FALSE) { 5234 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5235 &recov_state, TRUE); 5236 5237 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5238 kmem_free(argop, argoplist_size); 5239 goto recov_retry; 5240 } 5241 } 5242 5243 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5244 5245 if (e.error || res.array_len == 0) { 5246 /* 5247 * If e.error isn't set, then reply has no ops (or we couldn't 5248 * be here). The only legal way to reply without an op array 5249 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5250 * be in the reply for all other status values. 5251 * 5252 * For valid replies without an ops array, return ENOTSUP 5253 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5254 * return EIO -- don't trust status. 5255 */ 5256 if (e.error == 0) 5257 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5258 ENOTSUP : EIO; 5259 VN_RELE(*vpp); 5260 *vpp = NULL; 5261 kmem_free(argop, argoplist_size); 5262 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5263 return (e.error); 5264 } 5265 5266 if (res.status != NFS4ERR_SAME) { 5267 e.error = geterrno4(res.status); 5268 5269 /* 5270 * The NVERIFY "failed" so the directory has changed 5271 * First make sure PUTFH succeeded and NVERIFY "failed" 5272 * cleanly. 5273 */ 5274 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5275 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5276 nfs4_purge_stale_fh(e.error, dvp, cr); 5277 VN_RELE(*vpp); 5278 *vpp = NULL; 5279 goto exit; 5280 } 5281 5282 /* 5283 * We know the NVERIFY "failed" so we must: 5284 * purge the caches (access and indirectly dnlc if needed) 5285 */ 5286 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5287 5288 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5289 nfs4_purge_stale_fh(e.error, dvp, cr); 5290 VN_RELE(*vpp); 5291 *vpp = NULL; 5292 goto exit; 5293 } 5294 5295 /* 5296 * Install new cached attributes for the directory 5297 */ 5298 nfs4_attr_cache(dvp, 5299 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5300 t, cr, FALSE, NULL); 5301 5302 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5303 nfs4_purge_stale_fh(e.error, dvp, cr); 5304 VN_RELE(*vpp); 5305 *vpp = NULL; 5306 e.error = geterrno4(res.status); 5307 goto exit; 5308 } 5309 5310 /* 5311 * Now we know the directory is valid, 5312 * cache new directory access 5313 */ 5314 nfs4_access_cache(drp, 5315 args.array[3].nfs_argop4_u.opaccess.access, 5316 res.array[3].nfs_resop4_u.opaccess.access, cr); 5317 5318 /* 5319 * recheck VEXEC access 5320 */ 5321 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5322 if (cacc != NFS4_ACCESS_ALLOWED) { 5323 /* 5324 * Directory permissions might have been revoked 5325 */ 5326 if (cacc == NFS4_ACCESS_DENIED) { 5327 e.error = EACCES; 5328 VN_RELE(*vpp); 5329 *vpp = NULL; 5330 goto exit; 5331 } 5332 5333 /* 5334 * Somehow we must not have asked for enough 5335 * so try a singleton ACCESS, should never happen. 5336 */ 5337 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5338 if (e.error) { 5339 VN_RELE(*vpp); 5340 *vpp = NULL; 5341 goto exit; 5342 } 5343 } 5344 5345 e.error = geterrno4(res.status); 5346 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5347 /* 5348 * The lookup failed, probably no entry 5349 */ 5350 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5351 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5352 } else { 5353 /* 5354 * Might be some other error, so remove 5355 * the dnlc entry to make sure we start all 5356 * over again, next time. 5357 */ 5358 dnlc_remove(dvp, nm); 5359 } 5360 VN_RELE(*vpp); 5361 *vpp = NULL; 5362 goto exit; 5363 } 5364 5365 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5366 /* 5367 * The file exists but we can't get its fh for 5368 * some unknown reason. Remove it from the dnlc 5369 * and error out to be safe. 5370 */ 5371 dnlc_remove(dvp, nm); 5372 VN_RELE(*vpp); 5373 *vpp = NULL; 5374 goto exit; 5375 } 5376 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5377 if (fhp->nfs_fh4_len == 0) { 5378 /* 5379 * The file exists but a bogus fh 5380 * some unknown reason. Remove it from the dnlc 5381 * and error out to be safe. 5382 */ 5383 e.error = ENOENT; 5384 dnlc_remove(dvp, nm); 5385 VN_RELE(*vpp); 5386 *vpp = NULL; 5387 goto exit; 5388 } 5389 sfhp = sfh4_get(fhp, mi); 5390 5391 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5392 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5393 5394 /* 5395 * Make the new rnode 5396 */ 5397 if (isdotdot) { 5398 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5399 if (e.error) { 5400 sfh4_rele(&sfhp); 5401 VN_RELE(*vpp); 5402 *vpp = NULL; 5403 goto exit; 5404 } 5405 /* 5406 * XXX if nfs4_make_dotdot uses an existing rnode 5407 * XXX it doesn't update the attributes. 5408 * XXX for now just save them again to save an OTW 5409 */ 5410 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5411 } else { 5412 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5413 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5414 /* 5415 * If v_type == VNON, then garp was NULL because 5416 * the last op in the compound failed and makenfs4node 5417 * could not find the vnode for sfhp. It created 5418 * a new vnode, so we have nothing to purge here. 5419 */ 5420 if (nvp->v_type == VNON) { 5421 vattr_t vattr; 5422 5423 vattr.va_mask = AT_TYPE; 5424 /* 5425 * N.B. We've already called nfs4_end_fop above. 5426 */ 5427 e.error = nfs4getattr(nvp, &vattr, cr); 5428 if (e.error) { 5429 sfh4_rele(&sfhp); 5430 VN_RELE(*vpp); 5431 *vpp = NULL; 5432 VN_RELE(nvp); 5433 goto exit; 5434 } 5435 nvp->v_type = vattr.va_type; 5436 } 5437 } 5438 sfh4_rele(&sfhp); 5439 5440 nrp = VTOR4(nvp); 5441 mutex_enter(&nrp->r_statev4_lock); 5442 if (!nrp->created_v4) { 5443 mutex_exit(&nrp->r_statev4_lock); 5444 dnlc_update(dvp, nm, nvp); 5445 } else 5446 mutex_exit(&nrp->r_statev4_lock); 5447 5448 VN_RELE(*vpp); 5449 *vpp = nvp; 5450 } else { 5451 hrtime_t now; 5452 hrtime_t delta = 0; 5453 5454 e.error = 0; 5455 5456 /* 5457 * Because the NVERIFY "succeeded" we know that the 5458 * directory attributes are still valid 5459 * so update r_time_attr_inval 5460 */ 5461 now = gethrtime(); 5462 mutex_enter(&drp->r_statelock); 5463 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5464 delta = now - drp->r_time_attr_saved; 5465 if (delta < mi->mi_acdirmin) 5466 delta = mi->mi_acdirmin; 5467 else if (delta > mi->mi_acdirmax) 5468 delta = mi->mi_acdirmax; 5469 } 5470 drp->r_time_attr_inval = now + delta; 5471 mutex_exit(&drp->r_statelock); 5472 dnlc_update(dvp, nm, *vpp); 5473 5474 /* 5475 * Even though we have a valid directory attr cache 5476 * and dnlc entry, we may not have access. 5477 * This should almost always hit the cache. 5478 */ 5479 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5480 if (e.error) { 5481 VN_RELE(*vpp); 5482 *vpp = NULL; 5483 } 5484 5485 if (*vpp == DNLC_NO_VNODE) { 5486 VN_RELE(*vpp); 5487 *vpp = NULL; 5488 e.error = ENOENT; 5489 } 5490 } 5491 5492 exit: 5493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5494 kmem_free(argop, argoplist_size); 5495 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5496 return (e.error); 5497 } 5498 5499 /* 5500 * We need to go over the wire to lookup the name, but 5501 * while we are there verify the directory has not 5502 * changed but if it has, get new attributes and check access 5503 * 5504 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5505 * NVERIFY GETATTR ACCESS 5506 * 5507 * With the results: 5508 * if the NVERIFY failed we must purge the caches, add new attributes, 5509 * and cache new access. 5510 * set a new r_time_attr_inval 5511 * add name to dnlc, possibly negative 5512 * if LOOKUP succeeded 5513 * cache new attributes 5514 */ 5515 static int 5516 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5517 { 5518 COMPOUND4args_clnt args; 5519 COMPOUND4res_clnt res; 5520 fattr4 *ver_fattr; 5521 fattr4_change dchange; 5522 int32_t *ptr; 5523 nfs4_ga_res_t *garp = NULL; 5524 int argoplist_size = 9 * sizeof (nfs_argop4); 5525 nfs_argop4 *argop; 5526 int doqueue; 5527 mntinfo4_t *mi; 5528 nfs4_recov_state_t recov_state; 5529 hrtime_t t; 5530 int isdotdot; 5531 vnode_t *nvp; 5532 nfs_fh4 *fhp; 5533 nfs4_sharedfh_t *sfhp; 5534 nfs4_access_type_t cacc; 5535 rnode4_t *nrp; 5536 rnode4_t *drp = VTOR4(dvp); 5537 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5538 5539 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5540 ASSERT(nm != NULL); 5541 ASSERT(nm[0] != '\0'); 5542 ASSERT(dvp->v_type == VDIR); 5543 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5544 ASSERT(*vpp == NULL); 5545 5546 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5547 isdotdot = 1; 5548 args.ctag = TAG_LOOKUP_PARENT; 5549 } else { 5550 /* 5551 * Do not allow crossing of server mount points. The 5552 * only visible entries in a SRVSTUB dir are . and .. 5553 * This code handles the non-.. case. We can't even get 5554 * this far if looking up ".". 5555 */ 5556 if (VTOR4(dvp)->r_flags & R4SRVSTUB) 5557 return (ENOENT); 5558 5559 isdotdot = 0; 5560 args.ctag = TAG_LOOKUP; 5561 } 5562 5563 mi = VTOMI4(dvp); 5564 recov_state.rs_flags = 0; 5565 recov_state.rs_num_retry_despite_err = 0; 5566 5567 nvp = NULL; 5568 5569 /* Save the original mount point security information */ 5570 (void) save_mnt_secinfo(mi->mi_curr_serv); 5571 5572 recov_retry: 5573 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5574 &recov_state, NULL); 5575 if (e.error) { 5576 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5577 return (e.error); 5578 } 5579 5580 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5581 5582 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5583 args.array_len = 9; 5584 args.array = argop; 5585 5586 /* 0. putfh file */ 5587 argop[0].argop = OP_CPUTFH; 5588 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5589 5590 /* 1. savefh for the nverify */ 5591 argop[1].argop = OP_SAVEFH; 5592 5593 /* 2. lookup name */ 5594 if (isdotdot) { 5595 argop[2].argop = OP_LOOKUPP; 5596 } else { 5597 argop[2].argop = OP_CLOOKUP; 5598 argop[2].nfs_argop4_u.opclookup.cname = nm; 5599 } 5600 5601 /* 3. resulting file handle */ 5602 argop[3].argop = OP_GETFH; 5603 5604 /* 4. resulting file attributes */ 5605 argop[4].argop = OP_GETATTR; 5606 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5607 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5608 5609 /* 5. restorefh back the directory for the nverify */ 5610 argop[5].argop = OP_RESTOREFH; 5611 5612 /* 6. nverify the change info */ 5613 argop[6].argop = OP_NVERIFY; 5614 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5615 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5616 ver_fattr->attrlist4 = (char *)&dchange; 5617 ptr = (int32_t *)&dchange; 5618 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5619 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5620 5621 /* 7. getattr directory */ 5622 argop[7].argop = OP_GETATTR; 5623 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5624 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5625 5626 /* 8. access directory */ 5627 argop[8].argop = OP_ACCESS; 5628 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5629 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5630 5631 doqueue = 1; 5632 t = gethrtime(); 5633 5634 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5635 5636 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5637 /* 5638 * For WRONGSEC of a non-dotdot case, send secinfo directly 5639 * from this thread, do not go thru the recovery thread since 5640 * we need the nm information. 5641 * 5642 * Not doing dotdot case because there is no specification 5643 * for (PUTFH, SECINFO "..") yet. 5644 */ 5645 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5646 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5647 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5648 &recov_state, FALSE); 5649 } else { 5650 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5651 &recov_state, TRUE); 5652 } 5653 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5654 kmem_free(argop, argoplist_size); 5655 if (!e.error) 5656 goto recov_retry; 5657 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5658 return (e.error); 5659 } 5660 5661 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5662 OP_LOOKUP, NULL) == FALSE) { 5663 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5664 &recov_state, TRUE); 5665 5666 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5667 kmem_free(argop, argoplist_size); 5668 goto recov_retry; 5669 } 5670 } 5671 5672 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5673 5674 if (e.error || res.array_len == 0) { 5675 /* 5676 * If e.error isn't set, then reply has no ops (or we couldn't 5677 * be here). The only legal way to reply without an op array 5678 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5679 * be in the reply for all other status values. 5680 * 5681 * For valid replies without an ops array, return ENOTSUP 5682 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5683 * return EIO -- don't trust status. 5684 */ 5685 if (e.error == 0) 5686 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5687 ENOTSUP : EIO; 5688 5689 kmem_free(argop, argoplist_size); 5690 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5691 return (e.error); 5692 } 5693 5694 e.error = geterrno4(res.status); 5695 5696 /* 5697 * The PUTFH and SAVEFH may have failed. 5698 */ 5699 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5700 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5701 nfs4_purge_stale_fh(e.error, dvp, cr); 5702 goto exit; 5703 } 5704 5705 /* 5706 * Check if the file exists, if it does delay entering 5707 * into the dnlc until after we update the directory 5708 * attributes so we don't cause it to get purged immediately. 5709 */ 5710 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5711 /* 5712 * The lookup failed, probably no entry 5713 */ 5714 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5715 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5716 } 5717 goto exit; 5718 } 5719 5720 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5721 /* 5722 * The file exists but we can't get its fh for 5723 * some unknown reason. Error out to be safe. 5724 */ 5725 goto exit; 5726 } 5727 5728 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5729 if (fhp->nfs_fh4_len == 0) { 5730 /* 5731 * The file exists but a bogus fh 5732 * some unknown reason. Error out to be safe. 5733 */ 5734 e.error = EIO; 5735 goto exit; 5736 } 5737 sfhp = sfh4_get(fhp, mi); 5738 5739 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5740 sfh4_rele(&sfhp); 5741 e.error = EIO; 5742 goto exit; 5743 } 5744 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5745 5746 /* 5747 * The RESTOREFH may have failed 5748 */ 5749 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5750 sfh4_rele(&sfhp); 5751 e.error = EIO; 5752 goto exit; 5753 } 5754 5755 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5756 /* 5757 * First make sure the NVERIFY failed as we expected, 5758 * if it didn't then be conservative and error out 5759 * as we can't trust the directory. 5760 */ 5761 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5762 sfh4_rele(&sfhp); 5763 e.error = EIO; 5764 goto exit; 5765 } 5766 5767 /* 5768 * We know the NVERIFY "failed" so the directory has changed, 5769 * so we must: 5770 * purge the caches (access and indirectly dnlc if needed) 5771 */ 5772 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5773 5774 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5775 sfh4_rele(&sfhp); 5776 goto exit; 5777 } 5778 nfs4_attr_cache(dvp, 5779 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5780 t, cr, FALSE, NULL); 5781 5782 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5783 nfs4_purge_stale_fh(e.error, dvp, cr); 5784 sfh4_rele(&sfhp); 5785 e.error = geterrno4(res.status); 5786 goto exit; 5787 } 5788 5789 /* 5790 * Now we know the directory is valid, 5791 * cache new directory access 5792 */ 5793 nfs4_access_cache(drp, 5794 args.array[8].nfs_argop4_u.opaccess.access, 5795 res.array[8].nfs_resop4_u.opaccess.access, cr); 5796 5797 /* 5798 * recheck VEXEC access 5799 */ 5800 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5801 if (cacc != NFS4_ACCESS_ALLOWED) { 5802 /* 5803 * Directory permissions might have been revoked 5804 */ 5805 if (cacc == NFS4_ACCESS_DENIED) { 5806 sfh4_rele(&sfhp); 5807 e.error = EACCES; 5808 goto exit; 5809 } 5810 5811 /* 5812 * Somehow we must not have asked for enough 5813 * so try a singleton ACCESS should never happen 5814 */ 5815 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5816 if (e.error) { 5817 sfh4_rele(&sfhp); 5818 goto exit; 5819 } 5820 } 5821 5822 e.error = geterrno4(res.status); 5823 } else { 5824 hrtime_t now; 5825 hrtime_t delta = 0; 5826 5827 e.error = 0; 5828 5829 /* 5830 * Because the NVERIFY "succeeded" we know that the 5831 * directory attributes are still valid 5832 * so update r_time_attr_inval 5833 */ 5834 now = gethrtime(); 5835 mutex_enter(&drp->r_statelock); 5836 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5837 delta = now - drp->r_time_attr_saved; 5838 if (delta < mi->mi_acdirmin) 5839 delta = mi->mi_acdirmin; 5840 else if (delta > mi->mi_acdirmax) 5841 delta = mi->mi_acdirmax; 5842 } 5843 drp->r_time_attr_inval = now + delta; 5844 mutex_exit(&drp->r_statelock); 5845 5846 /* 5847 * Even though we have a valid directory attr cache, 5848 * we may not have access. 5849 * This should almost always hit the cache. 5850 */ 5851 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5852 if (e.error) { 5853 sfh4_rele(&sfhp); 5854 goto exit; 5855 } 5856 } 5857 5858 /* 5859 * Now we have successfully completed the lookup, if the 5860 * directory has changed we now have the valid attributes. 5861 * We also know we have directory access. 5862 * Create the new rnode and insert it in the dnlc. 5863 */ 5864 if (isdotdot) { 5865 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5866 if (e.error) { 5867 sfh4_rele(&sfhp); 5868 goto exit; 5869 } 5870 /* 5871 * XXX if nfs4_make_dotdot uses an existing rnode 5872 * XXX it doesn't update the attributes. 5873 * XXX for now just save them again to save an OTW 5874 */ 5875 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5876 } else { 5877 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5878 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5879 } 5880 sfh4_rele(&sfhp); 5881 5882 nrp = VTOR4(nvp); 5883 mutex_enter(&nrp->r_statev4_lock); 5884 if (!nrp->created_v4) { 5885 mutex_exit(&nrp->r_statev4_lock); 5886 dnlc_update(dvp, nm, nvp); 5887 } else 5888 mutex_exit(&nrp->r_statev4_lock); 5889 5890 *vpp = nvp; 5891 5892 exit: 5893 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5894 kmem_free(argop, argoplist_size); 5895 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5896 return (e.error); 5897 } 5898 5899 #ifdef DEBUG 5900 void 5901 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5902 { 5903 uint_t i, len; 5904 zoneid_t zoneid = getzoneid(); 5905 char *s; 5906 5907 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5908 for (i = 0; i < argcnt; i++) { 5909 nfs_argop4 *op = &argbase[i]; 5910 switch (op->argop) { 5911 case OP_CPUTFH: 5912 case OP_PUTFH: 5913 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5914 break; 5915 case OP_PUTROOTFH: 5916 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5917 break; 5918 case OP_CLOOKUP: 5919 s = op->nfs_argop4_u.opclookup.cname; 5920 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5921 break; 5922 case OP_LOOKUP: 5923 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 5924 &len, NULL); 5925 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5926 kmem_free(s, len); 5927 break; 5928 case OP_LOOKUPP: 5929 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 5930 break; 5931 case OP_GETFH: 5932 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 5933 break; 5934 case OP_GETATTR: 5935 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 5936 break; 5937 case OP_OPENATTR: 5938 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 5939 break; 5940 default: 5941 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 5942 op->argop); 5943 break; 5944 } 5945 } 5946 } 5947 #endif 5948 5949 /* 5950 * nfs4lookup_setup - constructs a multi-lookup compound request. 5951 * 5952 * Given the path "nm1/nm2/.../nmn", the following compound requests 5953 * may be created: 5954 * 5955 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 5956 * is faster, for now. 5957 * 5958 * l4_getattrs indicates the type of compound requested. 5959 * 5960 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 5961 * 5962 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 5963 * 5964 * total number of ops is n + 1. 5965 * 5966 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 5967 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 5968 * before the last component, and only get attributes 5969 * for the last component. Note that the second-to-last 5970 * pathname component is XATTR_RPATH, which does NOT go 5971 * over-the-wire as a lookup. 5972 * 5973 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 5974 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 5975 * 5976 * and total number of ops is n + 5. 5977 * 5978 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 5979 * attribute directory: create lookups plus an OPENATTR 5980 * replacing the last lookup. Note that the last pathname 5981 * component is XATTR_RPATH, which does NOT go over-the-wire 5982 * as a lookup. 5983 * 5984 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 5985 * Openattr; Getfh; Getattr } 5986 * 5987 * and total number of ops is n + 5. 5988 * 5989 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 5990 * nodes too. 5991 * 5992 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 5993 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 5994 * 5995 * and total number of ops is 3*n + 1. 5996 * 5997 * All cases: returns the index in the arg array of the final LOOKUP op, or 5998 * -1 if no LOOKUPs were used. 5999 */ 6000 int 6001 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6002 { 6003 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6004 nfs_argop4 *argbase, *argop; 6005 int arglen, argcnt; 6006 int n = 1; /* number of components */ 6007 int nga = 1; /* number of Getattr's in request */ 6008 char c = '\0', *s, *p; 6009 int lookup_idx = -1; 6010 int argoplist_size; 6011 6012 /* set lookuparg response result to 0 */ 6013 lookupargp->resp->status = NFS4_OK; 6014 6015 /* skip leading "/" or "." e.g. ".//./" if there is */ 6016 for (; ; nm++) { 6017 if (*nm != '/' && *nm != '.') 6018 break; 6019 6020 /* ".." is counted as 1 component */ 6021 if (*nm == '.' && *(nm + 1) == '.') 6022 break; 6023 } 6024 6025 /* 6026 * Find n = number of components - nm must be null terminated 6027 * Skip "." components. 6028 */ 6029 if (*nm != '\0') { 6030 for (n = 1, s = nm; *s != '\0'; s++) { 6031 if ((*s == '/') && (*(s + 1) != '/') && 6032 (*(s + 1) != '\0') && 6033 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6034 *(s + 2) == '\0'))) 6035 n++; 6036 } 6037 } else 6038 n = 0; 6039 6040 /* 6041 * nga is number of components that need Getfh+Getattr 6042 */ 6043 switch (l4_getattrs) { 6044 case LKP4_NO_ATTRIBUTES: 6045 nga = 0; 6046 break; 6047 case LKP4_ALL_ATTRIBUTES: 6048 nga = n; 6049 /* 6050 * Always have at least 1 getfh, getattr pair 6051 */ 6052 if (nga == 0) 6053 nga++; 6054 break; 6055 case LKP4_LAST_ATTRDIR: 6056 case LKP4_LAST_NAMED_ATTR: 6057 nga = n+1; 6058 break; 6059 } 6060 6061 /* 6062 * If change to use the filehandle attr instead of getfh 6063 * the following line can be deleted. 6064 */ 6065 nga *= 2; 6066 6067 /* 6068 * calculate number of ops in request as 6069 * header + trailer + lookups + getattrs 6070 */ 6071 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6072 6073 argoplist_size = arglen * sizeof (nfs_argop4); 6074 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6075 lookupargp->argsp->array = argop; 6076 6077 argcnt = lookupargp->header_len; 6078 argop += argcnt; 6079 6080 /* 6081 * loop and create a lookup op and possibly getattr/getfh for 6082 * each component. Skip "." components. 6083 */ 6084 for (s = nm; *s != '\0'; s = p) { 6085 /* 6086 * Set up a pathname struct for each component if needed 6087 */ 6088 while (*s == '/') 6089 s++; 6090 if (*s == '\0') 6091 break; 6092 for (p = s; (*p != '/') && (*p != '\0'); p++); 6093 c = *p; 6094 *p = '\0'; 6095 6096 if (s[0] == '.' && s[1] == '\0') { 6097 *p = c; 6098 continue; 6099 } 6100 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6101 strcmp(s, XATTR_RPATH) == 0) { 6102 /* getfh XXX may not be needed in future */ 6103 argop->argop = OP_GETFH; 6104 argop++; 6105 argcnt++; 6106 6107 /* getattr */ 6108 argop->argop = OP_GETATTR; 6109 argop->nfs_argop4_u.opgetattr.attr_request = 6110 lookupargp->ga_bits; 6111 argop->nfs_argop4_u.opgetattr.mi = 6112 lookupargp->mi; 6113 argop++; 6114 argcnt++; 6115 6116 /* openattr */ 6117 argop->argop = OP_OPENATTR; 6118 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6119 strcmp(s, XATTR_RPATH) == 0) { 6120 /* openattr */ 6121 argop->argop = OP_OPENATTR; 6122 argop++; 6123 argcnt++; 6124 6125 /* getfh XXX may not be needed in future */ 6126 argop->argop = OP_GETFH; 6127 argop++; 6128 argcnt++; 6129 6130 /* getattr */ 6131 argop->argop = OP_GETATTR; 6132 argop->nfs_argop4_u.opgetattr.attr_request = 6133 lookupargp->ga_bits; 6134 argop->nfs_argop4_u.opgetattr.mi = 6135 lookupargp->mi; 6136 argop++; 6137 argcnt++; 6138 *p = c; 6139 continue; 6140 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6141 /* lookupp */ 6142 argop->argop = OP_LOOKUPP; 6143 } else { 6144 /* lookup */ 6145 argop->argop = OP_LOOKUP; 6146 (void) str_to_utf8(s, 6147 &argop->nfs_argop4_u.oplookup.objname); 6148 } 6149 lookup_idx = argcnt; 6150 argop++; 6151 argcnt++; 6152 6153 *p = c; 6154 6155 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6156 /* getfh XXX may not be needed in future */ 6157 argop->argop = OP_GETFH; 6158 argop++; 6159 argcnt++; 6160 6161 /* getattr */ 6162 argop->argop = OP_GETATTR; 6163 argop->nfs_argop4_u.opgetattr.attr_request = 6164 lookupargp->ga_bits; 6165 argop->nfs_argop4_u.opgetattr.mi = 6166 lookupargp->mi; 6167 argop++; 6168 argcnt++; 6169 } 6170 } 6171 6172 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6173 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6174 if (needgetfh) { 6175 /* stick in a post-lookup getfh */ 6176 argop->argop = OP_GETFH; 6177 argcnt++; 6178 argop++; 6179 } 6180 /* post-lookup getattr */ 6181 argop->argop = OP_GETATTR; 6182 argop->nfs_argop4_u.opgetattr.attr_request = 6183 lookupargp->ga_bits; 6184 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6185 argcnt++; 6186 } 6187 argcnt += lookupargp->trailer_len; /* actual op count */ 6188 lookupargp->argsp->array_len = argcnt; 6189 lookupargp->arglen = arglen; 6190 6191 #ifdef DEBUG 6192 if (nfs4_client_lookup_debug) 6193 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6194 #endif 6195 6196 return (lookup_idx); 6197 } 6198 6199 static int 6200 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6201 { 6202 COMPOUND4args_clnt args; 6203 COMPOUND4res_clnt res; 6204 GETFH4res *gf_res = NULL; 6205 nfs_argop4 argop[4]; 6206 nfs_resop4 *resop = NULL; 6207 nfs4_sharedfh_t *sfhp; 6208 hrtime_t t; 6209 nfs4_error_t e; 6210 6211 rnode4_t *drp; 6212 int doqueue = 1; 6213 vnode_t *vp; 6214 int needrecov = 0; 6215 nfs4_recov_state_t recov_state; 6216 6217 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6218 6219 *avp = NULL; 6220 recov_state.rs_flags = 0; 6221 recov_state.rs_num_retry_despite_err = 0; 6222 6223 recov_retry: 6224 /* COMPOUND: putfh, openattr, getfh, getattr */ 6225 args.array_len = 4; 6226 args.array = argop; 6227 args.ctag = TAG_OPENATTR; 6228 6229 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6230 if (e.error) 6231 return (e.error); 6232 6233 drp = VTOR4(dvp); 6234 6235 /* putfh */ 6236 argop[0].argop = OP_CPUTFH; 6237 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6238 6239 /* openattr */ 6240 argop[1].argop = OP_OPENATTR; 6241 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6242 6243 /* getfh */ 6244 argop[2].argop = OP_GETFH; 6245 6246 /* getattr */ 6247 argop[3].argop = OP_GETATTR; 6248 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6249 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6250 6251 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6252 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6253 rnode4info(drp))); 6254 6255 t = gethrtime(); 6256 6257 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6258 6259 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6260 if (needrecov) { 6261 bool_t abort; 6262 6263 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6264 "nfs4openattr: initiating recovery\n")); 6265 6266 abort = nfs4_start_recovery(&e, 6267 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6268 OP_OPENATTR, NULL); 6269 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6270 if (!e.error) { 6271 e.error = geterrno4(res.status); 6272 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6273 } 6274 if (abort == FALSE) 6275 goto recov_retry; 6276 return (e.error); 6277 } 6278 6279 if (e.error) { 6280 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6281 return (e.error); 6282 } 6283 6284 if (res.status) { 6285 /* 6286 * If OTW errro is NOTSUPP, then it should be 6287 * translated to EINVAL. All Solaris file system 6288 * implementations return EINVAL to the syscall layer 6289 * when the attrdir cannot be created due to an 6290 * implementation restriction or noxattr mount option. 6291 */ 6292 if (res.status == NFS4ERR_NOTSUPP) { 6293 mutex_enter(&drp->r_statelock); 6294 if (drp->r_xattr_dir) 6295 VN_RELE(drp->r_xattr_dir); 6296 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6297 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6298 mutex_exit(&drp->r_statelock); 6299 6300 e.error = EINVAL; 6301 } else { 6302 e.error = geterrno4(res.status); 6303 } 6304 6305 if (e.error) { 6306 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6307 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6308 needrecov); 6309 return (e.error); 6310 } 6311 } 6312 6313 resop = &res.array[0]; /* putfh res */ 6314 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6315 6316 resop = &res.array[1]; /* openattr res */ 6317 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6318 6319 resop = &res.array[2]; /* getfh res */ 6320 gf_res = &resop->nfs_resop4_u.opgetfh; 6321 if (gf_res->object.nfs_fh4_len == 0) { 6322 *avp = NULL; 6323 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6324 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6325 return (ENOENT); 6326 } 6327 6328 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6329 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6330 dvp->v_vfsp, t, cr, dvp, 6331 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH)); 6332 sfh4_rele(&sfhp); 6333 6334 if (e.error) 6335 PURGE_ATTRCACHE4(vp); 6336 6337 mutex_enter(&vp->v_lock); 6338 vp->v_flag |= V_XATTRDIR; 6339 mutex_exit(&vp->v_lock); 6340 6341 *avp = vp; 6342 6343 mutex_enter(&drp->r_statelock); 6344 if (drp->r_xattr_dir) 6345 VN_RELE(drp->r_xattr_dir); 6346 VN_HOLD(vp); 6347 drp->r_xattr_dir = vp; 6348 6349 /* 6350 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6351 * NULL. xattrs could be created at any time, and we have no 6352 * way to update pc4_xattr_exists in the base object if/when 6353 * it happens. 6354 */ 6355 drp->r_pathconf.pc4_xattr_valid = 0; 6356 6357 mutex_exit(&drp->r_statelock); 6358 6359 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6360 6361 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6362 6363 return (0); 6364 } 6365 6366 /* ARGSUSED */ 6367 static int 6368 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6369 int mode, vnode_t **vpp, cred_t *cr, int flags) 6370 { 6371 int error; 6372 vnode_t *vp = NULL; 6373 rnode4_t *rp; 6374 struct vattr vattr; 6375 rnode4_t *drp; 6376 vnode_t *tempvp; 6377 enum createmode4 createmode; 6378 bool_t must_trunc = FALSE; 6379 6380 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6381 return (EPERM); 6382 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6383 return (EINVAL); 6384 } 6385 6386 /* . and .. have special meaning in the protocol, reject them. */ 6387 6388 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6389 return (EISDIR); 6390 6391 drp = VTOR4(dvp); 6392 6393 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6394 return (EINTR); 6395 6396 top: 6397 /* 6398 * We make a copy of the attributes because the caller does not 6399 * expect us to change what va points to. 6400 */ 6401 vattr = *va; 6402 6403 /* 6404 * If the pathname is "", then dvp is the root vnode of 6405 * a remote file mounted over a local directory. 6406 * All that needs to be done is access 6407 * checking and truncation. Note that we avoid doing 6408 * open w/ create because the parent directory might 6409 * be in pseudo-fs and the open would fail. 6410 */ 6411 if (*nm == '\0') { 6412 error = 0; 6413 VN_HOLD(dvp); 6414 vp = dvp; 6415 must_trunc = TRUE; 6416 } else { 6417 /* 6418 * We need to go over the wire, just to be sure whether the 6419 * file exists or not. Using the DNLC can be dangerous in 6420 * this case when making a decision regarding existence. 6421 */ 6422 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6423 } 6424 6425 if (exclusive) 6426 createmode = EXCLUSIVE4; 6427 else 6428 createmode = GUARDED4; 6429 6430 /* 6431 * error would be set if the file does not exist on the 6432 * server, so lets go create it. 6433 */ 6434 if (error) { 6435 goto create_otw; 6436 } 6437 6438 /* 6439 * File does exist on the server 6440 */ 6441 if (exclusive == EXCL) 6442 error = EEXIST; 6443 else if (vp->v_type == VDIR && (mode & VWRITE)) 6444 error = EISDIR; 6445 else { 6446 /* 6447 * If vnode is a device, create special vnode. 6448 */ 6449 if (ISVDEV(vp->v_type)) { 6450 tempvp = vp; 6451 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6452 VN_RELE(tempvp); 6453 } 6454 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 6455 if ((vattr.va_mask & AT_SIZE) && 6456 vp->v_type == VREG) { 6457 rp = VTOR4(vp); 6458 /* 6459 * Check here for large file handled 6460 * by LF-unaware process (as 6461 * ufs_create() does) 6462 */ 6463 if (!(flags & FOFFMAX)) { 6464 mutex_enter(&rp->r_statelock); 6465 if (rp->r_size > MAXOFF32_T) 6466 error = EOVERFLOW; 6467 mutex_exit(&rp->r_statelock); 6468 } 6469 6470 /* if error is set then we need to return */ 6471 if (error) { 6472 nfs_rw_exit(&drp->r_rwlock); 6473 VN_RELE(vp); 6474 return (error); 6475 } 6476 6477 if (must_trunc) { 6478 vattr.va_mask = AT_SIZE; 6479 error = nfs4setattr(vp, &vattr, 0, cr, 6480 NULL); 6481 } else { 6482 /* 6483 * we know we have a regular file that already 6484 * exists and we may end up truncating the file 6485 * as a result of the open_otw, so flush out 6486 * any dirty pages for this file first. 6487 */ 6488 if (nfs4_has_pages(vp) && 6489 ((rp->r_flags & R4DIRTY) || 6490 rp->r_count > 0 || 6491 rp->r_mapcnt > 0)) { 6492 error = nfs4_putpage(vp, 6493 (offset_t)0, 0, 0, cr); 6494 if (error && (error == ENOSPC || 6495 error == EDQUOT)) { 6496 mutex_enter( 6497 &rp->r_statelock); 6498 if (!rp->r_error) 6499 rp->r_error = 6500 error; 6501 mutex_exit( 6502 &rp->r_statelock); 6503 } 6504 } 6505 vattr.va_mask = (AT_SIZE | 6506 AT_TYPE | AT_MODE); 6507 vattr.va_type = VREG; 6508 createmode = UNCHECKED4; 6509 goto create_otw; 6510 } 6511 } 6512 } 6513 } 6514 nfs_rw_exit(&drp->r_rwlock); 6515 if (error) { 6516 VN_RELE(vp); 6517 } else { 6518 *vpp = vp; 6519 } 6520 return (error); 6521 6522 create_otw: 6523 dnlc_remove(dvp, nm); 6524 6525 ASSERT(vattr.va_mask & AT_TYPE); 6526 6527 /* 6528 * If not a regular file let nfs4mknod() handle it. 6529 */ 6530 if (vattr.va_type != VREG) { 6531 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6532 nfs_rw_exit(&drp->r_rwlock); 6533 return (error); 6534 } 6535 6536 /* 6537 * It _is_ a regular file. 6538 */ 6539 ASSERT(vattr.va_mask & AT_MODE); 6540 if (MANDMODE(vattr.va_mode)) { 6541 nfs_rw_exit(&drp->r_rwlock); 6542 return (EACCES); 6543 } 6544 6545 /* 6546 * If this happens to be a mknod of a regular file, then flags will 6547 * have neither FREAD or FWRITE. However, we must set at least one 6548 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6549 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6550 * set (based on openmode specified by app). 6551 */ 6552 if ((flags & (FREAD|FWRITE)) == 0) 6553 flags |= (FREAD|FWRITE); 6554 6555 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6556 6557 if (vp != NULL) { 6558 /* if create was successful, throw away the file's pages */ 6559 if (!error && (vattr.va_mask & AT_SIZE)) 6560 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6561 cr); 6562 /* release the lookup hold */ 6563 VN_RELE(vp); 6564 vp = NULL; 6565 } 6566 6567 /* 6568 * validate that we opened a regular file. This handles a misbehaving 6569 * server that returns an incorrect FH. 6570 */ 6571 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6572 error = EISDIR; 6573 VN_RELE(*vpp); 6574 } 6575 6576 /* 6577 * If this is not an exclusive create, then the CREATE 6578 * request will be made with the GUARDED mode set. This 6579 * means that the server will return EEXIST if the file 6580 * exists. The file could exist because of a retransmitted 6581 * request. In this case, we recover by starting over and 6582 * checking to see whether the file exists. This second 6583 * time through it should and a CREATE request will not be 6584 * sent. 6585 * 6586 * This handles the problem of a dangling CREATE request 6587 * which contains attributes which indicate that the file 6588 * should be truncated. This retransmitted request could 6589 * possibly truncate valid data in the file if not caught 6590 * by the duplicate request mechanism on the server or if 6591 * not caught by other means. The scenario is: 6592 * 6593 * Client transmits CREATE request with size = 0 6594 * Client times out, retransmits request. 6595 * Response to the first request arrives from the server 6596 * and the client proceeds on. 6597 * Client writes data to the file. 6598 * The server now processes retransmitted CREATE request 6599 * and truncates file. 6600 * 6601 * The use of the GUARDED CREATE request prevents this from 6602 * happening because the retransmitted CREATE would fail 6603 * with EEXIST and would not truncate the file. 6604 */ 6605 if (error == EEXIST && exclusive == NONEXCL) { 6606 #ifdef DEBUG 6607 nfs4_create_misses++; 6608 #endif 6609 goto top; 6610 } 6611 nfs_rw_exit(&drp->r_rwlock); 6612 return (error); 6613 } 6614 6615 /* 6616 * Create compound (for mkdir, mknod, symlink): 6617 * { Putfh <dfh>; Create; Getfh; Getattr } 6618 * It's okay if setattr failed to set gid - this is not considered 6619 * an error, but purge attrs in that case. 6620 */ 6621 static int 6622 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6623 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6624 { 6625 int need_end_op = FALSE; 6626 COMPOUND4args_clnt args; 6627 COMPOUND4res_clnt res, *resp = NULL; 6628 nfs_argop4 *argop; 6629 nfs_resop4 *resop; 6630 int doqueue; 6631 mntinfo4_t *mi; 6632 rnode4_t *drp = VTOR4(dvp); 6633 change_info4 *cinfo; 6634 GETFH4res *gf_res; 6635 struct vattr vattr; 6636 vnode_t *vp; 6637 fattr4 *crattr; 6638 bool_t needrecov = FALSE; 6639 nfs4_recov_state_t recov_state; 6640 nfs4_sharedfh_t *sfhp = NULL; 6641 hrtime_t t; 6642 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6643 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6644 dirattr_info_t dinfo, *dinfop; 6645 servinfo4_t *svp; 6646 bitmap4 supp_attrs; 6647 6648 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6649 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6650 6651 mi = VTOMI4(dvp); 6652 6653 /* 6654 * Make sure we properly deal with setting the right gid 6655 * on a new directory to reflect the parent's setgid bit 6656 */ 6657 setgid_flag = 0; 6658 if (type == NF4DIR) { 6659 struct vattr dva; 6660 6661 va->va_mode &= ~VSGID; 6662 dva.va_mask = AT_MODE | AT_GID; 6663 if (VOP_GETATTR(dvp, &dva, 0, cr) == 0) { 6664 6665 /* 6666 * If the parent's directory has the setgid bit set 6667 * _and_ the client was able to get a valid mapping 6668 * for the parent dir's owner_group, we want to 6669 * append NVERIFY(owner_group == dva.va_gid) and 6670 * SETTATTR to the CREATE compound. 6671 */ 6672 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6673 setgid_flag = 1; 6674 va->va_mode |= VSGID; 6675 if (dva.va_gid != GID_NOBODY) { 6676 va->va_mask |= AT_GID; 6677 va->va_gid = dva.va_gid; 6678 } 6679 } 6680 } 6681 } 6682 6683 /* 6684 * Create ops: 6685 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6686 * 5:restorefh(dir) 6:getattr(dir) 6687 * 6688 * if (setgid) 6689 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6690 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6691 * 8:nverify 9:setattr 6692 */ 6693 if (setgid_flag) { 6694 numops = 10; 6695 idx_create = 1; 6696 idx_fattr = 3; 6697 } else { 6698 numops = 7; 6699 idx_create = 2; 6700 idx_fattr = 4; 6701 } 6702 6703 ASSERT(nfs_zone() == mi->mi_zone); 6704 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6705 return (EINTR); 6706 } 6707 recov_state.rs_flags = 0; 6708 recov_state.rs_num_retry_despite_err = 0; 6709 6710 argoplist_size = numops * sizeof (nfs_argop4); 6711 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6712 6713 recov_retry: 6714 if (type == NF4LNK) 6715 args.ctag = TAG_SYMLINK; 6716 else if (type == NF4DIR) 6717 args.ctag = TAG_MKDIR; 6718 else 6719 args.ctag = TAG_MKNOD; 6720 6721 args.array_len = numops; 6722 args.array = argop; 6723 6724 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6725 nfs_rw_exit(&drp->r_rwlock); 6726 kmem_free(argop, argoplist_size); 6727 return (e.error); 6728 } 6729 need_end_op = TRUE; 6730 6731 6732 /* 0: putfh directory */ 6733 argop[0].argop = OP_CPUTFH; 6734 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6735 6736 /* 1/2: Create object */ 6737 argop[idx_create].argop = OP_CCREATE; 6738 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6739 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6740 if (type == NF4LNK) { 6741 /* 6742 * symlink, treat name as data 6743 */ 6744 ASSERT(data != NULL); 6745 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6746 (char *)data; 6747 } 6748 if (type == NF4BLK || type == NF4CHR) { 6749 ASSERT(data != NULL); 6750 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6751 *((specdata4 *)data); 6752 } 6753 6754 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6755 6756 svp = drp->r_server; 6757 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6758 supp_attrs = svp->sv_supp_attrs; 6759 nfs_rw_exit(&svp->sv_lock); 6760 6761 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6762 nfs_rw_exit(&drp->r_rwlock); 6763 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6764 e.error = EINVAL; 6765 kmem_free(argop, argoplist_size); 6766 return (e.error); 6767 } 6768 6769 /* 2/3: getfh fh of created object */ 6770 ASSERT(idx_create + 1 == idx_fattr - 1); 6771 argop[idx_create + 1].argop = OP_GETFH; 6772 6773 /* 3/4: getattr of new object */ 6774 argop[idx_fattr].argop = OP_GETATTR; 6775 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6776 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6777 6778 if (setgid_flag) { 6779 vattr_t _v; 6780 6781 argop[4].argop = OP_SAVEFH; 6782 6783 argop[5].argop = OP_CPUTFH; 6784 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6785 6786 argop[6].argop = OP_GETATTR; 6787 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6788 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6789 6790 argop[7].argop = OP_RESTOREFH; 6791 6792 /* 6793 * nverify 6794 * 6795 * XXX - Revisit the last argument to nfs4_end_op() 6796 * once 5020486 is fixed. 6797 */ 6798 _v.va_mask = AT_GID; 6799 _v.va_gid = va->va_gid; 6800 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6801 supp_attrs)) { 6802 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6803 nfs_rw_exit(&drp->r_rwlock); 6804 nfs4_fattr4_free(crattr); 6805 kmem_free(argop, argoplist_size); 6806 return (e.error); 6807 } 6808 6809 /* 6810 * setattr 6811 * 6812 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6813 * so no need for stateid or flags. Also we specify NULL 6814 * rp since we're only interested in setting owner_group 6815 * attributes. 6816 */ 6817 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6818 &e.error, 0); 6819 6820 if (e.error) { 6821 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6822 nfs_rw_exit(&drp->r_rwlock); 6823 nfs4_fattr4_free(crattr); 6824 nfs4args_verify_free(&argop[8]); 6825 kmem_free(argop, argoplist_size); 6826 return (e.error); 6827 } 6828 } else { 6829 argop[1].argop = OP_SAVEFH; 6830 6831 argop[5].argop = OP_RESTOREFH; 6832 6833 argop[6].argop = OP_GETATTR; 6834 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6835 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6836 } 6837 6838 dnlc_remove(dvp, nm); 6839 6840 doqueue = 1; 6841 t = gethrtime(); 6842 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6843 6844 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6845 if (e.error) { 6846 PURGE_ATTRCACHE4(dvp); 6847 if (!needrecov) 6848 goto out; 6849 } 6850 6851 if (needrecov) { 6852 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6853 OP_CREATE, NULL) == FALSE) { 6854 nfs4_end_op(mi, dvp, NULL, &recov_state, 6855 needrecov); 6856 need_end_op = FALSE; 6857 nfs4_fattr4_free(crattr); 6858 if (setgid_flag) { 6859 nfs4args_verify_free(&argop[8]); 6860 nfs4args_setattr_free(&argop[9]); 6861 } 6862 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6863 goto recov_retry; 6864 } 6865 } 6866 6867 resp = &res; 6868 6869 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6870 6871 if (res.status == NFS4ERR_BADOWNER) 6872 nfs4_log_badowner(mi, OP_CREATE); 6873 6874 e.error = geterrno4(res.status); 6875 6876 /* 6877 * This check is left over from when create was implemented 6878 * using a setattr op (instead of createattrs). If the 6879 * putfh/create/getfh failed, the error was returned. If 6880 * setattr/getattr failed, we keep going. 6881 * 6882 * It might be better to get rid of the GETFH also, and just 6883 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6884 * Then if any of the operations failed, we could return the 6885 * error now, and remove much of the error code below. 6886 */ 6887 if (res.array_len <= idx_fattr) { 6888 /* 6889 * Either Putfh, Create or Getfh failed. 6890 */ 6891 PURGE_ATTRCACHE4(dvp); 6892 /* 6893 * nfs4_purge_stale_fh() may generate otw calls through 6894 * nfs4_invalidate_pages. Hence the need to call 6895 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 6896 */ 6897 nfs4_end_op(mi, dvp, NULL, &recov_state, 6898 needrecov); 6899 need_end_op = FALSE; 6900 nfs4_purge_stale_fh(e.error, dvp, cr); 6901 goto out; 6902 } 6903 } 6904 6905 resop = &res.array[idx_create]; /* create res */ 6906 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 6907 6908 resop = &res.array[idx_create + 1]; /* getfh res */ 6909 gf_res = &resop->nfs_resop4_u.opgetfh; 6910 6911 sfhp = sfh4_get(&gf_res->object, mi); 6912 if (e.error) { 6913 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 6914 fn_get(VTOSV(dvp)->sv_name, nm)); 6915 if (vp->v_type == VNON) { 6916 vattr.va_mask = AT_TYPE; 6917 /* 6918 * Need to call nfs4_end_op before nfs4getattr to avoid 6919 * potential nfs4_start_op deadlock. See RFE 4777612. 6920 */ 6921 nfs4_end_op(mi, dvp, NULL, &recov_state, 6922 needrecov); 6923 need_end_op = FALSE; 6924 e.error = nfs4getattr(vp, &vattr, cr); 6925 if (e.error) { 6926 VN_RELE(vp); 6927 *vpp = NULL; 6928 goto out; 6929 } 6930 vp->v_type = vattr.va_type; 6931 } 6932 e.error = 0; 6933 } else { 6934 *vpp = vp = makenfs4node(sfhp, 6935 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 6936 dvp->v_vfsp, t, cr, 6937 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 6938 } 6939 6940 /* 6941 * If compound succeeded, then update dir attrs 6942 */ 6943 if (res.status == NFS4_OK) { 6944 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 6945 dinfo.di_cred = cr; 6946 dinfo.di_time_call = t; 6947 dinfop = &dinfo; 6948 } else 6949 dinfop = NULL; 6950 6951 /* Update directory cache attribute, readdir and dnlc caches */ 6952 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 6953 6954 out: 6955 if (sfhp != NULL) 6956 sfh4_rele(&sfhp); 6957 nfs_rw_exit(&drp->r_rwlock); 6958 nfs4_fattr4_free(crattr); 6959 if (setgid_flag) { 6960 nfs4args_verify_free(&argop[8]); 6961 nfs4args_setattr_free(&argop[9]); 6962 } 6963 if (resp) 6964 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 6965 if (need_end_op) 6966 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6967 6968 kmem_free(argop, argoplist_size); 6969 return (e.error); 6970 } 6971 6972 /* ARGSUSED */ 6973 static int 6974 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6975 int mode, vnode_t **vpp, cred_t *cr) 6976 { 6977 int error; 6978 vnode_t *vp; 6979 nfs_ftype4 type; 6980 specdata4 spec, *specp = NULL; 6981 6982 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6983 6984 switch (va->va_type) { 6985 case VCHR: 6986 case VBLK: 6987 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 6988 spec.specdata1 = getmajor(va->va_rdev); 6989 spec.specdata2 = getminor(va->va_rdev); 6990 specp = &spec; 6991 break; 6992 6993 case VFIFO: 6994 type = NF4FIFO; 6995 break; 6996 case VSOCK: 6997 type = NF4SOCK; 6998 break; 6999 7000 default: 7001 return (EINVAL); 7002 } 7003 7004 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7005 if (error) { 7006 return (error); 7007 } 7008 7009 /* 7010 * This might not be needed any more; special case to deal 7011 * with problematic v2/v3 servers. Since create was unable 7012 * to set group correctly, not sure what hope setattr has. 7013 */ 7014 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7015 va->va_mask = AT_GID; 7016 (void) nfs4setattr(vp, va, 0, cr, NULL); 7017 } 7018 7019 /* 7020 * If vnode is a device create special vnode 7021 */ 7022 if (ISVDEV(vp->v_type)) { 7023 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7024 VN_RELE(vp); 7025 } else { 7026 *vpp = vp; 7027 } 7028 return (error); 7029 } 7030 7031 /* 7032 * Remove requires that the current fh be the target directory. 7033 * After the operation, the current fh is unchanged. 7034 * The compound op structure is: 7035 * PUTFH(targetdir), REMOVE 7036 * 7037 * Weirdness: if the vnode to be removed is open 7038 * we rename it instead of removing it and nfs_inactive 7039 * will remove the new name. 7040 */ 7041 static int 7042 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr) 7043 { 7044 COMPOUND4args_clnt args; 7045 COMPOUND4res_clnt res, *resp = NULL; 7046 REMOVE4res *rm_res; 7047 nfs_argop4 argop[3]; 7048 nfs_resop4 *resop; 7049 vnode_t *vp; 7050 char *tmpname; 7051 int doqueue; 7052 mntinfo4_t *mi; 7053 rnode4_t *rp; 7054 rnode4_t *drp; 7055 int needrecov = 0; 7056 nfs4_recov_state_t recov_state; 7057 int isopen; 7058 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7059 dirattr_info_t dinfo; 7060 7061 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7062 return (EPERM); 7063 drp = VTOR4(dvp); 7064 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7065 return (EINTR); 7066 7067 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7068 if (e.error) { 7069 nfs_rw_exit(&drp->r_rwlock); 7070 return (e.error); 7071 } 7072 7073 if (vp->v_type == VDIR) { 7074 VN_RELE(vp); 7075 nfs_rw_exit(&drp->r_rwlock); 7076 return (EISDIR); 7077 } 7078 7079 /* 7080 * First just remove the entry from the name cache, as it 7081 * is most likely the only entry for this vp. 7082 */ 7083 dnlc_remove(dvp, nm); 7084 7085 rp = VTOR4(vp); 7086 7087 /* 7088 * For regular file types, check to see if the file is open by looking 7089 * at the open streams. 7090 * For all other types, check the reference count on the vnode. Since 7091 * they are not opened OTW they never have an open stream. 7092 * 7093 * If the file is open, rename it to .nfsXXXX. 7094 */ 7095 if (vp->v_type != VREG) { 7096 /* 7097 * If the file has a v_count > 1 then there may be more than one 7098 * entry in the name cache due multiple links or an open file, 7099 * but we don't have the real reference count so flush all 7100 * possible entries. 7101 */ 7102 if (vp->v_count > 1) 7103 dnlc_purge_vp(vp); 7104 7105 /* 7106 * Now we have the real reference count. 7107 */ 7108 isopen = vp->v_count > 1; 7109 } else { 7110 mutex_enter(&rp->r_os_lock); 7111 isopen = list_head(&rp->r_open_streams) != NULL; 7112 mutex_exit(&rp->r_os_lock); 7113 } 7114 7115 mutex_enter(&rp->r_statelock); 7116 if (isopen && 7117 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7118 mutex_exit(&rp->r_statelock); 7119 tmpname = newname(); 7120 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr); 7121 if (e.error) 7122 kmem_free(tmpname, MAXNAMELEN); 7123 else { 7124 mutex_enter(&rp->r_statelock); 7125 if (rp->r_unldvp == NULL) { 7126 VN_HOLD(dvp); 7127 rp->r_unldvp = dvp; 7128 if (rp->r_unlcred != NULL) 7129 crfree(rp->r_unlcred); 7130 crhold(cr); 7131 rp->r_unlcred = cr; 7132 rp->r_unlname = tmpname; 7133 } else { 7134 kmem_free(rp->r_unlname, MAXNAMELEN); 7135 rp->r_unlname = tmpname; 7136 } 7137 mutex_exit(&rp->r_statelock); 7138 } 7139 VN_RELE(vp); 7140 nfs_rw_exit(&drp->r_rwlock); 7141 return (e.error); 7142 } 7143 /* 7144 * Actually remove the file/dir 7145 */ 7146 mutex_exit(&rp->r_statelock); 7147 7148 /* 7149 * We need to flush any dirty pages which happen to 7150 * be hanging around before removing the file. 7151 * This shouldn't happen very often since in NFSv4 7152 * we should be close to open consistent. 7153 */ 7154 if (nfs4_has_pages(vp) && 7155 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7156 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 7157 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7158 mutex_enter(&rp->r_statelock); 7159 if (!rp->r_error) 7160 rp->r_error = e.error; 7161 mutex_exit(&rp->r_statelock); 7162 } 7163 } 7164 7165 mi = VTOMI4(dvp); 7166 7167 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7168 recov_state.rs_flags = 0; 7169 recov_state.rs_num_retry_despite_err = 0; 7170 7171 recov_retry: 7172 /* 7173 * Remove ops: putfh dir; remove 7174 */ 7175 args.ctag = TAG_REMOVE; 7176 args.array_len = 3; 7177 args.array = argop; 7178 7179 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7180 if (e.error) { 7181 nfs_rw_exit(&drp->r_rwlock); 7182 VN_RELE(vp); 7183 return (e.error); 7184 } 7185 7186 /* putfh directory */ 7187 argop[0].argop = OP_CPUTFH; 7188 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7189 7190 /* remove */ 7191 argop[1].argop = OP_CREMOVE; 7192 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7193 7194 /* getattr dir */ 7195 argop[2].argop = OP_GETATTR; 7196 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7197 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7198 7199 doqueue = 1; 7200 dinfo.di_time_call = gethrtime(); 7201 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7202 7203 PURGE_ATTRCACHE4(vp); 7204 7205 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7206 if (e.error) 7207 PURGE_ATTRCACHE4(dvp); 7208 7209 if (needrecov) { 7210 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7211 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7212 if (!e.error) 7213 (void) xdr_free(xdr_COMPOUND4res_clnt, 7214 (caddr_t)&res); 7215 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7216 needrecov); 7217 goto recov_retry; 7218 } 7219 } 7220 7221 /* 7222 * Matching nfs4_end_op() for start_op() above. 7223 * There is a path in the code below which calls 7224 * nfs4_purge_stale_fh(), which may generate otw calls through 7225 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7226 * here to avoid nfs4_start_op() deadlock. 7227 */ 7228 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7229 7230 if (!e.error) { 7231 resp = &res; 7232 7233 if (res.status) { 7234 e.error = geterrno4(res.status); 7235 PURGE_ATTRCACHE4(dvp); 7236 nfs4_purge_stale_fh(e.error, dvp, cr); 7237 } else { 7238 resop = &res.array[1]; /* remove res */ 7239 rm_res = &resop->nfs_resop4_u.opremove; 7240 7241 dinfo.di_garp = 7242 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7243 dinfo.di_cred = cr; 7244 7245 /* Update directory attr, readdir and dnlc caches */ 7246 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7247 &dinfo); 7248 } 7249 } 7250 nfs_rw_exit(&drp->r_rwlock); 7251 if (resp) 7252 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7253 7254 VN_RELE(vp); 7255 return (e.error); 7256 } 7257 7258 /* 7259 * Link requires that the current fh be the target directory and the 7260 * saved fh be the source fh. After the operation, the current fh is unchanged. 7261 * Thus the compound op structure is: 7262 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7263 * GETATTR(file) 7264 */ 7265 static int 7266 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 7267 { 7268 COMPOUND4args_clnt args; 7269 COMPOUND4res_clnt res, *resp = NULL; 7270 LINK4res *ln_res; 7271 int argoplist_size = 7 * sizeof (nfs_argop4); 7272 nfs_argop4 *argop; 7273 nfs_resop4 *resop; 7274 vnode_t *realvp, *nvp; 7275 int doqueue; 7276 mntinfo4_t *mi; 7277 rnode4_t *tdrp; 7278 bool_t needrecov = FALSE; 7279 nfs4_recov_state_t recov_state; 7280 hrtime_t t; 7281 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7282 dirattr_info_t dinfo; 7283 7284 ASSERT(*tnm != '\0'); 7285 ASSERT(tdvp->v_type == VDIR); 7286 ASSERT(nfs4_consistent_type(tdvp)); 7287 ASSERT(nfs4_consistent_type(svp)); 7288 7289 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7290 return (EPERM); 7291 if (VOP_REALVP(svp, &realvp) == 0) { 7292 svp = realvp; 7293 ASSERT(nfs4_consistent_type(svp)); 7294 } 7295 7296 tdrp = VTOR4(tdvp); 7297 mi = VTOMI4(svp); 7298 7299 if (!(mi->mi_flags & MI4_LINK)) { 7300 return (EOPNOTSUPP); 7301 } 7302 recov_state.rs_flags = 0; 7303 recov_state.rs_num_retry_despite_err = 0; 7304 7305 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7306 return (EINTR); 7307 7308 recov_retry: 7309 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7310 7311 args.ctag = TAG_LINK; 7312 7313 /* 7314 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7315 * restorefh; getattr(fl) 7316 */ 7317 args.array_len = 7; 7318 args.array = argop; 7319 7320 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7321 if (e.error) { 7322 kmem_free(argop, argoplist_size); 7323 nfs_rw_exit(&tdrp->r_rwlock); 7324 return (e.error); 7325 } 7326 7327 /* 0. putfh file */ 7328 argop[0].argop = OP_CPUTFH; 7329 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7330 7331 /* 1. save current fh to free up the space for the dir */ 7332 argop[1].argop = OP_SAVEFH; 7333 7334 /* 2. putfh targetdir */ 7335 argop[2].argop = OP_CPUTFH; 7336 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7337 7338 /* 3. link: current_fh is targetdir, saved_fh is source */ 7339 argop[3].argop = OP_CLINK; 7340 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7341 7342 /* 4. Get attributes of dir */ 7343 argop[4].argop = OP_GETATTR; 7344 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7345 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7346 7347 /* 5. If link was successful, restore current vp to file */ 7348 argop[5].argop = OP_RESTOREFH; 7349 7350 /* 6. Get attributes of linked object */ 7351 argop[6].argop = OP_GETATTR; 7352 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7353 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7354 7355 dnlc_remove(tdvp, tnm); 7356 7357 doqueue = 1; 7358 t = gethrtime(); 7359 7360 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7361 7362 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7363 if (e.error != 0 && !needrecov) { 7364 PURGE_ATTRCACHE4(tdvp); 7365 PURGE_ATTRCACHE4(svp); 7366 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7367 goto out; 7368 } 7369 7370 if (needrecov) { 7371 bool_t abort; 7372 7373 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7374 NULL, NULL, OP_LINK, NULL); 7375 if (abort == FALSE) { 7376 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7377 needrecov); 7378 kmem_free(argop, argoplist_size); 7379 if (!e.error) 7380 (void) xdr_free(xdr_COMPOUND4res_clnt, 7381 (caddr_t)&res); 7382 goto recov_retry; 7383 } else { 7384 if (e.error != 0) { 7385 PURGE_ATTRCACHE4(tdvp); 7386 PURGE_ATTRCACHE4(svp); 7387 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7388 &recov_state, needrecov); 7389 goto out; 7390 } 7391 /* fall through for res.status case */ 7392 } 7393 } 7394 7395 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7396 7397 resp = &res; 7398 if (res.status) { 7399 /* If link succeeded, then don't return error */ 7400 e.error = geterrno4(res.status); 7401 if (res.array_len <= 4) { 7402 /* 7403 * Either Putfh, Savefh, Putfh dir, or Link failed 7404 */ 7405 PURGE_ATTRCACHE4(svp); 7406 PURGE_ATTRCACHE4(tdvp); 7407 if (e.error == EOPNOTSUPP) { 7408 mutex_enter(&mi->mi_lock); 7409 mi->mi_flags &= ~MI4_LINK; 7410 mutex_exit(&mi->mi_lock); 7411 } 7412 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7413 /* XXX-LP */ 7414 if (e.error == EISDIR && crgetuid(cr) != 0) 7415 e.error = EPERM; 7416 goto out; 7417 } 7418 } 7419 7420 /* either no error or one of the postop getattr failed */ 7421 7422 /* 7423 * XXX - if LINK succeeded, but no attrs were returned for link 7424 * file, purge its cache. 7425 * 7426 * XXX Perform a simplified version of wcc checking. Instead of 7427 * have another getattr to get pre-op, just purge cache if 7428 * any of the ops prior to and including the getattr failed. 7429 * If the getattr succeeded then update the attrcache accordingly. 7430 */ 7431 7432 /* 7433 * update cache with link file postattrs. 7434 * Note: at this point resop points to link res. 7435 */ 7436 resop = &res.array[3]; /* link res */ 7437 ln_res = &resop->nfs_resop4_u.oplink; 7438 if (res.status == NFS4_OK) { 7439 e.error = nfs4_update_attrcache(res.status, 7440 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7441 t, svp, cr); 7442 } 7443 7444 /* 7445 * Call makenfs4node to create the new shadow vp for tnm. 7446 * We pass NULL attrs because we just cached attrs for 7447 * the src object. All we're trying to accomplish is to 7448 * to create the new shadow vnode. 7449 */ 7450 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7451 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm)); 7452 7453 /* Update target cache attribute, readdir and dnlc caches */ 7454 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7455 dinfo.di_time_call = t; 7456 dinfo.di_cred = cr; 7457 7458 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7459 ASSERT(nfs4_consistent_type(tdvp)); 7460 ASSERT(nfs4_consistent_type(svp)); 7461 ASSERT(nfs4_consistent_type(nvp)); 7462 VN_RELE(nvp); 7463 7464 out: 7465 kmem_free(argop, argoplist_size); 7466 if (resp) 7467 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7468 7469 nfs_rw_exit(&tdrp->r_rwlock); 7470 7471 return (e.error); 7472 } 7473 7474 static int 7475 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7476 { 7477 vnode_t *realvp; 7478 7479 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7480 return (EPERM); 7481 if (VOP_REALVP(ndvp, &realvp) == 0) 7482 ndvp = realvp; 7483 7484 return (nfs4rename(odvp, onm, ndvp, nnm, cr)); 7485 } 7486 7487 /* 7488 * nfs4rename does the real work of renaming in NFS Version 4. 7489 * 7490 * A file handle is considered volatile for renaming purposes if either 7491 * of the volatile bits are turned on. However, the compound may differ 7492 * based on the likelihood of the filehandle to change during rename. 7493 */ 7494 static int 7495 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7496 { 7497 int error; 7498 mntinfo4_t *mi; 7499 vnode_t *nvp; 7500 vnode_t *ovp = NULL; 7501 char *tmpname = NULL; 7502 rnode4_t *rp; 7503 rnode4_t *odrp; 7504 rnode4_t *ndrp; 7505 int did_link = 0; 7506 int do_link = 1; 7507 nfsstat4 stat = NFS4_OK; 7508 7509 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7510 ASSERT(nfs4_consistent_type(odvp)); 7511 ASSERT(nfs4_consistent_type(ndvp)); 7512 7513 if (onm[0] == '.' && (onm[1] == '\0' || 7514 (onm[1] == '.' && onm[2] == '\0'))) 7515 return (EINVAL); 7516 7517 if (nnm[0] == '.' && (nnm[1] == '\0' || 7518 (nnm[1] == '.' && nnm[2] == '\0'))) 7519 return (EINVAL); 7520 7521 odrp = VTOR4(odvp); 7522 ndrp = VTOR4(ndvp); 7523 if ((intptr_t)odrp < (intptr_t)ndrp) { 7524 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7525 return (EINTR); 7526 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7527 nfs_rw_exit(&odrp->r_rwlock); 7528 return (EINTR); 7529 } 7530 } else { 7531 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7532 return (EINTR); 7533 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7534 nfs_rw_exit(&ndrp->r_rwlock); 7535 return (EINTR); 7536 } 7537 } 7538 7539 /* 7540 * Lookup the target file. If it exists, it needs to be 7541 * checked to see whether it is a mount point and whether 7542 * it is active (open). 7543 */ 7544 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7545 if (!error) { 7546 int isactive; 7547 7548 ASSERT(nfs4_consistent_type(nvp)); 7549 /* 7550 * If this file has been mounted on, then just 7551 * return busy because renaming to it would remove 7552 * the mounted file system from the name space. 7553 */ 7554 if (vn_ismntpt(nvp)) { 7555 VN_RELE(nvp); 7556 nfs_rw_exit(&odrp->r_rwlock); 7557 nfs_rw_exit(&ndrp->r_rwlock); 7558 return (EBUSY); 7559 } 7560 7561 /* 7562 * First just remove the entry from the name cache, as it 7563 * is most likely the only entry for this vp. 7564 */ 7565 dnlc_remove(ndvp, nnm); 7566 7567 rp = VTOR4(nvp); 7568 7569 if (nvp->v_type != VREG) { 7570 /* 7571 * Purge the name cache of all references to this vnode 7572 * so that we can check the reference count to infer 7573 * whether it is active or not. 7574 */ 7575 if (nvp->v_count > 1) 7576 dnlc_purge_vp(nvp); 7577 7578 isactive = nvp->v_count > 1; 7579 } else { 7580 mutex_enter(&rp->r_os_lock); 7581 isactive = list_head(&rp->r_open_streams) != NULL; 7582 mutex_exit(&rp->r_os_lock); 7583 } 7584 7585 /* 7586 * If the vnode is active and is not a directory, 7587 * arrange to rename it to a 7588 * temporary file so that it will continue to be 7589 * accessible. This implements the "unlink-open-file" 7590 * semantics for the target of a rename operation. 7591 * Before doing this though, make sure that the 7592 * source and target files are not already the same. 7593 */ 7594 if (isactive && nvp->v_type != VDIR) { 7595 /* 7596 * Lookup the source name. 7597 */ 7598 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7599 7600 /* 7601 * The source name *should* already exist. 7602 */ 7603 if (error) { 7604 VN_RELE(nvp); 7605 nfs_rw_exit(&odrp->r_rwlock); 7606 nfs_rw_exit(&ndrp->r_rwlock); 7607 return (error); 7608 } 7609 7610 ASSERT(nfs4_consistent_type(ovp)); 7611 7612 /* 7613 * Compare the two vnodes. If they are the same, 7614 * just release all held vnodes and return success. 7615 */ 7616 if (VN_CMP(ovp, nvp)) { 7617 VN_RELE(ovp); 7618 VN_RELE(nvp); 7619 nfs_rw_exit(&odrp->r_rwlock); 7620 nfs_rw_exit(&ndrp->r_rwlock); 7621 return (0); 7622 } 7623 7624 /* 7625 * Can't mix and match directories and non- 7626 * directories in rename operations. We already 7627 * know that the target is not a directory. If 7628 * the source is a directory, return an error. 7629 */ 7630 if (ovp->v_type == VDIR) { 7631 VN_RELE(ovp); 7632 VN_RELE(nvp); 7633 nfs_rw_exit(&odrp->r_rwlock); 7634 nfs_rw_exit(&ndrp->r_rwlock); 7635 return (ENOTDIR); 7636 } 7637 link_call: 7638 /* 7639 * The target file exists, is not the same as 7640 * the source file, and is active. We first 7641 * try to Link it to a temporary filename to 7642 * avoid having the server removing the file 7643 * completely (which could cause data loss to 7644 * the user's POV in the event the Rename fails 7645 * -- see bug 1165874). 7646 */ 7647 /* 7648 * The do_link and did_link booleans are 7649 * introduced in the event we get NFS4ERR_FILE_OPEN 7650 * returned for the Rename. Some servers can 7651 * not Rename over an Open file, so they return 7652 * this error. The client needs to Remove the 7653 * newly created Link and do two Renames, just 7654 * as if the server didn't support LINK. 7655 */ 7656 tmpname = newname(); 7657 error = 0; 7658 7659 if (do_link) { 7660 error = nfs4_link(ndvp, nvp, tmpname, cr); 7661 } 7662 if (error == EOPNOTSUPP || !do_link) { 7663 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7664 cr); 7665 did_link = 0; 7666 } else { 7667 did_link = 1; 7668 } 7669 if (error) { 7670 kmem_free(tmpname, MAXNAMELEN); 7671 VN_RELE(ovp); 7672 VN_RELE(nvp); 7673 nfs_rw_exit(&odrp->r_rwlock); 7674 nfs_rw_exit(&ndrp->r_rwlock); 7675 return (error); 7676 } 7677 7678 mutex_enter(&rp->r_statelock); 7679 if (rp->r_unldvp == NULL) { 7680 VN_HOLD(ndvp); 7681 rp->r_unldvp = ndvp; 7682 if (rp->r_unlcred != NULL) 7683 crfree(rp->r_unlcred); 7684 crhold(cr); 7685 rp->r_unlcred = cr; 7686 rp->r_unlname = tmpname; 7687 } else { 7688 if (rp->r_unlname) 7689 kmem_free(rp->r_unlname, MAXNAMELEN); 7690 rp->r_unlname = tmpname; 7691 } 7692 mutex_exit(&rp->r_statelock); 7693 } 7694 7695 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7696 7697 ASSERT(nfs4_consistent_type(nvp)); 7698 VN_RELE(nvp); 7699 } 7700 7701 if (ovp == NULL) { 7702 /* 7703 * When renaming directories to be a subdirectory of a 7704 * different parent, the dnlc entry for ".." will no 7705 * longer be valid, so it must be removed. 7706 * 7707 * We do a lookup here to determine whether we are renaming 7708 * a directory and we need to check if we are renaming 7709 * an unlinked file. This might have already been done 7710 * in previous code, so we check ovp == NULL to avoid 7711 * doing it twice. 7712 */ 7713 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7714 /* 7715 * The source name *should* already exist. 7716 */ 7717 if (error) { 7718 nfs_rw_exit(&odrp->r_rwlock); 7719 nfs_rw_exit(&ndrp->r_rwlock); 7720 return (error); 7721 } 7722 ASSERT(ovp != NULL); 7723 ASSERT(nfs4_consistent_type(ovp)); 7724 } 7725 7726 /* 7727 * Is the object being renamed a dir, and if so, is 7728 * it being renamed to a child of itself? The underlying 7729 * fs should ultimately return EINVAL for this case; 7730 * however, buggy beta non-Solaris NFSv4 servers at 7731 * interop testing events have allowed this behavior, 7732 * and it caused our client to panic due to a recursive 7733 * mutex_enter in fn_move. 7734 * 7735 * The tedious locking in fn_move could be changed to 7736 * deal with this case, and the client could avoid the 7737 * panic; however, the client would just confuse itself 7738 * later and misbehave. A better way to handle the broken 7739 * server is to detect this condition and return EINVAL 7740 * without ever sending the the bogus rename to the server. 7741 * We know the rename is invalid -- just fail it now. 7742 */ 7743 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7744 VN_RELE(ovp); 7745 nfs_rw_exit(&odrp->r_rwlock); 7746 nfs_rw_exit(&ndrp->r_rwlock); 7747 return (EINVAL); 7748 } 7749 7750 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7751 7752 /* 7753 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7754 * possible for the filehandle to change due to the rename. 7755 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7756 * the fh will not change because of the rename, but we still need 7757 * to update its rnode entry with the new name for 7758 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7759 * has no effect on these for now, but for future improvements, 7760 * we might want to use it too to simplify handling of files 7761 * that are open with that flag on. (XXX) 7762 */ 7763 mi = VTOMI4(odvp); 7764 if (NFS4_VOLATILE_FH(mi)) { 7765 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7766 &stat); 7767 } else { 7768 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7769 &stat); 7770 } 7771 ASSERT(nfs4_consistent_type(odvp)); 7772 ASSERT(nfs4_consistent_type(ndvp)); 7773 ASSERT(nfs4_consistent_type(ovp)); 7774 7775 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7776 do_link = 0; 7777 /* 7778 * Before the 'link_call' code, we did a nfs4_lookup 7779 * that puts a VN_HOLD on nvp. After the nfs4_link 7780 * call we call VN_RELE to match that hold. We need 7781 * to place an additional VN_HOLD here since we will 7782 * be hitting that VN_RELE again. 7783 */ 7784 VN_HOLD(nvp); 7785 7786 (void) nfs4_remove(ndvp, tmpname, cr); 7787 7788 /* Undo the unlinked file naming stuff we just did */ 7789 mutex_enter(&rp->r_statelock); 7790 if (rp->r_unldvp) { 7791 VN_RELE(ndvp); 7792 rp->r_unldvp = NULL; 7793 if (rp->r_unlcred != NULL) 7794 crfree(rp->r_unlcred); 7795 rp->r_unlcred = NULL; 7796 /* rp->r_unlanme points to tmpname */ 7797 if (rp->r_unlname) 7798 kmem_free(rp->r_unlname, MAXNAMELEN); 7799 rp->r_unlname = NULL; 7800 } 7801 mutex_exit(&rp->r_statelock); 7802 7803 goto link_call; 7804 } 7805 7806 if (error) { 7807 VN_RELE(ovp); 7808 nfs_rw_exit(&odrp->r_rwlock); 7809 nfs_rw_exit(&ndrp->r_rwlock); 7810 return (error); 7811 } 7812 7813 /* 7814 * when renaming directories to be a subdirectory of a 7815 * different parent, the dnlc entry for ".." will no 7816 * longer be valid, so it must be removed 7817 */ 7818 rp = VTOR4(ovp); 7819 if (ndvp != odvp) { 7820 if (ovp->v_type == VDIR) { 7821 dnlc_remove(ovp, ".."); 7822 if (rp->r_dir != NULL) 7823 nfs4_purge_rddir_cache(ovp); 7824 } 7825 } 7826 7827 /* 7828 * If we are renaming the unlinked file, update the 7829 * r_unldvp and r_unlname as needed. 7830 */ 7831 mutex_enter(&rp->r_statelock); 7832 if (rp->r_unldvp != NULL) { 7833 if (strcmp(rp->r_unlname, onm) == 0) { 7834 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7835 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7836 if (ndvp != rp->r_unldvp) { 7837 VN_RELE(rp->r_unldvp); 7838 rp->r_unldvp = ndvp; 7839 VN_HOLD(ndvp); 7840 } 7841 } 7842 } 7843 mutex_exit(&rp->r_statelock); 7844 7845 VN_RELE(ovp); 7846 7847 nfs_rw_exit(&odrp->r_rwlock); 7848 nfs_rw_exit(&ndrp->r_rwlock); 7849 7850 return (error); 7851 } 7852 7853 /* 7854 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 7855 * when it is known that the filehandle is persistent through rename. 7856 * 7857 * Rename requires that the current fh be the target directory and the 7858 * saved fh be the source directory. After the operation, the current fh 7859 * is unchanged. 7860 * The compound op structure for persistent fh rename is: 7861 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 7862 * Rather than bother with the directory postop args, we'll simply 7863 * update that a change occured in the cache, so no post-op getattrs. 7864 */ 7865 static int 7866 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 7867 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 7868 { 7869 COMPOUND4args_clnt args; 7870 COMPOUND4res_clnt res, *resp = NULL; 7871 nfs_argop4 *argop; 7872 nfs_resop4 *resop; 7873 int doqueue, argoplist_size; 7874 mntinfo4_t *mi; 7875 rnode4_t *odrp = VTOR4(odvp); 7876 rnode4_t *ndrp = VTOR4(ndvp); 7877 RENAME4res *rn_res; 7878 bool_t needrecov; 7879 nfs4_recov_state_t recov_state; 7880 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7881 dirattr_info_t dinfo, *dinfop; 7882 7883 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7884 7885 recov_state.rs_flags = 0; 7886 recov_state.rs_num_retry_despite_err = 0; 7887 7888 /* 7889 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 7890 * 7891 * If source/target are different dirs, then append putfh(src); getattr 7892 */ 7893 args.array_len = (odvp == ndvp) ? 5 : 7; 7894 argoplist_size = args.array_len * sizeof (nfs_argop4); 7895 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 7896 7897 recov_retry: 7898 *statp = NFS4_OK; 7899 7900 /* No need to Lookup the file, persistent fh */ 7901 args.ctag = TAG_RENAME; 7902 7903 mi = VTOMI4(odvp); 7904 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 7905 if (e.error) { 7906 kmem_free(argop, argoplist_size); 7907 return (e.error); 7908 } 7909 7910 /* 0: putfh source directory */ 7911 argop[0].argop = OP_CPUTFH; 7912 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 7913 7914 /* 1: Save source fh to free up current for target */ 7915 argop[1].argop = OP_SAVEFH; 7916 7917 /* 2: putfh targetdir */ 7918 argop[2].argop = OP_CPUTFH; 7919 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7920 7921 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 7922 argop[3].argop = OP_CRENAME; 7923 argop[3].nfs_argop4_u.opcrename.coldname = onm; 7924 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 7925 7926 /* 4: getattr (targetdir) */ 7927 argop[4].argop = OP_GETATTR; 7928 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7929 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7930 7931 if (ndvp != odvp) { 7932 7933 /* 5: putfh (sourcedir) */ 7934 argop[5].argop = OP_CPUTFH; 7935 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7936 7937 /* 6: getattr (sourcedir) */ 7938 argop[6].argop = OP_GETATTR; 7939 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7940 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7941 } 7942 7943 dnlc_remove(odvp, onm); 7944 dnlc_remove(ndvp, nnm); 7945 7946 doqueue = 1; 7947 dinfo.di_time_call = gethrtime(); 7948 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7949 7950 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7951 if (e.error) { 7952 PURGE_ATTRCACHE4(odvp); 7953 PURGE_ATTRCACHE4(ndvp); 7954 } else { 7955 *statp = res.status; 7956 } 7957 7958 if (needrecov) { 7959 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 7960 OP_RENAME, NULL) == FALSE) { 7961 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 7962 if (!e.error) 7963 (void) xdr_free(xdr_COMPOUND4res_clnt, 7964 (caddr_t)&res); 7965 goto recov_retry; 7966 } 7967 } 7968 7969 if (!e.error) { 7970 resp = &res; 7971 /* 7972 * as long as OP_RENAME 7973 */ 7974 if (res.status != NFS4_OK && res.array_len <= 4) { 7975 e.error = geterrno4(res.status); 7976 PURGE_ATTRCACHE4(odvp); 7977 PURGE_ATTRCACHE4(ndvp); 7978 /* 7979 * System V defines rename to return EEXIST, not 7980 * ENOTEMPTY if the target directory is not empty. 7981 * Over the wire, the error is NFSERR_ENOTEMPTY 7982 * which geterrno4 maps to ENOTEMPTY. 7983 */ 7984 if (e.error == ENOTEMPTY) 7985 e.error = EEXIST; 7986 } else { 7987 7988 resop = &res.array[3]; /* rename res */ 7989 rn_res = &resop->nfs_resop4_u.oprename; 7990 7991 if (res.status == NFS4_OK) { 7992 /* 7993 * Update target attribute, readdir and dnlc 7994 * caches. 7995 */ 7996 dinfo.di_garp = 7997 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7998 dinfo.di_cred = cr; 7999 dinfop = &dinfo; 8000 } else 8001 dinfop = NULL; 8002 8003 nfs4_update_dircaches(&rn_res->target_cinfo, 8004 ndvp, NULL, NULL, dinfop); 8005 8006 /* 8007 * Update source attribute, readdir and dnlc caches 8008 * 8009 */ 8010 if (ndvp != odvp) { 8011 if (dinfop) 8012 dinfo.di_garp = 8013 &(res.array[6].nfs_resop4_u. 8014 opgetattr.ga_res); 8015 8016 nfs4_update_dircaches(&rn_res->source_cinfo, 8017 odvp, NULL, NULL, dinfop); 8018 } 8019 8020 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8021 nnm); 8022 } 8023 } 8024 8025 if (resp) 8026 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8027 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8028 kmem_free(argop, argoplist_size); 8029 8030 return (e.error); 8031 } 8032 8033 /* 8034 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8035 * it is possible for the filehandle to change due to the rename. 8036 * 8037 * The compound req in this case includes a post-rename lookup and getattr 8038 * to ensure that we have the correct fh and attributes for the object. 8039 * 8040 * Rename requires that the current fh be the target directory and the 8041 * saved fh be the source directory. After the operation, the current fh 8042 * is unchanged. 8043 * 8044 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8045 * update the filehandle for the renamed object. We also get the old 8046 * filehandle for historical reasons; this should be taken out sometime. 8047 * This results in a rather cumbersome compound... 8048 * 8049 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8050 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8051 * 8052 */ 8053 static int 8054 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8055 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8056 { 8057 COMPOUND4args_clnt args; 8058 COMPOUND4res_clnt res, *resp = NULL; 8059 int argoplist_size; 8060 nfs_argop4 *argop; 8061 nfs_resop4 *resop; 8062 int doqueue; 8063 mntinfo4_t *mi; 8064 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8065 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8066 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8067 RENAME4res *rn_res; 8068 GETFH4res *ngf_res; 8069 bool_t needrecov; 8070 nfs4_recov_state_t recov_state; 8071 hrtime_t t; 8072 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8073 dirattr_info_t dinfo, *dinfop = &dinfo; 8074 8075 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8076 8077 recov_state.rs_flags = 0; 8078 recov_state.rs_num_retry_despite_err = 0; 8079 8080 recov_retry: 8081 *statp = NFS4_OK; 8082 8083 /* 8084 * There is a window between the RPC and updating the path and 8085 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8086 * code, so that it doesn't try to use the old path during that 8087 * window. 8088 */ 8089 mutex_enter(&orp->r_statelock); 8090 while (orp->r_flags & R4RECEXPFH) { 8091 klwp_t *lwp = ttolwp(curthread); 8092 8093 if (lwp != NULL) 8094 lwp->lwp_nostop++; 8095 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8096 mutex_exit(&orp->r_statelock); 8097 if (lwp != NULL) 8098 lwp->lwp_nostop--; 8099 return (EINTR); 8100 } 8101 if (lwp != NULL) 8102 lwp->lwp_nostop--; 8103 } 8104 orp->r_flags |= R4RECEXPFH; 8105 mutex_exit(&orp->r_statelock); 8106 8107 mi = VTOMI4(odvp); 8108 8109 args.ctag = TAG_RENAME_VFH; 8110 args.array_len = (odvp == ndvp) ? 10 : 12; 8111 argoplist_size = args.array_len * sizeof (nfs_argop4); 8112 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8113 8114 /* 8115 * Rename ops: 8116 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8117 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8118 * LOOKUP(trgt), GETFH(new), GETATTR, 8119 * 8120 * if (odvp != ndvp) 8121 * add putfh(sourcedir), getattr(sourcedir) } 8122 */ 8123 args.array = argop; 8124 8125 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8126 &recov_state, NULL); 8127 if (e.error) { 8128 kmem_free(argop, argoplist_size); 8129 mutex_enter(&orp->r_statelock); 8130 orp->r_flags &= ~R4RECEXPFH; 8131 cv_broadcast(&orp->r_cv); 8132 mutex_exit(&orp->r_statelock); 8133 return (e.error); 8134 } 8135 8136 /* 0: putfh source directory */ 8137 argop[0].argop = OP_CPUTFH; 8138 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8139 8140 /* 1: Save source fh to free up current for target */ 8141 argop[1].argop = OP_SAVEFH; 8142 8143 /* 2: Lookup pre-rename fh of renamed object */ 8144 argop[2].argop = OP_CLOOKUP; 8145 argop[2].nfs_argop4_u.opclookup.cname = onm; 8146 8147 /* 3: getfh fh of renamed object (before rename) */ 8148 argop[3].argop = OP_GETFH; 8149 8150 /* 4: putfh targetdir */ 8151 argop[4].argop = OP_CPUTFH; 8152 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8153 8154 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8155 argop[5].argop = OP_CRENAME; 8156 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8157 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8158 8159 /* 6: getattr of target dir (post op attrs) */ 8160 argop[6].argop = OP_GETATTR; 8161 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8162 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8163 8164 /* 7: Lookup post-rename fh of renamed object */ 8165 argop[7].argop = OP_CLOOKUP; 8166 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8167 8168 /* 8: getfh fh of renamed object (after rename) */ 8169 argop[8].argop = OP_GETFH; 8170 8171 /* 9: getattr of renamed object */ 8172 argop[9].argop = OP_GETATTR; 8173 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8174 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8175 8176 /* 8177 * If source/target dirs are different, then get new post-op 8178 * attrs for source dir also. 8179 */ 8180 if (ndvp != odvp) { 8181 /* 10: putfh (sourcedir) */ 8182 argop[10].argop = OP_CPUTFH; 8183 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8184 8185 /* 11: getattr (sourcedir) */ 8186 argop[11].argop = OP_GETATTR; 8187 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8188 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8189 } 8190 8191 dnlc_remove(odvp, onm); 8192 dnlc_remove(ndvp, nnm); 8193 8194 doqueue = 1; 8195 t = gethrtime(); 8196 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8197 8198 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8199 if (e.error) { 8200 PURGE_ATTRCACHE4(odvp); 8201 PURGE_ATTRCACHE4(ndvp); 8202 if (!needrecov) { 8203 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8204 &recov_state, needrecov); 8205 goto out; 8206 } 8207 } else { 8208 *statp = res.status; 8209 } 8210 8211 if (needrecov) { 8212 bool_t abort; 8213 8214 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8215 OP_RENAME, NULL); 8216 if (abort == FALSE) { 8217 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8218 &recov_state, needrecov); 8219 kmem_free(argop, argoplist_size); 8220 if (!e.error) 8221 (void) xdr_free(xdr_COMPOUND4res_clnt, 8222 (caddr_t)&res); 8223 mutex_enter(&orp->r_statelock); 8224 orp->r_flags &= ~R4RECEXPFH; 8225 cv_broadcast(&orp->r_cv); 8226 mutex_exit(&orp->r_statelock); 8227 goto recov_retry; 8228 } else { 8229 if (e.error != 0) { 8230 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8231 &recov_state, needrecov); 8232 goto out; 8233 } 8234 /* fall through for res.status case */ 8235 } 8236 } 8237 8238 resp = &res; 8239 /* 8240 * If OP_RENAME (or any prev op) failed, then return an error. 8241 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8242 */ 8243 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8244 /* 8245 * Error in an op other than last Getattr 8246 */ 8247 e.error = geterrno4(res.status); 8248 PURGE_ATTRCACHE4(odvp); 8249 PURGE_ATTRCACHE4(ndvp); 8250 /* 8251 * System V defines rename to return EEXIST, not 8252 * ENOTEMPTY if the target directory is not empty. 8253 * Over the wire, the error is NFSERR_ENOTEMPTY 8254 * which geterrno4 maps to ENOTEMPTY. 8255 */ 8256 if (e.error == ENOTEMPTY) 8257 e.error = EEXIST; 8258 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8259 needrecov); 8260 goto out; 8261 } 8262 8263 /* rename results */ 8264 rn_res = &res.array[5].nfs_resop4_u.oprename; 8265 8266 if (res.status == NFS4_OK) { 8267 /* Update target attribute, readdir and dnlc caches */ 8268 dinfo.di_garp = 8269 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8270 dinfo.di_cred = cr; 8271 dinfo.di_time_call = t; 8272 } else 8273 dinfop = NULL; 8274 8275 /* Update source cache attribute, readdir and dnlc caches */ 8276 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8277 8278 /* Update source cache attribute, readdir and dnlc caches */ 8279 if (ndvp != odvp) { 8280 8281 /* 8282 * If dinfop is non-NULL, then compound succeded, so 8283 * set di_garp to attrs for source dir. dinfop is only 8284 * set to NULL when compound fails. 8285 */ 8286 if (dinfop) 8287 dinfo.di_garp = 8288 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8289 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8290 dinfop); 8291 } 8292 8293 /* 8294 * Update the rnode with the new component name and args, 8295 * and if the file handle changed, also update it with the new fh. 8296 * This is only necessary if the target object has an rnode 8297 * entry and there is no need to create one for it. 8298 */ 8299 resop = &res.array[8]; /* getfh new res */ 8300 ngf_res = &resop->nfs_resop4_u.opgetfh; 8301 8302 /* 8303 * Update the path and filehandle for the renamed object. 8304 */ 8305 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8306 8307 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8308 8309 if (res.status == NFS4_OK) { 8310 resop++; /* getattr res */ 8311 e.error = nfs4_update_attrcache(res.status, 8312 &resop->nfs_resop4_u.opgetattr.ga_res, 8313 t, ovp, cr); 8314 } 8315 8316 out: 8317 kmem_free(argop, argoplist_size); 8318 if (resp) 8319 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8320 mutex_enter(&orp->r_statelock); 8321 orp->r_flags &= ~R4RECEXPFH; 8322 cv_broadcast(&orp->r_cv); 8323 mutex_exit(&orp->r_statelock); 8324 8325 return (e.error); 8326 } 8327 8328 static int 8329 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 8330 { 8331 int error; 8332 vnode_t *vp; 8333 8334 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8335 return (EPERM); 8336 /* 8337 * As ".." has special meaning and rather than send a mkdir 8338 * over the wire to just let the server freak out, we just 8339 * short circuit it here and return EEXIST 8340 */ 8341 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8342 return (EEXIST); 8343 8344 /* 8345 * Decision to get the right gid and setgid bit of the 8346 * new directory is now made in call_nfs4_create_req. 8347 */ 8348 va->va_mask |= AT_MODE; 8349 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8350 if (error) 8351 return (error); 8352 8353 *vpp = vp; 8354 return (0); 8355 } 8356 8357 8358 /* 8359 * rmdir is using the same remove v4 op as does remove. 8360 * Remove requires that the current fh be the target directory. 8361 * After the operation, the current fh is unchanged. 8362 * The compound op structure is: 8363 * PUTFH(targetdir), REMOVE 8364 */ 8365 static int 8366 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 8367 { 8368 int need_end_op = FALSE; 8369 COMPOUND4args_clnt args; 8370 COMPOUND4res_clnt res, *resp = NULL; 8371 REMOVE4res *rm_res; 8372 nfs_argop4 argop[3]; 8373 nfs_resop4 *resop; 8374 vnode_t *vp; 8375 int doqueue; 8376 mntinfo4_t *mi; 8377 rnode4_t *drp; 8378 bool_t needrecov = FALSE; 8379 nfs4_recov_state_t recov_state; 8380 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8381 dirattr_info_t dinfo, *dinfop; 8382 8383 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8384 return (EPERM); 8385 /* 8386 * As ".." has special meaning and rather than send a rmdir 8387 * over the wire to just let the server freak out, we just 8388 * short circuit it here and return EEXIST 8389 */ 8390 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8391 return (EEXIST); 8392 8393 drp = VTOR4(dvp); 8394 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8395 return (EINTR); 8396 8397 /* 8398 * Attempt to prevent a rmdir(".") from succeeding. 8399 */ 8400 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8401 if (e.error) { 8402 nfs_rw_exit(&drp->r_rwlock); 8403 return (e.error); 8404 } 8405 if (vp == cdir) { 8406 VN_RELE(vp); 8407 nfs_rw_exit(&drp->r_rwlock); 8408 return (EINVAL); 8409 } 8410 8411 /* 8412 * Since nfsv4 remove op works on both files and directories, 8413 * check that the removed object is indeed a directory. 8414 */ 8415 if (vp->v_type != VDIR) { 8416 VN_RELE(vp); 8417 nfs_rw_exit(&drp->r_rwlock); 8418 return (ENOTDIR); 8419 } 8420 8421 /* 8422 * First just remove the entry from the name cache, as it 8423 * is most likely an entry for this vp. 8424 */ 8425 dnlc_remove(dvp, nm); 8426 8427 /* 8428 * If there vnode reference count is greater than one, then 8429 * there may be additional references in the DNLC which will 8430 * need to be purged. First, trying removing the entry for 8431 * the parent directory and see if that removes the additional 8432 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8433 * to completely remove any references to the directory which 8434 * might still exist in the DNLC. 8435 */ 8436 if (vp->v_count > 1) { 8437 dnlc_remove(vp, ".."); 8438 if (vp->v_count > 1) 8439 dnlc_purge_vp(vp); 8440 } 8441 8442 mi = VTOMI4(dvp); 8443 recov_state.rs_flags = 0; 8444 recov_state.rs_num_retry_despite_err = 0; 8445 8446 recov_retry: 8447 args.ctag = TAG_RMDIR; 8448 8449 /* 8450 * Rmdir ops: putfh dir; remove 8451 */ 8452 args.array_len = 3; 8453 args.array = argop; 8454 8455 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8456 if (e.error) { 8457 nfs_rw_exit(&drp->r_rwlock); 8458 return (e.error); 8459 } 8460 need_end_op = TRUE; 8461 8462 /* putfh directory */ 8463 argop[0].argop = OP_CPUTFH; 8464 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8465 8466 /* remove */ 8467 argop[1].argop = OP_CREMOVE; 8468 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8469 8470 /* getattr (postop attrs for dir that contained removed dir) */ 8471 argop[2].argop = OP_GETATTR; 8472 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8473 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8474 8475 dinfo.di_time_call = gethrtime(); 8476 doqueue = 1; 8477 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8478 8479 PURGE_ATTRCACHE4(vp); 8480 8481 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8482 if (e.error) { 8483 PURGE_ATTRCACHE4(dvp); 8484 } 8485 8486 if (needrecov) { 8487 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8488 NULL, OP_REMOVE, NULL) == FALSE) { 8489 if (!e.error) 8490 (void) xdr_free(xdr_COMPOUND4res_clnt, 8491 (caddr_t)&res); 8492 8493 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8494 needrecov); 8495 need_end_op = FALSE; 8496 goto recov_retry; 8497 } 8498 } 8499 8500 if (!e.error) { 8501 resp = &res; 8502 8503 /* 8504 * Only return error if first 2 ops (OP_REMOVE or earlier) 8505 * failed. 8506 */ 8507 if (res.status != NFS4_OK && res.array_len <= 2) { 8508 e.error = geterrno4(res.status); 8509 PURGE_ATTRCACHE4(dvp); 8510 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8511 &recov_state, needrecov); 8512 need_end_op = FALSE; 8513 nfs4_purge_stale_fh(e.error, dvp, cr); 8514 /* 8515 * System V defines rmdir to return EEXIST, not 8516 * ENOTEMPTY if the directory is not empty. Over 8517 * the wire, the error is NFSERR_ENOTEMPTY which 8518 * geterrno4 maps to ENOTEMPTY. 8519 */ 8520 if (e.error == ENOTEMPTY) 8521 e.error = EEXIST; 8522 } else { 8523 resop = &res.array[1]; /* remove res */ 8524 rm_res = &resop->nfs_resop4_u.opremove; 8525 8526 if (res.status == NFS4_OK) { 8527 resop = &res.array[2]; /* dir attrs */ 8528 dinfo.di_garp = 8529 &resop->nfs_resop4_u.opgetattr.ga_res; 8530 dinfo.di_cred = cr; 8531 dinfop = &dinfo; 8532 } else 8533 dinfop = NULL; 8534 8535 /* Update dir attribute, readdir and dnlc caches */ 8536 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8537 dinfop); 8538 8539 /* destroy rddir cache for dir that was removed */ 8540 if (VTOR4(vp)->r_dir != NULL) 8541 nfs4_purge_rddir_cache(vp); 8542 } 8543 } 8544 8545 if (need_end_op) 8546 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8547 8548 nfs_rw_exit(&drp->r_rwlock); 8549 8550 if (resp) 8551 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8552 8553 VN_RELE(vp); 8554 8555 return (e.error); 8556 } 8557 8558 static int 8559 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 8560 { 8561 int error; 8562 vnode_t *vp; 8563 rnode4_t *rp; 8564 char *contents; 8565 mntinfo4_t *mi = VTOMI4(dvp); 8566 8567 if (nfs_zone() != mi->mi_zone) 8568 return (EPERM); 8569 if (!(mi->mi_flags & MI4_SYMLINK)) 8570 return (EOPNOTSUPP); 8571 8572 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8573 if (error) { 8574 return (error); 8575 } 8576 8577 ASSERT(nfs4_consistent_type(vp)); 8578 rp = VTOR4(vp); 8579 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8580 8581 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8582 8583 if (contents != NULL) { 8584 mutex_enter(&rp->r_statelock); 8585 if (rp->r_symlink.contents == NULL) { 8586 rp->r_symlink.len = strlen(tnm); 8587 bcopy(tnm, contents, rp->r_symlink.len); 8588 rp->r_symlink.contents = contents; 8589 rp->r_symlink.size = MAXPATHLEN; 8590 mutex_exit(&rp->r_statelock); 8591 } else { 8592 mutex_exit(&rp->r_statelock); 8593 kmem_free((void *)contents, MAXPATHLEN); 8594 } 8595 } 8596 } 8597 VN_RELE(vp); 8598 8599 return (error); 8600 } 8601 8602 8603 /* 8604 * Read directory entries. 8605 * There are some weird things to look out for here. The uio_loffset 8606 * field is either 0 or it is the offset returned from a previous 8607 * readdir. It is an opaque value used by the server to find the 8608 * correct directory block to read. The count field is the number 8609 * of blocks to read on the server. This is advisory only, the server 8610 * may return only one block's worth of entries. Entries may be compressed 8611 * on the server. 8612 */ 8613 static int 8614 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 8615 { 8616 int error; 8617 uint_t count; 8618 rnode4_t *rp; 8619 rddir4_cache *rdc; 8620 rddir4_cache *rrdc; 8621 8622 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8623 return (EIO); 8624 rp = VTOR4(vp); 8625 8626 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8627 8628 /* 8629 * Make sure that the directory cache is valid. 8630 */ 8631 if (rp->r_dir != NULL) { 8632 if (nfs_disable_rddir_cache != 0) { 8633 /* 8634 * Setting nfs_disable_rddir_cache in /etc/system 8635 * allows interoperability with servers that do not 8636 * properly update the attributes of directories. 8637 * Any cached information gets purged before an 8638 * access is made to it. 8639 */ 8640 nfs4_purge_rddir_cache(vp); 8641 } 8642 8643 error = nfs4_validate_caches(vp, cr); 8644 if (error) 8645 return (error); 8646 } 8647 8648 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8649 8650 /* 8651 * Short circuit last readdir which always returns 0 bytes. 8652 * This can be done after the directory has been read through 8653 * completely at least once. This will set r_direof which 8654 * can be used to find the value of the last cookie. 8655 */ 8656 mutex_enter(&rp->r_statelock); 8657 if (rp->r_direof != NULL && 8658 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8659 mutex_exit(&rp->r_statelock); 8660 #ifdef DEBUG 8661 nfs4_readdir_cache_shorts++; 8662 #endif 8663 if (eofp) 8664 *eofp = 1; 8665 return (0); 8666 } 8667 8668 /* 8669 * Look for a cache entry. Cache entries are identified 8670 * by the NFS cookie value and the byte count requested. 8671 */ 8672 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8673 8674 /* 8675 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8676 */ 8677 if (rdc == NULL) { 8678 mutex_exit(&rp->r_statelock); 8679 return (EINTR); 8680 } 8681 8682 /* 8683 * Check to see if we need to fill this entry in. 8684 */ 8685 if (rdc->flags & RDDIRREQ) { 8686 rdc->flags &= ~RDDIRREQ; 8687 rdc->flags |= RDDIR; 8688 mutex_exit(&rp->r_statelock); 8689 8690 /* 8691 * Do the readdir. 8692 */ 8693 nfs4readdir(vp, rdc, cr); 8694 8695 /* 8696 * Reaquire the lock, so that we can continue 8697 */ 8698 mutex_enter(&rp->r_statelock); 8699 /* 8700 * The entry is now complete 8701 */ 8702 rdc->flags &= ~RDDIR; 8703 } 8704 8705 ASSERT(!(rdc->flags & RDDIR)); 8706 8707 /* 8708 * If an error occurred while attempting 8709 * to fill the cache entry, mark the entry invalid and 8710 * just return the error. 8711 */ 8712 if (rdc->error) { 8713 error = rdc->error; 8714 rdc->flags |= RDDIRREQ; 8715 rddir4_cache_rele(rp, rdc); 8716 mutex_exit(&rp->r_statelock); 8717 return (error); 8718 } 8719 8720 /* 8721 * The cache entry is complete and good, 8722 * copyout the dirent structs to the calling 8723 * thread. 8724 */ 8725 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8726 8727 /* 8728 * If no error occurred during the copyout, 8729 * update the offset in the uio struct to 8730 * contain the value of the next NFS 4 cookie 8731 * and set the eof value appropriately. 8732 */ 8733 if (!error) { 8734 uiop->uio_loffset = rdc->nfs4_ncookie; 8735 if (eofp) 8736 *eofp = rdc->eof; 8737 } 8738 8739 /* 8740 * Decide whether to do readahead. Don't if we 8741 * have already read to the end of directory. 8742 */ 8743 if (rdc->eof) { 8744 /* 8745 * Make the entry the direof only if it is cached 8746 */ 8747 if (rdc->flags & RDDIRCACHED) 8748 rp->r_direof = rdc; 8749 rddir4_cache_rele(rp, rdc); 8750 mutex_exit(&rp->r_statelock); 8751 return (error); 8752 } 8753 8754 /* Determine if a readdir readahead should be done */ 8755 if (!(rp->r_flags & R4LOOKUP)) { 8756 rddir4_cache_rele(rp, rdc); 8757 mutex_exit(&rp->r_statelock); 8758 return (error); 8759 } 8760 8761 /* 8762 * Now look for a readahead entry. 8763 * 8764 * Check to see whether we found an entry for the readahead. 8765 * If so, we don't need to do anything further, so free the new 8766 * entry if one was allocated. Otherwise, allocate a new entry, add 8767 * it to the cache, and then initiate an asynchronous readdir 8768 * operation to fill it. 8769 */ 8770 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8771 8772 /* 8773 * A readdir cache entry could not be obtained for the readahead. In 8774 * this case we skip the readahead and return. 8775 */ 8776 if (rrdc == NULL) { 8777 rddir4_cache_rele(rp, rdc); 8778 mutex_exit(&rp->r_statelock); 8779 return (error); 8780 } 8781 8782 /* 8783 * Check to see if we need to fill this entry in. 8784 */ 8785 if (rrdc->flags & RDDIRREQ) { 8786 rrdc->flags &= ~RDDIRREQ; 8787 rrdc->flags |= RDDIR; 8788 rddir4_cache_rele(rp, rdc); 8789 mutex_exit(&rp->r_statelock); 8790 #ifdef DEBUG 8791 nfs4_readdir_readahead++; 8792 #endif 8793 /* 8794 * Do the readdir. 8795 */ 8796 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 8797 return (error); 8798 } 8799 8800 rddir4_cache_rele(rp, rrdc); 8801 rddir4_cache_rele(rp, rdc); 8802 mutex_exit(&rp->r_statelock); 8803 return (error); 8804 } 8805 8806 static int 8807 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8808 { 8809 int error; 8810 rnode4_t *rp; 8811 8812 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 8813 8814 rp = VTOR4(vp); 8815 8816 /* 8817 * Obtain the readdir results for the caller. 8818 */ 8819 nfs4readdir(vp, rdc, cr); 8820 8821 mutex_enter(&rp->r_statelock); 8822 /* 8823 * The entry is now complete 8824 */ 8825 rdc->flags &= ~RDDIR; 8826 8827 error = rdc->error; 8828 if (error) 8829 rdc->flags |= RDDIRREQ; 8830 rddir4_cache_rele(rp, rdc); 8831 mutex_exit(&rp->r_statelock); 8832 8833 return (error); 8834 } 8835 8836 static void 8837 nfs4readdir_stub(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8838 { 8839 int stublength; 8840 dirent64_t *dp; 8841 u_longlong_t nodeid, pnodeid; 8842 vnode_t *dotdotvp = NULL; 8843 rnode4_t *rp = VTOR4(vp); 8844 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8845 8846 rdc->error = 0; 8847 rdc->entries = 0; 8848 rdc->actlen = rdc->entlen = 0; 8849 rdc->eof = TRUE; 8850 8851 /* Check for EOF case for readdir of stub */ 8852 if (cookie != 0 && cookie != 1) 8853 return; 8854 8855 nodeid = rp->r_attr.va_nodeid; 8856 if (vp->v_flag & VROOT) { 8857 pnodeid = nodeid; /* root of mount point */ 8858 } else { 8859 if (rdc->error = nfs4_lookup(vp, "..", &dotdotvp, 0, 0, 0, cr)) 8860 return; 8861 pnodeid = VTOR4(dotdotvp)->r_attr.va_nodeid; 8862 VN_RELE(dotdotvp); 8863 } 8864 8865 stublength = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8866 rdc->entries = kmem_alloc(stublength, KM_SLEEP); 8867 rdc->entlen = rdc->buflen = stublength; 8868 rdc->eof = TRUE; 8869 8870 dp = (dirent64_t *)rdc->entries; 8871 8872 if (rdc->nfs4_cookie == (nfs_cookie4)0) { 8873 bcopy(nfs4_dot_entries, rdc->entries, 8874 DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2)); 8875 dp->d_ino = nodeid; 8876 dp = (struct dirent64 *)(((char *)dp) + DIRENT64_RECLEN(1)); 8877 dp->d_ino = pnodeid; 8878 rdc->actlen = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8879 } else { /* for ".." entry */ 8880 bcopy(nfs4_dot_dot_entry, rdc->entries, DIRENT64_RECLEN(2)); 8881 dp->d_ino = pnodeid; 8882 rdc->actlen = DIRENT64_RECLEN(2); 8883 } 8884 rdc->nfs4_ncookie = rdc->actlen; 8885 } 8886 8887 /* 8888 * Read directory entries. 8889 * There are some weird things to look out for here. The uio_loffset 8890 * field is either 0 or it is the offset returned from a previous 8891 * readdir. It is an opaque value used by the server to find the 8892 * correct directory block to read. The count field is the number 8893 * of blocks to read on the server. This is advisory only, the server 8894 * may return only one block's worth of entries. Entries may be compressed 8895 * on the server. 8896 * 8897 * Generates the following compound request: 8898 * 1. If readdir offset is zero and no dnlc entry for parent exists, 8899 * must include a Lookupp as well. In this case, send: 8900 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 8901 * 2. Otherwise just do: { Putfh <fh>; Readdir } 8902 * 8903 * Get complete attributes and filehandles for entries if this is the 8904 * first read of the directory. Otherwise, just get fileid's. 8905 */ 8906 static void 8907 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8908 { 8909 COMPOUND4args_clnt args; 8910 COMPOUND4res_clnt res; 8911 READDIR4args *rargs; 8912 READDIR4res_clnt *rd_res; 8913 bitmap4 rd_bitsval; 8914 nfs_argop4 argop[5]; 8915 nfs_resop4 *resop; 8916 rnode4_t *rp = VTOR4(vp); 8917 mntinfo4_t *mi = VTOMI4(vp); 8918 int doqueue; 8919 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 8920 vnode_t *dvp; 8921 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8922 int num_ops, res_opcnt; 8923 bool_t needrecov = FALSE; 8924 nfs4_recov_state_t recov_state; 8925 hrtime_t t; 8926 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8927 8928 ASSERT(nfs_zone() == mi->mi_zone); 8929 ASSERT(rdc->flags & RDDIR); 8930 ASSERT(rdc->entries == NULL); 8931 8932 if (rp->r_flags & R4SRVSTUB) { 8933 nfs4readdir_stub(vp, rdc, cr); 8934 return; 8935 } 8936 8937 num_ops = 2; 8938 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 8939 /* 8940 * Since nfsv4 readdir may not return entries for "." and "..", 8941 * the client must recreate them: 8942 * To find the correct nodeid, do the following: 8943 * For current node, get nodeid from dnlc. 8944 * - if current node is rootvp, set pnodeid to nodeid. 8945 * - else if parent is in the dnlc, get its nodeid from there. 8946 * - else add LOOKUPP+GETATTR to compound. 8947 */ 8948 nodeid = rp->r_attr.va_nodeid; 8949 if (vp->v_flag & VROOT) { 8950 pnodeid = nodeid; /* root of mount point */ 8951 } else { 8952 dvp = dnlc_lookup(vp, ".."); 8953 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 8954 /* parent in dnlc cache - no need for otw */ 8955 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 8956 } else { 8957 /* 8958 * parent not in dnlc cache, 8959 * do lookupp to get its id 8960 */ 8961 num_ops = 5; 8962 pnodeid = 0; /* set later by getattr parent */ 8963 } 8964 if (dvp) 8965 VN_RELE(dvp); 8966 } 8967 } 8968 recov_state.rs_flags = 0; 8969 recov_state.rs_num_retry_despite_err = 0; 8970 8971 /* Save the original mount point security flavor */ 8972 (void) save_mnt_secinfo(mi->mi_curr_serv); 8973 8974 recov_retry: 8975 args.ctag = TAG_READDIR; 8976 8977 args.array = argop; 8978 args.array_len = num_ops; 8979 8980 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 8981 &recov_state, NULL)) { 8982 /* 8983 * If readdir a node that is a stub for a crossed mount point, 8984 * keep the original secinfo flavor for the current file 8985 * system, not the crossed one. 8986 */ 8987 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 8988 rdc->error = e.error; 8989 return; 8990 } 8991 8992 /* 8993 * Determine which attrs to request for dirents. This code 8994 * must be protected by nfs4_start/end_fop because of r_server 8995 * (which will change during failover recovery). 8996 * 8997 */ 8998 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 8999 /* 9000 * Get all vattr attrs plus filehandle and rdattr_error 9001 */ 9002 rd_bitsval = NFS4_VATTR_MASK | 9003 FATTR4_RDATTR_ERROR_MASK | 9004 FATTR4_FILEHANDLE_MASK; 9005 9006 if (rp->r_flags & R4READDIRWATTR) { 9007 mutex_enter(&rp->r_statelock); 9008 rp->r_flags &= ~R4READDIRWATTR; 9009 mutex_exit(&rp->r_statelock); 9010 } 9011 } else { 9012 servinfo4_t *svp = rp->r_server; 9013 9014 /* 9015 * Already read directory. Use readdir with 9016 * no attrs (except for mounted_on_fileid) for updates. 9017 */ 9018 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9019 9020 /* 9021 * request mounted on fileid if supported, else request 9022 * fileid. maybe we should verify that fileid is supported 9023 * and request something else if not. 9024 */ 9025 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9026 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9027 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9028 nfs_rw_exit(&svp->sv_lock); 9029 } 9030 9031 /* putfh directory fh */ 9032 argop[0].argop = OP_CPUTFH; 9033 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9034 9035 argop[1].argop = OP_READDIR; 9036 rargs = &argop[1].nfs_argop4_u.opreaddir; 9037 /* 9038 * 1 and 2 are reserved for client "." and ".." entry offset. 9039 * cookie 0 should be used over-the-wire to start reading at 9040 * the beginning of the directory excluding "." and "..". 9041 */ 9042 if (rdc->nfs4_cookie == 0 || 9043 rdc->nfs4_cookie == 1 || 9044 rdc->nfs4_cookie == 2) { 9045 rargs->cookie = (nfs_cookie4)0; 9046 rargs->cookieverf = 0; 9047 } else { 9048 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9049 mutex_enter(&rp->r_statelock); 9050 rargs->cookieverf = rp->r_cookieverf4; 9051 mutex_exit(&rp->r_statelock); 9052 } 9053 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9054 rargs->maxcount = mi->mi_tsize; 9055 rargs->attr_request = rd_bitsval; 9056 rargs->rdc = rdc; 9057 rargs->dvp = vp; 9058 rargs->mi = mi; 9059 rargs->cr = cr; 9060 9061 9062 /* 9063 * If count < than the minimum required, we return no entries 9064 * and fail with EINVAL 9065 */ 9066 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9067 rdc->error = EINVAL; 9068 goto out; 9069 } 9070 9071 if (args.array_len == 5) { 9072 /* 9073 * Add lookupp and getattr for parent nodeid. 9074 */ 9075 argop[2].argop = OP_LOOKUPP; 9076 9077 argop[3].argop = OP_GETFH; 9078 9079 /* getattr parent */ 9080 argop[4].argop = OP_GETATTR; 9081 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9082 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9083 } 9084 9085 doqueue = 1; 9086 9087 if (mi->mi_io_kstats) { 9088 mutex_enter(&mi->mi_lock); 9089 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9090 mutex_exit(&mi->mi_lock); 9091 } 9092 9093 /* capture the time of this call */ 9094 rargs->t = t = gethrtime(); 9095 9096 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9097 9098 if (mi->mi_io_kstats) { 9099 mutex_enter(&mi->mi_lock); 9100 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9101 mutex_exit(&mi->mi_lock); 9102 } 9103 9104 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9105 9106 /* 9107 * If RPC error occurred and it isn't an error that 9108 * triggers recovery, then go ahead and fail now. 9109 */ 9110 if (e.error != 0 && !needrecov) { 9111 rdc->error = e.error; 9112 goto out; 9113 } 9114 9115 if (needrecov) { 9116 bool_t abort; 9117 9118 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9119 "nfs4readdir: initiating recovery.\n")); 9120 9121 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9122 NULL, OP_READDIR, NULL); 9123 if (abort == FALSE) { 9124 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9125 &recov_state, needrecov); 9126 if (!e.error) 9127 (void) xdr_free(xdr_COMPOUND4res_clnt, 9128 (caddr_t)&res); 9129 if (rdc->entries != NULL) { 9130 kmem_free(rdc->entries, rdc->entlen); 9131 rdc->entries = NULL; 9132 } 9133 goto recov_retry; 9134 } 9135 9136 if (e.error != 0) { 9137 rdc->error = e.error; 9138 goto out; 9139 } 9140 9141 /* fall through for res.status case */ 9142 } 9143 9144 res_opcnt = res.array_len; 9145 9146 /* 9147 * If compound failed first 2 ops (PUTFH+READDIR), then return 9148 * failure here. Subsequent ops are for filling out dot-dot 9149 * dirent, and if they fail, we still want to give the caller 9150 * the dirents returned by (the successful) READDIR op, so we need 9151 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9152 * 9153 * One example where PUTFH+READDIR ops would succeed but 9154 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9155 * but lacks x. In this case, a POSIX server's VOP_READDIR 9156 * would succeed; however, VOP_LOOKUP(..) would fail since no 9157 * x perm. We need to come up with a non-vendor-specific way 9158 * for a POSIX server to return d_ino from dotdot's dirent if 9159 * client only requests mounted_on_fileid, and just say the 9160 * LOOKUPP succeeded and fill out the GETATTR. However, if 9161 * client requested any mandatory attrs, server would be required 9162 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9163 * for dotdot. 9164 */ 9165 9166 if (res.status) { 9167 if (res_opcnt <= 2) { 9168 e.error = geterrno4(res.status); 9169 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9170 &recov_state, needrecov); 9171 nfs4_purge_stale_fh(e.error, vp, cr); 9172 rdc->error = e.error; 9173 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9174 if (rdc->entries != NULL) { 9175 kmem_free(rdc->entries, rdc->entlen); 9176 rdc->entries = NULL; 9177 } 9178 /* 9179 * If readdir a node that is a stub for a 9180 * crossed mount point, keep the original 9181 * secinfo flavor for the current file system, 9182 * not the crossed one. 9183 */ 9184 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9185 return; 9186 } 9187 } 9188 9189 resop = &res.array[1]; /* readdir res */ 9190 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9191 9192 mutex_enter(&rp->r_statelock); 9193 rp->r_cookieverf4 = rd_res->cookieverf; 9194 mutex_exit(&rp->r_statelock); 9195 9196 /* 9197 * For "." and ".." entries 9198 * e.g. 9199 * seek(cookie=0) -> "." entry with d_off = 1 9200 * seek(cookie=1) -> ".." entry with d_off = 2 9201 */ 9202 if (cookie == (nfs_cookie4) 0) { 9203 if (rd_res->dotp) 9204 rd_res->dotp->d_ino = nodeid; 9205 if (rd_res->dotdotp) 9206 rd_res->dotdotp->d_ino = pnodeid; 9207 } 9208 if (cookie == (nfs_cookie4) 1) { 9209 if (rd_res->dotdotp) 9210 rd_res->dotdotp->d_ino = pnodeid; 9211 } 9212 9213 9214 /* LOOKUPP+GETATTR attemped */ 9215 if (args.array_len == 5 && rd_res->dotdotp) { 9216 if (res.status == NFS4_OK && res_opcnt == 5) { 9217 nfs_fh4 *fhp; 9218 nfs4_sharedfh_t *sfhp; 9219 vnode_t *pvp; 9220 nfs4_ga_res_t *garp; 9221 9222 resop++; /* lookupp */ 9223 resop++; /* getfh */ 9224 fhp = &resop->nfs_resop4_u.opgetfh.object; 9225 9226 resop++; /* getattr of parent */ 9227 9228 /* 9229 * First, take care of finishing the 9230 * readdir results. 9231 */ 9232 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9233 /* 9234 * The d_ino of .. must be the inode number 9235 * of the mounted filesystem. 9236 */ 9237 if (garp->n4g_va.va_mask & AT_NODEID) 9238 rd_res->dotdotp->d_ino = 9239 garp->n4g_va.va_nodeid; 9240 9241 9242 /* 9243 * Next, create the ".." dnlc entry 9244 */ 9245 sfhp = sfh4_get(fhp, mi); 9246 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9247 dnlc_update(vp, "..", pvp); 9248 VN_RELE(pvp); 9249 } 9250 sfh4_rele(&sfhp); 9251 } 9252 } 9253 9254 if (mi->mi_io_kstats) { 9255 mutex_enter(&mi->mi_lock); 9256 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9257 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9258 mutex_exit(&mi->mi_lock); 9259 } 9260 9261 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9262 9263 out: 9264 /* 9265 * If readdir a node that is a stub for a crossed mount point, 9266 * keep the original secinfo flavor for the current file system, 9267 * not the crossed one. 9268 */ 9269 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9270 9271 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9272 } 9273 9274 9275 static int 9276 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9277 { 9278 rnode4_t *rp = VTOR4(bp->b_vp); 9279 int count; 9280 int error; 9281 cred_t *cred_otw = NULL; 9282 offset_t offset; 9283 nfs4_open_stream_t *osp = NULL; 9284 bool_t first_time = TRUE; /* first time getting otw cred */ 9285 bool_t last_time = FALSE; /* last time getting otw cred */ 9286 9287 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9288 9289 DTRACE_IO1(start, struct buf *, bp); 9290 offset = ldbtob(bp->b_lblkno); 9291 9292 if (bp->b_flags & B_READ) { 9293 read_again: 9294 /* 9295 * Releases the osp, if it is provided. 9296 * Puts a hold on the cred_otw and the new osp (if found). 9297 */ 9298 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9299 &first_time, &last_time); 9300 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9301 offset, bp->b_bcount, 9302 &bp->b_resid, cred_otw, 9303 readahead, NULL); 9304 crfree(cred_otw); 9305 if (!error) { 9306 if (bp->b_resid) { 9307 /* 9308 * Didn't get it all because we hit EOF, 9309 * zero all the memory beyond the EOF. 9310 */ 9311 /* bzero(rdaddr + */ 9312 bzero(bp->b_un.b_addr + 9313 bp->b_bcount - bp->b_resid, bp->b_resid); 9314 } 9315 mutex_enter(&rp->r_statelock); 9316 if (bp->b_resid == bp->b_bcount && 9317 offset >= rp->r_size) { 9318 /* 9319 * We didn't read anything at all as we are 9320 * past EOF. Return an error indicator back 9321 * but don't destroy the pages (yet). 9322 */ 9323 error = NFS_EOF; 9324 } 9325 mutex_exit(&rp->r_statelock); 9326 } else if (error == EACCES && last_time == FALSE) { 9327 goto read_again; 9328 } 9329 } else { 9330 if (!(rp->r_flags & R4STALE)) { 9331 write_again: 9332 /* 9333 * Releases the osp, if it is provided. 9334 * Puts a hold on the cred_otw and the new 9335 * osp (if found). 9336 */ 9337 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9338 &first_time, &last_time); 9339 mutex_enter(&rp->r_statelock); 9340 count = MIN(bp->b_bcount, rp->r_size - offset); 9341 mutex_exit(&rp->r_statelock); 9342 if (count < 0) 9343 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9344 #ifdef DEBUG 9345 if (count == 0) { 9346 zoneid_t zoneid = getzoneid(); 9347 9348 zcmn_err(zoneid, CE_WARN, 9349 "nfs4_bio: zero length write at %lld", 9350 offset); 9351 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9352 "b_bcount=%ld, file size=%lld", 9353 rp->r_flags, (long)bp->b_bcount, 9354 rp->r_size); 9355 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9356 if (nfs4_bio_do_stop) 9357 debug_enter("nfs4_bio"); 9358 } 9359 #endif 9360 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9361 count, cred_otw, stab_comm); 9362 if (error == EACCES && last_time == FALSE) { 9363 crfree(cred_otw); 9364 goto write_again; 9365 } 9366 bp->b_error = error; 9367 if (error && error != EINTR && 9368 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9369 /* 9370 * Don't print EDQUOT errors on the console. 9371 * Don't print asynchronous EACCES errors. 9372 * Don't print EFBIG errors. 9373 * Print all other write errors. 9374 */ 9375 if (error != EDQUOT && error != EFBIG && 9376 (error != EACCES || 9377 !(bp->b_flags & B_ASYNC))) 9378 nfs4_write_error(bp->b_vp, 9379 error, cred_otw); 9380 /* 9381 * Update r_error and r_flags as appropriate. 9382 * If the error was ESTALE, then mark the 9383 * rnode as not being writeable and save 9384 * the error status. Otherwise, save any 9385 * errors which occur from asynchronous 9386 * page invalidations. Any errors occurring 9387 * from other operations should be saved 9388 * by the caller. 9389 */ 9390 mutex_enter(&rp->r_statelock); 9391 if (error == ESTALE) { 9392 rp->r_flags |= R4STALE; 9393 if (!rp->r_error) 9394 rp->r_error = error; 9395 } else if (!rp->r_error && 9396 (bp->b_flags & 9397 (B_INVAL|B_FORCE|B_ASYNC)) == 9398 (B_INVAL|B_FORCE|B_ASYNC)) { 9399 rp->r_error = error; 9400 } 9401 mutex_exit(&rp->r_statelock); 9402 } 9403 crfree(cred_otw); 9404 } else 9405 error = rp->r_error; 9406 } 9407 9408 if (error != 0 && error != NFS_EOF) 9409 bp->b_flags |= B_ERROR; 9410 9411 if (osp) 9412 open_stream_rele(osp, rp); 9413 9414 DTRACE_IO1(done, struct buf *, bp); 9415 9416 return (error); 9417 } 9418 9419 /* ARGSUSED */ 9420 static int 9421 nfs4_fid(vnode_t *vp, fid_t *fidp) 9422 { 9423 return (EREMOTE); 9424 } 9425 9426 /* ARGSUSED2 */ 9427 static int 9428 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9429 { 9430 rnode4_t *rp = VTOR4(vp); 9431 9432 if (!write_lock) { 9433 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9434 return (V_WRITELOCK_FALSE); 9435 } 9436 9437 if ((rp->r_flags & R4DIRECTIO) || 9438 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9439 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9440 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9441 return (V_WRITELOCK_FALSE); 9442 nfs_rw_exit(&rp->r_rwlock); 9443 } 9444 9445 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9446 return (V_WRITELOCK_TRUE); 9447 } 9448 9449 /* ARGSUSED */ 9450 static void 9451 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9452 { 9453 rnode4_t *rp = VTOR4(vp); 9454 9455 nfs_rw_exit(&rp->r_rwlock); 9456 } 9457 9458 /* ARGSUSED */ 9459 static int 9460 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 9461 { 9462 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9463 return (EIO); 9464 9465 /* 9466 * Because we stuff the readdir cookie into the offset field 9467 * someone may attempt to do an lseek with the cookie which 9468 * we want to succeed. 9469 */ 9470 if (vp->v_type == VDIR) 9471 return (0); 9472 if (*noffp < 0) 9473 return (EINVAL); 9474 return (0); 9475 } 9476 9477 9478 /* 9479 * Return all the pages from [off..off+len) in file 9480 */ 9481 static int 9482 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9483 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9484 enum seg_rw rw, cred_t *cr) 9485 { 9486 rnode4_t *rp; 9487 int error; 9488 mntinfo4_t *mi; 9489 9490 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9491 return (EIO); 9492 rp = VTOR4(vp); 9493 if (IS_SHADOW(vp, rp)) 9494 vp = RTOV4(rp); 9495 9496 if (vp->v_flag & VNOMAP) 9497 return (ENOSYS); 9498 9499 if (protp != NULL) 9500 *protp = PROT_ALL; 9501 9502 /* 9503 * Now validate that the caches are up to date. 9504 */ 9505 if (error = nfs4_validate_caches(vp, cr)) 9506 return (error); 9507 9508 mi = VTOMI4(vp); 9509 retry: 9510 mutex_enter(&rp->r_statelock); 9511 9512 /* 9513 * Don't create dirty pages faster than they 9514 * can be cleaned so that the system doesn't 9515 * get imbalanced. If the async queue is 9516 * maxed out, then wait for it to drain before 9517 * creating more dirty pages. Also, wait for 9518 * any threads doing pagewalks in the vop_getattr 9519 * entry points so that they don't block for 9520 * long periods. 9521 */ 9522 if (rw == S_CREATE) { 9523 while ((mi->mi_max_threads != 0 && 9524 rp->r_awcount > 2 * mi->mi_max_threads) || 9525 rp->r_gcount > 0) 9526 cv_wait(&rp->r_cv, &rp->r_statelock); 9527 } 9528 9529 /* 9530 * If we are getting called as a side effect of an nfs_write() 9531 * operation the local file size might not be extended yet. 9532 * In this case we want to be able to return pages of zeroes. 9533 */ 9534 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9535 NFS4_DEBUG(nfs4_pageio_debug, 9536 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9537 "len=%llu, size=%llu, attrsize =%llu", off, 9538 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9539 mutex_exit(&rp->r_statelock); 9540 return (EFAULT); /* beyond EOF */ 9541 } 9542 9543 mutex_exit(&rp->r_statelock); 9544 9545 if (len <= PAGESIZE) { 9546 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9547 seg, addr, rw, cr); 9548 NFS4_DEBUG(nfs4_pageio_debug && error, 9549 (CE_NOTE, "getpage error %d; off=%lld, " 9550 "len=%lld", error, off, (u_longlong_t)len)); 9551 } else { 9552 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9553 pl, plsz, seg, addr, rw, cr); 9554 NFS4_DEBUG(nfs4_pageio_debug && error, 9555 (CE_NOTE, "getpages error %d; off=%lld, " 9556 "len=%lld", error, off, (u_longlong_t)len)); 9557 } 9558 9559 switch (error) { 9560 case NFS_EOF: 9561 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9562 goto retry; 9563 case ESTALE: 9564 nfs4_purge_stale_fh(error, vp, cr); 9565 } 9566 9567 return (error); 9568 } 9569 9570 /* 9571 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9572 */ 9573 /* ARGSUSED */ 9574 static int 9575 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9576 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9577 enum seg_rw rw, cred_t *cr) 9578 { 9579 rnode4_t *rp; 9580 uint_t bsize; 9581 struct buf *bp; 9582 page_t *pp; 9583 u_offset_t lbn; 9584 u_offset_t io_off; 9585 u_offset_t blkoff; 9586 u_offset_t rablkoff; 9587 size_t io_len; 9588 uint_t blksize; 9589 int error; 9590 int readahead; 9591 int readahead_issued = 0; 9592 int ra_window; /* readahead window */ 9593 page_t *pagefound; 9594 page_t *savepp; 9595 9596 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9597 return (EIO); 9598 9599 rp = VTOR4(vp); 9600 ASSERT(!IS_SHADOW(vp, rp)); 9601 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9602 9603 reread: 9604 bp = NULL; 9605 pp = NULL; 9606 pagefound = NULL; 9607 9608 if (pl != NULL) 9609 pl[0] = NULL; 9610 9611 error = 0; 9612 lbn = off / bsize; 9613 blkoff = lbn * bsize; 9614 9615 /* 9616 * Queueing up the readahead before doing the synchronous read 9617 * results in a significant increase in read throughput because 9618 * of the increased parallelism between the async threads and 9619 * the process context. 9620 */ 9621 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9622 rw != S_CREATE && 9623 !(vp->v_flag & VNOCACHE)) { 9624 mutex_enter(&rp->r_statelock); 9625 9626 /* 9627 * Calculate the number of readaheads to do. 9628 * a) No readaheads at offset = 0. 9629 * b) Do maximum(nfs4_nra) readaheads when the readahead 9630 * window is closed. 9631 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9632 * upon how far the readahead window is open or close. 9633 * d) No readaheads if rp->r_nextr is not within the scope 9634 * of the readahead window (random i/o). 9635 */ 9636 9637 if (off == 0) 9638 readahead = 0; 9639 else if (blkoff == rp->r_nextr) 9640 readahead = nfs4_nra; 9641 else if (rp->r_nextr > blkoff && 9642 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9643 <= (nfs4_nra - 1))) 9644 readahead = nfs4_nra - ra_window; 9645 else 9646 readahead = 0; 9647 9648 rablkoff = rp->r_nextr; 9649 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9650 mutex_exit(&rp->r_statelock); 9651 if (nfs4_async_readahead(vp, rablkoff + bsize, 9652 addr + (rablkoff + bsize - off), 9653 seg, cr, nfs4_readahead) < 0) { 9654 mutex_enter(&rp->r_statelock); 9655 break; 9656 } 9657 readahead--; 9658 rablkoff += bsize; 9659 /* 9660 * Indicate that we did a readahead so 9661 * readahead offset is not updated 9662 * by the synchronous read below. 9663 */ 9664 readahead_issued = 1; 9665 mutex_enter(&rp->r_statelock); 9666 /* 9667 * set readahead offset to 9668 * offset of last async readahead 9669 * request. 9670 */ 9671 rp->r_nextr = rablkoff; 9672 } 9673 mutex_exit(&rp->r_statelock); 9674 } 9675 9676 again: 9677 if ((pagefound = page_exists(vp, off)) == NULL) { 9678 if (pl == NULL) { 9679 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9680 nfs4_readahead); 9681 } else if (rw == S_CREATE) { 9682 /* 9683 * Block for this page is not allocated, or the offset 9684 * is beyond the current allocation size, or we're 9685 * allocating a swap slot and the page was not found, 9686 * so allocate it and return a zero page. 9687 */ 9688 if ((pp = page_create_va(vp, off, 9689 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9690 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9691 io_len = PAGESIZE; 9692 mutex_enter(&rp->r_statelock); 9693 rp->r_nextr = off + PAGESIZE; 9694 mutex_exit(&rp->r_statelock); 9695 } else { 9696 /* 9697 * Need to go to server to get a block 9698 */ 9699 mutex_enter(&rp->r_statelock); 9700 if (blkoff < rp->r_size && 9701 blkoff + bsize > rp->r_size) { 9702 /* 9703 * If less than a block left in 9704 * file read less than a block. 9705 */ 9706 if (rp->r_size <= off) { 9707 /* 9708 * Trying to access beyond EOF, 9709 * set up to get at least one page. 9710 */ 9711 blksize = off + PAGESIZE - blkoff; 9712 } else 9713 blksize = rp->r_size - blkoff; 9714 } else if ((off == 0) || 9715 (off != rp->r_nextr && !readahead_issued)) { 9716 blksize = PAGESIZE; 9717 blkoff = off; /* block = page here */ 9718 } else 9719 blksize = bsize; 9720 mutex_exit(&rp->r_statelock); 9721 9722 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9723 &io_len, blkoff, blksize, 0); 9724 9725 /* 9726 * Some other thread has entered the page, 9727 * so just use it. 9728 */ 9729 if (pp == NULL) 9730 goto again; 9731 9732 /* 9733 * Now round the request size up to page boundaries. 9734 * This ensures that the entire page will be 9735 * initialized to zeroes if EOF is encountered. 9736 */ 9737 io_len = ptob(btopr(io_len)); 9738 9739 bp = pageio_setup(pp, io_len, vp, B_READ); 9740 ASSERT(bp != NULL); 9741 9742 /* 9743 * pageio_setup should have set b_addr to 0. This 9744 * is correct since we want to do I/O on a page 9745 * boundary. bp_mapin will use this addr to calculate 9746 * an offset, and then set b_addr to the kernel virtual 9747 * address it allocated for us. 9748 */ 9749 ASSERT(bp->b_un.b_addr == 0); 9750 9751 bp->b_edev = 0; 9752 bp->b_dev = 0; 9753 bp->b_lblkno = lbtodb(io_off); 9754 bp->b_file = vp; 9755 bp->b_offset = (offset_t)off; 9756 bp_mapin(bp); 9757 9758 /* 9759 * If doing a write beyond what we believe is EOF, 9760 * don't bother trying to read the pages from the 9761 * server, we'll just zero the pages here. We 9762 * don't check that the rw flag is S_WRITE here 9763 * because some implementations may attempt a 9764 * read access to the buffer before copying data. 9765 */ 9766 mutex_enter(&rp->r_statelock); 9767 if (io_off >= rp->r_size && seg == segkmap) { 9768 mutex_exit(&rp->r_statelock); 9769 bzero(bp->b_un.b_addr, io_len); 9770 } else { 9771 mutex_exit(&rp->r_statelock); 9772 error = nfs4_bio(bp, NULL, cr, FALSE); 9773 } 9774 9775 /* 9776 * Unmap the buffer before freeing it. 9777 */ 9778 bp_mapout(bp); 9779 pageio_done(bp); 9780 9781 savepp = pp; 9782 do { 9783 pp->p_fsdata = C_NOCOMMIT; 9784 } while ((pp = pp->p_next) != savepp); 9785 9786 if (error == NFS_EOF) { 9787 /* 9788 * If doing a write system call just return 9789 * zeroed pages, else user tried to get pages 9790 * beyond EOF, return error. We don't check 9791 * that the rw flag is S_WRITE here because 9792 * some implementations may attempt a read 9793 * access to the buffer before copying data. 9794 */ 9795 if (seg == segkmap) 9796 error = 0; 9797 else 9798 error = EFAULT; 9799 } 9800 9801 if (!readahead_issued && !error) { 9802 mutex_enter(&rp->r_statelock); 9803 rp->r_nextr = io_off + io_len; 9804 mutex_exit(&rp->r_statelock); 9805 } 9806 } 9807 } 9808 9809 out: 9810 if (pl == NULL) 9811 return (error); 9812 9813 if (error) { 9814 if (pp != NULL) 9815 pvn_read_done(pp, B_ERROR); 9816 return (error); 9817 } 9818 9819 if (pagefound) { 9820 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9821 9822 /* 9823 * Page exists in the cache, acquire the appropriate lock. 9824 * If this fails, start all over again. 9825 */ 9826 if ((pp = page_lookup(vp, off, se)) == NULL) { 9827 #ifdef DEBUG 9828 nfs4_lostpage++; 9829 #endif 9830 goto reread; 9831 } 9832 pl[0] = pp; 9833 pl[1] = NULL; 9834 return (0); 9835 } 9836 9837 if (pp != NULL) 9838 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 9839 9840 return (error); 9841 } 9842 9843 static void 9844 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 9845 cred_t *cr) 9846 { 9847 int error; 9848 page_t *pp; 9849 u_offset_t io_off; 9850 size_t io_len; 9851 struct buf *bp; 9852 uint_t bsize, blksize; 9853 rnode4_t *rp = VTOR4(vp); 9854 page_t *savepp; 9855 9856 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9857 9858 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9859 9860 mutex_enter(&rp->r_statelock); 9861 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 9862 /* 9863 * If less than a block left in file read less 9864 * than a block. 9865 */ 9866 blksize = rp->r_size - blkoff; 9867 } else 9868 blksize = bsize; 9869 mutex_exit(&rp->r_statelock); 9870 9871 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 9872 &io_off, &io_len, blkoff, blksize, 1); 9873 /* 9874 * The isra flag passed to the kluster function is 1, we may have 9875 * gotten a return value of NULL for a variety of reasons (# of free 9876 * pages < minfree, someone entered the page on the vnode etc). In all 9877 * cases, we want to punt on the readahead. 9878 */ 9879 if (pp == NULL) 9880 return; 9881 9882 /* 9883 * Now round the request size up to page boundaries. 9884 * This ensures that the entire page will be 9885 * initialized to zeroes if EOF is encountered. 9886 */ 9887 io_len = ptob(btopr(io_len)); 9888 9889 bp = pageio_setup(pp, io_len, vp, B_READ); 9890 ASSERT(bp != NULL); 9891 9892 /* 9893 * pageio_setup should have set b_addr to 0. This is correct since 9894 * we want to do I/O on a page boundary. bp_mapin() will use this addr 9895 * to calculate an offset, and then set b_addr to the kernel virtual 9896 * address it allocated for us. 9897 */ 9898 ASSERT(bp->b_un.b_addr == 0); 9899 9900 bp->b_edev = 0; 9901 bp->b_dev = 0; 9902 bp->b_lblkno = lbtodb(io_off); 9903 bp->b_file = vp; 9904 bp->b_offset = (offset_t)blkoff; 9905 bp_mapin(bp); 9906 9907 /* 9908 * If doing a write beyond what we believe is EOF, don't bother trying 9909 * to read the pages from the server, we'll just zero the pages here. 9910 * We don't check that the rw flag is S_WRITE here because some 9911 * implementations may attempt a read access to the buffer before 9912 * copying data. 9913 */ 9914 mutex_enter(&rp->r_statelock); 9915 if (io_off >= rp->r_size && seg == segkmap) { 9916 mutex_exit(&rp->r_statelock); 9917 bzero(bp->b_un.b_addr, io_len); 9918 error = 0; 9919 } else { 9920 mutex_exit(&rp->r_statelock); 9921 error = nfs4_bio(bp, NULL, cr, TRUE); 9922 if (error == NFS_EOF) 9923 error = 0; 9924 } 9925 9926 /* 9927 * Unmap the buffer before freeing it. 9928 */ 9929 bp_mapout(bp); 9930 pageio_done(bp); 9931 9932 savepp = pp; 9933 do { 9934 pp->p_fsdata = C_NOCOMMIT; 9935 } while ((pp = pp->p_next) != savepp); 9936 9937 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 9938 9939 /* 9940 * In case of error set readahead offset 9941 * to the lowest offset. 9942 * pvn_read_done() calls VN_DISPOSE to destroy the pages 9943 */ 9944 if (error && rp->r_nextr > io_off) { 9945 mutex_enter(&rp->r_statelock); 9946 if (rp->r_nextr > io_off) 9947 rp->r_nextr = io_off; 9948 mutex_exit(&rp->r_statelock); 9949 } 9950 } 9951 9952 /* 9953 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 9954 * If len == 0, do from off to EOF. 9955 * 9956 * The normal cases should be len == 0 && off == 0 (entire vp list) or 9957 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 9958 * (from pageout). 9959 */ 9960 static int 9961 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 9962 { 9963 int error; 9964 rnode4_t *rp; 9965 9966 ASSERT(cr != NULL); 9967 9968 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 9969 return (EIO); 9970 9971 rp = VTOR4(vp); 9972 if (IS_SHADOW(vp, rp)) 9973 vp = RTOV4(rp); 9974 9975 /* 9976 * XXX - Why should this check be made here? 9977 */ 9978 if (vp->v_flag & VNOMAP) 9979 return (ENOSYS); 9980 9981 if (len == 0 && !(flags & B_INVAL) && 9982 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 9983 return (0); 9984 9985 mutex_enter(&rp->r_statelock); 9986 rp->r_count++; 9987 mutex_exit(&rp->r_statelock); 9988 error = nfs4_putpages(vp, off, len, flags, cr); 9989 mutex_enter(&rp->r_statelock); 9990 rp->r_count--; 9991 cv_broadcast(&rp->r_cv); 9992 mutex_exit(&rp->r_statelock); 9993 9994 return (error); 9995 } 9996 9997 /* 9998 * Write out a single page, possibly klustering adjacent dirty pages. 9999 */ 10000 int 10001 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10002 int flags, cred_t *cr) 10003 { 10004 u_offset_t io_off; 10005 u_offset_t lbn_off; 10006 u_offset_t lbn; 10007 size_t io_len; 10008 uint_t bsize; 10009 int error; 10010 rnode4_t *rp; 10011 10012 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10013 ASSERT(pp != NULL); 10014 ASSERT(cr != NULL); 10015 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10016 10017 rp = VTOR4(vp); 10018 ASSERT(rp->r_count > 0); 10019 ASSERT(!IS_SHADOW(vp, rp)); 10020 10021 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10022 lbn = pp->p_offset / bsize; 10023 lbn_off = lbn * bsize; 10024 10025 /* 10026 * Find a kluster that fits in one block, or in 10027 * one page if pages are bigger than blocks. If 10028 * there is less file space allocated than a whole 10029 * page, we'll shorten the i/o request below. 10030 */ 10031 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10032 roundup(bsize, PAGESIZE), flags); 10033 10034 /* 10035 * pvn_write_kluster shouldn't have returned a page with offset 10036 * behind the original page we were given. Verify that. 10037 */ 10038 ASSERT((pp->p_offset / bsize) >= lbn); 10039 10040 /* 10041 * Now pp will have the list of kept dirty pages marked for 10042 * write back. It will also handle invalidation and freeing 10043 * of pages that are not dirty. Check for page length rounding 10044 * problems. 10045 */ 10046 if (io_off + io_len > lbn_off + bsize) { 10047 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10048 io_len = lbn_off + bsize - io_off; 10049 } 10050 /* 10051 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10052 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10053 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10054 * progress and the r_size has not been made consistent with the 10055 * new size of the file. When the uiomove() completes the r_size is 10056 * updated and the R4MODINPROGRESS flag is cleared. 10057 * 10058 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10059 * consistent value of r_size. Without this handshaking, it is 10060 * possible that nfs4_bio() picks up the old value of r_size 10061 * before the uiomove() in writerp4() completes. This will result 10062 * in the write through nfs4_bio() being dropped. 10063 * 10064 * More precisely, there is a window between the time the uiomove() 10065 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10066 * operation intervenes in this window, the page will be picked up, 10067 * because it is dirty (it will be unlocked, unless it was 10068 * pagecreate'd). When the page is picked up as dirty, the dirty 10069 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10070 * checked. This will still be the old size. Therefore the page will 10071 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10072 * the page will be found to be clean and the write will be dropped. 10073 */ 10074 if (rp->r_flags & R4MODINPROGRESS) { 10075 mutex_enter(&rp->r_statelock); 10076 if ((rp->r_flags & R4MODINPROGRESS) && 10077 rp->r_modaddr + MAXBSIZE > io_off && 10078 rp->r_modaddr < io_off + io_len) { 10079 page_t *plist; 10080 /* 10081 * A write is in progress for this region of the file. 10082 * If we did not detect R4MODINPROGRESS here then this 10083 * path through nfs_putapage() would eventually go to 10084 * nfs4_bio() and may not write out all of the data 10085 * in the pages. We end up losing data. So we decide 10086 * to set the modified bit on each page in the page 10087 * list and mark the rnode with R4DIRTY. This write 10088 * will be restarted at some later time. 10089 */ 10090 plist = pp; 10091 while (plist != NULL) { 10092 pp = plist; 10093 page_sub(&plist, pp); 10094 hat_setmod(pp); 10095 page_io_unlock(pp); 10096 page_unlock(pp); 10097 } 10098 rp->r_flags |= R4DIRTY; 10099 mutex_exit(&rp->r_statelock); 10100 if (offp) 10101 *offp = io_off; 10102 if (lenp) 10103 *lenp = io_len; 10104 return (0); 10105 } 10106 mutex_exit(&rp->r_statelock); 10107 } 10108 10109 if (flags & B_ASYNC) { 10110 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10111 nfs4_sync_putapage); 10112 } else 10113 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10114 10115 if (offp) 10116 *offp = io_off; 10117 if (lenp) 10118 *lenp = io_len; 10119 return (error); 10120 } 10121 10122 static int 10123 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10124 int flags, cred_t *cr) 10125 { 10126 int error; 10127 rnode4_t *rp; 10128 10129 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10130 10131 flags |= B_WRITE; 10132 10133 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10134 10135 rp = VTOR4(vp); 10136 10137 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10138 error == EACCES) && 10139 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10140 if (!(rp->r_flags & R4OUTOFSPACE)) { 10141 mutex_enter(&rp->r_statelock); 10142 rp->r_flags |= R4OUTOFSPACE; 10143 mutex_exit(&rp->r_statelock); 10144 } 10145 flags |= B_ERROR; 10146 pvn_write_done(pp, flags); 10147 /* 10148 * If this was not an async thread, then try again to 10149 * write out the pages, but this time, also destroy 10150 * them whether or not the write is successful. This 10151 * will prevent memory from filling up with these 10152 * pages and destroying them is the only alternative 10153 * if they can't be written out. 10154 * 10155 * Don't do this if this is an async thread because 10156 * when the pages are unlocked in pvn_write_done, 10157 * some other thread could have come along, locked 10158 * them, and queued for an async thread. It would be 10159 * possible for all of the async threads to be tied 10160 * up waiting to lock the pages again and they would 10161 * all already be locked and waiting for an async 10162 * thread to handle them. Deadlock. 10163 */ 10164 if (!(flags & B_ASYNC)) { 10165 error = nfs4_putpage(vp, io_off, io_len, 10166 B_INVAL | B_FORCE, cr); 10167 } 10168 } else { 10169 if (error) 10170 flags |= B_ERROR; 10171 else if (rp->r_flags & R4OUTOFSPACE) { 10172 mutex_enter(&rp->r_statelock); 10173 rp->r_flags &= ~R4OUTOFSPACE; 10174 mutex_exit(&rp->r_statelock); 10175 } 10176 pvn_write_done(pp, flags); 10177 if (freemem < desfree) 10178 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10179 NFS4_WRITE_NOWAIT); 10180 } 10181 10182 return (error); 10183 } 10184 10185 #ifdef DEBUG 10186 int nfs4_force_open_before_mmap = 0; 10187 #endif 10188 10189 static int 10190 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10191 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10192 { 10193 struct segvn_crargs vn_a; 10194 int error = 0; 10195 rnode4_t *rp = VTOR4(vp); 10196 mntinfo4_t *mi = VTOMI4(vp); 10197 10198 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10199 return (EIO); 10200 10201 if (vp->v_flag & VNOMAP) 10202 return (ENOSYS); 10203 10204 if (off < 0 || (off + len) < 0) 10205 return (ENXIO); 10206 10207 if (vp->v_type != VREG) 10208 return (ENODEV); 10209 10210 /* 10211 * If the file is delegated to the client don't do anything. 10212 * If the file is not delegated, then validate the data cache. 10213 */ 10214 mutex_enter(&rp->r_statev4_lock); 10215 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10216 mutex_exit(&rp->r_statev4_lock); 10217 error = nfs4_validate_caches(vp, cr); 10218 if (error) 10219 return (error); 10220 } else { 10221 mutex_exit(&rp->r_statev4_lock); 10222 } 10223 10224 /* 10225 * Check to see if the vnode is currently marked as not cachable. 10226 * This means portions of the file are locked (through VOP_FRLOCK). 10227 * In this case the map request must be refused. We use 10228 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10229 */ 10230 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 10231 return (EINTR); 10232 10233 if (vp->v_flag & VNOCACHE) { 10234 error = EAGAIN; 10235 goto done; 10236 } 10237 10238 /* 10239 * Don't allow concurrent locks and mapping if mandatory locking is 10240 * enabled. 10241 */ 10242 if (flk_has_remote_locks(vp)) { 10243 struct vattr va; 10244 va.va_mask = AT_MODE; 10245 error = nfs4getattr(vp, &va, cr); 10246 if (error != 0) 10247 goto done; 10248 if (MANDLOCK(vp, va.va_mode)) { 10249 error = EAGAIN; 10250 goto done; 10251 } 10252 } 10253 10254 /* 10255 * It is possible that the rnode has a lost lock request that we 10256 * are still trying to recover, and that the request conflicts with 10257 * this map request. 10258 * 10259 * An alternative approach would be for nfs4_safemap() to consider 10260 * queued lock requests when deciding whether to set or clear 10261 * VNOCACHE. This would require the frlock code path to call 10262 * nfs4_safemap() after enqueing a lost request. 10263 */ 10264 if (nfs4_map_lost_lock_conflict(vp)) { 10265 error = EAGAIN; 10266 goto done; 10267 } 10268 10269 as_rangelock(as); 10270 if (!(flags & MAP_FIXED)) { 10271 map_addr(addrp, len, off, 1, flags); 10272 if (*addrp == NULL) { 10273 as_rangeunlock(as); 10274 error = ENOMEM; 10275 goto done; 10276 } 10277 } else { 10278 /* 10279 * User specified address - blow away any previous mappings 10280 */ 10281 (void) as_unmap(as, *addrp, len); 10282 } 10283 10284 if (vp->v_type == VREG) { 10285 /* 10286 * We need to retrieve the open stream 10287 */ 10288 nfs4_open_stream_t *osp = NULL; 10289 nfs4_open_owner_t *oop = NULL; 10290 10291 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10292 if (oop != NULL) { 10293 /* returns with 'os_sync_lock' held */ 10294 osp = find_open_stream(oop, rp); 10295 open_owner_rele(oop); 10296 } 10297 if (osp == NULL) { 10298 #ifdef DEBUG 10299 if (nfs4_force_open_before_mmap) { 10300 error = EIO; 10301 goto done; 10302 } 10303 #endif 10304 /* returns with 'os_sync_lock' held */ 10305 error = open_and_get_osp(vp, cr, &osp); 10306 if (osp == NULL) { 10307 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10308 "nfs4_map: we tried to OPEN the file " 10309 "but again no osp, so fail with EIO")); 10310 goto done; 10311 } 10312 } 10313 10314 if (osp->os_failed_reopen) { 10315 mutex_exit(&osp->os_sync_lock); 10316 open_stream_rele(osp, rp); 10317 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10318 "nfs4_map: os_failed_reopen set on " 10319 "osp %p, cr %p, rp %s", (void *)osp, 10320 (void *)cr, rnode4info(rp))); 10321 error = EIO; 10322 goto done; 10323 } 10324 mutex_exit(&osp->os_sync_lock); 10325 open_stream_rele(osp, rp); 10326 } 10327 10328 vn_a.vp = vp; 10329 vn_a.offset = off; 10330 vn_a.type = (flags & MAP_TYPE); 10331 vn_a.prot = (uchar_t)prot; 10332 vn_a.maxprot = (uchar_t)maxprot; 10333 vn_a.flags = (flags & ~MAP_TYPE); 10334 vn_a.cred = cr; 10335 vn_a.amp = NULL; 10336 vn_a.szc = 0; 10337 vn_a.lgrp_mem_policy_flags = 0; 10338 10339 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10340 as_rangeunlock(as); 10341 10342 done: 10343 nfs_rw_exit(&rp->r_lkserlock); 10344 return (error); 10345 } 10346 10347 /* 10348 * We're most likely dealing with a kernel module that likes to READ 10349 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10350 * officially OPEN the file to create the necessary client state 10351 * for bookkeeping of os_mmap_read/write counts. 10352 * 10353 * Since VOP_MAP only passes in a pointer to the vnode rather than 10354 * a double pointer, we can't handle the case where nfs4open_otw() 10355 * returns a different vnode than the one passed into VOP_MAP (since 10356 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10357 * we return NULL and let nfs4_map() fail. Note: the only case where 10358 * this should happen is if the file got removed and replaced with the 10359 * same name on the server (in addition to the fact that we're trying 10360 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10361 */ 10362 static int 10363 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10364 { 10365 rnode4_t *rp, *drp; 10366 vnode_t *dvp, *open_vp; 10367 char file_name[MAXNAMELEN]; 10368 int just_created; 10369 nfs4_open_stream_t *osp; 10370 nfs4_open_owner_t *oop; 10371 int error; 10372 10373 *ospp = NULL; 10374 open_vp = map_vp; 10375 10376 rp = VTOR4(open_vp); 10377 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10378 return (error); 10379 drp = VTOR4(dvp); 10380 10381 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10382 VN_RELE(dvp); 10383 return (EINTR); 10384 } 10385 10386 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10387 nfs_rw_exit(&drp->r_rwlock); 10388 VN_RELE(dvp); 10389 return (error); 10390 } 10391 10392 mutex_enter(&rp->r_statev4_lock); 10393 if (rp->created_v4) { 10394 rp->created_v4 = 0; 10395 mutex_exit(&rp->r_statev4_lock); 10396 10397 dnlc_update(dvp, file_name, open_vp); 10398 /* This is needed so we don't bump the open ref count */ 10399 just_created = 1; 10400 } else { 10401 mutex_exit(&rp->r_statev4_lock); 10402 just_created = 0; 10403 } 10404 10405 VN_HOLD(map_vp); 10406 10407 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10408 just_created); 10409 if (error) { 10410 nfs_rw_exit(&drp->r_rwlock); 10411 VN_RELE(dvp); 10412 VN_RELE(map_vp); 10413 return (error); 10414 } 10415 10416 nfs_rw_exit(&drp->r_rwlock); 10417 VN_RELE(dvp); 10418 10419 /* 10420 * If nfs4open_otw() returned a different vnode then "undo" 10421 * the open and return failure to the caller. 10422 */ 10423 if (!VN_CMP(open_vp, map_vp)) { 10424 nfs4_error_t e; 10425 10426 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10427 "open returned a different vnode")); 10428 /* 10429 * If there's an error, ignore it, 10430 * and let VOP_INACTIVE handle it. 10431 */ 10432 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10433 CLOSE_NORM, 0, 0, 0); 10434 VN_RELE(map_vp); 10435 return (EIO); 10436 } 10437 10438 VN_RELE(map_vp); 10439 10440 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10441 if (!oop) { 10442 nfs4_error_t e; 10443 10444 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10445 "no open owner")); 10446 /* 10447 * If there's an error, ignore it, 10448 * and let VOP_INACTIVE handle it. 10449 */ 10450 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10451 CLOSE_NORM, 0, 0, 0); 10452 return (EIO); 10453 } 10454 osp = find_open_stream(oop, rp); 10455 open_owner_rele(oop); 10456 *ospp = osp; 10457 return (0); 10458 } 10459 10460 /* 10461 * Please be aware that when this function is called, the address space write 10462 * a_lock is held. Do not put over the wire calls in this function. 10463 */ 10464 /* ARGSUSED */ 10465 static int 10466 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10467 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10468 { 10469 rnode4_t *rp; 10470 int error = 0; 10471 mntinfo4_t *mi; 10472 10473 mi = VTOMI4(vp); 10474 rp = VTOR4(vp); 10475 10476 if (nfs_zone() != mi->mi_zone) 10477 return (EIO); 10478 if (vp->v_flag & VNOMAP) 10479 return (ENOSYS); 10480 10481 /* 10482 * Need to hold rwlock while incrementing the mapcnt so that 10483 * mmap'ing can be serialized with writes so that the caching 10484 * can be handled correctly. 10485 * 10486 * Don't need to update the open stream first, since this 10487 * mmap can't add any additional share access that isn't 10488 * already contained in the open stream (for the case where we 10489 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10490 * take into account os_mmap_read[write] counts). 10491 */ 10492 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10493 return (EINTR); 10494 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10495 nfs_rw_exit(&rp->r_rwlock); 10496 10497 if (vp->v_type == VREG) { 10498 /* 10499 * We need to retrieve the open stream and update the counts. 10500 * If there is no open stream here, something is wrong. 10501 */ 10502 nfs4_open_stream_t *osp = NULL; 10503 nfs4_open_owner_t *oop = NULL; 10504 10505 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10506 if (oop != NULL) { 10507 /* returns with 'os_sync_lock' held */ 10508 osp = find_open_stream(oop, rp); 10509 open_owner_rele(oop); 10510 } 10511 if (osp == NULL) { 10512 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10513 "nfs4_addmap: we should have an osp" 10514 "but we don't, so fail with EIO")); 10515 error = EIO; 10516 goto out; 10517 } 10518 10519 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10520 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10521 10522 /* 10523 * Update the map count in the open stream. 10524 * This is necessary in the case where we 10525 * open/mmap/close/, then the server reboots, and we 10526 * attempt to reopen. If the mmap doesn't add share 10527 * access then we send an invalid reopen with 10528 * access = NONE. 10529 * 10530 * We need to specifically check each PROT_* so a mmap 10531 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10532 * read and write access. A simple comparison of prot 10533 * to ~PROT_WRITE to determine read access is insufficient 10534 * since prot can be |= with PROT_USER, etc. 10535 */ 10536 10537 /* 10538 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10539 */ 10540 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10541 osp->os_mmap_write += btopr(len); 10542 if (maxprot & PROT_READ) 10543 osp->os_mmap_read += btopr(len); 10544 if (maxprot & PROT_EXEC) 10545 osp->os_mmap_read += btopr(len); 10546 /* 10547 * Ensure that os_mmap_read gets incremented, even if 10548 * maxprot were to look like PROT_NONE. 10549 */ 10550 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10551 !(maxprot & PROT_EXEC)) 10552 osp->os_mmap_read += btopr(len); 10553 osp->os_mapcnt += btopr(len); 10554 mutex_exit(&osp->os_sync_lock); 10555 open_stream_rele(osp, rp); 10556 } 10557 10558 out: 10559 /* 10560 * If we got an error, then undo our 10561 * incrementing of 'r_mapcnt'. 10562 */ 10563 10564 if (error) { 10565 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10566 ASSERT(rp->r_mapcnt >= 0); 10567 } 10568 return (error); 10569 } 10570 10571 static int 10572 nfs4_cmp(vnode_t *vp1, vnode_t *vp2) 10573 { 10574 10575 return (VTOR4(vp1) == VTOR4(vp2)); 10576 } 10577 10578 static int 10579 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10580 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 10581 { 10582 int rc; 10583 u_offset_t start, end; 10584 rnode4_t *rp; 10585 int error = 0, intr = INTR4(vp); 10586 nfs4_error_t e; 10587 10588 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10589 return (EIO); 10590 10591 /* check for valid cmd parameter */ 10592 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10593 return (EINVAL); 10594 10595 /* Verify l_type. */ 10596 switch (bfp->l_type) { 10597 case F_RDLCK: 10598 if (cmd != F_GETLK && !(flag & FREAD)) 10599 return (EBADF); 10600 break; 10601 case F_WRLCK: 10602 if (cmd != F_GETLK && !(flag & FWRITE)) 10603 return (EBADF); 10604 break; 10605 case F_UNLCK: 10606 intr = 0; 10607 break; 10608 10609 default: 10610 return (EINVAL); 10611 } 10612 10613 /* check the validity of the lock range */ 10614 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10615 return (rc); 10616 if (rc = flk_check_lock_data(start, end, MAXEND)) 10617 return (rc); 10618 10619 /* 10620 * If the filesystem is mounted using local locking, pass the 10621 * request off to the local locking code. 10622 */ 10623 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10624 if (cmd == F_SETLK || cmd == F_SETLKW) { 10625 /* 10626 * For complete safety, we should be holding 10627 * r_lkserlock. However, we can't call 10628 * nfs4_safelock and then fs_frlock while 10629 * holding r_lkserlock, so just invoke 10630 * nfs4_safelock and expect that this will 10631 * catch enough of the cases. 10632 */ 10633 if (!nfs4_safelock(vp, bfp, cr)) 10634 return (EAGAIN); 10635 } 10636 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 10637 } 10638 10639 rp = VTOR4(vp); 10640 10641 /* 10642 * Check whether the given lock request can proceed, given the 10643 * current file mappings. 10644 */ 10645 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10646 return (EINTR); 10647 if (cmd == F_SETLK || cmd == F_SETLKW) { 10648 if (!nfs4_safelock(vp, bfp, cr)) { 10649 rc = EAGAIN; 10650 goto done; 10651 } 10652 } 10653 10654 /* 10655 * Flush the cache after waiting for async I/O to finish. For new 10656 * locks, this is so that the process gets the latest bits from the 10657 * server. For unlocks, this is so that other clients see the 10658 * latest bits once the file has been unlocked. If currently dirty 10659 * pages can't be flushed, then don't allow a lock to be set. But 10660 * allow unlocks to succeed, to avoid having orphan locks on the 10661 * server. 10662 */ 10663 if (cmd != F_GETLK) { 10664 mutex_enter(&rp->r_statelock); 10665 while (rp->r_count > 0) { 10666 if (intr) { 10667 klwp_t *lwp = ttolwp(curthread); 10668 10669 if (lwp != NULL) 10670 lwp->lwp_nostop++; 10671 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 10672 if (lwp != NULL) 10673 lwp->lwp_nostop--; 10674 rc = EINTR; 10675 break; 10676 } 10677 if (lwp != NULL) 10678 lwp->lwp_nostop--; 10679 } else 10680 cv_wait(&rp->r_cv, &rp->r_statelock); 10681 } 10682 mutex_exit(&rp->r_statelock); 10683 if (rc != 0) 10684 goto done; 10685 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 10686 if (error) { 10687 if (error == ENOSPC || error == EDQUOT) { 10688 mutex_enter(&rp->r_statelock); 10689 if (!rp->r_error) 10690 rp->r_error = error; 10691 mutex_exit(&rp->r_statelock); 10692 } 10693 if (bfp->l_type != F_UNLCK) { 10694 rc = ENOLCK; 10695 goto done; 10696 } 10697 } 10698 } 10699 10700 /* 10701 * Call the lock manager to do the real work of contacting 10702 * the server and obtaining the lock. 10703 */ 10704 10705 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10706 cr, &e, NULL, NULL); 10707 rc = e.error; 10708 10709 if (rc == 0) 10710 nfs4_lockcompletion(vp, cmd); 10711 10712 done: 10713 nfs_rw_exit(&rp->r_lkserlock); 10714 10715 return (rc); 10716 } 10717 10718 /* 10719 * Free storage space associated with the specified vnode. The portion 10720 * to be freed is specified by bfp->l_start and bfp->l_len (already 10721 * normalized to a "whence" of 0). 10722 * 10723 * This is an experimental facility whose continued existence is not 10724 * guaranteed. Currently, we only support the special case 10725 * of l_len == 0, meaning free to end of file. 10726 */ 10727 /* ARGSUSED */ 10728 static int 10729 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10730 offset_t offset, cred_t *cr, caller_context_t *ct) 10731 { 10732 int error; 10733 10734 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10735 return (EIO); 10736 ASSERT(vp->v_type == VREG); 10737 if (cmd != F_FREESP) 10738 return (EINVAL); 10739 10740 error = convoff(vp, bfp, 0, offset); 10741 if (!error) { 10742 ASSERT(bfp->l_start >= 0); 10743 if (bfp->l_len == 0) { 10744 struct vattr va; 10745 10746 va.va_mask = AT_SIZE; 10747 va.va_size = bfp->l_start; 10748 error = nfs4setattr(vp, &va, 0, cr, NULL); 10749 } else 10750 error = EINVAL; 10751 } 10752 10753 return (error); 10754 } 10755 10756 /* ARGSUSED */ 10757 static int 10758 nfs4_realvp(vnode_t *vp, vnode_t **vpp) 10759 { 10760 return (EINVAL); 10761 } 10762 10763 /* 10764 * Setup and add an address space callback to do the work of the delmap call. 10765 * The callback will (and must be) deleted in the actual callback function. 10766 * 10767 * This is done in order to take care of the problem that we have with holding 10768 * the address space's a_lock for a long period of time (e.g. if the NFS server 10769 * is down). Callbacks will be executed in the address space code while the 10770 * a_lock is not held. Holding the address space's a_lock causes things such 10771 * as ps and fork to hang because they are trying to acquire this lock as well. 10772 */ 10773 /* ARGSUSED */ 10774 static int 10775 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10776 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 10777 { 10778 int caller_found; 10779 int error; 10780 rnode4_t *rp; 10781 nfs4_delmap_args_t *dmapp; 10782 nfs4_delmapcall_t *delmap_call; 10783 10784 if (vp->v_flag & VNOMAP) 10785 return (ENOSYS); 10786 10787 /* 10788 * A process may not change zones if it has NFS pages mmap'ed 10789 * in, so we can't legitimately get here from the wrong zone. 10790 */ 10791 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10792 10793 rp = VTOR4(vp); 10794 10795 /* 10796 * The way that the address space of this process deletes its mapping 10797 * of this file is via the following call chains: 10798 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10799 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10800 * 10801 * With the use of address space callbacks we are allowed to drop the 10802 * address space lock, a_lock, while executing the NFS operations that 10803 * need to go over the wire. Returning EAGAIN to the caller of this 10804 * function is what drives the execution of the callback that we add 10805 * below. The callback will be executed by the address space code 10806 * after dropping the a_lock. When the callback is finished, since 10807 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 10808 * is called again on the same segment to finish the rest of the work 10809 * that needs to happen during unmapping. 10810 * 10811 * This action of calling back into the segment driver causes 10812 * nfs4_delmap() to get called again, but since the callback was 10813 * already executed at this point, it already did the work and there 10814 * is nothing left for us to do. 10815 * 10816 * To Summarize: 10817 * - The first time nfs4_delmap is called by the current thread is when 10818 * we add the caller associated with this delmap to the delmap caller 10819 * list, add the callback, and return EAGAIN. 10820 * - The second time in this call chain when nfs4_delmap is called we 10821 * will find this caller in the delmap caller list and realize there 10822 * is no more work to do thus removing this caller from the list and 10823 * returning the error that was set in the callback execution. 10824 */ 10825 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 10826 if (caller_found) { 10827 /* 10828 * 'error' is from the actual delmap operations. To avoid 10829 * hangs, we need to handle the return of EAGAIN differently 10830 * since this is what drives the callback execution. 10831 * In this case, we don't want to return EAGAIN and do the 10832 * callback execution because there are none to execute. 10833 */ 10834 if (error == EAGAIN) 10835 return (0); 10836 else 10837 return (error); 10838 } 10839 10840 /* current caller was not in the list */ 10841 delmap_call = nfs4_init_delmapcall(); 10842 10843 mutex_enter(&rp->r_statelock); 10844 list_insert_tail(&rp->r_indelmap, delmap_call); 10845 mutex_exit(&rp->r_statelock); 10846 10847 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 10848 10849 dmapp->vp = vp; 10850 dmapp->off = off; 10851 dmapp->addr = addr; 10852 dmapp->len = len; 10853 dmapp->prot = prot; 10854 dmapp->maxprot = maxprot; 10855 dmapp->flags = flags; 10856 dmapp->cr = cr; 10857 dmapp->caller = delmap_call; 10858 10859 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 10860 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 10861 10862 return (error ? error : EAGAIN); 10863 } 10864 10865 static nfs4_delmapcall_t * 10866 nfs4_init_delmapcall() 10867 { 10868 nfs4_delmapcall_t *delmap_call; 10869 10870 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 10871 delmap_call->call_id = curthread; 10872 delmap_call->error = 0; 10873 10874 return (delmap_call); 10875 } 10876 10877 static void 10878 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 10879 { 10880 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 10881 } 10882 10883 /* 10884 * Searches for the current delmap caller (based on curthread) in the list of 10885 * callers. If it is found, we remove it and free the delmap caller. 10886 * Returns: 10887 * 0 if the caller wasn't found 10888 * 1 if the caller was found, removed and freed. *errp will be set 10889 * to what the result of the delmap was. 10890 */ 10891 static int 10892 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 10893 { 10894 nfs4_delmapcall_t *delmap_call; 10895 10896 /* 10897 * If the list doesn't exist yet, we create it and return 10898 * that the caller wasn't found. No list = no callers. 10899 */ 10900 mutex_enter(&rp->r_statelock); 10901 if (!(rp->r_flags & R4DELMAPLIST)) { 10902 /* The list does not exist */ 10903 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 10904 offsetof(nfs4_delmapcall_t, call_node)); 10905 rp->r_flags |= R4DELMAPLIST; 10906 mutex_exit(&rp->r_statelock); 10907 return (0); 10908 } else { 10909 /* The list exists so search it */ 10910 for (delmap_call = list_head(&rp->r_indelmap); 10911 delmap_call != NULL; 10912 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 10913 if (delmap_call->call_id == curthread) { 10914 /* current caller is in the list */ 10915 *errp = delmap_call->error; 10916 list_remove(&rp->r_indelmap, delmap_call); 10917 mutex_exit(&rp->r_statelock); 10918 nfs4_free_delmapcall(delmap_call); 10919 return (1); 10920 } 10921 } 10922 } 10923 mutex_exit(&rp->r_statelock); 10924 return (0); 10925 } 10926 10927 /* 10928 * Remove some pages from an mmap'd vnode. Just update the 10929 * count of pages. If doing close-to-open, then flush and 10930 * commit all of the pages associated with this file. 10931 * Otherwise, start an asynchronous page flush to write out 10932 * any dirty pages. This will also associate a credential 10933 * with the rnode which can be used to write the pages. 10934 */ 10935 /* ARGSUSED */ 10936 static void 10937 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 10938 { 10939 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 10940 rnode4_t *rp; 10941 mntinfo4_t *mi; 10942 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 10943 10944 rp = VTOR4(dmapp->vp); 10945 mi = VTOMI4(dmapp->vp); 10946 10947 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 10948 ASSERT(rp->r_mapcnt >= 0); 10949 10950 /* 10951 * Initiate a page flush and potential commit if there are 10952 * pages, the file system was not mounted readonly, the segment 10953 * was mapped shared, and the pages themselves were writeable. 10954 */ 10955 if (nfs4_has_pages(dmapp->vp) && 10956 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 10957 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 10958 mutex_enter(&rp->r_statelock); 10959 rp->r_flags |= R4DIRTY; 10960 mutex_exit(&rp->r_statelock); 10961 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 10962 dmapp->len, dmapp->cr); 10963 if (!e.error) { 10964 mutex_enter(&rp->r_statelock); 10965 e.error = rp->r_error; 10966 rp->r_error = 0; 10967 mutex_exit(&rp->r_statelock); 10968 } 10969 } else 10970 e.error = 0; 10971 10972 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 10973 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 10974 B_INVAL, dmapp->cr); 10975 10976 if (e.error) { 10977 e.stat = puterrno4(e.error); 10978 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 10979 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 10980 dmapp->caller->error = e.error; 10981 } 10982 10983 /* Check to see if we need to close the file */ 10984 10985 if (dmapp->vp->v_type == VREG) { 10986 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 10987 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 10988 10989 if (e.error != 0 || e.stat != NFS4_OK) { 10990 /* 10991 * Since it is possible that e.error == 0 and 10992 * e.stat != NFS4_OK (and vice versa), 10993 * we do the proper checking in order to get both 10994 * e.error and e.stat reporting the correct info. 10995 */ 10996 if (e.stat == NFS4_OK) 10997 e.stat = puterrno4(e.error); 10998 if (e.error == 0) 10999 e.error = geterrno4(e.stat); 11000 11001 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11002 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11003 dmapp->caller->error = e.error; 11004 } 11005 } 11006 11007 (void) as_delete_callback(as, arg); 11008 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11009 } 11010 11011 11012 static uint_t 11013 fattr4_maxfilesize_to_bits(uint64_t ll) 11014 { 11015 uint_t l = 1; 11016 11017 if (ll == 0) { 11018 return (0); 11019 } 11020 11021 if (ll & 0xffffffff00000000) { 11022 l += 32; ll >>= 32; 11023 } 11024 if (ll & 0xffff0000) { 11025 l += 16; ll >>= 16; 11026 } 11027 if (ll & 0xff00) { 11028 l += 8; ll >>= 8; 11029 } 11030 if (ll & 0xf0) { 11031 l += 4; ll >>= 4; 11032 } 11033 if (ll & 0xc) { 11034 l += 2; ll >>= 2; 11035 } 11036 if (ll & 0x2) { 11037 l += 1; 11038 } 11039 return (l); 11040 } 11041 11042 static int 11043 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 11044 { 11045 int error; 11046 hrtime_t t; 11047 rnode4_t *rp; 11048 nfs4_ga_res_t gar; 11049 nfs4_ga_ext_res_t ger; 11050 11051 gar.n4g_ext_res = &ger; 11052 11053 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11054 return (EIO); 11055 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11056 *valp = MAXPATHLEN; 11057 return (0); 11058 } 11059 if (cmd == _PC_ACL_ENABLED) { 11060 *valp = _ACL_ACE_ENABLED; 11061 return (0); 11062 } 11063 11064 rp = VTOR4(vp); 11065 if (cmd == _PC_XATTR_EXISTS) { 11066 /* 11067 * Eventually should attempt small client readdir before 11068 * going otw with GETATTR(FATTR4_NAMED_ATTR). For now 11069 * just drive the OTW getattr. This is required because 11070 * _PC_XATTR_EXISTS can only return true if attributes 11071 * exist -- simply checking for existance of the attrdir 11072 * is not sufficient. 11073 * 11074 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11075 * is NULL. Once the xadir vp exists, we can create xattrs, 11076 * and we don't have any way to update the "base" object's 11077 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11078 * could help out. 11079 */ 11080 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11081 rp->r_xattr_dir == NULL) { 11082 *valp = rp->r_pathconf.pc4_xattr_exists; 11083 return (0); 11084 } 11085 } else { /* OLD CODE */ 11086 if (ATTRCACHE4_VALID(vp)) { 11087 mutex_enter(&rp->r_statelock); 11088 if (rp->r_pathconf.pc4_cache_valid) { 11089 error = 0; 11090 switch (cmd) { 11091 case _PC_FILESIZEBITS: 11092 *valp = 11093 rp->r_pathconf.pc4_filesizebits; 11094 break; 11095 case _PC_LINK_MAX: 11096 *valp = 11097 rp->r_pathconf.pc4_link_max; 11098 break; 11099 case _PC_NAME_MAX: 11100 *valp = 11101 rp->r_pathconf.pc4_name_max; 11102 break; 11103 case _PC_CHOWN_RESTRICTED: 11104 *valp = 11105 rp->r_pathconf.pc4_chown_restricted; 11106 break; 11107 case _PC_NO_TRUNC: 11108 *valp = 11109 rp->r_pathconf.pc4_no_trunc; 11110 break; 11111 default: 11112 error = EINVAL; 11113 break; 11114 } 11115 mutex_exit(&rp->r_statelock); 11116 #ifdef DEBUG 11117 nfs4_pathconf_cache_hits++; 11118 #endif 11119 return (error); 11120 } 11121 mutex_exit(&rp->r_statelock); 11122 } 11123 } 11124 #ifdef DEBUG 11125 nfs4_pathconf_cache_misses++; 11126 #endif 11127 11128 t = gethrtime(); 11129 11130 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11131 11132 if (error) { 11133 mutex_enter(&rp->r_statelock); 11134 rp->r_pathconf.pc4_cache_valid = FALSE; 11135 rp->r_pathconf.pc4_xattr_valid = FALSE; 11136 mutex_exit(&rp->r_statelock); 11137 return (error); 11138 } 11139 11140 /* interpret the max filesize */ 11141 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11142 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11143 11144 /* Store the attributes we just received */ 11145 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11146 11147 switch (cmd) { 11148 case _PC_FILESIZEBITS: 11149 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11150 break; 11151 case _PC_LINK_MAX: 11152 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11153 break; 11154 case _PC_NAME_MAX: 11155 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11156 break; 11157 case _PC_CHOWN_RESTRICTED: 11158 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11159 break; 11160 case _PC_NO_TRUNC: 11161 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11162 break; 11163 case _PC_XATTR_EXISTS: 11164 *valp = gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists; 11165 break; 11166 default: 11167 return (EINVAL); 11168 } 11169 11170 return (0); 11171 } 11172 11173 /* 11174 * Called by async thread to do synchronous pageio. Do the i/o, wait 11175 * for it to complete, and cleanup the page list when done. 11176 */ 11177 static int 11178 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11179 int flags, cred_t *cr) 11180 { 11181 int error; 11182 11183 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11184 11185 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11186 if (flags & B_READ) 11187 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11188 else 11189 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11190 return (error); 11191 } 11192 11193 static int 11194 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11195 int flags, cred_t *cr) 11196 { 11197 int error; 11198 rnode4_t *rp; 11199 11200 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11201 return (EIO); 11202 11203 if (pp == NULL) 11204 return (EINVAL); 11205 11206 rp = VTOR4(vp); 11207 mutex_enter(&rp->r_statelock); 11208 rp->r_count++; 11209 mutex_exit(&rp->r_statelock); 11210 11211 if (flags & B_ASYNC) { 11212 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11213 nfs4_sync_pageio); 11214 } else 11215 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11216 mutex_enter(&rp->r_statelock); 11217 rp->r_count--; 11218 cv_broadcast(&rp->r_cv); 11219 mutex_exit(&rp->r_statelock); 11220 return (error); 11221 } 11222 11223 static void 11224 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 11225 { 11226 int error; 11227 rnode4_t *rp; 11228 page_t *plist; 11229 page_t *pptr; 11230 offset3 offset; 11231 count3 len; 11232 k_sigset_t smask; 11233 11234 /* 11235 * We should get called with fl equal to either B_FREE or 11236 * B_INVAL. Any other value is illegal. 11237 * 11238 * The page that we are either supposed to free or destroy 11239 * should be exclusive locked and its io lock should not 11240 * be held. 11241 */ 11242 ASSERT(fl == B_FREE || fl == B_INVAL); 11243 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11244 11245 rp = VTOR4(vp); 11246 11247 /* 11248 * If the page doesn't need to be committed or we shouldn't 11249 * even bother attempting to commit it, then just make sure 11250 * that the p_fsdata byte is clear and then either free or 11251 * destroy the page as appropriate. 11252 */ 11253 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11254 pp->p_fsdata = C_NOCOMMIT; 11255 if (fl == B_FREE) 11256 page_free(pp, dn); 11257 else 11258 page_destroy(pp, dn); 11259 return; 11260 } 11261 11262 /* 11263 * If there is a page invalidation operation going on, then 11264 * if this is one of the pages being destroyed, then just 11265 * clear the p_fsdata byte and then either free or destroy 11266 * the page as appropriate. 11267 */ 11268 mutex_enter(&rp->r_statelock); 11269 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11270 mutex_exit(&rp->r_statelock); 11271 pp->p_fsdata = C_NOCOMMIT; 11272 if (fl == B_FREE) 11273 page_free(pp, dn); 11274 else 11275 page_destroy(pp, dn); 11276 return; 11277 } 11278 11279 /* 11280 * If we are freeing this page and someone else is already 11281 * waiting to do a commit, then just unlock the page and 11282 * return. That other thread will take care of commiting 11283 * this page. The page can be freed sometime after the 11284 * commit has finished. Otherwise, if the page is marked 11285 * as delay commit, then we may be getting called from 11286 * pvn_write_done, one page at a time. This could result 11287 * in one commit per page, so we end up doing lots of small 11288 * commits instead of fewer larger commits. This is bad, 11289 * we want do as few commits as possible. 11290 */ 11291 if (fl == B_FREE) { 11292 if (rp->r_flags & R4COMMITWAIT) { 11293 page_unlock(pp); 11294 mutex_exit(&rp->r_statelock); 11295 return; 11296 } 11297 if (pp->p_fsdata == C_DELAYCOMMIT) { 11298 pp->p_fsdata = C_COMMIT; 11299 page_unlock(pp); 11300 mutex_exit(&rp->r_statelock); 11301 return; 11302 } 11303 } 11304 11305 /* 11306 * Check to see if there is a signal which would prevent an 11307 * attempt to commit the pages from being successful. If so, 11308 * then don't bother with all of the work to gather pages and 11309 * generate the unsuccessful RPC. Just return from here and 11310 * let the page be committed at some later time. 11311 */ 11312 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11313 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11314 sigunintr(&smask); 11315 page_unlock(pp); 11316 mutex_exit(&rp->r_statelock); 11317 return; 11318 } 11319 sigunintr(&smask); 11320 11321 /* 11322 * We are starting to need to commit pages, so let's try 11323 * to commit as many as possible at once to reduce the 11324 * overhead. 11325 * 11326 * Set the `commit inprogress' state bit. We must 11327 * first wait until any current one finishes. Then 11328 * we initialize the c_pages list with this page. 11329 */ 11330 while (rp->r_flags & R4COMMIT) { 11331 rp->r_flags |= R4COMMITWAIT; 11332 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11333 rp->r_flags &= ~R4COMMITWAIT; 11334 } 11335 rp->r_flags |= R4COMMIT; 11336 mutex_exit(&rp->r_statelock); 11337 ASSERT(rp->r_commit.c_pages == NULL); 11338 rp->r_commit.c_pages = pp; 11339 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11340 rp->r_commit.c_commlen = PAGESIZE; 11341 11342 /* 11343 * Gather together all other pages which can be committed. 11344 * They will all be chained off r_commit.c_pages. 11345 */ 11346 nfs4_get_commit(vp); 11347 11348 /* 11349 * Clear the `commit inprogress' status and disconnect 11350 * the list of pages to be committed from the rnode. 11351 * At this same time, we also save the starting offset 11352 * and length of data to be committed on the server. 11353 */ 11354 plist = rp->r_commit.c_pages; 11355 rp->r_commit.c_pages = NULL; 11356 offset = rp->r_commit.c_commbase; 11357 len = rp->r_commit.c_commlen; 11358 mutex_enter(&rp->r_statelock); 11359 rp->r_flags &= ~R4COMMIT; 11360 cv_broadcast(&rp->r_commit.c_cv); 11361 mutex_exit(&rp->r_statelock); 11362 11363 if (curproc == proc_pageout || curproc == proc_fsflush || 11364 nfs_zone() != VTOMI4(vp)->mi_zone) { 11365 nfs4_async_commit(vp, plist, offset, len, 11366 cr, do_nfs4_async_commit); 11367 return; 11368 } 11369 11370 /* 11371 * Actually generate the COMMIT op over the wire operation. 11372 */ 11373 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11374 11375 /* 11376 * If we got an error during the commit, just unlock all 11377 * of the pages. The pages will get retransmitted to the 11378 * server during a putpage operation. 11379 */ 11380 if (error) { 11381 while (plist != NULL) { 11382 pptr = plist; 11383 page_sub(&plist, pptr); 11384 page_unlock(pptr); 11385 } 11386 return; 11387 } 11388 11389 /* 11390 * We've tried as hard as we can to commit the data to stable 11391 * storage on the server. We just unlock the rest of the pages 11392 * and clear the commit required state. They will be put 11393 * onto the tail of the cachelist if they are nolonger 11394 * mapped. 11395 */ 11396 while (plist != pp) { 11397 pptr = plist; 11398 page_sub(&plist, pptr); 11399 pptr->p_fsdata = C_NOCOMMIT; 11400 page_unlock(pptr); 11401 } 11402 11403 /* 11404 * It is possible that nfs4_commit didn't return error but 11405 * some other thread has modified the page we are going 11406 * to free/destroy. 11407 * In this case we need to rewrite the page. Do an explicit check 11408 * before attempting to free/destroy the page. If modified, needs to 11409 * be rewritten so unlock the page and return. 11410 */ 11411 if (hat_ismod(pp)) { 11412 pp->p_fsdata = C_NOCOMMIT; 11413 page_unlock(pp); 11414 return; 11415 } 11416 11417 /* 11418 * Now, as appropriate, either free or destroy the page 11419 * that we were called with. 11420 */ 11421 pp->p_fsdata = C_NOCOMMIT; 11422 if (fl == B_FREE) 11423 page_free(pp, dn); 11424 else 11425 page_destroy(pp, dn); 11426 } 11427 11428 /* 11429 * Commit requires that the current fh be the file written to. 11430 * The compound op structure is: 11431 * PUTFH(file), COMMIT 11432 */ 11433 static int 11434 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11435 { 11436 COMPOUND4args_clnt args; 11437 COMPOUND4res_clnt res; 11438 COMMIT4res *cm_res; 11439 nfs_argop4 argop[2]; 11440 nfs_resop4 *resop; 11441 int doqueue; 11442 mntinfo4_t *mi; 11443 rnode4_t *rp; 11444 cred_t *cred_otw = NULL; 11445 bool_t needrecov = FALSE; 11446 nfs4_recov_state_t recov_state; 11447 nfs4_open_stream_t *osp = NULL; 11448 bool_t first_time = TRUE; /* first time getting OTW cred */ 11449 bool_t last_time = FALSE; /* last time getting OTW cred */ 11450 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11451 11452 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11453 11454 rp = VTOR4(vp); 11455 11456 mi = VTOMI4(vp); 11457 recov_state.rs_flags = 0; 11458 recov_state.rs_num_retry_despite_err = 0; 11459 get_commit_cred: 11460 /* 11461 * Releases the osp, if a valid open stream is provided. 11462 * Puts a hold on the cred_otw and the new osp (if found). 11463 */ 11464 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11465 &first_time, &last_time); 11466 args.ctag = TAG_COMMIT; 11467 recov_retry: 11468 /* 11469 * Commit ops: putfh file; commit 11470 */ 11471 args.array_len = 2; 11472 args.array = argop; 11473 11474 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11475 &recov_state, NULL); 11476 if (e.error) { 11477 crfree(cred_otw); 11478 if (osp != NULL) 11479 open_stream_rele(osp, rp); 11480 return (e.error); 11481 } 11482 11483 /* putfh directory */ 11484 argop[0].argop = OP_CPUTFH; 11485 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11486 11487 /* commit */ 11488 argop[1].argop = OP_COMMIT; 11489 argop[1].nfs_argop4_u.opcommit.offset = offset; 11490 argop[1].nfs_argop4_u.opcommit.count = count; 11491 11492 doqueue = 1; 11493 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11494 11495 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11496 if (!needrecov && e.error) { 11497 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11498 needrecov); 11499 crfree(cred_otw); 11500 if (e.error == EACCES && last_time == FALSE) 11501 goto get_commit_cred; 11502 if (osp != NULL) 11503 open_stream_rele(osp, rp); 11504 return (e.error); 11505 } 11506 11507 if (needrecov) { 11508 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11509 NULL, OP_COMMIT, NULL) == FALSE) { 11510 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11511 &recov_state, needrecov); 11512 if (!e.error) 11513 (void) xdr_free(xdr_COMPOUND4res_clnt, 11514 (caddr_t)&res); 11515 goto recov_retry; 11516 } 11517 if (e.error) { 11518 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11519 &recov_state, needrecov); 11520 crfree(cred_otw); 11521 if (osp != NULL) 11522 open_stream_rele(osp, rp); 11523 return (e.error); 11524 } 11525 /* fall through for res.status case */ 11526 } 11527 11528 if (res.status) { 11529 e.error = geterrno4(res.status); 11530 if (e.error == EACCES && last_time == FALSE) { 11531 crfree(cred_otw); 11532 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11533 &recov_state, needrecov); 11534 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11535 goto get_commit_cred; 11536 } 11537 /* 11538 * Can't do a nfs4_purge_stale_fh here because this 11539 * can cause a deadlock. nfs4_commit can 11540 * be called from nfs4_dispose which can be called 11541 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11542 * can call back to pvn_vplist_dirty. 11543 */ 11544 if (e.error == ESTALE) { 11545 mutex_enter(&rp->r_statelock); 11546 rp->r_flags |= R4STALE; 11547 if (!rp->r_error) 11548 rp->r_error = e.error; 11549 mutex_exit(&rp->r_statelock); 11550 PURGE_ATTRCACHE4(vp); 11551 } else { 11552 mutex_enter(&rp->r_statelock); 11553 if (!rp->r_error) 11554 rp->r_error = e.error; 11555 mutex_exit(&rp->r_statelock); 11556 } 11557 } else { 11558 ASSERT(rp->r_flags & R4HAVEVERF); 11559 resop = &res.array[1]; /* commit res */ 11560 cm_res = &resop->nfs_resop4_u.opcommit; 11561 mutex_enter(&rp->r_statelock); 11562 if (cm_res->writeverf == rp->r_writeverf) { 11563 mutex_exit(&rp->r_statelock); 11564 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11565 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11566 &recov_state, needrecov); 11567 crfree(cred_otw); 11568 if (osp != NULL) 11569 open_stream_rele(osp, rp); 11570 return (0); 11571 } 11572 nfs4_set_mod(vp); 11573 rp->r_writeverf = cm_res->writeverf; 11574 mutex_exit(&rp->r_statelock); 11575 e.error = NFS_VERF_MISMATCH; 11576 } 11577 11578 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11579 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11580 crfree(cred_otw); 11581 if (osp != NULL) 11582 open_stream_rele(osp, rp); 11583 11584 return (e.error); 11585 } 11586 11587 static void 11588 nfs4_set_mod(vnode_t *vp) 11589 { 11590 page_t *pp; 11591 kmutex_t *vphm; 11592 rnode4_t *rp; 11593 11594 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11595 11596 /* make sure we're looking at the master vnode, not a shadow */ 11597 11598 rp = VTOR4(vp); 11599 if (IS_SHADOW(vp, rp)) 11600 vp = RTOV4(rp); 11601 11602 vphm = page_vnode_mutex(vp); 11603 mutex_enter(vphm); 11604 /* 11605 * If there are no pages associated with this vnode, then 11606 * just return. 11607 */ 11608 if ((pp = vp->v_pages) == NULL) { 11609 mutex_exit(vphm); 11610 return; 11611 } 11612 11613 do { 11614 if (pp->p_fsdata != C_NOCOMMIT) { 11615 hat_setmod(pp); 11616 pp->p_fsdata = C_NOCOMMIT; 11617 } 11618 } while ((pp = pp->p_vpnext) != vp->v_pages); 11619 mutex_exit(vphm); 11620 } 11621 11622 /* 11623 * This function is used to gather a page list of the pages which 11624 * can be committed on the server. 11625 * 11626 * The calling thread must have set R4COMMIT. This bit is used to 11627 * serialize access to the commit structure in the rnode. As long 11628 * as the thread has set R4COMMIT, then it can manipulate the commit 11629 * structure without requiring any other locks. 11630 * 11631 * When this function is called from nfs4_dispose() the page passed 11632 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11633 * will skip it. This is not a problem since we initially add the 11634 * page to the r_commit page list. 11635 * 11636 */ 11637 static void 11638 nfs4_get_commit(vnode_t *vp) 11639 { 11640 rnode4_t *rp; 11641 page_t *pp; 11642 kmutex_t *vphm; 11643 11644 rp = VTOR4(vp); 11645 11646 ASSERT(rp->r_flags & R4COMMIT); 11647 11648 /* make sure we're looking at the master vnode, not a shadow */ 11649 11650 if (IS_SHADOW(vp, rp)) 11651 vp = RTOV4(rp); 11652 11653 vphm = page_vnode_mutex(vp); 11654 mutex_enter(vphm); 11655 11656 /* 11657 * If there are no pages associated with this vnode, then 11658 * just return. 11659 */ 11660 if ((pp = vp->v_pages) == NULL) { 11661 mutex_exit(vphm); 11662 return; 11663 } 11664 11665 /* 11666 * Step through all of the pages associated with this vnode 11667 * looking for pages which need to be committed. 11668 */ 11669 do { 11670 /* 11671 * First short-cut everything (without the page_lock) 11672 * and see if this page does not need to be committed 11673 * or is modified if so then we'll just skip it. 11674 */ 11675 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11676 continue; 11677 11678 /* 11679 * Attempt to lock the page. If we can't, then 11680 * someone else is messing with it or we have been 11681 * called from nfs4_dispose and this is the page that 11682 * nfs4_dispose was called with.. anyway just skip it. 11683 */ 11684 if (!page_trylock(pp, SE_EXCL)) 11685 continue; 11686 11687 /* 11688 * Lets check again now that we have the page lock. 11689 */ 11690 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11691 page_unlock(pp); 11692 continue; 11693 } 11694 11695 /* this had better not be a free page */ 11696 ASSERT(PP_ISFREE(pp) == 0); 11697 11698 /* 11699 * The page needs to be committed and we locked it. 11700 * Update the base and length parameters and add it 11701 * to r_pages. 11702 */ 11703 if (rp->r_commit.c_pages == NULL) { 11704 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11705 rp->r_commit.c_commlen = PAGESIZE; 11706 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11707 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11708 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11709 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11710 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11711 <= pp->p_offset) { 11712 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11713 rp->r_commit.c_commbase + PAGESIZE; 11714 } 11715 page_add(&rp->r_commit.c_pages, pp); 11716 } while ((pp = pp->p_vpnext) != vp->v_pages); 11717 11718 mutex_exit(vphm); 11719 } 11720 11721 /* 11722 * This routine is used to gather together a page list of the pages 11723 * which are to be committed on the server. This routine must not 11724 * be called if the calling thread holds any locked pages. 11725 * 11726 * The calling thread must have set R4COMMIT. This bit is used to 11727 * serialize access to the commit structure in the rnode. As long 11728 * as the thread has set R4COMMIT, then it can manipulate the commit 11729 * structure without requiring any other locks. 11730 */ 11731 static void 11732 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11733 { 11734 11735 rnode4_t *rp; 11736 page_t *pp; 11737 u_offset_t end; 11738 u_offset_t off; 11739 ASSERT(len != 0); 11740 rp = VTOR4(vp); 11741 ASSERT(rp->r_flags & R4COMMIT); 11742 11743 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11744 11745 /* make sure we're looking at the master vnode, not a shadow */ 11746 11747 if (IS_SHADOW(vp, rp)) 11748 vp = RTOV4(rp); 11749 11750 /* 11751 * If there are no pages associated with this vnode, then 11752 * just return. 11753 */ 11754 if ((pp = vp->v_pages) == NULL) 11755 return; 11756 /* 11757 * Calculate the ending offset. 11758 */ 11759 end = soff + len; 11760 for (off = soff; off < end; off += PAGESIZE) { 11761 /* 11762 * Lookup each page by vp, offset. 11763 */ 11764 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11765 continue; 11766 /* 11767 * If this page does not need to be committed or is 11768 * modified, then just skip it. 11769 */ 11770 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11771 page_unlock(pp); 11772 continue; 11773 } 11774 11775 ASSERT(PP_ISFREE(pp) == 0); 11776 /* 11777 * The page needs to be committed and we locked it. 11778 * Update the base and length parameters and add it 11779 * to r_pages. 11780 */ 11781 if (rp->r_commit.c_pages == NULL) { 11782 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11783 rp->r_commit.c_commlen = PAGESIZE; 11784 } else { 11785 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11786 rp->r_commit.c_commbase + PAGESIZE; 11787 } 11788 page_add(&rp->r_commit.c_pages, pp); 11789 } 11790 } 11791 11792 /* 11793 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 11794 * Flushes and commits data to the server. 11795 */ 11796 static int 11797 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 11798 { 11799 int error; 11800 verifier4 write_verf; 11801 rnode4_t *rp = VTOR4(vp); 11802 11803 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11804 11805 /* 11806 * Flush the data portion of the file and then commit any 11807 * portions which need to be committed. This may need to 11808 * be done twice if the server has changed state since 11809 * data was last written. The data will need to be 11810 * rewritten to the server and then a new commit done. 11811 * 11812 * In fact, this may need to be done several times if the 11813 * server is having problems and crashing while we are 11814 * attempting to do this. 11815 */ 11816 11817 top: 11818 /* 11819 * Do a flush based on the poff and plen arguments. This 11820 * will synchronously write out any modified pages in the 11821 * range specified by (poff, plen). This starts all of the 11822 * i/o operations which will be waited for in the next 11823 * call to nfs4_putpage 11824 */ 11825 11826 mutex_enter(&rp->r_statelock); 11827 write_verf = rp->r_writeverf; 11828 mutex_exit(&rp->r_statelock); 11829 11830 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr); 11831 if (error == EAGAIN) 11832 error = 0; 11833 11834 /* 11835 * Do a flush based on the poff and plen arguments. This 11836 * will synchronously write out any modified pages in the 11837 * range specified by (poff, plen) and wait until all of 11838 * the asynchronous i/o's in that range are done as well. 11839 */ 11840 if (!error) 11841 error = nfs4_putpage(vp, poff, plen, 0, cr); 11842 11843 if (error) 11844 return (error); 11845 11846 mutex_enter(&rp->r_statelock); 11847 if (rp->r_writeverf != write_verf) { 11848 mutex_exit(&rp->r_statelock); 11849 goto top; 11850 } 11851 mutex_exit(&rp->r_statelock); 11852 11853 /* 11854 * Now commit any pages which might need to be committed. 11855 * If the error, NFS_VERF_MISMATCH, is returned, then 11856 * start over with the flush operation. 11857 */ 11858 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 11859 11860 if (error == NFS_VERF_MISMATCH) 11861 goto top; 11862 11863 return (error); 11864 } 11865 11866 /* 11867 * nfs4_commit_vp() will wait for other pending commits and 11868 * will either commit the whole file or a range, plen dictates 11869 * if we commit whole file. a value of zero indicates the whole 11870 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 11871 */ 11872 static int 11873 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 11874 cred_t *cr, int wait_on_writes) 11875 { 11876 rnode4_t *rp; 11877 page_t *plist; 11878 offset3 offset; 11879 count3 len; 11880 11881 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11882 11883 rp = VTOR4(vp); 11884 11885 /* 11886 * before we gather commitable pages make 11887 * sure there are no outstanding async writes 11888 */ 11889 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 11890 mutex_enter(&rp->r_statelock); 11891 while (rp->r_count > 0) { 11892 cv_wait(&rp->r_cv, &rp->r_statelock); 11893 } 11894 mutex_exit(&rp->r_statelock); 11895 } 11896 11897 /* 11898 * Set the `commit inprogress' state bit. We must 11899 * first wait until any current one finishes. 11900 */ 11901 mutex_enter(&rp->r_statelock); 11902 while (rp->r_flags & R4COMMIT) { 11903 rp->r_flags |= R4COMMITWAIT; 11904 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11905 rp->r_flags &= ~R4COMMITWAIT; 11906 } 11907 rp->r_flags |= R4COMMIT; 11908 mutex_exit(&rp->r_statelock); 11909 11910 /* 11911 * Gather all of the pages which need to be 11912 * committed. 11913 */ 11914 if (plen == 0) 11915 nfs4_get_commit(vp); 11916 else 11917 nfs4_get_commit_range(vp, poff, plen); 11918 11919 /* 11920 * Clear the `commit inprogress' bit and disconnect the 11921 * page list which was gathered by nfs4_get_commit. 11922 */ 11923 plist = rp->r_commit.c_pages; 11924 rp->r_commit.c_pages = NULL; 11925 offset = rp->r_commit.c_commbase; 11926 len = rp->r_commit.c_commlen; 11927 mutex_enter(&rp->r_statelock); 11928 rp->r_flags &= ~R4COMMIT; 11929 cv_broadcast(&rp->r_commit.c_cv); 11930 mutex_exit(&rp->r_statelock); 11931 11932 /* 11933 * If any pages need to be committed, commit them and 11934 * then unlock them so that they can be freed some 11935 * time later. 11936 */ 11937 if (plist == NULL) 11938 return (0); 11939 11940 /* 11941 * No error occurred during the flush portion 11942 * of this operation, so now attempt to commit 11943 * the data to stable storage on the server. 11944 * 11945 * This will unlock all of the pages on the list. 11946 */ 11947 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 11948 } 11949 11950 static int 11951 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11952 cred_t *cr) 11953 { 11954 int error; 11955 page_t *pp; 11956 11957 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11958 11959 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 11960 11961 /* 11962 * If we got an error, then just unlock all of the pages 11963 * on the list. 11964 */ 11965 if (error) { 11966 while (plist != NULL) { 11967 pp = plist; 11968 page_sub(&plist, pp); 11969 page_unlock(pp); 11970 } 11971 return (error); 11972 } 11973 /* 11974 * We've tried as hard as we can to commit the data to stable 11975 * storage on the server. We just unlock the pages and clear 11976 * the commit required state. They will get freed later. 11977 */ 11978 while (plist != NULL) { 11979 pp = plist; 11980 page_sub(&plist, pp); 11981 pp->p_fsdata = C_NOCOMMIT; 11982 page_unlock(pp); 11983 } 11984 11985 return (error); 11986 } 11987 11988 static void 11989 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11990 cred_t *cr) 11991 { 11992 11993 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 11994 } 11995 11996 /*ARGSUSED*/ 11997 static int 11998 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 11999 { 12000 int error = 0; 12001 mntinfo4_t *mi; 12002 vattr_t va; 12003 vsecattr_t nfsace4_vsap; 12004 12005 mi = VTOMI4(vp); 12006 if (nfs_zone() != mi->mi_zone) 12007 return (EIO); 12008 if (mi->mi_flags & MI4_ACL) { 12009 /* if we have a delegation, return it */ 12010 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12011 (void) nfs4delegreturn(VTOR4(vp), 12012 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12013 12014 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12015 NFS4_ACL_SET); 12016 if (error) /* EINVAL */ 12017 return (error); 12018 12019 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12020 /* 12021 * These are aclent_t type entries. 12022 */ 12023 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12024 vp->v_type == VDIR, FALSE); 12025 if (error) 12026 return (error); 12027 } else { 12028 /* 12029 * These are ace_t type entries. 12030 */ 12031 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12032 FALSE); 12033 if (error) 12034 return (error); 12035 } 12036 bzero(&va, sizeof (va)); 12037 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12038 vs_ace4_destroy(&nfsace4_vsap); 12039 return (error); 12040 } 12041 return (ENOSYS); 12042 } 12043 12044 static int 12045 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 12046 { 12047 int error; 12048 mntinfo4_t *mi; 12049 nfs4_ga_res_t gar; 12050 rnode4_t *rp = VTOR4(vp); 12051 12052 mi = VTOMI4(vp); 12053 if (nfs_zone() != mi->mi_zone) 12054 return (EIO); 12055 12056 bzero(&gar, sizeof (gar)); 12057 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12058 12059 /* 12060 * vsecattr->vsa_mask holds the original acl request mask. 12061 * This is needed when determining what to return. 12062 * (See: nfs4_create_getsecattr_return()) 12063 */ 12064 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12065 if (error) /* EINVAL */ 12066 return (error); 12067 12068 if (mi->mi_flags & MI4_ACL) { 12069 /* 12070 * Check if the data is cached and the cache is valid. If it 12071 * is we don't go over the wire. 12072 */ 12073 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12074 mutex_enter(&rp->r_statelock); 12075 if (rp->r_secattr != NULL) { 12076 error = nfs4_create_getsecattr_return( 12077 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12078 rp->r_attr.va_gid, 12079 vp->v_type == VDIR); 12080 if (!error) { /* error == 0 - Success! */ 12081 mutex_exit(&rp->r_statelock); 12082 return (error); 12083 } 12084 } 12085 mutex_exit(&rp->r_statelock); 12086 } 12087 12088 /* 12089 * The getattr otw call will always get both the acl, in 12090 * the form of a list of nfsace4's, and the number of acl 12091 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12092 */ 12093 gar.n4g_va.va_mask = AT_ALL; 12094 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12095 if (error) { 12096 vs_ace4_destroy(&gar.n4g_vsa); 12097 if (error == ENOTSUP || error == EOPNOTSUPP) 12098 error = fs_fab_acl(vp, vsecattr, flag, cr); 12099 return (error); 12100 } 12101 12102 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12103 /* 12104 * No error was returned, but according to the response 12105 * bitmap, neither was an acl. 12106 */ 12107 vs_ace4_destroy(&gar.n4g_vsa); 12108 error = fs_fab_acl(vp, vsecattr, flag, cr); 12109 return (error); 12110 } 12111 12112 /* 12113 * Update the cache with the ACL. 12114 */ 12115 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12116 12117 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12118 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12119 vp->v_type == VDIR); 12120 vs_ace4_destroy(&gar.n4g_vsa); 12121 if ((error) && (vsecattr->vsa_mask & 12122 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12123 (error != EACCES)) { 12124 error = fs_fab_acl(vp, vsecattr, flag, cr); 12125 } 12126 return (error); 12127 } 12128 error = fs_fab_acl(vp, vsecattr, flag, cr); 12129 return (error); 12130 } 12131 12132 /* 12133 * The function returns: 12134 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12135 * - EINVAL if the passed in "acl_mask" is an invalid request. 12136 * 12137 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12138 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12139 * 12140 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12141 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12142 * - We have a count field set without the corresponding acl field set. (e.g. - 12143 * VSA_ACECNT is set, but VSA_ACE is not) 12144 */ 12145 static int 12146 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12147 { 12148 /* Shortcut the masks that are always valid. */ 12149 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12150 return (0); 12151 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12152 return (0); 12153 12154 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12155 /* 12156 * We can't have any VSA_ACL type stuff in the mask now. 12157 */ 12158 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12159 VSA_DFACLCNT)) 12160 return (EINVAL); 12161 12162 if (op == NFS4_ACL_SET) { 12163 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12164 return (EINVAL); 12165 } 12166 } 12167 12168 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12169 /* 12170 * We can't have any VSA_ACE type stuff in the mask now. 12171 */ 12172 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12173 return (EINVAL); 12174 12175 if (op == NFS4_ACL_SET) { 12176 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12177 return (EINVAL); 12178 12179 if ((acl_mask & VSA_DFACLCNT) && 12180 !(acl_mask & VSA_DFACL)) 12181 return (EINVAL); 12182 } 12183 } 12184 return (0); 12185 } 12186 12187 /* 12188 * The theory behind creating the correct getsecattr return is simply this: 12189 * "Don't return anything that the caller is not expecting to have to free." 12190 */ 12191 static int 12192 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12193 uid_t uid, gid_t gid, int isdir) 12194 { 12195 int error = 0; 12196 /* Save the mask since the translators modify it. */ 12197 uint_t orig_mask = vsap->vsa_mask; 12198 12199 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12200 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12201 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12202 12203 if (error) 12204 return (error); 12205 12206 /* 12207 * If the caller only asked for the ace count (VSA_ACECNT) 12208 * don't give them the full acl (VSA_ACE), free it. 12209 */ 12210 if (!orig_mask & VSA_ACE) { 12211 if (vsap->vsa_aclentp != NULL) { 12212 kmem_free(vsap->vsa_aclentp, 12213 vsap->vsa_aclcnt * sizeof (ace_t)); 12214 vsap->vsa_aclentp = NULL; 12215 } 12216 } 12217 vsap->vsa_mask = orig_mask; 12218 12219 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12220 VSA_DFACLCNT)) { 12221 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12222 isdir, FALSE, 12223 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12224 12225 if (error) 12226 return (error); 12227 12228 /* 12229 * If the caller only asked for the acl count (VSA_ACLCNT) 12230 * and/or the default acl count (VSA_DFACLCNT) don't give them 12231 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12232 */ 12233 if (!orig_mask & VSA_ACL) { 12234 if (vsap->vsa_aclentp != NULL) { 12235 kmem_free(vsap->vsa_aclentp, 12236 vsap->vsa_aclcnt * sizeof (aclent_t)); 12237 vsap->vsa_aclentp = NULL; 12238 } 12239 } 12240 12241 if (!orig_mask & VSA_DFACL) { 12242 if (vsap->vsa_dfaclentp != NULL) { 12243 kmem_free(vsap->vsa_dfaclentp, 12244 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12245 vsap->vsa_dfaclentp = NULL; 12246 } 12247 } 12248 vsap->vsa_mask = orig_mask; 12249 } 12250 return (0); 12251 } 12252 12253 static int 12254 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 12255 { 12256 int error; 12257 12258 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12259 return (EIO); 12260 /* 12261 * check for valid cmd parameter 12262 */ 12263 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12264 return (EINVAL); 12265 12266 /* 12267 * Check access permissions 12268 */ 12269 if ((cmd & F_SHARE) && 12270 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12271 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12272 return (EBADF); 12273 12274 /* 12275 * If the filesystem is mounted using local locking, pass the 12276 * request off to the local share code. 12277 */ 12278 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12279 return (fs_shrlock(vp, cmd, shr, flag, cr)); 12280 12281 switch (cmd) { 12282 case F_SHARE: 12283 case F_UNSHARE: 12284 /* 12285 * This will be properly implemented later, 12286 * see RFE: 4823948 . 12287 */ 12288 error = EAGAIN; 12289 break; 12290 12291 case F_HASREMOTELOCKS: 12292 /* 12293 * NFS client can't store remote locks itself 12294 */ 12295 shr->s_access = 0; 12296 error = 0; 12297 break; 12298 12299 default: 12300 error = EINVAL; 12301 break; 12302 } 12303 12304 return (error); 12305 } 12306 12307 /* 12308 * Common code called by directory ops to update the attrcache 12309 */ 12310 static int 12311 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12312 hrtime_t t, vnode_t *vp, cred_t *cr) 12313 { 12314 int error = 0; 12315 12316 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12317 12318 if (status != NFS4_OK) { 12319 /* getattr not done or failed */ 12320 PURGE_ATTRCACHE4(vp); 12321 return (error); 12322 } 12323 12324 if (garp) { 12325 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12326 } else { 12327 PURGE_ATTRCACHE4(vp); 12328 } 12329 return (error); 12330 } 12331 12332 /* 12333 * Update directory caches for directory modification ops (link, rename, etc.) 12334 * When dinfo is NULL, manage dircaches in the old way. 12335 */ 12336 static void 12337 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12338 dirattr_info_t *dinfo) 12339 { 12340 rnode4_t *drp = VTOR4(dvp); 12341 12342 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12343 12344 /* Purge rddir cache for dir since it changed */ 12345 if (drp->r_dir != NULL) 12346 nfs4_purge_rddir_cache(dvp); 12347 12348 /* 12349 * If caller provided dinfo, then use it to manage dir caches. 12350 */ 12351 if (dinfo != NULL) { 12352 if (vp != NULL) { 12353 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12354 if (!VTOR4(vp)->created_v4) { 12355 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12356 dnlc_update(dvp, nm, vp); 12357 } else { 12358 /* 12359 * XXX don't update if the created_v4 flag is 12360 * set 12361 */ 12362 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12363 NFS4_DEBUG(nfs4_client_state_debug, 12364 (CE_NOTE, "nfs4_update_dircaches: " 12365 "don't update dnlc: created_v4 flag")); 12366 } 12367 } 12368 12369 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12370 dinfo->di_cred, FALSE, cinfo); 12371 12372 return; 12373 } 12374 12375 /* 12376 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12377 * Since caller modified dir but didn't receive post-dirmod-op dir 12378 * attrs, the dir's attrs must be purged. 12379 * 12380 * XXX this check and dnlc update/purge should really be atomic, 12381 * XXX but can't use rnode statelock because it'll deadlock in 12382 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12383 * XXX does occur. 12384 * 12385 * XXX We also may want to check that atomic is true in the 12386 * XXX change_info struct. If it is not, the change_info may 12387 * XXX reflect changes by more than one clients which means that 12388 * XXX our cache may not be valid. 12389 */ 12390 PURGE_ATTRCACHE4(dvp); 12391 if (drp->r_change == cinfo->before) { 12392 /* no changes took place in the directory prior to our link */ 12393 if (vp != NULL) { 12394 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12395 if (!VTOR4(vp)->created_v4) { 12396 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12397 dnlc_update(dvp, nm, vp); 12398 } else { 12399 /* 12400 * XXX dont' update if the created_v4 flag 12401 * is set 12402 */ 12403 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12404 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12405 "nfs4_update_dircaches: don't" 12406 " update dnlc: created_v4 flag")); 12407 } 12408 } 12409 } else { 12410 /* Another client modified directory - purge its dnlc cache */ 12411 dnlc_purge_vp(dvp); 12412 } 12413 } 12414 12415 /* 12416 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12417 * file. 12418 * 12419 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12420 * file (ie: client recovery) and otherwise set to FALSE. 12421 * 12422 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12423 * initiated) calling functions. 12424 * 12425 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12426 * of resending a 'lost' open request. 12427 * 12428 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12429 * server that hands out BAD_SEQID on open confirm. 12430 * 12431 * Errors are returned via the nfs4_error_t parameter. 12432 */ 12433 void 12434 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12435 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12436 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12437 { 12438 COMPOUND4args_clnt args; 12439 COMPOUND4res_clnt res; 12440 nfs_argop4 argop[2]; 12441 nfs_resop4 *resop; 12442 int doqueue = 1; 12443 mntinfo4_t *mi; 12444 OPEN_CONFIRM4args *open_confirm_args; 12445 int needrecov; 12446 12447 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12448 #if DEBUG 12449 mutex_enter(&oop->oo_lock); 12450 ASSERT(oop->oo_seqid_inuse); 12451 mutex_exit(&oop->oo_lock); 12452 #endif 12453 12454 recov_retry_confirm: 12455 nfs4_error_zinit(ep); 12456 *retry_open = FALSE; 12457 12458 if (resend) 12459 args.ctag = TAG_OPEN_CONFIRM_LOST; 12460 else 12461 args.ctag = TAG_OPEN_CONFIRM; 12462 12463 args.array_len = 2; 12464 args.array = argop; 12465 12466 /* putfh target fh */ 12467 argop[0].argop = OP_CPUTFH; 12468 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12469 12470 argop[1].argop = OP_OPEN_CONFIRM; 12471 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12472 12473 (*seqid) += 1; 12474 open_confirm_args->seqid = *seqid; 12475 open_confirm_args->open_stateid = *stateid; 12476 12477 mi = VTOMI4(vp); 12478 12479 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12480 12481 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12482 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12483 } 12484 12485 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12486 if (!needrecov && ep->error) 12487 return; 12488 12489 if (needrecov) { 12490 bool_t abort = FALSE; 12491 12492 if (reopening_file == FALSE) { 12493 nfs4_bseqid_entry_t *bsep = NULL; 12494 12495 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12496 bsep = nfs4_create_bseqid_entry(oop, NULL, 12497 vp, 0, args.ctag, 12498 open_confirm_args->seqid); 12499 12500 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12501 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12502 if (bsep) { 12503 kmem_free(bsep, sizeof (*bsep)); 12504 if (num_bseqid_retryp && 12505 --(*num_bseqid_retryp) == 0) 12506 abort = TRUE; 12507 } 12508 } 12509 if ((ep->error == ETIMEDOUT || 12510 res.status == NFS4ERR_RESOURCE) && 12511 abort == FALSE && resend == FALSE) { 12512 if (!ep->error) 12513 (void) xdr_free(xdr_COMPOUND4res_clnt, 12514 (caddr_t)&res); 12515 12516 delay(SEC_TO_TICK(confirm_retry_sec)); 12517 goto recov_retry_confirm; 12518 } 12519 /* State may have changed so retry the entire OPEN op */ 12520 if (abort == FALSE) 12521 *retry_open = TRUE; 12522 else 12523 *retry_open = FALSE; 12524 if (!ep->error) 12525 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12526 return; 12527 } 12528 12529 if (res.status) { 12530 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12531 return; 12532 } 12533 12534 resop = &res.array[1]; /* open confirm res */ 12535 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12536 stateid, sizeof (*stateid)); 12537 12538 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12539 } 12540 12541 /* 12542 * Return the credentials associated with a client state object. The 12543 * caller is responsible for freeing the credentials. 12544 */ 12545 12546 static cred_t * 12547 state_to_cred(nfs4_open_stream_t *osp) 12548 { 12549 cred_t *cr; 12550 12551 /* 12552 * It's ok to not lock the open stream and open owner to get 12553 * the oo_cred since this is only written once (upon creation) 12554 * and will not change. 12555 */ 12556 cr = osp->os_open_owner->oo_cred; 12557 crhold(cr); 12558 12559 return (cr); 12560 } 12561 12562 /* 12563 * nfs4_find_sysid 12564 * 12565 * Find the sysid for the knetconfig associated with the given mi. 12566 */ 12567 static struct lm_sysid * 12568 nfs4_find_sysid(mntinfo4_t *mi) 12569 { 12570 ASSERT(nfs_zone() == mi->mi_zone); 12571 12572 /* 12573 * Switch from RDMA knconf to original mount knconf 12574 */ 12575 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12576 mi->mi_curr_serv->sv_hostname, NULL)); 12577 } 12578 12579 #ifdef DEBUG 12580 /* 12581 * Return a string version of the call type for easy reading. 12582 */ 12583 static char * 12584 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12585 { 12586 switch (ctype) { 12587 case NFS4_LCK_CTYPE_NORM: 12588 return ("NORMAL"); 12589 case NFS4_LCK_CTYPE_RECLAIM: 12590 return ("RECLAIM"); 12591 case NFS4_LCK_CTYPE_RESEND: 12592 return ("RESEND"); 12593 case NFS4_LCK_CTYPE_REINSTATE: 12594 return ("REINSTATE"); 12595 default: 12596 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12597 "type %d", ctype); 12598 return (""); 12599 } 12600 } 12601 #endif 12602 12603 /* 12604 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12605 * Unlock requests don't have an over-the-wire locktype, so we just return 12606 * something non-threatening. 12607 */ 12608 12609 static nfs_lock_type4 12610 flk_to_locktype(int cmd, int l_type) 12611 { 12612 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12613 12614 switch (l_type) { 12615 case F_UNLCK: 12616 return (READ_LT); 12617 case F_RDLCK: 12618 if (cmd == F_SETLK) 12619 return (READ_LT); 12620 else 12621 return (READW_LT); 12622 case F_WRLCK: 12623 if (cmd == F_SETLK) 12624 return (WRITE_LT); 12625 else 12626 return (WRITEW_LT); 12627 } 12628 panic("flk_to_locktype"); 12629 /*NOTREACHED*/ 12630 } 12631 12632 /* 12633 * Do some preliminary checks for nfs4frlock. 12634 */ 12635 static int 12636 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12637 u_offset_t offset) 12638 { 12639 int error = 0; 12640 12641 /* 12642 * If we are setting a lock, check that the file is opened 12643 * with the correct mode. 12644 */ 12645 if (cmd == F_SETLK || cmd == F_SETLKW) { 12646 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12647 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12648 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12649 "nfs4frlock_validate_args: file was opened with " 12650 "incorrect mode")); 12651 return (EBADF); 12652 } 12653 } 12654 12655 /* Convert the offset. It may need to be restored before returning. */ 12656 if (error = convoff(vp, flk, 0, offset)) { 12657 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12658 "nfs4frlock_validate_args: convoff => error= %d\n", 12659 error)); 12660 return (error); 12661 } 12662 12663 return (error); 12664 } 12665 12666 /* 12667 * Set the flock64's lm_sysid for nfs4frlock. 12668 */ 12669 static int 12670 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12671 { 12672 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12673 12674 /* Find the lm_sysid */ 12675 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12676 12677 if (*lspp == NULL) { 12678 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12679 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12680 return (ENOLCK); 12681 } 12682 12683 flk->l_sysid = lm_sysidt(*lspp); 12684 12685 return (0); 12686 } 12687 12688 /* 12689 * Do the remaining preliminary setup for nfs4frlock. 12690 */ 12691 static void 12692 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12693 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12694 cred_t **cred_otw) 12695 { 12696 /* 12697 * set tick_delay to the base delay time. 12698 * (NFS4_BASE_WAIT_TIME is in secs) 12699 */ 12700 12701 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12702 12703 /* 12704 * If lock is relative to EOF, we need the newest length of the 12705 * file. Therefore invalidate the ATTR_CACHE. 12706 */ 12707 12708 *whencep = flk->l_whence; 12709 12710 if (*whencep == 2) /* SEEK_END */ 12711 PURGE_ATTRCACHE4(vp); 12712 12713 recov_statep->rs_flags = 0; 12714 recov_statep->rs_num_retry_despite_err = 0; 12715 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12716 } 12717 12718 /* 12719 * Initialize and allocate the data structures necessary for 12720 * the nfs4frlock call. 12721 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12722 */ 12723 static void 12724 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12725 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12726 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12727 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12728 { 12729 int argoplist_size; 12730 int num_ops = 2; 12731 12732 *retry = FALSE; 12733 *did_start_fop = FALSE; 12734 *skip_get_err = FALSE; 12735 lost_rqstp->lr_op = 0; 12736 argoplist_size = num_ops * sizeof (nfs_argop4); 12737 /* fill array with zero */ 12738 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12739 12740 *argspp = argsp; 12741 *respp = NULL; 12742 12743 argsp->array_len = num_ops; 12744 argsp->array = *argopp; 12745 12746 /* initialize in case of error; will get real value down below */ 12747 argsp->ctag = TAG_NONE; 12748 12749 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12750 *op_hintp = OH_LOCKU; 12751 else 12752 *op_hintp = OH_OTHER; 12753 } 12754 12755 /* 12756 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12757 * the proper nfs4_server_t for this instance of nfs4frlock. 12758 * Returns 0 (success) or an errno value. 12759 */ 12760 static int 12761 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12762 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12763 bool_t *did_start_fop, bool_t *startrecovp) 12764 { 12765 int error = 0; 12766 rnode4_t *rp; 12767 12768 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12769 12770 if (ctype == NFS4_LCK_CTYPE_NORM) { 12771 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12772 recov_statep, startrecovp); 12773 if (error) 12774 return (error); 12775 *did_start_fop = TRUE; 12776 } else { 12777 *did_start_fop = FALSE; 12778 *startrecovp = FALSE; 12779 } 12780 12781 if (!error) { 12782 rp = VTOR4(vp); 12783 12784 /* If the file failed recovery, just quit. */ 12785 mutex_enter(&rp->r_statelock); 12786 if (rp->r_flags & R4RECOVERR) { 12787 error = EIO; 12788 } 12789 mutex_exit(&rp->r_statelock); 12790 } 12791 12792 return (error); 12793 } 12794 12795 /* 12796 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 12797 * resend nfs4frlock call is initiated by the recovery framework. 12798 * Acquires the lop and oop seqid synchronization. 12799 */ 12800 static void 12801 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 12802 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 12803 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 12804 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 12805 { 12806 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 12807 int error; 12808 12809 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 12810 (CE_NOTE, 12811 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 12812 ASSERT(resend_rqstp != NULL); 12813 ASSERT(resend_rqstp->lr_op == OP_LOCK || 12814 resend_rqstp->lr_op == OP_LOCKU); 12815 12816 *oopp = resend_rqstp->lr_oop; 12817 if (resend_rqstp->lr_oop) { 12818 open_owner_hold(resend_rqstp->lr_oop); 12819 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 12820 ASSERT(error == 0); /* recov thread always succeeds */ 12821 } 12822 12823 /* Must resend this lost lock/locku request. */ 12824 ASSERT(resend_rqstp->lr_lop != NULL); 12825 *lopp = resend_rqstp->lr_lop; 12826 lock_owner_hold(resend_rqstp->lr_lop); 12827 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 12828 ASSERT(error == 0); /* recov thread always succeeds */ 12829 12830 *ospp = resend_rqstp->lr_osp; 12831 if (*ospp) 12832 open_stream_hold(resend_rqstp->lr_osp); 12833 12834 if (resend_rqstp->lr_op == OP_LOCK) { 12835 LOCK4args *lock_args; 12836 12837 argop->argop = OP_LOCK; 12838 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 12839 lock_args->locktype = resend_rqstp->lr_locktype; 12840 lock_args->reclaim = 12841 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 12842 lock_args->offset = resend_rqstp->lr_flk->l_start; 12843 lock_args->length = resend_rqstp->lr_flk->l_len; 12844 if (lock_args->length == 0) 12845 lock_args->length = ~lock_args->length; 12846 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 12847 mi2clientid(mi), &lock_args->locker); 12848 12849 switch (resend_rqstp->lr_ctype) { 12850 case NFS4_LCK_CTYPE_RESEND: 12851 argsp->ctag = TAG_LOCK_RESEND; 12852 break; 12853 case NFS4_LCK_CTYPE_REINSTATE: 12854 argsp->ctag = TAG_LOCK_REINSTATE; 12855 break; 12856 case NFS4_LCK_CTYPE_RECLAIM: 12857 argsp->ctag = TAG_LOCK_RECLAIM; 12858 break; 12859 default: 12860 argsp->ctag = TAG_LOCK_UNKNOWN; 12861 break; 12862 } 12863 } else { 12864 LOCKU4args *locku_args; 12865 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 12866 12867 argop->argop = OP_LOCKU; 12868 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 12869 locku_args->locktype = READ_LT; 12870 locku_args->seqid = lop->lock_seqid + 1; 12871 mutex_enter(&lop->lo_lock); 12872 locku_args->lock_stateid = lop->lock_stateid; 12873 mutex_exit(&lop->lo_lock); 12874 locku_args->offset = resend_rqstp->lr_flk->l_start; 12875 locku_args->length = resend_rqstp->lr_flk->l_len; 12876 if (locku_args->length == 0) 12877 locku_args->length = ~locku_args->length; 12878 12879 switch (resend_rqstp->lr_ctype) { 12880 case NFS4_LCK_CTYPE_RESEND: 12881 argsp->ctag = TAG_LOCKU_RESEND; 12882 break; 12883 case NFS4_LCK_CTYPE_REINSTATE: 12884 argsp->ctag = TAG_LOCKU_REINSTATE; 12885 break; 12886 default: 12887 argsp->ctag = TAG_LOCK_UNKNOWN; 12888 break; 12889 } 12890 } 12891 } 12892 12893 /* 12894 * Setup the LOCKT4 arguments. 12895 */ 12896 static void 12897 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 12898 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 12899 rnode4_t *rp) 12900 { 12901 LOCKT4args *lockt_args; 12902 12903 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 12904 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 12905 argop->argop = OP_LOCKT; 12906 argsp->ctag = TAG_LOCKT; 12907 lockt_args = &argop->nfs_argop4_u.oplockt; 12908 12909 /* 12910 * The locktype will be READ_LT unless it's 12911 * a write lock. We do this because the Solaris 12912 * system call allows the combination of 12913 * F_UNLCK and F_GETLK* and so in that case the 12914 * unlock is mapped to a read. 12915 */ 12916 if (flk->l_type == F_WRLCK) 12917 lockt_args->locktype = WRITE_LT; 12918 else 12919 lockt_args->locktype = READ_LT; 12920 12921 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 12922 /* set the lock owner4 args */ 12923 nfs4_setlockowner_args(&lockt_args->owner, rp, 12924 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 12925 flk->l_pid); 12926 lockt_args->offset = flk->l_start; 12927 lockt_args->length = flk->l_len; 12928 if (flk->l_len == 0) 12929 lockt_args->length = ~lockt_args->length; 12930 12931 *lockt_argsp = lockt_args; 12932 } 12933 12934 /* 12935 * If the client is holding a delegation, and the open stream to be used 12936 * with this lock request is a delegation open stream, then re-open the stream. 12937 * Sets the nfs4_error_t to all zeros unless the open stream has already 12938 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 12939 * means the caller should retry (like a recovery retry). 12940 */ 12941 static void 12942 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 12943 { 12944 open_delegation_type4 dt; 12945 bool_t reopen_needed, force; 12946 nfs4_open_stream_t *osp; 12947 open_claim_type4 oclaim; 12948 rnode4_t *rp = VTOR4(vp); 12949 mntinfo4_t *mi = VTOMI4(vp); 12950 12951 ASSERT(nfs_zone() == mi->mi_zone); 12952 12953 nfs4_error_zinit(ep); 12954 12955 mutex_enter(&rp->r_statev4_lock); 12956 dt = rp->r_deleg_type; 12957 mutex_exit(&rp->r_statev4_lock); 12958 12959 if (dt != OPEN_DELEGATE_NONE) { 12960 nfs4_open_owner_t *oop; 12961 12962 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 12963 if (!oop) { 12964 ep->stat = NFS4ERR_IO; 12965 return; 12966 } 12967 /* returns with 'os_sync_lock' held */ 12968 osp = find_open_stream(oop, rp); 12969 if (!osp) { 12970 open_owner_rele(oop); 12971 ep->stat = NFS4ERR_IO; 12972 return; 12973 } 12974 12975 if (osp->os_failed_reopen) { 12976 NFS4_DEBUG((nfs4_open_stream_debug || 12977 nfs4_client_lock_debug), (CE_NOTE, 12978 "nfs4frlock_check_deleg: os_failed_reopen set " 12979 "for osp %p, cr %p, rp %s", (void *)osp, 12980 (void *)cr, rnode4info(rp))); 12981 mutex_exit(&osp->os_sync_lock); 12982 open_stream_rele(osp, rp); 12983 open_owner_rele(oop); 12984 ep->stat = NFS4ERR_IO; 12985 return; 12986 } 12987 12988 /* 12989 * Determine whether a reopen is needed. If this 12990 * is a delegation open stream, then send the open 12991 * to the server to give visibility to the open owner. 12992 * Even if it isn't a delegation open stream, we need 12993 * to check if the previous open CLAIM_DELEGATE_CUR 12994 * was sufficient. 12995 */ 12996 12997 reopen_needed = osp->os_delegation || 12998 ((lt == F_RDLCK && 12999 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13000 (lt == F_WRLCK && 13001 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13002 13003 mutex_exit(&osp->os_sync_lock); 13004 open_owner_rele(oop); 13005 13006 if (reopen_needed) { 13007 /* 13008 * Always use CLAIM_PREVIOUS after server reboot. 13009 * The server will reject CLAIM_DELEGATE_CUR if 13010 * it is used during the grace period. 13011 */ 13012 mutex_enter(&mi->mi_lock); 13013 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13014 oclaim = CLAIM_PREVIOUS; 13015 force = TRUE; 13016 } else { 13017 oclaim = CLAIM_DELEGATE_CUR; 13018 force = FALSE; 13019 } 13020 mutex_exit(&mi->mi_lock); 13021 13022 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13023 if (ep->error == EAGAIN) { 13024 nfs4_error_zinit(ep); 13025 ep->stat = NFS4ERR_DELAY; 13026 } 13027 } 13028 open_stream_rele(osp, rp); 13029 osp = NULL; 13030 } 13031 } 13032 13033 /* 13034 * Setup the LOCKU4 arguments. 13035 * Returns errors via the nfs4_error_t. 13036 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13037 * over-the-wire. The caller must release the 13038 * reference on *lopp. 13039 * NFS4ERR_DELAY caller should retry (like recovery retry) 13040 * (other) unrecoverable error. 13041 */ 13042 static void 13043 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13044 LOCKU4args **locku_argsp, flock64_t *flk, 13045 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13046 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13047 bool_t *skip_get_err, bool_t *go_otwp) 13048 { 13049 nfs4_lock_owner_t *lop = NULL; 13050 LOCKU4args *locku_args; 13051 pid_t pid; 13052 bool_t is_spec = FALSE; 13053 rnode4_t *rp = VTOR4(vp); 13054 13055 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13056 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13057 13058 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13059 if (ep->error || ep->stat) 13060 return; 13061 13062 argop->argop = OP_LOCKU; 13063 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13064 argsp->ctag = TAG_LOCKU_REINSTATE; 13065 else 13066 argsp->ctag = TAG_LOCKU; 13067 locku_args = &argop->nfs_argop4_u.oplocku; 13068 *locku_argsp = locku_args; 13069 13070 /* 13071 * XXX what should locku_args->locktype be? 13072 * setting to ALWAYS be READ_LT so at least 13073 * it is a valid locktype. 13074 */ 13075 13076 locku_args->locktype = READ_LT; 13077 13078 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13079 flk->l_pid; 13080 13081 /* 13082 * Get the lock owner stateid. If no lock owner 13083 * exists, return success. 13084 */ 13085 lop = find_lock_owner(rp, pid, LOWN_ANY); 13086 *lopp = lop; 13087 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13088 is_spec = TRUE; 13089 if (!lop || is_spec) { 13090 /* 13091 * No lock owner so no locks to unlock. 13092 * Return success. If there was a failed 13093 * reclaim earlier, the lock might still be 13094 * registered with the local locking code, 13095 * so notify it of the unlock. 13096 * 13097 * If the lockowner is using a special stateid, 13098 * then the original lock request (that created 13099 * this lockowner) was never successful, so we 13100 * have no lock to undo OTW. 13101 */ 13102 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13103 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13104 "(%ld) so return success", (long)pid)); 13105 13106 if (ctype == NFS4_LCK_CTYPE_NORM) 13107 flk->l_pid = curproc->p_pid; 13108 nfs4_register_lock_locally(vp, flk, flag, offset); 13109 /* 13110 * Release our hold and NULL out so final_cleanup 13111 * doesn't try to end a lock seqid sync we 13112 * never started. 13113 */ 13114 if (is_spec) { 13115 lock_owner_rele(lop); 13116 *lopp = NULL; 13117 } 13118 *skip_get_err = TRUE; 13119 *go_otwp = FALSE; 13120 return; 13121 } 13122 13123 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13124 if (ep->error == EAGAIN) { 13125 lock_owner_rele(lop); 13126 *lopp = NULL; 13127 return; 13128 } 13129 13130 mutex_enter(&lop->lo_lock); 13131 locku_args->lock_stateid = lop->lock_stateid; 13132 mutex_exit(&lop->lo_lock); 13133 locku_args->seqid = lop->lock_seqid + 1; 13134 13135 /* leave the ref count on lop, rele after RPC call */ 13136 13137 locku_args->offset = flk->l_start; 13138 locku_args->length = flk->l_len; 13139 if (flk->l_len == 0) 13140 locku_args->length = ~locku_args->length; 13141 13142 *go_otwp = TRUE; 13143 } 13144 13145 /* 13146 * Setup the LOCK4 arguments. 13147 * 13148 * Returns errors via the nfs4_error_t. 13149 * NFS4_OK no problems 13150 * NFS4ERR_DELAY caller should retry (like recovery retry) 13151 * (other) unrecoverable error 13152 */ 13153 static void 13154 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13155 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13156 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13157 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13158 { 13159 LOCK4args *lock_args; 13160 nfs4_open_owner_t *oop = NULL; 13161 nfs4_open_stream_t *osp = NULL; 13162 nfs4_lock_owner_t *lop = NULL; 13163 pid_t pid; 13164 rnode4_t *rp = VTOR4(vp); 13165 13166 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13167 13168 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13169 if (ep->error || ep->stat != NFS4_OK) 13170 return; 13171 13172 argop->argop = OP_LOCK; 13173 if (ctype == NFS4_LCK_CTYPE_NORM) 13174 argsp->ctag = TAG_LOCK; 13175 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13176 argsp->ctag = TAG_RELOCK; 13177 else 13178 argsp->ctag = TAG_LOCK_REINSTATE; 13179 lock_args = &argop->nfs_argop4_u.oplock; 13180 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13181 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13182 /* 13183 * Get the lock owner. If no lock owner exists, 13184 * create a 'temporary' one and grab the open seqid 13185 * synchronization (which puts a hold on the open 13186 * owner and open stream). 13187 * This also grabs the lock seqid synchronization. 13188 */ 13189 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13190 ep->stat = 13191 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13192 13193 if (ep->stat != NFS4_OK) 13194 goto out; 13195 13196 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13197 &lock_args->locker); 13198 13199 lock_args->offset = flk->l_start; 13200 lock_args->length = flk->l_len; 13201 if (flk->l_len == 0) 13202 lock_args->length = ~lock_args->length; 13203 *lock_argsp = lock_args; 13204 out: 13205 *oopp = oop; 13206 *ospp = osp; 13207 *lopp = lop; 13208 } 13209 13210 /* 13211 * After we get the reply from the server, record the proper information 13212 * for possible resend lock requests. 13213 * 13214 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13215 */ 13216 static void 13217 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13218 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13219 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13220 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13221 { 13222 bool_t unlock = (flk->l_type == F_UNLCK); 13223 13224 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13225 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13226 ctype == NFS4_LCK_CTYPE_REINSTATE); 13227 13228 if (error != 0 && !unlock) { 13229 NFS4_DEBUG((nfs4_lost_rqst_debug || 13230 nfs4_client_lock_debug), (CE_NOTE, 13231 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13232 " for lop %p", (void *)lop)); 13233 ASSERT(lop != NULL); 13234 mutex_enter(&lop->lo_lock); 13235 lop->lo_pending_rqsts = 1; 13236 mutex_exit(&lop->lo_lock); 13237 } 13238 13239 lost_rqstp->lr_putfirst = FALSE; 13240 lost_rqstp->lr_op = 0; 13241 13242 /* 13243 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13244 * recovery purposes so that the lock request that was sent 13245 * can be saved and re-issued later. Ditto for EIO from a forced 13246 * unmount. This is done to have the client's local locking state 13247 * match the v4 server's state; that is, the request was 13248 * potentially received and accepted by the server but the client 13249 * thinks it was not. 13250 */ 13251 if (error == ETIMEDOUT || error == EINTR || 13252 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13253 NFS4_DEBUG((nfs4_lost_rqst_debug || 13254 nfs4_client_lock_debug), (CE_NOTE, 13255 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13256 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13257 (void *)lop, (void *)oop, (void *)osp)); 13258 if (unlock) 13259 lost_rqstp->lr_op = OP_LOCKU; 13260 else { 13261 lost_rqstp->lr_op = OP_LOCK; 13262 lost_rqstp->lr_locktype = locktype; 13263 } 13264 /* 13265 * Objects are held and rele'd via the recovery code. 13266 * See nfs4_save_lost_rqst. 13267 */ 13268 lost_rqstp->lr_vp = vp; 13269 lost_rqstp->lr_dvp = NULL; 13270 lost_rqstp->lr_oop = oop; 13271 lost_rqstp->lr_osp = osp; 13272 lost_rqstp->lr_lop = lop; 13273 lost_rqstp->lr_cr = cr; 13274 switch (ctype) { 13275 case NFS4_LCK_CTYPE_NORM: 13276 flk->l_pid = ttoproc(curthread)->p_pid; 13277 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13278 break; 13279 case NFS4_LCK_CTYPE_REINSTATE: 13280 lost_rqstp->lr_putfirst = TRUE; 13281 lost_rqstp->lr_ctype = ctype; 13282 break; 13283 default: 13284 break; 13285 } 13286 lost_rqstp->lr_flk = flk; 13287 } 13288 } 13289 13290 /* 13291 * Update lop's seqid. Also update the seqid stored in a resend request, 13292 * if any. (Some recovery errors increment the seqid, and we may have to 13293 * send the resend request again.) 13294 */ 13295 13296 static void 13297 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13298 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13299 { 13300 if (lock_args) { 13301 if (lock_args->locker.new_lock_owner == TRUE) 13302 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13303 else { 13304 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13305 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13306 } 13307 } else if (locku_args) { 13308 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13309 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13310 } 13311 } 13312 13313 /* 13314 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13315 * COMPOUND4 args/res for calls that need to retry. 13316 * Switches the *cred_otwp to base_cr. 13317 */ 13318 static void 13319 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13320 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13321 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13322 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13323 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13324 { 13325 nfs4_open_owner_t *oop = *oopp; 13326 nfs4_open_stream_t *osp = *ospp; 13327 nfs4_lock_owner_t *lop = *lopp; 13328 nfs_argop4 *argop = (*argspp)->array; 13329 13330 if (*did_start_fop) { 13331 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13332 needrecov); 13333 *did_start_fop = FALSE; 13334 } 13335 ASSERT((*argspp)->array_len == 2); 13336 if (argop[1].argop == OP_LOCK) 13337 nfs4args_lock_free(&argop[1]); 13338 else if (argop[1].argop == OP_LOCKT) 13339 nfs4args_lockt_free(&argop[1]); 13340 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13341 if (!error) 13342 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13343 *argspp = NULL; 13344 *respp = NULL; 13345 13346 if (lop) { 13347 nfs4_end_lock_seqid_sync(lop); 13348 lock_owner_rele(lop); 13349 *lopp = NULL; 13350 } 13351 13352 /* need to free up the reference on osp for lock args */ 13353 if (osp != NULL) { 13354 open_stream_rele(osp, VTOR4(vp)); 13355 *ospp = NULL; 13356 } 13357 13358 /* need to free up the reference on oop for lock args */ 13359 if (oop != NULL) { 13360 nfs4_end_open_seqid_sync(oop); 13361 open_owner_rele(oop); 13362 *oopp = NULL; 13363 } 13364 13365 crfree(*cred_otwp); 13366 *cred_otwp = base_cr; 13367 crhold(*cred_otwp); 13368 } 13369 13370 /* 13371 * Function to process the client's recovery for nfs4frlock. 13372 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13373 * 13374 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13375 * COMPOUND4 args/res for calls that need to retry. 13376 * 13377 * Note: the rp's r_lkserlock is *not* dropped during this path. 13378 */ 13379 static bool_t 13380 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13381 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13382 LOCK4args *lock_args, LOCKU4args *locku_args, 13383 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13384 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13385 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13386 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13387 { 13388 nfs4_open_owner_t *oop = *oopp; 13389 nfs4_open_stream_t *osp = *ospp; 13390 nfs4_lock_owner_t *lop = *lopp; 13391 13392 bool_t abort, retry; 13393 13394 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13395 ASSERT((*argspp) != NULL); 13396 ASSERT((*respp) != NULL); 13397 if (lock_args || locku_args) 13398 ASSERT(lop != NULL); 13399 13400 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13401 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13402 13403 retry = TRUE; 13404 abort = FALSE; 13405 if (needrecov) { 13406 nfs4_bseqid_entry_t *bsep = NULL; 13407 nfs_opnum4 op; 13408 13409 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13410 13411 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13412 seqid4 seqid; 13413 13414 if (lock_args) { 13415 if (lock_args->locker.new_lock_owner == TRUE) 13416 seqid = lock_args->locker.locker4_u. 13417 open_owner.open_seqid; 13418 else 13419 seqid = lock_args->locker.locker4_u. 13420 lock_owner.lock_seqid; 13421 } else if (locku_args) { 13422 seqid = locku_args->seqid; 13423 } else { 13424 seqid = 0; 13425 } 13426 13427 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13428 flk->l_pid, (*argspp)->ctag, seqid); 13429 } 13430 13431 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13432 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13433 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13434 NULL, op, bsep); 13435 13436 if (bsep) 13437 kmem_free(bsep, sizeof (*bsep)); 13438 } 13439 13440 /* 13441 * Return that we do not want to retry the request for 3 cases: 13442 * 1. If we received EINTR or are bailing out because of a forced 13443 * unmount, we came into this code path just for the sake of 13444 * initiating recovery, we now need to return the error. 13445 * 2. If we have aborted recovery. 13446 * 3. We received NFS4ERR_BAD_SEQID. 13447 */ 13448 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13449 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13450 retry = FALSE; 13451 13452 if (*did_start_fop == TRUE) { 13453 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13454 needrecov); 13455 *did_start_fop = FALSE; 13456 } 13457 13458 if (retry == TRUE) { 13459 nfs_argop4 *argop; 13460 13461 argop = (*argspp)->array; 13462 ASSERT((*argspp)->array_len == 2); 13463 13464 if (argop[1].argop == OP_LOCK) 13465 nfs4args_lock_free(&argop[1]); 13466 else if (argop[1].argop == OP_LOCKT) 13467 nfs4args_lockt_free(&argop[1]); 13468 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13469 if (!ep->error) 13470 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13471 *respp = NULL; 13472 *argspp = NULL; 13473 } 13474 13475 if (lop != NULL) { 13476 nfs4_end_lock_seqid_sync(lop); 13477 lock_owner_rele(lop); 13478 } 13479 13480 *lopp = NULL; 13481 13482 /* need to free up the reference on osp for lock args */ 13483 if (osp != NULL) { 13484 open_stream_rele(osp, rp); 13485 *ospp = NULL; 13486 } 13487 13488 /* need to free up the reference on oop for lock args */ 13489 if (oop != NULL) { 13490 nfs4_end_open_seqid_sync(oop); 13491 open_owner_rele(oop); 13492 *oopp = NULL; 13493 } 13494 13495 return (retry); 13496 } 13497 13498 /* 13499 * Handles the succesful reply from the server for nfs4frlock. 13500 */ 13501 static void 13502 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13503 vnode_t *vp, int flag, u_offset_t offset, 13504 nfs4_lost_rqst_t *resend_rqstp) 13505 { 13506 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13507 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13508 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13509 if (ctype == NFS4_LCK_CTYPE_NORM) { 13510 flk->l_pid = ttoproc(curthread)->p_pid; 13511 /* 13512 * We do not register lost locks locally in 13513 * the 'resend' case since the user/application 13514 * doesn't think we have the lock. 13515 */ 13516 ASSERT(!resend_rqstp); 13517 nfs4_register_lock_locally(vp, flk, flag, offset); 13518 } 13519 } 13520 } 13521 13522 /* 13523 * Handle the DENIED reply from the server for nfs4frlock. 13524 * Returns TRUE if we should retry the request; FALSE otherwise. 13525 * 13526 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13527 * COMPOUND4 args/res for calls that need to retry. Can also 13528 * drop and regrab the r_lkserlock. 13529 */ 13530 static bool_t 13531 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13532 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13533 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13534 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13535 nfs4_recov_state_t *recov_statep, int needrecov, 13536 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13537 clock_t *tick_delayp, short *whencep, int *errorp, 13538 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13539 bool_t *skip_get_err) 13540 { 13541 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13542 13543 if (lock_args) { 13544 nfs4_open_owner_t *oop = *oopp; 13545 nfs4_open_stream_t *osp = *ospp; 13546 nfs4_lock_owner_t *lop = *lopp; 13547 int intr; 13548 13549 /* 13550 * Blocking lock needs to sleep and retry from the request. 13551 * 13552 * Do not block and wait for 'resend' or 'reinstate' 13553 * lock requests, just return the error. 13554 * 13555 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13556 */ 13557 if (cmd == F_SETLKW) { 13558 rnode4_t *rp = VTOR4(vp); 13559 nfs_argop4 *argop = (*argspp)->array; 13560 13561 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13562 13563 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13564 recov_statep, needrecov); 13565 *did_start_fop = FALSE; 13566 ASSERT((*argspp)->array_len == 2); 13567 if (argop[1].argop == OP_LOCK) 13568 nfs4args_lock_free(&argop[1]); 13569 else if (argop[1].argop == OP_LOCKT) 13570 nfs4args_lockt_free(&argop[1]); 13571 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13572 if (*respp) 13573 (void) xdr_free(xdr_COMPOUND4res_clnt, 13574 (caddr_t)*respp); 13575 *argspp = NULL; 13576 *respp = NULL; 13577 nfs4_end_lock_seqid_sync(lop); 13578 lock_owner_rele(lop); 13579 *lopp = NULL; 13580 if (osp != NULL) { 13581 open_stream_rele(osp, rp); 13582 *ospp = NULL; 13583 } 13584 if (oop != NULL) { 13585 nfs4_end_open_seqid_sync(oop); 13586 open_owner_rele(oop); 13587 *oopp = NULL; 13588 } 13589 13590 nfs_rw_exit(&rp->r_lkserlock); 13591 13592 intr = nfs4_block_and_wait(tick_delayp, rp); 13593 13594 if (intr) { 13595 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13596 RW_WRITER, FALSE); 13597 *errorp = EINTR; 13598 return (FALSE); 13599 } 13600 13601 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13602 RW_WRITER, FALSE); 13603 13604 /* 13605 * Make sure we are still safe to lock with 13606 * regards to mmapping. 13607 */ 13608 if (!nfs4_safelock(vp, flk, cr)) { 13609 *errorp = EAGAIN; 13610 return (FALSE); 13611 } 13612 13613 return (TRUE); 13614 } 13615 if (ctype == NFS4_LCK_CTYPE_NORM) 13616 *errorp = EAGAIN; 13617 *skip_get_err = TRUE; 13618 flk->l_whence = 0; 13619 *whencep = 0; 13620 return (FALSE); 13621 } else if (lockt_args) { 13622 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13623 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13624 13625 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13626 flk, lockt_args); 13627 13628 /* according to NLM code */ 13629 *errorp = 0; 13630 *whencep = 0; 13631 *skip_get_err = TRUE; 13632 return (FALSE); 13633 } 13634 return (FALSE); 13635 } 13636 13637 /* 13638 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13639 */ 13640 static void 13641 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13642 { 13643 switch (resp->status) { 13644 case NFS4ERR_ACCESS: 13645 case NFS4ERR_ADMIN_REVOKED: 13646 case NFS4ERR_BADHANDLE: 13647 case NFS4ERR_BAD_RANGE: 13648 case NFS4ERR_BAD_SEQID: 13649 case NFS4ERR_BAD_STATEID: 13650 case NFS4ERR_BADXDR: 13651 case NFS4ERR_DEADLOCK: 13652 case NFS4ERR_DELAY: 13653 case NFS4ERR_EXPIRED: 13654 case NFS4ERR_FHEXPIRED: 13655 case NFS4ERR_GRACE: 13656 case NFS4ERR_INVAL: 13657 case NFS4ERR_ISDIR: 13658 case NFS4ERR_LEASE_MOVED: 13659 case NFS4ERR_LOCK_NOTSUPP: 13660 case NFS4ERR_LOCK_RANGE: 13661 case NFS4ERR_MOVED: 13662 case NFS4ERR_NOFILEHANDLE: 13663 case NFS4ERR_NO_GRACE: 13664 case NFS4ERR_OLD_STATEID: 13665 case NFS4ERR_OPENMODE: 13666 case NFS4ERR_RECLAIM_BAD: 13667 case NFS4ERR_RECLAIM_CONFLICT: 13668 case NFS4ERR_RESOURCE: 13669 case NFS4ERR_SERVERFAULT: 13670 case NFS4ERR_STALE: 13671 case NFS4ERR_STALE_CLIENTID: 13672 case NFS4ERR_STALE_STATEID: 13673 return; 13674 default: 13675 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13676 "nfs4frlock_results_default: got unrecognizable " 13677 "res.status %d", resp->status)); 13678 *errorp = NFS4ERR_INVAL; 13679 } 13680 } 13681 13682 /* 13683 * The lock request was successful, so update the client's state. 13684 */ 13685 static void 13686 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13687 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13688 vnode_t *vp, flock64_t *flk, cred_t *cr, 13689 nfs4_lost_rqst_t *resend_rqstp) 13690 { 13691 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13692 13693 if (lock_args) { 13694 LOCK4res *lock_res; 13695 13696 lock_res = &resop->nfs_resop4_u.oplock; 13697 /* update the stateid with server's response */ 13698 13699 if (lock_args->locker.new_lock_owner == TRUE) { 13700 mutex_enter(&lop->lo_lock); 13701 lop->lo_just_created = NFS4_PERM_CREATED; 13702 mutex_exit(&lop->lo_lock); 13703 } 13704 13705 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13706 13707 /* 13708 * If the lock was the result of a resending a lost 13709 * request, we've synched up the stateid and seqid 13710 * with the server, but now the server might be out of sync 13711 * with what the application thinks it has for locks. 13712 * Clean that up here. It's unclear whether we should do 13713 * this even if the filesystem has been forcibly unmounted. 13714 * For most servers, it's probably wasted effort, but 13715 * RFC3530 lets servers require that unlocks exactly match 13716 * the locks that are held. 13717 */ 13718 if (resend_rqstp != NULL && 13719 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13720 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13721 } else { 13722 flk->l_whence = 0; 13723 } 13724 } else if (locku_args) { 13725 LOCKU4res *locku_res; 13726 13727 locku_res = &resop->nfs_resop4_u.oplocku; 13728 13729 /* Update the stateid with the server's response */ 13730 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13731 } else if (lockt_args) { 13732 /* Switch the lock type to express success, see fcntl */ 13733 flk->l_type = F_UNLCK; 13734 flk->l_whence = 0; 13735 } 13736 } 13737 13738 /* 13739 * Do final cleanup before exiting nfs4frlock. 13740 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13741 * COMPOUND4 args/res for calls that haven't already. 13742 */ 13743 static void 13744 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13745 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13746 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13747 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13748 short whence, u_offset_t offset, struct lm_sysid *ls, 13749 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13750 bool_t did_start_fop, bool_t skip_get_err, 13751 cred_t *cred_otw, cred_t *cred) 13752 { 13753 mntinfo4_t *mi = VTOMI4(vp); 13754 rnode4_t *rp = VTOR4(vp); 13755 int error = *errorp; 13756 nfs_argop4 *argop; 13757 13758 ASSERT(nfs_zone() == mi->mi_zone); 13759 /* 13760 * The client recovery code wants the raw status information, 13761 * so don't map the NFS status code to an errno value for 13762 * non-normal call types. 13763 */ 13764 if (ctype == NFS4_LCK_CTYPE_NORM) { 13765 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13766 *errorp = geterrno4(resp->status); 13767 if (did_start_fop == TRUE) 13768 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13769 needrecov); 13770 13771 if (!error && resp && resp->status == NFS4_OK) { 13772 /* 13773 * We've established a new lock on the server, so invalidate 13774 * the pages associated with the vnode to get the most up to 13775 * date pages from the server after acquiring the lock. We 13776 * want to be sure that the read operation gets the newest data. 13777 * N.B. 13778 * We used to do this in nfs4frlock_results_ok but that doesn't 13779 * work since VOP_PUTPAGE can call nfs4_commit which calls 13780 * nfs4_start_fop. We flush the pages below after calling 13781 * nfs4_end_fop above 13782 */ 13783 int error; 13784 13785 error = VOP_PUTPAGE(vp, (u_offset_t)0, 13786 0, B_INVAL, cred); 13787 13788 if (error && (error == ENOSPC || error == EDQUOT)) { 13789 rnode4_t *rp = VTOR4(vp); 13790 13791 mutex_enter(&rp->r_statelock); 13792 if (!rp->r_error) 13793 rp->r_error = error; 13794 mutex_exit(&rp->r_statelock); 13795 } 13796 } 13797 } 13798 if (argsp) { 13799 ASSERT(argsp->array_len == 2); 13800 argop = argsp->array; 13801 if (argop[1].argop == OP_LOCK) 13802 nfs4args_lock_free(&argop[1]); 13803 else if (argop[1].argop == OP_LOCKT) 13804 nfs4args_lockt_free(&argop[1]); 13805 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13806 if (resp) 13807 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 13808 } 13809 13810 /* free the reference on the lock owner */ 13811 if (lop != NULL) { 13812 nfs4_end_lock_seqid_sync(lop); 13813 lock_owner_rele(lop); 13814 } 13815 13816 /* need to free up the reference on osp for lock args */ 13817 if (osp != NULL) 13818 open_stream_rele(osp, rp); 13819 13820 /* need to free up the reference on oop for lock args */ 13821 if (oop != NULL) { 13822 nfs4_end_open_seqid_sync(oop); 13823 open_owner_rele(oop); 13824 } 13825 13826 (void) convoff(vp, flk, whence, offset); 13827 13828 lm_rel_sysid(ls); 13829 13830 /* 13831 * Record debug information in the event we get EINVAL. 13832 */ 13833 mutex_enter(&mi->mi_lock); 13834 if (*errorp == EINVAL && (lock_args || locku_args) && 13835 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 13836 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 13837 zcmn_err(getzoneid(), CE_NOTE, 13838 "%s operation failed with " 13839 "EINVAL probably since the server, %s," 13840 " doesn't support POSIX style locking", 13841 lock_args ? "LOCK" : "LOCKU", 13842 mi->mi_curr_serv->sv_hostname); 13843 mi->mi_flags |= MI4_LOCK_DEBUG; 13844 } 13845 } 13846 mutex_exit(&mi->mi_lock); 13847 13848 if (cred_otw) 13849 crfree(cred_otw); 13850 } 13851 13852 /* 13853 * This calls the server and the local locking code. 13854 * 13855 * Client locks are registerred locally by oring the sysid with 13856 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 13857 * We need to distinguish between the two to avoid collision in case one 13858 * machine is used as both client and server. 13859 * 13860 * Blocking lock requests will continually retry to acquire the lock 13861 * forever. 13862 * 13863 * The ctype is defined as follows: 13864 * NFS4_LCK_CTYPE_NORM: normal lock request. 13865 * 13866 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 13867 * recovery, get the pid from flk instead of curproc, and don't reregister 13868 * the lock locally. 13869 * 13870 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 13871 * that we will use the information passed in via resend_rqstp to setup the 13872 * lock/locku request. This resend is the exact same request as the 'lost 13873 * lock', and is initiated by the recovery framework. A successful resend 13874 * request can initiate one or more reinstate requests. 13875 * 13876 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 13877 * does not trigger additional reinstate requests. This lock call type is 13878 * set for setting the v4 server's locking state back to match what the 13879 * client's local locking state is in the event of a received 'lost lock'. 13880 * 13881 * Errors are returned via the nfs4_error_t parameter. 13882 */ 13883 void 13884 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 13885 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 13886 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 13887 { 13888 COMPOUND4args_clnt args, *argsp = NULL; 13889 COMPOUND4res_clnt res, *resp = NULL; 13890 nfs_argop4 *argop; 13891 nfs_resop4 *resop; 13892 rnode4_t *rp; 13893 int doqueue = 1; 13894 clock_t tick_delay; /* delay in clock ticks */ 13895 struct lm_sysid *ls; 13896 LOCK4args *lock_args = NULL; 13897 LOCKU4args *locku_args = NULL; 13898 LOCKT4args *lockt_args = NULL; 13899 nfs4_open_owner_t *oop = NULL; 13900 nfs4_open_stream_t *osp = NULL; 13901 nfs4_lock_owner_t *lop = NULL; 13902 bool_t needrecov = FALSE; 13903 nfs4_recov_state_t recov_state; 13904 short whence; 13905 nfs4_op_hint_t op_hint; 13906 nfs4_lost_rqst_t lost_rqst; 13907 bool_t retry = FALSE; 13908 bool_t did_start_fop = FALSE; 13909 bool_t skip_get_err = FALSE; 13910 cred_t *cred_otw = NULL; 13911 bool_t recovonly; /* just queue request */ 13912 int frc_no_reclaim = 0; 13913 #ifdef DEBUG 13914 char *name; 13915 #endif 13916 13917 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13918 13919 #ifdef DEBUG 13920 name = fn_name(VTOSV(vp)->sv_name); 13921 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 13922 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 13923 "length %"PRIu64", pid %d, sysid %d, call type %s, " 13924 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 13925 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 13926 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 13927 resend_rqstp ? "TRUE" : "FALSE")); 13928 kmem_free(name, MAXNAMELEN); 13929 #endif 13930 13931 nfs4_error_zinit(ep); 13932 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 13933 if (ep->error) 13934 return; 13935 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 13936 if (ep->error) 13937 return; 13938 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 13939 vp, cr, &cred_otw); 13940 13941 recov_retry: 13942 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 13943 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 13944 rp = VTOR4(vp); 13945 13946 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 13947 &did_start_fop, &recovonly); 13948 13949 if (ep->error) 13950 goto out; 13951 13952 if (recovonly) { 13953 /* 13954 * Leave the request for the recovery system to deal with. 13955 */ 13956 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13957 ASSERT(cmd != F_GETLK); 13958 ASSERT(flk->l_type == F_UNLCK); 13959 13960 nfs4_error_init(ep, EINTR); 13961 needrecov = TRUE; 13962 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 13963 if (lop != NULL) { 13964 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 13965 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 13966 (void) nfs4_start_recovery(ep, 13967 VTOMI4(vp), vp, NULL, NULL, 13968 (lost_rqst.lr_op == OP_LOCK || 13969 lost_rqst.lr_op == OP_LOCKU) ? 13970 &lost_rqst : NULL, OP_LOCKU, NULL); 13971 lock_owner_rele(lop); 13972 lop = NULL; 13973 } 13974 flk->l_pid = curproc->p_pid; 13975 nfs4_register_lock_locally(vp, flk, flag, offset); 13976 goto out; 13977 } 13978 13979 /* putfh directory fh */ 13980 argop[0].argop = OP_CPUTFH; 13981 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 13982 13983 /* 13984 * Set up the over-the-wire arguments and get references to the 13985 * open owner, etc. 13986 */ 13987 13988 if (ctype == NFS4_LCK_CTYPE_RESEND || 13989 ctype == NFS4_LCK_CTYPE_REINSTATE) { 13990 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 13991 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 13992 } else { 13993 bool_t go_otw = TRUE; 13994 13995 ASSERT(resend_rqstp == NULL); 13996 13997 switch (cmd) { 13998 case F_GETLK: 13999 case F_O_GETLK: 14000 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14001 &lockt_args, argsp, flk, rp); 14002 break; 14003 case F_SETLKW: 14004 case F_SETLK: 14005 if (flk->l_type == F_UNLCK) 14006 nfs4frlock_setup_locku_args(ctype, 14007 &argop[1], &locku_args, flk, 14008 &lop, ep, argsp, 14009 vp, flag, offset, cr, 14010 &skip_get_err, &go_otw); 14011 else 14012 nfs4frlock_setup_lock_args(ctype, 14013 &lock_args, &oop, &osp, &lop, &argop[1], 14014 argsp, flk, cmd, vp, cr, ep); 14015 14016 if (ep->error) 14017 goto out; 14018 14019 switch (ep->stat) { 14020 case NFS4_OK: 14021 break; 14022 case NFS4ERR_DELAY: 14023 /* recov thread never gets this error */ 14024 ASSERT(resend_rqstp == NULL); 14025 ASSERT(did_start_fop); 14026 14027 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14028 &recov_state, TRUE); 14029 did_start_fop = FALSE; 14030 if (argop[1].argop == OP_LOCK) 14031 nfs4args_lock_free(&argop[1]); 14032 else if (argop[1].argop == OP_LOCKT) 14033 nfs4args_lockt_free(&argop[1]); 14034 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14035 argsp = NULL; 14036 goto recov_retry; 14037 default: 14038 ep->error = EIO; 14039 goto out; 14040 } 14041 break; 14042 default: 14043 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14044 "nfs4_frlock: invalid cmd %d", cmd)); 14045 ep->error = EINVAL; 14046 goto out; 14047 } 14048 14049 if (!go_otw) 14050 goto out; 14051 } 14052 14053 /* XXX should we use the local reclock as a cache ? */ 14054 /* 14055 * Unregister the lock with the local locking code before 14056 * contacting the server. This avoids a potential race where 14057 * another process gets notified that it has been granted a lock 14058 * before we can unregister ourselves locally. 14059 */ 14060 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14061 if (ctype == NFS4_LCK_CTYPE_NORM) 14062 flk->l_pid = ttoproc(curthread)->p_pid; 14063 nfs4_register_lock_locally(vp, flk, flag, offset); 14064 } 14065 14066 /* 14067 * Send the server the lock request. Continually loop with a delay 14068 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14069 */ 14070 resp = &res; 14071 14072 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14073 (CE_NOTE, 14074 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14075 rnode4info(rp))); 14076 14077 if (lock_args && frc_no_reclaim) { 14078 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14079 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14080 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14081 lock_args->reclaim = FALSE; 14082 if (did_reclaimp) 14083 *did_reclaimp = 0; 14084 } 14085 14086 /* 14087 * Do the OTW call. 14088 */ 14089 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14090 14091 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14092 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14093 14094 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14095 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14096 "nfs4frlock: needrecov %d", needrecov)); 14097 14098 if (ep->error != 0 && !needrecov && ep->error != EACCES) 14099 goto out; 14100 14101 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14102 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14103 args.ctag); 14104 14105 if ((ep->error == EACCES || 14106 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14107 cred_otw != cr) { 14108 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14109 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14110 cr, &cred_otw); 14111 goto recov_retry; 14112 } 14113 14114 if (needrecov) { 14115 /* 14116 * LOCKT requests don't need to recover from lost 14117 * requests since they don't create/modify state. 14118 */ 14119 if ((ep->error == EINTR || 14120 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14121 lockt_args) 14122 goto out; 14123 /* 14124 * Do not attempt recovery for requests initiated by 14125 * the recovery framework. Let the framework redrive them. 14126 */ 14127 if (ctype != NFS4_LCK_CTYPE_NORM) 14128 goto out; 14129 else { 14130 ASSERT(resend_rqstp == NULL); 14131 } 14132 14133 nfs4frlock_save_lost_rqst(ctype, ep->error, 14134 flk_to_locktype(cmd, flk->l_type), 14135 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14136 14137 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14138 &resp, lock_args, locku_args, &oop, &osp, &lop, 14139 rp, vp, &recov_state, op_hint, &did_start_fop, 14140 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14141 14142 if (retry) { 14143 ASSERT(oop == NULL); 14144 ASSERT(osp == NULL); 14145 ASSERT(lop == NULL); 14146 goto recov_retry; 14147 } 14148 goto out; 14149 } 14150 14151 /* 14152 * Process the reply. 14153 */ 14154 switch (resp->status) { 14155 case NFS4_OK: 14156 resop = &resp->array[1]; 14157 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14158 resend_rqstp); 14159 /* 14160 * Have a successful lock operation, now update state. 14161 */ 14162 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14163 resop, lop, vp, flk, cr, resend_rqstp); 14164 break; 14165 14166 case NFS4ERR_DENIED: 14167 resop = &resp->array[1]; 14168 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14169 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14170 &recov_state, needrecov, &argsp, &resp, 14171 &tick_delay, &whence, &ep->error, resop, cr, 14172 &did_start_fop, &skip_get_err); 14173 14174 if (retry) { 14175 ASSERT(oop == NULL); 14176 ASSERT(osp == NULL); 14177 ASSERT(lop == NULL); 14178 goto recov_retry; 14179 } 14180 break; 14181 /* 14182 * If the server won't let us reclaim, fall-back to trying to lock 14183 * the file from scratch. Code elsewhere will check the changeinfo 14184 * to ensure the file hasn't been changed. 14185 */ 14186 case NFS4ERR_NO_GRACE: 14187 if (lock_args && lock_args->reclaim == TRUE) { 14188 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14189 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14190 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14191 frc_no_reclaim = 1; 14192 /* clean up before retrying */ 14193 needrecov = 0; 14194 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14195 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14196 &recov_state, op_hint, &did_start_fop, NULL, flk); 14197 goto recov_retry; 14198 } 14199 /* FALLTHROUGH */ 14200 14201 default: 14202 nfs4frlock_results_default(resp, &ep->error); 14203 break; 14204 } 14205 out: 14206 /* 14207 * Process and cleanup from error. Make interrupted unlock 14208 * requests look successful, since they will be handled by the 14209 * client recovery code. 14210 */ 14211 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14212 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14213 lock_args, locku_args, did_start_fop, 14214 skip_get_err, cred_otw, cr); 14215 14216 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14217 (cmd == F_SETLK || cmd == F_SETLKW)) 14218 ep->error = 0; 14219 } 14220 14221 /* 14222 * nfs4_safelock: 14223 * 14224 * Return non-zero if the given lock request can be handled without 14225 * violating the constraints on concurrent mapping and locking. 14226 */ 14227 14228 static int 14229 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14230 { 14231 rnode4_t *rp = VTOR4(vp); 14232 struct vattr va; 14233 int error; 14234 14235 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14236 ASSERT(rp->r_mapcnt >= 0); 14237 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14238 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14239 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14240 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14241 14242 if (rp->r_mapcnt == 0) 14243 return (1); /* always safe if not mapped */ 14244 14245 /* 14246 * If the file is already mapped and there are locks, then they 14247 * should be all safe locks. So adding or removing a lock is safe 14248 * as long as the new request is safe (i.e., whole-file, meaning 14249 * length and starting offset are both zero). 14250 */ 14251 14252 if (bfp->l_start != 0 || bfp->l_len != 0) { 14253 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14254 "cannot lock a memory mapped file unless locking the " 14255 "entire file: start %"PRIx64", len %"PRIx64, 14256 bfp->l_start, bfp->l_len)); 14257 return (0); 14258 } 14259 14260 /* mandatory locking and mapping don't mix */ 14261 va.va_mask = AT_MODE; 14262 error = VOP_GETATTR(vp, &va, 0, cr); 14263 if (error != 0) { 14264 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14265 "getattr error %d", error)); 14266 return (0); /* treat errors conservatively */ 14267 } 14268 if (MANDLOCK(vp, va.va_mode)) { 14269 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14270 "cannot mandatory lock and mmap a file")); 14271 return (0); 14272 } 14273 14274 return (1); 14275 } 14276 14277 14278 /* 14279 * Register the lock locally within Solaris. 14280 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14281 * recording locks locally. 14282 * 14283 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14284 * are registered locally. 14285 */ 14286 void 14287 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14288 u_offset_t offset) 14289 { 14290 int oldsysid; 14291 int error; 14292 #ifdef DEBUG 14293 char *name; 14294 #endif 14295 14296 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14297 14298 #ifdef DEBUG 14299 name = fn_name(VTOSV(vp)->sv_name); 14300 NFS4_DEBUG(nfs4_client_lock_debug, 14301 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14302 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14303 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14304 flk->l_sysid)); 14305 kmem_free(name, MAXNAMELEN); 14306 #endif 14307 14308 /* register the lock with local locking */ 14309 oldsysid = flk->l_sysid; 14310 flk->l_sysid |= LM_SYSID_CLIENT; 14311 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14312 #ifdef DEBUG 14313 if (error != 0) { 14314 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14315 "nfs4_register_lock_locally: could not register with" 14316 " local locking")); 14317 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14318 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14319 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14320 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14321 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14322 flk->l_type, flk->l_start, flk->l_len)); 14323 (void) reclock(vp, flk, 0, flag, offset, NULL); 14324 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14325 "blocked by pid %d sysid 0x%x type %d " 14326 "off 0x%" PRIx64 " len 0x%" PRIx64, 14327 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14328 flk->l_len)); 14329 } 14330 #endif 14331 flk->l_sysid = oldsysid; 14332 } 14333 14334 /* 14335 * nfs4_lockrelease: 14336 * 14337 * Release any locks on the given vnode that are held by the current 14338 * process. Also removes the lock owner (if one exists) from the rnode's 14339 * list. 14340 */ 14341 static int 14342 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14343 { 14344 flock64_t ld; 14345 int ret, error; 14346 rnode4_t *rp; 14347 nfs4_lock_owner_t *lop; 14348 nfs4_recov_state_t recov_state; 14349 mntinfo4_t *mi; 14350 bool_t possible_orphan = FALSE; 14351 bool_t recovonly; 14352 14353 ASSERT((uintptr_t)vp > KERNELBASE); 14354 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14355 14356 rp = VTOR4(vp); 14357 mi = VTOMI4(vp); 14358 14359 /* 14360 * If we have not locked anything then we can 14361 * just return since we have no work to do. 14362 */ 14363 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14364 return (0); 14365 } 14366 14367 /* 14368 * We need to comprehend that another thread may 14369 * kick off recovery and the lock_owner we have stashed 14370 * in lop might be invalid so we should NOT cache it 14371 * locally! 14372 */ 14373 recov_state.rs_flags = 0; 14374 recov_state.rs_num_retry_despite_err = 0; 14375 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14376 &recovonly); 14377 if (error) { 14378 mutex_enter(&rp->r_statelock); 14379 rp->r_flags |= R4LODANGLERS; 14380 mutex_exit(&rp->r_statelock); 14381 return (error); 14382 } 14383 14384 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14385 14386 /* 14387 * Check if the lock owner might have a lock (request was sent but 14388 * no response was received). Also check if there are any remote 14389 * locks on the file. (In theory we shouldn't have to make this 14390 * second check if there's no lock owner, but for now we'll be 14391 * conservative and do it anyway.) If either condition is true, 14392 * send an unlock for the entire file to the server. 14393 * 14394 * Note that no explicit synchronization is needed here. At worst, 14395 * flk_has_remote_locks() will return a false positive, in which case 14396 * the unlock call wastes time but doesn't harm correctness. 14397 */ 14398 14399 if (lop) { 14400 mutex_enter(&lop->lo_lock); 14401 possible_orphan = lop->lo_pending_rqsts; 14402 mutex_exit(&lop->lo_lock); 14403 lock_owner_rele(lop); 14404 } 14405 14406 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14407 14408 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14409 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14410 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14411 (void *)lop)); 14412 14413 if (possible_orphan || flk_has_remote_locks(vp)) { 14414 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14415 ld.l_whence = 0; /* unlock from start of file */ 14416 ld.l_start = 0; 14417 ld.l_len = 0; /* do entire file */ 14418 14419 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 14420 14421 if (ret != 0) { 14422 /* 14423 * If VOP_FRLOCK fails, make sure we unregister 14424 * local locks before we continue. 14425 */ 14426 ld.l_pid = ttoproc(curthread)->p_pid; 14427 nfs4_register_lock_locally(vp, &ld, flag, offset); 14428 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14429 "nfs4_lockrelease: lock release error on vp" 14430 " %p: error %d.\n", (void *)vp, ret)); 14431 } 14432 } 14433 14434 recov_state.rs_flags = 0; 14435 recov_state.rs_num_retry_despite_err = 0; 14436 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14437 &recovonly); 14438 if (error) { 14439 mutex_enter(&rp->r_statelock); 14440 rp->r_flags |= R4LODANGLERS; 14441 mutex_exit(&rp->r_statelock); 14442 return (error); 14443 } 14444 14445 /* 14446 * So, here we're going to need to retrieve the lock-owner 14447 * again (in case recovery has done a switch-a-roo) and 14448 * remove it because we can. 14449 */ 14450 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14451 14452 if (lop) { 14453 nfs4_rnode_remove_lock_owner(rp, lop); 14454 lock_owner_rele(lop); 14455 } 14456 14457 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14458 return (0); 14459 } 14460 14461 /* 14462 * Wait for 'tick_delay' clock ticks. 14463 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14464 * NOTE: lock_lease_time is in seconds. 14465 * 14466 * XXX For future improvements, should implement a waiting queue scheme. 14467 */ 14468 static int 14469 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14470 { 14471 long milliseconds_delay; 14472 time_t lock_lease_time; 14473 14474 /* wait tick_delay clock ticks or siginteruptus */ 14475 if (delay_sig(*tick_delay)) { 14476 return (EINTR); 14477 } 14478 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14479 "reissue the lock request: blocked for %ld clock ticks: %ld " 14480 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14481 14482 /* get the lease time */ 14483 lock_lease_time = r2lease_time(rp); 14484 14485 /* drv_hztousec converts ticks to microseconds */ 14486 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14487 if (milliseconds_delay < lock_lease_time * 1000) { 14488 *tick_delay = 2 * *tick_delay; 14489 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14490 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14491 } 14492 return (0); 14493 } 14494 14495 14496 void 14497 nfs4_vnops_init(void) 14498 { 14499 } 14500 14501 void 14502 nfs4_vnops_fini(void) 14503 { 14504 } 14505 14506 /* 14507 * Return a reference to the directory (parent) vnode for a given vnode, 14508 * using the saved pathname information and the directory file handle. The 14509 * caller is responsible for disposing of the reference. 14510 * Returns zero or an errno value. 14511 * 14512 * Caller should set need_start_op to FALSE if it is the recovery 14513 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14514 */ 14515 int 14516 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14517 { 14518 svnode_t *svnp; 14519 vnode_t *dvp = NULL; 14520 servinfo4_t *svp; 14521 nfs4_fname_t *mfname; 14522 int error; 14523 14524 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14525 14526 if (vp->v_flag & VROOT) { 14527 nfs4_sharedfh_t *sfh; 14528 nfs_fh4 fh; 14529 mntinfo4_t *mi; 14530 14531 ASSERT(vp->v_type == VREG); 14532 14533 mi = VTOMI4(vp); 14534 svp = mi->mi_curr_serv; 14535 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14536 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14537 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14538 sfh = sfh4_get(&fh, VTOMI4(vp)); 14539 nfs_rw_exit(&svp->sv_lock); 14540 mfname = mi->mi_fname; 14541 fn_hold(mfname); 14542 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14543 sfh4_rele(&sfh); 14544 14545 if (dvp->v_type == VNON) 14546 dvp->v_type = VDIR; 14547 *dvpp = dvp; 14548 return (0); 14549 } 14550 14551 svnp = VTOSV(vp); 14552 14553 if (svnp == NULL) { 14554 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14555 "shadow node is NULL")); 14556 return (EINVAL); 14557 } 14558 14559 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14560 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14561 "shadow node name or dfh val == NULL")); 14562 return (EINVAL); 14563 } 14564 14565 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14566 (int)need_start_op); 14567 if (error != 0) { 14568 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14569 "nfs4_make_dotdot returned %d", error)); 14570 return (error); 14571 } 14572 if (!dvp) { 14573 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14574 "nfs4_make_dotdot returned a NULL dvp")); 14575 return (EIO); 14576 } 14577 if (dvp->v_type == VNON) 14578 dvp->v_type = VDIR; 14579 ASSERT(dvp->v_type == VDIR); 14580 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14581 mutex_enter(&dvp->v_lock); 14582 dvp->v_flag |= V_XATTRDIR; 14583 mutex_exit(&dvp->v_lock); 14584 } 14585 *dvpp = dvp; 14586 return (0); 14587 } 14588 14589 /* 14590 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14591 * length that fnamep can accept, including the trailing null. 14592 * Returns 0 if okay, returns an errno value if there was a problem. 14593 */ 14594 14595 int 14596 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14597 { 14598 char *fn; 14599 int err = 0; 14600 servinfo4_t *svp; 14601 svnode_t *shvp; 14602 14603 /* 14604 * If the file being opened has VROOT set, then this is 14605 * a "file" mount. sv_name will not be interesting, so 14606 * go back to the servinfo4 to get the original mount 14607 * path and strip off all but the final edge. Otherwise 14608 * just return the name from the shadow vnode. 14609 */ 14610 14611 if (vp->v_flag & VROOT) { 14612 14613 svp = VTOMI4(vp)->mi_curr_serv; 14614 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14615 14616 fn = strrchr(svp->sv_path, '/'); 14617 if (fn == NULL) 14618 err = EINVAL; 14619 else 14620 fn++; 14621 } else { 14622 shvp = VTOSV(vp); 14623 fn = fn_name(shvp->sv_name); 14624 } 14625 14626 if (err == 0) 14627 if (strlen(fn) < maxlen) 14628 (void) strcpy(fnamep, fn); 14629 else 14630 err = ENAMETOOLONG; 14631 14632 if (vp->v_flag & VROOT) 14633 nfs_rw_exit(&svp->sv_lock); 14634 else 14635 kmem_free(fn, MAXNAMELEN); 14636 14637 return (err); 14638 } 14639 14640 /* 14641 * Bookkeeping for a close that doesn't need to go over the wire. 14642 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14643 * it is left at 1. 14644 */ 14645 void 14646 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14647 { 14648 rnode4_t *rp; 14649 mntinfo4_t *mi; 14650 14651 mi = VTOMI4(vp); 14652 rp = VTOR4(vp); 14653 14654 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14655 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14656 ASSERT(nfs_zone() == mi->mi_zone); 14657 ASSERT(mutex_owned(&osp->os_sync_lock)); 14658 ASSERT(*have_lockp); 14659 14660 if (!osp->os_valid || 14661 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14662 return; 14663 } 14664 14665 /* 14666 * This removes the reference obtained at OPEN; ie, 14667 * when the open stream structure was created. 14668 * 14669 * We don't have to worry about calling 'open_stream_rele' 14670 * since we our currently holding a reference to this 14671 * open stream which means the count can not go to 0 with 14672 * this decrement. 14673 */ 14674 ASSERT(osp->os_ref_count >= 2); 14675 osp->os_ref_count--; 14676 osp->os_valid = 0; 14677 mutex_exit(&osp->os_sync_lock); 14678 *have_lockp = 0; 14679 14680 nfs4_dec_state_ref_count(mi); 14681 } 14682 14683 /* 14684 * Close all remaining open streams on the rnode. These open streams 14685 * could be here because: 14686 * - The close attempted at either close or delmap failed 14687 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14688 * - Someone did mknod on a regular file but never opened it 14689 */ 14690 int 14691 nfs4close_all(vnode_t *vp, cred_t *cr) 14692 { 14693 nfs4_open_stream_t *osp; 14694 int error; 14695 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14696 rnode4_t *rp; 14697 14698 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14699 14700 error = 0; 14701 rp = VTOR4(vp); 14702 14703 /* 14704 * At this point, all we know is that the last time 14705 * someone called vn_rele, the count was 1. Since then, 14706 * the vnode could have been re-activated. We want to 14707 * loop through the open streams and close each one, but 14708 * we have to be careful since once we release the rnode 14709 * hash bucket lock, someone else is free to come in and 14710 * re-activate the rnode and add new open streams. The 14711 * strategy is take the rnode hash bucket lock, verify that 14712 * the count is still 1, grab the open stream off the 14713 * head of the list and mark it invalid, then release the 14714 * rnode hash bucket lock and proceed with that open stream. 14715 * This is ok because nfs4close_one() will acquire the proper 14716 * open/create to close/destroy synchronization for open 14717 * streams, and will ensure that if someone has reopened 14718 * the open stream after we've dropped the hash bucket lock 14719 * then we'll just simply return without destroying the 14720 * open stream. 14721 * Repeat until the list is empty. 14722 */ 14723 14724 for (;;) { 14725 14726 /* make sure vnode hasn't been reactivated */ 14727 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14728 mutex_enter(&vp->v_lock); 14729 if (vp->v_count > 1) { 14730 mutex_exit(&vp->v_lock); 14731 rw_exit(&rp->r_hashq->r_lock); 14732 break; 14733 } 14734 /* 14735 * Grabbing r_os_lock before releasing v_lock prevents 14736 * a window where the rnode/open stream could get 14737 * reactivated (and os_force_close set to 0) before we 14738 * had a chance to set os_force_close to 1. 14739 */ 14740 mutex_enter(&rp->r_os_lock); 14741 mutex_exit(&vp->v_lock); 14742 14743 osp = list_head(&rp->r_open_streams); 14744 if (!osp) { 14745 /* nothing left to CLOSE OTW, so return */ 14746 mutex_exit(&rp->r_os_lock); 14747 rw_exit(&rp->r_hashq->r_lock); 14748 break; 14749 } 14750 14751 mutex_enter(&rp->r_statev4_lock); 14752 /* the file can't still be mem mapped */ 14753 ASSERT(rp->r_mapcnt == 0); 14754 if (rp->created_v4) 14755 rp->created_v4 = 0; 14756 mutex_exit(&rp->r_statev4_lock); 14757 14758 /* 14759 * Grab a ref on this open stream; nfs4close_one 14760 * will mark it as invalid 14761 */ 14762 mutex_enter(&osp->os_sync_lock); 14763 osp->os_ref_count++; 14764 osp->os_force_close = 1; 14765 mutex_exit(&osp->os_sync_lock); 14766 mutex_exit(&rp->r_os_lock); 14767 rw_exit(&rp->r_hashq->r_lock); 14768 14769 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 14770 14771 /* Update error if it isn't already non-zero */ 14772 if (error == 0) { 14773 if (e.error) 14774 error = e.error; 14775 else if (e.stat) 14776 error = geterrno4(e.stat); 14777 } 14778 14779 #ifdef DEBUG 14780 nfs4close_all_cnt++; 14781 #endif 14782 /* Release the ref on osp acquired above. */ 14783 open_stream_rele(osp, rp); 14784 14785 /* Proceed to the next open stream, if any */ 14786 } 14787 return (error); 14788 } 14789 14790 /* 14791 * nfs4close_one - close one open stream for a file if needed. 14792 * 14793 * "close_type" indicates which close path this is: 14794 * CLOSE_NORM: close initiated via VOP_CLOSE. 14795 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 14796 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 14797 * the close and release of client state for this open stream 14798 * (unless someone else has the open stream open). 14799 * CLOSE_RESEND: indicates the request is a replay of an earlier request 14800 * (e.g., due to abort because of a signal). 14801 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 14802 * 14803 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 14804 * recovery. Instead, the caller is expected to deal with retries. 14805 * 14806 * The caller can either pass in the osp ('provided_osp') or not. 14807 * 14808 * 'access_bits' represents the access we are closing/downgrading. 14809 * 14810 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 14811 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 14812 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 14813 * 14814 * Errors are returned via the nfs4_error_t. 14815 */ 14816 void 14817 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 14818 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 14819 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 14820 uint_t mmap_flags) 14821 { 14822 nfs4_open_owner_t *oop; 14823 nfs4_open_stream_t *osp = NULL; 14824 int retry = 0; 14825 int num_retries = NFS4_NUM_RECOV_RETRIES; 14826 rnode4_t *rp; 14827 mntinfo4_t *mi; 14828 nfs4_recov_state_t recov_state; 14829 cred_t *cred_otw = NULL; 14830 bool_t recovonly = FALSE; 14831 int isrecov; 14832 int force_close; 14833 int close_failed = 0; 14834 int did_dec_count = 0; 14835 int did_start_op = 0; 14836 int did_force_recovlock = 0; 14837 int did_start_seqid_sync = 0; 14838 int have_sync_lock = 0; 14839 14840 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14841 14842 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 14843 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 14844 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 14845 len, maxprot, mmap_flags, access_bits)); 14846 14847 nfs4_error_zinit(ep); 14848 rp = VTOR4(vp); 14849 mi = VTOMI4(vp); 14850 isrecov = (close_type == CLOSE_RESEND || 14851 close_type == CLOSE_AFTER_RESEND); 14852 14853 /* 14854 * First get the open owner. 14855 */ 14856 if (!provided_osp) { 14857 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 14858 } else { 14859 oop = provided_osp->os_open_owner; 14860 ASSERT(oop != NULL); 14861 open_owner_hold(oop); 14862 } 14863 14864 if (!oop) { 14865 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 14866 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 14867 "close type %d", (void *)rp, (void *)mi, (void *)cr, 14868 (void *)provided_osp, close_type)); 14869 ep->error = EIO; 14870 goto out; 14871 } 14872 14873 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 14874 recov_retry: 14875 osp = NULL; 14876 close_failed = 0; 14877 force_close = (close_type == CLOSE_FORCE); 14878 retry = 0; 14879 did_start_op = 0; 14880 did_force_recovlock = 0; 14881 did_start_seqid_sync = 0; 14882 have_sync_lock = 0; 14883 recovonly = FALSE; 14884 recov_state.rs_flags = 0; 14885 recov_state.rs_num_retry_despite_err = 0; 14886 14887 /* 14888 * Second synchronize with recovery. 14889 */ 14890 if (!isrecov) { 14891 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 14892 &recov_state, &recovonly); 14893 if (!ep->error) { 14894 did_start_op = 1; 14895 } else { 14896 close_failed = 1; 14897 /* 14898 * If we couldn't get start_fop, but have to 14899 * cleanup state, then at least acquire the 14900 * mi_recovlock so we can synchronize with 14901 * recovery. 14902 */ 14903 if (close_type == CLOSE_FORCE) { 14904 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 14905 RW_READER, FALSE); 14906 did_force_recovlock = 1; 14907 } else 14908 goto out; 14909 } 14910 } 14911 14912 /* 14913 * We cannot attempt to get the open seqid sync if nfs4_start_fop 14914 * set 'recovonly' to TRUE since most likely this is due to 14915 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 14916 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 14917 * to retry, causing us to loop until recovery finishes. Plus we 14918 * don't need protection over the open seqid since we're not going 14919 * OTW, hence don't need to use the seqid. 14920 */ 14921 if (recovonly == FALSE) { 14922 /* need to grab the open owner sync before 'os_sync_lock' */ 14923 ep->error = nfs4_start_open_seqid_sync(oop, mi); 14924 if (ep->error == EAGAIN) { 14925 ASSERT(!isrecov); 14926 if (did_start_op) 14927 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 14928 &recov_state, TRUE); 14929 if (did_force_recovlock) 14930 nfs_rw_exit(&mi->mi_recovlock); 14931 goto recov_retry; 14932 } 14933 did_start_seqid_sync = 1; 14934 } 14935 14936 /* 14937 * Third get an open stream and acquire 'os_sync_lock' to 14938 * sychronize the opening/creating of an open stream with the 14939 * closing/destroying of an open stream. 14940 */ 14941 if (!provided_osp) { 14942 /* returns with 'os_sync_lock' held */ 14943 osp = find_open_stream(oop, rp); 14944 if (!osp) { 14945 ep->error = EIO; 14946 goto out; 14947 } 14948 } else { 14949 osp = provided_osp; 14950 open_stream_hold(osp); 14951 mutex_enter(&osp->os_sync_lock); 14952 } 14953 have_sync_lock = 1; 14954 14955 ASSERT(oop == osp->os_open_owner); 14956 14957 /* 14958 * Fourth, do any special pre-OTW CLOSE processing 14959 * based on the specific close type. 14960 */ 14961 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 14962 !did_dec_count) { 14963 ASSERT(osp->os_open_ref_count > 0); 14964 osp->os_open_ref_count--; 14965 did_dec_count = 1; 14966 if (osp->os_open_ref_count == 0) 14967 osp->os_final_close = 1; 14968 } 14969 14970 if (close_type == CLOSE_FORCE) { 14971 /* see if somebody reopened the open stream. */ 14972 if (!osp->os_force_close) { 14973 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14974 "nfs4close_one: skip CLOSE_FORCE as osp %p " 14975 "was reopened, vp %p", (void *)osp, (void *)vp)); 14976 ep->error = 0; 14977 ep->stat = NFS4_OK; 14978 goto out; 14979 } 14980 14981 if (!osp->os_final_close && !did_dec_count) { 14982 osp->os_open_ref_count--; 14983 did_dec_count = 1; 14984 } 14985 14986 /* 14987 * We can't depend on os_open_ref_count being 0 due to the 14988 * way executables are opened (VN_RELE to match a VOP_OPEN). 14989 */ 14990 #ifdef NOTYET 14991 ASSERT(osp->os_open_ref_count == 0); 14992 #endif 14993 if (osp->os_open_ref_count != 0) { 14994 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14995 "nfs4close_one: should panic here on an " 14996 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 14997 "since this is probably the exec problem.")); 14998 14999 osp->os_open_ref_count = 0; 15000 } 15001 15002 /* 15003 * There is the possibility that nfs4close_one() 15004 * for close_type == CLOSE_DELMAP couldn't find the 15005 * open stream, thus couldn't decrement its os_mapcnt; 15006 * therefore we can't use this ASSERT yet. 15007 */ 15008 #ifdef NOTYET 15009 ASSERT(osp->os_mapcnt == 0); 15010 #endif 15011 osp->os_mapcnt = 0; 15012 } 15013 15014 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15015 ASSERT(osp->os_mapcnt >= btopr(len)); 15016 15017 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15018 osp->os_mmap_write -= btopr(len); 15019 if (maxprot & PROT_READ) 15020 osp->os_mmap_read -= btopr(len); 15021 if (maxprot & PROT_EXEC) 15022 osp->os_mmap_read -= btopr(len); 15023 /* mirror the PROT_NONE check in nfs4_addmap() */ 15024 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15025 !(maxprot & PROT_EXEC)) 15026 osp->os_mmap_read -= btopr(len); 15027 osp->os_mapcnt -= btopr(len); 15028 did_dec_count = 1; 15029 } 15030 15031 if (recovonly) { 15032 nfs4_lost_rqst_t lost_rqst; 15033 15034 /* request should not already be in recovery queue */ 15035 ASSERT(lrp == NULL); 15036 nfs4_error_init(ep, EINTR); 15037 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15038 osp, cred_otw, vp); 15039 mutex_exit(&osp->os_sync_lock); 15040 have_sync_lock = 0; 15041 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15042 lost_rqst.lr_op == OP_CLOSE ? 15043 &lost_rqst : NULL, OP_CLOSE, NULL); 15044 close_failed = 1; 15045 force_close = 0; 15046 goto close_cleanup; 15047 } 15048 15049 /* 15050 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15051 * we stopped operating on the open owner's <old oo_name, old seqid> 15052 * space, which means we stopped operating on the open stream 15053 * too. So don't go OTW (as the seqid is likely bad, and the 15054 * stateid could be stale, potentially triggering a false 15055 * setclientid), and just clean up the client's internal state. 15056 */ 15057 if (osp->os_orig_oo_name != oop->oo_name) { 15058 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15059 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15060 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15061 "oo_name %" PRIx64")", 15062 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15063 oop->oo_name)); 15064 close_failed = 1; 15065 } 15066 15067 /* If the file failed recovery, just quit. */ 15068 mutex_enter(&rp->r_statelock); 15069 if (rp->r_flags & R4RECOVERR) { 15070 close_failed = 1; 15071 } 15072 mutex_exit(&rp->r_statelock); 15073 15074 /* 15075 * If the force close path failed to obtain start_fop 15076 * then skip the OTW close and just remove the state. 15077 */ 15078 if (close_failed) 15079 goto close_cleanup; 15080 15081 /* 15082 * Fifth, check to see if there are still mapped pages or other 15083 * opens using this open stream. If there are then we can't 15084 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15085 */ 15086 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15087 nfs4_lost_rqst_t new_lost_rqst; 15088 bool_t needrecov = FALSE; 15089 cred_t *odg_cred_otw = NULL; 15090 seqid4 open_dg_seqid = 0; 15091 15092 if (osp->os_delegation) { 15093 /* 15094 * If this open stream was never OPENed OTW then we 15095 * surely can't DOWNGRADE it (especially since the 15096 * osp->open_stateid is really a delegation stateid 15097 * when os_delegation is 1). 15098 */ 15099 if (access_bits & FREAD) 15100 osp->os_share_acc_read--; 15101 if (access_bits & FWRITE) 15102 osp->os_share_acc_write--; 15103 osp->os_share_deny_none--; 15104 nfs4_error_zinit(ep); 15105 goto out; 15106 } 15107 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15108 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15109 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15110 if (needrecov && !isrecov) { 15111 bool_t abort; 15112 nfs4_bseqid_entry_t *bsep = NULL; 15113 15114 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15115 bsep = nfs4_create_bseqid_entry(oop, NULL, 15116 vp, 0, 15117 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15118 open_dg_seqid); 15119 15120 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15121 oop, osp, odg_cred_otw, vp, access_bits, 0); 15122 mutex_exit(&osp->os_sync_lock); 15123 have_sync_lock = 0; 15124 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15125 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15126 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15127 bsep); 15128 if (odg_cred_otw) 15129 crfree(odg_cred_otw); 15130 if (bsep) 15131 kmem_free(bsep, sizeof (*bsep)); 15132 15133 if (abort == TRUE) 15134 goto out; 15135 15136 if (did_start_seqid_sync) { 15137 nfs4_end_open_seqid_sync(oop); 15138 did_start_seqid_sync = 0; 15139 } 15140 open_stream_rele(osp, rp); 15141 15142 if (did_start_op) 15143 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15144 &recov_state, FALSE); 15145 if (did_force_recovlock) 15146 nfs_rw_exit(&mi->mi_recovlock); 15147 15148 goto recov_retry; 15149 } else { 15150 if (odg_cred_otw) 15151 crfree(odg_cred_otw); 15152 } 15153 goto out; 15154 } 15155 15156 /* 15157 * If this open stream was created as the results of an open 15158 * while holding a delegation, then just release it; no need 15159 * to do an OTW close. Otherwise do a "normal" OTW close. 15160 */ 15161 if (osp->os_delegation) { 15162 nfs4close_notw(vp, osp, &have_sync_lock); 15163 nfs4_error_zinit(ep); 15164 goto out; 15165 } 15166 15167 /* 15168 * If this stream is not valid, we're done. 15169 */ 15170 if (!osp->os_valid) { 15171 nfs4_error_zinit(ep); 15172 goto out; 15173 } 15174 15175 /* 15176 * Last open or mmap ref has vanished, need to do an OTW close. 15177 * First check to see if a close is still necessary. 15178 */ 15179 if (osp->os_failed_reopen) { 15180 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15181 "don't close OTW osp %p since reopen failed.", 15182 (void *)osp)); 15183 /* 15184 * Reopen of the open stream failed, hence the 15185 * stateid of the open stream is invalid/stale, and 15186 * sending this OTW would incorrectly cause another 15187 * round of recovery. In this case, we need to set 15188 * the 'os_valid' bit to 0 so another thread doesn't 15189 * come in and re-open this open stream before 15190 * this "closing" thread cleans up state (decrementing 15191 * the nfs4_server_t's state_ref_count and decrementing 15192 * the os_ref_count). 15193 */ 15194 osp->os_valid = 0; 15195 /* 15196 * This removes the reference obtained at OPEN; ie, 15197 * when the open stream structure was created. 15198 * 15199 * We don't have to worry about calling 'open_stream_rele' 15200 * since we our currently holding a reference to this 15201 * open stream which means the count can not go to 0 with 15202 * this decrement. 15203 */ 15204 ASSERT(osp->os_ref_count >= 2); 15205 osp->os_ref_count--; 15206 nfs4_error_zinit(ep); 15207 close_failed = 0; 15208 goto close_cleanup; 15209 } 15210 15211 ASSERT(osp->os_ref_count > 1); 15212 15213 /* 15214 * Sixth, try the CLOSE OTW. 15215 */ 15216 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15217 close_type, ep, &have_sync_lock); 15218 15219 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15220 /* 15221 * Let the recovery thread be responsible for 15222 * removing the state for CLOSE. 15223 */ 15224 close_failed = 1; 15225 force_close = 0; 15226 retry = 0; 15227 } 15228 15229 /* See if we need to retry with a different cred */ 15230 if ((ep->error == EACCES || 15231 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15232 cred_otw != cr) { 15233 crfree(cred_otw); 15234 cred_otw = cr; 15235 crhold(cred_otw); 15236 retry = 1; 15237 } 15238 15239 if (ep->error || ep->stat) 15240 close_failed = 1; 15241 15242 if (retry && !isrecov && num_retries-- > 0) { 15243 if (have_sync_lock) { 15244 mutex_exit(&osp->os_sync_lock); 15245 have_sync_lock = 0; 15246 } 15247 if (did_start_seqid_sync) { 15248 nfs4_end_open_seqid_sync(oop); 15249 did_start_seqid_sync = 0; 15250 } 15251 open_stream_rele(osp, rp); 15252 15253 if (did_start_op) 15254 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15255 &recov_state, FALSE); 15256 if (did_force_recovlock) 15257 nfs_rw_exit(&mi->mi_recovlock); 15258 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15259 "nfs4close_one: need to retry the close " 15260 "operation")); 15261 goto recov_retry; 15262 } 15263 close_cleanup: 15264 /* 15265 * Seventh and lastly, process our results. 15266 */ 15267 if (close_failed && force_close) { 15268 /* 15269 * It's ok to drop and regrab the 'os_sync_lock' since 15270 * nfs4close_notw() will recheck to make sure the 15271 * "close"/removal of state should happen. 15272 */ 15273 if (!have_sync_lock) { 15274 mutex_enter(&osp->os_sync_lock); 15275 have_sync_lock = 1; 15276 } 15277 /* 15278 * This is last call, remove the ref on the open 15279 * stream created by open and clean everything up. 15280 */ 15281 osp->os_pending_close = 0; 15282 nfs4close_notw(vp, osp, &have_sync_lock); 15283 nfs4_error_zinit(ep); 15284 } 15285 15286 if (!close_failed) { 15287 if (have_sync_lock) { 15288 osp->os_pending_close = 0; 15289 mutex_exit(&osp->os_sync_lock); 15290 have_sync_lock = 0; 15291 } else { 15292 mutex_enter(&osp->os_sync_lock); 15293 osp->os_pending_close = 0; 15294 mutex_exit(&osp->os_sync_lock); 15295 } 15296 if (did_start_op && recov_state.rs_sp != NULL) { 15297 mutex_enter(&recov_state.rs_sp->s_lock); 15298 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15299 mutex_exit(&recov_state.rs_sp->s_lock); 15300 } else { 15301 nfs4_dec_state_ref_count(mi); 15302 } 15303 nfs4_error_zinit(ep); 15304 } 15305 15306 out: 15307 if (have_sync_lock) 15308 mutex_exit(&osp->os_sync_lock); 15309 if (did_start_op) 15310 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15311 recovonly ? TRUE : FALSE); 15312 if (did_force_recovlock) 15313 nfs_rw_exit(&mi->mi_recovlock); 15314 if (cred_otw) 15315 crfree(cred_otw); 15316 if (osp) 15317 open_stream_rele(osp, rp); 15318 if (oop) { 15319 if (did_start_seqid_sync) 15320 nfs4_end_open_seqid_sync(oop); 15321 open_owner_rele(oop); 15322 } 15323 } 15324 15325 /* 15326 * Convert information returned by the server in the LOCK4denied 15327 * structure to the form required by fcntl. 15328 */ 15329 static void 15330 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15331 { 15332 nfs4_lo_name_t *lo; 15333 15334 #ifdef DEBUG 15335 if (denied_to_flk_debug) { 15336 lockt_denied_debug = lockt_denied; 15337 debug_enter("lockt_denied"); 15338 } 15339 #endif 15340 15341 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15342 flk->l_whence = 0; /* aka SEEK_SET */ 15343 flk->l_start = lockt_denied->offset; 15344 flk->l_len = lockt_denied->length; 15345 15346 /* 15347 * If the blocking clientid matches our client id, then we can 15348 * interpret the lockowner (since we built it). If not, then 15349 * fabricate a sysid and pid. Note that the l_sysid field 15350 * in *flk already has the local sysid. 15351 */ 15352 15353 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15354 15355 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15356 lo = (nfs4_lo_name_t *) 15357 lockt_denied->owner.owner_val; 15358 15359 flk->l_pid = lo->ln_pid; 15360 } else { 15361 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15362 "denied_to_flk: bad lock owner length\n")); 15363 15364 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15365 } 15366 } else { 15367 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15368 "denied_to_flk: foreign clientid\n")); 15369 15370 /* 15371 * Construct a new sysid which should be different from 15372 * sysids of other systems. 15373 */ 15374 15375 flk->l_sysid++; 15376 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15377 } 15378 } 15379 15380 static pid_t 15381 lo_to_pid(lock_owner4 *lop) 15382 { 15383 pid_t pid = 0; 15384 uchar_t *cp; 15385 int i; 15386 15387 cp = (uchar_t *)&lop->clientid; 15388 15389 for (i = 0; i < sizeof (lop->clientid); i++) 15390 pid += (pid_t)*cp++; 15391 15392 cp = (uchar_t *)lop->owner_val; 15393 15394 for (i = 0; i < lop->owner_len; i++) 15395 pid += (pid_t)*cp++; 15396 15397 return (pid); 15398 } 15399 15400 /* 15401 * Given a lock pointer, returns the length of that lock. 15402 * "end" is the last locked offset the "l_len" covers from 15403 * the start of the lock. 15404 */ 15405 static off64_t 15406 lock_to_end(flock64_t *lock) 15407 { 15408 off64_t lock_end; 15409 15410 if (lock->l_len == 0) 15411 lock_end = (off64_t)MAXEND; 15412 else 15413 lock_end = lock->l_start + lock->l_len - 1; 15414 15415 return (lock_end); 15416 } 15417 15418 /* 15419 * Given the end of a lock, it will return you the length "l_len" for that lock. 15420 */ 15421 static off64_t 15422 end_to_len(off64_t start, off64_t end) 15423 { 15424 off64_t lock_len; 15425 15426 ASSERT(end >= start); 15427 if (end == MAXEND) 15428 lock_len = 0; 15429 else 15430 lock_len = end - start + 1; 15431 15432 return (lock_len); 15433 } 15434 15435 /* 15436 * On given end for a lock it determines if it is the last locked offset 15437 * or not, if so keeps it as is, else adds one to return the length for 15438 * valid start. 15439 */ 15440 static off64_t 15441 start_check(off64_t x) 15442 { 15443 if (x == MAXEND) 15444 return (x); 15445 else 15446 return (x + 1); 15447 } 15448 15449 /* 15450 * See if these two locks overlap, and if so return 1; 15451 * otherwise, return 0. 15452 */ 15453 static int 15454 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15455 { 15456 off64_t llfp_end, curfp_end; 15457 15458 llfp_end = lock_to_end(llfp); 15459 curfp_end = lock_to_end(curfp); 15460 15461 if (((llfp_end >= curfp->l_start) && 15462 (llfp->l_start <= curfp->l_start)) || 15463 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15464 return (1); 15465 return (0); 15466 } 15467 15468 /* 15469 * Determine what the interseting lock region is, and add that to the 15470 * 'nl_llpp' locklist in increasing order (by l_start). 15471 */ 15472 static void 15473 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15474 locklist_t **nl_llpp, vnode_t *vp) 15475 { 15476 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15477 off64_t lost_flp_end, local_flp_end, len, start; 15478 15479 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15480 15481 if (!locks_intersect(lost_flp, local_flp)) 15482 return; 15483 15484 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15485 "locks intersect")); 15486 15487 lost_flp_end = lock_to_end(lost_flp); 15488 local_flp_end = lock_to_end(local_flp); 15489 15490 /* Find the starting point of the intersecting region */ 15491 if (local_flp->l_start > lost_flp->l_start) 15492 start = local_flp->l_start; 15493 else 15494 start = lost_flp->l_start; 15495 15496 /* Find the lenght of the intersecting region */ 15497 if (lost_flp_end < local_flp_end) 15498 len = end_to_len(start, lost_flp_end); 15499 else 15500 len = end_to_len(start, local_flp_end); 15501 15502 /* 15503 * Prepare the flock structure for the intersection found and insert 15504 * it into the new list in increasing l_start order. This list contains 15505 * intersections of locks registered by the client with the local host 15506 * and the lost lock. 15507 * The lock type of this lock is the same as that of the local_flp. 15508 */ 15509 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15510 intersect_llp->ll_flock.l_start = start; 15511 intersect_llp->ll_flock.l_len = len; 15512 intersect_llp->ll_flock.l_type = local_flp->l_type; 15513 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15514 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15515 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15516 intersect_llp->ll_vp = vp; 15517 15518 tmp_fllp = *nl_llpp; 15519 cur_fllp = NULL; 15520 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15521 intersect_llp->ll_flock.l_start) { 15522 cur_fllp = tmp_fllp; 15523 tmp_fllp = tmp_fllp->ll_next; 15524 } 15525 if (cur_fllp == NULL) { 15526 /* first on the list */ 15527 intersect_llp->ll_next = *nl_llpp; 15528 *nl_llpp = intersect_llp; 15529 } else { 15530 intersect_llp->ll_next = cur_fllp->ll_next; 15531 cur_fllp->ll_next = intersect_llp; 15532 } 15533 15534 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15535 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15536 intersect_llp->ll_flock.l_start, 15537 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15538 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15539 } 15540 15541 /* 15542 * Our local locking current state is potentially different than 15543 * what the NFSv4 server thinks we have due to a lost lock that was 15544 * resent and then received. We need to reset our "NFSv4" locking 15545 * state to match the current local locking state for this pid since 15546 * that is what the user/application sees as what the world is. 15547 * 15548 * We cannot afford to drop the open/lock seqid sync since then we can 15549 * get confused about what the current local locking state "is" versus 15550 * "was". 15551 * 15552 * If we are unable to fix up the locks, we send SIGLOST to the affected 15553 * process. This is not done if the filesystem has been forcibly 15554 * unmounted, in case the process has already exited and a new process 15555 * exists with the same pid. 15556 */ 15557 static void 15558 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15559 nfs4_lock_owner_t *lop) 15560 { 15561 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15562 mntinfo4_t *mi = VTOMI4(vp); 15563 const int cmd = F_SETLK; 15564 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15565 flock64_t ul_fl; 15566 15567 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15568 "nfs4_reinstitute_local_lock_state")); 15569 15570 /* 15571 * Find active locks for this vp from the local locking code. 15572 * Scan through this list and find out the locks that intersect with 15573 * the lost lock. Once we find the lock that intersects, add the 15574 * intersection area as a new lock to a new list "ri_llp". The lock 15575 * type of the intersection region lock added to ri_llp is the same 15576 * as that found in the active lock list, "list". The intersecting 15577 * region locks are added to ri_llp in increasing l_start order. 15578 */ 15579 ASSERT(nfs_zone() == mi->mi_zone); 15580 15581 locks = flk_active_locks_for_vp(vp); 15582 ri_llp = NULL; 15583 15584 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15585 ASSERT(llp->ll_vp == vp); 15586 /* 15587 * Pick locks that belong to this pid/lockowner 15588 */ 15589 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15590 continue; 15591 15592 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15593 } 15594 15595 /* 15596 * Now we have the list of intersections with the lost lock. These are 15597 * the locks that were/are active before the server replied to the 15598 * last/lost lock. Issue these locks to the server here. Playing these 15599 * locks to the server will re-establish aur current local locking state 15600 * with the v4 server. 15601 * If we get an error, send SIGLOST to the application for that lock. 15602 */ 15603 15604 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15605 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15606 "nfs4_reinstitute_local_lock_state: need to issue " 15607 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15608 llp->ll_flock.l_start, 15609 llp->ll_flock.l_start + llp->ll_flock.l_len, 15610 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15611 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15612 /* 15613 * No need to relock what we already have 15614 */ 15615 if (llp->ll_flock.l_type == lost_flp->l_type) 15616 continue; 15617 15618 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15619 } 15620 15621 /* 15622 * Now keeping the start of the lost lock as our reference parse the 15623 * newly created ri_llp locklist to find the ranges that we have locked 15624 * with the v4 server but not in the current local locking. We need 15625 * to unlock these ranges. 15626 * These ranges can also be reffered to as those ranges, where the lost 15627 * lock does not overlap with the locks in the ri_llp but are locked 15628 * since the server replied to the lost lock. 15629 */ 15630 cur_start = lost_flp->l_start; 15631 lost_flp_end = lock_to_end(lost_flp); 15632 15633 ul_fl.l_type = F_UNLCK; 15634 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15635 ul_fl.l_sysid = lost_flp->l_sysid; 15636 ul_fl.l_pid = lost_flp->l_pid; 15637 15638 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15639 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15640 15641 if (llp->ll_flock.l_start <= cur_start) { 15642 cur_start = start_check(llp_ll_flock_end); 15643 continue; 15644 } 15645 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15646 "nfs4_reinstitute_local_lock_state: " 15647 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15648 cur_start, llp->ll_flock.l_start)); 15649 15650 ul_fl.l_start = cur_start; 15651 ul_fl.l_len = end_to_len(cur_start, 15652 (llp->ll_flock.l_start - 1)); 15653 15654 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15655 cur_start = start_check(llp_ll_flock_end); 15656 } 15657 15658 /* 15659 * In the case where the lost lock ends after all intersecting locks, 15660 * unlock the last part of the lost lock range. 15661 */ 15662 if (cur_start != start_check(lost_flp_end)) { 15663 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15664 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15665 "lost lock region [%"PRIx64" - %"PRIx64"]", 15666 cur_start, lost_flp->l_start + lost_flp->l_len)); 15667 15668 ul_fl.l_start = cur_start; 15669 /* 15670 * Is it an to-EOF lock? if so unlock till the end 15671 */ 15672 if (lost_flp->l_len == 0) 15673 ul_fl.l_len = 0; 15674 else 15675 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15676 15677 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15678 } 15679 15680 if (locks != NULL) 15681 flk_free_locklist(locks); 15682 15683 /* Free up our newly created locklist */ 15684 for (llp = ri_llp; llp != NULL; ) { 15685 tmp_llp = llp->ll_next; 15686 kmem_free(llp, sizeof (locklist_t)); 15687 llp = tmp_llp; 15688 } 15689 15690 /* 15691 * Now return back to the original calling nfs4frlock() 15692 * and let us naturally drop our seqid syncs. 15693 */ 15694 } 15695 15696 /* 15697 * Create a lost state record for the given lock reinstantiation request 15698 * and push it onto the lost state queue. 15699 */ 15700 static void 15701 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15702 nfs4_lock_owner_t *lop) 15703 { 15704 nfs4_lost_rqst_t req; 15705 nfs_lock_type4 locktype; 15706 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15707 15708 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15709 15710 locktype = flk_to_locktype(cmd, flk->l_type); 15711 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15712 NULL, NULL, lop, flk, &req, cr, vp); 15713 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15714 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15715 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15716 NULL); 15717 } 15718