1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/cmn_err.h> 57 #include <sys/pathconf.h> 58 #include <sys/utsname.h> 59 #include <sys/dnlc.h> 60 #include <sys/acl.h> 61 #include <sys/systeminfo.h> 62 #include <sys/policy.h> 63 #include <sys/sdt.h> 64 #include <sys/list.h> 65 #include <sys/stat.h> 66 67 #include <rpc/types.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs4_kprot.h> 77 #include <nfs/rnode4.h> 78 #include <nfs/nfs4_clnt.h> 79 80 #include <vm/hat.h> 81 #include <vm/as.h> 82 #include <vm/page.h> 83 #include <vm/pvn.h> 84 #include <vm/seg.h> 85 #include <vm/seg_map.h> 86 #include <vm/seg_kpm.h> 87 #include <vm/seg_vn.h> 88 89 #include <fs/fs_subr.h> 90 91 #include <sys/ddi.h> 92 #include <sys/int_fmtio.h> 93 94 typedef struct { 95 nfs4_ga_res_t *di_garp; 96 cred_t *di_cred; 97 hrtime_t di_time_call; 98 } dirattr_info_t; 99 100 typedef enum nfs4_acl_op { 101 NFS4_ACL_GET, 102 NFS4_ACL_SET 103 } nfs4_acl_op_t; 104 105 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 106 char *, dirattr_info_t *); 107 108 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 109 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 110 nfs4_error_t *, int *); 111 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 112 cred_t *); 113 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 114 stable_how4 *); 115 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 116 cred_t *, bool_t, struct uio *); 117 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 118 vsecattr_t *); 119 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 120 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 121 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 122 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 123 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 124 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 125 int, vnode_t **, cred_t *); 126 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 127 cred_t *, int, int, enum createmode4, int); 128 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 129 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 130 vnode_t *, char *, cred_t *, nfsstat4 *); 131 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 132 vnode_t *, char *, cred_t *, nfsstat4 *); 133 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 134 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 135 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 136 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 137 page_t *[], size_t, struct seg *, caddr_t, 138 enum seg_rw, cred_t *); 139 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 140 cred_t *); 141 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 142 int, cred_t *); 143 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 144 int, cred_t *); 145 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 146 static void nfs4_set_mod(vnode_t *); 147 static void nfs4_get_commit(vnode_t *); 148 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 149 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 150 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 151 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 152 cred_t *); 153 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 154 cred_t *); 155 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 156 hrtime_t, vnode_t *, cred_t *); 157 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 158 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 159 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 160 u_offset_t); 161 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 162 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 163 static cred_t *state_to_cred(nfs4_open_stream_t *); 164 static int vtoname(vnode_t *, char *, ssize_t); 165 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 166 static pid_t lo_to_pid(lock_owner4 *); 167 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 168 cred_t *, nfs4_lock_owner_t *); 169 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 170 nfs4_lock_owner_t *); 171 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 172 static void nfs4_delmap_callback(struct as *, void *, uint_t); 173 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 174 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 175 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 176 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 177 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 178 uid_t, gid_t, int); 179 180 /* 181 * Routines that implement the setting of v4 args for the misc. ops 182 */ 183 static void nfs4args_lock_free(nfs_argop4 *); 184 static void nfs4args_lockt_free(nfs_argop4 *); 185 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 186 int, rnode4_t *, cred_t *, bitmap4, int *, 187 nfs4_stateid_types_t *); 188 static void nfs4args_setattr_free(nfs_argop4 *); 189 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 190 bitmap4); 191 static void nfs4args_verify_free(nfs_argop4 *); 192 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 193 WRITE4args **, nfs4_stateid_types_t *); 194 195 /* 196 * These are the vnode ops functions that implement the vnode interface to 197 * the networked file system. See more comments below at nfs4_vnodeops. 198 */ 199 static int nfs4_open(vnode_t **, int, cred_t *); 200 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *); 201 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 202 caller_context_t *); 203 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 204 caller_context_t *); 205 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 206 static int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *); 207 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 208 caller_context_t *); 209 static int nfs4_access(vnode_t *, int, int, cred_t *); 210 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *); 211 static int nfs4_fsync(vnode_t *, int, cred_t *); 212 static void nfs4_inactive(vnode_t *, cred_t *); 213 static int nfs4_lookup(vnode_t *, char *, vnode_t **, 214 struct pathname *, int, vnode_t *, cred_t *); 215 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 216 int, vnode_t **, cred_t *, int); 217 static int nfs4_remove(vnode_t *, char *, cred_t *); 218 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *); 219 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 220 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, 221 vnode_t **, cred_t *); 222 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 223 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 224 cred_t *); 225 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *); 226 static int nfs4_fid(vnode_t *, fid_t *); 227 static int nfs4_rwlock(vnode_t *, int, caller_context_t *); 228 static void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 229 static int nfs4_seek(vnode_t *, offset_t, offset_t *); 230 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 231 page_t *[], size_t, struct seg *, caddr_t, 232 enum seg_rw, cred_t *); 233 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 234 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, 235 size_t, uchar_t, uchar_t, uint_t, cred_t *); 236 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, 237 size_t, uchar_t, uchar_t, uint_t, cred_t *); 238 static int nfs4_cmp(vnode_t *, vnode_t *); 239 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 240 struct flk_callback *, cred_t *); 241 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 242 cred_t *, caller_context_t *); 243 static int nfs4_realvp(vnode_t *, vnode_t **); 244 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, 245 size_t, uint_t, uint_t, uint_t, cred_t *); 246 static int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *); 247 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 248 cred_t *); 249 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *); 250 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 251 static int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 252 static int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 253 254 /* 255 * Used for nfs4_commit_vp() to indicate if we should 256 * wait on pending writes. 257 */ 258 #define NFS4_WRITE_NOWAIT 0 259 #define NFS4_WRITE_WAIT 1 260 261 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 262 263 /* 264 * Error flags used to pass information about certain special errors 265 * which need to be handled specially. 266 */ 267 #define NFS_EOF -98 268 #define NFS_VERF_MISMATCH -97 269 270 /* 271 * Flags used to differentiate between which operation drove the 272 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 273 */ 274 #define NFS4_CLOSE_OP 0x1 275 #define NFS4_DELMAP_OP 0x2 276 #define NFS4_INACTIVE_OP 0x3 277 278 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 279 280 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 281 #define ALIGN64(x, ptr, sz) \ 282 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 283 if (x) { \ 284 x = sizeof (uint64_t) - (x); \ 285 sz -= (x); \ 286 ptr += (x); \ 287 } 288 289 #ifdef DEBUG 290 int nfs4_client_attr_debug = 0; 291 int nfs4_client_state_debug = 0; 292 int nfs4_client_shadow_debug = 0; 293 int nfs4_client_lock_debug = 0; 294 int nfs4_seqid_sync = 0; 295 int nfs4_client_map_debug = 0; 296 static int nfs4_pageio_debug = 0; 297 int nfs4_client_inactive_debug = 0; 298 int nfs4_client_recov_debug = 0; 299 int nfs4_client_recov_stub_debug = 0; 300 int nfs4_client_failover_debug = 0; 301 int nfs4_client_call_debug = 0; 302 int nfs4_client_lookup_debug = 0; 303 int nfs4_client_zone_debug = 0; 304 int nfs4_lost_rqst_debug = 0; 305 int nfs4_rdattrerr_debug = 0; 306 int nfs4_open_stream_debug = 0; 307 308 int nfs4read_error_inject; 309 310 static int nfs4_create_misses = 0; 311 312 static int nfs4_readdir_cache_shorts = 0; 313 static int nfs4_readdir_readahead = 0; 314 315 static int nfs4_bio_do_stop = 0; 316 317 static int nfs4_lostpage = 0; /* number of times we lost original page */ 318 319 int nfs4_mmap_debug = 0; 320 321 static int nfs4_pathconf_cache_hits = 0; 322 static int nfs4_pathconf_cache_misses = 0; 323 324 int nfs4close_all_cnt; 325 int nfs4close_one_debug = 0; 326 int nfs4close_notw_debug = 0; 327 328 int denied_to_flk_debug = 0; 329 void *lockt_denied_debug; 330 331 #endif 332 333 /* 334 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 335 * or NFS4ERR_RESOURCE. 336 */ 337 static int confirm_retry_sec = 30; 338 339 static int nfs4_lookup_neg_cache = 1; 340 341 /* 342 * number of pages to read ahead 343 * optimized for 100 base-T. 344 */ 345 static int nfs4_nra = 4; 346 347 static int nfs4_do_symlink_cache = 1; 348 349 static int nfs4_pathconf_disable_cache = 0; 350 351 /* 352 * These are the vnode ops routines which implement the vnode interface to 353 * the networked file system. These routines just take their parameters, 354 * make them look networkish by putting the right info into interface structs, 355 * and then calling the appropriate remote routine(s) to do the work. 356 * 357 * Note on directory name lookup cacheing: If we detect a stale fhandle, 358 * we purge the directory cache relative to that vnode. This way, the 359 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 360 * more details on rnode locking. 361 */ 362 363 struct vnodeops *nfs4_vnodeops; 364 365 const fs_operation_def_t nfs4_vnodeops_template[] = { 366 VOPNAME_OPEN, nfs4_open, 367 VOPNAME_CLOSE, nfs4_close, 368 VOPNAME_READ, nfs4_read, 369 VOPNAME_WRITE, nfs4_write, 370 VOPNAME_IOCTL, nfs4_ioctl, 371 VOPNAME_GETATTR, nfs4_getattr, 372 VOPNAME_SETATTR, nfs4_setattr, 373 VOPNAME_ACCESS, nfs4_access, 374 VOPNAME_LOOKUP, nfs4_lookup, 375 VOPNAME_CREATE, nfs4_create, 376 VOPNAME_REMOVE, nfs4_remove, 377 VOPNAME_LINK, nfs4_link, 378 VOPNAME_RENAME, nfs4_rename, 379 VOPNAME_MKDIR, nfs4_mkdir, 380 VOPNAME_RMDIR, nfs4_rmdir, 381 VOPNAME_READDIR, nfs4_readdir, 382 VOPNAME_SYMLINK, nfs4_symlink, 383 VOPNAME_READLINK, nfs4_readlink, 384 VOPNAME_FSYNC, nfs4_fsync, 385 VOPNAME_INACTIVE, (fs_generic_func_p) nfs4_inactive, 386 VOPNAME_FID, nfs4_fid, 387 VOPNAME_RWLOCK, nfs4_rwlock, 388 VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs4_rwunlock, 389 VOPNAME_SEEK, nfs4_seek, 390 VOPNAME_FRLOCK, nfs4_frlock, 391 VOPNAME_SPACE, nfs4_space, 392 VOPNAME_REALVP, nfs4_realvp, 393 VOPNAME_GETPAGE, nfs4_getpage, 394 VOPNAME_PUTPAGE, nfs4_putpage, 395 VOPNAME_MAP, (fs_generic_func_p) nfs4_map, 396 VOPNAME_ADDMAP, (fs_generic_func_p) nfs4_addmap, 397 VOPNAME_DELMAP, nfs4_delmap, 398 VOPNAME_DUMP, nfs_dump, /* there is no separate nfs4_dump */ 399 VOPNAME_PATHCONF, nfs4_pathconf, 400 VOPNAME_PAGEIO, nfs4_pageio, 401 VOPNAME_DISPOSE, (fs_generic_func_p) nfs4_dispose, 402 VOPNAME_SETSECATTR, nfs4_setsecattr, 403 VOPNAME_GETSECATTR, nfs4_getsecattr, 404 VOPNAME_SHRLOCK, nfs4_shrlock, 405 NULL, NULL 406 }; 407 408 /* 409 * The following are subroutines and definitions to set args or get res 410 * for the different nfsv4 ops 411 */ 412 413 void 414 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 415 { 416 int i; 417 418 for (i = 0; i < arglen; i++) { 419 if (argop[i].argop == OP_LOOKUP) 420 kmem_free( 421 argop[i].nfs_argop4_u.oplookup.objname.utf8string_val, 422 argop[i].nfs_argop4_u.oplookup.objname.utf8string_len); 423 } 424 } 425 426 static void 427 nfs4args_lock_free(nfs_argop4 *argop) 428 { 429 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 430 431 if (locker->new_lock_owner == TRUE) { 432 open_to_lock_owner4 *open_owner; 433 434 open_owner = &locker->locker4_u.open_owner; 435 if (open_owner->lock_owner.owner_val != NULL) { 436 kmem_free(open_owner->lock_owner.owner_val, 437 open_owner->lock_owner.owner_len); 438 } 439 } 440 } 441 442 static void 443 nfs4args_lockt_free(nfs_argop4 *argop) 444 { 445 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 446 447 if (lowner->owner_val != NULL) { 448 kmem_free(lowner->owner_val, lowner->owner_len); 449 } 450 } 451 452 static void 453 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 454 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 455 nfs4_stateid_types_t *sid_types) 456 { 457 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 458 mntinfo4_t *mi; 459 460 argop->argop = OP_SETATTR; 461 /* 462 * The stateid is set to 0 if client is not modifying the size 463 * and otherwise to whatever nfs4_get_stateid() returns. 464 * 465 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 466 * state struct could be found for the process/file pair. We may 467 * want to change this in the future (by OPENing the file). See 468 * bug # 4474852. 469 */ 470 if (vap->va_mask & AT_SIZE) { 471 472 ASSERT(rp != NULL); 473 mi = VTOMI4(RTOV4(rp)); 474 475 argop->nfs_argop4_u.opsetattr.stateid = 476 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 477 OP_SETATTR, sid_types, FALSE); 478 } else { 479 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 480 sizeof (stateid4)); 481 } 482 483 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 484 if (*error) 485 bzero(attr, sizeof (*attr)); 486 } 487 488 static void 489 nfs4args_setattr_free(nfs_argop4 *argop) 490 { 491 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 492 } 493 494 static int 495 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 496 bitmap4 supp) 497 { 498 fattr4 *attr; 499 int error = 0; 500 501 argop->argop = op; 502 switch (op) { 503 case OP_VERIFY: 504 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 505 break; 506 case OP_NVERIFY: 507 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 508 break; 509 default: 510 return (EINVAL); 511 } 512 if (!error) 513 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 514 if (error) 515 bzero(attr, sizeof (*attr)); 516 return (error); 517 } 518 519 static void 520 nfs4args_verify_free(nfs_argop4 *argop) 521 { 522 switch (argop->argop) { 523 case OP_VERIFY: 524 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 525 break; 526 case OP_NVERIFY: 527 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 528 break; 529 default: 530 break; 531 } 532 } 533 534 static void 535 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 536 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 537 { 538 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 539 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 540 541 argop->argop = OP_WRITE; 542 wargs->stable = stable; 543 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 544 mi, OP_WRITE, sid_tp); 545 wargs->mblk = NULL; 546 *wargs_pp = wargs; 547 } 548 549 void 550 nfs4args_copen_free(OPEN4cargs *open_args) 551 { 552 if (open_args->owner.owner_val) { 553 kmem_free(open_args->owner.owner_val, 554 open_args->owner.owner_len); 555 } 556 if ((open_args->opentype == OPEN4_CREATE) && 557 (open_args->mode != EXCLUSIVE4)) { 558 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 559 } 560 } 561 562 /* 563 * XXX: This is referenced in modstubs.s 564 */ 565 struct vnodeops * 566 nfs4_getvnodeops(void) 567 { 568 return (nfs4_vnodeops); 569 } 570 571 /* 572 * The OPEN operation opens a regular file. 573 * 574 * ARGSUSED 575 */ 576 static int 577 nfs4_open(vnode_t **vpp, int flag, cred_t *cr) 578 { 579 vnode_t *dvp = NULL; 580 rnode4_t *rp, *drp; 581 int error; 582 int just_been_created; 583 char fn[MAXNAMELEN]; 584 585 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 586 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 587 return (EIO); 588 rp = VTOR4(*vpp); 589 590 /* 591 * Check to see if opening something besides a regular file; 592 * if so skip the OTW call 593 */ 594 if ((*vpp)->v_type != VREG) { 595 error = nfs4_open_non_reg_file(vpp, flag, cr); 596 return (error); 597 } 598 599 /* 600 * XXX - would like a check right here to know if the file is 601 * executable or not, so as to skip OTW 602 */ 603 604 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 605 return (error); 606 607 drp = VTOR4(dvp); 608 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 609 return (EINTR); 610 611 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 612 nfs_rw_exit(&drp->r_rwlock); 613 return (error); 614 } 615 616 /* 617 * See if this file has just been CREATEd. 618 * If so, clear the flag and update the dnlc, which was previously 619 * skipped in nfs4_create. 620 * XXX need better serilization on this. 621 * XXX move this into the nf4open_otw call, after we have 622 * XXX acquired the open owner seqid sync. 623 */ 624 mutex_enter(&rp->r_statev4_lock); 625 if (rp->created_v4) { 626 rp->created_v4 = 0; 627 mutex_exit(&rp->r_statev4_lock); 628 629 dnlc_update(dvp, fn, *vpp); 630 /* This is needed so we don't bump the open ref count */ 631 just_been_created = 1; 632 } else { 633 mutex_exit(&rp->r_statev4_lock); 634 just_been_created = 0; 635 } 636 637 /* 638 * If caller specified O_TRUNC/FTRUNC, then be sure to set 639 * FWRITE (to drive successful setattr(size=0) after open) 640 */ 641 if (flag & FTRUNC) 642 flag |= FWRITE; 643 644 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 645 just_been_created); 646 647 if (!error && !((*vpp)->v_flag & VROOT)) 648 dnlc_update(dvp, fn, *vpp); 649 650 nfs_rw_exit(&drp->r_rwlock); 651 652 /* release the hold from vtodv */ 653 VN_RELE(dvp); 654 655 /* exchange the shadow for the master vnode, if needed */ 656 657 if (error == 0 && IS_SHADOW(*vpp, rp)) 658 sv_exchange(vpp); 659 660 return (error); 661 } 662 663 /* 664 * See if there's a "lost open" request to be saved and recovered. 665 */ 666 static void 667 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 668 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 669 vnode_t *dvp, OPEN4cargs *open_args) 670 { 671 vfs_t *vfsp; 672 char *srccfp; 673 674 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 675 676 if (error != ETIMEDOUT && error != EINTR && 677 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 678 lost_rqstp->lr_op = 0; 679 return; 680 } 681 682 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 683 "nfs4open_save_lost_rqst: error %d", error)); 684 685 lost_rqstp->lr_op = OP_OPEN; 686 /* 687 * The vp (if it is not NULL) and dvp are held and rele'd via 688 * the recovery code. See nfs4_save_lost_rqst. 689 */ 690 lost_rqstp->lr_vp = vp; 691 lost_rqstp->lr_dvp = dvp; 692 lost_rqstp->lr_oop = oop; 693 lost_rqstp->lr_osp = NULL; 694 lost_rqstp->lr_lop = NULL; 695 lost_rqstp->lr_cr = cr; 696 lost_rqstp->lr_flk = NULL; 697 lost_rqstp->lr_oacc = open_args->share_access; 698 lost_rqstp->lr_odeny = open_args->share_deny; 699 lost_rqstp->lr_oclaim = open_args->claim; 700 if (open_args->claim == CLAIM_DELEGATE_CUR) { 701 lost_rqstp->lr_ostateid = 702 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 703 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 704 } else { 705 srccfp = open_args->open_claim4_u.cfile; 706 } 707 lost_rqstp->lr_ofile.utf8string_len = 0; 708 lost_rqstp->lr_ofile.utf8string_val = NULL; 709 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 710 lost_rqstp->lr_putfirst = FALSE; 711 } 712 713 struct nfs4_excl_time { 714 uint32 seconds; 715 uint32 nseconds; 716 }; 717 718 /* 719 * The OPEN operation creates and/or opens a regular file 720 * 721 * ARGSUSED 722 */ 723 static int 724 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 725 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 726 enum createmode4 createmode, int file_just_been_created) 727 { 728 rnode4_t *rp; 729 rnode4_t *drp = VTOR4(dvp); 730 vnode_t *vp = NULL; 731 vnode_t *vpi = *vpp; 732 bool_t needrecov = FALSE; 733 734 int doqueue = 1; 735 736 COMPOUND4args_clnt args; 737 COMPOUND4res_clnt res; 738 nfs_argop4 *argop; 739 nfs_resop4 *resop; 740 int argoplist_size; 741 int idx_open, idx_fattr; 742 743 GETFH4res *gf_res = NULL; 744 OPEN4res *op_res = NULL; 745 nfs4_ga_res_t *garp; 746 fattr4 *attr = NULL; 747 struct nfs4_excl_time verf; 748 bool_t did_excl_setup = FALSE; 749 int created_osp; 750 751 OPEN4cargs *open_args; 752 nfs4_open_owner_t *oop = NULL; 753 nfs4_open_stream_t *osp = NULL; 754 seqid4 seqid = 0; 755 bool_t retry_open = FALSE; 756 nfs4_recov_state_t recov_state; 757 nfs4_lost_rqst_t lost_rqst; 758 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 759 hrtime_t t; 760 int acc = 0; 761 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 762 cred_t *ncr = NULL; 763 764 nfs4_sharedfh_t *otw_sfh; 765 nfs4_sharedfh_t *orig_sfh; 766 int fh_differs = 0; 767 int numops, setgid_flag; 768 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 769 770 /* 771 * Make sure we properly deal with setting the right gid on 772 * a newly created file to reflect the parent's setgid bit 773 */ 774 setgid_flag = 0; 775 if (create_flag && in_va) { 776 777 /* 778 * If the parent's directory has the setgid bit set 779 * _and_ the client was able to get a valid mapping 780 * for the parent dir's owner_group, we want to 781 * append NVERIFY(owner_group == dva.va_gid) and 782 * SETATTR to the CREATE compound. 783 */ 784 mutex_enter(&drp->r_statelock); 785 if (drp->r_attr.va_mode & VSGID && 786 drp->r_attr.va_gid != GID_NOBODY) { 787 in_va->va_gid = drp->r_attr.va_gid; 788 setgid_flag = 1; 789 } 790 mutex_exit(&drp->r_statelock); 791 } 792 793 /* 794 * Normal/non-create compound: 795 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 796 * 797 * Open(create) compound no setgid: 798 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 799 * RESTOREFH + GETATTR 800 * 801 * Open(create) setgid: 802 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 803 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 804 * NVERIFY(grp) + SETATTR 805 */ 806 if (setgid_flag) { 807 numops = 10; 808 idx_open = 1; 809 idx_fattr = 3; 810 } else if (create_flag) { 811 numops = 7; 812 idx_open = 2; 813 idx_fattr = 4; 814 } else { 815 numops = 4; 816 idx_open = 1; 817 idx_fattr = 3; 818 } 819 820 args.array_len = numops; 821 argoplist_size = numops * sizeof (nfs_argop4); 822 argop = kmem_alloc(argoplist_size, KM_SLEEP); 823 824 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 825 "open %s open flag 0x%x cred %p", file_name, open_flag, 826 (void *)cr)); 827 828 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 829 if (create_flag) { 830 /* 831 * We are to create a file. Initialize the passed in vnode 832 * pointer. 833 */ 834 vpi = NULL; 835 } else { 836 /* 837 * Check to see if the client owns a read delegation and is 838 * trying to open for write. If so, then return the delegation 839 * to avoid the server doing a cb_recall and returning DELAY. 840 * NB - we don't use the statev4_lock here because we'd have 841 * to drop the lock anyway and the result would be stale. 842 */ 843 if ((open_flag & FWRITE) && 844 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 845 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 846 847 /* 848 * If the file has a delegation, then do an access check up 849 * front. This avoids having to an access check later after 850 * we've already done start_op, which could deadlock. 851 */ 852 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 853 if (open_flag & FREAD && 854 nfs4_access(vpi, VREAD, 0, cr) == 0) 855 acc |= VREAD; 856 if (open_flag & FWRITE && 857 nfs4_access(vpi, VWRITE, 0, cr) == 0) 858 acc |= VWRITE; 859 } 860 } 861 862 drp = VTOR4(dvp); 863 864 recov_state.rs_flags = 0; 865 recov_state.rs_num_retry_despite_err = 0; 866 cred_otw = cr; 867 868 recov_retry: 869 fh_differs = 0; 870 nfs4_error_zinit(&e); 871 872 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 873 if (e.error) { 874 if (ncr != NULL) 875 crfree(ncr); 876 kmem_free(argop, argoplist_size); 877 return (e.error); 878 } 879 880 args.ctag = TAG_OPEN; 881 args.array_len = numops; 882 args.array = argop; 883 884 /* putfh directory fh */ 885 argop[0].argop = OP_CPUTFH; 886 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 887 888 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 889 argop[idx_open].argop = OP_COPEN; 890 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 891 open_args->claim = CLAIM_NULL; 892 893 /* name of file */ 894 open_args->open_claim4_u.cfile = file_name; 895 open_args->owner.owner_len = 0; 896 open_args->owner.owner_val = NULL; 897 898 if (create_flag) { 899 /* CREATE a file */ 900 open_args->opentype = OPEN4_CREATE; 901 open_args->mode = createmode; 902 if (createmode == EXCLUSIVE4) { 903 if (did_excl_setup == FALSE) { 904 verf.seconds = nfs_atoi(hw_serial); 905 if (verf.seconds != 0) 906 verf.nseconds = newnum(); 907 else { 908 timestruc_t now; 909 910 gethrestime(&now); 911 verf.seconds = now.tv_sec; 912 verf.nseconds = now.tv_nsec; 913 } 914 /* 915 * Since the server will use this value for the 916 * mtime, make sure that it can't overflow. Zero 917 * out the MSB. The actual value does not matter 918 * here, only its uniqeness. 919 */ 920 verf.seconds &= INT32_MAX; 921 did_excl_setup = TRUE; 922 } 923 924 /* Now copy over verifier to OPEN4args. */ 925 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 926 } else { 927 int v_error; 928 bitmap4 supp_attrs; 929 servinfo4_t *svp; 930 931 attr = &open_args->createhow4_u.createattrs; 932 933 svp = drp->r_server; 934 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 935 supp_attrs = svp->sv_supp_attrs; 936 nfs_rw_exit(&svp->sv_lock); 937 938 /* GUARDED4 or UNCHECKED4 */ 939 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 940 supp_attrs); 941 if (v_error) { 942 bzero(attr, sizeof (*attr)); 943 nfs4args_copen_free(open_args); 944 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 945 &recov_state, FALSE); 946 if (ncr != NULL) 947 crfree(ncr); 948 kmem_free(argop, argoplist_size); 949 return (v_error); 950 } 951 } 952 } else { 953 /* NO CREATE */ 954 open_args->opentype = OPEN4_NOCREATE; 955 } 956 957 if (recov_state.rs_sp != NULL) { 958 mutex_enter(&recov_state.rs_sp->s_lock); 959 open_args->owner.clientid = recov_state.rs_sp->clientid; 960 mutex_exit(&recov_state.rs_sp->s_lock); 961 } else { 962 /* XXX should we just fail here? */ 963 open_args->owner.clientid = 0; 964 } 965 966 /* 967 * This increments oop's ref count or creates a temporary 'just_created' 968 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 969 * completes. 970 */ 971 mutex_enter(&VTOMI4(dvp)->mi_lock); 972 973 /* See if a permanent or just created open owner exists */ 974 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 975 if (!oop) { 976 /* 977 * This open owner does not exist so create a temporary 978 * just created one. 979 */ 980 oop = create_open_owner(cr, VTOMI4(dvp)); 981 ASSERT(oop != NULL); 982 } 983 mutex_exit(&VTOMI4(dvp)->mi_lock); 984 985 /* this length never changes, do alloc before seqid sync */ 986 open_args->owner.owner_len = sizeof (oop->oo_name); 987 open_args->owner.owner_val = 988 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 989 990 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 991 if (e.error == EAGAIN) { 992 open_owner_rele(oop); 993 nfs4args_copen_free(open_args); 994 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 995 if (ncr != NULL) { 996 crfree(ncr); 997 ncr = NULL; 998 } 999 goto recov_retry; 1000 } 1001 1002 /* Check to see if we need to do the OTW call */ 1003 if (!create_flag) { 1004 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1005 file_just_been_created, &e.error, acc, &recov_state)) { 1006 1007 /* 1008 * The OTW open is not necessary. Either 1009 * the open can succeed without it (eg. 1010 * delegation, error == 0) or the open 1011 * must fail due to an access failure 1012 * (error != 0). In either case, tidy 1013 * up and return. 1014 */ 1015 1016 nfs4_end_open_seqid_sync(oop); 1017 open_owner_rele(oop); 1018 nfs4args_copen_free(open_args); 1019 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1020 if (ncr != NULL) 1021 crfree(ncr); 1022 kmem_free(argop, argoplist_size); 1023 return (e.error); 1024 } 1025 } 1026 1027 bcopy(&oop->oo_name, open_args->owner.owner_val, 1028 open_args->owner.owner_len); 1029 1030 seqid = nfs4_get_open_seqid(oop) + 1; 1031 open_args->seqid = seqid; 1032 open_args->share_access = 0; 1033 if (open_flag & FREAD) 1034 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1035 if (open_flag & FWRITE) 1036 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1037 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1038 1039 1040 1041 /* 1042 * getfh w/sanity check for idx_open/idx_fattr 1043 */ 1044 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1045 argop[idx_open + 1].argop = OP_GETFH; 1046 1047 /* getattr */ 1048 argop[idx_fattr].argop = OP_GETATTR; 1049 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1050 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1051 1052 if (setgid_flag) { 1053 vattr_t _v; 1054 servinfo4_t *svp; 1055 bitmap4 supp_attrs; 1056 1057 svp = drp->r_server; 1058 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1059 supp_attrs = svp->sv_supp_attrs; 1060 nfs_rw_exit(&svp->sv_lock); 1061 1062 /* 1063 * For setgid case, we need to: 1064 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1065 */ 1066 argop[4].argop = OP_SAVEFH; 1067 1068 argop[5].argop = OP_CPUTFH; 1069 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1070 1071 argop[6].argop = OP_GETATTR; 1072 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1073 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1074 1075 argop[7].argop = OP_RESTOREFH; 1076 1077 /* 1078 * nverify 1079 */ 1080 _v.va_mask = AT_GID; 1081 _v.va_gid = in_va->va_gid; 1082 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1083 supp_attrs))) { 1084 1085 /* 1086 * setattr 1087 * 1088 * We _know_ we're not messing with AT_SIZE or 1089 * AT_XTIME, so no need for stateid or flags. 1090 * Also we specify NULL rp since we're only 1091 * interested in setting owner_group attributes. 1092 */ 1093 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1094 supp_attrs, &e.error, 0); 1095 if (e.error) 1096 nfs4args_verify_free(&argop[8]); 1097 } 1098 1099 if (e.error) { 1100 /* 1101 * XXX - Revisit the last argument to nfs4_end_op() 1102 * once 5020486 is fixed. 1103 */ 1104 nfs4_end_open_seqid_sync(oop); 1105 open_owner_rele(oop); 1106 nfs4args_copen_free(open_args); 1107 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1108 if (ncr != NULL) 1109 crfree(ncr); 1110 kmem_free(argop, argoplist_size); 1111 return (e.error); 1112 } 1113 } else if (create_flag) { 1114 /* 1115 * For setgid case, we need to: 1116 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1117 */ 1118 argop[1].argop = OP_SAVEFH; 1119 1120 argop[5].argop = OP_RESTOREFH; 1121 1122 argop[6].argop = OP_GETATTR; 1123 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1124 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1125 } 1126 1127 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1128 "nfs4open_otw: %s call, nm %s, rp %s", 1129 needrecov ? "recov" : "first", file_name, 1130 rnode4info(VTOR4(dvp)))); 1131 1132 t = gethrtime(); 1133 1134 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1135 1136 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1137 nfs4_set_open_seqid(seqid, oop, args.ctag); 1138 1139 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1140 1141 if (e.error || needrecov) { 1142 bool_t abort = FALSE; 1143 1144 if (needrecov) { 1145 nfs4_bseqid_entry_t *bsep = NULL; 1146 1147 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1148 cred_otw, vpi, dvp, open_args); 1149 1150 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1151 bsep = nfs4_create_bseqid_entry(oop, NULL, 1152 vpi, 0, args.ctag, open_args->seqid); 1153 num_bseqid_retry--; 1154 } 1155 1156 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1157 NULL, lost_rqst.lr_op == OP_OPEN ? 1158 &lost_rqst : NULL, OP_OPEN, bsep); 1159 1160 if (bsep) 1161 kmem_free(bsep, sizeof (*bsep)); 1162 /* give up if we keep getting BAD_SEQID */ 1163 if (num_bseqid_retry == 0) 1164 abort = TRUE; 1165 if (abort == TRUE && e.error == 0) 1166 e.error = geterrno4(res.status); 1167 } 1168 nfs4_end_open_seqid_sync(oop); 1169 open_owner_rele(oop); 1170 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1171 nfs4args_copen_free(open_args); 1172 if (setgid_flag) { 1173 nfs4args_verify_free(&argop[8]); 1174 nfs4args_setattr_free(&argop[9]); 1175 } 1176 if (!e.error) 1177 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1178 if (ncr != NULL) { 1179 crfree(ncr); 1180 ncr = NULL; 1181 } 1182 if (!needrecov || abort == TRUE || e.error == EINTR || 1183 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1184 kmem_free(argop, argoplist_size); 1185 return (e.error); 1186 } 1187 goto recov_retry; 1188 } 1189 1190 /* 1191 * Will check and update lease after checking the rflag for 1192 * OPEN_CONFIRM in the successful OPEN call. 1193 */ 1194 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1195 1196 /* 1197 * XXX what if we're crossing mount points from server1:/drp 1198 * to server2:/drp/rp. 1199 */ 1200 1201 /* Signal our end of use of the open seqid */ 1202 nfs4_end_open_seqid_sync(oop); 1203 1204 /* 1205 * This will destroy the open owner if it was just created, 1206 * and no one else has put a reference on it. 1207 */ 1208 open_owner_rele(oop); 1209 if (create_flag && (createmode != EXCLUSIVE4) && 1210 res.status == NFS4ERR_BADOWNER) 1211 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1212 1213 e.error = geterrno4(res.status); 1214 nfs4args_copen_free(open_args); 1215 if (setgid_flag) { 1216 nfs4args_verify_free(&argop[8]); 1217 nfs4args_setattr_free(&argop[9]); 1218 } 1219 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1220 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1221 /* 1222 * If the reply is NFS4ERR_ACCESS, it may be because 1223 * we are root (no root net access). If the real uid 1224 * is not root, then retry with the real uid instead. 1225 */ 1226 if (ncr != NULL) { 1227 crfree(ncr); 1228 ncr = NULL; 1229 } 1230 if (res.status == NFS4ERR_ACCESS && 1231 (ncr = crnetadjust(cred_otw)) != NULL) { 1232 cred_otw = ncr; 1233 goto recov_retry; 1234 } 1235 kmem_free(argop, argoplist_size); 1236 return (e.error); 1237 } 1238 1239 resop = &res.array[idx_open]; /* open res */ 1240 op_res = &resop->nfs_resop4_u.opopen; 1241 1242 #ifdef DEBUG 1243 /* 1244 * verify attrset bitmap 1245 */ 1246 if (create_flag && 1247 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1248 /* make sure attrset returned is what we asked for */ 1249 /* XXX Ignore this 'error' for now */ 1250 if (attr->attrmask != op_res->attrset) 1251 /* EMPTY */; 1252 } 1253 #endif 1254 1255 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1256 mutex_enter(&VTOMI4(dvp)->mi_lock); 1257 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1258 mutex_exit(&VTOMI4(dvp)->mi_lock); 1259 } 1260 1261 resop = &res.array[idx_open + 1]; /* getfh res */ 1262 gf_res = &resop->nfs_resop4_u.opgetfh; 1263 1264 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1265 1266 /* 1267 * The open stateid has been updated on the server but not 1268 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1269 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1270 * WRITE call. That, however, will use the old stateid, so go ahead 1271 * and upate the open stateid now, before any call to makenfs4node. 1272 */ 1273 if (vpi) { 1274 nfs4_open_stream_t *tmp_osp; 1275 rnode4_t *tmp_rp = VTOR4(vpi); 1276 1277 tmp_osp = find_open_stream(oop, tmp_rp); 1278 if (tmp_osp) { 1279 tmp_osp->open_stateid = op_res->stateid; 1280 mutex_exit(&tmp_osp->os_sync_lock); 1281 open_stream_rele(tmp_osp, tmp_rp); 1282 } 1283 1284 /* 1285 * We must determine if the file handle given by the otw open 1286 * is the same as the file handle which was passed in with 1287 * *vpp. This case can be reached if the file we are trying 1288 * to open has been removed and another file has been created 1289 * having the same file name. The passed in vnode is released 1290 * later. 1291 */ 1292 orig_sfh = VTOR4(vpi)->r_fh; 1293 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1294 } 1295 1296 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1297 1298 if (create_flag || fh_differs) { 1299 int rnode_err = 0; 1300 1301 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1302 dvp, fn_get(VTOSV(dvp)->sv_name, file_name)); 1303 1304 if (e.error) 1305 PURGE_ATTRCACHE4(vp); 1306 /* 1307 * For the newly created vp case, make sure the rnode 1308 * isn't bad before using it. 1309 */ 1310 mutex_enter(&(VTOR4(vp))->r_statelock); 1311 if (VTOR4(vp)->r_flags & R4RECOVERR) 1312 rnode_err = EIO; 1313 mutex_exit(&(VTOR4(vp))->r_statelock); 1314 1315 if (rnode_err) { 1316 nfs4_end_open_seqid_sync(oop); 1317 nfs4args_copen_free(open_args); 1318 if (setgid_flag) { 1319 nfs4args_verify_free(&argop[8]); 1320 nfs4args_setattr_free(&argop[9]); 1321 } 1322 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1323 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1324 needrecov); 1325 open_owner_rele(oop); 1326 VN_RELE(vp); 1327 if (ncr != NULL) 1328 crfree(ncr); 1329 sfh4_rele(&otw_sfh); 1330 kmem_free(argop, argoplist_size); 1331 return (EIO); 1332 } 1333 } else { 1334 vp = vpi; 1335 } 1336 sfh4_rele(&otw_sfh); 1337 1338 /* 1339 * It seems odd to get a full set of attrs and then not update 1340 * the object's attrcache in the non-create case. Create case uses 1341 * the attrs since makenfs4node checks to see if the attrs need to 1342 * be updated (and then updates them). The non-create case should 1343 * update attrs also. 1344 */ 1345 if (! create_flag && ! fh_differs && !e.error) { 1346 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1347 } 1348 1349 nfs4_error_zinit(&e); 1350 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1351 /* This does not do recovery for vp explicitly. */ 1352 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1353 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1354 1355 if (e.error || e.stat) { 1356 nfs4_end_open_seqid_sync(oop); 1357 nfs4args_copen_free(open_args); 1358 if (setgid_flag) { 1359 nfs4args_verify_free(&argop[8]); 1360 nfs4args_setattr_free(&argop[9]); 1361 } 1362 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1363 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1364 needrecov); 1365 open_owner_rele(oop); 1366 if (create_flag || fh_differs) { 1367 /* rele the makenfs4node */ 1368 VN_RELE(vp); 1369 } 1370 if (ncr != NULL) { 1371 crfree(ncr); 1372 ncr = NULL; 1373 } 1374 if (retry_open == TRUE) { 1375 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1376 "nfs4open_otw: retry the open since OPEN " 1377 "CONFIRM failed with error %d stat %d", 1378 e.error, e.stat)); 1379 if (create_flag && createmode == GUARDED4) { 1380 NFS4_DEBUG(nfs4_client_recov_debug, 1381 (CE_NOTE, "nfs4open_otw: switch " 1382 "createmode from GUARDED4 to " 1383 "UNCHECKED4")); 1384 createmode = UNCHECKED4; 1385 } 1386 goto recov_retry; 1387 } 1388 if (!e.error) { 1389 if (create_flag && (createmode != EXCLUSIVE4) && 1390 e.stat == NFS4ERR_BADOWNER) 1391 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1392 1393 e.error = geterrno4(e.stat); 1394 } 1395 kmem_free(argop, argoplist_size); 1396 return (e.error); 1397 } 1398 } 1399 1400 rp = VTOR4(vp); 1401 1402 mutex_enter(&rp->r_statev4_lock); 1403 if (create_flag) 1404 rp->created_v4 = 1; 1405 mutex_exit(&rp->r_statev4_lock); 1406 1407 mutex_enter(&oop->oo_lock); 1408 /* Doesn't matter if 'oo_just_created' already was set as this */ 1409 oop->oo_just_created = NFS4_PERM_CREATED; 1410 if (oop->oo_cred_otw) 1411 crfree(oop->oo_cred_otw); 1412 oop->oo_cred_otw = cred_otw; 1413 crhold(oop->oo_cred_otw); 1414 mutex_exit(&oop->oo_lock); 1415 1416 /* returns with 'os_sync_lock' held */ 1417 osp = find_or_create_open_stream(oop, rp, &created_osp); 1418 if (!osp) { 1419 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1420 "nfs4open_otw: failed to create an open stream")); 1421 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1422 "signal our end of use of the open seqid")); 1423 1424 nfs4_end_open_seqid_sync(oop); 1425 open_owner_rele(oop); 1426 nfs4args_copen_free(open_args); 1427 if (setgid_flag) { 1428 nfs4args_verify_free(&argop[8]); 1429 nfs4args_setattr_free(&argop[9]); 1430 } 1431 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1432 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1433 if (create_flag || fh_differs) 1434 VN_RELE(vp); 1435 if (ncr != NULL) 1436 crfree(ncr); 1437 1438 kmem_free(argop, argoplist_size); 1439 return (EINVAL); 1440 1441 } 1442 1443 osp->open_stateid = op_res->stateid; 1444 1445 if (open_flag & FREAD) 1446 osp->os_share_acc_read++; 1447 if (open_flag & FWRITE) 1448 osp->os_share_acc_write++; 1449 osp->os_share_deny_none++; 1450 1451 /* 1452 * Need to reset this bitfield for the possible case where we were 1453 * going to OTW CLOSE the file, got a non-recoverable error, and before 1454 * we could retry the CLOSE, OPENed the file again. 1455 */ 1456 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1457 osp->os_final_close = 0; 1458 osp->os_force_close = 0; 1459 #ifdef DEBUG 1460 if (osp->os_failed_reopen) 1461 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1462 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1463 (void *)osp, (void *)cr, rnode4info(rp))); 1464 #endif 1465 osp->os_failed_reopen = 0; 1466 1467 mutex_exit(&osp->os_sync_lock); 1468 1469 nfs4_end_open_seqid_sync(oop); 1470 1471 if (created_osp && recov_state.rs_sp != NULL) { 1472 mutex_enter(&recov_state.rs_sp->s_lock); 1473 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1474 mutex_exit(&recov_state.rs_sp->s_lock); 1475 } 1476 1477 /* get rid of our reference to find oop */ 1478 open_owner_rele(oop); 1479 1480 open_stream_rele(osp, rp); 1481 1482 /* accept delegation, if any */ 1483 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1484 1485 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1486 1487 if (createmode == EXCLUSIVE4 && 1488 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1489 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1490 " EXCLUSIVE4: sending a SETATTR")); 1491 /* 1492 * If doing an exclusive create, then generate 1493 * a SETATTR to set the initial attributes. 1494 * Try to set the mtime and the atime to the 1495 * server's current time. It is somewhat 1496 * expected that these fields will be used to 1497 * store the exclusive create cookie. If not, 1498 * server implementors will need to know that 1499 * a SETATTR will follow an exclusive create 1500 * and the cookie should be destroyed if 1501 * appropriate. 1502 * 1503 * The AT_GID and AT_SIZE bits are turned off 1504 * so that the SETATTR request will not attempt 1505 * to process these. The gid will be set 1506 * separately if appropriate. The size is turned 1507 * off because it is assumed that a new file will 1508 * be created empty and if the file wasn't empty, 1509 * then the exclusive create will have failed 1510 * because the file must have existed already. 1511 * Therefore, no truncate operation is needed. 1512 */ 1513 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1514 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1515 1516 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1517 if (e.error) { 1518 /* 1519 * Couldn't correct the attributes of 1520 * the newly created file and the 1521 * attributes are wrong. Remove the 1522 * file and return an error to the 1523 * application. 1524 */ 1525 /* XXX will this take care of client state ? */ 1526 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1527 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1528 " remove file", e.error)); 1529 VN_RELE(vp); 1530 (void) nfs4_remove(dvp, file_name, cr); 1531 /* 1532 * Since we've reled the vnode and removed 1533 * the file we now need to return the error. 1534 * At this point we don't want to update the 1535 * dircaches, call nfs4_waitfor_purge_complete 1536 * or set vpp to vp so we need to skip these 1537 * as well. 1538 */ 1539 goto skip_update_dircaches; 1540 } 1541 } 1542 1543 /* 1544 * If we created or found the correct vnode, due to create_flag or 1545 * fh_differs being set, then update directory cache attribute, readdir 1546 * and dnlc caches. 1547 */ 1548 if (create_flag || fh_differs) { 1549 dirattr_info_t dinfo, *dinfop; 1550 1551 /* 1552 * Make sure getattr succeeded before using results. 1553 * note: op 7 is getattr(dir) for both flavors of 1554 * open(create). 1555 */ 1556 if (create_flag && res.status == NFS4_OK) { 1557 dinfo.di_time_call = t; 1558 dinfo.di_cred = cr; 1559 dinfo.di_garp = 1560 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1561 dinfop = &dinfo; 1562 } else { 1563 dinfop = NULL; 1564 } 1565 1566 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1567 dinfop); 1568 } 1569 1570 /* 1571 * If the page cache for this file was flushed from actions 1572 * above, it was done asynchronously and if that is true, 1573 * there is a need to wait here for it to complete. This must 1574 * be done outside of start_fop/end_fop. 1575 */ 1576 (void) nfs4_waitfor_purge_complete(vp); 1577 1578 /* 1579 * It is implicit that we are in the open case (create_flag == 0) since 1580 * fh_differs can only be set to a non-zero value in the open case. 1581 */ 1582 if (fh_differs != 0 && vpi != NULL) 1583 VN_RELE(vpi); 1584 1585 /* 1586 * Be sure to set *vpp to the correct value before returning. 1587 */ 1588 *vpp = vp; 1589 1590 skip_update_dircaches: 1591 1592 nfs4args_copen_free(open_args); 1593 if (setgid_flag) { 1594 nfs4args_verify_free(&argop[8]); 1595 nfs4args_setattr_free(&argop[9]); 1596 } 1597 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1598 1599 if (ncr) 1600 crfree(ncr); 1601 kmem_free(argop, argoplist_size); 1602 return (e.error); 1603 } 1604 1605 /* 1606 * Reopen an open instance. cf. nfs4open_otw(). 1607 * 1608 * Errors are returned by the nfs4_error_t parameter. 1609 * - ep->error contains an errno value or zero. 1610 * - if it is zero, ep->stat is set to an NFS status code, if any. 1611 * If the file could not be reopened, but the caller should continue, the 1612 * file is marked dead and no error values are returned. If the caller 1613 * should stop recovering open files and start over, either the ep->error 1614 * value or ep->stat will indicate an error (either something that requires 1615 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1616 * filehandles) may be handled silently by this routine. 1617 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1618 * will be started, so the caller should not do it. 1619 * 1620 * Gotos: 1621 * - kill_file : reopen failed in such a fashion to constitute marking the 1622 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1623 * is for cases where recovery is not possible. 1624 * - failed_reopen : same as above, except that the file has already been 1625 * marked dead, so no need to do it again. 1626 * - bailout : reopen failed but we are able to recover and retry the reopen - 1627 * either within this function immediatley or via the calling function. 1628 */ 1629 1630 void 1631 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1632 open_claim_type4 claim, bool_t frc_use_claim_previous, 1633 bool_t is_recov) 1634 { 1635 COMPOUND4args_clnt args; 1636 COMPOUND4res_clnt res; 1637 nfs_argop4 argop[4]; 1638 nfs_resop4 *resop; 1639 OPEN4res *op_res = NULL; 1640 OPEN4cargs *open_args; 1641 GETFH4res *gf_res; 1642 rnode4_t *rp = VTOR4(vp); 1643 int doqueue = 1; 1644 cred_t *cr = NULL, *cred_otw = NULL; 1645 nfs4_open_owner_t *oop = NULL; 1646 seqid4 seqid; 1647 nfs4_ga_res_t *garp; 1648 char fn[MAXNAMELEN]; 1649 nfs4_recov_state_t recov = {NULL, 0}; 1650 nfs4_lost_rqst_t lost_rqst; 1651 mntinfo4_t *mi = VTOMI4(vp); 1652 bool_t abort; 1653 char *failed_msg = ""; 1654 int fh_different; 1655 hrtime_t t; 1656 nfs4_bseqid_entry_t *bsep = NULL; 1657 1658 ASSERT(nfs4_consistent_type(vp)); 1659 ASSERT(nfs_zone() == mi->mi_zone); 1660 1661 nfs4_error_zinit(ep); 1662 1663 /* this is the cred used to find the open owner */ 1664 cr = state_to_cred(osp); 1665 if (cr == NULL) { 1666 failed_msg = "Couldn't reopen: no cred"; 1667 goto kill_file; 1668 } 1669 /* use this cred for OTW operations */ 1670 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1671 1672 top: 1673 nfs4_error_zinit(ep); 1674 1675 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1676 /* File system has been unmounted, quit */ 1677 ep->error = EIO; 1678 failed_msg = "Couldn't reopen: file system has been unmounted"; 1679 goto kill_file; 1680 } 1681 1682 oop = osp->os_open_owner; 1683 1684 ASSERT(oop != NULL); 1685 if (oop == NULL) { /* be defensive in non-DEBUG */ 1686 failed_msg = "can't reopen: no open owner"; 1687 goto kill_file; 1688 } 1689 open_owner_hold(oop); 1690 1691 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1692 if (ep->error) { 1693 open_owner_rele(oop); 1694 oop = NULL; 1695 goto bailout; 1696 } 1697 1698 /* 1699 * If the rnode has a delegation and the delegation has been 1700 * recovered and the server didn't request a recall and the caller 1701 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1702 * recovery) and the rnode hasn't been marked dead, then install 1703 * the delegation stateid in the open stream. Otherwise, proceed 1704 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1705 */ 1706 mutex_enter(&rp->r_statev4_lock); 1707 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1708 !rp->r_deleg_return_pending && 1709 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1710 !rp->r_deleg_needs_recall && 1711 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1712 !(rp->r_flags & R4RECOVERR)) { 1713 mutex_enter(&osp->os_sync_lock); 1714 osp->os_delegation = 1; 1715 osp->open_stateid = rp->r_deleg_stateid; 1716 mutex_exit(&osp->os_sync_lock); 1717 mutex_exit(&rp->r_statev4_lock); 1718 goto bailout; 1719 } 1720 mutex_exit(&rp->r_statev4_lock); 1721 1722 /* 1723 * If the file failed recovery, just quit. This failure need not 1724 * affect other reopens, so don't return an error. 1725 */ 1726 mutex_enter(&rp->r_statelock); 1727 if (rp->r_flags & R4RECOVERR) { 1728 mutex_exit(&rp->r_statelock); 1729 ep->error = 0; 1730 goto failed_reopen; 1731 } 1732 mutex_exit(&rp->r_statelock); 1733 1734 /* 1735 * argop is empty here 1736 * 1737 * PUTFH, OPEN, GETATTR 1738 */ 1739 args.ctag = TAG_REOPEN; 1740 args.array_len = 4; 1741 args.array = argop; 1742 1743 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1744 "nfs4_reopen: file is type %d, id %s", 1745 vp->v_type, rnode4info(VTOR4(vp)))); 1746 1747 argop[0].argop = OP_CPUTFH; 1748 1749 if (claim != CLAIM_PREVIOUS) { 1750 /* 1751 * if this is a file mount then 1752 * use the mntinfo parentfh 1753 */ 1754 argop[0].nfs_argop4_u.opcputfh.sfh = 1755 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1756 VTOSV(vp)->sv_dfh; 1757 } else { 1758 /* putfh fh to reopen */ 1759 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1760 } 1761 1762 argop[1].argop = OP_COPEN; 1763 open_args = &argop[1].nfs_argop4_u.opcopen; 1764 open_args->claim = claim; 1765 1766 if (claim == CLAIM_NULL) { 1767 1768 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1769 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1770 "failed for vp 0x%p for CLAIM_NULL with %m", 1771 (void *)vp); 1772 failed_msg = "Couldn't reopen: vtoname failed for " 1773 "CLAIM_NULL"; 1774 /* nothing allocated yet */ 1775 goto kill_file; 1776 } 1777 1778 open_args->open_claim4_u.cfile = fn; 1779 } else if (claim == CLAIM_PREVIOUS) { 1780 1781 /* 1782 * We have two cases to deal with here: 1783 * 1) We're being called to reopen files in order to satisfy 1784 * a lock operation request which requires us to explicitly 1785 * reopen files which were opened under a delegation. If 1786 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1787 * that case, frc_use_claim_previous is TRUE and we must 1788 * use the rnode's current delegation type (r_deleg_type). 1789 * 2) We're reopening files during some form of recovery. 1790 * In this case, frc_use_claim_previous is FALSE and we 1791 * use the delegation type appropriate for recovery 1792 * (r_deleg_needs_recovery). 1793 */ 1794 mutex_enter(&rp->r_statev4_lock); 1795 open_args->open_claim4_u.delegate_type = 1796 frc_use_claim_previous ? 1797 rp->r_deleg_type : 1798 rp->r_deleg_needs_recovery; 1799 mutex_exit(&rp->r_statev4_lock); 1800 1801 } else if (claim == CLAIM_DELEGATE_CUR) { 1802 1803 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1804 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1805 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1806 "with %m", (void *)vp); 1807 failed_msg = "Couldn't reopen: vtoname failed for " 1808 "CLAIM_DELEGATE_CUR"; 1809 /* nothing allocated yet */ 1810 goto kill_file; 1811 } 1812 1813 mutex_enter(&rp->r_statev4_lock); 1814 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1815 rp->r_deleg_stateid; 1816 mutex_exit(&rp->r_statev4_lock); 1817 1818 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1819 } 1820 open_args->opentype = OPEN4_NOCREATE; 1821 open_args->owner.clientid = mi2clientid(mi); 1822 open_args->owner.owner_len = sizeof (oop->oo_name); 1823 open_args->owner.owner_val = 1824 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1825 bcopy(&oop->oo_name, open_args->owner.owner_val, 1826 open_args->owner.owner_len); 1827 open_args->share_access = 0; 1828 open_args->share_deny = 0; 1829 1830 mutex_enter(&osp->os_sync_lock); 1831 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1832 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1833 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1834 (void *)osp, (void *)rp, osp->os_share_acc_read, 1835 osp->os_share_acc_write, osp->os_open_ref_count, 1836 osp->os_mmap_read, osp->os_mmap_write, claim)); 1837 1838 if (osp->os_share_acc_read || osp->os_mmap_read) 1839 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1840 if (osp->os_share_acc_write || osp->os_mmap_write) 1841 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1842 if (osp->os_share_deny_read) 1843 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1844 if (osp->os_share_deny_write) 1845 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1846 mutex_exit(&osp->os_sync_lock); 1847 1848 seqid = nfs4_get_open_seqid(oop) + 1; 1849 open_args->seqid = seqid; 1850 1851 /* Construct the getfh part of the compound */ 1852 argop[2].argop = OP_GETFH; 1853 1854 /* Construct the getattr part of the compound */ 1855 argop[3].argop = OP_GETATTR; 1856 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1857 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1858 1859 t = gethrtime(); 1860 1861 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1862 1863 if (ep->error) { 1864 if (!is_recov && !frc_use_claim_previous && 1865 (ep->error == EINTR || ep->error == ETIMEDOUT || 1866 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1867 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1868 cred_otw, vp, NULL, open_args); 1869 abort = nfs4_start_recovery(ep, 1870 VTOMI4(vp), vp, NULL, NULL, 1871 lost_rqst.lr_op == OP_OPEN ? 1872 &lost_rqst : NULL, OP_OPEN, NULL); 1873 nfs4args_copen_free(open_args); 1874 goto bailout; 1875 } 1876 1877 nfs4args_copen_free(open_args); 1878 1879 if (ep->error == EACCES && cred_otw != cr) { 1880 crfree(cred_otw); 1881 cred_otw = cr; 1882 crhold(cred_otw); 1883 nfs4_end_open_seqid_sync(oop); 1884 open_owner_rele(oop); 1885 oop = NULL; 1886 goto top; 1887 } 1888 if (ep->error == ETIMEDOUT) 1889 goto bailout; 1890 failed_msg = "Couldn't reopen: rpc error"; 1891 goto kill_file; 1892 } 1893 1894 if (nfs4_need_to_bump_seqid(&res)) 1895 nfs4_set_open_seqid(seqid, oop, args.ctag); 1896 1897 switch (res.status) { 1898 case NFS4_OK: 1899 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1900 mutex_enter(&rp->r_statelock); 1901 rp->r_delay_interval = 0; 1902 mutex_exit(&rp->r_statelock); 1903 } 1904 break; 1905 case NFS4ERR_BAD_SEQID: 1906 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1907 args.ctag, open_args->seqid); 1908 1909 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1910 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1911 NULL, OP_OPEN, bsep); 1912 1913 nfs4args_copen_free(open_args); 1914 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1915 nfs4_end_open_seqid_sync(oop); 1916 open_owner_rele(oop); 1917 oop = NULL; 1918 kmem_free(bsep, sizeof (*bsep)); 1919 1920 goto kill_file; 1921 case NFS4ERR_NO_GRACE: 1922 nfs4args_copen_free(open_args); 1923 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1924 nfs4_end_open_seqid_sync(oop); 1925 open_owner_rele(oop); 1926 oop = NULL; 1927 if (claim == CLAIM_PREVIOUS) { 1928 /* 1929 * Retry as a plain open. We don't need to worry about 1930 * checking the changeinfo: it is acceptable for a 1931 * client to re-open a file and continue processing 1932 * (in the absence of locks). 1933 */ 1934 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1935 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1936 "will retry as CLAIM_NULL")); 1937 claim = CLAIM_NULL; 1938 nfs4_mi_kstat_inc_no_grace(mi); 1939 goto top; 1940 } 1941 failed_msg = 1942 "Couldn't reopen: tried reclaim outside grace period. "; 1943 goto kill_file; 1944 case NFS4ERR_GRACE: 1945 nfs4_set_grace_wait(mi); 1946 nfs4args_copen_free(open_args); 1947 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1948 nfs4_end_open_seqid_sync(oop); 1949 open_owner_rele(oop); 1950 oop = NULL; 1951 ep->error = nfs4_wait_for_grace(mi, &recov); 1952 if (ep->error != 0) 1953 goto bailout; 1954 goto top; 1955 case NFS4ERR_DELAY: 1956 nfs4_set_delay_wait(vp); 1957 nfs4args_copen_free(open_args); 1958 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1959 nfs4_end_open_seqid_sync(oop); 1960 open_owner_rele(oop); 1961 oop = NULL; 1962 ep->error = nfs4_wait_for_delay(vp, &recov); 1963 nfs4_mi_kstat_inc_delay(mi); 1964 if (ep->error != 0) 1965 goto bailout; 1966 goto top; 1967 case NFS4ERR_FHEXPIRED: 1968 /* recover filehandle and retry */ 1969 abort = nfs4_start_recovery(ep, 1970 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 1971 nfs4args_copen_free(open_args); 1972 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1973 nfs4_end_open_seqid_sync(oop); 1974 open_owner_rele(oop); 1975 oop = NULL; 1976 if (abort == FALSE) 1977 goto top; 1978 failed_msg = "Couldn't reopen: recovery aborted"; 1979 goto kill_file; 1980 case NFS4ERR_RESOURCE: 1981 case NFS4ERR_STALE_CLIENTID: 1982 case NFS4ERR_WRONGSEC: 1983 case NFS4ERR_EXPIRED: 1984 /* 1985 * Do not mark the file dead and let the calling 1986 * function initiate recovery. 1987 */ 1988 nfs4args_copen_free(open_args); 1989 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1990 nfs4_end_open_seqid_sync(oop); 1991 open_owner_rele(oop); 1992 oop = NULL; 1993 goto bailout; 1994 case NFS4ERR_ACCESS: 1995 if (cred_otw != cr) { 1996 crfree(cred_otw); 1997 cred_otw = cr; 1998 crhold(cred_otw); 1999 nfs4args_copen_free(open_args); 2000 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2001 nfs4_end_open_seqid_sync(oop); 2002 open_owner_rele(oop); 2003 oop = NULL; 2004 goto top; 2005 } 2006 /* fall through */ 2007 default: 2008 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2009 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2010 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2011 rnode4info(VTOR4(vp)))); 2012 failed_msg = "Couldn't reopen: NFSv4 error"; 2013 nfs4args_copen_free(open_args); 2014 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2015 goto kill_file; 2016 } 2017 2018 resop = &res.array[1]; /* open res */ 2019 op_res = &resop->nfs_resop4_u.opopen; 2020 2021 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2022 2023 /* 2024 * Check if the path we reopened really is the same 2025 * file. We could end up in a situation where the file 2026 * was removed and a new file created with the same name. 2027 */ 2028 resop = &res.array[2]; 2029 gf_res = &resop->nfs_resop4_u.opgetfh; 2030 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2031 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2032 if (fh_different) { 2033 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2034 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2035 /* Oops, we don't have the same file */ 2036 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2037 failed_msg = "Couldn't reopen: Persistent " 2038 "file handle changed"; 2039 else 2040 failed_msg = "Couldn't reopen: Volatile " 2041 "(no expire on open) file handle changed"; 2042 2043 nfs4args_copen_free(open_args); 2044 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2045 nfs_rw_exit(&mi->mi_fh_lock); 2046 goto kill_file; 2047 2048 } else { 2049 /* 2050 * We have volatile file handles that don't compare. 2051 * If the fids are the same then we assume that the 2052 * file handle expired but the rnode still refers to 2053 * the same file object. 2054 * 2055 * First check that we have fids or not. 2056 * If we don't we have a dumb server so we will 2057 * just assume every thing is ok for now. 2058 */ 2059 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2060 rp->r_attr.va_mask & AT_NODEID && 2061 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2062 /* 2063 * We have fids, but they don't 2064 * compare. So kill the file. 2065 */ 2066 failed_msg = 2067 "Couldn't reopen: file handle changed" 2068 " due to mismatched fids"; 2069 nfs4args_copen_free(open_args); 2070 (void) xdr_free(xdr_COMPOUND4res_clnt, 2071 (caddr_t)&res); 2072 nfs_rw_exit(&mi->mi_fh_lock); 2073 goto kill_file; 2074 } else { 2075 /* 2076 * We have volatile file handles that refers 2077 * to the same file (at least they have the 2078 * same fid) or we don't have fids so we 2079 * can't tell. :(. We'll be a kind and accepting 2080 * client so we'll update the rnode's file 2081 * handle with the otw handle. 2082 * 2083 * We need to drop mi->mi_fh_lock since 2084 * sh4_update acquires it. Since there is 2085 * only one recovery thread there is no 2086 * race. 2087 */ 2088 nfs_rw_exit(&mi->mi_fh_lock); 2089 sfh4_update(rp->r_fh, &gf_res->object); 2090 } 2091 } 2092 } else { 2093 nfs_rw_exit(&mi->mi_fh_lock); 2094 } 2095 2096 ASSERT(nfs4_consistent_type(vp)); 2097 2098 /* 2099 * If the server wanted an OPEN_CONFIRM but that fails, just start 2100 * over. Presumably if there is a persistent error it will show up 2101 * when we resend the OPEN. 2102 */ 2103 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2104 bool_t retry_open = FALSE; 2105 2106 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2107 cred_otw, is_recov, &retry_open, 2108 oop, FALSE, ep, NULL); 2109 if (ep->error || ep->stat) { 2110 nfs4args_copen_free(open_args); 2111 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2112 nfs4_end_open_seqid_sync(oop); 2113 open_owner_rele(oop); 2114 oop = NULL; 2115 goto top; 2116 } 2117 } 2118 2119 mutex_enter(&osp->os_sync_lock); 2120 osp->open_stateid = op_res->stateid; 2121 osp->os_delegation = 0; 2122 /* 2123 * Need to reset this bitfield for the possible case where we were 2124 * going to OTW CLOSE the file, got a non-recoverable error, and before 2125 * we could retry the CLOSE, OPENed the file again. 2126 */ 2127 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2128 osp->os_final_close = 0; 2129 osp->os_force_close = 0; 2130 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2131 osp->os_dc_openacc = open_args->share_access; 2132 mutex_exit(&osp->os_sync_lock); 2133 2134 nfs4_end_open_seqid_sync(oop); 2135 2136 /* accept delegation, if any */ 2137 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2138 2139 nfs4args_copen_free(open_args); 2140 2141 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2142 2143 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2144 2145 ASSERT(nfs4_consistent_type(vp)); 2146 2147 open_owner_rele(oop); 2148 crfree(cr); 2149 crfree(cred_otw); 2150 return; 2151 2152 kill_file: 2153 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2154 failed_reopen: 2155 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2156 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2157 (void *)osp, (void *)cr, rnode4info(rp))); 2158 mutex_enter(&osp->os_sync_lock); 2159 osp->os_failed_reopen = 1; 2160 mutex_exit(&osp->os_sync_lock); 2161 bailout: 2162 if (oop != NULL) { 2163 nfs4_end_open_seqid_sync(oop); 2164 open_owner_rele(oop); 2165 } 2166 if (cr != NULL) 2167 crfree(cr); 2168 if (cred_otw != NULL) 2169 crfree(cred_otw); 2170 } 2171 2172 /* for . and .. OPENs */ 2173 /* ARGSUSED */ 2174 static int 2175 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2176 { 2177 rnode4_t *rp; 2178 nfs4_ga_res_t gar; 2179 2180 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2181 2182 /* 2183 * If close-to-open consistency checking is turned off or 2184 * if there is no cached data, we can avoid 2185 * the over the wire getattr. Otherwise, force a 2186 * call to the server to get fresh attributes and to 2187 * check caches. This is required for close-to-open 2188 * consistency. 2189 */ 2190 rp = VTOR4(*vpp); 2191 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2192 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2193 return (0); 2194 2195 gar.n4g_va.va_mask = AT_ALL; 2196 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2197 } 2198 2199 /* 2200 * CLOSE a file 2201 */ 2202 static int 2203 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 2204 { 2205 rnode4_t *rp; 2206 int error = 0; 2207 int r_error = 0; 2208 int n4error = 0; 2209 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2210 2211 /* 2212 * Remove client state for this (lockowner, file) pair. 2213 * Issue otw v4 call to have the server do the same. 2214 */ 2215 2216 rp = VTOR4(vp); 2217 2218 /* 2219 * zone_enter(2) prevents processes from changing zones with NFS files 2220 * open; if we happen to get here from the wrong zone we can't do 2221 * anything over the wire. 2222 */ 2223 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2224 /* 2225 * We could attempt to clean up locks, except we're sure 2226 * that the current process didn't acquire any locks on 2227 * the file: any attempt to lock a file belong to another zone 2228 * will fail, and one can't lock an NFS file and then change 2229 * zones, as that fails too. 2230 * 2231 * Returning an error here is the sane thing to do. A 2232 * subsequent call to VN_RELE() which translates to a 2233 * nfs4_inactive() will clean up state: if the zone of the 2234 * vnode's origin is still alive and kicking, the inactive 2235 * thread will handle the request (from the correct zone), and 2236 * everything (minus the OTW close call) should be OK. If the 2237 * zone is going away nfs4_async_inactive() will throw away 2238 * delegations, open streams and cached pages inline. 2239 */ 2240 return (EIO); 2241 } 2242 2243 /* 2244 * If we are using local locking for this filesystem, then 2245 * release all of the SYSV style record locks. Otherwise, 2246 * we are doing network locking and we need to release all 2247 * of the network locks. All of the locks held by this 2248 * process on this file are released no matter what the 2249 * incoming reference count is. 2250 */ 2251 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2252 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2253 cleanshares(vp, ttoproc(curthread)->p_pid); 2254 } else 2255 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2256 2257 if (e.error) 2258 return (e.error); 2259 2260 if (count > 1) 2261 return (0); 2262 2263 /* 2264 * If the file has been `unlinked', then purge the 2265 * DNLC so that this vnode will get reycled quicker 2266 * and the .nfs* file on the server will get removed. 2267 */ 2268 if (rp->r_unldvp != NULL) 2269 dnlc_purge_vp(vp); 2270 2271 /* 2272 * If the file was open for write and there are pages, 2273 * do a synchronous flush and commit of all of the 2274 * dirty and uncommitted pages. 2275 */ 2276 ASSERT(!e.error); 2277 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2278 error = nfs4_putpage_commit(vp, 0, 0, cr); 2279 2280 mutex_enter(&rp->r_statelock); 2281 r_error = rp->r_error; 2282 rp->r_error = 0; 2283 mutex_exit(&rp->r_statelock); 2284 2285 /* 2286 * If this file type is one for which no explicit 'open' was 2287 * done, then bail now (ie. no need for protocol 'close'). If 2288 * there was an error w/the vm subsystem, return _that_ error, 2289 * otherwise, return any errors that may've been reported via 2290 * the rnode. 2291 */ 2292 if (vp->v_type != VREG) 2293 return (error ? error : r_error); 2294 2295 /* 2296 * The sync putpage commit may have failed above, but since 2297 * we're working w/a regular file, we need to do the protocol 2298 * 'close' (nfs4close_one will figure out if an otw close is 2299 * needed or not). Report any errors _after_ doing the protocol 2300 * 'close'. 2301 */ 2302 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2303 n4error = e.error ? e.error : geterrno4(e.stat); 2304 2305 /* 2306 * Error reporting prio (Hi -> Lo) 2307 * 2308 * i) nfs4_putpage_commit (error) 2309 * ii) rnode's (r_error) 2310 * iii) nfs4close_one (n4error) 2311 */ 2312 return (error ? error : (r_error ? r_error : n4error)); 2313 } 2314 2315 /* 2316 * Initialize *lost_rqstp. 2317 */ 2318 2319 static void 2320 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2321 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2322 vnode_t *vp) 2323 { 2324 if (error != ETIMEDOUT && error != EINTR && 2325 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2326 lost_rqstp->lr_op = 0; 2327 return; 2328 } 2329 2330 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2331 "nfs4close_save_lost_rqst: error %d", error)); 2332 2333 lost_rqstp->lr_op = OP_CLOSE; 2334 /* 2335 * The vp is held and rele'd via the recovery code. 2336 * See nfs4_save_lost_rqst. 2337 */ 2338 lost_rqstp->lr_vp = vp; 2339 lost_rqstp->lr_dvp = NULL; 2340 lost_rqstp->lr_oop = oop; 2341 lost_rqstp->lr_osp = osp; 2342 ASSERT(osp != NULL); 2343 ASSERT(mutex_owned(&osp->os_sync_lock)); 2344 osp->os_pending_close = 1; 2345 lost_rqstp->lr_lop = NULL; 2346 lost_rqstp->lr_cr = cr; 2347 lost_rqstp->lr_flk = NULL; 2348 lost_rqstp->lr_putfirst = FALSE; 2349 } 2350 2351 /* 2352 * Assumes you already have the open seqid sync grabbed as well as the 2353 * 'os_sync_lock'. Note: this will release the open seqid sync and 2354 * 'os_sync_lock' if client recovery starts. Calling functions have to 2355 * be prepared to handle this. 2356 * 2357 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2358 * was needed and was started, and that the calling function should retry 2359 * this function; otherwise it is returned as 0. 2360 * 2361 * Errors are returned via the nfs4_error_t parameter. 2362 */ 2363 static void 2364 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2365 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2366 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2367 { 2368 COMPOUND4args_clnt args; 2369 COMPOUND4res_clnt res; 2370 CLOSE4args *close_args; 2371 nfs_resop4 *resop; 2372 nfs_argop4 argop[3]; 2373 int doqueue = 1; 2374 mntinfo4_t *mi; 2375 seqid4 seqid; 2376 vnode_t *vp; 2377 bool_t needrecov = FALSE; 2378 nfs4_lost_rqst_t lost_rqst; 2379 hrtime_t t; 2380 2381 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2382 2383 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2384 2385 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2386 2387 /* Only set this to 1 if recovery is started */ 2388 *recov = 0; 2389 2390 /* do the OTW call to close the file */ 2391 2392 if (close_type == CLOSE_RESEND) 2393 args.ctag = TAG_CLOSE_LOST; 2394 else if (close_type == CLOSE_AFTER_RESEND) 2395 args.ctag = TAG_CLOSE_UNDO; 2396 else 2397 args.ctag = TAG_CLOSE; 2398 2399 args.array_len = 3; 2400 args.array = argop; 2401 2402 vp = RTOV4(rp); 2403 2404 mi = VTOMI4(vp); 2405 2406 /* putfh target fh */ 2407 argop[0].argop = OP_CPUTFH; 2408 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2409 2410 argop[1].argop = OP_GETATTR; 2411 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2412 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2413 2414 argop[2].argop = OP_CLOSE; 2415 close_args = &argop[2].nfs_argop4_u.opclose; 2416 2417 seqid = nfs4_get_open_seqid(oop) + 1; 2418 2419 close_args->seqid = seqid; 2420 close_args->open_stateid = osp->open_stateid; 2421 2422 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2423 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2424 rnode4info(rp))); 2425 2426 t = gethrtime(); 2427 2428 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2429 2430 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2431 nfs4_set_open_seqid(seqid, oop, args.ctag); 2432 } 2433 2434 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2435 if (ep->error && !needrecov) { 2436 /* 2437 * if there was an error and no recovery is to be done 2438 * then then set up the file to flush its cache if 2439 * needed for the next caller. 2440 */ 2441 mutex_enter(&rp->r_statelock); 2442 PURGE_ATTRCACHE4_LOCKED(rp); 2443 rp->r_flags &= ~R4WRITEMODIFIED; 2444 mutex_exit(&rp->r_statelock); 2445 return; 2446 } 2447 2448 if (needrecov) { 2449 bool_t abort; 2450 nfs4_bseqid_entry_t *bsep = NULL; 2451 2452 if (close_type != CLOSE_RESEND) 2453 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2454 osp, cred_otw, vp); 2455 2456 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2457 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2458 0, args.ctag, close_args->seqid); 2459 2460 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2461 "nfs4close_otw: initiating recovery. error %d " 2462 "res.status %d", ep->error, res.status)); 2463 2464 /* 2465 * Drop the 'os_sync_lock' here so we don't hit 2466 * a potential recursive mutex_enter via an 2467 * 'open_stream_hold()'. 2468 */ 2469 mutex_exit(&osp->os_sync_lock); 2470 *have_sync_lockp = 0; 2471 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2472 (close_type != CLOSE_RESEND && 2473 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2474 OP_CLOSE, bsep); 2475 2476 /* drop open seq sync, and let the calling function regrab it */ 2477 nfs4_end_open_seqid_sync(oop); 2478 *did_start_seqid_syncp = 0; 2479 2480 if (bsep) 2481 kmem_free(bsep, sizeof (*bsep)); 2482 /* 2483 * For signals, the caller wants to quit, so don't say to 2484 * retry. For forced unmount, if it's a user thread, it 2485 * wants to quit. If it's a recovery thread, the retry 2486 * will happen higher-up on the call stack. Either way, 2487 * don't say to retry. 2488 */ 2489 if (abort == FALSE && ep->error != EINTR && 2490 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2491 close_type != CLOSE_RESEND && 2492 close_type != CLOSE_AFTER_RESEND) 2493 *recov = 1; 2494 else 2495 *recov = 0; 2496 2497 if (!ep->error) 2498 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2499 return; 2500 } 2501 2502 if (res.status) { 2503 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2504 return; 2505 } 2506 2507 mutex_enter(&rp->r_statev4_lock); 2508 rp->created_v4 = 0; 2509 mutex_exit(&rp->r_statev4_lock); 2510 2511 resop = &res.array[2]; 2512 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2513 osp->os_valid = 0; 2514 2515 /* 2516 * This removes the reference obtained at OPEN; ie, when the 2517 * open stream structure was created. 2518 * 2519 * We don't have to worry about calling 'open_stream_rele' 2520 * since we our currently holding a reference to the open 2521 * stream which means the count cannot go to 0 with this 2522 * decrement. 2523 */ 2524 ASSERT(osp->os_ref_count >= 2); 2525 osp->os_ref_count--; 2526 2527 if (!ep->error) 2528 nfs4_attr_cache(vp, 2529 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2530 t, cred_otw, TRUE, NULL); 2531 2532 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2533 " returning %d", ep->error)); 2534 2535 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2536 } 2537 2538 /* ARGSUSED */ 2539 static int 2540 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2541 caller_context_t *ct) 2542 { 2543 rnode4_t *rp; 2544 u_offset_t off; 2545 offset_t diff; 2546 uint_t on; 2547 uint_t n; 2548 caddr_t base; 2549 uint_t flags; 2550 int error; 2551 mntinfo4_t *mi; 2552 2553 rp = VTOR4(vp); 2554 2555 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2556 2557 if (IS_SHADOW(vp, rp)) 2558 vp = RTOV4(rp); 2559 2560 if (vp->v_type != VREG) 2561 return (EISDIR); 2562 2563 mi = VTOMI4(vp); 2564 2565 if (nfs_zone() != mi->mi_zone) 2566 return (EIO); 2567 2568 if (uiop->uio_resid == 0) 2569 return (0); 2570 2571 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2572 return (EINVAL); 2573 2574 mutex_enter(&rp->r_statelock); 2575 if (rp->r_flags & R4RECOVERRP) 2576 error = (rp->r_error ? rp->r_error : EIO); 2577 else 2578 error = 0; 2579 mutex_exit(&rp->r_statelock); 2580 if (error) 2581 return (error); 2582 2583 /* 2584 * Bypass VM if caching has been disabled (e.g., locking) or if 2585 * using client-side direct I/O and the file is not mmap'd and 2586 * there are no cached pages. 2587 */ 2588 if ((vp->v_flag & VNOCACHE) || 2589 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2590 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2591 size_t resid = 0; 2592 2593 return (nfs4read(vp, NULL, uiop->uio_loffset, 2594 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2595 } 2596 2597 error = 0; 2598 2599 do { 2600 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2601 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2602 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2603 2604 if (error = nfs4_validate_caches(vp, cr)) 2605 break; 2606 2607 mutex_enter(&rp->r_statelock); 2608 diff = rp->r_size - uiop->uio_loffset; 2609 mutex_exit(&rp->r_statelock); 2610 if (diff <= 0) 2611 break; 2612 if (diff < n) 2613 n = (uint_t)diff; 2614 2615 if (vpm_enable) { 2616 /* 2617 * Copy data. 2618 */ 2619 error = vpm_data_copy(vp, off + on, n, uiop, 2620 1, NULL, 0, S_READ); 2621 2622 } else { 2623 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2624 S_READ); 2625 2626 error = uiomove(base + on, n, UIO_READ, uiop); 2627 } 2628 2629 if (!error) { 2630 /* 2631 * If read a whole block or read to eof, 2632 * won't need this buffer again soon. 2633 */ 2634 mutex_enter(&rp->r_statelock); 2635 if (n + on == MAXBSIZE || 2636 uiop->uio_loffset == rp->r_size) 2637 flags = SM_DONTNEED; 2638 else 2639 flags = 0; 2640 mutex_exit(&rp->r_statelock); 2641 if (vpm_enable) { 2642 error = vpm_sync_pages(vp, off, n, flags); 2643 } else { 2644 error = segmap_release(segkmap, base, flags); 2645 } 2646 } else { 2647 if (vpm_enable) { 2648 (void) vpm_sync_pages(vp, off, n, 0); 2649 } else { 2650 (void) segmap_release(segkmap, base, 0); 2651 } 2652 } 2653 } while (!error && uiop->uio_resid > 0); 2654 2655 return (error); 2656 } 2657 2658 /* ARGSUSED */ 2659 static int 2660 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2661 caller_context_t *ct) 2662 { 2663 rlim64_t limit = uiop->uio_llimit; 2664 rnode4_t *rp; 2665 u_offset_t off; 2666 caddr_t base; 2667 uint_t flags; 2668 int remainder; 2669 size_t n; 2670 int on; 2671 int error; 2672 int resid; 2673 u_offset_t offset; 2674 mntinfo4_t *mi; 2675 uint_t bsize; 2676 2677 rp = VTOR4(vp); 2678 2679 if (IS_SHADOW(vp, rp)) 2680 vp = RTOV4(rp); 2681 2682 if (vp->v_type != VREG) 2683 return (EISDIR); 2684 2685 mi = VTOMI4(vp); 2686 2687 if (nfs_zone() != mi->mi_zone) 2688 return (EIO); 2689 2690 if (uiop->uio_resid == 0) 2691 return (0); 2692 2693 mutex_enter(&rp->r_statelock); 2694 if (rp->r_flags & R4RECOVERRP) 2695 error = (rp->r_error ? rp->r_error : EIO); 2696 else 2697 error = 0; 2698 mutex_exit(&rp->r_statelock); 2699 if (error) 2700 return (error); 2701 2702 if (ioflag & FAPPEND) { 2703 struct vattr va; 2704 2705 /* 2706 * Must serialize if appending. 2707 */ 2708 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2709 nfs_rw_exit(&rp->r_rwlock); 2710 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2711 INTR(vp))) 2712 return (EINTR); 2713 } 2714 2715 va.va_mask = AT_SIZE; 2716 error = nfs4getattr(vp, &va, cr); 2717 if (error) 2718 return (error); 2719 uiop->uio_loffset = va.va_size; 2720 } 2721 2722 offset = uiop->uio_loffset + uiop->uio_resid; 2723 2724 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2725 return (EINVAL); 2726 2727 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2728 limit = MAXOFFSET_T; 2729 2730 /* 2731 * Check to make sure that the process will not exceed 2732 * its limit on file size. It is okay to write up to 2733 * the limit, but not beyond. Thus, the write which 2734 * reaches the limit will be short and the next write 2735 * will return an error. 2736 */ 2737 remainder = 0; 2738 if (offset > uiop->uio_llimit) { 2739 remainder = offset - uiop->uio_llimit; 2740 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2741 if (uiop->uio_resid <= 0) { 2742 proc_t *p = ttoproc(curthread); 2743 2744 uiop->uio_resid += remainder; 2745 mutex_enter(&p->p_lock); 2746 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2747 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2748 mutex_exit(&p->p_lock); 2749 return (EFBIG); 2750 } 2751 } 2752 2753 /* update the change attribute, if we have a write delegation */ 2754 2755 mutex_enter(&rp->r_statev4_lock); 2756 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2757 rp->r_deleg_change++; 2758 2759 mutex_exit(&rp->r_statev4_lock); 2760 2761 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2762 return (EINTR); 2763 2764 /* 2765 * Bypass VM if caching has been disabled (e.g., locking) or if 2766 * using client-side direct I/O and the file is not mmap'd and 2767 * there are no cached pages. 2768 */ 2769 if ((vp->v_flag & VNOCACHE) || 2770 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2771 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2772 size_t bufsize; 2773 int count; 2774 u_offset_t org_offset; 2775 stable_how4 stab_comm; 2776 nfs4_fwrite: 2777 if (rp->r_flags & R4STALE) { 2778 resid = uiop->uio_resid; 2779 offset = uiop->uio_loffset; 2780 error = rp->r_error; 2781 goto bottom; 2782 } 2783 2784 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2785 base = kmem_alloc(bufsize, KM_SLEEP); 2786 do { 2787 if (ioflag & FDSYNC) 2788 stab_comm = DATA_SYNC4; 2789 else 2790 stab_comm = FILE_SYNC4; 2791 resid = uiop->uio_resid; 2792 offset = uiop->uio_loffset; 2793 count = MIN(uiop->uio_resid, bufsize); 2794 org_offset = uiop->uio_loffset; 2795 error = uiomove(base, count, UIO_WRITE, uiop); 2796 if (!error) { 2797 error = nfs4write(vp, base, org_offset, 2798 count, cr, &stab_comm); 2799 if (!error) { 2800 mutex_enter(&rp->r_statelock); 2801 if (rp->r_size < uiop->uio_loffset) 2802 rp->r_size = uiop->uio_loffset; 2803 mutex_exit(&rp->r_statelock); 2804 } 2805 } 2806 } while (!error && uiop->uio_resid > 0); 2807 kmem_free(base, bufsize); 2808 goto bottom; 2809 } 2810 2811 bsize = vp->v_vfsp->vfs_bsize; 2812 2813 do { 2814 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2815 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2816 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2817 2818 resid = uiop->uio_resid; 2819 offset = uiop->uio_loffset; 2820 2821 if (rp->r_flags & R4STALE) { 2822 error = rp->r_error; 2823 break; 2824 } 2825 2826 /* 2827 * Don't create dirty pages faster than they 2828 * can be cleaned so that the system doesn't 2829 * get imbalanced. If the async queue is 2830 * maxed out, then wait for it to drain before 2831 * creating more dirty pages. Also, wait for 2832 * any threads doing pagewalks in the vop_getattr 2833 * entry points so that they don't block for 2834 * long periods. 2835 */ 2836 mutex_enter(&rp->r_statelock); 2837 while ((mi->mi_max_threads != 0 && 2838 rp->r_awcount > 2 * mi->mi_max_threads) || 2839 rp->r_gcount > 0) 2840 cv_wait(&rp->r_cv, &rp->r_statelock); 2841 mutex_exit(&rp->r_statelock); 2842 2843 if (vpm_enable) { 2844 /* 2845 * It will use kpm mappings, so no need to 2846 * pass an address. 2847 */ 2848 error = writerp4(rp, NULL, n, uiop, 0); 2849 } else { 2850 if (segmap_kpm) { 2851 int pon = uiop->uio_loffset & PAGEOFFSET; 2852 size_t pn = MIN(PAGESIZE - pon, 2853 uiop->uio_resid); 2854 int pagecreate; 2855 2856 mutex_enter(&rp->r_statelock); 2857 pagecreate = (pon == 0) && (pn == PAGESIZE || 2858 uiop->uio_loffset + pn >= rp->r_size); 2859 mutex_exit(&rp->r_statelock); 2860 2861 base = segmap_getmapflt(segkmap, vp, off + on, 2862 pn, !pagecreate, S_WRITE); 2863 2864 error = writerp4(rp, base + pon, n, uiop, 2865 pagecreate); 2866 2867 } else { 2868 base = segmap_getmapflt(segkmap, vp, off + on, 2869 n, 0, S_READ); 2870 error = writerp4(rp, base + on, n, uiop, 0); 2871 } 2872 } 2873 2874 if (!error) { 2875 if (mi->mi_flags & MI4_NOAC) 2876 flags = SM_WRITE; 2877 else if ((uiop->uio_loffset % bsize) == 0 || 2878 IS_SWAPVP(vp)) { 2879 /* 2880 * Have written a whole block. 2881 * Start an asynchronous write 2882 * and mark the buffer to 2883 * indicate that it won't be 2884 * needed again soon. 2885 */ 2886 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2887 } else 2888 flags = 0; 2889 if ((ioflag & (FSYNC|FDSYNC)) || 2890 (rp->r_flags & R4OUTOFSPACE)) { 2891 flags &= ~SM_ASYNC; 2892 flags |= SM_WRITE; 2893 } 2894 if (vpm_enable) { 2895 error = vpm_sync_pages(vp, off, n, flags); 2896 } else { 2897 error = segmap_release(segkmap, base, flags); 2898 } 2899 } else { 2900 if (vpm_enable) { 2901 (void) vpm_sync_pages(vp, off, n, 0); 2902 } else { 2903 (void) segmap_release(segkmap, base, 0); 2904 } 2905 /* 2906 * In the event that we got an access error while 2907 * faulting in a page for a write-only file just 2908 * force a write. 2909 */ 2910 if (error == EACCES) 2911 goto nfs4_fwrite; 2912 } 2913 } while (!error && uiop->uio_resid > 0); 2914 2915 bottom: 2916 if (error) { 2917 uiop->uio_resid = resid + remainder; 2918 uiop->uio_loffset = offset; 2919 } else { 2920 uiop->uio_resid += remainder; 2921 2922 mutex_enter(&rp->r_statev4_lock); 2923 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2924 gethrestime(&rp->r_attr.va_mtime); 2925 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2926 } 2927 mutex_exit(&rp->r_statev4_lock); 2928 } 2929 2930 nfs_rw_exit(&rp->r_lkserlock); 2931 2932 return (error); 2933 } 2934 2935 /* 2936 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2937 */ 2938 static int 2939 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2940 int flags, cred_t *cr) 2941 { 2942 struct buf *bp; 2943 int error; 2944 page_t *savepp; 2945 uchar_t fsdata; 2946 stable_how4 stab_comm; 2947 2948 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 2949 bp = pageio_setup(pp, len, vp, flags); 2950 ASSERT(bp != NULL); 2951 2952 /* 2953 * pageio_setup should have set b_addr to 0. This 2954 * is correct since we want to do I/O on a page 2955 * boundary. bp_mapin will use this addr to calculate 2956 * an offset, and then set b_addr to the kernel virtual 2957 * address it allocated for us. 2958 */ 2959 ASSERT(bp->b_un.b_addr == 0); 2960 2961 bp->b_edev = 0; 2962 bp->b_dev = 0; 2963 bp->b_lblkno = lbtodb(off); 2964 bp->b_file = vp; 2965 bp->b_offset = (offset_t)off; 2966 bp_mapin(bp); 2967 2968 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 2969 freemem > desfree) 2970 stab_comm = UNSTABLE4; 2971 else 2972 stab_comm = FILE_SYNC4; 2973 2974 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 2975 2976 bp_mapout(bp); 2977 pageio_done(bp); 2978 2979 if (stab_comm == UNSTABLE4) 2980 fsdata = C_DELAYCOMMIT; 2981 else 2982 fsdata = C_NOCOMMIT; 2983 2984 savepp = pp; 2985 do { 2986 pp->p_fsdata = fsdata; 2987 } while ((pp = pp->p_next) != savepp); 2988 2989 return (error); 2990 } 2991 2992 /* 2993 */ 2994 static int 2995 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 2996 { 2997 nfs4_open_owner_t *oop; 2998 nfs4_open_stream_t *osp; 2999 rnode4_t *rp = VTOR4(vp); 3000 mntinfo4_t *mi = VTOMI4(vp); 3001 int reopen_needed; 3002 3003 ASSERT(nfs_zone() == mi->mi_zone); 3004 3005 3006 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3007 if (!oop) 3008 return (EIO); 3009 3010 /* returns with 'os_sync_lock' held */ 3011 osp = find_open_stream(oop, rp); 3012 if (!osp) { 3013 open_owner_rele(oop); 3014 return (EIO); 3015 } 3016 3017 if (osp->os_failed_reopen) { 3018 mutex_exit(&osp->os_sync_lock); 3019 open_stream_rele(osp, rp); 3020 open_owner_rele(oop); 3021 return (EIO); 3022 } 3023 3024 /* 3025 * Determine whether a reopen is needed. If this 3026 * is a delegation open stream, then the os_delegation bit 3027 * should be set. 3028 */ 3029 3030 reopen_needed = osp->os_delegation; 3031 3032 mutex_exit(&osp->os_sync_lock); 3033 open_owner_rele(oop); 3034 3035 if (reopen_needed) { 3036 nfs4_error_zinit(ep); 3037 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3038 mutex_enter(&osp->os_sync_lock); 3039 if (ep->error || ep->stat || osp->os_failed_reopen) { 3040 mutex_exit(&osp->os_sync_lock); 3041 open_stream_rele(osp, rp); 3042 return (EIO); 3043 } 3044 mutex_exit(&osp->os_sync_lock); 3045 } 3046 open_stream_rele(osp, rp); 3047 3048 return (0); 3049 } 3050 3051 /* 3052 * Write to file. Writes to remote server in largest size 3053 * chunks that the server can handle. Write is synchronous. 3054 */ 3055 static int 3056 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3057 stable_how4 *stab_comm) 3058 { 3059 mntinfo4_t *mi; 3060 COMPOUND4args_clnt args; 3061 COMPOUND4res_clnt res; 3062 WRITE4args *wargs; 3063 WRITE4res *wres; 3064 nfs_argop4 argop[2]; 3065 nfs_resop4 *resop; 3066 int tsize; 3067 stable_how4 stable; 3068 rnode4_t *rp; 3069 int doqueue = 1; 3070 bool_t needrecov; 3071 nfs4_recov_state_t recov_state; 3072 nfs4_stateid_types_t sid_types; 3073 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3074 3075 rp = VTOR4(vp); 3076 mi = VTOMI4(vp); 3077 3078 ASSERT(nfs_zone() == mi->mi_zone); 3079 3080 stable = *stab_comm; 3081 *stab_comm = FILE_SYNC4; 3082 3083 needrecov = FALSE; 3084 recov_state.rs_flags = 0; 3085 recov_state.rs_num_retry_despite_err = 0; 3086 nfs4_init_stateid_types(&sid_types); 3087 3088 recov_retry: 3089 args.ctag = TAG_WRITE; 3090 args.array_len = 2; 3091 args.array = argop; 3092 3093 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3094 &recov_state, NULL); 3095 if (e.error) 3096 return (e.error); 3097 3098 /* 0. putfh target fh */ 3099 argop[0].argop = OP_CPUTFH; 3100 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3101 3102 /* 1. write */ 3103 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3104 3105 do { 3106 3107 wargs->offset = (offset4)offset; 3108 wargs->data_val = base; 3109 3110 if (mi->mi_io_kstats) { 3111 mutex_enter(&mi->mi_lock); 3112 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3113 mutex_exit(&mi->mi_lock); 3114 } 3115 3116 if ((vp->v_flag & VNOCACHE) || 3117 (rp->r_flags & R4DIRECTIO) || 3118 (mi->mi_flags & MI4_DIRECTIO)) 3119 tsize = MIN(mi->mi_stsize, count); 3120 else 3121 tsize = MIN(mi->mi_curwrite, count); 3122 wargs->data_len = (uint_t)tsize; 3123 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3124 3125 if (mi->mi_io_kstats) { 3126 mutex_enter(&mi->mi_lock); 3127 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3128 mutex_exit(&mi->mi_lock); 3129 } 3130 3131 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3132 if (e.error && !needrecov) { 3133 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3134 &recov_state, needrecov); 3135 return (e.error); 3136 } 3137 3138 3139 /* 3140 * Do handling of OLD_STATEID outside 3141 * of the normal recovery framework. 3142 * 3143 * If write receives a BAD stateid error while using a 3144 * delegation stateid, retry using the open stateid (if it 3145 * exists). If it doesn't have an open stateid, reopen the 3146 * file first, then retry. 3147 */ 3148 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3149 sid_types.cur_sid_type != SPEC_SID) { 3150 nfs4_save_stateid(&wargs->stateid, &sid_types); 3151 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3152 &recov_state, needrecov); 3153 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3154 goto recov_retry; 3155 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3156 sid_types.cur_sid_type == DEL_SID) { 3157 nfs4_save_stateid(&wargs->stateid, &sid_types); 3158 mutex_enter(&rp->r_statev4_lock); 3159 rp->r_deleg_return_pending = TRUE; 3160 mutex_exit(&rp->r_statev4_lock); 3161 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3162 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3163 &recov_state, needrecov); 3164 (void) xdr_free(xdr_COMPOUND4res_clnt, 3165 (caddr_t)&res); 3166 return (EIO); 3167 } 3168 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3169 &recov_state, needrecov); 3170 /* hold needed for nfs4delegreturn_thread */ 3171 VN_HOLD(vp); 3172 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3173 NFS4_DR_DISCARD), FALSE); 3174 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3175 goto recov_retry; 3176 } 3177 3178 if (needrecov) { 3179 bool_t abort; 3180 3181 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3182 "nfs4write: client got error %d, res.status %d" 3183 ", so start recovery", e.error, res.status)); 3184 3185 abort = nfs4_start_recovery(&e, 3186 VTOMI4(vp), vp, NULL, &wargs->stateid, 3187 NULL, OP_WRITE, NULL); 3188 if (!e.error) { 3189 e.error = geterrno4(res.status); 3190 (void) xdr_free(xdr_COMPOUND4res_clnt, 3191 (caddr_t)&res); 3192 } 3193 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3194 &recov_state, needrecov); 3195 if (abort == FALSE) 3196 goto recov_retry; 3197 return (e.error); 3198 } 3199 3200 if (res.status) { 3201 e.error = geterrno4(res.status); 3202 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3203 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3204 &recov_state, needrecov); 3205 return (e.error); 3206 } 3207 3208 resop = &res.array[1]; /* write res */ 3209 wres = &resop->nfs_resop4_u.opwrite; 3210 3211 if ((int)wres->count > tsize) { 3212 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3213 3214 zcmn_err(getzoneid(), CE_WARN, 3215 "nfs4write: server wrote %u, requested was %u", 3216 (int)wres->count, tsize); 3217 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3218 &recov_state, needrecov); 3219 return (EIO); 3220 } 3221 if (wres->committed == UNSTABLE4) { 3222 *stab_comm = UNSTABLE4; 3223 if (wargs->stable == DATA_SYNC4 || 3224 wargs->stable == FILE_SYNC4) { 3225 (void) xdr_free(xdr_COMPOUND4res_clnt, 3226 (caddr_t)&res); 3227 zcmn_err(getzoneid(), CE_WARN, 3228 "nfs4write: server %s did not commit " 3229 "to stable storage", 3230 rp->r_server->sv_hostname); 3231 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3232 &recov_state, needrecov); 3233 return (EIO); 3234 } 3235 } 3236 3237 tsize = (int)wres->count; 3238 count -= tsize; 3239 base += tsize; 3240 offset += tsize; 3241 if (mi->mi_io_kstats) { 3242 mutex_enter(&mi->mi_lock); 3243 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3244 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3245 tsize; 3246 mutex_exit(&mi->mi_lock); 3247 } 3248 lwp_stat_update(LWP_STAT_OUBLK, 1); 3249 mutex_enter(&rp->r_statelock); 3250 if (rp->r_flags & R4HAVEVERF) { 3251 if (rp->r_writeverf != wres->writeverf) { 3252 nfs4_set_mod(vp); 3253 rp->r_writeverf = wres->writeverf; 3254 } 3255 } else { 3256 rp->r_writeverf = wres->writeverf; 3257 rp->r_flags |= R4HAVEVERF; 3258 } 3259 PURGE_ATTRCACHE4_LOCKED(rp); 3260 rp->r_flags |= R4WRITEMODIFIED; 3261 gethrestime(&rp->r_attr.va_mtime); 3262 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3263 mutex_exit(&rp->r_statelock); 3264 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3265 } while (count); 3266 3267 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, needrecov); 3268 3269 return (e.error); 3270 } 3271 3272 /* 3273 * Read from a file. Reads data in largest chunks our interface can handle. 3274 */ 3275 static int 3276 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3277 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3278 { 3279 mntinfo4_t *mi; 3280 COMPOUND4args_clnt args; 3281 COMPOUND4res_clnt res; 3282 READ4args *rargs; 3283 nfs_argop4 argop[2]; 3284 int tsize; 3285 int doqueue; 3286 rnode4_t *rp; 3287 int data_len; 3288 bool_t is_eof; 3289 bool_t needrecov = FALSE; 3290 nfs4_recov_state_t recov_state; 3291 nfs4_stateid_types_t sid_types; 3292 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3293 3294 rp = VTOR4(vp); 3295 mi = VTOMI4(vp); 3296 doqueue = 1; 3297 3298 ASSERT(nfs_zone() == mi->mi_zone); 3299 3300 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3301 3302 args.array_len = 2; 3303 args.array = argop; 3304 3305 nfs4_init_stateid_types(&sid_types); 3306 3307 recov_state.rs_flags = 0; 3308 recov_state.rs_num_retry_despite_err = 0; 3309 3310 recov_retry: 3311 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3312 &recov_state, NULL); 3313 if (e.error) 3314 return (e.error); 3315 3316 /* putfh target fh */ 3317 argop[0].argop = OP_CPUTFH; 3318 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3319 3320 /* read */ 3321 argop[1].argop = OP_READ; 3322 rargs = &argop[1].nfs_argop4_u.opread; 3323 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3324 OP_READ, &sid_types, async); 3325 3326 do { 3327 if (mi->mi_io_kstats) { 3328 mutex_enter(&mi->mi_lock); 3329 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3330 mutex_exit(&mi->mi_lock); 3331 } 3332 3333 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3334 "nfs4read: %s call, rp %s", 3335 needrecov ? "recov" : "first", 3336 rnode4info(rp))); 3337 3338 if ((vp->v_flag & VNOCACHE) || 3339 (rp->r_flags & R4DIRECTIO) || 3340 (mi->mi_flags & MI4_DIRECTIO)) 3341 tsize = MIN(mi->mi_tsize, count); 3342 else 3343 tsize = MIN(mi->mi_curread, count); 3344 rargs->offset = (offset4)offset; 3345 rargs->count = (count4)tsize; 3346 rargs->res_data_val_alt = NULL; 3347 rargs->res_mblk = NULL; 3348 rargs->res_uiop = NULL; 3349 rargs->res_maxsize = 0; 3350 if (uiop) 3351 rargs->res_uiop = uiop; 3352 else 3353 rargs->res_data_val_alt = base; 3354 rargs->res_maxsize = tsize; 3355 3356 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3357 #ifdef DEBUG 3358 if (nfs4read_error_inject) { 3359 res.status = nfs4read_error_inject; 3360 nfs4read_error_inject = 0; 3361 } 3362 #endif 3363 3364 if (mi->mi_io_kstats) { 3365 mutex_enter(&mi->mi_lock); 3366 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3367 mutex_exit(&mi->mi_lock); 3368 } 3369 3370 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3371 if (e.error != 0 && !needrecov) { 3372 nfs4_end_fop(mi, vp, NULL, OH_READ, 3373 &recov_state, needrecov); 3374 return (e.error); 3375 } 3376 3377 /* 3378 * Do proper retry for OLD and BAD stateid errors outside 3379 * of the normal recovery framework. There are two differences 3380 * between async and sync reads. The first is that we allow 3381 * retry on BAD_STATEID for async reads, but not sync reads. 3382 * The second is that we mark the file dead for a failed 3383 * attempt with a special stateid for sync reads, but just 3384 * return EIO for async reads. 3385 * 3386 * If a sync read receives a BAD stateid error while using a 3387 * delegation stateid, retry using the open stateid (if it 3388 * exists). If it doesn't have an open stateid, reopen the 3389 * file first, then retry. 3390 */ 3391 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3392 res.status == NFS4ERR_BAD_STATEID) && async) { 3393 nfs4_end_fop(mi, vp, NULL, OH_READ, 3394 &recov_state, needrecov); 3395 if (sid_types.cur_sid_type == SPEC_SID) { 3396 (void) xdr_free(xdr_COMPOUND4res_clnt, 3397 (caddr_t)&res); 3398 return (EIO); 3399 } 3400 nfs4_save_stateid(&rargs->stateid, &sid_types); 3401 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3402 goto recov_retry; 3403 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3404 !async && sid_types.cur_sid_type != SPEC_SID) { 3405 nfs4_save_stateid(&rargs->stateid, &sid_types); 3406 nfs4_end_fop(mi, vp, NULL, OH_READ, 3407 &recov_state, needrecov); 3408 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3409 goto recov_retry; 3410 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3411 sid_types.cur_sid_type == DEL_SID) { 3412 nfs4_save_stateid(&rargs->stateid, &sid_types); 3413 mutex_enter(&rp->r_statev4_lock); 3414 rp->r_deleg_return_pending = TRUE; 3415 mutex_exit(&rp->r_statev4_lock); 3416 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3417 nfs4_end_fop(mi, vp, NULL, OH_READ, 3418 &recov_state, needrecov); 3419 (void) xdr_free(xdr_COMPOUND4res_clnt, 3420 (caddr_t)&res); 3421 return (EIO); 3422 } 3423 nfs4_end_fop(mi, vp, NULL, OH_READ, 3424 &recov_state, needrecov); 3425 /* hold needed for nfs4delegreturn_thread */ 3426 VN_HOLD(vp); 3427 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3428 NFS4_DR_DISCARD), FALSE); 3429 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3430 goto recov_retry; 3431 } 3432 if (needrecov) { 3433 bool_t abort; 3434 3435 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3436 "nfs4read: initiating recovery\n")); 3437 3438 abort = nfs4_start_recovery(&e, 3439 mi, vp, NULL, &rargs->stateid, 3440 NULL, OP_READ, NULL); 3441 nfs4_end_fop(mi, vp, NULL, OH_READ, 3442 &recov_state, needrecov); 3443 /* 3444 * Do not retry if we got OLD_STATEID using a special 3445 * stateid. This avoids looping with a broken server. 3446 */ 3447 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3448 sid_types.cur_sid_type == SPEC_SID) 3449 abort = TRUE; 3450 3451 if (abort == FALSE) { 3452 /* 3453 * Need to retry all possible stateids in 3454 * case the recovery error wasn't stateid 3455 * related or the stateids have become 3456 * stale (server reboot). 3457 */ 3458 nfs4_init_stateid_types(&sid_types); 3459 (void) xdr_free(xdr_COMPOUND4res_clnt, 3460 (caddr_t)&res); 3461 goto recov_retry; 3462 } 3463 3464 if (!e.error) { 3465 e.error = geterrno4(res.status); 3466 (void) xdr_free(xdr_COMPOUND4res_clnt, 3467 (caddr_t)&res); 3468 } 3469 return (e.error); 3470 } 3471 3472 if (res.status) { 3473 e.error = geterrno4(res.status); 3474 nfs4_end_fop(mi, vp, NULL, OH_READ, 3475 &recov_state, needrecov); 3476 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3477 return (e.error); 3478 } 3479 3480 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3481 count -= data_len; 3482 if (base) 3483 base += data_len; 3484 offset += data_len; 3485 if (mi->mi_io_kstats) { 3486 mutex_enter(&mi->mi_lock); 3487 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3488 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3489 mutex_exit(&mi->mi_lock); 3490 } 3491 lwp_stat_update(LWP_STAT_INBLK, 1); 3492 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3494 3495 } while (count && !is_eof); 3496 3497 *residp = count; 3498 3499 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3500 3501 return (e.error); 3502 } 3503 3504 /* ARGSUSED */ 3505 static int 3506 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 3507 { 3508 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3509 return (EIO); 3510 switch (cmd) { 3511 case _FIODIRECTIO: 3512 return (nfs4_directio(vp, (int)arg, cr)); 3513 default: 3514 return (ENOTTY); 3515 } 3516 } 3517 3518 static int 3519 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 3520 { 3521 int error; 3522 rnode4_t *rp = VTOR4(vp); 3523 3524 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3525 return (EIO); 3526 /* 3527 * If it has been specified that the return value will 3528 * just be used as a hint, and we are only being asked 3529 * for size, fsid or rdevid, then return the client's 3530 * notion of these values without checking to make sure 3531 * that the attribute cache is up to date. 3532 * The whole point is to avoid an over the wire GETATTR 3533 * call. 3534 */ 3535 if (flags & ATTR_HINT) { 3536 if (vap->va_mask == 3537 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3538 mutex_enter(&rp->r_statelock); 3539 if (vap->va_mask | AT_SIZE) 3540 vap->va_size = rp->r_size; 3541 if (vap->va_mask | AT_FSID) 3542 vap->va_fsid = rp->r_attr.va_fsid; 3543 if (vap->va_mask | AT_RDEV) 3544 vap->va_rdev = rp->r_attr.va_rdev; 3545 mutex_exit(&rp->r_statelock); 3546 return (0); 3547 } 3548 } 3549 3550 /* 3551 * Only need to flush pages if asking for the mtime 3552 * and if there any dirty pages or any outstanding 3553 * asynchronous (write) requests for this file. 3554 */ 3555 if (vap->va_mask & AT_MTIME) { 3556 rp = VTOR4(vp); 3557 if (nfs4_has_pages(vp)) { 3558 mutex_enter(&rp->r_statev4_lock); 3559 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3560 mutex_exit(&rp->r_statev4_lock); 3561 if (rp->r_flags & R4DIRTY || 3562 rp->r_awcount > 0) { 3563 mutex_enter(&rp->r_statelock); 3564 rp->r_gcount++; 3565 mutex_exit(&rp->r_statelock); 3566 error = 3567 nfs4_putpage(vp, (u_offset_t)0, 3568 0, 0, cr); 3569 mutex_enter(&rp->r_statelock); 3570 if (error && (error == ENOSPC || 3571 error == EDQUOT)) { 3572 if (!rp->r_error) 3573 rp->r_error = error; 3574 } 3575 if (--rp->r_gcount == 0) 3576 cv_broadcast(&rp->r_cv); 3577 mutex_exit(&rp->r_statelock); 3578 } 3579 } else { 3580 mutex_exit(&rp->r_statev4_lock); 3581 } 3582 } 3583 } 3584 return (nfs4getattr(vp, vap, cr)); 3585 } 3586 3587 int 3588 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3589 { 3590 /* 3591 * If these are the only two bits cleared 3592 * on the server then return 0 (OK) else 3593 * return 1 (BAD). 3594 */ 3595 on_client &= ~(S_ISUID|S_ISGID); 3596 if (on_client == from_server) 3597 return (0); 3598 else 3599 return (1); 3600 } 3601 3602 /*ARGSUSED4*/ 3603 static int 3604 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3605 caller_context_t *ct) 3606 { 3607 if (vap->va_mask & AT_NOSET) 3608 return (EINVAL); 3609 3610 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3611 return (EIO); 3612 3613 /* 3614 * Don't call secpolicy_vnode_setattr, the client cannot 3615 * use its cached attributes to make security decisions 3616 * as the server may be faking mode bits or mapping uid/gid. 3617 * Always just let the server to the checking. 3618 * If we provide the ability to remove basic priviledges 3619 * to setattr (e.g. basic without chmod) then we will 3620 * need to add a check here before calling the server. 3621 */ 3622 3623 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3624 } 3625 3626 /* 3627 * To replace the "guarded" version 3 setattr, we use two types of compound 3628 * setattr requests: 3629 * 1. The "normal" setattr, used when the size of the file isn't being 3630 * changed - { Putfh <fh>; Setattr; Getattr }/ 3631 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3632 * with only ctime as the argument. If the server ctime differs from 3633 * what is cached on the client, the verify will fail, but we would 3634 * already have the ctime from the preceding getattr, so just set it 3635 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3636 * Setattr; Getattr }. 3637 * 3638 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3639 * this setattr and NULL if they are not. 3640 */ 3641 static int 3642 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3643 vsecattr_t *vsap) 3644 { 3645 COMPOUND4args_clnt args; 3646 COMPOUND4res_clnt res, *resp = NULL; 3647 nfs4_ga_res_t *garp = NULL; 3648 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3649 nfs_argop4 argop[5]; 3650 int verify_argop = -1; 3651 int setattr_argop = 1; 3652 nfs_resop4 *resop; 3653 vattr_t va; 3654 rnode4_t *rp; 3655 int doqueue = 1; 3656 uint_t mask = vap->va_mask; 3657 mode_t omode; 3658 vsecattr_t *vsp; 3659 timestruc_t ctime; 3660 bool_t needrecov = FALSE; 3661 nfs4_recov_state_t recov_state; 3662 nfs4_stateid_types_t sid_types; 3663 stateid4 stateid; 3664 hrtime_t t; 3665 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3666 servinfo4_t *svp; 3667 bitmap4 supp_attrs; 3668 3669 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3670 rp = VTOR4(vp); 3671 nfs4_init_stateid_types(&sid_types); 3672 3673 /* 3674 * Only need to flush pages if there are any pages and 3675 * if the file is marked as dirty in some fashion. The 3676 * file must be flushed so that we can accurately 3677 * determine the size of the file and the cached data 3678 * after the SETATTR returns. A file is considered to 3679 * be dirty if it is either marked with R4DIRTY, has 3680 * outstanding i/o's active, or is mmap'd. In this 3681 * last case, we can't tell whether there are dirty 3682 * pages, so we flush just to be sure. 3683 */ 3684 if (nfs4_has_pages(vp) && 3685 ((rp->r_flags & R4DIRTY) || 3686 rp->r_count > 0 || 3687 rp->r_mapcnt > 0)) { 3688 ASSERT(vp->v_type != VCHR); 3689 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr); 3690 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3691 mutex_enter(&rp->r_statelock); 3692 if (!rp->r_error) 3693 rp->r_error = e.error; 3694 mutex_exit(&rp->r_statelock); 3695 } 3696 } 3697 3698 if (mask & AT_SIZE) { 3699 /* 3700 * Verification setattr compound for non-deleg AT_SIZE: 3701 * { Putfh; Getattr; Verify; Setattr; Getattr } 3702 * Set ctime local here (outside the do_again label) 3703 * so that subsequent retries (after failed VERIFY) 3704 * will use ctime from GETATTR results (from failed 3705 * verify compound) as VERIFY arg. 3706 * If file has delegation, then VERIFY(time_metadata) 3707 * is of little added value, so don't bother. 3708 */ 3709 mutex_enter(&rp->r_statev4_lock); 3710 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3711 rp->r_deleg_return_pending) { 3712 numops = 5; 3713 ctime = rp->r_attr.va_ctime; 3714 } 3715 mutex_exit(&rp->r_statev4_lock); 3716 } 3717 3718 recov_state.rs_flags = 0; 3719 recov_state.rs_num_retry_despite_err = 0; 3720 3721 args.ctag = TAG_SETATTR; 3722 do_again: 3723 recov_retry: 3724 setattr_argop = numops - 2; 3725 3726 args.array = argop; 3727 args.array_len = numops; 3728 3729 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3730 if (e.error) 3731 return (e.error); 3732 3733 3734 /* putfh target fh */ 3735 argop[0].argop = OP_CPUTFH; 3736 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3737 3738 if (numops == 5) { 3739 /* 3740 * We only care about the ctime, but need to get mtime 3741 * and size for proper cache update. 3742 */ 3743 /* getattr */ 3744 argop[1].argop = OP_GETATTR; 3745 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3746 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3747 3748 /* verify - set later in loop */ 3749 verify_argop = 2; 3750 } 3751 3752 /* setattr */ 3753 svp = rp->r_server; 3754 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3755 supp_attrs = svp->sv_supp_attrs; 3756 nfs_rw_exit(&svp->sv_lock); 3757 3758 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3759 supp_attrs, &e.error, &sid_types); 3760 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3761 if (e.error) { 3762 /* req time field(s) overflow - return immediately */ 3763 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3764 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3765 opsetattr.obj_attributes); 3766 return (e.error); 3767 } 3768 omode = rp->r_attr.va_mode; 3769 3770 /* getattr */ 3771 argop[numops-1].argop = OP_GETATTR; 3772 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3773 /* 3774 * If we are setting the ACL (indicated only by vsap != NULL), request 3775 * the ACL in this getattr. The ACL returned from this getattr will be 3776 * used in updating the ACL cache. 3777 */ 3778 if (vsap != NULL) 3779 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3780 FATTR4_ACL_MASK; 3781 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3782 3783 /* 3784 * setattr iterates if the object size is set and the cached ctime 3785 * does not match the file ctime. In that case, verify the ctime first. 3786 */ 3787 3788 do { 3789 if (verify_argop != -1) { 3790 /* 3791 * Verify that the ctime match before doing setattr. 3792 */ 3793 va.va_mask = AT_CTIME; 3794 va.va_ctime = ctime; 3795 svp = rp->r_server; 3796 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3797 supp_attrs = svp->sv_supp_attrs; 3798 nfs_rw_exit(&svp->sv_lock); 3799 e.error = nfs4args_verify(&argop[verify_argop], &va, 3800 OP_VERIFY, supp_attrs); 3801 if (e.error) { 3802 /* req time field(s) overflow - return */ 3803 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3804 needrecov); 3805 break; 3806 } 3807 } 3808 3809 doqueue = 1; 3810 3811 t = gethrtime(); 3812 3813 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3814 3815 /* 3816 * Purge the access cache and ACL cache if changing either the 3817 * owner of the file, the group owner, or the mode. These may 3818 * change the access permissions of the file, so purge old 3819 * information and start over again. 3820 */ 3821 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3822 (void) nfs4_access_purge_rp(rp); 3823 if (rp->r_secattr != NULL) { 3824 mutex_enter(&rp->r_statelock); 3825 vsp = rp->r_secattr; 3826 rp->r_secattr = NULL; 3827 mutex_exit(&rp->r_statelock); 3828 if (vsp != NULL) 3829 nfs4_acl_free_cache(vsp); 3830 } 3831 } 3832 3833 /* 3834 * If res.array_len == numops, then everything succeeded, 3835 * except for possibly the final getattr. If only the 3836 * last getattr failed, give up, and don't try recovery. 3837 */ 3838 if (res.array_len == numops) { 3839 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3840 needrecov); 3841 if (! e.error) 3842 resp = &res; 3843 break; 3844 } 3845 3846 /* 3847 * if either rpc call failed or completely succeeded - done 3848 */ 3849 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3850 if (e.error) { 3851 PURGE_ATTRCACHE4(vp); 3852 if (!needrecov) { 3853 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3854 needrecov); 3855 break; 3856 } 3857 } 3858 3859 /* 3860 * Do proper retry for OLD_STATEID outside of the normal 3861 * recovery framework. 3862 */ 3863 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3864 sid_types.cur_sid_type != SPEC_SID && 3865 sid_types.cur_sid_type != NO_SID) { 3866 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3867 needrecov); 3868 nfs4_save_stateid(&stateid, &sid_types); 3869 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3870 opsetattr.obj_attributes); 3871 if (verify_argop != -1) { 3872 nfs4args_verify_free(&argop[verify_argop]); 3873 verify_argop = -1; 3874 } 3875 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3876 goto recov_retry; 3877 } 3878 3879 if (needrecov) { 3880 bool_t abort; 3881 3882 abort = nfs4_start_recovery(&e, 3883 VTOMI4(vp), vp, NULL, NULL, NULL, 3884 OP_SETATTR, NULL); 3885 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3886 needrecov); 3887 /* 3888 * Do not retry if we failed with OLD_STATEID using 3889 * a special stateid. This is done to avoid looping 3890 * with a broken server. 3891 */ 3892 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3893 (sid_types.cur_sid_type == SPEC_SID || 3894 sid_types.cur_sid_type == NO_SID)) 3895 abort = TRUE; 3896 if (!e.error) { 3897 if (res.status == NFS4ERR_BADOWNER) 3898 nfs4_log_badowner(VTOMI4(vp), 3899 OP_SETATTR); 3900 3901 e.error = geterrno4(res.status); 3902 (void) xdr_free(xdr_COMPOUND4res_clnt, 3903 (caddr_t)&res); 3904 } 3905 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3906 opsetattr.obj_attributes); 3907 if (verify_argop != -1) { 3908 nfs4args_verify_free(&argop[verify_argop]); 3909 verify_argop = -1; 3910 } 3911 if (abort == FALSE) { 3912 /* 3913 * Need to retry all possible stateids in 3914 * case the recovery error wasn't stateid 3915 * related or the stateids have become 3916 * stale (server reboot). 3917 */ 3918 nfs4_init_stateid_types(&sid_types); 3919 goto recov_retry; 3920 } 3921 return (e.error); 3922 } 3923 3924 /* 3925 * Need to call nfs4_end_op before nfs4getattr to 3926 * avoid potential nfs4_start_op deadlock. See RFE 3927 * 4777612. Calls to nfs4_invalidate_pages() and 3928 * nfs4_purge_stale_fh() might also generate over the 3929 * wire calls which my cause nfs4_start_op() deadlock. 3930 */ 3931 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3932 3933 /* 3934 * Check to update lease. 3935 */ 3936 resp = &res; 3937 if (res.status == NFS4_OK) { 3938 break; 3939 } 3940 3941 /* 3942 * Check if verify failed to see if try again 3943 */ 3944 if ((verify_argop == -1) || (res.array_len != 3)) { 3945 /* 3946 * can't continue... 3947 */ 3948 if (res.status == NFS4ERR_BADOWNER) 3949 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 3950 3951 e.error = geterrno4(res.status); 3952 } else { 3953 /* 3954 * When the verify request fails, the client ctime is 3955 * not in sync with the server. This is the same as 3956 * the version 3 "not synchronized" error, and we 3957 * handle it in a similar manner (XXX do we need to???). 3958 * Use the ctime returned in the first getattr for 3959 * the input to the next verify. 3960 * If we couldn't get the attributes, then we give up 3961 * because we can't complete the operation as required. 3962 */ 3963 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 3964 } 3965 if (e.error) { 3966 PURGE_ATTRCACHE4(vp); 3967 nfs4_purge_stale_fh(e.error, vp, cr); 3968 } else { 3969 /* 3970 * retry with a new verify value 3971 */ 3972 ctime = garp->n4g_va.va_ctime; 3973 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3974 resp = NULL; 3975 } 3976 if (!e.error) { 3977 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3978 opsetattr.obj_attributes); 3979 if (verify_argop != -1) { 3980 nfs4args_verify_free(&argop[verify_argop]); 3981 verify_argop = -1; 3982 } 3983 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3984 goto do_again; 3985 } 3986 } while (!e.error); 3987 3988 if (e.error) { 3989 /* 3990 * If we are here, rfs4call has an irrecoverable error - return 3991 */ 3992 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3993 opsetattr.obj_attributes); 3994 if (verify_argop != -1) { 3995 nfs4args_verify_free(&argop[verify_argop]); 3996 verify_argop = -1; 3997 } 3998 if (resp) 3999 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4000 return (e.error); 4001 } 4002 4003 4004 4005 /* 4006 * If changing the size of the file, invalidate 4007 * any local cached data which is no longer part 4008 * of the file. We also possibly invalidate the 4009 * last page in the file. We could use 4010 * pvn_vpzero(), but this would mark the page as 4011 * modified and require it to be written back to 4012 * the server for no particularly good reason. 4013 * This way, if we access it, then we bring it 4014 * back in. A read should be cheaper than a 4015 * write. 4016 */ 4017 if (mask & AT_SIZE) { 4018 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4019 } 4020 4021 /* either no error or one of the postop getattr failed */ 4022 4023 /* 4024 * XXX Perform a simplified version of wcc checking. Instead of 4025 * have another getattr to get pre-op, just purge cache if 4026 * any of the ops prior to and including the getattr failed. 4027 * If the getattr succeeded then update the attrcache accordingly. 4028 */ 4029 4030 garp = NULL; 4031 if (res.status == NFS4_OK) { 4032 /* 4033 * Last getattr 4034 */ 4035 resop = &res.array[numops - 1]; 4036 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4037 } 4038 /* 4039 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4040 * rather than filling it. See the function itself for details. 4041 */ 4042 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4043 if (garp != NULL) { 4044 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4045 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4046 vs_ace4_destroy(&garp->n4g_vsa); 4047 } else { 4048 if (vsap != NULL) { 4049 /* 4050 * The ACL was supposed to be set and to be 4051 * returned in the last getattr of this 4052 * compound, but for some reason the getattr 4053 * result doesn't contain the ACL. In this 4054 * case, purge the ACL cache. 4055 */ 4056 if (rp->r_secattr != NULL) { 4057 mutex_enter(&rp->r_statelock); 4058 vsp = rp->r_secattr; 4059 rp->r_secattr = NULL; 4060 mutex_exit(&rp->r_statelock); 4061 if (vsp != NULL) 4062 nfs4_acl_free_cache(vsp); 4063 } 4064 } 4065 } 4066 } 4067 4068 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4069 /* 4070 * Set the size, rather than relying on getting it updated 4071 * via a GETATTR. With delegations the client tries to 4072 * suppress GETATTR calls. 4073 */ 4074 mutex_enter(&rp->r_statelock); 4075 rp->r_size = vap->va_size; 4076 mutex_exit(&rp->r_statelock); 4077 } 4078 4079 /* 4080 * Can free up request args and res 4081 */ 4082 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4083 opsetattr.obj_attributes); 4084 if (verify_argop != -1) { 4085 nfs4args_verify_free(&argop[verify_argop]); 4086 verify_argop = -1; 4087 } 4088 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4089 4090 /* 4091 * Some servers will change the mode to clear the setuid 4092 * and setgid bits when changing the uid or gid. The 4093 * client needs to compensate appropriately. 4094 */ 4095 if (mask & (AT_UID | AT_GID)) { 4096 int terror, do_setattr; 4097 4098 do_setattr = 0; 4099 va.va_mask = AT_MODE; 4100 terror = nfs4getattr(vp, &va, cr); 4101 if (!terror && 4102 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4103 (!(mask & AT_MODE) && va.va_mode != omode))) { 4104 va.va_mask = AT_MODE; 4105 if (mask & AT_MODE) { 4106 /* 4107 * We asked the mode to be changed and what 4108 * we just got from the server in getattr is 4109 * not what we wanted it to be, so set it now. 4110 */ 4111 va.va_mode = vap->va_mode; 4112 do_setattr = 1; 4113 } else { 4114 /* 4115 * We did not ask the mode to be changed, 4116 * Check to see that the server just cleared 4117 * I_SUID and I_GUID from it. If not then 4118 * set mode to omode with UID/GID cleared. 4119 */ 4120 if (nfs4_compare_modes(va.va_mode, omode)) { 4121 omode &= ~(S_ISUID|S_ISGID); 4122 va.va_mode = omode; 4123 do_setattr = 1; 4124 } 4125 } 4126 4127 if (do_setattr) 4128 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4129 } 4130 } 4131 4132 return (e.error); 4133 } 4134 4135 /* ARGSUSED */ 4136 static int 4137 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr) 4138 { 4139 COMPOUND4args_clnt args; 4140 COMPOUND4res_clnt res; 4141 int doqueue; 4142 uint32_t acc, resacc, argacc; 4143 rnode4_t *rp; 4144 cred_t *cred, *ncr, *ncrfree = NULL; 4145 nfs4_access_type_t cacc; 4146 int num_ops; 4147 nfs_argop4 argop[3]; 4148 nfs_resop4 *resop; 4149 bool_t needrecov = FALSE, do_getattr; 4150 nfs4_recov_state_t recov_state; 4151 int rpc_error; 4152 hrtime_t t; 4153 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4154 mntinfo4_t *mi = VTOMI4(vp); 4155 4156 if (nfs_zone() != mi->mi_zone) 4157 return (EIO); 4158 4159 acc = 0; 4160 if (mode & VREAD) 4161 acc |= ACCESS4_READ; 4162 if (mode & VWRITE) { 4163 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4164 return (EROFS); 4165 if (vp->v_type == VDIR) 4166 acc |= ACCESS4_DELETE; 4167 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4168 } 4169 if (mode & VEXEC) { 4170 if (vp->v_type == VDIR) 4171 acc |= ACCESS4_LOOKUP; 4172 else 4173 acc |= ACCESS4_EXECUTE; 4174 } 4175 4176 if (VTOR4(vp)->r_acache != NULL) { 4177 e.error = nfs4_validate_caches(vp, cr); 4178 if (e.error) 4179 return (e.error); 4180 } 4181 4182 rp = VTOR4(vp); 4183 if (vp->v_type == VDIR) { 4184 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4185 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4186 } else { 4187 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4188 ACCESS4_EXECUTE; 4189 } 4190 recov_state.rs_flags = 0; 4191 recov_state.rs_num_retry_despite_err = 0; 4192 4193 cred = cr; 4194 /* 4195 * ncr and ncrfree both initially 4196 * point to the memory area returned 4197 * by crnetadjust(); 4198 * ncrfree not NULL when exiting means 4199 * that we need to release it 4200 */ 4201 ncr = crnetadjust(cred); 4202 ncrfree = ncr; 4203 4204 tryagain: 4205 cacc = nfs4_access_check(rp, acc, cred); 4206 if (cacc == NFS4_ACCESS_ALLOWED) { 4207 if (ncrfree != NULL) 4208 crfree(ncrfree); 4209 return (0); 4210 } 4211 if (cacc == NFS4_ACCESS_DENIED) { 4212 /* 4213 * If the cred can be adjusted, try again 4214 * with the new cred. 4215 */ 4216 if (ncr != NULL) { 4217 cred = ncr; 4218 ncr = NULL; 4219 goto tryagain; 4220 } 4221 if (ncrfree != NULL) 4222 crfree(ncrfree); 4223 return (EACCES); 4224 } 4225 4226 recov_retry: 4227 /* 4228 * Don't take with r_statev4_lock here. r_deleg_type could 4229 * change as soon as lock is released. Since it is an int, 4230 * there is no atomicity issue. 4231 */ 4232 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4233 num_ops = do_getattr ? 3 : 2; 4234 4235 args.ctag = TAG_ACCESS; 4236 4237 args.array_len = num_ops; 4238 args.array = argop; 4239 4240 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4241 &recov_state, NULL)) { 4242 if (ncrfree != NULL) 4243 crfree(ncrfree); 4244 return (e.error); 4245 } 4246 4247 /* putfh target fh */ 4248 argop[0].argop = OP_CPUTFH; 4249 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4250 4251 /* access */ 4252 argop[1].argop = OP_ACCESS; 4253 argop[1].nfs_argop4_u.opaccess.access = argacc; 4254 4255 /* getattr */ 4256 if (do_getattr) { 4257 argop[2].argop = OP_GETATTR; 4258 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4259 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4260 } 4261 4262 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4263 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4264 rnode4info(VTOR4(vp)))); 4265 4266 doqueue = 1; 4267 t = gethrtime(); 4268 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4269 rpc_error = e.error; 4270 4271 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4272 if (needrecov) { 4273 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4274 "nfs4_access: initiating recovery\n")); 4275 4276 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4277 NULL, OP_ACCESS, NULL) == FALSE) { 4278 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4279 &recov_state, needrecov); 4280 if (!e.error) 4281 (void) xdr_free(xdr_COMPOUND4res_clnt, 4282 (caddr_t)&res); 4283 goto recov_retry; 4284 } 4285 } 4286 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4287 4288 if (e.error) 4289 goto out; 4290 4291 if (res.status) { 4292 e.error = geterrno4(res.status); 4293 /* 4294 * This might generate over the wire calls throught 4295 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4296 * here to avoid a deadlock. 4297 */ 4298 nfs4_purge_stale_fh(e.error, vp, cr); 4299 goto out; 4300 } 4301 resop = &res.array[1]; /* access res */ 4302 4303 resacc = resop->nfs_resop4_u.opaccess.access; 4304 4305 if (do_getattr) { 4306 resop++; /* getattr res */ 4307 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4308 t, cr, FALSE, NULL); 4309 } 4310 4311 if (!e.error) { 4312 nfs4_access_cache(rp, argacc, resacc, cred); 4313 /* 4314 * we just cached results with cred; if cred is the 4315 * adjusted credentials from crnetadjust, we do not want 4316 * to release them before exiting: hence setting ncrfree 4317 * to NULL 4318 */ 4319 if (cred != cr) 4320 ncrfree = NULL; 4321 /* XXX check the supported bits too? */ 4322 if ((acc & resacc) != acc) { 4323 /* 4324 * The following code implements the semantic 4325 * that a setuid root program has *at least* the 4326 * permissions of the user that is running the 4327 * program. See rfs3call() for more portions 4328 * of the implementation of this functionality. 4329 */ 4330 /* XXX-LP */ 4331 if (ncr != NULL) { 4332 (void) xdr_free(xdr_COMPOUND4res_clnt, 4333 (caddr_t)&res); 4334 cred = ncr; 4335 ncr = NULL; 4336 goto tryagain; 4337 } 4338 e.error = EACCES; 4339 } 4340 } 4341 4342 out: 4343 if (!rpc_error) 4344 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4345 4346 if (ncrfree != NULL) 4347 crfree(ncrfree); 4348 4349 return (e.error); 4350 } 4351 4352 static int 4353 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 4354 { 4355 COMPOUND4args_clnt args; 4356 COMPOUND4res_clnt res; 4357 int doqueue; 4358 rnode4_t *rp; 4359 nfs_argop4 argop[3]; 4360 nfs_resop4 *resop; 4361 READLINK4res *lr_res; 4362 nfs4_ga_res_t *garp; 4363 uint_t len; 4364 char *linkdata; 4365 bool_t needrecov = FALSE; 4366 nfs4_recov_state_t recov_state; 4367 hrtime_t t; 4368 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4369 4370 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4371 return (EIO); 4372 /* 4373 * Can't readlink anything other than a symbolic link. 4374 */ 4375 if (vp->v_type != VLNK) 4376 return (EINVAL); 4377 4378 rp = VTOR4(vp); 4379 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4380 e.error = nfs4_validate_caches(vp, cr); 4381 if (e.error) 4382 return (e.error); 4383 mutex_enter(&rp->r_statelock); 4384 if (rp->r_symlink.contents != NULL) { 4385 e.error = uiomove(rp->r_symlink.contents, 4386 rp->r_symlink.len, UIO_READ, uiop); 4387 mutex_exit(&rp->r_statelock); 4388 return (e.error); 4389 } 4390 mutex_exit(&rp->r_statelock); 4391 } 4392 recov_state.rs_flags = 0; 4393 recov_state.rs_num_retry_despite_err = 0; 4394 4395 recov_retry: 4396 args.array_len = 3; 4397 args.array = argop; 4398 args.ctag = TAG_READLINK; 4399 4400 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4401 if (e.error) { 4402 return (e.error); 4403 } 4404 4405 /* 0. putfh symlink fh */ 4406 argop[0].argop = OP_CPUTFH; 4407 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4408 4409 /* 1. readlink */ 4410 argop[1].argop = OP_READLINK; 4411 4412 /* 2. getattr */ 4413 argop[2].argop = OP_GETATTR; 4414 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4415 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4416 4417 doqueue = 1; 4418 4419 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4420 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4421 rnode4info(VTOR4(vp)))); 4422 4423 t = gethrtime(); 4424 4425 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4426 4427 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4428 if (needrecov) { 4429 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4430 "nfs4_readlink: initiating recovery\n")); 4431 4432 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4433 NULL, OP_READLINK, NULL) == FALSE) { 4434 if (!e.error) 4435 (void) xdr_free(xdr_COMPOUND4res_clnt, 4436 (caddr_t)&res); 4437 4438 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4439 needrecov); 4440 goto recov_retry; 4441 } 4442 } 4443 4444 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4445 4446 if (e.error) 4447 return (e.error); 4448 4449 /* 4450 * There is an path in the code below which calls 4451 * nfs4_purge_stale_fh(), which may generate otw calls through 4452 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4453 * here to avoid nfs4_start_op() deadlock. 4454 */ 4455 4456 if (res.status && (res.array_len < args.array_len)) { 4457 /* 4458 * either Putfh or Link failed 4459 */ 4460 e.error = geterrno4(res.status); 4461 nfs4_purge_stale_fh(e.error, vp, cr); 4462 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4463 return (e.error); 4464 } 4465 4466 resop = &res.array[1]; /* readlink res */ 4467 lr_res = &resop->nfs_resop4_u.opreadlink; 4468 4469 /* 4470 * treat symlink names as data 4471 */ 4472 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4473 if (linkdata != NULL) { 4474 int uio_len = len - 1; 4475 /* len includes null byte, which we won't uiomove */ 4476 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4477 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4478 mutex_enter(&rp->r_statelock); 4479 if (rp->r_symlink.contents == NULL) { 4480 rp->r_symlink.contents = linkdata; 4481 rp->r_symlink.len = uio_len; 4482 rp->r_symlink.size = len; 4483 mutex_exit(&rp->r_statelock); 4484 } else { 4485 mutex_exit(&rp->r_statelock); 4486 kmem_free(linkdata, len); 4487 } 4488 } else { 4489 kmem_free(linkdata, len); 4490 } 4491 } 4492 if (res.status == NFS4_OK) { 4493 resop++; /* getattr res */ 4494 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4495 } 4496 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4497 4498 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4499 4500 /* 4501 * The over the wire error for attempting to readlink something 4502 * other than a symbolic link is ENXIO. However, we need to 4503 * return EINVAL instead of ENXIO, so we map it here. 4504 */ 4505 return (e.error == ENXIO ? EINVAL : e.error); 4506 } 4507 4508 /* 4509 * Flush local dirty pages to stable storage on the server. 4510 * 4511 * If FNODSYNC is specified, then there is nothing to do because 4512 * metadata changes are not cached on the client before being 4513 * sent to the server. 4514 */ 4515 static int 4516 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr) 4517 { 4518 int error; 4519 4520 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4521 return (0); 4522 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4523 return (EIO); 4524 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4525 if (!error) 4526 error = VTOR4(vp)->r_error; 4527 return (error); 4528 } 4529 4530 /* 4531 * Weirdness: if the file was removed or the target of a rename 4532 * operation while it was open, it got renamed instead. Here we 4533 * remove the renamed file. 4534 */ 4535 static void 4536 nfs4_inactive(vnode_t *vp, cred_t *cr) 4537 { 4538 rnode4_t *rp; 4539 4540 ASSERT(vp != DNLC_NO_VNODE); 4541 4542 rp = VTOR4(vp); 4543 4544 if (IS_SHADOW(vp, rp)) { 4545 sv_inactive(vp); 4546 return; 4547 } 4548 4549 /* 4550 * If this is coming from the wrong zone, we let someone in the right 4551 * zone take care of it asynchronously. We can get here due to 4552 * VN_RELE() being called from pageout() or fsflush(). This call may 4553 * potentially turn into an expensive no-op if, for instance, v_count 4554 * gets incremented in the meantime, but it's still correct. 4555 */ 4556 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4557 nfs4_async_inactive(vp, cr); 4558 return; 4559 } 4560 4561 /* 4562 * Some of the cleanup steps might require over-the-wire 4563 * operations. Since VOP_INACTIVE can get called as a result of 4564 * other over-the-wire operations (e.g., an attribute cache update 4565 * can lead to a DNLC purge), doing those steps now would lead to a 4566 * nested call to the recovery framework, which can deadlock. So 4567 * do any over-the-wire cleanups asynchronously, in a separate 4568 * thread. 4569 */ 4570 4571 mutex_enter(&rp->r_os_lock); 4572 mutex_enter(&rp->r_statelock); 4573 mutex_enter(&rp->r_statev4_lock); 4574 4575 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4576 mutex_exit(&rp->r_statev4_lock); 4577 mutex_exit(&rp->r_statelock); 4578 mutex_exit(&rp->r_os_lock); 4579 nfs4_async_inactive(vp, cr); 4580 return; 4581 } 4582 4583 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4584 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4585 mutex_exit(&rp->r_statev4_lock); 4586 mutex_exit(&rp->r_statelock); 4587 mutex_exit(&rp->r_os_lock); 4588 nfs4_async_inactive(vp, cr); 4589 return; 4590 } 4591 4592 if (rp->r_unldvp != NULL) { 4593 mutex_exit(&rp->r_statev4_lock); 4594 mutex_exit(&rp->r_statelock); 4595 mutex_exit(&rp->r_os_lock); 4596 nfs4_async_inactive(vp, cr); 4597 return; 4598 } 4599 mutex_exit(&rp->r_statev4_lock); 4600 mutex_exit(&rp->r_statelock); 4601 mutex_exit(&rp->r_os_lock); 4602 4603 rp4_addfree(rp, cr); 4604 } 4605 4606 /* 4607 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4608 * various bits of state. The caller must not refer to vp after this call. 4609 */ 4610 4611 void 4612 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4613 { 4614 rnode4_t *rp = VTOR4(vp); 4615 nfs4_recov_state_t recov_state; 4616 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4617 vnode_t *unldvp; 4618 char *unlname; 4619 cred_t *unlcred; 4620 COMPOUND4args_clnt args; 4621 COMPOUND4res_clnt res, *resp; 4622 nfs_argop4 argop[2]; 4623 int doqueue; 4624 #ifdef DEBUG 4625 char *name; 4626 #endif 4627 4628 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4629 ASSERT(!IS_SHADOW(vp, rp)); 4630 4631 #ifdef DEBUG 4632 name = fn_name(VTOSV(vp)->sv_name); 4633 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4634 "release vnode %s", name)); 4635 kmem_free(name, MAXNAMELEN); 4636 #endif 4637 4638 if (vp->v_type == VREG) { 4639 bool_t recov_failed = FALSE; 4640 4641 e.error = nfs4close_all(vp, cr); 4642 if (e.error) { 4643 /* Check to see if recovery failed */ 4644 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4645 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4646 recov_failed = TRUE; 4647 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4648 if (!recov_failed) { 4649 mutex_enter(&rp->r_statelock); 4650 if (rp->r_flags & R4RECOVERR) 4651 recov_failed = TRUE; 4652 mutex_exit(&rp->r_statelock); 4653 } 4654 if (recov_failed) { 4655 NFS4_DEBUG(nfs4_client_recov_debug, 4656 (CE_NOTE, "nfs4_inactive_otw: " 4657 "close failed (recovery failure)")); 4658 } 4659 } 4660 } 4661 4662 redo: 4663 if (rp->r_unldvp == NULL) { 4664 rp4_addfree(rp, cr); 4665 return; 4666 } 4667 4668 /* 4669 * Save the vnode pointer for the directory where the 4670 * unlinked-open file got renamed, then set it to NULL 4671 * to prevent another thread from getting here before 4672 * we're done with the remove. While we have the 4673 * statelock, make local copies of the pertinent rnode 4674 * fields. If we weren't to do this in an atomic way, the 4675 * the unl* fields could become inconsistent with respect 4676 * to each other due to a race condition between this 4677 * code and nfs_remove(). See bug report 1034328. 4678 */ 4679 mutex_enter(&rp->r_statelock); 4680 if (rp->r_unldvp == NULL) { 4681 mutex_exit(&rp->r_statelock); 4682 rp4_addfree(rp, cr); 4683 return; 4684 } 4685 4686 unldvp = rp->r_unldvp; 4687 rp->r_unldvp = NULL; 4688 unlname = rp->r_unlname; 4689 rp->r_unlname = NULL; 4690 unlcred = rp->r_unlcred; 4691 rp->r_unlcred = NULL; 4692 mutex_exit(&rp->r_statelock); 4693 4694 /* 4695 * If there are any dirty pages left, then flush 4696 * them. This is unfortunate because they just 4697 * may get thrown away during the remove operation, 4698 * but we have to do this for correctness. 4699 */ 4700 if (nfs4_has_pages(vp) && 4701 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4702 ASSERT(vp->v_type != VCHR); 4703 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 4704 if (e.error) { 4705 mutex_enter(&rp->r_statelock); 4706 if (!rp->r_error) 4707 rp->r_error = e.error; 4708 mutex_exit(&rp->r_statelock); 4709 } 4710 } 4711 4712 recov_state.rs_flags = 0; 4713 recov_state.rs_num_retry_despite_err = 0; 4714 recov_retry_remove: 4715 /* 4716 * Do the remove operation on the renamed file 4717 */ 4718 args.ctag = TAG_INACTIVE; 4719 4720 /* 4721 * Remove ops: putfh dir; remove 4722 */ 4723 args.array_len = 2; 4724 args.array = argop; 4725 4726 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4727 if (e.error) { 4728 kmem_free(unlname, MAXNAMELEN); 4729 crfree(unlcred); 4730 VN_RELE(unldvp); 4731 /* 4732 * Try again; this time around r_unldvp will be NULL, so we'll 4733 * just call rp4_addfree() and return. 4734 */ 4735 goto redo; 4736 } 4737 4738 /* putfh directory */ 4739 argop[0].argop = OP_CPUTFH; 4740 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4741 4742 /* remove */ 4743 argop[1].argop = OP_CREMOVE; 4744 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4745 4746 doqueue = 1; 4747 resp = &res; 4748 4749 #if 0 /* notyet */ 4750 /* 4751 * Can't do this yet. We may be being called from 4752 * dnlc_purge_XXX while that routine is holding a 4753 * mutex lock to the nc_rele list. The calls to 4754 * nfs3_cache_wcc_data may result in calls to 4755 * dnlc_purge_XXX. This will result in a deadlock. 4756 */ 4757 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4758 if (e.error) { 4759 PURGE_ATTRCACHE4(unldvp); 4760 resp = NULL; 4761 } else if (res.status) { 4762 e.error = geterrno4(res.status); 4763 PURGE_ATTRCACHE4(unldvp); 4764 /* 4765 * This code is inactive right now 4766 * but if made active there should 4767 * be a nfs4_end_op() call before 4768 * nfs4_purge_stale_fh to avoid start_op() 4769 * deadlock. See BugId: 4948726 4770 */ 4771 nfs4_purge_stale_fh(error, unldvp, cr); 4772 } else { 4773 nfs_resop4 *resop; 4774 REMOVE4res *rm_res; 4775 4776 resop = &res.array[1]; 4777 rm_res = &resop->nfs_resop4_u.opremove; 4778 /* 4779 * Update directory cache attribute, 4780 * readdir and dnlc caches. 4781 */ 4782 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4783 } 4784 #else 4785 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4786 4787 PURGE_ATTRCACHE4(unldvp); 4788 #endif 4789 4790 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4791 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4792 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4793 if (!e.error) 4794 (void) xdr_free(xdr_COMPOUND4res_clnt, 4795 (caddr_t)&res); 4796 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4797 &recov_state, TRUE); 4798 goto recov_retry_remove; 4799 } 4800 } 4801 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4802 4803 /* 4804 * Release stuff held for the remove 4805 */ 4806 VN_RELE(unldvp); 4807 if (!e.error && resp) 4808 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4809 4810 kmem_free(unlname, MAXNAMELEN); 4811 crfree(unlcred); 4812 goto redo; 4813 } 4814 4815 /* 4816 * Remote file system operations having to do with directory manipulation. 4817 */ 4818 /* ARGSUSED3 */ 4819 static int 4820 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4821 int flags, vnode_t *rdir, cred_t *cr) 4822 { 4823 int error; 4824 vnode_t *vp, *avp = NULL; 4825 rnode4_t *drp; 4826 4827 *vpp = NULL; 4828 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4829 return (EPERM); 4830 /* 4831 * if LOOKUP_XATTR, must replace dvp (object) with 4832 * object's attrdir before continuing with lookup 4833 */ 4834 if (flags & LOOKUP_XATTR) { 4835 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4836 if (error) 4837 return (error); 4838 4839 dvp = avp; 4840 4841 /* 4842 * If lookup is for "", just return dvp now. The attrdir 4843 * has already been activated (from nfs4lookup_xattr), and 4844 * the caller will RELE the original dvp -- not 4845 * the attrdir. So, set vpp and return. 4846 * Currently, when the LOOKUP_XATTR flag is 4847 * passed to VOP_LOOKUP, the name is always empty, and 4848 * shortcircuiting here avoids 3 unneeded lock/unlock 4849 * pairs. 4850 * 4851 * If a non-empty name was provided, then it is the 4852 * attribute name, and it will be looked up below. 4853 */ 4854 if (*nm == '\0') { 4855 *vpp = dvp; 4856 return (0); 4857 } 4858 4859 /* 4860 * The vfs layer never sends a name when asking for the 4861 * attrdir, so we should never get here (unless of course 4862 * name is passed at some time in future -- at which time 4863 * we'll blow up here). 4864 */ 4865 ASSERT(0); 4866 } 4867 4868 drp = VTOR4(dvp); 4869 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4870 return (EINTR); 4871 4872 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4873 nfs_rw_exit(&drp->r_rwlock); 4874 4875 /* 4876 * If vnode is a device, create special vnode. 4877 */ 4878 if (!error && ISVDEV((*vpp)->v_type)) { 4879 vp = *vpp; 4880 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4881 VN_RELE(vp); 4882 } 4883 4884 return (error); 4885 } 4886 4887 /* ARGSUSED */ 4888 static int 4889 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4890 { 4891 int error; 4892 rnode4_t *drp; 4893 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4894 mntinfo4_t *mi; 4895 4896 mi = VTOMI4(dvp); 4897 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR)) 4898 return (EINVAL); 4899 4900 drp = VTOR4(dvp); 4901 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4902 return (EINTR); 4903 4904 mutex_enter(&drp->r_statelock); 4905 /* 4906 * If the server doesn't support xattrs just return EINVAL 4907 */ 4908 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4909 mutex_exit(&drp->r_statelock); 4910 nfs_rw_exit(&drp->r_rwlock); 4911 return (EINVAL); 4912 } 4913 4914 /* 4915 * If there is a cached xattr directory entry, 4916 * use it as long as the attributes are valid. If the 4917 * attributes are not valid, take the simple approach and 4918 * free the cached value and re-fetch a new value. 4919 * 4920 * We don't negative entry cache for now, if we did we 4921 * would need to check if the file has changed on every 4922 * lookup. But xattrs don't exist very often and failing 4923 * an openattr is not much more expensive than and NVERIFY or GETATTR 4924 * so do an openattr over the wire for now. 4925 */ 4926 if (drp->r_xattr_dir != NULL) { 4927 if (ATTRCACHE4_VALID(dvp)) { 4928 VN_HOLD(drp->r_xattr_dir); 4929 *vpp = drp->r_xattr_dir; 4930 mutex_exit(&drp->r_statelock); 4931 nfs_rw_exit(&drp->r_rwlock); 4932 return (0); 4933 } 4934 VN_RELE(drp->r_xattr_dir); 4935 drp->r_xattr_dir = NULL; 4936 } 4937 mutex_exit(&drp->r_statelock); 4938 4939 error = nfs4openattr(dvp, vpp, cflag, cr); 4940 4941 nfs_rw_exit(&drp->r_rwlock); 4942 4943 return (error); 4944 } 4945 4946 static int 4947 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 4948 { 4949 int error; 4950 rnode4_t *drp; 4951 4952 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 4953 4954 /* 4955 * If lookup is for "", just return dvp. Don't need 4956 * to send it over the wire, look it up in the dnlc, 4957 * or perform any access checks. 4958 */ 4959 if (*nm == '\0') { 4960 VN_HOLD(dvp); 4961 *vpp = dvp; 4962 return (0); 4963 } 4964 4965 /* 4966 * Can't do lookups in non-directories. 4967 */ 4968 if (dvp->v_type != VDIR) 4969 return (ENOTDIR); 4970 4971 /* 4972 * If lookup is for ".", just return dvp. Don't need 4973 * to send it over the wire or look it up in the dnlc, 4974 * just need to check access. 4975 */ 4976 if (nm[0] == '.' && nm[1] == '\0') { 4977 error = nfs4_access(dvp, VEXEC, 0, cr); 4978 if (error) 4979 return (error); 4980 VN_HOLD(dvp); 4981 *vpp = dvp; 4982 return (0); 4983 } 4984 4985 drp = VTOR4(dvp); 4986 if (!(drp->r_flags & R4LOOKUP)) { 4987 mutex_enter(&drp->r_statelock); 4988 drp->r_flags |= R4LOOKUP; 4989 mutex_exit(&drp->r_statelock); 4990 } 4991 4992 *vpp = NULL; 4993 /* 4994 * Lookup this name in the DNLC. If there is no entry 4995 * lookup over the wire. 4996 */ 4997 if (!skipdnlc) 4998 *vpp = dnlc_lookup(dvp, nm); 4999 if (*vpp == NULL) { 5000 /* 5001 * We need to go over the wire to lookup the name. 5002 */ 5003 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5004 } 5005 5006 /* 5007 * We hit on the dnlc 5008 */ 5009 if (*vpp != DNLC_NO_VNODE || 5010 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5011 /* 5012 * But our attrs may not be valid. 5013 */ 5014 if (ATTRCACHE4_VALID(dvp)) { 5015 error = nfs4_waitfor_purge_complete(dvp); 5016 if (error) { 5017 VN_RELE(*vpp); 5018 *vpp = NULL; 5019 return (error); 5020 } 5021 5022 /* 5023 * If after the purge completes, check to make sure 5024 * our attrs are still valid. 5025 */ 5026 if (ATTRCACHE4_VALID(dvp)) { 5027 /* 5028 * If we waited for a purge we may have 5029 * lost our vnode so look it up again. 5030 */ 5031 VN_RELE(*vpp); 5032 *vpp = dnlc_lookup(dvp, nm); 5033 if (*vpp == NULL) 5034 return (nfs4lookupnew_otw(dvp, 5035 nm, vpp, cr)); 5036 5037 /* 5038 * The access cache should almost always hit 5039 */ 5040 error = nfs4_access(dvp, VEXEC, 0, cr); 5041 5042 if (error) { 5043 VN_RELE(*vpp); 5044 *vpp = NULL; 5045 return (error); 5046 } 5047 if (*vpp == DNLC_NO_VNODE) { 5048 VN_RELE(*vpp); 5049 *vpp = NULL; 5050 return (ENOENT); 5051 } 5052 return (0); 5053 } 5054 } 5055 } 5056 5057 ASSERT(*vpp != NULL); 5058 5059 /* 5060 * We may have gotten here we have one of the following cases: 5061 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5062 * need to validate them. 5063 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5064 * must validate. 5065 * 5066 * Go to the server and check if the directory has changed, if 5067 * it hasn't we are done and can use the dnlc entry. 5068 */ 5069 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5070 } 5071 5072 /* 5073 * Go to the server and check if the directory has changed, if 5074 * it hasn't we are done and can use the dnlc entry. If it 5075 * has changed we get a new copy of its attributes and check 5076 * the access for VEXEC, then relookup the filename and 5077 * get its filehandle and attributes. 5078 * 5079 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5080 * if the NVERIFY failed we must 5081 * purge the caches 5082 * cache new attributes (will set r_time_attr_inval) 5083 * cache new access 5084 * recheck VEXEC access 5085 * add name to dnlc, possibly negative 5086 * if LOOKUP succeeded 5087 * cache new attributes 5088 * else 5089 * set a new r_time_attr_inval for dvp 5090 * check to make sure we have access 5091 * 5092 * The vpp returned is the vnode passed in if the directory is valid, 5093 * a new vnode if successful lookup, or NULL on error. 5094 */ 5095 static int 5096 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5097 { 5098 COMPOUND4args_clnt args; 5099 COMPOUND4res_clnt res; 5100 fattr4 *ver_fattr; 5101 fattr4_change dchange; 5102 int32_t *ptr; 5103 int argoplist_size = 7 * sizeof (nfs_argop4); 5104 nfs_argop4 *argop; 5105 int doqueue; 5106 mntinfo4_t *mi; 5107 nfs4_recov_state_t recov_state; 5108 hrtime_t t; 5109 int isdotdot; 5110 vnode_t *nvp; 5111 nfs_fh4 *fhp; 5112 nfs4_sharedfh_t *sfhp; 5113 nfs4_access_type_t cacc; 5114 rnode4_t *nrp; 5115 rnode4_t *drp = VTOR4(dvp); 5116 nfs4_ga_res_t *garp = NULL; 5117 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5118 5119 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5120 ASSERT(nm != NULL); 5121 ASSERT(nm[0] != '\0'); 5122 ASSERT(dvp->v_type == VDIR); 5123 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5124 ASSERT(*vpp != NULL); 5125 5126 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5127 isdotdot = 1; 5128 args.ctag = TAG_LOOKUP_VPARENT; 5129 } else { 5130 /* 5131 * Do not allow crossing of server mount points. The 5132 * only visible entries in a SRVSTUB dir are . and .. 5133 * This code handles the non-.. case. We can't even get 5134 * this far if looking up ".". 5135 */ 5136 if (VTOR4(dvp)->r_flags & R4SRVSTUB) { 5137 VN_RELE(*vpp); 5138 *vpp = NULL; 5139 return (ENOENT); 5140 } 5141 isdotdot = 0; 5142 args.ctag = TAG_LOOKUP_VALID; 5143 } 5144 5145 mi = VTOMI4(dvp); 5146 recov_state.rs_flags = 0; 5147 recov_state.rs_num_retry_despite_err = 0; 5148 5149 nvp = NULL; 5150 5151 /* Save the original mount point security information */ 5152 (void) save_mnt_secinfo(mi->mi_curr_serv); 5153 5154 recov_retry: 5155 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5156 &recov_state, NULL); 5157 if (e.error) { 5158 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5159 VN_RELE(*vpp); 5160 *vpp = NULL; 5161 return (e.error); 5162 } 5163 5164 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5165 5166 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5167 args.array_len = 7; 5168 args.array = argop; 5169 5170 /* 0. putfh file */ 5171 argop[0].argop = OP_CPUTFH; 5172 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5173 5174 /* 1. nverify the change info */ 5175 argop[1].argop = OP_NVERIFY; 5176 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5177 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5178 ver_fattr->attrlist4 = (char *)&dchange; 5179 ptr = (int32_t *)&dchange; 5180 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5181 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5182 5183 /* 2. getattr directory */ 5184 argop[2].argop = OP_GETATTR; 5185 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5186 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5187 5188 /* 3. access directory */ 5189 argop[3].argop = OP_ACCESS; 5190 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5191 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5192 5193 /* 4. lookup name */ 5194 if (isdotdot) { 5195 argop[4].argop = OP_LOOKUPP; 5196 } else { 5197 argop[4].argop = OP_CLOOKUP; 5198 argop[4].nfs_argop4_u.opclookup.cname = nm; 5199 } 5200 5201 /* 5. resulting file handle */ 5202 argop[5].argop = OP_GETFH; 5203 5204 /* 6. resulting file attributes */ 5205 argop[6].argop = OP_GETATTR; 5206 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5207 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5208 5209 doqueue = 1; 5210 t = gethrtime(); 5211 5212 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5213 5214 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5215 /* 5216 * For WRONGSEC of a non-dotdot case, send secinfo directly 5217 * from this thread, do not go thru the recovery thread since 5218 * we need the nm information. 5219 * 5220 * Not doing dotdot case because there is no specification 5221 * for (PUTFH, SECINFO "..") yet. 5222 */ 5223 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5224 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5225 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5226 &recov_state, FALSE); 5227 } else { 5228 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5229 &recov_state, TRUE); 5230 } 5231 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5232 kmem_free(argop, argoplist_size); 5233 if (!e.error) 5234 goto recov_retry; 5235 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5236 VN_RELE(*vpp); 5237 *vpp = NULL; 5238 return (e.error); 5239 } 5240 5241 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5242 OP_LOOKUP, NULL) == FALSE) { 5243 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5244 &recov_state, TRUE); 5245 5246 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5247 kmem_free(argop, argoplist_size); 5248 goto recov_retry; 5249 } 5250 } 5251 5252 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5253 5254 if (e.error || res.array_len == 0) { 5255 /* 5256 * If e.error isn't set, then reply has no ops (or we couldn't 5257 * be here). The only legal way to reply without an op array 5258 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5259 * be in the reply for all other status values. 5260 * 5261 * For valid replies without an ops array, return ENOTSUP 5262 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5263 * return EIO -- don't trust status. 5264 */ 5265 if (e.error == 0) 5266 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5267 ENOTSUP : EIO; 5268 VN_RELE(*vpp); 5269 *vpp = NULL; 5270 kmem_free(argop, argoplist_size); 5271 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5272 return (e.error); 5273 } 5274 5275 if (res.status != NFS4ERR_SAME) { 5276 e.error = geterrno4(res.status); 5277 5278 /* 5279 * The NVERIFY "failed" so the directory has changed 5280 * First make sure PUTFH succeeded and NVERIFY "failed" 5281 * cleanly. 5282 */ 5283 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5284 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5285 nfs4_purge_stale_fh(e.error, dvp, cr); 5286 VN_RELE(*vpp); 5287 *vpp = NULL; 5288 goto exit; 5289 } 5290 5291 /* 5292 * We know the NVERIFY "failed" so we must: 5293 * purge the caches (access and indirectly dnlc if needed) 5294 */ 5295 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5296 5297 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5298 nfs4_purge_stale_fh(e.error, dvp, cr); 5299 VN_RELE(*vpp); 5300 *vpp = NULL; 5301 goto exit; 5302 } 5303 5304 /* 5305 * Install new cached attributes for the directory 5306 */ 5307 nfs4_attr_cache(dvp, 5308 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5309 t, cr, FALSE, NULL); 5310 5311 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5312 nfs4_purge_stale_fh(e.error, dvp, cr); 5313 VN_RELE(*vpp); 5314 *vpp = NULL; 5315 e.error = geterrno4(res.status); 5316 goto exit; 5317 } 5318 5319 /* 5320 * Now we know the directory is valid, 5321 * cache new directory access 5322 */ 5323 nfs4_access_cache(drp, 5324 args.array[3].nfs_argop4_u.opaccess.access, 5325 res.array[3].nfs_resop4_u.opaccess.access, cr); 5326 5327 /* 5328 * recheck VEXEC access 5329 */ 5330 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5331 if (cacc != NFS4_ACCESS_ALLOWED) { 5332 /* 5333 * Directory permissions might have been revoked 5334 */ 5335 if (cacc == NFS4_ACCESS_DENIED) { 5336 e.error = EACCES; 5337 VN_RELE(*vpp); 5338 *vpp = NULL; 5339 goto exit; 5340 } 5341 5342 /* 5343 * Somehow we must not have asked for enough 5344 * so try a singleton ACCESS, should never happen. 5345 */ 5346 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5347 if (e.error) { 5348 VN_RELE(*vpp); 5349 *vpp = NULL; 5350 goto exit; 5351 } 5352 } 5353 5354 e.error = geterrno4(res.status); 5355 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5356 /* 5357 * The lookup failed, probably no entry 5358 */ 5359 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5360 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5361 } else { 5362 /* 5363 * Might be some other error, so remove 5364 * the dnlc entry to make sure we start all 5365 * over again, next time. 5366 */ 5367 dnlc_remove(dvp, nm); 5368 } 5369 VN_RELE(*vpp); 5370 *vpp = NULL; 5371 goto exit; 5372 } 5373 5374 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5375 /* 5376 * The file exists but we can't get its fh for 5377 * some unknown reason. Remove it from the dnlc 5378 * and error out to be safe. 5379 */ 5380 dnlc_remove(dvp, nm); 5381 VN_RELE(*vpp); 5382 *vpp = NULL; 5383 goto exit; 5384 } 5385 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5386 if (fhp->nfs_fh4_len == 0) { 5387 /* 5388 * The file exists but a bogus fh 5389 * some unknown reason. Remove it from the dnlc 5390 * and error out to be safe. 5391 */ 5392 e.error = ENOENT; 5393 dnlc_remove(dvp, nm); 5394 VN_RELE(*vpp); 5395 *vpp = NULL; 5396 goto exit; 5397 } 5398 sfhp = sfh4_get(fhp, mi); 5399 5400 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5401 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5402 5403 /* 5404 * Make the new rnode 5405 */ 5406 if (isdotdot) { 5407 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5408 if (e.error) { 5409 sfh4_rele(&sfhp); 5410 VN_RELE(*vpp); 5411 *vpp = NULL; 5412 goto exit; 5413 } 5414 /* 5415 * XXX if nfs4_make_dotdot uses an existing rnode 5416 * XXX it doesn't update the attributes. 5417 * XXX for now just save them again to save an OTW 5418 */ 5419 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5420 } else { 5421 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5422 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5423 /* 5424 * If v_type == VNON, then garp was NULL because 5425 * the last op in the compound failed and makenfs4node 5426 * could not find the vnode for sfhp. It created 5427 * a new vnode, so we have nothing to purge here. 5428 */ 5429 if (nvp->v_type == VNON) { 5430 vattr_t vattr; 5431 5432 vattr.va_mask = AT_TYPE; 5433 /* 5434 * N.B. We've already called nfs4_end_fop above. 5435 */ 5436 e.error = nfs4getattr(nvp, &vattr, cr); 5437 if (e.error) { 5438 sfh4_rele(&sfhp); 5439 VN_RELE(*vpp); 5440 *vpp = NULL; 5441 VN_RELE(nvp); 5442 goto exit; 5443 } 5444 nvp->v_type = vattr.va_type; 5445 } 5446 } 5447 sfh4_rele(&sfhp); 5448 5449 nrp = VTOR4(nvp); 5450 mutex_enter(&nrp->r_statev4_lock); 5451 if (!nrp->created_v4) { 5452 mutex_exit(&nrp->r_statev4_lock); 5453 dnlc_update(dvp, nm, nvp); 5454 } else 5455 mutex_exit(&nrp->r_statev4_lock); 5456 5457 VN_RELE(*vpp); 5458 *vpp = nvp; 5459 } else { 5460 hrtime_t now; 5461 hrtime_t delta = 0; 5462 5463 e.error = 0; 5464 5465 /* 5466 * Because the NVERIFY "succeeded" we know that the 5467 * directory attributes are still valid 5468 * so update r_time_attr_inval 5469 */ 5470 now = gethrtime(); 5471 mutex_enter(&drp->r_statelock); 5472 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5473 delta = now - drp->r_time_attr_saved; 5474 if (delta < mi->mi_acdirmin) 5475 delta = mi->mi_acdirmin; 5476 else if (delta > mi->mi_acdirmax) 5477 delta = mi->mi_acdirmax; 5478 } 5479 drp->r_time_attr_inval = now + delta; 5480 mutex_exit(&drp->r_statelock); 5481 dnlc_update(dvp, nm, *vpp); 5482 5483 /* 5484 * Even though we have a valid directory attr cache 5485 * and dnlc entry, we may not have access. 5486 * This should almost always hit the cache. 5487 */ 5488 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5489 if (e.error) { 5490 VN_RELE(*vpp); 5491 *vpp = NULL; 5492 } 5493 5494 if (*vpp == DNLC_NO_VNODE) { 5495 VN_RELE(*vpp); 5496 *vpp = NULL; 5497 e.error = ENOENT; 5498 } 5499 } 5500 5501 exit: 5502 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5503 kmem_free(argop, argoplist_size); 5504 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5505 return (e.error); 5506 } 5507 5508 /* 5509 * We need to go over the wire to lookup the name, but 5510 * while we are there verify the directory has not 5511 * changed but if it has, get new attributes and check access 5512 * 5513 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5514 * NVERIFY GETATTR ACCESS 5515 * 5516 * With the results: 5517 * if the NVERIFY failed we must purge the caches, add new attributes, 5518 * and cache new access. 5519 * set a new r_time_attr_inval 5520 * add name to dnlc, possibly negative 5521 * if LOOKUP succeeded 5522 * cache new attributes 5523 */ 5524 static int 5525 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5526 { 5527 COMPOUND4args_clnt args; 5528 COMPOUND4res_clnt res; 5529 fattr4 *ver_fattr; 5530 fattr4_change dchange; 5531 int32_t *ptr; 5532 nfs4_ga_res_t *garp = NULL; 5533 int argoplist_size = 9 * sizeof (nfs_argop4); 5534 nfs_argop4 *argop; 5535 int doqueue; 5536 mntinfo4_t *mi; 5537 nfs4_recov_state_t recov_state; 5538 hrtime_t t; 5539 int isdotdot; 5540 vnode_t *nvp; 5541 nfs_fh4 *fhp; 5542 nfs4_sharedfh_t *sfhp; 5543 nfs4_access_type_t cacc; 5544 rnode4_t *nrp; 5545 rnode4_t *drp = VTOR4(dvp); 5546 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5547 5548 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5549 ASSERT(nm != NULL); 5550 ASSERT(nm[0] != '\0'); 5551 ASSERT(dvp->v_type == VDIR); 5552 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5553 ASSERT(*vpp == NULL); 5554 5555 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5556 isdotdot = 1; 5557 args.ctag = TAG_LOOKUP_PARENT; 5558 } else { 5559 /* 5560 * Do not allow crossing of server mount points. The 5561 * only visible entries in a SRVSTUB dir are . and .. 5562 * This code handles the non-.. case. We can't even get 5563 * this far if looking up ".". 5564 */ 5565 if (VTOR4(dvp)->r_flags & R4SRVSTUB) 5566 return (ENOENT); 5567 5568 isdotdot = 0; 5569 args.ctag = TAG_LOOKUP; 5570 } 5571 5572 mi = VTOMI4(dvp); 5573 recov_state.rs_flags = 0; 5574 recov_state.rs_num_retry_despite_err = 0; 5575 5576 nvp = NULL; 5577 5578 /* Save the original mount point security information */ 5579 (void) save_mnt_secinfo(mi->mi_curr_serv); 5580 5581 recov_retry: 5582 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5583 &recov_state, NULL); 5584 if (e.error) { 5585 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5586 return (e.error); 5587 } 5588 5589 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5590 5591 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5592 args.array_len = 9; 5593 args.array = argop; 5594 5595 /* 0. putfh file */ 5596 argop[0].argop = OP_CPUTFH; 5597 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5598 5599 /* 1. savefh for the nverify */ 5600 argop[1].argop = OP_SAVEFH; 5601 5602 /* 2. lookup name */ 5603 if (isdotdot) { 5604 argop[2].argop = OP_LOOKUPP; 5605 } else { 5606 argop[2].argop = OP_CLOOKUP; 5607 argop[2].nfs_argop4_u.opclookup.cname = nm; 5608 } 5609 5610 /* 3. resulting file handle */ 5611 argop[3].argop = OP_GETFH; 5612 5613 /* 4. resulting file attributes */ 5614 argop[4].argop = OP_GETATTR; 5615 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5616 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5617 5618 /* 5. restorefh back the directory for the nverify */ 5619 argop[5].argop = OP_RESTOREFH; 5620 5621 /* 6. nverify the change info */ 5622 argop[6].argop = OP_NVERIFY; 5623 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5624 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5625 ver_fattr->attrlist4 = (char *)&dchange; 5626 ptr = (int32_t *)&dchange; 5627 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5628 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5629 5630 /* 7. getattr directory */ 5631 argop[7].argop = OP_GETATTR; 5632 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5633 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5634 5635 /* 8. access directory */ 5636 argop[8].argop = OP_ACCESS; 5637 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5638 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5639 5640 doqueue = 1; 5641 t = gethrtime(); 5642 5643 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5644 5645 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5646 /* 5647 * For WRONGSEC of a non-dotdot case, send secinfo directly 5648 * from this thread, do not go thru the recovery thread since 5649 * we need the nm information. 5650 * 5651 * Not doing dotdot case because there is no specification 5652 * for (PUTFH, SECINFO "..") yet. 5653 */ 5654 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5655 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5656 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5657 &recov_state, FALSE); 5658 } else { 5659 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5660 &recov_state, TRUE); 5661 } 5662 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5663 kmem_free(argop, argoplist_size); 5664 if (!e.error) 5665 goto recov_retry; 5666 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5667 return (e.error); 5668 } 5669 5670 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5671 OP_LOOKUP, NULL) == FALSE) { 5672 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5673 &recov_state, TRUE); 5674 5675 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5676 kmem_free(argop, argoplist_size); 5677 goto recov_retry; 5678 } 5679 } 5680 5681 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5682 5683 if (e.error || res.array_len == 0) { 5684 /* 5685 * If e.error isn't set, then reply has no ops (or we couldn't 5686 * be here). The only legal way to reply without an op array 5687 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5688 * be in the reply for all other status values. 5689 * 5690 * For valid replies without an ops array, return ENOTSUP 5691 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5692 * return EIO -- don't trust status. 5693 */ 5694 if (e.error == 0) 5695 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5696 ENOTSUP : EIO; 5697 5698 kmem_free(argop, argoplist_size); 5699 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5700 return (e.error); 5701 } 5702 5703 e.error = geterrno4(res.status); 5704 5705 /* 5706 * The PUTFH and SAVEFH may have failed. 5707 */ 5708 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5709 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5710 nfs4_purge_stale_fh(e.error, dvp, cr); 5711 goto exit; 5712 } 5713 5714 /* 5715 * Check if the file exists, if it does delay entering 5716 * into the dnlc until after we update the directory 5717 * attributes so we don't cause it to get purged immediately. 5718 */ 5719 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5720 /* 5721 * The lookup failed, probably no entry 5722 */ 5723 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5724 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5725 } 5726 goto exit; 5727 } 5728 5729 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5730 /* 5731 * The file exists but we can't get its fh for 5732 * some unknown reason. Error out to be safe. 5733 */ 5734 goto exit; 5735 } 5736 5737 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5738 if (fhp->nfs_fh4_len == 0) { 5739 /* 5740 * The file exists but a bogus fh 5741 * some unknown reason. Error out to be safe. 5742 */ 5743 e.error = EIO; 5744 goto exit; 5745 } 5746 sfhp = sfh4_get(fhp, mi); 5747 5748 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5749 sfh4_rele(&sfhp); 5750 e.error = EIO; 5751 goto exit; 5752 } 5753 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5754 5755 /* 5756 * The RESTOREFH may have failed 5757 */ 5758 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5759 sfh4_rele(&sfhp); 5760 e.error = EIO; 5761 goto exit; 5762 } 5763 5764 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5765 /* 5766 * First make sure the NVERIFY failed as we expected, 5767 * if it didn't then be conservative and error out 5768 * as we can't trust the directory. 5769 */ 5770 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5771 sfh4_rele(&sfhp); 5772 e.error = EIO; 5773 goto exit; 5774 } 5775 5776 /* 5777 * We know the NVERIFY "failed" so the directory has changed, 5778 * so we must: 5779 * purge the caches (access and indirectly dnlc if needed) 5780 */ 5781 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5782 5783 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5784 sfh4_rele(&sfhp); 5785 goto exit; 5786 } 5787 nfs4_attr_cache(dvp, 5788 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5789 t, cr, FALSE, NULL); 5790 5791 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5792 nfs4_purge_stale_fh(e.error, dvp, cr); 5793 sfh4_rele(&sfhp); 5794 e.error = geterrno4(res.status); 5795 goto exit; 5796 } 5797 5798 /* 5799 * Now we know the directory is valid, 5800 * cache new directory access 5801 */ 5802 nfs4_access_cache(drp, 5803 args.array[8].nfs_argop4_u.opaccess.access, 5804 res.array[8].nfs_resop4_u.opaccess.access, cr); 5805 5806 /* 5807 * recheck VEXEC access 5808 */ 5809 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5810 if (cacc != NFS4_ACCESS_ALLOWED) { 5811 /* 5812 * Directory permissions might have been revoked 5813 */ 5814 if (cacc == NFS4_ACCESS_DENIED) { 5815 sfh4_rele(&sfhp); 5816 e.error = EACCES; 5817 goto exit; 5818 } 5819 5820 /* 5821 * Somehow we must not have asked for enough 5822 * so try a singleton ACCESS should never happen 5823 */ 5824 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5825 if (e.error) { 5826 sfh4_rele(&sfhp); 5827 goto exit; 5828 } 5829 } 5830 5831 e.error = geterrno4(res.status); 5832 } else { 5833 hrtime_t now; 5834 hrtime_t delta = 0; 5835 5836 e.error = 0; 5837 5838 /* 5839 * Because the NVERIFY "succeeded" we know that the 5840 * directory attributes are still valid 5841 * so update r_time_attr_inval 5842 */ 5843 now = gethrtime(); 5844 mutex_enter(&drp->r_statelock); 5845 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5846 delta = now - drp->r_time_attr_saved; 5847 if (delta < mi->mi_acdirmin) 5848 delta = mi->mi_acdirmin; 5849 else if (delta > mi->mi_acdirmax) 5850 delta = mi->mi_acdirmax; 5851 } 5852 drp->r_time_attr_inval = now + delta; 5853 mutex_exit(&drp->r_statelock); 5854 5855 /* 5856 * Even though we have a valid directory attr cache, 5857 * we may not have access. 5858 * This should almost always hit the cache. 5859 */ 5860 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5861 if (e.error) { 5862 sfh4_rele(&sfhp); 5863 goto exit; 5864 } 5865 } 5866 5867 /* 5868 * Now we have successfully completed the lookup, if the 5869 * directory has changed we now have the valid attributes. 5870 * We also know we have directory access. 5871 * Create the new rnode and insert it in the dnlc. 5872 */ 5873 if (isdotdot) { 5874 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5875 if (e.error) { 5876 sfh4_rele(&sfhp); 5877 goto exit; 5878 } 5879 /* 5880 * XXX if nfs4_make_dotdot uses an existing rnode 5881 * XXX it doesn't update the attributes. 5882 * XXX for now just save them again to save an OTW 5883 */ 5884 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5885 } else { 5886 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5887 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5888 } 5889 sfh4_rele(&sfhp); 5890 5891 nrp = VTOR4(nvp); 5892 mutex_enter(&nrp->r_statev4_lock); 5893 if (!nrp->created_v4) { 5894 mutex_exit(&nrp->r_statev4_lock); 5895 dnlc_update(dvp, nm, nvp); 5896 } else 5897 mutex_exit(&nrp->r_statev4_lock); 5898 5899 *vpp = nvp; 5900 5901 exit: 5902 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5903 kmem_free(argop, argoplist_size); 5904 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5905 return (e.error); 5906 } 5907 5908 #ifdef DEBUG 5909 void 5910 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5911 { 5912 uint_t i, len; 5913 zoneid_t zoneid = getzoneid(); 5914 char *s; 5915 5916 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5917 for (i = 0; i < argcnt; i++) { 5918 nfs_argop4 *op = &argbase[i]; 5919 switch (op->argop) { 5920 case OP_CPUTFH: 5921 case OP_PUTFH: 5922 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5923 break; 5924 case OP_PUTROOTFH: 5925 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5926 break; 5927 case OP_CLOOKUP: 5928 s = op->nfs_argop4_u.opclookup.cname; 5929 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5930 break; 5931 case OP_LOOKUP: 5932 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 5933 &len, NULL); 5934 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5935 kmem_free(s, len); 5936 break; 5937 case OP_LOOKUPP: 5938 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 5939 break; 5940 case OP_GETFH: 5941 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 5942 break; 5943 case OP_GETATTR: 5944 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 5945 break; 5946 case OP_OPENATTR: 5947 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 5948 break; 5949 default: 5950 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 5951 op->argop); 5952 break; 5953 } 5954 } 5955 } 5956 #endif 5957 5958 /* 5959 * nfs4lookup_setup - constructs a multi-lookup compound request. 5960 * 5961 * Given the path "nm1/nm2/.../nmn", the following compound requests 5962 * may be created: 5963 * 5964 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 5965 * is faster, for now. 5966 * 5967 * l4_getattrs indicates the type of compound requested. 5968 * 5969 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 5970 * 5971 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 5972 * 5973 * total number of ops is n + 1. 5974 * 5975 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 5976 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 5977 * before the last component, and only get attributes 5978 * for the last component. Note that the second-to-last 5979 * pathname component is XATTR_RPATH, which does NOT go 5980 * over-the-wire as a lookup. 5981 * 5982 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 5983 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 5984 * 5985 * and total number of ops is n + 5. 5986 * 5987 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 5988 * attribute directory: create lookups plus an OPENATTR 5989 * replacing the last lookup. Note that the last pathname 5990 * component is XATTR_RPATH, which does NOT go over-the-wire 5991 * as a lookup. 5992 * 5993 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 5994 * Openattr; Getfh; Getattr } 5995 * 5996 * and total number of ops is n + 5. 5997 * 5998 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 5999 * nodes too. 6000 * 6001 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6002 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6003 * 6004 * and total number of ops is 3*n + 1. 6005 * 6006 * All cases: returns the index in the arg array of the final LOOKUP op, or 6007 * -1 if no LOOKUPs were used. 6008 */ 6009 int 6010 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6011 { 6012 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6013 nfs_argop4 *argbase, *argop; 6014 int arglen, argcnt; 6015 int n = 1; /* number of components */ 6016 int nga = 1; /* number of Getattr's in request */ 6017 char c = '\0', *s, *p; 6018 int lookup_idx = -1; 6019 int argoplist_size; 6020 6021 /* set lookuparg response result to 0 */ 6022 lookupargp->resp->status = NFS4_OK; 6023 6024 /* skip leading "/" or "." e.g. ".//./" if there is */ 6025 for (; ; nm++) { 6026 if (*nm != '/' && *nm != '.') 6027 break; 6028 6029 /* ".." is counted as 1 component */ 6030 if (*nm == '.' && *(nm + 1) == '.') 6031 break; 6032 } 6033 6034 /* 6035 * Find n = number of components - nm must be null terminated 6036 * Skip "." components. 6037 */ 6038 if (*nm != '\0') { 6039 for (n = 1, s = nm; *s != '\0'; s++) { 6040 if ((*s == '/') && (*(s + 1) != '/') && 6041 (*(s + 1) != '\0') && 6042 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6043 *(s + 2) == '\0'))) 6044 n++; 6045 } 6046 } else 6047 n = 0; 6048 6049 /* 6050 * nga is number of components that need Getfh+Getattr 6051 */ 6052 switch (l4_getattrs) { 6053 case LKP4_NO_ATTRIBUTES: 6054 nga = 0; 6055 break; 6056 case LKP4_ALL_ATTRIBUTES: 6057 nga = n; 6058 /* 6059 * Always have at least 1 getfh, getattr pair 6060 */ 6061 if (nga == 0) 6062 nga++; 6063 break; 6064 case LKP4_LAST_ATTRDIR: 6065 case LKP4_LAST_NAMED_ATTR: 6066 nga = n+1; 6067 break; 6068 } 6069 6070 /* 6071 * If change to use the filehandle attr instead of getfh 6072 * the following line can be deleted. 6073 */ 6074 nga *= 2; 6075 6076 /* 6077 * calculate number of ops in request as 6078 * header + trailer + lookups + getattrs 6079 */ 6080 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6081 6082 argoplist_size = arglen * sizeof (nfs_argop4); 6083 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6084 lookupargp->argsp->array = argop; 6085 6086 argcnt = lookupargp->header_len; 6087 argop += argcnt; 6088 6089 /* 6090 * loop and create a lookup op and possibly getattr/getfh for 6091 * each component. Skip "." components. 6092 */ 6093 for (s = nm; *s != '\0'; s = p) { 6094 /* 6095 * Set up a pathname struct for each component if needed 6096 */ 6097 while (*s == '/') 6098 s++; 6099 if (*s == '\0') 6100 break; 6101 for (p = s; (*p != '/') && (*p != '\0'); p++); 6102 c = *p; 6103 *p = '\0'; 6104 6105 if (s[0] == '.' && s[1] == '\0') { 6106 *p = c; 6107 continue; 6108 } 6109 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6110 strcmp(s, XATTR_RPATH) == 0) { 6111 /* getfh XXX may not be needed in future */ 6112 argop->argop = OP_GETFH; 6113 argop++; 6114 argcnt++; 6115 6116 /* getattr */ 6117 argop->argop = OP_GETATTR; 6118 argop->nfs_argop4_u.opgetattr.attr_request = 6119 lookupargp->ga_bits; 6120 argop->nfs_argop4_u.opgetattr.mi = 6121 lookupargp->mi; 6122 argop++; 6123 argcnt++; 6124 6125 /* openattr */ 6126 argop->argop = OP_OPENATTR; 6127 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6128 strcmp(s, XATTR_RPATH) == 0) { 6129 /* openattr */ 6130 argop->argop = OP_OPENATTR; 6131 argop++; 6132 argcnt++; 6133 6134 /* getfh XXX may not be needed in future */ 6135 argop->argop = OP_GETFH; 6136 argop++; 6137 argcnt++; 6138 6139 /* getattr */ 6140 argop->argop = OP_GETATTR; 6141 argop->nfs_argop4_u.opgetattr.attr_request = 6142 lookupargp->ga_bits; 6143 argop->nfs_argop4_u.opgetattr.mi = 6144 lookupargp->mi; 6145 argop++; 6146 argcnt++; 6147 *p = c; 6148 continue; 6149 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6150 /* lookupp */ 6151 argop->argop = OP_LOOKUPP; 6152 } else { 6153 /* lookup */ 6154 argop->argop = OP_LOOKUP; 6155 (void) str_to_utf8(s, 6156 &argop->nfs_argop4_u.oplookup.objname); 6157 } 6158 lookup_idx = argcnt; 6159 argop++; 6160 argcnt++; 6161 6162 *p = c; 6163 6164 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6165 /* getfh XXX may not be needed in future */ 6166 argop->argop = OP_GETFH; 6167 argop++; 6168 argcnt++; 6169 6170 /* getattr */ 6171 argop->argop = OP_GETATTR; 6172 argop->nfs_argop4_u.opgetattr.attr_request = 6173 lookupargp->ga_bits; 6174 argop->nfs_argop4_u.opgetattr.mi = 6175 lookupargp->mi; 6176 argop++; 6177 argcnt++; 6178 } 6179 } 6180 6181 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6182 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6183 if (needgetfh) { 6184 /* stick in a post-lookup getfh */ 6185 argop->argop = OP_GETFH; 6186 argcnt++; 6187 argop++; 6188 } 6189 /* post-lookup getattr */ 6190 argop->argop = OP_GETATTR; 6191 argop->nfs_argop4_u.opgetattr.attr_request = 6192 lookupargp->ga_bits; 6193 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6194 argcnt++; 6195 } 6196 argcnt += lookupargp->trailer_len; /* actual op count */ 6197 lookupargp->argsp->array_len = argcnt; 6198 lookupargp->arglen = arglen; 6199 6200 #ifdef DEBUG 6201 if (nfs4_client_lookup_debug) 6202 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6203 #endif 6204 6205 return (lookup_idx); 6206 } 6207 6208 static int 6209 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6210 { 6211 COMPOUND4args_clnt args; 6212 COMPOUND4res_clnt res; 6213 GETFH4res *gf_res = NULL; 6214 nfs_argop4 argop[4]; 6215 nfs_resop4 *resop = NULL; 6216 nfs4_sharedfh_t *sfhp; 6217 hrtime_t t; 6218 nfs4_error_t e; 6219 6220 rnode4_t *drp; 6221 int doqueue = 1; 6222 vnode_t *vp; 6223 int needrecov = 0; 6224 nfs4_recov_state_t recov_state; 6225 6226 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6227 6228 *avp = NULL; 6229 recov_state.rs_flags = 0; 6230 recov_state.rs_num_retry_despite_err = 0; 6231 6232 recov_retry: 6233 /* COMPOUND: putfh, openattr, getfh, getattr */ 6234 args.array_len = 4; 6235 args.array = argop; 6236 args.ctag = TAG_OPENATTR; 6237 6238 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6239 if (e.error) 6240 return (e.error); 6241 6242 drp = VTOR4(dvp); 6243 6244 /* putfh */ 6245 argop[0].argop = OP_CPUTFH; 6246 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6247 6248 /* openattr */ 6249 argop[1].argop = OP_OPENATTR; 6250 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6251 6252 /* getfh */ 6253 argop[2].argop = OP_GETFH; 6254 6255 /* getattr */ 6256 argop[3].argop = OP_GETATTR; 6257 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6258 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6259 6260 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6261 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6262 rnode4info(drp))); 6263 6264 t = gethrtime(); 6265 6266 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6267 6268 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6269 if (needrecov) { 6270 bool_t abort; 6271 6272 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6273 "nfs4openattr: initiating recovery\n")); 6274 6275 abort = nfs4_start_recovery(&e, 6276 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6277 OP_OPENATTR, NULL); 6278 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6279 if (!e.error) { 6280 e.error = geterrno4(res.status); 6281 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6282 } 6283 if (abort == FALSE) 6284 goto recov_retry; 6285 return (e.error); 6286 } 6287 6288 if (e.error) { 6289 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6290 return (e.error); 6291 } 6292 6293 if (res.status) { 6294 /* 6295 * If OTW errro is NOTSUPP, then it should be 6296 * translated to EINVAL. All Solaris file system 6297 * implementations return EINVAL to the syscall layer 6298 * when the attrdir cannot be created due to an 6299 * implementation restriction or noxattr mount option. 6300 */ 6301 if (res.status == NFS4ERR_NOTSUPP) { 6302 mutex_enter(&drp->r_statelock); 6303 if (drp->r_xattr_dir) 6304 VN_RELE(drp->r_xattr_dir); 6305 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6306 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6307 mutex_exit(&drp->r_statelock); 6308 6309 e.error = EINVAL; 6310 } else { 6311 e.error = geterrno4(res.status); 6312 } 6313 6314 if (e.error) { 6315 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6316 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6317 needrecov); 6318 return (e.error); 6319 } 6320 } 6321 6322 resop = &res.array[0]; /* putfh res */ 6323 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6324 6325 resop = &res.array[1]; /* openattr res */ 6326 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6327 6328 resop = &res.array[2]; /* getfh res */ 6329 gf_res = &resop->nfs_resop4_u.opgetfh; 6330 if (gf_res->object.nfs_fh4_len == 0) { 6331 *avp = NULL; 6332 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6333 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6334 return (ENOENT); 6335 } 6336 6337 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6338 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6339 dvp->v_vfsp, t, cr, dvp, 6340 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH)); 6341 sfh4_rele(&sfhp); 6342 6343 if (e.error) 6344 PURGE_ATTRCACHE4(vp); 6345 6346 mutex_enter(&vp->v_lock); 6347 vp->v_flag |= V_XATTRDIR; 6348 mutex_exit(&vp->v_lock); 6349 6350 *avp = vp; 6351 6352 mutex_enter(&drp->r_statelock); 6353 if (drp->r_xattr_dir) 6354 VN_RELE(drp->r_xattr_dir); 6355 VN_HOLD(vp); 6356 drp->r_xattr_dir = vp; 6357 6358 /* 6359 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6360 * NULL. xattrs could be created at any time, and we have no 6361 * way to update pc4_xattr_exists in the base object if/when 6362 * it happens. 6363 */ 6364 drp->r_pathconf.pc4_xattr_valid = 0; 6365 6366 mutex_exit(&drp->r_statelock); 6367 6368 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6369 6370 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6371 6372 return (0); 6373 } 6374 6375 /* ARGSUSED */ 6376 static int 6377 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6378 int mode, vnode_t **vpp, cred_t *cr, int flags) 6379 { 6380 int error; 6381 vnode_t *vp = NULL; 6382 rnode4_t *rp; 6383 struct vattr vattr; 6384 rnode4_t *drp; 6385 vnode_t *tempvp; 6386 enum createmode4 createmode; 6387 bool_t must_trunc = FALSE; 6388 6389 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6390 return (EPERM); 6391 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6392 return (EINVAL); 6393 } 6394 6395 /* . and .. have special meaning in the protocol, reject them. */ 6396 6397 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6398 return (EISDIR); 6399 6400 drp = VTOR4(dvp); 6401 6402 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6403 return (EINTR); 6404 6405 top: 6406 /* 6407 * We make a copy of the attributes because the caller does not 6408 * expect us to change what va points to. 6409 */ 6410 vattr = *va; 6411 6412 /* 6413 * If the pathname is "", then dvp is the root vnode of 6414 * a remote file mounted over a local directory. 6415 * All that needs to be done is access 6416 * checking and truncation. Note that we avoid doing 6417 * open w/ create because the parent directory might 6418 * be in pseudo-fs and the open would fail. 6419 */ 6420 if (*nm == '\0') { 6421 error = 0; 6422 VN_HOLD(dvp); 6423 vp = dvp; 6424 must_trunc = TRUE; 6425 } else { 6426 /* 6427 * We need to go over the wire, just to be sure whether the 6428 * file exists or not. Using the DNLC can be dangerous in 6429 * this case when making a decision regarding existence. 6430 */ 6431 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6432 } 6433 6434 if (exclusive) 6435 createmode = EXCLUSIVE4; 6436 else 6437 createmode = GUARDED4; 6438 6439 /* 6440 * error would be set if the file does not exist on the 6441 * server, so lets go create it. 6442 */ 6443 if (error) { 6444 goto create_otw; 6445 } 6446 6447 /* 6448 * File does exist on the server 6449 */ 6450 if (exclusive == EXCL) 6451 error = EEXIST; 6452 else if (vp->v_type == VDIR && (mode & VWRITE)) 6453 error = EISDIR; 6454 else { 6455 /* 6456 * If vnode is a device, create special vnode. 6457 */ 6458 if (ISVDEV(vp->v_type)) { 6459 tempvp = vp; 6460 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6461 VN_RELE(tempvp); 6462 } 6463 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 6464 if ((vattr.va_mask & AT_SIZE) && 6465 vp->v_type == VREG) { 6466 rp = VTOR4(vp); 6467 /* 6468 * Check here for large file handled 6469 * by LF-unaware process (as 6470 * ufs_create() does) 6471 */ 6472 if (!(flags & FOFFMAX)) { 6473 mutex_enter(&rp->r_statelock); 6474 if (rp->r_size > MAXOFF32_T) 6475 error = EOVERFLOW; 6476 mutex_exit(&rp->r_statelock); 6477 } 6478 6479 /* if error is set then we need to return */ 6480 if (error) { 6481 nfs_rw_exit(&drp->r_rwlock); 6482 VN_RELE(vp); 6483 return (error); 6484 } 6485 6486 if (must_trunc) { 6487 vattr.va_mask = AT_SIZE; 6488 error = nfs4setattr(vp, &vattr, 0, cr, 6489 NULL); 6490 } else { 6491 /* 6492 * we know we have a regular file that already 6493 * exists and we may end up truncating the file 6494 * as a result of the open_otw, so flush out 6495 * any dirty pages for this file first. 6496 */ 6497 if (nfs4_has_pages(vp) && 6498 ((rp->r_flags & R4DIRTY) || 6499 rp->r_count > 0 || 6500 rp->r_mapcnt > 0)) { 6501 error = nfs4_putpage(vp, 6502 (offset_t)0, 0, 0, cr); 6503 if (error && (error == ENOSPC || 6504 error == EDQUOT)) { 6505 mutex_enter( 6506 &rp->r_statelock); 6507 if (!rp->r_error) 6508 rp->r_error = 6509 error; 6510 mutex_exit( 6511 &rp->r_statelock); 6512 } 6513 } 6514 vattr.va_mask = (AT_SIZE | 6515 AT_TYPE | AT_MODE); 6516 vattr.va_type = VREG; 6517 createmode = UNCHECKED4; 6518 goto create_otw; 6519 } 6520 } 6521 } 6522 } 6523 nfs_rw_exit(&drp->r_rwlock); 6524 if (error) { 6525 VN_RELE(vp); 6526 } else { 6527 *vpp = vp; 6528 } 6529 return (error); 6530 6531 create_otw: 6532 dnlc_remove(dvp, nm); 6533 6534 ASSERT(vattr.va_mask & AT_TYPE); 6535 6536 /* 6537 * If not a regular file let nfs4mknod() handle it. 6538 */ 6539 if (vattr.va_type != VREG) { 6540 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6541 nfs_rw_exit(&drp->r_rwlock); 6542 return (error); 6543 } 6544 6545 /* 6546 * It _is_ a regular file. 6547 */ 6548 ASSERT(vattr.va_mask & AT_MODE); 6549 if (MANDMODE(vattr.va_mode)) { 6550 nfs_rw_exit(&drp->r_rwlock); 6551 return (EACCES); 6552 } 6553 6554 /* 6555 * If this happens to be a mknod of a regular file, then flags will 6556 * have neither FREAD or FWRITE. However, we must set at least one 6557 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6558 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6559 * set (based on openmode specified by app). 6560 */ 6561 if ((flags & (FREAD|FWRITE)) == 0) 6562 flags |= (FREAD|FWRITE); 6563 6564 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6565 6566 if (vp != NULL) { 6567 /* if create was successful, throw away the file's pages */ 6568 if (!error && (vattr.va_mask & AT_SIZE)) 6569 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6570 cr); 6571 /* release the lookup hold */ 6572 VN_RELE(vp); 6573 vp = NULL; 6574 } 6575 6576 /* 6577 * validate that we opened a regular file. This handles a misbehaving 6578 * server that returns an incorrect FH. 6579 */ 6580 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6581 error = EISDIR; 6582 VN_RELE(*vpp); 6583 } 6584 6585 /* 6586 * If this is not an exclusive create, then the CREATE 6587 * request will be made with the GUARDED mode set. This 6588 * means that the server will return EEXIST if the file 6589 * exists. The file could exist because of a retransmitted 6590 * request. In this case, we recover by starting over and 6591 * checking to see whether the file exists. This second 6592 * time through it should and a CREATE request will not be 6593 * sent. 6594 * 6595 * This handles the problem of a dangling CREATE request 6596 * which contains attributes which indicate that the file 6597 * should be truncated. This retransmitted request could 6598 * possibly truncate valid data in the file if not caught 6599 * by the duplicate request mechanism on the server or if 6600 * not caught by other means. The scenario is: 6601 * 6602 * Client transmits CREATE request with size = 0 6603 * Client times out, retransmits request. 6604 * Response to the first request arrives from the server 6605 * and the client proceeds on. 6606 * Client writes data to the file. 6607 * The server now processes retransmitted CREATE request 6608 * and truncates file. 6609 * 6610 * The use of the GUARDED CREATE request prevents this from 6611 * happening because the retransmitted CREATE would fail 6612 * with EEXIST and would not truncate the file. 6613 */ 6614 if (error == EEXIST && exclusive == NONEXCL) { 6615 #ifdef DEBUG 6616 nfs4_create_misses++; 6617 #endif 6618 goto top; 6619 } 6620 nfs_rw_exit(&drp->r_rwlock); 6621 return (error); 6622 } 6623 6624 /* 6625 * Create compound (for mkdir, mknod, symlink): 6626 * { Putfh <dfh>; Create; Getfh; Getattr } 6627 * It's okay if setattr failed to set gid - this is not considered 6628 * an error, but purge attrs in that case. 6629 */ 6630 static int 6631 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6632 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6633 { 6634 int need_end_op = FALSE; 6635 COMPOUND4args_clnt args; 6636 COMPOUND4res_clnt res, *resp = NULL; 6637 nfs_argop4 *argop; 6638 nfs_resop4 *resop; 6639 int doqueue; 6640 mntinfo4_t *mi; 6641 rnode4_t *drp = VTOR4(dvp); 6642 change_info4 *cinfo; 6643 GETFH4res *gf_res; 6644 struct vattr vattr; 6645 vnode_t *vp; 6646 fattr4 *crattr; 6647 bool_t needrecov = FALSE; 6648 nfs4_recov_state_t recov_state; 6649 nfs4_sharedfh_t *sfhp = NULL; 6650 hrtime_t t; 6651 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6652 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6653 dirattr_info_t dinfo, *dinfop; 6654 servinfo4_t *svp; 6655 bitmap4 supp_attrs; 6656 6657 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6658 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6659 6660 mi = VTOMI4(dvp); 6661 6662 /* 6663 * Make sure we properly deal with setting the right gid 6664 * on a new directory to reflect the parent's setgid bit 6665 */ 6666 setgid_flag = 0; 6667 if (type == NF4DIR) { 6668 struct vattr dva; 6669 6670 va->va_mode &= ~VSGID; 6671 dva.va_mask = AT_MODE | AT_GID; 6672 if (VOP_GETATTR(dvp, &dva, 0, cr) == 0) { 6673 6674 /* 6675 * If the parent's directory has the setgid bit set 6676 * _and_ the client was able to get a valid mapping 6677 * for the parent dir's owner_group, we want to 6678 * append NVERIFY(owner_group == dva.va_gid) and 6679 * SETTATTR to the CREATE compound. 6680 */ 6681 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6682 setgid_flag = 1; 6683 va->va_mode |= VSGID; 6684 if (dva.va_gid != GID_NOBODY) { 6685 va->va_mask |= AT_GID; 6686 va->va_gid = dva.va_gid; 6687 } 6688 } 6689 } 6690 } 6691 6692 /* 6693 * Create ops: 6694 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6695 * 5:restorefh(dir) 6:getattr(dir) 6696 * 6697 * if (setgid) 6698 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6699 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6700 * 8:nverify 9:setattr 6701 */ 6702 if (setgid_flag) { 6703 numops = 10; 6704 idx_create = 1; 6705 idx_fattr = 3; 6706 } else { 6707 numops = 7; 6708 idx_create = 2; 6709 idx_fattr = 4; 6710 } 6711 6712 ASSERT(nfs_zone() == mi->mi_zone); 6713 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6714 return (EINTR); 6715 } 6716 recov_state.rs_flags = 0; 6717 recov_state.rs_num_retry_despite_err = 0; 6718 6719 argoplist_size = numops * sizeof (nfs_argop4); 6720 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6721 6722 recov_retry: 6723 if (type == NF4LNK) 6724 args.ctag = TAG_SYMLINK; 6725 else if (type == NF4DIR) 6726 args.ctag = TAG_MKDIR; 6727 else 6728 args.ctag = TAG_MKNOD; 6729 6730 args.array_len = numops; 6731 args.array = argop; 6732 6733 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6734 nfs_rw_exit(&drp->r_rwlock); 6735 kmem_free(argop, argoplist_size); 6736 return (e.error); 6737 } 6738 need_end_op = TRUE; 6739 6740 6741 /* 0: putfh directory */ 6742 argop[0].argop = OP_CPUTFH; 6743 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6744 6745 /* 1/2: Create object */ 6746 argop[idx_create].argop = OP_CCREATE; 6747 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6748 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6749 if (type == NF4LNK) { 6750 /* 6751 * symlink, treat name as data 6752 */ 6753 ASSERT(data != NULL); 6754 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6755 (char *)data; 6756 } 6757 if (type == NF4BLK || type == NF4CHR) { 6758 ASSERT(data != NULL); 6759 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6760 *((specdata4 *)data); 6761 } 6762 6763 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6764 6765 svp = drp->r_server; 6766 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6767 supp_attrs = svp->sv_supp_attrs; 6768 nfs_rw_exit(&svp->sv_lock); 6769 6770 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6771 nfs_rw_exit(&drp->r_rwlock); 6772 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6773 e.error = EINVAL; 6774 kmem_free(argop, argoplist_size); 6775 return (e.error); 6776 } 6777 6778 /* 2/3: getfh fh of created object */ 6779 ASSERT(idx_create + 1 == idx_fattr - 1); 6780 argop[idx_create + 1].argop = OP_GETFH; 6781 6782 /* 3/4: getattr of new object */ 6783 argop[idx_fattr].argop = OP_GETATTR; 6784 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6785 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6786 6787 if (setgid_flag) { 6788 vattr_t _v; 6789 6790 argop[4].argop = OP_SAVEFH; 6791 6792 argop[5].argop = OP_CPUTFH; 6793 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6794 6795 argop[6].argop = OP_GETATTR; 6796 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6797 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6798 6799 argop[7].argop = OP_RESTOREFH; 6800 6801 /* 6802 * nverify 6803 * 6804 * XXX - Revisit the last argument to nfs4_end_op() 6805 * once 5020486 is fixed. 6806 */ 6807 _v.va_mask = AT_GID; 6808 _v.va_gid = va->va_gid; 6809 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6810 supp_attrs)) { 6811 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6812 nfs_rw_exit(&drp->r_rwlock); 6813 nfs4_fattr4_free(crattr); 6814 kmem_free(argop, argoplist_size); 6815 return (e.error); 6816 } 6817 6818 /* 6819 * setattr 6820 * 6821 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6822 * so no need for stateid or flags. Also we specify NULL 6823 * rp since we're only interested in setting owner_group 6824 * attributes. 6825 */ 6826 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6827 &e.error, 0); 6828 6829 if (e.error) { 6830 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6831 nfs_rw_exit(&drp->r_rwlock); 6832 nfs4_fattr4_free(crattr); 6833 nfs4args_verify_free(&argop[8]); 6834 kmem_free(argop, argoplist_size); 6835 return (e.error); 6836 } 6837 } else { 6838 argop[1].argop = OP_SAVEFH; 6839 6840 argop[5].argop = OP_RESTOREFH; 6841 6842 argop[6].argop = OP_GETATTR; 6843 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6844 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6845 } 6846 6847 dnlc_remove(dvp, nm); 6848 6849 doqueue = 1; 6850 t = gethrtime(); 6851 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6852 6853 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6854 if (e.error) { 6855 PURGE_ATTRCACHE4(dvp); 6856 if (!needrecov) 6857 goto out; 6858 } 6859 6860 if (needrecov) { 6861 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6862 OP_CREATE, NULL) == FALSE) { 6863 nfs4_end_op(mi, dvp, NULL, &recov_state, 6864 needrecov); 6865 need_end_op = FALSE; 6866 nfs4_fattr4_free(crattr); 6867 if (setgid_flag) { 6868 nfs4args_verify_free(&argop[8]); 6869 nfs4args_setattr_free(&argop[9]); 6870 } 6871 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6872 goto recov_retry; 6873 } 6874 } 6875 6876 resp = &res; 6877 6878 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6879 6880 if (res.status == NFS4ERR_BADOWNER) 6881 nfs4_log_badowner(mi, OP_CREATE); 6882 6883 e.error = geterrno4(res.status); 6884 6885 /* 6886 * This check is left over from when create was implemented 6887 * using a setattr op (instead of createattrs). If the 6888 * putfh/create/getfh failed, the error was returned. If 6889 * setattr/getattr failed, we keep going. 6890 * 6891 * It might be better to get rid of the GETFH also, and just 6892 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6893 * Then if any of the operations failed, we could return the 6894 * error now, and remove much of the error code below. 6895 */ 6896 if (res.array_len <= idx_fattr) { 6897 /* 6898 * Either Putfh, Create or Getfh failed. 6899 */ 6900 PURGE_ATTRCACHE4(dvp); 6901 /* 6902 * nfs4_purge_stale_fh() may generate otw calls through 6903 * nfs4_invalidate_pages. Hence the need to call 6904 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 6905 */ 6906 nfs4_end_op(mi, dvp, NULL, &recov_state, 6907 needrecov); 6908 need_end_op = FALSE; 6909 nfs4_purge_stale_fh(e.error, dvp, cr); 6910 goto out; 6911 } 6912 } 6913 6914 resop = &res.array[idx_create]; /* create res */ 6915 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 6916 6917 resop = &res.array[idx_create + 1]; /* getfh res */ 6918 gf_res = &resop->nfs_resop4_u.opgetfh; 6919 6920 sfhp = sfh4_get(&gf_res->object, mi); 6921 if (e.error) { 6922 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 6923 fn_get(VTOSV(dvp)->sv_name, nm)); 6924 if (vp->v_type == VNON) { 6925 vattr.va_mask = AT_TYPE; 6926 /* 6927 * Need to call nfs4_end_op before nfs4getattr to avoid 6928 * potential nfs4_start_op deadlock. See RFE 4777612. 6929 */ 6930 nfs4_end_op(mi, dvp, NULL, &recov_state, 6931 needrecov); 6932 need_end_op = FALSE; 6933 e.error = nfs4getattr(vp, &vattr, cr); 6934 if (e.error) { 6935 VN_RELE(vp); 6936 *vpp = NULL; 6937 goto out; 6938 } 6939 vp->v_type = vattr.va_type; 6940 } 6941 e.error = 0; 6942 } else { 6943 *vpp = vp = makenfs4node(sfhp, 6944 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 6945 dvp->v_vfsp, t, cr, 6946 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 6947 } 6948 6949 /* 6950 * If compound succeeded, then update dir attrs 6951 */ 6952 if (res.status == NFS4_OK) { 6953 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 6954 dinfo.di_cred = cr; 6955 dinfo.di_time_call = t; 6956 dinfop = &dinfo; 6957 } else 6958 dinfop = NULL; 6959 6960 /* Update directory cache attribute, readdir and dnlc caches */ 6961 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 6962 6963 out: 6964 if (sfhp != NULL) 6965 sfh4_rele(&sfhp); 6966 nfs_rw_exit(&drp->r_rwlock); 6967 nfs4_fattr4_free(crattr); 6968 if (setgid_flag) { 6969 nfs4args_verify_free(&argop[8]); 6970 nfs4args_setattr_free(&argop[9]); 6971 } 6972 if (resp) 6973 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 6974 if (need_end_op) 6975 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6976 6977 kmem_free(argop, argoplist_size); 6978 return (e.error); 6979 } 6980 6981 /* ARGSUSED */ 6982 static int 6983 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6984 int mode, vnode_t **vpp, cred_t *cr) 6985 { 6986 int error; 6987 vnode_t *vp; 6988 nfs_ftype4 type; 6989 specdata4 spec, *specp = NULL; 6990 6991 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6992 6993 switch (va->va_type) { 6994 case VCHR: 6995 case VBLK: 6996 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 6997 spec.specdata1 = getmajor(va->va_rdev); 6998 spec.specdata2 = getminor(va->va_rdev); 6999 specp = &spec; 7000 break; 7001 7002 case VFIFO: 7003 type = NF4FIFO; 7004 break; 7005 case VSOCK: 7006 type = NF4SOCK; 7007 break; 7008 7009 default: 7010 return (EINVAL); 7011 } 7012 7013 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7014 if (error) { 7015 return (error); 7016 } 7017 7018 /* 7019 * This might not be needed any more; special case to deal 7020 * with problematic v2/v3 servers. Since create was unable 7021 * to set group correctly, not sure what hope setattr has. 7022 */ 7023 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7024 va->va_mask = AT_GID; 7025 (void) nfs4setattr(vp, va, 0, cr, NULL); 7026 } 7027 7028 /* 7029 * If vnode is a device create special vnode 7030 */ 7031 if (ISVDEV(vp->v_type)) { 7032 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7033 VN_RELE(vp); 7034 } else { 7035 *vpp = vp; 7036 } 7037 return (error); 7038 } 7039 7040 /* 7041 * Remove requires that the current fh be the target directory. 7042 * After the operation, the current fh is unchanged. 7043 * The compound op structure is: 7044 * PUTFH(targetdir), REMOVE 7045 * 7046 * Weirdness: if the vnode to be removed is open 7047 * we rename it instead of removing it and nfs_inactive 7048 * will remove the new name. 7049 */ 7050 static int 7051 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr) 7052 { 7053 COMPOUND4args_clnt args; 7054 COMPOUND4res_clnt res, *resp = NULL; 7055 REMOVE4res *rm_res; 7056 nfs_argop4 argop[3]; 7057 nfs_resop4 *resop; 7058 vnode_t *vp; 7059 char *tmpname; 7060 int doqueue; 7061 mntinfo4_t *mi; 7062 rnode4_t *rp; 7063 rnode4_t *drp; 7064 int needrecov = 0; 7065 nfs4_recov_state_t recov_state; 7066 int isopen; 7067 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7068 dirattr_info_t dinfo; 7069 7070 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7071 return (EPERM); 7072 drp = VTOR4(dvp); 7073 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7074 return (EINTR); 7075 7076 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7077 if (e.error) { 7078 nfs_rw_exit(&drp->r_rwlock); 7079 return (e.error); 7080 } 7081 7082 if (vp->v_type == VDIR) { 7083 VN_RELE(vp); 7084 nfs_rw_exit(&drp->r_rwlock); 7085 return (EISDIR); 7086 } 7087 7088 /* 7089 * First just remove the entry from the name cache, as it 7090 * is most likely the only entry for this vp. 7091 */ 7092 dnlc_remove(dvp, nm); 7093 7094 rp = VTOR4(vp); 7095 7096 /* 7097 * For regular file types, check to see if the file is open by looking 7098 * at the open streams. 7099 * For all other types, check the reference count on the vnode. Since 7100 * they are not opened OTW they never have an open stream. 7101 * 7102 * If the file is open, rename it to .nfsXXXX. 7103 */ 7104 if (vp->v_type != VREG) { 7105 /* 7106 * If the file has a v_count > 1 then there may be more than one 7107 * entry in the name cache due multiple links or an open file, 7108 * but we don't have the real reference count so flush all 7109 * possible entries. 7110 */ 7111 if (vp->v_count > 1) 7112 dnlc_purge_vp(vp); 7113 7114 /* 7115 * Now we have the real reference count. 7116 */ 7117 isopen = vp->v_count > 1; 7118 } else { 7119 mutex_enter(&rp->r_os_lock); 7120 isopen = list_head(&rp->r_open_streams) != NULL; 7121 mutex_exit(&rp->r_os_lock); 7122 } 7123 7124 mutex_enter(&rp->r_statelock); 7125 if (isopen && 7126 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7127 mutex_exit(&rp->r_statelock); 7128 tmpname = newname(); 7129 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr); 7130 if (e.error) 7131 kmem_free(tmpname, MAXNAMELEN); 7132 else { 7133 mutex_enter(&rp->r_statelock); 7134 if (rp->r_unldvp == NULL) { 7135 VN_HOLD(dvp); 7136 rp->r_unldvp = dvp; 7137 if (rp->r_unlcred != NULL) 7138 crfree(rp->r_unlcred); 7139 crhold(cr); 7140 rp->r_unlcred = cr; 7141 rp->r_unlname = tmpname; 7142 } else { 7143 kmem_free(rp->r_unlname, MAXNAMELEN); 7144 rp->r_unlname = tmpname; 7145 } 7146 mutex_exit(&rp->r_statelock); 7147 } 7148 VN_RELE(vp); 7149 nfs_rw_exit(&drp->r_rwlock); 7150 return (e.error); 7151 } 7152 /* 7153 * Actually remove the file/dir 7154 */ 7155 mutex_exit(&rp->r_statelock); 7156 7157 /* 7158 * We need to flush any dirty pages which happen to 7159 * be hanging around before removing the file. 7160 * This shouldn't happen very often since in NFSv4 7161 * we should be close to open consistent. 7162 */ 7163 if (nfs4_has_pages(vp) && 7164 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7165 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 7166 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7167 mutex_enter(&rp->r_statelock); 7168 if (!rp->r_error) 7169 rp->r_error = e.error; 7170 mutex_exit(&rp->r_statelock); 7171 } 7172 } 7173 7174 mi = VTOMI4(dvp); 7175 7176 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7177 recov_state.rs_flags = 0; 7178 recov_state.rs_num_retry_despite_err = 0; 7179 7180 recov_retry: 7181 /* 7182 * Remove ops: putfh dir; remove 7183 */ 7184 args.ctag = TAG_REMOVE; 7185 args.array_len = 3; 7186 args.array = argop; 7187 7188 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7189 if (e.error) { 7190 nfs_rw_exit(&drp->r_rwlock); 7191 VN_RELE(vp); 7192 return (e.error); 7193 } 7194 7195 /* putfh directory */ 7196 argop[0].argop = OP_CPUTFH; 7197 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7198 7199 /* remove */ 7200 argop[1].argop = OP_CREMOVE; 7201 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7202 7203 /* getattr dir */ 7204 argop[2].argop = OP_GETATTR; 7205 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7206 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7207 7208 doqueue = 1; 7209 dinfo.di_time_call = gethrtime(); 7210 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7211 7212 PURGE_ATTRCACHE4(vp); 7213 7214 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7215 if (e.error) 7216 PURGE_ATTRCACHE4(dvp); 7217 7218 if (needrecov) { 7219 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7220 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7221 if (!e.error) 7222 (void) xdr_free(xdr_COMPOUND4res_clnt, 7223 (caddr_t)&res); 7224 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7225 needrecov); 7226 goto recov_retry; 7227 } 7228 } 7229 7230 /* 7231 * Matching nfs4_end_op() for start_op() above. 7232 * There is a path in the code below which calls 7233 * nfs4_purge_stale_fh(), which may generate otw calls through 7234 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7235 * here to avoid nfs4_start_op() deadlock. 7236 */ 7237 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7238 7239 if (!e.error) { 7240 resp = &res; 7241 7242 if (res.status) { 7243 e.error = geterrno4(res.status); 7244 PURGE_ATTRCACHE4(dvp); 7245 nfs4_purge_stale_fh(e.error, dvp, cr); 7246 } else { 7247 resop = &res.array[1]; /* remove res */ 7248 rm_res = &resop->nfs_resop4_u.opremove; 7249 7250 dinfo.di_garp = 7251 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7252 dinfo.di_cred = cr; 7253 7254 /* Update directory attr, readdir and dnlc caches */ 7255 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7256 &dinfo); 7257 } 7258 } 7259 nfs_rw_exit(&drp->r_rwlock); 7260 if (resp) 7261 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7262 7263 VN_RELE(vp); 7264 return (e.error); 7265 } 7266 7267 /* 7268 * Link requires that the current fh be the target directory and the 7269 * saved fh be the source fh. After the operation, the current fh is unchanged. 7270 * Thus the compound op structure is: 7271 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7272 * GETATTR(file) 7273 */ 7274 static int 7275 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 7276 { 7277 COMPOUND4args_clnt args; 7278 COMPOUND4res_clnt res, *resp = NULL; 7279 LINK4res *ln_res; 7280 int argoplist_size = 7 * sizeof (nfs_argop4); 7281 nfs_argop4 *argop; 7282 nfs_resop4 *resop; 7283 vnode_t *realvp, *nvp; 7284 int doqueue; 7285 mntinfo4_t *mi; 7286 rnode4_t *tdrp; 7287 bool_t needrecov = FALSE; 7288 nfs4_recov_state_t recov_state; 7289 hrtime_t t; 7290 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7291 dirattr_info_t dinfo; 7292 7293 ASSERT(*tnm != '\0'); 7294 ASSERT(tdvp->v_type == VDIR); 7295 ASSERT(nfs4_consistent_type(tdvp)); 7296 ASSERT(nfs4_consistent_type(svp)); 7297 7298 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7299 return (EPERM); 7300 if (VOP_REALVP(svp, &realvp) == 0) { 7301 svp = realvp; 7302 ASSERT(nfs4_consistent_type(svp)); 7303 } 7304 7305 tdrp = VTOR4(tdvp); 7306 mi = VTOMI4(svp); 7307 7308 if (!(mi->mi_flags & MI4_LINK)) { 7309 return (EOPNOTSUPP); 7310 } 7311 recov_state.rs_flags = 0; 7312 recov_state.rs_num_retry_despite_err = 0; 7313 7314 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7315 return (EINTR); 7316 7317 recov_retry: 7318 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7319 7320 args.ctag = TAG_LINK; 7321 7322 /* 7323 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7324 * restorefh; getattr(fl) 7325 */ 7326 args.array_len = 7; 7327 args.array = argop; 7328 7329 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7330 if (e.error) { 7331 kmem_free(argop, argoplist_size); 7332 nfs_rw_exit(&tdrp->r_rwlock); 7333 return (e.error); 7334 } 7335 7336 /* 0. putfh file */ 7337 argop[0].argop = OP_CPUTFH; 7338 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7339 7340 /* 1. save current fh to free up the space for the dir */ 7341 argop[1].argop = OP_SAVEFH; 7342 7343 /* 2. putfh targetdir */ 7344 argop[2].argop = OP_CPUTFH; 7345 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7346 7347 /* 3. link: current_fh is targetdir, saved_fh is source */ 7348 argop[3].argop = OP_CLINK; 7349 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7350 7351 /* 4. Get attributes of dir */ 7352 argop[4].argop = OP_GETATTR; 7353 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7354 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7355 7356 /* 5. If link was successful, restore current vp to file */ 7357 argop[5].argop = OP_RESTOREFH; 7358 7359 /* 6. Get attributes of linked object */ 7360 argop[6].argop = OP_GETATTR; 7361 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7362 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7363 7364 dnlc_remove(tdvp, tnm); 7365 7366 doqueue = 1; 7367 t = gethrtime(); 7368 7369 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7370 7371 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7372 if (e.error != 0 && !needrecov) { 7373 PURGE_ATTRCACHE4(tdvp); 7374 PURGE_ATTRCACHE4(svp); 7375 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7376 goto out; 7377 } 7378 7379 if (needrecov) { 7380 bool_t abort; 7381 7382 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7383 NULL, NULL, OP_LINK, NULL); 7384 if (abort == FALSE) { 7385 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7386 needrecov); 7387 kmem_free(argop, argoplist_size); 7388 if (!e.error) 7389 (void) xdr_free(xdr_COMPOUND4res_clnt, 7390 (caddr_t)&res); 7391 goto recov_retry; 7392 } else { 7393 if (e.error != 0) { 7394 PURGE_ATTRCACHE4(tdvp); 7395 PURGE_ATTRCACHE4(svp); 7396 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7397 &recov_state, needrecov); 7398 goto out; 7399 } 7400 /* fall through for res.status case */ 7401 } 7402 } 7403 7404 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7405 7406 resp = &res; 7407 if (res.status) { 7408 /* If link succeeded, then don't return error */ 7409 e.error = geterrno4(res.status); 7410 if (res.array_len <= 4) { 7411 /* 7412 * Either Putfh, Savefh, Putfh dir, or Link failed 7413 */ 7414 PURGE_ATTRCACHE4(svp); 7415 PURGE_ATTRCACHE4(tdvp); 7416 if (e.error == EOPNOTSUPP) { 7417 mutex_enter(&mi->mi_lock); 7418 mi->mi_flags &= ~MI4_LINK; 7419 mutex_exit(&mi->mi_lock); 7420 } 7421 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7422 /* XXX-LP */ 7423 if (e.error == EISDIR && crgetuid(cr) != 0) 7424 e.error = EPERM; 7425 goto out; 7426 } 7427 } 7428 7429 /* either no error or one of the postop getattr failed */ 7430 7431 /* 7432 * XXX - if LINK succeeded, but no attrs were returned for link 7433 * file, purge its cache. 7434 * 7435 * XXX Perform a simplified version of wcc checking. Instead of 7436 * have another getattr to get pre-op, just purge cache if 7437 * any of the ops prior to and including the getattr failed. 7438 * If the getattr succeeded then update the attrcache accordingly. 7439 */ 7440 7441 /* 7442 * update cache with link file postattrs. 7443 * Note: at this point resop points to link res. 7444 */ 7445 resop = &res.array[3]; /* link res */ 7446 ln_res = &resop->nfs_resop4_u.oplink; 7447 if (res.status == NFS4_OK) { 7448 e.error = nfs4_update_attrcache(res.status, 7449 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7450 t, svp, cr); 7451 } 7452 7453 /* 7454 * Call makenfs4node to create the new shadow vp for tnm. 7455 * We pass NULL attrs because we just cached attrs for 7456 * the src object. All we're trying to accomplish is to 7457 * to create the new shadow vnode. 7458 */ 7459 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7460 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm)); 7461 7462 /* Update target cache attribute, readdir and dnlc caches */ 7463 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7464 dinfo.di_time_call = t; 7465 dinfo.di_cred = cr; 7466 7467 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7468 ASSERT(nfs4_consistent_type(tdvp)); 7469 ASSERT(nfs4_consistent_type(svp)); 7470 ASSERT(nfs4_consistent_type(nvp)); 7471 VN_RELE(nvp); 7472 7473 out: 7474 kmem_free(argop, argoplist_size); 7475 if (resp) 7476 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7477 7478 nfs_rw_exit(&tdrp->r_rwlock); 7479 7480 return (e.error); 7481 } 7482 7483 static int 7484 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7485 { 7486 vnode_t *realvp; 7487 7488 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7489 return (EPERM); 7490 if (VOP_REALVP(ndvp, &realvp) == 0) 7491 ndvp = realvp; 7492 7493 return (nfs4rename(odvp, onm, ndvp, nnm, cr)); 7494 } 7495 7496 /* 7497 * nfs4rename does the real work of renaming in NFS Version 4. 7498 * 7499 * A file handle is considered volatile for renaming purposes if either 7500 * of the volatile bits are turned on. However, the compound may differ 7501 * based on the likelihood of the filehandle to change during rename. 7502 */ 7503 static int 7504 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7505 { 7506 int error; 7507 mntinfo4_t *mi; 7508 vnode_t *nvp; 7509 vnode_t *ovp = NULL; 7510 char *tmpname = NULL; 7511 rnode4_t *rp; 7512 rnode4_t *odrp; 7513 rnode4_t *ndrp; 7514 int did_link = 0; 7515 int do_link = 1; 7516 nfsstat4 stat = NFS4_OK; 7517 7518 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7519 ASSERT(nfs4_consistent_type(odvp)); 7520 ASSERT(nfs4_consistent_type(ndvp)); 7521 7522 if (onm[0] == '.' && (onm[1] == '\0' || 7523 (onm[1] == '.' && onm[2] == '\0'))) 7524 return (EINVAL); 7525 7526 if (nnm[0] == '.' && (nnm[1] == '\0' || 7527 (nnm[1] == '.' && nnm[2] == '\0'))) 7528 return (EINVAL); 7529 7530 odrp = VTOR4(odvp); 7531 ndrp = VTOR4(ndvp); 7532 if ((intptr_t)odrp < (intptr_t)ndrp) { 7533 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7534 return (EINTR); 7535 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7536 nfs_rw_exit(&odrp->r_rwlock); 7537 return (EINTR); 7538 } 7539 } else { 7540 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7541 return (EINTR); 7542 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7543 nfs_rw_exit(&ndrp->r_rwlock); 7544 return (EINTR); 7545 } 7546 } 7547 7548 /* 7549 * Lookup the target file. If it exists, it needs to be 7550 * checked to see whether it is a mount point and whether 7551 * it is active (open). 7552 */ 7553 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7554 if (!error) { 7555 int isactive; 7556 7557 ASSERT(nfs4_consistent_type(nvp)); 7558 /* 7559 * If this file has been mounted on, then just 7560 * return busy because renaming to it would remove 7561 * the mounted file system from the name space. 7562 */ 7563 if (vn_ismntpt(nvp)) { 7564 VN_RELE(nvp); 7565 nfs_rw_exit(&odrp->r_rwlock); 7566 nfs_rw_exit(&ndrp->r_rwlock); 7567 return (EBUSY); 7568 } 7569 7570 /* 7571 * First just remove the entry from the name cache, as it 7572 * is most likely the only entry for this vp. 7573 */ 7574 dnlc_remove(ndvp, nnm); 7575 7576 rp = VTOR4(nvp); 7577 7578 if (nvp->v_type != VREG) { 7579 /* 7580 * Purge the name cache of all references to this vnode 7581 * so that we can check the reference count to infer 7582 * whether it is active or not. 7583 */ 7584 if (nvp->v_count > 1) 7585 dnlc_purge_vp(nvp); 7586 7587 isactive = nvp->v_count > 1; 7588 } else { 7589 mutex_enter(&rp->r_os_lock); 7590 isactive = list_head(&rp->r_open_streams) != NULL; 7591 mutex_exit(&rp->r_os_lock); 7592 } 7593 7594 /* 7595 * If the vnode is active and is not a directory, 7596 * arrange to rename it to a 7597 * temporary file so that it will continue to be 7598 * accessible. This implements the "unlink-open-file" 7599 * semantics for the target of a rename operation. 7600 * Before doing this though, make sure that the 7601 * source and target files are not already the same. 7602 */ 7603 if (isactive && nvp->v_type != VDIR) { 7604 /* 7605 * Lookup the source name. 7606 */ 7607 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7608 7609 /* 7610 * The source name *should* already exist. 7611 */ 7612 if (error) { 7613 VN_RELE(nvp); 7614 nfs_rw_exit(&odrp->r_rwlock); 7615 nfs_rw_exit(&ndrp->r_rwlock); 7616 return (error); 7617 } 7618 7619 ASSERT(nfs4_consistent_type(ovp)); 7620 7621 /* 7622 * Compare the two vnodes. If they are the same, 7623 * just release all held vnodes and return success. 7624 */ 7625 if (VN_CMP(ovp, nvp)) { 7626 VN_RELE(ovp); 7627 VN_RELE(nvp); 7628 nfs_rw_exit(&odrp->r_rwlock); 7629 nfs_rw_exit(&ndrp->r_rwlock); 7630 return (0); 7631 } 7632 7633 /* 7634 * Can't mix and match directories and non- 7635 * directories in rename operations. We already 7636 * know that the target is not a directory. If 7637 * the source is a directory, return an error. 7638 */ 7639 if (ovp->v_type == VDIR) { 7640 VN_RELE(ovp); 7641 VN_RELE(nvp); 7642 nfs_rw_exit(&odrp->r_rwlock); 7643 nfs_rw_exit(&ndrp->r_rwlock); 7644 return (ENOTDIR); 7645 } 7646 link_call: 7647 /* 7648 * The target file exists, is not the same as 7649 * the source file, and is active. We first 7650 * try to Link it to a temporary filename to 7651 * avoid having the server removing the file 7652 * completely (which could cause data loss to 7653 * the user's POV in the event the Rename fails 7654 * -- see bug 1165874). 7655 */ 7656 /* 7657 * The do_link and did_link booleans are 7658 * introduced in the event we get NFS4ERR_FILE_OPEN 7659 * returned for the Rename. Some servers can 7660 * not Rename over an Open file, so they return 7661 * this error. The client needs to Remove the 7662 * newly created Link and do two Renames, just 7663 * as if the server didn't support LINK. 7664 */ 7665 tmpname = newname(); 7666 error = 0; 7667 7668 if (do_link) { 7669 error = nfs4_link(ndvp, nvp, tmpname, cr); 7670 } 7671 if (error == EOPNOTSUPP || !do_link) { 7672 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7673 cr); 7674 did_link = 0; 7675 } else { 7676 did_link = 1; 7677 } 7678 if (error) { 7679 kmem_free(tmpname, MAXNAMELEN); 7680 VN_RELE(ovp); 7681 VN_RELE(nvp); 7682 nfs_rw_exit(&odrp->r_rwlock); 7683 nfs_rw_exit(&ndrp->r_rwlock); 7684 return (error); 7685 } 7686 7687 mutex_enter(&rp->r_statelock); 7688 if (rp->r_unldvp == NULL) { 7689 VN_HOLD(ndvp); 7690 rp->r_unldvp = ndvp; 7691 if (rp->r_unlcred != NULL) 7692 crfree(rp->r_unlcred); 7693 crhold(cr); 7694 rp->r_unlcred = cr; 7695 rp->r_unlname = tmpname; 7696 } else { 7697 if (rp->r_unlname) 7698 kmem_free(rp->r_unlname, MAXNAMELEN); 7699 rp->r_unlname = tmpname; 7700 } 7701 mutex_exit(&rp->r_statelock); 7702 } 7703 7704 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7705 7706 ASSERT(nfs4_consistent_type(nvp)); 7707 VN_RELE(nvp); 7708 } 7709 7710 if (ovp == NULL) { 7711 /* 7712 * When renaming directories to be a subdirectory of a 7713 * different parent, the dnlc entry for ".." will no 7714 * longer be valid, so it must be removed. 7715 * 7716 * We do a lookup here to determine whether we are renaming 7717 * a directory and we need to check if we are renaming 7718 * an unlinked file. This might have already been done 7719 * in previous code, so we check ovp == NULL to avoid 7720 * doing it twice. 7721 */ 7722 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7723 /* 7724 * The source name *should* already exist. 7725 */ 7726 if (error) { 7727 nfs_rw_exit(&odrp->r_rwlock); 7728 nfs_rw_exit(&ndrp->r_rwlock); 7729 return (error); 7730 } 7731 ASSERT(ovp != NULL); 7732 ASSERT(nfs4_consistent_type(ovp)); 7733 } 7734 7735 /* 7736 * Is the object being renamed a dir, and if so, is 7737 * it being renamed to a child of itself? The underlying 7738 * fs should ultimately return EINVAL for this case; 7739 * however, buggy beta non-Solaris NFSv4 servers at 7740 * interop testing events have allowed this behavior, 7741 * and it caused our client to panic due to a recursive 7742 * mutex_enter in fn_move. 7743 * 7744 * The tedious locking in fn_move could be changed to 7745 * deal with this case, and the client could avoid the 7746 * panic; however, the client would just confuse itself 7747 * later and misbehave. A better way to handle the broken 7748 * server is to detect this condition and return EINVAL 7749 * without ever sending the the bogus rename to the server. 7750 * We know the rename is invalid -- just fail it now. 7751 */ 7752 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7753 VN_RELE(ovp); 7754 nfs_rw_exit(&odrp->r_rwlock); 7755 nfs_rw_exit(&ndrp->r_rwlock); 7756 return (EINVAL); 7757 } 7758 7759 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7760 7761 /* 7762 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7763 * possible for the filehandle to change due to the rename. 7764 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7765 * the fh will not change because of the rename, but we still need 7766 * to update its rnode entry with the new name for 7767 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7768 * has no effect on these for now, but for future improvements, 7769 * we might want to use it too to simplify handling of files 7770 * that are open with that flag on. (XXX) 7771 */ 7772 mi = VTOMI4(odvp); 7773 if (NFS4_VOLATILE_FH(mi)) { 7774 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7775 &stat); 7776 } else { 7777 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7778 &stat); 7779 } 7780 ASSERT(nfs4_consistent_type(odvp)); 7781 ASSERT(nfs4_consistent_type(ndvp)); 7782 ASSERT(nfs4_consistent_type(ovp)); 7783 7784 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7785 do_link = 0; 7786 /* 7787 * Before the 'link_call' code, we did a nfs4_lookup 7788 * that puts a VN_HOLD on nvp. After the nfs4_link 7789 * call we call VN_RELE to match that hold. We need 7790 * to place an additional VN_HOLD here since we will 7791 * be hitting that VN_RELE again. 7792 */ 7793 VN_HOLD(nvp); 7794 7795 (void) nfs4_remove(ndvp, tmpname, cr); 7796 7797 /* Undo the unlinked file naming stuff we just did */ 7798 mutex_enter(&rp->r_statelock); 7799 if (rp->r_unldvp) { 7800 VN_RELE(ndvp); 7801 rp->r_unldvp = NULL; 7802 if (rp->r_unlcred != NULL) 7803 crfree(rp->r_unlcred); 7804 rp->r_unlcred = NULL; 7805 /* rp->r_unlanme points to tmpname */ 7806 if (rp->r_unlname) 7807 kmem_free(rp->r_unlname, MAXNAMELEN); 7808 rp->r_unlname = NULL; 7809 } 7810 mutex_exit(&rp->r_statelock); 7811 7812 goto link_call; 7813 } 7814 7815 if (error) { 7816 VN_RELE(ovp); 7817 nfs_rw_exit(&odrp->r_rwlock); 7818 nfs_rw_exit(&ndrp->r_rwlock); 7819 return (error); 7820 } 7821 7822 /* 7823 * when renaming directories to be a subdirectory of a 7824 * different parent, the dnlc entry for ".." will no 7825 * longer be valid, so it must be removed 7826 */ 7827 rp = VTOR4(ovp); 7828 if (ndvp != odvp) { 7829 if (ovp->v_type == VDIR) { 7830 dnlc_remove(ovp, ".."); 7831 if (rp->r_dir != NULL) 7832 nfs4_purge_rddir_cache(ovp); 7833 } 7834 } 7835 7836 /* 7837 * If we are renaming the unlinked file, update the 7838 * r_unldvp and r_unlname as needed. 7839 */ 7840 mutex_enter(&rp->r_statelock); 7841 if (rp->r_unldvp != NULL) { 7842 if (strcmp(rp->r_unlname, onm) == 0) { 7843 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7844 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7845 if (ndvp != rp->r_unldvp) { 7846 VN_RELE(rp->r_unldvp); 7847 rp->r_unldvp = ndvp; 7848 VN_HOLD(ndvp); 7849 } 7850 } 7851 } 7852 mutex_exit(&rp->r_statelock); 7853 7854 VN_RELE(ovp); 7855 7856 nfs_rw_exit(&odrp->r_rwlock); 7857 nfs_rw_exit(&ndrp->r_rwlock); 7858 7859 return (error); 7860 } 7861 7862 /* 7863 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 7864 * when it is known that the filehandle is persistent through rename. 7865 * 7866 * Rename requires that the current fh be the target directory and the 7867 * saved fh be the source directory. After the operation, the current fh 7868 * is unchanged. 7869 * The compound op structure for persistent fh rename is: 7870 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 7871 * Rather than bother with the directory postop args, we'll simply 7872 * update that a change occured in the cache, so no post-op getattrs. 7873 */ 7874 static int 7875 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 7876 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 7877 { 7878 COMPOUND4args_clnt args; 7879 COMPOUND4res_clnt res, *resp = NULL; 7880 nfs_argop4 *argop; 7881 nfs_resop4 *resop; 7882 int doqueue, argoplist_size; 7883 mntinfo4_t *mi; 7884 rnode4_t *odrp = VTOR4(odvp); 7885 rnode4_t *ndrp = VTOR4(ndvp); 7886 RENAME4res *rn_res; 7887 bool_t needrecov; 7888 nfs4_recov_state_t recov_state; 7889 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7890 dirattr_info_t dinfo, *dinfop; 7891 7892 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7893 7894 recov_state.rs_flags = 0; 7895 recov_state.rs_num_retry_despite_err = 0; 7896 7897 /* 7898 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 7899 * 7900 * If source/target are different dirs, then append putfh(src); getattr 7901 */ 7902 args.array_len = (odvp == ndvp) ? 5 : 7; 7903 argoplist_size = args.array_len * sizeof (nfs_argop4); 7904 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 7905 7906 recov_retry: 7907 *statp = NFS4_OK; 7908 7909 /* No need to Lookup the file, persistent fh */ 7910 args.ctag = TAG_RENAME; 7911 7912 mi = VTOMI4(odvp); 7913 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 7914 if (e.error) { 7915 kmem_free(argop, argoplist_size); 7916 return (e.error); 7917 } 7918 7919 /* 0: putfh source directory */ 7920 argop[0].argop = OP_CPUTFH; 7921 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 7922 7923 /* 1: Save source fh to free up current for target */ 7924 argop[1].argop = OP_SAVEFH; 7925 7926 /* 2: putfh targetdir */ 7927 argop[2].argop = OP_CPUTFH; 7928 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7929 7930 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 7931 argop[3].argop = OP_CRENAME; 7932 argop[3].nfs_argop4_u.opcrename.coldname = onm; 7933 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 7934 7935 /* 4: getattr (targetdir) */ 7936 argop[4].argop = OP_GETATTR; 7937 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7938 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7939 7940 if (ndvp != odvp) { 7941 7942 /* 5: putfh (sourcedir) */ 7943 argop[5].argop = OP_CPUTFH; 7944 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7945 7946 /* 6: getattr (sourcedir) */ 7947 argop[6].argop = OP_GETATTR; 7948 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7949 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7950 } 7951 7952 dnlc_remove(odvp, onm); 7953 dnlc_remove(ndvp, nnm); 7954 7955 doqueue = 1; 7956 dinfo.di_time_call = gethrtime(); 7957 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7958 7959 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7960 if (e.error) { 7961 PURGE_ATTRCACHE4(odvp); 7962 PURGE_ATTRCACHE4(ndvp); 7963 } else { 7964 *statp = res.status; 7965 } 7966 7967 if (needrecov) { 7968 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 7969 OP_RENAME, NULL) == FALSE) { 7970 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 7971 if (!e.error) 7972 (void) xdr_free(xdr_COMPOUND4res_clnt, 7973 (caddr_t)&res); 7974 goto recov_retry; 7975 } 7976 } 7977 7978 if (!e.error) { 7979 resp = &res; 7980 /* 7981 * as long as OP_RENAME 7982 */ 7983 if (res.status != NFS4_OK && res.array_len <= 4) { 7984 e.error = geterrno4(res.status); 7985 PURGE_ATTRCACHE4(odvp); 7986 PURGE_ATTRCACHE4(ndvp); 7987 /* 7988 * System V defines rename to return EEXIST, not 7989 * ENOTEMPTY if the target directory is not empty. 7990 * Over the wire, the error is NFSERR_ENOTEMPTY 7991 * which geterrno4 maps to ENOTEMPTY. 7992 */ 7993 if (e.error == ENOTEMPTY) 7994 e.error = EEXIST; 7995 } else { 7996 7997 resop = &res.array[3]; /* rename res */ 7998 rn_res = &resop->nfs_resop4_u.oprename; 7999 8000 if (res.status == NFS4_OK) { 8001 /* 8002 * Update target attribute, readdir and dnlc 8003 * caches. 8004 */ 8005 dinfo.di_garp = 8006 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8007 dinfo.di_cred = cr; 8008 dinfop = &dinfo; 8009 } else 8010 dinfop = NULL; 8011 8012 nfs4_update_dircaches(&rn_res->target_cinfo, 8013 ndvp, NULL, NULL, dinfop); 8014 8015 /* 8016 * Update source attribute, readdir and dnlc caches 8017 * 8018 */ 8019 if (ndvp != odvp) { 8020 if (dinfop) 8021 dinfo.di_garp = 8022 &(res.array[6].nfs_resop4_u. 8023 opgetattr.ga_res); 8024 8025 nfs4_update_dircaches(&rn_res->source_cinfo, 8026 odvp, NULL, NULL, dinfop); 8027 } 8028 8029 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8030 nnm); 8031 } 8032 } 8033 8034 if (resp) 8035 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8036 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8037 kmem_free(argop, argoplist_size); 8038 8039 return (e.error); 8040 } 8041 8042 /* 8043 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8044 * it is possible for the filehandle to change due to the rename. 8045 * 8046 * The compound req in this case includes a post-rename lookup and getattr 8047 * to ensure that we have the correct fh and attributes for the object. 8048 * 8049 * Rename requires that the current fh be the target directory and the 8050 * saved fh be the source directory. After the operation, the current fh 8051 * is unchanged. 8052 * 8053 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8054 * update the filehandle for the renamed object. We also get the old 8055 * filehandle for historical reasons; this should be taken out sometime. 8056 * This results in a rather cumbersome compound... 8057 * 8058 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8059 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8060 * 8061 */ 8062 static int 8063 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8064 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8065 { 8066 COMPOUND4args_clnt args; 8067 COMPOUND4res_clnt res, *resp = NULL; 8068 int argoplist_size; 8069 nfs_argop4 *argop; 8070 nfs_resop4 *resop; 8071 int doqueue; 8072 mntinfo4_t *mi; 8073 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8074 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8075 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8076 RENAME4res *rn_res; 8077 GETFH4res *ngf_res; 8078 bool_t needrecov; 8079 nfs4_recov_state_t recov_state; 8080 hrtime_t t; 8081 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8082 dirattr_info_t dinfo, *dinfop = &dinfo; 8083 8084 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8085 8086 recov_state.rs_flags = 0; 8087 recov_state.rs_num_retry_despite_err = 0; 8088 8089 recov_retry: 8090 *statp = NFS4_OK; 8091 8092 /* 8093 * There is a window between the RPC and updating the path and 8094 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8095 * code, so that it doesn't try to use the old path during that 8096 * window. 8097 */ 8098 mutex_enter(&orp->r_statelock); 8099 while (orp->r_flags & R4RECEXPFH) { 8100 klwp_t *lwp = ttolwp(curthread); 8101 8102 if (lwp != NULL) 8103 lwp->lwp_nostop++; 8104 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8105 mutex_exit(&orp->r_statelock); 8106 if (lwp != NULL) 8107 lwp->lwp_nostop--; 8108 return (EINTR); 8109 } 8110 if (lwp != NULL) 8111 lwp->lwp_nostop--; 8112 } 8113 orp->r_flags |= R4RECEXPFH; 8114 mutex_exit(&orp->r_statelock); 8115 8116 mi = VTOMI4(odvp); 8117 8118 args.ctag = TAG_RENAME_VFH; 8119 args.array_len = (odvp == ndvp) ? 10 : 12; 8120 argoplist_size = args.array_len * sizeof (nfs_argop4); 8121 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8122 8123 /* 8124 * Rename ops: 8125 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8126 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8127 * LOOKUP(trgt), GETFH(new), GETATTR, 8128 * 8129 * if (odvp != ndvp) 8130 * add putfh(sourcedir), getattr(sourcedir) } 8131 */ 8132 args.array = argop; 8133 8134 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8135 &recov_state, NULL); 8136 if (e.error) { 8137 kmem_free(argop, argoplist_size); 8138 mutex_enter(&orp->r_statelock); 8139 orp->r_flags &= ~R4RECEXPFH; 8140 cv_broadcast(&orp->r_cv); 8141 mutex_exit(&orp->r_statelock); 8142 return (e.error); 8143 } 8144 8145 /* 0: putfh source directory */ 8146 argop[0].argop = OP_CPUTFH; 8147 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8148 8149 /* 1: Save source fh to free up current for target */ 8150 argop[1].argop = OP_SAVEFH; 8151 8152 /* 2: Lookup pre-rename fh of renamed object */ 8153 argop[2].argop = OP_CLOOKUP; 8154 argop[2].nfs_argop4_u.opclookup.cname = onm; 8155 8156 /* 3: getfh fh of renamed object (before rename) */ 8157 argop[3].argop = OP_GETFH; 8158 8159 /* 4: putfh targetdir */ 8160 argop[4].argop = OP_CPUTFH; 8161 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8162 8163 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8164 argop[5].argop = OP_CRENAME; 8165 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8166 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8167 8168 /* 6: getattr of target dir (post op attrs) */ 8169 argop[6].argop = OP_GETATTR; 8170 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8171 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8172 8173 /* 7: Lookup post-rename fh of renamed object */ 8174 argop[7].argop = OP_CLOOKUP; 8175 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8176 8177 /* 8: getfh fh of renamed object (after rename) */ 8178 argop[8].argop = OP_GETFH; 8179 8180 /* 9: getattr of renamed object */ 8181 argop[9].argop = OP_GETATTR; 8182 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8183 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8184 8185 /* 8186 * If source/target dirs are different, then get new post-op 8187 * attrs for source dir also. 8188 */ 8189 if (ndvp != odvp) { 8190 /* 10: putfh (sourcedir) */ 8191 argop[10].argop = OP_CPUTFH; 8192 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8193 8194 /* 11: getattr (sourcedir) */ 8195 argop[11].argop = OP_GETATTR; 8196 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8197 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8198 } 8199 8200 dnlc_remove(odvp, onm); 8201 dnlc_remove(ndvp, nnm); 8202 8203 doqueue = 1; 8204 t = gethrtime(); 8205 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8206 8207 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8208 if (e.error) { 8209 PURGE_ATTRCACHE4(odvp); 8210 PURGE_ATTRCACHE4(ndvp); 8211 if (!needrecov) { 8212 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8213 &recov_state, needrecov); 8214 goto out; 8215 } 8216 } else { 8217 *statp = res.status; 8218 } 8219 8220 if (needrecov) { 8221 bool_t abort; 8222 8223 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8224 OP_RENAME, NULL); 8225 if (abort == FALSE) { 8226 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8227 &recov_state, needrecov); 8228 kmem_free(argop, argoplist_size); 8229 if (!e.error) 8230 (void) xdr_free(xdr_COMPOUND4res_clnt, 8231 (caddr_t)&res); 8232 mutex_enter(&orp->r_statelock); 8233 orp->r_flags &= ~R4RECEXPFH; 8234 cv_broadcast(&orp->r_cv); 8235 mutex_exit(&orp->r_statelock); 8236 goto recov_retry; 8237 } else { 8238 if (e.error != 0) { 8239 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8240 &recov_state, needrecov); 8241 goto out; 8242 } 8243 /* fall through for res.status case */ 8244 } 8245 } 8246 8247 resp = &res; 8248 /* 8249 * If OP_RENAME (or any prev op) failed, then return an error. 8250 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8251 */ 8252 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8253 /* 8254 * Error in an op other than last Getattr 8255 */ 8256 e.error = geterrno4(res.status); 8257 PURGE_ATTRCACHE4(odvp); 8258 PURGE_ATTRCACHE4(ndvp); 8259 /* 8260 * System V defines rename to return EEXIST, not 8261 * ENOTEMPTY if the target directory is not empty. 8262 * Over the wire, the error is NFSERR_ENOTEMPTY 8263 * which geterrno4 maps to ENOTEMPTY. 8264 */ 8265 if (e.error == ENOTEMPTY) 8266 e.error = EEXIST; 8267 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8268 needrecov); 8269 goto out; 8270 } 8271 8272 /* rename results */ 8273 rn_res = &res.array[5].nfs_resop4_u.oprename; 8274 8275 if (res.status == NFS4_OK) { 8276 /* Update target attribute, readdir and dnlc caches */ 8277 dinfo.di_garp = 8278 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8279 dinfo.di_cred = cr; 8280 dinfo.di_time_call = t; 8281 } else 8282 dinfop = NULL; 8283 8284 /* Update source cache attribute, readdir and dnlc caches */ 8285 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8286 8287 /* Update source cache attribute, readdir and dnlc caches */ 8288 if (ndvp != odvp) { 8289 8290 /* 8291 * If dinfop is non-NULL, then compound succeded, so 8292 * set di_garp to attrs for source dir. dinfop is only 8293 * set to NULL when compound fails. 8294 */ 8295 if (dinfop) 8296 dinfo.di_garp = 8297 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8298 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8299 dinfop); 8300 } 8301 8302 /* 8303 * Update the rnode with the new component name and args, 8304 * and if the file handle changed, also update it with the new fh. 8305 * This is only necessary if the target object has an rnode 8306 * entry and there is no need to create one for it. 8307 */ 8308 resop = &res.array[8]; /* getfh new res */ 8309 ngf_res = &resop->nfs_resop4_u.opgetfh; 8310 8311 /* 8312 * Update the path and filehandle for the renamed object. 8313 */ 8314 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8315 8316 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8317 8318 if (res.status == NFS4_OK) { 8319 resop++; /* getattr res */ 8320 e.error = nfs4_update_attrcache(res.status, 8321 &resop->nfs_resop4_u.opgetattr.ga_res, 8322 t, ovp, cr); 8323 } 8324 8325 out: 8326 kmem_free(argop, argoplist_size); 8327 if (resp) 8328 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8329 mutex_enter(&orp->r_statelock); 8330 orp->r_flags &= ~R4RECEXPFH; 8331 cv_broadcast(&orp->r_cv); 8332 mutex_exit(&orp->r_statelock); 8333 8334 return (e.error); 8335 } 8336 8337 static int 8338 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 8339 { 8340 int error; 8341 vnode_t *vp; 8342 8343 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8344 return (EPERM); 8345 /* 8346 * As ".." has special meaning and rather than send a mkdir 8347 * over the wire to just let the server freak out, we just 8348 * short circuit it here and return EEXIST 8349 */ 8350 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8351 return (EEXIST); 8352 8353 /* 8354 * Decision to get the right gid and setgid bit of the 8355 * new directory is now made in call_nfs4_create_req. 8356 */ 8357 va->va_mask |= AT_MODE; 8358 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8359 if (error) 8360 return (error); 8361 8362 *vpp = vp; 8363 return (0); 8364 } 8365 8366 8367 /* 8368 * rmdir is using the same remove v4 op as does remove. 8369 * Remove requires that the current fh be the target directory. 8370 * After the operation, the current fh is unchanged. 8371 * The compound op structure is: 8372 * PUTFH(targetdir), REMOVE 8373 */ 8374 static int 8375 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 8376 { 8377 int need_end_op = FALSE; 8378 COMPOUND4args_clnt args; 8379 COMPOUND4res_clnt res, *resp = NULL; 8380 REMOVE4res *rm_res; 8381 nfs_argop4 argop[3]; 8382 nfs_resop4 *resop; 8383 vnode_t *vp; 8384 int doqueue; 8385 mntinfo4_t *mi; 8386 rnode4_t *drp; 8387 bool_t needrecov = FALSE; 8388 nfs4_recov_state_t recov_state; 8389 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8390 dirattr_info_t dinfo, *dinfop; 8391 8392 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8393 return (EPERM); 8394 /* 8395 * As ".." has special meaning and rather than send a rmdir 8396 * over the wire to just let the server freak out, we just 8397 * short circuit it here and return EEXIST 8398 */ 8399 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8400 return (EEXIST); 8401 8402 drp = VTOR4(dvp); 8403 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8404 return (EINTR); 8405 8406 /* 8407 * Attempt to prevent a rmdir(".") from succeeding. 8408 */ 8409 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8410 if (e.error) { 8411 nfs_rw_exit(&drp->r_rwlock); 8412 return (e.error); 8413 } 8414 if (vp == cdir) { 8415 VN_RELE(vp); 8416 nfs_rw_exit(&drp->r_rwlock); 8417 return (EINVAL); 8418 } 8419 8420 /* 8421 * Since nfsv4 remove op works on both files and directories, 8422 * check that the removed object is indeed a directory. 8423 */ 8424 if (vp->v_type != VDIR) { 8425 VN_RELE(vp); 8426 nfs_rw_exit(&drp->r_rwlock); 8427 return (ENOTDIR); 8428 } 8429 8430 /* 8431 * First just remove the entry from the name cache, as it 8432 * is most likely an entry for this vp. 8433 */ 8434 dnlc_remove(dvp, nm); 8435 8436 /* 8437 * If there vnode reference count is greater than one, then 8438 * there may be additional references in the DNLC which will 8439 * need to be purged. First, trying removing the entry for 8440 * the parent directory and see if that removes the additional 8441 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8442 * to completely remove any references to the directory which 8443 * might still exist in the DNLC. 8444 */ 8445 if (vp->v_count > 1) { 8446 dnlc_remove(vp, ".."); 8447 if (vp->v_count > 1) 8448 dnlc_purge_vp(vp); 8449 } 8450 8451 mi = VTOMI4(dvp); 8452 recov_state.rs_flags = 0; 8453 recov_state.rs_num_retry_despite_err = 0; 8454 8455 recov_retry: 8456 args.ctag = TAG_RMDIR; 8457 8458 /* 8459 * Rmdir ops: putfh dir; remove 8460 */ 8461 args.array_len = 3; 8462 args.array = argop; 8463 8464 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8465 if (e.error) { 8466 nfs_rw_exit(&drp->r_rwlock); 8467 return (e.error); 8468 } 8469 need_end_op = TRUE; 8470 8471 /* putfh directory */ 8472 argop[0].argop = OP_CPUTFH; 8473 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8474 8475 /* remove */ 8476 argop[1].argop = OP_CREMOVE; 8477 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8478 8479 /* getattr (postop attrs for dir that contained removed dir) */ 8480 argop[2].argop = OP_GETATTR; 8481 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8482 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8483 8484 dinfo.di_time_call = gethrtime(); 8485 doqueue = 1; 8486 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8487 8488 PURGE_ATTRCACHE4(vp); 8489 8490 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8491 if (e.error) { 8492 PURGE_ATTRCACHE4(dvp); 8493 } 8494 8495 if (needrecov) { 8496 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8497 NULL, OP_REMOVE, NULL) == FALSE) { 8498 if (!e.error) 8499 (void) xdr_free(xdr_COMPOUND4res_clnt, 8500 (caddr_t)&res); 8501 8502 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8503 needrecov); 8504 need_end_op = FALSE; 8505 goto recov_retry; 8506 } 8507 } 8508 8509 if (!e.error) { 8510 resp = &res; 8511 8512 /* 8513 * Only return error if first 2 ops (OP_REMOVE or earlier) 8514 * failed. 8515 */ 8516 if (res.status != NFS4_OK && res.array_len <= 2) { 8517 e.error = geterrno4(res.status); 8518 PURGE_ATTRCACHE4(dvp); 8519 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8520 &recov_state, needrecov); 8521 need_end_op = FALSE; 8522 nfs4_purge_stale_fh(e.error, dvp, cr); 8523 /* 8524 * System V defines rmdir to return EEXIST, not 8525 * ENOTEMPTY if the directory is not empty. Over 8526 * the wire, the error is NFSERR_ENOTEMPTY which 8527 * geterrno4 maps to ENOTEMPTY. 8528 */ 8529 if (e.error == ENOTEMPTY) 8530 e.error = EEXIST; 8531 } else { 8532 resop = &res.array[1]; /* remove res */ 8533 rm_res = &resop->nfs_resop4_u.opremove; 8534 8535 if (res.status == NFS4_OK) { 8536 resop = &res.array[2]; /* dir attrs */ 8537 dinfo.di_garp = 8538 &resop->nfs_resop4_u.opgetattr.ga_res; 8539 dinfo.di_cred = cr; 8540 dinfop = &dinfo; 8541 } else 8542 dinfop = NULL; 8543 8544 /* Update dir attribute, readdir and dnlc caches */ 8545 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8546 dinfop); 8547 8548 /* destroy rddir cache for dir that was removed */ 8549 if (VTOR4(vp)->r_dir != NULL) 8550 nfs4_purge_rddir_cache(vp); 8551 } 8552 } 8553 8554 if (need_end_op) 8555 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8556 8557 nfs_rw_exit(&drp->r_rwlock); 8558 8559 if (resp) 8560 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8561 8562 VN_RELE(vp); 8563 8564 return (e.error); 8565 } 8566 8567 static int 8568 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 8569 { 8570 int error; 8571 vnode_t *vp; 8572 rnode4_t *rp; 8573 char *contents; 8574 mntinfo4_t *mi = VTOMI4(dvp); 8575 8576 if (nfs_zone() != mi->mi_zone) 8577 return (EPERM); 8578 if (!(mi->mi_flags & MI4_SYMLINK)) 8579 return (EOPNOTSUPP); 8580 8581 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8582 if (error) { 8583 return (error); 8584 } 8585 8586 ASSERT(nfs4_consistent_type(vp)); 8587 rp = VTOR4(vp); 8588 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8589 8590 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8591 8592 if (contents != NULL) { 8593 mutex_enter(&rp->r_statelock); 8594 if (rp->r_symlink.contents == NULL) { 8595 rp->r_symlink.len = strlen(tnm); 8596 bcopy(tnm, contents, rp->r_symlink.len); 8597 rp->r_symlink.contents = contents; 8598 rp->r_symlink.size = MAXPATHLEN; 8599 mutex_exit(&rp->r_statelock); 8600 } else { 8601 mutex_exit(&rp->r_statelock); 8602 kmem_free((void *)contents, MAXPATHLEN); 8603 } 8604 } 8605 } 8606 VN_RELE(vp); 8607 8608 return (error); 8609 } 8610 8611 8612 /* 8613 * Read directory entries. 8614 * There are some weird things to look out for here. The uio_loffset 8615 * field is either 0 or it is the offset returned from a previous 8616 * readdir. It is an opaque value used by the server to find the 8617 * correct directory block to read. The count field is the number 8618 * of blocks to read on the server. This is advisory only, the server 8619 * may return only one block's worth of entries. Entries may be compressed 8620 * on the server. 8621 */ 8622 static int 8623 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 8624 { 8625 int error; 8626 uint_t count; 8627 rnode4_t *rp; 8628 rddir4_cache *rdc; 8629 rddir4_cache *rrdc; 8630 8631 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8632 return (EIO); 8633 rp = VTOR4(vp); 8634 8635 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8636 8637 /* 8638 * Make sure that the directory cache is valid. 8639 */ 8640 if (rp->r_dir != NULL) { 8641 if (nfs_disable_rddir_cache != 0) { 8642 /* 8643 * Setting nfs_disable_rddir_cache in /etc/system 8644 * allows interoperability with servers that do not 8645 * properly update the attributes of directories. 8646 * Any cached information gets purged before an 8647 * access is made to it. 8648 */ 8649 nfs4_purge_rddir_cache(vp); 8650 } 8651 8652 error = nfs4_validate_caches(vp, cr); 8653 if (error) 8654 return (error); 8655 } 8656 8657 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8658 8659 /* 8660 * Short circuit last readdir which always returns 0 bytes. 8661 * This can be done after the directory has been read through 8662 * completely at least once. This will set r_direof which 8663 * can be used to find the value of the last cookie. 8664 */ 8665 mutex_enter(&rp->r_statelock); 8666 if (rp->r_direof != NULL && 8667 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8668 mutex_exit(&rp->r_statelock); 8669 #ifdef DEBUG 8670 nfs4_readdir_cache_shorts++; 8671 #endif 8672 if (eofp) 8673 *eofp = 1; 8674 return (0); 8675 } 8676 8677 /* 8678 * Look for a cache entry. Cache entries are identified 8679 * by the NFS cookie value and the byte count requested. 8680 */ 8681 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8682 8683 /* 8684 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8685 */ 8686 if (rdc == NULL) { 8687 mutex_exit(&rp->r_statelock); 8688 return (EINTR); 8689 } 8690 8691 /* 8692 * Check to see if we need to fill this entry in. 8693 */ 8694 if (rdc->flags & RDDIRREQ) { 8695 rdc->flags &= ~RDDIRREQ; 8696 rdc->flags |= RDDIR; 8697 mutex_exit(&rp->r_statelock); 8698 8699 /* 8700 * Do the readdir. 8701 */ 8702 nfs4readdir(vp, rdc, cr); 8703 8704 /* 8705 * Reaquire the lock, so that we can continue 8706 */ 8707 mutex_enter(&rp->r_statelock); 8708 /* 8709 * The entry is now complete 8710 */ 8711 rdc->flags &= ~RDDIR; 8712 } 8713 8714 ASSERT(!(rdc->flags & RDDIR)); 8715 8716 /* 8717 * If an error occurred while attempting 8718 * to fill the cache entry, mark the entry invalid and 8719 * just return the error. 8720 */ 8721 if (rdc->error) { 8722 error = rdc->error; 8723 rdc->flags |= RDDIRREQ; 8724 rddir4_cache_rele(rp, rdc); 8725 mutex_exit(&rp->r_statelock); 8726 return (error); 8727 } 8728 8729 /* 8730 * The cache entry is complete and good, 8731 * copyout the dirent structs to the calling 8732 * thread. 8733 */ 8734 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8735 8736 /* 8737 * If no error occurred during the copyout, 8738 * update the offset in the uio struct to 8739 * contain the value of the next NFS 4 cookie 8740 * and set the eof value appropriately. 8741 */ 8742 if (!error) { 8743 uiop->uio_loffset = rdc->nfs4_ncookie; 8744 if (eofp) 8745 *eofp = rdc->eof; 8746 } 8747 8748 /* 8749 * Decide whether to do readahead. Don't if we 8750 * have already read to the end of directory. 8751 */ 8752 if (rdc->eof) { 8753 /* 8754 * Make the entry the direof only if it is cached 8755 */ 8756 if (rdc->flags & RDDIRCACHED) 8757 rp->r_direof = rdc; 8758 rddir4_cache_rele(rp, rdc); 8759 mutex_exit(&rp->r_statelock); 8760 return (error); 8761 } 8762 8763 /* Determine if a readdir readahead should be done */ 8764 if (!(rp->r_flags & R4LOOKUP)) { 8765 rddir4_cache_rele(rp, rdc); 8766 mutex_exit(&rp->r_statelock); 8767 return (error); 8768 } 8769 8770 /* 8771 * Now look for a readahead entry. 8772 * 8773 * Check to see whether we found an entry for the readahead. 8774 * If so, we don't need to do anything further, so free the new 8775 * entry if one was allocated. Otherwise, allocate a new entry, add 8776 * it to the cache, and then initiate an asynchronous readdir 8777 * operation to fill it. 8778 */ 8779 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8780 8781 /* 8782 * A readdir cache entry could not be obtained for the readahead. In 8783 * this case we skip the readahead and return. 8784 */ 8785 if (rrdc == NULL) { 8786 rddir4_cache_rele(rp, rdc); 8787 mutex_exit(&rp->r_statelock); 8788 return (error); 8789 } 8790 8791 /* 8792 * Check to see if we need to fill this entry in. 8793 */ 8794 if (rrdc->flags & RDDIRREQ) { 8795 rrdc->flags &= ~RDDIRREQ; 8796 rrdc->flags |= RDDIR; 8797 rddir4_cache_rele(rp, rdc); 8798 mutex_exit(&rp->r_statelock); 8799 #ifdef DEBUG 8800 nfs4_readdir_readahead++; 8801 #endif 8802 /* 8803 * Do the readdir. 8804 */ 8805 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 8806 return (error); 8807 } 8808 8809 rddir4_cache_rele(rp, rrdc); 8810 rddir4_cache_rele(rp, rdc); 8811 mutex_exit(&rp->r_statelock); 8812 return (error); 8813 } 8814 8815 static int 8816 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8817 { 8818 int error; 8819 rnode4_t *rp; 8820 8821 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 8822 8823 rp = VTOR4(vp); 8824 8825 /* 8826 * Obtain the readdir results for the caller. 8827 */ 8828 nfs4readdir(vp, rdc, cr); 8829 8830 mutex_enter(&rp->r_statelock); 8831 /* 8832 * The entry is now complete 8833 */ 8834 rdc->flags &= ~RDDIR; 8835 8836 error = rdc->error; 8837 if (error) 8838 rdc->flags |= RDDIRREQ; 8839 rddir4_cache_rele(rp, rdc); 8840 mutex_exit(&rp->r_statelock); 8841 8842 return (error); 8843 } 8844 8845 static void 8846 nfs4readdir_stub(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8847 { 8848 int stublength; 8849 dirent64_t *dp; 8850 u_longlong_t nodeid, pnodeid; 8851 vnode_t *dotdotvp = NULL; 8852 rnode4_t *rp = VTOR4(vp); 8853 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8854 8855 rdc->error = 0; 8856 rdc->entries = 0; 8857 rdc->actlen = rdc->entlen = 0; 8858 rdc->eof = TRUE; 8859 8860 /* Check for EOF case for readdir of stub */ 8861 if (cookie != 0 && cookie != 1) 8862 return; 8863 8864 nodeid = rp->r_attr.va_nodeid; 8865 if (vp->v_flag & VROOT) { 8866 pnodeid = nodeid; /* root of mount point */ 8867 } else { 8868 if (rdc->error = nfs4_lookup(vp, "..", &dotdotvp, 0, 0, 0, cr)) 8869 return; 8870 pnodeid = VTOR4(dotdotvp)->r_attr.va_nodeid; 8871 VN_RELE(dotdotvp); 8872 } 8873 8874 stublength = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8875 rdc->entries = kmem_alloc(stublength, KM_SLEEP); 8876 rdc->entlen = rdc->buflen = stublength; 8877 rdc->eof = TRUE; 8878 8879 dp = (dirent64_t *)rdc->entries; 8880 8881 if (rdc->nfs4_cookie == (nfs_cookie4)0) { 8882 bcopy(nfs4_dot_entries, rdc->entries, 8883 DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2)); 8884 dp->d_ino = nodeid; 8885 dp = (struct dirent64 *)(((char *)dp) + DIRENT64_RECLEN(1)); 8886 dp->d_ino = pnodeid; 8887 rdc->actlen = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8888 } else { /* for ".." entry */ 8889 bcopy(nfs4_dot_dot_entry, rdc->entries, DIRENT64_RECLEN(2)); 8890 dp->d_ino = pnodeid; 8891 rdc->actlen = DIRENT64_RECLEN(2); 8892 } 8893 rdc->nfs4_ncookie = rdc->actlen; 8894 } 8895 8896 /* 8897 * Read directory entries. 8898 * There are some weird things to look out for here. The uio_loffset 8899 * field is either 0 or it is the offset returned from a previous 8900 * readdir. It is an opaque value used by the server to find the 8901 * correct directory block to read. The count field is the number 8902 * of blocks to read on the server. This is advisory only, the server 8903 * may return only one block's worth of entries. Entries may be compressed 8904 * on the server. 8905 * 8906 * Generates the following compound request: 8907 * 1. If readdir offset is zero and no dnlc entry for parent exists, 8908 * must include a Lookupp as well. In this case, send: 8909 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 8910 * 2. Otherwise just do: { Putfh <fh>; Readdir } 8911 * 8912 * Get complete attributes and filehandles for entries if this is the 8913 * first read of the directory. Otherwise, just get fileid's. 8914 */ 8915 static void 8916 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8917 { 8918 COMPOUND4args_clnt args; 8919 COMPOUND4res_clnt res; 8920 READDIR4args *rargs; 8921 READDIR4res_clnt *rd_res; 8922 bitmap4 rd_bitsval; 8923 nfs_argop4 argop[5]; 8924 nfs_resop4 *resop; 8925 rnode4_t *rp = VTOR4(vp); 8926 mntinfo4_t *mi = VTOMI4(vp); 8927 int doqueue; 8928 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 8929 vnode_t *dvp; 8930 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8931 int num_ops, res_opcnt; 8932 bool_t needrecov = FALSE; 8933 nfs4_recov_state_t recov_state; 8934 hrtime_t t; 8935 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8936 8937 ASSERT(nfs_zone() == mi->mi_zone); 8938 ASSERT(rdc->flags & RDDIR); 8939 ASSERT(rdc->entries == NULL); 8940 8941 if (rp->r_flags & R4SRVSTUB) { 8942 nfs4readdir_stub(vp, rdc, cr); 8943 return; 8944 } 8945 8946 num_ops = 2; 8947 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 8948 /* 8949 * Since nfsv4 readdir may not return entries for "." and "..", 8950 * the client must recreate them: 8951 * To find the correct nodeid, do the following: 8952 * For current node, get nodeid from dnlc. 8953 * - if current node is rootvp, set pnodeid to nodeid. 8954 * - else if parent is in the dnlc, get its nodeid from there. 8955 * - else add LOOKUPP+GETATTR to compound. 8956 */ 8957 nodeid = rp->r_attr.va_nodeid; 8958 if (vp->v_flag & VROOT) { 8959 pnodeid = nodeid; /* root of mount point */ 8960 } else { 8961 dvp = dnlc_lookup(vp, ".."); 8962 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 8963 /* parent in dnlc cache - no need for otw */ 8964 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 8965 } else { 8966 /* 8967 * parent not in dnlc cache, 8968 * do lookupp to get its id 8969 */ 8970 num_ops = 5; 8971 pnodeid = 0; /* set later by getattr parent */ 8972 } 8973 if (dvp) 8974 VN_RELE(dvp); 8975 } 8976 } 8977 recov_state.rs_flags = 0; 8978 recov_state.rs_num_retry_despite_err = 0; 8979 8980 /* Save the original mount point security flavor */ 8981 (void) save_mnt_secinfo(mi->mi_curr_serv); 8982 8983 recov_retry: 8984 args.ctag = TAG_READDIR; 8985 8986 args.array = argop; 8987 args.array_len = num_ops; 8988 8989 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 8990 &recov_state, NULL)) { 8991 /* 8992 * If readdir a node that is a stub for a crossed mount point, 8993 * keep the original secinfo flavor for the current file 8994 * system, not the crossed one. 8995 */ 8996 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 8997 rdc->error = e.error; 8998 return; 8999 } 9000 9001 /* 9002 * Determine which attrs to request for dirents. This code 9003 * must be protected by nfs4_start/end_fop because of r_server 9004 * (which will change during failover recovery). 9005 * 9006 */ 9007 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9008 /* 9009 * Get all vattr attrs plus filehandle and rdattr_error 9010 */ 9011 rd_bitsval = NFS4_VATTR_MASK | 9012 FATTR4_RDATTR_ERROR_MASK | 9013 FATTR4_FILEHANDLE_MASK; 9014 9015 if (rp->r_flags & R4READDIRWATTR) { 9016 mutex_enter(&rp->r_statelock); 9017 rp->r_flags &= ~R4READDIRWATTR; 9018 mutex_exit(&rp->r_statelock); 9019 } 9020 } else { 9021 servinfo4_t *svp = rp->r_server; 9022 9023 /* 9024 * Already read directory. Use readdir with 9025 * no attrs (except for mounted_on_fileid) for updates. 9026 */ 9027 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9028 9029 /* 9030 * request mounted on fileid if supported, else request 9031 * fileid. maybe we should verify that fileid is supported 9032 * and request something else if not. 9033 */ 9034 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9035 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9036 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9037 nfs_rw_exit(&svp->sv_lock); 9038 } 9039 9040 /* putfh directory fh */ 9041 argop[0].argop = OP_CPUTFH; 9042 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9043 9044 argop[1].argop = OP_READDIR; 9045 rargs = &argop[1].nfs_argop4_u.opreaddir; 9046 /* 9047 * 1 and 2 are reserved for client "." and ".." entry offset. 9048 * cookie 0 should be used over-the-wire to start reading at 9049 * the beginning of the directory excluding "." and "..". 9050 */ 9051 if (rdc->nfs4_cookie == 0 || 9052 rdc->nfs4_cookie == 1 || 9053 rdc->nfs4_cookie == 2) { 9054 rargs->cookie = (nfs_cookie4)0; 9055 rargs->cookieverf = 0; 9056 } else { 9057 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9058 mutex_enter(&rp->r_statelock); 9059 rargs->cookieverf = rp->r_cookieverf4; 9060 mutex_exit(&rp->r_statelock); 9061 } 9062 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9063 rargs->maxcount = mi->mi_tsize; 9064 rargs->attr_request = rd_bitsval; 9065 rargs->rdc = rdc; 9066 rargs->dvp = vp; 9067 rargs->mi = mi; 9068 rargs->cr = cr; 9069 9070 9071 /* 9072 * If count < than the minimum required, we return no entries 9073 * and fail with EINVAL 9074 */ 9075 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9076 rdc->error = EINVAL; 9077 goto out; 9078 } 9079 9080 if (args.array_len == 5) { 9081 /* 9082 * Add lookupp and getattr for parent nodeid. 9083 */ 9084 argop[2].argop = OP_LOOKUPP; 9085 9086 argop[3].argop = OP_GETFH; 9087 9088 /* getattr parent */ 9089 argop[4].argop = OP_GETATTR; 9090 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9091 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9092 } 9093 9094 doqueue = 1; 9095 9096 if (mi->mi_io_kstats) { 9097 mutex_enter(&mi->mi_lock); 9098 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9099 mutex_exit(&mi->mi_lock); 9100 } 9101 9102 /* capture the time of this call */ 9103 rargs->t = t = gethrtime(); 9104 9105 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9106 9107 if (mi->mi_io_kstats) { 9108 mutex_enter(&mi->mi_lock); 9109 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9110 mutex_exit(&mi->mi_lock); 9111 } 9112 9113 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9114 9115 /* 9116 * If RPC error occurred and it isn't an error that 9117 * triggers recovery, then go ahead and fail now. 9118 */ 9119 if (e.error != 0 && !needrecov) { 9120 rdc->error = e.error; 9121 goto out; 9122 } 9123 9124 if (needrecov) { 9125 bool_t abort; 9126 9127 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9128 "nfs4readdir: initiating recovery.\n")); 9129 9130 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9131 NULL, OP_READDIR, NULL); 9132 if (abort == FALSE) { 9133 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9134 &recov_state, needrecov); 9135 if (!e.error) 9136 (void) xdr_free(xdr_COMPOUND4res_clnt, 9137 (caddr_t)&res); 9138 if (rdc->entries != NULL) { 9139 kmem_free(rdc->entries, rdc->entlen); 9140 rdc->entries = NULL; 9141 } 9142 goto recov_retry; 9143 } 9144 9145 if (e.error != 0) { 9146 rdc->error = e.error; 9147 goto out; 9148 } 9149 9150 /* fall through for res.status case */ 9151 } 9152 9153 res_opcnt = res.array_len; 9154 9155 /* 9156 * If compound failed first 2 ops (PUTFH+READDIR), then return 9157 * failure here. Subsequent ops are for filling out dot-dot 9158 * dirent, and if they fail, we still want to give the caller 9159 * the dirents returned by (the successful) READDIR op, so we need 9160 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9161 * 9162 * One example where PUTFH+READDIR ops would succeed but 9163 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9164 * but lacks x. In this case, a POSIX server's VOP_READDIR 9165 * would succeed; however, VOP_LOOKUP(..) would fail since no 9166 * x perm. We need to come up with a non-vendor-specific way 9167 * for a POSIX server to return d_ino from dotdot's dirent if 9168 * client only requests mounted_on_fileid, and just say the 9169 * LOOKUPP succeeded and fill out the GETATTR. However, if 9170 * client requested any mandatory attrs, server would be required 9171 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9172 * for dotdot. 9173 */ 9174 9175 if (res.status) { 9176 if (res_opcnt <= 2) { 9177 e.error = geterrno4(res.status); 9178 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9179 &recov_state, needrecov); 9180 nfs4_purge_stale_fh(e.error, vp, cr); 9181 rdc->error = e.error; 9182 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9183 if (rdc->entries != NULL) { 9184 kmem_free(rdc->entries, rdc->entlen); 9185 rdc->entries = NULL; 9186 } 9187 /* 9188 * If readdir a node that is a stub for a 9189 * crossed mount point, keep the original 9190 * secinfo flavor for the current file system, 9191 * not the crossed one. 9192 */ 9193 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9194 return; 9195 } 9196 } 9197 9198 resop = &res.array[1]; /* readdir res */ 9199 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9200 9201 mutex_enter(&rp->r_statelock); 9202 rp->r_cookieverf4 = rd_res->cookieverf; 9203 mutex_exit(&rp->r_statelock); 9204 9205 /* 9206 * For "." and ".." entries 9207 * e.g. 9208 * seek(cookie=0) -> "." entry with d_off = 1 9209 * seek(cookie=1) -> ".." entry with d_off = 2 9210 */ 9211 if (cookie == (nfs_cookie4) 0) { 9212 if (rd_res->dotp) 9213 rd_res->dotp->d_ino = nodeid; 9214 if (rd_res->dotdotp) 9215 rd_res->dotdotp->d_ino = pnodeid; 9216 } 9217 if (cookie == (nfs_cookie4) 1) { 9218 if (rd_res->dotdotp) 9219 rd_res->dotdotp->d_ino = pnodeid; 9220 } 9221 9222 9223 /* LOOKUPP+GETATTR attemped */ 9224 if (args.array_len == 5 && rd_res->dotdotp) { 9225 if (res.status == NFS4_OK && res_opcnt == 5) { 9226 nfs_fh4 *fhp; 9227 nfs4_sharedfh_t *sfhp; 9228 vnode_t *pvp; 9229 nfs4_ga_res_t *garp; 9230 9231 resop++; /* lookupp */ 9232 resop++; /* getfh */ 9233 fhp = &resop->nfs_resop4_u.opgetfh.object; 9234 9235 resop++; /* getattr of parent */ 9236 9237 /* 9238 * First, take care of finishing the 9239 * readdir results. 9240 */ 9241 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9242 /* 9243 * The d_ino of .. must be the inode number 9244 * of the mounted filesystem. 9245 */ 9246 if (garp->n4g_va.va_mask & AT_NODEID) 9247 rd_res->dotdotp->d_ino = 9248 garp->n4g_va.va_nodeid; 9249 9250 9251 /* 9252 * Next, create the ".." dnlc entry 9253 */ 9254 sfhp = sfh4_get(fhp, mi); 9255 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9256 dnlc_update(vp, "..", pvp); 9257 VN_RELE(pvp); 9258 } 9259 sfh4_rele(&sfhp); 9260 } 9261 } 9262 9263 if (mi->mi_io_kstats) { 9264 mutex_enter(&mi->mi_lock); 9265 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9266 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9267 mutex_exit(&mi->mi_lock); 9268 } 9269 9270 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9271 9272 out: 9273 /* 9274 * If readdir a node that is a stub for a crossed mount point, 9275 * keep the original secinfo flavor for the current file system, 9276 * not the crossed one. 9277 */ 9278 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9279 9280 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9281 } 9282 9283 9284 static int 9285 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9286 { 9287 rnode4_t *rp = VTOR4(bp->b_vp); 9288 int count; 9289 int error; 9290 cred_t *cred_otw = NULL; 9291 offset_t offset; 9292 nfs4_open_stream_t *osp = NULL; 9293 bool_t first_time = TRUE; /* first time getting otw cred */ 9294 bool_t last_time = FALSE; /* last time getting otw cred */ 9295 9296 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9297 9298 DTRACE_IO1(start, struct buf *, bp); 9299 offset = ldbtob(bp->b_lblkno); 9300 9301 if (bp->b_flags & B_READ) { 9302 read_again: 9303 /* 9304 * Releases the osp, if it is provided. 9305 * Puts a hold on the cred_otw and the new osp (if found). 9306 */ 9307 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9308 &first_time, &last_time); 9309 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9310 offset, bp->b_bcount, 9311 &bp->b_resid, cred_otw, 9312 readahead, NULL); 9313 crfree(cred_otw); 9314 if (!error) { 9315 if (bp->b_resid) { 9316 /* 9317 * Didn't get it all because we hit EOF, 9318 * zero all the memory beyond the EOF. 9319 */ 9320 /* bzero(rdaddr + */ 9321 bzero(bp->b_un.b_addr + 9322 bp->b_bcount - bp->b_resid, bp->b_resid); 9323 } 9324 mutex_enter(&rp->r_statelock); 9325 if (bp->b_resid == bp->b_bcount && 9326 offset >= rp->r_size) { 9327 /* 9328 * We didn't read anything at all as we are 9329 * past EOF. Return an error indicator back 9330 * but don't destroy the pages (yet). 9331 */ 9332 error = NFS_EOF; 9333 } 9334 mutex_exit(&rp->r_statelock); 9335 } else if (error == EACCES && last_time == FALSE) { 9336 goto read_again; 9337 } 9338 } else { 9339 if (!(rp->r_flags & R4STALE)) { 9340 write_again: 9341 /* 9342 * Releases the osp, if it is provided. 9343 * Puts a hold on the cred_otw and the new 9344 * osp (if found). 9345 */ 9346 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9347 &first_time, &last_time); 9348 mutex_enter(&rp->r_statelock); 9349 count = MIN(bp->b_bcount, rp->r_size - offset); 9350 mutex_exit(&rp->r_statelock); 9351 if (count < 0) 9352 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9353 #ifdef DEBUG 9354 if (count == 0) { 9355 zoneid_t zoneid = getzoneid(); 9356 9357 zcmn_err(zoneid, CE_WARN, 9358 "nfs4_bio: zero length write at %lld", 9359 offset); 9360 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9361 "b_bcount=%ld, file size=%lld", 9362 rp->r_flags, (long)bp->b_bcount, 9363 rp->r_size); 9364 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9365 if (nfs4_bio_do_stop) 9366 debug_enter("nfs4_bio"); 9367 } 9368 #endif 9369 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9370 count, cred_otw, stab_comm); 9371 if (error == EACCES && last_time == FALSE) { 9372 crfree(cred_otw); 9373 goto write_again; 9374 } 9375 bp->b_error = error; 9376 if (error && error != EINTR && 9377 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9378 /* 9379 * Don't print EDQUOT errors on the console. 9380 * Don't print asynchronous EACCES errors. 9381 * Don't print EFBIG errors. 9382 * Print all other write errors. 9383 */ 9384 if (error != EDQUOT && error != EFBIG && 9385 (error != EACCES || 9386 !(bp->b_flags & B_ASYNC))) 9387 nfs4_write_error(bp->b_vp, 9388 error, cred_otw); 9389 /* 9390 * Update r_error and r_flags as appropriate. 9391 * If the error was ESTALE, then mark the 9392 * rnode as not being writeable and save 9393 * the error status. Otherwise, save any 9394 * errors which occur from asynchronous 9395 * page invalidations. Any errors occurring 9396 * from other operations should be saved 9397 * by the caller. 9398 */ 9399 mutex_enter(&rp->r_statelock); 9400 if (error == ESTALE) { 9401 rp->r_flags |= R4STALE; 9402 if (!rp->r_error) 9403 rp->r_error = error; 9404 } else if (!rp->r_error && 9405 (bp->b_flags & 9406 (B_INVAL|B_FORCE|B_ASYNC)) == 9407 (B_INVAL|B_FORCE|B_ASYNC)) { 9408 rp->r_error = error; 9409 } 9410 mutex_exit(&rp->r_statelock); 9411 } 9412 crfree(cred_otw); 9413 } else 9414 error = rp->r_error; 9415 } 9416 9417 if (error != 0 && error != NFS_EOF) 9418 bp->b_flags |= B_ERROR; 9419 9420 if (osp) 9421 open_stream_rele(osp, rp); 9422 9423 DTRACE_IO1(done, struct buf *, bp); 9424 9425 return (error); 9426 } 9427 9428 /* ARGSUSED */ 9429 static int 9430 nfs4_fid(vnode_t *vp, fid_t *fidp) 9431 { 9432 return (EREMOTE); 9433 } 9434 9435 /* ARGSUSED2 */ 9436 static int 9437 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9438 { 9439 rnode4_t *rp = VTOR4(vp); 9440 9441 if (!write_lock) { 9442 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9443 return (V_WRITELOCK_FALSE); 9444 } 9445 9446 if ((rp->r_flags & R4DIRECTIO) || 9447 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9448 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9449 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9450 return (V_WRITELOCK_FALSE); 9451 nfs_rw_exit(&rp->r_rwlock); 9452 } 9453 9454 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9455 return (V_WRITELOCK_TRUE); 9456 } 9457 9458 /* ARGSUSED */ 9459 static void 9460 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9461 { 9462 rnode4_t *rp = VTOR4(vp); 9463 9464 nfs_rw_exit(&rp->r_rwlock); 9465 } 9466 9467 /* ARGSUSED */ 9468 static int 9469 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 9470 { 9471 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9472 return (EIO); 9473 9474 /* 9475 * Because we stuff the readdir cookie into the offset field 9476 * someone may attempt to do an lseek with the cookie which 9477 * we want to succeed. 9478 */ 9479 if (vp->v_type == VDIR) 9480 return (0); 9481 if (*noffp < 0) 9482 return (EINVAL); 9483 return (0); 9484 } 9485 9486 9487 /* 9488 * Return all the pages from [off..off+len) in file 9489 */ 9490 static int 9491 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9492 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9493 enum seg_rw rw, cred_t *cr) 9494 { 9495 rnode4_t *rp; 9496 int error; 9497 mntinfo4_t *mi; 9498 9499 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9500 return (EIO); 9501 rp = VTOR4(vp); 9502 if (IS_SHADOW(vp, rp)) 9503 vp = RTOV4(rp); 9504 9505 if (vp->v_flag & VNOMAP) 9506 return (ENOSYS); 9507 9508 if (protp != NULL) 9509 *protp = PROT_ALL; 9510 9511 /* 9512 * Now validate that the caches are up to date. 9513 */ 9514 if (error = nfs4_validate_caches(vp, cr)) 9515 return (error); 9516 9517 mi = VTOMI4(vp); 9518 retry: 9519 mutex_enter(&rp->r_statelock); 9520 9521 /* 9522 * Don't create dirty pages faster than they 9523 * can be cleaned so that the system doesn't 9524 * get imbalanced. If the async queue is 9525 * maxed out, then wait for it to drain before 9526 * creating more dirty pages. Also, wait for 9527 * any threads doing pagewalks in the vop_getattr 9528 * entry points so that they don't block for 9529 * long periods. 9530 */ 9531 if (rw == S_CREATE) { 9532 while ((mi->mi_max_threads != 0 && 9533 rp->r_awcount > 2 * mi->mi_max_threads) || 9534 rp->r_gcount > 0) 9535 cv_wait(&rp->r_cv, &rp->r_statelock); 9536 } 9537 9538 /* 9539 * If we are getting called as a side effect of an nfs_write() 9540 * operation the local file size might not be extended yet. 9541 * In this case we want to be able to return pages of zeroes. 9542 */ 9543 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9544 NFS4_DEBUG(nfs4_pageio_debug, 9545 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9546 "len=%llu, size=%llu, attrsize =%llu", off, 9547 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9548 mutex_exit(&rp->r_statelock); 9549 return (EFAULT); /* beyond EOF */ 9550 } 9551 9552 mutex_exit(&rp->r_statelock); 9553 9554 if (len <= PAGESIZE) { 9555 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9556 seg, addr, rw, cr); 9557 NFS4_DEBUG(nfs4_pageio_debug && error, 9558 (CE_NOTE, "getpage error %d; off=%lld, " 9559 "len=%lld", error, off, (u_longlong_t)len)); 9560 } else { 9561 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9562 pl, plsz, seg, addr, rw, cr); 9563 NFS4_DEBUG(nfs4_pageio_debug && error, 9564 (CE_NOTE, "getpages error %d; off=%lld, " 9565 "len=%lld", error, off, (u_longlong_t)len)); 9566 } 9567 9568 switch (error) { 9569 case NFS_EOF: 9570 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9571 goto retry; 9572 case ESTALE: 9573 nfs4_purge_stale_fh(error, vp, cr); 9574 } 9575 9576 return (error); 9577 } 9578 9579 /* 9580 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9581 */ 9582 /* ARGSUSED */ 9583 static int 9584 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9585 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9586 enum seg_rw rw, cred_t *cr) 9587 { 9588 rnode4_t *rp; 9589 uint_t bsize; 9590 struct buf *bp; 9591 page_t *pp; 9592 u_offset_t lbn; 9593 u_offset_t io_off; 9594 u_offset_t blkoff; 9595 u_offset_t rablkoff; 9596 size_t io_len; 9597 uint_t blksize; 9598 int error; 9599 int readahead; 9600 int readahead_issued = 0; 9601 int ra_window; /* readahead window */ 9602 page_t *pagefound; 9603 page_t *savepp; 9604 9605 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9606 return (EIO); 9607 9608 rp = VTOR4(vp); 9609 ASSERT(!IS_SHADOW(vp, rp)); 9610 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9611 9612 reread: 9613 bp = NULL; 9614 pp = NULL; 9615 pagefound = NULL; 9616 9617 if (pl != NULL) 9618 pl[0] = NULL; 9619 9620 error = 0; 9621 lbn = off / bsize; 9622 blkoff = lbn * bsize; 9623 9624 /* 9625 * Queueing up the readahead before doing the synchronous read 9626 * results in a significant increase in read throughput because 9627 * of the increased parallelism between the async threads and 9628 * the process context. 9629 */ 9630 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9631 rw != S_CREATE && 9632 !(vp->v_flag & VNOCACHE)) { 9633 mutex_enter(&rp->r_statelock); 9634 9635 /* 9636 * Calculate the number of readaheads to do. 9637 * a) No readaheads at offset = 0. 9638 * b) Do maximum(nfs4_nra) readaheads when the readahead 9639 * window is closed. 9640 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9641 * upon how far the readahead window is open or close. 9642 * d) No readaheads if rp->r_nextr is not within the scope 9643 * of the readahead window (random i/o). 9644 */ 9645 9646 if (off == 0) 9647 readahead = 0; 9648 else if (blkoff == rp->r_nextr) 9649 readahead = nfs4_nra; 9650 else if (rp->r_nextr > blkoff && 9651 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9652 <= (nfs4_nra - 1))) 9653 readahead = nfs4_nra - ra_window; 9654 else 9655 readahead = 0; 9656 9657 rablkoff = rp->r_nextr; 9658 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9659 mutex_exit(&rp->r_statelock); 9660 if (nfs4_async_readahead(vp, rablkoff + bsize, 9661 addr + (rablkoff + bsize - off), 9662 seg, cr, nfs4_readahead) < 0) { 9663 mutex_enter(&rp->r_statelock); 9664 break; 9665 } 9666 readahead--; 9667 rablkoff += bsize; 9668 /* 9669 * Indicate that we did a readahead so 9670 * readahead offset is not updated 9671 * by the synchronous read below. 9672 */ 9673 readahead_issued = 1; 9674 mutex_enter(&rp->r_statelock); 9675 /* 9676 * set readahead offset to 9677 * offset of last async readahead 9678 * request. 9679 */ 9680 rp->r_nextr = rablkoff; 9681 } 9682 mutex_exit(&rp->r_statelock); 9683 } 9684 9685 again: 9686 if ((pagefound = page_exists(vp, off)) == NULL) { 9687 if (pl == NULL) { 9688 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9689 nfs4_readahead); 9690 } else if (rw == S_CREATE) { 9691 /* 9692 * Block for this page is not allocated, or the offset 9693 * is beyond the current allocation size, or we're 9694 * allocating a swap slot and the page was not found, 9695 * so allocate it and return a zero page. 9696 */ 9697 if ((pp = page_create_va(vp, off, 9698 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9699 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9700 io_len = PAGESIZE; 9701 mutex_enter(&rp->r_statelock); 9702 rp->r_nextr = off + PAGESIZE; 9703 mutex_exit(&rp->r_statelock); 9704 } else { 9705 /* 9706 * Need to go to server to get a block 9707 */ 9708 mutex_enter(&rp->r_statelock); 9709 if (blkoff < rp->r_size && 9710 blkoff + bsize > rp->r_size) { 9711 /* 9712 * If less than a block left in 9713 * file read less than a block. 9714 */ 9715 if (rp->r_size <= off) { 9716 /* 9717 * Trying to access beyond EOF, 9718 * set up to get at least one page. 9719 */ 9720 blksize = off + PAGESIZE - blkoff; 9721 } else 9722 blksize = rp->r_size - blkoff; 9723 } else if ((off == 0) || 9724 (off != rp->r_nextr && !readahead_issued)) { 9725 blksize = PAGESIZE; 9726 blkoff = off; /* block = page here */ 9727 } else 9728 blksize = bsize; 9729 mutex_exit(&rp->r_statelock); 9730 9731 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9732 &io_len, blkoff, blksize, 0); 9733 9734 /* 9735 * Some other thread has entered the page, 9736 * so just use it. 9737 */ 9738 if (pp == NULL) 9739 goto again; 9740 9741 /* 9742 * Now round the request size up to page boundaries. 9743 * This ensures that the entire page will be 9744 * initialized to zeroes if EOF is encountered. 9745 */ 9746 io_len = ptob(btopr(io_len)); 9747 9748 bp = pageio_setup(pp, io_len, vp, B_READ); 9749 ASSERT(bp != NULL); 9750 9751 /* 9752 * pageio_setup should have set b_addr to 0. This 9753 * is correct since we want to do I/O on a page 9754 * boundary. bp_mapin will use this addr to calculate 9755 * an offset, and then set b_addr to the kernel virtual 9756 * address it allocated for us. 9757 */ 9758 ASSERT(bp->b_un.b_addr == 0); 9759 9760 bp->b_edev = 0; 9761 bp->b_dev = 0; 9762 bp->b_lblkno = lbtodb(io_off); 9763 bp->b_file = vp; 9764 bp->b_offset = (offset_t)off; 9765 bp_mapin(bp); 9766 9767 /* 9768 * If doing a write beyond what we believe is EOF, 9769 * don't bother trying to read the pages from the 9770 * server, we'll just zero the pages here. We 9771 * don't check that the rw flag is S_WRITE here 9772 * because some implementations may attempt a 9773 * read access to the buffer before copying data. 9774 */ 9775 mutex_enter(&rp->r_statelock); 9776 if (io_off >= rp->r_size && seg == segkmap) { 9777 mutex_exit(&rp->r_statelock); 9778 bzero(bp->b_un.b_addr, io_len); 9779 } else { 9780 mutex_exit(&rp->r_statelock); 9781 error = nfs4_bio(bp, NULL, cr, FALSE); 9782 } 9783 9784 /* 9785 * Unmap the buffer before freeing it. 9786 */ 9787 bp_mapout(bp); 9788 pageio_done(bp); 9789 9790 savepp = pp; 9791 do { 9792 pp->p_fsdata = C_NOCOMMIT; 9793 } while ((pp = pp->p_next) != savepp); 9794 9795 if (error == NFS_EOF) { 9796 /* 9797 * If doing a write system call just return 9798 * zeroed pages, else user tried to get pages 9799 * beyond EOF, return error. We don't check 9800 * that the rw flag is S_WRITE here because 9801 * some implementations may attempt a read 9802 * access to the buffer before copying data. 9803 */ 9804 if (seg == segkmap) 9805 error = 0; 9806 else 9807 error = EFAULT; 9808 } 9809 9810 if (!readahead_issued && !error) { 9811 mutex_enter(&rp->r_statelock); 9812 rp->r_nextr = io_off + io_len; 9813 mutex_exit(&rp->r_statelock); 9814 } 9815 } 9816 } 9817 9818 out: 9819 if (pl == NULL) 9820 return (error); 9821 9822 if (error) { 9823 if (pp != NULL) 9824 pvn_read_done(pp, B_ERROR); 9825 return (error); 9826 } 9827 9828 if (pagefound) { 9829 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9830 9831 /* 9832 * Page exists in the cache, acquire the appropriate lock. 9833 * If this fails, start all over again. 9834 */ 9835 if ((pp = page_lookup(vp, off, se)) == NULL) { 9836 #ifdef DEBUG 9837 nfs4_lostpage++; 9838 #endif 9839 goto reread; 9840 } 9841 pl[0] = pp; 9842 pl[1] = NULL; 9843 return (0); 9844 } 9845 9846 if (pp != NULL) 9847 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 9848 9849 return (error); 9850 } 9851 9852 static void 9853 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 9854 cred_t *cr) 9855 { 9856 int error; 9857 page_t *pp; 9858 u_offset_t io_off; 9859 size_t io_len; 9860 struct buf *bp; 9861 uint_t bsize, blksize; 9862 rnode4_t *rp = VTOR4(vp); 9863 page_t *savepp; 9864 9865 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9866 9867 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9868 9869 mutex_enter(&rp->r_statelock); 9870 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 9871 /* 9872 * If less than a block left in file read less 9873 * than a block. 9874 */ 9875 blksize = rp->r_size - blkoff; 9876 } else 9877 blksize = bsize; 9878 mutex_exit(&rp->r_statelock); 9879 9880 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 9881 &io_off, &io_len, blkoff, blksize, 1); 9882 /* 9883 * The isra flag passed to the kluster function is 1, we may have 9884 * gotten a return value of NULL for a variety of reasons (# of free 9885 * pages < minfree, someone entered the page on the vnode etc). In all 9886 * cases, we want to punt on the readahead. 9887 */ 9888 if (pp == NULL) 9889 return; 9890 9891 /* 9892 * Now round the request size up to page boundaries. 9893 * This ensures that the entire page will be 9894 * initialized to zeroes if EOF is encountered. 9895 */ 9896 io_len = ptob(btopr(io_len)); 9897 9898 bp = pageio_setup(pp, io_len, vp, B_READ); 9899 ASSERT(bp != NULL); 9900 9901 /* 9902 * pageio_setup should have set b_addr to 0. This is correct since 9903 * we want to do I/O on a page boundary. bp_mapin() will use this addr 9904 * to calculate an offset, and then set b_addr to the kernel virtual 9905 * address it allocated for us. 9906 */ 9907 ASSERT(bp->b_un.b_addr == 0); 9908 9909 bp->b_edev = 0; 9910 bp->b_dev = 0; 9911 bp->b_lblkno = lbtodb(io_off); 9912 bp->b_file = vp; 9913 bp->b_offset = (offset_t)blkoff; 9914 bp_mapin(bp); 9915 9916 /* 9917 * If doing a write beyond what we believe is EOF, don't bother trying 9918 * to read the pages from the server, we'll just zero the pages here. 9919 * We don't check that the rw flag is S_WRITE here because some 9920 * implementations may attempt a read access to the buffer before 9921 * copying data. 9922 */ 9923 mutex_enter(&rp->r_statelock); 9924 if (io_off >= rp->r_size && seg == segkmap) { 9925 mutex_exit(&rp->r_statelock); 9926 bzero(bp->b_un.b_addr, io_len); 9927 error = 0; 9928 } else { 9929 mutex_exit(&rp->r_statelock); 9930 error = nfs4_bio(bp, NULL, cr, TRUE); 9931 if (error == NFS_EOF) 9932 error = 0; 9933 } 9934 9935 /* 9936 * Unmap the buffer before freeing it. 9937 */ 9938 bp_mapout(bp); 9939 pageio_done(bp); 9940 9941 savepp = pp; 9942 do { 9943 pp->p_fsdata = C_NOCOMMIT; 9944 } while ((pp = pp->p_next) != savepp); 9945 9946 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 9947 9948 /* 9949 * In case of error set readahead offset 9950 * to the lowest offset. 9951 * pvn_read_done() calls VN_DISPOSE to destroy the pages 9952 */ 9953 if (error && rp->r_nextr > io_off) { 9954 mutex_enter(&rp->r_statelock); 9955 if (rp->r_nextr > io_off) 9956 rp->r_nextr = io_off; 9957 mutex_exit(&rp->r_statelock); 9958 } 9959 } 9960 9961 /* 9962 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 9963 * If len == 0, do from off to EOF. 9964 * 9965 * The normal cases should be len == 0 && off == 0 (entire vp list) or 9966 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 9967 * (from pageout). 9968 */ 9969 static int 9970 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 9971 { 9972 int error; 9973 rnode4_t *rp; 9974 9975 ASSERT(cr != NULL); 9976 9977 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 9978 return (EIO); 9979 9980 rp = VTOR4(vp); 9981 if (IS_SHADOW(vp, rp)) 9982 vp = RTOV4(rp); 9983 9984 /* 9985 * XXX - Why should this check be made here? 9986 */ 9987 if (vp->v_flag & VNOMAP) 9988 return (ENOSYS); 9989 9990 if (len == 0 && !(flags & B_INVAL) && 9991 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 9992 return (0); 9993 9994 mutex_enter(&rp->r_statelock); 9995 rp->r_count++; 9996 mutex_exit(&rp->r_statelock); 9997 error = nfs4_putpages(vp, off, len, flags, cr); 9998 mutex_enter(&rp->r_statelock); 9999 rp->r_count--; 10000 cv_broadcast(&rp->r_cv); 10001 mutex_exit(&rp->r_statelock); 10002 10003 return (error); 10004 } 10005 10006 /* 10007 * Write out a single page, possibly klustering adjacent dirty pages. 10008 */ 10009 int 10010 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10011 int flags, cred_t *cr) 10012 { 10013 u_offset_t io_off; 10014 u_offset_t lbn_off; 10015 u_offset_t lbn; 10016 size_t io_len; 10017 uint_t bsize; 10018 int error; 10019 rnode4_t *rp; 10020 10021 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10022 ASSERT(pp != NULL); 10023 ASSERT(cr != NULL); 10024 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10025 10026 rp = VTOR4(vp); 10027 ASSERT(rp->r_count > 0); 10028 ASSERT(!IS_SHADOW(vp, rp)); 10029 10030 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10031 lbn = pp->p_offset / bsize; 10032 lbn_off = lbn * bsize; 10033 10034 /* 10035 * Find a kluster that fits in one block, or in 10036 * one page if pages are bigger than blocks. If 10037 * there is less file space allocated than a whole 10038 * page, we'll shorten the i/o request below. 10039 */ 10040 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10041 roundup(bsize, PAGESIZE), flags); 10042 10043 /* 10044 * pvn_write_kluster shouldn't have returned a page with offset 10045 * behind the original page we were given. Verify that. 10046 */ 10047 ASSERT((pp->p_offset / bsize) >= lbn); 10048 10049 /* 10050 * Now pp will have the list of kept dirty pages marked for 10051 * write back. It will also handle invalidation and freeing 10052 * of pages that are not dirty. Check for page length rounding 10053 * problems. 10054 */ 10055 if (io_off + io_len > lbn_off + bsize) { 10056 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10057 io_len = lbn_off + bsize - io_off; 10058 } 10059 /* 10060 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10061 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10062 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10063 * progress and the r_size has not been made consistent with the 10064 * new size of the file. When the uiomove() completes the r_size is 10065 * updated and the R4MODINPROGRESS flag is cleared. 10066 * 10067 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10068 * consistent value of r_size. Without this handshaking, it is 10069 * possible that nfs4_bio() picks up the old value of r_size 10070 * before the uiomove() in writerp4() completes. This will result 10071 * in the write through nfs4_bio() being dropped. 10072 * 10073 * More precisely, there is a window between the time the uiomove() 10074 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10075 * operation intervenes in this window, the page will be picked up, 10076 * because it is dirty (it will be unlocked, unless it was 10077 * pagecreate'd). When the page is picked up as dirty, the dirty 10078 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10079 * checked. This will still be the old size. Therefore the page will 10080 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10081 * the page will be found to be clean and the write will be dropped. 10082 */ 10083 if (rp->r_flags & R4MODINPROGRESS) { 10084 mutex_enter(&rp->r_statelock); 10085 if ((rp->r_flags & R4MODINPROGRESS) && 10086 rp->r_modaddr + MAXBSIZE > io_off && 10087 rp->r_modaddr < io_off + io_len) { 10088 page_t *plist; 10089 /* 10090 * A write is in progress for this region of the file. 10091 * If we did not detect R4MODINPROGRESS here then this 10092 * path through nfs_putapage() would eventually go to 10093 * nfs4_bio() and may not write out all of the data 10094 * in the pages. We end up losing data. So we decide 10095 * to set the modified bit on each page in the page 10096 * list and mark the rnode with R4DIRTY. This write 10097 * will be restarted at some later time. 10098 */ 10099 plist = pp; 10100 while (plist != NULL) { 10101 pp = plist; 10102 page_sub(&plist, pp); 10103 hat_setmod(pp); 10104 page_io_unlock(pp); 10105 page_unlock(pp); 10106 } 10107 rp->r_flags |= R4DIRTY; 10108 mutex_exit(&rp->r_statelock); 10109 if (offp) 10110 *offp = io_off; 10111 if (lenp) 10112 *lenp = io_len; 10113 return (0); 10114 } 10115 mutex_exit(&rp->r_statelock); 10116 } 10117 10118 if (flags & B_ASYNC) { 10119 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10120 nfs4_sync_putapage); 10121 } else 10122 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10123 10124 if (offp) 10125 *offp = io_off; 10126 if (lenp) 10127 *lenp = io_len; 10128 return (error); 10129 } 10130 10131 static int 10132 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10133 int flags, cred_t *cr) 10134 { 10135 int error; 10136 rnode4_t *rp; 10137 10138 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10139 10140 flags |= B_WRITE; 10141 10142 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10143 10144 rp = VTOR4(vp); 10145 10146 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10147 error == EACCES) && 10148 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10149 if (!(rp->r_flags & R4OUTOFSPACE)) { 10150 mutex_enter(&rp->r_statelock); 10151 rp->r_flags |= R4OUTOFSPACE; 10152 mutex_exit(&rp->r_statelock); 10153 } 10154 flags |= B_ERROR; 10155 pvn_write_done(pp, flags); 10156 /* 10157 * If this was not an async thread, then try again to 10158 * write out the pages, but this time, also destroy 10159 * them whether or not the write is successful. This 10160 * will prevent memory from filling up with these 10161 * pages and destroying them is the only alternative 10162 * if they can't be written out. 10163 * 10164 * Don't do this if this is an async thread because 10165 * when the pages are unlocked in pvn_write_done, 10166 * some other thread could have come along, locked 10167 * them, and queued for an async thread. It would be 10168 * possible for all of the async threads to be tied 10169 * up waiting to lock the pages again and they would 10170 * all already be locked and waiting for an async 10171 * thread to handle them. Deadlock. 10172 */ 10173 if (!(flags & B_ASYNC)) { 10174 error = nfs4_putpage(vp, io_off, io_len, 10175 B_INVAL | B_FORCE, cr); 10176 } 10177 } else { 10178 if (error) 10179 flags |= B_ERROR; 10180 else if (rp->r_flags & R4OUTOFSPACE) { 10181 mutex_enter(&rp->r_statelock); 10182 rp->r_flags &= ~R4OUTOFSPACE; 10183 mutex_exit(&rp->r_statelock); 10184 } 10185 pvn_write_done(pp, flags); 10186 if (freemem < desfree) 10187 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10188 NFS4_WRITE_NOWAIT); 10189 } 10190 10191 return (error); 10192 } 10193 10194 #ifdef DEBUG 10195 int nfs4_force_open_before_mmap = 0; 10196 #endif 10197 10198 static int 10199 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10200 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10201 { 10202 struct segvn_crargs vn_a; 10203 int error = 0; 10204 rnode4_t *rp = VTOR4(vp); 10205 mntinfo4_t *mi = VTOMI4(vp); 10206 10207 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10208 return (EIO); 10209 10210 if (vp->v_flag & VNOMAP) 10211 return (ENOSYS); 10212 10213 if (off < 0 || (off + len) < 0) 10214 return (ENXIO); 10215 10216 if (vp->v_type != VREG) 10217 return (ENODEV); 10218 10219 /* 10220 * If the file is delegated to the client don't do anything. 10221 * If the file is not delegated, then validate the data cache. 10222 */ 10223 mutex_enter(&rp->r_statev4_lock); 10224 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10225 mutex_exit(&rp->r_statev4_lock); 10226 error = nfs4_validate_caches(vp, cr); 10227 if (error) 10228 return (error); 10229 } else { 10230 mutex_exit(&rp->r_statev4_lock); 10231 } 10232 10233 /* 10234 * Check to see if the vnode is currently marked as not cachable. 10235 * This means portions of the file are locked (through VOP_FRLOCK). 10236 * In this case the map request must be refused. We use 10237 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10238 */ 10239 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 10240 return (EINTR); 10241 10242 if (vp->v_flag & VNOCACHE) { 10243 error = EAGAIN; 10244 goto done; 10245 } 10246 10247 /* 10248 * Don't allow concurrent locks and mapping if mandatory locking is 10249 * enabled. 10250 */ 10251 if (flk_has_remote_locks(vp)) { 10252 struct vattr va; 10253 va.va_mask = AT_MODE; 10254 error = nfs4getattr(vp, &va, cr); 10255 if (error != 0) 10256 goto done; 10257 if (MANDLOCK(vp, va.va_mode)) { 10258 error = EAGAIN; 10259 goto done; 10260 } 10261 } 10262 10263 /* 10264 * It is possible that the rnode has a lost lock request that we 10265 * are still trying to recover, and that the request conflicts with 10266 * this map request. 10267 * 10268 * An alternative approach would be for nfs4_safemap() to consider 10269 * queued lock requests when deciding whether to set or clear 10270 * VNOCACHE. This would require the frlock code path to call 10271 * nfs4_safemap() after enqueing a lost request. 10272 */ 10273 if (nfs4_map_lost_lock_conflict(vp)) { 10274 error = EAGAIN; 10275 goto done; 10276 } 10277 10278 as_rangelock(as); 10279 if (!(flags & MAP_FIXED)) { 10280 map_addr(addrp, len, off, 1, flags); 10281 if (*addrp == NULL) { 10282 as_rangeunlock(as); 10283 error = ENOMEM; 10284 goto done; 10285 } 10286 } else { 10287 /* 10288 * User specified address - blow away any previous mappings 10289 */ 10290 (void) as_unmap(as, *addrp, len); 10291 } 10292 10293 if (vp->v_type == VREG) { 10294 /* 10295 * We need to retrieve the open stream 10296 */ 10297 nfs4_open_stream_t *osp = NULL; 10298 nfs4_open_owner_t *oop = NULL; 10299 10300 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10301 if (oop != NULL) { 10302 /* returns with 'os_sync_lock' held */ 10303 osp = find_open_stream(oop, rp); 10304 open_owner_rele(oop); 10305 } 10306 if (osp == NULL) { 10307 #ifdef DEBUG 10308 if (nfs4_force_open_before_mmap) { 10309 error = EIO; 10310 goto done; 10311 } 10312 #endif 10313 /* returns with 'os_sync_lock' held */ 10314 error = open_and_get_osp(vp, cr, &osp); 10315 if (osp == NULL) { 10316 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10317 "nfs4_map: we tried to OPEN the file " 10318 "but again no osp, so fail with EIO")); 10319 goto done; 10320 } 10321 } 10322 10323 if (osp->os_failed_reopen) { 10324 mutex_exit(&osp->os_sync_lock); 10325 open_stream_rele(osp, rp); 10326 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10327 "nfs4_map: os_failed_reopen set on " 10328 "osp %p, cr %p, rp %s", (void *)osp, 10329 (void *)cr, rnode4info(rp))); 10330 error = EIO; 10331 goto done; 10332 } 10333 mutex_exit(&osp->os_sync_lock); 10334 open_stream_rele(osp, rp); 10335 } 10336 10337 vn_a.vp = vp; 10338 vn_a.offset = off; 10339 vn_a.type = (flags & MAP_TYPE); 10340 vn_a.prot = (uchar_t)prot; 10341 vn_a.maxprot = (uchar_t)maxprot; 10342 vn_a.flags = (flags & ~MAP_TYPE); 10343 vn_a.cred = cr; 10344 vn_a.amp = NULL; 10345 vn_a.szc = 0; 10346 vn_a.lgrp_mem_policy_flags = 0; 10347 10348 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10349 as_rangeunlock(as); 10350 10351 done: 10352 nfs_rw_exit(&rp->r_lkserlock); 10353 return (error); 10354 } 10355 10356 /* 10357 * We're most likely dealing with a kernel module that likes to READ 10358 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10359 * officially OPEN the file to create the necessary client state 10360 * for bookkeeping of os_mmap_read/write counts. 10361 * 10362 * Since VOP_MAP only passes in a pointer to the vnode rather than 10363 * a double pointer, we can't handle the case where nfs4open_otw() 10364 * returns a different vnode than the one passed into VOP_MAP (since 10365 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10366 * we return NULL and let nfs4_map() fail. Note: the only case where 10367 * this should happen is if the file got removed and replaced with the 10368 * same name on the server (in addition to the fact that we're trying 10369 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10370 */ 10371 static int 10372 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10373 { 10374 rnode4_t *rp, *drp; 10375 vnode_t *dvp, *open_vp; 10376 char file_name[MAXNAMELEN]; 10377 int just_created; 10378 nfs4_open_stream_t *osp; 10379 nfs4_open_owner_t *oop; 10380 int error; 10381 10382 *ospp = NULL; 10383 open_vp = map_vp; 10384 10385 rp = VTOR4(open_vp); 10386 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10387 return (error); 10388 drp = VTOR4(dvp); 10389 10390 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10391 VN_RELE(dvp); 10392 return (EINTR); 10393 } 10394 10395 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10396 nfs_rw_exit(&drp->r_rwlock); 10397 VN_RELE(dvp); 10398 return (error); 10399 } 10400 10401 mutex_enter(&rp->r_statev4_lock); 10402 if (rp->created_v4) { 10403 rp->created_v4 = 0; 10404 mutex_exit(&rp->r_statev4_lock); 10405 10406 dnlc_update(dvp, file_name, open_vp); 10407 /* This is needed so we don't bump the open ref count */ 10408 just_created = 1; 10409 } else { 10410 mutex_exit(&rp->r_statev4_lock); 10411 just_created = 0; 10412 } 10413 10414 VN_HOLD(map_vp); 10415 10416 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10417 just_created); 10418 if (error) { 10419 nfs_rw_exit(&drp->r_rwlock); 10420 VN_RELE(dvp); 10421 VN_RELE(map_vp); 10422 return (error); 10423 } 10424 10425 nfs_rw_exit(&drp->r_rwlock); 10426 VN_RELE(dvp); 10427 10428 /* 10429 * If nfs4open_otw() returned a different vnode then "undo" 10430 * the open and return failure to the caller. 10431 */ 10432 if (!VN_CMP(open_vp, map_vp)) { 10433 nfs4_error_t e; 10434 10435 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10436 "open returned a different vnode")); 10437 /* 10438 * If there's an error, ignore it, 10439 * and let VOP_INACTIVE handle it. 10440 */ 10441 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10442 CLOSE_NORM, 0, 0, 0); 10443 VN_RELE(map_vp); 10444 return (EIO); 10445 } 10446 10447 VN_RELE(map_vp); 10448 10449 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10450 if (!oop) { 10451 nfs4_error_t e; 10452 10453 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10454 "no open owner")); 10455 /* 10456 * If there's an error, ignore it, 10457 * and let VOP_INACTIVE handle it. 10458 */ 10459 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10460 CLOSE_NORM, 0, 0, 0); 10461 return (EIO); 10462 } 10463 osp = find_open_stream(oop, rp); 10464 open_owner_rele(oop); 10465 *ospp = osp; 10466 return (0); 10467 } 10468 10469 /* 10470 * Please be aware that when this function is called, the address space write 10471 * a_lock is held. Do not put over the wire calls in this function. 10472 */ 10473 /* ARGSUSED */ 10474 static int 10475 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10476 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10477 { 10478 rnode4_t *rp; 10479 int error = 0; 10480 mntinfo4_t *mi; 10481 10482 mi = VTOMI4(vp); 10483 rp = VTOR4(vp); 10484 10485 if (nfs_zone() != mi->mi_zone) 10486 return (EIO); 10487 if (vp->v_flag & VNOMAP) 10488 return (ENOSYS); 10489 10490 /* 10491 * Need to hold rwlock while incrementing the mapcnt so that 10492 * mmap'ing can be serialized with writes so that the caching 10493 * can be handled correctly. 10494 * 10495 * Don't need to update the open stream first, since this 10496 * mmap can't add any additional share access that isn't 10497 * already contained in the open stream (for the case where we 10498 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10499 * take into account os_mmap_read[write] counts). 10500 */ 10501 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10502 return (EINTR); 10503 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10504 nfs_rw_exit(&rp->r_rwlock); 10505 10506 if (vp->v_type == VREG) { 10507 /* 10508 * We need to retrieve the open stream and update the counts. 10509 * If there is no open stream here, something is wrong. 10510 */ 10511 nfs4_open_stream_t *osp = NULL; 10512 nfs4_open_owner_t *oop = NULL; 10513 10514 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10515 if (oop != NULL) { 10516 /* returns with 'os_sync_lock' held */ 10517 osp = find_open_stream(oop, rp); 10518 open_owner_rele(oop); 10519 } 10520 if (osp == NULL) { 10521 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10522 "nfs4_addmap: we should have an osp" 10523 "but we don't, so fail with EIO")); 10524 error = EIO; 10525 goto out; 10526 } 10527 10528 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10529 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10530 10531 /* 10532 * Update the map count in the open stream. 10533 * This is necessary in the case where we 10534 * open/mmap/close/, then the server reboots, and we 10535 * attempt to reopen. If the mmap doesn't add share 10536 * access then we send an invalid reopen with 10537 * access = NONE. 10538 * 10539 * We need to specifically check each PROT_* so a mmap 10540 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10541 * read and write access. A simple comparison of prot 10542 * to ~PROT_WRITE to determine read access is insufficient 10543 * since prot can be |= with PROT_USER, etc. 10544 */ 10545 10546 /* 10547 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10548 */ 10549 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10550 osp->os_mmap_write += btopr(len); 10551 if (maxprot & PROT_READ) 10552 osp->os_mmap_read += btopr(len); 10553 if (maxprot & PROT_EXEC) 10554 osp->os_mmap_read += btopr(len); 10555 /* 10556 * Ensure that os_mmap_read gets incremented, even if 10557 * maxprot were to look like PROT_NONE. 10558 */ 10559 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10560 !(maxprot & PROT_EXEC)) 10561 osp->os_mmap_read += btopr(len); 10562 osp->os_mapcnt += btopr(len); 10563 mutex_exit(&osp->os_sync_lock); 10564 open_stream_rele(osp, rp); 10565 } 10566 10567 out: 10568 /* 10569 * If we got an error, then undo our 10570 * incrementing of 'r_mapcnt'. 10571 */ 10572 10573 if (error) { 10574 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10575 ASSERT(rp->r_mapcnt >= 0); 10576 } 10577 return (error); 10578 } 10579 10580 static int 10581 nfs4_cmp(vnode_t *vp1, vnode_t *vp2) 10582 { 10583 10584 return (VTOR4(vp1) == VTOR4(vp2)); 10585 } 10586 10587 static int 10588 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10589 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 10590 { 10591 int rc; 10592 u_offset_t start, end; 10593 rnode4_t *rp; 10594 int error = 0, intr = INTR4(vp); 10595 nfs4_error_t e; 10596 10597 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10598 return (EIO); 10599 10600 /* check for valid cmd parameter */ 10601 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10602 return (EINVAL); 10603 10604 /* Verify l_type. */ 10605 switch (bfp->l_type) { 10606 case F_RDLCK: 10607 if (cmd != F_GETLK && !(flag & FREAD)) 10608 return (EBADF); 10609 break; 10610 case F_WRLCK: 10611 if (cmd != F_GETLK && !(flag & FWRITE)) 10612 return (EBADF); 10613 break; 10614 case F_UNLCK: 10615 intr = 0; 10616 break; 10617 10618 default: 10619 return (EINVAL); 10620 } 10621 10622 /* check the validity of the lock range */ 10623 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10624 return (rc); 10625 if (rc = flk_check_lock_data(start, end, MAXEND)) 10626 return (rc); 10627 10628 /* 10629 * If the filesystem is mounted using local locking, pass the 10630 * request off to the local locking code. 10631 */ 10632 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10633 if (cmd == F_SETLK || cmd == F_SETLKW) { 10634 /* 10635 * For complete safety, we should be holding 10636 * r_lkserlock. However, we can't call 10637 * nfs4_safelock and then fs_frlock while 10638 * holding r_lkserlock, so just invoke 10639 * nfs4_safelock and expect that this will 10640 * catch enough of the cases. 10641 */ 10642 if (!nfs4_safelock(vp, bfp, cr)) 10643 return (EAGAIN); 10644 } 10645 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 10646 } 10647 10648 rp = VTOR4(vp); 10649 10650 /* 10651 * Check whether the given lock request can proceed, given the 10652 * current file mappings. 10653 */ 10654 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10655 return (EINTR); 10656 if (cmd == F_SETLK || cmd == F_SETLKW) { 10657 if (!nfs4_safelock(vp, bfp, cr)) { 10658 rc = EAGAIN; 10659 goto done; 10660 } 10661 } 10662 10663 /* 10664 * Flush the cache after waiting for async I/O to finish. For new 10665 * locks, this is so that the process gets the latest bits from the 10666 * server. For unlocks, this is so that other clients see the 10667 * latest bits once the file has been unlocked. If currently dirty 10668 * pages can't be flushed, then don't allow a lock to be set. But 10669 * allow unlocks to succeed, to avoid having orphan locks on the 10670 * server. 10671 */ 10672 if (cmd != F_GETLK) { 10673 mutex_enter(&rp->r_statelock); 10674 while (rp->r_count > 0) { 10675 if (intr) { 10676 klwp_t *lwp = ttolwp(curthread); 10677 10678 if (lwp != NULL) 10679 lwp->lwp_nostop++; 10680 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 10681 if (lwp != NULL) 10682 lwp->lwp_nostop--; 10683 rc = EINTR; 10684 break; 10685 } 10686 if (lwp != NULL) 10687 lwp->lwp_nostop--; 10688 } else 10689 cv_wait(&rp->r_cv, &rp->r_statelock); 10690 } 10691 mutex_exit(&rp->r_statelock); 10692 if (rc != 0) 10693 goto done; 10694 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 10695 if (error) { 10696 if (error == ENOSPC || error == EDQUOT) { 10697 mutex_enter(&rp->r_statelock); 10698 if (!rp->r_error) 10699 rp->r_error = error; 10700 mutex_exit(&rp->r_statelock); 10701 } 10702 if (bfp->l_type != F_UNLCK) { 10703 rc = ENOLCK; 10704 goto done; 10705 } 10706 } 10707 } 10708 10709 /* 10710 * Call the lock manager to do the real work of contacting 10711 * the server and obtaining the lock. 10712 */ 10713 10714 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10715 cr, &e, NULL, NULL); 10716 rc = e.error; 10717 10718 if (rc == 0) 10719 nfs4_lockcompletion(vp, cmd); 10720 10721 done: 10722 nfs_rw_exit(&rp->r_lkserlock); 10723 10724 return (rc); 10725 } 10726 10727 /* 10728 * Free storage space associated with the specified vnode. The portion 10729 * to be freed is specified by bfp->l_start and bfp->l_len (already 10730 * normalized to a "whence" of 0). 10731 * 10732 * This is an experimental facility whose continued existence is not 10733 * guaranteed. Currently, we only support the special case 10734 * of l_len == 0, meaning free to end of file. 10735 */ 10736 /* ARGSUSED */ 10737 static int 10738 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10739 offset_t offset, cred_t *cr, caller_context_t *ct) 10740 { 10741 int error; 10742 10743 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10744 return (EIO); 10745 ASSERT(vp->v_type == VREG); 10746 if (cmd != F_FREESP) 10747 return (EINVAL); 10748 10749 error = convoff(vp, bfp, 0, offset); 10750 if (!error) { 10751 ASSERT(bfp->l_start >= 0); 10752 if (bfp->l_len == 0) { 10753 struct vattr va; 10754 10755 va.va_mask = AT_SIZE; 10756 va.va_size = bfp->l_start; 10757 error = nfs4setattr(vp, &va, 0, cr, NULL); 10758 } else 10759 error = EINVAL; 10760 } 10761 10762 return (error); 10763 } 10764 10765 /* ARGSUSED */ 10766 static int 10767 nfs4_realvp(vnode_t *vp, vnode_t **vpp) 10768 { 10769 return (EINVAL); 10770 } 10771 10772 /* 10773 * Setup and add an address space callback to do the work of the delmap call. 10774 * The callback will (and must be) deleted in the actual callback function. 10775 * 10776 * This is done in order to take care of the problem that we have with holding 10777 * the address space's a_lock for a long period of time (e.g. if the NFS server 10778 * is down). Callbacks will be executed in the address space code while the 10779 * a_lock is not held. Holding the address space's a_lock causes things such 10780 * as ps and fork to hang because they are trying to acquire this lock as well. 10781 */ 10782 /* ARGSUSED */ 10783 static int 10784 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10785 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 10786 { 10787 int caller_found; 10788 int error; 10789 rnode4_t *rp; 10790 nfs4_delmap_args_t *dmapp; 10791 nfs4_delmapcall_t *delmap_call; 10792 10793 if (vp->v_flag & VNOMAP) 10794 return (ENOSYS); 10795 10796 /* 10797 * A process may not change zones if it has NFS pages mmap'ed 10798 * in, so we can't legitimately get here from the wrong zone. 10799 */ 10800 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10801 10802 rp = VTOR4(vp); 10803 10804 /* 10805 * The way that the address space of this process deletes its mapping 10806 * of this file is via the following call chains: 10807 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10808 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10809 * 10810 * With the use of address space callbacks we are allowed to drop the 10811 * address space lock, a_lock, while executing the NFS operations that 10812 * need to go over the wire. Returning EAGAIN to the caller of this 10813 * function is what drives the execution of the callback that we add 10814 * below. The callback will be executed by the address space code 10815 * after dropping the a_lock. When the callback is finished, since 10816 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 10817 * is called again on the same segment to finish the rest of the work 10818 * that needs to happen during unmapping. 10819 * 10820 * This action of calling back into the segment driver causes 10821 * nfs4_delmap() to get called again, but since the callback was 10822 * already executed at this point, it already did the work and there 10823 * is nothing left for us to do. 10824 * 10825 * To Summarize: 10826 * - The first time nfs4_delmap is called by the current thread is when 10827 * we add the caller associated with this delmap to the delmap caller 10828 * list, add the callback, and return EAGAIN. 10829 * - The second time in this call chain when nfs4_delmap is called we 10830 * will find this caller in the delmap caller list and realize there 10831 * is no more work to do thus removing this caller from the list and 10832 * returning the error that was set in the callback execution. 10833 */ 10834 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 10835 if (caller_found) { 10836 /* 10837 * 'error' is from the actual delmap operations. To avoid 10838 * hangs, we need to handle the return of EAGAIN differently 10839 * since this is what drives the callback execution. 10840 * In this case, we don't want to return EAGAIN and do the 10841 * callback execution because there are none to execute. 10842 */ 10843 if (error == EAGAIN) 10844 return (0); 10845 else 10846 return (error); 10847 } 10848 10849 /* current caller was not in the list */ 10850 delmap_call = nfs4_init_delmapcall(); 10851 10852 mutex_enter(&rp->r_statelock); 10853 list_insert_tail(&rp->r_indelmap, delmap_call); 10854 mutex_exit(&rp->r_statelock); 10855 10856 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 10857 10858 dmapp->vp = vp; 10859 dmapp->off = off; 10860 dmapp->addr = addr; 10861 dmapp->len = len; 10862 dmapp->prot = prot; 10863 dmapp->maxprot = maxprot; 10864 dmapp->flags = flags; 10865 dmapp->cr = cr; 10866 dmapp->caller = delmap_call; 10867 10868 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 10869 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 10870 10871 return (error ? error : EAGAIN); 10872 } 10873 10874 static nfs4_delmapcall_t * 10875 nfs4_init_delmapcall() 10876 { 10877 nfs4_delmapcall_t *delmap_call; 10878 10879 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 10880 delmap_call->call_id = curthread; 10881 delmap_call->error = 0; 10882 10883 return (delmap_call); 10884 } 10885 10886 static void 10887 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 10888 { 10889 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 10890 } 10891 10892 /* 10893 * Searches for the current delmap caller (based on curthread) in the list of 10894 * callers. If it is found, we remove it and free the delmap caller. 10895 * Returns: 10896 * 0 if the caller wasn't found 10897 * 1 if the caller was found, removed and freed. *errp will be set 10898 * to what the result of the delmap was. 10899 */ 10900 static int 10901 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 10902 { 10903 nfs4_delmapcall_t *delmap_call; 10904 10905 /* 10906 * If the list doesn't exist yet, we create it and return 10907 * that the caller wasn't found. No list = no callers. 10908 */ 10909 mutex_enter(&rp->r_statelock); 10910 if (!(rp->r_flags & R4DELMAPLIST)) { 10911 /* The list does not exist */ 10912 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 10913 offsetof(nfs4_delmapcall_t, call_node)); 10914 rp->r_flags |= R4DELMAPLIST; 10915 mutex_exit(&rp->r_statelock); 10916 return (0); 10917 } else { 10918 /* The list exists so search it */ 10919 for (delmap_call = list_head(&rp->r_indelmap); 10920 delmap_call != NULL; 10921 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 10922 if (delmap_call->call_id == curthread) { 10923 /* current caller is in the list */ 10924 *errp = delmap_call->error; 10925 list_remove(&rp->r_indelmap, delmap_call); 10926 mutex_exit(&rp->r_statelock); 10927 nfs4_free_delmapcall(delmap_call); 10928 return (1); 10929 } 10930 } 10931 } 10932 mutex_exit(&rp->r_statelock); 10933 return (0); 10934 } 10935 10936 /* 10937 * Remove some pages from an mmap'd vnode. Just update the 10938 * count of pages. If doing close-to-open, then flush and 10939 * commit all of the pages associated with this file. 10940 * Otherwise, start an asynchronous page flush to write out 10941 * any dirty pages. This will also associate a credential 10942 * with the rnode which can be used to write the pages. 10943 */ 10944 /* ARGSUSED */ 10945 static void 10946 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 10947 { 10948 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 10949 rnode4_t *rp; 10950 mntinfo4_t *mi; 10951 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 10952 10953 rp = VTOR4(dmapp->vp); 10954 mi = VTOMI4(dmapp->vp); 10955 10956 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 10957 ASSERT(rp->r_mapcnt >= 0); 10958 10959 /* 10960 * Initiate a page flush and potential commit if there are 10961 * pages, the file system was not mounted readonly, the segment 10962 * was mapped shared, and the pages themselves were writeable. 10963 */ 10964 if (nfs4_has_pages(dmapp->vp) && 10965 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 10966 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 10967 mutex_enter(&rp->r_statelock); 10968 rp->r_flags |= R4DIRTY; 10969 mutex_exit(&rp->r_statelock); 10970 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 10971 dmapp->len, dmapp->cr); 10972 if (!e.error) { 10973 mutex_enter(&rp->r_statelock); 10974 e.error = rp->r_error; 10975 rp->r_error = 0; 10976 mutex_exit(&rp->r_statelock); 10977 } 10978 } else 10979 e.error = 0; 10980 10981 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 10982 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 10983 B_INVAL, dmapp->cr); 10984 10985 if (e.error) { 10986 e.stat = puterrno4(e.error); 10987 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 10988 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 10989 dmapp->caller->error = e.error; 10990 } 10991 10992 /* Check to see if we need to close the file */ 10993 10994 if (dmapp->vp->v_type == VREG) { 10995 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 10996 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 10997 10998 if (e.error != 0 || e.stat != NFS4_OK) { 10999 /* 11000 * Since it is possible that e.error == 0 and 11001 * e.stat != NFS4_OK (and vice versa), 11002 * we do the proper checking in order to get both 11003 * e.error and e.stat reporting the correct info. 11004 */ 11005 if (e.stat == NFS4_OK) 11006 e.stat = puterrno4(e.error); 11007 if (e.error == 0) 11008 e.error = geterrno4(e.stat); 11009 11010 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11011 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11012 dmapp->caller->error = e.error; 11013 } 11014 } 11015 11016 (void) as_delete_callback(as, arg); 11017 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11018 } 11019 11020 11021 static uint_t 11022 fattr4_maxfilesize_to_bits(uint64_t ll) 11023 { 11024 uint_t l = 1; 11025 11026 if (ll == 0) { 11027 return (0); 11028 } 11029 11030 if (ll & 0xffffffff00000000) { 11031 l += 32; ll >>= 32; 11032 } 11033 if (ll & 0xffff0000) { 11034 l += 16; ll >>= 16; 11035 } 11036 if (ll & 0xff00) { 11037 l += 8; ll >>= 8; 11038 } 11039 if (ll & 0xf0) { 11040 l += 4; ll >>= 4; 11041 } 11042 if (ll & 0xc) { 11043 l += 2; ll >>= 2; 11044 } 11045 if (ll & 0x2) { 11046 l += 1; 11047 } 11048 return (l); 11049 } 11050 11051 static int 11052 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 11053 { 11054 int error; 11055 hrtime_t t; 11056 rnode4_t *rp; 11057 nfs4_ga_res_t gar; 11058 nfs4_ga_ext_res_t ger; 11059 11060 gar.n4g_ext_res = &ger; 11061 11062 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11063 return (EIO); 11064 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11065 *valp = MAXPATHLEN; 11066 return (0); 11067 } 11068 if (cmd == _PC_ACL_ENABLED) { 11069 *valp = _ACL_ACE_ENABLED; 11070 return (0); 11071 } 11072 11073 rp = VTOR4(vp); 11074 if (cmd == _PC_XATTR_EXISTS) { 11075 /* 11076 * Eventually should attempt small client readdir before 11077 * going otw with GETATTR(FATTR4_NAMED_ATTR). For now 11078 * just drive the OTW getattr. This is required because 11079 * _PC_XATTR_EXISTS can only return true if attributes 11080 * exist -- simply checking for existance of the attrdir 11081 * is not sufficient. 11082 * 11083 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11084 * is NULL. Once the xadir vp exists, we can create xattrs, 11085 * and we don't have any way to update the "base" object's 11086 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11087 * could help out. 11088 */ 11089 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11090 rp->r_xattr_dir == NULL) { 11091 *valp = rp->r_pathconf.pc4_xattr_exists; 11092 return (0); 11093 } 11094 } else { /* OLD CODE */ 11095 if (ATTRCACHE4_VALID(vp)) { 11096 mutex_enter(&rp->r_statelock); 11097 if (rp->r_pathconf.pc4_cache_valid) { 11098 error = 0; 11099 switch (cmd) { 11100 case _PC_FILESIZEBITS: 11101 *valp = 11102 rp->r_pathconf.pc4_filesizebits; 11103 break; 11104 case _PC_LINK_MAX: 11105 *valp = 11106 rp->r_pathconf.pc4_link_max; 11107 break; 11108 case _PC_NAME_MAX: 11109 *valp = 11110 rp->r_pathconf.pc4_name_max; 11111 break; 11112 case _PC_CHOWN_RESTRICTED: 11113 *valp = 11114 rp->r_pathconf.pc4_chown_restricted; 11115 break; 11116 case _PC_NO_TRUNC: 11117 *valp = 11118 rp->r_pathconf.pc4_no_trunc; 11119 break; 11120 default: 11121 error = EINVAL; 11122 break; 11123 } 11124 mutex_exit(&rp->r_statelock); 11125 #ifdef DEBUG 11126 nfs4_pathconf_cache_hits++; 11127 #endif 11128 return (error); 11129 } 11130 mutex_exit(&rp->r_statelock); 11131 } 11132 } 11133 #ifdef DEBUG 11134 nfs4_pathconf_cache_misses++; 11135 #endif 11136 11137 t = gethrtime(); 11138 11139 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11140 11141 if (error) { 11142 mutex_enter(&rp->r_statelock); 11143 rp->r_pathconf.pc4_cache_valid = FALSE; 11144 rp->r_pathconf.pc4_xattr_valid = FALSE; 11145 mutex_exit(&rp->r_statelock); 11146 return (error); 11147 } 11148 11149 /* interpret the max filesize */ 11150 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11151 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11152 11153 /* Store the attributes we just received */ 11154 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11155 11156 switch (cmd) { 11157 case _PC_FILESIZEBITS: 11158 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11159 break; 11160 case _PC_LINK_MAX: 11161 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11162 break; 11163 case _PC_NAME_MAX: 11164 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11165 break; 11166 case _PC_CHOWN_RESTRICTED: 11167 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11168 break; 11169 case _PC_NO_TRUNC: 11170 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11171 break; 11172 case _PC_XATTR_EXISTS: 11173 *valp = gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists; 11174 break; 11175 default: 11176 return (EINVAL); 11177 } 11178 11179 return (0); 11180 } 11181 11182 /* 11183 * Called by async thread to do synchronous pageio. Do the i/o, wait 11184 * for it to complete, and cleanup the page list when done. 11185 */ 11186 static int 11187 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11188 int flags, cred_t *cr) 11189 { 11190 int error; 11191 11192 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11193 11194 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11195 if (flags & B_READ) 11196 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11197 else 11198 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11199 return (error); 11200 } 11201 11202 static int 11203 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11204 int flags, cred_t *cr) 11205 { 11206 int error; 11207 rnode4_t *rp; 11208 11209 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11210 return (EIO); 11211 11212 if (pp == NULL) 11213 return (EINVAL); 11214 11215 rp = VTOR4(vp); 11216 mutex_enter(&rp->r_statelock); 11217 rp->r_count++; 11218 mutex_exit(&rp->r_statelock); 11219 11220 if (flags & B_ASYNC) { 11221 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11222 nfs4_sync_pageio); 11223 } else 11224 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11225 mutex_enter(&rp->r_statelock); 11226 rp->r_count--; 11227 cv_broadcast(&rp->r_cv); 11228 mutex_exit(&rp->r_statelock); 11229 return (error); 11230 } 11231 11232 static void 11233 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 11234 { 11235 int error; 11236 rnode4_t *rp; 11237 page_t *plist; 11238 page_t *pptr; 11239 offset3 offset; 11240 count3 len; 11241 k_sigset_t smask; 11242 11243 /* 11244 * We should get called with fl equal to either B_FREE or 11245 * B_INVAL. Any other value is illegal. 11246 * 11247 * The page that we are either supposed to free or destroy 11248 * should be exclusive locked and its io lock should not 11249 * be held. 11250 */ 11251 ASSERT(fl == B_FREE || fl == B_INVAL); 11252 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11253 11254 rp = VTOR4(vp); 11255 11256 /* 11257 * If the page doesn't need to be committed or we shouldn't 11258 * even bother attempting to commit it, then just make sure 11259 * that the p_fsdata byte is clear and then either free or 11260 * destroy the page as appropriate. 11261 */ 11262 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11263 pp->p_fsdata = C_NOCOMMIT; 11264 if (fl == B_FREE) 11265 page_free(pp, dn); 11266 else 11267 page_destroy(pp, dn); 11268 return; 11269 } 11270 11271 /* 11272 * If there is a page invalidation operation going on, then 11273 * if this is one of the pages being destroyed, then just 11274 * clear the p_fsdata byte and then either free or destroy 11275 * the page as appropriate. 11276 */ 11277 mutex_enter(&rp->r_statelock); 11278 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11279 mutex_exit(&rp->r_statelock); 11280 pp->p_fsdata = C_NOCOMMIT; 11281 if (fl == B_FREE) 11282 page_free(pp, dn); 11283 else 11284 page_destroy(pp, dn); 11285 return; 11286 } 11287 11288 /* 11289 * If we are freeing this page and someone else is already 11290 * waiting to do a commit, then just unlock the page and 11291 * return. That other thread will take care of commiting 11292 * this page. The page can be freed sometime after the 11293 * commit has finished. Otherwise, if the page is marked 11294 * as delay commit, then we may be getting called from 11295 * pvn_write_done, one page at a time. This could result 11296 * in one commit per page, so we end up doing lots of small 11297 * commits instead of fewer larger commits. This is bad, 11298 * we want do as few commits as possible. 11299 */ 11300 if (fl == B_FREE) { 11301 if (rp->r_flags & R4COMMITWAIT) { 11302 page_unlock(pp); 11303 mutex_exit(&rp->r_statelock); 11304 return; 11305 } 11306 if (pp->p_fsdata == C_DELAYCOMMIT) { 11307 pp->p_fsdata = C_COMMIT; 11308 page_unlock(pp); 11309 mutex_exit(&rp->r_statelock); 11310 return; 11311 } 11312 } 11313 11314 /* 11315 * Check to see if there is a signal which would prevent an 11316 * attempt to commit the pages from being successful. If so, 11317 * then don't bother with all of the work to gather pages and 11318 * generate the unsuccessful RPC. Just return from here and 11319 * let the page be committed at some later time. 11320 */ 11321 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11322 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11323 sigunintr(&smask); 11324 page_unlock(pp); 11325 mutex_exit(&rp->r_statelock); 11326 return; 11327 } 11328 sigunintr(&smask); 11329 11330 /* 11331 * We are starting to need to commit pages, so let's try 11332 * to commit as many as possible at once to reduce the 11333 * overhead. 11334 * 11335 * Set the `commit inprogress' state bit. We must 11336 * first wait until any current one finishes. Then 11337 * we initialize the c_pages list with this page. 11338 */ 11339 while (rp->r_flags & R4COMMIT) { 11340 rp->r_flags |= R4COMMITWAIT; 11341 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11342 rp->r_flags &= ~R4COMMITWAIT; 11343 } 11344 rp->r_flags |= R4COMMIT; 11345 mutex_exit(&rp->r_statelock); 11346 ASSERT(rp->r_commit.c_pages == NULL); 11347 rp->r_commit.c_pages = pp; 11348 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11349 rp->r_commit.c_commlen = PAGESIZE; 11350 11351 /* 11352 * Gather together all other pages which can be committed. 11353 * They will all be chained off r_commit.c_pages. 11354 */ 11355 nfs4_get_commit(vp); 11356 11357 /* 11358 * Clear the `commit inprogress' status and disconnect 11359 * the list of pages to be committed from the rnode. 11360 * At this same time, we also save the starting offset 11361 * and length of data to be committed on the server. 11362 */ 11363 plist = rp->r_commit.c_pages; 11364 rp->r_commit.c_pages = NULL; 11365 offset = rp->r_commit.c_commbase; 11366 len = rp->r_commit.c_commlen; 11367 mutex_enter(&rp->r_statelock); 11368 rp->r_flags &= ~R4COMMIT; 11369 cv_broadcast(&rp->r_commit.c_cv); 11370 mutex_exit(&rp->r_statelock); 11371 11372 if (curproc == proc_pageout || curproc == proc_fsflush || 11373 nfs_zone() != VTOMI4(vp)->mi_zone) { 11374 nfs4_async_commit(vp, plist, offset, len, 11375 cr, do_nfs4_async_commit); 11376 return; 11377 } 11378 11379 /* 11380 * Actually generate the COMMIT op over the wire operation. 11381 */ 11382 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11383 11384 /* 11385 * If we got an error during the commit, just unlock all 11386 * of the pages. The pages will get retransmitted to the 11387 * server during a putpage operation. 11388 */ 11389 if (error) { 11390 while (plist != NULL) { 11391 pptr = plist; 11392 page_sub(&plist, pptr); 11393 page_unlock(pptr); 11394 } 11395 return; 11396 } 11397 11398 /* 11399 * We've tried as hard as we can to commit the data to stable 11400 * storage on the server. We just unlock the rest of the pages 11401 * and clear the commit required state. They will be put 11402 * onto the tail of the cachelist if they are nolonger 11403 * mapped. 11404 */ 11405 while (plist != pp) { 11406 pptr = plist; 11407 page_sub(&plist, pptr); 11408 pptr->p_fsdata = C_NOCOMMIT; 11409 page_unlock(pptr); 11410 } 11411 11412 /* 11413 * It is possible that nfs4_commit didn't return error but 11414 * some other thread has modified the page we are going 11415 * to free/destroy. 11416 * In this case we need to rewrite the page. Do an explicit check 11417 * before attempting to free/destroy the page. If modified, needs to 11418 * be rewritten so unlock the page and return. 11419 */ 11420 if (hat_ismod(pp)) { 11421 pp->p_fsdata = C_NOCOMMIT; 11422 page_unlock(pp); 11423 return; 11424 } 11425 11426 /* 11427 * Now, as appropriate, either free or destroy the page 11428 * that we were called with. 11429 */ 11430 pp->p_fsdata = C_NOCOMMIT; 11431 if (fl == B_FREE) 11432 page_free(pp, dn); 11433 else 11434 page_destroy(pp, dn); 11435 } 11436 11437 /* 11438 * Commit requires that the current fh be the file written to. 11439 * The compound op structure is: 11440 * PUTFH(file), COMMIT 11441 */ 11442 static int 11443 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11444 { 11445 COMPOUND4args_clnt args; 11446 COMPOUND4res_clnt res; 11447 COMMIT4res *cm_res; 11448 nfs_argop4 argop[2]; 11449 nfs_resop4 *resop; 11450 int doqueue; 11451 mntinfo4_t *mi; 11452 rnode4_t *rp; 11453 cred_t *cred_otw = NULL; 11454 bool_t needrecov = FALSE; 11455 nfs4_recov_state_t recov_state; 11456 nfs4_open_stream_t *osp = NULL; 11457 bool_t first_time = TRUE; /* first time getting OTW cred */ 11458 bool_t last_time = FALSE; /* last time getting OTW cred */ 11459 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11460 11461 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11462 11463 rp = VTOR4(vp); 11464 11465 mi = VTOMI4(vp); 11466 recov_state.rs_flags = 0; 11467 recov_state.rs_num_retry_despite_err = 0; 11468 get_commit_cred: 11469 /* 11470 * Releases the osp, if a valid open stream is provided. 11471 * Puts a hold on the cred_otw and the new osp (if found). 11472 */ 11473 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11474 &first_time, &last_time); 11475 args.ctag = TAG_COMMIT; 11476 recov_retry: 11477 /* 11478 * Commit ops: putfh file; commit 11479 */ 11480 args.array_len = 2; 11481 args.array = argop; 11482 11483 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11484 &recov_state, NULL); 11485 if (e.error) { 11486 crfree(cred_otw); 11487 if (osp != NULL) 11488 open_stream_rele(osp, rp); 11489 return (e.error); 11490 } 11491 11492 /* putfh directory */ 11493 argop[0].argop = OP_CPUTFH; 11494 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11495 11496 /* commit */ 11497 argop[1].argop = OP_COMMIT; 11498 argop[1].nfs_argop4_u.opcommit.offset = offset; 11499 argop[1].nfs_argop4_u.opcommit.count = count; 11500 11501 doqueue = 1; 11502 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11503 11504 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11505 if (!needrecov && e.error) { 11506 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11507 needrecov); 11508 crfree(cred_otw); 11509 if (e.error == EACCES && last_time == FALSE) 11510 goto get_commit_cred; 11511 if (osp != NULL) 11512 open_stream_rele(osp, rp); 11513 return (e.error); 11514 } 11515 11516 if (needrecov) { 11517 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11518 NULL, OP_COMMIT, NULL) == FALSE) { 11519 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11520 &recov_state, needrecov); 11521 if (!e.error) 11522 (void) xdr_free(xdr_COMPOUND4res_clnt, 11523 (caddr_t)&res); 11524 goto recov_retry; 11525 } 11526 if (e.error) { 11527 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11528 &recov_state, needrecov); 11529 crfree(cred_otw); 11530 if (osp != NULL) 11531 open_stream_rele(osp, rp); 11532 return (e.error); 11533 } 11534 /* fall through for res.status case */ 11535 } 11536 11537 if (res.status) { 11538 e.error = geterrno4(res.status); 11539 if (e.error == EACCES && last_time == FALSE) { 11540 crfree(cred_otw); 11541 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11542 &recov_state, needrecov); 11543 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11544 goto get_commit_cred; 11545 } 11546 /* 11547 * Can't do a nfs4_purge_stale_fh here because this 11548 * can cause a deadlock. nfs4_commit can 11549 * be called from nfs4_dispose which can be called 11550 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11551 * can call back to pvn_vplist_dirty. 11552 */ 11553 if (e.error == ESTALE) { 11554 mutex_enter(&rp->r_statelock); 11555 rp->r_flags |= R4STALE; 11556 if (!rp->r_error) 11557 rp->r_error = e.error; 11558 mutex_exit(&rp->r_statelock); 11559 PURGE_ATTRCACHE4(vp); 11560 } else { 11561 mutex_enter(&rp->r_statelock); 11562 if (!rp->r_error) 11563 rp->r_error = e.error; 11564 mutex_exit(&rp->r_statelock); 11565 } 11566 } else { 11567 ASSERT(rp->r_flags & R4HAVEVERF); 11568 resop = &res.array[1]; /* commit res */ 11569 cm_res = &resop->nfs_resop4_u.opcommit; 11570 mutex_enter(&rp->r_statelock); 11571 if (cm_res->writeverf == rp->r_writeverf) { 11572 mutex_exit(&rp->r_statelock); 11573 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11574 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11575 &recov_state, needrecov); 11576 crfree(cred_otw); 11577 if (osp != NULL) 11578 open_stream_rele(osp, rp); 11579 return (0); 11580 } 11581 nfs4_set_mod(vp); 11582 rp->r_writeverf = cm_res->writeverf; 11583 mutex_exit(&rp->r_statelock); 11584 e.error = NFS_VERF_MISMATCH; 11585 } 11586 11587 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11588 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11589 crfree(cred_otw); 11590 if (osp != NULL) 11591 open_stream_rele(osp, rp); 11592 11593 return (e.error); 11594 } 11595 11596 static void 11597 nfs4_set_mod(vnode_t *vp) 11598 { 11599 page_t *pp; 11600 kmutex_t *vphm; 11601 rnode4_t *rp; 11602 11603 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11604 11605 /* make sure we're looking at the master vnode, not a shadow */ 11606 11607 rp = VTOR4(vp); 11608 if (IS_SHADOW(vp, rp)) 11609 vp = RTOV4(rp); 11610 11611 vphm = page_vnode_mutex(vp); 11612 mutex_enter(vphm); 11613 /* 11614 * If there are no pages associated with this vnode, then 11615 * just return. 11616 */ 11617 if ((pp = vp->v_pages) == NULL) { 11618 mutex_exit(vphm); 11619 return; 11620 } 11621 11622 do { 11623 if (pp->p_fsdata != C_NOCOMMIT) { 11624 hat_setmod(pp); 11625 pp->p_fsdata = C_NOCOMMIT; 11626 } 11627 } while ((pp = pp->p_vpnext) != vp->v_pages); 11628 mutex_exit(vphm); 11629 } 11630 11631 /* 11632 * This function is used to gather a page list of the pages which 11633 * can be committed on the server. 11634 * 11635 * The calling thread must have set R4COMMIT. This bit is used to 11636 * serialize access to the commit structure in the rnode. As long 11637 * as the thread has set R4COMMIT, then it can manipulate the commit 11638 * structure without requiring any other locks. 11639 * 11640 * When this function is called from nfs4_dispose() the page passed 11641 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11642 * will skip it. This is not a problem since we initially add the 11643 * page to the r_commit page list. 11644 * 11645 */ 11646 static void 11647 nfs4_get_commit(vnode_t *vp) 11648 { 11649 rnode4_t *rp; 11650 page_t *pp; 11651 kmutex_t *vphm; 11652 11653 rp = VTOR4(vp); 11654 11655 ASSERT(rp->r_flags & R4COMMIT); 11656 11657 /* make sure we're looking at the master vnode, not a shadow */ 11658 11659 if (IS_SHADOW(vp, rp)) 11660 vp = RTOV4(rp); 11661 11662 vphm = page_vnode_mutex(vp); 11663 mutex_enter(vphm); 11664 11665 /* 11666 * If there are no pages associated with this vnode, then 11667 * just return. 11668 */ 11669 if ((pp = vp->v_pages) == NULL) { 11670 mutex_exit(vphm); 11671 return; 11672 } 11673 11674 /* 11675 * Step through all of the pages associated with this vnode 11676 * looking for pages which need to be committed. 11677 */ 11678 do { 11679 /* 11680 * First short-cut everything (without the page_lock) 11681 * and see if this page does not need to be committed 11682 * or is modified if so then we'll just skip it. 11683 */ 11684 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11685 continue; 11686 11687 /* 11688 * Attempt to lock the page. If we can't, then 11689 * someone else is messing with it or we have been 11690 * called from nfs4_dispose and this is the page that 11691 * nfs4_dispose was called with.. anyway just skip it. 11692 */ 11693 if (!page_trylock(pp, SE_EXCL)) 11694 continue; 11695 11696 /* 11697 * Lets check again now that we have the page lock. 11698 */ 11699 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11700 page_unlock(pp); 11701 continue; 11702 } 11703 11704 /* this had better not be a free page */ 11705 ASSERT(PP_ISFREE(pp) == 0); 11706 11707 /* 11708 * The page needs to be committed and we locked it. 11709 * Update the base and length parameters and add it 11710 * to r_pages. 11711 */ 11712 if (rp->r_commit.c_pages == NULL) { 11713 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11714 rp->r_commit.c_commlen = PAGESIZE; 11715 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11716 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11717 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11718 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11719 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11720 <= pp->p_offset) { 11721 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11722 rp->r_commit.c_commbase + PAGESIZE; 11723 } 11724 page_add(&rp->r_commit.c_pages, pp); 11725 } while ((pp = pp->p_vpnext) != vp->v_pages); 11726 11727 mutex_exit(vphm); 11728 } 11729 11730 /* 11731 * This routine is used to gather together a page list of the pages 11732 * which are to be committed on the server. This routine must not 11733 * be called if the calling thread holds any locked pages. 11734 * 11735 * The calling thread must have set R4COMMIT. This bit is used to 11736 * serialize access to the commit structure in the rnode. As long 11737 * as the thread has set R4COMMIT, then it can manipulate the commit 11738 * structure without requiring any other locks. 11739 */ 11740 static void 11741 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11742 { 11743 11744 rnode4_t *rp; 11745 page_t *pp; 11746 u_offset_t end; 11747 u_offset_t off; 11748 ASSERT(len != 0); 11749 rp = VTOR4(vp); 11750 ASSERT(rp->r_flags & R4COMMIT); 11751 11752 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11753 11754 /* make sure we're looking at the master vnode, not a shadow */ 11755 11756 if (IS_SHADOW(vp, rp)) 11757 vp = RTOV4(rp); 11758 11759 /* 11760 * If there are no pages associated with this vnode, then 11761 * just return. 11762 */ 11763 if ((pp = vp->v_pages) == NULL) 11764 return; 11765 /* 11766 * Calculate the ending offset. 11767 */ 11768 end = soff + len; 11769 for (off = soff; off < end; off += PAGESIZE) { 11770 /* 11771 * Lookup each page by vp, offset. 11772 */ 11773 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11774 continue; 11775 /* 11776 * If this page does not need to be committed or is 11777 * modified, then just skip it. 11778 */ 11779 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11780 page_unlock(pp); 11781 continue; 11782 } 11783 11784 ASSERT(PP_ISFREE(pp) == 0); 11785 /* 11786 * The page needs to be committed and we locked it. 11787 * Update the base and length parameters and add it 11788 * to r_pages. 11789 */ 11790 if (rp->r_commit.c_pages == NULL) { 11791 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11792 rp->r_commit.c_commlen = PAGESIZE; 11793 } else { 11794 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11795 rp->r_commit.c_commbase + PAGESIZE; 11796 } 11797 page_add(&rp->r_commit.c_pages, pp); 11798 } 11799 } 11800 11801 /* 11802 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 11803 * Flushes and commits data to the server. 11804 */ 11805 static int 11806 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 11807 { 11808 int error; 11809 verifier4 write_verf; 11810 rnode4_t *rp = VTOR4(vp); 11811 11812 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11813 11814 /* 11815 * Flush the data portion of the file and then commit any 11816 * portions which need to be committed. This may need to 11817 * be done twice if the server has changed state since 11818 * data was last written. The data will need to be 11819 * rewritten to the server and then a new commit done. 11820 * 11821 * In fact, this may need to be done several times if the 11822 * server is having problems and crashing while we are 11823 * attempting to do this. 11824 */ 11825 11826 top: 11827 /* 11828 * Do a flush based on the poff and plen arguments. This 11829 * will synchronously write out any modified pages in the 11830 * range specified by (poff, plen). This starts all of the 11831 * i/o operations which will be waited for in the next 11832 * call to nfs4_putpage 11833 */ 11834 11835 mutex_enter(&rp->r_statelock); 11836 write_verf = rp->r_writeverf; 11837 mutex_exit(&rp->r_statelock); 11838 11839 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr); 11840 if (error == EAGAIN) 11841 error = 0; 11842 11843 /* 11844 * Do a flush based on the poff and plen arguments. This 11845 * will synchronously write out any modified pages in the 11846 * range specified by (poff, plen) and wait until all of 11847 * the asynchronous i/o's in that range are done as well. 11848 */ 11849 if (!error) 11850 error = nfs4_putpage(vp, poff, plen, 0, cr); 11851 11852 if (error) 11853 return (error); 11854 11855 mutex_enter(&rp->r_statelock); 11856 if (rp->r_writeverf != write_verf) { 11857 mutex_exit(&rp->r_statelock); 11858 goto top; 11859 } 11860 mutex_exit(&rp->r_statelock); 11861 11862 /* 11863 * Now commit any pages which might need to be committed. 11864 * If the error, NFS_VERF_MISMATCH, is returned, then 11865 * start over with the flush operation. 11866 */ 11867 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 11868 11869 if (error == NFS_VERF_MISMATCH) 11870 goto top; 11871 11872 return (error); 11873 } 11874 11875 /* 11876 * nfs4_commit_vp() will wait for other pending commits and 11877 * will either commit the whole file or a range, plen dictates 11878 * if we commit whole file. a value of zero indicates the whole 11879 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 11880 */ 11881 static int 11882 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 11883 cred_t *cr, int wait_on_writes) 11884 { 11885 rnode4_t *rp; 11886 page_t *plist; 11887 offset3 offset; 11888 count3 len; 11889 11890 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11891 11892 rp = VTOR4(vp); 11893 11894 /* 11895 * before we gather commitable pages make 11896 * sure there are no outstanding async writes 11897 */ 11898 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 11899 mutex_enter(&rp->r_statelock); 11900 while (rp->r_count > 0) { 11901 cv_wait(&rp->r_cv, &rp->r_statelock); 11902 } 11903 mutex_exit(&rp->r_statelock); 11904 } 11905 11906 /* 11907 * Set the `commit inprogress' state bit. We must 11908 * first wait until any current one finishes. 11909 */ 11910 mutex_enter(&rp->r_statelock); 11911 while (rp->r_flags & R4COMMIT) { 11912 rp->r_flags |= R4COMMITWAIT; 11913 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11914 rp->r_flags &= ~R4COMMITWAIT; 11915 } 11916 rp->r_flags |= R4COMMIT; 11917 mutex_exit(&rp->r_statelock); 11918 11919 /* 11920 * Gather all of the pages which need to be 11921 * committed. 11922 */ 11923 if (plen == 0) 11924 nfs4_get_commit(vp); 11925 else 11926 nfs4_get_commit_range(vp, poff, plen); 11927 11928 /* 11929 * Clear the `commit inprogress' bit and disconnect the 11930 * page list which was gathered by nfs4_get_commit. 11931 */ 11932 plist = rp->r_commit.c_pages; 11933 rp->r_commit.c_pages = NULL; 11934 offset = rp->r_commit.c_commbase; 11935 len = rp->r_commit.c_commlen; 11936 mutex_enter(&rp->r_statelock); 11937 rp->r_flags &= ~R4COMMIT; 11938 cv_broadcast(&rp->r_commit.c_cv); 11939 mutex_exit(&rp->r_statelock); 11940 11941 /* 11942 * If any pages need to be committed, commit them and 11943 * then unlock them so that they can be freed some 11944 * time later. 11945 */ 11946 if (plist == NULL) 11947 return (0); 11948 11949 /* 11950 * No error occurred during the flush portion 11951 * of this operation, so now attempt to commit 11952 * the data to stable storage on the server. 11953 * 11954 * This will unlock all of the pages on the list. 11955 */ 11956 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 11957 } 11958 11959 static int 11960 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11961 cred_t *cr) 11962 { 11963 int error; 11964 page_t *pp; 11965 11966 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11967 11968 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 11969 11970 /* 11971 * If we got an error, then just unlock all of the pages 11972 * on the list. 11973 */ 11974 if (error) { 11975 while (plist != NULL) { 11976 pp = plist; 11977 page_sub(&plist, pp); 11978 page_unlock(pp); 11979 } 11980 return (error); 11981 } 11982 /* 11983 * We've tried as hard as we can to commit the data to stable 11984 * storage on the server. We just unlock the pages and clear 11985 * the commit required state. They will get freed later. 11986 */ 11987 while (plist != NULL) { 11988 pp = plist; 11989 page_sub(&plist, pp); 11990 pp->p_fsdata = C_NOCOMMIT; 11991 page_unlock(pp); 11992 } 11993 11994 return (error); 11995 } 11996 11997 static void 11998 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11999 cred_t *cr) 12000 { 12001 12002 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12003 } 12004 12005 /*ARGSUSED*/ 12006 static int 12007 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 12008 { 12009 int error = 0; 12010 mntinfo4_t *mi; 12011 vattr_t va; 12012 vsecattr_t nfsace4_vsap; 12013 12014 mi = VTOMI4(vp); 12015 if (nfs_zone() != mi->mi_zone) 12016 return (EIO); 12017 if (mi->mi_flags & MI4_ACL) { 12018 /* if we have a delegation, return it */ 12019 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12020 (void) nfs4delegreturn(VTOR4(vp), 12021 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12022 12023 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12024 NFS4_ACL_SET); 12025 if (error) /* EINVAL */ 12026 return (error); 12027 12028 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12029 /* 12030 * These are aclent_t type entries. 12031 */ 12032 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12033 vp->v_type == VDIR, FALSE); 12034 if (error) 12035 return (error); 12036 } else { 12037 /* 12038 * These are ace_t type entries. 12039 */ 12040 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12041 FALSE); 12042 if (error) 12043 return (error); 12044 } 12045 bzero(&va, sizeof (va)); 12046 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12047 vs_ace4_destroy(&nfsace4_vsap); 12048 return (error); 12049 } 12050 return (ENOSYS); 12051 } 12052 12053 static int 12054 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 12055 { 12056 int error; 12057 mntinfo4_t *mi; 12058 nfs4_ga_res_t gar; 12059 rnode4_t *rp = VTOR4(vp); 12060 12061 mi = VTOMI4(vp); 12062 if (nfs_zone() != mi->mi_zone) 12063 return (EIO); 12064 12065 bzero(&gar, sizeof (gar)); 12066 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12067 12068 /* 12069 * vsecattr->vsa_mask holds the original acl request mask. 12070 * This is needed when determining what to return. 12071 * (See: nfs4_create_getsecattr_return()) 12072 */ 12073 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12074 if (error) /* EINVAL */ 12075 return (error); 12076 12077 if (mi->mi_flags & MI4_ACL) { 12078 /* 12079 * Check if the data is cached and the cache is valid. If it 12080 * is we don't go over the wire. 12081 */ 12082 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12083 mutex_enter(&rp->r_statelock); 12084 if (rp->r_secattr != NULL) { 12085 error = nfs4_create_getsecattr_return( 12086 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12087 rp->r_attr.va_gid, 12088 vp->v_type == VDIR); 12089 if (!error) { /* error == 0 - Success! */ 12090 mutex_exit(&rp->r_statelock); 12091 return (error); 12092 } 12093 } 12094 mutex_exit(&rp->r_statelock); 12095 } 12096 12097 /* 12098 * The getattr otw call will always get both the acl, in 12099 * the form of a list of nfsace4's, and the number of acl 12100 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12101 */ 12102 gar.n4g_va.va_mask = AT_ALL; 12103 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12104 if (error) { 12105 vs_ace4_destroy(&gar.n4g_vsa); 12106 if (error == ENOTSUP || error == EOPNOTSUPP) 12107 error = fs_fab_acl(vp, vsecattr, flag, cr); 12108 return (error); 12109 } 12110 12111 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12112 /* 12113 * No error was returned, but according to the response 12114 * bitmap, neither was an acl. 12115 */ 12116 vs_ace4_destroy(&gar.n4g_vsa); 12117 error = fs_fab_acl(vp, vsecattr, flag, cr); 12118 return (error); 12119 } 12120 12121 /* 12122 * Update the cache with the ACL. 12123 */ 12124 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12125 12126 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12127 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12128 vp->v_type == VDIR); 12129 vs_ace4_destroy(&gar.n4g_vsa); 12130 if ((error) && (vsecattr->vsa_mask & 12131 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12132 (error != EACCES)) { 12133 error = fs_fab_acl(vp, vsecattr, flag, cr); 12134 } 12135 return (error); 12136 } 12137 error = fs_fab_acl(vp, vsecattr, flag, cr); 12138 return (error); 12139 } 12140 12141 /* 12142 * The function returns: 12143 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12144 * - EINVAL if the passed in "acl_mask" is an invalid request. 12145 * 12146 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12147 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12148 * 12149 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12150 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12151 * - We have a count field set without the corresponding acl field set. (e.g. - 12152 * VSA_ACECNT is set, but VSA_ACE is not) 12153 */ 12154 static int 12155 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12156 { 12157 /* Shortcut the masks that are always valid. */ 12158 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12159 return (0); 12160 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12161 return (0); 12162 12163 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12164 /* 12165 * We can't have any VSA_ACL type stuff in the mask now. 12166 */ 12167 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12168 VSA_DFACLCNT)) 12169 return (EINVAL); 12170 12171 if (op == NFS4_ACL_SET) { 12172 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12173 return (EINVAL); 12174 } 12175 } 12176 12177 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12178 /* 12179 * We can't have any VSA_ACE type stuff in the mask now. 12180 */ 12181 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12182 return (EINVAL); 12183 12184 if (op == NFS4_ACL_SET) { 12185 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12186 return (EINVAL); 12187 12188 if ((acl_mask & VSA_DFACLCNT) && 12189 !(acl_mask & VSA_DFACL)) 12190 return (EINVAL); 12191 } 12192 } 12193 return (0); 12194 } 12195 12196 /* 12197 * The theory behind creating the correct getsecattr return is simply this: 12198 * "Don't return anything that the caller is not expecting to have to free." 12199 */ 12200 static int 12201 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12202 uid_t uid, gid_t gid, int isdir) 12203 { 12204 int error = 0; 12205 /* Save the mask since the translators modify it. */ 12206 uint_t orig_mask = vsap->vsa_mask; 12207 12208 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12209 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12210 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12211 12212 if (error) 12213 return (error); 12214 12215 /* 12216 * If the caller only asked for the ace count (VSA_ACECNT) 12217 * don't give them the full acl (VSA_ACE), free it. 12218 */ 12219 if (!orig_mask & VSA_ACE) { 12220 if (vsap->vsa_aclentp != NULL) { 12221 kmem_free(vsap->vsa_aclentp, 12222 vsap->vsa_aclcnt * sizeof (ace_t)); 12223 vsap->vsa_aclentp = NULL; 12224 } 12225 } 12226 vsap->vsa_mask = orig_mask; 12227 12228 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12229 VSA_DFACLCNT)) { 12230 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12231 isdir, FALSE, 12232 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12233 12234 if (error) 12235 return (error); 12236 12237 /* 12238 * If the caller only asked for the acl count (VSA_ACLCNT) 12239 * and/or the default acl count (VSA_DFACLCNT) don't give them 12240 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12241 */ 12242 if (!orig_mask & VSA_ACL) { 12243 if (vsap->vsa_aclentp != NULL) { 12244 kmem_free(vsap->vsa_aclentp, 12245 vsap->vsa_aclcnt * sizeof (aclent_t)); 12246 vsap->vsa_aclentp = NULL; 12247 } 12248 } 12249 12250 if (!orig_mask & VSA_DFACL) { 12251 if (vsap->vsa_dfaclentp != NULL) { 12252 kmem_free(vsap->vsa_dfaclentp, 12253 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12254 vsap->vsa_dfaclentp = NULL; 12255 } 12256 } 12257 vsap->vsa_mask = orig_mask; 12258 } 12259 return (0); 12260 } 12261 12262 static int 12263 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 12264 { 12265 int error; 12266 12267 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12268 return (EIO); 12269 /* 12270 * check for valid cmd parameter 12271 */ 12272 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12273 return (EINVAL); 12274 12275 /* 12276 * Check access permissions 12277 */ 12278 if ((cmd & F_SHARE) && 12279 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12280 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12281 return (EBADF); 12282 12283 /* 12284 * If the filesystem is mounted using local locking, pass the 12285 * request off to the local share code. 12286 */ 12287 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12288 return (fs_shrlock(vp, cmd, shr, flag, cr)); 12289 12290 switch (cmd) { 12291 case F_SHARE: 12292 case F_UNSHARE: 12293 /* 12294 * This will be properly implemented later, 12295 * see RFE: 4823948 . 12296 */ 12297 error = EAGAIN; 12298 break; 12299 12300 case F_HASREMOTELOCKS: 12301 /* 12302 * NFS client can't store remote locks itself 12303 */ 12304 shr->s_access = 0; 12305 error = 0; 12306 break; 12307 12308 default: 12309 error = EINVAL; 12310 break; 12311 } 12312 12313 return (error); 12314 } 12315 12316 /* 12317 * Common code called by directory ops to update the attrcache 12318 */ 12319 static int 12320 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12321 hrtime_t t, vnode_t *vp, cred_t *cr) 12322 { 12323 int error = 0; 12324 12325 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12326 12327 if (status != NFS4_OK) { 12328 /* getattr not done or failed */ 12329 PURGE_ATTRCACHE4(vp); 12330 return (error); 12331 } 12332 12333 if (garp) { 12334 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12335 } else { 12336 PURGE_ATTRCACHE4(vp); 12337 } 12338 return (error); 12339 } 12340 12341 /* 12342 * Update directory caches for directory modification ops (link, rename, etc.) 12343 * When dinfo is NULL, manage dircaches in the old way. 12344 */ 12345 static void 12346 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12347 dirattr_info_t *dinfo) 12348 { 12349 rnode4_t *drp = VTOR4(dvp); 12350 12351 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12352 12353 /* Purge rddir cache for dir since it changed */ 12354 if (drp->r_dir != NULL) 12355 nfs4_purge_rddir_cache(dvp); 12356 12357 /* 12358 * If caller provided dinfo, then use it to manage dir caches. 12359 */ 12360 if (dinfo != NULL) { 12361 if (vp != NULL) { 12362 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12363 if (!VTOR4(vp)->created_v4) { 12364 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12365 dnlc_update(dvp, nm, vp); 12366 } else { 12367 /* 12368 * XXX don't update if the created_v4 flag is 12369 * set 12370 */ 12371 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12372 NFS4_DEBUG(nfs4_client_state_debug, 12373 (CE_NOTE, "nfs4_update_dircaches: " 12374 "don't update dnlc: created_v4 flag")); 12375 } 12376 } 12377 12378 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12379 dinfo->di_cred, FALSE, cinfo); 12380 12381 return; 12382 } 12383 12384 /* 12385 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12386 * Since caller modified dir but didn't receive post-dirmod-op dir 12387 * attrs, the dir's attrs must be purged. 12388 * 12389 * XXX this check and dnlc update/purge should really be atomic, 12390 * XXX but can't use rnode statelock because it'll deadlock in 12391 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12392 * XXX does occur. 12393 * 12394 * XXX We also may want to check that atomic is true in the 12395 * XXX change_info struct. If it is not, the change_info may 12396 * XXX reflect changes by more than one clients which means that 12397 * XXX our cache may not be valid. 12398 */ 12399 PURGE_ATTRCACHE4(dvp); 12400 if (drp->r_change == cinfo->before) { 12401 /* no changes took place in the directory prior to our link */ 12402 if (vp != NULL) { 12403 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12404 if (!VTOR4(vp)->created_v4) { 12405 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12406 dnlc_update(dvp, nm, vp); 12407 } else { 12408 /* 12409 * XXX dont' update if the created_v4 flag 12410 * is set 12411 */ 12412 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12413 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12414 "nfs4_update_dircaches: don't" 12415 " update dnlc: created_v4 flag")); 12416 } 12417 } 12418 } else { 12419 /* Another client modified directory - purge its dnlc cache */ 12420 dnlc_purge_vp(dvp); 12421 } 12422 } 12423 12424 /* 12425 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12426 * file. 12427 * 12428 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12429 * file (ie: client recovery) and otherwise set to FALSE. 12430 * 12431 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12432 * initiated) calling functions. 12433 * 12434 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12435 * of resending a 'lost' open request. 12436 * 12437 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12438 * server that hands out BAD_SEQID on open confirm. 12439 * 12440 * Errors are returned via the nfs4_error_t parameter. 12441 */ 12442 void 12443 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12444 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12445 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12446 { 12447 COMPOUND4args_clnt args; 12448 COMPOUND4res_clnt res; 12449 nfs_argop4 argop[2]; 12450 nfs_resop4 *resop; 12451 int doqueue = 1; 12452 mntinfo4_t *mi; 12453 OPEN_CONFIRM4args *open_confirm_args; 12454 int needrecov; 12455 12456 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12457 #if DEBUG 12458 mutex_enter(&oop->oo_lock); 12459 ASSERT(oop->oo_seqid_inuse); 12460 mutex_exit(&oop->oo_lock); 12461 #endif 12462 12463 recov_retry_confirm: 12464 nfs4_error_zinit(ep); 12465 *retry_open = FALSE; 12466 12467 if (resend) 12468 args.ctag = TAG_OPEN_CONFIRM_LOST; 12469 else 12470 args.ctag = TAG_OPEN_CONFIRM; 12471 12472 args.array_len = 2; 12473 args.array = argop; 12474 12475 /* putfh target fh */ 12476 argop[0].argop = OP_CPUTFH; 12477 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12478 12479 argop[1].argop = OP_OPEN_CONFIRM; 12480 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12481 12482 (*seqid) += 1; 12483 open_confirm_args->seqid = *seqid; 12484 open_confirm_args->open_stateid = *stateid; 12485 12486 mi = VTOMI4(vp); 12487 12488 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12489 12490 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12491 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12492 } 12493 12494 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12495 if (!needrecov && ep->error) 12496 return; 12497 12498 if (needrecov) { 12499 bool_t abort = FALSE; 12500 12501 if (reopening_file == FALSE) { 12502 nfs4_bseqid_entry_t *bsep = NULL; 12503 12504 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12505 bsep = nfs4_create_bseqid_entry(oop, NULL, 12506 vp, 0, args.ctag, 12507 open_confirm_args->seqid); 12508 12509 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12510 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12511 if (bsep) { 12512 kmem_free(bsep, sizeof (*bsep)); 12513 if (num_bseqid_retryp && 12514 --(*num_bseqid_retryp) == 0) 12515 abort = TRUE; 12516 } 12517 } 12518 if ((ep->error == ETIMEDOUT || 12519 res.status == NFS4ERR_RESOURCE) && 12520 abort == FALSE && resend == FALSE) { 12521 if (!ep->error) 12522 (void) xdr_free(xdr_COMPOUND4res_clnt, 12523 (caddr_t)&res); 12524 12525 delay(SEC_TO_TICK(confirm_retry_sec)); 12526 goto recov_retry_confirm; 12527 } 12528 /* State may have changed so retry the entire OPEN op */ 12529 if (abort == FALSE) 12530 *retry_open = TRUE; 12531 else 12532 *retry_open = FALSE; 12533 if (!ep->error) 12534 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12535 return; 12536 } 12537 12538 if (res.status) { 12539 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12540 return; 12541 } 12542 12543 resop = &res.array[1]; /* open confirm res */ 12544 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12545 stateid, sizeof (*stateid)); 12546 12547 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12548 } 12549 12550 /* 12551 * Return the credentials associated with a client state object. The 12552 * caller is responsible for freeing the credentials. 12553 */ 12554 12555 static cred_t * 12556 state_to_cred(nfs4_open_stream_t *osp) 12557 { 12558 cred_t *cr; 12559 12560 /* 12561 * It's ok to not lock the open stream and open owner to get 12562 * the oo_cred since this is only written once (upon creation) 12563 * and will not change. 12564 */ 12565 cr = osp->os_open_owner->oo_cred; 12566 crhold(cr); 12567 12568 return (cr); 12569 } 12570 12571 /* 12572 * nfs4_find_sysid 12573 * 12574 * Find the sysid for the knetconfig associated with the given mi. 12575 */ 12576 static struct lm_sysid * 12577 nfs4_find_sysid(mntinfo4_t *mi) 12578 { 12579 ASSERT(nfs_zone() == mi->mi_zone); 12580 12581 /* 12582 * Switch from RDMA knconf to original mount knconf 12583 */ 12584 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12585 mi->mi_curr_serv->sv_hostname, NULL)); 12586 } 12587 12588 #ifdef DEBUG 12589 /* 12590 * Return a string version of the call type for easy reading. 12591 */ 12592 static char * 12593 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12594 { 12595 switch (ctype) { 12596 case NFS4_LCK_CTYPE_NORM: 12597 return ("NORMAL"); 12598 case NFS4_LCK_CTYPE_RECLAIM: 12599 return ("RECLAIM"); 12600 case NFS4_LCK_CTYPE_RESEND: 12601 return ("RESEND"); 12602 case NFS4_LCK_CTYPE_REINSTATE: 12603 return ("REINSTATE"); 12604 default: 12605 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12606 "type %d", ctype); 12607 return (""); 12608 } 12609 } 12610 #endif 12611 12612 /* 12613 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12614 * Unlock requests don't have an over-the-wire locktype, so we just return 12615 * something non-threatening. 12616 */ 12617 12618 static nfs_lock_type4 12619 flk_to_locktype(int cmd, int l_type) 12620 { 12621 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12622 12623 switch (l_type) { 12624 case F_UNLCK: 12625 return (READ_LT); 12626 case F_RDLCK: 12627 if (cmd == F_SETLK) 12628 return (READ_LT); 12629 else 12630 return (READW_LT); 12631 case F_WRLCK: 12632 if (cmd == F_SETLK) 12633 return (WRITE_LT); 12634 else 12635 return (WRITEW_LT); 12636 } 12637 panic("flk_to_locktype"); 12638 /*NOTREACHED*/ 12639 } 12640 12641 /* 12642 * Do some preliminary checks for nfs4frlock. 12643 */ 12644 static int 12645 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12646 u_offset_t offset) 12647 { 12648 int error = 0; 12649 12650 /* 12651 * If we are setting a lock, check that the file is opened 12652 * with the correct mode. 12653 */ 12654 if (cmd == F_SETLK || cmd == F_SETLKW) { 12655 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12656 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12657 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12658 "nfs4frlock_validate_args: file was opened with " 12659 "incorrect mode")); 12660 return (EBADF); 12661 } 12662 } 12663 12664 /* Convert the offset. It may need to be restored before returning. */ 12665 if (error = convoff(vp, flk, 0, offset)) { 12666 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12667 "nfs4frlock_validate_args: convoff => error= %d\n", 12668 error)); 12669 return (error); 12670 } 12671 12672 return (error); 12673 } 12674 12675 /* 12676 * Set the flock64's lm_sysid for nfs4frlock. 12677 */ 12678 static int 12679 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12680 { 12681 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12682 12683 /* Find the lm_sysid */ 12684 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12685 12686 if (*lspp == NULL) { 12687 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12688 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12689 return (ENOLCK); 12690 } 12691 12692 flk->l_sysid = lm_sysidt(*lspp); 12693 12694 return (0); 12695 } 12696 12697 /* 12698 * Do the remaining preliminary setup for nfs4frlock. 12699 */ 12700 static void 12701 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12702 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12703 cred_t **cred_otw) 12704 { 12705 /* 12706 * set tick_delay to the base delay time. 12707 * (NFS4_BASE_WAIT_TIME is in secs) 12708 */ 12709 12710 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12711 12712 /* 12713 * If lock is relative to EOF, we need the newest length of the 12714 * file. Therefore invalidate the ATTR_CACHE. 12715 */ 12716 12717 *whencep = flk->l_whence; 12718 12719 if (*whencep == 2) /* SEEK_END */ 12720 PURGE_ATTRCACHE4(vp); 12721 12722 recov_statep->rs_flags = 0; 12723 recov_statep->rs_num_retry_despite_err = 0; 12724 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12725 } 12726 12727 /* 12728 * Initialize and allocate the data structures necessary for 12729 * the nfs4frlock call. 12730 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12731 */ 12732 static void 12733 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12734 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12735 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12736 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12737 { 12738 int argoplist_size; 12739 int num_ops = 2; 12740 12741 *retry = FALSE; 12742 *did_start_fop = FALSE; 12743 *skip_get_err = FALSE; 12744 lost_rqstp->lr_op = 0; 12745 argoplist_size = num_ops * sizeof (nfs_argop4); 12746 /* fill array with zero */ 12747 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12748 12749 *argspp = argsp; 12750 *respp = NULL; 12751 12752 argsp->array_len = num_ops; 12753 argsp->array = *argopp; 12754 12755 /* initialize in case of error; will get real value down below */ 12756 argsp->ctag = TAG_NONE; 12757 12758 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12759 *op_hintp = OH_LOCKU; 12760 else 12761 *op_hintp = OH_OTHER; 12762 } 12763 12764 /* 12765 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12766 * the proper nfs4_server_t for this instance of nfs4frlock. 12767 * Returns 0 (success) or an errno value. 12768 */ 12769 static int 12770 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12771 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12772 bool_t *did_start_fop, bool_t *startrecovp) 12773 { 12774 int error = 0; 12775 rnode4_t *rp; 12776 12777 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12778 12779 if (ctype == NFS4_LCK_CTYPE_NORM) { 12780 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12781 recov_statep, startrecovp); 12782 if (error) 12783 return (error); 12784 *did_start_fop = TRUE; 12785 } else { 12786 *did_start_fop = FALSE; 12787 *startrecovp = FALSE; 12788 } 12789 12790 if (!error) { 12791 rp = VTOR4(vp); 12792 12793 /* If the file failed recovery, just quit. */ 12794 mutex_enter(&rp->r_statelock); 12795 if (rp->r_flags & R4RECOVERR) { 12796 error = EIO; 12797 } 12798 mutex_exit(&rp->r_statelock); 12799 } 12800 12801 return (error); 12802 } 12803 12804 /* 12805 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 12806 * resend nfs4frlock call is initiated by the recovery framework. 12807 * Acquires the lop and oop seqid synchronization. 12808 */ 12809 static void 12810 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 12811 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 12812 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 12813 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 12814 { 12815 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 12816 int error; 12817 12818 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 12819 (CE_NOTE, 12820 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 12821 ASSERT(resend_rqstp != NULL); 12822 ASSERT(resend_rqstp->lr_op == OP_LOCK || 12823 resend_rqstp->lr_op == OP_LOCKU); 12824 12825 *oopp = resend_rqstp->lr_oop; 12826 if (resend_rqstp->lr_oop) { 12827 open_owner_hold(resend_rqstp->lr_oop); 12828 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 12829 ASSERT(error == 0); /* recov thread always succeeds */ 12830 } 12831 12832 /* Must resend this lost lock/locku request. */ 12833 ASSERT(resend_rqstp->lr_lop != NULL); 12834 *lopp = resend_rqstp->lr_lop; 12835 lock_owner_hold(resend_rqstp->lr_lop); 12836 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 12837 ASSERT(error == 0); /* recov thread always succeeds */ 12838 12839 *ospp = resend_rqstp->lr_osp; 12840 if (*ospp) 12841 open_stream_hold(resend_rqstp->lr_osp); 12842 12843 if (resend_rqstp->lr_op == OP_LOCK) { 12844 LOCK4args *lock_args; 12845 12846 argop->argop = OP_LOCK; 12847 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 12848 lock_args->locktype = resend_rqstp->lr_locktype; 12849 lock_args->reclaim = 12850 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 12851 lock_args->offset = resend_rqstp->lr_flk->l_start; 12852 lock_args->length = resend_rqstp->lr_flk->l_len; 12853 if (lock_args->length == 0) 12854 lock_args->length = ~lock_args->length; 12855 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 12856 mi2clientid(mi), &lock_args->locker); 12857 12858 switch (resend_rqstp->lr_ctype) { 12859 case NFS4_LCK_CTYPE_RESEND: 12860 argsp->ctag = TAG_LOCK_RESEND; 12861 break; 12862 case NFS4_LCK_CTYPE_REINSTATE: 12863 argsp->ctag = TAG_LOCK_REINSTATE; 12864 break; 12865 case NFS4_LCK_CTYPE_RECLAIM: 12866 argsp->ctag = TAG_LOCK_RECLAIM; 12867 break; 12868 default: 12869 argsp->ctag = TAG_LOCK_UNKNOWN; 12870 break; 12871 } 12872 } else { 12873 LOCKU4args *locku_args; 12874 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 12875 12876 argop->argop = OP_LOCKU; 12877 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 12878 locku_args->locktype = READ_LT; 12879 locku_args->seqid = lop->lock_seqid + 1; 12880 mutex_enter(&lop->lo_lock); 12881 locku_args->lock_stateid = lop->lock_stateid; 12882 mutex_exit(&lop->lo_lock); 12883 locku_args->offset = resend_rqstp->lr_flk->l_start; 12884 locku_args->length = resend_rqstp->lr_flk->l_len; 12885 if (locku_args->length == 0) 12886 locku_args->length = ~locku_args->length; 12887 12888 switch (resend_rqstp->lr_ctype) { 12889 case NFS4_LCK_CTYPE_RESEND: 12890 argsp->ctag = TAG_LOCKU_RESEND; 12891 break; 12892 case NFS4_LCK_CTYPE_REINSTATE: 12893 argsp->ctag = TAG_LOCKU_REINSTATE; 12894 break; 12895 default: 12896 argsp->ctag = TAG_LOCK_UNKNOWN; 12897 break; 12898 } 12899 } 12900 } 12901 12902 /* 12903 * Setup the LOCKT4 arguments. 12904 */ 12905 static void 12906 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 12907 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 12908 rnode4_t *rp) 12909 { 12910 LOCKT4args *lockt_args; 12911 12912 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 12913 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 12914 argop->argop = OP_LOCKT; 12915 argsp->ctag = TAG_LOCKT; 12916 lockt_args = &argop->nfs_argop4_u.oplockt; 12917 12918 /* 12919 * The locktype will be READ_LT unless it's 12920 * a write lock. We do this because the Solaris 12921 * system call allows the combination of 12922 * F_UNLCK and F_GETLK* and so in that case the 12923 * unlock is mapped to a read. 12924 */ 12925 if (flk->l_type == F_WRLCK) 12926 lockt_args->locktype = WRITE_LT; 12927 else 12928 lockt_args->locktype = READ_LT; 12929 12930 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 12931 /* set the lock owner4 args */ 12932 nfs4_setlockowner_args(&lockt_args->owner, rp, 12933 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 12934 flk->l_pid); 12935 lockt_args->offset = flk->l_start; 12936 lockt_args->length = flk->l_len; 12937 if (flk->l_len == 0) 12938 lockt_args->length = ~lockt_args->length; 12939 12940 *lockt_argsp = lockt_args; 12941 } 12942 12943 /* 12944 * If the client is holding a delegation, and the open stream to be used 12945 * with this lock request is a delegation open stream, then re-open the stream. 12946 * Sets the nfs4_error_t to all zeros unless the open stream has already 12947 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 12948 * means the caller should retry (like a recovery retry). 12949 */ 12950 static void 12951 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 12952 { 12953 open_delegation_type4 dt; 12954 bool_t reopen_needed, force; 12955 nfs4_open_stream_t *osp; 12956 open_claim_type4 oclaim; 12957 rnode4_t *rp = VTOR4(vp); 12958 mntinfo4_t *mi = VTOMI4(vp); 12959 12960 ASSERT(nfs_zone() == mi->mi_zone); 12961 12962 nfs4_error_zinit(ep); 12963 12964 mutex_enter(&rp->r_statev4_lock); 12965 dt = rp->r_deleg_type; 12966 mutex_exit(&rp->r_statev4_lock); 12967 12968 if (dt != OPEN_DELEGATE_NONE) { 12969 nfs4_open_owner_t *oop; 12970 12971 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 12972 if (!oop) { 12973 ep->stat = NFS4ERR_IO; 12974 return; 12975 } 12976 /* returns with 'os_sync_lock' held */ 12977 osp = find_open_stream(oop, rp); 12978 if (!osp) { 12979 open_owner_rele(oop); 12980 ep->stat = NFS4ERR_IO; 12981 return; 12982 } 12983 12984 if (osp->os_failed_reopen) { 12985 NFS4_DEBUG((nfs4_open_stream_debug || 12986 nfs4_client_lock_debug), (CE_NOTE, 12987 "nfs4frlock_check_deleg: os_failed_reopen set " 12988 "for osp %p, cr %p, rp %s", (void *)osp, 12989 (void *)cr, rnode4info(rp))); 12990 mutex_exit(&osp->os_sync_lock); 12991 open_stream_rele(osp, rp); 12992 open_owner_rele(oop); 12993 ep->stat = NFS4ERR_IO; 12994 return; 12995 } 12996 12997 /* 12998 * Determine whether a reopen is needed. If this 12999 * is a delegation open stream, then send the open 13000 * to the server to give visibility to the open owner. 13001 * Even if it isn't a delegation open stream, we need 13002 * to check if the previous open CLAIM_DELEGATE_CUR 13003 * was sufficient. 13004 */ 13005 13006 reopen_needed = osp->os_delegation || 13007 ((lt == F_RDLCK && 13008 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13009 (lt == F_WRLCK && 13010 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13011 13012 mutex_exit(&osp->os_sync_lock); 13013 open_owner_rele(oop); 13014 13015 if (reopen_needed) { 13016 /* 13017 * Always use CLAIM_PREVIOUS after server reboot. 13018 * The server will reject CLAIM_DELEGATE_CUR if 13019 * it is used during the grace period. 13020 */ 13021 mutex_enter(&mi->mi_lock); 13022 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13023 oclaim = CLAIM_PREVIOUS; 13024 force = TRUE; 13025 } else { 13026 oclaim = CLAIM_DELEGATE_CUR; 13027 force = FALSE; 13028 } 13029 mutex_exit(&mi->mi_lock); 13030 13031 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13032 if (ep->error == EAGAIN) { 13033 nfs4_error_zinit(ep); 13034 ep->stat = NFS4ERR_DELAY; 13035 } 13036 } 13037 open_stream_rele(osp, rp); 13038 osp = NULL; 13039 } 13040 } 13041 13042 /* 13043 * Setup the LOCKU4 arguments. 13044 * Returns errors via the nfs4_error_t. 13045 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13046 * over-the-wire. The caller must release the 13047 * reference on *lopp. 13048 * NFS4ERR_DELAY caller should retry (like recovery retry) 13049 * (other) unrecoverable error. 13050 */ 13051 static void 13052 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13053 LOCKU4args **locku_argsp, flock64_t *flk, 13054 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13055 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13056 bool_t *skip_get_err, bool_t *go_otwp) 13057 { 13058 nfs4_lock_owner_t *lop = NULL; 13059 LOCKU4args *locku_args; 13060 pid_t pid; 13061 bool_t is_spec = FALSE; 13062 rnode4_t *rp = VTOR4(vp); 13063 13064 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13065 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13066 13067 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13068 if (ep->error || ep->stat) 13069 return; 13070 13071 argop->argop = OP_LOCKU; 13072 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13073 argsp->ctag = TAG_LOCKU_REINSTATE; 13074 else 13075 argsp->ctag = TAG_LOCKU; 13076 locku_args = &argop->nfs_argop4_u.oplocku; 13077 *locku_argsp = locku_args; 13078 13079 /* 13080 * XXX what should locku_args->locktype be? 13081 * setting to ALWAYS be READ_LT so at least 13082 * it is a valid locktype. 13083 */ 13084 13085 locku_args->locktype = READ_LT; 13086 13087 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13088 flk->l_pid; 13089 13090 /* 13091 * Get the lock owner stateid. If no lock owner 13092 * exists, return success. 13093 */ 13094 lop = find_lock_owner(rp, pid, LOWN_ANY); 13095 *lopp = lop; 13096 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13097 is_spec = TRUE; 13098 if (!lop || is_spec) { 13099 /* 13100 * No lock owner so no locks to unlock. 13101 * Return success. If there was a failed 13102 * reclaim earlier, the lock might still be 13103 * registered with the local locking code, 13104 * so notify it of the unlock. 13105 * 13106 * If the lockowner is using a special stateid, 13107 * then the original lock request (that created 13108 * this lockowner) was never successful, so we 13109 * have no lock to undo OTW. 13110 */ 13111 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13112 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13113 "(%ld) so return success", (long)pid)); 13114 13115 if (ctype == NFS4_LCK_CTYPE_NORM) 13116 flk->l_pid = curproc->p_pid; 13117 nfs4_register_lock_locally(vp, flk, flag, offset); 13118 /* 13119 * Release our hold and NULL out so final_cleanup 13120 * doesn't try to end a lock seqid sync we 13121 * never started. 13122 */ 13123 if (is_spec) { 13124 lock_owner_rele(lop); 13125 *lopp = NULL; 13126 } 13127 *skip_get_err = TRUE; 13128 *go_otwp = FALSE; 13129 return; 13130 } 13131 13132 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13133 if (ep->error == EAGAIN) { 13134 lock_owner_rele(lop); 13135 *lopp = NULL; 13136 return; 13137 } 13138 13139 mutex_enter(&lop->lo_lock); 13140 locku_args->lock_stateid = lop->lock_stateid; 13141 mutex_exit(&lop->lo_lock); 13142 locku_args->seqid = lop->lock_seqid + 1; 13143 13144 /* leave the ref count on lop, rele after RPC call */ 13145 13146 locku_args->offset = flk->l_start; 13147 locku_args->length = flk->l_len; 13148 if (flk->l_len == 0) 13149 locku_args->length = ~locku_args->length; 13150 13151 *go_otwp = TRUE; 13152 } 13153 13154 /* 13155 * Setup the LOCK4 arguments. 13156 * 13157 * Returns errors via the nfs4_error_t. 13158 * NFS4_OK no problems 13159 * NFS4ERR_DELAY caller should retry (like recovery retry) 13160 * (other) unrecoverable error 13161 */ 13162 static void 13163 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13164 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13165 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13166 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13167 { 13168 LOCK4args *lock_args; 13169 nfs4_open_owner_t *oop = NULL; 13170 nfs4_open_stream_t *osp = NULL; 13171 nfs4_lock_owner_t *lop = NULL; 13172 pid_t pid; 13173 rnode4_t *rp = VTOR4(vp); 13174 13175 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13176 13177 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13178 if (ep->error || ep->stat != NFS4_OK) 13179 return; 13180 13181 argop->argop = OP_LOCK; 13182 if (ctype == NFS4_LCK_CTYPE_NORM) 13183 argsp->ctag = TAG_LOCK; 13184 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13185 argsp->ctag = TAG_RELOCK; 13186 else 13187 argsp->ctag = TAG_LOCK_REINSTATE; 13188 lock_args = &argop->nfs_argop4_u.oplock; 13189 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13190 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13191 /* 13192 * Get the lock owner. If no lock owner exists, 13193 * create a 'temporary' one and grab the open seqid 13194 * synchronization (which puts a hold on the open 13195 * owner and open stream). 13196 * This also grabs the lock seqid synchronization. 13197 */ 13198 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13199 ep->stat = 13200 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13201 13202 if (ep->stat != NFS4_OK) 13203 goto out; 13204 13205 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13206 &lock_args->locker); 13207 13208 lock_args->offset = flk->l_start; 13209 lock_args->length = flk->l_len; 13210 if (flk->l_len == 0) 13211 lock_args->length = ~lock_args->length; 13212 *lock_argsp = lock_args; 13213 out: 13214 *oopp = oop; 13215 *ospp = osp; 13216 *lopp = lop; 13217 } 13218 13219 /* 13220 * After we get the reply from the server, record the proper information 13221 * for possible resend lock requests. 13222 * 13223 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13224 */ 13225 static void 13226 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13227 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13228 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13229 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13230 { 13231 bool_t unlock = (flk->l_type == F_UNLCK); 13232 13233 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13234 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13235 ctype == NFS4_LCK_CTYPE_REINSTATE); 13236 13237 if (error != 0 && !unlock) { 13238 NFS4_DEBUG((nfs4_lost_rqst_debug || 13239 nfs4_client_lock_debug), (CE_NOTE, 13240 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13241 " for lop %p", (void *)lop)); 13242 ASSERT(lop != NULL); 13243 mutex_enter(&lop->lo_lock); 13244 lop->lo_pending_rqsts = 1; 13245 mutex_exit(&lop->lo_lock); 13246 } 13247 13248 lost_rqstp->lr_putfirst = FALSE; 13249 lost_rqstp->lr_op = 0; 13250 13251 /* 13252 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13253 * recovery purposes so that the lock request that was sent 13254 * can be saved and re-issued later. Ditto for EIO from a forced 13255 * unmount. This is done to have the client's local locking state 13256 * match the v4 server's state; that is, the request was 13257 * potentially received and accepted by the server but the client 13258 * thinks it was not. 13259 */ 13260 if (error == ETIMEDOUT || error == EINTR || 13261 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13262 NFS4_DEBUG((nfs4_lost_rqst_debug || 13263 nfs4_client_lock_debug), (CE_NOTE, 13264 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13265 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13266 (void *)lop, (void *)oop, (void *)osp)); 13267 if (unlock) 13268 lost_rqstp->lr_op = OP_LOCKU; 13269 else { 13270 lost_rqstp->lr_op = OP_LOCK; 13271 lost_rqstp->lr_locktype = locktype; 13272 } 13273 /* 13274 * Objects are held and rele'd via the recovery code. 13275 * See nfs4_save_lost_rqst. 13276 */ 13277 lost_rqstp->lr_vp = vp; 13278 lost_rqstp->lr_dvp = NULL; 13279 lost_rqstp->lr_oop = oop; 13280 lost_rqstp->lr_osp = osp; 13281 lost_rqstp->lr_lop = lop; 13282 lost_rqstp->lr_cr = cr; 13283 switch (ctype) { 13284 case NFS4_LCK_CTYPE_NORM: 13285 flk->l_pid = ttoproc(curthread)->p_pid; 13286 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13287 break; 13288 case NFS4_LCK_CTYPE_REINSTATE: 13289 lost_rqstp->lr_putfirst = TRUE; 13290 lost_rqstp->lr_ctype = ctype; 13291 break; 13292 default: 13293 break; 13294 } 13295 lost_rqstp->lr_flk = flk; 13296 } 13297 } 13298 13299 /* 13300 * Update lop's seqid. Also update the seqid stored in a resend request, 13301 * if any. (Some recovery errors increment the seqid, and we may have to 13302 * send the resend request again.) 13303 */ 13304 13305 static void 13306 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13307 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13308 { 13309 if (lock_args) { 13310 if (lock_args->locker.new_lock_owner == TRUE) 13311 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13312 else { 13313 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13314 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13315 } 13316 } else if (locku_args) { 13317 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13318 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13319 } 13320 } 13321 13322 /* 13323 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13324 * COMPOUND4 args/res for calls that need to retry. 13325 * Switches the *cred_otwp to base_cr. 13326 */ 13327 static void 13328 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13329 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13330 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13331 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13332 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13333 { 13334 nfs4_open_owner_t *oop = *oopp; 13335 nfs4_open_stream_t *osp = *ospp; 13336 nfs4_lock_owner_t *lop = *lopp; 13337 nfs_argop4 *argop = (*argspp)->array; 13338 13339 if (*did_start_fop) { 13340 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13341 needrecov); 13342 *did_start_fop = FALSE; 13343 } 13344 ASSERT((*argspp)->array_len == 2); 13345 if (argop[1].argop == OP_LOCK) 13346 nfs4args_lock_free(&argop[1]); 13347 else if (argop[1].argop == OP_LOCKT) 13348 nfs4args_lockt_free(&argop[1]); 13349 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13350 if (!error) 13351 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13352 *argspp = NULL; 13353 *respp = NULL; 13354 13355 if (lop) { 13356 nfs4_end_lock_seqid_sync(lop); 13357 lock_owner_rele(lop); 13358 *lopp = NULL; 13359 } 13360 13361 /* need to free up the reference on osp for lock args */ 13362 if (osp != NULL) { 13363 open_stream_rele(osp, VTOR4(vp)); 13364 *ospp = NULL; 13365 } 13366 13367 /* need to free up the reference on oop for lock args */ 13368 if (oop != NULL) { 13369 nfs4_end_open_seqid_sync(oop); 13370 open_owner_rele(oop); 13371 *oopp = NULL; 13372 } 13373 13374 crfree(*cred_otwp); 13375 *cred_otwp = base_cr; 13376 crhold(*cred_otwp); 13377 } 13378 13379 /* 13380 * Function to process the client's recovery for nfs4frlock. 13381 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13382 * 13383 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13384 * COMPOUND4 args/res for calls that need to retry. 13385 * 13386 * Note: the rp's r_lkserlock is *not* dropped during this path. 13387 */ 13388 static bool_t 13389 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13390 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13391 LOCK4args *lock_args, LOCKU4args *locku_args, 13392 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13393 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13394 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13395 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13396 { 13397 nfs4_open_owner_t *oop = *oopp; 13398 nfs4_open_stream_t *osp = *ospp; 13399 nfs4_lock_owner_t *lop = *lopp; 13400 13401 bool_t abort, retry; 13402 13403 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13404 ASSERT((*argspp) != NULL); 13405 ASSERT((*respp) != NULL); 13406 if (lock_args || locku_args) 13407 ASSERT(lop != NULL); 13408 13409 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13410 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13411 13412 retry = TRUE; 13413 abort = FALSE; 13414 if (needrecov) { 13415 nfs4_bseqid_entry_t *bsep = NULL; 13416 nfs_opnum4 op; 13417 13418 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13419 13420 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13421 seqid4 seqid; 13422 13423 if (lock_args) { 13424 if (lock_args->locker.new_lock_owner == TRUE) 13425 seqid = lock_args->locker.locker4_u. 13426 open_owner.open_seqid; 13427 else 13428 seqid = lock_args->locker.locker4_u. 13429 lock_owner.lock_seqid; 13430 } else if (locku_args) { 13431 seqid = locku_args->seqid; 13432 } else { 13433 seqid = 0; 13434 } 13435 13436 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13437 flk->l_pid, (*argspp)->ctag, seqid); 13438 } 13439 13440 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13441 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13442 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13443 NULL, op, bsep); 13444 13445 if (bsep) 13446 kmem_free(bsep, sizeof (*bsep)); 13447 } 13448 13449 /* 13450 * Return that we do not want to retry the request for 3 cases: 13451 * 1. If we received EINTR or are bailing out because of a forced 13452 * unmount, we came into this code path just for the sake of 13453 * initiating recovery, we now need to return the error. 13454 * 2. If we have aborted recovery. 13455 * 3. We received NFS4ERR_BAD_SEQID. 13456 */ 13457 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13458 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13459 retry = FALSE; 13460 13461 if (*did_start_fop == TRUE) { 13462 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13463 needrecov); 13464 *did_start_fop = FALSE; 13465 } 13466 13467 if (retry == TRUE) { 13468 nfs_argop4 *argop; 13469 13470 argop = (*argspp)->array; 13471 ASSERT((*argspp)->array_len == 2); 13472 13473 if (argop[1].argop == OP_LOCK) 13474 nfs4args_lock_free(&argop[1]); 13475 else if (argop[1].argop == OP_LOCKT) 13476 nfs4args_lockt_free(&argop[1]); 13477 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13478 if (!ep->error) 13479 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13480 *respp = NULL; 13481 *argspp = NULL; 13482 } 13483 13484 if (lop != NULL) { 13485 nfs4_end_lock_seqid_sync(lop); 13486 lock_owner_rele(lop); 13487 } 13488 13489 *lopp = NULL; 13490 13491 /* need to free up the reference on osp for lock args */ 13492 if (osp != NULL) { 13493 open_stream_rele(osp, rp); 13494 *ospp = NULL; 13495 } 13496 13497 /* need to free up the reference on oop for lock args */ 13498 if (oop != NULL) { 13499 nfs4_end_open_seqid_sync(oop); 13500 open_owner_rele(oop); 13501 *oopp = NULL; 13502 } 13503 13504 return (retry); 13505 } 13506 13507 /* 13508 * Handles the succesful reply from the server for nfs4frlock. 13509 */ 13510 static void 13511 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13512 vnode_t *vp, int flag, u_offset_t offset, 13513 nfs4_lost_rqst_t *resend_rqstp) 13514 { 13515 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13516 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13517 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13518 if (ctype == NFS4_LCK_CTYPE_NORM) { 13519 flk->l_pid = ttoproc(curthread)->p_pid; 13520 /* 13521 * We do not register lost locks locally in 13522 * the 'resend' case since the user/application 13523 * doesn't think we have the lock. 13524 */ 13525 ASSERT(!resend_rqstp); 13526 nfs4_register_lock_locally(vp, flk, flag, offset); 13527 } 13528 } 13529 } 13530 13531 /* 13532 * Handle the DENIED reply from the server for nfs4frlock. 13533 * Returns TRUE if we should retry the request; FALSE otherwise. 13534 * 13535 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13536 * COMPOUND4 args/res for calls that need to retry. Can also 13537 * drop and regrab the r_lkserlock. 13538 */ 13539 static bool_t 13540 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13541 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13542 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13543 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13544 nfs4_recov_state_t *recov_statep, int needrecov, 13545 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13546 clock_t *tick_delayp, short *whencep, int *errorp, 13547 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13548 bool_t *skip_get_err) 13549 { 13550 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13551 13552 if (lock_args) { 13553 nfs4_open_owner_t *oop = *oopp; 13554 nfs4_open_stream_t *osp = *ospp; 13555 nfs4_lock_owner_t *lop = *lopp; 13556 int intr; 13557 13558 /* 13559 * Blocking lock needs to sleep and retry from the request. 13560 * 13561 * Do not block and wait for 'resend' or 'reinstate' 13562 * lock requests, just return the error. 13563 * 13564 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13565 */ 13566 if (cmd == F_SETLKW) { 13567 rnode4_t *rp = VTOR4(vp); 13568 nfs_argop4 *argop = (*argspp)->array; 13569 13570 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13571 13572 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13573 recov_statep, needrecov); 13574 *did_start_fop = FALSE; 13575 ASSERT((*argspp)->array_len == 2); 13576 if (argop[1].argop == OP_LOCK) 13577 nfs4args_lock_free(&argop[1]); 13578 else if (argop[1].argop == OP_LOCKT) 13579 nfs4args_lockt_free(&argop[1]); 13580 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13581 if (*respp) 13582 (void) xdr_free(xdr_COMPOUND4res_clnt, 13583 (caddr_t)*respp); 13584 *argspp = NULL; 13585 *respp = NULL; 13586 nfs4_end_lock_seqid_sync(lop); 13587 lock_owner_rele(lop); 13588 *lopp = NULL; 13589 if (osp != NULL) { 13590 open_stream_rele(osp, rp); 13591 *ospp = NULL; 13592 } 13593 if (oop != NULL) { 13594 nfs4_end_open_seqid_sync(oop); 13595 open_owner_rele(oop); 13596 *oopp = NULL; 13597 } 13598 13599 nfs_rw_exit(&rp->r_lkserlock); 13600 13601 intr = nfs4_block_and_wait(tick_delayp, rp); 13602 13603 if (intr) { 13604 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13605 RW_WRITER, FALSE); 13606 *errorp = EINTR; 13607 return (FALSE); 13608 } 13609 13610 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13611 RW_WRITER, FALSE); 13612 13613 /* 13614 * Make sure we are still safe to lock with 13615 * regards to mmapping. 13616 */ 13617 if (!nfs4_safelock(vp, flk, cr)) { 13618 *errorp = EAGAIN; 13619 return (FALSE); 13620 } 13621 13622 return (TRUE); 13623 } 13624 if (ctype == NFS4_LCK_CTYPE_NORM) 13625 *errorp = EAGAIN; 13626 *skip_get_err = TRUE; 13627 flk->l_whence = 0; 13628 *whencep = 0; 13629 return (FALSE); 13630 } else if (lockt_args) { 13631 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13632 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13633 13634 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13635 flk, lockt_args); 13636 13637 /* according to NLM code */ 13638 *errorp = 0; 13639 *whencep = 0; 13640 *skip_get_err = TRUE; 13641 return (FALSE); 13642 } 13643 return (FALSE); 13644 } 13645 13646 /* 13647 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13648 */ 13649 static void 13650 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13651 { 13652 switch (resp->status) { 13653 case NFS4ERR_ACCESS: 13654 case NFS4ERR_ADMIN_REVOKED: 13655 case NFS4ERR_BADHANDLE: 13656 case NFS4ERR_BAD_RANGE: 13657 case NFS4ERR_BAD_SEQID: 13658 case NFS4ERR_BAD_STATEID: 13659 case NFS4ERR_BADXDR: 13660 case NFS4ERR_DEADLOCK: 13661 case NFS4ERR_DELAY: 13662 case NFS4ERR_EXPIRED: 13663 case NFS4ERR_FHEXPIRED: 13664 case NFS4ERR_GRACE: 13665 case NFS4ERR_INVAL: 13666 case NFS4ERR_ISDIR: 13667 case NFS4ERR_LEASE_MOVED: 13668 case NFS4ERR_LOCK_NOTSUPP: 13669 case NFS4ERR_LOCK_RANGE: 13670 case NFS4ERR_MOVED: 13671 case NFS4ERR_NOFILEHANDLE: 13672 case NFS4ERR_NO_GRACE: 13673 case NFS4ERR_OLD_STATEID: 13674 case NFS4ERR_OPENMODE: 13675 case NFS4ERR_RECLAIM_BAD: 13676 case NFS4ERR_RECLAIM_CONFLICT: 13677 case NFS4ERR_RESOURCE: 13678 case NFS4ERR_SERVERFAULT: 13679 case NFS4ERR_STALE: 13680 case NFS4ERR_STALE_CLIENTID: 13681 case NFS4ERR_STALE_STATEID: 13682 return; 13683 default: 13684 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13685 "nfs4frlock_results_default: got unrecognizable " 13686 "res.status %d", resp->status)); 13687 *errorp = NFS4ERR_INVAL; 13688 } 13689 } 13690 13691 /* 13692 * The lock request was successful, so update the client's state. 13693 */ 13694 static void 13695 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13696 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13697 vnode_t *vp, flock64_t *flk, cred_t *cr, 13698 nfs4_lost_rqst_t *resend_rqstp) 13699 { 13700 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13701 13702 if (lock_args) { 13703 LOCK4res *lock_res; 13704 13705 lock_res = &resop->nfs_resop4_u.oplock; 13706 /* update the stateid with server's response */ 13707 13708 if (lock_args->locker.new_lock_owner == TRUE) { 13709 mutex_enter(&lop->lo_lock); 13710 lop->lo_just_created = NFS4_PERM_CREATED; 13711 mutex_exit(&lop->lo_lock); 13712 } 13713 13714 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13715 13716 /* 13717 * If the lock was the result of a resending a lost 13718 * request, we've synched up the stateid and seqid 13719 * with the server, but now the server might be out of sync 13720 * with what the application thinks it has for locks. 13721 * Clean that up here. It's unclear whether we should do 13722 * this even if the filesystem has been forcibly unmounted. 13723 * For most servers, it's probably wasted effort, but 13724 * RFC3530 lets servers require that unlocks exactly match 13725 * the locks that are held. 13726 */ 13727 if (resend_rqstp != NULL && 13728 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13729 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13730 } else { 13731 flk->l_whence = 0; 13732 } 13733 } else if (locku_args) { 13734 LOCKU4res *locku_res; 13735 13736 locku_res = &resop->nfs_resop4_u.oplocku; 13737 13738 /* Update the stateid with the server's response */ 13739 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13740 } else if (lockt_args) { 13741 /* Switch the lock type to express success, see fcntl */ 13742 flk->l_type = F_UNLCK; 13743 flk->l_whence = 0; 13744 } 13745 } 13746 13747 /* 13748 * Do final cleanup before exiting nfs4frlock. 13749 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13750 * COMPOUND4 args/res for calls that haven't already. 13751 */ 13752 static void 13753 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13754 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13755 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13756 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13757 short whence, u_offset_t offset, struct lm_sysid *ls, 13758 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13759 bool_t did_start_fop, bool_t skip_get_err, 13760 cred_t *cred_otw, cred_t *cred) 13761 { 13762 mntinfo4_t *mi = VTOMI4(vp); 13763 rnode4_t *rp = VTOR4(vp); 13764 int error = *errorp; 13765 nfs_argop4 *argop; 13766 13767 ASSERT(nfs_zone() == mi->mi_zone); 13768 /* 13769 * The client recovery code wants the raw status information, 13770 * so don't map the NFS status code to an errno value for 13771 * non-normal call types. 13772 */ 13773 if (ctype == NFS4_LCK_CTYPE_NORM) { 13774 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13775 *errorp = geterrno4(resp->status); 13776 if (did_start_fop == TRUE) 13777 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13778 needrecov); 13779 13780 if (!error && resp && resp->status == NFS4_OK) { 13781 /* 13782 * We've established a new lock on the server, so invalidate 13783 * the pages associated with the vnode to get the most up to 13784 * date pages from the server after acquiring the lock. We 13785 * want to be sure that the read operation gets the newest data. 13786 * N.B. 13787 * We used to do this in nfs4frlock_results_ok but that doesn't 13788 * work since VOP_PUTPAGE can call nfs4_commit which calls 13789 * nfs4_start_fop. We flush the pages below after calling 13790 * nfs4_end_fop above 13791 */ 13792 int error; 13793 13794 error = VOP_PUTPAGE(vp, (u_offset_t)0, 13795 0, B_INVAL, cred); 13796 13797 if (error && (error == ENOSPC || error == EDQUOT)) { 13798 rnode4_t *rp = VTOR4(vp); 13799 13800 mutex_enter(&rp->r_statelock); 13801 if (!rp->r_error) 13802 rp->r_error = error; 13803 mutex_exit(&rp->r_statelock); 13804 } 13805 } 13806 } 13807 if (argsp) { 13808 ASSERT(argsp->array_len == 2); 13809 argop = argsp->array; 13810 if (argop[1].argop == OP_LOCK) 13811 nfs4args_lock_free(&argop[1]); 13812 else if (argop[1].argop == OP_LOCKT) 13813 nfs4args_lockt_free(&argop[1]); 13814 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13815 if (resp) 13816 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 13817 } 13818 13819 /* free the reference on the lock owner */ 13820 if (lop != NULL) { 13821 nfs4_end_lock_seqid_sync(lop); 13822 lock_owner_rele(lop); 13823 } 13824 13825 /* need to free up the reference on osp for lock args */ 13826 if (osp != NULL) 13827 open_stream_rele(osp, rp); 13828 13829 /* need to free up the reference on oop for lock args */ 13830 if (oop != NULL) { 13831 nfs4_end_open_seqid_sync(oop); 13832 open_owner_rele(oop); 13833 } 13834 13835 (void) convoff(vp, flk, whence, offset); 13836 13837 lm_rel_sysid(ls); 13838 13839 /* 13840 * Record debug information in the event we get EINVAL. 13841 */ 13842 mutex_enter(&mi->mi_lock); 13843 if (*errorp == EINVAL && (lock_args || locku_args) && 13844 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 13845 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 13846 zcmn_err(getzoneid(), CE_NOTE, 13847 "%s operation failed with " 13848 "EINVAL probably since the server, %s," 13849 " doesn't support POSIX style locking", 13850 lock_args ? "LOCK" : "LOCKU", 13851 mi->mi_curr_serv->sv_hostname); 13852 mi->mi_flags |= MI4_LOCK_DEBUG; 13853 } 13854 } 13855 mutex_exit(&mi->mi_lock); 13856 13857 if (cred_otw) 13858 crfree(cred_otw); 13859 } 13860 13861 /* 13862 * This calls the server and the local locking code. 13863 * 13864 * Client locks are registerred locally by oring the sysid with 13865 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 13866 * We need to distinguish between the two to avoid collision in case one 13867 * machine is used as both client and server. 13868 * 13869 * Blocking lock requests will continually retry to acquire the lock 13870 * forever. 13871 * 13872 * The ctype is defined as follows: 13873 * NFS4_LCK_CTYPE_NORM: normal lock request. 13874 * 13875 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 13876 * recovery, get the pid from flk instead of curproc, and don't reregister 13877 * the lock locally. 13878 * 13879 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 13880 * that we will use the information passed in via resend_rqstp to setup the 13881 * lock/locku request. This resend is the exact same request as the 'lost 13882 * lock', and is initiated by the recovery framework. A successful resend 13883 * request can initiate one or more reinstate requests. 13884 * 13885 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 13886 * does not trigger additional reinstate requests. This lock call type is 13887 * set for setting the v4 server's locking state back to match what the 13888 * client's local locking state is in the event of a received 'lost lock'. 13889 * 13890 * Errors are returned via the nfs4_error_t parameter. 13891 */ 13892 void 13893 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 13894 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 13895 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 13896 { 13897 COMPOUND4args_clnt args, *argsp = NULL; 13898 COMPOUND4res_clnt res, *resp = NULL; 13899 nfs_argop4 *argop; 13900 nfs_resop4 *resop; 13901 rnode4_t *rp; 13902 int doqueue = 1; 13903 clock_t tick_delay; /* delay in clock ticks */ 13904 struct lm_sysid *ls; 13905 LOCK4args *lock_args = NULL; 13906 LOCKU4args *locku_args = NULL; 13907 LOCKT4args *lockt_args = NULL; 13908 nfs4_open_owner_t *oop = NULL; 13909 nfs4_open_stream_t *osp = NULL; 13910 nfs4_lock_owner_t *lop = NULL; 13911 bool_t needrecov = FALSE; 13912 nfs4_recov_state_t recov_state; 13913 short whence; 13914 nfs4_op_hint_t op_hint; 13915 nfs4_lost_rqst_t lost_rqst; 13916 bool_t retry = FALSE; 13917 bool_t did_start_fop = FALSE; 13918 bool_t skip_get_err = FALSE; 13919 cred_t *cred_otw = NULL; 13920 bool_t recovonly; /* just queue request */ 13921 int frc_no_reclaim = 0; 13922 #ifdef DEBUG 13923 char *name; 13924 #endif 13925 13926 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13927 13928 #ifdef DEBUG 13929 name = fn_name(VTOSV(vp)->sv_name); 13930 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 13931 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 13932 "length %"PRIu64", pid %d, sysid %d, call type %s, " 13933 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 13934 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 13935 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 13936 resend_rqstp ? "TRUE" : "FALSE")); 13937 kmem_free(name, MAXNAMELEN); 13938 #endif 13939 13940 nfs4_error_zinit(ep); 13941 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 13942 if (ep->error) 13943 return; 13944 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 13945 if (ep->error) 13946 return; 13947 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 13948 vp, cr, &cred_otw); 13949 13950 recov_retry: 13951 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 13952 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 13953 rp = VTOR4(vp); 13954 13955 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 13956 &did_start_fop, &recovonly); 13957 13958 if (ep->error) 13959 goto out; 13960 13961 if (recovonly) { 13962 /* 13963 * Leave the request for the recovery system to deal with. 13964 */ 13965 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13966 ASSERT(cmd != F_GETLK); 13967 ASSERT(flk->l_type == F_UNLCK); 13968 13969 nfs4_error_init(ep, EINTR); 13970 needrecov = TRUE; 13971 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 13972 if (lop != NULL) { 13973 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 13974 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 13975 (void) nfs4_start_recovery(ep, 13976 VTOMI4(vp), vp, NULL, NULL, 13977 (lost_rqst.lr_op == OP_LOCK || 13978 lost_rqst.lr_op == OP_LOCKU) ? 13979 &lost_rqst : NULL, OP_LOCKU, NULL); 13980 lock_owner_rele(lop); 13981 lop = NULL; 13982 } 13983 flk->l_pid = curproc->p_pid; 13984 nfs4_register_lock_locally(vp, flk, flag, offset); 13985 goto out; 13986 } 13987 13988 /* putfh directory fh */ 13989 argop[0].argop = OP_CPUTFH; 13990 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 13991 13992 /* 13993 * Set up the over-the-wire arguments and get references to the 13994 * open owner, etc. 13995 */ 13996 13997 if (ctype == NFS4_LCK_CTYPE_RESEND || 13998 ctype == NFS4_LCK_CTYPE_REINSTATE) { 13999 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14000 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14001 } else { 14002 bool_t go_otw = TRUE; 14003 14004 ASSERT(resend_rqstp == NULL); 14005 14006 switch (cmd) { 14007 case F_GETLK: 14008 case F_O_GETLK: 14009 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14010 &lockt_args, argsp, flk, rp); 14011 break; 14012 case F_SETLKW: 14013 case F_SETLK: 14014 if (flk->l_type == F_UNLCK) 14015 nfs4frlock_setup_locku_args(ctype, 14016 &argop[1], &locku_args, flk, 14017 &lop, ep, argsp, 14018 vp, flag, offset, cr, 14019 &skip_get_err, &go_otw); 14020 else 14021 nfs4frlock_setup_lock_args(ctype, 14022 &lock_args, &oop, &osp, &lop, &argop[1], 14023 argsp, flk, cmd, vp, cr, ep); 14024 14025 if (ep->error) 14026 goto out; 14027 14028 switch (ep->stat) { 14029 case NFS4_OK: 14030 break; 14031 case NFS4ERR_DELAY: 14032 /* recov thread never gets this error */ 14033 ASSERT(resend_rqstp == NULL); 14034 ASSERT(did_start_fop); 14035 14036 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14037 &recov_state, TRUE); 14038 did_start_fop = FALSE; 14039 if (argop[1].argop == OP_LOCK) 14040 nfs4args_lock_free(&argop[1]); 14041 else if (argop[1].argop == OP_LOCKT) 14042 nfs4args_lockt_free(&argop[1]); 14043 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14044 argsp = NULL; 14045 goto recov_retry; 14046 default: 14047 ep->error = EIO; 14048 goto out; 14049 } 14050 break; 14051 default: 14052 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14053 "nfs4_frlock: invalid cmd %d", cmd)); 14054 ep->error = EINVAL; 14055 goto out; 14056 } 14057 14058 if (!go_otw) 14059 goto out; 14060 } 14061 14062 /* XXX should we use the local reclock as a cache ? */ 14063 /* 14064 * Unregister the lock with the local locking code before 14065 * contacting the server. This avoids a potential race where 14066 * another process gets notified that it has been granted a lock 14067 * before we can unregister ourselves locally. 14068 */ 14069 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14070 if (ctype == NFS4_LCK_CTYPE_NORM) 14071 flk->l_pid = ttoproc(curthread)->p_pid; 14072 nfs4_register_lock_locally(vp, flk, flag, offset); 14073 } 14074 14075 /* 14076 * Send the server the lock request. Continually loop with a delay 14077 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14078 */ 14079 resp = &res; 14080 14081 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14082 (CE_NOTE, 14083 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14084 rnode4info(rp))); 14085 14086 if (lock_args && frc_no_reclaim) { 14087 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14088 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14089 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14090 lock_args->reclaim = FALSE; 14091 if (did_reclaimp) 14092 *did_reclaimp = 0; 14093 } 14094 14095 /* 14096 * Do the OTW call. 14097 */ 14098 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14099 14100 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14101 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14102 14103 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14104 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14105 "nfs4frlock: needrecov %d", needrecov)); 14106 14107 if (ep->error != 0 && !needrecov && ep->error != EACCES) 14108 goto out; 14109 14110 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14111 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14112 args.ctag); 14113 14114 if ((ep->error == EACCES || 14115 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14116 cred_otw != cr) { 14117 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14118 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14119 cr, &cred_otw); 14120 goto recov_retry; 14121 } 14122 14123 if (needrecov) { 14124 /* 14125 * LOCKT requests don't need to recover from lost 14126 * requests since they don't create/modify state. 14127 */ 14128 if ((ep->error == EINTR || 14129 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14130 lockt_args) 14131 goto out; 14132 /* 14133 * Do not attempt recovery for requests initiated by 14134 * the recovery framework. Let the framework redrive them. 14135 */ 14136 if (ctype != NFS4_LCK_CTYPE_NORM) 14137 goto out; 14138 else { 14139 ASSERT(resend_rqstp == NULL); 14140 } 14141 14142 nfs4frlock_save_lost_rqst(ctype, ep->error, 14143 flk_to_locktype(cmd, flk->l_type), 14144 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14145 14146 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14147 &resp, lock_args, locku_args, &oop, &osp, &lop, 14148 rp, vp, &recov_state, op_hint, &did_start_fop, 14149 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14150 14151 if (retry) { 14152 ASSERT(oop == NULL); 14153 ASSERT(osp == NULL); 14154 ASSERT(lop == NULL); 14155 goto recov_retry; 14156 } 14157 goto out; 14158 } 14159 14160 /* 14161 * Process the reply. 14162 */ 14163 switch (resp->status) { 14164 case NFS4_OK: 14165 resop = &resp->array[1]; 14166 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14167 resend_rqstp); 14168 /* 14169 * Have a successful lock operation, now update state. 14170 */ 14171 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14172 resop, lop, vp, flk, cr, resend_rqstp); 14173 break; 14174 14175 case NFS4ERR_DENIED: 14176 resop = &resp->array[1]; 14177 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14178 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14179 &recov_state, needrecov, &argsp, &resp, 14180 &tick_delay, &whence, &ep->error, resop, cr, 14181 &did_start_fop, &skip_get_err); 14182 14183 if (retry) { 14184 ASSERT(oop == NULL); 14185 ASSERT(osp == NULL); 14186 ASSERT(lop == NULL); 14187 goto recov_retry; 14188 } 14189 break; 14190 /* 14191 * If the server won't let us reclaim, fall-back to trying to lock 14192 * the file from scratch. Code elsewhere will check the changeinfo 14193 * to ensure the file hasn't been changed. 14194 */ 14195 case NFS4ERR_NO_GRACE: 14196 if (lock_args && lock_args->reclaim == TRUE) { 14197 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14198 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14199 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14200 frc_no_reclaim = 1; 14201 /* clean up before retrying */ 14202 needrecov = 0; 14203 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14204 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14205 &recov_state, op_hint, &did_start_fop, NULL, flk); 14206 goto recov_retry; 14207 } 14208 /* FALLTHROUGH */ 14209 14210 default: 14211 nfs4frlock_results_default(resp, &ep->error); 14212 break; 14213 } 14214 out: 14215 /* 14216 * Process and cleanup from error. Make interrupted unlock 14217 * requests look successful, since they will be handled by the 14218 * client recovery code. 14219 */ 14220 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14221 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14222 lock_args, locku_args, did_start_fop, 14223 skip_get_err, cred_otw, cr); 14224 14225 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14226 (cmd == F_SETLK || cmd == F_SETLKW)) 14227 ep->error = 0; 14228 } 14229 14230 /* 14231 * nfs4_safelock: 14232 * 14233 * Return non-zero if the given lock request can be handled without 14234 * violating the constraints on concurrent mapping and locking. 14235 */ 14236 14237 static int 14238 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14239 { 14240 rnode4_t *rp = VTOR4(vp); 14241 struct vattr va; 14242 int error; 14243 14244 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14245 ASSERT(rp->r_mapcnt >= 0); 14246 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14247 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14248 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14249 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14250 14251 if (rp->r_mapcnt == 0) 14252 return (1); /* always safe if not mapped */ 14253 14254 /* 14255 * If the file is already mapped and there are locks, then they 14256 * should be all safe locks. So adding or removing a lock is safe 14257 * as long as the new request is safe (i.e., whole-file, meaning 14258 * length and starting offset are both zero). 14259 */ 14260 14261 if (bfp->l_start != 0 || bfp->l_len != 0) { 14262 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14263 "cannot lock a memory mapped file unless locking the " 14264 "entire file: start %"PRIx64", len %"PRIx64, 14265 bfp->l_start, bfp->l_len)); 14266 return (0); 14267 } 14268 14269 /* mandatory locking and mapping don't mix */ 14270 va.va_mask = AT_MODE; 14271 error = VOP_GETATTR(vp, &va, 0, cr); 14272 if (error != 0) { 14273 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14274 "getattr error %d", error)); 14275 return (0); /* treat errors conservatively */ 14276 } 14277 if (MANDLOCK(vp, va.va_mode)) { 14278 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14279 "cannot mandatory lock and mmap a file")); 14280 return (0); 14281 } 14282 14283 return (1); 14284 } 14285 14286 14287 /* 14288 * Register the lock locally within Solaris. 14289 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14290 * recording locks locally. 14291 * 14292 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14293 * are registered locally. 14294 */ 14295 void 14296 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14297 u_offset_t offset) 14298 { 14299 int oldsysid; 14300 int error; 14301 #ifdef DEBUG 14302 char *name; 14303 #endif 14304 14305 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14306 14307 #ifdef DEBUG 14308 name = fn_name(VTOSV(vp)->sv_name); 14309 NFS4_DEBUG(nfs4_client_lock_debug, 14310 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14311 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14312 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14313 flk->l_sysid)); 14314 kmem_free(name, MAXNAMELEN); 14315 #endif 14316 14317 /* register the lock with local locking */ 14318 oldsysid = flk->l_sysid; 14319 flk->l_sysid |= LM_SYSID_CLIENT; 14320 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14321 #ifdef DEBUG 14322 if (error != 0) { 14323 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14324 "nfs4_register_lock_locally: could not register with" 14325 " local locking")); 14326 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14327 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14328 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14329 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14330 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14331 flk->l_type, flk->l_start, flk->l_len)); 14332 (void) reclock(vp, flk, 0, flag, offset, NULL); 14333 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14334 "blocked by pid %d sysid 0x%x type %d " 14335 "off 0x%" PRIx64 " len 0x%" PRIx64, 14336 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14337 flk->l_len)); 14338 } 14339 #endif 14340 flk->l_sysid = oldsysid; 14341 } 14342 14343 /* 14344 * nfs4_lockrelease: 14345 * 14346 * Release any locks on the given vnode that are held by the current 14347 * process. Also removes the lock owner (if one exists) from the rnode's 14348 * list. 14349 */ 14350 static int 14351 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14352 { 14353 flock64_t ld; 14354 int ret, error; 14355 rnode4_t *rp; 14356 nfs4_lock_owner_t *lop; 14357 nfs4_recov_state_t recov_state; 14358 mntinfo4_t *mi; 14359 bool_t possible_orphan = FALSE; 14360 bool_t recovonly; 14361 14362 ASSERT((uintptr_t)vp > KERNELBASE); 14363 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14364 14365 rp = VTOR4(vp); 14366 mi = VTOMI4(vp); 14367 14368 /* 14369 * If we have not locked anything then we can 14370 * just return since we have no work to do. 14371 */ 14372 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14373 return (0); 14374 } 14375 14376 /* 14377 * We need to comprehend that another thread may 14378 * kick off recovery and the lock_owner we have stashed 14379 * in lop might be invalid so we should NOT cache it 14380 * locally! 14381 */ 14382 recov_state.rs_flags = 0; 14383 recov_state.rs_num_retry_despite_err = 0; 14384 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14385 &recovonly); 14386 if (error) { 14387 mutex_enter(&rp->r_statelock); 14388 rp->r_flags |= R4LODANGLERS; 14389 mutex_exit(&rp->r_statelock); 14390 return (error); 14391 } 14392 14393 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14394 14395 /* 14396 * Check if the lock owner might have a lock (request was sent but 14397 * no response was received). Also check if there are any remote 14398 * locks on the file. (In theory we shouldn't have to make this 14399 * second check if there's no lock owner, but for now we'll be 14400 * conservative and do it anyway.) If either condition is true, 14401 * send an unlock for the entire file to the server. 14402 * 14403 * Note that no explicit synchronization is needed here. At worst, 14404 * flk_has_remote_locks() will return a false positive, in which case 14405 * the unlock call wastes time but doesn't harm correctness. 14406 */ 14407 14408 if (lop) { 14409 mutex_enter(&lop->lo_lock); 14410 possible_orphan = lop->lo_pending_rqsts; 14411 mutex_exit(&lop->lo_lock); 14412 lock_owner_rele(lop); 14413 } 14414 14415 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14416 14417 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14418 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14419 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14420 (void *)lop)); 14421 14422 if (possible_orphan || flk_has_remote_locks(vp)) { 14423 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14424 ld.l_whence = 0; /* unlock from start of file */ 14425 ld.l_start = 0; 14426 ld.l_len = 0; /* do entire file */ 14427 14428 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 14429 14430 if (ret != 0) { 14431 /* 14432 * If VOP_FRLOCK fails, make sure we unregister 14433 * local locks before we continue. 14434 */ 14435 ld.l_pid = ttoproc(curthread)->p_pid; 14436 nfs4_register_lock_locally(vp, &ld, flag, offset); 14437 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14438 "nfs4_lockrelease: lock release error on vp" 14439 " %p: error %d.\n", (void *)vp, ret)); 14440 } 14441 } 14442 14443 recov_state.rs_flags = 0; 14444 recov_state.rs_num_retry_despite_err = 0; 14445 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14446 &recovonly); 14447 if (error) { 14448 mutex_enter(&rp->r_statelock); 14449 rp->r_flags |= R4LODANGLERS; 14450 mutex_exit(&rp->r_statelock); 14451 return (error); 14452 } 14453 14454 /* 14455 * So, here we're going to need to retrieve the lock-owner 14456 * again (in case recovery has done a switch-a-roo) and 14457 * remove it because we can. 14458 */ 14459 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14460 14461 if (lop) { 14462 nfs4_rnode_remove_lock_owner(rp, lop); 14463 lock_owner_rele(lop); 14464 } 14465 14466 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14467 return (0); 14468 } 14469 14470 /* 14471 * Wait for 'tick_delay' clock ticks. 14472 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14473 * NOTE: lock_lease_time is in seconds. 14474 * 14475 * XXX For future improvements, should implement a waiting queue scheme. 14476 */ 14477 static int 14478 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14479 { 14480 long milliseconds_delay; 14481 time_t lock_lease_time; 14482 14483 /* wait tick_delay clock ticks or siginteruptus */ 14484 if (delay_sig(*tick_delay)) { 14485 return (EINTR); 14486 } 14487 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14488 "reissue the lock request: blocked for %ld clock ticks: %ld " 14489 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14490 14491 /* get the lease time */ 14492 lock_lease_time = r2lease_time(rp); 14493 14494 /* drv_hztousec converts ticks to microseconds */ 14495 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14496 if (milliseconds_delay < lock_lease_time * 1000) { 14497 *tick_delay = 2 * *tick_delay; 14498 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14499 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14500 } 14501 return (0); 14502 } 14503 14504 14505 void 14506 nfs4_vnops_init(void) 14507 { 14508 } 14509 14510 void 14511 nfs4_vnops_fini(void) 14512 { 14513 } 14514 14515 /* 14516 * Return a reference to the directory (parent) vnode for a given vnode, 14517 * using the saved pathname information and the directory file handle. The 14518 * caller is responsible for disposing of the reference. 14519 * Returns zero or an errno value. 14520 * 14521 * Caller should set need_start_op to FALSE if it is the recovery 14522 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14523 */ 14524 int 14525 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14526 { 14527 svnode_t *svnp; 14528 vnode_t *dvp = NULL; 14529 servinfo4_t *svp; 14530 nfs4_fname_t *mfname; 14531 int error; 14532 14533 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14534 14535 if (vp->v_flag & VROOT) { 14536 nfs4_sharedfh_t *sfh; 14537 nfs_fh4 fh; 14538 mntinfo4_t *mi; 14539 14540 ASSERT(vp->v_type == VREG); 14541 14542 mi = VTOMI4(vp); 14543 svp = mi->mi_curr_serv; 14544 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14545 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14546 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14547 sfh = sfh4_get(&fh, VTOMI4(vp)); 14548 nfs_rw_exit(&svp->sv_lock); 14549 mfname = mi->mi_fname; 14550 fn_hold(mfname); 14551 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14552 sfh4_rele(&sfh); 14553 14554 if (dvp->v_type == VNON) 14555 dvp->v_type = VDIR; 14556 *dvpp = dvp; 14557 return (0); 14558 } 14559 14560 svnp = VTOSV(vp); 14561 14562 if (svnp == NULL) { 14563 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14564 "shadow node is NULL")); 14565 return (EINVAL); 14566 } 14567 14568 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14569 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14570 "shadow node name or dfh val == NULL")); 14571 return (EINVAL); 14572 } 14573 14574 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14575 (int)need_start_op); 14576 if (error != 0) { 14577 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14578 "nfs4_make_dotdot returned %d", error)); 14579 return (error); 14580 } 14581 if (!dvp) { 14582 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14583 "nfs4_make_dotdot returned a NULL dvp")); 14584 return (EIO); 14585 } 14586 if (dvp->v_type == VNON) 14587 dvp->v_type = VDIR; 14588 ASSERT(dvp->v_type == VDIR); 14589 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14590 mutex_enter(&dvp->v_lock); 14591 dvp->v_flag |= V_XATTRDIR; 14592 mutex_exit(&dvp->v_lock); 14593 } 14594 *dvpp = dvp; 14595 return (0); 14596 } 14597 14598 /* 14599 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14600 * length that fnamep can accept, including the trailing null. 14601 * Returns 0 if okay, returns an errno value if there was a problem. 14602 */ 14603 14604 int 14605 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14606 { 14607 char *fn; 14608 int err = 0; 14609 servinfo4_t *svp; 14610 svnode_t *shvp; 14611 14612 /* 14613 * If the file being opened has VROOT set, then this is 14614 * a "file" mount. sv_name will not be interesting, so 14615 * go back to the servinfo4 to get the original mount 14616 * path and strip off all but the final edge. Otherwise 14617 * just return the name from the shadow vnode. 14618 */ 14619 14620 if (vp->v_flag & VROOT) { 14621 14622 svp = VTOMI4(vp)->mi_curr_serv; 14623 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14624 14625 fn = strrchr(svp->sv_path, '/'); 14626 if (fn == NULL) 14627 err = EINVAL; 14628 else 14629 fn++; 14630 } else { 14631 shvp = VTOSV(vp); 14632 fn = fn_name(shvp->sv_name); 14633 } 14634 14635 if (err == 0) 14636 if (strlen(fn) < maxlen) 14637 (void) strcpy(fnamep, fn); 14638 else 14639 err = ENAMETOOLONG; 14640 14641 if (vp->v_flag & VROOT) 14642 nfs_rw_exit(&svp->sv_lock); 14643 else 14644 kmem_free(fn, MAXNAMELEN); 14645 14646 return (err); 14647 } 14648 14649 /* 14650 * Bookkeeping for a close that doesn't need to go over the wire. 14651 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14652 * it is left at 1. 14653 */ 14654 void 14655 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14656 { 14657 rnode4_t *rp; 14658 mntinfo4_t *mi; 14659 14660 mi = VTOMI4(vp); 14661 rp = VTOR4(vp); 14662 14663 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14664 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14665 ASSERT(nfs_zone() == mi->mi_zone); 14666 ASSERT(mutex_owned(&osp->os_sync_lock)); 14667 ASSERT(*have_lockp); 14668 14669 if (!osp->os_valid || 14670 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14671 return; 14672 } 14673 14674 /* 14675 * This removes the reference obtained at OPEN; ie, 14676 * when the open stream structure was created. 14677 * 14678 * We don't have to worry about calling 'open_stream_rele' 14679 * since we our currently holding a reference to this 14680 * open stream which means the count can not go to 0 with 14681 * this decrement. 14682 */ 14683 ASSERT(osp->os_ref_count >= 2); 14684 osp->os_ref_count--; 14685 osp->os_valid = 0; 14686 mutex_exit(&osp->os_sync_lock); 14687 *have_lockp = 0; 14688 14689 nfs4_dec_state_ref_count(mi); 14690 } 14691 14692 /* 14693 * Close all remaining open streams on the rnode. These open streams 14694 * could be here because: 14695 * - The close attempted at either close or delmap failed 14696 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14697 * - Someone did mknod on a regular file but never opened it 14698 */ 14699 int 14700 nfs4close_all(vnode_t *vp, cred_t *cr) 14701 { 14702 nfs4_open_stream_t *osp; 14703 int error; 14704 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14705 rnode4_t *rp; 14706 14707 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14708 14709 error = 0; 14710 rp = VTOR4(vp); 14711 14712 /* 14713 * At this point, all we know is that the last time 14714 * someone called vn_rele, the count was 1. Since then, 14715 * the vnode could have been re-activated. We want to 14716 * loop through the open streams and close each one, but 14717 * we have to be careful since once we release the rnode 14718 * hash bucket lock, someone else is free to come in and 14719 * re-activate the rnode and add new open streams. The 14720 * strategy is take the rnode hash bucket lock, verify that 14721 * the count is still 1, grab the open stream off the 14722 * head of the list and mark it invalid, then release the 14723 * rnode hash bucket lock and proceed with that open stream. 14724 * This is ok because nfs4close_one() will acquire the proper 14725 * open/create to close/destroy synchronization for open 14726 * streams, and will ensure that if someone has reopened 14727 * the open stream after we've dropped the hash bucket lock 14728 * then we'll just simply return without destroying the 14729 * open stream. 14730 * Repeat until the list is empty. 14731 */ 14732 14733 for (;;) { 14734 14735 /* make sure vnode hasn't been reactivated */ 14736 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14737 mutex_enter(&vp->v_lock); 14738 if (vp->v_count > 1) { 14739 mutex_exit(&vp->v_lock); 14740 rw_exit(&rp->r_hashq->r_lock); 14741 break; 14742 } 14743 /* 14744 * Grabbing r_os_lock before releasing v_lock prevents 14745 * a window where the rnode/open stream could get 14746 * reactivated (and os_force_close set to 0) before we 14747 * had a chance to set os_force_close to 1. 14748 */ 14749 mutex_enter(&rp->r_os_lock); 14750 mutex_exit(&vp->v_lock); 14751 14752 osp = list_head(&rp->r_open_streams); 14753 if (!osp) { 14754 /* nothing left to CLOSE OTW, so return */ 14755 mutex_exit(&rp->r_os_lock); 14756 rw_exit(&rp->r_hashq->r_lock); 14757 break; 14758 } 14759 14760 mutex_enter(&rp->r_statev4_lock); 14761 /* the file can't still be mem mapped */ 14762 ASSERT(rp->r_mapcnt == 0); 14763 if (rp->created_v4) 14764 rp->created_v4 = 0; 14765 mutex_exit(&rp->r_statev4_lock); 14766 14767 /* 14768 * Grab a ref on this open stream; nfs4close_one 14769 * will mark it as invalid 14770 */ 14771 mutex_enter(&osp->os_sync_lock); 14772 osp->os_ref_count++; 14773 osp->os_force_close = 1; 14774 mutex_exit(&osp->os_sync_lock); 14775 mutex_exit(&rp->r_os_lock); 14776 rw_exit(&rp->r_hashq->r_lock); 14777 14778 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 14779 14780 /* Update error if it isn't already non-zero */ 14781 if (error == 0) { 14782 if (e.error) 14783 error = e.error; 14784 else if (e.stat) 14785 error = geterrno4(e.stat); 14786 } 14787 14788 #ifdef DEBUG 14789 nfs4close_all_cnt++; 14790 #endif 14791 /* Release the ref on osp acquired above. */ 14792 open_stream_rele(osp, rp); 14793 14794 /* Proceed to the next open stream, if any */ 14795 } 14796 return (error); 14797 } 14798 14799 /* 14800 * nfs4close_one - close one open stream for a file if needed. 14801 * 14802 * "close_type" indicates which close path this is: 14803 * CLOSE_NORM: close initiated via VOP_CLOSE. 14804 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 14805 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 14806 * the close and release of client state for this open stream 14807 * (unless someone else has the open stream open). 14808 * CLOSE_RESEND: indicates the request is a replay of an earlier request 14809 * (e.g., due to abort because of a signal). 14810 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 14811 * 14812 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 14813 * recovery. Instead, the caller is expected to deal with retries. 14814 * 14815 * The caller can either pass in the osp ('provided_osp') or not. 14816 * 14817 * 'access_bits' represents the access we are closing/downgrading. 14818 * 14819 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 14820 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 14821 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 14822 * 14823 * Errors are returned via the nfs4_error_t. 14824 */ 14825 void 14826 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 14827 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 14828 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 14829 uint_t mmap_flags) 14830 { 14831 nfs4_open_owner_t *oop; 14832 nfs4_open_stream_t *osp = NULL; 14833 int retry = 0; 14834 int num_retries = NFS4_NUM_RECOV_RETRIES; 14835 rnode4_t *rp; 14836 mntinfo4_t *mi; 14837 nfs4_recov_state_t recov_state; 14838 cred_t *cred_otw = NULL; 14839 bool_t recovonly = FALSE; 14840 int isrecov; 14841 int force_close; 14842 int close_failed = 0; 14843 int did_dec_count = 0; 14844 int did_start_op = 0; 14845 int did_force_recovlock = 0; 14846 int did_start_seqid_sync = 0; 14847 int have_sync_lock = 0; 14848 14849 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14850 14851 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 14852 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 14853 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 14854 len, maxprot, mmap_flags, access_bits)); 14855 14856 nfs4_error_zinit(ep); 14857 rp = VTOR4(vp); 14858 mi = VTOMI4(vp); 14859 isrecov = (close_type == CLOSE_RESEND || 14860 close_type == CLOSE_AFTER_RESEND); 14861 14862 /* 14863 * First get the open owner. 14864 */ 14865 if (!provided_osp) { 14866 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 14867 } else { 14868 oop = provided_osp->os_open_owner; 14869 ASSERT(oop != NULL); 14870 open_owner_hold(oop); 14871 } 14872 14873 if (!oop) { 14874 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 14875 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 14876 "close type %d", (void *)rp, (void *)mi, (void *)cr, 14877 (void *)provided_osp, close_type)); 14878 ep->error = EIO; 14879 goto out; 14880 } 14881 14882 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 14883 recov_retry: 14884 osp = NULL; 14885 close_failed = 0; 14886 force_close = (close_type == CLOSE_FORCE); 14887 retry = 0; 14888 did_start_op = 0; 14889 did_force_recovlock = 0; 14890 did_start_seqid_sync = 0; 14891 have_sync_lock = 0; 14892 recovonly = FALSE; 14893 recov_state.rs_flags = 0; 14894 recov_state.rs_num_retry_despite_err = 0; 14895 14896 /* 14897 * Second synchronize with recovery. 14898 */ 14899 if (!isrecov) { 14900 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 14901 &recov_state, &recovonly); 14902 if (!ep->error) { 14903 did_start_op = 1; 14904 } else { 14905 close_failed = 1; 14906 /* 14907 * If we couldn't get start_fop, but have to 14908 * cleanup state, then at least acquire the 14909 * mi_recovlock so we can synchronize with 14910 * recovery. 14911 */ 14912 if (close_type == CLOSE_FORCE) { 14913 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 14914 RW_READER, FALSE); 14915 did_force_recovlock = 1; 14916 } else 14917 goto out; 14918 } 14919 } 14920 14921 /* 14922 * We cannot attempt to get the open seqid sync if nfs4_start_fop 14923 * set 'recovonly' to TRUE since most likely this is due to 14924 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 14925 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 14926 * to retry, causing us to loop until recovery finishes. Plus we 14927 * don't need protection over the open seqid since we're not going 14928 * OTW, hence don't need to use the seqid. 14929 */ 14930 if (recovonly == FALSE) { 14931 /* need to grab the open owner sync before 'os_sync_lock' */ 14932 ep->error = nfs4_start_open_seqid_sync(oop, mi); 14933 if (ep->error == EAGAIN) { 14934 ASSERT(!isrecov); 14935 if (did_start_op) 14936 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 14937 &recov_state, TRUE); 14938 if (did_force_recovlock) 14939 nfs_rw_exit(&mi->mi_recovlock); 14940 goto recov_retry; 14941 } 14942 did_start_seqid_sync = 1; 14943 } 14944 14945 /* 14946 * Third get an open stream and acquire 'os_sync_lock' to 14947 * sychronize the opening/creating of an open stream with the 14948 * closing/destroying of an open stream. 14949 */ 14950 if (!provided_osp) { 14951 /* returns with 'os_sync_lock' held */ 14952 osp = find_open_stream(oop, rp); 14953 if (!osp) { 14954 ep->error = EIO; 14955 goto out; 14956 } 14957 } else { 14958 osp = provided_osp; 14959 open_stream_hold(osp); 14960 mutex_enter(&osp->os_sync_lock); 14961 } 14962 have_sync_lock = 1; 14963 14964 ASSERT(oop == osp->os_open_owner); 14965 14966 /* 14967 * Fourth, do any special pre-OTW CLOSE processing 14968 * based on the specific close type. 14969 */ 14970 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 14971 !did_dec_count) { 14972 ASSERT(osp->os_open_ref_count > 0); 14973 osp->os_open_ref_count--; 14974 did_dec_count = 1; 14975 if (osp->os_open_ref_count == 0) 14976 osp->os_final_close = 1; 14977 } 14978 14979 if (close_type == CLOSE_FORCE) { 14980 /* see if somebody reopened the open stream. */ 14981 if (!osp->os_force_close) { 14982 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14983 "nfs4close_one: skip CLOSE_FORCE as osp %p " 14984 "was reopened, vp %p", (void *)osp, (void *)vp)); 14985 ep->error = 0; 14986 ep->stat = NFS4_OK; 14987 goto out; 14988 } 14989 14990 if (!osp->os_final_close && !did_dec_count) { 14991 osp->os_open_ref_count--; 14992 did_dec_count = 1; 14993 } 14994 14995 /* 14996 * We can't depend on os_open_ref_count being 0 due to the 14997 * way executables are opened (VN_RELE to match a VOP_OPEN). 14998 */ 14999 #ifdef NOTYET 15000 ASSERT(osp->os_open_ref_count == 0); 15001 #endif 15002 if (osp->os_open_ref_count != 0) { 15003 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15004 "nfs4close_one: should panic here on an " 15005 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15006 "since this is probably the exec problem.")); 15007 15008 osp->os_open_ref_count = 0; 15009 } 15010 15011 /* 15012 * There is the possibility that nfs4close_one() 15013 * for close_type == CLOSE_DELMAP couldn't find the 15014 * open stream, thus couldn't decrement its os_mapcnt; 15015 * therefore we can't use this ASSERT yet. 15016 */ 15017 #ifdef NOTYET 15018 ASSERT(osp->os_mapcnt == 0); 15019 #endif 15020 osp->os_mapcnt = 0; 15021 } 15022 15023 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15024 ASSERT(osp->os_mapcnt >= btopr(len)); 15025 15026 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15027 osp->os_mmap_write -= btopr(len); 15028 if (maxprot & PROT_READ) 15029 osp->os_mmap_read -= btopr(len); 15030 if (maxprot & PROT_EXEC) 15031 osp->os_mmap_read -= btopr(len); 15032 /* mirror the PROT_NONE check in nfs4_addmap() */ 15033 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15034 !(maxprot & PROT_EXEC)) 15035 osp->os_mmap_read -= btopr(len); 15036 osp->os_mapcnt -= btopr(len); 15037 did_dec_count = 1; 15038 } 15039 15040 if (recovonly) { 15041 nfs4_lost_rqst_t lost_rqst; 15042 15043 /* request should not already be in recovery queue */ 15044 ASSERT(lrp == NULL); 15045 nfs4_error_init(ep, EINTR); 15046 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15047 osp, cred_otw, vp); 15048 mutex_exit(&osp->os_sync_lock); 15049 have_sync_lock = 0; 15050 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15051 lost_rqst.lr_op == OP_CLOSE ? 15052 &lost_rqst : NULL, OP_CLOSE, NULL); 15053 close_failed = 1; 15054 force_close = 0; 15055 goto close_cleanup; 15056 } 15057 15058 /* 15059 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15060 * we stopped operating on the open owner's <old oo_name, old seqid> 15061 * space, which means we stopped operating on the open stream 15062 * too. So don't go OTW (as the seqid is likely bad, and the 15063 * stateid could be stale, potentially triggering a false 15064 * setclientid), and just clean up the client's internal state. 15065 */ 15066 if (osp->os_orig_oo_name != oop->oo_name) { 15067 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15068 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15069 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15070 "oo_name %" PRIx64")", 15071 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15072 oop->oo_name)); 15073 close_failed = 1; 15074 } 15075 15076 /* If the file failed recovery, just quit. */ 15077 mutex_enter(&rp->r_statelock); 15078 if (rp->r_flags & R4RECOVERR) { 15079 close_failed = 1; 15080 } 15081 mutex_exit(&rp->r_statelock); 15082 15083 /* 15084 * If the force close path failed to obtain start_fop 15085 * then skip the OTW close and just remove the state. 15086 */ 15087 if (close_failed) 15088 goto close_cleanup; 15089 15090 /* 15091 * Fifth, check to see if there are still mapped pages or other 15092 * opens using this open stream. If there are then we can't 15093 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15094 */ 15095 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15096 nfs4_lost_rqst_t new_lost_rqst; 15097 bool_t needrecov = FALSE; 15098 cred_t *odg_cred_otw = NULL; 15099 seqid4 open_dg_seqid = 0; 15100 15101 if (osp->os_delegation) { 15102 /* 15103 * If this open stream was never OPENed OTW then we 15104 * surely can't DOWNGRADE it (especially since the 15105 * osp->open_stateid is really a delegation stateid 15106 * when os_delegation is 1). 15107 */ 15108 if (access_bits & FREAD) 15109 osp->os_share_acc_read--; 15110 if (access_bits & FWRITE) 15111 osp->os_share_acc_write--; 15112 osp->os_share_deny_none--; 15113 nfs4_error_zinit(ep); 15114 goto out; 15115 } 15116 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15117 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15118 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15119 if (needrecov && !isrecov) { 15120 bool_t abort; 15121 nfs4_bseqid_entry_t *bsep = NULL; 15122 15123 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15124 bsep = nfs4_create_bseqid_entry(oop, NULL, 15125 vp, 0, 15126 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15127 open_dg_seqid); 15128 15129 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15130 oop, osp, odg_cred_otw, vp, access_bits, 0); 15131 mutex_exit(&osp->os_sync_lock); 15132 have_sync_lock = 0; 15133 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15134 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15135 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15136 bsep); 15137 if (odg_cred_otw) 15138 crfree(odg_cred_otw); 15139 if (bsep) 15140 kmem_free(bsep, sizeof (*bsep)); 15141 15142 if (abort == TRUE) 15143 goto out; 15144 15145 if (did_start_seqid_sync) { 15146 nfs4_end_open_seqid_sync(oop); 15147 did_start_seqid_sync = 0; 15148 } 15149 open_stream_rele(osp, rp); 15150 15151 if (did_start_op) 15152 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15153 &recov_state, FALSE); 15154 if (did_force_recovlock) 15155 nfs_rw_exit(&mi->mi_recovlock); 15156 15157 goto recov_retry; 15158 } else { 15159 if (odg_cred_otw) 15160 crfree(odg_cred_otw); 15161 } 15162 goto out; 15163 } 15164 15165 /* 15166 * If this open stream was created as the results of an open 15167 * while holding a delegation, then just release it; no need 15168 * to do an OTW close. Otherwise do a "normal" OTW close. 15169 */ 15170 if (osp->os_delegation) { 15171 nfs4close_notw(vp, osp, &have_sync_lock); 15172 nfs4_error_zinit(ep); 15173 goto out; 15174 } 15175 15176 /* 15177 * If this stream is not valid, we're done. 15178 */ 15179 if (!osp->os_valid) { 15180 nfs4_error_zinit(ep); 15181 goto out; 15182 } 15183 15184 /* 15185 * Last open or mmap ref has vanished, need to do an OTW close. 15186 * First check to see if a close is still necessary. 15187 */ 15188 if (osp->os_failed_reopen) { 15189 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15190 "don't close OTW osp %p since reopen failed.", 15191 (void *)osp)); 15192 /* 15193 * Reopen of the open stream failed, hence the 15194 * stateid of the open stream is invalid/stale, and 15195 * sending this OTW would incorrectly cause another 15196 * round of recovery. In this case, we need to set 15197 * the 'os_valid' bit to 0 so another thread doesn't 15198 * come in and re-open this open stream before 15199 * this "closing" thread cleans up state (decrementing 15200 * the nfs4_server_t's state_ref_count and decrementing 15201 * the os_ref_count). 15202 */ 15203 osp->os_valid = 0; 15204 /* 15205 * This removes the reference obtained at OPEN; ie, 15206 * when the open stream structure was created. 15207 * 15208 * We don't have to worry about calling 'open_stream_rele' 15209 * since we our currently holding a reference to this 15210 * open stream which means the count can not go to 0 with 15211 * this decrement. 15212 */ 15213 ASSERT(osp->os_ref_count >= 2); 15214 osp->os_ref_count--; 15215 nfs4_error_zinit(ep); 15216 close_failed = 0; 15217 goto close_cleanup; 15218 } 15219 15220 ASSERT(osp->os_ref_count > 1); 15221 15222 /* 15223 * Sixth, try the CLOSE OTW. 15224 */ 15225 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15226 close_type, ep, &have_sync_lock); 15227 15228 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15229 /* 15230 * Let the recovery thread be responsible for 15231 * removing the state for CLOSE. 15232 */ 15233 close_failed = 1; 15234 force_close = 0; 15235 retry = 0; 15236 } 15237 15238 /* See if we need to retry with a different cred */ 15239 if ((ep->error == EACCES || 15240 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15241 cred_otw != cr) { 15242 crfree(cred_otw); 15243 cred_otw = cr; 15244 crhold(cred_otw); 15245 retry = 1; 15246 } 15247 15248 if (ep->error || ep->stat) 15249 close_failed = 1; 15250 15251 if (retry && !isrecov && num_retries-- > 0) { 15252 if (have_sync_lock) { 15253 mutex_exit(&osp->os_sync_lock); 15254 have_sync_lock = 0; 15255 } 15256 if (did_start_seqid_sync) { 15257 nfs4_end_open_seqid_sync(oop); 15258 did_start_seqid_sync = 0; 15259 } 15260 open_stream_rele(osp, rp); 15261 15262 if (did_start_op) 15263 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15264 &recov_state, FALSE); 15265 if (did_force_recovlock) 15266 nfs_rw_exit(&mi->mi_recovlock); 15267 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15268 "nfs4close_one: need to retry the close " 15269 "operation")); 15270 goto recov_retry; 15271 } 15272 close_cleanup: 15273 /* 15274 * Seventh and lastly, process our results. 15275 */ 15276 if (close_failed && force_close) { 15277 /* 15278 * It's ok to drop and regrab the 'os_sync_lock' since 15279 * nfs4close_notw() will recheck to make sure the 15280 * "close"/removal of state should happen. 15281 */ 15282 if (!have_sync_lock) { 15283 mutex_enter(&osp->os_sync_lock); 15284 have_sync_lock = 1; 15285 } 15286 /* 15287 * This is last call, remove the ref on the open 15288 * stream created by open and clean everything up. 15289 */ 15290 osp->os_pending_close = 0; 15291 nfs4close_notw(vp, osp, &have_sync_lock); 15292 nfs4_error_zinit(ep); 15293 } 15294 15295 if (!close_failed) { 15296 if (have_sync_lock) { 15297 osp->os_pending_close = 0; 15298 mutex_exit(&osp->os_sync_lock); 15299 have_sync_lock = 0; 15300 } else { 15301 mutex_enter(&osp->os_sync_lock); 15302 osp->os_pending_close = 0; 15303 mutex_exit(&osp->os_sync_lock); 15304 } 15305 if (did_start_op && recov_state.rs_sp != NULL) { 15306 mutex_enter(&recov_state.rs_sp->s_lock); 15307 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15308 mutex_exit(&recov_state.rs_sp->s_lock); 15309 } else { 15310 nfs4_dec_state_ref_count(mi); 15311 } 15312 nfs4_error_zinit(ep); 15313 } 15314 15315 out: 15316 if (have_sync_lock) 15317 mutex_exit(&osp->os_sync_lock); 15318 if (did_start_op) 15319 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15320 recovonly ? TRUE : FALSE); 15321 if (did_force_recovlock) 15322 nfs_rw_exit(&mi->mi_recovlock); 15323 if (cred_otw) 15324 crfree(cred_otw); 15325 if (osp) 15326 open_stream_rele(osp, rp); 15327 if (oop) { 15328 if (did_start_seqid_sync) 15329 nfs4_end_open_seqid_sync(oop); 15330 open_owner_rele(oop); 15331 } 15332 } 15333 15334 /* 15335 * Convert information returned by the server in the LOCK4denied 15336 * structure to the form required by fcntl. 15337 */ 15338 static void 15339 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15340 { 15341 nfs4_lo_name_t *lo; 15342 15343 #ifdef DEBUG 15344 if (denied_to_flk_debug) { 15345 lockt_denied_debug = lockt_denied; 15346 debug_enter("lockt_denied"); 15347 } 15348 #endif 15349 15350 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15351 flk->l_whence = 0; /* aka SEEK_SET */ 15352 flk->l_start = lockt_denied->offset; 15353 flk->l_len = lockt_denied->length; 15354 15355 /* 15356 * If the blocking clientid matches our client id, then we can 15357 * interpret the lockowner (since we built it). If not, then 15358 * fabricate a sysid and pid. Note that the l_sysid field 15359 * in *flk already has the local sysid. 15360 */ 15361 15362 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15363 15364 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15365 lo = (nfs4_lo_name_t *) 15366 lockt_denied->owner.owner_val; 15367 15368 flk->l_pid = lo->ln_pid; 15369 } else { 15370 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15371 "denied_to_flk: bad lock owner length\n")); 15372 15373 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15374 } 15375 } else { 15376 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15377 "denied_to_flk: foreign clientid\n")); 15378 15379 /* 15380 * Construct a new sysid which should be different from 15381 * sysids of other systems. 15382 */ 15383 15384 flk->l_sysid++; 15385 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15386 } 15387 } 15388 15389 static pid_t 15390 lo_to_pid(lock_owner4 *lop) 15391 { 15392 pid_t pid = 0; 15393 uchar_t *cp; 15394 int i; 15395 15396 cp = (uchar_t *)&lop->clientid; 15397 15398 for (i = 0; i < sizeof (lop->clientid); i++) 15399 pid += (pid_t)*cp++; 15400 15401 cp = (uchar_t *)lop->owner_val; 15402 15403 for (i = 0; i < lop->owner_len; i++) 15404 pid += (pid_t)*cp++; 15405 15406 return (pid); 15407 } 15408 15409 /* 15410 * Given a lock pointer, returns the length of that lock. 15411 * "end" is the last locked offset the "l_len" covers from 15412 * the start of the lock. 15413 */ 15414 static off64_t 15415 lock_to_end(flock64_t *lock) 15416 { 15417 off64_t lock_end; 15418 15419 if (lock->l_len == 0) 15420 lock_end = (off64_t)MAXEND; 15421 else 15422 lock_end = lock->l_start + lock->l_len - 1; 15423 15424 return (lock_end); 15425 } 15426 15427 /* 15428 * Given the end of a lock, it will return you the length "l_len" for that lock. 15429 */ 15430 static off64_t 15431 end_to_len(off64_t start, off64_t end) 15432 { 15433 off64_t lock_len; 15434 15435 ASSERT(end >= start); 15436 if (end == MAXEND) 15437 lock_len = 0; 15438 else 15439 lock_len = end - start + 1; 15440 15441 return (lock_len); 15442 } 15443 15444 /* 15445 * On given end for a lock it determines if it is the last locked offset 15446 * or not, if so keeps it as is, else adds one to return the length for 15447 * valid start. 15448 */ 15449 static off64_t 15450 start_check(off64_t x) 15451 { 15452 if (x == MAXEND) 15453 return (x); 15454 else 15455 return (x + 1); 15456 } 15457 15458 /* 15459 * See if these two locks overlap, and if so return 1; 15460 * otherwise, return 0. 15461 */ 15462 static int 15463 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15464 { 15465 off64_t llfp_end, curfp_end; 15466 15467 llfp_end = lock_to_end(llfp); 15468 curfp_end = lock_to_end(curfp); 15469 15470 if (((llfp_end >= curfp->l_start) && 15471 (llfp->l_start <= curfp->l_start)) || 15472 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15473 return (1); 15474 return (0); 15475 } 15476 15477 /* 15478 * Determine what the interseting lock region is, and add that to the 15479 * 'nl_llpp' locklist in increasing order (by l_start). 15480 */ 15481 static void 15482 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15483 locklist_t **nl_llpp, vnode_t *vp) 15484 { 15485 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15486 off64_t lost_flp_end, local_flp_end, len, start; 15487 15488 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15489 15490 if (!locks_intersect(lost_flp, local_flp)) 15491 return; 15492 15493 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15494 "locks intersect")); 15495 15496 lost_flp_end = lock_to_end(lost_flp); 15497 local_flp_end = lock_to_end(local_flp); 15498 15499 /* Find the starting point of the intersecting region */ 15500 if (local_flp->l_start > lost_flp->l_start) 15501 start = local_flp->l_start; 15502 else 15503 start = lost_flp->l_start; 15504 15505 /* Find the lenght of the intersecting region */ 15506 if (lost_flp_end < local_flp_end) 15507 len = end_to_len(start, lost_flp_end); 15508 else 15509 len = end_to_len(start, local_flp_end); 15510 15511 /* 15512 * Prepare the flock structure for the intersection found and insert 15513 * it into the new list in increasing l_start order. This list contains 15514 * intersections of locks registered by the client with the local host 15515 * and the lost lock. 15516 * The lock type of this lock is the same as that of the local_flp. 15517 */ 15518 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15519 intersect_llp->ll_flock.l_start = start; 15520 intersect_llp->ll_flock.l_len = len; 15521 intersect_llp->ll_flock.l_type = local_flp->l_type; 15522 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15523 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15524 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15525 intersect_llp->ll_vp = vp; 15526 15527 tmp_fllp = *nl_llpp; 15528 cur_fllp = NULL; 15529 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15530 intersect_llp->ll_flock.l_start) { 15531 cur_fllp = tmp_fllp; 15532 tmp_fllp = tmp_fllp->ll_next; 15533 } 15534 if (cur_fllp == NULL) { 15535 /* first on the list */ 15536 intersect_llp->ll_next = *nl_llpp; 15537 *nl_llpp = intersect_llp; 15538 } else { 15539 intersect_llp->ll_next = cur_fllp->ll_next; 15540 cur_fllp->ll_next = intersect_llp; 15541 } 15542 15543 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15544 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15545 intersect_llp->ll_flock.l_start, 15546 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15547 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15548 } 15549 15550 /* 15551 * Our local locking current state is potentially different than 15552 * what the NFSv4 server thinks we have due to a lost lock that was 15553 * resent and then received. We need to reset our "NFSv4" locking 15554 * state to match the current local locking state for this pid since 15555 * that is what the user/application sees as what the world is. 15556 * 15557 * We cannot afford to drop the open/lock seqid sync since then we can 15558 * get confused about what the current local locking state "is" versus 15559 * "was". 15560 * 15561 * If we are unable to fix up the locks, we send SIGLOST to the affected 15562 * process. This is not done if the filesystem has been forcibly 15563 * unmounted, in case the process has already exited and a new process 15564 * exists with the same pid. 15565 */ 15566 static void 15567 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15568 nfs4_lock_owner_t *lop) 15569 { 15570 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15571 mntinfo4_t *mi = VTOMI4(vp); 15572 const int cmd = F_SETLK; 15573 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15574 flock64_t ul_fl; 15575 15576 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15577 "nfs4_reinstitute_local_lock_state")); 15578 15579 /* 15580 * Find active locks for this vp from the local locking code. 15581 * Scan through this list and find out the locks that intersect with 15582 * the lost lock. Once we find the lock that intersects, add the 15583 * intersection area as a new lock to a new list "ri_llp". The lock 15584 * type of the intersection region lock added to ri_llp is the same 15585 * as that found in the active lock list, "list". The intersecting 15586 * region locks are added to ri_llp in increasing l_start order. 15587 */ 15588 ASSERT(nfs_zone() == mi->mi_zone); 15589 15590 locks = flk_active_locks_for_vp(vp); 15591 ri_llp = NULL; 15592 15593 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15594 ASSERT(llp->ll_vp == vp); 15595 /* 15596 * Pick locks that belong to this pid/lockowner 15597 */ 15598 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15599 continue; 15600 15601 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15602 } 15603 15604 /* 15605 * Now we have the list of intersections with the lost lock. These are 15606 * the locks that were/are active before the server replied to the 15607 * last/lost lock. Issue these locks to the server here. Playing these 15608 * locks to the server will re-establish aur current local locking state 15609 * with the v4 server. 15610 * If we get an error, send SIGLOST to the application for that lock. 15611 */ 15612 15613 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15614 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15615 "nfs4_reinstitute_local_lock_state: need to issue " 15616 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15617 llp->ll_flock.l_start, 15618 llp->ll_flock.l_start + llp->ll_flock.l_len, 15619 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15620 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15621 /* 15622 * No need to relock what we already have 15623 */ 15624 if (llp->ll_flock.l_type == lost_flp->l_type) 15625 continue; 15626 15627 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15628 } 15629 15630 /* 15631 * Now keeping the start of the lost lock as our reference parse the 15632 * newly created ri_llp locklist to find the ranges that we have locked 15633 * with the v4 server but not in the current local locking. We need 15634 * to unlock these ranges. 15635 * These ranges can also be reffered to as those ranges, where the lost 15636 * lock does not overlap with the locks in the ri_llp but are locked 15637 * since the server replied to the lost lock. 15638 */ 15639 cur_start = lost_flp->l_start; 15640 lost_flp_end = lock_to_end(lost_flp); 15641 15642 ul_fl.l_type = F_UNLCK; 15643 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15644 ul_fl.l_sysid = lost_flp->l_sysid; 15645 ul_fl.l_pid = lost_flp->l_pid; 15646 15647 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15648 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15649 15650 if (llp->ll_flock.l_start <= cur_start) { 15651 cur_start = start_check(llp_ll_flock_end); 15652 continue; 15653 } 15654 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15655 "nfs4_reinstitute_local_lock_state: " 15656 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15657 cur_start, llp->ll_flock.l_start)); 15658 15659 ul_fl.l_start = cur_start; 15660 ul_fl.l_len = end_to_len(cur_start, 15661 (llp->ll_flock.l_start - 1)); 15662 15663 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15664 cur_start = start_check(llp_ll_flock_end); 15665 } 15666 15667 /* 15668 * In the case where the lost lock ends after all intersecting locks, 15669 * unlock the last part of the lost lock range. 15670 */ 15671 if (cur_start != start_check(lost_flp_end)) { 15672 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15673 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15674 "lost lock region [%"PRIx64" - %"PRIx64"]", 15675 cur_start, lost_flp->l_start + lost_flp->l_len)); 15676 15677 ul_fl.l_start = cur_start; 15678 /* 15679 * Is it an to-EOF lock? if so unlock till the end 15680 */ 15681 if (lost_flp->l_len == 0) 15682 ul_fl.l_len = 0; 15683 else 15684 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15685 15686 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15687 } 15688 15689 if (locks != NULL) 15690 flk_free_locklist(locks); 15691 15692 /* Free up our newly created locklist */ 15693 for (llp = ri_llp; llp != NULL; ) { 15694 tmp_llp = llp->ll_next; 15695 kmem_free(llp, sizeof (locklist_t)); 15696 llp = tmp_llp; 15697 } 15698 15699 /* 15700 * Now return back to the original calling nfs4frlock() 15701 * and let us naturally drop our seqid syncs. 15702 */ 15703 } 15704 15705 /* 15706 * Create a lost state record for the given lock reinstantiation request 15707 * and push it onto the lost state queue. 15708 */ 15709 static void 15710 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15711 nfs4_lock_owner_t *lop) 15712 { 15713 nfs4_lost_rqst_t req; 15714 nfs_lock_type4 locktype; 15715 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15716 15717 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15718 15719 locktype = flk_to_locktype(cmd, flk->l_type); 15720 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15721 NULL, NULL, lop, flk, &req, cr, vp); 15722 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15723 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15724 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15725 NULL); 15726 } 15727