1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All Rights Reserved 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/file.h> 42 #include <sys/filio.h> 43 #include <sys/uio.h> 44 #include <sys/buf.h> 45 #include <sys/mman.h> 46 #include <sys/pathname.h> 47 #include <sys/dirent.h> 48 #include <sys/debug.h> 49 #include <sys/vmsystm.h> 50 #include <sys/fcntl.h> 51 #include <sys/flock.h> 52 #include <sys/swap.h> 53 #include <sys/errno.h> 54 #include <sys/strsubr.h> 55 #include <sys/sysmacros.h> 56 #include <sys/kmem.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 68 #include <rpc/types.h> 69 #include <rpc/auth.h> 70 #include <rpc/clnt.h> 71 72 #include <nfs/nfs.h> 73 #include <nfs/nfs_clnt.h> 74 #include <nfs/nfs_acl.h> 75 #include <nfs/lm.h> 76 #include <nfs/nfs4.h> 77 #include <nfs/nfs4_kprot.h> 78 #include <nfs/rnode4.h> 79 #include <nfs/nfs4_clnt.h> 80 81 #include <vm/hat.h> 82 #include <vm/as.h> 83 #include <vm/page.h> 84 #include <vm/pvn.h> 85 #include <vm/seg.h> 86 #include <vm/seg_map.h> 87 #include <vm/seg_kpm.h> 88 #include <vm/seg_vn.h> 89 90 #include <fs/fs_subr.h> 91 92 #include <sys/ddi.h> 93 #include <sys/int_fmtio.h> 94 95 typedef struct { 96 nfs4_ga_res_t *di_garp; 97 cred_t *di_cred; 98 hrtime_t di_time_call; 99 } dirattr_info_t; 100 101 typedef enum nfs4_acl_op { 102 NFS4_ACL_GET, 103 NFS4_ACL_SET 104 } nfs4_acl_op_t; 105 106 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 107 char *, dirattr_info_t *); 108 109 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 110 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 111 nfs4_error_t *, int *); 112 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 113 cred_t *); 114 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 115 stable_how4 *); 116 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 117 cred_t *, bool_t, struct uio *); 118 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 119 vsecattr_t *); 120 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 121 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 122 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 123 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 124 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 125 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 126 int, vnode_t **, cred_t *); 127 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 128 cred_t *, int, int, enum createmode4, int); 129 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 130 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 131 vnode_t *, char *, cred_t *, nfsstat4 *); 132 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 133 vnode_t *, char *, cred_t *, nfsstat4 *); 134 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 135 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 136 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 137 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 138 page_t *[], size_t, struct seg *, caddr_t, 139 enum seg_rw, cred_t *); 140 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 141 cred_t *); 142 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 143 int, cred_t *); 144 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 145 int, cred_t *); 146 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 147 static void nfs4_set_mod(vnode_t *); 148 static void nfs4_get_commit(vnode_t *); 149 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 150 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 151 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 152 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 153 cred_t *); 154 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 155 cred_t *); 156 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 157 hrtime_t, vnode_t *, cred_t *); 158 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 159 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 160 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 161 u_offset_t); 162 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 163 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 164 static cred_t *state_to_cred(nfs4_open_stream_t *); 165 static int vtoname(vnode_t *, char *, ssize_t); 166 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 167 static pid_t lo_to_pid(lock_owner4 *); 168 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 169 cred_t *, nfs4_lock_owner_t *); 170 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 171 nfs4_lock_owner_t *); 172 static nfs4_open_stream_t *open_and_get_osp(vnode_t *, cred_t *, mntinfo4_t *); 173 static void nfs4_delmap_callback(struct as *, void *, uint_t); 174 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 175 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 176 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 177 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 178 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 179 uid_t, gid_t, int); 180 181 /* 182 * Routines that implement the setting of v4 args for the misc. ops 183 */ 184 static void nfs4args_lock_free(nfs_argop4 *); 185 static void nfs4args_lockt_free(nfs_argop4 *); 186 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 187 int, rnode4_t *, cred_t *, bitmap4, int *, 188 nfs4_stateid_types_t *); 189 static void nfs4args_setattr_free(nfs_argop4 *); 190 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 191 bitmap4); 192 static void nfs4args_verify_free(nfs_argop4 *); 193 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 194 WRITE4args **, nfs4_stateid_types_t *); 195 196 /* 197 * These are the vnode ops functions that implement the vnode interface to 198 * the networked file system. See more comments below at nfs4_vnodeops. 199 */ 200 static int nfs4_open(vnode_t **, int, cred_t *); 201 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *); 202 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 203 caller_context_t *); 204 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 205 caller_context_t *); 206 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 207 static int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *); 208 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 209 caller_context_t *); 210 static int nfs4_access(vnode_t *, int, int, cred_t *); 211 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *); 212 static int nfs4_fsync(vnode_t *, int, cred_t *); 213 static void nfs4_inactive(vnode_t *, cred_t *); 214 static int nfs4_lookup(vnode_t *, char *, vnode_t **, 215 struct pathname *, int, vnode_t *, cred_t *); 216 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 217 int, vnode_t **, cred_t *, int); 218 static int nfs4_remove(vnode_t *, char *, cred_t *); 219 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *); 220 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 221 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, 222 vnode_t **, cred_t *); 223 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 224 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 225 cred_t *); 226 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *); 227 static int nfs4_fid(vnode_t *, fid_t *); 228 static int nfs4_rwlock(vnode_t *, int, caller_context_t *); 229 static void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 230 static int nfs4_seek(vnode_t *, offset_t, offset_t *); 231 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 232 page_t *[], size_t, struct seg *, caddr_t, 233 enum seg_rw, cred_t *); 234 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 235 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, 236 size_t, uchar_t, uchar_t, uint_t, cred_t *); 237 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, 238 size_t, uchar_t, uchar_t, uint_t, cred_t *); 239 static int nfs4_cmp(vnode_t *, vnode_t *); 240 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 241 struct flk_callback *, cred_t *); 242 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 243 cred_t *, caller_context_t *); 244 static int nfs4_realvp(vnode_t *, vnode_t **); 245 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, 246 size_t, uint_t, uint_t, uint_t, cred_t *); 247 static int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *); 248 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 249 cred_t *); 250 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *); 251 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 252 static int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 253 static int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 254 255 /* 256 * Used for nfs4_commit_vp() to indicate if we should 257 * wait on pending writes. 258 */ 259 #define NFS4_WRITE_NOWAIT 0 260 #define NFS4_WRITE_WAIT 1 261 262 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 263 264 /* 265 * Error flags used to pass information about certain special errors 266 * which need to be handled specially. 267 */ 268 #define NFS_EOF -98 269 #define NFS_VERF_MISMATCH -97 270 271 /* 272 * Flags used to differentiate between which operation drove the 273 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 274 */ 275 #define NFS4_CLOSE_OP 0x1 276 #define NFS4_DELMAP_OP 0x2 277 #define NFS4_INACTIVE_OP 0x3 278 279 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 280 281 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 282 #define ALIGN64(x, ptr, sz) \ 283 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 284 if (x) { \ 285 x = sizeof (uint64_t) - (x); \ 286 sz -= (x); \ 287 ptr += (x); \ 288 } 289 290 #ifdef DEBUG 291 int nfs4_client_attr_debug = 0; 292 int nfs4_client_state_debug = 0; 293 int nfs4_client_shadow_debug = 0; 294 int nfs4_client_lock_debug = 0; 295 int nfs4_seqid_sync = 0; 296 int nfs4_client_map_debug = 0; 297 static int nfs4_pageio_debug = 0; 298 int nfs4_client_inactive_debug = 0; 299 int nfs4_client_recov_debug = 0; 300 int nfs4_client_recov_stub_debug = 0; 301 int nfs4_client_failover_debug = 0; 302 int nfs4_client_call_debug = 0; 303 int nfs4_client_lookup_debug = 0; 304 int nfs4_client_zone_debug = 0; 305 int nfs4_lost_rqst_debug = 0; 306 int nfs4_rdattrerr_debug = 0; 307 int nfs4_open_stream_debug = 0; 308 309 int nfs4read_error_inject; 310 311 static int nfs4_create_misses = 0; 312 313 static int nfs4_readdir_cache_shorts = 0; 314 static int nfs4_readdir_readahead = 0; 315 316 static int nfs4_bio_do_stop = 0; 317 318 static int nfs4_lostpage = 0; /* number of times we lost original page */ 319 320 int nfs4_mmap_debug = 0; 321 322 static int nfs4_pathconf_cache_hits = 0; 323 static int nfs4_pathconf_cache_misses = 0; 324 325 int nfs4close_all_cnt; 326 int nfs4close_one_debug = 0; 327 int nfs4close_notw_debug = 0; 328 329 int denied_to_flk_debug = 0; 330 void *lockt_denied_debug; 331 332 #endif 333 334 /* 335 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 336 * or NFS4ERR_RESOURCE. 337 */ 338 static int confirm_retry_sec = 30; 339 340 static int nfs4_lookup_neg_cache = 1; 341 342 /* 343 * number of pages to read ahead 344 * optimized for 100 base-T. 345 */ 346 static int nfs4_nra = 4; 347 348 static int nfs4_do_symlink_cache = 1; 349 350 static int nfs4_pathconf_disable_cache = 0; 351 352 /* 353 * These are the vnode ops routines which implement the vnode interface to 354 * the networked file system. These routines just take their parameters, 355 * make them look networkish by putting the right info into interface structs, 356 * and then calling the appropriate remote routine(s) to do the work. 357 * 358 * Note on directory name lookup cacheing: If we detect a stale fhandle, 359 * we purge the directory cache relative to that vnode. This way, the 360 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 361 * more details on rnode locking. 362 */ 363 364 struct vnodeops *nfs4_vnodeops; 365 366 const fs_operation_def_t nfs4_vnodeops_template[] = { 367 VOPNAME_OPEN, nfs4_open, 368 VOPNAME_CLOSE, nfs4_close, 369 VOPNAME_READ, nfs4_read, 370 VOPNAME_WRITE, nfs4_write, 371 VOPNAME_IOCTL, nfs4_ioctl, 372 VOPNAME_GETATTR, nfs4_getattr, 373 VOPNAME_SETATTR, nfs4_setattr, 374 VOPNAME_ACCESS, nfs4_access, 375 VOPNAME_LOOKUP, nfs4_lookup, 376 VOPNAME_CREATE, nfs4_create, 377 VOPNAME_REMOVE, nfs4_remove, 378 VOPNAME_LINK, nfs4_link, 379 VOPNAME_RENAME, nfs4_rename, 380 VOPNAME_MKDIR, nfs4_mkdir, 381 VOPNAME_RMDIR, nfs4_rmdir, 382 VOPNAME_READDIR, nfs4_readdir, 383 VOPNAME_SYMLINK, nfs4_symlink, 384 VOPNAME_READLINK, nfs4_readlink, 385 VOPNAME_FSYNC, nfs4_fsync, 386 VOPNAME_INACTIVE, (fs_generic_func_p) nfs4_inactive, 387 VOPNAME_FID, nfs4_fid, 388 VOPNAME_RWLOCK, nfs4_rwlock, 389 VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs4_rwunlock, 390 VOPNAME_SEEK, nfs4_seek, 391 VOPNAME_FRLOCK, nfs4_frlock, 392 VOPNAME_SPACE, nfs4_space, 393 VOPNAME_REALVP, nfs4_realvp, 394 VOPNAME_GETPAGE, nfs4_getpage, 395 VOPNAME_PUTPAGE, nfs4_putpage, 396 VOPNAME_MAP, (fs_generic_func_p) nfs4_map, 397 VOPNAME_ADDMAP, (fs_generic_func_p) nfs4_addmap, 398 VOPNAME_DELMAP, nfs4_delmap, 399 VOPNAME_DUMP, nfs_dump, /* there is no separate nfs4_dump */ 400 VOPNAME_PATHCONF, nfs4_pathconf, 401 VOPNAME_PAGEIO, nfs4_pageio, 402 VOPNAME_DISPOSE, (fs_generic_func_p) nfs4_dispose, 403 VOPNAME_SETSECATTR, nfs4_setsecattr, 404 VOPNAME_GETSECATTR, nfs4_getsecattr, 405 VOPNAME_SHRLOCK, nfs4_shrlock, 406 NULL, NULL 407 }; 408 409 /* 410 * The following are subroutines and definitions to set args or get res 411 * for the different nfsv4 ops 412 */ 413 414 void 415 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 416 { 417 int i; 418 419 for (i = 0; i < arglen; i++) { 420 if (argop[i].argop == OP_LOOKUP) 421 kmem_free( 422 argop[i].nfs_argop4_u.oplookup.objname.utf8string_val, 423 argop[i].nfs_argop4_u.oplookup.objname.utf8string_len); 424 } 425 } 426 427 static void 428 nfs4args_lock_free(nfs_argop4 *argop) 429 { 430 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 431 432 if (locker->new_lock_owner == TRUE) { 433 open_to_lock_owner4 *open_owner; 434 435 open_owner = &locker->locker4_u.open_owner; 436 if (open_owner->lock_owner.owner_val != NULL) { 437 kmem_free(open_owner->lock_owner.owner_val, 438 open_owner->lock_owner.owner_len); 439 } 440 } 441 } 442 443 static void 444 nfs4args_lockt_free(nfs_argop4 *argop) 445 { 446 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 447 448 if (lowner->owner_val != NULL) { 449 kmem_free(lowner->owner_val, lowner->owner_len); 450 } 451 } 452 453 static void 454 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 455 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 456 nfs4_stateid_types_t *sid_types) 457 { 458 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 459 mntinfo4_t *mi; 460 461 argop->argop = OP_SETATTR; 462 /* 463 * The stateid is set to 0 if client is not modifying the size 464 * and otherwise to whatever nfs4_get_stateid() returns. 465 * 466 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 467 * state struct could be found for the process/file pair. We may 468 * want to change this in the future (by OPENing the file). See 469 * bug # 4474852. 470 */ 471 if (vap->va_mask & AT_SIZE) { 472 473 ASSERT(rp != NULL); 474 mi = VTOMI4(RTOV4(rp)); 475 476 argop->nfs_argop4_u.opsetattr.stateid = 477 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 478 OP_SETATTR, sid_types, FALSE); 479 } else { 480 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 481 sizeof (stateid4)); 482 } 483 484 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 485 if (*error) 486 bzero(attr, sizeof (*attr)); 487 } 488 489 static void 490 nfs4args_setattr_free(nfs_argop4 *argop) 491 { 492 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 493 } 494 495 static int 496 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 497 bitmap4 supp) 498 { 499 fattr4 *attr; 500 int error = 0; 501 502 argop->argop = op; 503 switch (op) { 504 case OP_VERIFY: 505 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 506 break; 507 case OP_NVERIFY: 508 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 509 break; 510 default: 511 return (EINVAL); 512 /*NOTREACHED*/ 513 break; 514 } 515 if (!error) 516 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 517 if (error) 518 bzero(attr, sizeof (*attr)); 519 return (error); 520 } 521 522 static void 523 nfs4args_verify_free(nfs_argop4 *argop) 524 { 525 switch (argop->argop) { 526 case OP_VERIFY: 527 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 528 break; 529 case OP_NVERIFY: 530 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 531 break; 532 default: 533 break; 534 } 535 } 536 537 static void 538 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 539 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 540 { 541 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 542 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 543 544 argop->argop = OP_WRITE; 545 wargs->stable = stable; 546 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 547 mi, OP_WRITE, sid_tp); 548 wargs->mblk = NULL; 549 *wargs_pp = wargs; 550 } 551 552 void 553 nfs4args_copen_free(OPEN4cargs *open_args) 554 { 555 if (open_args->owner.owner_val) { 556 kmem_free(open_args->owner.owner_val, 557 open_args->owner.owner_len); 558 } 559 if ((open_args->opentype == OPEN4_CREATE) && 560 (open_args->mode != EXCLUSIVE4)) { 561 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 562 } 563 } 564 565 /* 566 * XXX: This is referenced in modstubs.s 567 */ 568 struct vnodeops * 569 nfs4_getvnodeops(void) 570 { 571 return (nfs4_vnodeops); 572 } 573 574 /* 575 * The OPEN operation opens a regular file. 576 * 577 * ARGSUSED 578 */ 579 static int 580 nfs4_open(vnode_t **vpp, int flag, cred_t *cr) 581 { 582 vnode_t *dvp = NULL; 583 rnode4_t *rp; 584 int error; 585 int just_been_created; 586 char fn[MAXNAMELEN]; 587 588 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 589 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 590 return (EIO); 591 rp = VTOR4(*vpp); 592 593 /* 594 * Check to see if opening something besides a regular file; 595 * if so skip the OTW call 596 */ 597 if ((*vpp)->v_type != VREG) { 598 error = nfs4_open_non_reg_file(vpp, flag, cr); 599 return (error); 600 } 601 602 /* 603 * XXX - would like a check right here to know if the file is 604 * executable or not, so as to skip OTW 605 */ 606 607 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) 608 return (error); 609 610 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 611 return (error); 612 613 /* 614 * See if this file has just been CREATEd. 615 * If so, clear the flag and update the dnlc, which was previously 616 * skipped in nfs4_create. 617 * XXX need better serilization on this. 618 * XXX move this into the nf4open_otw call, after we have 619 * XXX acquired the open owner seqid sync. 620 */ 621 mutex_enter(&rp->r_statev4_lock); 622 if (rp->created_v4) { 623 rp->created_v4 = 0; 624 mutex_exit(&rp->r_statev4_lock); 625 626 dnlc_update(dvp, fn, *vpp); 627 /* This is needed so we don't bump the open ref count */ 628 just_been_created = 1; 629 } else { 630 mutex_exit(&rp->r_statev4_lock); 631 just_been_created = 0; 632 } 633 634 /* 635 * If caller specified O_TRUNC/FTRUNC, then be sure to set 636 * FWRITE (to drive successful setattr(size=0) after open) 637 */ 638 if (flag & FTRUNC) 639 flag |= FWRITE; 640 641 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 642 just_been_created); 643 644 if (!error && !((*vpp)->v_flag & VROOT)) 645 dnlc_update(dvp, fn, *vpp); 646 647 /* release the hold from vtodv */ 648 VN_RELE(dvp); 649 650 /* exchange the shadow for the master vnode, if needed */ 651 652 if (error == 0 && IS_SHADOW(*vpp, rp)) 653 sv_exchange(vpp); 654 655 return (error); 656 } 657 658 /* 659 * See if there's a "lost open" request to be saved and recovered. 660 */ 661 static void 662 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 663 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 664 vnode_t *dvp, OPEN4cargs *open_args) 665 { 666 vfs_t *vfsp; 667 char *srccfp; 668 669 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 670 671 if (error != ETIMEDOUT && error != EINTR && 672 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 673 lost_rqstp->lr_op = 0; 674 return; 675 } 676 677 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 678 "nfs4open_save_lost_rqst: error %d", error)); 679 680 lost_rqstp->lr_op = OP_OPEN; 681 /* 682 * The vp (if it is not NULL) and dvp are held and rele'd via 683 * the recovery code. See nfs4_save_lost_rqst. 684 */ 685 lost_rqstp->lr_vp = vp; 686 lost_rqstp->lr_dvp = dvp; 687 lost_rqstp->lr_oop = oop; 688 lost_rqstp->lr_osp = NULL; 689 lost_rqstp->lr_lop = NULL; 690 lost_rqstp->lr_cr = cr; 691 lost_rqstp->lr_flk = NULL; 692 lost_rqstp->lr_oacc = open_args->share_access; 693 lost_rqstp->lr_odeny = open_args->share_deny; 694 lost_rqstp->lr_oclaim = open_args->claim; 695 if (open_args->claim == CLAIM_DELEGATE_CUR) { 696 lost_rqstp->lr_ostateid = 697 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 698 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 699 } else { 700 srccfp = open_args->open_claim4_u.cfile; 701 } 702 lost_rqstp->lr_ofile.utf8string_len = 0; 703 lost_rqstp->lr_ofile.utf8string_val = NULL; 704 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 705 lost_rqstp->lr_putfirst = FALSE; 706 } 707 708 struct nfs4_excl_time { 709 uint32 seconds; 710 uint32 nseconds; 711 }; 712 713 /* 714 * The OPEN operation creates and/or opens a regular file 715 * 716 * ARGSUSED 717 */ 718 static int 719 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 720 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 721 enum createmode4 createmode, int file_just_been_created) 722 { 723 rnode4_t *rp; 724 rnode4_t *drp = VTOR4(dvp); 725 vnode_t *vp = NULL; 726 vnode_t *vpi = *vpp; 727 bool_t needrecov = FALSE; 728 729 int doqueue = 1; 730 731 COMPOUND4args_clnt args; 732 COMPOUND4res_clnt res; 733 nfs_argop4 *argop; 734 nfs_resop4 *resop; 735 int argoplist_size; 736 int idx_open, idx_fattr; 737 738 GETFH4res *gf_res = NULL; 739 OPEN4res *op_res = NULL; 740 nfs4_ga_res_t *garp; 741 fattr4 *attr = NULL; 742 struct nfs4_excl_time verf; 743 bool_t did_excl_setup = FALSE; 744 int created_osp; 745 746 OPEN4cargs *open_args; 747 nfs4_open_owner_t *oop = NULL; 748 nfs4_open_stream_t *osp = NULL; 749 seqid4 seqid = 0; 750 bool_t retry_open = FALSE; 751 nfs4_recov_state_t recov_state; 752 nfs4_lost_rqst_t lost_rqst; 753 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 754 hrtime_t t; 755 int acc = 0; 756 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 757 cred_t *ncr = NULL; 758 759 nfs4_sharedfh_t *otw_sfh; 760 nfs4_sharedfh_t *orig_sfh; 761 int fh_differs = 0; 762 int numops, setgid_flag; 763 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 764 765 /* 766 * Make sure we properly deal with setting the right gid on 767 * a newly created file to reflect the parent's setgid bit 768 */ 769 setgid_flag = 0; 770 if (create_flag && in_va) { 771 772 /* 773 * If the parent's directory has the setgid bit set 774 * _and_ the client was able to get a valid mapping 775 * for the parent dir's owner_group, we want to 776 * append NVERIFY(owner_group == dva.va_gid) and 777 * SETATTR to the CREATE compound. 778 */ 779 mutex_enter(&drp->r_statelock); 780 if (drp->r_attr.va_mode & VSGID && 781 drp->r_attr.va_gid != GID_NOBODY) { 782 in_va->va_gid = drp->r_attr.va_gid; 783 setgid_flag = 1; 784 } 785 mutex_exit(&drp->r_statelock); 786 } 787 788 /* 789 * Normal/non-create compound: 790 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 791 * 792 * Open(create) compound no setgid: 793 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 794 * RESTOREFH + GETATTR 795 * 796 * Open(create) setgid: 797 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 798 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 799 * NVERIFY(grp) + SETATTR 800 */ 801 if (setgid_flag) { 802 numops = 10; 803 idx_open = 1; 804 idx_fattr = 3; 805 } else if (create_flag) { 806 numops = 7; 807 idx_open = 2; 808 idx_fattr = 4; 809 } else { 810 numops = 4; 811 idx_open = 1; 812 idx_fattr = 3; 813 } 814 815 args.array_len = numops; 816 argoplist_size = numops * sizeof (nfs_argop4); 817 argop = kmem_alloc(argoplist_size, KM_SLEEP); 818 819 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 820 "open %s open flag 0x%x cred %p", file_name, open_flag, 821 (void *)cr)); 822 823 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 824 if (create_flag) { 825 /* 826 * We are to create a file. Initialize the passed in vnode 827 * pointer. 828 */ 829 vpi = NULL; 830 } else { 831 /* 832 * Check to see if the client owns a read delegation and is 833 * trying to open for write. If so, then return the delegation 834 * to avoid the server doing a cb_recall and returning DELAY. 835 * NB - we don't use the statev4_lock here because we'd have 836 * to drop the lock anyway and the result would be stale. 837 */ 838 if ((open_flag & FWRITE) && 839 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 840 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 841 842 /* 843 * If the file has a delegation, then do an access check up 844 * front. This avoids having to an access check later after 845 * we've already done start_op, which could deadlock. 846 */ 847 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 848 if (open_flag & FREAD && 849 nfs4_access(vpi, VREAD, 0, cr) == 0) 850 acc |= VREAD; 851 if (open_flag & FWRITE && 852 nfs4_access(vpi, VWRITE, 0, cr) == 0) 853 acc |= VWRITE; 854 } 855 } 856 857 drp = VTOR4(dvp); 858 859 recov_state.rs_flags = 0; 860 recov_state.rs_num_retry_despite_err = 0; 861 cred_otw = cr; 862 863 recov_retry: 864 fh_differs = 0; 865 nfs4_error_zinit(&e); 866 867 /* argop is empty here */ 868 869 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 870 if (ncr != NULL) 871 crfree(ncr); 872 kmem_free(argop, argoplist_size); 873 return (EINTR); 874 } 875 876 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 877 if (e.error) { 878 nfs_rw_exit(&drp->r_rwlock); 879 if (ncr != NULL) 880 crfree(ncr); 881 kmem_free(argop, argoplist_size); 882 return (e.error); 883 } 884 885 args.ctag = TAG_OPEN; 886 args.array_len = numops; 887 args.array = argop; 888 889 /* putfh directory fh */ 890 argop[0].argop = OP_CPUTFH; 891 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 892 893 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 894 argop[idx_open].argop = OP_COPEN; 895 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 896 open_args->claim = CLAIM_NULL; 897 898 /* name of file */ 899 open_args->open_claim4_u.cfile = file_name; 900 open_args->owner.owner_len = 0; 901 open_args->owner.owner_val = NULL; 902 903 if (create_flag) { 904 /* CREATE a file */ 905 open_args->opentype = OPEN4_CREATE; 906 open_args->mode = createmode; 907 if (createmode == EXCLUSIVE4) { 908 if (did_excl_setup == FALSE) { 909 verf.seconds = nfs_atoi(hw_serial); 910 if (verf.seconds != 0) 911 verf.nseconds = newnum(); 912 else { 913 timestruc_t now; 914 915 gethrestime(&now); 916 verf.seconds = now.tv_sec; 917 verf.nseconds = now.tv_nsec; 918 } 919 /* 920 * Since the server will use this value for the 921 * mtime, make sure that it can't overflow. Zero 922 * out the MSB. The actual value does not matter 923 * here, only its uniqeness. 924 */ 925 verf.seconds &= INT32_MAX; 926 did_excl_setup = TRUE; 927 } 928 929 /* Now copy over verifier to OPEN4args. */ 930 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 931 } else { 932 int v_error; 933 bitmap4 supp_attrs; 934 servinfo4_t *svp; 935 936 attr = &open_args->createhow4_u.createattrs; 937 938 svp = drp->r_server; 939 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 940 supp_attrs = svp->sv_supp_attrs; 941 nfs_rw_exit(&svp->sv_lock); 942 943 /* GUARDED4 or UNCHECKED4 */ 944 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 945 supp_attrs); 946 if (v_error) { 947 bzero(attr, sizeof (*attr)); 948 nfs4args_copen_free(open_args); 949 nfs_rw_exit(&drp->r_rwlock); 950 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 951 &recov_state, FALSE); 952 if (ncr != NULL) 953 crfree(ncr); 954 kmem_free(argop, argoplist_size); 955 return (v_error); 956 } 957 } 958 } else { 959 /* NO CREATE */ 960 open_args->opentype = OPEN4_NOCREATE; 961 } 962 963 if (recov_state.rs_sp != NULL) { 964 mutex_enter(&recov_state.rs_sp->s_lock); 965 open_args->owner.clientid = recov_state.rs_sp->clientid; 966 mutex_exit(&recov_state.rs_sp->s_lock); 967 } else { 968 /* XXX should we just fail here? */ 969 open_args->owner.clientid = 0; 970 } 971 972 /* 973 * This increments oop's ref count or creates a temporary 'just_created' 974 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 975 * completes. 976 */ 977 mutex_enter(&VTOMI4(dvp)->mi_lock); 978 979 /* See if a permanent or just created open owner exists */ 980 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 981 if (!oop) { 982 /* 983 * This open owner does not exist so create a temporary 984 * just created one. 985 */ 986 oop = create_open_owner(cr, VTOMI4(dvp)); 987 ASSERT(oop != NULL); 988 } 989 mutex_exit(&VTOMI4(dvp)->mi_lock); 990 991 /* this length never changes, do alloc before seqid sync */ 992 open_args->owner.owner_len = sizeof (oop->oo_name); 993 open_args->owner.owner_val = 994 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 995 996 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 997 if (e.error == EAGAIN) { 998 open_owner_rele(oop); 999 nfs4args_copen_free(open_args); 1000 nfs_rw_exit(&drp->r_rwlock); 1001 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1002 if (ncr != NULL) { 1003 crfree(ncr); 1004 ncr = NULL; 1005 } 1006 goto recov_retry; 1007 } 1008 1009 /* Check to see if we need to do the OTW call */ 1010 if (!create_flag) { 1011 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1012 file_just_been_created, &e.error, acc, &recov_state)) { 1013 1014 /* 1015 * The OTW open is not necessary. Either 1016 * the open can succeed without it (eg. 1017 * delegation, error == 0) or the open 1018 * must fail due to an access failure 1019 * (error != 0). In either case, tidy 1020 * up and return. 1021 */ 1022 1023 nfs4_end_open_seqid_sync(oop); 1024 open_owner_rele(oop); 1025 nfs4args_copen_free(open_args); 1026 nfs_rw_exit(&drp->r_rwlock); 1027 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1028 if (ncr != NULL) 1029 crfree(ncr); 1030 kmem_free(argop, argoplist_size); 1031 return (e.error); 1032 } 1033 } 1034 1035 bcopy(&oop->oo_name, open_args->owner.owner_val, 1036 open_args->owner.owner_len); 1037 1038 seqid = nfs4_get_open_seqid(oop) + 1; 1039 open_args->seqid = seqid; 1040 open_args->share_access = 0; 1041 if (open_flag & FREAD) 1042 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1043 if (open_flag & FWRITE) 1044 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1045 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1046 1047 1048 1049 /* 1050 * getfh w/sanity check for idx_open/idx_fattr 1051 */ 1052 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1053 argop[idx_open + 1].argop = OP_GETFH; 1054 1055 /* getattr */ 1056 argop[idx_fattr].argop = OP_GETATTR; 1057 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1058 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1059 1060 if (setgid_flag) { 1061 vattr_t _v; 1062 servinfo4_t *svp; 1063 bitmap4 supp_attrs; 1064 1065 svp = drp->r_server; 1066 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1067 supp_attrs = svp->sv_supp_attrs; 1068 nfs_rw_exit(&svp->sv_lock); 1069 1070 /* 1071 * For setgid case, we need to: 1072 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1073 */ 1074 argop[4].argop = OP_SAVEFH; 1075 1076 argop[5].argop = OP_CPUTFH; 1077 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1078 1079 argop[6].argop = OP_GETATTR; 1080 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1081 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1082 1083 argop[7].argop = OP_RESTOREFH; 1084 1085 /* 1086 * nverify 1087 */ 1088 _v.va_mask = AT_GID; 1089 _v.va_gid = in_va->va_gid; 1090 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1091 supp_attrs))) { 1092 1093 /* 1094 * setattr 1095 * 1096 * We _know_ we're not messing with AT_SIZE or 1097 * AT_XTIME, so no need for stateid or flags. 1098 * Also we specify NULL rp since we're only 1099 * interested in setting owner_group attributes. 1100 */ 1101 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1102 supp_attrs, &e.error, 0); 1103 if (e.error) 1104 nfs4args_verify_free(&argop[8]); 1105 } 1106 1107 if (e.error) { 1108 /* 1109 * XXX - Revisit the last argument to nfs4_end_op() 1110 * once 5020486 is fixed. 1111 */ 1112 nfs4_end_open_seqid_sync(oop); 1113 open_owner_rele(oop); 1114 nfs4args_copen_free(open_args); 1115 nfs_rw_exit(&drp->r_rwlock); 1116 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1117 if (ncr != NULL) 1118 crfree(ncr); 1119 kmem_free(argop, argoplist_size); 1120 return (e.error); 1121 } 1122 } else if (create_flag) { 1123 /* 1124 * For setgid case, we need to: 1125 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1126 */ 1127 argop[1].argop = OP_SAVEFH; 1128 1129 argop[5].argop = OP_RESTOREFH; 1130 1131 argop[6].argop = OP_GETATTR; 1132 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1133 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1134 } 1135 1136 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1137 "nfs4open_otw: %s call, nm %s, rp %s", 1138 needrecov ? "recov" : "first", file_name, 1139 rnode4info(VTOR4(dvp)))); 1140 1141 t = gethrtime(); 1142 1143 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1144 1145 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1146 nfs4_set_open_seqid(seqid, oop, args.ctag); 1147 1148 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1149 1150 if (e.error || needrecov) { 1151 bool_t abort = FALSE; 1152 1153 if (needrecov) { 1154 nfs4_bseqid_entry_t *bsep = NULL; 1155 1156 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1157 cred_otw, vpi, dvp, open_args); 1158 1159 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1160 bsep = nfs4_create_bseqid_entry(oop, NULL, 1161 vpi, 0, args.ctag, open_args->seqid); 1162 num_bseqid_retry--; 1163 } 1164 1165 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1166 NULL, lost_rqst.lr_op == OP_OPEN ? 1167 &lost_rqst : NULL, OP_OPEN, bsep); 1168 1169 if (bsep) 1170 kmem_free(bsep, sizeof (*bsep)); 1171 /* give up if we keep getting BAD_SEQID */ 1172 if (num_bseqid_retry == 0) 1173 abort = TRUE; 1174 if (abort == TRUE && e.error == 0) 1175 e.error = geterrno4(res.status); 1176 } 1177 nfs4_end_open_seqid_sync(oop); 1178 open_owner_rele(oop); 1179 nfs_rw_exit(&drp->r_rwlock); 1180 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1181 nfs4args_copen_free(open_args); 1182 if (setgid_flag) { 1183 nfs4args_verify_free(&argop[8]); 1184 nfs4args_setattr_free(&argop[9]); 1185 } 1186 if (!e.error) 1187 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1188 if (ncr != NULL) { 1189 crfree(ncr); 1190 ncr = NULL; 1191 } 1192 if (!needrecov || abort == TRUE || e.error == EINTR || 1193 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1194 kmem_free(argop, argoplist_size); 1195 return (e.error); 1196 } 1197 goto recov_retry; 1198 } 1199 1200 /* 1201 * Will check and update lease after checking the rflag for 1202 * OPEN_CONFIRM in the successful OPEN call. 1203 */ 1204 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1205 1206 /* 1207 * XXX what if we're crossing mount points from server1:/drp 1208 * to server2:/drp/rp. 1209 */ 1210 1211 /* Signal our end of use of the open seqid */ 1212 nfs4_end_open_seqid_sync(oop); 1213 1214 /* 1215 * This will destroy the open owner if it was just created, 1216 * and no one else has put a reference on it. 1217 */ 1218 open_owner_rele(oop); 1219 if (create_flag && (createmode != EXCLUSIVE4) && 1220 res.status == NFS4ERR_BADOWNER) 1221 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1222 1223 e.error = geterrno4(res.status); 1224 nfs4args_copen_free(open_args); 1225 if (setgid_flag) { 1226 nfs4args_verify_free(&argop[8]); 1227 nfs4args_setattr_free(&argop[9]); 1228 } 1229 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1230 nfs_rw_exit(&drp->r_rwlock); 1231 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1232 /* 1233 * If the reply is NFS4ERR_ACCESS, it may be because 1234 * we are root (no root net access). If the real uid 1235 * is not root, then retry with the real uid instead. 1236 */ 1237 if (ncr != NULL) { 1238 crfree(ncr); 1239 ncr = NULL; 1240 } 1241 if (res.status == NFS4ERR_ACCESS && 1242 (ncr = crnetadjust(cred_otw)) != NULL) { 1243 cred_otw = ncr; 1244 goto recov_retry; 1245 } 1246 kmem_free(argop, argoplist_size); 1247 return (e.error); 1248 } 1249 1250 resop = &res.array[idx_open]; /* open res */ 1251 op_res = &resop->nfs_resop4_u.opopen; 1252 1253 #ifdef DEBUG 1254 /* 1255 * verify attrset bitmap 1256 */ 1257 if (create_flag && 1258 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1259 /* make sure attrset returned is what we asked for */ 1260 /* XXX Ignore this 'error' for now */ 1261 if (attr->attrmask != op_res->attrset) 1262 /* EMPTY */; 1263 } 1264 #endif 1265 1266 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1267 mutex_enter(&VTOMI4(dvp)->mi_lock); 1268 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1269 mutex_exit(&VTOMI4(dvp)->mi_lock); 1270 } 1271 1272 resop = &res.array[idx_open + 1]; /* getfh res */ 1273 gf_res = &resop->nfs_resop4_u.opgetfh; 1274 1275 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1276 1277 /* 1278 * The open stateid has been updated on the server but not 1279 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1280 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1281 * WRITE call. That, however, will use the old stateid, so go ahead 1282 * and upate the open stateid now, before any call to makenfs4node. 1283 */ 1284 if (vpi) { 1285 nfs4_open_stream_t *tmp_osp; 1286 rnode4_t *tmp_rp = VTOR4(vpi); 1287 1288 tmp_osp = find_open_stream(oop, tmp_rp); 1289 if (tmp_osp) { 1290 tmp_osp->open_stateid = op_res->stateid; 1291 mutex_exit(&tmp_osp->os_sync_lock); 1292 open_stream_rele(tmp_osp, tmp_rp); 1293 } 1294 1295 /* 1296 * We must determine if the file handle given by the otw open 1297 * is the same as the file handle which was passed in with 1298 * *vpp. This case can be reached if the file we are trying 1299 * to open has been removed and another file has been created 1300 * having the same file name. The passed in vnode is released 1301 * later. 1302 */ 1303 orig_sfh = VTOR4(vpi)->r_fh; 1304 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1305 } 1306 1307 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1308 1309 if (create_flag || fh_differs) { 1310 int rnode_err = 0; 1311 1312 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1313 dvp, fn_get(VTOSV(dvp)->sv_name, file_name)); 1314 1315 if (e.error) 1316 PURGE_ATTRCACHE4(vp); 1317 /* 1318 * For the newly created vp case, make sure the rnode 1319 * isn't bad before using it. 1320 */ 1321 mutex_enter(&(VTOR4(vp))->r_statelock); 1322 if (VTOR4(vp)->r_flags & R4RECOVERR) 1323 rnode_err = EIO; 1324 mutex_exit(&(VTOR4(vp))->r_statelock); 1325 1326 if (rnode_err) { 1327 nfs4_end_open_seqid_sync(oop); 1328 nfs4args_copen_free(open_args); 1329 if (setgid_flag) { 1330 nfs4args_verify_free(&argop[8]); 1331 nfs4args_setattr_free(&argop[9]); 1332 } 1333 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1334 nfs_rw_exit(&drp->r_rwlock); 1335 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1336 needrecov); 1337 open_owner_rele(oop); 1338 VN_RELE(vp); 1339 if (ncr != NULL) 1340 crfree(ncr); 1341 sfh4_rele(&otw_sfh); 1342 kmem_free(argop, argoplist_size); 1343 return (EIO); 1344 } 1345 } else { 1346 vp = vpi; 1347 } 1348 sfh4_rele(&otw_sfh); 1349 1350 /* 1351 * It seems odd to get a full set of attrs and then not update 1352 * the object's attrcache in the non-create case. Create case uses 1353 * the attrs since makenfs4node checks to see if the attrs need to 1354 * be updated (and then updates them). The non-create case should 1355 * update attrs also. 1356 */ 1357 if (! create_flag && ! fh_differs && !e.error) { 1358 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1359 } 1360 1361 nfs4_error_zinit(&e); 1362 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1363 /* This does not do recovery for vp explicitly. */ 1364 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1365 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1366 1367 if (e.error || e.stat) { 1368 nfs4_end_open_seqid_sync(oop); 1369 nfs4args_copen_free(open_args); 1370 if (setgid_flag) { 1371 nfs4args_verify_free(&argop[8]); 1372 nfs4args_setattr_free(&argop[9]); 1373 } 1374 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1375 nfs_rw_exit(&drp->r_rwlock); 1376 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1377 needrecov); 1378 open_owner_rele(oop); 1379 if (create_flag || fh_differs) { 1380 /* rele the makenfs4node */ 1381 VN_RELE(vp); 1382 } 1383 if (ncr != NULL) { 1384 crfree(ncr); 1385 ncr = NULL; 1386 } 1387 if (retry_open == TRUE) { 1388 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1389 "nfs4open_otw: retry the open since OPEN " 1390 "CONFIRM failed with error %d stat %d", 1391 e.error, e.stat)); 1392 if (create_flag && createmode == GUARDED4) { 1393 NFS4_DEBUG(nfs4_client_recov_debug, 1394 (CE_NOTE, "nfs4open_otw: switch " 1395 "createmode from GUARDED4 to " 1396 "UNCHECKED4")); 1397 createmode = UNCHECKED4; 1398 } 1399 goto recov_retry; 1400 } 1401 if (!e.error) { 1402 if (create_flag && (createmode != EXCLUSIVE4) && 1403 e.stat == NFS4ERR_BADOWNER) 1404 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1405 1406 e.error = geterrno4(e.stat); 1407 } 1408 kmem_free(argop, argoplist_size); 1409 return (e.error); 1410 } 1411 } 1412 1413 rp = VTOR4(vp); 1414 1415 mutex_enter(&rp->r_statev4_lock); 1416 if (create_flag) 1417 rp->created_v4 = 1; 1418 mutex_exit(&rp->r_statev4_lock); 1419 1420 mutex_enter(&oop->oo_lock); 1421 /* Doesn't matter if 'oo_just_created' already was set as this */ 1422 oop->oo_just_created = NFS4_PERM_CREATED; 1423 if (oop->oo_cred_otw) 1424 crfree(oop->oo_cred_otw); 1425 oop->oo_cred_otw = cred_otw; 1426 crhold(oop->oo_cred_otw); 1427 mutex_exit(&oop->oo_lock); 1428 1429 /* returns with 'os_sync_lock' held */ 1430 osp = find_or_create_open_stream(oop, rp, &created_osp); 1431 if (!osp) { 1432 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1433 "nfs4open_otw: failed to create an open stream")); 1434 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1435 "signal our end of use of the open seqid")); 1436 1437 nfs4_end_open_seqid_sync(oop); 1438 open_owner_rele(oop); 1439 nfs4args_copen_free(open_args); 1440 if (setgid_flag) { 1441 nfs4args_verify_free(&argop[8]); 1442 nfs4args_setattr_free(&argop[9]); 1443 } 1444 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1445 nfs_rw_exit(&drp->r_rwlock); 1446 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1447 if (create_flag || fh_differs) 1448 VN_RELE(vp); 1449 if (ncr != NULL) 1450 crfree(ncr); 1451 1452 kmem_free(argop, argoplist_size); 1453 return (EINVAL); 1454 1455 } 1456 1457 osp->open_stateid = op_res->stateid; 1458 1459 if (open_flag & FREAD) 1460 osp->os_share_acc_read++; 1461 if (open_flag & FWRITE) 1462 osp->os_share_acc_write++; 1463 osp->os_share_deny_none++; 1464 1465 /* 1466 * Need to reset this bitfield for the possible case where we were 1467 * going to OTW CLOSE the file, got a non-recoverable error, and before 1468 * we could retry the CLOSE, OPENed the file again. 1469 */ 1470 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1471 osp->os_final_close = 0; 1472 osp->os_force_close = 0; 1473 #ifdef DEBUG 1474 if (osp->os_failed_reopen) 1475 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1476 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1477 (void *)osp, (void *)cr, rnode4info(rp))); 1478 #endif 1479 osp->os_failed_reopen = 0; 1480 1481 mutex_exit(&osp->os_sync_lock); 1482 1483 nfs4_end_open_seqid_sync(oop); 1484 1485 if (created_osp && recov_state.rs_sp != NULL) { 1486 mutex_enter(&recov_state.rs_sp->s_lock); 1487 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1488 mutex_exit(&recov_state.rs_sp->s_lock); 1489 } 1490 1491 /* get rid of our reference to find oop */ 1492 open_owner_rele(oop); 1493 1494 open_stream_rele(osp, rp); 1495 1496 /* accept delegation, if any */ 1497 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1498 1499 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1500 1501 if (createmode == EXCLUSIVE4 && 1502 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1503 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1504 " EXCLUSIVE4: sending a SETATTR")); 1505 /* 1506 * If doing an exclusive create, then generate 1507 * a SETATTR to set the initial attributes. 1508 * Try to set the mtime and the atime to the 1509 * server's current time. It is somewhat 1510 * expected that these fields will be used to 1511 * store the exclusive create cookie. If not, 1512 * server implementors will need to know that 1513 * a SETATTR will follow an exclusive create 1514 * and the cookie should be destroyed if 1515 * appropriate. 1516 * 1517 * The AT_GID and AT_SIZE bits are turned off 1518 * so that the SETATTR request will not attempt 1519 * to process these. The gid will be set 1520 * separately if appropriate. The size is turned 1521 * off because it is assumed that a new file will 1522 * be created empty and if the file wasn't empty, 1523 * then the exclusive create will have failed 1524 * because the file must have existed already. 1525 * Therefore, no truncate operation is needed. 1526 */ 1527 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1528 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1529 1530 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1531 if (e.error) { 1532 /* 1533 * Couldn't correct the attributes of 1534 * the newly created file and the 1535 * attributes are wrong. Remove the 1536 * file and return an error to the 1537 * application. 1538 */ 1539 /* XXX will this take care of client state ? */ 1540 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1541 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1542 " remove file", e.error)); 1543 VN_RELE(vp); 1544 (void) nfs4_remove(dvp, file_name, cr); 1545 nfs_rw_exit(&drp->r_rwlock); 1546 goto skip_rwlock_exit; 1547 } 1548 } 1549 1550 /* 1551 * If we created or found the correct vnode, due to create_flag or 1552 * fh_differs being set, then update directory cache attribute, readdir 1553 * and dnlc caches. 1554 */ 1555 if (create_flag || fh_differs) { 1556 dirattr_info_t dinfo, *dinfop; 1557 1558 /* 1559 * Make sure getattr succeeded before using results. 1560 * note: op 7 is getattr(dir) for both flavors of 1561 * open(create). 1562 */ 1563 if (create_flag && res.status == NFS4_OK) { 1564 dinfo.di_time_call = t; 1565 dinfo.di_cred = cr; 1566 dinfo.di_garp = 1567 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1568 dinfop = &dinfo; 1569 } else { 1570 dinfop = NULL; 1571 } 1572 1573 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1574 dinfop); 1575 } 1576 nfs_rw_exit(&drp->r_rwlock); 1577 skip_rwlock_exit: 1578 1579 /* 1580 * If the page cache for this file was flushed from actions 1581 * above, it was done asynchronously and if that is true, 1582 * there is a need to wait here for it to complete. This must 1583 * be done outside of start_fop/end_fop. 1584 */ 1585 (void) nfs4_waitfor_purge_complete(vp); 1586 1587 /* 1588 * It is implicit that we are in the open case (create_flag == 0) since 1589 * fh_differs can only be set to a non-zero value in the open case. 1590 */ 1591 if (fh_differs != 0 && vpi != NULL) 1592 VN_RELE(vpi); 1593 1594 /* 1595 * Be sure to set *vpp to the correct value before returning. 1596 */ 1597 *vpp = vp; 1598 1599 nfs4args_copen_free(open_args); 1600 if (setgid_flag) { 1601 nfs4args_verify_free(&argop[8]); 1602 nfs4args_setattr_free(&argop[9]); 1603 } 1604 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1605 1606 if (ncr) 1607 crfree(ncr); 1608 kmem_free(argop, argoplist_size); 1609 return (e.error); 1610 } 1611 1612 /* 1613 * Reopen an open instance. cf. nfs4open_otw(). 1614 * 1615 * Errors are returned by the nfs4_error_t parameter. 1616 * - ep->error contains an errno value or zero. 1617 * - if it is zero, ep->stat is set to an NFS status code, if any. 1618 * If the file could not be reopened, but the caller should continue, the 1619 * file is marked dead and no error values are returned. If the caller 1620 * should stop recovering open files and start over, either the ep->error 1621 * value or ep->stat will indicate an error (either something that requires 1622 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1623 * filehandles) may be handled silently by this routine. 1624 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1625 * will be started, so the caller should not do it. 1626 * 1627 * Gotos: 1628 * - kill_file : reopen failed in such a fashion to constitute marking the 1629 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1630 * is for cases where recovery is not possible. 1631 * - failed_reopen : same as above, except that the file has already been 1632 * marked dead, so no need to do it again. 1633 * - bailout : reopen failed but we are able to recover and retry the reopen - 1634 * either within this function immediatley or via the calling function. 1635 */ 1636 1637 void 1638 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1639 open_claim_type4 claim, bool_t frc_use_claim_previous, 1640 bool_t is_recov) 1641 { 1642 COMPOUND4args_clnt args; 1643 COMPOUND4res_clnt res; 1644 nfs_argop4 argop[4]; 1645 nfs_resop4 *resop; 1646 OPEN4res *op_res = NULL; 1647 OPEN4cargs *open_args; 1648 GETFH4res *gf_res; 1649 rnode4_t *rp = VTOR4(vp); 1650 int doqueue = 1; 1651 cred_t *cr = NULL, *cred_otw = NULL; 1652 nfs4_open_owner_t *oop = NULL; 1653 seqid4 seqid; 1654 nfs4_ga_res_t *garp; 1655 char fn[MAXNAMELEN]; 1656 nfs4_recov_state_t recov = {NULL, 0}; 1657 nfs4_lost_rqst_t lost_rqst; 1658 mntinfo4_t *mi = VTOMI4(vp); 1659 bool_t abort; 1660 char *failed_msg = ""; 1661 int fh_different; 1662 hrtime_t t; 1663 nfs4_bseqid_entry_t *bsep = NULL; 1664 1665 ASSERT(nfs4_consistent_type(vp)); 1666 ASSERT(nfs_zone() == mi->mi_zone); 1667 1668 nfs4_error_zinit(ep); 1669 1670 /* this is the cred used to find the open owner */ 1671 cr = state_to_cred(osp); 1672 if (cr == NULL) { 1673 failed_msg = "Couldn't reopen: no cred"; 1674 goto kill_file; 1675 } 1676 /* use this cred for OTW operations */ 1677 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1678 1679 top: 1680 nfs4_error_zinit(ep); 1681 1682 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1683 /* File system has been unmounted, quit */ 1684 ep->error = EIO; 1685 failed_msg = "Couldn't reopen: file system has been unmounted"; 1686 goto kill_file; 1687 } 1688 1689 oop = osp->os_open_owner; 1690 1691 ASSERT(oop != NULL); 1692 if (oop == NULL) { /* be defensive in non-DEBUG */ 1693 failed_msg = "can't reopen: no open owner"; 1694 goto kill_file; 1695 } 1696 open_owner_hold(oop); 1697 1698 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1699 if (ep->error) { 1700 open_owner_rele(oop); 1701 oop = NULL; 1702 goto bailout; 1703 } 1704 1705 /* 1706 * If the rnode has a delegation and the delegation has been 1707 * recovered and the server didn't request a recall and the caller 1708 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1709 * recovery) and the rnode hasn't been marked dead, then install 1710 * the delegation stateid in the open stream. Otherwise, proceed 1711 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1712 */ 1713 mutex_enter(&rp->r_statev4_lock); 1714 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1715 !rp->r_deleg_return_pending && 1716 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1717 !rp->r_deleg_needs_recall && 1718 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1719 !(rp->r_flags & R4RECOVERR)) { 1720 mutex_enter(&osp->os_sync_lock); 1721 osp->os_delegation = 1; 1722 osp->open_stateid = rp->r_deleg_stateid; 1723 mutex_exit(&osp->os_sync_lock); 1724 mutex_exit(&rp->r_statev4_lock); 1725 goto bailout; 1726 } 1727 mutex_exit(&rp->r_statev4_lock); 1728 1729 /* 1730 * If the file failed recovery, just quit. This failure need not 1731 * affect other reopens, so don't return an error. 1732 */ 1733 mutex_enter(&rp->r_statelock); 1734 if (rp->r_flags & R4RECOVERR) { 1735 mutex_exit(&rp->r_statelock); 1736 ep->error = 0; 1737 goto failed_reopen; 1738 } 1739 mutex_exit(&rp->r_statelock); 1740 1741 /* 1742 * argop is empty here 1743 * 1744 * PUTFH, OPEN, GETATTR 1745 */ 1746 args.ctag = TAG_REOPEN; 1747 args.array_len = 4; 1748 args.array = argop; 1749 1750 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1751 "nfs4_reopen: file is type %d, id %s", 1752 vp->v_type, rnode4info(VTOR4(vp)))); 1753 1754 argop[0].argop = OP_CPUTFH; 1755 1756 if (claim != CLAIM_PREVIOUS) { 1757 /* 1758 * if this is a file mount then 1759 * use the mntinfo parentfh 1760 */ 1761 argop[0].nfs_argop4_u.opcputfh.sfh = 1762 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1763 VTOSV(vp)->sv_dfh; 1764 } else { 1765 /* putfh fh to reopen */ 1766 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1767 } 1768 1769 argop[1].argop = OP_COPEN; 1770 open_args = &argop[1].nfs_argop4_u.opcopen; 1771 open_args->claim = claim; 1772 1773 if (claim == CLAIM_NULL) { 1774 1775 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1776 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1777 "failed for vp 0x%p for CLAIM_NULL with %m", 1778 (void *)vp); 1779 failed_msg = "Couldn't reopen: vtoname failed for " 1780 "CLAIM_NULL"; 1781 /* nothing allocated yet */ 1782 goto kill_file; 1783 } 1784 1785 open_args->open_claim4_u.cfile = fn; 1786 } else if (claim == CLAIM_PREVIOUS) { 1787 1788 /* 1789 * We have two cases to deal with here: 1790 * 1) We're being called to reopen files in order to satisfy 1791 * a lock operation request which requires us to explicitly 1792 * reopen files which were opened under a delegation. If 1793 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1794 * that case, frc_use_claim_previous is TRUE and we must 1795 * use the rnode's current delegation type (r_deleg_type). 1796 * 2) We're reopening files during some form of recovery. 1797 * In this case, frc_use_claim_previous is FALSE and we 1798 * use the delegation type appropriate for recovery 1799 * (r_deleg_needs_recovery). 1800 */ 1801 mutex_enter(&rp->r_statev4_lock); 1802 open_args->open_claim4_u.delegate_type = 1803 frc_use_claim_previous ? 1804 rp->r_deleg_type : 1805 rp->r_deleg_needs_recovery; 1806 mutex_exit(&rp->r_statev4_lock); 1807 1808 } else if (claim == CLAIM_DELEGATE_CUR) { 1809 1810 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1811 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1812 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1813 "with %m", (void *)vp); 1814 failed_msg = "Couldn't reopen: vtoname failed for " 1815 "CLAIM_DELEGATE_CUR"; 1816 /* nothing allocated yet */ 1817 goto kill_file; 1818 } 1819 1820 mutex_enter(&rp->r_statev4_lock); 1821 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1822 rp->r_deleg_stateid; 1823 mutex_exit(&rp->r_statev4_lock); 1824 1825 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1826 } 1827 open_args->opentype = OPEN4_NOCREATE; 1828 open_args->owner.clientid = mi2clientid(mi); 1829 open_args->owner.owner_len = sizeof (oop->oo_name); 1830 open_args->owner.owner_val = 1831 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1832 bcopy(&oop->oo_name, open_args->owner.owner_val, 1833 open_args->owner.owner_len); 1834 open_args->share_access = 0; 1835 open_args->share_deny = 0; 1836 1837 mutex_enter(&osp->os_sync_lock); 1838 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1839 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1840 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1841 (void *)osp, (void *)rp, osp->os_share_acc_read, 1842 osp->os_share_acc_write, osp->os_open_ref_count, 1843 osp->os_mmap_read, osp->os_mmap_write, claim)); 1844 1845 if (osp->os_share_acc_read || osp->os_mmap_read) 1846 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1847 if (osp->os_share_acc_write || osp->os_mmap_write) 1848 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1849 if (osp->os_share_deny_read) 1850 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1851 if (osp->os_share_deny_write) 1852 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1853 mutex_exit(&osp->os_sync_lock); 1854 1855 seqid = nfs4_get_open_seqid(oop) + 1; 1856 open_args->seqid = seqid; 1857 1858 /* Construct the getfh part of the compound */ 1859 argop[2].argop = OP_GETFH; 1860 1861 /* Construct the getattr part of the compound */ 1862 argop[3].argop = OP_GETATTR; 1863 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1864 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1865 1866 t = gethrtime(); 1867 1868 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1869 1870 if (ep->error) { 1871 if (!is_recov && !frc_use_claim_previous && 1872 (ep->error == EINTR || ep->error == ETIMEDOUT || 1873 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1874 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1875 cred_otw, vp, NULL, open_args); 1876 abort = nfs4_start_recovery(ep, 1877 VTOMI4(vp), vp, NULL, NULL, 1878 lost_rqst.lr_op == OP_OPEN ? 1879 &lost_rqst : NULL, OP_OPEN, NULL); 1880 nfs4args_copen_free(open_args); 1881 goto bailout; 1882 } 1883 1884 nfs4args_copen_free(open_args); 1885 1886 if (ep->error == EACCES && cred_otw != cr) { 1887 crfree(cred_otw); 1888 cred_otw = cr; 1889 crhold(cred_otw); 1890 nfs4_end_open_seqid_sync(oop); 1891 open_owner_rele(oop); 1892 oop = NULL; 1893 goto top; 1894 } 1895 if (ep->error == ETIMEDOUT) 1896 goto bailout; 1897 failed_msg = "Couldn't reopen: rpc error"; 1898 goto kill_file; 1899 } 1900 1901 if (nfs4_need_to_bump_seqid(&res)) 1902 nfs4_set_open_seqid(seqid, oop, args.ctag); 1903 1904 switch (res.status) { 1905 case NFS4_OK: 1906 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1907 mutex_enter(&rp->r_statelock); 1908 rp->r_delay_interval = 0; 1909 mutex_exit(&rp->r_statelock); 1910 } 1911 break; 1912 case NFS4ERR_BAD_SEQID: 1913 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1914 args.ctag, open_args->seqid); 1915 1916 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1917 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1918 NULL, OP_OPEN, bsep); 1919 1920 nfs4args_copen_free(open_args); 1921 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1922 nfs4_end_open_seqid_sync(oop); 1923 open_owner_rele(oop); 1924 oop = NULL; 1925 kmem_free(bsep, sizeof (*bsep)); 1926 1927 goto kill_file; 1928 case NFS4ERR_NO_GRACE: 1929 nfs4args_copen_free(open_args); 1930 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1931 nfs4_end_open_seqid_sync(oop); 1932 open_owner_rele(oop); 1933 oop = NULL; 1934 if (claim == CLAIM_PREVIOUS) { 1935 /* 1936 * Retry as a plain open. We don't need to worry about 1937 * checking the changeinfo: it is acceptable for a 1938 * client to re-open a file and continue processing 1939 * (in the absence of locks). 1940 */ 1941 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1942 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1943 "will retry as CLAIM_NULL")); 1944 claim = CLAIM_NULL; 1945 nfs4_mi_kstat_inc_no_grace(mi); 1946 goto top; 1947 } 1948 failed_msg = 1949 "Couldn't reopen: tried reclaim outside grace period. "; 1950 goto kill_file; 1951 case NFS4ERR_GRACE: 1952 nfs4_set_grace_wait(mi); 1953 nfs4args_copen_free(open_args); 1954 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1955 nfs4_end_open_seqid_sync(oop); 1956 open_owner_rele(oop); 1957 oop = NULL; 1958 ep->error = nfs4_wait_for_grace(mi, &recov); 1959 if (ep->error != 0) 1960 goto bailout; 1961 goto top; 1962 case NFS4ERR_DELAY: 1963 nfs4_set_delay_wait(vp); 1964 nfs4args_copen_free(open_args); 1965 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1966 nfs4_end_open_seqid_sync(oop); 1967 open_owner_rele(oop); 1968 oop = NULL; 1969 ep->error = nfs4_wait_for_delay(vp, &recov); 1970 nfs4_mi_kstat_inc_delay(mi); 1971 if (ep->error != 0) 1972 goto bailout; 1973 goto top; 1974 case NFS4ERR_FHEXPIRED: 1975 /* recover filehandle and retry */ 1976 abort = nfs4_start_recovery(ep, 1977 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 1978 nfs4args_copen_free(open_args); 1979 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1980 nfs4_end_open_seqid_sync(oop); 1981 open_owner_rele(oop); 1982 oop = NULL; 1983 if (abort == FALSE) 1984 goto top; 1985 failed_msg = "Couldn't reopen: recovery aborted"; 1986 goto kill_file; 1987 case NFS4ERR_RESOURCE: 1988 case NFS4ERR_STALE_CLIENTID: 1989 case NFS4ERR_WRONGSEC: 1990 case NFS4ERR_EXPIRED: 1991 /* 1992 * Do not mark the file dead and let the calling 1993 * function initiate recovery. 1994 */ 1995 nfs4args_copen_free(open_args); 1996 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1997 nfs4_end_open_seqid_sync(oop); 1998 open_owner_rele(oop); 1999 oop = NULL; 2000 goto bailout; 2001 case NFS4ERR_ACCESS: 2002 if (cred_otw != cr) { 2003 crfree(cred_otw); 2004 cred_otw = cr; 2005 crhold(cred_otw); 2006 nfs4args_copen_free(open_args); 2007 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2008 nfs4_end_open_seqid_sync(oop); 2009 open_owner_rele(oop); 2010 oop = NULL; 2011 goto top; 2012 } 2013 /* fall through */ 2014 default: 2015 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2016 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2017 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2018 rnode4info(VTOR4(vp)))); 2019 failed_msg = "Couldn't reopen: NFSv4 error"; 2020 nfs4args_copen_free(open_args); 2021 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2022 goto kill_file; 2023 } 2024 2025 resop = &res.array[1]; /* open res */ 2026 op_res = &resop->nfs_resop4_u.opopen; 2027 2028 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2029 2030 /* 2031 * Check if the path we reopened really is the same 2032 * file. We could end up in a situation where the file 2033 * was removed and a new file created with the same name. 2034 */ 2035 resop = &res.array[2]; 2036 gf_res = &resop->nfs_resop4_u.opgetfh; 2037 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2038 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2039 if (fh_different) { 2040 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2041 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2042 /* Oops, we don't have the same file */ 2043 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2044 failed_msg = "Couldn't reopen: Persistent " 2045 "file handle changed"; 2046 else 2047 failed_msg = "Couldn't reopen: Volatile " 2048 "(no expire on open) file handle changed"; 2049 2050 nfs4args_copen_free(open_args); 2051 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2052 nfs_rw_exit(&mi->mi_fh_lock); 2053 goto kill_file; 2054 2055 } else { 2056 /* 2057 * We have volatile file handles that don't compare. 2058 * If the fids are the same then we assume that the 2059 * file handle expired but the rnode still refers to 2060 * the same file object. 2061 * 2062 * First check that we have fids or not. 2063 * If we don't we have a dumb server so we will 2064 * just assume every thing is ok for now. 2065 */ 2066 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2067 rp->r_attr.va_mask & AT_NODEID && 2068 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2069 /* 2070 * We have fids, but they don't 2071 * compare. So kill the file. 2072 */ 2073 failed_msg = 2074 "Couldn't reopen: file handle changed" 2075 " due to mismatched fids"; 2076 nfs4args_copen_free(open_args); 2077 (void) xdr_free(xdr_COMPOUND4res_clnt, 2078 (caddr_t)&res); 2079 nfs_rw_exit(&mi->mi_fh_lock); 2080 goto kill_file; 2081 } else { 2082 /* 2083 * We have volatile file handles that refers 2084 * to the same file (at least they have the 2085 * same fid) or we don't have fids so we 2086 * can't tell. :(. We'll be a kind and accepting 2087 * client so we'll update the rnode's file 2088 * handle with the otw handle. 2089 * 2090 * We need to drop mi->mi_fh_lock since 2091 * sh4_update acquires it. Since there is 2092 * only one recovery thread there is no 2093 * race. 2094 */ 2095 nfs_rw_exit(&mi->mi_fh_lock); 2096 sfh4_update(rp->r_fh, &gf_res->object); 2097 } 2098 } 2099 } else { 2100 nfs_rw_exit(&mi->mi_fh_lock); 2101 } 2102 2103 ASSERT(nfs4_consistent_type(vp)); 2104 2105 /* 2106 * If the server wanted an OPEN_CONFIRM but that fails, just start 2107 * over. Presumably if there is a persistent error it will show up 2108 * when we resend the OPEN. 2109 */ 2110 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2111 bool_t retry_open = FALSE; 2112 2113 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2114 cred_otw, is_recov, &retry_open, 2115 oop, FALSE, ep, NULL); 2116 if (ep->error || ep->stat) { 2117 nfs4args_copen_free(open_args); 2118 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2119 nfs4_end_open_seqid_sync(oop); 2120 open_owner_rele(oop); 2121 oop = NULL; 2122 goto top; 2123 } 2124 } 2125 2126 mutex_enter(&osp->os_sync_lock); 2127 osp->open_stateid = op_res->stateid; 2128 osp->os_delegation = 0; 2129 /* 2130 * Need to reset this bitfield for the possible case where we were 2131 * going to OTW CLOSE the file, got a non-recoverable error, and before 2132 * we could retry the CLOSE, OPENed the file again. 2133 */ 2134 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2135 osp->os_final_close = 0; 2136 osp->os_force_close = 0; 2137 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2138 osp->os_dc_openacc = open_args->share_access; 2139 mutex_exit(&osp->os_sync_lock); 2140 2141 nfs4_end_open_seqid_sync(oop); 2142 2143 /* accept delegation, if any */ 2144 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2145 2146 nfs4args_copen_free(open_args); 2147 2148 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2149 2150 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2151 2152 ASSERT(nfs4_consistent_type(vp)); 2153 2154 open_owner_rele(oop); 2155 crfree(cr); 2156 crfree(cred_otw); 2157 return; 2158 2159 kill_file: 2160 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2161 failed_reopen: 2162 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2163 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2164 (void *)osp, (void *)cr, rnode4info(rp))); 2165 mutex_enter(&osp->os_sync_lock); 2166 osp->os_failed_reopen = 1; 2167 mutex_exit(&osp->os_sync_lock); 2168 bailout: 2169 if (oop != NULL) { 2170 nfs4_end_open_seqid_sync(oop); 2171 open_owner_rele(oop); 2172 } 2173 if (cr != NULL) 2174 crfree(cr); 2175 if (cred_otw != NULL) 2176 crfree(cred_otw); 2177 } 2178 2179 /* for . and .. OPENs */ 2180 /* ARGSUSED */ 2181 static int 2182 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2183 { 2184 rnode4_t *rp; 2185 nfs4_ga_res_t gar; 2186 2187 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2188 2189 /* 2190 * If close-to-open consistency checking is turned off or 2191 * if there is no cached data, we can avoid 2192 * the over the wire getattr. Otherwise, force a 2193 * call to the server to get fresh attributes and to 2194 * check caches. This is required for close-to-open 2195 * consistency. 2196 */ 2197 rp = VTOR4(*vpp); 2198 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2199 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2200 return (0); 2201 2202 gar.n4g_va.va_mask = AT_ALL; 2203 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2204 } 2205 2206 /* 2207 * CLOSE a file 2208 */ 2209 static int 2210 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 2211 { 2212 rnode4_t *rp; 2213 int pc_err = 0; 2214 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2215 2216 /* 2217 * Remove client state for this (lockowner, file) pair. 2218 * Issue otw v4 call to have the server do the same. 2219 */ 2220 2221 rp = VTOR4(vp); 2222 2223 /* 2224 * zone_enter(2) prevents processes from changing zones with NFS files 2225 * open; if we happen to get here from the wrong zone we can't do 2226 * anything over the wire. 2227 */ 2228 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2229 /* 2230 * We could attempt to clean up locks, except we're sure 2231 * that the current process didn't acquire any locks on 2232 * the file: any attempt to lock a file belong to another zone 2233 * will fail, and one can't lock an NFS file and then change 2234 * zones, as that fails too. 2235 * 2236 * Returning an error here is the sane thing to do. A 2237 * subsequent call to VN_RELE() which translates to a 2238 * nfs4_inactive() will clean up state: if the zone of the 2239 * vnode's origin is still alive and kicking, the inactive 2240 * thread will handle the request (from the correct zone), and 2241 * everything (minus the OTW close call) should be OK. If the 2242 * zone is going away nfs4_async_inactive() will throw away 2243 * delegations, open streams and cached pages inline. 2244 */ 2245 return (EIO); 2246 } 2247 2248 /* 2249 * If we are using local locking for this filesystem, then 2250 * release all of the SYSV style record locks. Otherwise, 2251 * we are doing network locking and we need to release all 2252 * of the network locks. All of the locks held by this 2253 * process on this file are released no matter what the 2254 * incoming reference count is. 2255 */ 2256 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2257 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2258 cleanshares(vp, ttoproc(curthread)->p_pid); 2259 } else 2260 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2261 2262 if (e.error) 2263 return (e.error); 2264 2265 if (count > 1) 2266 return (0); 2267 2268 /* 2269 * If the file has been `unlinked', then purge the 2270 * DNLC so that this vnode will get reycled quicker 2271 * and the .nfs* file on the server will get removed. 2272 */ 2273 if (rp->r_unldvp != NULL) 2274 dnlc_purge_vp(vp); 2275 2276 /* 2277 * If the file was open for write and there are pages, 2278 * do a synchronous flush and commit of all of the 2279 * dirty and uncommitted pages. 2280 */ 2281 ASSERT(!e.error); 2282 if ((flag & FWRITE) && nfs4_has_pages(vp)) { 2283 pc_err = nfs4_putpage_commit(vp, 0, 0, cr); 2284 } 2285 2286 mutex_enter(&rp->r_statelock); 2287 e.error = rp->r_error; 2288 rp->r_error = 0; 2289 mutex_exit(&rp->r_statelock); 2290 2291 /* Check to see if we need to close the file */ 2292 2293 if (vp->v_type != VREG) 2294 return (pc_err ? pc_err : e.error); 2295 2296 /* Let nfs4close_one figure out if an OTW close is needed. */ 2297 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2298 2299 if (pc_err) 2300 return (pc_err); 2301 2302 return (e.error ? e.error : geterrno4(e.stat)); 2303 } 2304 2305 /* 2306 * Initialize *lost_rqstp. 2307 */ 2308 2309 static void 2310 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2311 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2312 vnode_t *vp) 2313 { 2314 if (error != ETIMEDOUT && error != EINTR && 2315 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2316 lost_rqstp->lr_op = 0; 2317 return; 2318 } 2319 2320 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2321 "nfs4close_save_lost_rqst: error %d", error)); 2322 2323 lost_rqstp->lr_op = OP_CLOSE; 2324 /* 2325 * The vp is held and rele'd via the recovery code. 2326 * See nfs4_save_lost_rqst. 2327 */ 2328 lost_rqstp->lr_vp = vp; 2329 lost_rqstp->lr_dvp = NULL; 2330 lost_rqstp->lr_oop = oop; 2331 lost_rqstp->lr_osp = osp; 2332 ASSERT(osp != NULL); 2333 ASSERT(mutex_owned(&osp->os_sync_lock)); 2334 osp->os_pending_close = 1; 2335 lost_rqstp->lr_lop = NULL; 2336 lost_rqstp->lr_cr = cr; 2337 lost_rqstp->lr_flk = NULL; 2338 lost_rqstp->lr_putfirst = FALSE; 2339 } 2340 2341 /* 2342 * Assumes you already have the open seqid sync grabbed as well as the 2343 * 'os_sync_lock'. Note: this will release the open seqid sync and 2344 * 'os_sync_lock' if client recovery starts. Calling functions have to 2345 * be prepared to handle this. 2346 * 2347 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2348 * was needed and was started, and that the calling function should retry 2349 * this function; otherwise it is returned as 0. 2350 * 2351 * Errors are returned via the nfs4_error_t parameter. 2352 */ 2353 static void 2354 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2355 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2356 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2357 { 2358 COMPOUND4args_clnt args; 2359 COMPOUND4res_clnt res; 2360 CLOSE4args *close_args; 2361 nfs_resop4 *resop; 2362 nfs_argop4 argop[3]; 2363 int doqueue = 1; 2364 mntinfo4_t *mi; 2365 seqid4 seqid; 2366 vnode_t *vp; 2367 bool_t needrecov = FALSE; 2368 nfs4_lost_rqst_t lost_rqst; 2369 hrtime_t t; 2370 2371 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2372 2373 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2374 2375 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2376 2377 /* Only set this to 1 if recovery is started */ 2378 *recov = 0; 2379 2380 /* do the OTW call to close the file */ 2381 2382 if (close_type == CLOSE_RESEND) 2383 args.ctag = TAG_CLOSE_LOST; 2384 else if (close_type == CLOSE_AFTER_RESEND) 2385 args.ctag = TAG_CLOSE_UNDO; 2386 else 2387 args.ctag = TAG_CLOSE; 2388 2389 args.array_len = 3; 2390 args.array = argop; 2391 2392 vp = RTOV4(rp); 2393 2394 mi = VTOMI4(vp); 2395 2396 /* putfh target fh */ 2397 argop[0].argop = OP_CPUTFH; 2398 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2399 2400 argop[1].argop = OP_GETATTR; 2401 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2402 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2403 2404 argop[2].argop = OP_CLOSE; 2405 close_args = &argop[2].nfs_argop4_u.opclose; 2406 2407 seqid = nfs4_get_open_seqid(oop) + 1; 2408 2409 close_args->seqid = seqid; 2410 close_args->open_stateid = osp->open_stateid; 2411 2412 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2413 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2414 rnode4info(rp))); 2415 2416 t = gethrtime(); 2417 2418 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2419 2420 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2421 nfs4_set_open_seqid(seqid, oop, args.ctag); 2422 } 2423 2424 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2425 if (ep->error && !needrecov) { 2426 /* 2427 * if there was an error and no recovery is to be done 2428 * then then set up the file to flush its cache if 2429 * needed for the next caller. 2430 */ 2431 mutex_enter(&rp->r_statelock); 2432 PURGE_ATTRCACHE4_LOCKED(rp); 2433 rp->r_flags &= ~R4WRITEMODIFIED; 2434 mutex_exit(&rp->r_statelock); 2435 return; 2436 } 2437 2438 if (needrecov) { 2439 bool_t abort; 2440 nfs4_bseqid_entry_t *bsep = NULL; 2441 2442 if (close_type != CLOSE_RESEND) 2443 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2444 osp, cred_otw, vp); 2445 2446 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2447 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2448 0, args.ctag, close_args->seqid); 2449 2450 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2451 "nfs4close_otw: initiating recovery. error %d " 2452 "res.status %d", ep->error, res.status)); 2453 2454 /* 2455 * Drop the 'os_sync_lock' here so we don't hit 2456 * a potential recursive mutex_enter via an 2457 * 'open_stream_hold()'. 2458 */ 2459 mutex_exit(&osp->os_sync_lock); 2460 *have_sync_lockp = 0; 2461 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2462 (close_type != CLOSE_RESEND && 2463 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2464 OP_CLOSE, bsep); 2465 2466 /* drop open seq sync, and let the calling function regrab it */ 2467 nfs4_end_open_seqid_sync(oop); 2468 *did_start_seqid_syncp = 0; 2469 2470 if (bsep) 2471 kmem_free(bsep, sizeof (*bsep)); 2472 /* 2473 * For signals, the caller wants to quit, so don't say to 2474 * retry. For forced unmount, if it's a user thread, it 2475 * wants to quit. If it's a recovery thread, the retry 2476 * will happen higher-up on the call stack. Either way, 2477 * don't say to retry. 2478 */ 2479 if (abort == FALSE && ep->error != EINTR && 2480 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2481 close_type != CLOSE_RESEND && 2482 close_type != CLOSE_AFTER_RESEND) 2483 *recov = 1; 2484 else 2485 *recov = 0; 2486 2487 if (!ep->error) 2488 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2489 return; 2490 } 2491 2492 if (res.status) { 2493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2494 return; 2495 } 2496 2497 mutex_enter(&rp->r_statev4_lock); 2498 rp->created_v4 = 0; 2499 mutex_exit(&rp->r_statev4_lock); 2500 2501 resop = &res.array[2]; 2502 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2503 osp->os_valid = 0; 2504 2505 /* 2506 * This removes the reference obtained at OPEN; ie, when the 2507 * open stream structure was created. 2508 * 2509 * We don't have to worry about calling 'open_stream_rele' 2510 * since we our currently holding a reference to the open 2511 * stream which means the count cannot go to 0 with this 2512 * decrement. 2513 */ 2514 ASSERT(osp->os_ref_count >= 2); 2515 osp->os_ref_count--; 2516 2517 if (!ep->error) 2518 nfs4_attr_cache(vp, 2519 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2520 t, cred_otw, TRUE, NULL); 2521 2522 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2523 " returning %d", ep->error)); 2524 2525 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2526 } 2527 2528 /* ARGSUSED */ 2529 static int 2530 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2531 caller_context_t *ct) 2532 { 2533 rnode4_t *rp; 2534 u_offset_t off; 2535 offset_t diff; 2536 uint_t on; 2537 uint_t n; 2538 caddr_t base; 2539 uint_t flags; 2540 int error; 2541 mntinfo4_t *mi; 2542 2543 rp = VTOR4(vp); 2544 2545 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2546 2547 if (IS_SHADOW(vp, rp)) 2548 vp = RTOV4(rp); 2549 2550 if (vp->v_type != VREG) 2551 return (EISDIR); 2552 2553 mi = VTOMI4(vp); 2554 2555 if (nfs_zone() != mi->mi_zone) 2556 return (EIO); 2557 2558 if (uiop->uio_resid == 0) 2559 return (0); 2560 2561 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2562 return (EINVAL); 2563 2564 mutex_enter(&rp->r_statelock); 2565 if (rp->r_flags & R4RECOVERRP) 2566 error = (rp->r_error ? rp->r_error : EIO); 2567 else 2568 error = 0; 2569 mutex_exit(&rp->r_statelock); 2570 if (error) 2571 return (error); 2572 2573 /* 2574 * Bypass VM if caching has been disabled (e.g., locking) or if 2575 * using client-side direct I/O and the file is not mmap'd and 2576 * there are no cached pages. 2577 */ 2578 if ((vp->v_flag & VNOCACHE) || 2579 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2580 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2581 size_t resid = 0; 2582 2583 return (nfs4read(vp, NULL, uiop->uio_loffset, 2584 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2585 } 2586 2587 error = 0; 2588 2589 do { 2590 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2591 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2592 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2593 2594 if (error = nfs4_validate_caches(vp, cr)) 2595 break; 2596 2597 mutex_enter(&rp->r_statelock); 2598 diff = rp->r_size - uiop->uio_loffset; 2599 mutex_exit(&rp->r_statelock); 2600 if (diff <= 0) 2601 break; 2602 if (diff < n) 2603 n = (uint_t)diff; 2604 2605 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, S_READ); 2606 2607 error = uiomove(base + on, n, UIO_READ, uiop); 2608 2609 if (!error) { 2610 /* 2611 * If read a whole block or read to eof, 2612 * won't need this buffer again soon. 2613 */ 2614 mutex_enter(&rp->r_statelock); 2615 if (n + on == MAXBSIZE || 2616 uiop->uio_loffset == rp->r_size) 2617 flags = SM_DONTNEED; 2618 else 2619 flags = 0; 2620 mutex_exit(&rp->r_statelock); 2621 error = segmap_release(segkmap, base, flags); 2622 } else 2623 (void) segmap_release(segkmap, base, 0); 2624 } while (!error && uiop->uio_resid > 0); 2625 2626 return (error); 2627 } 2628 2629 /* ARGSUSED */ 2630 static int 2631 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2632 caller_context_t *ct) 2633 { 2634 rlim64_t limit = uiop->uio_llimit; 2635 rnode4_t *rp; 2636 u_offset_t off; 2637 caddr_t base; 2638 uint_t flags; 2639 int remainder; 2640 size_t n; 2641 int on; 2642 int error; 2643 int resid; 2644 u_offset_t offset; 2645 mntinfo4_t *mi; 2646 uint_t bsize; 2647 2648 rp = VTOR4(vp); 2649 2650 if (IS_SHADOW(vp, rp)) 2651 vp = RTOV4(rp); 2652 2653 if (vp->v_type != VREG) 2654 return (EISDIR); 2655 2656 mi = VTOMI4(vp); 2657 2658 if (nfs_zone() != mi->mi_zone) 2659 return (EIO); 2660 2661 if (uiop->uio_resid == 0) 2662 return (0); 2663 2664 mutex_enter(&rp->r_statelock); 2665 if (rp->r_flags & R4RECOVERRP) 2666 error = (rp->r_error ? rp->r_error : EIO); 2667 else 2668 error = 0; 2669 mutex_exit(&rp->r_statelock); 2670 if (error) 2671 return (error); 2672 2673 if (ioflag & FAPPEND) { 2674 struct vattr va; 2675 2676 /* 2677 * Must serialize if appending. 2678 */ 2679 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2680 nfs_rw_exit(&rp->r_rwlock); 2681 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2682 INTR(vp))) 2683 return (EINTR); 2684 } 2685 2686 va.va_mask = AT_SIZE; 2687 error = nfs4getattr(vp, &va, cr); 2688 if (error) 2689 return (error); 2690 uiop->uio_loffset = va.va_size; 2691 } 2692 2693 offset = uiop->uio_loffset + uiop->uio_resid; 2694 2695 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2696 return (EINVAL); 2697 2698 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2699 limit = MAXOFFSET_T; 2700 2701 /* 2702 * Check to make sure that the process will not exceed 2703 * its limit on file size. It is okay to write up to 2704 * the limit, but not beyond. Thus, the write which 2705 * reaches the limit will be short and the next write 2706 * will return an error. 2707 */ 2708 remainder = 0; 2709 if (offset > uiop->uio_llimit) { 2710 remainder = offset - uiop->uio_llimit; 2711 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2712 if (uiop->uio_resid <= 0) { 2713 proc_t *p = ttoproc(curthread); 2714 2715 uiop->uio_resid += remainder; 2716 mutex_enter(&p->p_lock); 2717 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2718 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2719 mutex_exit(&p->p_lock); 2720 return (EFBIG); 2721 } 2722 } 2723 2724 /* update the change attribute, if we have a write delegation */ 2725 2726 mutex_enter(&rp->r_statev4_lock); 2727 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2728 rp->r_deleg_change++; 2729 2730 mutex_exit(&rp->r_statev4_lock); 2731 2732 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2733 return (EINTR); 2734 2735 /* 2736 * Bypass VM if caching has been disabled (e.g., locking) or if 2737 * using client-side direct I/O and the file is not mmap'd and 2738 * there are no cached pages. 2739 */ 2740 if ((vp->v_flag & VNOCACHE) || 2741 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2742 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2743 size_t bufsize; 2744 int count; 2745 u_offset_t org_offset; 2746 stable_how4 stab_comm; 2747 nfs4_fwrite: 2748 if (rp->r_flags & R4STALE) { 2749 resid = uiop->uio_resid; 2750 offset = uiop->uio_loffset; 2751 error = rp->r_error; 2752 goto bottom; 2753 } 2754 2755 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2756 base = kmem_alloc(bufsize, KM_SLEEP); 2757 do { 2758 if (ioflag & FDSYNC) 2759 stab_comm = DATA_SYNC4; 2760 else 2761 stab_comm = FILE_SYNC4; 2762 resid = uiop->uio_resid; 2763 offset = uiop->uio_loffset; 2764 count = MIN(uiop->uio_resid, bufsize); 2765 org_offset = uiop->uio_loffset; 2766 error = uiomove(base, count, UIO_WRITE, uiop); 2767 if (!error) { 2768 error = nfs4write(vp, base, org_offset, 2769 count, cr, &stab_comm); 2770 if (!error) { 2771 mutex_enter(&rp->r_statelock); 2772 if (rp->r_size < uiop->uio_loffset) 2773 rp->r_size = uiop->uio_loffset; 2774 mutex_exit(&rp->r_statelock); 2775 } 2776 } 2777 } while (!error && uiop->uio_resid > 0); 2778 kmem_free(base, bufsize); 2779 goto bottom; 2780 } 2781 2782 bsize = vp->v_vfsp->vfs_bsize; 2783 2784 do { 2785 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2786 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2787 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2788 2789 resid = uiop->uio_resid; 2790 offset = uiop->uio_loffset; 2791 2792 if (rp->r_flags & R4STALE) { 2793 error = rp->r_error; 2794 break; 2795 } 2796 2797 /* 2798 * Don't create dirty pages faster than they 2799 * can be cleaned so that the system doesn't 2800 * get imbalanced. If the async queue is 2801 * maxed out, then wait for it to drain before 2802 * creating more dirty pages. Also, wait for 2803 * any threads doing pagewalks in the vop_getattr 2804 * entry points so that they don't block for 2805 * long periods. 2806 */ 2807 mutex_enter(&rp->r_statelock); 2808 while ((mi->mi_max_threads != 0 && 2809 rp->r_awcount > 2 * mi->mi_max_threads) || 2810 rp->r_gcount > 0) 2811 cv_wait(&rp->r_cv, &rp->r_statelock); 2812 mutex_exit(&rp->r_statelock); 2813 2814 if (segmap_kpm) { 2815 int pon = uiop->uio_loffset & PAGEOFFSET; 2816 size_t pn = MIN(PAGESIZE - pon, uiop->uio_resid); 2817 int pagecreate; 2818 2819 mutex_enter(&rp->r_statelock); 2820 pagecreate = (pon == 0) && (pn == PAGESIZE || 2821 uiop->uio_loffset + pn >= rp->r_size); 2822 mutex_exit(&rp->r_statelock); 2823 2824 base = segmap_getmapflt(segkmap, vp, off + on, 2825 pn, !pagecreate, S_WRITE); 2826 2827 error = writerp4(rp, base + pon, n, uiop, pagecreate); 2828 2829 } else { 2830 base = segmap_getmapflt(segkmap, vp, off + on, 2831 n, 0, S_READ); 2832 error = writerp4(rp, base + on, n, uiop, 0); 2833 } 2834 2835 if (!error) { 2836 if (mi->mi_flags & MI4_NOAC) 2837 flags = SM_WRITE; 2838 else if ((uiop->uio_loffset % bsize) == 0 || 2839 IS_SWAPVP(vp)) { 2840 /* 2841 * Have written a whole block. 2842 * Start an asynchronous write 2843 * and mark the buffer to 2844 * indicate that it won't be 2845 * needed again soon. 2846 */ 2847 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2848 } else 2849 flags = 0; 2850 if ((ioflag & (FSYNC|FDSYNC)) || 2851 (rp->r_flags & R4OUTOFSPACE)) { 2852 flags &= ~SM_ASYNC; 2853 flags |= SM_WRITE; 2854 } 2855 error = segmap_release(segkmap, base, flags); 2856 } else { 2857 (void) segmap_release(segkmap, base, 0); 2858 /* 2859 * In the event that we got an access error while 2860 * faulting in a page for a write-only file just 2861 * force a write. 2862 */ 2863 if (error == EACCES) 2864 goto nfs4_fwrite; 2865 } 2866 } while (!error && uiop->uio_resid > 0); 2867 2868 bottom: 2869 if (error) { 2870 uiop->uio_resid = resid + remainder; 2871 uiop->uio_loffset = offset; 2872 } else { 2873 uiop->uio_resid += remainder; 2874 2875 mutex_enter(&rp->r_statev4_lock); 2876 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2877 gethrestime(&rp->r_attr.va_mtime); 2878 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2879 } 2880 mutex_exit(&rp->r_statev4_lock); 2881 } 2882 2883 nfs_rw_exit(&rp->r_lkserlock); 2884 2885 return (error); 2886 } 2887 2888 /* 2889 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2890 */ 2891 static int 2892 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2893 int flags, cred_t *cr) 2894 { 2895 struct buf *bp; 2896 int error; 2897 page_t *savepp; 2898 uchar_t fsdata; 2899 stable_how4 stab_comm; 2900 2901 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 2902 bp = pageio_setup(pp, len, vp, flags); 2903 ASSERT(bp != NULL); 2904 2905 /* 2906 * pageio_setup should have set b_addr to 0. This 2907 * is correct since we want to do I/O on a page 2908 * boundary. bp_mapin will use this addr to calculate 2909 * an offset, and then set b_addr to the kernel virtual 2910 * address it allocated for us. 2911 */ 2912 ASSERT(bp->b_un.b_addr == 0); 2913 2914 bp->b_edev = 0; 2915 bp->b_dev = 0; 2916 bp->b_lblkno = lbtodb(off); 2917 bp->b_file = vp; 2918 bp->b_offset = (offset_t)off; 2919 bp_mapin(bp); 2920 2921 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 2922 freemem > desfree) 2923 stab_comm = UNSTABLE4; 2924 else 2925 stab_comm = FILE_SYNC4; 2926 2927 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 2928 2929 bp_mapout(bp); 2930 pageio_done(bp); 2931 2932 if (stab_comm == UNSTABLE4) 2933 fsdata = C_DELAYCOMMIT; 2934 else 2935 fsdata = C_NOCOMMIT; 2936 2937 savepp = pp; 2938 do { 2939 pp->p_fsdata = fsdata; 2940 } while ((pp = pp->p_next) != savepp); 2941 2942 return (error); 2943 } 2944 2945 /* 2946 */ 2947 static int 2948 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 2949 { 2950 nfs4_open_owner_t *oop; 2951 nfs4_open_stream_t *osp; 2952 rnode4_t *rp = VTOR4(vp); 2953 mntinfo4_t *mi = VTOMI4(vp); 2954 int reopen_needed; 2955 2956 ASSERT(nfs_zone() == mi->mi_zone); 2957 2958 2959 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 2960 if (!oop) 2961 return (EIO); 2962 2963 /* returns with 'os_sync_lock' held */ 2964 osp = find_open_stream(oop, rp); 2965 if (!osp) { 2966 open_owner_rele(oop); 2967 return (EIO); 2968 } 2969 2970 if (osp->os_failed_reopen) { 2971 mutex_exit(&osp->os_sync_lock); 2972 open_stream_rele(osp, rp); 2973 open_owner_rele(oop); 2974 return (EIO); 2975 } 2976 2977 /* 2978 * Determine whether a reopen is needed. If this 2979 * is a delegation open stream, then the os_delegation bit 2980 * should be set. 2981 */ 2982 2983 reopen_needed = osp->os_delegation; 2984 2985 mutex_exit(&osp->os_sync_lock); 2986 open_owner_rele(oop); 2987 2988 if (reopen_needed) { 2989 nfs4_error_zinit(ep); 2990 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 2991 mutex_enter(&osp->os_sync_lock); 2992 if (ep->error || ep->stat || osp->os_failed_reopen) { 2993 mutex_exit(&osp->os_sync_lock); 2994 open_stream_rele(osp, rp); 2995 return (EIO); 2996 } 2997 mutex_exit(&osp->os_sync_lock); 2998 } 2999 open_stream_rele(osp, rp); 3000 3001 return (0); 3002 } 3003 3004 /* 3005 * Write to file. Writes to remote server in largest size 3006 * chunks that the server can handle. Write is synchronous. 3007 */ 3008 static int 3009 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3010 stable_how4 *stab_comm) 3011 { 3012 mntinfo4_t *mi; 3013 COMPOUND4args_clnt args; 3014 COMPOUND4res_clnt res; 3015 WRITE4args *wargs; 3016 WRITE4res *wres; 3017 nfs_argop4 argop[2]; 3018 nfs_resop4 *resop; 3019 int tsize; 3020 stable_how4 stable; 3021 rnode4_t *rp; 3022 int doqueue = 1; 3023 bool_t needrecov; 3024 nfs4_recov_state_t recov_state; 3025 nfs4_stateid_types_t sid_types; 3026 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3027 3028 rp = VTOR4(vp); 3029 mi = VTOMI4(vp); 3030 3031 ASSERT(nfs_zone() == mi->mi_zone); 3032 3033 stable = *stab_comm; 3034 *stab_comm = FILE_SYNC4; 3035 3036 needrecov = FALSE; 3037 recov_state.rs_flags = 0; 3038 recov_state.rs_num_retry_despite_err = 0; 3039 nfs4_init_stateid_types(&sid_types); 3040 3041 recov_retry: 3042 args.ctag = TAG_WRITE; 3043 args.array_len = 2; 3044 args.array = argop; 3045 3046 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3047 &recov_state, NULL); 3048 if (e.error) 3049 return (e.error); 3050 3051 /* 0. putfh target fh */ 3052 argop[0].argop = OP_CPUTFH; 3053 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3054 3055 /* 1. write */ 3056 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3057 3058 do { 3059 3060 wargs->offset = (offset4)offset; 3061 wargs->data_val = base; 3062 3063 if (mi->mi_io_kstats) { 3064 mutex_enter(&mi->mi_lock); 3065 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3066 mutex_exit(&mi->mi_lock); 3067 } 3068 3069 if ((vp->v_flag & VNOCACHE) || 3070 (rp->r_flags & R4DIRECTIO) || 3071 (mi->mi_flags & MI4_DIRECTIO)) 3072 tsize = MIN(mi->mi_stsize, count); 3073 else 3074 tsize = MIN(mi->mi_curwrite, count); 3075 wargs->data_len = (uint_t)tsize; 3076 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3077 3078 if (mi->mi_io_kstats) { 3079 mutex_enter(&mi->mi_lock); 3080 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3081 mutex_exit(&mi->mi_lock); 3082 } 3083 3084 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3085 if (e.error && !needrecov) { 3086 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3087 &recov_state, needrecov); 3088 return (e.error); 3089 } 3090 3091 3092 /* 3093 * Do handling of OLD_STATEID outside 3094 * of the normal recovery framework. 3095 * 3096 * If write receives a BAD stateid error while using a 3097 * delegation stateid, retry using the open stateid (if it 3098 * exists). If it doesn't have an open stateid, reopen the 3099 * file first, then retry. 3100 */ 3101 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3102 sid_types.cur_sid_type != SPEC_SID) { 3103 nfs4_save_stateid(&wargs->stateid, &sid_types); 3104 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3105 &recov_state, needrecov); 3106 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3107 goto recov_retry; 3108 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3109 sid_types.cur_sid_type == DEL_SID) { 3110 nfs4_save_stateid(&wargs->stateid, &sid_types); 3111 mutex_enter(&rp->r_statev4_lock); 3112 rp->r_deleg_return_pending = TRUE; 3113 mutex_exit(&rp->r_statev4_lock); 3114 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3115 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3116 &recov_state, needrecov); 3117 (void) xdr_free(xdr_COMPOUND4res_clnt, 3118 (caddr_t)&res); 3119 return (EIO); 3120 } 3121 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3122 &recov_state, needrecov); 3123 /* hold needed for nfs4delegreturn_thread */ 3124 VN_HOLD(vp); 3125 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3126 NFS4_DR_DISCARD), FALSE); 3127 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3128 goto recov_retry; 3129 } 3130 3131 if (needrecov) { 3132 bool_t abort; 3133 3134 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3135 "nfs4write: client got error %d, res.status %d" 3136 ", so start recovery", e.error, res.status)); 3137 3138 abort = nfs4_start_recovery(&e, 3139 VTOMI4(vp), vp, NULL, &wargs->stateid, 3140 NULL, OP_WRITE, NULL); 3141 if (!e.error) { 3142 e.error = geterrno4(res.status); 3143 (void) xdr_free(xdr_COMPOUND4res_clnt, 3144 (caddr_t)&res); 3145 } 3146 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3147 &recov_state, needrecov); 3148 if (abort == FALSE) 3149 goto recov_retry; 3150 return (e.error); 3151 } 3152 3153 if (res.status) { 3154 e.error = geterrno4(res.status); 3155 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3156 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3157 &recov_state, needrecov); 3158 return (e.error); 3159 } 3160 3161 resop = &res.array[1]; /* write res */ 3162 wres = &resop->nfs_resop4_u.opwrite; 3163 3164 if ((int)wres->count > tsize) { 3165 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3166 3167 zcmn_err(getzoneid(), CE_WARN, 3168 "nfs4write: server wrote %u, requested was %u", 3169 (int)wres->count, tsize); 3170 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3171 &recov_state, needrecov); 3172 return (EIO); 3173 } 3174 if (wres->committed == UNSTABLE4) { 3175 *stab_comm = UNSTABLE4; 3176 if (wargs->stable == DATA_SYNC4 || 3177 wargs->stable == FILE_SYNC4) { 3178 (void) xdr_free(xdr_COMPOUND4res_clnt, 3179 (caddr_t)&res); 3180 zcmn_err(getzoneid(), CE_WARN, 3181 "nfs4write: server %s did not commit " 3182 "to stable storage", 3183 rp->r_server->sv_hostname); 3184 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3185 &recov_state, needrecov); 3186 return (EIO); 3187 } 3188 } 3189 3190 tsize = (int)wres->count; 3191 count -= tsize; 3192 base += tsize; 3193 offset += tsize; 3194 if (mi->mi_io_kstats) { 3195 mutex_enter(&mi->mi_lock); 3196 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3197 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3198 tsize; 3199 mutex_exit(&mi->mi_lock); 3200 } 3201 lwp_stat_update(LWP_STAT_OUBLK, 1); 3202 mutex_enter(&rp->r_statelock); 3203 if (rp->r_flags & R4HAVEVERF) { 3204 if (rp->r_writeverf != wres->writeverf) { 3205 nfs4_set_mod(vp); 3206 rp->r_writeverf = wres->writeverf; 3207 } 3208 } else { 3209 rp->r_writeverf = wres->writeverf; 3210 rp->r_flags |= R4HAVEVERF; 3211 } 3212 PURGE_ATTRCACHE4_LOCKED(rp); 3213 rp->r_flags |= R4WRITEMODIFIED; 3214 gethrestime(&rp->r_attr.va_mtime); 3215 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3216 mutex_exit(&rp->r_statelock); 3217 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3218 } while (count); 3219 3220 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, needrecov); 3221 3222 return (e.error); 3223 } 3224 3225 /* 3226 * Read from a file. Reads data in largest chunks our interface can handle. 3227 */ 3228 static int 3229 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3230 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3231 { 3232 mntinfo4_t *mi; 3233 COMPOUND4args_clnt args; 3234 COMPOUND4res_clnt res; 3235 READ4args *rargs; 3236 nfs_argop4 argop[2]; 3237 int tsize; 3238 int doqueue; 3239 rnode4_t *rp; 3240 int data_len; 3241 bool_t is_eof; 3242 bool_t needrecov = FALSE; 3243 nfs4_recov_state_t recov_state; 3244 nfs4_stateid_types_t sid_types; 3245 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3246 3247 rp = VTOR4(vp); 3248 mi = VTOMI4(vp); 3249 doqueue = 1; 3250 3251 ASSERT(nfs_zone() == mi->mi_zone); 3252 3253 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3254 3255 args.array_len = 2; 3256 args.array = argop; 3257 3258 nfs4_init_stateid_types(&sid_types); 3259 3260 recov_state.rs_flags = 0; 3261 recov_state.rs_num_retry_despite_err = 0; 3262 3263 recov_retry: 3264 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3265 &recov_state, NULL); 3266 if (e.error) 3267 return (e.error); 3268 3269 /* putfh target fh */ 3270 argop[0].argop = OP_CPUTFH; 3271 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3272 3273 /* read */ 3274 argop[1].argop = OP_READ; 3275 rargs = &argop[1].nfs_argop4_u.opread; 3276 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3277 OP_READ, &sid_types, async); 3278 3279 do { 3280 if (mi->mi_io_kstats) { 3281 mutex_enter(&mi->mi_lock); 3282 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3283 mutex_exit(&mi->mi_lock); 3284 } 3285 3286 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3287 "nfs4read: %s call, rp %s", 3288 needrecov ? "recov" : "first", 3289 rnode4info(rp))); 3290 3291 if ((vp->v_flag & VNOCACHE) || 3292 (rp->r_flags & R4DIRECTIO) || 3293 (mi->mi_flags & MI4_DIRECTIO)) 3294 tsize = MIN(mi->mi_tsize, count); 3295 else 3296 tsize = MIN(mi->mi_curread, count); 3297 rargs->offset = (offset4)offset; 3298 rargs->count = (count4)tsize; 3299 rargs->res_data_val_alt = NULL; 3300 rargs->res_mblk = NULL; 3301 rargs->res_uiop = NULL; 3302 rargs->res_maxsize = 0; 3303 if (uiop) 3304 rargs->res_uiop = uiop; 3305 else 3306 rargs->res_data_val_alt = base; 3307 rargs->res_maxsize = tsize; 3308 3309 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3310 #ifdef DEBUG 3311 if (nfs4read_error_inject) { 3312 res.status = nfs4read_error_inject; 3313 nfs4read_error_inject = 0; 3314 } 3315 #endif 3316 3317 if (mi->mi_io_kstats) { 3318 mutex_enter(&mi->mi_lock); 3319 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3320 mutex_exit(&mi->mi_lock); 3321 } 3322 3323 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3324 if (e.error != 0 && !needrecov) { 3325 nfs4_end_fop(mi, vp, NULL, OH_READ, 3326 &recov_state, needrecov); 3327 return (e.error); 3328 } 3329 3330 /* 3331 * Do proper retry for OLD and BAD stateid errors outside 3332 * of the normal recovery framework. There are two differences 3333 * between async and sync reads. The first is that we allow 3334 * retry on BAD_STATEID for async reads, but not sync reads. 3335 * The second is that we mark the file dead for a failed 3336 * attempt with a special stateid for sync reads, but just 3337 * return EIO for async reads. 3338 * 3339 * If a sync read receives a BAD stateid error while using a 3340 * delegation stateid, retry using the open stateid (if it 3341 * exists). If it doesn't have an open stateid, reopen the 3342 * file first, then retry. 3343 */ 3344 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3345 res.status == NFS4ERR_BAD_STATEID) && async) { 3346 nfs4_end_fop(mi, vp, NULL, OH_READ, 3347 &recov_state, needrecov); 3348 if (sid_types.cur_sid_type == SPEC_SID) { 3349 (void) xdr_free(xdr_COMPOUND4res_clnt, 3350 (caddr_t)&res); 3351 return (EIO); 3352 } 3353 nfs4_save_stateid(&rargs->stateid, &sid_types); 3354 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3355 goto recov_retry; 3356 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3357 !async && sid_types.cur_sid_type != SPEC_SID) { 3358 nfs4_save_stateid(&rargs->stateid, &sid_types); 3359 nfs4_end_fop(mi, vp, NULL, OH_READ, 3360 &recov_state, needrecov); 3361 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3362 goto recov_retry; 3363 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3364 sid_types.cur_sid_type == DEL_SID) { 3365 nfs4_save_stateid(&rargs->stateid, &sid_types); 3366 mutex_enter(&rp->r_statev4_lock); 3367 rp->r_deleg_return_pending = TRUE; 3368 mutex_exit(&rp->r_statev4_lock); 3369 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3370 nfs4_end_fop(mi, vp, NULL, OH_READ, 3371 &recov_state, needrecov); 3372 (void) xdr_free(xdr_COMPOUND4res_clnt, 3373 (caddr_t)&res); 3374 return (EIO); 3375 } 3376 nfs4_end_fop(mi, vp, NULL, OH_READ, 3377 &recov_state, needrecov); 3378 /* hold needed for nfs4delegreturn_thread */ 3379 VN_HOLD(vp); 3380 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3381 NFS4_DR_DISCARD), FALSE); 3382 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3383 goto recov_retry; 3384 } 3385 if (needrecov) { 3386 bool_t abort; 3387 3388 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3389 "nfs4read: initiating recovery\n")); 3390 3391 abort = nfs4_start_recovery(&e, 3392 mi, vp, NULL, &rargs->stateid, 3393 NULL, OP_READ, NULL); 3394 nfs4_end_fop(mi, vp, NULL, OH_READ, 3395 &recov_state, needrecov); 3396 /* 3397 * Do not retry if we got OLD_STATEID using a special 3398 * stateid. This avoids looping with a broken server. 3399 */ 3400 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3401 sid_types.cur_sid_type == SPEC_SID) 3402 abort = TRUE; 3403 3404 if (abort == FALSE) { 3405 /* 3406 * Need to retry all possible stateids in 3407 * case the recovery error wasn't stateid 3408 * related or the stateids have become 3409 * stale (server reboot). 3410 */ 3411 nfs4_init_stateid_types(&sid_types); 3412 (void) xdr_free(xdr_COMPOUND4res_clnt, 3413 (caddr_t)&res); 3414 goto recov_retry; 3415 } 3416 3417 if (!e.error) { 3418 e.error = geterrno4(res.status); 3419 (void) xdr_free(xdr_COMPOUND4res_clnt, 3420 (caddr_t)&res); 3421 } 3422 return (e.error); 3423 } 3424 3425 if (res.status) { 3426 e.error = geterrno4(res.status); 3427 nfs4_end_fop(mi, vp, NULL, OH_READ, 3428 &recov_state, needrecov); 3429 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3430 return (e.error); 3431 } 3432 3433 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3434 count -= data_len; 3435 if (base) 3436 base += data_len; 3437 offset += data_len; 3438 if (mi->mi_io_kstats) { 3439 mutex_enter(&mi->mi_lock); 3440 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3441 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3442 mutex_exit(&mi->mi_lock); 3443 } 3444 lwp_stat_update(LWP_STAT_INBLK, 1); 3445 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3446 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3447 3448 } while (count && !is_eof); 3449 3450 *residp = count; 3451 3452 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3453 3454 return (e.error); 3455 } 3456 3457 /* ARGSUSED */ 3458 static int 3459 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 3460 { 3461 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3462 return (EIO); 3463 switch (cmd) { 3464 case _FIODIRECTIO: 3465 return (nfs4_directio(vp, (int)arg, cr)); 3466 default: 3467 return (ENOTTY); 3468 } 3469 } 3470 3471 static int 3472 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 3473 { 3474 int error; 3475 rnode4_t *rp = VTOR4(vp); 3476 3477 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3478 return (EIO); 3479 /* 3480 * If it has been specified that the return value will 3481 * just be used as a hint, and we are only being asked 3482 * for size, fsid or rdevid, then return the client's 3483 * notion of these values without checking to make sure 3484 * that the attribute cache is up to date. 3485 * The whole point is to avoid an over the wire GETATTR 3486 * call. 3487 */ 3488 if (flags & ATTR_HINT) { 3489 if (vap->va_mask == 3490 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3491 mutex_enter(&rp->r_statelock); 3492 if (vap->va_mask | AT_SIZE) 3493 vap->va_size = rp->r_size; 3494 if (vap->va_mask | AT_FSID) 3495 vap->va_fsid = rp->r_attr.va_fsid; 3496 if (vap->va_mask | AT_RDEV) 3497 vap->va_rdev = rp->r_attr.va_rdev; 3498 mutex_exit(&rp->r_statelock); 3499 return (0); 3500 } 3501 } 3502 3503 /* 3504 * Only need to flush pages if asking for the mtime 3505 * and if there any dirty pages or any outstanding 3506 * asynchronous (write) requests for this file. 3507 */ 3508 if (vap->va_mask & AT_MTIME) { 3509 rp = VTOR4(vp); 3510 if (nfs4_has_pages(vp)) { 3511 mutex_enter(&rp->r_statev4_lock); 3512 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3513 mutex_exit(&rp->r_statev4_lock); 3514 if (rp->r_flags & R4DIRTY || 3515 rp->r_awcount > 0) { 3516 mutex_enter(&rp->r_statelock); 3517 rp->r_gcount++; 3518 mutex_exit(&rp->r_statelock); 3519 error = 3520 nfs4_putpage(vp, (u_offset_t)0, 3521 0, 0, cr); 3522 mutex_enter(&rp->r_statelock); 3523 if (error && (error == ENOSPC || 3524 error == EDQUOT)) { 3525 if (!rp->r_error) 3526 rp->r_error = error; 3527 } 3528 if (--rp->r_gcount == 0) 3529 cv_broadcast(&rp->r_cv); 3530 mutex_exit(&rp->r_statelock); 3531 } 3532 } else { 3533 mutex_exit(&rp->r_statev4_lock); 3534 } 3535 } 3536 } 3537 return (nfs4getattr(vp, vap, cr)); 3538 } 3539 3540 int 3541 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3542 { 3543 /* 3544 * If these are the only two bits cleared 3545 * on the server then return 0 (OK) else 3546 * return 1 (BAD). 3547 */ 3548 on_client &= ~(S_ISUID|S_ISGID); 3549 if (on_client == from_server) 3550 return (0); 3551 else 3552 return (1); 3553 } 3554 3555 /*ARGSUSED4*/ 3556 static int 3557 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3558 caller_context_t *ct) 3559 { 3560 if (vap->va_mask & AT_NOSET) 3561 return (EINVAL); 3562 3563 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3564 return (EIO); 3565 3566 /* 3567 * Don't call secpolicy_vnode_setattr, the client cannot 3568 * use its cached attributes to make security decisions 3569 * as the server may be faking mode bits or mapping uid/gid. 3570 * Always just let the server to the checking. 3571 * If we provide the ability to remove basic priviledges 3572 * to setattr (e.g. basic without chmod) then we will 3573 * need to add a check here before calling the server. 3574 */ 3575 3576 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3577 } 3578 3579 /* 3580 * To replace the "guarded" version 3 setattr, we use two types of compound 3581 * setattr requests: 3582 * 1. The "normal" setattr, used when the size of the file isn't being 3583 * changed - { Putfh <fh>; Setattr; Getattr }/ 3584 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3585 * with only ctime as the argument. If the server ctime differs from 3586 * what is cached on the client, the verify will fail, but we would 3587 * already have the ctime from the preceding getattr, so just set it 3588 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3589 * Setattr; Getattr }. 3590 * 3591 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3592 * this setattr and NULL if they are not. 3593 */ 3594 static int 3595 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3596 vsecattr_t *vsap) 3597 { 3598 COMPOUND4args_clnt args; 3599 COMPOUND4res_clnt res, *resp = NULL; 3600 nfs4_ga_res_t *garp = NULL; 3601 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3602 nfs_argop4 argop[5]; 3603 int verify_argop = -1; 3604 int setattr_argop = 1; 3605 nfs_resop4 *resop; 3606 vattr_t va; 3607 rnode4_t *rp; 3608 int doqueue = 1; 3609 uint_t mask = vap->va_mask; 3610 mode_t omode; 3611 vsecattr_t *vsp; 3612 timestruc_t ctime; 3613 bool_t needrecov = FALSE; 3614 nfs4_recov_state_t recov_state; 3615 nfs4_stateid_types_t sid_types; 3616 stateid4 stateid; 3617 hrtime_t t; 3618 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3619 servinfo4_t *svp; 3620 bitmap4 supp_attrs; 3621 3622 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3623 rp = VTOR4(vp); 3624 nfs4_init_stateid_types(&sid_types); 3625 3626 /* 3627 * Only need to flush pages if there are any pages and 3628 * if the file is marked as dirty in some fashion. The 3629 * file must be flushed so that we can accurately 3630 * determine the size of the file and the cached data 3631 * after the SETATTR returns. A file is considered to 3632 * be dirty if it is either marked with R4DIRTY, has 3633 * outstanding i/o's active, or is mmap'd. In this 3634 * last case, we can't tell whether there are dirty 3635 * pages, so we flush just to be sure. 3636 */ 3637 if (nfs4_has_pages(vp) && 3638 ((rp->r_flags & R4DIRTY) || 3639 rp->r_count > 0 || 3640 rp->r_mapcnt > 0)) { 3641 ASSERT(vp->v_type != VCHR); 3642 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr); 3643 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3644 mutex_enter(&rp->r_statelock); 3645 if (!rp->r_error) 3646 rp->r_error = e.error; 3647 mutex_exit(&rp->r_statelock); 3648 } 3649 } 3650 3651 if (mask & AT_SIZE) { 3652 /* 3653 * Verification setattr compound for non-deleg AT_SIZE: 3654 * { Putfh; Getattr; Verify; Setattr; Getattr } 3655 * Set ctime local here (outside the do_again label) 3656 * so that subsequent retries (after failed VERIFY) 3657 * will use ctime from GETATTR results (from failed 3658 * verify compound) as VERIFY arg. 3659 * If file has delegation, then VERIFY(time_metadata) 3660 * is of little added value, so don't bother. 3661 */ 3662 mutex_enter(&rp->r_statev4_lock); 3663 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3664 rp->r_deleg_return_pending) { 3665 numops = 5; 3666 ctime = rp->r_attr.va_ctime; 3667 } 3668 mutex_exit(&rp->r_statev4_lock); 3669 } 3670 3671 recov_state.rs_flags = 0; 3672 recov_state.rs_num_retry_despite_err = 0; 3673 3674 args.ctag = TAG_SETATTR; 3675 do_again: 3676 recov_retry: 3677 setattr_argop = numops - 2; 3678 3679 args.array = argop; 3680 args.array_len = numops; 3681 3682 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3683 if (e.error) 3684 return (e.error); 3685 3686 3687 /* putfh target fh */ 3688 argop[0].argop = OP_CPUTFH; 3689 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3690 3691 if (numops == 5) { 3692 /* 3693 * We only care about the ctime, but need to get mtime 3694 * and size for proper cache update. 3695 */ 3696 /* getattr */ 3697 argop[1].argop = OP_GETATTR; 3698 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3699 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3700 3701 /* verify - set later in loop */ 3702 verify_argop = 2; 3703 } 3704 3705 /* setattr */ 3706 svp = rp->r_server; 3707 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3708 supp_attrs = svp->sv_supp_attrs; 3709 nfs_rw_exit(&svp->sv_lock); 3710 3711 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3712 supp_attrs, &e.error, &sid_types); 3713 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3714 if (e.error) { 3715 /* req time field(s) overflow - return immediately */ 3716 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3717 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3718 opsetattr.obj_attributes); 3719 return (e.error); 3720 } 3721 omode = rp->r_attr.va_mode; 3722 3723 /* getattr */ 3724 argop[numops-1].argop = OP_GETATTR; 3725 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3726 /* 3727 * If we are setting the ACL (indicated only by vsap != NULL), request 3728 * the ACL in this getattr. The ACL returned from this getattr will be 3729 * used in updating the ACL cache. 3730 */ 3731 if (vsap != NULL) 3732 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3733 FATTR4_ACL_MASK; 3734 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3735 3736 /* 3737 * setattr iterates if the object size is set and the cached ctime 3738 * does not match the file ctime. In that case, verify the ctime first. 3739 */ 3740 3741 do { 3742 if (verify_argop != -1) { 3743 /* 3744 * Verify that the ctime match before doing setattr. 3745 */ 3746 va.va_mask = AT_CTIME; 3747 va.va_ctime = ctime; 3748 svp = rp->r_server; 3749 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3750 supp_attrs = svp->sv_supp_attrs; 3751 nfs_rw_exit(&svp->sv_lock); 3752 e.error = nfs4args_verify(&argop[verify_argop], &va, 3753 OP_VERIFY, supp_attrs); 3754 if (e.error) { 3755 /* req time field(s) overflow - return */ 3756 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3757 needrecov); 3758 break; 3759 } 3760 } 3761 3762 doqueue = 1; 3763 3764 t = gethrtime(); 3765 3766 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3767 3768 /* 3769 * Purge the access cache and ACL cache if changing either the 3770 * owner of the file, the group owner, or the mode. These may 3771 * change the access permissions of the file, so purge old 3772 * information and start over again. 3773 */ 3774 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3775 (void) nfs4_access_purge_rp(rp); 3776 if (rp->r_secattr != NULL) { 3777 mutex_enter(&rp->r_statelock); 3778 vsp = rp->r_secattr; 3779 rp->r_secattr = NULL; 3780 mutex_exit(&rp->r_statelock); 3781 if (vsp != NULL) 3782 nfs4_acl_free_cache(vsp); 3783 } 3784 } 3785 3786 /* 3787 * If res.array_len == numops, then everything succeeded, 3788 * except for possibly the final getattr. If only the 3789 * last getattr failed, give up, and don't try recovery. 3790 */ 3791 if (res.array_len == numops) { 3792 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3793 needrecov); 3794 if (! e.error) 3795 resp = &res; 3796 break; 3797 } 3798 3799 /* 3800 * if either rpc call failed or completely succeeded - done 3801 */ 3802 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3803 if (e.error) { 3804 PURGE_ATTRCACHE4(vp); 3805 if (!needrecov) { 3806 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3807 needrecov); 3808 break; 3809 } 3810 } 3811 3812 /* 3813 * Do proper retry for OLD_STATEID outside of the normal 3814 * recovery framework. 3815 */ 3816 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3817 sid_types.cur_sid_type != SPEC_SID && 3818 sid_types.cur_sid_type != NO_SID) { 3819 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3820 needrecov); 3821 nfs4_save_stateid(&stateid, &sid_types); 3822 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3823 opsetattr.obj_attributes); 3824 if (verify_argop != -1) { 3825 nfs4args_verify_free(&argop[verify_argop]); 3826 verify_argop = -1; 3827 } 3828 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3829 goto recov_retry; 3830 } 3831 3832 if (needrecov) { 3833 bool_t abort; 3834 3835 abort = nfs4_start_recovery(&e, 3836 VTOMI4(vp), vp, NULL, NULL, NULL, 3837 OP_SETATTR, NULL); 3838 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3839 needrecov); 3840 /* 3841 * Do not retry if we failed with OLD_STATEID using 3842 * a special stateid. This is done to avoid looping 3843 * with a broken server. 3844 */ 3845 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3846 (sid_types.cur_sid_type == SPEC_SID || 3847 sid_types.cur_sid_type == NO_SID)) 3848 abort = TRUE; 3849 if (!e.error) { 3850 if (res.status == NFS4ERR_BADOWNER) 3851 nfs4_log_badowner(VTOMI4(vp), 3852 OP_SETATTR); 3853 3854 e.error = geterrno4(res.status); 3855 (void) xdr_free(xdr_COMPOUND4res_clnt, 3856 (caddr_t)&res); 3857 } 3858 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3859 opsetattr.obj_attributes); 3860 if (verify_argop != -1) { 3861 nfs4args_verify_free(&argop[verify_argop]); 3862 verify_argop = -1; 3863 } 3864 if (abort == FALSE) { 3865 /* 3866 * Need to retry all possible stateids in 3867 * case the recovery error wasn't stateid 3868 * related or the stateids have become 3869 * stale (server reboot). 3870 */ 3871 nfs4_init_stateid_types(&sid_types); 3872 goto recov_retry; 3873 } 3874 return (e.error); 3875 } 3876 3877 /* 3878 * Need to call nfs4_end_op before nfs4getattr to 3879 * avoid potential nfs4_start_op deadlock. See RFE 3880 * 4777612. Calls to nfs4_invalidate_pages() and 3881 * nfs4_purge_stale_fh() might also generate over the 3882 * wire calls which my cause nfs4_start_op() deadlock. 3883 */ 3884 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3885 3886 /* 3887 * Check to update lease. 3888 */ 3889 resp = &res; 3890 if (res.status == NFS4_OK) { 3891 break; 3892 } 3893 3894 /* 3895 * Check if verify failed to see if try again 3896 */ 3897 if ((verify_argop == -1) || (res.array_len != 3)) { 3898 /* 3899 * can't continue... 3900 */ 3901 if (res.status == NFS4ERR_BADOWNER) 3902 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 3903 3904 e.error = geterrno4(res.status); 3905 } else { 3906 /* 3907 * When the verify request fails, the client ctime is 3908 * not in sync with the server. This is the same as 3909 * the version 3 "not synchronized" error, and we 3910 * handle it in a similar manner (XXX do we need to???). 3911 * Use the ctime returned in the first getattr for 3912 * the input to the next verify. 3913 * If we couldn't get the attributes, then we give up 3914 * because we can't complete the operation as required. 3915 */ 3916 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 3917 } 3918 if (e.error) { 3919 PURGE_ATTRCACHE4(vp); 3920 nfs4_purge_stale_fh(e.error, vp, cr); 3921 } else { 3922 /* 3923 * retry with a new verify value 3924 */ 3925 ctime = garp->n4g_va.va_ctime; 3926 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3927 resp = NULL; 3928 } 3929 if (!e.error) { 3930 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3931 opsetattr.obj_attributes); 3932 if (verify_argop != -1) { 3933 nfs4args_verify_free(&argop[verify_argop]); 3934 verify_argop = -1; 3935 } 3936 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3937 goto do_again; 3938 } 3939 } while (!e.error); 3940 3941 if (e.error) { 3942 /* 3943 * If we are here, rfs4call has an irrecoverable error - return 3944 */ 3945 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3946 opsetattr.obj_attributes); 3947 if (verify_argop != -1) { 3948 nfs4args_verify_free(&argop[verify_argop]); 3949 verify_argop = -1; 3950 } 3951 if (resp) 3952 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 3953 return (e.error); 3954 } 3955 3956 3957 3958 /* 3959 * If changing the size of the file, invalidate 3960 * any local cached data which is no longer part 3961 * of the file. We also possibly invalidate the 3962 * last page in the file. We could use 3963 * pvn_vpzero(), but this would mark the page as 3964 * modified and require it to be written back to 3965 * the server for no particularly good reason. 3966 * This way, if we access it, then we bring it 3967 * back in. A read should be cheaper than a 3968 * write. 3969 */ 3970 if (mask & AT_SIZE) { 3971 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 3972 } 3973 3974 /* either no error or one of the postop getattr failed */ 3975 3976 /* 3977 * XXX Perform a simplified version of wcc checking. Instead of 3978 * have another getattr to get pre-op, just purge cache if 3979 * any of the ops prior to and including the getattr failed. 3980 * If the getattr succeeded then update the attrcache accordingly. 3981 */ 3982 3983 garp = NULL; 3984 if (res.status == NFS4_OK) { 3985 /* 3986 * Last getattr 3987 */ 3988 resop = &res.array[numops - 1]; 3989 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 3990 } 3991 /* 3992 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 3993 * rather than filling it. See the function itself for details. 3994 */ 3995 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 3996 if (garp != NULL) { 3997 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 3998 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 3999 vs_ace4_destroy(&garp->n4g_vsa); 4000 } else { 4001 if (vsap != NULL) { 4002 /* 4003 * The ACL was supposed to be set and to be 4004 * returned in the last getattr of this 4005 * compound, but for some reason the getattr 4006 * result doesn't contain the ACL. In this 4007 * case, purge the ACL cache. 4008 */ 4009 if (rp->r_secattr != NULL) { 4010 mutex_enter(&rp->r_statelock); 4011 vsp = rp->r_secattr; 4012 rp->r_secattr = NULL; 4013 mutex_exit(&rp->r_statelock); 4014 if (vsp != NULL) 4015 nfs4_acl_free_cache(vsp); 4016 } 4017 } 4018 } 4019 } 4020 4021 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4022 /* 4023 * Set the size, rather than relying on getting it updated 4024 * via a GETATTR. With delegations the client tries to 4025 * suppress GETATTR calls. 4026 */ 4027 mutex_enter(&rp->r_statelock); 4028 rp->r_size = vap->va_size; 4029 mutex_exit(&rp->r_statelock); 4030 } 4031 4032 /* 4033 * Can free up request args and res 4034 */ 4035 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4036 opsetattr.obj_attributes); 4037 if (verify_argop != -1) { 4038 nfs4args_verify_free(&argop[verify_argop]); 4039 verify_argop = -1; 4040 } 4041 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4042 4043 /* 4044 * Some servers will change the mode to clear the setuid 4045 * and setgid bits when changing the uid or gid. The 4046 * client needs to compensate appropriately. 4047 */ 4048 if (mask & (AT_UID | AT_GID)) { 4049 int terror, do_setattr; 4050 4051 do_setattr = 0; 4052 va.va_mask = AT_MODE; 4053 terror = nfs4getattr(vp, &va, cr); 4054 if (!terror && 4055 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4056 (!(mask & AT_MODE) && va.va_mode != omode))) { 4057 va.va_mask = AT_MODE; 4058 if (mask & AT_MODE) { 4059 /* 4060 * We asked the mode to be changed and what 4061 * we just got from the server in getattr is 4062 * not what we wanted it to be, so set it now. 4063 */ 4064 va.va_mode = vap->va_mode; 4065 do_setattr = 1; 4066 } else { 4067 /* 4068 * We did not ask the mode to be changed, 4069 * Check to see that the server just cleared 4070 * I_SUID and I_GUID from it. If not then 4071 * set mode to omode with UID/GID cleared. 4072 */ 4073 if (nfs4_compare_modes(va.va_mode, omode)) { 4074 omode &= ~(S_ISUID|S_ISGID); 4075 va.va_mode = omode; 4076 do_setattr = 1; 4077 } 4078 } 4079 4080 if (do_setattr) 4081 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4082 } 4083 } 4084 4085 return (e.error); 4086 } 4087 4088 /* ARGSUSED */ 4089 static int 4090 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr) 4091 { 4092 COMPOUND4args_clnt args; 4093 COMPOUND4res_clnt res; 4094 int doqueue; 4095 uint32_t acc, resacc, argacc; 4096 rnode4_t *rp; 4097 cred_t *cred, *ncr, *ncrfree = NULL; 4098 nfs4_access_type_t cacc; 4099 int num_ops; 4100 nfs_argop4 argop[3]; 4101 nfs_resop4 *resop; 4102 bool_t needrecov = FALSE, do_getattr; 4103 nfs4_recov_state_t recov_state; 4104 int rpc_error; 4105 hrtime_t t; 4106 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4107 mntinfo4_t *mi = VTOMI4(vp); 4108 4109 if (nfs_zone() != mi->mi_zone) 4110 return (EIO); 4111 4112 acc = 0; 4113 if (mode & VREAD) 4114 acc |= ACCESS4_READ; 4115 if (mode & VWRITE) { 4116 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4117 return (EROFS); 4118 if (vp->v_type == VDIR) 4119 acc |= ACCESS4_DELETE; 4120 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4121 } 4122 if (mode & VEXEC) { 4123 if (vp->v_type == VDIR) 4124 acc |= ACCESS4_LOOKUP; 4125 else 4126 acc |= ACCESS4_EXECUTE; 4127 } 4128 4129 if (VTOR4(vp)->r_acache != NULL) { 4130 e.error = nfs4_validate_caches(vp, cr); 4131 if (e.error) 4132 return (e.error); 4133 } 4134 4135 rp = VTOR4(vp); 4136 if (vp->v_type == VDIR) { 4137 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4138 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4139 } else { 4140 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4141 ACCESS4_EXECUTE; 4142 } 4143 recov_state.rs_flags = 0; 4144 recov_state.rs_num_retry_despite_err = 0; 4145 4146 cred = cr; 4147 /* 4148 * ncr and ncrfree both initially 4149 * point to the memory area returned 4150 * by crnetadjust(); 4151 * ncrfree not NULL when exiting means 4152 * that we need to release it 4153 */ 4154 ncr = crnetadjust(cred); 4155 ncrfree = ncr; 4156 4157 tryagain: 4158 cacc = nfs4_access_check(rp, acc, cred); 4159 if (cacc == NFS4_ACCESS_ALLOWED) { 4160 if (ncrfree != NULL) 4161 crfree(ncrfree); 4162 return (0); 4163 } 4164 if (cacc == NFS4_ACCESS_DENIED) { 4165 /* 4166 * If the cred can be adjusted, try again 4167 * with the new cred. 4168 */ 4169 if (ncr != NULL) { 4170 cred = ncr; 4171 ncr = NULL; 4172 goto tryagain; 4173 } 4174 if (ncrfree != NULL) 4175 crfree(ncrfree); 4176 return (EACCES); 4177 } 4178 4179 recov_retry: 4180 /* 4181 * Don't take with r_statev4_lock here. r_deleg_type could 4182 * change as soon as lock is released. Since it is an int, 4183 * there is no atomicity issue. 4184 */ 4185 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4186 num_ops = do_getattr ? 3 : 2; 4187 4188 args.ctag = TAG_ACCESS; 4189 4190 args.array_len = num_ops; 4191 args.array = argop; 4192 4193 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4194 &recov_state, NULL)) { 4195 if (ncrfree != NULL) 4196 crfree(ncrfree); 4197 return (e.error); 4198 } 4199 4200 /* putfh target fh */ 4201 argop[0].argop = OP_CPUTFH; 4202 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4203 4204 /* access */ 4205 argop[1].argop = OP_ACCESS; 4206 argop[1].nfs_argop4_u.opaccess.access = argacc; 4207 4208 /* getattr */ 4209 if (do_getattr) { 4210 argop[2].argop = OP_GETATTR; 4211 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4212 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4213 } 4214 4215 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4216 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4217 rnode4info(VTOR4(vp)))); 4218 4219 doqueue = 1; 4220 t = gethrtime(); 4221 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4222 rpc_error = e.error; 4223 4224 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4225 if (needrecov) { 4226 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4227 "nfs4_access: initiating recovery\n")); 4228 4229 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4230 NULL, OP_ACCESS, NULL) == FALSE) { 4231 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4232 &recov_state, needrecov); 4233 if (!e.error) 4234 (void) xdr_free(xdr_COMPOUND4res_clnt, 4235 (caddr_t)&res); 4236 goto recov_retry; 4237 } 4238 } 4239 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4240 4241 if (e.error) 4242 goto out; 4243 4244 if (res.status) { 4245 e.error = geterrno4(res.status); 4246 /* 4247 * This might generate over the wire calls throught 4248 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4249 * here to avoid a deadlock. 4250 */ 4251 nfs4_purge_stale_fh(e.error, vp, cr); 4252 goto out; 4253 } 4254 resop = &res.array[1]; /* access res */ 4255 4256 resacc = resop->nfs_resop4_u.opaccess.access; 4257 4258 if (do_getattr) { 4259 resop++; /* getattr res */ 4260 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4261 t, cr, FALSE, NULL); 4262 } 4263 4264 if (!e.error) { 4265 nfs4_access_cache(rp, argacc, resacc, cred); 4266 /* 4267 * we just cached results with cred; if cred is the 4268 * adjusted credentials from crnetadjust, we do not want 4269 * to release them before exiting: hence setting ncrfree 4270 * to NULL 4271 */ 4272 if (cred != cr) 4273 ncrfree = NULL; 4274 /* XXX check the supported bits too? */ 4275 if ((acc & resacc) != acc) { 4276 /* 4277 * The following code implements the semantic 4278 * that a setuid root program has *at least* the 4279 * permissions of the user that is running the 4280 * program. See rfs3call() for more portions 4281 * of the implementation of this functionality. 4282 */ 4283 /* XXX-LP */ 4284 if (ncr != NULL) { 4285 (void) xdr_free(xdr_COMPOUND4res_clnt, 4286 (caddr_t)&res); 4287 cred = ncr; 4288 ncr = NULL; 4289 goto tryagain; 4290 } 4291 e.error = EACCES; 4292 } 4293 } 4294 4295 out: 4296 if (!rpc_error) 4297 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4298 4299 if (ncrfree != NULL) 4300 crfree(ncrfree); 4301 4302 return (e.error); 4303 } 4304 4305 static int 4306 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 4307 { 4308 COMPOUND4args_clnt args; 4309 COMPOUND4res_clnt res; 4310 int doqueue; 4311 rnode4_t *rp; 4312 nfs_argop4 argop[3]; 4313 nfs_resop4 *resop; 4314 READLINK4res *lr_res; 4315 nfs4_ga_res_t *garp; 4316 uint_t len; 4317 char *linkdata; 4318 bool_t needrecov = FALSE; 4319 nfs4_recov_state_t recov_state; 4320 hrtime_t t; 4321 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4322 4323 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4324 return (EIO); 4325 /* 4326 * Can't readlink anything other than a symbolic link. 4327 */ 4328 if (vp->v_type != VLNK) 4329 return (EINVAL); 4330 4331 rp = VTOR4(vp); 4332 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4333 e.error = nfs4_validate_caches(vp, cr); 4334 if (e.error) 4335 return (e.error); 4336 mutex_enter(&rp->r_statelock); 4337 if (rp->r_symlink.contents != NULL) { 4338 e.error = uiomove(rp->r_symlink.contents, 4339 rp->r_symlink.len, UIO_READ, uiop); 4340 mutex_exit(&rp->r_statelock); 4341 return (e.error); 4342 } 4343 mutex_exit(&rp->r_statelock); 4344 } 4345 recov_state.rs_flags = 0; 4346 recov_state.rs_num_retry_despite_err = 0; 4347 4348 recov_retry: 4349 args.array_len = 3; 4350 args.array = argop; 4351 args.ctag = TAG_READLINK; 4352 4353 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4354 if (e.error) { 4355 return (e.error); 4356 } 4357 4358 /* 0. putfh symlink fh */ 4359 argop[0].argop = OP_CPUTFH; 4360 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4361 4362 /* 1. readlink */ 4363 argop[1].argop = OP_READLINK; 4364 4365 /* 2. getattr */ 4366 argop[2].argop = OP_GETATTR; 4367 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4368 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4369 4370 doqueue = 1; 4371 4372 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4373 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4374 rnode4info(VTOR4(vp)))); 4375 4376 t = gethrtime(); 4377 4378 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4379 4380 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4381 if (needrecov) { 4382 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4383 "nfs4_readlink: initiating recovery\n")); 4384 4385 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4386 NULL, OP_READLINK, NULL) == FALSE) { 4387 if (!e.error) 4388 (void) xdr_free(xdr_COMPOUND4res_clnt, 4389 (caddr_t)&res); 4390 4391 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4392 needrecov); 4393 goto recov_retry; 4394 } 4395 } 4396 4397 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4398 4399 if (e.error) 4400 return (e.error); 4401 4402 /* 4403 * There is an path in the code below which calls 4404 * nfs4_purge_stale_fh(), which may generate otw calls through 4405 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4406 * here to avoid nfs4_start_op() deadlock. 4407 */ 4408 4409 if (res.status && (res.array_len < args.array_len)) { 4410 /* 4411 * either Putfh or Link failed 4412 */ 4413 e.error = geterrno4(res.status); 4414 nfs4_purge_stale_fh(e.error, vp, cr); 4415 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4416 return (e.error); 4417 } 4418 4419 resop = &res.array[1]; /* readlink res */ 4420 lr_res = &resop->nfs_resop4_u.opreadlink; 4421 4422 /* 4423 * treat symlink names as data 4424 */ 4425 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4426 if (linkdata != NULL) { 4427 int uio_len = len - 1; 4428 /* len includes null byte, which we won't uiomove */ 4429 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4430 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4431 mutex_enter(&rp->r_statelock); 4432 if (rp->r_symlink.contents == NULL) { 4433 rp->r_symlink.contents = linkdata; 4434 rp->r_symlink.len = uio_len; 4435 rp->r_symlink.size = len; 4436 mutex_exit(&rp->r_statelock); 4437 } else { 4438 mutex_exit(&rp->r_statelock); 4439 kmem_free(linkdata, len); 4440 } 4441 } else { 4442 kmem_free(linkdata, len); 4443 } 4444 } 4445 if (res.status == NFS4_OK) { 4446 resop++; /* getattr res */ 4447 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4448 } 4449 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4450 4451 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4452 4453 /* 4454 * The over the wire error for attempting to readlink something 4455 * other than a symbolic link is ENXIO. However, we need to 4456 * return EINVAL instead of ENXIO, so we map it here. 4457 */ 4458 return (e.error == ENXIO ? EINVAL : e.error); 4459 } 4460 4461 /* 4462 * Flush local dirty pages to stable storage on the server. 4463 * 4464 * If FNODSYNC is specified, then there is nothing to do because 4465 * metadata changes are not cached on the client before being 4466 * sent to the server. 4467 */ 4468 static int 4469 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr) 4470 { 4471 int error; 4472 4473 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4474 return (0); 4475 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4476 return (EIO); 4477 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4478 if (!error) 4479 error = VTOR4(vp)->r_error; 4480 return (error); 4481 } 4482 4483 /* 4484 * Weirdness: if the file was removed or the target of a rename 4485 * operation while it was open, it got renamed instead. Here we 4486 * remove the renamed file. 4487 */ 4488 static void 4489 nfs4_inactive(vnode_t *vp, cred_t *cr) 4490 { 4491 rnode4_t *rp; 4492 4493 ASSERT(vp != DNLC_NO_VNODE); 4494 4495 rp = VTOR4(vp); 4496 4497 if (IS_SHADOW(vp, rp)) { 4498 sv_inactive(vp); 4499 return; 4500 } 4501 4502 /* 4503 * If this is coming from the wrong zone, we let someone in the right 4504 * zone take care of it asynchronously. We can get here due to 4505 * VN_RELE() being called from pageout() or fsflush(). This call may 4506 * potentially turn into an expensive no-op if, for instance, v_count 4507 * gets incremented in the meantime, but it's still correct. 4508 */ 4509 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4510 nfs4_async_inactive(vp, cr); 4511 return; 4512 } 4513 4514 /* 4515 * Some of the cleanup steps might require over-the-wire 4516 * operations. Since VOP_INACTIVE can get called as a result of 4517 * other over-the-wire operations (e.g., an attribute cache update 4518 * can lead to a DNLC purge), doing those steps now would lead to a 4519 * nested call to the recovery framework, which can deadlock. So 4520 * do any over-the-wire cleanups asynchronously, in a separate 4521 * thread. 4522 */ 4523 4524 mutex_enter(&rp->r_os_lock); 4525 mutex_enter(&rp->r_statelock); 4526 mutex_enter(&rp->r_statev4_lock); 4527 4528 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4529 mutex_exit(&rp->r_statev4_lock); 4530 mutex_exit(&rp->r_statelock); 4531 mutex_exit(&rp->r_os_lock); 4532 nfs4_async_inactive(vp, cr); 4533 return; 4534 } 4535 4536 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4537 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4538 mutex_exit(&rp->r_statev4_lock); 4539 mutex_exit(&rp->r_statelock); 4540 mutex_exit(&rp->r_os_lock); 4541 nfs4_async_inactive(vp, cr); 4542 return; 4543 } 4544 4545 if (rp->r_unldvp != NULL) { 4546 mutex_exit(&rp->r_statev4_lock); 4547 mutex_exit(&rp->r_statelock); 4548 mutex_exit(&rp->r_os_lock); 4549 nfs4_async_inactive(vp, cr); 4550 return; 4551 } 4552 mutex_exit(&rp->r_statev4_lock); 4553 mutex_exit(&rp->r_statelock); 4554 mutex_exit(&rp->r_os_lock); 4555 4556 rp4_addfree(rp, cr); 4557 } 4558 4559 /* 4560 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4561 * various bits of state. The caller must not refer to vp after this call. 4562 */ 4563 4564 void 4565 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4566 { 4567 rnode4_t *rp = VTOR4(vp); 4568 nfs4_recov_state_t recov_state; 4569 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4570 vnode_t *unldvp; 4571 char *unlname; 4572 cred_t *unlcred; 4573 COMPOUND4args_clnt args; 4574 COMPOUND4res_clnt res, *resp; 4575 nfs_argop4 argop[2]; 4576 int doqueue; 4577 #ifdef DEBUG 4578 char *name; 4579 #endif 4580 4581 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4582 ASSERT(!IS_SHADOW(vp, rp)); 4583 4584 #ifdef DEBUG 4585 name = fn_name(VTOSV(vp)->sv_name); 4586 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4587 "release vnode %s", name)); 4588 kmem_free(name, MAXNAMELEN); 4589 #endif 4590 4591 if (vp->v_type == VREG) { 4592 bool_t recov_failed = FALSE; 4593 4594 e.error = nfs4close_all(vp, cr); 4595 if (e.error) { 4596 /* Check to see if recovery failed */ 4597 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4598 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4599 recov_failed = TRUE; 4600 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4601 if (!recov_failed) { 4602 mutex_enter(&rp->r_statelock); 4603 if (rp->r_flags & R4RECOVERR) 4604 recov_failed = TRUE; 4605 mutex_exit(&rp->r_statelock); 4606 } 4607 if (recov_failed) { 4608 NFS4_DEBUG(nfs4_client_recov_debug, 4609 (CE_NOTE, "nfs4_inactive_otw: " 4610 "close failed (recovery failure)")); 4611 } 4612 } 4613 } 4614 4615 redo: 4616 if (rp->r_unldvp == NULL) { 4617 rp4_addfree(rp, cr); 4618 return; 4619 } 4620 4621 /* 4622 * Save the vnode pointer for the directory where the 4623 * unlinked-open file got renamed, then set it to NULL 4624 * to prevent another thread from getting here before 4625 * we're done with the remove. While we have the 4626 * statelock, make local copies of the pertinent rnode 4627 * fields. If we weren't to do this in an atomic way, the 4628 * the unl* fields could become inconsistent with respect 4629 * to each other due to a race condition between this 4630 * code and nfs_remove(). See bug report 1034328. 4631 */ 4632 mutex_enter(&rp->r_statelock); 4633 if (rp->r_unldvp == NULL) { 4634 mutex_exit(&rp->r_statelock); 4635 rp4_addfree(rp, cr); 4636 return; 4637 } 4638 4639 unldvp = rp->r_unldvp; 4640 rp->r_unldvp = NULL; 4641 unlname = rp->r_unlname; 4642 rp->r_unlname = NULL; 4643 unlcred = rp->r_unlcred; 4644 rp->r_unlcred = NULL; 4645 mutex_exit(&rp->r_statelock); 4646 4647 /* 4648 * If there are any dirty pages left, then flush 4649 * them. This is unfortunate because they just 4650 * may get thrown away during the remove operation, 4651 * but we have to do this for correctness. 4652 */ 4653 if (nfs4_has_pages(vp) && 4654 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4655 ASSERT(vp->v_type != VCHR); 4656 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 4657 if (e.error) { 4658 mutex_enter(&rp->r_statelock); 4659 if (!rp->r_error) 4660 rp->r_error = e.error; 4661 mutex_exit(&rp->r_statelock); 4662 } 4663 } 4664 4665 recov_state.rs_flags = 0; 4666 recov_state.rs_num_retry_despite_err = 0; 4667 recov_retry_remove: 4668 /* 4669 * Do the remove operation on the renamed file 4670 */ 4671 args.ctag = TAG_INACTIVE; 4672 4673 /* 4674 * Remove ops: putfh dir; remove 4675 */ 4676 args.array_len = 2; 4677 args.array = argop; 4678 4679 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4680 if (e.error) { 4681 kmem_free(unlname, MAXNAMELEN); 4682 crfree(unlcred); 4683 VN_RELE(unldvp); 4684 /* 4685 * Try again; this time around r_unldvp will be NULL, so we'll 4686 * just call rp4_addfree() and return. 4687 */ 4688 goto redo; 4689 } 4690 4691 /* putfh directory */ 4692 argop[0].argop = OP_CPUTFH; 4693 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4694 4695 /* remove */ 4696 argop[1].argop = OP_CREMOVE; 4697 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4698 4699 doqueue = 1; 4700 resp = &res; 4701 4702 #if 0 /* notyet */ 4703 /* 4704 * Can't do this yet. We may be being called from 4705 * dnlc_purge_XXX while that routine is holding a 4706 * mutex lock to the nc_rele list. The calls to 4707 * nfs3_cache_wcc_data may result in calls to 4708 * dnlc_purge_XXX. This will result in a deadlock. 4709 */ 4710 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4711 if (e.error) { 4712 PURGE_ATTRCACHE4(unldvp); 4713 resp = NULL; 4714 } else if (res.status) { 4715 e.error = geterrno4(res.status); 4716 PURGE_ATTRCACHE4(unldvp); 4717 /* 4718 * This code is inactive right now 4719 * but if made active there should 4720 * be a nfs4_end_op() call before 4721 * nfs4_purge_stale_fh to avoid start_op() 4722 * deadlock. See BugId: 4948726 4723 */ 4724 nfs4_purge_stale_fh(error, unldvp, cr); 4725 } else { 4726 nfs_resop4 *resop; 4727 REMOVE4res *rm_res; 4728 4729 resop = &res.array[1]; 4730 rm_res = &resop->nfs_resop4_u.opremove; 4731 /* 4732 * Update directory cache attribute, 4733 * readdir and dnlc caches. 4734 */ 4735 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4736 } 4737 #else 4738 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4739 4740 PURGE_ATTRCACHE4(unldvp); 4741 #endif 4742 4743 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4744 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4745 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4746 if (!e.error) 4747 (void) xdr_free(xdr_COMPOUND4res_clnt, 4748 (caddr_t)&res); 4749 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4750 &recov_state, TRUE); 4751 goto recov_retry_remove; 4752 } 4753 } 4754 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4755 4756 /* 4757 * Release stuff held for the remove 4758 */ 4759 VN_RELE(unldvp); 4760 if (!e.error && resp) 4761 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4762 4763 kmem_free(unlname, MAXNAMELEN); 4764 crfree(unlcred); 4765 goto redo; 4766 } 4767 4768 /* 4769 * Remote file system operations having to do with directory manipulation. 4770 */ 4771 /* ARGSUSED3 */ 4772 static int 4773 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4774 int flags, vnode_t *rdir, cred_t *cr) 4775 { 4776 int error; 4777 vnode_t *vp, *avp = NULL; 4778 rnode4_t *drp; 4779 4780 *vpp = NULL; 4781 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4782 return (EPERM); 4783 /* 4784 * if LOOKUP_XATTR, must replace dvp (object) with 4785 * object's attrdir before continuing with lookup 4786 */ 4787 if (flags & LOOKUP_XATTR) { 4788 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4789 if (error) 4790 return (error); 4791 4792 dvp = avp; 4793 4794 /* 4795 * If lookup is for "", just return dvp now. The attrdir 4796 * has already been activated (from nfs4lookup_xattr), and 4797 * the caller will RELE the original dvp -- not 4798 * the attrdir. So, set vpp and return. 4799 * Currently, when the LOOKUP_XATTR flag is 4800 * passed to VOP_LOOKUP, the name is always empty, and 4801 * shortcircuiting here avoids 3 unneeded lock/unlock 4802 * pairs. 4803 * 4804 * If a non-empty name was provided, then it is the 4805 * attribute name, and it will be looked up below. 4806 */ 4807 if (*nm == '\0') { 4808 *vpp = dvp; 4809 return (0); 4810 } 4811 4812 /* 4813 * The vfs layer never sends a name when asking for the 4814 * attrdir, so we should never get here (unless of course 4815 * name is passed at some time in future -- at which time 4816 * we'll blow up here). 4817 */ 4818 ASSERT(0); 4819 } 4820 4821 drp = VTOR4(dvp); 4822 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4823 return (EINTR); 4824 4825 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4826 nfs_rw_exit(&drp->r_rwlock); 4827 4828 /* 4829 * If vnode is a device, create special vnode. 4830 */ 4831 if (!error && ISVDEV((*vpp)->v_type)) { 4832 vp = *vpp; 4833 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4834 VN_RELE(vp); 4835 } 4836 4837 return (error); 4838 } 4839 4840 /* ARGSUSED */ 4841 static int 4842 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4843 { 4844 int error; 4845 rnode4_t *drp; 4846 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4847 mntinfo4_t *mi; 4848 4849 mi = VTOMI4(dvp); 4850 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR)) 4851 return (EINVAL); 4852 4853 drp = VTOR4(dvp); 4854 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4855 return (EINTR); 4856 4857 mutex_enter(&drp->r_statelock); 4858 /* 4859 * If the server doesn't support xattrs just return EINVAL 4860 */ 4861 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4862 mutex_exit(&drp->r_statelock); 4863 nfs_rw_exit(&drp->r_rwlock); 4864 return (EINVAL); 4865 } 4866 4867 /* 4868 * If there is a cached xattr directory entry, 4869 * use it as long as the attributes are valid. If the 4870 * attributes are not valid, take the simple approach and 4871 * free the cached value and re-fetch a new value. 4872 * 4873 * We don't negative entry cache for now, if we did we 4874 * would need to check if the file has changed on every 4875 * lookup. But xattrs don't exist very often and failing 4876 * an openattr is not much more expensive than and NVERIFY or GETATTR 4877 * so do an openattr over the wire for now. 4878 */ 4879 if (drp->r_xattr_dir != NULL) { 4880 if (ATTRCACHE4_VALID(dvp)) { 4881 VN_HOLD(drp->r_xattr_dir); 4882 *vpp = drp->r_xattr_dir; 4883 mutex_exit(&drp->r_statelock); 4884 nfs_rw_exit(&drp->r_rwlock); 4885 return (0); 4886 } 4887 VN_RELE(drp->r_xattr_dir); 4888 drp->r_xattr_dir = NULL; 4889 } 4890 mutex_exit(&drp->r_statelock); 4891 4892 error = nfs4openattr(dvp, vpp, cflag, cr); 4893 4894 nfs_rw_exit(&drp->r_rwlock); 4895 4896 return (error); 4897 } 4898 4899 static int 4900 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 4901 { 4902 int error; 4903 rnode4_t *drp; 4904 4905 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 4906 4907 /* 4908 * If lookup is for "", just return dvp. Don't need 4909 * to send it over the wire, look it up in the dnlc, 4910 * or perform any access checks. 4911 */ 4912 if (*nm == '\0') { 4913 VN_HOLD(dvp); 4914 *vpp = dvp; 4915 return (0); 4916 } 4917 4918 /* 4919 * Can't do lookups in non-directories. 4920 */ 4921 if (dvp->v_type != VDIR) 4922 return (ENOTDIR); 4923 4924 /* 4925 * If lookup is for ".", just return dvp. Don't need 4926 * to send it over the wire or look it up in the dnlc, 4927 * just need to check access. 4928 */ 4929 if (nm[0] == '.' && nm[1] == '\0') { 4930 error = nfs4_access(dvp, VEXEC, 0, cr); 4931 if (error) 4932 return (error); 4933 VN_HOLD(dvp); 4934 *vpp = dvp; 4935 return (0); 4936 } 4937 4938 drp = VTOR4(dvp); 4939 if (!(drp->r_flags & R4LOOKUP)) { 4940 mutex_enter(&drp->r_statelock); 4941 drp->r_flags |= R4LOOKUP; 4942 mutex_exit(&drp->r_statelock); 4943 } 4944 4945 *vpp = NULL; 4946 /* 4947 * Lookup this name in the DNLC. If there is no entry 4948 * lookup over the wire. 4949 */ 4950 if (!skipdnlc) 4951 *vpp = dnlc_lookup(dvp, nm); 4952 if (*vpp == NULL) { 4953 /* 4954 * We need to go over the wire to lookup the name. 4955 */ 4956 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 4957 } 4958 4959 /* 4960 * We hit on the dnlc 4961 */ 4962 if (*vpp != DNLC_NO_VNODE || 4963 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 4964 /* 4965 * But our attrs may not be valid. 4966 */ 4967 if (ATTRCACHE4_VALID(dvp)) { 4968 error = nfs4_waitfor_purge_complete(dvp); 4969 if (error) { 4970 VN_RELE(*vpp); 4971 *vpp = NULL; 4972 return (error); 4973 } 4974 4975 /* 4976 * If after the purge completes, check to make sure 4977 * our attrs are still valid. 4978 */ 4979 if (ATTRCACHE4_VALID(dvp)) { 4980 /* 4981 * If we waited for a purge we may have 4982 * lost our vnode so look it up again. 4983 */ 4984 VN_RELE(*vpp); 4985 *vpp = dnlc_lookup(dvp, nm); 4986 if (*vpp == NULL) 4987 return (nfs4lookupnew_otw(dvp, 4988 nm, vpp, cr)); 4989 4990 /* 4991 * The access cache should almost always hit 4992 */ 4993 error = nfs4_access(dvp, VEXEC, 0, cr); 4994 4995 if (error) { 4996 VN_RELE(*vpp); 4997 *vpp = NULL; 4998 return (error); 4999 } 5000 if (*vpp == DNLC_NO_VNODE) { 5001 VN_RELE(*vpp); 5002 *vpp = NULL; 5003 return (ENOENT); 5004 } 5005 return (0); 5006 } 5007 } 5008 } 5009 5010 ASSERT(*vpp != NULL); 5011 5012 /* 5013 * We may have gotten here we have one of the following cases: 5014 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5015 * need to validate them. 5016 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5017 * must validate. 5018 * 5019 * Go to the server and check if the directory has changed, if 5020 * it hasn't we are done and can use the dnlc entry. 5021 */ 5022 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5023 } 5024 5025 /* 5026 * Go to the server and check if the directory has changed, if 5027 * it hasn't we are done and can use the dnlc entry. If it 5028 * has changed we get a new copy of its attributes and check 5029 * the access for VEXEC, then relookup the filename and 5030 * get its filehandle and attributes. 5031 * 5032 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5033 * if the NVERIFY failed we must 5034 * purge the caches 5035 * cache new attributes (will set r_time_attr_inval) 5036 * cache new access 5037 * recheck VEXEC access 5038 * add name to dnlc, possibly negative 5039 * if LOOKUP succeeded 5040 * cache new attributes 5041 * else 5042 * set a new r_time_attr_inval for dvp 5043 * check to make sure we have access 5044 * 5045 * The vpp returned is the vnode passed in if the directory is valid, 5046 * a new vnode if successful lookup, or NULL on error. 5047 */ 5048 static int 5049 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5050 { 5051 COMPOUND4args_clnt args; 5052 COMPOUND4res_clnt res; 5053 fattr4 *ver_fattr; 5054 fattr4_change dchange; 5055 int32_t *ptr; 5056 int argoplist_size = 7 * sizeof (nfs_argop4); 5057 nfs_argop4 *argop; 5058 int doqueue; 5059 mntinfo4_t *mi; 5060 nfs4_recov_state_t recov_state; 5061 hrtime_t t; 5062 int isdotdot; 5063 vnode_t *nvp; 5064 nfs_fh4 *fhp; 5065 nfs4_sharedfh_t *sfhp; 5066 nfs4_access_type_t cacc; 5067 rnode4_t *nrp; 5068 rnode4_t *drp = VTOR4(dvp); 5069 nfs4_ga_res_t *garp = NULL; 5070 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5071 5072 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5073 ASSERT(nm != NULL); 5074 ASSERT(nm[0] != '\0'); 5075 ASSERT(dvp->v_type == VDIR); 5076 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5077 ASSERT(*vpp != NULL); 5078 5079 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5080 isdotdot = 1; 5081 args.ctag = TAG_LOOKUP_VPARENT; 5082 } else { 5083 /* 5084 * Do not allow crossing of server mount points. The 5085 * only visible entries in a SRVSTUB dir are . and .. 5086 * This code handles the non-.. case. We can't even get 5087 * this far if looking up ".". 5088 */ 5089 if (VTOR4(dvp)->r_flags & R4SRVSTUB) { 5090 VN_RELE(*vpp); 5091 *vpp = NULL; 5092 return (ENOENT); 5093 } 5094 isdotdot = 0; 5095 args.ctag = TAG_LOOKUP_VALID; 5096 } 5097 5098 mi = VTOMI4(dvp); 5099 recov_state.rs_flags = 0; 5100 recov_state.rs_num_retry_despite_err = 0; 5101 5102 nvp = NULL; 5103 5104 /* Save the original mount point security information */ 5105 (void) save_mnt_secinfo(mi->mi_curr_serv); 5106 5107 recov_retry: 5108 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5109 &recov_state, NULL); 5110 if (e.error) { 5111 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5112 VN_RELE(*vpp); 5113 *vpp = NULL; 5114 return (e.error); 5115 } 5116 5117 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5118 5119 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5120 args.array_len = 7; 5121 args.array = argop; 5122 5123 /* 0. putfh file */ 5124 argop[0].argop = OP_CPUTFH; 5125 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5126 5127 /* 1. nverify the change info */ 5128 argop[1].argop = OP_NVERIFY; 5129 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5130 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5131 ver_fattr->attrlist4 = (char *)&dchange; 5132 ptr = (int32_t *)&dchange; 5133 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5134 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5135 5136 /* 2. getattr directory */ 5137 argop[2].argop = OP_GETATTR; 5138 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5139 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5140 5141 /* 3. access directory */ 5142 argop[3].argop = OP_ACCESS; 5143 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5144 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5145 5146 /* 4. lookup name */ 5147 if (isdotdot) { 5148 argop[4].argop = OP_LOOKUPP; 5149 } else { 5150 argop[4].argop = OP_CLOOKUP; 5151 argop[4].nfs_argop4_u.opclookup.cname = nm; 5152 } 5153 5154 /* 5. resulting file handle */ 5155 argop[5].argop = OP_GETFH; 5156 5157 /* 6. resulting file attributes */ 5158 argop[6].argop = OP_GETATTR; 5159 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5160 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5161 5162 doqueue = 1; 5163 t = gethrtime(); 5164 5165 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5166 5167 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5168 /* 5169 * For WRONGSEC of a non-dotdot case, send secinfo directly 5170 * from this thread, do not go thru the recovery thread since 5171 * we need the nm information. 5172 * 5173 * Not doing dotdot case because there is no specification 5174 * for (PUTFH, SECINFO "..") yet. 5175 */ 5176 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5177 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5178 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5179 &recov_state, FALSE); 5180 } else { 5181 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5182 &recov_state, TRUE); 5183 } 5184 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5185 kmem_free(argop, argoplist_size); 5186 if (!e.error) 5187 goto recov_retry; 5188 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5189 VN_RELE(*vpp); 5190 *vpp = NULL; 5191 return (e.error); 5192 } 5193 5194 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5195 OP_LOOKUP, NULL) == FALSE) { 5196 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5197 &recov_state, TRUE); 5198 5199 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5200 kmem_free(argop, argoplist_size); 5201 goto recov_retry; 5202 } 5203 } 5204 5205 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5206 5207 if (e.error || res.array_len == 0) { 5208 /* 5209 * If e.error isn't set, then reply has no ops (or we couldn't 5210 * be here). The only legal way to reply without an op array 5211 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5212 * be in the reply for all other status values. 5213 * 5214 * For valid replies without an ops array, return ENOTSUP 5215 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5216 * return EIO -- don't trust status. 5217 */ 5218 if (e.error == 0) 5219 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5220 ENOTSUP : EIO; 5221 VN_RELE(*vpp); 5222 *vpp = NULL; 5223 kmem_free(argop, argoplist_size); 5224 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5225 return (e.error); 5226 } 5227 5228 if (res.status != NFS4ERR_SAME) { 5229 e.error = geterrno4(res.status); 5230 5231 /* 5232 * The NVERIFY "failed" so the directory has changed 5233 * First make sure PUTFH succeeded and NVERIFY "failed" 5234 * cleanly. 5235 */ 5236 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5237 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5238 nfs4_purge_stale_fh(e.error, dvp, cr); 5239 VN_RELE(*vpp); 5240 *vpp = NULL; 5241 goto exit; 5242 } 5243 5244 /* 5245 * We know the NVERIFY "failed" so we must: 5246 * purge the caches (access and indirectly dnlc if needed) 5247 */ 5248 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5249 5250 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5251 nfs4_purge_stale_fh(e.error, dvp, cr); 5252 VN_RELE(*vpp); 5253 *vpp = NULL; 5254 goto exit; 5255 } 5256 5257 /* 5258 * Install new cached attributes for the directory 5259 */ 5260 nfs4_attr_cache(dvp, 5261 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5262 t, cr, FALSE, NULL); 5263 5264 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5265 nfs4_purge_stale_fh(e.error, dvp, cr); 5266 VN_RELE(*vpp); 5267 *vpp = NULL; 5268 e.error = geterrno4(res.status); 5269 goto exit; 5270 } 5271 5272 /* 5273 * Now we know the directory is valid, 5274 * cache new directory access 5275 */ 5276 nfs4_access_cache(drp, 5277 args.array[3].nfs_argop4_u.opaccess.access, 5278 res.array[3].nfs_resop4_u.opaccess.access, cr); 5279 5280 /* 5281 * recheck VEXEC access 5282 */ 5283 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5284 if (cacc != NFS4_ACCESS_ALLOWED) { 5285 /* 5286 * Directory permissions might have been revoked 5287 */ 5288 if (cacc == NFS4_ACCESS_DENIED) { 5289 e.error = EACCES; 5290 VN_RELE(*vpp); 5291 *vpp = NULL; 5292 goto exit; 5293 } 5294 5295 /* 5296 * Somehow we must not have asked for enough 5297 * so try a singleton ACCESS, should never happen. 5298 */ 5299 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5300 if (e.error) { 5301 VN_RELE(*vpp); 5302 *vpp = NULL; 5303 goto exit; 5304 } 5305 } 5306 5307 e.error = geterrno4(res.status); 5308 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5309 /* 5310 * The lookup failed, probably no entry 5311 */ 5312 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5313 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5314 } else { 5315 /* 5316 * Might be some other error, so remove 5317 * the dnlc entry to make sure we start all 5318 * over again, next time. 5319 */ 5320 dnlc_remove(dvp, nm); 5321 } 5322 VN_RELE(*vpp); 5323 *vpp = NULL; 5324 goto exit; 5325 } 5326 5327 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5328 /* 5329 * The file exists but we can't get its fh for 5330 * some unknown reason. Remove it from the dnlc 5331 * and error out to be safe. 5332 */ 5333 dnlc_remove(dvp, nm); 5334 VN_RELE(*vpp); 5335 *vpp = NULL; 5336 goto exit; 5337 } 5338 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5339 if (fhp->nfs_fh4_len == 0) { 5340 /* 5341 * The file exists but a bogus fh 5342 * some unknown reason. Remove it from the dnlc 5343 * and error out to be safe. 5344 */ 5345 e.error = ENOENT; 5346 dnlc_remove(dvp, nm); 5347 VN_RELE(*vpp); 5348 *vpp = NULL; 5349 goto exit; 5350 } 5351 sfhp = sfh4_get(fhp, mi); 5352 5353 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5354 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5355 5356 /* 5357 * Make the new rnode 5358 */ 5359 if (isdotdot) { 5360 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5361 if (e.error) { 5362 sfh4_rele(&sfhp); 5363 VN_RELE(*vpp); 5364 *vpp = NULL; 5365 goto exit; 5366 } 5367 /* 5368 * XXX if nfs4_make_dotdot uses an existing rnode 5369 * XXX it doesn't update the attributes. 5370 * XXX for now just save them again to save an OTW 5371 */ 5372 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5373 } else { 5374 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5375 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5376 /* 5377 * If v_type == VNON, then garp was NULL because 5378 * the last op in the compound failed and makenfs4node 5379 * could not find the vnode for sfhp. It created 5380 * a new vnode, so we have nothing to purge here. 5381 */ 5382 if (nvp->v_type == VNON) { 5383 vattr_t vattr; 5384 5385 vattr.va_mask = AT_TYPE; 5386 /* 5387 * N.B. We've already called nfs4_end_fop above. 5388 */ 5389 e.error = nfs4getattr(nvp, &vattr, cr); 5390 if (e.error) { 5391 sfh4_rele(&sfhp); 5392 VN_RELE(*vpp); 5393 *vpp = NULL; 5394 VN_RELE(nvp); 5395 goto exit; 5396 } 5397 nvp->v_type = vattr.va_type; 5398 } 5399 } 5400 sfh4_rele(&sfhp); 5401 5402 nrp = VTOR4(nvp); 5403 mutex_enter(&nrp->r_statev4_lock); 5404 if (!nrp->created_v4) { 5405 mutex_exit(&nrp->r_statev4_lock); 5406 dnlc_update(dvp, nm, nvp); 5407 } else 5408 mutex_exit(&nrp->r_statev4_lock); 5409 5410 VN_RELE(*vpp); 5411 *vpp = nvp; 5412 } else { 5413 hrtime_t now; 5414 hrtime_t delta = 0; 5415 5416 e.error = 0; 5417 5418 /* 5419 * Because the NVERIFY "succeeded" we know that the 5420 * directory attributes are still valid 5421 * so update r_time_attr_inval 5422 */ 5423 now = gethrtime(); 5424 mutex_enter(&drp->r_statelock); 5425 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5426 delta = now - drp->r_time_attr_saved; 5427 if (delta < mi->mi_acdirmin) 5428 delta = mi->mi_acdirmin; 5429 else if (delta > mi->mi_acdirmax) 5430 delta = mi->mi_acdirmax; 5431 } 5432 drp->r_time_attr_inval = now + delta; 5433 mutex_exit(&drp->r_statelock); 5434 dnlc_update(dvp, nm, *vpp); 5435 5436 /* 5437 * Even though we have a valid directory attr cache 5438 * and dnlc entry, we may not have access. 5439 * This should almost always hit the cache. 5440 */ 5441 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5442 if (e.error) { 5443 VN_RELE(*vpp); 5444 *vpp = NULL; 5445 } 5446 5447 if (*vpp == DNLC_NO_VNODE) { 5448 VN_RELE(*vpp); 5449 *vpp = NULL; 5450 e.error = ENOENT; 5451 } 5452 } 5453 5454 exit: 5455 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5456 kmem_free(argop, argoplist_size); 5457 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5458 return (e.error); 5459 } 5460 5461 /* 5462 * We need to go over the wire to lookup the name, but 5463 * while we are there verify the directory has not 5464 * changed but if it has, get new attributes and check access 5465 * 5466 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5467 * NVERIFY GETATTR ACCESS 5468 * 5469 * With the results: 5470 * if the NVERIFY failed we must purge the caches, add new attributes, 5471 * and cache new access. 5472 * set a new r_time_attr_inval 5473 * add name to dnlc, possibly negative 5474 * if LOOKUP succeeded 5475 * cache new attributes 5476 */ 5477 static int 5478 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5479 { 5480 COMPOUND4args_clnt args; 5481 COMPOUND4res_clnt res; 5482 fattr4 *ver_fattr; 5483 fattr4_change dchange; 5484 int32_t *ptr; 5485 nfs4_ga_res_t *garp = NULL; 5486 int argoplist_size = 9 * sizeof (nfs_argop4); 5487 nfs_argop4 *argop; 5488 int doqueue; 5489 mntinfo4_t *mi; 5490 nfs4_recov_state_t recov_state; 5491 hrtime_t t; 5492 int isdotdot; 5493 vnode_t *nvp; 5494 nfs_fh4 *fhp; 5495 nfs4_sharedfh_t *sfhp; 5496 nfs4_access_type_t cacc; 5497 rnode4_t *nrp; 5498 rnode4_t *drp = VTOR4(dvp); 5499 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5500 5501 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5502 ASSERT(nm != NULL); 5503 ASSERT(nm[0] != '\0'); 5504 ASSERT(dvp->v_type == VDIR); 5505 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5506 ASSERT(*vpp == NULL); 5507 5508 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5509 isdotdot = 1; 5510 args.ctag = TAG_LOOKUP_PARENT; 5511 } else { 5512 /* 5513 * Do not allow crossing of server mount points. The 5514 * only visible entries in a SRVSTUB dir are . and .. 5515 * This code handles the non-.. case. We can't even get 5516 * this far if looking up ".". 5517 */ 5518 if (VTOR4(dvp)->r_flags & R4SRVSTUB) 5519 return (ENOENT); 5520 5521 isdotdot = 0; 5522 args.ctag = TAG_LOOKUP; 5523 } 5524 5525 mi = VTOMI4(dvp); 5526 recov_state.rs_flags = 0; 5527 recov_state.rs_num_retry_despite_err = 0; 5528 5529 nvp = NULL; 5530 5531 /* Save the original mount point security information */ 5532 (void) save_mnt_secinfo(mi->mi_curr_serv); 5533 5534 recov_retry: 5535 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5536 &recov_state, NULL); 5537 if (e.error) { 5538 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5539 return (e.error); 5540 } 5541 5542 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5543 5544 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5545 args.array_len = 9; 5546 args.array = argop; 5547 5548 /* 0. putfh file */ 5549 argop[0].argop = OP_CPUTFH; 5550 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5551 5552 /* 1. savefh for the nverify */ 5553 argop[1].argop = OP_SAVEFH; 5554 5555 /* 2. lookup name */ 5556 if (isdotdot) { 5557 argop[2].argop = OP_LOOKUPP; 5558 } else { 5559 argop[2].argop = OP_CLOOKUP; 5560 argop[2].nfs_argop4_u.opclookup.cname = nm; 5561 } 5562 5563 /* 3. resulting file handle */ 5564 argop[3].argop = OP_GETFH; 5565 5566 /* 4. resulting file attributes */ 5567 argop[4].argop = OP_GETATTR; 5568 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5569 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5570 5571 /* 5. restorefh back the directory for the nverify */ 5572 argop[5].argop = OP_RESTOREFH; 5573 5574 /* 6. nverify the change info */ 5575 argop[6].argop = OP_NVERIFY; 5576 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5577 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5578 ver_fattr->attrlist4 = (char *)&dchange; 5579 ptr = (int32_t *)&dchange; 5580 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5581 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5582 5583 /* 7. getattr directory */ 5584 argop[7].argop = OP_GETATTR; 5585 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5586 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5587 5588 /* 8. access directory */ 5589 argop[8].argop = OP_ACCESS; 5590 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5591 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5592 5593 doqueue = 1; 5594 t = gethrtime(); 5595 5596 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5597 5598 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5599 /* 5600 * For WRONGSEC of a non-dotdot case, send secinfo directly 5601 * from this thread, do not go thru the recovery thread since 5602 * we need the nm information. 5603 * 5604 * Not doing dotdot case because there is no specification 5605 * for (PUTFH, SECINFO "..") yet. 5606 */ 5607 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5608 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5609 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5610 &recov_state, FALSE); 5611 } else { 5612 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5613 &recov_state, TRUE); 5614 } 5615 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5616 kmem_free(argop, argoplist_size); 5617 if (!e.error) 5618 goto recov_retry; 5619 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5620 return (e.error); 5621 } 5622 5623 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5624 OP_LOOKUP, NULL) == FALSE) { 5625 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5626 &recov_state, TRUE); 5627 5628 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5629 kmem_free(argop, argoplist_size); 5630 goto recov_retry; 5631 } 5632 } 5633 5634 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5635 5636 if (e.error || res.array_len == 0) { 5637 /* 5638 * If e.error isn't set, then reply has no ops (or we couldn't 5639 * be here). The only legal way to reply without an op array 5640 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5641 * be in the reply for all other status values. 5642 * 5643 * For valid replies without an ops array, return ENOTSUP 5644 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5645 * return EIO -- don't trust status. 5646 */ 5647 if (e.error == 0) 5648 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5649 ENOTSUP : EIO; 5650 5651 kmem_free(argop, argoplist_size); 5652 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5653 return (e.error); 5654 } 5655 5656 e.error = geterrno4(res.status); 5657 5658 /* 5659 * The PUTFH and SAVEFH may have failed. 5660 */ 5661 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5662 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5663 nfs4_purge_stale_fh(e.error, dvp, cr); 5664 goto exit; 5665 } 5666 5667 /* 5668 * Check if the file exists, if it does delay entering 5669 * into the dnlc until after we update the directory 5670 * attributes so we don't cause it to get purged immediately. 5671 */ 5672 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5673 /* 5674 * The lookup failed, probably no entry 5675 */ 5676 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5677 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5678 } 5679 goto exit; 5680 } 5681 5682 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5683 /* 5684 * The file exists but we can't get its fh for 5685 * some unknown reason. Error out to be safe. 5686 */ 5687 goto exit; 5688 } 5689 5690 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5691 if (fhp->nfs_fh4_len == 0) { 5692 /* 5693 * The file exists but a bogus fh 5694 * some unknown reason. Error out to be safe. 5695 */ 5696 e.error = EIO; 5697 goto exit; 5698 } 5699 sfhp = sfh4_get(fhp, mi); 5700 5701 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5702 sfh4_rele(&sfhp); 5703 e.error = EIO; 5704 goto exit; 5705 } 5706 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5707 5708 /* 5709 * The RESTOREFH may have failed 5710 */ 5711 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5712 sfh4_rele(&sfhp); 5713 e.error = EIO; 5714 goto exit; 5715 } 5716 5717 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5718 /* 5719 * First make sure the NVERIFY failed as we expected, 5720 * if it didn't then be conservative and error out 5721 * as we can't trust the directory. 5722 */ 5723 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5724 sfh4_rele(&sfhp); 5725 e.error = EIO; 5726 goto exit; 5727 } 5728 5729 /* 5730 * We know the NVERIFY "failed" so the directory has changed, 5731 * so we must: 5732 * purge the caches (access and indirectly dnlc if needed) 5733 */ 5734 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5735 5736 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5737 sfh4_rele(&sfhp); 5738 goto exit; 5739 } 5740 nfs4_attr_cache(dvp, 5741 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5742 t, cr, FALSE, NULL); 5743 5744 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5745 nfs4_purge_stale_fh(e.error, dvp, cr); 5746 sfh4_rele(&sfhp); 5747 e.error = geterrno4(res.status); 5748 goto exit; 5749 } 5750 5751 /* 5752 * Now we know the directory is valid, 5753 * cache new directory access 5754 */ 5755 nfs4_access_cache(drp, 5756 args.array[8].nfs_argop4_u.opaccess.access, 5757 res.array[8].nfs_resop4_u.opaccess.access, cr); 5758 5759 /* 5760 * recheck VEXEC access 5761 */ 5762 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5763 if (cacc != NFS4_ACCESS_ALLOWED) { 5764 /* 5765 * Directory permissions might have been revoked 5766 */ 5767 if (cacc == NFS4_ACCESS_DENIED) { 5768 sfh4_rele(&sfhp); 5769 e.error = EACCES; 5770 goto exit; 5771 } 5772 5773 /* 5774 * Somehow we must not have asked for enough 5775 * so try a singleton ACCESS should never happen 5776 */ 5777 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5778 if (e.error) { 5779 sfh4_rele(&sfhp); 5780 goto exit; 5781 } 5782 } 5783 5784 e.error = geterrno4(res.status); 5785 } else { 5786 hrtime_t now; 5787 hrtime_t delta = 0; 5788 5789 e.error = 0; 5790 5791 /* 5792 * Because the NVERIFY "succeeded" we know that the 5793 * directory attributes are still valid 5794 * so update r_time_attr_inval 5795 */ 5796 now = gethrtime(); 5797 mutex_enter(&drp->r_statelock); 5798 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5799 delta = now - drp->r_time_attr_saved; 5800 if (delta < mi->mi_acdirmin) 5801 delta = mi->mi_acdirmin; 5802 else if (delta > mi->mi_acdirmax) 5803 delta = mi->mi_acdirmax; 5804 } 5805 drp->r_time_attr_inval = now + delta; 5806 mutex_exit(&drp->r_statelock); 5807 5808 /* 5809 * Even though we have a valid directory attr cache, 5810 * we may not have access. 5811 * This should almost always hit the cache. 5812 */ 5813 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5814 if (e.error) { 5815 sfh4_rele(&sfhp); 5816 goto exit; 5817 } 5818 } 5819 5820 /* 5821 * Now we have successfully completed the lookup, if the 5822 * directory has changed we now have the valid attributes. 5823 * We also know we have directory access. 5824 * Create the new rnode and insert it in the dnlc. 5825 */ 5826 if (isdotdot) { 5827 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5828 if (e.error) { 5829 sfh4_rele(&sfhp); 5830 goto exit; 5831 } 5832 /* 5833 * XXX if nfs4_make_dotdot uses an existing rnode 5834 * XXX it doesn't update the attributes. 5835 * XXX for now just save them again to save an OTW 5836 */ 5837 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5838 } else { 5839 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5840 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5841 } 5842 sfh4_rele(&sfhp); 5843 5844 nrp = VTOR4(nvp); 5845 mutex_enter(&nrp->r_statev4_lock); 5846 if (!nrp->created_v4) { 5847 mutex_exit(&nrp->r_statev4_lock); 5848 dnlc_update(dvp, nm, nvp); 5849 } else 5850 mutex_exit(&nrp->r_statev4_lock); 5851 5852 *vpp = nvp; 5853 5854 exit: 5855 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5856 kmem_free(argop, argoplist_size); 5857 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5858 return (e.error); 5859 } 5860 5861 #ifdef DEBUG 5862 void 5863 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5864 { 5865 uint_t i, len; 5866 zoneid_t zoneid = getzoneid(); 5867 char *s; 5868 5869 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5870 for (i = 0; i < argcnt; i++) { 5871 nfs_argop4 *op = &argbase[i]; 5872 switch (op->argop) { 5873 case OP_CPUTFH: 5874 case OP_PUTFH: 5875 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5876 break; 5877 case OP_PUTROOTFH: 5878 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5879 break; 5880 case OP_CLOOKUP: 5881 s = op->nfs_argop4_u.opclookup.cname; 5882 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5883 break; 5884 case OP_LOOKUP: 5885 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 5886 &len, NULL); 5887 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5888 kmem_free(s, len); 5889 break; 5890 case OP_LOOKUPP: 5891 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 5892 break; 5893 case OP_GETFH: 5894 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 5895 break; 5896 case OP_GETATTR: 5897 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 5898 break; 5899 case OP_OPENATTR: 5900 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 5901 break; 5902 default: 5903 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 5904 op->argop); 5905 break; 5906 } 5907 } 5908 } 5909 #endif 5910 5911 /* 5912 * nfs4lookup_setup - constructs a multi-lookup compound request. 5913 * 5914 * Given the path "nm1/nm2/.../nmn", the following compound requests 5915 * may be created: 5916 * 5917 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 5918 * is faster, for now. 5919 * 5920 * l4_getattrs indicates the type of compound requested. 5921 * 5922 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 5923 * 5924 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 5925 * 5926 * total number of ops is n + 1. 5927 * 5928 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 5929 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 5930 * before the last component, and only get attributes 5931 * for the last component. Note that the second-to-last 5932 * pathname component is XATTR_RPATH, which does NOT go 5933 * over-the-wire as a lookup. 5934 * 5935 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 5936 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 5937 * 5938 * and total number of ops is n + 5. 5939 * 5940 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 5941 * attribute directory: create lookups plus an OPENATTR 5942 * replacing the last lookup. Note that the last pathname 5943 * component is XATTR_RPATH, which does NOT go over-the-wire 5944 * as a lookup. 5945 * 5946 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 5947 * Openattr; Getfh; Getattr } 5948 * 5949 * and total number of ops is n + 5. 5950 * 5951 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 5952 * nodes too. 5953 * 5954 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 5955 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 5956 * 5957 * and total number of ops is 3*n + 1. 5958 * 5959 * All cases: returns the index in the arg array of the final LOOKUP op, or 5960 * -1 if no LOOKUPs were used. 5961 */ 5962 int 5963 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 5964 { 5965 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 5966 nfs_argop4 *argbase, *argop; 5967 int arglen, argcnt; 5968 int n = 1; /* number of components */ 5969 int nga = 1; /* number of Getattr's in request */ 5970 char c = '\0', *s, *p; 5971 int lookup_idx = -1; 5972 int argoplist_size; 5973 5974 /* set lookuparg response result to 0 */ 5975 lookupargp->resp->status = NFS4_OK; 5976 5977 /* skip leading "/" or "." e.g. ".//./" if there is */ 5978 for (; ; nm++) { 5979 if (*nm != '/' && *nm != '.') 5980 break; 5981 5982 /* ".." is counted as 1 component */ 5983 if (*nm == '.' && *(nm + 1) == '.') 5984 break; 5985 } 5986 5987 /* 5988 * Find n = number of components - nm must be null terminated 5989 * Skip "." components. 5990 */ 5991 if (*nm != '\0') { 5992 for (n = 1, s = nm; *s != '\0'; s++) { 5993 if ((*s == '/') && (*(s + 1) != '/') && 5994 (*(s + 1) != '\0') && 5995 !(*(s + 1) == '.' && (*(s + 2) == '/' || 5996 *(s + 2) == '\0'))) 5997 n++; 5998 } 5999 } else 6000 n = 0; 6001 6002 /* 6003 * nga is number of components that need Getfh+Getattr 6004 */ 6005 switch (l4_getattrs) { 6006 case LKP4_NO_ATTRIBUTES: 6007 nga = 0; 6008 break; 6009 case LKP4_ALL_ATTRIBUTES: 6010 nga = n; 6011 /* 6012 * Always have at least 1 getfh, getattr pair 6013 */ 6014 if (nga == 0) 6015 nga++; 6016 break; 6017 case LKP4_LAST_ATTRDIR: 6018 case LKP4_LAST_NAMED_ATTR: 6019 nga = n+1; 6020 break; 6021 } 6022 6023 /* 6024 * If change to use the filehandle attr instead of getfh 6025 * the following line can be deleted. 6026 */ 6027 nga *= 2; 6028 6029 /* 6030 * calculate number of ops in request as 6031 * header + trailer + lookups + getattrs 6032 */ 6033 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6034 6035 argoplist_size = arglen * sizeof (nfs_argop4); 6036 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6037 lookupargp->argsp->array = argop; 6038 6039 argcnt = lookupargp->header_len; 6040 argop += argcnt; 6041 6042 /* 6043 * loop and create a lookup op and possibly getattr/getfh for 6044 * each component. Skip "." components. 6045 */ 6046 for (s = nm; *s != '\0'; s = p) { 6047 /* 6048 * Set up a pathname struct for each component if needed 6049 */ 6050 while (*s == '/') 6051 s++; 6052 if (*s == '\0') 6053 break; 6054 for (p = s; (*p != '/') && (*p != '\0'); p++); 6055 c = *p; 6056 *p = '\0'; 6057 6058 if (s[0] == '.' && s[1] == '\0') { 6059 *p = c; 6060 continue; 6061 } 6062 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6063 strcmp(s, XATTR_RPATH) == 0) { 6064 /* getfh XXX may not be needed in future */ 6065 argop->argop = OP_GETFH; 6066 argop++; 6067 argcnt++; 6068 6069 /* getattr */ 6070 argop->argop = OP_GETATTR; 6071 argop->nfs_argop4_u.opgetattr.attr_request = 6072 lookupargp->ga_bits; 6073 argop->nfs_argop4_u.opgetattr.mi = 6074 lookupargp->mi; 6075 argop++; 6076 argcnt++; 6077 6078 /* openattr */ 6079 argop->argop = OP_OPENATTR; 6080 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6081 strcmp(s, XATTR_RPATH) == 0) { 6082 /* openattr */ 6083 argop->argop = OP_OPENATTR; 6084 argop++; 6085 argcnt++; 6086 6087 /* getfh XXX may not be needed in future */ 6088 argop->argop = OP_GETFH; 6089 argop++; 6090 argcnt++; 6091 6092 /* getattr */ 6093 argop->argop = OP_GETATTR; 6094 argop->nfs_argop4_u.opgetattr.attr_request = 6095 lookupargp->ga_bits; 6096 argop->nfs_argop4_u.opgetattr.mi = 6097 lookupargp->mi; 6098 argop++; 6099 argcnt++; 6100 *p = c; 6101 continue; 6102 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6103 /* lookupp */ 6104 argop->argop = OP_LOOKUPP; 6105 } else { 6106 /* lookup */ 6107 argop->argop = OP_LOOKUP; 6108 (void) str_to_utf8(s, 6109 &argop->nfs_argop4_u.oplookup.objname); 6110 } 6111 lookup_idx = argcnt; 6112 argop++; 6113 argcnt++; 6114 6115 *p = c; 6116 6117 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6118 /* getfh XXX may not be needed in future */ 6119 argop->argop = OP_GETFH; 6120 argop++; 6121 argcnt++; 6122 6123 /* getattr */ 6124 argop->argop = OP_GETATTR; 6125 argop->nfs_argop4_u.opgetattr.attr_request = 6126 lookupargp->ga_bits; 6127 argop->nfs_argop4_u.opgetattr.mi = 6128 lookupargp->mi; 6129 argop++; 6130 argcnt++; 6131 } 6132 } 6133 6134 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6135 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6136 if (needgetfh) { 6137 /* stick in a post-lookup getfh */ 6138 argop->argop = OP_GETFH; 6139 argcnt++; 6140 argop++; 6141 } 6142 /* post-lookup getattr */ 6143 argop->argop = OP_GETATTR; 6144 argop->nfs_argop4_u.opgetattr.attr_request = 6145 lookupargp->ga_bits; 6146 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6147 argcnt++; 6148 } 6149 argcnt += lookupargp->trailer_len; /* actual op count */ 6150 lookupargp->argsp->array_len = argcnt; 6151 lookupargp->arglen = arglen; 6152 6153 #ifdef DEBUG 6154 if (nfs4_client_lookup_debug) 6155 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6156 #endif 6157 6158 return (lookup_idx); 6159 } 6160 6161 static int 6162 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6163 { 6164 COMPOUND4args_clnt args; 6165 COMPOUND4res_clnt res; 6166 GETFH4res *gf_res = NULL; 6167 nfs_argop4 argop[4]; 6168 nfs_resop4 *resop = NULL; 6169 nfs4_sharedfh_t *sfhp; 6170 hrtime_t t; 6171 nfs4_error_t e; 6172 6173 rnode4_t *drp; 6174 int doqueue = 1; 6175 vnode_t *vp; 6176 int needrecov = 0; 6177 nfs4_recov_state_t recov_state; 6178 6179 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6180 6181 *avp = NULL; 6182 recov_state.rs_flags = 0; 6183 recov_state.rs_num_retry_despite_err = 0; 6184 6185 recov_retry: 6186 /* COMPOUND: putfh, openattr, getfh, getattr */ 6187 args.array_len = 4; 6188 args.array = argop; 6189 args.ctag = TAG_OPENATTR; 6190 6191 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6192 if (e.error) 6193 return (e.error); 6194 6195 drp = VTOR4(dvp); 6196 6197 /* putfh */ 6198 argop[0].argop = OP_CPUTFH; 6199 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6200 6201 /* openattr */ 6202 argop[1].argop = OP_OPENATTR; 6203 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6204 6205 /* getfh */ 6206 argop[2].argop = OP_GETFH; 6207 6208 /* getattr */ 6209 argop[3].argop = OP_GETATTR; 6210 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6211 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6212 6213 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6214 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6215 rnode4info(drp))); 6216 6217 t = gethrtime(); 6218 6219 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6220 6221 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6222 if (needrecov) { 6223 bool_t abort; 6224 6225 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6226 "nfs4openattr: initiating recovery\n")); 6227 6228 abort = nfs4_start_recovery(&e, 6229 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6230 OP_OPENATTR, NULL); 6231 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6232 if (!e.error) { 6233 e.error = geterrno4(res.status); 6234 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6235 } 6236 if (abort == FALSE) 6237 goto recov_retry; 6238 return (e.error); 6239 } 6240 6241 if (e.error) { 6242 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6243 return (e.error); 6244 } 6245 6246 if (res.status) { 6247 /* 6248 * If OTW errro is NOTSUPP, then it should be 6249 * translated to EINVAL. All Solaris file system 6250 * implementations return EINVAL to the syscall layer 6251 * when the attrdir cannot be created due to an 6252 * implementation restriction or noxattr mount option. 6253 */ 6254 if (res.status == NFS4ERR_NOTSUPP) { 6255 mutex_enter(&drp->r_statelock); 6256 if (drp->r_xattr_dir) 6257 VN_RELE(drp->r_xattr_dir); 6258 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6259 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6260 mutex_exit(&drp->r_statelock); 6261 6262 e.error = EINVAL; 6263 } else { 6264 e.error = geterrno4(res.status); 6265 } 6266 6267 if (e.error) { 6268 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6269 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6270 needrecov); 6271 return (e.error); 6272 } 6273 } 6274 6275 resop = &res.array[0]; /* putfh res */ 6276 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6277 6278 resop = &res.array[1]; /* openattr res */ 6279 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6280 6281 resop = &res.array[2]; /* getfh res */ 6282 gf_res = &resop->nfs_resop4_u.opgetfh; 6283 if (gf_res->object.nfs_fh4_len == 0) { 6284 *avp = NULL; 6285 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6286 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6287 return (ENOENT); 6288 } 6289 6290 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6291 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6292 dvp->v_vfsp, t, cr, dvp, 6293 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH)); 6294 sfh4_rele(&sfhp); 6295 6296 if (e.error) 6297 PURGE_ATTRCACHE4(vp); 6298 6299 mutex_enter(&vp->v_lock); 6300 vp->v_flag |= V_XATTRDIR; 6301 mutex_exit(&vp->v_lock); 6302 6303 *avp = vp; 6304 6305 mutex_enter(&drp->r_statelock); 6306 if (drp->r_xattr_dir) 6307 VN_RELE(drp->r_xattr_dir); 6308 VN_HOLD(vp); 6309 drp->r_xattr_dir = vp; 6310 6311 /* 6312 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6313 * NULL. xattrs could be created at any time, and we have no 6314 * way to update pc4_xattr_exists in the base object if/when 6315 * it happens. 6316 */ 6317 drp->r_pathconf.pc4_xattr_valid = 0; 6318 6319 mutex_exit(&drp->r_statelock); 6320 6321 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6322 6323 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6324 6325 return (0); 6326 } 6327 6328 /* ARGSUSED */ 6329 static int 6330 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6331 int mode, vnode_t **vpp, cred_t *cr, int flags) 6332 { 6333 int error; 6334 vnode_t *vp = NULL; 6335 rnode4_t *rp; 6336 struct vattr vattr; 6337 rnode4_t *drp; 6338 vnode_t *tempvp; 6339 enum createmode4 createmode; 6340 bool_t must_trunc = FALSE; 6341 6342 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6343 return (EPERM); 6344 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6345 return (EINVAL); 6346 } 6347 6348 /* . and .. have special meaning in the protocol, reject them. */ 6349 6350 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6351 return (EISDIR); 6352 6353 drp = VTOR4(dvp); 6354 6355 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6356 return (EINTR); 6357 6358 top: 6359 /* 6360 * We make a copy of the attributes because the caller does not 6361 * expect us to change what va points to. 6362 */ 6363 vattr = *va; 6364 6365 /* 6366 * If the pathname is "", then dvp is the root vnode of 6367 * a remote file mounted over a local directory. 6368 * All that needs to be done is access 6369 * checking and truncation. Note that we avoid doing 6370 * open w/ create because the parent directory might 6371 * be in pseudo-fs and the open would fail. 6372 */ 6373 if (*nm == '\0') { 6374 error = 0; 6375 VN_HOLD(dvp); 6376 vp = dvp; 6377 must_trunc = TRUE; 6378 } else { 6379 /* 6380 * We need to go over the wire, just to be sure whether the 6381 * file exists or not. Using the DNLC can be dangerous in 6382 * this case when making a decision regarding existence. 6383 */ 6384 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6385 } 6386 6387 if (exclusive) 6388 createmode = EXCLUSIVE4; 6389 else 6390 createmode = GUARDED4; 6391 6392 /* 6393 * error would be set if the file does not exist on the 6394 * server, so lets go create it. 6395 */ 6396 if (error) { 6397 goto create_otw; 6398 } 6399 6400 /* 6401 * File does exist on the server 6402 */ 6403 if (exclusive == EXCL) 6404 error = EEXIST; 6405 else if (vp->v_type == VDIR && (mode & VWRITE)) 6406 error = EISDIR; 6407 else { 6408 /* 6409 * If vnode is a device, create special vnode. 6410 */ 6411 if (ISVDEV(vp->v_type)) { 6412 tempvp = vp; 6413 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6414 VN_RELE(tempvp); 6415 } 6416 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 6417 if ((vattr.va_mask & AT_SIZE) && 6418 vp->v_type == VREG) { 6419 rp = VTOR4(vp); 6420 /* 6421 * Check here for large file handled 6422 * by LF-unaware process (as 6423 * ufs_create() does) 6424 */ 6425 if (!(flags & FOFFMAX)) { 6426 mutex_enter(&rp->r_statelock); 6427 if (rp->r_size > MAXOFF32_T) 6428 error = EOVERFLOW; 6429 mutex_exit(&rp->r_statelock); 6430 } 6431 6432 /* if error is set then we need to return */ 6433 if (error) { 6434 nfs_rw_exit(&drp->r_rwlock); 6435 VN_RELE(vp); 6436 return (error); 6437 } 6438 6439 if (must_trunc) { 6440 vattr.va_mask = AT_SIZE; 6441 error = nfs4setattr(vp, &vattr, 0, cr, 6442 NULL); 6443 } else { 6444 /* 6445 * we know we have a regular file that already 6446 * exists and we may end up truncating the file 6447 * as a result of the open_otw, so flush out 6448 * any dirty pages for this file first. 6449 */ 6450 if (nfs4_has_pages(vp) && 6451 ((rp->r_flags & R4DIRTY) || 6452 rp->r_count > 0 || 6453 rp->r_mapcnt > 0)) { 6454 error = nfs4_putpage(vp, 6455 (offset_t)0, 0, 0, cr); 6456 if (error && (error == ENOSPC || 6457 error == EDQUOT)) { 6458 mutex_enter( 6459 &rp->r_statelock); 6460 if (!rp->r_error) 6461 rp->r_error = 6462 error; 6463 mutex_exit( 6464 &rp->r_statelock); 6465 } 6466 } 6467 vattr.va_mask = (AT_SIZE | 6468 AT_TYPE | AT_MODE); 6469 vattr.va_type = VREG; 6470 createmode = UNCHECKED4; 6471 goto create_otw; 6472 } 6473 } 6474 } 6475 } 6476 nfs_rw_exit(&drp->r_rwlock); 6477 if (error) { 6478 VN_RELE(vp); 6479 } else { 6480 *vpp = vp; 6481 } 6482 return (error); 6483 6484 create_otw: 6485 dnlc_remove(dvp, nm); 6486 6487 ASSERT(vattr.va_mask & AT_TYPE); 6488 6489 /* 6490 * If not a regular file let nfs4mknod() handle it. 6491 */ 6492 if (vattr.va_type != VREG) { 6493 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6494 nfs_rw_exit(&drp->r_rwlock); 6495 return (error); 6496 } 6497 6498 /* 6499 * It _is_ a regular file. 6500 */ 6501 ASSERT(vattr.va_mask & AT_MODE); 6502 if (MANDMODE(vattr.va_mode)) { 6503 nfs_rw_exit(&drp->r_rwlock); 6504 return (EACCES); 6505 } 6506 6507 /* 6508 * If this happens to be a mknod of a regular file, then flags will 6509 * have neither FREAD or FWRITE. However, we must set at least one 6510 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6511 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6512 * set (based on openmode specified by app). 6513 */ 6514 if ((flags & (FREAD|FWRITE)) == 0) 6515 flags |= (FREAD|FWRITE); 6516 6517 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6518 6519 if (vp != NULL) { 6520 /* if create was successful, throw away the file's pages */ 6521 if (!error && (vattr.va_mask & AT_SIZE)) 6522 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6523 cr); 6524 /* release the lookup hold */ 6525 VN_RELE(vp); 6526 vp = NULL; 6527 } 6528 6529 /* 6530 * validate that we opened a regular file. This handles a misbehaving 6531 * server that returns an incorrect FH. 6532 */ 6533 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6534 error = EISDIR; 6535 VN_RELE(*vpp); 6536 } 6537 6538 /* 6539 * If this is not an exclusive create, then the CREATE 6540 * request will be made with the GUARDED mode set. This 6541 * means that the server will return EEXIST if the file 6542 * exists. The file could exist because of a retransmitted 6543 * request. In this case, we recover by starting over and 6544 * checking to see whether the file exists. This second 6545 * time through it should and a CREATE request will not be 6546 * sent. 6547 * 6548 * This handles the problem of a dangling CREATE request 6549 * which contains attributes which indicate that the file 6550 * should be truncated. This retransmitted request could 6551 * possibly truncate valid data in the file if not caught 6552 * by the duplicate request mechanism on the server or if 6553 * not caught by other means. The scenario is: 6554 * 6555 * Client transmits CREATE request with size = 0 6556 * Client times out, retransmits request. 6557 * Response to the first request arrives from the server 6558 * and the client proceeds on. 6559 * Client writes data to the file. 6560 * The server now processes retransmitted CREATE request 6561 * and truncates file. 6562 * 6563 * The use of the GUARDED CREATE request prevents this from 6564 * happening because the retransmitted CREATE would fail 6565 * with EEXIST and would not truncate the file. 6566 */ 6567 if (error == EEXIST && exclusive == NONEXCL) { 6568 #ifdef DEBUG 6569 nfs4_create_misses++; 6570 #endif 6571 goto top; 6572 } 6573 nfs_rw_exit(&drp->r_rwlock); 6574 return (error); 6575 } 6576 6577 /* 6578 * Create compound (for mkdir, mknod, symlink): 6579 * { Putfh <dfh>; Create; Getfh; Getattr } 6580 * It's okay if setattr failed to set gid - this is not considered 6581 * an error, but purge attrs in that case. 6582 */ 6583 static int 6584 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6585 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6586 { 6587 int need_end_op = FALSE; 6588 COMPOUND4args_clnt args; 6589 COMPOUND4res_clnt res, *resp = NULL; 6590 nfs_argop4 *argop; 6591 nfs_resop4 *resop; 6592 int doqueue; 6593 mntinfo4_t *mi; 6594 rnode4_t *drp = VTOR4(dvp); 6595 change_info4 *cinfo; 6596 GETFH4res *gf_res; 6597 struct vattr vattr; 6598 vnode_t *vp; 6599 fattr4 *crattr; 6600 bool_t needrecov = FALSE; 6601 nfs4_recov_state_t recov_state; 6602 nfs4_sharedfh_t *sfhp = NULL; 6603 hrtime_t t; 6604 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6605 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6606 dirattr_info_t dinfo, *dinfop; 6607 servinfo4_t *svp; 6608 bitmap4 supp_attrs; 6609 6610 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6611 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6612 6613 mi = VTOMI4(dvp); 6614 6615 /* 6616 * Make sure we properly deal with setting the right gid 6617 * on a new directory to reflect the parent's setgid bit 6618 */ 6619 setgid_flag = 0; 6620 if (type == NF4DIR) { 6621 struct vattr dva; 6622 6623 va->va_mode &= ~VSGID; 6624 dva.va_mask = AT_MODE | AT_GID; 6625 if (VOP_GETATTR(dvp, &dva, 0, cr) == 0) { 6626 6627 /* 6628 * If the parent's directory has the setgid bit set 6629 * _and_ the client was able to get a valid mapping 6630 * for the parent dir's owner_group, we want to 6631 * append NVERIFY(owner_group == dva.va_gid) and 6632 * SETTATTR to the CREATE compound. 6633 */ 6634 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6635 setgid_flag = 1; 6636 va->va_mode |= VSGID; 6637 if (dva.va_gid != GID_NOBODY) { 6638 va->va_mask |= AT_GID; 6639 va->va_gid = dva.va_gid; 6640 } 6641 } 6642 } 6643 } 6644 6645 /* 6646 * Create ops: 6647 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6648 * 5:restorefh(dir) 6:getattr(dir) 6649 * 6650 * if (setgid) 6651 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6652 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6653 * 8:nverify 9:setattr 6654 */ 6655 if (setgid_flag) { 6656 numops = 10; 6657 idx_create = 1; 6658 idx_fattr = 3; 6659 } else { 6660 numops = 7; 6661 idx_create = 2; 6662 idx_fattr = 4; 6663 } 6664 6665 ASSERT(nfs_zone() == mi->mi_zone); 6666 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6667 return (EINTR); 6668 } 6669 recov_state.rs_flags = 0; 6670 recov_state.rs_num_retry_despite_err = 0; 6671 6672 argoplist_size = numops * sizeof (nfs_argop4); 6673 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6674 6675 recov_retry: 6676 if (type == NF4LNK) 6677 args.ctag = TAG_SYMLINK; 6678 else if (type == NF4DIR) 6679 args.ctag = TAG_MKDIR; 6680 else 6681 args.ctag = TAG_MKNOD; 6682 6683 args.array_len = numops; 6684 args.array = argop; 6685 6686 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6687 nfs_rw_exit(&drp->r_rwlock); 6688 kmem_free(argop, argoplist_size); 6689 return (e.error); 6690 } 6691 need_end_op = TRUE; 6692 6693 6694 /* 0: putfh directory */ 6695 argop[0].argop = OP_CPUTFH; 6696 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6697 6698 /* 1/2: Create object */ 6699 argop[idx_create].argop = OP_CCREATE; 6700 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6701 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6702 if (type == NF4LNK) { 6703 /* 6704 * symlink, treat name as data 6705 */ 6706 ASSERT(data != NULL); 6707 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6708 (char *)data; 6709 } 6710 if (type == NF4BLK || type == NF4CHR) { 6711 ASSERT(data != NULL); 6712 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6713 *((specdata4 *)data); 6714 } 6715 6716 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6717 6718 svp = drp->r_server; 6719 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6720 supp_attrs = svp->sv_supp_attrs; 6721 nfs_rw_exit(&svp->sv_lock); 6722 6723 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6724 nfs_rw_exit(&drp->r_rwlock); 6725 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6726 e.error = EINVAL; 6727 kmem_free(argop, argoplist_size); 6728 return (e.error); 6729 } 6730 6731 /* 2/3: getfh fh of created object */ 6732 ASSERT(idx_create + 1 == idx_fattr - 1); 6733 argop[idx_create + 1].argop = OP_GETFH; 6734 6735 /* 3/4: getattr of new object */ 6736 argop[idx_fattr].argop = OP_GETATTR; 6737 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6738 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6739 6740 if (setgid_flag) { 6741 vattr_t _v; 6742 6743 argop[4].argop = OP_SAVEFH; 6744 6745 argop[5].argop = OP_CPUTFH; 6746 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6747 6748 argop[6].argop = OP_GETATTR; 6749 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6750 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6751 6752 argop[7].argop = OP_RESTOREFH; 6753 6754 /* 6755 * nverify 6756 * 6757 * XXX - Revisit the last argument to nfs4_end_op() 6758 * once 5020486 is fixed. 6759 */ 6760 _v.va_mask = AT_GID; 6761 _v.va_gid = va->va_gid; 6762 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6763 supp_attrs)) { 6764 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6765 nfs_rw_exit(&drp->r_rwlock); 6766 nfs4_fattr4_free(crattr); 6767 kmem_free(argop, argoplist_size); 6768 return (e.error); 6769 } 6770 6771 /* 6772 * setattr 6773 * 6774 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6775 * so no need for stateid or flags. Also we specify NULL 6776 * rp since we're only interested in setting owner_group 6777 * attributes. 6778 */ 6779 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6780 &e.error, 0); 6781 6782 if (e.error) { 6783 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6784 nfs_rw_exit(&drp->r_rwlock); 6785 nfs4_fattr4_free(crattr); 6786 nfs4args_verify_free(&argop[8]); 6787 kmem_free(argop, argoplist_size); 6788 return (e.error); 6789 } 6790 } else { 6791 argop[1].argop = OP_SAVEFH; 6792 6793 argop[5].argop = OP_RESTOREFH; 6794 6795 argop[6].argop = OP_GETATTR; 6796 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6797 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6798 } 6799 6800 dnlc_remove(dvp, nm); 6801 6802 doqueue = 1; 6803 t = gethrtime(); 6804 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6805 6806 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6807 if (e.error) { 6808 PURGE_ATTRCACHE4(dvp); 6809 if (!needrecov) 6810 goto out; 6811 } 6812 6813 if (needrecov) { 6814 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6815 OP_CREATE, NULL) == FALSE) { 6816 nfs4_end_op(mi, dvp, NULL, &recov_state, 6817 needrecov); 6818 need_end_op = FALSE; 6819 nfs4_fattr4_free(crattr); 6820 if (setgid_flag) { 6821 nfs4args_verify_free(&argop[8]); 6822 nfs4args_setattr_free(&argop[9]); 6823 } 6824 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6825 goto recov_retry; 6826 } 6827 } 6828 6829 resp = &res; 6830 6831 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6832 6833 if (res.status == NFS4ERR_BADOWNER) 6834 nfs4_log_badowner(mi, OP_CREATE); 6835 6836 e.error = geterrno4(res.status); 6837 6838 /* 6839 * This check is left over from when create was implemented 6840 * using a setattr op (instead of createattrs). If the 6841 * putfh/create/getfh failed, the error was returned. If 6842 * setattr/getattr failed, we keep going. 6843 * 6844 * It might be better to get rid of the GETFH also, and just 6845 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6846 * Then if any of the operations failed, we could return the 6847 * error now, and remove much of the error code below. 6848 */ 6849 if (res.array_len <= idx_fattr) { 6850 /* 6851 * Either Putfh, Create or Getfh failed. 6852 */ 6853 PURGE_ATTRCACHE4(dvp); 6854 /* 6855 * nfs4_purge_stale_fh() may generate otw calls through 6856 * nfs4_invalidate_pages. Hence the need to call 6857 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 6858 */ 6859 nfs4_end_op(mi, dvp, NULL, &recov_state, 6860 needrecov); 6861 need_end_op = FALSE; 6862 nfs4_purge_stale_fh(e.error, dvp, cr); 6863 goto out; 6864 } 6865 } 6866 6867 resop = &res.array[idx_create]; /* create res */ 6868 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 6869 6870 resop = &res.array[idx_create + 1]; /* getfh res */ 6871 gf_res = &resop->nfs_resop4_u.opgetfh; 6872 6873 sfhp = sfh4_get(&gf_res->object, mi); 6874 if (e.error) { 6875 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 6876 fn_get(VTOSV(dvp)->sv_name, nm)); 6877 if (vp->v_type == VNON) { 6878 vattr.va_mask = AT_TYPE; 6879 /* 6880 * Need to call nfs4_end_op before nfs4getattr to avoid 6881 * potential nfs4_start_op deadlock. See RFE 4777612. 6882 */ 6883 nfs4_end_op(mi, dvp, NULL, &recov_state, 6884 needrecov); 6885 need_end_op = FALSE; 6886 e.error = nfs4getattr(vp, &vattr, cr); 6887 if (e.error) { 6888 VN_RELE(vp); 6889 *vpp = NULL; 6890 goto out; 6891 } 6892 vp->v_type = vattr.va_type; 6893 } 6894 e.error = 0; 6895 } else { 6896 *vpp = vp = makenfs4node(sfhp, 6897 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 6898 dvp->v_vfsp, t, cr, 6899 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 6900 } 6901 6902 /* 6903 * If compound succeeded, then update dir attrs 6904 */ 6905 if (res.status == NFS4_OK) { 6906 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 6907 dinfo.di_cred = cr; 6908 dinfo.di_time_call = t; 6909 dinfop = &dinfo; 6910 } else 6911 dinfop = NULL; 6912 6913 /* Update directory cache attribute, readdir and dnlc caches */ 6914 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 6915 6916 out: 6917 if (sfhp != NULL) 6918 sfh4_rele(&sfhp); 6919 nfs_rw_exit(&drp->r_rwlock); 6920 nfs4_fattr4_free(crattr); 6921 if (setgid_flag) { 6922 nfs4args_verify_free(&argop[8]); 6923 nfs4args_setattr_free(&argop[9]); 6924 } 6925 if (resp) 6926 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 6927 if (need_end_op) 6928 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6929 6930 kmem_free(argop, argoplist_size); 6931 return (e.error); 6932 } 6933 6934 /* ARGSUSED */ 6935 static int 6936 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6937 int mode, vnode_t **vpp, cred_t *cr) 6938 { 6939 int error; 6940 vnode_t *vp; 6941 nfs_ftype4 type; 6942 specdata4 spec, *specp = NULL; 6943 6944 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6945 6946 switch (va->va_type) { 6947 case VCHR: 6948 case VBLK: 6949 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 6950 spec.specdata1 = getmajor(va->va_rdev); 6951 spec.specdata2 = getminor(va->va_rdev); 6952 specp = &spec; 6953 break; 6954 6955 case VFIFO: 6956 type = NF4FIFO; 6957 break; 6958 case VSOCK: 6959 type = NF4SOCK; 6960 break; 6961 6962 default: 6963 return (EINVAL); 6964 } 6965 6966 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 6967 if (error) { 6968 return (error); 6969 } 6970 6971 /* 6972 * This might not be needed any more; special case to deal 6973 * with problematic v2/v3 servers. Since create was unable 6974 * to set group correctly, not sure what hope setattr has. 6975 */ 6976 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 6977 va->va_mask = AT_GID; 6978 (void) nfs4setattr(vp, va, 0, cr, NULL); 6979 } 6980 6981 /* 6982 * If vnode is a device create special vnode 6983 */ 6984 if (ISVDEV(vp->v_type)) { 6985 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6986 VN_RELE(vp); 6987 } else { 6988 *vpp = vp; 6989 } 6990 return (error); 6991 } 6992 6993 /* 6994 * Remove requires that the current fh be the target directory. 6995 * After the operation, the current fh is unchanged. 6996 * The compound op structure is: 6997 * PUTFH(targetdir), REMOVE 6998 * 6999 * Weirdness: if the vnode to be removed is open 7000 * we rename it instead of removing it and nfs_inactive 7001 * will remove the new name. 7002 */ 7003 static int 7004 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr) 7005 { 7006 COMPOUND4args_clnt args; 7007 COMPOUND4res_clnt res, *resp = NULL; 7008 REMOVE4res *rm_res; 7009 nfs_argop4 argop[3]; 7010 nfs_resop4 *resop; 7011 vnode_t *vp; 7012 char *tmpname; 7013 int doqueue; 7014 mntinfo4_t *mi; 7015 rnode4_t *rp; 7016 rnode4_t *drp; 7017 int needrecov = 0; 7018 nfs4_recov_state_t recov_state; 7019 int isopen; 7020 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7021 dirattr_info_t dinfo; 7022 7023 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7024 return (EPERM); 7025 drp = VTOR4(dvp); 7026 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7027 return (EINTR); 7028 7029 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7030 if (e.error) { 7031 nfs_rw_exit(&drp->r_rwlock); 7032 return (e.error); 7033 } 7034 7035 if (vp->v_type == VDIR) { 7036 VN_RELE(vp); 7037 nfs_rw_exit(&drp->r_rwlock); 7038 return (EISDIR); 7039 } 7040 7041 /* 7042 * First just remove the entry from the name cache, as it 7043 * is most likely the only entry for this vp. 7044 */ 7045 dnlc_remove(dvp, nm); 7046 7047 rp = VTOR4(vp); 7048 7049 /* 7050 * For regular file types, check to see if the file is open by looking 7051 * at the open streams. 7052 * For all other types, check the reference count on the vnode. Since 7053 * they are not opened OTW they never have an open stream. 7054 * 7055 * If the file is open, rename it to .nfsXXXX. 7056 */ 7057 if (vp->v_type != VREG) { 7058 /* 7059 * If the file has a v_count > 1 then there may be more than one 7060 * entry in the name cache due multiple links or an open file, 7061 * but we don't have the real reference count so flush all 7062 * possible entries. 7063 */ 7064 if (vp->v_count > 1) 7065 dnlc_purge_vp(vp); 7066 7067 /* 7068 * Now we have the real reference count. 7069 */ 7070 isopen = vp->v_count > 1; 7071 } else { 7072 mutex_enter(&rp->r_os_lock); 7073 isopen = list_head(&rp->r_open_streams) != NULL; 7074 mutex_exit(&rp->r_os_lock); 7075 } 7076 7077 mutex_enter(&rp->r_statelock); 7078 if (isopen && 7079 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7080 mutex_exit(&rp->r_statelock); 7081 tmpname = newname(); 7082 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr); 7083 if (e.error) 7084 kmem_free(tmpname, MAXNAMELEN); 7085 else { 7086 mutex_enter(&rp->r_statelock); 7087 if (rp->r_unldvp == NULL) { 7088 VN_HOLD(dvp); 7089 rp->r_unldvp = dvp; 7090 if (rp->r_unlcred != NULL) 7091 crfree(rp->r_unlcred); 7092 crhold(cr); 7093 rp->r_unlcred = cr; 7094 rp->r_unlname = tmpname; 7095 } else { 7096 kmem_free(rp->r_unlname, MAXNAMELEN); 7097 rp->r_unlname = tmpname; 7098 } 7099 mutex_exit(&rp->r_statelock); 7100 } 7101 VN_RELE(vp); 7102 nfs_rw_exit(&drp->r_rwlock); 7103 return (e.error); 7104 } 7105 /* 7106 * Actually remove the file/dir 7107 */ 7108 mutex_exit(&rp->r_statelock); 7109 7110 /* 7111 * We need to flush any dirty pages which happen to 7112 * be hanging around before removing the file. 7113 * This shouldn't happen very often since in NFSv4 7114 * we should be close to open consistent. 7115 */ 7116 if (nfs4_has_pages(vp) && 7117 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7118 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 7119 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7120 mutex_enter(&rp->r_statelock); 7121 if (!rp->r_error) 7122 rp->r_error = e.error; 7123 mutex_exit(&rp->r_statelock); 7124 } 7125 } 7126 7127 mi = VTOMI4(dvp); 7128 7129 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7130 recov_state.rs_flags = 0; 7131 recov_state.rs_num_retry_despite_err = 0; 7132 7133 recov_retry: 7134 /* 7135 * Remove ops: putfh dir; remove 7136 */ 7137 args.ctag = TAG_REMOVE; 7138 args.array_len = 3; 7139 args.array = argop; 7140 7141 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7142 if (e.error) { 7143 nfs_rw_exit(&drp->r_rwlock); 7144 VN_RELE(vp); 7145 return (e.error); 7146 } 7147 7148 /* putfh directory */ 7149 argop[0].argop = OP_CPUTFH; 7150 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7151 7152 /* remove */ 7153 argop[1].argop = OP_CREMOVE; 7154 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7155 7156 /* getattr dir */ 7157 argop[2].argop = OP_GETATTR; 7158 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7159 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7160 7161 doqueue = 1; 7162 dinfo.di_time_call = gethrtime(); 7163 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7164 7165 PURGE_ATTRCACHE4(vp); 7166 7167 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7168 if (e.error) 7169 PURGE_ATTRCACHE4(dvp); 7170 7171 if (needrecov) { 7172 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7173 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7174 if (!e.error) 7175 (void) xdr_free(xdr_COMPOUND4res_clnt, 7176 (caddr_t)&res); 7177 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7178 needrecov); 7179 goto recov_retry; 7180 } 7181 } 7182 7183 /* 7184 * Matching nfs4_end_op() for start_op() above. 7185 * There is a path in the code below which calls 7186 * nfs4_purge_stale_fh(), which may generate otw calls through 7187 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7188 * here to avoid nfs4_start_op() deadlock. 7189 */ 7190 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7191 7192 if (!e.error) { 7193 resp = &res; 7194 7195 if (res.status) { 7196 e.error = geterrno4(res.status); 7197 PURGE_ATTRCACHE4(dvp); 7198 nfs4_purge_stale_fh(e.error, dvp, cr); 7199 } else { 7200 resop = &res.array[1]; /* remove res */ 7201 rm_res = &resop->nfs_resop4_u.opremove; 7202 7203 dinfo.di_garp = 7204 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7205 dinfo.di_cred = cr; 7206 7207 /* Update directory attr, readdir and dnlc caches */ 7208 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7209 &dinfo); 7210 } 7211 } 7212 nfs_rw_exit(&drp->r_rwlock); 7213 if (resp) 7214 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7215 7216 VN_RELE(vp); 7217 return (e.error); 7218 } 7219 7220 /* 7221 * Link requires that the current fh be the target directory and the 7222 * saved fh be the source fh. After the operation, the current fh is unchanged. 7223 * Thus the compound op structure is: 7224 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7225 * GETATTR(file) 7226 */ 7227 static int 7228 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 7229 { 7230 COMPOUND4args_clnt args; 7231 COMPOUND4res_clnt res, *resp = NULL; 7232 LINK4res *ln_res; 7233 int argoplist_size = 7 * sizeof (nfs_argop4); 7234 nfs_argop4 *argop; 7235 nfs_resop4 *resop; 7236 vnode_t *realvp, *nvp; 7237 int doqueue; 7238 mntinfo4_t *mi; 7239 rnode4_t *tdrp; 7240 bool_t needrecov = FALSE; 7241 nfs4_recov_state_t recov_state; 7242 hrtime_t t; 7243 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7244 dirattr_info_t dinfo; 7245 7246 ASSERT(*tnm != '\0'); 7247 ASSERT(tdvp->v_type == VDIR); 7248 ASSERT(nfs4_consistent_type(tdvp)); 7249 ASSERT(nfs4_consistent_type(svp)); 7250 7251 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7252 return (EPERM); 7253 if (VOP_REALVP(svp, &realvp) == 0) { 7254 svp = realvp; 7255 ASSERT(nfs4_consistent_type(svp)); 7256 } 7257 7258 tdrp = VTOR4(tdvp); 7259 mi = VTOMI4(svp); 7260 7261 if (!(mi->mi_flags & MI4_LINK)) { 7262 return (EOPNOTSUPP); 7263 } 7264 recov_state.rs_flags = 0; 7265 recov_state.rs_num_retry_despite_err = 0; 7266 7267 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7268 return (EINTR); 7269 7270 recov_retry: 7271 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7272 7273 args.ctag = TAG_LINK; 7274 7275 /* 7276 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7277 * restorefh; getattr(fl) 7278 */ 7279 args.array_len = 7; 7280 args.array = argop; 7281 7282 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7283 if (e.error) { 7284 kmem_free(argop, argoplist_size); 7285 nfs_rw_exit(&tdrp->r_rwlock); 7286 return (e.error); 7287 } 7288 7289 /* 0. putfh file */ 7290 argop[0].argop = OP_CPUTFH; 7291 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7292 7293 /* 1. save current fh to free up the space for the dir */ 7294 argop[1].argop = OP_SAVEFH; 7295 7296 /* 2. putfh targetdir */ 7297 argop[2].argop = OP_CPUTFH; 7298 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7299 7300 /* 3. link: current_fh is targetdir, saved_fh is source */ 7301 argop[3].argop = OP_CLINK; 7302 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7303 7304 /* 4. Get attributes of dir */ 7305 argop[4].argop = OP_GETATTR; 7306 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7307 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7308 7309 /* 5. If link was successful, restore current vp to file */ 7310 argop[5].argop = OP_RESTOREFH; 7311 7312 /* 6. Get attributes of linked object */ 7313 argop[6].argop = OP_GETATTR; 7314 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7315 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7316 7317 dnlc_remove(tdvp, tnm); 7318 7319 doqueue = 1; 7320 t = gethrtime(); 7321 7322 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7323 7324 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7325 if (e.error != 0 && !needrecov) { 7326 PURGE_ATTRCACHE4(tdvp); 7327 PURGE_ATTRCACHE4(svp); 7328 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7329 goto out; 7330 } 7331 7332 if (needrecov) { 7333 bool_t abort; 7334 7335 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7336 NULL, NULL, OP_LINK, NULL); 7337 if (abort == FALSE) { 7338 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7339 needrecov); 7340 kmem_free(argop, argoplist_size); 7341 if (!e.error) 7342 (void) xdr_free(xdr_COMPOUND4res_clnt, 7343 (caddr_t)&res); 7344 goto recov_retry; 7345 } else { 7346 if (e.error != 0) { 7347 PURGE_ATTRCACHE4(tdvp); 7348 PURGE_ATTRCACHE4(svp); 7349 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7350 &recov_state, needrecov); 7351 goto out; 7352 } 7353 /* fall through for res.status case */ 7354 } 7355 } 7356 7357 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7358 7359 resp = &res; 7360 if (res.status) { 7361 /* If link succeeded, then don't return error */ 7362 e.error = geterrno4(res.status); 7363 if (res.array_len <= 4) { 7364 /* 7365 * Either Putfh, Savefh, Putfh dir, or Link failed 7366 */ 7367 PURGE_ATTRCACHE4(svp); 7368 PURGE_ATTRCACHE4(tdvp); 7369 if (e.error == EOPNOTSUPP) { 7370 mutex_enter(&mi->mi_lock); 7371 mi->mi_flags &= ~MI4_LINK; 7372 mutex_exit(&mi->mi_lock); 7373 } 7374 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7375 /* XXX-LP */ 7376 if (e.error == EISDIR && crgetuid(cr) != 0) 7377 e.error = EPERM; 7378 goto out; 7379 } 7380 } 7381 7382 /* either no error or one of the postop getattr failed */ 7383 7384 /* 7385 * XXX - if LINK succeeded, but no attrs were returned for link 7386 * file, purge its cache. 7387 * 7388 * XXX Perform a simplified version of wcc checking. Instead of 7389 * have another getattr to get pre-op, just purge cache if 7390 * any of the ops prior to and including the getattr failed. 7391 * If the getattr succeeded then update the attrcache accordingly. 7392 */ 7393 7394 /* 7395 * update cache with link file postattrs. 7396 * Note: at this point resop points to link res. 7397 */ 7398 resop = &res.array[3]; /* link res */ 7399 ln_res = &resop->nfs_resop4_u.oplink; 7400 if (res.status == NFS4_OK) { 7401 e.error = nfs4_update_attrcache(res.status, 7402 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7403 t, svp, cr); 7404 } 7405 7406 /* 7407 * Call makenfs4node to create the new shadow vp for tnm. 7408 * We pass NULL attrs because we just cached attrs for 7409 * the src object. All we're trying to accomplish is to 7410 * to create the new shadow vnode. 7411 */ 7412 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7413 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm)); 7414 7415 /* Update target cache attribute, readdir and dnlc caches */ 7416 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7417 dinfo.di_time_call = t; 7418 dinfo.di_cred = cr; 7419 7420 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7421 ASSERT(nfs4_consistent_type(tdvp)); 7422 ASSERT(nfs4_consistent_type(svp)); 7423 ASSERT(nfs4_consistent_type(nvp)); 7424 VN_RELE(nvp); 7425 7426 out: 7427 kmem_free(argop, argoplist_size); 7428 if (resp) 7429 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7430 7431 nfs_rw_exit(&tdrp->r_rwlock); 7432 7433 return (e.error); 7434 } 7435 7436 static int 7437 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7438 { 7439 vnode_t *realvp; 7440 7441 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7442 return (EPERM); 7443 if (VOP_REALVP(ndvp, &realvp) == 0) 7444 ndvp = realvp; 7445 7446 return (nfs4rename(odvp, onm, ndvp, nnm, cr)); 7447 } 7448 7449 /* 7450 * nfs4rename does the real work of renaming in NFS Version 4. 7451 * 7452 * A file handle is considered volatile for renaming purposes if either 7453 * of the volatile bits are turned on. However, the compound may differ 7454 * based on the likelihood of the filehandle to change during rename. 7455 */ 7456 static int 7457 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7458 { 7459 int error; 7460 mntinfo4_t *mi; 7461 vnode_t *nvp; 7462 vnode_t *ovp = NULL; 7463 char *tmpname = NULL; 7464 rnode4_t *rp; 7465 rnode4_t *odrp; 7466 rnode4_t *ndrp; 7467 int did_link = 0; 7468 int do_link = 1; 7469 nfsstat4 stat = NFS4_OK; 7470 7471 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7472 ASSERT(nfs4_consistent_type(odvp)); 7473 ASSERT(nfs4_consistent_type(ndvp)); 7474 7475 if (onm[0] == '.' && (onm[1] == '\0' || 7476 (onm[1] == '.' && onm[2] == '\0'))) 7477 return (EINVAL); 7478 7479 if (nnm[0] == '.' && (nnm[1] == '\0' || 7480 (nnm[1] == '.' && nnm[2] == '\0'))) 7481 return (EINVAL); 7482 7483 odrp = VTOR4(odvp); 7484 ndrp = VTOR4(ndvp); 7485 if ((intptr_t)odrp < (intptr_t)ndrp) { 7486 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7487 return (EINTR); 7488 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7489 nfs_rw_exit(&odrp->r_rwlock); 7490 return (EINTR); 7491 } 7492 } else { 7493 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7494 return (EINTR); 7495 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7496 nfs_rw_exit(&ndrp->r_rwlock); 7497 return (EINTR); 7498 } 7499 } 7500 7501 /* 7502 * Lookup the target file. If it exists, it needs to be 7503 * checked to see whether it is a mount point and whether 7504 * it is active (open). 7505 */ 7506 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7507 if (!error) { 7508 int isactive; 7509 7510 ASSERT(nfs4_consistent_type(nvp)); 7511 /* 7512 * If this file has been mounted on, then just 7513 * return busy because renaming to it would remove 7514 * the mounted file system from the name space. 7515 */ 7516 if (vn_ismntpt(nvp)) { 7517 VN_RELE(nvp); 7518 nfs_rw_exit(&odrp->r_rwlock); 7519 nfs_rw_exit(&ndrp->r_rwlock); 7520 return (EBUSY); 7521 } 7522 7523 /* 7524 * First just remove the entry from the name cache, as it 7525 * is most likely the only entry for this vp. 7526 */ 7527 dnlc_remove(ndvp, nnm); 7528 7529 rp = VTOR4(nvp); 7530 7531 if (nvp->v_type != VREG) { 7532 /* 7533 * Purge the name cache of all references to this vnode 7534 * so that we can check the reference count to infer 7535 * whether it is active or not. 7536 */ 7537 if (nvp->v_count > 1) 7538 dnlc_purge_vp(nvp); 7539 7540 isactive = nvp->v_count > 1; 7541 } else { 7542 mutex_enter(&rp->r_os_lock); 7543 isactive = list_head(&rp->r_open_streams) != NULL; 7544 mutex_exit(&rp->r_os_lock); 7545 } 7546 7547 /* 7548 * If the vnode is active and is not a directory, 7549 * arrange to rename it to a 7550 * temporary file so that it will continue to be 7551 * accessible. This implements the "unlink-open-file" 7552 * semantics for the target of a rename operation. 7553 * Before doing this though, make sure that the 7554 * source and target files are not already the same. 7555 */ 7556 if (isactive && nvp->v_type != VDIR) { 7557 /* 7558 * Lookup the source name. 7559 */ 7560 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7561 7562 /* 7563 * The source name *should* already exist. 7564 */ 7565 if (error) { 7566 VN_RELE(nvp); 7567 nfs_rw_exit(&odrp->r_rwlock); 7568 nfs_rw_exit(&ndrp->r_rwlock); 7569 return (error); 7570 } 7571 7572 ASSERT(nfs4_consistent_type(ovp)); 7573 7574 /* 7575 * Compare the two vnodes. If they are the same, 7576 * just release all held vnodes and return success. 7577 */ 7578 if (VN_CMP(ovp, nvp)) { 7579 VN_RELE(ovp); 7580 VN_RELE(nvp); 7581 nfs_rw_exit(&odrp->r_rwlock); 7582 nfs_rw_exit(&ndrp->r_rwlock); 7583 return (0); 7584 } 7585 7586 /* 7587 * Can't mix and match directories and non- 7588 * directories in rename operations. We already 7589 * know that the target is not a directory. If 7590 * the source is a directory, return an error. 7591 */ 7592 if (ovp->v_type == VDIR) { 7593 VN_RELE(ovp); 7594 VN_RELE(nvp); 7595 nfs_rw_exit(&odrp->r_rwlock); 7596 nfs_rw_exit(&ndrp->r_rwlock); 7597 return (ENOTDIR); 7598 } 7599 link_call: 7600 /* 7601 * The target file exists, is not the same as 7602 * the source file, and is active. We first 7603 * try to Link it to a temporary filename to 7604 * avoid having the server removing the file 7605 * completely (which could cause data loss to 7606 * the user's POV in the event the Rename fails 7607 * -- see bug 1165874). 7608 */ 7609 /* 7610 * The do_link and did_link booleans are 7611 * introduced in the event we get NFS4ERR_FILE_OPEN 7612 * returned for the Rename. Some servers can 7613 * not Rename over an Open file, so they return 7614 * this error. The client needs to Remove the 7615 * newly created Link and do two Renames, just 7616 * as if the server didn't support LINK. 7617 */ 7618 tmpname = newname(); 7619 error = 0; 7620 7621 if (do_link) { 7622 error = nfs4_link(ndvp, nvp, tmpname, cr); 7623 } 7624 if (error == EOPNOTSUPP || !do_link) { 7625 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7626 cr); 7627 did_link = 0; 7628 } else { 7629 did_link = 1; 7630 } 7631 if (error) { 7632 kmem_free(tmpname, MAXNAMELEN); 7633 VN_RELE(ovp); 7634 VN_RELE(nvp); 7635 nfs_rw_exit(&odrp->r_rwlock); 7636 nfs_rw_exit(&ndrp->r_rwlock); 7637 return (error); 7638 } 7639 7640 mutex_enter(&rp->r_statelock); 7641 if (rp->r_unldvp == NULL) { 7642 VN_HOLD(ndvp); 7643 rp->r_unldvp = ndvp; 7644 if (rp->r_unlcred != NULL) 7645 crfree(rp->r_unlcred); 7646 crhold(cr); 7647 rp->r_unlcred = cr; 7648 rp->r_unlname = tmpname; 7649 } else { 7650 if (rp->r_unlname) 7651 kmem_free(rp->r_unlname, MAXNAMELEN); 7652 rp->r_unlname = tmpname; 7653 } 7654 mutex_exit(&rp->r_statelock); 7655 } 7656 7657 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7658 7659 ASSERT(nfs4_consistent_type(nvp)); 7660 VN_RELE(nvp); 7661 } 7662 7663 if (ovp == NULL) { 7664 /* 7665 * When renaming directories to be a subdirectory of a 7666 * different parent, the dnlc entry for ".." will no 7667 * longer be valid, so it must be removed. 7668 * 7669 * We do a lookup here to determine whether we are renaming 7670 * a directory and we need to check if we are renaming 7671 * an unlinked file. This might have already been done 7672 * in previous code, so we check ovp == NULL to avoid 7673 * doing it twice. 7674 */ 7675 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7676 /* 7677 * The source name *should* already exist. 7678 */ 7679 if (error) { 7680 nfs_rw_exit(&odrp->r_rwlock); 7681 nfs_rw_exit(&ndrp->r_rwlock); 7682 return (error); 7683 } 7684 ASSERT(ovp != NULL); 7685 ASSERT(nfs4_consistent_type(ovp)); 7686 } 7687 7688 /* 7689 * Is the object being renamed a dir, and if so, is 7690 * it being renamed to a child of itself? The underlying 7691 * fs should ultimately return EINVAL for this case; 7692 * however, buggy beta non-Solaris NFSv4 servers at 7693 * interop testing events have allowed this behavior, 7694 * and it caused our client to panic due to a recursive 7695 * mutex_enter in fn_move. 7696 * 7697 * The tedious locking in fn_move could be changed to 7698 * deal with this case, and the client could avoid the 7699 * panic; however, the client would just confuse itself 7700 * later and misbehave. A better way to handle the broken 7701 * server is to detect this condition and return EINVAL 7702 * without ever sending the the bogus rename to the server. 7703 * We know the rename is invalid -- just fail it now. 7704 */ 7705 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7706 VN_RELE(ovp); 7707 nfs_rw_exit(&odrp->r_rwlock); 7708 nfs_rw_exit(&ndrp->r_rwlock); 7709 return (EINVAL); 7710 } 7711 7712 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7713 7714 /* 7715 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7716 * possible for the filehandle to change due to the rename. 7717 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7718 * the fh will not change because of the rename, but we still need 7719 * to update its rnode entry with the new name for 7720 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7721 * has no effect on these for now, but for future improvements, 7722 * we might want to use it too to simplify handling of files 7723 * that are open with that flag on. (XXX) 7724 */ 7725 mi = VTOMI4(odvp); 7726 if (NFS4_VOLATILE_FH(mi)) { 7727 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7728 &stat); 7729 } else { 7730 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7731 &stat); 7732 } 7733 ASSERT(nfs4_consistent_type(odvp)); 7734 ASSERT(nfs4_consistent_type(ndvp)); 7735 ASSERT(nfs4_consistent_type(ovp)); 7736 7737 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7738 do_link = 0; 7739 /* 7740 * Before the 'link_call' code, we did a nfs4_lookup 7741 * that puts a VN_HOLD on nvp. After the nfs4_link 7742 * call we call VN_RELE to match that hold. We need 7743 * to place an additional VN_HOLD here since we will 7744 * be hitting that VN_RELE again. 7745 */ 7746 VN_HOLD(nvp); 7747 7748 (void) nfs4_remove(ndvp, tmpname, cr); 7749 7750 /* Undo the unlinked file naming stuff we just did */ 7751 mutex_enter(&rp->r_statelock); 7752 if (rp->r_unldvp) { 7753 VN_RELE(ndvp); 7754 rp->r_unldvp = NULL; 7755 if (rp->r_unlcred != NULL) 7756 crfree(rp->r_unlcred); 7757 rp->r_unlcred = NULL; 7758 /* rp->r_unlanme points to tmpname */ 7759 if (rp->r_unlname) 7760 kmem_free(rp->r_unlname, MAXNAMELEN); 7761 rp->r_unlname = NULL; 7762 } 7763 mutex_exit(&rp->r_statelock); 7764 7765 goto link_call; 7766 } 7767 7768 if (error) { 7769 VN_RELE(ovp); 7770 nfs_rw_exit(&odrp->r_rwlock); 7771 nfs_rw_exit(&ndrp->r_rwlock); 7772 return (error); 7773 } 7774 7775 /* 7776 * when renaming directories to be a subdirectory of a 7777 * different parent, the dnlc entry for ".." will no 7778 * longer be valid, so it must be removed 7779 */ 7780 rp = VTOR4(ovp); 7781 if (ndvp != odvp) { 7782 if (ovp->v_type == VDIR) { 7783 dnlc_remove(ovp, ".."); 7784 if (rp->r_dir != NULL) 7785 nfs4_purge_rddir_cache(ovp); 7786 } 7787 } 7788 7789 /* 7790 * If we are renaming the unlinked file, update the 7791 * r_unldvp and r_unlname as needed. 7792 */ 7793 mutex_enter(&rp->r_statelock); 7794 if (rp->r_unldvp != NULL) { 7795 if (strcmp(rp->r_unlname, onm) == 0) { 7796 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7797 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7798 if (ndvp != rp->r_unldvp) { 7799 VN_RELE(rp->r_unldvp); 7800 rp->r_unldvp = ndvp; 7801 VN_HOLD(ndvp); 7802 } 7803 } 7804 } 7805 mutex_exit(&rp->r_statelock); 7806 7807 VN_RELE(ovp); 7808 7809 nfs_rw_exit(&odrp->r_rwlock); 7810 nfs_rw_exit(&ndrp->r_rwlock); 7811 7812 return (error); 7813 } 7814 7815 /* 7816 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 7817 * when it is known that the filehandle is persistent through rename. 7818 * 7819 * Rename requires that the current fh be the target directory and the 7820 * saved fh be the source directory. After the operation, the current fh 7821 * is unchanged. 7822 * The compound op structure for persistent fh rename is: 7823 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 7824 * Rather than bother with the directory postop args, we'll simply 7825 * update that a change occured in the cache, so no post-op getattrs. 7826 */ 7827 static int 7828 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 7829 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 7830 { 7831 COMPOUND4args_clnt args; 7832 COMPOUND4res_clnt res, *resp = NULL; 7833 nfs_argop4 *argop; 7834 nfs_resop4 *resop; 7835 int doqueue, argoplist_size; 7836 mntinfo4_t *mi; 7837 rnode4_t *odrp = VTOR4(odvp); 7838 rnode4_t *ndrp = VTOR4(ndvp); 7839 RENAME4res *rn_res; 7840 bool_t needrecov; 7841 nfs4_recov_state_t recov_state; 7842 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7843 dirattr_info_t dinfo, *dinfop; 7844 7845 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7846 7847 recov_state.rs_flags = 0; 7848 recov_state.rs_num_retry_despite_err = 0; 7849 7850 /* 7851 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 7852 * 7853 * If source/target are different dirs, then append putfh(src); getattr 7854 */ 7855 args.array_len = (odvp == ndvp) ? 5 : 7; 7856 argoplist_size = args.array_len * sizeof (nfs_argop4); 7857 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 7858 7859 recov_retry: 7860 *statp = NFS4_OK; 7861 7862 /* No need to Lookup the file, persistent fh */ 7863 args.ctag = TAG_RENAME; 7864 7865 mi = VTOMI4(odvp); 7866 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 7867 if (e.error) { 7868 kmem_free(argop, argoplist_size); 7869 return (e.error); 7870 } 7871 7872 /* 0: putfh source directory */ 7873 argop[0].argop = OP_CPUTFH; 7874 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 7875 7876 /* 1: Save source fh to free up current for target */ 7877 argop[1].argop = OP_SAVEFH; 7878 7879 /* 2: putfh targetdir */ 7880 argop[2].argop = OP_CPUTFH; 7881 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7882 7883 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 7884 argop[3].argop = OP_CRENAME; 7885 argop[3].nfs_argop4_u.opcrename.coldname = onm; 7886 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 7887 7888 /* 4: getattr (targetdir) */ 7889 argop[4].argop = OP_GETATTR; 7890 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7891 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7892 7893 if (ndvp != odvp) { 7894 7895 /* 5: putfh (sourcedir) */ 7896 argop[5].argop = OP_CPUTFH; 7897 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7898 7899 /* 6: getattr (sourcedir) */ 7900 argop[6].argop = OP_GETATTR; 7901 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7902 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7903 } 7904 7905 dnlc_remove(odvp, onm); 7906 dnlc_remove(ndvp, nnm); 7907 7908 doqueue = 1; 7909 dinfo.di_time_call = gethrtime(); 7910 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7911 7912 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7913 if (e.error) { 7914 PURGE_ATTRCACHE4(odvp); 7915 PURGE_ATTRCACHE4(ndvp); 7916 } else { 7917 *statp = res.status; 7918 } 7919 7920 if (needrecov) { 7921 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 7922 OP_RENAME, NULL) == FALSE) { 7923 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 7924 if (!e.error) 7925 (void) xdr_free(xdr_COMPOUND4res_clnt, 7926 (caddr_t)&res); 7927 goto recov_retry; 7928 } 7929 } 7930 7931 if (!e.error) { 7932 resp = &res; 7933 /* 7934 * as long as OP_RENAME 7935 */ 7936 if (res.status != NFS4_OK && res.array_len <= 4) { 7937 e.error = geterrno4(res.status); 7938 PURGE_ATTRCACHE4(odvp); 7939 PURGE_ATTRCACHE4(ndvp); 7940 /* 7941 * System V defines rename to return EEXIST, not 7942 * ENOTEMPTY if the target directory is not empty. 7943 * Over the wire, the error is NFSERR_ENOTEMPTY 7944 * which geterrno4 maps to ENOTEMPTY. 7945 */ 7946 if (e.error == ENOTEMPTY) 7947 e.error = EEXIST; 7948 } else { 7949 7950 resop = &res.array[3]; /* rename res */ 7951 rn_res = &resop->nfs_resop4_u.oprename; 7952 7953 if (res.status == NFS4_OK) { 7954 /* 7955 * Update target attribute, readdir and dnlc 7956 * caches. 7957 */ 7958 dinfo.di_garp = 7959 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7960 dinfo.di_cred = cr; 7961 dinfop = &dinfo; 7962 } else 7963 dinfop = NULL; 7964 7965 nfs4_update_dircaches(&rn_res->target_cinfo, 7966 ndvp, NULL, NULL, dinfop); 7967 7968 /* 7969 * Update source attribute, readdir and dnlc caches 7970 * 7971 */ 7972 if (ndvp != odvp) { 7973 if (dinfop) 7974 dinfo.di_garp = 7975 &(res.array[6].nfs_resop4_u. 7976 opgetattr.ga_res); 7977 7978 nfs4_update_dircaches(&rn_res->source_cinfo, 7979 odvp, NULL, NULL, dinfop); 7980 } 7981 7982 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 7983 nnm); 7984 } 7985 } 7986 7987 if (resp) 7988 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7989 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 7990 kmem_free(argop, argoplist_size); 7991 7992 return (e.error); 7993 } 7994 7995 /* 7996 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 7997 * it is possible for the filehandle to change due to the rename. 7998 * 7999 * The compound req in this case includes a post-rename lookup and getattr 8000 * to ensure that we have the correct fh and attributes for the object. 8001 * 8002 * Rename requires that the current fh be the target directory and the 8003 * saved fh be the source directory. After the operation, the current fh 8004 * is unchanged. 8005 * 8006 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8007 * update the filehandle for the renamed object. We also get the old 8008 * filehandle for historical reasons; this should be taken out sometime. 8009 * This results in a rather cumbersome compound... 8010 * 8011 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8012 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8013 * 8014 */ 8015 static int 8016 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8017 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8018 { 8019 COMPOUND4args_clnt args; 8020 COMPOUND4res_clnt res, *resp = NULL; 8021 int argoplist_size; 8022 nfs_argop4 *argop; 8023 nfs_resop4 *resop; 8024 int doqueue; 8025 mntinfo4_t *mi; 8026 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8027 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8028 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8029 RENAME4res *rn_res; 8030 GETFH4res *ngf_res; 8031 bool_t needrecov; 8032 nfs4_recov_state_t recov_state; 8033 hrtime_t t; 8034 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8035 dirattr_info_t dinfo, *dinfop = &dinfo; 8036 8037 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8038 8039 recov_state.rs_flags = 0; 8040 recov_state.rs_num_retry_despite_err = 0; 8041 8042 recov_retry: 8043 *statp = NFS4_OK; 8044 8045 /* 8046 * There is a window between the RPC and updating the path and 8047 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8048 * code, so that it doesn't try to use the old path during that 8049 * window. 8050 */ 8051 mutex_enter(&orp->r_statelock); 8052 while (orp->r_flags & R4RECEXPFH) { 8053 klwp_t *lwp = ttolwp(curthread); 8054 8055 if (lwp != NULL) 8056 lwp->lwp_nostop++; 8057 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8058 mutex_exit(&orp->r_statelock); 8059 if (lwp != NULL) 8060 lwp->lwp_nostop--; 8061 return (EINTR); 8062 } 8063 if (lwp != NULL) 8064 lwp->lwp_nostop--; 8065 } 8066 orp->r_flags |= R4RECEXPFH; 8067 mutex_exit(&orp->r_statelock); 8068 8069 mi = VTOMI4(odvp); 8070 8071 args.ctag = TAG_RENAME_VFH; 8072 args.array_len = (odvp == ndvp) ? 10 : 12; 8073 argoplist_size = args.array_len * sizeof (nfs_argop4); 8074 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8075 8076 /* 8077 * Rename ops: 8078 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8079 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8080 * LOOKUP(trgt), GETFH(new), GETATTR, 8081 * 8082 * if (odvp != ndvp) 8083 * add putfh(sourcedir), getattr(sourcedir) } 8084 */ 8085 args.array = argop; 8086 8087 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8088 &recov_state, NULL); 8089 if (e.error) { 8090 kmem_free(argop, argoplist_size); 8091 mutex_enter(&orp->r_statelock); 8092 orp->r_flags &= ~R4RECEXPFH; 8093 cv_broadcast(&orp->r_cv); 8094 mutex_exit(&orp->r_statelock); 8095 return (e.error); 8096 } 8097 8098 /* 0: putfh source directory */ 8099 argop[0].argop = OP_CPUTFH; 8100 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8101 8102 /* 1: Save source fh to free up current for target */ 8103 argop[1].argop = OP_SAVEFH; 8104 8105 /* 2: Lookup pre-rename fh of renamed object */ 8106 argop[2].argop = OP_CLOOKUP; 8107 argop[2].nfs_argop4_u.opclookup.cname = onm; 8108 8109 /* 3: getfh fh of renamed object (before rename) */ 8110 argop[3].argop = OP_GETFH; 8111 8112 /* 4: putfh targetdir */ 8113 argop[4].argop = OP_CPUTFH; 8114 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8115 8116 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8117 argop[5].argop = OP_CRENAME; 8118 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8119 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8120 8121 /* 6: getattr of target dir (post op attrs) */ 8122 argop[6].argop = OP_GETATTR; 8123 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8124 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8125 8126 /* 7: Lookup post-rename fh of renamed object */ 8127 argop[7].argop = OP_CLOOKUP; 8128 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8129 8130 /* 8: getfh fh of renamed object (after rename) */ 8131 argop[8].argop = OP_GETFH; 8132 8133 /* 9: getattr of renamed object */ 8134 argop[9].argop = OP_GETATTR; 8135 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8136 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8137 8138 /* 8139 * If source/target dirs are different, then get new post-op 8140 * attrs for source dir also. 8141 */ 8142 if (ndvp != odvp) { 8143 /* 10: putfh (sourcedir) */ 8144 argop[10].argop = OP_CPUTFH; 8145 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8146 8147 /* 11: getattr (sourcedir) */ 8148 argop[11].argop = OP_GETATTR; 8149 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8150 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8151 } 8152 8153 dnlc_remove(odvp, onm); 8154 dnlc_remove(ndvp, nnm); 8155 8156 doqueue = 1; 8157 t = gethrtime(); 8158 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8159 8160 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8161 if (e.error) { 8162 PURGE_ATTRCACHE4(odvp); 8163 PURGE_ATTRCACHE4(ndvp); 8164 if (!needrecov) { 8165 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8166 &recov_state, needrecov); 8167 goto out; 8168 } 8169 } else { 8170 *statp = res.status; 8171 } 8172 8173 if (needrecov) { 8174 bool_t abort; 8175 8176 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8177 OP_RENAME, NULL); 8178 if (abort == FALSE) { 8179 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8180 &recov_state, needrecov); 8181 kmem_free(argop, argoplist_size); 8182 if (!e.error) 8183 (void) xdr_free(xdr_COMPOUND4res_clnt, 8184 (caddr_t)&res); 8185 mutex_enter(&orp->r_statelock); 8186 orp->r_flags &= ~R4RECEXPFH; 8187 cv_broadcast(&orp->r_cv); 8188 mutex_exit(&orp->r_statelock); 8189 goto recov_retry; 8190 } else { 8191 if (e.error != 0) { 8192 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8193 &recov_state, needrecov); 8194 goto out; 8195 } 8196 /* fall through for res.status case */ 8197 } 8198 } 8199 8200 resp = &res; 8201 /* 8202 * If OP_RENAME (or any prev op) failed, then return an error. 8203 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8204 */ 8205 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8206 /* 8207 * Error in an op other than last Getattr 8208 */ 8209 e.error = geterrno4(res.status); 8210 PURGE_ATTRCACHE4(odvp); 8211 PURGE_ATTRCACHE4(ndvp); 8212 /* 8213 * System V defines rename to return EEXIST, not 8214 * ENOTEMPTY if the target directory is not empty. 8215 * Over the wire, the error is NFSERR_ENOTEMPTY 8216 * which geterrno4 maps to ENOTEMPTY. 8217 */ 8218 if (e.error == ENOTEMPTY) 8219 e.error = EEXIST; 8220 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8221 needrecov); 8222 goto out; 8223 } 8224 8225 /* rename results */ 8226 rn_res = &res.array[5].nfs_resop4_u.oprename; 8227 8228 if (res.status == NFS4_OK) { 8229 /* Update target attribute, readdir and dnlc caches */ 8230 dinfo.di_garp = 8231 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8232 dinfo.di_cred = cr; 8233 dinfo.di_time_call = t; 8234 } else 8235 dinfop = NULL; 8236 8237 /* Update source cache attribute, readdir and dnlc caches */ 8238 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8239 8240 /* Update source cache attribute, readdir and dnlc caches */ 8241 if (ndvp != odvp) { 8242 8243 /* 8244 * If dinfop is non-NULL, then compound succeded, so 8245 * set di_garp to attrs for source dir. dinfop is only 8246 * set to NULL when compound fails. 8247 */ 8248 if (dinfop) 8249 dinfo.di_garp = 8250 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8251 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8252 dinfop); 8253 } 8254 8255 /* 8256 * Update the rnode with the new component name and args, 8257 * and if the file handle changed, also update it with the new fh. 8258 * This is only necessary if the target object has an rnode 8259 * entry and there is no need to create one for it. 8260 */ 8261 resop = &res.array[8]; /* getfh new res */ 8262 ngf_res = &resop->nfs_resop4_u.opgetfh; 8263 8264 /* 8265 * Update the path and filehandle for the renamed object. 8266 */ 8267 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8268 8269 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8270 8271 if (res.status == NFS4_OK) { 8272 resop++; /* getattr res */ 8273 e.error = nfs4_update_attrcache(res.status, 8274 &resop->nfs_resop4_u.opgetattr.ga_res, 8275 t, ovp, cr); 8276 } 8277 8278 out: 8279 kmem_free(argop, argoplist_size); 8280 if (resp) 8281 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8282 mutex_enter(&orp->r_statelock); 8283 orp->r_flags &= ~R4RECEXPFH; 8284 cv_broadcast(&orp->r_cv); 8285 mutex_exit(&orp->r_statelock); 8286 8287 return (e.error); 8288 } 8289 8290 static int 8291 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 8292 { 8293 int error; 8294 vnode_t *vp; 8295 8296 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8297 return (EPERM); 8298 /* 8299 * As ".." has special meaning and rather than send a mkdir 8300 * over the wire to just let the server freak out, we just 8301 * short circuit it here and return EEXIST 8302 */ 8303 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8304 return (EEXIST); 8305 8306 /* 8307 * Decision to get the right gid and setgid bit of the 8308 * new directory is now made in call_nfs4_create_req. 8309 */ 8310 va->va_mask |= AT_MODE; 8311 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8312 if (error) 8313 return (error); 8314 8315 *vpp = vp; 8316 return (0); 8317 } 8318 8319 8320 /* 8321 * rmdir is using the same remove v4 op as does remove. 8322 * Remove requires that the current fh be the target directory. 8323 * After the operation, the current fh is unchanged. 8324 * The compound op structure is: 8325 * PUTFH(targetdir), REMOVE 8326 */ 8327 static int 8328 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 8329 { 8330 int need_end_op = FALSE; 8331 COMPOUND4args_clnt args; 8332 COMPOUND4res_clnt res, *resp = NULL; 8333 REMOVE4res *rm_res; 8334 nfs_argop4 argop[3]; 8335 nfs_resop4 *resop; 8336 vnode_t *vp; 8337 int doqueue; 8338 mntinfo4_t *mi; 8339 rnode4_t *drp; 8340 bool_t needrecov = FALSE; 8341 nfs4_recov_state_t recov_state; 8342 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8343 dirattr_info_t dinfo, *dinfop; 8344 8345 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8346 return (EPERM); 8347 /* 8348 * As ".." has special meaning and rather than send a rmdir 8349 * over the wire to just let the server freak out, we just 8350 * short circuit it here and return EEXIST 8351 */ 8352 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8353 return (EEXIST); 8354 8355 drp = VTOR4(dvp); 8356 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8357 return (EINTR); 8358 8359 /* 8360 * Attempt to prevent a rmdir(".") from succeeding. 8361 */ 8362 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8363 if (e.error) { 8364 nfs_rw_exit(&drp->r_rwlock); 8365 return (e.error); 8366 } 8367 if (vp == cdir) { 8368 VN_RELE(vp); 8369 nfs_rw_exit(&drp->r_rwlock); 8370 return (EINVAL); 8371 } 8372 8373 /* 8374 * Since nfsv4 remove op works on both files and directories, 8375 * check that the removed object is indeed a directory. 8376 */ 8377 if (vp->v_type != VDIR) { 8378 VN_RELE(vp); 8379 nfs_rw_exit(&drp->r_rwlock); 8380 return (ENOTDIR); 8381 } 8382 8383 /* 8384 * First just remove the entry from the name cache, as it 8385 * is most likely an entry for this vp. 8386 */ 8387 dnlc_remove(dvp, nm); 8388 8389 /* 8390 * If there vnode reference count is greater than one, then 8391 * there may be additional references in the DNLC which will 8392 * need to be purged. First, trying removing the entry for 8393 * the parent directory and see if that removes the additional 8394 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8395 * to completely remove any references to the directory which 8396 * might still exist in the DNLC. 8397 */ 8398 if (vp->v_count > 1) { 8399 dnlc_remove(vp, ".."); 8400 if (vp->v_count > 1) 8401 dnlc_purge_vp(vp); 8402 } 8403 8404 mi = VTOMI4(dvp); 8405 recov_state.rs_flags = 0; 8406 recov_state.rs_num_retry_despite_err = 0; 8407 8408 recov_retry: 8409 args.ctag = TAG_RMDIR; 8410 8411 /* 8412 * Rmdir ops: putfh dir; remove 8413 */ 8414 args.array_len = 3; 8415 args.array = argop; 8416 8417 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8418 if (e.error) { 8419 nfs_rw_exit(&drp->r_rwlock); 8420 return (e.error); 8421 } 8422 need_end_op = TRUE; 8423 8424 /* putfh directory */ 8425 argop[0].argop = OP_CPUTFH; 8426 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8427 8428 /* remove */ 8429 argop[1].argop = OP_CREMOVE; 8430 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8431 8432 /* getattr (postop attrs for dir that contained removed dir) */ 8433 argop[2].argop = OP_GETATTR; 8434 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8435 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8436 8437 dinfo.di_time_call = gethrtime(); 8438 doqueue = 1; 8439 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8440 8441 PURGE_ATTRCACHE4(vp); 8442 8443 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8444 if (e.error) { 8445 PURGE_ATTRCACHE4(dvp); 8446 } 8447 8448 if (needrecov) { 8449 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8450 NULL, OP_REMOVE, NULL) == FALSE) { 8451 if (!e.error) 8452 (void) xdr_free(xdr_COMPOUND4res_clnt, 8453 (caddr_t)&res); 8454 8455 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8456 needrecov); 8457 need_end_op = FALSE; 8458 goto recov_retry; 8459 } 8460 } 8461 8462 if (!e.error) { 8463 resp = &res; 8464 8465 /* 8466 * Only return error if first 2 ops (OP_REMOVE or earlier) 8467 * failed. 8468 */ 8469 if (res.status != NFS4_OK && res.array_len <= 2) { 8470 e.error = geterrno4(res.status); 8471 PURGE_ATTRCACHE4(dvp); 8472 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8473 &recov_state, needrecov); 8474 need_end_op = FALSE; 8475 nfs4_purge_stale_fh(e.error, dvp, cr); 8476 /* 8477 * System V defines rmdir to return EEXIST, not 8478 * ENOTEMPTY if the directory is not empty. Over 8479 * the wire, the error is NFSERR_ENOTEMPTY which 8480 * geterrno4 maps to ENOTEMPTY. 8481 */ 8482 if (e.error == ENOTEMPTY) 8483 e.error = EEXIST; 8484 } else { 8485 resop = &res.array[1]; /* remove res */ 8486 rm_res = &resop->nfs_resop4_u.opremove; 8487 8488 if (res.status == NFS4_OK) { 8489 resop = &res.array[2]; /* dir attrs */ 8490 dinfo.di_garp = 8491 &resop->nfs_resop4_u.opgetattr.ga_res; 8492 dinfo.di_cred = cr; 8493 dinfop = &dinfo; 8494 } else 8495 dinfop = NULL; 8496 8497 /* Update dir attribute, readdir and dnlc caches */ 8498 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8499 dinfop); 8500 8501 /* destroy rddir cache for dir that was removed */ 8502 if (VTOR4(vp)->r_dir != NULL) 8503 nfs4_purge_rddir_cache(vp); 8504 } 8505 } 8506 8507 if (need_end_op) 8508 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8509 8510 nfs_rw_exit(&drp->r_rwlock); 8511 8512 if (resp) 8513 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8514 8515 VN_RELE(vp); 8516 8517 return (e.error); 8518 } 8519 8520 static int 8521 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 8522 { 8523 int error; 8524 vnode_t *vp; 8525 rnode4_t *rp; 8526 char *contents; 8527 mntinfo4_t *mi = VTOMI4(dvp); 8528 8529 if (nfs_zone() != mi->mi_zone) 8530 return (EPERM); 8531 if (!(mi->mi_flags & MI4_SYMLINK)) 8532 return (EOPNOTSUPP); 8533 8534 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8535 if (error) { 8536 return (error); 8537 } 8538 8539 ASSERT(nfs4_consistent_type(vp)); 8540 rp = VTOR4(vp); 8541 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8542 8543 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8544 8545 if (contents != NULL) { 8546 mutex_enter(&rp->r_statelock); 8547 if (rp->r_symlink.contents == NULL) { 8548 rp->r_symlink.len = strlen(tnm); 8549 bcopy(tnm, contents, rp->r_symlink.len); 8550 rp->r_symlink.contents = contents; 8551 rp->r_symlink.size = MAXPATHLEN; 8552 mutex_exit(&rp->r_statelock); 8553 } else { 8554 mutex_exit(&rp->r_statelock); 8555 kmem_free((void *)contents, MAXPATHLEN); 8556 } 8557 } 8558 } 8559 VN_RELE(vp); 8560 8561 return (error); 8562 } 8563 8564 8565 /* 8566 * Read directory entries. 8567 * There are some weird things to look out for here. The uio_loffset 8568 * field is either 0 or it is the offset returned from a previous 8569 * readdir. It is an opaque value used by the server to find the 8570 * correct directory block to read. The count field is the number 8571 * of blocks to read on the server. This is advisory only, the server 8572 * may return only one block's worth of entries. Entries may be compressed 8573 * on the server. 8574 */ 8575 static int 8576 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 8577 { 8578 int error; 8579 uint_t count; 8580 rnode4_t *rp; 8581 rddir4_cache *rdc; 8582 rddir4_cache *rrdc; 8583 8584 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8585 return (EIO); 8586 rp = VTOR4(vp); 8587 8588 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8589 8590 /* 8591 * Make sure that the directory cache is valid. 8592 */ 8593 if (rp->r_dir != NULL) { 8594 if (nfs_disable_rddir_cache != 0) { 8595 /* 8596 * Setting nfs_disable_rddir_cache in /etc/system 8597 * allows interoperability with servers that do not 8598 * properly update the attributes of directories. 8599 * Any cached information gets purged before an 8600 * access is made to it. 8601 */ 8602 nfs4_purge_rddir_cache(vp); 8603 } 8604 8605 error = nfs4_validate_caches(vp, cr); 8606 if (error) 8607 return (error); 8608 } 8609 8610 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8611 8612 /* 8613 * Short circuit last readdir which always returns 0 bytes. 8614 * This can be done after the directory has been read through 8615 * completely at least once. This will set r_direof which 8616 * can be used to find the value of the last cookie. 8617 */ 8618 mutex_enter(&rp->r_statelock); 8619 if (rp->r_direof != NULL && 8620 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8621 mutex_exit(&rp->r_statelock); 8622 #ifdef DEBUG 8623 nfs4_readdir_cache_shorts++; 8624 #endif 8625 if (eofp) 8626 *eofp = 1; 8627 return (0); 8628 } 8629 8630 /* 8631 * Look for a cache entry. Cache entries are identified 8632 * by the NFS cookie value and the byte count requested. 8633 */ 8634 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8635 8636 /* 8637 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8638 */ 8639 if (rdc == NULL) { 8640 mutex_exit(&rp->r_statelock); 8641 return (EINTR); 8642 } 8643 8644 /* 8645 * Check to see if we need to fill this entry in. 8646 */ 8647 if (rdc->flags & RDDIRREQ) { 8648 rdc->flags &= ~RDDIRREQ; 8649 rdc->flags |= RDDIR; 8650 mutex_exit(&rp->r_statelock); 8651 8652 /* 8653 * Do the readdir. 8654 */ 8655 nfs4readdir(vp, rdc, cr); 8656 8657 /* 8658 * Reaquire the lock, so that we can continue 8659 */ 8660 mutex_enter(&rp->r_statelock); 8661 /* 8662 * The entry is now complete 8663 */ 8664 rdc->flags &= ~RDDIR; 8665 } 8666 8667 ASSERT(!(rdc->flags & RDDIR)); 8668 8669 /* 8670 * If an error occurred while attempting 8671 * to fill the cache entry, mark the entry invalid and 8672 * just return the error. 8673 */ 8674 if (rdc->error) { 8675 error = rdc->error; 8676 rdc->flags |= RDDIRREQ; 8677 rddir4_cache_rele(rp, rdc); 8678 mutex_exit(&rp->r_statelock); 8679 return (error); 8680 } 8681 8682 /* 8683 * The cache entry is complete and good, 8684 * copyout the dirent structs to the calling 8685 * thread. 8686 */ 8687 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8688 8689 /* 8690 * If no error occurred during the copyout, 8691 * update the offset in the uio struct to 8692 * contain the value of the next NFS 4 cookie 8693 * and set the eof value appropriately. 8694 */ 8695 if (!error) { 8696 uiop->uio_loffset = rdc->nfs4_ncookie; 8697 if (eofp) 8698 *eofp = rdc->eof; 8699 } 8700 8701 /* 8702 * Decide whether to do readahead. Don't if we 8703 * have already read to the end of directory. 8704 */ 8705 if (rdc->eof) { 8706 /* 8707 * Make the entry the direof only if it is cached 8708 */ 8709 if (rdc->flags & RDDIRCACHED) 8710 rp->r_direof = rdc; 8711 rddir4_cache_rele(rp, rdc); 8712 mutex_exit(&rp->r_statelock); 8713 return (error); 8714 } 8715 8716 /* Determine if a readdir readahead should be done */ 8717 if (!(rp->r_flags & R4LOOKUP)) { 8718 rddir4_cache_rele(rp, rdc); 8719 mutex_exit(&rp->r_statelock); 8720 return (error); 8721 } 8722 8723 /* 8724 * Now look for a readahead entry. 8725 * 8726 * Check to see whether we found an entry for the readahead. 8727 * If so, we don't need to do anything further, so free the new 8728 * entry if one was allocated. Otherwise, allocate a new entry, add 8729 * it to the cache, and then initiate an asynchronous readdir 8730 * operation to fill it. 8731 */ 8732 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8733 8734 /* 8735 * A readdir cache entry could not be obtained for the readahead. In 8736 * this case we skip the readahead and return. 8737 */ 8738 if (rrdc == NULL) { 8739 rddir4_cache_rele(rp, rdc); 8740 mutex_exit(&rp->r_statelock); 8741 return (error); 8742 } 8743 8744 /* 8745 * Check to see if we need to fill this entry in. 8746 */ 8747 if (rrdc->flags & RDDIRREQ) { 8748 rrdc->flags &= ~RDDIRREQ; 8749 rrdc->flags |= RDDIR; 8750 rddir4_cache_rele(rp, rdc); 8751 mutex_exit(&rp->r_statelock); 8752 #ifdef DEBUG 8753 nfs4_readdir_readahead++; 8754 #endif 8755 /* 8756 * Do the readdir. 8757 */ 8758 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 8759 return (error); 8760 } 8761 8762 rddir4_cache_rele(rp, rrdc); 8763 rddir4_cache_rele(rp, rdc); 8764 mutex_exit(&rp->r_statelock); 8765 return (error); 8766 } 8767 8768 static int 8769 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8770 { 8771 int error; 8772 rnode4_t *rp; 8773 8774 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 8775 8776 rp = VTOR4(vp); 8777 8778 /* 8779 * Obtain the readdir results for the caller. 8780 */ 8781 nfs4readdir(vp, rdc, cr); 8782 8783 mutex_enter(&rp->r_statelock); 8784 /* 8785 * The entry is now complete 8786 */ 8787 rdc->flags &= ~RDDIR; 8788 8789 error = rdc->error; 8790 if (error) 8791 rdc->flags |= RDDIRREQ; 8792 rddir4_cache_rele(rp, rdc); 8793 mutex_exit(&rp->r_statelock); 8794 8795 return (error); 8796 } 8797 8798 static void 8799 nfs4readdir_stub(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8800 { 8801 int stublength; 8802 dirent64_t *dp; 8803 u_longlong_t nodeid, pnodeid; 8804 vnode_t *dotdotvp = NULL; 8805 rnode4_t *rp = VTOR4(vp); 8806 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8807 8808 rdc->error = 0; 8809 rdc->entries = 0; 8810 rdc->actlen = rdc->entlen = 0; 8811 rdc->eof = TRUE; 8812 8813 /* Check for EOF case for readdir of stub */ 8814 if (cookie != 0 && cookie != 1) 8815 return; 8816 8817 nodeid = rp->r_attr.va_nodeid; 8818 if (vp->v_flag & VROOT) { 8819 pnodeid = nodeid; /* root of mount point */ 8820 } else { 8821 if (rdc->error = nfs4_lookup(vp, "..", &dotdotvp, 0, 0, 0, cr)) 8822 return; 8823 pnodeid = VTOR4(dotdotvp)->r_attr.va_nodeid; 8824 VN_RELE(dotdotvp); 8825 } 8826 8827 stublength = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8828 rdc->entries = kmem_alloc(stublength, KM_SLEEP); 8829 rdc->entlen = rdc->buflen = stublength; 8830 rdc->eof = TRUE; 8831 8832 dp = (dirent64_t *)rdc->entries; 8833 8834 if (rdc->nfs4_cookie == (nfs_cookie4)0) { 8835 bcopy(nfs4_dot_entries, rdc->entries, 8836 DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2)); 8837 dp->d_ino = nodeid; 8838 dp = (struct dirent64 *)(((char *)dp) + DIRENT64_RECLEN(1)); 8839 dp->d_ino = pnodeid; 8840 rdc->actlen = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8841 } else { /* for ".." entry */ 8842 bcopy(nfs4_dot_dot_entry, rdc->entries, DIRENT64_RECLEN(2)); 8843 dp->d_ino = pnodeid; 8844 rdc->actlen = DIRENT64_RECLEN(2); 8845 } 8846 rdc->nfs4_ncookie = rdc->actlen; 8847 } 8848 8849 /* 8850 * Read directory entries. 8851 * There are some weird things to look out for here. The uio_loffset 8852 * field is either 0 or it is the offset returned from a previous 8853 * readdir. It is an opaque value used by the server to find the 8854 * correct directory block to read. The count field is the number 8855 * of blocks to read on the server. This is advisory only, the server 8856 * may return only one block's worth of entries. Entries may be compressed 8857 * on the server. 8858 * 8859 * Generates the following compound request: 8860 * 1. If readdir offset is zero and no dnlc entry for parent exists, 8861 * must include a Lookupp as well. In this case, send: 8862 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 8863 * 2. Otherwise just do: { Putfh <fh>; Readdir } 8864 * 8865 * Get complete attributes and filehandles for entries if this is the 8866 * first read of the directory. Otherwise, just get fileid's. 8867 */ 8868 static void 8869 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8870 { 8871 COMPOUND4args_clnt args; 8872 COMPOUND4res_clnt res; 8873 READDIR4args *rargs; 8874 READDIR4res_clnt *rd_res; 8875 bitmap4 rd_bitsval; 8876 nfs_argop4 argop[5]; 8877 nfs_resop4 *resop; 8878 rnode4_t *rp = VTOR4(vp); 8879 mntinfo4_t *mi = VTOMI4(vp); 8880 int doqueue; 8881 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 8882 vnode_t *dvp; 8883 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8884 int num_ops, res_opcnt; 8885 bool_t needrecov = FALSE; 8886 nfs4_recov_state_t recov_state; 8887 hrtime_t t; 8888 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8889 8890 ASSERT(nfs_zone() == mi->mi_zone); 8891 ASSERT(rdc->flags & RDDIR); 8892 ASSERT(rdc->entries == NULL); 8893 8894 if (rp->r_flags & R4SRVSTUB) { 8895 nfs4readdir_stub(vp, rdc, cr); 8896 return; 8897 } 8898 8899 num_ops = 2; 8900 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 8901 /* 8902 * Since nfsv4 readdir may not return entries for "." and "..", 8903 * the client must recreate them: 8904 * To find the correct nodeid, do the following: 8905 * For current node, get nodeid from dnlc. 8906 * - if current node is rootvp, set pnodeid to nodeid. 8907 * - else if parent is in the dnlc, get its nodeid from there. 8908 * - else add LOOKUPP+GETATTR to compound. 8909 */ 8910 nodeid = rp->r_attr.va_nodeid; 8911 if (vp->v_flag & VROOT) { 8912 pnodeid = nodeid; /* root of mount point */ 8913 } else { 8914 dvp = dnlc_lookup(vp, ".."); 8915 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 8916 /* parent in dnlc cache - no need for otw */ 8917 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 8918 } else { 8919 /* 8920 * parent not in dnlc cache, 8921 * do lookupp to get its id 8922 */ 8923 num_ops = 5; 8924 pnodeid = 0; /* set later by getattr parent */ 8925 } 8926 if (dvp) 8927 VN_RELE(dvp); 8928 } 8929 } 8930 recov_state.rs_flags = 0; 8931 recov_state.rs_num_retry_despite_err = 0; 8932 8933 /* Save the original mount point security flavor */ 8934 (void) save_mnt_secinfo(mi->mi_curr_serv); 8935 8936 recov_retry: 8937 args.ctag = TAG_READDIR; 8938 8939 args.array = argop; 8940 args.array_len = num_ops; 8941 8942 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 8943 &recov_state, NULL)) { 8944 /* 8945 * If readdir a node that is a stub for a crossed mount point, 8946 * keep the original secinfo flavor for the current file 8947 * system, not the crossed one. 8948 */ 8949 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 8950 rdc->error = e.error; 8951 return; 8952 } 8953 8954 /* 8955 * Determine which attrs to request for dirents. This code 8956 * must be protected by nfs4_start/end_fop because of r_server 8957 * (which will change during failover recovery). 8958 * 8959 */ 8960 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 8961 /* 8962 * Get all vattr attrs plus filehandle and rdattr_error 8963 */ 8964 rd_bitsval = NFS4_VATTR_MASK | 8965 FATTR4_RDATTR_ERROR_MASK | 8966 FATTR4_FILEHANDLE_MASK; 8967 8968 if (rp->r_flags & R4READDIRWATTR) { 8969 mutex_enter(&rp->r_statelock); 8970 rp->r_flags &= ~R4READDIRWATTR; 8971 mutex_exit(&rp->r_statelock); 8972 } 8973 } else { 8974 servinfo4_t *svp = rp->r_server; 8975 8976 /* 8977 * Already read directory. Use readdir with 8978 * no attrs (except for mounted_on_fileid) for updates. 8979 */ 8980 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 8981 8982 /* 8983 * request mounted on fileid if supported, else request 8984 * fileid. maybe we should verify that fileid is supported 8985 * and request something else if not. 8986 */ 8987 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 8988 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 8989 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 8990 nfs_rw_exit(&svp->sv_lock); 8991 } 8992 8993 /* putfh directory fh */ 8994 argop[0].argop = OP_CPUTFH; 8995 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 8996 8997 argop[1].argop = OP_READDIR; 8998 rargs = &argop[1].nfs_argop4_u.opreaddir; 8999 /* 9000 * 1 and 2 are reserved for client "." and ".." entry offset. 9001 * cookie 0 should be used over-the-wire to start reading at 9002 * the beginning of the directory excluding "." and "..". 9003 */ 9004 if (rdc->nfs4_cookie == 0 || 9005 rdc->nfs4_cookie == 1 || 9006 rdc->nfs4_cookie == 2) { 9007 rargs->cookie = (nfs_cookie4)0; 9008 rargs->cookieverf = 0; 9009 } else { 9010 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9011 mutex_enter(&rp->r_statelock); 9012 rargs->cookieverf = rp->r_cookieverf4; 9013 mutex_exit(&rp->r_statelock); 9014 } 9015 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9016 rargs->maxcount = mi->mi_tsize; 9017 rargs->attr_request = rd_bitsval; 9018 rargs->rdc = rdc; 9019 rargs->dvp = vp; 9020 rargs->mi = mi; 9021 rargs->cr = cr; 9022 9023 9024 /* 9025 * If count < than the minimum required, we return no entries 9026 * and fail with EINVAL 9027 */ 9028 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9029 rdc->error = EINVAL; 9030 goto out; 9031 } 9032 9033 if (args.array_len == 5) { 9034 /* 9035 * Add lookupp and getattr for parent nodeid. 9036 */ 9037 argop[2].argop = OP_LOOKUPP; 9038 9039 argop[3].argop = OP_GETFH; 9040 9041 /* getattr parent */ 9042 argop[4].argop = OP_GETATTR; 9043 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9044 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9045 } 9046 9047 doqueue = 1; 9048 9049 if (mi->mi_io_kstats) { 9050 mutex_enter(&mi->mi_lock); 9051 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9052 mutex_exit(&mi->mi_lock); 9053 } 9054 9055 /* capture the time of this call */ 9056 rargs->t = t = gethrtime(); 9057 9058 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9059 9060 if (mi->mi_io_kstats) { 9061 mutex_enter(&mi->mi_lock); 9062 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9063 mutex_exit(&mi->mi_lock); 9064 } 9065 9066 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9067 9068 /* 9069 * If RPC error occurred and it isn't an error that 9070 * triggers recovery, then go ahead and fail now. 9071 */ 9072 if (e.error != 0 && !needrecov) { 9073 rdc->error = e.error; 9074 goto out; 9075 } 9076 9077 if (needrecov) { 9078 bool_t abort; 9079 9080 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9081 "nfs4readdir: initiating recovery.\n")); 9082 9083 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9084 NULL, OP_READDIR, NULL); 9085 if (abort == FALSE) { 9086 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9087 &recov_state, needrecov); 9088 if (!e.error) 9089 (void) xdr_free(xdr_COMPOUND4res_clnt, 9090 (caddr_t)&res); 9091 if (rdc->entries != NULL) { 9092 kmem_free(rdc->entries, rdc->entlen); 9093 rdc->entries = NULL; 9094 } 9095 goto recov_retry; 9096 } 9097 9098 if (e.error != 0) { 9099 rdc->error = e.error; 9100 goto out; 9101 } 9102 9103 /* fall through for res.status case */ 9104 } 9105 9106 res_opcnt = res.array_len; 9107 9108 /* 9109 * If compound failed first 2 ops (PUTFH+READDIR), then return 9110 * failure here. Subsequent ops are for filling out dot-dot 9111 * dirent, and if they fail, we still want to give the caller 9112 * the dirents returned by (the successful) READDIR op, so we need 9113 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9114 * 9115 * One example where PUTFH+READDIR ops would succeed but 9116 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9117 * but lacks x. In this case, a POSIX server's VOP_READDIR 9118 * would succeed; however, VOP_LOOKUP(..) would fail since no 9119 * x perm. We need to come up with a non-vendor-specific way 9120 * for a POSIX server to return d_ino from dotdot's dirent if 9121 * client only requests mounted_on_fileid, and just say the 9122 * LOOKUPP succeeded and fill out the GETATTR. However, if 9123 * client requested any mandatory attrs, server would be required 9124 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9125 * for dotdot. 9126 */ 9127 9128 if (res.status) { 9129 if (res_opcnt <= 2) { 9130 e.error = geterrno4(res.status); 9131 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9132 &recov_state, needrecov); 9133 nfs4_purge_stale_fh(e.error, vp, cr); 9134 rdc->error = e.error; 9135 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9136 if (rdc->entries != NULL) { 9137 kmem_free(rdc->entries, rdc->entlen); 9138 rdc->entries = NULL; 9139 } 9140 /* 9141 * If readdir a node that is a stub for a 9142 * crossed mount point, keep the original 9143 * secinfo flavor for the current file system, 9144 * not the crossed one. 9145 */ 9146 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9147 return; 9148 } 9149 } 9150 9151 resop = &res.array[1]; /* readdir res */ 9152 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9153 9154 mutex_enter(&rp->r_statelock); 9155 rp->r_cookieverf4 = rd_res->cookieverf; 9156 mutex_exit(&rp->r_statelock); 9157 9158 /* 9159 * For "." and ".." entries 9160 * e.g. 9161 * seek(cookie=0) -> "." entry with d_off = 1 9162 * seek(cookie=1) -> ".." entry with d_off = 2 9163 */ 9164 if (cookie == (nfs_cookie4) 0) { 9165 if (rd_res->dotp) 9166 rd_res->dotp->d_ino = nodeid; 9167 if (rd_res->dotdotp) 9168 rd_res->dotdotp->d_ino = pnodeid; 9169 } 9170 if (cookie == (nfs_cookie4) 1) { 9171 if (rd_res->dotdotp) 9172 rd_res->dotdotp->d_ino = pnodeid; 9173 } 9174 9175 9176 /* LOOKUPP+GETATTR attemped */ 9177 if (args.array_len == 5 && rd_res->dotdotp) { 9178 if (res.status == NFS4_OK && res_opcnt == 5) { 9179 nfs_fh4 *fhp; 9180 nfs4_sharedfh_t *sfhp; 9181 vnode_t *pvp; 9182 nfs4_ga_res_t *garp; 9183 9184 resop++; /* lookupp */ 9185 resop++; /* getfh */ 9186 fhp = &resop->nfs_resop4_u.opgetfh.object; 9187 9188 resop++; /* getattr of parent */ 9189 9190 /* 9191 * First, take care of finishing the 9192 * readdir results. 9193 */ 9194 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9195 /* 9196 * The d_ino of .. must be the inode number 9197 * of the mounted filesystem. 9198 */ 9199 if (garp->n4g_va.va_mask & AT_NODEID) 9200 rd_res->dotdotp->d_ino = 9201 garp->n4g_va.va_nodeid; 9202 9203 9204 /* 9205 * Next, create the ".." dnlc entry 9206 */ 9207 sfhp = sfh4_get(fhp, mi); 9208 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9209 dnlc_update(vp, "..", pvp); 9210 VN_RELE(pvp); 9211 } 9212 sfh4_rele(&sfhp); 9213 } 9214 } 9215 9216 if (mi->mi_io_kstats) { 9217 mutex_enter(&mi->mi_lock); 9218 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9219 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9220 mutex_exit(&mi->mi_lock); 9221 } 9222 9223 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9224 9225 out: 9226 /* 9227 * If readdir a node that is a stub for a crossed mount point, 9228 * keep the original secinfo flavor for the current file system, 9229 * not the crossed one. 9230 */ 9231 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9232 9233 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9234 } 9235 9236 9237 static int 9238 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9239 { 9240 rnode4_t *rp = VTOR4(bp->b_vp); 9241 int count; 9242 int error; 9243 cred_t *cred_otw = NULL; 9244 offset_t offset; 9245 nfs4_open_stream_t *osp = NULL; 9246 bool_t first_time = TRUE; /* first time getting otw cred */ 9247 bool_t last_time = FALSE; /* last time getting otw cred */ 9248 9249 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9250 9251 DTRACE_IO1(start, struct buf *, bp); 9252 offset = ldbtob(bp->b_lblkno); 9253 9254 if (bp->b_flags & B_READ) { 9255 read_again: 9256 /* 9257 * Releases the osp, if it is provided. 9258 * Puts a hold on the cred_otw and the new osp (if found). 9259 */ 9260 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9261 &first_time, &last_time); 9262 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9263 offset, bp->b_bcount, 9264 &bp->b_resid, cred_otw, 9265 readahead, NULL); 9266 crfree(cred_otw); 9267 if (!error) { 9268 if (bp->b_resid) { 9269 /* 9270 * Didn't get it all because we hit EOF, 9271 * zero all the memory beyond the EOF. 9272 */ 9273 /* bzero(rdaddr + */ 9274 bzero(bp->b_un.b_addr + 9275 bp->b_bcount - bp->b_resid, bp->b_resid); 9276 } 9277 mutex_enter(&rp->r_statelock); 9278 if (bp->b_resid == bp->b_bcount && 9279 offset >= rp->r_size) { 9280 /* 9281 * We didn't read anything at all as we are 9282 * past EOF. Return an error indicator back 9283 * but don't destroy the pages (yet). 9284 */ 9285 error = NFS_EOF; 9286 } 9287 mutex_exit(&rp->r_statelock); 9288 } else if (error == EACCES && last_time == FALSE) { 9289 goto read_again; 9290 } 9291 } else { 9292 if (!(rp->r_flags & R4STALE)) { 9293 write_again: 9294 /* 9295 * Releases the osp, if it is provided. 9296 * Puts a hold on the cred_otw and the new 9297 * osp (if found). 9298 */ 9299 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9300 &first_time, &last_time); 9301 mutex_enter(&rp->r_statelock); 9302 count = MIN(bp->b_bcount, rp->r_size - offset); 9303 mutex_exit(&rp->r_statelock); 9304 if (count < 0) 9305 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9306 #ifdef DEBUG 9307 if (count == 0) { 9308 zoneid_t zoneid = getzoneid(); 9309 9310 zcmn_err(zoneid, CE_WARN, 9311 "nfs4_bio: zero length write at %lld", 9312 offset); 9313 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9314 "b_bcount=%ld, file size=%lld", 9315 rp->r_flags, (long)bp->b_bcount, 9316 rp->r_size); 9317 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9318 if (nfs4_bio_do_stop) 9319 debug_enter("nfs4_bio"); 9320 } 9321 #endif 9322 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9323 count, cred_otw, stab_comm); 9324 if (error == EACCES && last_time == FALSE) { 9325 crfree(cred_otw); 9326 goto write_again; 9327 } 9328 bp->b_error = error; 9329 if (error && error != EINTR && 9330 !(bp->b_vp->v_vfsp->vfs_flag && VFS_UNMOUNTED)) { 9331 /* 9332 * Don't print EDQUOT errors on the console. 9333 * Don't print asynchronous EACCES errors. 9334 * Don't print EFBIG errors. 9335 * Print all other write errors. 9336 */ 9337 if (error != EDQUOT && error != EFBIG && 9338 (error != EACCES || 9339 !(bp->b_flags & B_ASYNC))) 9340 nfs4_write_error(bp->b_vp, 9341 error, cred_otw); 9342 /* 9343 * Update r_error and r_flags as appropriate. 9344 * If the error was ESTALE, then mark the 9345 * rnode as not being writeable and save 9346 * the error status. Otherwise, save any 9347 * errors which occur from asynchronous 9348 * page invalidations. Any errors occurring 9349 * from other operations should be saved 9350 * by the caller. 9351 */ 9352 mutex_enter(&rp->r_statelock); 9353 if (error == ESTALE) { 9354 rp->r_flags |= R4STALE; 9355 if (!rp->r_error) 9356 rp->r_error = error; 9357 } else if (!rp->r_error && 9358 (bp->b_flags & 9359 (B_INVAL|B_FORCE|B_ASYNC)) == 9360 (B_INVAL|B_FORCE|B_ASYNC)) { 9361 rp->r_error = error; 9362 } 9363 mutex_exit(&rp->r_statelock); 9364 } 9365 crfree(cred_otw); 9366 } else 9367 error = rp->r_error; 9368 } 9369 9370 if (error != 0 && error != NFS_EOF) 9371 bp->b_flags |= B_ERROR; 9372 9373 if (osp) 9374 open_stream_rele(osp, rp); 9375 9376 DTRACE_IO1(done, struct buf *, bp); 9377 9378 return (error); 9379 } 9380 9381 /* ARGSUSED */ 9382 static int 9383 nfs4_fid(vnode_t *vp, fid_t *fidp) 9384 { 9385 return (EREMOTE); 9386 } 9387 9388 /* ARGSUSED2 */ 9389 static int 9390 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9391 { 9392 rnode4_t *rp = VTOR4(vp); 9393 9394 if (!write_lock) { 9395 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9396 return (V_WRITELOCK_FALSE); 9397 } 9398 9399 if ((rp->r_flags & R4DIRECTIO) || 9400 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9401 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9402 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9403 return (V_WRITELOCK_FALSE); 9404 nfs_rw_exit(&rp->r_rwlock); 9405 } 9406 9407 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9408 return (V_WRITELOCK_TRUE); 9409 } 9410 9411 /* ARGSUSED */ 9412 static void 9413 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9414 { 9415 rnode4_t *rp = VTOR4(vp); 9416 9417 nfs_rw_exit(&rp->r_rwlock); 9418 } 9419 9420 /* ARGSUSED */ 9421 static int 9422 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 9423 { 9424 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9425 return (EIO); 9426 9427 /* 9428 * Because we stuff the readdir cookie into the offset field 9429 * someone may attempt to do an lseek with the cookie which 9430 * we want to succeed. 9431 */ 9432 if (vp->v_type == VDIR) 9433 return (0); 9434 if (*noffp < 0) 9435 return (EINVAL); 9436 return (0); 9437 } 9438 9439 9440 /* 9441 * Return all the pages from [off..off+len) in file 9442 */ 9443 static int 9444 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9445 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9446 enum seg_rw rw, cred_t *cr) 9447 { 9448 rnode4_t *rp; 9449 int error; 9450 mntinfo4_t *mi; 9451 9452 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9453 return (EIO); 9454 rp = VTOR4(vp); 9455 if (IS_SHADOW(vp, rp)) 9456 vp = RTOV4(rp); 9457 9458 if (vp->v_flag & VNOMAP) 9459 return (ENOSYS); 9460 9461 if (protp != NULL) 9462 *protp = PROT_ALL; 9463 9464 /* 9465 * Now validate that the caches are up to date. 9466 */ 9467 if (error = nfs4_validate_caches(vp, cr)) 9468 return (error); 9469 9470 mi = VTOMI4(vp); 9471 retry: 9472 mutex_enter(&rp->r_statelock); 9473 9474 /* 9475 * Don't create dirty pages faster than they 9476 * can be cleaned so that the system doesn't 9477 * get imbalanced. If the async queue is 9478 * maxed out, then wait for it to drain before 9479 * creating more dirty pages. Also, wait for 9480 * any threads doing pagewalks in the vop_getattr 9481 * entry points so that they don't block for 9482 * long periods. 9483 */ 9484 if (rw == S_CREATE) { 9485 while ((mi->mi_max_threads != 0 && 9486 rp->r_awcount > 2 * mi->mi_max_threads) || 9487 rp->r_gcount > 0) 9488 cv_wait(&rp->r_cv, &rp->r_statelock); 9489 } 9490 9491 /* 9492 * If we are getting called as a side effect of an nfs_write() 9493 * operation the local file size might not be extended yet. 9494 * In this case we want to be able to return pages of zeroes. 9495 */ 9496 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9497 NFS4_DEBUG(nfs4_pageio_debug, 9498 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9499 "len=%llu, size=%llu, attrsize =%llu", off, 9500 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9501 mutex_exit(&rp->r_statelock); 9502 return (EFAULT); /* beyond EOF */ 9503 } 9504 9505 mutex_exit(&rp->r_statelock); 9506 9507 if (len <= PAGESIZE) { 9508 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9509 seg, addr, rw, cr); 9510 NFS4_DEBUG(nfs4_pageio_debug && error, 9511 (CE_NOTE, "getpage error %d; off=%lld, " 9512 "len=%lld", error, off, (u_longlong_t)len)); 9513 } else { 9514 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9515 pl, plsz, seg, addr, rw, cr); 9516 NFS4_DEBUG(nfs4_pageio_debug && error, 9517 (CE_NOTE, "getpages error %d; off=%lld, " 9518 "len=%lld", error, off, (u_longlong_t)len)); 9519 } 9520 9521 switch (error) { 9522 case NFS_EOF: 9523 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9524 goto retry; 9525 case ESTALE: 9526 nfs4_purge_stale_fh(error, vp, cr); 9527 } 9528 9529 return (error); 9530 } 9531 9532 /* 9533 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9534 */ 9535 /* ARGSUSED */ 9536 static int 9537 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9538 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9539 enum seg_rw rw, cred_t *cr) 9540 { 9541 rnode4_t *rp; 9542 uint_t bsize; 9543 struct buf *bp; 9544 page_t *pp; 9545 u_offset_t lbn; 9546 u_offset_t io_off; 9547 u_offset_t blkoff; 9548 u_offset_t rablkoff; 9549 size_t io_len; 9550 uint_t blksize; 9551 int error; 9552 int readahead; 9553 int readahead_issued = 0; 9554 int ra_window; /* readahead window */ 9555 page_t *pagefound; 9556 page_t *savepp; 9557 9558 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9559 return (EIO); 9560 9561 rp = VTOR4(vp); 9562 ASSERT(!IS_SHADOW(vp, rp)); 9563 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9564 9565 reread: 9566 bp = NULL; 9567 pp = NULL; 9568 pagefound = NULL; 9569 9570 if (pl != NULL) 9571 pl[0] = NULL; 9572 9573 error = 0; 9574 lbn = off / bsize; 9575 blkoff = lbn * bsize; 9576 9577 /* 9578 * Queueing up the readahead before doing the synchronous read 9579 * results in a significant increase in read throughput because 9580 * of the increased parallelism between the async threads and 9581 * the process context. 9582 */ 9583 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9584 rw != S_CREATE && 9585 !(vp->v_flag & VNOCACHE)) { 9586 mutex_enter(&rp->r_statelock); 9587 9588 /* 9589 * Calculate the number of readaheads to do. 9590 * a) No readaheads at offset = 0. 9591 * b) Do maximum(nfs4_nra) readaheads when the readahead 9592 * window is closed. 9593 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9594 * upon how far the readahead window is open or close. 9595 * d) No readaheads if rp->r_nextr is not within the scope 9596 * of the readahead window (random i/o). 9597 */ 9598 9599 if (off == 0) 9600 readahead = 0; 9601 else if (blkoff == rp->r_nextr) 9602 readahead = nfs4_nra; 9603 else if (rp->r_nextr > blkoff && 9604 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9605 <= (nfs4_nra - 1))) 9606 readahead = nfs4_nra - ra_window; 9607 else 9608 readahead = 0; 9609 9610 rablkoff = rp->r_nextr; 9611 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9612 mutex_exit(&rp->r_statelock); 9613 if (nfs4_async_readahead(vp, rablkoff + bsize, 9614 addr + (rablkoff + bsize - off), 9615 seg, cr, nfs4_readahead) < 0) { 9616 mutex_enter(&rp->r_statelock); 9617 break; 9618 } 9619 readahead--; 9620 rablkoff += bsize; 9621 /* 9622 * Indicate that we did a readahead so 9623 * readahead offset is not updated 9624 * by the synchronous read below. 9625 */ 9626 readahead_issued = 1; 9627 mutex_enter(&rp->r_statelock); 9628 /* 9629 * set readahead offset to 9630 * offset of last async readahead 9631 * request. 9632 */ 9633 rp->r_nextr = rablkoff; 9634 } 9635 mutex_exit(&rp->r_statelock); 9636 } 9637 9638 again: 9639 if ((pagefound = page_exists(vp, off)) == NULL) { 9640 if (pl == NULL) { 9641 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9642 nfs4_readahead); 9643 } else if (rw == S_CREATE) { 9644 /* 9645 * Block for this page is not allocated, or the offset 9646 * is beyond the current allocation size, or we're 9647 * allocating a swap slot and the page was not found, 9648 * so allocate it and return a zero page. 9649 */ 9650 if ((pp = page_create_va(vp, off, 9651 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9652 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9653 io_len = PAGESIZE; 9654 mutex_enter(&rp->r_statelock); 9655 rp->r_nextr = off + PAGESIZE; 9656 mutex_exit(&rp->r_statelock); 9657 } else { 9658 /* 9659 * Need to go to server to get a block 9660 */ 9661 mutex_enter(&rp->r_statelock); 9662 if (blkoff < rp->r_size && 9663 blkoff + bsize > rp->r_size) { 9664 /* 9665 * If less than a block left in 9666 * file read less than a block. 9667 */ 9668 if (rp->r_size <= off) { 9669 /* 9670 * Trying to access beyond EOF, 9671 * set up to get at least one page. 9672 */ 9673 blksize = off + PAGESIZE - blkoff; 9674 } else 9675 blksize = rp->r_size - blkoff; 9676 } else if ((off == 0) || 9677 (off != rp->r_nextr && !readahead_issued)) { 9678 blksize = PAGESIZE; 9679 blkoff = off; /* block = page here */ 9680 } else 9681 blksize = bsize; 9682 mutex_exit(&rp->r_statelock); 9683 9684 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9685 &io_len, blkoff, blksize, 0); 9686 9687 /* 9688 * Some other thread has entered the page, 9689 * so just use it. 9690 */ 9691 if (pp == NULL) 9692 goto again; 9693 9694 /* 9695 * Now round the request size up to page boundaries. 9696 * This ensures that the entire page will be 9697 * initialized to zeroes if EOF is encountered. 9698 */ 9699 io_len = ptob(btopr(io_len)); 9700 9701 bp = pageio_setup(pp, io_len, vp, B_READ); 9702 ASSERT(bp != NULL); 9703 9704 /* 9705 * pageio_setup should have set b_addr to 0. This 9706 * is correct since we want to do I/O on a page 9707 * boundary. bp_mapin will use this addr to calculate 9708 * an offset, and then set b_addr to the kernel virtual 9709 * address it allocated for us. 9710 */ 9711 ASSERT(bp->b_un.b_addr == 0); 9712 9713 bp->b_edev = 0; 9714 bp->b_dev = 0; 9715 bp->b_lblkno = lbtodb(io_off); 9716 bp->b_file = vp; 9717 bp->b_offset = (offset_t)off; 9718 bp_mapin(bp); 9719 9720 /* 9721 * If doing a write beyond what we believe is EOF, 9722 * don't bother trying to read the pages from the 9723 * server, we'll just zero the pages here. We 9724 * don't check that the rw flag is S_WRITE here 9725 * because some implementations may attempt a 9726 * read access to the buffer before copying data. 9727 */ 9728 mutex_enter(&rp->r_statelock); 9729 if (io_off >= rp->r_size && seg == segkmap) { 9730 mutex_exit(&rp->r_statelock); 9731 bzero(bp->b_un.b_addr, io_len); 9732 } else { 9733 mutex_exit(&rp->r_statelock); 9734 error = nfs4_bio(bp, NULL, cr, FALSE); 9735 } 9736 9737 /* 9738 * Unmap the buffer before freeing it. 9739 */ 9740 bp_mapout(bp); 9741 pageio_done(bp); 9742 9743 savepp = pp; 9744 do { 9745 pp->p_fsdata = C_NOCOMMIT; 9746 } while ((pp = pp->p_next) != savepp); 9747 9748 if (error == NFS_EOF) { 9749 /* 9750 * If doing a write system call just return 9751 * zeroed pages, else user tried to get pages 9752 * beyond EOF, return error. We don't check 9753 * that the rw flag is S_WRITE here because 9754 * some implementations may attempt a read 9755 * access to the buffer before copying data. 9756 */ 9757 if (seg == segkmap) 9758 error = 0; 9759 else 9760 error = EFAULT; 9761 } 9762 9763 if (!readahead_issued && !error) { 9764 mutex_enter(&rp->r_statelock); 9765 rp->r_nextr = io_off + io_len; 9766 mutex_exit(&rp->r_statelock); 9767 } 9768 } 9769 } 9770 9771 out: 9772 if (pl == NULL) 9773 return (error); 9774 9775 if (error) { 9776 if (pp != NULL) 9777 pvn_read_done(pp, B_ERROR); 9778 return (error); 9779 } 9780 9781 if (pagefound) { 9782 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9783 9784 /* 9785 * Page exists in the cache, acquire the appropriate lock. 9786 * If this fails, start all over again. 9787 */ 9788 if ((pp = page_lookup(vp, off, se)) == NULL) { 9789 #ifdef DEBUG 9790 nfs4_lostpage++; 9791 #endif 9792 goto reread; 9793 } 9794 pl[0] = pp; 9795 pl[1] = NULL; 9796 return (0); 9797 } 9798 9799 if (pp != NULL) 9800 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 9801 9802 return (error); 9803 } 9804 9805 static void 9806 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 9807 cred_t *cr) 9808 { 9809 int error; 9810 page_t *pp; 9811 u_offset_t io_off; 9812 size_t io_len; 9813 struct buf *bp; 9814 uint_t bsize, blksize; 9815 rnode4_t *rp = VTOR4(vp); 9816 page_t *savepp; 9817 9818 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9819 9820 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9821 9822 mutex_enter(&rp->r_statelock); 9823 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 9824 /* 9825 * If less than a block left in file read less 9826 * than a block. 9827 */ 9828 blksize = rp->r_size - blkoff; 9829 } else 9830 blksize = bsize; 9831 mutex_exit(&rp->r_statelock); 9832 9833 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 9834 &io_off, &io_len, blkoff, blksize, 1); 9835 /* 9836 * The isra flag passed to the kluster function is 1, we may have 9837 * gotten a return value of NULL for a variety of reasons (# of free 9838 * pages < minfree, someone entered the page on the vnode etc). In all 9839 * cases, we want to punt on the readahead. 9840 */ 9841 if (pp == NULL) 9842 return; 9843 9844 /* 9845 * Now round the request size up to page boundaries. 9846 * This ensures that the entire page will be 9847 * initialized to zeroes if EOF is encountered. 9848 */ 9849 io_len = ptob(btopr(io_len)); 9850 9851 bp = pageio_setup(pp, io_len, vp, B_READ); 9852 ASSERT(bp != NULL); 9853 9854 /* 9855 * pageio_setup should have set b_addr to 0. This is correct since 9856 * we want to do I/O on a page boundary. bp_mapin() will use this addr 9857 * to calculate an offset, and then set b_addr to the kernel virtual 9858 * address it allocated for us. 9859 */ 9860 ASSERT(bp->b_un.b_addr == 0); 9861 9862 bp->b_edev = 0; 9863 bp->b_dev = 0; 9864 bp->b_lblkno = lbtodb(io_off); 9865 bp->b_file = vp; 9866 bp->b_offset = (offset_t)blkoff; 9867 bp_mapin(bp); 9868 9869 /* 9870 * If doing a write beyond what we believe is EOF, don't bother trying 9871 * to read the pages from the server, we'll just zero the pages here. 9872 * We don't check that the rw flag is S_WRITE here because some 9873 * implementations may attempt a read access to the buffer before 9874 * copying data. 9875 */ 9876 mutex_enter(&rp->r_statelock); 9877 if (io_off >= rp->r_size && seg == segkmap) { 9878 mutex_exit(&rp->r_statelock); 9879 bzero(bp->b_un.b_addr, io_len); 9880 error = 0; 9881 } else { 9882 mutex_exit(&rp->r_statelock); 9883 error = nfs4_bio(bp, NULL, cr, TRUE); 9884 if (error == NFS_EOF) 9885 error = 0; 9886 } 9887 9888 /* 9889 * Unmap the buffer before freeing it. 9890 */ 9891 bp_mapout(bp); 9892 pageio_done(bp); 9893 9894 savepp = pp; 9895 do { 9896 pp->p_fsdata = C_NOCOMMIT; 9897 } while ((pp = pp->p_next) != savepp); 9898 9899 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 9900 9901 /* 9902 * In case of error set readahead offset 9903 * to the lowest offset. 9904 * pvn_read_done() calls VN_DISPOSE to destroy the pages 9905 */ 9906 if (error && rp->r_nextr > io_off) { 9907 mutex_enter(&rp->r_statelock); 9908 if (rp->r_nextr > io_off) 9909 rp->r_nextr = io_off; 9910 mutex_exit(&rp->r_statelock); 9911 } 9912 } 9913 9914 /* 9915 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 9916 * If len == 0, do from off to EOF. 9917 * 9918 * The normal cases should be len == 0 && off == 0 (entire vp list) or 9919 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 9920 * (from pageout). 9921 */ 9922 static int 9923 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 9924 { 9925 int error; 9926 rnode4_t *rp; 9927 9928 ASSERT(cr != NULL); 9929 9930 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 9931 return (EIO); 9932 9933 rp = VTOR4(vp); 9934 if (IS_SHADOW(vp, rp)) 9935 vp = RTOV4(rp); 9936 9937 /* 9938 * XXX - Why should this check be made here? 9939 */ 9940 if (vp->v_flag & VNOMAP) 9941 return (ENOSYS); 9942 9943 if (len == 0 && !(flags & B_INVAL) && 9944 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 9945 return (0); 9946 9947 mutex_enter(&rp->r_statelock); 9948 rp->r_count++; 9949 mutex_exit(&rp->r_statelock); 9950 error = nfs4_putpages(vp, off, len, flags, cr); 9951 mutex_enter(&rp->r_statelock); 9952 rp->r_count--; 9953 cv_broadcast(&rp->r_cv); 9954 mutex_exit(&rp->r_statelock); 9955 9956 return (error); 9957 } 9958 9959 /* 9960 * Write out a single page, possibly klustering adjacent dirty pages. 9961 */ 9962 int 9963 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 9964 int flags, cred_t *cr) 9965 { 9966 u_offset_t io_off; 9967 u_offset_t lbn_off; 9968 u_offset_t lbn; 9969 size_t io_len; 9970 uint_t bsize; 9971 int error; 9972 rnode4_t *rp; 9973 9974 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 9975 ASSERT(pp != NULL); 9976 ASSERT(cr != NULL); 9977 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 9978 9979 rp = VTOR4(vp); 9980 ASSERT(rp->r_count > 0); 9981 ASSERT(!IS_SHADOW(vp, rp)); 9982 9983 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9984 lbn = pp->p_offset / bsize; 9985 lbn_off = lbn * bsize; 9986 9987 /* 9988 * Find a kluster that fits in one block, or in 9989 * one page if pages are bigger than blocks. If 9990 * there is less file space allocated than a whole 9991 * page, we'll shorten the i/o request below. 9992 */ 9993 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 9994 roundup(bsize, PAGESIZE), flags); 9995 9996 /* 9997 * pvn_write_kluster shouldn't have returned a page with offset 9998 * behind the original page we were given. Verify that. 9999 */ 10000 ASSERT((pp->p_offset / bsize) >= lbn); 10001 10002 /* 10003 * Now pp will have the list of kept dirty pages marked for 10004 * write back. It will also handle invalidation and freeing 10005 * of pages that are not dirty. Check for page length rounding 10006 * problems. 10007 */ 10008 if (io_off + io_len > lbn_off + bsize) { 10009 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10010 io_len = lbn_off + bsize - io_off; 10011 } 10012 /* 10013 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10014 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10015 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10016 * progress and the r_size has not been made consistent with the 10017 * new size of the file. When the uiomove() completes the r_size is 10018 * updated and the R4MODINPROGRESS flag is cleared. 10019 * 10020 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10021 * consistent value of r_size. Without this handshaking, it is 10022 * possible that nfs4_bio() picks up the old value of r_size 10023 * before the uiomove() in writerp4() completes. This will result 10024 * in the write through nfs4_bio() being dropped. 10025 * 10026 * More precisely, there is a window between the time the uiomove() 10027 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10028 * operation intervenes in this window, the page will be picked up, 10029 * because it is dirty (it will be unlocked, unless it was 10030 * pagecreate'd). When the page is picked up as dirty, the dirty 10031 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10032 * checked. This will still be the old size. Therefore the page will 10033 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10034 * the page will be found to be clean and the write will be dropped. 10035 */ 10036 if (rp->r_flags & R4MODINPROGRESS) { 10037 mutex_enter(&rp->r_statelock); 10038 if ((rp->r_flags & R4MODINPROGRESS) && 10039 rp->r_modaddr + MAXBSIZE > io_off && 10040 rp->r_modaddr < io_off + io_len) { 10041 page_t *plist; 10042 /* 10043 * A write is in progress for this region of the file. 10044 * If we did not detect R4MODINPROGRESS here then this 10045 * path through nfs_putapage() would eventually go to 10046 * nfs4_bio() and may not write out all of the data 10047 * in the pages. We end up losing data. So we decide 10048 * to set the modified bit on each page in the page 10049 * list and mark the rnode with R4DIRTY. This write 10050 * will be restarted at some later time. 10051 */ 10052 plist = pp; 10053 while (plist != NULL) { 10054 pp = plist; 10055 page_sub(&plist, pp); 10056 hat_setmod(pp); 10057 page_io_unlock(pp); 10058 page_unlock(pp); 10059 } 10060 rp->r_flags |= R4DIRTY; 10061 mutex_exit(&rp->r_statelock); 10062 if (offp) 10063 *offp = io_off; 10064 if (lenp) 10065 *lenp = io_len; 10066 return (0); 10067 } 10068 mutex_exit(&rp->r_statelock); 10069 } 10070 10071 if (flags & B_ASYNC) { 10072 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10073 nfs4_sync_putapage); 10074 } else 10075 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10076 10077 if (offp) 10078 *offp = io_off; 10079 if (lenp) 10080 *lenp = io_len; 10081 return (error); 10082 } 10083 10084 static int 10085 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10086 int flags, cred_t *cr) 10087 { 10088 int error; 10089 rnode4_t *rp; 10090 10091 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10092 10093 flags |= B_WRITE; 10094 10095 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10096 10097 rp = VTOR4(vp); 10098 10099 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10100 error == EACCES) && 10101 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10102 if (!(rp->r_flags & R4OUTOFSPACE)) { 10103 mutex_enter(&rp->r_statelock); 10104 rp->r_flags |= R4OUTOFSPACE; 10105 mutex_exit(&rp->r_statelock); 10106 } 10107 flags |= B_ERROR; 10108 pvn_write_done(pp, flags); 10109 /* 10110 * If this was not an async thread, then try again to 10111 * write out the pages, but this time, also destroy 10112 * them whether or not the write is successful. This 10113 * will prevent memory from filling up with these 10114 * pages and destroying them is the only alternative 10115 * if they can't be written out. 10116 * 10117 * Don't do this if this is an async thread because 10118 * when the pages are unlocked in pvn_write_done, 10119 * some other thread could have come along, locked 10120 * them, and queued for an async thread. It would be 10121 * possible for all of the async threads to be tied 10122 * up waiting to lock the pages again and they would 10123 * all already be locked and waiting for an async 10124 * thread to handle them. Deadlock. 10125 */ 10126 if (!(flags & B_ASYNC)) { 10127 error = nfs4_putpage(vp, io_off, io_len, 10128 B_INVAL | B_FORCE, cr); 10129 } 10130 } else { 10131 if (error) 10132 flags |= B_ERROR; 10133 else if (rp->r_flags & R4OUTOFSPACE) { 10134 mutex_enter(&rp->r_statelock); 10135 rp->r_flags &= ~R4OUTOFSPACE; 10136 mutex_exit(&rp->r_statelock); 10137 } 10138 pvn_write_done(pp, flags); 10139 if (freemem < desfree) 10140 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10141 NFS4_WRITE_NOWAIT); 10142 } 10143 10144 return (error); 10145 } 10146 10147 #ifdef DEBUG 10148 int nfs4_force_open_before_mmap = 0; 10149 #endif 10150 10151 static int 10152 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10153 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10154 { 10155 struct segvn_crargs vn_a; 10156 int error = 0; 10157 rnode4_t *rp = VTOR4(vp); 10158 mntinfo4_t *mi = VTOMI4(vp); 10159 10160 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10161 return (EIO); 10162 10163 if (vp->v_flag & VNOMAP) 10164 return (ENOSYS); 10165 10166 if (off < 0 || (off + len) < 0) 10167 return (ENXIO); 10168 10169 if (vp->v_type != VREG) 10170 return (ENODEV); 10171 10172 /* 10173 * If the file is delegated to the client don't do anything. 10174 * If the file is not delegated, then validate the data cache. 10175 */ 10176 mutex_enter(&rp->r_statev4_lock); 10177 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10178 mutex_exit(&rp->r_statev4_lock); 10179 error = nfs4_validate_caches(vp, cr); 10180 if (error) 10181 return (error); 10182 } else { 10183 mutex_exit(&rp->r_statev4_lock); 10184 } 10185 10186 /* 10187 * Check to see if the vnode is currently marked as not cachable. 10188 * This means portions of the file are locked (through VOP_FRLOCK). 10189 * In this case the map request must be refused. We use 10190 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10191 */ 10192 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 10193 return (EINTR); 10194 10195 if (vp->v_flag & VNOCACHE) { 10196 error = EAGAIN; 10197 goto done; 10198 } 10199 10200 /* 10201 * Don't allow concurrent locks and mapping if mandatory locking is 10202 * enabled. 10203 */ 10204 if (flk_has_remote_locks(vp)) { 10205 struct vattr va; 10206 va.va_mask = AT_MODE; 10207 error = nfs4getattr(vp, &va, cr); 10208 if (error != 0) 10209 goto done; 10210 if (MANDLOCK(vp, va.va_mode)) { 10211 error = EAGAIN; 10212 goto done; 10213 } 10214 } 10215 10216 /* 10217 * It is possible that the rnode has a lost lock request that we 10218 * are still trying to recover, and that the request conflicts with 10219 * this map request. 10220 * 10221 * An alternative approach would be for nfs4_safemap() to consider 10222 * queued lock requests when deciding whether to set or clear 10223 * VNOCACHE. This would require the frlock code path to call 10224 * nfs4_safemap() after enqueing a lost request. 10225 */ 10226 if (nfs4_map_lost_lock_conflict(vp)) { 10227 error = EAGAIN; 10228 goto done; 10229 } 10230 10231 as_rangelock(as); 10232 if (!(flags & MAP_FIXED)) { 10233 map_addr(addrp, len, off, 1, flags); 10234 if (*addrp == NULL) { 10235 as_rangeunlock(as); 10236 error = ENOMEM; 10237 goto done; 10238 } 10239 } else { 10240 /* 10241 * User specified address - blow away any previous mappings 10242 */ 10243 (void) as_unmap(as, *addrp, len); 10244 } 10245 10246 if (vp->v_type == VREG) { 10247 /* 10248 * We need to retrieve the open stream 10249 */ 10250 nfs4_open_stream_t *osp = NULL; 10251 nfs4_open_owner_t *oop = NULL; 10252 10253 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10254 if (oop != NULL) { 10255 /* returns with 'os_sync_lock' held */ 10256 osp = find_open_stream(oop, rp); 10257 open_owner_rele(oop); 10258 } 10259 if (osp == NULL) { 10260 #ifdef DEBUG 10261 if (nfs4_force_open_before_mmap) { 10262 error = EIO; 10263 goto done; 10264 } 10265 #endif 10266 /* returns with 'os_sync_lock' held */ 10267 osp = open_and_get_osp(vp, cr, mi); 10268 if (osp == NULL) { 10269 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10270 "nfs4_map: we tried to OPEN the file " 10271 "but again no osp, so fail with EIO")); 10272 error = EIO; 10273 goto done; 10274 } 10275 } 10276 10277 if (osp->os_failed_reopen) { 10278 mutex_exit(&osp->os_sync_lock); 10279 open_stream_rele(osp, rp); 10280 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10281 "nfs4_map: os_failed_reopen set on " 10282 "osp %p, cr %p, rp %s", (void *)osp, 10283 (void *)cr, rnode4info(rp))); 10284 error = EIO; 10285 goto done; 10286 } 10287 mutex_exit(&osp->os_sync_lock); 10288 open_stream_rele(osp, rp); 10289 } 10290 10291 vn_a.vp = vp; 10292 vn_a.offset = off; 10293 vn_a.type = (flags & MAP_TYPE); 10294 vn_a.prot = (uchar_t)prot; 10295 vn_a.maxprot = (uchar_t)maxprot; 10296 vn_a.flags = (flags & ~MAP_TYPE); 10297 vn_a.cred = cr; 10298 vn_a.amp = NULL; 10299 vn_a.szc = 0; 10300 vn_a.lgrp_mem_policy_flags = 0; 10301 10302 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10303 as_rangeunlock(as); 10304 10305 done: 10306 nfs_rw_exit(&rp->r_lkserlock); 10307 return (error); 10308 } 10309 10310 /* 10311 * We're most likely dealing with a kernel module that likes to READ 10312 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10313 * officially OPEN the file to create the necessary client state 10314 * for bookkeeping of os_mmap_read/write counts. 10315 * 10316 * Since VOP_MAP only passes in a pointer to the vnode rather than 10317 * a double pointer, we can't handle the case where nfs4open_otw() 10318 * returns a different vnode than the one passed into VOP_MAP (since 10319 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10320 * we return NULL and let nfs4_map() fail. Note: the only case where 10321 * this should happen is if the file got removed and replaced with the 10322 * same name on the server (in addition to the fact that we're trying 10323 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10324 */ 10325 static nfs4_open_stream_t * 10326 open_and_get_osp(vnode_t *map_vp, cred_t *cr, mntinfo4_t *mi) 10327 { 10328 rnode4_t *rp, *drp; 10329 vnode_t *dvp, *open_vp; 10330 char *file_name; 10331 int just_created; 10332 nfs4_sharedfh_t *sfh; 10333 nfs4_open_stream_t *osp; 10334 nfs4_open_owner_t *oop; 10335 10336 open_vp = map_vp; 10337 sfh = (open_vp->v_flag & VROOT) ? mi->mi_srvparentfh : 10338 VTOSV(open_vp)->sv_dfh; 10339 drp = r4find_unlocked(sfh, open_vp->v_vfsp); 10340 if (!drp) 10341 return (NULL); 10342 10343 file_name = fn_name(VTOSV(open_vp)->sv_name); 10344 10345 rp = VTOR4(open_vp); 10346 dvp = RTOV4(drp); 10347 mutex_enter(&rp->r_statev4_lock); 10348 if (rp->created_v4) { 10349 rp->created_v4 = 0; 10350 mutex_exit(&rp->r_statev4_lock); 10351 10352 dnlc_update(dvp, file_name, open_vp); 10353 /* This is needed so we don't bump the open ref count */ 10354 just_created = 1; 10355 } else { 10356 mutex_exit(&rp->r_statev4_lock); 10357 just_created = 0; 10358 } 10359 10360 VN_HOLD(map_vp); 10361 10362 if (nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10363 just_created)) { 10364 kmem_free(file_name, MAXNAMELEN); 10365 VN_RELE(dvp); 10366 VN_RELE(map_vp); 10367 return (NULL); 10368 } 10369 10370 kmem_free(file_name, MAXNAMELEN); 10371 VN_RELE(dvp); 10372 10373 /* 10374 * If nfs4open_otw() returned a different vnode then "undo" 10375 * the open and return failure to the caller. 10376 */ 10377 if (!VN_CMP(open_vp, map_vp)) { 10378 nfs4_error_t e; 10379 10380 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10381 "open returned a different vnode")); 10382 /* 10383 * If there's an error, ignore it, 10384 * and let VOP_INACTIVE handle it. 10385 */ 10386 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10387 CLOSE_NORM, 0, 0, 0); 10388 VN_RELE(map_vp); 10389 return (NULL); 10390 } 10391 10392 VN_RELE(map_vp); 10393 10394 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10395 if (!oop) { 10396 nfs4_error_t e; 10397 10398 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10399 "no open owner")); 10400 /* 10401 * If there's an error, ignore it, 10402 * and let VOP_INACTIVE handle it. 10403 */ 10404 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10405 CLOSE_NORM, 0, 0, 0); 10406 return (NULL); 10407 } 10408 osp = find_open_stream(oop, rp); 10409 open_owner_rele(oop); 10410 return (osp); 10411 } 10412 10413 /* 10414 * Please be aware that when this function is called, the address space write 10415 * a_lock is held. Do not put over the wire calls in this function. 10416 */ 10417 /* ARGSUSED */ 10418 static int 10419 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10420 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10421 { 10422 rnode4_t *rp; 10423 int error = 0; 10424 mntinfo4_t *mi; 10425 10426 mi = VTOMI4(vp); 10427 rp = VTOR4(vp); 10428 10429 if (nfs_zone() != mi->mi_zone) 10430 return (EIO); 10431 if (vp->v_flag & VNOMAP) 10432 return (ENOSYS); 10433 10434 /* 10435 * Need to hold rwlock while incrementing the mapcnt so that 10436 * mmap'ing can be serialized with writes so that the caching 10437 * can be handled correctly. 10438 * 10439 * Don't need to update the open stream first, since this 10440 * mmap can't add any additional share access that isn't 10441 * already contained in the open stream (for the case where we 10442 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10443 * take into account os_mmap_read[write] counts). 10444 */ 10445 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10446 return (EINTR); 10447 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10448 nfs_rw_exit(&rp->r_rwlock); 10449 10450 if (vp->v_type == VREG) { 10451 /* 10452 * We need to retrieve the open stream and update the counts. 10453 * If there is no open stream here, something is wrong. 10454 */ 10455 nfs4_open_stream_t *osp = NULL; 10456 nfs4_open_owner_t *oop = NULL; 10457 10458 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10459 if (oop != NULL) { 10460 /* returns with 'os_sync_lock' held */ 10461 osp = find_open_stream(oop, rp); 10462 open_owner_rele(oop); 10463 } 10464 if (osp == NULL) { 10465 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10466 "nfs4_addmap: we should have an osp" 10467 "but we don't, so fail with EIO")); 10468 error = EIO; 10469 goto out; 10470 } 10471 10472 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10473 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10474 10475 /* 10476 * Update the map count in the open stream. 10477 * This is necessary in the case where we 10478 * open/mmap/close/, then the server reboots, and we 10479 * attempt to reopen. If the mmap doesn't add share 10480 * access then we send an invalid reopen with 10481 * access = NONE. 10482 * 10483 * We need to specifically check each PROT_* so a mmap 10484 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10485 * read and write access. A simple comparison of prot 10486 * to ~PROT_WRITE to determine read access is insufficient 10487 * since prot can be |= with PROT_USER, etc. 10488 */ 10489 10490 /* 10491 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10492 */ 10493 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10494 osp->os_mmap_write += btopr(len); 10495 if (maxprot & PROT_READ) 10496 osp->os_mmap_read += btopr(len); 10497 if (maxprot & PROT_EXEC) 10498 osp->os_mmap_read += btopr(len); 10499 /* 10500 * Ensure that os_mmap_read gets incremented, even if 10501 * maxprot were to look like PROT_NONE. 10502 */ 10503 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10504 !(maxprot & PROT_EXEC)) 10505 osp->os_mmap_read += btopr(len); 10506 osp->os_mapcnt += btopr(len); 10507 mutex_exit(&osp->os_sync_lock); 10508 open_stream_rele(osp, rp); 10509 } 10510 10511 out: 10512 /* 10513 * If we got an error, then undo our 10514 * incrementing of 'r_mapcnt'. 10515 */ 10516 10517 if (error) { 10518 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10519 ASSERT(rp->r_mapcnt >= 0); 10520 } 10521 return (error); 10522 } 10523 10524 static int 10525 nfs4_cmp(vnode_t *vp1, vnode_t *vp2) 10526 { 10527 10528 return (VTOR4(vp1) == VTOR4(vp2)); 10529 } 10530 10531 static int 10532 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10533 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 10534 { 10535 int rc; 10536 u_offset_t start, end; 10537 rnode4_t *rp; 10538 int error = 0, intr = INTR4(vp); 10539 nfs4_error_t e; 10540 10541 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10542 return (EIO); 10543 10544 /* check for valid cmd parameter */ 10545 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10546 return (EINVAL); 10547 10548 /* Verify l_type. */ 10549 switch (bfp->l_type) { 10550 case F_RDLCK: 10551 if (cmd != F_GETLK && !(flag & FREAD)) 10552 return (EBADF); 10553 break; 10554 case F_WRLCK: 10555 if (cmd != F_GETLK && !(flag & FWRITE)) 10556 return (EBADF); 10557 break; 10558 case F_UNLCK: 10559 intr = 0; 10560 break; 10561 10562 default: 10563 return (EINVAL); 10564 } 10565 10566 /* check the validity of the lock range */ 10567 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10568 return (rc); 10569 if (rc = flk_check_lock_data(start, end, MAXEND)) 10570 return (rc); 10571 10572 /* 10573 * If the filesystem is mounted using local locking, pass the 10574 * request off to the local locking code. 10575 */ 10576 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10577 if (cmd == F_SETLK || cmd == F_SETLKW) { 10578 /* 10579 * For complete safety, we should be holding 10580 * r_lkserlock. However, we can't call 10581 * nfs4_safelock and then fs_frlock while 10582 * holding r_lkserlock, so just invoke 10583 * nfs4_safelock and expect that this will 10584 * catch enough of the cases. 10585 */ 10586 if (!nfs4_safelock(vp, bfp, cr)) 10587 return (EAGAIN); 10588 } 10589 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 10590 } 10591 10592 rp = VTOR4(vp); 10593 10594 /* 10595 * Check whether the given lock request can proceed, given the 10596 * current file mappings. 10597 */ 10598 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10599 return (EINTR); 10600 if (cmd == F_SETLK || cmd == F_SETLKW) { 10601 if (!nfs4_safelock(vp, bfp, cr)) { 10602 rc = EAGAIN; 10603 goto done; 10604 } 10605 } 10606 10607 /* 10608 * Flush the cache after waiting for async I/O to finish. For new 10609 * locks, this is so that the process gets the latest bits from the 10610 * server. For unlocks, this is so that other clients see the 10611 * latest bits once the file has been unlocked. If currently dirty 10612 * pages can't be flushed, then don't allow a lock to be set. But 10613 * allow unlocks to succeed, to avoid having orphan locks on the 10614 * server. 10615 */ 10616 if (cmd != F_GETLK) { 10617 mutex_enter(&rp->r_statelock); 10618 while (rp->r_count > 0) { 10619 if (intr) { 10620 klwp_t *lwp = ttolwp(curthread); 10621 10622 if (lwp != NULL) 10623 lwp->lwp_nostop++; 10624 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 10625 if (lwp != NULL) 10626 lwp->lwp_nostop--; 10627 rc = EINTR; 10628 break; 10629 } 10630 if (lwp != NULL) 10631 lwp->lwp_nostop--; 10632 } else 10633 cv_wait(&rp->r_cv, &rp->r_statelock); 10634 } 10635 mutex_exit(&rp->r_statelock); 10636 if (rc != 0) 10637 goto done; 10638 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 10639 if (error) { 10640 if (error == ENOSPC || error == EDQUOT) { 10641 mutex_enter(&rp->r_statelock); 10642 if (!rp->r_error) 10643 rp->r_error = error; 10644 mutex_exit(&rp->r_statelock); 10645 } 10646 if (bfp->l_type != F_UNLCK) { 10647 rc = ENOLCK; 10648 goto done; 10649 } 10650 } 10651 } 10652 10653 /* 10654 * Call the lock manager to do the real work of contacting 10655 * the server and obtaining the lock. 10656 */ 10657 10658 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10659 cr, &e, NULL, NULL); 10660 rc = e.error; 10661 10662 if (rc == 0) 10663 nfs4_lockcompletion(vp, cmd); 10664 10665 done: 10666 nfs_rw_exit(&rp->r_lkserlock); 10667 10668 return (rc); 10669 } 10670 10671 /* 10672 * Free storage space associated with the specified vnode. The portion 10673 * to be freed is specified by bfp->l_start and bfp->l_len (already 10674 * normalized to a "whence" of 0). 10675 * 10676 * This is an experimental facility whose continued existence is not 10677 * guaranteed. Currently, we only support the special case 10678 * of l_len == 0, meaning free to end of file. 10679 */ 10680 /* ARGSUSED */ 10681 static int 10682 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10683 offset_t offset, cred_t *cr, caller_context_t *ct) 10684 { 10685 int error; 10686 10687 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10688 return (EIO); 10689 ASSERT(vp->v_type == VREG); 10690 if (cmd != F_FREESP) 10691 return (EINVAL); 10692 10693 error = convoff(vp, bfp, 0, offset); 10694 if (!error) { 10695 ASSERT(bfp->l_start >= 0); 10696 if (bfp->l_len == 0) { 10697 struct vattr va; 10698 10699 va.va_mask = AT_SIZE; 10700 va.va_size = bfp->l_start; 10701 error = nfs4setattr(vp, &va, 0, cr, NULL); 10702 } else 10703 error = EINVAL; 10704 } 10705 10706 return (error); 10707 } 10708 10709 /* ARGSUSED */ 10710 static int 10711 nfs4_realvp(vnode_t *vp, vnode_t **vpp) 10712 { 10713 return (EINVAL); 10714 } 10715 10716 /* 10717 * Setup and add an address space callback to do the work of the delmap call. 10718 * The callback will (and must be) deleted in the actual callback function. 10719 * 10720 * This is done in order to take care of the problem that we have with holding 10721 * the address space's a_lock for a long period of time (e.g. if the NFS server 10722 * is down). Callbacks will be executed in the address space code while the 10723 * a_lock is not held. Holding the address space's a_lock causes things such 10724 * as ps and fork to hang because they are trying to acquire this lock as well. 10725 */ 10726 /* ARGSUSED */ 10727 static int 10728 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10729 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 10730 { 10731 int caller_found; 10732 int error; 10733 rnode4_t *rp; 10734 nfs4_delmap_args_t *dmapp; 10735 nfs4_delmapcall_t *delmap_call; 10736 10737 if (vp->v_flag & VNOMAP) 10738 return (ENOSYS); 10739 10740 /* 10741 * A process may not change zones if it has NFS pages mmap'ed 10742 * in, so we can't legitimately get here from the wrong zone. 10743 */ 10744 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10745 10746 rp = VTOR4(vp); 10747 10748 /* 10749 * The way that the address space of this process deletes its mapping 10750 * of this file is via the following call chains: 10751 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10752 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10753 * 10754 * With the use of address space callbacks we are allowed to drop the 10755 * address space lock, a_lock, while executing the NFS operations that 10756 * need to go over the wire. Returning EAGAIN to the caller of this 10757 * function is what drives the execution of the callback that we add 10758 * below. The callback will be executed by the address space code 10759 * after dropping the a_lock. When the callback is finished, since 10760 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 10761 * is called again on the same segment to finish the rest of the work 10762 * that needs to happen during unmapping. 10763 * 10764 * This action of calling back into the segment driver causes 10765 * nfs4_delmap() to get called again, but since the callback was 10766 * already executed at this point, it already did the work and there 10767 * is nothing left for us to do. 10768 * 10769 * To Summarize: 10770 * - The first time nfs4_delmap is called by the current thread is when 10771 * we add the caller associated with this delmap to the delmap caller 10772 * list, add the callback, and return EAGAIN. 10773 * - The second time in this call chain when nfs4_delmap is called we 10774 * will find this caller in the delmap caller list and realize there 10775 * is no more work to do thus removing this caller from the list and 10776 * returning the error that was set in the callback execution. 10777 */ 10778 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 10779 if (caller_found) { 10780 /* 10781 * 'error' is from the actual delmap operations. To avoid 10782 * hangs, we need to handle the return of EAGAIN differently 10783 * since this is what drives the callback execution. 10784 * In this case, we don't want to return EAGAIN and do the 10785 * callback execution because there are none to execute. 10786 */ 10787 if (error == EAGAIN) 10788 return (0); 10789 else 10790 return (error); 10791 } 10792 10793 /* current caller was not in the list */ 10794 delmap_call = nfs4_init_delmapcall(); 10795 10796 mutex_enter(&rp->r_statelock); 10797 list_insert_tail(&rp->r_indelmap, delmap_call); 10798 mutex_exit(&rp->r_statelock); 10799 10800 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 10801 10802 dmapp->vp = vp; 10803 dmapp->off = off; 10804 dmapp->addr = addr; 10805 dmapp->len = len; 10806 dmapp->prot = prot; 10807 dmapp->maxprot = maxprot; 10808 dmapp->flags = flags; 10809 dmapp->cr = cr; 10810 dmapp->caller = delmap_call; 10811 10812 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 10813 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 10814 10815 return (error ? error : EAGAIN); 10816 } 10817 10818 static nfs4_delmapcall_t * 10819 nfs4_init_delmapcall() 10820 { 10821 nfs4_delmapcall_t *delmap_call; 10822 10823 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 10824 delmap_call->call_id = curthread; 10825 delmap_call->error = 0; 10826 10827 return (delmap_call); 10828 } 10829 10830 static void 10831 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 10832 { 10833 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 10834 } 10835 10836 /* 10837 * Searches for the current delmap caller (based on curthread) in the list of 10838 * callers. If it is found, we remove it and free the delmap caller. 10839 * Returns: 10840 * 0 if the caller wasn't found 10841 * 1 if the caller was found, removed and freed. *errp will be set 10842 * to what the result of the delmap was. 10843 */ 10844 static int 10845 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 10846 { 10847 nfs4_delmapcall_t *delmap_call; 10848 10849 /* 10850 * If the list doesn't exist yet, we create it and return 10851 * that the caller wasn't found. No list = no callers. 10852 */ 10853 mutex_enter(&rp->r_statelock); 10854 if (!(rp->r_flags & R4DELMAPLIST)) { 10855 /* The list does not exist */ 10856 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 10857 offsetof(nfs4_delmapcall_t, call_node)); 10858 rp->r_flags |= R4DELMAPLIST; 10859 mutex_exit(&rp->r_statelock); 10860 return (0); 10861 } else { 10862 /* The list exists so search it */ 10863 for (delmap_call = list_head(&rp->r_indelmap); 10864 delmap_call != NULL; 10865 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 10866 if (delmap_call->call_id == curthread) { 10867 /* current caller is in the list */ 10868 *errp = delmap_call->error; 10869 list_remove(&rp->r_indelmap, delmap_call); 10870 mutex_exit(&rp->r_statelock); 10871 nfs4_free_delmapcall(delmap_call); 10872 return (1); 10873 } 10874 } 10875 } 10876 mutex_exit(&rp->r_statelock); 10877 return (0); 10878 } 10879 10880 /* 10881 * Remove some pages from an mmap'd vnode. Just update the 10882 * count of pages. If doing close-to-open, then flush and 10883 * commit all of the pages associated with this file. 10884 * Otherwise, start an asynchronous page flush to write out 10885 * any dirty pages. This will also associate a credential 10886 * with the rnode which can be used to write the pages. 10887 */ 10888 /* ARGSUSED */ 10889 static void 10890 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 10891 { 10892 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 10893 rnode4_t *rp; 10894 mntinfo4_t *mi; 10895 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 10896 10897 rp = VTOR4(dmapp->vp); 10898 mi = VTOMI4(dmapp->vp); 10899 10900 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 10901 ASSERT(rp->r_mapcnt >= 0); 10902 10903 /* 10904 * Initiate a page flush and potential commit if there are 10905 * pages, the file system was not mounted readonly, the segment 10906 * was mapped shared, and the pages themselves were writeable. 10907 */ 10908 if (nfs4_has_pages(dmapp->vp) && 10909 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 10910 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 10911 mutex_enter(&rp->r_statelock); 10912 rp->r_flags |= R4DIRTY; 10913 mutex_exit(&rp->r_statelock); 10914 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 10915 dmapp->len, dmapp->cr); 10916 if (!e.error) { 10917 mutex_enter(&rp->r_statelock); 10918 e.error = rp->r_error; 10919 rp->r_error = 0; 10920 mutex_exit(&rp->r_statelock); 10921 } 10922 } else 10923 e.error = 0; 10924 10925 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 10926 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 10927 B_INVAL, dmapp->cr); 10928 10929 if (e.error) { 10930 e.stat = puterrno4(e.error); 10931 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 10932 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 10933 dmapp->caller->error = e.error; 10934 } 10935 10936 /* Check to see if we need to close the file */ 10937 10938 if (dmapp->vp->v_type == VREG) { 10939 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 10940 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 10941 10942 if (e.error != 0 || e.stat != NFS4_OK) { 10943 /* 10944 * Since it is possible that e.error == 0 and 10945 * e.stat != NFS4_OK (and vice versa), 10946 * we do the proper checking in order to get both 10947 * e.error and e.stat reporting the correct info. 10948 */ 10949 if (e.stat == NFS4_OK) 10950 e.stat = puterrno4(e.error); 10951 if (e.error == 0) 10952 e.error = geterrno4(e.stat); 10953 10954 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 10955 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 10956 dmapp->caller->error = e.error; 10957 } 10958 } 10959 10960 (void) as_delete_callback(as, arg); 10961 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 10962 } 10963 10964 10965 static uint_t 10966 fattr4_maxfilesize_to_bits(uint64_t ll) 10967 { 10968 uint_t l = 1; 10969 10970 if (ll == 0) { 10971 return (0); 10972 } 10973 10974 if (ll & 0xffffffff00000000) { 10975 l += 32; ll >>= 32; 10976 } 10977 if (ll & 0xffff0000) { 10978 l += 16; ll >>= 16; 10979 } 10980 if (ll & 0xff00) { 10981 l += 8; ll >>= 8; 10982 } 10983 if (ll & 0xf0) { 10984 l += 4; ll >>= 4; 10985 } 10986 if (ll & 0xc) { 10987 l += 2; ll >>= 2; 10988 } 10989 if (ll & 0x2) { 10990 l += 1; 10991 } 10992 return (l); 10993 } 10994 10995 static int 10996 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 10997 { 10998 int error; 10999 hrtime_t t; 11000 rnode4_t *rp; 11001 nfs4_ga_res_t gar; 11002 nfs4_ga_ext_res_t ger; 11003 11004 gar.n4g_ext_res = &ger; 11005 11006 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11007 return (EIO); 11008 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11009 *valp = MAXPATHLEN; 11010 return (0); 11011 } 11012 if (cmd == _PC_ACL_ENABLED) { 11013 *valp = _ACL_ACE_ENABLED; 11014 return (0); 11015 } 11016 11017 rp = VTOR4(vp); 11018 if (cmd == _PC_XATTR_EXISTS) { 11019 /* 11020 * Eventually should attempt small client readdir before 11021 * going otw with GETATTR(FATTR4_NAMED_ATTR). For now 11022 * just drive the OTW getattr. This is required because 11023 * _PC_XATTR_EXISTS can only return true if attributes 11024 * exist -- simply checking for existance of the attrdir 11025 * is not sufficient. 11026 * 11027 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11028 * is NULL. Once the xadir vp exists, we can create xattrs, 11029 * and we don't have any way to update the "base" object's 11030 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11031 * could help out. 11032 */ 11033 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11034 rp->r_xattr_dir == NULL) { 11035 *valp = rp->r_pathconf.pc4_xattr_exists; 11036 return (0); 11037 } 11038 } else { /* OLD CODE */ 11039 if (ATTRCACHE4_VALID(vp)) { 11040 mutex_enter(&rp->r_statelock); 11041 if (rp->r_pathconf.pc4_cache_valid) { 11042 error = 0; 11043 switch (cmd) { 11044 case _PC_FILESIZEBITS: 11045 *valp = 11046 rp->r_pathconf.pc4_filesizebits; 11047 break; 11048 case _PC_LINK_MAX: 11049 *valp = 11050 rp->r_pathconf.pc4_link_max; 11051 break; 11052 case _PC_NAME_MAX: 11053 *valp = 11054 rp->r_pathconf.pc4_name_max; 11055 break; 11056 case _PC_CHOWN_RESTRICTED: 11057 *valp = 11058 rp->r_pathconf.pc4_chown_restricted; 11059 break; 11060 case _PC_NO_TRUNC: 11061 *valp = 11062 rp->r_pathconf.pc4_no_trunc; 11063 break; 11064 default: 11065 error = EINVAL; 11066 break; 11067 } 11068 mutex_exit(&rp->r_statelock); 11069 #ifdef DEBUG 11070 nfs4_pathconf_cache_hits++; 11071 #endif 11072 return (error); 11073 } 11074 mutex_exit(&rp->r_statelock); 11075 } 11076 } 11077 #ifdef DEBUG 11078 nfs4_pathconf_cache_misses++; 11079 #endif 11080 11081 t = gethrtime(); 11082 11083 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11084 11085 if (error) { 11086 mutex_enter(&rp->r_statelock); 11087 rp->r_pathconf.pc4_cache_valid = FALSE; 11088 rp->r_pathconf.pc4_xattr_valid = FALSE; 11089 mutex_exit(&rp->r_statelock); 11090 return (error); 11091 } 11092 11093 /* interpret the max filesize */ 11094 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11095 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11096 11097 /* Store the attributes we just received */ 11098 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11099 11100 switch (cmd) { 11101 case _PC_FILESIZEBITS: 11102 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11103 break; 11104 case _PC_LINK_MAX: 11105 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11106 break; 11107 case _PC_NAME_MAX: 11108 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11109 break; 11110 case _PC_CHOWN_RESTRICTED: 11111 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11112 break; 11113 case _PC_NO_TRUNC: 11114 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11115 break; 11116 case _PC_XATTR_EXISTS: 11117 *valp = gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists; 11118 break; 11119 default: 11120 return (EINVAL); 11121 } 11122 11123 return (0); 11124 } 11125 11126 /* 11127 * Called by async thread to do synchronous pageio. Do the i/o, wait 11128 * for it to complete, and cleanup the page list when done. 11129 */ 11130 static int 11131 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11132 int flags, cred_t *cr) 11133 { 11134 int error; 11135 11136 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11137 11138 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11139 if (flags & B_READ) 11140 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11141 else 11142 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11143 return (error); 11144 } 11145 11146 static int 11147 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11148 int flags, cred_t *cr) 11149 { 11150 int error; 11151 rnode4_t *rp; 11152 11153 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11154 return (EIO); 11155 11156 if (pp == NULL) 11157 return (EINVAL); 11158 11159 rp = VTOR4(vp); 11160 mutex_enter(&rp->r_statelock); 11161 rp->r_count++; 11162 mutex_exit(&rp->r_statelock); 11163 11164 if (flags & B_ASYNC) { 11165 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11166 nfs4_sync_pageio); 11167 } else 11168 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11169 mutex_enter(&rp->r_statelock); 11170 rp->r_count--; 11171 cv_broadcast(&rp->r_cv); 11172 mutex_exit(&rp->r_statelock); 11173 return (error); 11174 } 11175 11176 static void 11177 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 11178 { 11179 int error; 11180 rnode4_t *rp; 11181 page_t *plist; 11182 page_t *pptr; 11183 offset3 offset; 11184 count3 len; 11185 k_sigset_t smask; 11186 11187 /* 11188 * We should get called with fl equal to either B_FREE or 11189 * B_INVAL. Any other value is illegal. 11190 * 11191 * The page that we are either supposed to free or destroy 11192 * should be exclusive locked and its io lock should not 11193 * be held. 11194 */ 11195 ASSERT(fl == B_FREE || fl == B_INVAL); 11196 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11197 11198 rp = VTOR4(vp); 11199 11200 /* 11201 * If the page doesn't need to be committed or we shouldn't 11202 * even bother attempting to commit it, then just make sure 11203 * that the p_fsdata byte is clear and then either free or 11204 * destroy the page as appropriate. 11205 */ 11206 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11207 pp->p_fsdata = C_NOCOMMIT; 11208 if (fl == B_FREE) 11209 page_free(pp, dn); 11210 else 11211 page_destroy(pp, dn); 11212 return; 11213 } 11214 11215 /* 11216 * If there is a page invalidation operation going on, then 11217 * if this is one of the pages being destroyed, then just 11218 * clear the p_fsdata byte and then either free or destroy 11219 * the page as appropriate. 11220 */ 11221 mutex_enter(&rp->r_statelock); 11222 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11223 mutex_exit(&rp->r_statelock); 11224 pp->p_fsdata = C_NOCOMMIT; 11225 if (fl == B_FREE) 11226 page_free(pp, dn); 11227 else 11228 page_destroy(pp, dn); 11229 return; 11230 } 11231 11232 /* 11233 * If we are freeing this page and someone else is already 11234 * waiting to do a commit, then just unlock the page and 11235 * return. That other thread will take care of commiting 11236 * this page. The page can be freed sometime after the 11237 * commit has finished. Otherwise, if the page is marked 11238 * as delay commit, then we may be getting called from 11239 * pvn_write_done, one page at a time. This could result 11240 * in one commit per page, so we end up doing lots of small 11241 * commits instead of fewer larger commits. This is bad, 11242 * we want do as few commits as possible. 11243 */ 11244 if (fl == B_FREE) { 11245 if (rp->r_flags & R4COMMITWAIT) { 11246 page_unlock(pp); 11247 mutex_exit(&rp->r_statelock); 11248 return; 11249 } 11250 if (pp->p_fsdata == C_DELAYCOMMIT) { 11251 pp->p_fsdata = C_COMMIT; 11252 page_unlock(pp); 11253 mutex_exit(&rp->r_statelock); 11254 return; 11255 } 11256 } 11257 11258 /* 11259 * Check to see if there is a signal which would prevent an 11260 * attempt to commit the pages from being successful. If so, 11261 * then don't bother with all of the work to gather pages and 11262 * generate the unsuccessful RPC. Just return from here and 11263 * let the page be committed at some later time. 11264 */ 11265 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11266 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11267 sigunintr(&smask); 11268 page_unlock(pp); 11269 mutex_exit(&rp->r_statelock); 11270 return; 11271 } 11272 sigunintr(&smask); 11273 11274 /* 11275 * We are starting to need to commit pages, so let's try 11276 * to commit as many as possible at once to reduce the 11277 * overhead. 11278 * 11279 * Set the `commit inprogress' state bit. We must 11280 * first wait until any current one finishes. Then 11281 * we initialize the c_pages list with this page. 11282 */ 11283 while (rp->r_flags & R4COMMIT) { 11284 rp->r_flags |= R4COMMITWAIT; 11285 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11286 rp->r_flags &= ~R4COMMITWAIT; 11287 } 11288 rp->r_flags |= R4COMMIT; 11289 mutex_exit(&rp->r_statelock); 11290 ASSERT(rp->r_commit.c_pages == NULL); 11291 rp->r_commit.c_pages = pp; 11292 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11293 rp->r_commit.c_commlen = PAGESIZE; 11294 11295 /* 11296 * Gather together all other pages which can be committed. 11297 * They will all be chained off r_commit.c_pages. 11298 */ 11299 nfs4_get_commit(vp); 11300 11301 /* 11302 * Clear the `commit inprogress' status and disconnect 11303 * the list of pages to be committed from the rnode. 11304 * At this same time, we also save the starting offset 11305 * and length of data to be committed on the server. 11306 */ 11307 plist = rp->r_commit.c_pages; 11308 rp->r_commit.c_pages = NULL; 11309 offset = rp->r_commit.c_commbase; 11310 len = rp->r_commit.c_commlen; 11311 mutex_enter(&rp->r_statelock); 11312 rp->r_flags &= ~R4COMMIT; 11313 cv_broadcast(&rp->r_commit.c_cv); 11314 mutex_exit(&rp->r_statelock); 11315 11316 if (curproc == proc_pageout || curproc == proc_fsflush || 11317 nfs_zone() != VTOMI4(vp)->mi_zone) { 11318 nfs4_async_commit(vp, plist, offset, len, 11319 cr, do_nfs4_async_commit); 11320 return; 11321 } 11322 11323 /* 11324 * Actually generate the COMMIT op over the wire operation. 11325 */ 11326 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11327 11328 /* 11329 * If we got an error during the commit, just unlock all 11330 * of the pages. The pages will get retransmitted to the 11331 * server during a putpage operation. 11332 */ 11333 if (error) { 11334 while (plist != NULL) { 11335 pptr = plist; 11336 page_sub(&plist, pptr); 11337 page_unlock(pptr); 11338 } 11339 return; 11340 } 11341 11342 /* 11343 * We've tried as hard as we can to commit the data to stable 11344 * storage on the server. We just unlock the rest of the pages 11345 * and clear the commit required state. They will be put 11346 * onto the tail of the cachelist if they are nolonger 11347 * mapped. 11348 */ 11349 while (plist != pp) { 11350 pptr = plist; 11351 page_sub(&plist, pptr); 11352 pptr->p_fsdata = C_NOCOMMIT; 11353 page_unlock(pptr); 11354 } 11355 11356 /* 11357 * It is possible that nfs4_commit didn't return error but 11358 * some other thread has modified the page we are going 11359 * to free/destroy. 11360 * In this case we need to rewrite the page. Do an explicit check 11361 * before attempting to free/destroy the page. If modified, needs to 11362 * be rewritten so unlock the page and return. 11363 */ 11364 if (hat_ismod(pp)) { 11365 pp->p_fsdata = C_NOCOMMIT; 11366 page_unlock(pp); 11367 return; 11368 } 11369 11370 /* 11371 * Now, as appropriate, either free or destroy the page 11372 * that we were called with. 11373 */ 11374 pp->p_fsdata = C_NOCOMMIT; 11375 if (fl == B_FREE) 11376 page_free(pp, dn); 11377 else 11378 page_destroy(pp, dn); 11379 } 11380 11381 /* 11382 * Commit requires that the current fh be the file written to. 11383 * The compound op structure is: 11384 * PUTFH(file), COMMIT 11385 */ 11386 static int 11387 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11388 { 11389 COMPOUND4args_clnt args; 11390 COMPOUND4res_clnt res; 11391 COMMIT4res *cm_res; 11392 nfs_argop4 argop[2]; 11393 nfs_resop4 *resop; 11394 int doqueue; 11395 mntinfo4_t *mi; 11396 rnode4_t *rp; 11397 cred_t *cred_otw = NULL; 11398 bool_t needrecov = FALSE; 11399 nfs4_recov_state_t recov_state; 11400 nfs4_open_stream_t *osp = NULL; 11401 bool_t first_time = TRUE; /* first time getting OTW cred */ 11402 bool_t last_time = FALSE; /* last time getting OTW cred */ 11403 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11404 11405 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11406 11407 rp = VTOR4(vp); 11408 11409 mi = VTOMI4(vp); 11410 recov_state.rs_flags = 0; 11411 recov_state.rs_num_retry_despite_err = 0; 11412 get_commit_cred: 11413 /* 11414 * Releases the osp, if a valid open stream is provided. 11415 * Puts a hold on the cred_otw and the new osp (if found). 11416 */ 11417 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11418 &first_time, &last_time); 11419 args.ctag = TAG_COMMIT; 11420 recov_retry: 11421 /* 11422 * Commit ops: putfh file; commit 11423 */ 11424 args.array_len = 2; 11425 args.array = argop; 11426 11427 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11428 &recov_state, NULL); 11429 if (e.error) { 11430 crfree(cred_otw); 11431 if (osp != NULL) 11432 open_stream_rele(osp, rp); 11433 return (e.error); 11434 } 11435 11436 /* putfh directory */ 11437 argop[0].argop = OP_CPUTFH; 11438 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11439 11440 /* commit */ 11441 argop[1].argop = OP_COMMIT; 11442 argop[1].nfs_argop4_u.opcommit.offset = offset; 11443 argop[1].nfs_argop4_u.opcommit.count = count; 11444 11445 doqueue = 1; 11446 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11447 11448 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11449 if (!needrecov && e.error) { 11450 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11451 needrecov); 11452 crfree(cred_otw); 11453 if (e.error == EACCES && last_time == FALSE) 11454 goto get_commit_cred; 11455 if (osp != NULL) 11456 open_stream_rele(osp, rp); 11457 return (e.error); 11458 } 11459 11460 if (needrecov) { 11461 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11462 NULL, OP_COMMIT, NULL) == FALSE) { 11463 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11464 &recov_state, needrecov); 11465 if (!e.error) 11466 (void) xdr_free(xdr_COMPOUND4res_clnt, 11467 (caddr_t)&res); 11468 goto recov_retry; 11469 } 11470 if (e.error) { 11471 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11472 &recov_state, needrecov); 11473 crfree(cred_otw); 11474 if (osp != NULL) 11475 open_stream_rele(osp, rp); 11476 return (e.error); 11477 } 11478 /* fall through for res.status case */ 11479 } 11480 11481 if (res.status) { 11482 e.error = geterrno4(res.status); 11483 if (e.error == EACCES && last_time == FALSE) { 11484 crfree(cred_otw); 11485 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11486 &recov_state, needrecov); 11487 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11488 goto get_commit_cred; 11489 } 11490 /* 11491 * Can't do a nfs4_purge_stale_fh here because this 11492 * can cause a deadlock. nfs4_commit can 11493 * be called from nfs4_dispose which can be called 11494 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11495 * can call back to pvn_vplist_dirty. 11496 */ 11497 if (e.error == ESTALE) { 11498 mutex_enter(&rp->r_statelock); 11499 rp->r_flags |= R4STALE; 11500 if (!rp->r_error) 11501 rp->r_error = e.error; 11502 mutex_exit(&rp->r_statelock); 11503 PURGE_ATTRCACHE4(vp); 11504 } else { 11505 mutex_enter(&rp->r_statelock); 11506 if (!rp->r_error) 11507 rp->r_error = e.error; 11508 mutex_exit(&rp->r_statelock); 11509 } 11510 } else { 11511 ASSERT(rp->r_flags & R4HAVEVERF); 11512 resop = &res.array[1]; /* commit res */ 11513 cm_res = &resop->nfs_resop4_u.opcommit; 11514 mutex_enter(&rp->r_statelock); 11515 if (cm_res->writeverf == rp->r_writeverf) { 11516 mutex_exit(&rp->r_statelock); 11517 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11518 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11519 &recov_state, needrecov); 11520 crfree(cred_otw); 11521 if (osp != NULL) 11522 open_stream_rele(osp, rp); 11523 return (0); 11524 } 11525 nfs4_set_mod(vp); 11526 rp->r_writeverf = cm_res->writeverf; 11527 mutex_exit(&rp->r_statelock); 11528 e.error = NFS_VERF_MISMATCH; 11529 } 11530 11531 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11532 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11533 crfree(cred_otw); 11534 if (osp != NULL) 11535 open_stream_rele(osp, rp); 11536 11537 return (e.error); 11538 } 11539 11540 static void 11541 nfs4_set_mod(vnode_t *vp) 11542 { 11543 page_t *pp; 11544 kmutex_t *vphm; 11545 rnode4_t *rp; 11546 11547 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11548 11549 /* make sure we're looking at the master vnode, not a shadow */ 11550 11551 rp = VTOR4(vp); 11552 if (IS_SHADOW(vp, rp)) 11553 vp = RTOV4(rp); 11554 11555 vphm = page_vnode_mutex(vp); 11556 mutex_enter(vphm); 11557 /* 11558 * If there are no pages associated with this vnode, then 11559 * just return. 11560 */ 11561 if ((pp = vp->v_pages) == NULL) { 11562 mutex_exit(vphm); 11563 return; 11564 } 11565 11566 do { 11567 if (pp->p_fsdata != C_NOCOMMIT) { 11568 hat_setmod(pp); 11569 pp->p_fsdata = C_NOCOMMIT; 11570 } 11571 } while ((pp = pp->p_vpnext) != vp->v_pages); 11572 mutex_exit(vphm); 11573 } 11574 11575 /* 11576 * This function is used to gather a page list of the pages which 11577 * can be committed on the server. 11578 * 11579 * The calling thread must have set R4COMMIT. This bit is used to 11580 * serialize access to the commit structure in the rnode. As long 11581 * as the thread has set R4COMMIT, then it can manipulate the commit 11582 * structure without requiring any other locks. 11583 * 11584 * When this function is called from nfs4_dispose() the page passed 11585 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11586 * will skip it. This is not a problem since we initially add the 11587 * page to the r_commit page list. 11588 * 11589 */ 11590 static void 11591 nfs4_get_commit(vnode_t *vp) 11592 { 11593 rnode4_t *rp; 11594 page_t *pp; 11595 kmutex_t *vphm; 11596 11597 rp = VTOR4(vp); 11598 11599 ASSERT(rp->r_flags & R4COMMIT); 11600 11601 /* make sure we're looking at the master vnode, not a shadow */ 11602 11603 if (IS_SHADOW(vp, rp)) 11604 vp = RTOV4(rp); 11605 11606 vphm = page_vnode_mutex(vp); 11607 mutex_enter(vphm); 11608 11609 /* 11610 * If there are no pages associated with this vnode, then 11611 * just return. 11612 */ 11613 if ((pp = vp->v_pages) == NULL) { 11614 mutex_exit(vphm); 11615 return; 11616 } 11617 11618 /* 11619 * Step through all of the pages associated with this vnode 11620 * looking for pages which need to be committed. 11621 */ 11622 do { 11623 /* 11624 * First short-cut everything (without the page_lock) 11625 * and see if this page does not need to be committed 11626 * or is modified if so then we'll just skip it. 11627 */ 11628 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11629 continue; 11630 11631 /* 11632 * Attempt to lock the page. If we can't, then 11633 * someone else is messing with it or we have been 11634 * called from nfs4_dispose and this is the page that 11635 * nfs4_dispose was called with.. anyway just skip it. 11636 */ 11637 if (!page_trylock(pp, SE_EXCL)) 11638 continue; 11639 11640 /* 11641 * Lets check again now that we have the page lock. 11642 */ 11643 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11644 page_unlock(pp); 11645 continue; 11646 } 11647 11648 /* this had better not be a free page */ 11649 ASSERT(PP_ISFREE(pp) == 0); 11650 11651 /* 11652 * The page needs to be committed and we locked it. 11653 * Update the base and length parameters and add it 11654 * to r_pages. 11655 */ 11656 if (rp->r_commit.c_pages == NULL) { 11657 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11658 rp->r_commit.c_commlen = PAGESIZE; 11659 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11660 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11661 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11662 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11663 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11664 <= pp->p_offset) { 11665 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11666 rp->r_commit.c_commbase + PAGESIZE; 11667 } 11668 page_add(&rp->r_commit.c_pages, pp); 11669 } while ((pp = pp->p_vpnext) != vp->v_pages); 11670 11671 mutex_exit(vphm); 11672 } 11673 11674 /* 11675 * This routine is used to gather together a page list of the pages 11676 * which are to be committed on the server. This routine must not 11677 * be called if the calling thread holds any locked pages. 11678 * 11679 * The calling thread must have set R4COMMIT. This bit is used to 11680 * serialize access to the commit structure in the rnode. As long 11681 * as the thread has set R4COMMIT, then it can manipulate the commit 11682 * structure without requiring any other locks. 11683 */ 11684 static void 11685 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11686 { 11687 11688 rnode4_t *rp; 11689 page_t *pp; 11690 u_offset_t end; 11691 u_offset_t off; 11692 ASSERT(len != 0); 11693 rp = VTOR4(vp); 11694 ASSERT(rp->r_flags & R4COMMIT); 11695 11696 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11697 11698 /* make sure we're looking at the master vnode, not a shadow */ 11699 11700 if (IS_SHADOW(vp, rp)) 11701 vp = RTOV4(rp); 11702 11703 /* 11704 * If there are no pages associated with this vnode, then 11705 * just return. 11706 */ 11707 if ((pp = vp->v_pages) == NULL) 11708 return; 11709 /* 11710 * Calculate the ending offset. 11711 */ 11712 end = soff + len; 11713 for (off = soff; off < end; off += PAGESIZE) { 11714 /* 11715 * Lookup each page by vp, offset. 11716 */ 11717 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11718 continue; 11719 /* 11720 * If this page does not need to be committed or is 11721 * modified, then just skip it. 11722 */ 11723 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11724 page_unlock(pp); 11725 continue; 11726 } 11727 11728 ASSERT(PP_ISFREE(pp) == 0); 11729 /* 11730 * The page needs to be committed and we locked it. 11731 * Update the base and length parameters and add it 11732 * to r_pages. 11733 */ 11734 if (rp->r_commit.c_pages == NULL) { 11735 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11736 rp->r_commit.c_commlen = PAGESIZE; 11737 } else { 11738 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11739 rp->r_commit.c_commbase + PAGESIZE; 11740 } 11741 page_add(&rp->r_commit.c_pages, pp); 11742 } 11743 } 11744 11745 /* 11746 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 11747 * Flushes and commits data to the server. 11748 */ 11749 static int 11750 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 11751 { 11752 int error; 11753 verifier4 write_verf; 11754 rnode4_t *rp = VTOR4(vp); 11755 11756 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11757 11758 /* 11759 * Flush the data portion of the file and then commit any 11760 * portions which need to be committed. This may need to 11761 * be done twice if the server has changed state since 11762 * data was last written. The data will need to be 11763 * rewritten to the server and then a new commit done. 11764 * 11765 * In fact, this may need to be done several times if the 11766 * server is having problems and crashing while we are 11767 * attempting to do this. 11768 */ 11769 11770 top: 11771 /* 11772 * Do a flush based on the poff and plen arguments. This 11773 * will synchronously write out any modified pages in the 11774 * range specified by (poff, plen). This starts all of the 11775 * i/o operations which will be waited for in the next 11776 * call to nfs4_putpage 11777 */ 11778 11779 mutex_enter(&rp->r_statelock); 11780 write_verf = rp->r_writeverf; 11781 mutex_exit(&rp->r_statelock); 11782 11783 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr); 11784 if (error == EAGAIN) 11785 error = 0; 11786 11787 /* 11788 * Do a flush based on the poff and plen arguments. This 11789 * will synchronously write out any modified pages in the 11790 * range specified by (poff, plen) and wait until all of 11791 * the asynchronous i/o's in that range are done as well. 11792 */ 11793 if (!error) 11794 error = nfs4_putpage(vp, poff, plen, 0, cr); 11795 11796 if (error) 11797 return (error); 11798 11799 mutex_enter(&rp->r_statelock); 11800 if (rp->r_writeverf != write_verf) { 11801 mutex_exit(&rp->r_statelock); 11802 goto top; 11803 } 11804 mutex_exit(&rp->r_statelock); 11805 11806 /* 11807 * Now commit any pages which might need to be committed. 11808 * If the error, NFS_VERF_MISMATCH, is returned, then 11809 * start over with the flush operation. 11810 */ 11811 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 11812 11813 if (error == NFS_VERF_MISMATCH) 11814 goto top; 11815 11816 return (error); 11817 } 11818 11819 /* 11820 * nfs4_commit_vp() will wait for other pending commits and 11821 * will either commit the whole file or a range, plen dictates 11822 * if we commit whole file. a value of zero indicates the whole 11823 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 11824 */ 11825 static int 11826 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 11827 cred_t *cr, int wait_on_writes) 11828 { 11829 rnode4_t *rp; 11830 page_t *plist; 11831 offset3 offset; 11832 count3 len; 11833 11834 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11835 11836 rp = VTOR4(vp); 11837 11838 /* 11839 * before we gather commitable pages make 11840 * sure there are no outstanding async writes 11841 */ 11842 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 11843 mutex_enter(&rp->r_statelock); 11844 while (rp->r_count > 0) { 11845 cv_wait(&rp->r_cv, &rp->r_statelock); 11846 } 11847 mutex_exit(&rp->r_statelock); 11848 } 11849 11850 /* 11851 * Set the `commit inprogress' state bit. We must 11852 * first wait until any current one finishes. 11853 */ 11854 mutex_enter(&rp->r_statelock); 11855 while (rp->r_flags & R4COMMIT) { 11856 rp->r_flags |= R4COMMITWAIT; 11857 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11858 rp->r_flags &= ~R4COMMITWAIT; 11859 } 11860 rp->r_flags |= R4COMMIT; 11861 mutex_exit(&rp->r_statelock); 11862 11863 /* 11864 * Gather all of the pages which need to be 11865 * committed. 11866 */ 11867 if (plen == 0) 11868 nfs4_get_commit(vp); 11869 else 11870 nfs4_get_commit_range(vp, poff, plen); 11871 11872 /* 11873 * Clear the `commit inprogress' bit and disconnect the 11874 * page list which was gathered by nfs4_get_commit. 11875 */ 11876 plist = rp->r_commit.c_pages; 11877 rp->r_commit.c_pages = NULL; 11878 offset = rp->r_commit.c_commbase; 11879 len = rp->r_commit.c_commlen; 11880 mutex_enter(&rp->r_statelock); 11881 rp->r_flags &= ~R4COMMIT; 11882 cv_broadcast(&rp->r_commit.c_cv); 11883 mutex_exit(&rp->r_statelock); 11884 11885 /* 11886 * If any pages need to be committed, commit them and 11887 * then unlock them so that they can be freed some 11888 * time later. 11889 */ 11890 if (plist == NULL) 11891 return (0); 11892 11893 /* 11894 * No error occurred during the flush portion 11895 * of this operation, so now attempt to commit 11896 * the data to stable storage on the server. 11897 * 11898 * This will unlock all of the pages on the list. 11899 */ 11900 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 11901 } 11902 11903 static int 11904 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11905 cred_t *cr) 11906 { 11907 int error; 11908 page_t *pp; 11909 11910 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11911 11912 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 11913 11914 /* 11915 * If we got an error, then just unlock all of the pages 11916 * on the list. 11917 */ 11918 if (error) { 11919 while (plist != NULL) { 11920 pp = plist; 11921 page_sub(&plist, pp); 11922 page_unlock(pp); 11923 } 11924 return (error); 11925 } 11926 /* 11927 * We've tried as hard as we can to commit the data to stable 11928 * storage on the server. We just unlock the pages and clear 11929 * the commit required state. They will get freed later. 11930 */ 11931 while (plist != NULL) { 11932 pp = plist; 11933 page_sub(&plist, pp); 11934 pp->p_fsdata = C_NOCOMMIT; 11935 page_unlock(pp); 11936 } 11937 11938 return (error); 11939 } 11940 11941 static void 11942 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11943 cred_t *cr) 11944 { 11945 11946 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 11947 } 11948 11949 /*ARGSUSED*/ 11950 static int 11951 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 11952 { 11953 int error = 0; 11954 mntinfo4_t *mi; 11955 vattr_t va; 11956 vsecattr_t nfsace4_vsap; 11957 11958 mi = VTOMI4(vp); 11959 if (nfs_zone() != mi->mi_zone) 11960 return (EIO); 11961 if (mi->mi_flags & MI4_ACL) { 11962 /* if we have a delegation, return it */ 11963 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 11964 (void) nfs4delegreturn(VTOR4(vp), 11965 NFS4_DR_REOPEN|NFS4_DR_PUSH); 11966 11967 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 11968 NFS4_ACL_SET); 11969 if (error) /* EINVAL */ 11970 return (error); 11971 11972 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 11973 /* 11974 * These are aclent_t type entries. 11975 */ 11976 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 11977 vp->v_type == VDIR, FALSE); 11978 if (error) 11979 return (error); 11980 } else { 11981 /* 11982 * These are ace_t type entries. 11983 */ 11984 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 11985 FALSE); 11986 if (error) 11987 return (error); 11988 } 11989 bzero(&va, sizeof (va)); 11990 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 11991 vs_ace4_destroy(&nfsace4_vsap); 11992 return (error); 11993 } 11994 return (ENOSYS); 11995 } 11996 11997 static int 11998 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 11999 { 12000 int error; 12001 mntinfo4_t *mi; 12002 nfs4_ga_res_t gar; 12003 rnode4_t *rp = VTOR4(vp); 12004 12005 mi = VTOMI4(vp); 12006 if (nfs_zone() != mi->mi_zone) 12007 return (EIO); 12008 12009 bzero(&gar, sizeof (gar)); 12010 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12011 12012 /* 12013 * vsecattr->vsa_mask holds the original acl request mask. 12014 * This is needed when determining what to return. 12015 * (See: nfs4_create_getsecattr_return()) 12016 */ 12017 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12018 if (error) /* EINVAL */ 12019 return (error); 12020 12021 if (mi->mi_flags & MI4_ACL) { 12022 /* 12023 * Check if the data is cached and the cache is valid. If it 12024 * is we don't go over the wire. 12025 */ 12026 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12027 mutex_enter(&rp->r_statelock); 12028 if (rp->r_secattr != NULL) { 12029 error = nfs4_create_getsecattr_return( 12030 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12031 rp->r_attr.va_gid, 12032 vp->v_type == VDIR); 12033 if (!error) { /* error == 0 - Success! */ 12034 mutex_exit(&rp->r_statelock); 12035 return (error); 12036 } 12037 } 12038 mutex_exit(&rp->r_statelock); 12039 } 12040 12041 /* 12042 * The getattr otw call will always get both the acl, in 12043 * the form of a list of nfsace4's, and the number of acl 12044 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12045 */ 12046 gar.n4g_va.va_mask = AT_ALL; 12047 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12048 if (error) { 12049 vs_ace4_destroy(&gar.n4g_vsa); 12050 if (error == ENOTSUP || error == EOPNOTSUPP) 12051 error = fs_fab_acl(vp, vsecattr, flag, cr); 12052 return (error); 12053 } 12054 12055 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12056 /* 12057 * No error was returned, but according to the response 12058 * bitmap, neither was an acl. 12059 */ 12060 vs_ace4_destroy(&gar.n4g_vsa); 12061 error = fs_fab_acl(vp, vsecattr, flag, cr); 12062 return (error); 12063 } 12064 12065 /* 12066 * Update the cache with the ACL. 12067 */ 12068 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12069 12070 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12071 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12072 vp->v_type == VDIR); 12073 vs_ace4_destroy(&gar.n4g_vsa); 12074 if ((error) && (vsecattr->vsa_mask & 12075 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12076 (error != EACCES)) { 12077 error = fs_fab_acl(vp, vsecattr, flag, cr); 12078 } 12079 return (error); 12080 } 12081 error = fs_fab_acl(vp, vsecattr, flag, cr); 12082 return (error); 12083 } 12084 12085 /* 12086 * The function returns: 12087 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12088 * - EINVAL if the passed in "acl_mask" is an invalid request. 12089 * 12090 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12091 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12092 * 12093 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12094 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12095 * - We have a count field set without the corresponding acl field set. (e.g. - 12096 * VSA_ACECNT is set, but VSA_ACE is not) 12097 */ 12098 static int 12099 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12100 { 12101 /* Shortcut the masks that are always valid. */ 12102 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12103 return (0); 12104 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12105 return (0); 12106 12107 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12108 /* 12109 * We can't have any VSA_ACL type stuff in the mask now. 12110 */ 12111 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12112 VSA_DFACLCNT)) 12113 return (EINVAL); 12114 12115 if (op == NFS4_ACL_SET) { 12116 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12117 return (EINVAL); 12118 } 12119 } 12120 12121 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12122 /* 12123 * We can't have any VSA_ACE type stuff in the mask now. 12124 */ 12125 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12126 return (EINVAL); 12127 12128 if (op == NFS4_ACL_SET) { 12129 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12130 return (EINVAL); 12131 12132 if ((acl_mask & VSA_DFACLCNT) && 12133 !(acl_mask & VSA_DFACL)) 12134 return (EINVAL); 12135 } 12136 } 12137 return (0); 12138 } 12139 12140 /* 12141 * The theory behind creating the correct getsecattr return is simply this: 12142 * "Don't return anything that the caller is not expecting to have to free." 12143 */ 12144 static int 12145 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12146 uid_t uid, gid_t gid, int isdir) 12147 { 12148 int error = 0; 12149 /* Save the mask since the translators modify it. */ 12150 uint_t orig_mask = vsap->vsa_mask; 12151 12152 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12153 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12154 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12155 12156 if (error) 12157 return (error); 12158 12159 /* 12160 * If the caller only asked for the ace count (VSA_ACECNT) 12161 * don't give them the full acl (VSA_ACE), free it. 12162 */ 12163 if (!orig_mask & VSA_ACE) { 12164 if (vsap->vsa_aclentp != NULL) { 12165 kmem_free(vsap->vsa_aclentp, 12166 vsap->vsa_aclcnt * sizeof (ace_t)); 12167 vsap->vsa_aclentp = NULL; 12168 } 12169 } 12170 vsap->vsa_mask = orig_mask; 12171 12172 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12173 VSA_DFACLCNT)) { 12174 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12175 isdir, FALSE, 12176 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12177 12178 if (error) 12179 return (error); 12180 12181 /* 12182 * If the caller only asked for the acl count (VSA_ACLCNT) 12183 * and/or the default acl count (VSA_DFACLCNT) don't give them 12184 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12185 */ 12186 if (!orig_mask & VSA_ACL) { 12187 if (vsap->vsa_aclentp != NULL) { 12188 kmem_free(vsap->vsa_aclentp, 12189 vsap->vsa_aclcnt * sizeof (aclent_t)); 12190 vsap->vsa_aclentp = NULL; 12191 } 12192 } 12193 12194 if (!orig_mask & VSA_DFACL) { 12195 if (vsap->vsa_dfaclentp != NULL) { 12196 kmem_free(vsap->vsa_dfaclentp, 12197 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12198 vsap->vsa_dfaclentp = NULL; 12199 } 12200 } 12201 vsap->vsa_mask = orig_mask; 12202 } 12203 return (0); 12204 } 12205 12206 static int 12207 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 12208 { 12209 int error; 12210 12211 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12212 return (EIO); 12213 /* 12214 * check for valid cmd parameter 12215 */ 12216 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12217 return (EINVAL); 12218 12219 /* 12220 * Check access permissions 12221 */ 12222 if ((cmd & F_SHARE) && 12223 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12224 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12225 return (EBADF); 12226 12227 /* 12228 * If the filesystem is mounted using local locking, pass the 12229 * request off to the local share code. 12230 */ 12231 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12232 return (fs_shrlock(vp, cmd, shr, flag, cr)); 12233 12234 switch (cmd) { 12235 case F_SHARE: 12236 case F_UNSHARE: 12237 /* 12238 * This will be properly implemented later, 12239 * see RFE: 4823948 . 12240 */ 12241 error = EAGAIN; 12242 break; 12243 12244 case F_HASREMOTELOCKS: 12245 /* 12246 * NFS client can't store remote locks itself 12247 */ 12248 shr->s_access = 0; 12249 error = 0; 12250 break; 12251 12252 default: 12253 error = EINVAL; 12254 break; 12255 } 12256 12257 return (error); 12258 } 12259 12260 /* 12261 * Common code called by directory ops to update the attrcache 12262 */ 12263 static int 12264 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12265 hrtime_t t, vnode_t *vp, cred_t *cr) 12266 { 12267 int error = 0; 12268 12269 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12270 12271 if (status != NFS4_OK) { 12272 /* getattr not done or failed */ 12273 PURGE_ATTRCACHE4(vp); 12274 return (error); 12275 } 12276 12277 if (garp) { 12278 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12279 } else { 12280 PURGE_ATTRCACHE4(vp); 12281 } 12282 return (error); 12283 } 12284 12285 /* 12286 * Update directory caches for directory modification ops (link, rename, etc.) 12287 * When dinfo is NULL, manage dircaches in the old way. 12288 */ 12289 static void 12290 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12291 dirattr_info_t *dinfo) 12292 { 12293 rnode4_t *drp = VTOR4(dvp); 12294 12295 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12296 12297 /* Purge rddir cache for dir since it changed */ 12298 if (drp->r_dir != NULL) 12299 nfs4_purge_rddir_cache(dvp); 12300 12301 /* 12302 * If caller provided dinfo, then use it to manage dir caches. 12303 */ 12304 if (dinfo != NULL) { 12305 if (vp != NULL) { 12306 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12307 if (!VTOR4(vp)->created_v4) { 12308 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12309 dnlc_update(dvp, nm, vp); 12310 } else { 12311 /* 12312 * XXX don't update if the created_v4 flag is 12313 * set 12314 */ 12315 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12316 NFS4_DEBUG(nfs4_client_state_debug, 12317 (CE_NOTE, "nfs4_update_dircaches: " 12318 "don't update dnlc: created_v4 flag")); 12319 } 12320 } 12321 12322 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12323 dinfo->di_cred, FALSE, cinfo); 12324 12325 return; 12326 } 12327 12328 /* 12329 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12330 * Since caller modified dir but didn't receive post-dirmod-op dir 12331 * attrs, the dir's attrs must be purged. 12332 * 12333 * XXX this check and dnlc update/purge should really be atomic, 12334 * XXX but can't use rnode statelock because it'll deadlock in 12335 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12336 * XXX does occur. 12337 * 12338 * XXX We also may want to check that atomic is true in the 12339 * XXX change_info struct. If it is not, the change_info may 12340 * XXX reflect changes by more than one clients which means that 12341 * XXX our cache may not be valid. 12342 */ 12343 PURGE_ATTRCACHE4(dvp); 12344 if (drp->r_change == cinfo->before) { 12345 /* no changes took place in the directory prior to our link */ 12346 if (vp != NULL) { 12347 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12348 if (!VTOR4(vp)->created_v4) { 12349 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12350 dnlc_update(dvp, nm, vp); 12351 } else { 12352 /* 12353 * XXX dont' update if the created_v4 flag 12354 * is set 12355 */ 12356 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12357 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12358 "nfs4_update_dircaches: don't" 12359 " update dnlc: created_v4 flag")); 12360 } 12361 } 12362 } else { 12363 /* Another client modified directory - purge its dnlc cache */ 12364 dnlc_purge_vp(dvp); 12365 } 12366 } 12367 12368 /* 12369 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12370 * file. 12371 * 12372 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12373 * file (ie: client recovery) and otherwise set to FALSE. 12374 * 12375 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12376 * initiated) calling functions. 12377 * 12378 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12379 * of resending a 'lost' open request. 12380 * 12381 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12382 * server that hands out BAD_SEQID on open confirm. 12383 * 12384 * Errors are returned via the nfs4_error_t parameter. 12385 */ 12386 void 12387 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12388 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12389 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12390 { 12391 COMPOUND4args_clnt args; 12392 COMPOUND4res_clnt res; 12393 nfs_argop4 argop[2]; 12394 nfs_resop4 *resop; 12395 int doqueue = 1; 12396 mntinfo4_t *mi; 12397 OPEN_CONFIRM4args *open_confirm_args; 12398 int needrecov; 12399 12400 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12401 #if DEBUG 12402 mutex_enter(&oop->oo_lock); 12403 ASSERT(oop->oo_seqid_inuse); 12404 mutex_exit(&oop->oo_lock); 12405 #endif 12406 12407 recov_retry_confirm: 12408 nfs4_error_zinit(ep); 12409 *retry_open = FALSE; 12410 12411 if (resend) 12412 args.ctag = TAG_OPEN_CONFIRM_LOST; 12413 else 12414 args.ctag = TAG_OPEN_CONFIRM; 12415 12416 args.array_len = 2; 12417 args.array = argop; 12418 12419 /* putfh target fh */ 12420 argop[0].argop = OP_CPUTFH; 12421 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12422 12423 argop[1].argop = OP_OPEN_CONFIRM; 12424 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12425 12426 (*seqid) += 1; 12427 open_confirm_args->seqid = *seqid; 12428 open_confirm_args->open_stateid = *stateid; 12429 12430 mi = VTOMI4(vp); 12431 12432 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12433 12434 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12435 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12436 } 12437 12438 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12439 if (!needrecov && ep->error) 12440 return; 12441 12442 if (needrecov) { 12443 bool_t abort = FALSE; 12444 12445 if (reopening_file == FALSE) { 12446 nfs4_bseqid_entry_t *bsep = NULL; 12447 12448 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12449 bsep = nfs4_create_bseqid_entry(oop, NULL, 12450 vp, 0, args.ctag, 12451 open_confirm_args->seqid); 12452 12453 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12454 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12455 if (bsep) { 12456 kmem_free(bsep, sizeof (*bsep)); 12457 if (num_bseqid_retryp && 12458 --(*num_bseqid_retryp) == 0) 12459 abort = TRUE; 12460 } 12461 } 12462 if ((ep->error == ETIMEDOUT || 12463 res.status == NFS4ERR_RESOURCE) && 12464 abort == FALSE && resend == FALSE) { 12465 if (!ep->error) 12466 (void) xdr_free(xdr_COMPOUND4res_clnt, 12467 (caddr_t)&res); 12468 12469 delay(SEC_TO_TICK(confirm_retry_sec)); 12470 goto recov_retry_confirm; 12471 } 12472 /* State may have changed so retry the entire OPEN op */ 12473 if (abort == FALSE) 12474 *retry_open = TRUE; 12475 else 12476 *retry_open = FALSE; 12477 if (!ep->error) 12478 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12479 return; 12480 } 12481 12482 if (res.status) { 12483 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12484 return; 12485 } 12486 12487 resop = &res.array[1]; /* open confirm res */ 12488 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12489 stateid, sizeof (*stateid)); 12490 12491 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12492 } 12493 12494 /* 12495 * Return the credentials associated with a client state object. The 12496 * caller is responsible for freeing the credentials. 12497 */ 12498 12499 static cred_t * 12500 state_to_cred(nfs4_open_stream_t *osp) 12501 { 12502 cred_t *cr; 12503 12504 /* 12505 * It's ok to not lock the open stream and open owner to get 12506 * the oo_cred since this is only written once (upon creation) 12507 * and will not change. 12508 */ 12509 cr = osp->os_open_owner->oo_cred; 12510 crhold(cr); 12511 12512 return (cr); 12513 } 12514 12515 /* 12516 * nfs4_find_sysid 12517 * 12518 * Find the sysid for the knetconfig associated with the given mi. 12519 */ 12520 static struct lm_sysid * 12521 nfs4_find_sysid(mntinfo4_t *mi) 12522 { 12523 ASSERT(nfs_zone() == mi->mi_zone); 12524 12525 /* 12526 * Switch from RDMA knconf to original mount knconf 12527 */ 12528 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12529 mi->mi_curr_serv->sv_hostname, NULL)); 12530 } 12531 12532 #ifdef DEBUG 12533 /* 12534 * Return a string version of the call type for easy reading. 12535 */ 12536 static char * 12537 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12538 { 12539 switch (ctype) { 12540 case NFS4_LCK_CTYPE_NORM: 12541 return ("NORMAL"); 12542 case NFS4_LCK_CTYPE_RECLAIM: 12543 return ("RECLAIM"); 12544 case NFS4_LCK_CTYPE_RESEND: 12545 return ("RESEND"); 12546 case NFS4_LCK_CTYPE_REINSTATE: 12547 return ("REINSTATE"); 12548 default: 12549 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12550 "type %d", ctype); 12551 return (""); 12552 } 12553 } 12554 #endif 12555 12556 /* 12557 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12558 * Unlock requests don't have an over-the-wire locktype, so we just return 12559 * something non-threatening. 12560 */ 12561 12562 static nfs_lock_type4 12563 flk_to_locktype(int cmd, int l_type) 12564 { 12565 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12566 12567 switch (l_type) { 12568 case F_UNLCK: 12569 return (READ_LT); 12570 case F_RDLCK: 12571 if (cmd == F_SETLK) 12572 return (READ_LT); 12573 else 12574 return (READW_LT); 12575 case F_WRLCK: 12576 if (cmd == F_SETLK) 12577 return (WRITE_LT); 12578 else 12579 return (WRITEW_LT); 12580 } 12581 panic("flk_to_locktype"); 12582 /*NOTREACHED*/ 12583 } 12584 12585 /* 12586 * Do some preliminary checks for nfs4frlock. 12587 */ 12588 static int 12589 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12590 u_offset_t offset) 12591 { 12592 int error = 0; 12593 12594 /* 12595 * If we are setting a lock, check that the file is opened 12596 * with the correct mode. 12597 */ 12598 if (cmd == F_SETLK || cmd == F_SETLKW) { 12599 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12600 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12601 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12602 "nfs4frlock_validate_args: file was opened with " 12603 "incorrect mode")); 12604 return (EBADF); 12605 } 12606 } 12607 12608 /* Convert the offset. It may need to be restored before returning. */ 12609 if (error = convoff(vp, flk, 0, offset)) { 12610 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12611 "nfs4frlock_validate_args: convoff => error= %d\n", 12612 error)); 12613 return (error); 12614 } 12615 12616 return (error); 12617 } 12618 12619 /* 12620 * Set the flock64's lm_sysid for nfs4frlock. 12621 */ 12622 static int 12623 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12624 { 12625 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12626 12627 /* Find the lm_sysid */ 12628 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12629 12630 if (*lspp == NULL) { 12631 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12632 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12633 return (ENOLCK); 12634 } 12635 12636 flk->l_sysid = lm_sysidt(*lspp); 12637 12638 return (0); 12639 } 12640 12641 /* 12642 * Do the remaining preliminary setup for nfs4frlock. 12643 */ 12644 static void 12645 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12646 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12647 cred_t **cred_otw) 12648 { 12649 /* 12650 * set tick_delay to the base delay time. 12651 * (NFS4_BASE_WAIT_TIME is in secs) 12652 */ 12653 12654 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12655 12656 /* 12657 * If lock is relative to EOF, we need the newest length of the 12658 * file. Therefore invalidate the ATTR_CACHE. 12659 */ 12660 12661 *whencep = flk->l_whence; 12662 12663 if (*whencep == 2) /* SEEK_END */ 12664 PURGE_ATTRCACHE4(vp); 12665 12666 recov_statep->rs_flags = 0; 12667 recov_statep->rs_num_retry_despite_err = 0; 12668 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12669 } 12670 12671 /* 12672 * Initialize and allocate the data structures necessary for 12673 * the nfs4frlock call. 12674 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12675 */ 12676 static void 12677 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12678 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12679 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12680 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12681 { 12682 int argoplist_size; 12683 int num_ops = 2; 12684 12685 *retry = FALSE; 12686 *did_start_fop = FALSE; 12687 *skip_get_err = FALSE; 12688 lost_rqstp->lr_op = 0; 12689 argoplist_size = num_ops * sizeof (nfs_argop4); 12690 /* fill array with zero */ 12691 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12692 12693 *argspp = argsp; 12694 *respp = NULL; 12695 12696 argsp->array_len = num_ops; 12697 argsp->array = *argopp; 12698 12699 /* initialize in case of error; will get real value down below */ 12700 argsp->ctag = TAG_NONE; 12701 12702 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12703 *op_hintp = OH_LOCKU; 12704 else 12705 *op_hintp = OH_OTHER; 12706 } 12707 12708 /* 12709 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12710 * the proper nfs4_server_t for this instance of nfs4frlock. 12711 * Returns 0 (success) or an errno value. 12712 */ 12713 static int 12714 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12715 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12716 bool_t *did_start_fop, bool_t *startrecovp) 12717 { 12718 int error = 0; 12719 rnode4_t *rp; 12720 12721 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12722 12723 if (ctype == NFS4_LCK_CTYPE_NORM) { 12724 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12725 recov_statep, startrecovp); 12726 if (error) 12727 return (error); 12728 *did_start_fop = TRUE; 12729 } else { 12730 *did_start_fop = FALSE; 12731 *startrecovp = FALSE; 12732 } 12733 12734 if (!error) { 12735 rp = VTOR4(vp); 12736 12737 /* If the file failed recovery, just quit. */ 12738 mutex_enter(&rp->r_statelock); 12739 if (rp->r_flags & R4RECOVERR) { 12740 error = EIO; 12741 } 12742 mutex_exit(&rp->r_statelock); 12743 } 12744 12745 return (error); 12746 } 12747 12748 /* 12749 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 12750 * resend nfs4frlock call is initiated by the recovery framework. 12751 * Acquires the lop and oop seqid synchronization. 12752 */ 12753 static void 12754 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 12755 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 12756 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 12757 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 12758 { 12759 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 12760 int error; 12761 12762 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 12763 (CE_NOTE, 12764 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 12765 ASSERT(resend_rqstp != NULL); 12766 ASSERT(resend_rqstp->lr_op == OP_LOCK || 12767 resend_rqstp->lr_op == OP_LOCKU); 12768 12769 *oopp = resend_rqstp->lr_oop; 12770 if (resend_rqstp->lr_oop) { 12771 open_owner_hold(resend_rqstp->lr_oop); 12772 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 12773 ASSERT(error == 0); /* recov thread always succeeds */ 12774 } 12775 12776 /* Must resend this lost lock/locku request. */ 12777 ASSERT(resend_rqstp->lr_lop != NULL); 12778 *lopp = resend_rqstp->lr_lop; 12779 lock_owner_hold(resend_rqstp->lr_lop); 12780 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 12781 ASSERT(error == 0); /* recov thread always succeeds */ 12782 12783 *ospp = resend_rqstp->lr_osp; 12784 if (*ospp) 12785 open_stream_hold(resend_rqstp->lr_osp); 12786 12787 if (resend_rqstp->lr_op == OP_LOCK) { 12788 LOCK4args *lock_args; 12789 12790 argop->argop = OP_LOCK; 12791 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 12792 lock_args->locktype = resend_rqstp->lr_locktype; 12793 lock_args->reclaim = 12794 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 12795 lock_args->offset = resend_rqstp->lr_flk->l_start; 12796 lock_args->length = resend_rqstp->lr_flk->l_len; 12797 if (lock_args->length == 0) 12798 lock_args->length = ~lock_args->length; 12799 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 12800 mi2clientid(mi), &lock_args->locker); 12801 12802 switch (resend_rqstp->lr_ctype) { 12803 case NFS4_LCK_CTYPE_RESEND: 12804 argsp->ctag = TAG_LOCK_RESEND; 12805 break; 12806 case NFS4_LCK_CTYPE_REINSTATE: 12807 argsp->ctag = TAG_LOCK_REINSTATE; 12808 break; 12809 case NFS4_LCK_CTYPE_RECLAIM: 12810 argsp->ctag = TAG_LOCK_RECLAIM; 12811 break; 12812 default: 12813 argsp->ctag = TAG_LOCK_UNKNOWN; 12814 break; 12815 } 12816 } else { 12817 LOCKU4args *locku_args; 12818 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 12819 12820 argop->argop = OP_LOCKU; 12821 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 12822 locku_args->locktype = READ_LT; 12823 locku_args->seqid = lop->lock_seqid + 1; 12824 mutex_enter(&lop->lo_lock); 12825 locku_args->lock_stateid = lop->lock_stateid; 12826 mutex_exit(&lop->lo_lock); 12827 locku_args->offset = resend_rqstp->lr_flk->l_start; 12828 locku_args->length = resend_rqstp->lr_flk->l_len; 12829 if (locku_args->length == 0) 12830 locku_args->length = ~locku_args->length; 12831 12832 switch (resend_rqstp->lr_ctype) { 12833 case NFS4_LCK_CTYPE_RESEND: 12834 argsp->ctag = TAG_LOCKU_RESEND; 12835 break; 12836 case NFS4_LCK_CTYPE_REINSTATE: 12837 argsp->ctag = TAG_LOCKU_REINSTATE; 12838 break; 12839 default: 12840 argsp->ctag = TAG_LOCK_UNKNOWN; 12841 break; 12842 } 12843 } 12844 } 12845 12846 /* 12847 * Setup the LOCKT4 arguments. 12848 */ 12849 static void 12850 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 12851 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 12852 rnode4_t *rp) 12853 { 12854 LOCKT4args *lockt_args; 12855 12856 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 12857 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 12858 argop->argop = OP_LOCKT; 12859 argsp->ctag = TAG_LOCKT; 12860 lockt_args = &argop->nfs_argop4_u.oplockt; 12861 12862 /* 12863 * The locktype will be READ_LT unless it's 12864 * a write lock. We do this because the Solaris 12865 * system call allows the combination of 12866 * F_UNLCK and F_GETLK* and so in that case the 12867 * unlock is mapped to a read. 12868 */ 12869 if (flk->l_type == F_WRLCK) 12870 lockt_args->locktype = WRITE_LT; 12871 else 12872 lockt_args->locktype = READ_LT; 12873 12874 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 12875 /* set the lock owner4 args */ 12876 nfs4_setlockowner_args(&lockt_args->owner, rp, 12877 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 12878 flk->l_pid); 12879 lockt_args->offset = flk->l_start; 12880 lockt_args->length = flk->l_len; 12881 if (flk->l_len == 0) 12882 lockt_args->length = ~lockt_args->length; 12883 12884 *lockt_argsp = lockt_args; 12885 } 12886 12887 /* 12888 * If the client is holding a delegation, and the open stream to be used 12889 * with this lock request is a delegation open stream, then re-open the stream. 12890 * Sets the nfs4_error_t to all zeros unless the open stream has already 12891 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 12892 * means the caller should retry (like a recovery retry). 12893 */ 12894 static void 12895 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 12896 { 12897 open_delegation_type4 dt; 12898 bool_t reopen_needed, force; 12899 nfs4_open_stream_t *osp; 12900 open_claim_type4 oclaim; 12901 rnode4_t *rp = VTOR4(vp); 12902 mntinfo4_t *mi = VTOMI4(vp); 12903 12904 ASSERT(nfs_zone() == mi->mi_zone); 12905 12906 nfs4_error_zinit(ep); 12907 12908 mutex_enter(&rp->r_statev4_lock); 12909 dt = rp->r_deleg_type; 12910 mutex_exit(&rp->r_statev4_lock); 12911 12912 if (dt != OPEN_DELEGATE_NONE) { 12913 nfs4_open_owner_t *oop; 12914 12915 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 12916 if (!oop) { 12917 ep->stat = NFS4ERR_IO; 12918 return; 12919 } 12920 /* returns with 'os_sync_lock' held */ 12921 osp = find_open_stream(oop, rp); 12922 if (!osp) { 12923 open_owner_rele(oop); 12924 ep->stat = NFS4ERR_IO; 12925 return; 12926 } 12927 12928 if (osp->os_failed_reopen) { 12929 NFS4_DEBUG((nfs4_open_stream_debug || 12930 nfs4_client_lock_debug), (CE_NOTE, 12931 "nfs4frlock_check_deleg: os_failed_reopen set " 12932 "for osp %p, cr %p, rp %s", (void *)osp, 12933 (void *)cr, rnode4info(rp))); 12934 mutex_exit(&osp->os_sync_lock); 12935 open_stream_rele(osp, rp); 12936 open_owner_rele(oop); 12937 ep->stat = NFS4ERR_IO; 12938 return; 12939 } 12940 12941 /* 12942 * Determine whether a reopen is needed. If this 12943 * is a delegation open stream, then send the open 12944 * to the server to give visibility to the open owner. 12945 * Even if it isn't a delegation open stream, we need 12946 * to check if the previous open CLAIM_DELEGATE_CUR 12947 * was sufficient. 12948 */ 12949 12950 reopen_needed = osp->os_delegation || 12951 ((lt == F_RDLCK && 12952 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 12953 (lt == F_WRLCK && 12954 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 12955 12956 mutex_exit(&osp->os_sync_lock); 12957 open_owner_rele(oop); 12958 12959 if (reopen_needed) { 12960 /* 12961 * Always use CLAIM_PREVIOUS after server reboot. 12962 * The server will reject CLAIM_DELEGATE_CUR if 12963 * it is used during the grace period. 12964 */ 12965 mutex_enter(&mi->mi_lock); 12966 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 12967 oclaim = CLAIM_PREVIOUS; 12968 force = TRUE; 12969 } else { 12970 oclaim = CLAIM_DELEGATE_CUR; 12971 force = FALSE; 12972 } 12973 mutex_exit(&mi->mi_lock); 12974 12975 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 12976 if (ep->error == EAGAIN) { 12977 nfs4_error_zinit(ep); 12978 ep->stat = NFS4ERR_DELAY; 12979 } 12980 } 12981 open_stream_rele(osp, rp); 12982 osp = NULL; 12983 } 12984 } 12985 12986 /* 12987 * Setup the LOCKU4 arguments. 12988 * Returns errors via the nfs4_error_t. 12989 * NFS4_OK no problems. *go_otwp is TRUE if call should go 12990 * over-the-wire. The caller must release the 12991 * reference on *lopp. 12992 * NFS4ERR_DELAY caller should retry (like recovery retry) 12993 * (other) unrecoverable error. 12994 */ 12995 static void 12996 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 12997 LOCKU4args **locku_argsp, flock64_t *flk, 12998 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 12999 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13000 bool_t *skip_get_err, bool_t *go_otwp) 13001 { 13002 nfs4_lock_owner_t *lop = NULL; 13003 LOCKU4args *locku_args; 13004 pid_t pid; 13005 bool_t is_spec = FALSE; 13006 rnode4_t *rp = VTOR4(vp); 13007 13008 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13009 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13010 13011 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13012 if (ep->error || ep->stat) 13013 return; 13014 13015 argop->argop = OP_LOCKU; 13016 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13017 argsp->ctag = TAG_LOCKU_REINSTATE; 13018 else 13019 argsp->ctag = TAG_LOCKU; 13020 locku_args = &argop->nfs_argop4_u.oplocku; 13021 *locku_argsp = locku_args; 13022 13023 /* 13024 * XXX what should locku_args->locktype be? 13025 * setting to ALWAYS be READ_LT so at least 13026 * it is a valid locktype. 13027 */ 13028 13029 locku_args->locktype = READ_LT; 13030 13031 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13032 flk->l_pid; 13033 13034 /* 13035 * Get the lock owner stateid. If no lock owner 13036 * exists, return success. 13037 */ 13038 lop = find_lock_owner(rp, pid, LOWN_ANY); 13039 *lopp = lop; 13040 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13041 is_spec = TRUE; 13042 if (!lop || is_spec) { 13043 /* 13044 * No lock owner so no locks to unlock. 13045 * Return success. If there was a failed 13046 * reclaim earlier, the lock might still be 13047 * registered with the local locking code, 13048 * so notify it of the unlock. 13049 * 13050 * If the lockowner is using a special stateid, 13051 * then the original lock request (that created 13052 * this lockowner) was never successful, so we 13053 * have no lock to undo OTW. 13054 */ 13055 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13056 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13057 "(%ld) so return success", (long)pid)); 13058 13059 if (ctype == NFS4_LCK_CTYPE_NORM) 13060 flk->l_pid = curproc->p_pid; 13061 nfs4_register_lock_locally(vp, flk, flag, offset); 13062 /* 13063 * Release our hold and NULL out so final_cleanup 13064 * doesn't try to end a lock seqid sync we 13065 * never started. 13066 */ 13067 if (is_spec) { 13068 lock_owner_rele(lop); 13069 *lopp = NULL; 13070 } 13071 *skip_get_err = TRUE; 13072 *go_otwp = FALSE; 13073 return; 13074 } 13075 13076 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13077 if (ep->error == EAGAIN) { 13078 lock_owner_rele(lop); 13079 *lopp = NULL; 13080 return; 13081 } 13082 13083 mutex_enter(&lop->lo_lock); 13084 locku_args->lock_stateid = lop->lock_stateid; 13085 mutex_exit(&lop->lo_lock); 13086 locku_args->seqid = lop->lock_seqid + 1; 13087 13088 /* leave the ref count on lop, rele after RPC call */ 13089 13090 locku_args->offset = flk->l_start; 13091 locku_args->length = flk->l_len; 13092 if (flk->l_len == 0) 13093 locku_args->length = ~locku_args->length; 13094 13095 *go_otwp = TRUE; 13096 } 13097 13098 /* 13099 * Setup the LOCK4 arguments. 13100 * 13101 * Returns errors via the nfs4_error_t. 13102 * NFS4_OK no problems 13103 * NFS4ERR_DELAY caller should retry (like recovery retry) 13104 * (other) unrecoverable error 13105 */ 13106 static void 13107 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13108 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13109 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13110 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13111 { 13112 LOCK4args *lock_args; 13113 nfs4_open_owner_t *oop = NULL; 13114 nfs4_open_stream_t *osp = NULL; 13115 nfs4_lock_owner_t *lop = NULL; 13116 pid_t pid; 13117 rnode4_t *rp = VTOR4(vp); 13118 13119 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13120 13121 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13122 if (ep->error || ep->stat != NFS4_OK) 13123 return; 13124 13125 argop->argop = OP_LOCK; 13126 if (ctype == NFS4_LCK_CTYPE_NORM) 13127 argsp->ctag = TAG_LOCK; 13128 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13129 argsp->ctag = TAG_RELOCK; 13130 else 13131 argsp->ctag = TAG_LOCK_REINSTATE; 13132 lock_args = &argop->nfs_argop4_u.oplock; 13133 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13134 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13135 /* 13136 * Get the lock owner. If no lock owner exists, 13137 * create a 'temporary' one and grab the open seqid 13138 * synchronization (which puts a hold on the open 13139 * owner and open stream). 13140 * This also grabs the lock seqid synchronization. 13141 */ 13142 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13143 ep->stat = 13144 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13145 13146 if (ep->stat != NFS4_OK) 13147 goto out; 13148 13149 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13150 &lock_args->locker); 13151 13152 lock_args->offset = flk->l_start; 13153 lock_args->length = flk->l_len; 13154 if (flk->l_len == 0) 13155 lock_args->length = ~lock_args->length; 13156 *lock_argsp = lock_args; 13157 out: 13158 *oopp = oop; 13159 *ospp = osp; 13160 *lopp = lop; 13161 } 13162 13163 /* 13164 * After we get the reply from the server, record the proper information 13165 * for possible resend lock requests. 13166 * 13167 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13168 */ 13169 static void 13170 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13171 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13172 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13173 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13174 { 13175 bool_t unlock = (flk->l_type == F_UNLCK); 13176 13177 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13178 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13179 ctype == NFS4_LCK_CTYPE_REINSTATE); 13180 13181 if (error != 0 && !unlock) { 13182 NFS4_DEBUG((nfs4_lost_rqst_debug || 13183 nfs4_client_lock_debug), (CE_NOTE, 13184 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13185 " for lop %p", (void *)lop)); 13186 ASSERT(lop != NULL); 13187 mutex_enter(&lop->lo_lock); 13188 lop->lo_pending_rqsts = 1; 13189 mutex_exit(&lop->lo_lock); 13190 } 13191 13192 lost_rqstp->lr_putfirst = FALSE; 13193 lost_rqstp->lr_op = 0; 13194 13195 /* 13196 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13197 * recovery purposes so that the lock request that was sent 13198 * can be saved and re-issued later. Ditto for EIO from a forced 13199 * unmount. This is done to have the client's local locking state 13200 * match the v4 server's state; that is, the request was 13201 * potentially received and accepted by the server but the client 13202 * thinks it was not. 13203 */ 13204 if (error == ETIMEDOUT || error == EINTR || 13205 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13206 NFS4_DEBUG((nfs4_lost_rqst_debug || 13207 nfs4_client_lock_debug), (CE_NOTE, 13208 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13209 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13210 (void *)lop, (void *)oop, (void *)osp)); 13211 if (unlock) 13212 lost_rqstp->lr_op = OP_LOCKU; 13213 else { 13214 lost_rqstp->lr_op = OP_LOCK; 13215 lost_rqstp->lr_locktype = locktype; 13216 } 13217 /* 13218 * Objects are held and rele'd via the recovery code. 13219 * See nfs4_save_lost_rqst. 13220 */ 13221 lost_rqstp->lr_vp = vp; 13222 lost_rqstp->lr_dvp = NULL; 13223 lost_rqstp->lr_oop = oop; 13224 lost_rqstp->lr_osp = osp; 13225 lost_rqstp->lr_lop = lop; 13226 lost_rqstp->lr_cr = cr; 13227 switch (ctype) { 13228 case NFS4_LCK_CTYPE_NORM: 13229 flk->l_pid = ttoproc(curthread)->p_pid; 13230 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13231 break; 13232 case NFS4_LCK_CTYPE_REINSTATE: 13233 lost_rqstp->lr_putfirst = TRUE; 13234 lost_rqstp->lr_ctype = ctype; 13235 break; 13236 default: 13237 break; 13238 } 13239 lost_rqstp->lr_flk = flk; 13240 } 13241 } 13242 13243 /* 13244 * Update lop's seqid. Also update the seqid stored in a resend request, 13245 * if any. (Some recovery errors increment the seqid, and we may have to 13246 * send the resend request again.) 13247 */ 13248 13249 static void 13250 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13251 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13252 { 13253 if (lock_args) { 13254 if (lock_args->locker.new_lock_owner == TRUE) 13255 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13256 else { 13257 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13258 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13259 } 13260 } else if (locku_args) { 13261 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13262 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13263 } 13264 } 13265 13266 /* 13267 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13268 * COMPOUND4 args/res for calls that need to retry. 13269 * Switches the *cred_otwp to base_cr. 13270 */ 13271 static void 13272 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13273 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13274 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13275 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13276 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13277 { 13278 nfs4_open_owner_t *oop = *oopp; 13279 nfs4_open_stream_t *osp = *ospp; 13280 nfs4_lock_owner_t *lop = *lopp; 13281 nfs_argop4 *argop = (*argspp)->array; 13282 13283 if (*did_start_fop) { 13284 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13285 needrecov); 13286 *did_start_fop = FALSE; 13287 } 13288 ASSERT((*argspp)->array_len == 2); 13289 if (argop[1].argop == OP_LOCK) 13290 nfs4args_lock_free(&argop[1]); 13291 else if (argop[1].argop == OP_LOCKT) 13292 nfs4args_lockt_free(&argop[1]); 13293 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13294 if (!error) 13295 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13296 *argspp = NULL; 13297 *respp = NULL; 13298 13299 if (lop) { 13300 nfs4_end_lock_seqid_sync(lop); 13301 lock_owner_rele(lop); 13302 *lopp = NULL; 13303 } 13304 13305 /* need to free up the reference on osp for lock args */ 13306 if (osp != NULL) { 13307 open_stream_rele(osp, VTOR4(vp)); 13308 *ospp = NULL; 13309 } 13310 13311 /* need to free up the reference on oop for lock args */ 13312 if (oop != NULL) { 13313 nfs4_end_open_seqid_sync(oop); 13314 open_owner_rele(oop); 13315 *oopp = NULL; 13316 } 13317 13318 crfree(*cred_otwp); 13319 *cred_otwp = base_cr; 13320 crhold(*cred_otwp); 13321 } 13322 13323 /* 13324 * Function to process the client's recovery for nfs4frlock. 13325 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13326 * 13327 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13328 * COMPOUND4 args/res for calls that need to retry. 13329 * 13330 * Note: the rp's r_lkserlock is *not* dropped during this path. 13331 */ 13332 static bool_t 13333 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13334 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13335 LOCK4args *lock_args, LOCKU4args *locku_args, 13336 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13337 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13338 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13339 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13340 { 13341 nfs4_open_owner_t *oop = *oopp; 13342 nfs4_open_stream_t *osp = *ospp; 13343 nfs4_lock_owner_t *lop = *lopp; 13344 13345 bool_t abort, retry; 13346 13347 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13348 ASSERT((*argspp) != NULL); 13349 ASSERT((*respp) != NULL); 13350 if (lock_args || locku_args) 13351 ASSERT(lop != NULL); 13352 13353 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13354 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13355 13356 retry = TRUE; 13357 abort = FALSE; 13358 if (needrecov) { 13359 nfs4_bseqid_entry_t *bsep = NULL; 13360 nfs_opnum4 op; 13361 13362 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13363 13364 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13365 seqid4 seqid; 13366 13367 if (lock_args) { 13368 if (lock_args->locker.new_lock_owner == TRUE) 13369 seqid = lock_args->locker.locker4_u. 13370 open_owner.open_seqid; 13371 else 13372 seqid = lock_args->locker.locker4_u. 13373 lock_owner.lock_seqid; 13374 } else if (locku_args) { 13375 seqid = locku_args->seqid; 13376 } else { 13377 seqid = 0; 13378 } 13379 13380 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13381 flk->l_pid, (*argspp)->ctag, seqid); 13382 } 13383 13384 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13385 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13386 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13387 NULL, op, bsep); 13388 13389 if (bsep) 13390 kmem_free(bsep, sizeof (*bsep)); 13391 } 13392 13393 /* 13394 * Return that we do not want to retry the request for 3 cases: 13395 * 1. If we received EINTR or are bailing out because of a forced 13396 * unmount, we came into this code path just for the sake of 13397 * initiating recovery, we now need to return the error. 13398 * 2. If we have aborted recovery. 13399 * 3. We received NFS4ERR_BAD_SEQID. 13400 */ 13401 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13402 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13403 retry = FALSE; 13404 13405 if (*did_start_fop == TRUE) { 13406 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13407 needrecov); 13408 *did_start_fop = FALSE; 13409 } 13410 13411 if (retry == TRUE) { 13412 nfs_argop4 *argop; 13413 13414 argop = (*argspp)->array; 13415 ASSERT((*argspp)->array_len == 2); 13416 13417 if (argop[1].argop == OP_LOCK) 13418 nfs4args_lock_free(&argop[1]); 13419 else if (argop[1].argop == OP_LOCKT) 13420 nfs4args_lockt_free(&argop[1]); 13421 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13422 if (!ep->error) 13423 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13424 *respp = NULL; 13425 *argspp = NULL; 13426 } 13427 13428 if (lop != NULL) { 13429 nfs4_end_lock_seqid_sync(lop); 13430 lock_owner_rele(lop); 13431 } 13432 13433 *lopp = NULL; 13434 13435 /* need to free up the reference on osp for lock args */ 13436 if (osp != NULL) { 13437 open_stream_rele(osp, rp); 13438 *ospp = NULL; 13439 } 13440 13441 /* need to free up the reference on oop for lock args */ 13442 if (oop != NULL) { 13443 nfs4_end_open_seqid_sync(oop); 13444 open_owner_rele(oop); 13445 *oopp = NULL; 13446 } 13447 13448 return (retry); 13449 } 13450 13451 /* 13452 * Handles the succesful reply from the server for nfs4frlock. 13453 */ 13454 static void 13455 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13456 vnode_t *vp, int flag, u_offset_t offset, 13457 nfs4_lost_rqst_t *resend_rqstp) 13458 { 13459 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13460 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13461 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13462 if (ctype == NFS4_LCK_CTYPE_NORM) { 13463 flk->l_pid = ttoproc(curthread)->p_pid; 13464 /* 13465 * We do not register lost locks locally in 13466 * the 'resend' case since the user/application 13467 * doesn't think we have the lock. 13468 */ 13469 ASSERT(!resend_rqstp); 13470 nfs4_register_lock_locally(vp, flk, flag, offset); 13471 } 13472 } 13473 } 13474 13475 /* 13476 * Handle the DENIED reply from the server for nfs4frlock. 13477 * Returns TRUE if we should retry the request; FALSE otherwise. 13478 * 13479 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13480 * COMPOUND4 args/res for calls that need to retry. Can also 13481 * drop and regrab the r_lkserlock. 13482 */ 13483 static bool_t 13484 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13485 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13486 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13487 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13488 nfs4_recov_state_t *recov_statep, int needrecov, 13489 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13490 clock_t *tick_delayp, short *whencep, int *errorp, 13491 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13492 bool_t *skip_get_err) 13493 { 13494 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13495 13496 if (lock_args) { 13497 nfs4_open_owner_t *oop = *oopp; 13498 nfs4_open_stream_t *osp = *ospp; 13499 nfs4_lock_owner_t *lop = *lopp; 13500 int intr; 13501 13502 /* 13503 * Blocking lock needs to sleep and retry from the request. 13504 * 13505 * Do not block and wait for 'resend' or 'reinstate' 13506 * lock requests, just return the error. 13507 * 13508 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13509 */ 13510 if (cmd == F_SETLKW) { 13511 rnode4_t *rp = VTOR4(vp); 13512 nfs_argop4 *argop = (*argspp)->array; 13513 13514 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13515 13516 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13517 recov_statep, needrecov); 13518 *did_start_fop = FALSE; 13519 ASSERT((*argspp)->array_len == 2); 13520 if (argop[1].argop == OP_LOCK) 13521 nfs4args_lock_free(&argop[1]); 13522 else if (argop[1].argop == OP_LOCKT) 13523 nfs4args_lockt_free(&argop[1]); 13524 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13525 if (*respp) 13526 (void) xdr_free(xdr_COMPOUND4res_clnt, 13527 (caddr_t)*respp); 13528 *argspp = NULL; 13529 *respp = NULL; 13530 nfs4_end_lock_seqid_sync(lop); 13531 lock_owner_rele(lop); 13532 *lopp = NULL; 13533 if (osp != NULL) { 13534 open_stream_rele(osp, rp); 13535 *ospp = NULL; 13536 } 13537 if (oop != NULL) { 13538 nfs4_end_open_seqid_sync(oop); 13539 open_owner_rele(oop); 13540 *oopp = NULL; 13541 } 13542 13543 nfs_rw_exit(&rp->r_lkserlock); 13544 13545 intr = nfs4_block_and_wait(tick_delayp, rp); 13546 13547 if (intr) { 13548 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13549 RW_WRITER, FALSE); 13550 *errorp = EINTR; 13551 return (FALSE); 13552 } 13553 13554 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13555 RW_WRITER, FALSE); 13556 13557 /* 13558 * Make sure we are still safe to lock with 13559 * regards to mmapping. 13560 */ 13561 if (!nfs4_safelock(vp, flk, cr)) { 13562 *errorp = EAGAIN; 13563 return (FALSE); 13564 } 13565 13566 return (TRUE); 13567 } 13568 if (ctype == NFS4_LCK_CTYPE_NORM) 13569 *errorp = EAGAIN; 13570 *skip_get_err = TRUE; 13571 flk->l_whence = 0; 13572 *whencep = 0; 13573 return (FALSE); 13574 } else if (lockt_args) { 13575 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13576 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13577 13578 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13579 flk, lockt_args); 13580 13581 /* according to NLM code */ 13582 *errorp = 0; 13583 *whencep = 0; 13584 *skip_get_err = TRUE; 13585 return (FALSE); 13586 } 13587 return (FALSE); 13588 } 13589 13590 /* 13591 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13592 */ 13593 static void 13594 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13595 { 13596 switch (resp->status) { 13597 case NFS4ERR_ACCESS: 13598 case NFS4ERR_ADMIN_REVOKED: 13599 case NFS4ERR_BADHANDLE: 13600 case NFS4ERR_BAD_RANGE: 13601 case NFS4ERR_BAD_SEQID: 13602 case NFS4ERR_BAD_STATEID: 13603 case NFS4ERR_BADXDR: 13604 case NFS4ERR_DEADLOCK: 13605 case NFS4ERR_DELAY: 13606 case NFS4ERR_EXPIRED: 13607 case NFS4ERR_FHEXPIRED: 13608 case NFS4ERR_GRACE: 13609 case NFS4ERR_INVAL: 13610 case NFS4ERR_ISDIR: 13611 case NFS4ERR_LEASE_MOVED: 13612 case NFS4ERR_LOCK_NOTSUPP: 13613 case NFS4ERR_LOCK_RANGE: 13614 case NFS4ERR_MOVED: 13615 case NFS4ERR_NOFILEHANDLE: 13616 case NFS4ERR_NO_GRACE: 13617 case NFS4ERR_OLD_STATEID: 13618 case NFS4ERR_OPENMODE: 13619 case NFS4ERR_RECLAIM_BAD: 13620 case NFS4ERR_RECLAIM_CONFLICT: 13621 case NFS4ERR_RESOURCE: 13622 case NFS4ERR_SERVERFAULT: 13623 case NFS4ERR_STALE: 13624 case NFS4ERR_STALE_CLIENTID: 13625 case NFS4ERR_STALE_STATEID: 13626 return; 13627 default: 13628 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13629 "nfs4frlock_results_default: got unrecognizable " 13630 "res.status %d", resp->status)); 13631 *errorp = NFS4ERR_INVAL; 13632 } 13633 } 13634 13635 /* 13636 * The lock request was successful, so update the client's state. 13637 */ 13638 static void 13639 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13640 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13641 vnode_t *vp, flock64_t *flk, cred_t *cr, 13642 nfs4_lost_rqst_t *resend_rqstp) 13643 { 13644 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13645 13646 if (lock_args) { 13647 LOCK4res *lock_res; 13648 13649 lock_res = &resop->nfs_resop4_u.oplock; 13650 /* update the stateid with server's response */ 13651 13652 if (lock_args->locker.new_lock_owner == TRUE) { 13653 mutex_enter(&lop->lo_lock); 13654 lop->lo_just_created = NFS4_PERM_CREATED; 13655 mutex_exit(&lop->lo_lock); 13656 } 13657 13658 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13659 13660 /* 13661 * If the lock was the result of a resending a lost 13662 * request, we've synched up the stateid and seqid 13663 * with the server, but now the server might be out of sync 13664 * with what the application thinks it has for locks. 13665 * Clean that up here. It's unclear whether we should do 13666 * this even if the filesystem has been forcibly unmounted. 13667 * For most servers, it's probably wasted effort, but 13668 * RFC3530 lets servers require that unlocks exactly match 13669 * the locks that are held. 13670 */ 13671 if (resend_rqstp != NULL && 13672 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13673 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13674 } else { 13675 flk->l_whence = 0; 13676 } 13677 } else if (locku_args) { 13678 LOCKU4res *locku_res; 13679 13680 locku_res = &resop->nfs_resop4_u.oplocku; 13681 13682 /* Update the stateid with the server's response */ 13683 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13684 } else if (lockt_args) { 13685 /* Switch the lock type to express success, see fcntl */ 13686 flk->l_type = F_UNLCK; 13687 flk->l_whence = 0; 13688 } 13689 } 13690 13691 /* 13692 * Do final cleanup before exiting nfs4frlock. 13693 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13694 * COMPOUND4 args/res for calls that haven't already. 13695 */ 13696 static void 13697 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13698 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13699 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13700 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13701 short whence, u_offset_t offset, struct lm_sysid *ls, 13702 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13703 bool_t did_start_fop, bool_t skip_get_err, 13704 cred_t *cred_otw, cred_t *cred) 13705 { 13706 mntinfo4_t *mi = VTOMI4(vp); 13707 rnode4_t *rp = VTOR4(vp); 13708 int error = *errorp; 13709 nfs_argop4 *argop; 13710 13711 ASSERT(nfs_zone() == mi->mi_zone); 13712 /* 13713 * The client recovery code wants the raw status information, 13714 * so don't map the NFS status code to an errno value for 13715 * non-normal call types. 13716 */ 13717 if (ctype == NFS4_LCK_CTYPE_NORM) { 13718 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13719 *errorp = geterrno4(resp->status); 13720 if (did_start_fop == TRUE) 13721 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13722 needrecov); 13723 13724 if (!error && resp && resp->status == NFS4_OK) { 13725 /* 13726 * We've established a new lock on the server, so invalidate 13727 * the pages associated with the vnode to get the most up to 13728 * date pages from the server after acquiring the lock. We 13729 * want to be sure that the read operation gets the newest data. 13730 * N.B. 13731 * We used to do this in nfs4frlock_results_ok but that doesn't 13732 * work since VOP_PUTPAGE can call nfs4_commit which calls 13733 * nfs4_start_fop. We flush the pages below after calling 13734 * nfs4_end_fop above 13735 */ 13736 int error; 13737 13738 error = VOP_PUTPAGE(vp, (u_offset_t)0, 13739 0, B_INVAL, cred); 13740 13741 if (error && (error == ENOSPC || error == EDQUOT)) { 13742 rnode4_t *rp = VTOR4(vp); 13743 13744 mutex_enter(&rp->r_statelock); 13745 if (!rp->r_error) 13746 rp->r_error = error; 13747 mutex_exit(&rp->r_statelock); 13748 } 13749 } 13750 } 13751 if (argsp) { 13752 ASSERT(argsp->array_len == 2); 13753 argop = argsp->array; 13754 if (argop[1].argop == OP_LOCK) 13755 nfs4args_lock_free(&argop[1]); 13756 else if (argop[1].argop == OP_LOCKT) 13757 nfs4args_lockt_free(&argop[1]); 13758 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13759 if (resp) 13760 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 13761 } 13762 13763 /* free the reference on the lock owner */ 13764 if (lop != NULL) { 13765 nfs4_end_lock_seqid_sync(lop); 13766 lock_owner_rele(lop); 13767 } 13768 13769 /* need to free up the reference on osp for lock args */ 13770 if (osp != NULL) 13771 open_stream_rele(osp, rp); 13772 13773 /* need to free up the reference on oop for lock args */ 13774 if (oop != NULL) { 13775 nfs4_end_open_seqid_sync(oop); 13776 open_owner_rele(oop); 13777 } 13778 13779 (void) convoff(vp, flk, whence, offset); 13780 13781 lm_rel_sysid(ls); 13782 13783 /* 13784 * Record debug information in the event we get EINVAL. 13785 */ 13786 mutex_enter(&mi->mi_lock); 13787 if (*errorp == EINVAL && (lock_args || locku_args) && 13788 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 13789 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 13790 zcmn_err(getzoneid(), CE_NOTE, 13791 "%s operation failed with " 13792 "EINVAL probably since the server, %s," 13793 " doesn't support POSIX style locking", 13794 lock_args ? "LOCK" : "LOCKU", 13795 mi->mi_curr_serv->sv_hostname); 13796 mi->mi_flags |= MI4_LOCK_DEBUG; 13797 } 13798 } 13799 mutex_exit(&mi->mi_lock); 13800 13801 if (cred_otw) 13802 crfree(cred_otw); 13803 } 13804 13805 /* 13806 * This calls the server and the local locking code. 13807 * 13808 * Client locks are registerred locally by oring the sysid with 13809 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 13810 * We need to distinguish between the two to avoid collision in case one 13811 * machine is used as both client and server. 13812 * 13813 * Blocking lock requests will continually retry to acquire the lock 13814 * forever. 13815 * 13816 * The ctype is defined as follows: 13817 * NFS4_LCK_CTYPE_NORM: normal lock request. 13818 * 13819 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 13820 * recovery, get the pid from flk instead of curproc, and don't reregister 13821 * the lock locally. 13822 * 13823 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 13824 * that we will use the information passed in via resend_rqstp to setup the 13825 * lock/locku request. This resend is the exact same request as the 'lost 13826 * lock', and is initiated by the recovery framework. A successful resend 13827 * request can initiate one or more reinstate requests. 13828 * 13829 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 13830 * does not trigger additional reinstate requests. This lock call type is 13831 * set for setting the v4 server's locking state back to match what the 13832 * client's local locking state is in the event of a received 'lost lock'. 13833 * 13834 * Errors are returned via the nfs4_error_t parameter. 13835 */ 13836 void 13837 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 13838 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 13839 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 13840 { 13841 COMPOUND4args_clnt args, *argsp = NULL; 13842 COMPOUND4res_clnt res, *resp = NULL; 13843 nfs_argop4 *argop; 13844 nfs_resop4 *resop; 13845 rnode4_t *rp; 13846 int doqueue = 1; 13847 clock_t tick_delay; /* delay in clock ticks */ 13848 struct lm_sysid *ls; 13849 LOCK4args *lock_args = NULL; 13850 LOCKU4args *locku_args = NULL; 13851 LOCKT4args *lockt_args = NULL; 13852 nfs4_open_owner_t *oop = NULL; 13853 nfs4_open_stream_t *osp = NULL; 13854 nfs4_lock_owner_t *lop = NULL; 13855 bool_t needrecov = FALSE; 13856 nfs4_recov_state_t recov_state; 13857 short whence; 13858 nfs4_op_hint_t op_hint; 13859 nfs4_lost_rqst_t lost_rqst; 13860 bool_t retry = FALSE; 13861 bool_t did_start_fop = FALSE; 13862 bool_t skip_get_err = FALSE; 13863 cred_t *cred_otw = NULL; 13864 bool_t recovonly; /* just queue request */ 13865 int frc_no_reclaim = 0; 13866 #ifdef DEBUG 13867 char *name; 13868 #endif 13869 13870 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13871 13872 #ifdef DEBUG 13873 name = fn_name(VTOSV(vp)->sv_name); 13874 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 13875 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 13876 "length %"PRIu64", pid %d, sysid %d, call type %s, " 13877 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 13878 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 13879 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 13880 resend_rqstp ? "TRUE" : "FALSE")); 13881 kmem_free(name, MAXNAMELEN); 13882 #endif 13883 13884 nfs4_error_zinit(ep); 13885 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 13886 if (ep->error) 13887 return; 13888 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 13889 if (ep->error) 13890 return; 13891 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 13892 vp, cr, &cred_otw); 13893 13894 recov_retry: 13895 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 13896 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 13897 rp = VTOR4(vp); 13898 13899 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 13900 &did_start_fop, &recovonly); 13901 13902 if (ep->error) 13903 goto out; 13904 13905 if (recovonly) { 13906 /* 13907 * Leave the request for the recovery system to deal with. 13908 */ 13909 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13910 ASSERT(cmd != F_GETLK); 13911 ASSERT(flk->l_type == F_UNLCK); 13912 13913 nfs4_error_init(ep, EINTR); 13914 needrecov = TRUE; 13915 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 13916 if (lop != NULL) { 13917 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 13918 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 13919 (void) nfs4_start_recovery(ep, 13920 VTOMI4(vp), vp, NULL, NULL, 13921 (lost_rqst.lr_op == OP_LOCK || 13922 lost_rqst.lr_op == OP_LOCKU) ? 13923 &lost_rqst : NULL, OP_LOCKU, NULL); 13924 lock_owner_rele(lop); 13925 lop = NULL; 13926 } 13927 flk->l_pid = curproc->p_pid; 13928 nfs4_register_lock_locally(vp, flk, flag, offset); 13929 goto out; 13930 } 13931 13932 /* putfh directory fh */ 13933 argop[0].argop = OP_CPUTFH; 13934 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 13935 13936 /* 13937 * Set up the over-the-wire arguments and get references to the 13938 * open owner, etc. 13939 */ 13940 13941 if (ctype == NFS4_LCK_CTYPE_RESEND || 13942 ctype == NFS4_LCK_CTYPE_REINSTATE) { 13943 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 13944 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 13945 } else { 13946 bool_t go_otw = TRUE; 13947 13948 ASSERT(resend_rqstp == NULL); 13949 13950 switch (cmd) { 13951 case F_GETLK: 13952 case F_O_GETLK: 13953 nfs4frlock_setup_lockt_args(ctype, &argop[1], 13954 &lockt_args, argsp, flk, rp); 13955 break; 13956 case F_SETLKW: 13957 case F_SETLK: 13958 if (flk->l_type == F_UNLCK) 13959 nfs4frlock_setup_locku_args(ctype, 13960 &argop[1], &locku_args, flk, 13961 &lop, ep, argsp, 13962 vp, flag, offset, cr, 13963 &skip_get_err, &go_otw); 13964 else 13965 nfs4frlock_setup_lock_args(ctype, 13966 &lock_args, &oop, &osp, &lop, &argop[1], 13967 argsp, flk, cmd, vp, cr, ep); 13968 13969 if (ep->error) 13970 goto out; 13971 13972 switch (ep->stat) { 13973 case NFS4_OK: 13974 break; 13975 case NFS4ERR_DELAY: 13976 /* recov thread never gets this error */ 13977 ASSERT(resend_rqstp == NULL); 13978 ASSERT(did_start_fop); 13979 13980 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13981 &recov_state, TRUE); 13982 did_start_fop = FALSE; 13983 if (argop[1].argop == OP_LOCK) 13984 nfs4args_lock_free(&argop[1]); 13985 else if (argop[1].argop == OP_LOCKT) 13986 nfs4args_lockt_free(&argop[1]); 13987 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13988 argsp = NULL; 13989 goto recov_retry; 13990 default: 13991 ep->error = EIO; 13992 goto out; 13993 } 13994 break; 13995 default: 13996 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13997 "nfs4_frlock: invalid cmd %d", cmd)); 13998 ep->error = EINVAL; 13999 goto out; 14000 } 14001 14002 if (!go_otw) 14003 goto out; 14004 } 14005 14006 /* XXX should we use the local reclock as a cache ? */ 14007 /* 14008 * Unregister the lock with the local locking code before 14009 * contacting the server. This avoids a potential race where 14010 * another process gets notified that it has been granted a lock 14011 * before we can unregister ourselves locally. 14012 */ 14013 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14014 if (ctype == NFS4_LCK_CTYPE_NORM) 14015 flk->l_pid = ttoproc(curthread)->p_pid; 14016 nfs4_register_lock_locally(vp, flk, flag, offset); 14017 } 14018 14019 /* 14020 * Send the server the lock request. Continually loop with a delay 14021 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14022 */ 14023 resp = &res; 14024 14025 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14026 (CE_NOTE, 14027 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14028 rnode4info(rp))); 14029 14030 if (lock_args && frc_no_reclaim) { 14031 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14032 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14033 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14034 lock_args->reclaim = FALSE; 14035 if (did_reclaimp) 14036 *did_reclaimp = 0; 14037 } 14038 14039 /* 14040 * Do the OTW call. 14041 */ 14042 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14043 14044 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14045 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14046 14047 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14048 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14049 "nfs4frlock: needrecov %d", needrecov)); 14050 14051 if (ep->error != 0 && !needrecov && ep->error != EACCES) 14052 goto out; 14053 14054 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14055 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14056 args.ctag); 14057 14058 if ((ep->error == EACCES || 14059 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14060 cred_otw != cr) { 14061 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14062 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14063 cr, &cred_otw); 14064 goto recov_retry; 14065 } 14066 14067 if (needrecov) { 14068 /* 14069 * LOCKT requests don't need to recover from lost 14070 * requests since they don't create/modify state. 14071 */ 14072 if ((ep->error == EINTR || 14073 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14074 lockt_args) 14075 goto out; 14076 /* 14077 * Do not attempt recovery for requests initiated by 14078 * the recovery framework. Let the framework redrive them. 14079 */ 14080 if (ctype != NFS4_LCK_CTYPE_NORM) 14081 goto out; 14082 else { 14083 ASSERT(resend_rqstp == NULL); 14084 } 14085 14086 nfs4frlock_save_lost_rqst(ctype, ep->error, 14087 flk_to_locktype(cmd, flk->l_type), 14088 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14089 14090 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14091 &resp, lock_args, locku_args, &oop, &osp, &lop, 14092 rp, vp, &recov_state, op_hint, &did_start_fop, 14093 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14094 14095 if (retry) { 14096 ASSERT(oop == NULL); 14097 ASSERT(osp == NULL); 14098 ASSERT(lop == NULL); 14099 goto recov_retry; 14100 } 14101 goto out; 14102 } 14103 14104 /* 14105 * Process the reply. 14106 */ 14107 switch (resp->status) { 14108 case NFS4_OK: 14109 resop = &resp->array[1]; 14110 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14111 resend_rqstp); 14112 /* 14113 * Have a successful lock operation, now update state. 14114 */ 14115 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14116 resop, lop, vp, flk, cr, resend_rqstp); 14117 break; 14118 14119 case NFS4ERR_DENIED: 14120 resop = &resp->array[1]; 14121 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14122 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14123 &recov_state, needrecov, &argsp, &resp, 14124 &tick_delay, &whence, &ep->error, resop, cr, 14125 &did_start_fop, &skip_get_err); 14126 14127 if (retry) { 14128 ASSERT(oop == NULL); 14129 ASSERT(osp == NULL); 14130 ASSERT(lop == NULL); 14131 goto recov_retry; 14132 } 14133 break; 14134 /* 14135 * If the server won't let us reclaim, fall-back to trying to lock 14136 * the file from scratch. Code elsewhere will check the changeinfo 14137 * to ensure the file hasn't been changed. 14138 */ 14139 case NFS4ERR_NO_GRACE: 14140 if (lock_args && lock_args->reclaim == TRUE) { 14141 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14142 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14143 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14144 frc_no_reclaim = 1; 14145 /* clean up before retrying */ 14146 needrecov = 0; 14147 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14148 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14149 &recov_state, op_hint, &did_start_fop, NULL, flk); 14150 goto recov_retry; 14151 } 14152 /* FALLTHROUGH */ 14153 14154 default: 14155 nfs4frlock_results_default(resp, &ep->error); 14156 break; 14157 } 14158 out: 14159 /* 14160 * Process and cleanup from error. Make interrupted unlock 14161 * requests look successful, since they will be handled by the 14162 * client recovery code. 14163 */ 14164 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14165 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14166 lock_args, locku_args, did_start_fop, 14167 skip_get_err, cred_otw, cr); 14168 14169 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14170 (cmd == F_SETLK || cmd == F_SETLKW)) 14171 ep->error = 0; 14172 } 14173 14174 /* 14175 * nfs4_safelock: 14176 * 14177 * Return non-zero if the given lock request can be handled without 14178 * violating the constraints on concurrent mapping and locking. 14179 */ 14180 14181 static int 14182 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14183 { 14184 rnode4_t *rp = VTOR4(vp); 14185 struct vattr va; 14186 int error; 14187 14188 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14189 ASSERT(rp->r_mapcnt >= 0); 14190 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14191 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14192 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14193 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14194 14195 if (rp->r_mapcnt == 0) 14196 return (1); /* always safe if not mapped */ 14197 14198 /* 14199 * If the file is already mapped and there are locks, then they 14200 * should be all safe locks. So adding or removing a lock is safe 14201 * as long as the new request is safe (i.e., whole-file, meaning 14202 * length and starting offset are both zero). 14203 */ 14204 14205 if (bfp->l_start != 0 || bfp->l_len != 0) { 14206 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14207 "cannot lock a memory mapped file unless locking the " 14208 "entire file: start %"PRIx64", len %"PRIx64, 14209 bfp->l_start, bfp->l_len)); 14210 return (0); 14211 } 14212 14213 /* mandatory locking and mapping don't mix */ 14214 va.va_mask = AT_MODE; 14215 error = VOP_GETATTR(vp, &va, 0, cr); 14216 if (error != 0) { 14217 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14218 "getattr error %d", error)); 14219 return (0); /* treat errors conservatively */ 14220 } 14221 if (MANDLOCK(vp, va.va_mode)) { 14222 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14223 "cannot mandatory lock and mmap a file")); 14224 return (0); 14225 } 14226 14227 return (1); 14228 } 14229 14230 14231 /* 14232 * Register the lock locally within Solaris. 14233 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14234 * recording locks locally. 14235 * 14236 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14237 * are registered locally. 14238 */ 14239 void 14240 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14241 u_offset_t offset) 14242 { 14243 int oldsysid; 14244 int error; 14245 #ifdef DEBUG 14246 char *name; 14247 #endif 14248 14249 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14250 14251 #ifdef DEBUG 14252 name = fn_name(VTOSV(vp)->sv_name); 14253 NFS4_DEBUG(nfs4_client_lock_debug, 14254 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14255 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14256 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14257 flk->l_sysid)); 14258 kmem_free(name, MAXNAMELEN); 14259 #endif 14260 14261 /* register the lock with local locking */ 14262 oldsysid = flk->l_sysid; 14263 flk->l_sysid |= LM_SYSID_CLIENT; 14264 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14265 #ifdef DEBUG 14266 if (error != 0) { 14267 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14268 "nfs4_register_lock_locally: could not register with" 14269 " local locking")); 14270 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14271 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14272 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14273 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14274 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14275 flk->l_type, flk->l_start, flk->l_len)); 14276 (void) reclock(vp, flk, 0, flag, offset, NULL); 14277 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14278 "blocked by pid %d sysid 0x%x type %d " 14279 "off 0x%" PRIx64 " len 0x%" PRIx64, 14280 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14281 flk->l_len)); 14282 } 14283 #endif 14284 flk->l_sysid = oldsysid; 14285 } 14286 14287 /* 14288 * nfs4_lockrelease: 14289 * 14290 * Release any locks on the given vnode that are held by the current 14291 * process. Also removes the lock owner (if one exists) from the rnode's 14292 * list. 14293 */ 14294 static int 14295 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14296 { 14297 flock64_t ld; 14298 int ret, error; 14299 rnode4_t *rp; 14300 nfs4_lock_owner_t *lop; 14301 nfs4_recov_state_t recov_state; 14302 mntinfo4_t *mi; 14303 bool_t possible_orphan = FALSE; 14304 bool_t recovonly; 14305 14306 ASSERT((uintptr_t)vp > KERNELBASE); 14307 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14308 14309 rp = VTOR4(vp); 14310 mi = VTOMI4(vp); 14311 14312 /* 14313 * If we have not locked anything then we can 14314 * just return since we have no work to do. 14315 */ 14316 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14317 return (0); 14318 } 14319 14320 /* 14321 * We need to comprehend that another thread may 14322 * kick off recovery and the lock_owner we have stashed 14323 * in lop might be invalid so we should NOT cache it 14324 * locally! 14325 */ 14326 recov_state.rs_flags = 0; 14327 recov_state.rs_num_retry_despite_err = 0; 14328 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14329 &recovonly); 14330 if (error) { 14331 mutex_enter(&rp->r_statelock); 14332 rp->r_flags |= R4LODANGLERS; 14333 mutex_exit(&rp->r_statelock); 14334 return (error); 14335 } 14336 14337 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14338 14339 /* 14340 * Check if the lock owner might have a lock (request was sent but 14341 * no response was received). Also check if there are any remote 14342 * locks on the file. (In theory we shouldn't have to make this 14343 * second check if there's no lock owner, but for now we'll be 14344 * conservative and do it anyway.) If either condition is true, 14345 * send an unlock for the entire file to the server. 14346 * 14347 * Note that no explicit synchronization is needed here. At worst, 14348 * flk_has_remote_locks() will return a false positive, in which case 14349 * the unlock call wastes time but doesn't harm correctness. 14350 */ 14351 14352 if (lop) { 14353 mutex_enter(&lop->lo_lock); 14354 possible_orphan = lop->lo_pending_rqsts; 14355 mutex_exit(&lop->lo_lock); 14356 lock_owner_rele(lop); 14357 } 14358 14359 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14360 14361 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14362 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14363 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14364 (void *)lop)); 14365 14366 if (possible_orphan || flk_has_remote_locks(vp)) { 14367 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14368 ld.l_whence = 0; /* unlock from start of file */ 14369 ld.l_start = 0; 14370 ld.l_len = 0; /* do entire file */ 14371 14372 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 14373 14374 if (ret != 0) { 14375 /* 14376 * If VOP_FRLOCK fails, make sure we unregister 14377 * local locks before we continue. 14378 */ 14379 ld.l_pid = ttoproc(curthread)->p_pid; 14380 nfs4_register_lock_locally(vp, &ld, flag, offset); 14381 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14382 "nfs4_lockrelease: lock release error on vp" 14383 " %p: error %d.\n", (void *)vp, ret)); 14384 } 14385 } 14386 14387 recov_state.rs_flags = 0; 14388 recov_state.rs_num_retry_despite_err = 0; 14389 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14390 &recovonly); 14391 if (error) { 14392 mutex_enter(&rp->r_statelock); 14393 rp->r_flags |= R4LODANGLERS; 14394 mutex_exit(&rp->r_statelock); 14395 return (error); 14396 } 14397 14398 /* 14399 * So, here we're going to need to retrieve the lock-owner 14400 * again (in case recovery has done a switch-a-roo) and 14401 * remove it because we can. 14402 */ 14403 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14404 14405 if (lop) { 14406 nfs4_rnode_remove_lock_owner(rp, lop); 14407 lock_owner_rele(lop); 14408 } 14409 14410 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14411 return (0); 14412 } 14413 14414 /* 14415 * Wait for 'tick_delay' clock ticks. 14416 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14417 * NOTE: lock_lease_time is in seconds. 14418 * 14419 * XXX For future improvements, should implement a waiting queue scheme. 14420 */ 14421 static int 14422 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14423 { 14424 long milliseconds_delay; 14425 time_t lock_lease_time; 14426 14427 /* wait tick_delay clock ticks or siginteruptus */ 14428 if (delay_sig(*tick_delay)) { 14429 return (EINTR); 14430 } 14431 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14432 "reissue the lock request: blocked for %ld clock ticks: %ld " 14433 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14434 14435 /* get the lease time */ 14436 lock_lease_time = r2lease_time(rp); 14437 14438 /* drv_hztousec converts ticks to microseconds */ 14439 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14440 if (milliseconds_delay < lock_lease_time * 1000) { 14441 *tick_delay = 2 * *tick_delay; 14442 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14443 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14444 } 14445 return (0); 14446 } 14447 14448 14449 void 14450 nfs4_vnops_init(void) 14451 { 14452 } 14453 14454 void 14455 nfs4_vnops_fini(void) 14456 { 14457 } 14458 14459 /* 14460 * Return a reference to the directory (parent) vnode for a given vnode, 14461 * using the saved pathname information and the directory file handle. The 14462 * caller is responsible for disposing of the reference. 14463 * Returns zero or an errno value. 14464 * 14465 * Caller should set need_start_op to FALSE if it is the recovery 14466 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14467 */ 14468 int 14469 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14470 { 14471 svnode_t *svnp; 14472 vnode_t *dvp = NULL; 14473 servinfo4_t *svp; 14474 nfs4_fname_t *mfname; 14475 int error; 14476 14477 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14478 14479 if (vp->v_flag & VROOT) { 14480 nfs4_sharedfh_t *sfh; 14481 nfs_fh4 fh; 14482 mntinfo4_t *mi; 14483 14484 ASSERT(vp->v_type == VREG); 14485 14486 mi = VTOMI4(vp); 14487 svp = mi->mi_curr_serv; 14488 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14489 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14490 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14491 sfh = sfh4_get(&fh, VTOMI4(vp)); 14492 nfs_rw_exit(&svp->sv_lock); 14493 mfname = mi->mi_fname; 14494 fn_hold(mfname); 14495 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14496 sfh4_rele(&sfh); 14497 14498 if (dvp->v_type == VNON) 14499 dvp->v_type = VDIR; 14500 *dvpp = dvp; 14501 return (0); 14502 } 14503 14504 svnp = VTOSV(vp); 14505 14506 if (svnp == NULL) { 14507 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14508 "shadow node is NULL")); 14509 return (EINVAL); 14510 } 14511 14512 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14513 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14514 "shadow node name or dfh val == NULL")); 14515 return (EINVAL); 14516 } 14517 14518 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14519 (int)need_start_op); 14520 if (error != 0) { 14521 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14522 "nfs4_make_dotdot returned %d", error)); 14523 return (error); 14524 } 14525 if (!dvp) { 14526 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14527 "nfs4_make_dotdot returned a NULL dvp")); 14528 return (EIO); 14529 } 14530 if (dvp->v_type == VNON) 14531 dvp->v_type = VDIR; 14532 ASSERT(dvp->v_type == VDIR); 14533 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14534 mutex_enter(&dvp->v_lock); 14535 dvp->v_flag |= V_XATTRDIR; 14536 mutex_exit(&dvp->v_lock); 14537 } 14538 *dvpp = dvp; 14539 return (0); 14540 } 14541 14542 /* 14543 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14544 * length that fnamep can accept, including the trailing null. 14545 * Returns 0 if okay, returns an errno value if there was a problem. 14546 */ 14547 14548 int 14549 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14550 { 14551 char *fn; 14552 int err = 0; 14553 servinfo4_t *svp; 14554 svnode_t *shvp; 14555 14556 /* 14557 * If the file being opened has VROOT set, then this is 14558 * a "file" mount. sv_name will not be interesting, so 14559 * go back to the servinfo4 to get the original mount 14560 * path and strip off all but the final edge. Otherwise 14561 * just return the name from the shadow vnode. 14562 */ 14563 14564 if (vp->v_flag & VROOT) { 14565 14566 svp = VTOMI4(vp)->mi_curr_serv; 14567 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14568 14569 fn = strrchr(svp->sv_path, '/'); 14570 if (fn == NULL) 14571 err = EINVAL; 14572 else 14573 fn++; 14574 } else { 14575 shvp = VTOSV(vp); 14576 fn = fn_name(shvp->sv_name); 14577 } 14578 14579 if (err == 0) 14580 if (strlen(fn) < maxlen) 14581 (void) strcpy(fnamep, fn); 14582 else 14583 err = ENAMETOOLONG; 14584 14585 if (vp->v_flag & VROOT) 14586 nfs_rw_exit(&svp->sv_lock); 14587 else 14588 kmem_free(fn, MAXNAMELEN); 14589 14590 return (err); 14591 } 14592 14593 /* 14594 * If the vnode has pages, run the list and check for 14595 * any that are still dangling. We call this function 14596 * before the OTW CLOSE occurs so we can B_INVAL the 14597 * danglers. 14598 */ 14599 static int 14600 nfs4_dross_pages(vnode_t *vp) 14601 { 14602 page_t *pp; 14603 kmutex_t *vphm; 14604 rnode4_t *rp; 14605 14606 /* make sure we're looking at the master vnode, not a shadow */ 14607 rp = VTOR4(vp); 14608 if (IS_SHADOW(vp, rp)) 14609 vp = RTOV4(rp); 14610 14611 vphm = page_vnode_mutex(vp); 14612 mutex_enter(vphm); 14613 if ((pp = vp->v_pages) != NULL) { 14614 do { 14615 if (pp->p_fsdata != C_NOCOMMIT) { 14616 mutex_exit(vphm); 14617 return (1); 14618 } 14619 } while ((pp = pp->p_vpnext) != vp->v_pages); 14620 } 14621 mutex_exit(vphm); 14622 14623 return (0); 14624 } 14625 14626 /* 14627 * Bookkeeping for a close that doesn't need to go over the wire. 14628 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14629 * it is left at 1. 14630 */ 14631 void 14632 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14633 { 14634 rnode4_t *rp; 14635 mntinfo4_t *mi; 14636 14637 mi = VTOMI4(vp); 14638 rp = VTOR4(vp); 14639 14640 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14641 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14642 ASSERT(nfs_zone() == mi->mi_zone); 14643 ASSERT(mutex_owned(&osp->os_sync_lock)); 14644 ASSERT(*have_lockp); 14645 14646 if (!osp->os_valid || 14647 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14648 return; 14649 } 14650 14651 /* 14652 * This removes the reference obtained at OPEN; ie, 14653 * when the open stream structure was created. 14654 * 14655 * We don't have to worry about calling 'open_stream_rele' 14656 * since we our currently holding a reference to this 14657 * open stream which means the count can not go to 0 with 14658 * this decrement. 14659 */ 14660 ASSERT(osp->os_ref_count >= 2); 14661 osp->os_ref_count--; 14662 osp->os_valid = 0; 14663 mutex_exit(&osp->os_sync_lock); 14664 *have_lockp = 0; 14665 14666 nfs4_dec_state_ref_count(mi); 14667 } 14668 14669 /* 14670 * Close all remaining open streams on the rnode. These open streams 14671 * could be here because: 14672 * - The close attempted at either close or delmap failed 14673 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14674 * - Someone did mknod on a regular file but never opened it 14675 */ 14676 int 14677 nfs4close_all(vnode_t *vp, cred_t *cr) 14678 { 14679 nfs4_open_stream_t *osp; 14680 int error; 14681 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14682 rnode4_t *rp; 14683 14684 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14685 14686 error = 0; 14687 rp = VTOR4(vp); 14688 14689 /* 14690 * At this point, all we know is that the last time 14691 * someone called vn_rele, the count was 1. Since then, 14692 * the vnode could have been re-activated. We want to 14693 * loop through the open streams and close each one, but 14694 * we have to be careful since once we release the rnode 14695 * hash bucket lock, someone else is free to come in and 14696 * re-activate the rnode and add new open streams. The 14697 * strategy is take the rnode hash bucket lock, verify that 14698 * the count is still 1, grab the open stream off the 14699 * head of the list and mark it invalid, then release the 14700 * rnode hash bucket lock and proceed with that open stream. 14701 * This is ok because nfs4close_one() will acquire the proper 14702 * open/create to close/destroy synchronization for open 14703 * streams, and will ensure that if someone has reopened 14704 * the open stream after we've dropped the hash bucket lock 14705 * then we'll just simply return without destroying the 14706 * open stream. 14707 * Repeat until the list is empty. 14708 */ 14709 14710 for (;;) { 14711 14712 /* make sure vnode hasn't been reactivated */ 14713 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14714 mutex_enter(&vp->v_lock); 14715 if (vp->v_count > 1) { 14716 mutex_exit(&vp->v_lock); 14717 rw_exit(&rp->r_hashq->r_lock); 14718 break; 14719 } 14720 /* 14721 * Grabbing r_os_lock before releasing v_lock prevents 14722 * a window where the rnode/open stream could get 14723 * reactivated (and os_force_close set to 0) before we 14724 * had a chance to set os_force_close to 1. 14725 */ 14726 mutex_enter(&rp->r_os_lock); 14727 mutex_exit(&vp->v_lock); 14728 14729 osp = list_head(&rp->r_open_streams); 14730 if (!osp) { 14731 /* nothing left to CLOSE OTW, so return */ 14732 mutex_exit(&rp->r_os_lock); 14733 rw_exit(&rp->r_hashq->r_lock); 14734 break; 14735 } 14736 14737 mutex_enter(&rp->r_statev4_lock); 14738 /* the file can't still be mem mapped */ 14739 ASSERT(rp->r_mapcnt == 0); 14740 if (rp->created_v4) 14741 rp->created_v4 = 0; 14742 mutex_exit(&rp->r_statev4_lock); 14743 14744 /* 14745 * Grab a ref on this open stream; nfs4close_one 14746 * will mark it as invalid 14747 */ 14748 mutex_enter(&osp->os_sync_lock); 14749 osp->os_ref_count++; 14750 osp->os_force_close = 1; 14751 mutex_exit(&osp->os_sync_lock); 14752 mutex_exit(&rp->r_os_lock); 14753 rw_exit(&rp->r_hashq->r_lock); 14754 14755 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 14756 14757 /* Update error if it isn't already non-zero */ 14758 if (error == 0) { 14759 if (e.error) 14760 error = e.error; 14761 else if (e.stat) 14762 error = geterrno4(e.stat); 14763 } 14764 14765 #ifdef DEBUG 14766 nfs4close_all_cnt++; 14767 #endif 14768 /* Release the ref on osp acquired above. */ 14769 open_stream_rele(osp, rp); 14770 14771 /* Proceed to the next open stream, if any */ 14772 } 14773 return (error); 14774 } 14775 14776 /* 14777 * nfs4close_one - close one open stream for a file if needed. 14778 * 14779 * "close_type" indicates which close path this is: 14780 * CLOSE_NORM: close initiated via VOP_CLOSE. 14781 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 14782 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 14783 * the close and release of client state for this open stream 14784 * (unless someone else has the open stream open). 14785 * CLOSE_RESEND: indicates the request is a replay of an earlier request 14786 * (e.g., due to abort because of a signal). 14787 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 14788 * 14789 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 14790 * recovery. Instead, the caller is expected to deal with retries. 14791 * 14792 * The caller can either pass in the osp ('provided_osp') or not. 14793 * 14794 * 'access_bits' represents the access we are closing/downgrading. 14795 * 14796 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 14797 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 14798 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 14799 * 14800 * Errors are returned via the nfs4_error_t. 14801 */ 14802 void 14803 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 14804 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 14805 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 14806 uint_t mmap_flags) 14807 { 14808 nfs4_open_owner_t *oop; 14809 nfs4_open_stream_t *osp = NULL; 14810 int retry = 0; 14811 int num_retries = NFS4_NUM_RECOV_RETRIES; 14812 rnode4_t *rp; 14813 mntinfo4_t *mi; 14814 nfs4_recov_state_t recov_state; 14815 cred_t *cred_otw = NULL; 14816 bool_t recovonly = FALSE; 14817 int isrecov; 14818 int force_close; 14819 int close_failed = 0; 14820 int did_dec_count = 0; 14821 int did_start_op = 0; 14822 int did_force_recovlock = 0; 14823 int did_start_seqid_sync = 0; 14824 int have_sync_lock = 0; 14825 14826 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14827 14828 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 14829 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 14830 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 14831 len, maxprot, mmap_flags, access_bits)); 14832 14833 nfs4_error_zinit(ep); 14834 rp = VTOR4(vp); 14835 mi = VTOMI4(vp); 14836 isrecov = (close_type == CLOSE_RESEND || 14837 close_type == CLOSE_AFTER_RESEND); 14838 14839 /* 14840 * First get the open owner. 14841 */ 14842 if (!provided_osp) { 14843 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 14844 } else { 14845 oop = provided_osp->os_open_owner; 14846 ASSERT(oop != NULL); 14847 open_owner_hold(oop); 14848 } 14849 14850 if (!oop) { 14851 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 14852 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 14853 "close type %d", (void *)rp, (void *)mi, (void *)cr, 14854 (void *)provided_osp, close_type)); 14855 ep->error = EIO; 14856 goto out; 14857 } 14858 14859 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 14860 recov_retry: 14861 osp = NULL; 14862 close_failed = 0; 14863 force_close = (close_type == CLOSE_FORCE); 14864 retry = 0; 14865 did_start_op = 0; 14866 did_force_recovlock = 0; 14867 did_start_seqid_sync = 0; 14868 have_sync_lock = 0; 14869 recovonly = FALSE; 14870 recov_state.rs_flags = 0; 14871 recov_state.rs_num_retry_despite_err = 0; 14872 14873 /* 14874 * Second synchronize with recovery. 14875 */ 14876 if (!isrecov) { 14877 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 14878 &recov_state, &recovonly); 14879 if (!ep->error) { 14880 did_start_op = 1; 14881 } else { 14882 close_failed = 1; 14883 /* 14884 * If we couldn't get start_fop, but have to 14885 * cleanup state, then at least acquire the 14886 * mi_recovlock so we can synchronize with 14887 * recovery. 14888 */ 14889 if (close_type == CLOSE_FORCE) { 14890 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 14891 RW_READER, FALSE); 14892 did_force_recovlock = 1; 14893 } else 14894 goto out; 14895 } 14896 } 14897 14898 /* 14899 * We cannot attempt to get the open seqid sync if nfs4_start_fop 14900 * set 'recovonly' to TRUE since most likely this is due to 14901 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 14902 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 14903 * to retry, causing us to loop until recovery finishes. Plus we 14904 * don't need protection over the open seqid since we're not going 14905 * OTW, hence don't need to use the seqid. 14906 */ 14907 if (recovonly == FALSE) { 14908 /* need to grab the open owner sync before 'os_sync_lock' */ 14909 ep->error = nfs4_start_open_seqid_sync(oop, mi); 14910 if (ep->error == EAGAIN) { 14911 ASSERT(!isrecov); 14912 if (did_start_op) 14913 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 14914 &recov_state, TRUE); 14915 if (did_force_recovlock) 14916 nfs_rw_exit(&mi->mi_recovlock); 14917 goto recov_retry; 14918 } 14919 did_start_seqid_sync = 1; 14920 } 14921 14922 /* 14923 * Third get an open stream and acquire 'os_sync_lock' to 14924 * sychronize the opening/creating of an open stream with the 14925 * closing/destroying of an open stream. 14926 */ 14927 if (!provided_osp) { 14928 /* returns with 'os_sync_lock' held */ 14929 osp = find_open_stream(oop, rp); 14930 if (!osp) { 14931 ep->error = EIO; 14932 goto out; 14933 } 14934 } else { 14935 osp = provided_osp; 14936 open_stream_hold(osp); 14937 mutex_enter(&osp->os_sync_lock); 14938 } 14939 have_sync_lock = 1; 14940 14941 ASSERT(oop == osp->os_open_owner); 14942 14943 /* 14944 * Fourth, do any special pre-OTW CLOSE processing 14945 * based on the specific close type. 14946 */ 14947 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 14948 !did_dec_count) { 14949 ASSERT(osp->os_open_ref_count > 0); 14950 osp->os_open_ref_count--; 14951 did_dec_count = 1; 14952 if (osp->os_open_ref_count == 0) 14953 osp->os_final_close = 1; 14954 } 14955 14956 if (close_type == CLOSE_FORCE) { 14957 /* see if somebody reopened the open stream. */ 14958 if (!osp->os_force_close) { 14959 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14960 "nfs4close_one: skip CLOSE_FORCE as osp %p " 14961 "was reopened, vp %p", (void *)osp, (void *)vp)); 14962 ep->error = 0; 14963 ep->stat = NFS4_OK; 14964 goto out; 14965 } 14966 14967 if (!osp->os_final_close && !did_dec_count) { 14968 osp->os_open_ref_count--; 14969 did_dec_count = 1; 14970 } 14971 14972 /* 14973 * We can't depend on os_open_ref_count being 0 due to the 14974 * way executables are opened (VN_RELE to match a VOP_OPEN). 14975 */ 14976 #ifdef NOTYET 14977 ASSERT(osp->os_open_ref_count == 0); 14978 #endif 14979 if (osp->os_open_ref_count != 0) { 14980 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14981 "nfs4close_one: should panic here on an " 14982 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 14983 "since this is probably the exec problem.")); 14984 14985 osp->os_open_ref_count = 0; 14986 } 14987 14988 /* 14989 * There is the possibility that nfs4close_one() 14990 * for close_type == CLOSE_DELMAP couldn't find the 14991 * open stream, thus couldn't decrement its os_mapcnt; 14992 * therefore we can't use this ASSERT yet. 14993 */ 14994 #ifdef NOTYET 14995 ASSERT(osp->os_mapcnt == 0); 14996 #endif 14997 osp->os_mapcnt = 0; 14998 } 14999 15000 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15001 ASSERT(osp->os_mapcnt >= btopr(len)); 15002 15003 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15004 osp->os_mmap_write -= btopr(len); 15005 if (maxprot & PROT_READ) 15006 osp->os_mmap_read -= btopr(len); 15007 if (maxprot & PROT_EXEC) 15008 osp->os_mmap_read -= btopr(len); 15009 /* mirror the PROT_NONE check in nfs4_addmap() */ 15010 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15011 !(maxprot & PROT_EXEC)) 15012 osp->os_mmap_read -= btopr(len); 15013 osp->os_mapcnt -= btopr(len); 15014 did_dec_count = 1; 15015 } 15016 15017 if (recovonly) { 15018 nfs4_lost_rqst_t lost_rqst; 15019 15020 /* request should not already be in recovery queue */ 15021 ASSERT(lrp == NULL); 15022 nfs4_error_init(ep, EINTR); 15023 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15024 osp, cred_otw, vp); 15025 mutex_exit(&osp->os_sync_lock); 15026 have_sync_lock = 0; 15027 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15028 lost_rqst.lr_op == OP_CLOSE ? 15029 &lost_rqst : NULL, OP_CLOSE, NULL); 15030 close_failed = 1; 15031 force_close = 0; 15032 goto close_cleanup; 15033 } 15034 15035 /* 15036 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15037 * we stopped operating on the open owner's <old oo_name, old seqid> 15038 * space, which means we stopped operating on the open stream 15039 * too. So don't go OTW (as the seqid is likely bad, and the 15040 * stateid could be stale, potentially triggering a false 15041 * setclientid), and just clean up the client's internal state. 15042 */ 15043 if (osp->os_orig_oo_name != oop->oo_name) { 15044 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15045 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15046 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15047 "oo_name %" PRIx64")", 15048 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15049 oop->oo_name)); 15050 close_failed = 1; 15051 } 15052 15053 /* If the file failed recovery, just quit. */ 15054 mutex_enter(&rp->r_statelock); 15055 if (rp->r_flags & R4RECOVERR) { 15056 close_failed = 1; 15057 } 15058 mutex_exit(&rp->r_statelock); 15059 15060 /* 15061 * If the force close path failed to obtain start_fop 15062 * then skip the OTW close and just remove the state. 15063 */ 15064 if (close_failed) 15065 goto close_cleanup; 15066 15067 /* 15068 * Fifth, check to see if there are still mapped pages or other 15069 * opens using this open stream. If there are then we can't 15070 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15071 */ 15072 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15073 nfs4_lost_rqst_t new_lost_rqst; 15074 bool_t needrecov = FALSE; 15075 cred_t *odg_cred_otw = NULL; 15076 seqid4 open_dg_seqid = 0; 15077 15078 if (osp->os_delegation) { 15079 /* 15080 * If this open stream was never OPENed OTW then we 15081 * surely can't DOWNGRADE it (especially since the 15082 * osp->open_stateid is really a delegation stateid 15083 * when os_delegation is 1). 15084 */ 15085 if (access_bits & FREAD) 15086 osp->os_share_acc_read--; 15087 if (access_bits & FWRITE) 15088 osp->os_share_acc_write--; 15089 osp->os_share_deny_none--; 15090 nfs4_error_zinit(ep); 15091 goto out; 15092 } 15093 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15094 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15095 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15096 if (needrecov && !isrecov) { 15097 bool_t abort; 15098 nfs4_bseqid_entry_t *bsep = NULL; 15099 15100 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15101 bsep = nfs4_create_bseqid_entry(oop, NULL, 15102 vp, 0, 15103 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15104 open_dg_seqid); 15105 15106 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15107 oop, osp, odg_cred_otw, vp, access_bits, 0); 15108 mutex_exit(&osp->os_sync_lock); 15109 have_sync_lock = 0; 15110 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15111 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15112 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15113 bsep); 15114 if (odg_cred_otw) 15115 crfree(odg_cred_otw); 15116 if (bsep) 15117 kmem_free(bsep, sizeof (*bsep)); 15118 15119 if (abort == TRUE) 15120 goto out; 15121 15122 if (did_start_seqid_sync) { 15123 nfs4_end_open_seqid_sync(oop); 15124 did_start_seqid_sync = 0; 15125 } 15126 open_stream_rele(osp, rp); 15127 15128 if (did_start_op) 15129 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15130 &recov_state, FALSE); 15131 if (did_force_recovlock) 15132 nfs_rw_exit(&mi->mi_recovlock); 15133 15134 goto recov_retry; 15135 } else { 15136 if (odg_cred_otw) 15137 crfree(odg_cred_otw); 15138 } 15139 goto out; 15140 } 15141 15142 /* 15143 * If this open stream was created as the results of an open 15144 * while holding a delegation, then just release it; no need 15145 * to do an OTW close. Otherwise do a "normal" OTW close. 15146 */ 15147 if (osp->os_delegation) { 15148 nfs4close_notw(vp, osp, &have_sync_lock); 15149 nfs4_error_zinit(ep); 15150 goto out; 15151 } 15152 15153 /* 15154 * If this stream is not valid, we're done. 15155 */ 15156 if (!osp->os_valid) { 15157 nfs4_error_zinit(ep); 15158 goto out; 15159 } 15160 15161 /* 15162 * Last open or mmap ref has vanished, need to do an OTW close. 15163 * First check to see if a close is still necessary. 15164 */ 15165 if (osp->os_failed_reopen) { 15166 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15167 "don't close OTW osp %p since reopen failed.", 15168 (void *)osp)); 15169 /* 15170 * Reopen of the open stream failed, hence the 15171 * stateid of the open stream is invalid/stale, and 15172 * sending this OTW would incorrectly cause another 15173 * round of recovery. In this case, we need to set 15174 * the 'os_valid' bit to 0 so another thread doesn't 15175 * come in and re-open this open stream before 15176 * this "closing" thread cleans up state (decrementing 15177 * the nfs4_server_t's state_ref_count and decrementing 15178 * the os_ref_count). 15179 */ 15180 osp->os_valid = 0; 15181 /* 15182 * This removes the reference obtained at OPEN; ie, 15183 * when the open stream structure was created. 15184 * 15185 * We don't have to worry about calling 'open_stream_rele' 15186 * since we our currently holding a reference to this 15187 * open stream which means the count can not go to 0 with 15188 * this decrement. 15189 */ 15190 ASSERT(osp->os_ref_count >= 2); 15191 osp->os_ref_count--; 15192 nfs4_error_zinit(ep); 15193 close_failed = 0; 15194 goto close_cleanup; 15195 } 15196 15197 ASSERT(osp->os_ref_count > 1); 15198 15199 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && 15200 nfs4_dross_pages(vp)) { 15201 nfs4_invalidate_pages(vp, 0, cred_otw); 15202 } 15203 15204 /* 15205 * Sixth, try the CLOSE OTW. 15206 */ 15207 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15208 close_type, ep, &have_sync_lock); 15209 15210 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15211 /* 15212 * Let the recovery thread be responsible for 15213 * removing the state for CLOSE. 15214 */ 15215 close_failed = 1; 15216 force_close = 0; 15217 retry = 0; 15218 } 15219 15220 /* See if we need to retry with a different cred */ 15221 if ((ep->error == EACCES || 15222 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15223 cred_otw != cr) { 15224 crfree(cred_otw); 15225 cred_otw = cr; 15226 crhold(cred_otw); 15227 retry = 1; 15228 } 15229 15230 if (ep->error || ep->stat) 15231 close_failed = 1; 15232 15233 if (retry && !isrecov && num_retries-- > 0) { 15234 if (have_sync_lock) { 15235 mutex_exit(&osp->os_sync_lock); 15236 have_sync_lock = 0; 15237 } 15238 if (did_start_seqid_sync) { 15239 nfs4_end_open_seqid_sync(oop); 15240 did_start_seqid_sync = 0; 15241 } 15242 open_stream_rele(osp, rp); 15243 15244 if (did_start_op) 15245 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15246 &recov_state, FALSE); 15247 if (did_force_recovlock) 15248 nfs_rw_exit(&mi->mi_recovlock); 15249 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15250 "nfs4close_one: need to retry the close " 15251 "operation")); 15252 goto recov_retry; 15253 } 15254 close_cleanup: 15255 /* 15256 * Seventh and lastly, process our results. 15257 */ 15258 if (close_failed && force_close) { 15259 /* 15260 * It's ok to drop and regrab the 'os_sync_lock' since 15261 * nfs4close_notw() will recheck to make sure the 15262 * "close"/removal of state should happen. 15263 */ 15264 if (!have_sync_lock) { 15265 mutex_enter(&osp->os_sync_lock); 15266 have_sync_lock = 1; 15267 } 15268 /* 15269 * This is last call, remove the ref on the open 15270 * stream created by open and clean everything up. 15271 */ 15272 osp->os_pending_close = 0; 15273 nfs4close_notw(vp, osp, &have_sync_lock); 15274 nfs4_error_zinit(ep); 15275 } 15276 15277 if (!close_failed) { 15278 if (have_sync_lock) { 15279 osp->os_pending_close = 0; 15280 mutex_exit(&osp->os_sync_lock); 15281 have_sync_lock = 0; 15282 } else { 15283 mutex_enter(&osp->os_sync_lock); 15284 osp->os_pending_close = 0; 15285 mutex_exit(&osp->os_sync_lock); 15286 } 15287 if (did_start_op && recov_state.rs_sp != NULL) { 15288 mutex_enter(&recov_state.rs_sp->s_lock); 15289 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15290 mutex_exit(&recov_state.rs_sp->s_lock); 15291 } else { 15292 nfs4_dec_state_ref_count(mi); 15293 } 15294 nfs4_error_zinit(ep); 15295 } 15296 15297 out: 15298 if (have_sync_lock) 15299 mutex_exit(&osp->os_sync_lock); 15300 if (did_start_op) 15301 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15302 recovonly ? TRUE : FALSE); 15303 if (did_force_recovlock) 15304 nfs_rw_exit(&mi->mi_recovlock); 15305 if (cred_otw) 15306 crfree(cred_otw); 15307 if (osp) 15308 open_stream_rele(osp, rp); 15309 if (oop) { 15310 if (did_start_seqid_sync) 15311 nfs4_end_open_seqid_sync(oop); 15312 open_owner_rele(oop); 15313 } 15314 } 15315 15316 /* 15317 * Convert information returned by the server in the LOCK4denied 15318 * structure to the form required by fcntl. 15319 */ 15320 static void 15321 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15322 { 15323 nfs4_lo_name_t *lo; 15324 15325 #ifdef DEBUG 15326 if (denied_to_flk_debug) { 15327 lockt_denied_debug = lockt_denied; 15328 debug_enter("lockt_denied"); 15329 } 15330 #endif 15331 15332 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15333 flk->l_whence = 0; /* aka SEEK_SET */ 15334 flk->l_start = lockt_denied->offset; 15335 flk->l_len = lockt_denied->length; 15336 15337 /* 15338 * If the blocking clientid matches our client id, then we can 15339 * interpret the lockowner (since we built it). If not, then 15340 * fabricate a sysid and pid. Note that the l_sysid field 15341 * in *flk already has the local sysid. 15342 */ 15343 15344 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15345 15346 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15347 lo = (nfs4_lo_name_t *) 15348 lockt_denied->owner.owner_val; 15349 15350 flk->l_pid = lo->ln_pid; 15351 } else { 15352 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15353 "denied_to_flk: bad lock owner length\n")); 15354 15355 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15356 } 15357 } else { 15358 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15359 "denied_to_flk: foreign clientid\n")); 15360 15361 /* 15362 * Construct a new sysid which should be different from 15363 * sysids of other systems. 15364 */ 15365 15366 flk->l_sysid++; 15367 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15368 } 15369 } 15370 15371 static pid_t 15372 lo_to_pid(lock_owner4 *lop) 15373 { 15374 pid_t pid = 0; 15375 uchar_t *cp; 15376 int i; 15377 15378 cp = (uchar_t *)&lop->clientid; 15379 15380 for (i = 0; i < sizeof (lop->clientid); i++) 15381 pid += (pid_t)*cp++; 15382 15383 cp = (uchar_t *)lop->owner_val; 15384 15385 for (i = 0; i < lop->owner_len; i++) 15386 pid += (pid_t)*cp++; 15387 15388 return (pid); 15389 } 15390 15391 /* 15392 * Given a lock pointer, returns the length of that lock. 15393 * "end" is the last locked offset the "l_len" covers from 15394 * the start of the lock. 15395 */ 15396 static off64_t 15397 lock_to_end(flock64_t *lock) 15398 { 15399 off64_t lock_end; 15400 15401 if (lock->l_len == 0) 15402 lock_end = (off64_t)MAXEND; 15403 else 15404 lock_end = lock->l_start + lock->l_len - 1; 15405 15406 return (lock_end); 15407 } 15408 15409 /* 15410 * Given the end of a lock, it will return you the length "l_len" for that lock. 15411 */ 15412 static off64_t 15413 end_to_len(off64_t start, off64_t end) 15414 { 15415 off64_t lock_len; 15416 15417 ASSERT(end >= start); 15418 if (end == MAXEND) 15419 lock_len = 0; 15420 else 15421 lock_len = end - start + 1; 15422 15423 return (lock_len); 15424 } 15425 15426 /* 15427 * On given end for a lock it determines if it is the last locked offset 15428 * or not, if so keeps it as is, else adds one to return the length for 15429 * valid start. 15430 */ 15431 static off64_t 15432 start_check(off64_t x) 15433 { 15434 if (x == MAXEND) 15435 return (x); 15436 else 15437 return (x + 1); 15438 } 15439 15440 /* 15441 * See if these two locks overlap, and if so return 1; 15442 * otherwise, return 0. 15443 */ 15444 static int 15445 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15446 { 15447 off64_t llfp_end, curfp_end; 15448 15449 llfp_end = lock_to_end(llfp); 15450 curfp_end = lock_to_end(curfp); 15451 15452 if (((llfp_end >= curfp->l_start) && 15453 (llfp->l_start <= curfp->l_start)) || 15454 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15455 return (1); 15456 return (0); 15457 } 15458 15459 /* 15460 * Determine what the interseting lock region is, and add that to the 15461 * 'nl_llpp' locklist in increasing order (by l_start). 15462 */ 15463 static void 15464 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15465 locklist_t **nl_llpp, vnode_t *vp) 15466 { 15467 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15468 off64_t lost_flp_end, local_flp_end, len, start; 15469 15470 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15471 15472 if (!locks_intersect(lost_flp, local_flp)) 15473 return; 15474 15475 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15476 "locks intersect")); 15477 15478 lost_flp_end = lock_to_end(lost_flp); 15479 local_flp_end = lock_to_end(local_flp); 15480 15481 /* Find the starting point of the intersecting region */ 15482 if (local_flp->l_start > lost_flp->l_start) 15483 start = local_flp->l_start; 15484 else 15485 start = lost_flp->l_start; 15486 15487 /* Find the lenght of the intersecting region */ 15488 if (lost_flp_end < local_flp_end) 15489 len = end_to_len(start, lost_flp_end); 15490 else 15491 len = end_to_len(start, local_flp_end); 15492 15493 /* 15494 * Prepare the flock structure for the intersection found and insert 15495 * it into the new list in increasing l_start order. This list contains 15496 * intersections of locks registered by the client with the local host 15497 * and the lost lock. 15498 * The lock type of this lock is the same as that of the local_flp. 15499 */ 15500 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15501 intersect_llp->ll_flock.l_start = start; 15502 intersect_llp->ll_flock.l_len = len; 15503 intersect_llp->ll_flock.l_type = local_flp->l_type; 15504 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15505 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15506 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15507 intersect_llp->ll_vp = vp; 15508 15509 tmp_fllp = *nl_llpp; 15510 cur_fllp = NULL; 15511 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15512 intersect_llp->ll_flock.l_start) { 15513 cur_fllp = tmp_fllp; 15514 tmp_fllp = tmp_fllp->ll_next; 15515 } 15516 if (cur_fllp == NULL) { 15517 /* first on the list */ 15518 intersect_llp->ll_next = *nl_llpp; 15519 *nl_llpp = intersect_llp; 15520 } else { 15521 intersect_llp->ll_next = cur_fllp->ll_next; 15522 cur_fllp->ll_next = intersect_llp; 15523 } 15524 15525 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15526 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15527 intersect_llp->ll_flock.l_start, 15528 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15529 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15530 } 15531 15532 /* 15533 * Our local locking current state is potentially different than 15534 * what the NFSv4 server thinks we have due to a lost lock that was 15535 * resent and then received. We need to reset our "NFSv4" locking 15536 * state to match the current local locking state for this pid since 15537 * that is what the user/application sees as what the world is. 15538 * 15539 * We cannot afford to drop the open/lock seqid sync since then we can 15540 * get confused about what the current local locking state "is" versus 15541 * "was". 15542 * 15543 * If we are unable to fix up the locks, we send SIGLOST to the affected 15544 * process. This is not done if the filesystem has been forcibly 15545 * unmounted, in case the process has already exited and a new process 15546 * exists with the same pid. 15547 */ 15548 static void 15549 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15550 nfs4_lock_owner_t *lop) 15551 { 15552 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15553 mntinfo4_t *mi = VTOMI4(vp); 15554 const int cmd = F_SETLK; 15555 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15556 flock64_t ul_fl; 15557 15558 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15559 "nfs4_reinstitute_local_lock_state")); 15560 15561 /* 15562 * Find active locks for this vp from the local locking code. 15563 * Scan through this list and find out the locks that intersect with 15564 * the lost lock. Once we find the lock that intersects, add the 15565 * intersection area as a new lock to a new list "ri_llp". The lock 15566 * type of the intersection region lock added to ri_llp is the same 15567 * as that found in the active lock list, "list". The intersecting 15568 * region locks are added to ri_llp in increasing l_start order. 15569 */ 15570 ASSERT(nfs_zone() == mi->mi_zone); 15571 15572 locks = flk_active_locks_for_vp(vp); 15573 ri_llp = NULL; 15574 15575 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15576 ASSERT(llp->ll_vp == vp); 15577 /* 15578 * Pick locks that belong to this pid/lockowner 15579 */ 15580 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15581 continue; 15582 15583 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15584 } 15585 15586 /* 15587 * Now we have the list of intersections with the lost lock. These are 15588 * the locks that were/are active before the server replied to the 15589 * last/lost lock. Issue these locks to the server here. Playing these 15590 * locks to the server will re-establish aur current local locking state 15591 * with the v4 server. 15592 * If we get an error, send SIGLOST to the application for that lock. 15593 */ 15594 15595 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15596 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15597 "nfs4_reinstitute_local_lock_state: need to issue " 15598 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15599 llp->ll_flock.l_start, 15600 llp->ll_flock.l_start + llp->ll_flock.l_len, 15601 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15602 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15603 /* 15604 * No need to relock what we already have 15605 */ 15606 if (llp->ll_flock.l_type == lost_flp->l_type) 15607 continue; 15608 15609 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15610 } 15611 15612 /* 15613 * Now keeping the start of the lost lock as our reference parse the 15614 * newly created ri_llp locklist to find the ranges that we have locked 15615 * with the v4 server but not in the current local locking. We need 15616 * to unlock these ranges. 15617 * These ranges can also be reffered to as those ranges, where the lost 15618 * lock does not overlap with the locks in the ri_llp but are locked 15619 * since the server replied to the lost lock. 15620 */ 15621 cur_start = lost_flp->l_start; 15622 lost_flp_end = lock_to_end(lost_flp); 15623 15624 ul_fl.l_type = F_UNLCK; 15625 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15626 ul_fl.l_sysid = lost_flp->l_sysid; 15627 ul_fl.l_pid = lost_flp->l_pid; 15628 15629 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15630 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15631 15632 if (llp->ll_flock.l_start <= cur_start) { 15633 cur_start = start_check(llp_ll_flock_end); 15634 continue; 15635 } 15636 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15637 "nfs4_reinstitute_local_lock_state: " 15638 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15639 cur_start, llp->ll_flock.l_start)); 15640 15641 ul_fl.l_start = cur_start; 15642 ul_fl.l_len = end_to_len(cur_start, 15643 (llp->ll_flock.l_start - 1)); 15644 15645 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15646 cur_start = start_check(llp_ll_flock_end); 15647 } 15648 15649 /* 15650 * In the case where the lost lock ends after all intersecting locks, 15651 * unlock the last part of the lost lock range. 15652 */ 15653 if (cur_start != start_check(lost_flp_end)) { 15654 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15655 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15656 "lost lock region [%"PRIx64" - %"PRIx64"]", 15657 cur_start, lost_flp->l_start + lost_flp->l_len)); 15658 15659 ul_fl.l_start = cur_start; 15660 /* 15661 * Is it an to-EOF lock? if so unlock till the end 15662 */ 15663 if (lost_flp->l_len == 0) 15664 ul_fl.l_len = 0; 15665 else 15666 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15667 15668 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15669 } 15670 15671 if (locks != NULL) 15672 flk_free_locklist(locks); 15673 15674 /* Free up our newly created locklist */ 15675 for (llp = ri_llp; llp != NULL; ) { 15676 tmp_llp = llp->ll_next; 15677 kmem_free(llp, sizeof (locklist_t)); 15678 llp = tmp_llp; 15679 } 15680 15681 /* 15682 * Now return back to the original calling nfs4frlock() 15683 * and let us naturally drop our seqid syncs. 15684 */ 15685 } 15686 15687 /* 15688 * Create a lost state record for the given lock reinstantiation request 15689 * and push it onto the lost state queue. 15690 */ 15691 static void 15692 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15693 nfs4_lock_owner_t *lop) 15694 { 15695 nfs4_lost_rqst_t req; 15696 nfs_lock_type4 locktype; 15697 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15698 15699 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15700 15701 locktype = flk_to_locktype(cmd, flk->l_type); 15702 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15703 NULL, NULL, lop, flk, &req, cr, vp); 15704 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15705 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15706 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15707 NULL); 15708 } 15709